summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore18
-rw-r--r--ANNOUNCE-3.098
-rw-r--r--ANNOUNCE-3.0.122
-rw-r--r--ANNOUNCE-3.0.221
-rw-r--r--ANNOUNCE-3.0.329
-rw-r--r--ANNOUNCE-3.133
-rw-r--r--ANNOUNCE-3.1.139
-rw-r--r--ANNOUNCE-3.1.246
-rw-r--r--ANNOUNCE-3.1.346
-rw-r--r--ANNOUNCE-3.1.437
-rw-r--r--ANNOUNCE-3.1.542
-rw-r--r--ANNOUNCE-3.277
-rw-r--r--ANNOUNCE-3.2.175
-rw-r--r--ANNOUNCE-3.2.236
-rw-r--r--ANNOUNCE-3.2.324
-rw-r--r--ANNOUNCE-3.2.4144
-rw-r--r--ANNOUNCE-3.2.531
-rw-r--r--ANNOUNCE-3.2.657
-rw-r--r--ANNOUNCE-3.363
-rw-r--r--ANNOUNCE-3.3.123
-rw-r--r--ANNOUNCE-3.3.216
-rw-r--r--ANNOUNCE-3.3.318
-rw-r--r--ANNOUNCE-3.3.437
-rw-r--r--ANNOUNCE-3.424
-rw-r--r--Assemble.c2070
-rw-r--r--Build.c292
-rw-r--r--COPYING339
-rw-r--r--ChangeLog306
-rw-r--r--Create.c1071
-rw-r--r--Detail.c768
-rw-r--r--Dump.c311
-rw-r--r--Examine.c225
-rwxr-xr-xGrow.c4985
-rw-r--r--INSTALL13
-rw-r--r--Incremental.c1808
-rw-r--r--Kill.c146
-rw-r--r--Makefile345
-rw-r--r--Manage.c1786
-rw-r--r--Monitor.c1143
-rw-r--r--Query.c126
-rw-r--r--README.initramfs123
-rw-r--r--ReadMe.c642
-rw-r--r--TODO213
-rw-r--r--bitmap.c485
-rw-r--r--bitmap.h291
-rw-r--r--config.c1206
-rw-r--r--crc32.c360
-rw-r--r--crc32.h441
-rw-r--r--crc32c.c104
-rw-r--r--debian/FAQ581
-rw-r--r--debian/NEWS107
-rw-r--r--debian/README.Debian148
-rw-r--r--debian/README.checkarray33
-rw-r--r--debian/README.recipes149
-rw-r--r--debian/TODO29
-rwxr-xr-xdebian/bugscript219
-rw-r--r--debian/changelog1889
-rw-r--r--debian/checkarray219
-rw-r--r--debian/compat1
-rw-r--r--debian/control32
-rw-r--r--debian/copyright21
-rw-r--r--debian/initramfs/hook110
-rw-r--r--debian/initramfs/script.local-block44
-rw-r--r--debian/initramfs/script.local-bottom3
-rw-r--r--debian/mdadm-waitidle56
-rw-r--r--debian/mdadm.config43
-rw-r--r--debian/mdadm.cron.d12
-rw-r--r--debian/mdadm.cron.daily18
-rw-r--r--debian/mdadm.doc-base.faq9
-rw-r--r--debian/mdadm.doc-base.recipes9
-rw-r--r--debian/mdadm.docs7
-rw-r--r--debian/mdadm.init100
-rw-r--r--debian/mdadm.logcheck.ignore.server23
-rw-r--r--debian/mdadm.logcheck.violations3
-rw-r--r--debian/mdadm.maintscript1
-rw-r--r--debian/mdadm.modules8
-rw-r--r--debian/mdadm.postinst107
-rw-r--r--debian/mdadm.postrm25
-rw-r--r--debian/mdadm.templates38
-rw-r--r--debian/mkconf97
-rw-r--r--debian/patches/debian-conffile-location.diff115
-rw-r--r--debian/patches/debian-no-Werror.diff24
-rw-r--r--debian/patches/mdmonitor-service-simplify.diff20
-rw-r--r--debian/patches/readlink-path.patch15
-rw-r--r--debian/patches/series5
-rw-r--r--debian/patches/sha1-includes.diff40
-rw-r--r--debian/po/POTFILES.in1
-rw-r--r--debian/po/ca.po184
-rw-r--r--debian/po/cs.po228
-rw-r--r--debian/po/da.po175
-rw-r--r--debian/po/de.po249
-rw-r--r--debian/po/es.po218
-rw-r--r--debian/po/eu.po176
-rw-r--r--debian/po/fi.po173
-rw-r--r--debian/po/fr.po186
-rw-r--r--debian/po/gl.po177
-rw-r--r--debian/po/it.po177
-rw-r--r--debian/po/ja.po233
-rw-r--r--debian/po/nl.po188
-rw-r--r--debian/po/pt.po179
-rw-r--r--debian/po/pt_BR.po304
-rw-r--r--debian/po/ru.po189
-rw-r--r--debian/po/sk.po176
-rw-r--r--debian/po/sv.po186
-rw-r--r--debian/po/templates.pot78
-rw-r--r--debian/po/vi.po179
-rw-r--r--debian/presubj32
-rwxr-xr-xdebian/rules103
-rw-r--r--debian/source/format1
-rw-r--r--debian/watch2
-rw-r--r--dlink.c74
-rw-r--r--dlink.h25
-rw-r--r--external-reshape-design.txt280
-rwxr-xr-xinventory255
-rw-r--r--kernel-patch-2.6.1835
-rw-r--r--kernel-patch-2.6.18.635
-rw-r--r--kernel-patch-2.6.1934
-rw-r--r--kernel-patch-2.6.25199
-rw-r--r--kernel-patch-2.6.2736
-rw-r--r--lib.c475
-rwxr-xr-xmakedist96
-rw-r--r--managemon.c926
-rw-r--r--mapfile.c508
-rw-r--r--maps.c150
-rw-r--r--md.41145
-rw-r--r--md5.h136
-rw-r--r--md_p.h269
-rw-r--r--md_u.h123
-rw-r--r--mdadm.8.in3258
-rw-r--r--mdadm.c1936
-rw-r--r--mdadm.conf-example65
-rw-r--r--mdadm.conf.5641
-rwxr-xr-xmdadm.h1691
-rw-r--r--mdadm.spec45
-rw-r--r--mdassemble.865
-rw-r--r--mdassemble.c80
-rw-r--r--mdmon-design.txt146
-rw-r--r--mdmon.8257
-rw-r--r--mdmon.c602
-rw-r--r--mdmon.h110
-rw-r--r--mdopen.c468
-rw-r--r--mdstat.c414
-rw-r--r--misc/mdcheck159
-rw-r--r--misc/syslog-events27
-rw-r--r--mkinitramfs55
-rw-r--r--monitor.c712
-rw-r--r--msg.c475
-rw-r--r--msg.h37
-rw-r--r--part.h79
-rw-r--r--platform-intel.c741
-rw-r--r--platform-intel.h247
-rw-r--r--policy.c911
-rw-r--r--probe_roms.c317
-rw-r--r--probe_roms.h24
-rw-r--r--pwgr.c17
-rw-r--r--raid5extend.c80
-rw-r--r--raid6check.896
-rw-r--r--raid6check.c713
-rw-r--r--restripe.c1008
-rw-r--r--sg_io.c42
-rw-r--r--sha1.c415
-rw-r--r--sha1.h136
-rw-r--r--super-ddf.c5273
-rw-r--r--super-gpt.c216
-rw-r--r--super-intel.c10765
-rw-r--r--super-mbr.c204
-rw-r--r--super0.c1332
-rw-r--r--super1.c2656
-rw-r--r--swap_super.c81
-rw-r--r--sysfs.c931
-rw-r--r--systemd/SUSE-mdadm_env.sh45
-rw-r--r--systemd/mdadm-grow-continue@.service17
-rw-r--r--systemd/mdadm-last-resort@.service8
-rw-r--r--systemd/mdadm-last-resort@.timer7
-rw-r--r--systemd/mdadm.shutdown4
-rw-r--r--systemd/mdmon@.service28
-rw-r--r--systemd/mdmonitor.service13
-rwxr-xr-xtest440
-rw-r--r--tests/00linear25
-rw-r--r--tests/00multipath29
-rw-r--r--tests/00names13
-rw-r--r--tests/00raid043
-rw-r--r--tests/00raid134
-rw-r--r--tests/00raid1018
-rw-r--r--tests/00raid416
-rw-r--r--tests/00raid533
-rw-r--r--tests/00raid616
-rw-r--r--tests/01r1fail29
-rw-r--r--tests/01r5fail27
-rw-r--r--tests/01r5integ33
-rw-r--r--tests/01raid6integ57
-rw-r--r--tests/01replace52
-rw-r--r--tests/02lineargrow23
-rw-r--r--tests/02r1add40
-rw-r--r--tests/02r1grow36
-rw-r--r--tests/02r5grow36
-rw-r--r--tests/02r6grow36
-rw-r--r--tests/03assem-incr17
-rw-r--r--tests/03r0assem137
-rw-r--r--tests/03r5assem109
-rw-r--r--tests/03r5assem-failed12
-rw-r--r--tests/03r5assemV1128
-rw-r--r--tests/04r0update20
-rw-r--r--tests/04r1update15
-rw-r--r--tests/04r5swap18
-rw-r--r--tests/04update-metadata48
-rw-r--r--tests/04update-uuid82
-rw-r--r--tests/05r1-add-internalbitmap20
-rw-r--r--tests/05r1-add-internalbitmap-v1a20
-rw-r--r--tests/05r1-add-internalbitmap-v1b20
-rw-r--r--tests/05r1-add-internalbitmap-v1c20
-rw-r--r--tests/05r1-bitmapfile49
-rw-r--r--tests/05r1-grow-external33
-rw-r--r--tests/05r1-grow-internal31
-rw-r--r--tests/05r1-grow-internal-131
-rw-r--r--tests/05r1-internalbitmap47
-rw-r--r--tests/05r1-internalbitmap-v1a48
-rw-r--r--tests/05r1-internalbitmap-v1b49
-rw-r--r--tests/05r1-internalbitmap-v1c48
-rw-r--r--tests/05r1-n3-bitmapfile53
-rw-r--r--tests/05r1-re-add39
-rw-r--r--tests/05r1-re-add-nosuper38
-rw-r--r--tests/05r1-remove-internalbitmap18
-rw-r--r--tests/05r1-remove-internalbitmap-v1a18
-rw-r--r--tests/05r1-remove-internalbitmap-v1b18
-rw-r--r--tests/05r1-remove-internalbitmap-v1c18
-rw-r--r--tests/05r5-bitmapfile49
-rw-r--r--tests/05r5-internalbitmap47
-rw-r--r--tests/05r6-bitmapfile49
-rw-r--r--tests/05r6tor027
-rw-r--r--tests/06name12
-rw-r--r--tests/06sysfs11
-rw-r--r--tests/06wrmostly13
-rw-r--r--tests/07autoassemble24
-rw-r--r--tests/07autodetect34
-rw-r--r--tests/07changelevelintr61
-rw-r--r--tests/07changelevels114
-rw-r--r--tests/07layouts91
-rw-r--r--tests/07reshape5intr41
-rw-r--r--tests/07revert-grow52
-rw-r--r--tests/07revert-inplace44
-rw-r--r--tests/07revert-shrink56
-rw-r--r--tests/07testreshape545
-rw-r--r--tests/09imsm-assemble73
-rw-r--r--tests/09imsm-create-fail-rebuild78
-rw-r--r--tests/09imsm-overlap30
-rw-r--r--tests/10ddf-assemble-missing61
-rw-r--r--tests/10ddf-create89
-rw-r--r--tests/10ddf-create-fail-rebuild77
-rw-r--r--tests/10ddf-fail-create-race66
-rw-r--r--tests/10ddf-fail-readd55
-rw-r--r--tests/10ddf-fail-readd-readonly71
-rw-r--r--tests/10ddf-fail-spare86
-rw-r--r--tests/10ddf-fail-stop-readd66
-rw-r--r--tests/10ddf-fail-twice59
-rw-r--r--tests/10ddf-fail-two-spares86
-rw-r--r--tests/10ddf-geometry82
-rw-r--r--tests/10ddf-incremental-wrong-order131
-rw-r--r--tests/10ddf-sudden-degraded18
-rw-r--r--tests/11spare-migration454
-rw-r--r--tests/12imsm-r0_2d-grow-r0_3d20
-rw-r--r--tests/12imsm-r0_2d-grow-r0_4d20
-rw-r--r--tests/12imsm-r0_2d-grow-r0_5d20
-rw-r--r--tests/12imsm-r0_3d-grow-r0_4d20
-rw-r--r--tests/12imsm-r5_3d-grow-r5_4d20
-rw-r--r--tests/12imsm-r5_3d-grow-r5_5d20
-rw-r--r--tests/13imsm-r0_r0_2d-grow-r0_r0_4d29
-rw-r--r--tests/13imsm-r0_r0_2d-grow-r0_r0_5d29
-rw-r--r--tests/13imsm-r0_r0_3d-grow-r0_r0_4d29
-rw-r--r--tests/13imsm-r0_r5_3d-grow-r0_r5_4d29
-rw-r--r--tests/13imsm-r0_r5_3d-grow-r0_r5_5d29
-rw-r--r--tests/13imsm-r5_r0_3d-grow-r5_r0_4d29
-rw-r--r--tests/13imsm-r5_r0_3d-grow-r5_r0_5d29
-rw-r--r--tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d29
-rw-r--r--tests/14imsm-r0_3d_no_spares-migrate-r5_3d21
-rw-r--r--tests/14imsm-r0_r0_2d-takeover-r10_4d30
-rw-r--r--tests/14imsm-r10_4d-grow-r10_5d20
-rw-r--r--tests/14imsm-r10_r5_4d-takeover-r0_2d30
-rw-r--r--tests/14imsm-r1_2d-grow-r1_3d20
-rw-r--r--tests/14imsm-r1_2d-takeover-r0_2d22
-rw-r--r--tests/14imsm-r5_3d-grow-r5_5d-no-spares20
-rw-r--r--tests/14imsm-r5_3d-migrate-r4_3d21
-rw-r--r--tests/15imsm-r0_3d_64k-migrate-r0_3d_256k21
-rw-r--r--tests/15imsm-r5_3d_4k-migrate-r5_3d_256k21
-rw-r--r--tests/15imsm-r5_3d_64k-migrate-r5_3d_256k21
-rw-r--r--tests/15imsm-r5_6d_4k-migrate-r5_6d_256k21
-rw-r--r--tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k34
-rw-r--r--tests/16imsm-r0_3d-migrate-r5_4d21
-rw-r--r--tests/16imsm-r0_5d-migrate-r5_6d21
-rw-r--r--tests/16imsm-r5_3d-migrate-r0_3d21
-rw-r--r--tests/16imsm-r5_5d-migrate-r0_5d21
-rw-r--r--tests/18imsm-1d-takeover-r0_1d22
-rw-r--r--tests/18imsm-1d-takeover-r1_2d20
-rw-r--r--tests/18imsm-r0_2d-takeover-r10_4d22
-rw-r--r--tests/18imsm-r10_4d-takeover-r0_2d22
-rw-r--r--tests/18imsm-r1_2d-takeover-r0_1d22
-rw-r--r--tests/19raid6auto-repair49
-rw-r--r--tests/19raid6check27
-rw-r--r--tests/19raid6repair56
-rw-r--r--tests/19repair-does-not-destroy28
-rw-r--r--tests/20raid5journal64
-rw-r--r--tests/ToTest44
-rw-r--r--tests/check35
-rw-r--r--tests/env-ddf-template113
-rw-r--r--tests/env-imsm-template74
-rw-r--r--tests/imsm-grow-template106
-rw-r--r--tests/testdev13
-rw-r--r--tests/utils191
-rw-r--r--udev-md-raid-arrays.rules41
-rw-r--r--udev-md-raid-assembly.rules35
-rw-r--r--util.c2205
-rw-r--r--xmalloc.c84
312 files changed, 79862 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..217fe76
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,18 @@
+/*.o
+/*.man
+/*-stamp
+/mdadm
+/mdadm.8
+/mdadm.udeb
+/mdassemble
+/mdmon
+/swap_super
+/test_stripe
+/TAGS
+/mdadm.O2
+/mdadm.Os
+/mdadm.static
+/mdassemble.auto
+/mdassemble.static
+/mdmon.O2
+/raid6check
diff --git a/ANNOUNCE-3.0 b/ANNOUNCE-3.0
new file mode 100644
index 0000000..f2d4f84
--- /dev/null
+++ b/ANNOUNCE-3.0
@@ -0,0 +1,98 @@
+Subject: ANNOUNCE: mdadm 3.0 - A tool for managing Soft RAID under Linux
+
+I am pleased to (finally) announce the availability of
+ mdadm version 3.0
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+
+This is a major new version and as such should be treated with some
+caution. However it has seen substantial testing and is considerred
+to be ready for wide use.
+
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+ - DDF - The SNIA standard format
+ - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+Also the approach to device names has changed significantly.
+
+If udev is installed on the system, mdadm will not create any devices
+in /dev. Rather it allows udev to manage those devices. For this to work
+as expected, the included udev rules file should be installed.
+
+If udev is not installed, mdadm will still create devices and symlinks
+as required, and will also remove them when the array is stopped.
+
+mdadm now requires all devices which do not have a standard name (mdX
+or md_dX) to live in the directory /dev/md/. Names in this directory
+will always be created as symlinks back to the standard name in /dev.
+
+The man pages contain some information about the new externally managed
+metadata. However see below for a more condensed overview.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata. A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays. These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+ mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+ mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+ mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+To assemble a container, it is easiest just to pass each device in turn to
+mdadm -I
+
+ for i in /dev/sd[abcde]
+ do mdadm -I $i
+ done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+ mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+ mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed. The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to. The new 'mdmon' approach is only used for
+newly introduced metadata types.
+
+NeilBrown 2nd June 2009
diff --git a/ANNOUNCE-3.0.1 b/ANNOUNCE-3.0.1
new file mode 100644
index 0000000..91b4428
--- /dev/null
+++ b/ANNOUNCE-3.0.1
@@ -0,0 +1,22 @@
+Subject: ANNOUNCE: mdadm 3.0.1 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.0.1
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+
+This contains only minor bug fixes over 3.0. If you are using
+3.0, you could consider upgrading.
+
+The brief change log is:
+ - Fix various segfaults
+ - Fixed for --examine with containers
+ - Lots of other little fixes.
+
+NeilBrown 25th September 2009
diff --git a/ANNOUNCE-3.0.2 b/ANNOUNCE-3.0.2
new file mode 100644
index 0000000..93643d1
--- /dev/null
+++ b/ANNOUNCE-3.0.2
@@ -0,0 +1,21 @@
+Subject: ANNOUNCE: mdadm 3.0.2 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.0.2
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+
+This just contains one bugfix over 3.0.1 - I was obviously a bit hasty
+in releasing that one.
+
+The brief change log is:
+ - Fix crash when hosthost is not set, as often happens in
+ early boot.
+
+NeilBrown 25th September 2009
diff --git a/ANNOUNCE-3.0.3 b/ANNOUNCE-3.0.3
new file mode 100644
index 0000000..d6117a1
--- /dev/null
+++ b/ANNOUNCE-3.0.3
@@ -0,0 +1,29 @@
+Subject: ANNOUNCE: mdadm 3.0.3 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.0.3
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+
+This contains a collection of bug fixes and minor enhancements over
+3.0.1.
+
+The brief change log is:
+ - Improvements for creating arrays giving just a name, like 'foo',
+ rather than the full '/dev/md/foo'.
+ - Improvements for assembling member arrays of containers.
+ - Improvements to test suite
+ - Add option to change increment for RebuildNN messages reported
+ by "mdadm --monitor"
+ - Improvements to mdmon 'hand-over' from initrd to final root.
+ - Handle merging of devices that have left an IMSM array and are
+ being re-incorporated.
+ - Add missing space in "--detail --brief" output.
+
+NeilBrown 22nd October 2009
diff --git a/ANNOUNCE-3.1 b/ANNOUNCE-3.1
new file mode 100644
index 0000000..343b85d
--- /dev/null
+++ b/ANNOUNCE-3.1
@@ -0,0 +1,33 @@
+Subject: ANNOUNCE: mdadm 3.1 - A tool for managing Soft RAID under Linux
+
+Hot on the heals of 3.0.3 I am pleased to announce the availability of
+ mdadm version 3.1
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+
+It contains significant feature enhancements over 3.0.x
+
+The brief change log is:
+ - Support --grow to change the layout of RAID4/5/6
+ - Support --grow to change the chunksize of raid 4/5/6
+ - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and
+ back.
+ - Support --grow to reduce the number of devices in RAID4/5/6.
+ - Support restart of these grow options which assembling an array
+ which is partially grown.
+ - Assorted tests of this code, and of different RAID6 layouts.
+
+Note that a 2.6.31 or later is needed to have access to these.
+Reducing devices in a RAID4/5/6 requires 2.6.32.
+Changing RAID5 to RAID1 requires 2.6.33.
+
+You should only upgrade if you need to use, or which to test, these
+features.
+
+NeilBrown 22nd October 2009
diff --git a/ANNOUNCE-3.1.1 b/ANNOUNCE-3.1.1
new file mode 100644
index 0000000..9e480dc
--- /dev/null
+++ b/ANNOUNCE-3.1.1
@@ -0,0 +1,39 @@
+Subject: ANNOUNCE: mdadm 3.1.1 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.1.1
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a bugfix release over 3.1, which was withdrawn due to serious
+bugs. So it might be best to ignore 3.1 and say that this is a significant
+feature release over 3.0.x
+
+Significant changes are:
+ - RAID level conversion between RAID1, RAID5, and RAID6 are
+ possible were the kernel supports it (2.6.32 at least)
+ - online chunksize and layout changing for RAID5 and RAID6
+ where the kernel supports it.
+ - reduce the number of devices in a RAID4/5/6 array.
+
+ - The default metadata is not v1.1. This metadata is stored at the
+ start of the device so is safer in many ways but could interfere with
+ boot loaded. The old default (0.90) is still available and fully
+ supported.
+
+ - The default chunksize is now 512K rather than 64K. This seems more
+ appropriate for modern devices.
+
+ - The default bitmap chunksize for internal bitmaps is now at least
+ 64Meg as fine grained bitmaps tend to impact performance more for
+ little extra gain.
+
+This release is believed to be stable and you should feel free to
+upgrade to 3.1.1.
+
+NeilBrown 19th November 2009
diff --git a/ANNOUNCE-3.1.2 b/ANNOUNCE-3.1.2
new file mode 100644
index 0000000..321b8be
--- /dev/null
+++ b/ANNOUNCE-3.1.2
@@ -0,0 +1,46 @@
+Subject: ANNOUNCE: mdadm 3.1.2 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.1.2
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a bugfix/stability release over 3.1.1.
+
+Significant changes are:
+ - The default metadata has change again (sorry about that).
+ It is now v1.2 and will hopefully stay that way. It turned
+ out there with boot-block issues with v1.1 which make it
+ unsuitable for a default, though in many cases it is still
+ suitable to use.
+ - Stopping a container is not permitted when members are still
+ active
+ - Add 'homehost' to the valid words for the "AUTO" config file
+ line. When followed by "-all", this causes mdadm to
+ auto-assemble any array belonging to this host, but not
+ auto-assemble anything else.
+ - Fix some bugs with "--grow --chunksize=" for changing chunksize.
+ - VAR_RUN can be easily changed at compile time just like ALT_RUN.
+ This gives distros more flexability in how to manage the
+ pid and sock files that mdmon needs.
+ - Various mdmon fixes
+ - Alway make bitmap 4K-aligned if at all possible.
+ - If mdadm.conf lists arrays which have inter-dependencies,
+ the previously had to be listed in the "right" order. Now
+ any order should work.
+ - Fix --force assembly of v1.x arrays which are in the process
+ of recovering.
+ - Add section on 'scrubbing' to 'md' man page.
+ - Various command-line-option parsing improvements.
+ - ... and lots of other bug fixes.
+
+
+This release is believed to be stable and you should feel free to
+upgrade to 3.1.2
+
+NeilBrown 10th March 2010
diff --git a/ANNOUNCE-3.1.3 b/ANNOUNCE-3.1.3
new file mode 100644
index 0000000..95b2b6c
--- /dev/null
+++ b/ANNOUNCE-3.1.3
@@ -0,0 +1,46 @@
+Subject: ANNOUNCE: mdadm 3.1.3 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.1.3
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a bugfix/stability release over 3.1.2
+
+Significant changes are:
+ - mapfile now lives in a fixed location which default to
+ /dev/.mdadm/map but can be changed at compile time. This
+ location is choses and most distros provide it during early
+ boot and preserve it through. As long a /dev exists and is
+ writable, /dev/.mdadm will be created.
+ Other files file communication with mdmon live here too.
+ This fixes a bug reported by Debian and Gentoo users where
+ udev would spin in early-boot.
+ - IMSM and DDF metadata will not be recognised on partitions
+ as they should only be used on whole-disks.
+ - Various overflows causes by 2G drives have been addressed.
+ - A subarray of an IMSM contain can now be killed with
+ --kill-subarray. Also subarrays can be renamed with
+ --update-subarray
+ - -If (or --incremental --fail) can be used from udev to
+ fail and remove from all arrays a device which has been
+ unplugged from the system. i.e. hot-unplug-support.
+ - "mdadm /dev/mdX --re-add missing" will look for any device
+ that looks like it should be a member of /dev/mdX but isn't
+ and will automatically --re-add it
+ - Now compile with -Wextra to get extra warnings.
+ - Lots of minor bug fixes, documentation improvements, etcc
+
+This release is believed to be stable and you should feel free to
+upgrade to 3.1.3
+
+It is expected that the next release will be 3.2 with a number of new
+features. 3.1.4 will only happen if important bugs show up before 3.2
+is stable.
+
+NeilBrown 6th August 2010
diff --git a/ANNOUNCE-3.1.4 b/ANNOUNCE-3.1.4
new file mode 100644
index 0000000..c157a36
--- /dev/null
+++ b/ANNOUNCE-3.1.4
@@ -0,0 +1,37 @@
+Subject: ANNOUNCE: mdadm 3.1.4 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.1.4
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a bugfix/stability release over 3.1.3.
+3.1.3 had a couple of embarrasing regressions and a couple of other
+issues surfaces which had easy fixes so I decided to make a 3.1.4
+release after all.
+
+Two fixes related to configs that aren't using udev:
+ - Don't remove md devices which 'standard' names on --stop
+ - Allow dev_open to work on read-only /dev
+And fixed regressions:
+ - Allow --incremental to add spares to an array
+ - Accept --no-degraded as a deprecated option rather than
+ throwing an error
+ - Return correct success status when --incrmental assembling
+ a container which does not yet have enough devices.
+ - Don't link mdadm with pthreads, only mdmon needs it.
+ - Fix compiler warning due to bad use of snprintf
+ - Fix spare migration
+
+This release is believed to be stable and you should feel free to
+upgrade to 3.1.4
+
+It is expected that the next release will be 3.2 with a number of new
+features.
+
+NeilBrown 31st August 2010
diff --git a/ANNOUNCE-3.1.5 b/ANNOUNCE-3.1.5
new file mode 100644
index 0000000..baa1f92
--- /dev/null
+++ b/ANNOUNCE-3.1.5
@@ -0,0 +1,42 @@
+Subject: ANNOUNCE: mdadm 3.1.5 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.1.5
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a bugfix/stability release over 3.1.4. It contains all the
+important bugfixes found while working on 3.2 and 3.2.1. It will be
+the last 3.1.x release - 3.2.1 is expected to be released in a few days.
+
+Changes include:
+ - Fixes for v1.x metadata on big-endian machines.
+ - man page improvements
+ - Improve '--detail --export' when run on partitions of an md array.
+ - Fix regression with removing 'failed' or 'detached' devices.
+ - Fixes for "--assemble --force" in various unusual cases.
+ - Allow '-Y' to mean --export. This was documented but not implemented.
+ - Various fixed for handling 'ddf' metadata. This is now more reliable
+ but could benefit from more interoperability testing.
+ - Correctly list subarrays of a container in "--detail" output.
+ - Improve checks on whether the requested number of devices is supported
+ by the metadata - both for --create and --grow.
+ - Don't remove partitions from a device that is being included in an
+ array until we are fully committed to including it.
+ - Allow "--assemble --update=no-bitmap" so an array with a corrupt
+ bitmap can still be assembled.
+ - Don't allow --add to succeed if it looks like a "--re-add" is probably
+ wanted, but cannot succeed. This avoids inadvertently turning
+ devices into spares when an array is failed.
+
+This release is believed to be stable and you should feel free to
+upgrade to 3.1.5
+
+
+NeilBrown 23rd March 2011
+
diff --git a/ANNOUNCE-3.2 b/ANNOUNCE-3.2
new file mode 100644
index 0000000..9e282bc
--- /dev/null
+++ b/ANNOUNCE-3.2
@@ -0,0 +1,77 @@
+Subject: ANNOUNCE: mdadm 3.2 - A tool for managing Soft RAID under Linux (DEVEL ONLY)
+
+I am pleased to announce the availability of
+ mdadm version 3.2
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm devel-3.2
+ http://neil.brown.name/git?p=mdadm
+
+This is a "Developers only" release. Please don't consider using it
+or making it available to others without reading the following.
+
+
+By far the most significant change in this release related to the
+management of reshaping arrays. This code has been substantially
+re-written so that it can work with 'externally managed metadata' -
+Intel's IMSM in particular. We now support level migration and
+OnLine Capacity Expansion on these arrays.
+
+However, while the code largely works it has not been tested
+exhaustively so there are likely to be problems. As the reshape code
+for native metadata arrays was changed as part of this rewrite these
+problems could also result in regressions for reshape of native
+metadata.
+
+It is partly to encourage greater testing that this release is being
+made. Any reports of problem - particular reproducible recipes for
+triggering the problems - will be gratefully received.
+
+It is hopped that a "3.2.1" release will be available in early March
+which will be a bugfix release over this and can be considered
+suitable for general use.
+
+Other changes of note:
+
+ - Policy framework.
+ Various policy statements can be made in the mdadm.conf to guide
+ the behaviour of mdadm, particular with regards to how new devices
+ are treated by "mdadm -I".
+ Depending on the 'action' associated with a device (identified by
+ its 'path') such need devices can be automatically re-added to and
+ existing array that they previously fell out off, or automatically
+ added as a spare if they appear to contain no data.
+
+ - mdadm now has a limited understanding of partition tables. This
+ allows the policy framework to make decisions about partitioned
+ devices as well.
+
+ - --incremental --remove can be told what --path the device was on,
+ and this info will be recorded so that another device appearing at
+ the same physical location can be preferentially added to the same
+ array (provides the spare-same-slot action policy applied to the
+ path).
+
+ - A new flags "--invalid-backup" flag is available in --assemble
+ mode. This can be used to re-assemble an array which was stopping
+ in the middle of a reshape, and for which the 'backup file' is no
+ longer available or is corrupted. The array may have some
+ corruption in it at the point where reshape was up to, but at least
+ the rest of the array will become available.
+
+
+ - Various internal restructuring - more is needed.
+
+
+Any feed back and bug reports are always welcomed at:
+ linux-raid@vger.kernel.org
+
+And please: don't use this in production - particularly not the
+--grow functionality.
+
+NeilBrown 1st February 2011
+
+
diff --git a/ANNOUNCE-3.2.1 b/ANNOUNCE-3.2.1
new file mode 100644
index 0000000..0e7826c
--- /dev/null
+++ b/ANNOUNCE-3.2.1
@@ -0,0 +1,75 @@
+
+
+I am pleased to announce the availability of
+ mdadm version 3.2.1
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+Many of the changes in this release are of internal interest only,
+restructuring and refactoring code and so forth.
+
+Most of the bugs found and fixed during development for 3.2.1 have been
+back-ported for the recently-release 3.1.5 so this release primarily
+provides a few new features over 3.1.5.
+
+They include:
+ - policy framework
+ Policy can be expressed for moving spare devices between arrays, and
+ for how to handle hot-plugged devices. This policy can be different
+ for devices plugged in to different controllers etc.
+ This, for example, allows a configuration where when a device is plugged
+ in it is immediately included in an md array as a hot spare and
+ possibly starts recovery immediately if an array is degraded.
+
+ - some understanding of mbr and gpt paritition tables
+ This is primarly to support the new hot-plug support. If a
+ device is plugged in and policy suggests it should have a partition table,
+ the partition table will be copied from a suitably similar device, and
+ then the partitions will hot-plug and can then be added to md arrays.
+
+ - "--incremental --remove" can remember where a device was removed from
+ so if a device gets plugged back in the same place, special policy applies
+ to it, allowing it to be included in an array even if a general hotplug
+ will not be included.
+
+ - enhanced reshape options, including growing a RAID0 by converting to RAID4,
+ restriping, and converting back. Also convertions between RAID0 and
+ RAID10 and between RAID1 and RAID10 are possible (with a suitably recent
+ kernel).
+
+ - spare migration for IMSM arrays.
+ Spare migration can now work across 'containers' using non-native metadata
+ and specifically Intel's IMSM arrays support spare migrations.
+
+ - OLCE and level migration for Intel IMSM arrays.
+ OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is
+ supported for Intel Matrix Storage Manager arrays.
+ This support is currently 'experimental' for technical reasons. It can
+ be enabled with "export MDADM_EXPERIMENTAL=1"
+
+ - avoid including wayward devices
+ If you split a RAID1, mount the two halves as two separate degraded RAID1s,
+ and then later bring the two back together, it is possible that the md
+ metadata won't properly show that one must over-ride the other.
+ mdadm now does extra checking to detect this possibilty and avoid
+ potentially corrupting data.
+
+ - remove any possible confusion between similar options.
+ e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't
+ notice if one was used where the other was expected.
+
+ - allow K,M,G suffixes on chunk sizes
+
+
+While mdadm-3.2.1 is considered to be reasonably stable, you should
+only use it if you want to try out the new features, or if you
+generally like to be on the bleeding edge. If the new features are not
+important to you, then 3.1.5 is probably the appropriate version to be using
+until 3.2.2 comes out.
+
+NeilBrown 28th March 2011
diff --git a/ANNOUNCE-3.2.2 b/ANNOUNCE-3.2.2
new file mode 100644
index 0000000..b70d18b
--- /dev/null
+++ b/ANNOUNCE-3.2.2
@@ -0,0 +1,36 @@
+Subject: ANNOUNCE: mdadm 3.2.2 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.2.2
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+This release is largely a stablising release for the 3.2 series.
+Many of the changes just fix bugs introduces in 3.2 or 3.2.1.
+
+There are some new features. They are:
+ - reshaping IMSM (Intel metadata) arrays is no longer 'experimental',
+ it should work properly and be largely compatible with IMSM drivers in
+ other platforms.
+ - --assume-clean can be used with --grow --size to avoid resyncing the
+ new part of the array. This is only support with very new kernels.
+ - RAID0 arrays can have chunksize which is not a power of 2. This has been
+ supported in the kernel for a while but is only now supprted by
+ mdadm.
+
+ - A new tool 'raid6check' is available which can check a RAID6 array,
+ or part of it, and report which device is most inconsistent with the
+ others if any stripe is inconsistent. This is still under development
+ and does not have a man page yet. If anyone tries it out and has any
+ questions or experience to report, they would be most welcome on
+ linux-raid@vger.kernel.org.
+
+Future releases in the 3.2 series will only be made if bugfixes are needed.
+The next release to add features is expected to be 3.3.
+
+NeilBrown 17th June 2011
diff --git a/ANNOUNCE-3.2.3 b/ANNOUNCE-3.2.3
new file mode 100644
index 0000000..8a8dba4
--- /dev/null
+++ b/ANNOUNCE-3.2.3
@@ -0,0 +1,24 @@
+Subject: ANNOUNCE: mdadm 3.2.3 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.2.3
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+This release is largely a bugfix release for the 3.2 series with many
+minor fixes with little or no impact.
+
+The largest single area of change is support for reshape of Intel
+IMSM arrays (OnLine Capacity Explansion and Level Migtration).
+Among other fixes, this now has a better chance of surviving if a
+device fails during reshape.
+
+Upgrading is recommended - particularly if you use mdadm for IMSM
+arrays - but not essential.
+
+NeilBrown 23rd December 2011
diff --git a/ANNOUNCE-3.2.4 b/ANNOUNCE-3.2.4
new file mode 100644
index 0000000..e321678
--- /dev/null
+++ b/ANNOUNCE-3.2.4
@@ -0,0 +1,144 @@
+Subject: ANNOUNCE: mdadm 3.2.4 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.2.4
+
+It is available at the usual places, now including github:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+This release is largely a bugfix release for the 3.2 series with many
+minor fixes with little or no impact.
+
+"--oneline" log of changes is below. Some notable ones are:
+
+ - --offroot argument to improve interactions between mdmon and initrd
+ - --prefer argument to select which /dev names to display in some
+ circumstances.
+ - relax restructions on when "--add" will be allowed
+ - Fix bug with adding write-intent-bitmap to active array
+ - Now defaults to "/run/mdadm" for storing run-time files.
+
+Upgrading is encouraged.
+
+The next mdadm release is expected to be 3.3 with a number of new
+features.
+
+NeilBrown 9th May 2012
+
+77b3ac8 monitor: make return from read_and_act more symbolic.
+68226a8 monitor: ensure we retry soon when 'remove' fails.
+8453f8d fix: Monitor sometimes crashes
+90fa1a2 Work around gcc-4.7's strict aliasing checks
+0c4304c fix: container creation with --incremental used.
+5d1c7cd FIX: External metadata sometimes is not updated
+3c20f98 FIX: mdmon check in reshape_container() can cause a problem
+59ab9f5 FIX: Typo error in fprint command
+9587c37 imsm: load_super_imsm_all function refactoring
+ec50f7b imsm: load_imsm_super_all supports loading metadata from the device list
+ca9de18 imsm: validate the number of imsm volumes per controller
+30602f5 imsm: display fd in error trace when when store_imsm_mpb failes
+eb155f6 mdmon: Use getopt_long() to parse command line options
+08ca2ad Add --offroot argument to mdadm
+da82751 Add --offroot argument to mdmon
+a0963a8 Spawn mdmon with --offroot if mdadm was launched with --offroot
+f878b24 imsm: fix, the second array need to have the whole available space on devices
+d597705 getinfo_super1: Use MaxSector in place of sb->size
+6ef8905 super1: make aread/awrite always use an aligned buffer.
+de5a472 Remove avail_disks arg from 'enough'.
+da8fe5a Assemble: fix --force assemble during reshape.
+b10c663 config: fix handing of 'homehost' in AUTO line.
+92d49ec FIX: NULL pointer to strdup() can be passed
+d2bde6d imsm: FIX: No new missing disks are allowed during general migration
+111e9fd FIX: Array is not run when expansion disks are added
+bf5cf7c imsm: FIX: imsm_get_allowed_degradation() doesn't count degradation for raid1
+50927b1 Fix: Sometimes mdmon throws core dump during reshape
+78340e2 Flush mdmon before next reshape step during container operation
+e174219 imsm: FIX: Chunk size migration problem
+f93346e FIX: use md position to reshape restart
+6a75c8c imsm: FIX: use md position to reshape restart
+51d83f5 imsm: FIX: Clear migration record when migration switches to next volume.
+e1dd332 FIX: restart reshape when reshape process is stopped just between 2 reshapes
+1ca90aa FIX: Do not try to (continue) reshape using inactive array
+9f1b0f0 config: conf_match should ignore devname when not set.
+d669228 Use posix_memalign() for memory used to write bitmaps
+178950e FIX: Changes in '0' case for reshape position verification
+9200d41 avoid double-free upon "old buggy kernel" sysfs_read failure
+4011421 Print error message if failing to write super for 1.x metadata
+0011874 Use MDMON_DIR for pid files created in Monitor.c
+56d1885 Assemble: don't use O_EXCL until we have checked device content.
+b720636 Assemble: support assembling of a RAID0 being reshaped.
+c69ffac Manage: allow --re-add to failed array.
+52f07f5 Reset bad flag on map update
+911cead super1: support superblocks up to 4K.
+ad6db3c Create: reduce the verbosity of 'default_layout'.
+b2bfdfa super1.c don't keep recalculating bitmap pointer
+4122675 Define and use SUPER1_SIZE for allocations
+1afa930 init_super1() memset full buffer allocated for superblock
+2de0b8a match_metadata_desc1(): Use calloc instead of malloc+memset
+3c0bcd4 Use 4K buffer alignment for superblock allocations
+308340a Use struct align_fd to cache fd's block size for aligned reads/writes
+65ed615 match_metadata_desc0(): Use calloc instead of malloc+memset
+de89706 Generalize ROUND_UP() macro and introduce matching ROUND_UP_PTR()
+0a2f189 super1.c: use ROUND_UP/ROUND_UP_PTR
+654a381 super-intel.c: Use ROUND_UP() instead of manually coding it
+42d5dfd __write_init_super_ddf(): Use posix_memalign() instead of static aligned buffer
+d4633e0 Examine: fix array size calculation for RAID10.
+e62b778 Assemble: improve verbose logging when including old devices.
+0073a6e Remove possible crash during RAID6 -> RAID5 reshape.
+69fe207 Incremental: fix adding devices with --incremental
+bcbb311 Manage: replace 'return 1' with 'goto abort'.
+9f58469 Manage: freeze recovery while adding multiple devices.
+ae6c05a Create: round off size for RAID1 arrays.
+5ca3a90 Grow: print useful error when converting RAID1->RAID5 will fail.
+c07d640 Fix tests/05r1-re-add-nosupper
+2d762ad Fix the new ROUND_UP macro.
+fd324b0 sysfs: fixed sysfs_freeze_array array to work properly with Manage_subdevs.
+5551b11 imsm: avoid overflows for disks over 1TB
+97f81ee clear hi bits if not used after loading metadata from disk
+e03640b simplify calculating array_blocks
+29cd082 show 2TB volumes/disks support in --detail-platform
+2cc699a check volume size in validate_geometry_imsm_orom
+9126b9a check that no disk over 2TB is used to create container when no support
+027c374 imsm: set 2tb disk attribute for spare
+3556c2f Fix typo: wan -> want
+15632a9 parse_size: distinguish between 0 and error.
+fbdef49 Bitmap_offset is a signed number
+508a7f1 super1: leave more space in front of data by default.
+40110b9 Fix two typos in fprintf messages
+342460c mdadm man page: fix typo
+0e7f69a imsm: display maximum volumes per controller and array
+36fd8cc imsm: FIX: Update function imsm_num_data_members() for Raid1/10
+7abc987 imsm: FIX: Add volume size expand support to imsm_analyze_change()
+f3871fd imsm: Add new metadata update for volume size expansion
+54397ed imsm: Execute size change for external metatdata
+016e00f FIX: Support metadata changes rollback
+fbf3d20 imsm: FIX: Support metadata changes rollback
+44f6f18 FIX: Extend size of raid0 array
+7e7e9a4 FIX: Respect metadata size limitations
+65a9798 FIX: Detect error and rollback metadata
+13bcac9 imsm: Add function imsm_get_free_size()
+b130333 imsm: Support setting max size for size change operation
+c41e00b imsm: FIX: Component size alignment check
+58d26a2 FIX: Size change is possible as standalone change only
+4aecb54 FIX: Assembled second array is in read only state during reshape
+ae2416e FIX: resolve make everything compilation error
+480f356 Raid limit of 1024 when scanning for devices.
+c2ecf5f Add --prefer option for --detail and --monitor
+0a99975 Relax restrictions on when --add is permitted.
+7ce0570 imsm: fix: rebuild does not continue after reboot
+b51702b fix: correct extending size of raid0 array
+34a1395 Fix sign extension of bitmap_offset in super1.c
+012a864 Introduce sysfs_set_num_signed() and use it to set bitmap/offset
+5d7b407 imsm: fix: thunderdome may drop 2tb attribute
+5ffdc2d Update test for "is udev active".
+96fd06e Adjust to new standard of /run
+974e039 test: don't worry too much about array size.
+b0a658f Grow: failing the set the per-device size is not an error.
+36614e9 super-intel.c: Don't try to close negative fd
+562aa10 super-intel.c: Fix resource leak from opendir()
+
diff --git a/ANNOUNCE-3.2.5 b/ANNOUNCE-3.2.5
new file mode 100644
index 0000000..396da12
--- /dev/null
+++ b/ANNOUNCE-3.2.5
@@ -0,0 +1,31 @@
+Subject: ANNOUNCE: mdadm 3.2.5 - A tool for managing Soft RAID under Linux
+
+I am somewhat disappointed to have to announce the availability of
+ mdadm version 3.2.5
+
+It is available at the usual places, now including github:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+This release primarily fixes a serious regression in 3.2.4.
+This regression does *not* cause any risk to data. It simply
+means that adding a device with "--add" would sometime fail
+when it should not.
+
+The fix also includes a couple of minor fixes such as making
+the "--layout=preserve" option to "--grow" work again.
+
+A reminder that the default location for runtime files is now
+"/run/mdadm". If you compile this for a distro that does not
+have "/run", you will need to compile with an alternate setting for
+MAP_DIR. e.g.
+ make MAP_DIR=/var/run/mdadm
+or
+ make MAP_DIR=/dev/.mdadm
+
+NeilBrown 18th May 2012
+
diff --git a/ANNOUNCE-3.2.6 b/ANNOUNCE-3.2.6
new file mode 100644
index 0000000..f5cfd49
--- /dev/null
+++ b/ANNOUNCE-3.2.6
@@ -0,0 +1,57 @@
+Subject: ANNOUNCE: mdadm 3.2.6 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.2.6
+
+It is available at the usual places, now including github:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+This is a stablity release which adds a number of bugfixs to 3.2.5.
+There are no real stand-out fixes, just lots of little bits and pieces.
+
+Below is the "git log --oneline --reverse" list of changes since
+3.2.5.
+
+NeilBrown 25th October 2012
+
+b7e05d2 udev-rules: prevent systemd from mount devices before they are ready.
+0d478e2 mdadm: Fix Segmentation fault.
+42f0ca1 imsm: fix: correct checking volume's degradation
+fcf2195 Monitor: fix inconsistencies in values for ->percent
+5f862fb Monitor: Report NewArray when an array the disappeared, reappears.
+6f51b1c Monitor: fix reporting for Fail vs FailSpare etc.
+68ad53b mdmon: fix arg parsing.
+517f135 Assemble: don't leak memory with fdlist.
+090900c udev-rules: prevent systemd from mount devices before they are ready.
+446e000 sha1.h: remove ansidecl.h header inclusion
+ec894f5 Manage: zero metadata before adding to 'external' array.
+3a84db5 ddf: allow a non-spare to be used to recovery a missing device.
+c5d61ca ddf: hack to fix container recognition.
+23084aa mdmon: fix arg processing for -a
+c4e96a3 mdmon: allow --takeover when original was started with --offroot
+80841df find_free_devnum: avoid auto-using names in /etc/mdadm.conf
+c5c56d6 mapfile: fix mapfile rebuild for containers
+aec89f6 fix segfaults in Detail()
+2117ad1 Fix 'enough' function for RAID10.
+0bc300d Use --offroot flag when assembling md arrays via --incrmental
+ac78f24 Grow: make warning about old metadata more explicit.
+14026ab Replace sha1.h with slightly older version.
+6f6809f Add zlib license to crc32.c
+5267ba0 Handles spaces in array names better.
+c51f288 imsm: allow --assume-clean to work.
+acf7076 Grow: allow --grow --continue to work for native metadata.
+335d2a6 Grow: fix a couple of typos with --assume-clean usage
+9ff1427 Fix open_container
+3713633 mdadm: super0: do not override uuid with homehost
+31bff58 Trivial bugfix and spelling fixes.
+e1e539f Detail: don't report a faulty device as 'spare' or 'rebuilding'.
+22a6461 super0: allow creation of array on 2TB+ devices.
+a5d47a2 Create new md devices consistently
+eb48676 Monitor: don't complain about non-monitorable arrays in mdadm.conf
+ecdf2d7 Query: don't be confused by partition tables.
+f7b75c1 Query: allow member of non-0.90 arrays to be better reported.
diff --git a/ANNOUNCE-3.3 b/ANNOUNCE-3.3
new file mode 100644
index 0000000..f770aa1
--- /dev/null
+++ b/ANNOUNCE-3.3
@@ -0,0 +1,63 @@
+Subject: ANNOUNCE: mdadm 3.3 - A tools for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.3
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm
+
+This is a major new release so don't be too surprised if there are a
+few issues. If I hear about them they will be fixed in 3.3.1.
+git log reports nearly 500 changes since 3.2.6 so I won't list them
+all.
+
+Some highlights are:
+
+- Some array reshapes can proceed without needing backup file.
+ This is done by changing the 'data_offset' so we never need to write
+ any data back over where it was before. If there is no "head space"
+ or "tail space" to allow data_offset to change, the old mechanism
+ with a backup file can still be used.
+- RAID10 arrays can be reshaped to change the number of devices,
+ change the chunk size, or change the layout between 'near'
+ and 'offset'.
+ This will always change data_offset, and will fail if there is no
+ room for data_offset to be moved.
+- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array.
+- bad-block-logs are supported (but not heavily tested yet)
+- "--assemble --update=revert-reshape" can be used to undo a reshape
+ that has just been started but isn't really wanted. This is very
+ new and while it passes basic tests it cannot be guaranteed.
+- improved locking between --incremental and --assemble
+- uses systemd to run "mdmon" if systemd is configured to do that.
+- kernel names of md devices can be non-numeric. e.g. "md_home" rather than
+ "md0". This will probably confuse lots of other tools, so you need to
+ echo CREATE names=yes >> /etc/mdadm.conf
+ or the feature will not be used. (you also need a reasonably new kernel).
+- "--stop" can be given a kernel name instead of a device name. i.e
+ mdadm --stop md4
+ will work even if /dev/md4 doesn't exist.
+- "--detail --export" has some information about the devices in the array
+- --dump and --restore can be used to backup and restore the metadata on an
+ array.
+- Hot-replace is supported with
+ mdadm /dev/mdX --replace /dev/foo
+ and
+ mdadm /dev/mdX --replace /dev/foo --with /dev/bar
+- Config file can be a directory in which case all "*.conf" files are
+ read in lexical order.
+ Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d
+ Thus
+ echo CREATE name=yes > /etc/mdadm.conf.d/names.conf
+ will also enable the use of named md devices.
+
+- Lots of improvements to DDF support including adding support for
+ RAID10 (thanks Martin Wilck).
+
+and lots of bugfixes and other little changes.
+
+NeilBrown 3rd September 2013
diff --git a/ANNOUNCE-3.3.1 b/ANNOUNCE-3.3.1
new file mode 100644
index 0000000..7d5e666
--- /dev/null
+++ b/ANNOUNCE-3.3.1
@@ -0,0 +1,23 @@
+Subject: ANNOUNCE: mdadm 3.3.1 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.3.1
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm.git
+
+The main changes are:
+ - lots of work on "DDF" support. Hopefully it will be more stable
+ now. Bug reports are always welcome.
+ - improved interactions with 'systemd'. Where possible, background
+ tasks are run from systemd (if it is present) rather then forking
+ disassociationg from the session. This is important because udev
+ doesn't really let you disassociate.
+
+though there are a number of other little bug fixes too.
+
+NeilBrown 5th June 2014
diff --git a/ANNOUNCE-3.3.2 b/ANNOUNCE-3.3.2
new file mode 100644
index 0000000..6b54961
--- /dev/null
+++ b/ANNOUNCE-3.3.2
@@ -0,0 +1,16 @@
+Subject: ANNOUNCE: mdadm 3.3.2 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.3.2
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm.git
+
+Changes since 3.3.1 are mostly little bugfixes and some man-page
+updates.
+
+NeilBrown 21st August 2014
diff --git a/ANNOUNCE-3.3.3 b/ANNOUNCE-3.3.3
new file mode 100644
index 0000000..ac1b217
--- /dev/null
+++ b/ANNOUNCE-3.3.3
@@ -0,0 +1,18 @@
+Subject: ANNOUNCE: mdadm 3.3.3 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.3.3
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm.git
+
+The 100 changes since 3.3.3 are mostly little bugfixes and some improvements
+to the selftests.
+raid6check now handle all RAID6 layouts including DDF correctly.
+See git log for the rest.
+
+NeilBrown 24th July 2015
diff --git a/ANNOUNCE-3.3.4 b/ANNOUNCE-3.3.4
new file mode 100644
index 0000000..52b9456
--- /dev/null
+++ b/ANNOUNCE-3.3.4
@@ -0,0 +1,37 @@
+Subject: ANNOUNCE: mdadm 3.3.4 - A tool for managing md Soft RAID under Linux
+
+I am somewhat disappointed to have to announce the availability of
+ mdadm version 3.3.4
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm.git
+
+In mdadm-3.3 a change was made to how IMSM (Intel Matrix Storage
+Manager) metadata was handled. Previously an IMSM array would only
+be assembled if it was attached to an IMSM controller.
+
+In 3.3 this was relaxed as there are circumstances where the
+controller is not properly detected. Unfortunately this has negative
+consequences which have only just come to light.
+
+If you have an IMSM RAID1 configured and then disable RAID in the
+BIOS, the metadata will remain on the devices. If you then install
+some other OS on one device and then install Linux on the other, Linux
+might eventually start noticing the IMSM metadata (depending a bit on whether
+mdadm is included in the initramfs) and might start up the RAID1. This could
+copy one device over the other, thus trashing one of the installations.
+
+Not good.
+
+So with this release IMSM arrays will only be assembled if attached to
+an IMSM controller, or if "--force" is given to --assemble, or if the
+environment variable IMSM_NO_PLATFORM is set (used primarily for
+testing).
+
+I strongly recommend upgrading to 3.3.4 if you are using 3.3 or later.
+
+NeilBrown 3rd August 2015.
diff --git a/ANNOUNCE-3.4 b/ANNOUNCE-3.4
new file mode 100644
index 0000000..2689732
--- /dev/null
+++ b/ANNOUNCE-3.4
@@ -0,0 +1,24 @@
+Subject: ANNOUNCE: mdadm 3.4 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.4
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm
+
+The new second-level version number reflects significant new
+functionality, particular support for journalled RAID5/6 and clustered
+RAID1. This new support is probably still buggy. Please report bugs.
+
+There are also a number of fixes for Intel's IMSM metadata support,
+and an assortment of minor bug fixes.
+
+I plan for this to be the last release of mdadm that I provide as I am
+retiring from MD and mdadm maintenance. Jes Sorensen has volunteered
+to oversee mdadm for the next while. Thanks Jes!
+
+NeilBrown 28th January 2016
diff --git a/Assemble.c b/Assemble.c
new file mode 100644
index 0000000..d199afc
--- /dev/null
+++ b/Assemble.c
@@ -0,0 +1,2070 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include <ctype.h>
+
+static int name_matches(char *found, char *required, char *homehost, int require_homehost)
+{
+ /* See if the name found matches the required name, possibly
+ * prefixed with 'homehost'
+ */
+ char *sep;
+ unsigned int l;
+
+ if (strcmp(found, required)==0)
+ return 1;
+ sep = strchr(found, ':');
+ if (!sep)
+ return 0;
+ l = sep - found;
+ if (strncmp(found, "any:", 4) == 0 ||
+ (homehost && strcmp(homehost, "any") == 0) ||
+ !require_homehost ||
+ (homehost && strlen(homehost) == l &&
+ strncmp(found, homehost, l) == 0)) {
+ /* matching homehost */
+ if (strcmp(sep+1, required) == 0)
+ return 1;
+ }
+ return 0;
+}
+
+static int is_member_busy(char *metadata_version)
+{
+ /* check if the given member array is active */
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *ent;
+ int busy = 0;
+
+ for (ent = mdstat; ent; ent = ent->next) {
+ if (ent->metadata_version == NULL)
+ continue;
+ if (strncmp(ent->metadata_version, "external:", 9) != 0)
+ continue;
+ if (!is_subarray(&ent->metadata_version[9]))
+ continue;
+ /* Skip first char - it can be '/' or '-' */
+ if (strcmp(&ent->metadata_version[10], metadata_version+1) == 0) {
+ busy = 1;
+ break;
+ }
+ }
+ free_mdstat(mdstat);
+
+ return busy;
+}
+
+static int ident_matches(struct mddev_ident *ident,
+ struct mdinfo *content,
+ struct supertype *tst,
+ char *homehost, int require_homehost,
+ char *update, char *devname)
+{
+
+ if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) &&
+ same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0 &&
+ memcmp(content->uuid, uuid_zero, sizeof(int[4])) != 0) {
+ if (devname)
+ pr_err("%s has wrong uuid.\n", devname);
+ return 0;
+ }
+ if (ident->name[0] && (!update || strcmp(update, "name")!= 0) &&
+ name_matches(content->name, ident->name, homehost, require_homehost)==0) {
+ if (devname)
+ pr_err("%s has wrong name.\n", devname);
+ return 0;
+ }
+ if (ident->super_minor != UnSet &&
+ ident->super_minor != content->array.md_minor) {
+ if (devname)
+ pr_err("%s has wrong super-minor.\n",
+ devname);
+ return 0;
+ }
+ if (ident->level != UnSet &&
+ ident->level != content->array.level) {
+ if (devname)
+ pr_err("%s has wrong raid level.\n",
+ devname);
+ return 0;
+ }
+ if (ident->raid_disks != UnSet &&
+ content->array.raid_disks != 0 && /* metadata doesn't know how many to expect */
+ ident->raid_disks!= content->array.raid_disks) {
+ if (devname)
+ pr_err("%s requires wrong number of drives.\n",
+ devname);
+ return 0;
+ }
+ if (ident->member && ident->member[0]) {
+ /* content->text_version must match */
+ char *s = strchr(content->text_version+1, '/');
+ if (s == NULL) {
+ if (devname)
+ pr_err("%s is not a container and one is required.\n",
+ devname);
+ return 0;
+ } else if (strcmp(ident->member, s+1) != 0) {
+ if (devname)
+ pr_err("skipping wrong member %s is %s\n",
+ content->text_version, devname);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int select_devices(struct mddev_dev *devlist,
+ struct mddev_ident *ident,
+ struct supertype **stp,
+ struct mdinfo **contentp,
+ struct context *c,
+ int inargv, int auto_assem)
+{
+ struct mddev_dev *tmpdev;
+ int num_devs;
+ struct supertype *st = *stp;
+ struct mdinfo *content = NULL;
+ int report_mismatch = ((inargv && c->verbose >= 0) || c->verbose > 0);
+ struct domainlist *domains = NULL;
+
+ tmpdev = devlist; num_devs = 0;
+ while (tmpdev) {
+ if (tmpdev->used)
+ tmpdev->used = 2;
+ else
+ num_devs++;
+ tmpdev->disposition = 0;
+ tmpdev = tmpdev->next;
+ }
+
+ /* first walk the list of devices to find a consistent set
+ * that match the criterea, if that is possible.
+ * We flag the ones we like with 'used'.
+ */
+ for (tmpdev = devlist;
+ tmpdev;
+ tmpdev = tmpdev ? tmpdev->next : NULL) {
+ char *devname = tmpdev->devname;
+ int dfd;
+ struct stat stb;
+ struct supertype *tst;
+ struct dev_policy *pol = NULL;
+ int found_container = 0;
+
+ if (tmpdev->used > 1)
+ continue;
+
+ if (ident->container) {
+ if (ident->container[0] == '/' &&
+ !same_dev(ident->container, devname)) {
+ if (report_mismatch)
+ pr_err("%s is not the container required (%s)\n",
+ devname, ident->container);
+ continue;
+ }
+ } else if (ident->devices &&
+ !match_oneof(ident->devices, devname)) {
+ /* Note that we ignore the "device=" identifier if a
+ * "container=" is given. Checking both is unnecessarily
+ * complicated.
+ */
+ if (report_mismatch)
+ pr_err("%s is not one of %s\n", devname, ident->devices);
+ continue;
+ }
+
+ tst = dup_super(st);
+
+ dfd = dev_open(devname, O_RDONLY);
+ if (dfd < 0) {
+ if (report_mismatch)
+ pr_err("cannot open device %s: %s\n",
+ devname, strerror(errno));
+ tmpdev->used = 2;
+ } else if (fstat(dfd, &stb)< 0) {
+ /* Impossible! */
+ pr_err("fstat failed for %s: %s\n",
+ devname, strerror(errno));
+ tmpdev->used = 2;
+ } else if ((stb.st_mode & S_IFMT) != S_IFBLK) {
+ pr_err("%s is not a block device.\n",
+ devname);
+ tmpdev->used = 2;
+ } else if (must_be_container(dfd)) {
+ if (st) {
+ /* already found some components, this cannot
+ * be another one.
+ */
+ if (report_mismatch)
+ pr_err("%s is a container, but we are looking for components\n",
+ devname);
+ tmpdev->used = 2;
+#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO)
+ } if (!tst && (tst = super_by_fd(dfd, NULL)) == NULL) {
+ if (report_mismatch)
+ pr_err("not a recognisable container: %s\n",
+ devname);
+ tmpdev->used = 2;
+#endif
+ } else if (!tst->ss->load_container
+ || tst->ss->load_container(tst, dfd, NULL)) {
+ if (report_mismatch)
+ pr_err("no correct container type: %s\n",
+ devname);
+ tmpdev->used = 2;
+ } else if (auto_assem &&
+ !conf_test_metadata(tst->ss->name, (pol = devid_policy(stb.st_rdev)),
+ tst->ss->match_home(tst, c->homehost) == 1)) {
+ if (report_mismatch)
+ pr_err("%s has metadata type %s for which auto-assembly is disabled\n",
+ devname, tst->ss->name);
+ tmpdev->used = 2;
+ } else
+ found_container = 1;
+ } else {
+ if (!tst && (tst = guess_super(dfd)) == NULL) {
+ if (report_mismatch)
+ pr_err("no recogniseable superblock on %s\n",
+ devname);
+ tmpdev->used = 2;
+ } else if ((tst->ignore_hw_compat = 0),
+ tst->ss->load_super(tst, dfd,
+ report_mismatch ? devname : NULL)) {
+ if (report_mismatch)
+ pr_err("no RAID superblock on %s\n",
+ devname);
+ tmpdev->used = 2;
+ } else if (tst->ss->compare_super == NULL) {
+ if (report_mismatch)
+ pr_err("Cannot assemble %s metadata on %s\n",
+ tst->ss->name, devname);
+ tmpdev->used = 2;
+ } else if (auto_assem && st == NULL &&
+ !conf_test_metadata(tst->ss->name, (pol = devid_policy(stb.st_rdev)),
+ tst->ss->match_home(tst, c->homehost) == 1)) {
+ if (report_mismatch)
+ pr_err("%s has metadata type %s for which auto-assembly is disabled\n",
+ devname, tst->ss->name);
+ tmpdev->used = 2;
+ }
+ }
+ if (dfd >= 0) close(dfd);
+ if (tmpdev->used == 2) {
+ if (auto_assem || !inargv)
+ /* Ignore unrecognised devices during auto-assembly */
+ goto loop;
+ if (ident->uuid_set || ident->name[0] ||
+ ident->super_minor != UnSet)
+ /* Ignore unrecognised device if looking for
+ * specific array */
+ goto loop;
+
+ pr_err("%s has no superblock - assembly aborted\n",
+ devname);
+ if (st)
+ st->ss->free_super(st);
+ dev_policy_free(pol);
+ domain_free(domains);
+ return -1;
+ }
+
+ if (found_container) {
+ /* tmpdev is a container. We need to be either
+ * looking for a member, or auto-assembling
+ */
+ /* should be safe to try an exclusive open now, we
+ * have rejected anything that some other mdadm might
+ * be looking at
+ */
+ dfd = dev_open(devname, O_RDONLY | O_EXCL);
+ if (dfd < 0) {
+ if (report_mismatch)
+ pr_err("%s is busy - skipping\n", devname);
+ goto loop;
+ }
+ close(dfd);
+
+ if (ident->container && ident->container[0] != '/') {
+ /* we have a uuid */
+ int uuid[4];
+
+ content = *contentp;
+ tst->ss->getinfo_super(tst, content, NULL);
+
+ if (!parse_uuid(ident->container, uuid) ||
+ !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) {
+ if (report_mismatch)
+ pr_err("%s has wrong UUID to be required container\n",
+ devname);
+ goto loop;
+ }
+ }
+ /* It is worth looking inside this container.
+ */
+ if (c->verbose > 0)
+ pr_err("looking in container %s\n",
+ devname);
+
+ for (content = tst->ss->container_content(tst, NULL);
+ content;
+ content = content->next) {
+
+ if (!ident_matches(ident, content, tst,
+ c->homehost, c->require_homehost,
+ c->update,
+ report_mismatch ? devname : NULL))
+ /* message already printed */;
+ else if (is_member_busy(content->text_version)) {
+ if (report_mismatch)
+ pr_err("member %s in %s is already assembled\n",
+ content->text_version,
+ devname);
+ } else if (content->array.state & (1<<MD_SB_BLOCK_VOLUME)) {
+ /* do not assemble arrays with unsupported configurations */
+ pr_err("Cannot activate member %s in %s.\n",
+ content->text_version,
+ devname);
+ } else
+ break;
+ }
+ if (!content) {
+ tmpdev->used = 2;
+ goto loop; /* empty container */
+ }
+
+ st = tst; tst = NULL;
+ if (!auto_assem && inargv && tmpdev->next != NULL) {
+ pr_err("%s is a container, but is not only device given: confused and aborting\n",
+ devname);
+ st->ss->free_super(st);
+ dev_policy_free(pol);
+ domain_free(domains);
+ return -1;
+ }
+ if (c->verbose > 0)
+ pr_err("found match on member %s in %s\n",
+ content->text_version, devname);
+
+ /* make sure we finished the loop */
+ tmpdev = NULL;
+ goto loop;
+ } else {
+ content = *contentp;
+ tst->ss->getinfo_super(tst, content, NULL);
+
+ if (!ident_matches(ident, content, tst,
+ c->homehost, c->require_homehost,
+ c->update,
+ report_mismatch ? devname : NULL))
+ goto loop;
+
+ if (auto_assem) {
+ /* Never auto-assemble things that conflict
+ * with mdadm.conf in some way
+ */
+ struct mddev_ident *match;
+ int rv = 0;
+
+ match = conf_match(tst, content, devname,
+ report_mismatch ? c->verbose : -1,
+ &rv);
+ if (!match && rv == 2)
+ goto loop;
+ if (match && match->devname &&
+ strcasecmp(match->devname, "<ignore>") == 0) {
+ if (report_mismatch)
+ pr_err("%s is a member of an explicitly ignored array\n",
+ devname);
+ goto loop;
+ }
+ if (match && !ident_matches(match, content, tst,
+ c->homehost, c->require_homehost,
+ c->update,
+ report_mismatch ? devname : NULL))
+ /* Array exists in mdadm.conf but some
+ * details don't match, so reject it
+ */
+ goto loop;
+ }
+
+ /* should be safe to try an exclusive open now, we
+ * have rejected anything that some other mdadm might
+ * be looking at
+ */
+ dfd = dev_open(devname, O_RDONLY | O_EXCL);
+ if (dfd < 0) {
+ if (report_mismatch)
+ pr_err("%s is busy - skipping\n", devname);
+ goto loop;
+ }
+ close(dfd);
+
+ if (st == NULL)
+ st = dup_super(tst);
+ if (st->minor_version == -1)
+ st->minor_version = tst->minor_version;
+
+ if (memcmp(content->uuid, uuid_zero,
+ sizeof(int[4])) == 0) {
+ /* this is a floating spare. It cannot define
+ * an array unless there are no more arrays of
+ * this type to be found. It can be included
+ * in an array of this type though.
+ */
+ tmpdev->used = 3;
+ goto loop;
+ }
+
+ if (st->ss != tst->ss ||
+ st->minor_version != tst->minor_version ||
+ st->ss->compare_super(st, tst) != 0) {
+ /* Some mismatch. If exactly one array matches this host,
+ * we can resolve on that one.
+ * Or, if we are auto assembling, we just ignore the second
+ * for now.
+ */
+ if (auto_assem)
+ goto loop;
+ if (c->homehost) {
+ int first = st->ss->match_home(st, c->homehost);
+ int last = tst->ss->match_home(tst, c->homehost);
+ if (first != last &&
+ (first == 1 || last == 1)) {
+ /* We can do something */
+ if (first) {/* just ignore this one */
+ if (report_mismatch)
+ pr_err("%s misses out due to wrong homehost\n",
+ devname);
+ goto loop;
+ } else { /* reject all those sofar */
+ struct mddev_dev *td;
+ if (report_mismatch)
+ pr_err("%s overrides previous devices due to good homehost\n",
+ devname);
+ for (td=devlist; td != tmpdev; td=td->next)
+ if (td->used == 1)
+ td->used = 0;
+ tmpdev->used = 1;
+ goto loop;
+ }
+ }
+ }
+ pr_err("superblock on %s doesn't match others - assembly aborted\n",
+ devname);
+ tst->ss->free_super(tst);
+ st->ss->free_super(st);
+ dev_policy_free(pol);
+ domain_free(domains);
+ return -1;
+ }
+ tmpdev->used = 1;
+ }
+ loop:
+ /* Collect domain information from members only */
+ if (tmpdev && tmpdev->used == 1) {
+ if (!pol)
+ pol = devid_policy(stb.st_rdev);
+ domain_merge(&domains, pol, tst?tst->ss->name:NULL);
+ }
+ dev_policy_free(pol);
+ pol = NULL;
+ if (tst)
+ tst->ss->free_super(tst);
+ }
+
+ /* Check if we found some imsm spares but no members */
+ if ((auto_assem ||
+ (ident->uuid_set &&
+ memcmp(uuid_zero, ident->uuid,sizeof(uuid_zero)) == 0)) &&
+ (!st || !st->sb))
+ for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
+ if (tmpdev->used != 3)
+ continue;
+ tmpdev->used = 1;
+ content = *contentp;
+
+ if (!st->sb) {
+ /* we need sb from one of the spares */
+ int dfd = dev_open(tmpdev->devname, O_RDONLY);
+ if (dfd < 0 ||
+ st->ss->load_super(st, dfd, NULL))
+ tmpdev->used = 2;
+ if (dfd > 0)
+ close(dfd);
+ }
+ }
+
+ /* Now reject spares that don't match domains of identified members */
+ for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
+ struct stat stb;
+ if (tmpdev->used != 3)
+ continue;
+ if (stat(tmpdev->devname, &stb)< 0) {
+ pr_err("fstat failed for %s: %s\n",
+ tmpdev->devname, strerror(errno));
+ tmpdev->used = 2;
+ } else {
+ struct dev_policy *pol = devid_policy(stb.st_rdev);
+ int dt = domain_test(domains, pol, NULL);
+ if (inargv && dt != 0)
+ /* take this spare as domains match
+ * if there are any */
+ tmpdev->used = 1;
+ else if (!inargv && dt == 1)
+ /* device wasn't explicitly listed, so need
+ * explicit domain match - which we have */
+ tmpdev->used = 1;
+ else
+ /* if domains don't match mark as unused */
+ tmpdev->used = 0;
+ dev_policy_free(pol);
+ }
+ }
+ domain_free(domains);
+ *stp = st;
+ if (st && st->sb && content == *contentp)
+ st->ss->getinfo_super(st, content, NULL);
+ *contentp = content;
+
+ return num_devs;
+}
+
+struct devs {
+ char *devname;
+ int uptodate; /* set once we decide that this device is as
+ * recent as everything else in the array.
+ */
+ int included; /* set if the device is already in the array
+ * due to a previous '-I'
+ */
+ struct mdinfo i;
+};
+
+static int load_devices(struct devs *devices, char *devmap,
+ struct mddev_ident *ident, struct supertype **stp,
+ struct mddev_dev *devlist, struct context *c,
+ struct mdinfo *content,
+ int mdfd, char *mddev,
+ int *most_recentp, int *bestcntp, int **bestp,
+ int inargv)
+{
+ struct mddev_dev *tmpdev;
+ int devcnt = 0;
+ int nextspare = 0;
+#ifndef MDASSEMBLE
+ int bitmap_done = 0;
+#endif
+ int most_recent = -1;
+ int bestcnt = 0;
+ int *best = *bestp;
+ struct supertype *st = *stp;
+
+ for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) {
+ char *devname = tmpdev->devname;
+ struct stat stb;
+ struct supertype *tst;
+ int i;
+ int dfd;
+
+ if (tmpdev->used != 1)
+ continue;
+ /* looks like a good enough match to update the super block if needed */
+#ifndef MDASSEMBLE
+ if (c->update) {
+ /* prepare useful information in info structures */
+ struct stat stb2;
+ int err;
+ fstat(mdfd, &stb2);
+
+ if (strcmp(c->update, "uuid")==0 &&
+ !ident->uuid_set) {
+ int rfd;
+ if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 ||
+ read(rfd, ident->uuid, 16) != 16) {
+ *(__u32*)(ident->uuid) = random();
+ *(__u32*)(ident->uuid+1) = random();
+ *(__u32*)(ident->uuid+2) = random();
+ *(__u32*)(ident->uuid+3) = random();
+ }
+ if (rfd >= 0) close(rfd);
+ }
+ dfd = dev_open(devname,
+ tmpdev->disposition == 'I'
+ ? O_RDWR : (O_RDWR|O_EXCL));
+
+ tst = dup_super(st);
+ if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) {
+ pr_err("cannot re-read metadata from %s - aborting\n",
+ devname);
+ if (dfd >= 0)
+ close(dfd);
+ close(mdfd);
+ free(devices);
+ free(devmap);
+ tst->ss->free_super(tst);
+ free(tst);
+ *stp = st;
+ return -1;
+ }
+ tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks);
+
+ memcpy(content->uuid, ident->uuid, 16);
+ strcpy(content->name, ident->name);
+ content->array.md_minor = minor(stb2.st_rdev);
+
+ if (strcmp(c->update, "byteorder") == 0)
+ err = 0;
+ else if (strcmp(c->update, "home-cluster") == 0) {
+ tst->cluster_name = c->homecluster;
+ err = tst->ss->write_bitmap(tst, dfd, NameUpdate);
+ } else if (strcmp(c->update, "nodes") == 0) {
+ tst->nodes = c->nodes;
+ err = tst->ss->write_bitmap(tst, dfd, NodeNumUpdate);
+ } else if (strcmp(c->update, "revert-reshape") == 0 &&
+ c->invalid_backup)
+ err = tst->ss->update_super(tst, content,
+ "revert-reshape-nobackup",
+ devname, c->verbose,
+ ident->uuid_set,
+ c->homehost);
+ else
+ err = tst->ss->update_super(tst, content, c->update,
+ devname, c->verbose,
+ ident->uuid_set,
+ c->homehost);
+ if (err < 0) {
+ if (err == -1)
+ pr_err("--update=%s not understood for %s metadata\n",
+ c->update, tst->ss->name);
+ tst->ss->free_super(tst);
+ free(tst);
+ close(mdfd);
+ close(dfd);
+ free(devices);
+ free(devmap);
+ *stp = st;
+ return -1;
+ }
+ if (strcmp(c->update, "uuid")==0 &&
+ !ident->uuid_set) {
+ ident->uuid_set = 1;
+ memcpy(ident->uuid, content->uuid, 16);
+ }
+ if (tst->ss->store_super(tst, dfd))
+ pr_err("Could not re-write superblock on %s.\n",
+ devname);
+
+ if (strcmp(c->update, "uuid")==0 &&
+ ident->bitmap_fd >= 0 && !bitmap_done) {
+ if (bitmap_update_uuid(ident->bitmap_fd,
+ content->uuid,
+ tst->ss->swapuuid) != 0)
+ pr_err("Could not update uuid on external bitmap.\n");
+ else
+ bitmap_done = 1;
+ }
+ } else
+#endif
+ {
+ dfd = dev_open(devname,
+ tmpdev->disposition == 'I'
+ ? O_RDWR : (O_RDWR|O_EXCL));
+ tst = dup_super(st);
+
+ if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) {
+ pr_err("cannot re-read metadata from %s - aborting\n",
+ devname);
+ if (dfd >= 0)
+ close(dfd);
+ close(mdfd);
+ free(devices);
+ free(devmap);
+ tst->ss->free_super(tst);
+ free(tst);
+ *stp = st;
+ return -1;
+ }
+ tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks);
+ }
+
+ fstat(dfd, &stb);
+ close(dfd);
+
+ if (c->verbose > 0)
+ pr_err("%s is identified as a member of %s, slot %d%s.\n",
+ devname, mddev, content->disk.raid_disk,
+ (content->disk.state & (1<<MD_DISK_REPLACEMENT)) ? " replacement":"");
+ devices[devcnt].devname = devname;
+ devices[devcnt].uptodate = 0;
+ devices[devcnt].included = (tmpdev->disposition == 'I');
+ devices[devcnt].i = *content;
+ devices[devcnt].i.disk.major = major(stb.st_rdev);
+ devices[devcnt].i.disk.minor = minor(stb.st_rdev);
+
+ if (devices[devcnt].i.disk.state == 6) {
+ if (most_recent < 0 ||
+ devices[devcnt].i.events
+ > devices[most_recent].i.events) {
+ struct supertype *tmp = tst;
+ tst = st;
+ st = tmp;
+ most_recent = devcnt;
+ }
+ }
+ tst->ss->free_super(tst);
+ free(tst);
+
+ if (content->array.level == LEVEL_MULTIPATH)
+ /* with multipath, the raid_disk from the superblock is meaningless */
+ i = devcnt;
+ else
+ i = devices[devcnt].i.disk.raid_disk;
+ if (i+1 == 0 || i == MD_DISK_ROLE_JOURNAL) {
+ if (nextspare < content->array.raid_disks*2)
+ nextspare = content->array.raid_disks*2;
+ i = nextspare++;
+ } else {
+ /* i is raid_disk - double it so there is room for
+ * replacements */
+ i *= 2;
+ if (devices[devcnt].i.disk.state & (1<<MD_DISK_REPLACEMENT))
+ i++;
+ if (i >= content->array.raid_disks*2 &&
+ i >= nextspare)
+ nextspare = i+1;
+ }
+ if (i < 10000) {
+ if (i >= bestcnt) {
+ int newbestcnt = i+10;
+ int *newbest = xmalloc(sizeof(int)*newbestcnt);
+ int c;
+ for (c=0; c < newbestcnt; c++)
+ if (c < bestcnt)
+ newbest[c] = best[c];
+ else
+ newbest[c] = -1;
+ if (best)free(best);
+ best = newbest;
+ bestcnt = newbestcnt;
+ }
+ if (best[i] >=0 &&
+ devices[best[i]].i.events
+ == devices[devcnt].i.events
+ && (devices[best[i]].i.disk.minor
+ != devices[devcnt].i.disk.minor)
+ && st->ss == &super0
+ && content->array.level != LEVEL_MULTIPATH) {
+ /* two different devices with identical superblock.
+ * Could be a mis-detection caused by overlapping
+ * partitions. fail-safe.
+ */
+ pr_err("WARNING %s and %s appear to have very similar superblocks.\n"
+ " If they are really different, please --zero the superblock on one\n"
+ " If they are the same or overlap, please remove one from %s.\n",
+ devices[best[i]].devname, devname,
+ inargv ? "the list" :
+ "the\n DEVICE list in mdadm.conf"
+ );
+ close(mdfd);
+ free(devices);
+ free(devmap);
+ *stp = st;
+ return -1;
+ }
+ if (best[i] == -1
+ || (devices[best[i]].i.events
+ < devices[devcnt].i.events))
+ best[i] = devcnt;
+ }
+ devcnt++;
+ }
+ if (most_recent >= 0)
+ *most_recentp = most_recent;
+ *bestcntp = bestcnt;
+ *bestp = best;
+ *stp = st;
+ return devcnt;
+}
+
+static int force_array(struct mdinfo *content,
+ struct devs *devices,
+ int *best, int bestcnt, char *avail,
+ int most_recent,
+ struct supertype *st,
+ struct context *c)
+{
+ int okcnt = 0;
+ while (!enough(content->array.level, content->array.raid_disks,
+ content->array.layout, 1,
+ avail)
+ ||
+ (content->reshape_active && content->delta_disks > 0 &&
+ !enough(content->array.level, (content->array.raid_disks
+ - content->delta_disks),
+ content->new_layout, 1,
+ avail)
+ )) {
+ /* Choose the newest best drive which is
+ * not up-to-date, update the superblock
+ * and add it.
+ */
+ int fd;
+ struct supertype *tst;
+ unsigned long long current_events;
+ int chosen_drive = -1;
+ int i;
+
+ for (i = 0;
+ i < content->array.raid_disks * 2 && i < bestcnt;
+ i += 2) {
+ int j = best[i];
+ if (j < 0)
+ continue;
+ if (devices[j].uptodate)
+ continue;
+ if (devices[j].i.recovery_start != MaxSector) {
+ int delta;
+ if (!devices[j].i.reshape_active ||
+ devices[j].i.delta_disks <= 0)
+ continue;
+ /* When increasing number of devices, an
+ * added device also appears to be
+ * recovering. It is safe to include it
+ * as long as it won't be a source of
+ * data.
+ * For now, just allow for last data
+ * devices in RAID4 or last devices in RAID4/5/6.
+ */
+ delta = devices[j].i.delta_disks;
+ if (devices[j].i.array.level >= 4 &&
+ devices[j].i.array.level <= 6 &&
+ i/2 >= content->array.raid_disks - delta)
+ /* OK */;
+ else if (devices[j].i.array.level == 4 &&
+ i/2 >= content->array.raid_disks - delta - 1)
+ /* OK */;
+ else
+ continue;
+ }
+ if (chosen_drive < 0 ||
+ devices[j].i.events
+ > devices[chosen_drive].i.events)
+ chosen_drive = j;
+ }
+ if (chosen_drive < 0)
+ break;
+ current_events = devices[chosen_drive].i.events;
+ add_another:
+ if (c->verbose >= 0)
+ pr_err("forcing event count in %s(%d) from %d upto %d\n",
+ devices[chosen_drive].devname,
+ devices[chosen_drive].i.disk.raid_disk,
+ (int)(devices[chosen_drive].i.events),
+ (int)(devices[most_recent].i.events));
+ fd = dev_open(devices[chosen_drive].devname,
+ devices[chosen_drive].included ? O_RDWR
+ : (O_RDWR|O_EXCL));
+ if (fd < 0) {
+ pr_err("Couldn't open %s for write - not updating\n",
+ devices[chosen_drive].devname);
+ devices[chosen_drive].i.events = 0;
+ continue;
+ }
+ tst = dup_super(st);
+ if (tst->ss->load_super(tst,fd, NULL)) {
+ close(fd);
+ pr_err("RAID superblock disappeared from %s - not updating.\n",
+ devices[chosen_drive].devname);
+ devices[chosen_drive].i.events = 0;
+ continue;
+ }
+ content->events = devices[most_recent].i.events;
+ tst->ss->update_super(tst, content, "force-one",
+ devices[chosen_drive].devname, c->verbose,
+ 0, NULL);
+
+ if (tst->ss->store_super(tst, fd)) {
+ close(fd);
+ pr_err("Could not re-write superblock on %s\n",
+ devices[chosen_drive].devname);
+ devices[chosen_drive].i.events = 0;
+ tst->ss->free_super(tst);
+ continue;
+ }
+ close(fd);
+ devices[chosen_drive].i.events = devices[most_recent].i.events;
+ devices[chosen_drive].uptodate = 1;
+ avail[chosen_drive] = 1;
+ okcnt++;
+ tst->ss->free_super(tst);
+ /* If there are any other drives of the same vintage,
+ * add them in as well. We can't lose and we might gain
+ */
+ for (i = 0;
+ i < content->array.raid_disks * 2 && i < bestcnt ;
+ i += 2) {
+ int j = best[i];
+ if (j >= 0 &&
+ !devices[j].uptodate &&
+ devices[j].i.recovery_start == MaxSector &&
+ devices[j].i.events == current_events) {
+ chosen_drive = j;
+ goto add_another;
+ }
+ }
+ }
+ return okcnt;
+}
+
+static int start_array(int mdfd,
+ char *mddev,
+ struct mdinfo *content,
+ struct supertype *st,
+ struct mddev_ident *ident,
+ int *best, int bestcnt,
+ int chosen_drive,
+ struct devs *devices,
+ unsigned int okcnt,
+ unsigned int sparecnt,
+ unsigned int rebuilding_cnt,
+ unsigned int journalcnt,
+ struct context *c,
+ int clean, char *avail,
+ int start_partial_ok,
+ int err_ok,
+ int was_forced
+ )
+{
+ int rv;
+ int i;
+ unsigned int req_cnt;
+
+ if (content->journal_device_required && (content->journal_clean == 0)) {
+ if (!c->force) {
+ pr_err("Not safe to assemble with missing or stale journal device, consider --force.\n");
+ return 1;
+ }
+ pr_err("Journal is missing or stale, starting array read only.\n");
+ c->readonly = 1;
+ }
+
+ rv = set_array_info(mdfd, st, content);
+ if (rv && !err_ok) {
+ pr_err("failed to set array info for %s: %s\n",
+ mddev, strerror(errno));
+ return 1;
+ }
+ if (ident->bitmap_fd >= 0) {
+ if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) {
+ pr_err("SET_BITMAP_FILE failed.\n");
+ return 1;
+ }
+ } else if (ident->bitmap_file) {
+ /* From config file */
+ int bmfd = open(ident->bitmap_file, O_RDWR);
+ if (bmfd < 0) {
+ pr_err("Could not open bitmap file %s\n",
+ ident->bitmap_file);
+ return 1;
+ }
+ if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) {
+ pr_err("Failed to set bitmapfile for %s\n", mddev);
+ close(bmfd);
+ return 1;
+ }
+ close(bmfd);
+ }
+
+ /* First, add the raid disks, but add the chosen one last */
+ for (i=0; i<= bestcnt; i++) {
+ int j;
+ if (i < bestcnt) {
+ j = best[i];
+ if (j == chosen_drive)
+ continue;
+ } else
+ j = chosen_drive;
+
+ if (j >= 0 && !devices[j].included) {
+ int dfd = dev_open(devices[j].devname,
+ O_RDWR|O_EXCL);
+ if (dfd >= 0) {
+ remove_partitions(dfd);
+ close(dfd);
+ }
+ rv = add_disk(mdfd, st, content, &devices[j].i);
+
+ if (rv) {
+ pr_err("failed to add %s to %s: %s\n",
+ devices[j].devname,
+ mddev,
+ strerror(errno));
+ if (i < content->array.raid_disks * 2
+ || i == bestcnt)
+ okcnt--;
+ else
+ sparecnt--;
+ } else if (c->verbose > 0)
+ pr_err("added %s to %s as %d%s%s\n",
+ devices[j].devname, mddev,
+ devices[j].i.disk.raid_disk,
+ devices[j].uptodate?"":
+ " (possibly out of date)",
+ (devices[j].i.disk.state & (1<<MD_DISK_REPLACEMENT))?" replacement":"");
+ } else if (j >= 0) {
+ if (c->verbose > 0)
+ pr_err("%s is already in %s as %d\n",
+ devices[j].devname, mddev,
+ devices[j].i.disk.raid_disk);
+ } else if (c->verbose > 0 && i < content->array.raid_disks*2
+ && (i&1) == 0)
+ pr_err("no uptodate device for slot %d of %s\n",
+ i/2, mddev);
+ }
+
+ if (content->array.level == LEVEL_CONTAINER) {
+ if (c->verbose >= 0) {
+ pr_err("Container %s has been assembled with %d drive%s",
+ mddev, okcnt+sparecnt+journalcnt,
+ okcnt+sparecnt+journalcnt==1?"":"s");
+ if (okcnt < (unsigned)content->array.raid_disks)
+ fprintf(stderr, " (out of %d)",
+ content->array.raid_disks);
+ fprintf(stderr, "\n");
+ }
+
+ if (st->ss->validate_container) {
+ struct mdinfo *devices_list;
+ struct mdinfo *info_devices = xmalloc(sizeof(struct mdinfo)*(okcnt+sparecnt));
+ unsigned int count;
+ devices_list = NULL;
+ for (count = 0; count < okcnt+sparecnt; count++) {
+ info_devices[count] = devices[count].i;
+ info_devices[count].next = devices_list;
+ devices_list = &info_devices[count];
+ }
+ if (st->ss->validate_container(devices_list))
+ pr_err("Mismatch detected!\n");
+ free(info_devices);
+ }
+
+ st->ss->free_super(st);
+ sysfs_uevent(content, "change");
+ if (err_ok && okcnt < (unsigned)content->array.raid_disks)
+ /* Was partial, is still partial, so signal an error
+ * to ensure we don't retry */
+ return 1;
+ return 0;
+ }
+
+ /* Get number of in-sync devices according to the superblock.
+ * We must have this number to start the array without -s or -R
+ */
+ req_cnt = content->array.working_disks;
+
+ if (c->runstop == 1 ||
+ (c->runstop <= 0 &&
+ ( enough(content->array.level, content->array.raid_disks,
+ content->array.layout, clean, avail) &&
+ (okcnt + rebuilding_cnt >= req_cnt || start_partial_ok)
+ ))) {
+ /* This array is good-to-go.
+ * If a reshape is in progress then we might need to
+ * continue monitoring it. In that case we start
+ * it read-only and let the grow code make it writable.
+ */
+ int rv;
+#ifndef MDASSEMBLE
+ if (content->reshape_active &&
+ !(content->reshape_active & RESHAPE_NO_BACKUP) &&
+ content->delta_disks <= 0) {
+ if (!c->backup_file) {
+ pr_err("%s: Need a backup file to complete reshape of this array.\n",
+ mddev);
+ pr_err("Please provided one with \"--backup-file=...\"\n");
+ if (c->update &&
+ strcmp(c->update, "revert-reshape") == 0)
+ pr_err("(Don't specify --update=revert-reshape again, that part succeeded.)\n");
+ return 1;
+ }
+ rv = sysfs_set_str(content, NULL,
+ "array_state", "readonly");
+ if (rv == 0)
+ rv = Grow_continue(mdfd, st, content,
+ c->backup_file, 0,
+ c->freeze_reshape);
+ } else if (c->readonly &&
+ sysfs_attribute_available(
+ content, NULL, "array_state")) {
+ rv = sysfs_set_str(content, NULL,
+ "array_state", "readonly");
+ } else
+#endif
+ rv = ioctl(mdfd, RUN_ARRAY, NULL);
+ reopen_mddev(mdfd); /* drop O_EXCL */
+ if (rv == 0) {
+ if (c->verbose >= 0) {
+ pr_err("%s has been started with %d drive%s",
+ mddev, okcnt, okcnt==1?"":"s");
+ if (okcnt < (unsigned)content->array.raid_disks)
+ fprintf(stderr, " (out of %d)", content->array.raid_disks);
+ if (rebuilding_cnt)
+ fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt);
+ if (sparecnt)
+ fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s");
+ if (content->journal_clean)
+ fprintf(stderr, " and %d journal", journalcnt);
+ fprintf(stderr, ".\n");
+ }
+ if (content->reshape_active &&
+ content->array.level >= 4 &&
+ content->array.level <= 6) {
+ /* might need to increase the size
+ * of the stripe cache - default is 256
+ */
+ int chunk_size = content->array.chunk_size;
+ if (content->reshape_active &&
+ content->new_chunk > chunk_size)
+ chunk_size = content->new_chunk;
+ if (256 < 4 * ((chunk_size+4065)/4096)) {
+ struct mdinfo *sra = sysfs_read(mdfd, NULL, 0);
+ if (sra)
+ sysfs_set_num(sra, NULL,
+ "stripe_cache_size",
+ (4 * chunk_size / 4096) + 1);
+ sysfs_free(sra);
+ }
+ }
+ if (okcnt < (unsigned)content->array.raid_disks) {
+ /* If any devices did not get added
+ * because the kernel rejected them based
+ * on event count, try adding them
+ * again providing the action policy is
+ * 're-add' or greater. The bitmap
+ * might allow them to be included, or
+ * they will become spares.
+ */
+ for (i = 0; i < bestcnt; i++) {
+ int j = best[i];
+ if (j >= 0 && !devices[j].uptodate) {
+ if (!disk_action_allows(&devices[j].i, st->ss->name, act_re_add))
+ continue;
+ rv = add_disk(mdfd, st, content,
+ &devices[j].i);
+ if (rv == 0 && c->verbose >= 0)
+ pr_err("%s has been re-added.\n",
+ devices[j].devname);
+ }
+ }
+ }
+ if (content->array.level == 6 &&
+ okcnt + 1 == (unsigned)content->array.raid_disks &&
+ was_forced) {
+ struct mdinfo *sra = sysfs_read(mdfd, NULL, 0);
+ if (sra)
+ sysfs_set_str(sra, NULL,
+ "sync_action", "repair");
+ sysfs_free(sra);
+ }
+ return 0;
+ }
+ pr_err("failed to RUN_ARRAY %s: %s\n",
+ mddev, strerror(errno));
+
+ if (!enough(content->array.level, content->array.raid_disks,
+ content->array.layout, 1, avail))
+ pr_err("Not enough devices to start the array.\n");
+ else if (!enough(content->array.level,
+ content->array.raid_disks,
+ content->array.layout, clean,
+ avail))
+ pr_err("Not enough devices to start the array while not clean - consider --force.\n");
+
+ return 1;
+ }
+ if (c->runstop == -1) {
+ pr_err("%s assembled from %d drive%s",
+ mddev, okcnt, okcnt==1?"":"s");
+ if (okcnt != (unsigned)content->array.raid_disks)
+ fprintf(stderr, " (out of %d)", content->array.raid_disks);
+ fprintf(stderr, ", but not started.\n");
+ return 2;
+ }
+ if (c->verbose >= -1) {
+ pr_err("%s assembled from %d drive%s", mddev, okcnt, okcnt==1?"":"s");
+ if (rebuilding_cnt)
+ fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt);
+ if (sparecnt)
+ fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s");
+ if (!enough(content->array.level, content->array.raid_disks,
+ content->array.layout, 1, avail))
+ fprintf(stderr, " - not enough to start the array.\n");
+ else if (!enough(content->array.level,
+ content->array.raid_disks,
+ content->array.layout, clean,
+ avail))
+ fprintf(stderr, " - not enough to start the array while not clean - consider --force.\n");
+ else {
+ if (req_cnt == (unsigned)content->array.raid_disks)
+ fprintf(stderr, " - need all %d to start it", req_cnt);
+ else
+ fprintf(stderr, " - need %d to start", req_cnt);
+ fprintf(stderr, " (use --run to insist).\n");
+ }
+ }
+ return 1;
+}
+
+int Assemble(struct supertype *st, char *mddev,
+ struct mddev_ident *ident,
+ struct mddev_dev *devlist,
+ struct context *c)
+{
+ /*
+ * The task of Assemble is to find a collection of
+ * devices that should (according to their superblocks)
+ * form an array, and to give this collection to the MD driver.
+ * In Linux-2.4 and later, this involves submitting a
+ * SET_ARRAY_INFO ioctl with no arg - to prepare
+ * the array - and then submit a number of
+ * ADD_NEW_DISK ioctls to add disks into
+ * the array. Finally RUN_ARRAY might
+ * be submitted to start the array.
+ *
+ * Much of the work of Assemble is in finding and/or
+ * checking the disks to make sure they look right.
+ *
+ * If mddev is not set, then scan must be set and we
+ * read through the config file for dev+uuid mapping
+ * We recurse, setting mddev, for each device that
+ * - isn't running
+ * - has a valid uuid (or any uuid if !uuidset)
+ *
+ * If mddev is set, we try to determine state of md.
+ * check version - must be at least 0.90.0
+ * check kernel version. must be at least 2.4.
+ * If not, we can possibly fall back on START_ARRAY
+ * Try to GET_ARRAY_INFO.
+ * If possible, give up
+ * If not, try to STOP_ARRAY just to make sure
+ *
+ * If !uuidset and scan, look in conf-file for uuid
+ * If not found, give up
+ * If !devlist and scan and uuidset, get list of devs from conf-file
+ *
+ * For each device:
+ * Check superblock - discard if bad
+ * Check uuid (set if we don't have one) - discard if no match
+ * Check superblock similarity if we have a superblock - discard if different
+ * Record events, devicenum
+ * This should give us a list of devices for the array
+ * We should collect the most recent event number
+ *
+ * Count disks with recent enough event count
+ * While force && !enough disks
+ * Choose newest rejected disks, update event count
+ * mark clean and rewrite superblock
+ * If recent kernel:
+ * SET_ARRAY_INFO
+ * foreach device with recent events : ADD_NEW_DISK
+ * if runstop == 1 || "enough" disks and runstop==0 -> RUN_ARRAY
+ * If old kernel:
+ * Check the device numbers in superblock are right
+ * update superblock if any changes
+ * START_ARRAY
+ *
+ */
+ int rv;
+ int mdfd;
+ int clean;
+ int auto_assem = (mddev == NULL && !ident->uuid_set &&
+ ident->super_minor == UnSet && ident->name[0] == 0
+ && (ident->container == NULL || ident->member == NULL));
+ struct devs *devices;
+ char *devmap;
+ int *best = NULL; /* indexed by raid_disk */
+ int bestcnt = 0;
+ int devcnt;
+ unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt, journalcnt;
+ int journal_clean = 0;
+ int i;
+ int was_forced = 0;
+ int most_recent = 0;
+ int chosen_drive;
+ int change = 0;
+ int inargv = 0;
+ int start_partial_ok = (c->runstop >= 0) &&
+ (c->force || devlist==NULL || auto_assem);
+ int num_devs;
+ struct mddev_dev *tmpdev;
+ struct mdinfo info;
+ struct mdinfo *content = NULL;
+ struct mdinfo *pre_exist = NULL;
+ char *avail;
+ char *name = NULL;
+ char chosen_name[1024];
+ struct map_ent *map = NULL;
+ struct map_ent *mp;
+
+ /*
+ * If any subdevs are listed, then any that don't
+ * match ident are discarded. Remainder must all match and
+ * become the array.
+ * If no subdevs, then we scan all devices in the config file, but
+ * there must be something in the identity
+ */
+
+ if (!devlist &&
+ ident->uuid_set == 0 &&
+ (ident->super_minor < 0 || ident->super_minor == UnSet) &&
+ ident->name[0] == 0 &&
+ (ident->container == NULL || ident->member == NULL) &&
+ ident->devices == NULL) {
+ pr_err("No identity information available for %s - cannot assemble.\n",
+ mddev ? mddev : "further assembly");
+ return 1;
+ }
+
+ if (devlist == NULL)
+ devlist = conf_get_devs();
+ else if (mddev)
+ inargv = 1;
+
+try_again:
+ /* We come back here when doing auto-assembly and attempting some
+ * set of devices failed. Those are now marked as ->used==2 and
+ * we ignore them and try again
+ */
+ if (!st && ident->st)
+ st = ident->st;
+ if (c->verbose>0)
+ pr_err("looking for devices for %s\n",
+ mddev ? mddev : "further assembly");
+
+ content = &info;
+ if (st && c->force)
+ st->ignore_hw_compat = 1;
+ num_devs = select_devices(devlist, ident, &st, &content, c,
+ inargv, auto_assem);
+ if (num_devs < 0)
+ return 1;
+
+ if (!st || !st->sb || !content)
+ return 2;
+
+ /* We have a full set of devices - we now need to find the
+ * array device.
+ * However there is a risk that we are racing with "mdadm -I"
+ * and the array is already partially assembled - we will have
+ * rejected any devices already in this address.
+ * So we take a lock on the map file - to prevent further races -
+ * and look for the uuid in there. If found and the array is
+ * active, we abort. If found and the array is not active
+ * we commit to that md device and add all the contained devices
+ * to our list. We flag them so that we don't try to re-add,
+ * but can remove if they turn out to not be wanted.
+ */
+ if (map_lock(&map))
+ pr_err("failed to get exclusive lock on mapfile - continue anyway...\n");
+ if (c->update && strcmp(c->update,"uuid") == 0)
+ mp = NULL;
+ else
+ mp = map_by_uuid(&map, content->uuid);
+ if (mp) {
+ struct mdinfo *dv;
+ /* array already exists. */
+ pre_exist = sysfs_read(-1, mp->devnm, GET_LEVEL|GET_DEVS);
+ if (pre_exist->array.level != UnSet) {
+ pr_err("Found some drive for an array that is already active: %s\n",
+ mp->path);
+ pr_err("giving up.\n");
+ return 1;
+ }
+ for (dv = pre_exist->devs; dv; dv = dv->next) {
+ /* We want to add this device to our list,
+ * but it could already be there if "mdadm -I"
+ * started *after* we checked for O_EXCL.
+ * If we add it to the top of the list
+ * it will be preferred over later copies.
+ */
+ struct mddev_dev *newdev;
+ char *devname = map_dev(dv->disk.major,
+ dv->disk.minor,
+ 0);
+ if (!devname)
+ continue;
+ newdev = xmalloc(sizeof(*newdev));
+ newdev->devname = devname;
+ newdev->disposition = 'I';
+ newdev->used = 1;
+ newdev->next = devlist;
+ devlist = newdev;
+ num_devs++;
+ }
+ strcpy(chosen_name, mp->path);
+ if (c->verbose > 0 || mddev == NULL ||
+ strcmp(mddev, chosen_name) != 0)
+ pr_err("Merging with already-assembled %s\n",
+ chosen_name);
+ mdfd = open_dev_excl(mp->devnm);
+ } else {
+ int trustworthy = FOREIGN;
+ name = content->name;
+ switch (st->ss->match_home(st, c->homehost)
+ ?: st->ss->match_home(st, "any")) {
+ case 1:
+ trustworthy = LOCAL;
+ name = strchr(content->name, ':');
+ if (name)
+ name++;
+ else
+ name = content->name;
+ break;
+ }
+ if (!auto_assem)
+ /* If the array is listed in mdadm.conf or on
+ * command line, then we trust the name
+ * even if the array doesn't look local
+ */
+ trustworthy = LOCAL;
+
+ if (name[0] == 0 &&
+ content->array.level == LEVEL_CONTAINER) {
+ name = content->text_version;
+ trustworthy = METADATA;
+ }
+
+ if (name[0] && trustworthy != LOCAL &&
+ ! c->require_homehost &&
+ conf_name_is_free(name))
+ trustworthy = LOCAL;
+
+ if (trustworthy == LOCAL &&
+ strchr(name, ':'))
+ /* Ignore 'host:' prefix of name */
+ name = strchr(name, ':')+1;
+
+ mdfd = create_mddev(mddev, name, ident->autof, trustworthy,
+ chosen_name);
+ }
+ if (mdfd < 0) {
+ st->ss->free_super(st);
+ if (auto_assem)
+ goto try_again;
+ return 1;
+ }
+ mddev = chosen_name;
+ if (get_linux_version() < 2004000 ||
+ md_get_version(mdfd) < 9000) {
+ pr_err("Assemble requires Linux 2.4 or later, and\n"
+ " md driver version 0.90.0 or later.\n"
+ " Upgrade your kernel or try --build\n");
+ close(mdfd);
+ return 1;
+ }
+ if (pre_exist == NULL) {
+ if (mddev_busy(fd2devnm(mdfd))) {
+ pr_err("%s already active, cannot restart it!\n",
+ mddev);
+ for (tmpdev = devlist ;
+ tmpdev && tmpdev->used != 1;
+ tmpdev = tmpdev->next)
+ ;
+ if (tmpdev && auto_assem)
+ pr_err("%s needed for %s...\n",
+ mddev, tmpdev->devname);
+ close(mdfd);
+ mdfd = -3;
+ st->ss->free_super(st);
+ if (auto_assem)
+ goto try_again;
+ return 1;
+ }
+ /* just incase it was started but has no content */
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ }
+
+#ifndef MDASSEMBLE
+ if (content != &info) {
+ /* This is a member of a container. Try starting the array. */
+ int err;
+ err = assemble_container_content(st, mdfd, content, c,
+ chosen_name, NULL);
+ close(mdfd);
+ return err;
+ }
+#endif
+ /* Ok, no bad inconsistancy, we can try updating etc */
+ devices = xcalloc(num_devs, sizeof(*devices));
+ devmap = xcalloc(num_devs, content->array.raid_disks);
+ devcnt = load_devices(devices, devmap, ident, &st, devlist,
+ c, content, mdfd, mddev,
+ &most_recent, &bestcnt, &best, inargv);
+ if (devcnt < 0)
+ return 1;
+
+ if (devcnt == 0) {
+ pr_err("no devices found for %s\n",
+ mddev);
+ if (st)
+ st->ss->free_super(st);
+ close(mdfd);
+ free(devices);
+ free(devmap);
+ return 1;
+ }
+
+ if (c->update && strcmp(c->update, "byteorder")==0)
+ st->minor_version = 90;
+
+ st->ss->getinfo_super(st, content, NULL);
+ clean = content->array.state & 1;
+
+ /* now we have some devices that might be suitable.
+ * I wonder how many
+ */
+ avail = xcalloc(content->array.raid_disks, 1);
+ okcnt = 0;
+ replcnt = 0;
+ sparecnt=0;
+ journalcnt=0;
+ rebuilding_cnt=0;
+ for (i=0; i< bestcnt; i++) {
+ int j = best[i];
+ int event_margin = 1; /* always allow a difference of '1'
+ * like the kernel does
+ */
+ if (j < 0) continue;
+ /* note: we ignore error flags in multipath arrays
+ * as they don't make sense
+ */
+ if (content->array.level != LEVEL_MULTIPATH) {
+ if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL)) {
+ if (content->journal_device_required)
+ journalcnt++;
+ else /* unexpected journal, mark as faulty */
+ devices[j].i.disk.state |= (1<<MD_DISK_FAULTY);
+ } else if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) {
+ if (!(devices[j].i.disk.state
+ & (1<<MD_DISK_FAULTY))) {
+ devices[j].uptodate = 1;
+ sparecnt++;
+ }
+ continue;
+ }
+ }
+ /* If this device thinks that 'most_recent' has failed, then
+ * we must reject this device.
+ */
+ if (j != most_recent && !c->force &&
+ content->array.raid_disks > 0 &&
+ devices[most_recent].i.disk.raid_disk >= 0 &&
+ devmap[j * content->array.raid_disks + devices[most_recent].i.disk.raid_disk] == 0) {
+ if (c->verbose > -1)
+ pr_err("ignoring %s as it reports %s as failed\n",
+ devices[j].devname, devices[most_recent].devname);
+ best[i] = -1;
+ continue;
+ }
+ /* Require event counter to be same as, or just less than,
+ * most recent. If it is bigger, it must be a stray spare and
+ * should be ignored.
+ */
+ if (devices[j].i.events+event_margin >=
+ devices[most_recent].i.events &&
+ devices[j].i.events <=
+ devices[most_recent].i.events
+ ) {
+ devices[j].uptodate = 1;
+ if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL))
+ journal_clean = 1;
+ if (i < content->array.raid_disks * 2) {
+ if (devices[j].i.recovery_start == MaxSector ||
+ (content->reshape_active &&
+ i >= content->array.raid_disks - content->delta_disks)) {
+ if (!avail[i/2]) {
+ okcnt++;
+ avail[i/2]=1;
+ } else
+ replcnt++;
+ } else
+ rebuilding_cnt++;
+ } else if (devices[j].i.disk.raid_disk != MD_DISK_ROLE_JOURNAL)
+ sparecnt++;
+ }
+ }
+ free(devmap);
+ if (c->force) {
+ int force_ok = force_array(content, devices, best, bestcnt,
+ avail, most_recent, st, c);
+ okcnt += force_ok;
+ if (force_ok)
+ was_forced = 1;
+ }
+ /* Now we want to look at the superblock which the kernel will base things on
+ * and compare the devices that we think are working with the devices that the
+ * superblock thinks are working.
+ * If there are differences and --force is given, then update this chosen
+ * superblock.
+ */
+ chosen_drive = -1;
+ st->ss->free_super(st);
+ for (i=0; chosen_drive < 0 && i<bestcnt; i+=2) {
+ int j = best[i];
+ int fd;
+
+ if (j<0)
+ continue;
+ if (!devices[j].uptodate)
+ continue;
+ if (devices[j].i.events < devices[most_recent].i.events)
+ continue;
+ chosen_drive = j;
+ if ((fd=dev_open(devices[j].devname,
+ devices[j].included ? O_RDONLY
+ : (O_RDONLY|O_EXCL)))< 0) {
+ pr_err("Cannot open %s: %s\n",
+ devices[j].devname, strerror(errno));
+ close(mdfd);
+ free(devices);
+ return 1;
+ }
+ if (st->ss->load_super(st,fd, NULL)) {
+ close(fd);
+ pr_err("RAID superblock has disappeared from %s\n",
+ devices[j].devname);
+ close(mdfd);
+ free(devices);
+ return 1;
+ }
+ close(fd);
+ }
+ if (st->sb == NULL) {
+ pr_err("No suitable drives found for %s\n", mddev);
+ close(mdfd);
+ free(devices);
+ return 1;
+ }
+ st->ss->getinfo_super(st, content, NULL);
+#ifndef MDASSEMBLE
+ sysfs_init(content, mdfd, NULL);
+#endif
+ /* after reload context, store journal_clean in context */
+ content->journal_clean = journal_clean;
+ for (i=0; i<bestcnt; i++) {
+ int j = best[i];
+ unsigned int desired_state;
+
+ if (devices[j].i.disk.raid_disk == MD_DISK_ROLE_JOURNAL)
+ desired_state = (1<<MD_DISK_JOURNAL);
+ else if (i >= content->array.raid_disks * 2)
+ desired_state = 0;
+ else if (i & 1)
+ desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_REPLACEMENT);
+ else
+ desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC);
+
+ if (j<0)
+ continue;
+ if (!devices[j].uptodate)
+ continue;
+
+ devices[j].i.disk.state = desired_state;
+ if (!(devices[j].i.array.state & 1))
+ clean = 0;
+
+ if (st->ss->update_super(st, &devices[j].i, "assemble", NULL,
+ c->verbose, 0, NULL)) {
+ if (c->force) {
+ if (c->verbose >= 0)
+ pr_err("clearing FAULTY flag for device %d in %s for %s\n",
+ j, mddev, devices[j].devname);
+ change = 1;
+ } else {
+ if (c->verbose >= -1)
+ pr_err("device %d in %s has wrong state in superblock, but %s seems ok\n",
+ i, mddev, devices[j].devname);
+ }
+ }
+#if 0
+ if (!(super.disks[i].i.disk.state & (1 << MD_DISK_FAULTY))) {
+ pr_err("devices %d of %s is not marked FAULTY in superblock, but cannot be found\n",
+ i, mddev);
+ }
+#endif
+ }
+ if (c->force && !clean &&
+ !enough(content->array.level, content->array.raid_disks,
+ content->array.layout, clean,
+ avail)) {
+ change += st->ss->update_super(st, content, "force-array",
+ devices[chosen_drive].devname, c->verbose,
+ 0, NULL);
+ was_forced = 1;
+ clean = 1;
+ }
+
+ if (change) {
+ int fd;
+ fd = dev_open(devices[chosen_drive].devname,
+ devices[chosen_drive].included ?
+ O_RDWR : (O_RDWR|O_EXCL));
+ if (fd < 0) {
+ pr_err("Could not open %s for write - cannot Assemble array.\n",
+ devices[chosen_drive].devname);
+ close(mdfd);
+ free(devices);
+ return 1;
+ }
+ if (st->ss->store_super(st, fd)) {
+ close(fd);
+ pr_err("Could not re-write superblock on %s\n",
+ devices[chosen_drive].devname);
+ close(mdfd);
+ free(devices);
+ return 1;
+ }
+ if (c->verbose >= 0)
+ pr_err("Marking array %s as 'clean'\n",
+ mddev);
+ close(fd);
+ }
+
+ /* If we are in the middle of a reshape we may need to restore saved data
+ * that was moved aside due to the reshape overwriting live data
+ * The code of doing this lives in Grow.c
+ */
+#ifndef MDASSEMBLE
+ if (content->reshape_active &&
+ !(content->reshape_active & RESHAPE_NO_BACKUP)) {
+ int err = 0;
+ int *fdlist = xmalloc(sizeof(int)* bestcnt);
+ if (c->verbose > 0)
+ pr_err("%s has an active reshape - checking if critical section needs to be restored\n",
+ chosen_name);
+ if (!c->backup_file)
+ c->backup_file = locate_backup(content->sys_name);
+ enable_fds(bestcnt/2);
+ for (i = 0; i < bestcnt/2; i++) {
+ int j = best[i*2];
+ if (j >= 0) {
+ fdlist[i] = dev_open(devices[j].devname,
+ devices[j].included
+ ? O_RDWR : (O_RDWR|O_EXCL));
+ if (fdlist[i] < 0) {
+ pr_err("Could not open %s for write - cannot Assemble array.\n",
+ devices[j].devname);
+ err = 1;
+ break;
+ }
+ } else
+ fdlist[i] = -1;
+ }
+ if (!err) {
+ if (st->ss->external && st->ss->recover_backup)
+ err = st->ss->recover_backup(st, content);
+ else
+ err = Grow_restart(st, content, fdlist, bestcnt/2,
+ c->backup_file, c->verbose > 0);
+ if (err && c->invalid_backup) {
+ if (c->verbose > 0)
+ pr_err("continuing without restoring backup\n");
+ err = 0;
+ }
+ }
+ while (i>0) {
+ i--;
+ if (fdlist[i]>=0) close(fdlist[i]);
+ }
+ free(fdlist);
+ if (err) {
+ pr_err("Failed to restore critical section for reshape, sorry.\n");
+ if (c->backup_file == NULL)
+ cont_err("Possibly you needed to specify the --backup-file\n");
+ close(mdfd);
+ free(devices);
+ return err;
+ }
+ }
+#endif
+
+ /* Almost ready to actually *do* something */
+ /* First, fill in the map, so that udev can find our name
+ * as soon as we become active.
+ */
+ if (c->update && strcmp(c->update, "metadata")==0) {
+ content->array.major_version = 1;
+ content->array.minor_version = 0;
+ strcpy(content->text_version, "1.0");
+ }
+
+ map_update(&map, fd2devnm(mdfd), content->text_version,
+ content->uuid, chosen_name);
+
+ rv = start_array(mdfd, mddev, content,
+ st, ident, best, bestcnt,
+ chosen_drive, devices, okcnt, sparecnt,
+ rebuilding_cnt, journalcnt,
+ c,
+ clean, avail, start_partial_ok,
+ pre_exist != NULL,
+ was_forced);
+ if (rv == 1 && !pre_exist)
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ free(devices);
+ map_unlock(&map);
+ if (rv == 0) {
+ wait_for(chosen_name, mdfd);
+ close(mdfd);
+ if (auto_assem) {
+ int usecs = 1;
+ /* There is a nasty race with 'mdadm --monitor'.
+ * If it opens this device before we close it,
+ * it gets an incomplete open on which IO
+ * doesn't work and the capacity is
+ * wrong.
+ * If we reopen (to check for layered devices)
+ * before --monitor closes, we loose.
+ *
+ * So: wait upto 1 second for there to be
+ * a non-zero capacity.
+ */
+ while (usecs < 1000) {
+ mdfd = open(mddev, O_RDONLY);
+ if (mdfd >= 0) {
+ unsigned long long size;
+ if (get_dev_size(mdfd, NULL, &size) &&
+ size > 0)
+ break;
+ close(mdfd);
+ }
+ usleep(usecs);
+ usecs <<= 1;
+ }
+ }
+ } else
+ close(mdfd);
+
+ /* '2' means 'OK, but not started yet' */
+ return rv == 2 ? 0 : rv;
+}
+
+#ifndef MDASSEMBLE
+int assemble_container_content(struct supertype *st, int mdfd,
+ struct mdinfo *content, struct context *c,
+ char *chosen_name, int *result)
+{
+ struct mdinfo *dev, *sra, *dev2;
+ int working = 0, preexist = 0;
+ int expansion = 0;
+ struct map_ent *map = NULL;
+ int old_raid_disks;
+ int start_reshape;
+ char *avail = NULL;
+ int err;
+
+ sysfs_init(content, mdfd, NULL);
+
+ sra = sysfs_read(mdfd, NULL, GET_VERSION|GET_DEVS);
+ if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0) {
+ if (content->array.major_version == -1 &&
+ content->array.minor_version == -2 &&
+ c->readonly &&
+ content->text_version[0] == '/')
+ content->text_version[0] = '-';
+ if (sysfs_set_array(content, md_get_version(mdfd)) != 0) {
+ if (sra)
+ sysfs_free(sra);
+ return 1;
+ }
+ }
+
+ /* There are two types of reshape: container wide or sub-array specific
+ * Check if metadata requests blocking container wide reshapes
+ */
+ start_reshape = (content->reshape_active &&
+ !((content->reshape_active == CONTAINER_RESHAPE) &&
+ (content->array.state & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE))));
+
+ /* Block subarray here if it is under reshape now
+ * Do not allow for any changes in this array
+ */
+ if (st->ss->external && content->recovery_blocked && start_reshape)
+ block_subarray(content);
+
+ for (dev2 = sra->devs; dev2; dev2 = dev2->next) {
+ for (dev = content->devs; dev; dev = dev->next)
+ if (dev2->disk.major == dev->disk.major &&
+ dev2->disk.minor == dev->disk.minor)
+ break;
+ if (dev)
+ continue;
+ /* Don't want this one any more */
+ if (sysfs_set_str(sra, dev2, "slot", "none") < 0 &&
+ errno == EBUSY) {
+ pr_err("Cannot remove old device %s: not updating %s\n", dev2->sys_name, sra->sys_name);
+ sysfs_free(sra);
+ return 1;
+ }
+ sysfs_set_str(sra, dev2, "state", "remove");
+ }
+ old_raid_disks = content->array.raid_disks - content->delta_disks;
+ avail = xcalloc(content->array.raid_disks, 1);
+ for (dev = content->devs; dev; dev = dev->next) {
+ if (dev->disk.raid_disk >= 0)
+ avail[dev->disk.raid_disk] = 1;
+ if (sysfs_add_disk(content, dev, 1) == 0) {
+ if (dev->disk.raid_disk >= old_raid_disks &&
+ content->reshape_active)
+ expansion++;
+ else
+ working++;
+ } else if (errno == EEXIST)
+ preexist++;
+ }
+ sysfs_free(sra);
+ if (working + expansion == 0 && c->runstop <= 0) {
+ free(avail);
+ return 1;/* Nothing new, don't try to start */
+ }
+ map_update(&map, fd2devnm(mdfd),
+ content->text_version,
+ content->uuid, chosen_name);
+
+
+ if (enough(content->array.level, content->array.raid_disks,
+ content->array.layout, content->array.state & 1, avail) == 0) {
+ if (c->export && result)
+ *result |= INCR_NO;
+ else if (c->verbose >= 0) {
+ pr_err("%s assembled with %d device%s",
+ chosen_name, preexist + working,
+ preexist + working == 1 ? "":"s");
+ if (preexist)
+ fprintf(stderr, " (%d new)", working);
+ fprintf(stderr, " but not started\n");
+ }
+ free(avail);
+ return 1;
+ }
+ free(avail);
+
+ if (c->runstop <= 0 &&
+ (working + preexist + expansion) <
+ content->array.working_disks) {
+ if (c->export && result)
+ *result |= INCR_UNSAFE;
+ else if (c->verbose >= 0) {
+ pr_err("%s assembled with %d device%s",
+ chosen_name, preexist + working,
+ preexist + working == 1 ? "":"s");
+ if (preexist)
+ fprintf(stderr, " (%d new)", working);
+ fprintf(stderr, " but not safe to start\n");
+ }
+ return 1;
+ }
+
+
+ if (start_reshape) {
+ int spare = content->array.raid_disks + expansion;
+ if (restore_backup(st, content,
+ working,
+ spare, &c->backup_file, c->verbose) == 1)
+ return 1;
+
+ err = sysfs_set_str(content, NULL,
+ "array_state", "readonly");
+ if (err)
+ return 1;
+
+ if (st->ss->external) {
+ if (!mdmon_running(st->container_devnm))
+ start_mdmon(st->container_devnm);
+ ping_monitor(st->container_devnm);
+ if (mdmon_running(st->container_devnm) &&
+ st->update_tail == NULL)
+ st->update_tail = &st->updates;
+ }
+
+ err = Grow_continue(mdfd, st, content, c->backup_file,
+ 0, c->freeze_reshape);
+ } else switch(content->array.level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ err = sysfs_set_str(content, NULL, "array_state",
+ c->readonly ? "readonly" : "active");
+ break;
+ default:
+ err = sysfs_set_str(content, NULL, "array_state",
+ "readonly");
+ /* start mdmon if needed. */
+ if (!err) {
+ if (!mdmon_running(st->container_devnm))
+ start_mdmon(st->container_devnm);
+ ping_monitor(st->container_devnm);
+ }
+ break;
+ }
+ if (!err)
+ sysfs_set_safemode(content, content->safe_mode_delay);
+
+ /* Block subarray here if it is not reshaped now
+ * It has be blocked a little later to allow mdmon to switch in
+ * in to R/W state
+ */
+ if (st->ss->external && content->recovery_blocked &&
+ !start_reshape)
+ block_subarray(content);
+
+ if (c->export && result) {
+ if (err)
+ *result |= INCR_NO;
+ else
+ *result |= INCR_YES;
+ } else if (c->verbose >= 0) {
+ if (err)
+ pr_err("array %s now has %d device%s",
+ chosen_name, working + preexist,
+ working + preexist == 1 ? "":"s");
+ else
+ pr_err("Started %s with %d device%s",
+ chosen_name, working + preexist,
+ working + preexist == 1 ? "":"s");
+ if (preexist)
+ fprintf(stderr, " (%d new)", working);
+ if (expansion)
+ fprintf(stderr, " ( + %d for expansion)",
+ expansion);
+ fprintf(stderr, "\n");
+ }
+ if (!err)
+ wait_for(chosen_name, mdfd);
+ return err;
+ /* FIXME should have an O_EXCL and wait for read-auto */
+}
+#endif
diff --git a/Build.c b/Build.c
new file mode 100644
index 0000000..8603c71
--- /dev/null
+++ b/Build.c
@@ -0,0 +1,292 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+
+#define REGISTER_DEV _IO (MD_MAJOR, 1)
+#define START_MD _IO (MD_MAJOR, 2)
+#define STOP_MD _IO (MD_MAJOR, 3)
+
+int Build(char *mddev, struct mddev_dev *devlist,
+ struct shape *s, struct context *c)
+{
+ /* Build a linear or raid0 arrays without superblocks
+ * We cannot really do any checks, we just do it.
+ * For md_version < 0.90.0, we call REGISTER_DEV
+ * with the device numbers, and then
+ * START_MD giving the "geometry"
+ * geometry is 0xpp00cc
+ * where pp is personality: 1==linear, 2=raid0
+ * cc = chunk size factor: 0==4k, 1==8k etc.
+ *
+ * For md_version >= 0.90.0 we call
+ * SET_ARRAY_INFO, ADD_NEW_DISK, RUN_ARRAY
+ *
+ */
+ int i;
+ int vers;
+ struct stat stb;
+ int subdevs = 0, missing_disks = 0;
+ struct mddev_dev *dv;
+ int bitmap_fd;
+ unsigned long long bitmapsize;
+ int mdfd;
+ char chosen_name[1024];
+ int uuid[4] = {0,0,0,0};
+ struct map_ent *map = NULL;
+
+ /* scan all devices, make sure they really are block devices */
+ for (dv = devlist; dv; dv=dv->next) {
+ subdevs++;
+ if (strcmp("missing", dv->devname) == 0) {
+ missing_disks++;
+ continue;
+ }
+ if (stat(dv->devname, &stb)) {
+ pr_err("Cannot find %s: %s\n",
+ dv->devname, strerror(errno));
+ return 1;
+ }
+ if ((stb.st_mode & S_IFMT) != S_IFBLK) {
+ pr_err("%s is not a block device.\n",
+ dv->devname);
+ return 1;
+ }
+ }
+
+ if (s->raiddisks != subdevs) {
+ pr_err("requested %d devices in array but listed %d\n",
+ s->raiddisks, subdevs);
+ return 1;
+ }
+
+ if (s->layout == UnSet)
+ switch(s->level) {
+ default: /* no layout */
+ s->layout = 0;
+ break;
+ case 10:
+ s->layout = 0x102; /* near=2, far=1 */
+ if (c->verbose > 0)
+ pr_err("layout defaults to n1\n");
+ break;
+ case 5:
+ case 6:
+ s->layout = map_name(r5layout, "default");
+ if (c->verbose > 0)
+ pr_err("layout defaults to %s\n", map_num(r5layout, s->layout));
+ break;
+ case LEVEL_FAULTY:
+ s->layout = map_name(faultylayout, "default");
+
+ if (c->verbose > 0)
+ pr_err("layout defaults to %s\n", map_num(faultylayout, s->layout));
+ break;
+ }
+
+ /* We need to create the device. It can have no name. */
+ map_lock(&map);
+ mdfd = create_mddev(mddev, NULL, c->autof, LOCAL,
+ chosen_name);
+ if (mdfd < 0) {
+ map_unlock(&map);
+ return 1;
+ }
+ mddev = chosen_name;
+
+ map_update(&map, fd2devnm(mdfd), "none", uuid, chosen_name);
+ map_unlock(&map);
+
+ vers = md_get_version(mdfd);
+
+ /* looks Ok, go for it */
+ if (vers >= 9000) {
+ mdu_array_info_t array;
+ array.level = s->level;
+ if (s->size == MAX_SIZE)
+ s->size = 0;
+ array.size = s->size;
+ array.nr_disks = s->raiddisks;
+ array.raid_disks = s->raiddisks;
+ array.md_minor = 0;
+ if (fstat(mdfd, &stb)==0)
+ array.md_minor = minor(stb.st_rdev);
+ array.not_persistent = 1;
+ array.state = 0; /* not clean, but no errors */
+ if (s->assume_clean)
+ array.state |= 1;
+ array.active_disks = s->raiddisks - missing_disks;
+ array.working_disks = s->raiddisks - missing_disks;
+ array.spare_disks = 0;
+ array.failed_disks = missing_disks;
+ if (s->chunk == 0 && (s->level==0 || s->level==LEVEL_LINEAR))
+ s->chunk = 64;
+ array.chunk_size = s->chunk*1024;
+ array.layout = s->layout;
+ if (ioctl(mdfd, SET_ARRAY_INFO, &array)) {
+ pr_err("SET_ARRAY_INFO failed for %s: %s\n",
+ mddev, strerror(errno));
+ goto abort;
+ }
+ } else if (s->bitmap_file) {
+ pr_err("bitmaps not supported with this kernel\n");
+ goto abort;
+ }
+
+ if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0)
+ s->bitmap_file = NULL;
+ if (s->bitmap_file && s->level <= 0) {
+ pr_err("bitmaps not meaningful with level %s\n",
+ map_num(pers, s->level)?:"given");
+ goto abort;
+ }
+ /* now add the devices */
+ for ((i=0), (dv = devlist) ; dv ; i++, dv=dv->next) {
+ unsigned long long dsize;
+ int fd;
+ if (strcmp("missing", dv->devname) == 0)
+ continue;
+ if (stat(dv->devname, &stb)) {
+ pr_err("Weird: %s has disappeared.\n",
+ dv->devname);
+ goto abort;
+ }
+ if ((stb.st_mode & S_IFMT)!= S_IFBLK) {
+ pr_err("Weird: %s is no longer a block device.\n",
+ dv->devname);
+ goto abort;
+ }
+ fd = open(dv->devname, O_RDONLY|O_EXCL);
+ if (fd < 0) {
+ pr_err("Cannot open %s: %s\n",
+ dv->devname, strerror(errno));
+ goto abort;
+ }
+ if (get_dev_size(fd, NULL, &dsize) &&
+ (s->size == 0 || s->size == MAX_SIZE || dsize < s->size))
+ s->size = dsize;
+ close(fd);
+ if (vers >= 9000) {
+ mdu_disk_info_t disk;
+ disk.number = i;
+ disk.raid_disk = i;
+ disk.state = (1<<MD_DISK_SYNC) | (1<<MD_DISK_ACTIVE);
+ if (dv->writemostly == 1)
+ disk.state |= 1<<MD_DISK_WRITEMOSTLY;
+ disk.major = major(stb.st_rdev);
+ disk.minor = minor(stb.st_rdev);
+ if (ioctl(mdfd, ADD_NEW_DISK, &disk)) {
+ pr_err("ADD_NEW_DISK failed for %s: %s\n",
+ dv->devname, strerror(errno));
+ goto abort;
+ }
+ } else {
+ if (ioctl(mdfd, REGISTER_DEV, &stb.st_rdev)) {
+ pr_err("REGISTER_DEV failed for %s: %s.\n",
+ dv->devname, strerror(errno));
+ goto abort;
+ }
+ }
+ }
+ /* now to start it */
+ if (vers >= 9000) {
+ mdu_param_t param; /* not used by syscall */
+ if (s->bitmap_file) {
+ bitmap_fd = open(s->bitmap_file, O_RDWR);
+ if (bitmap_fd < 0) {
+ int major = BITMAP_MAJOR_HI;
+#if 0
+ if (s->bitmap_chunk == UnSet) {
+ pr_err("%s cannot be openned.",
+ s->bitmap_file);
+ goto abort;
+ }
+#endif
+ if (vers < 9003) {
+ major = BITMAP_MAJOR_HOSTENDIAN;
+#ifdef __BIG_ENDIAN
+ pr_err("Warning - bitmaps created on this kernel are not portable\n"
+ " between different architectures. Consider upgrading the Linux kernel.\n");
+#endif
+ }
+ bitmapsize = s->size>>9; /* FIXME wrong for RAID10 */
+ if (CreateBitmap(s->bitmap_file, 1, NULL, s->bitmap_chunk,
+ c->delay, s->write_behind, bitmapsize, major)) {
+ goto abort;
+ }
+ bitmap_fd = open(s->bitmap_file, O_RDWR);
+ if (bitmap_fd < 0) {
+ pr_err("%s cannot be openned.",
+ s->bitmap_file);
+ goto abort;
+ }
+ }
+ if (bitmap_fd >= 0) {
+ if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
+ pr_err("Cannot set bitmap file for %s: %s\n",
+ mddev, strerror(errno));
+ goto abort;
+ }
+ }
+ }
+ if (ioctl(mdfd, RUN_ARRAY, &param)) {
+ pr_err("RUN_ARRAY failed: %s\n",
+ strerror(errno));
+ if (s->chunk & (s->chunk-1)) {
+ cont_err("Problem may be that chunk size is not a power of 2\n");
+ }
+ goto abort;
+ }
+ } else {
+ unsigned long arg;
+ arg=0;
+ while (s->chunk > 4096) {
+ arg++;
+ s->chunk >>= 1;
+ }
+ if (s->level == 0)
+ arg |= 0x20000;
+ else
+ arg |= 0x10000;
+ if (ioctl(mdfd, START_MD, arg)) {
+ pr_err("START_MD failed: %s\n",
+ strerror(errno));
+ goto abort;
+ }
+ }
+ if (c->verbose >= 0)
+ pr_err("array %s built and started.\n",
+ mddev);
+ wait_for(mddev, mdfd);
+ close(mdfd);
+ return 0;
+
+ abort:
+ if (vers >= 9000)
+ ioctl(mdfd, STOP_ARRAY, 0);
+ else
+ ioctl(mdfd, STOP_MD, 0);
+ close(mdfd);
+ return 1;
+}
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..a3bf700
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,306 @@
+Please see git logs for detailed change log.
+This file just contains highlight.
+
+Changes Prior to release 3.3
+- Some array reshapes can proceed without needing backup file.
+ This is done by changing the 'data_offset' so we never need to write
+ any data back over where it was before. If there is no "head space"
+ or "tail space" to allow data_offset to change, the old mechanism
+ with a backup file can still be used.
+- RAID10 arrays can be reshaped to change the number of devices,
+ change the chunk size, or change the layout between 'near'
+ and 'offset'.
+ This will always change data_offset, and will fail if there is no
+ room for data_offset to be moved.
+- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array.
+- bad-block-logs are supported (but not heavily tested yet)
+- "--assemble --update=revert-reshape" can be used to undo a reshape
+ that has just been started but isn't really wanted. This is very
+ new and while it passes basic tests it cannot be guaranteed.
+- improved locking between --incremental and --assemble
+- uses systemd to run "mdmon" if systemd is configured to do that.
+- kernel names of md devices can be non-numeric. e.g. "md_home" rather than
+ "md0". This will probably confuse lots of other tools, so you need to
+ echo CREATE names=yes >> /etc/mdadm.conf
+ or the feature will not be used. (you also need a reasonably new kernel).
+- "--stop" can be given a kernel name instead of a device name. i.e
+ mdadm --stop md4
+ will work even if /dev/md4 doesn't exist.
+- "--detail --export" has some information about the devices in the array
+- --dump and --restore can be used to backup and restore the metadata on an
+ array.
+- Hot-replace is supported with
+ mdadm /dev/mdX --replace /dev/foo
+ and
+ mdadm /dev/mdX --replace /dev/foo --with /dev/bar
+- Config file can be a directory in which case all "*.conf" files are
+ read in lexical order.
+ Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d
+ Thus
+ echo CREATE name=yes > /etc/mdadm.conf.d/names.conf
+ will also enable the use of named md devices.
+
+- Lots of improvements to DDF support including adding support for
+ RAID10 (thanks Martin Wilck).
+
+Changes Prior to release 3.2.6
+ - There are no real stand-out fixes, just lots of little bits and pieces.
+
+Changes Prior to release 3.2.5
+ - This release primarily fixes a serious regression in 3.2.4.
+ This regression does *not* cause any risk to data. It simply
+ means that adding a device with "--add" would sometime fail
+ when it should not.
+
+ - The fix also includes a couple of minor fixes such as making
+ the "--layout=preserve" option to "--grow" work again.
+
+
+Changes Prior to release 3.2.4
+"--oneline" log of changes is below. Some notable ones are:
+
+ - --offroot argument to improve interactions between mdmon and initrd
+ - --prefer argument to select which /dev names to display in some
+ circumstances.
+ - relax restructions on when "--add" will be allowed
+ - Fix bug with adding write-intent-bitmap to active array
+ - Now defaults to "/run/mdadm" for storing run-time files.
+
+Changes Prior to release 3.2.3
+ - The largest single area of change is support for reshape of Intel
+ IMSM arrays (OnLine Capacity Explansion and Level Migration).
+ - Among other fixes, this now has a better chance of surviving if a
+ device fails during reshape.
+
+Changes Prior to release 3.2.2
+ - reshaping IMSM (Intel metadata) arrays is no longer 'experimental',
+ it should work properly and be largely compatible with IMSM drivers in
+ other platforms.
+ - --assume-clean can be used with --grow --size to avoid resyncing the
+ new part of the array. This is only support with very new kernels.
+ - RAID0 arrays can have chunksize which is not a power of 2. This has been
+ supported in the kernel for a while but is only now supprted by
+ mdadm.
+
+ - A new tool 'raid6check' is available which can check a RAID6 array,
+ or part of it, and report which device is most inconsistent with the
+ others if any stripe is inconsistent. This is still under development
+ and does not have a man page yet. If anyone tries it out and has any
+ questions or experience to report, they would be most welcome on
+ linux-raid@vger.kernel.org.
+
+Changes Prior to release 3.2.1
+ - policy framework
+ Policy can be expressed for moving spare devices between arrays, and
+ for how to handle hot-plugged devices. This policy can be different
+ for devices plugged in to different controllers etc.
+ This, for example, allows a configuration where when a device is plugged
+ in it is immediately included in an md array as a hot spare and
+ possibly starts recovery immediately if an array is degraded.
+
+ - some understanding of mbr and gpt paritition tables
+ This is primarly to support the new hot-plug support. If a
+ device is plugged in and policy suggests it should have a partition table,
+ the partition table will be copied from a suitably similar device, and
+ then the partitions will hot-plug and can then be added to md arrays.
+
+ - "--incremental --remove" can remember where a device was removed from
+ so if a device gets plugged back in the same place, special policy applies
+ to it, allowing it to be included in an array even if a general hotplug
+ will not be included.
+
+ - enhanced reshape options, including growing a RAID0 by converting to RAID4,
+ restriping, and converting back. Also convertions between RAID0 and
+ RAID10 and between RAID1 and RAID10 are possible (with a suitably recent
+ kernel).
+
+ - spare migration for IMSM arrays.
+ Spare migration can now work across 'containers' using non-native metadata
+ and specifically Intel's IMSM arrays support spare migrations.
+
+ - OLCE and level migration for Intel IMSM arrays.
+ OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is
+ supported for Intel Matrix Storage Manager arrays.
+ This support is currently 'experimental' for technical reasons. It can
+ be enabled with "export MDADM_EXPERIMENTAL=1"
+
+ - avoid including wayward devices
+ If you split a RAID1, mount the two halves as two separate degraded RAID1s,
+ and then later bring the two back together, it is possible that the md
+ metadata won't properly show that one must over-ride the other.
+ mdadm now does extra checking to detect this possibilty and avoid
+ potentially corrupting data.
+
+ - remove any possible confusion between similar options.
+ e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't
+ notice if one was used where the other was expected.
+
+ - allow K,M,G suffixes on chunk sizes
+
+Changes Prior to release 3.2
+ - By far the most significant change in this release related to the
+ management of reshaping arrays. This code has been substantially
+ re-written so that it can work with 'externally managed metadata' -
+ Intel's IMSM in particular. We now support level migration and
+ OnLine Capacity Expansion on these arrays.
+ - Policy framework.
+ Various policy statements can be made in the mdadm.conf to guide
+ the behaviour of mdadm, particular with regards to how new devices
+ are treated by "mdadm -I".
+ Depending on the 'action' associated with a device (identified by
+ its 'path') such need devices can be automatically re-added to and
+ existing array that they previously fell out off, or automatically
+ added as a spare if they appear to contain no data.
+
+ - mdadm now has a limited understanding of partition tables. This
+ allows the policy framework to make decisions about partitioned
+ devices as well.
+
+ - --incremental --remove can be told what --path the device was on,
+ and this info will be recorded so that another device appearing at
+ the same physical location can be preferentially added to the same
+ array (provides the spare-same-slot action policy applied to the
+ path).
+
+ - A new flags "--invalid-backup" flag is available in --assemble
+ mode. This can be used to re-assemble an array which was stopping
+ in the middle of a reshape, and for which the 'backup file' is no
+ longer available or is corrupted. The array may have some
+ corruption in it at the point where reshape was up to, but at least
+ the rest of the array will become available.
+
+
+ - Various internal restructuring - more is needed.
+
+Changes Prior to release 3.1.5
+ - Fixes for v1.x metadata on big-endian machines.
+ - man page improvements
+ - Improve '--detail --export' when run on partitions of an md array.
+ - Fix regression with removing 'failed' or 'detached' devices.
+ - Fixes for "--assemble --force" in various unusual cases.
+ - Allow '-Y' to mean --export. This was documented but not implemented.
+ - Various fixed for handling 'ddf' metadata. This is now more reliable
+ but could benefit from more interoperability testing.
+ - Correctly list subarrays of a container in "--detail" output.
+ - Improve checks on whether the requested number of devices is supported
+ by the metadata - both for --create and --grow.
+ - Don't remove partitions from a device that is being included in an
+ array until we are fully committed to including it.
+ - Allow "--assemble --update=no-bitmap" so an array with a corrupt
+ bitmap can still be assembled.
+ - Don't allow --add to succeed if it looks like a "--re-add" is probably
+ wanted, but cannot succeed. This avoids inadvertently turning
+ devices into spares when an array is failed.
+
+Changes Prior to release 3.1.4
+ Two fixes related to configs that aren't using udev:
+ - Don't remove md devices which 'standard' names on --stop
+ - Allow dev_open to work on read-only /dev
+ And fixed regressions:
+ - Allow --incremental to add spares to an array
+ - Accept --no-degraded as a deprecated option rather than
+ throwing an error
+ - Return correct success status when --incrmental assembling
+ a container which does not yet have enough devices.
+ - Don't link mdadm with pthreads, only mdmon needs it.
+ - Fix compiler warning due to bad use of snprintf
+
+Changes Prior to release 3.1.3
+ - mapfile now lives in a fixed location which default to
+ /dev/.mdadm/map but can be changed at compile time. This
+ location is choses and most distros provide it during early
+ boot and preserve it through. As long a /dev exists and is
+ writable, /dev/.mdadm will be created.
+ Other files file communication with mdmon live here too.
+ This fixes a bug reported by Debian and Gentoo users where
+ udev would spin in early-boot.
+ - IMSM and DDF metadata will not be recognised on partitions
+ as they should only be used on whole-disks.
+ - Various overflows causes by 2G drives have been addressed.
+ - A subarray of an IMSM contain can now be killed with
+ --kill-subarray. Also subarrays can be renamed with
+ --update-subarray
+ - -If (or --incremental --fail) can be used from udev to
+ fail and remove from all arrays a device which has been
+ unplugged from the system. i.e. hot-unplug-support.
+ - "mdadm /dev/mdX --re-add missing" will look for any device
+ that looks like it should be a member of /dev/mdX but isn't
+ and will automatically --re-add it
+ - Now compile with -Wextra to get extra warnings.
+ - Lots of minor bug fixes, documentation improvements, etcc
+
+Changes Prior to release 3.1.2
+ - The default metadata has change again (sorry about that).
+ It is now v1.2 and will hopefully stay that way. It turned
+ out there with boot-block issues with v1.1 which make it
+ unsuitable for a default, though in many cases it is still
+ suitable to use.
+ - Stopping a container is not permitted when members are still
+ active
+ - Add 'homehost' to the valid words for the "AUTO" config file
+ line. When followed by "-all", this causes mdadm to
+ auto-assemble any array belonging to this host, but not
+ auto-assemble anything else.
+ - Fix some bugs with "--grow --chunksize=" for changing chunksize.
+ - VAR_RUN can be easily changed at compile time just like ALT_RUN.
+ This gives distros more flexability in how to manage the
+ pid and sock files that mdmon needs.
+ - Various mdmon fixes
+ - Alway make bitmap 4K-aligned if at all possible.
+ - If mdadm.conf lists arrays which have inter-dependencies,
+ the previously had to be listed in the "right" order. Now
+ any order should work.
+ - Fix --force assembly of v1.x arrays which are in the process
+ of recovering.
+ - Add section on 'scrubbing' to 'md' man page.
+ - Various command-line-option parsing improvements.
+ - ... and lots of other bug fixes.
+
+Changes Prior to release 3.1.1
+ - Multiple fixes for new --grow levels including fixes for
+ serious data corruption problems.
+ - Change default metadata to v1.1
+ - Change default chunk size to 512K
+ - Change default bitmap chunk size to 64Meg
+ - When --re-add is used, don't fall back to
+ --add if --re-add fails as this can destroy data.
+
+Changes Prior to release 3.1
+ - Support --grow to change the layout of RAID4/5/6
+ - Support --grow to change the chunksize of raid 4/5/6
+ - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and
+ back.
+ - Support --grow to reduce the number of devices in RAID4/5/6.
+ - Support restart of these grow options which assembling an array
+ which is partially grown.
+ - Assorted tests of this code, and of different RAID6 layouts.
+
+Changes Prior to release 3.0.3
+ - Improvements for creating arrays giving just a name, like 'foo',
+ rather than the full '/dev/md/foo'.
+ - Improvements for assembling member arrays of containers.
+ - Improvements to test suite
+ - Add option to change increment for RebuildNN messages reported
+ by "mdadm --monitor"
+ - Improvements to mdmon 'hand-over' from initrd to final root.
+ - Handle merging of devices that have left an IMSM array and are
+ being re-incorporated.
+ - Add missing space in "--detail --brief" output.
+
+Changes Prior to release 3.0.2
+ - Fix crash when hosthost is not set, as often happens in
+ early boot.
+
+Changes Prior to release 3.0.1
+ - Fix various segfaults
+ - Fixed for --examine with containers
+ - Lots of other little fixes.
+
+Changes Prior to release 3.0
+ - Support for externally managed metadata, specifically DDF and IMSM.
+ - Depend on udev to create entries in /dev, rather than creating them
+ ourselves.
+ - remove --auto-update-home-hosts
+ - new config file line "auto"
+ - new "<ignore>" and "any" options for "homehost"
+ - numerous bug fixes and minor enhancements.
diff --git a/Create.c b/Create.c
new file mode 100644
index 0000000..1e4a6ee
--- /dev/null
+++ b/Create.c
@@ -0,0 +1,1071 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_u.h"
+#include "md_p.h"
+#include <ctype.h>
+
+static int default_layout(struct supertype *st, int level, int verbose)
+{
+ int layout = UnSet;
+
+ if (st && st->ss->default_geometry)
+ st->ss->default_geometry(st, &level, &layout, NULL);
+
+ if (layout == UnSet)
+ switch(level) {
+ default: /* no layout */
+ layout = 0;
+ break;
+ case 10:
+ layout = 0x102; /* near=2, far=1 */
+ if (verbose > 0)
+ pr_err("layout defaults to n2\n");
+ break;
+ case 5:
+ case 6:
+ layout = map_name(r5layout, "default");
+ if (verbose > 0)
+ pr_err("layout defaults to %s\n", map_num(r5layout, layout));
+ break;
+ case LEVEL_FAULTY:
+ layout = map_name(faultylayout, "default");
+
+ if (verbose > 0)
+ pr_err("layout defaults to %s\n", map_num(faultylayout, layout));
+ break;
+ }
+
+ return layout;
+}
+
+int Create(struct supertype *st, char *mddev,
+ char *name, int *uuid,
+ int subdevs, struct mddev_dev *devlist,
+ struct shape *s,
+ struct context *c, unsigned long long data_offset)
+{
+ /*
+ * Create a new raid array.
+ *
+ * First check that necessary details are available
+ * (i.e. level, raid-disks)
+ *
+ * Then check each disk to see what might be on it
+ * and report anything interesting.
+ *
+ * If anything looks odd, and runstop not set,
+ * abort.
+ *
+ * SET_ARRAY_INFO and ADD_NEW_DISK, and
+ * if runstop==run, or raiddisks disks were used,
+ * RUN_ARRAY
+ */
+ int mdfd;
+ unsigned long long minsize=0, maxsize=0;
+ char *mindisc = NULL;
+ char *maxdisc = NULL;
+ int dnum, raid_disk_num;
+ struct mddev_dev *dv;
+ int fail=0, warn=0;
+ struct stat stb;
+ int first_missing = subdevs * 2;
+ int second_missing = subdevs * 2;
+ int missing_disks = 0;
+ int insert_point = subdevs * 2; /* where to insert a missing drive */
+ int total_slots;
+ int pass;
+ int vers;
+ int rv;
+ int bitmap_fd;
+ int have_container = 0;
+ int container_fd = -1;
+ int need_mdmon = 0;
+ unsigned long long bitmapsize;
+ struct mdinfo info, *infos;
+ int did_default = 0;
+ int do_default_layout = 0;
+ int do_default_chunk = 0;
+ unsigned long safe_mode_delay = 0;
+ char chosen_name[1024];
+ struct map_ent *map = NULL;
+ unsigned long long newsize;
+
+ int major_num = BITMAP_MAJOR_HI;
+ if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0)
+ major_num = BITMAP_MAJOR_CLUSTERED;
+
+ memset(&info, 0, sizeof(info));
+ if (s->level == UnSet && st && st->ss->default_geometry)
+ st->ss->default_geometry(st, &s->level, NULL, NULL);
+ if (s->level == UnSet) {
+ pr_err("a RAID level is needed to create an array.\n");
+ return 1;
+ }
+ if (s->raiddisks < 4 && s->level == 6) {
+ pr_err("at least 4 raid-devices needed for level 6\n");
+ return 1;
+ }
+ if (s->raiddisks > 256 && s->level == 6) {
+ pr_err("no more than 256 raid-devices supported for level 6\n");
+ return 1;
+ }
+ if (s->raiddisks < 2 && s->level >= 4) {
+ pr_err("at least 2 raid-devices needed for level 4 or 5\n");
+ return 1;
+ }
+ if (s->level <= 0 && s->sparedisks) {
+ pr_err("This level does not support spare devices\n");
+ return 1;
+ }
+
+ if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
+ /* If given a single device, it might be a container, and we can
+ * extract a device list from there
+ */
+ mdu_array_info_t inf;
+ int fd;
+
+ memset(&inf, 0, sizeof(inf));
+ fd = open(devlist->devname, O_RDONLY);
+ if (fd >= 0 &&
+ ioctl(fd, GET_ARRAY_INFO, &inf) == 0 &&
+ inf.raid_disks == 0) {
+ /* yep, looks like a container */
+ if (st) {
+ rv = st->ss->load_container(st, fd,
+ devlist->devname);
+ if (rv == 0)
+ have_container = 1;
+ } else {
+ st = super_by_fd(fd, NULL);
+ if (st && !(rv = st->ss->
+ load_container(st, fd,
+ devlist->devname)))
+ have_container = 1;
+ else
+ st = NULL;
+ }
+ if (have_container) {
+ subdevs = s->raiddisks;
+ first_missing = subdevs * 2;
+ second_missing = subdevs * 2;
+ insert_point = subdevs * 2;
+ }
+ }
+ if (fd >= 0)
+ close(fd);
+ }
+ if (st && st->ss->external && s->sparedisks) {
+ pr_err("This metadata type does not support spare disks at create time\n");
+ return 1;
+ }
+ if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) {
+ pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks);
+ return 1;
+ }
+ if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) {
+ pr_err("You haven't given enough devices (real or missing) to create this array\n");
+ return 1;
+ }
+ if (s->bitmap_file && s->level <= 0) {
+ pr_err("bitmaps not meaningful with level %s\n",
+ map_num(pers, s->level)?:"given");
+ return 1;
+ }
+
+ /* now set some defaults */
+
+ if (s->layout == UnSet) {
+ do_default_layout = 1;
+ s->layout = default_layout(st, s->level, c->verbose);
+ }
+
+ if (s->level == 10)
+ /* check layout fits in array*/
+ if ((s->layout&255) * ((s->layout>>8)&255) > s->raiddisks) {
+ pr_err("that layout requires at least %d devices\n",
+ (s->layout&255) * ((s->layout>>8)&255));
+ return 1;
+ }
+
+ switch(s->level) {
+ case 4:
+ case 5:
+ case 10:
+ case 6:
+ case 0:
+ if (s->chunk == 0 || s->chunk == UnSet) {
+ s->chunk = UnSet;
+ do_default_chunk = 1;
+ /* chunk will be set later */
+ }
+ break;
+ case LEVEL_LINEAR:
+ /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */
+ if (get_linux_version() < 2006016 && s->chunk == 0) {
+ s->chunk = 64;
+ if (c->verbose > 0)
+ pr_err("chunk size defaults to 64K\n");
+ }
+ break;
+ case 1:
+ case LEVEL_FAULTY:
+ case LEVEL_MULTIPATH:
+ case LEVEL_CONTAINER:
+ if (s->chunk) {
+ s->chunk = 0;
+ if (c->verbose > 0)
+ pr_err("chunk size ignored for this level\n");
+ }
+ break;
+ default:
+ pr_err("unknown level %d\n", s->level);
+ return 1;
+ }
+ if (s->size == MAX_SIZE)
+ /* use '0' to mean 'max' now... */
+ s->size = 0;
+ if (s->size && s->chunk && s->chunk != UnSet)
+ s->size &= ~(unsigned long long)(s->chunk - 1);
+ newsize = s->size * 2;
+ if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks,
+ &s->chunk, s->size*2,
+ data_offset, NULL,
+ &newsize, c->verbose>=0))
+ return 1;
+
+ if (s->chunk && s->chunk != UnSet) {
+ newsize &= ~(unsigned long long)(s->chunk*2 - 1);
+ if (do_default_chunk) {
+ /* default chunk was just set */
+ if (c->verbose > 0)
+ pr_err("chunk size defaults to %dK\n", s->chunk);
+ s->size &= ~(unsigned long long)(s->chunk - 1);
+ do_default_chunk = 0;
+ }
+ }
+
+ if (s->size == 0) {
+ s->size = newsize / 2;
+ if (s->level == 1)
+ /* If this is ever reshaped to RAID5, we will
+ * need a chunksize. So round it off a bit
+ * now just to be safe
+ */
+ s->size &= ~(64ULL-1);
+
+ if (s->size && c->verbose > 0)
+ pr_err("setting size to %lluK\n", s->size);
+ }
+
+ /* now look at the subdevs */
+ info.array.active_disks = 0;
+ info.array.working_disks = 0;
+ dnum = 0;
+ for (dv = devlist; dv ; dv = dv->next)
+ if (data_offset == VARIABLE_OFFSET)
+ dv->data_offset = INVALID_SECTORS;
+ else
+ dv->data_offset = data_offset;
+
+ for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) {
+ char *dname = dv->devname;
+ unsigned long long freesize;
+ int dfd;
+ char *doff;
+
+ if (strcasecmp(dname, "missing")==0) {
+ if (first_missing > dnum)
+ first_missing = dnum;
+ if (second_missing > dnum && dnum > first_missing)
+ second_missing = dnum;
+ missing_disks ++;
+ continue;
+ }
+ if (data_offset == VARIABLE_OFFSET) {
+ doff = strchr(dname, ':');
+ if (doff) {
+ *doff++ = 0;
+ dv->data_offset = parse_size(doff);
+ } else
+ dv->data_offset = INVALID_SECTORS;
+ } else
+ dv->data_offset = data_offset;
+
+ dfd = open(dname, O_RDONLY);
+ if (dfd < 0) {
+ pr_err("cannot open %s: %s\n",
+ dname, strerror(errno));
+ exit(2);
+ }
+ if (fstat(dfd, &stb) != 0 ||
+ (stb.st_mode & S_IFMT) != S_IFBLK) {
+ close(dfd);
+ pr_err("%s is not a block device\n",
+ dname);
+ exit(2);
+ }
+ close(dfd);
+ info.array.working_disks++;
+ if (dnum < s->raiddisks && dv->disposition != 'j')
+ info.array.active_disks++;
+ if (st == NULL) {
+ struct createinfo *ci = conf_get_create_info();
+ if (ci)
+ st = ci->supertype;
+ }
+ if (st == NULL) {
+ /* Need to choose a default metadata, which is different
+ * depending on geometry of array.
+ */
+ int i;
+ char *name = "default";
+ for(i=0; !st && superlist[i]; i++) {
+ st = superlist[i]->match_metadata_desc(name);
+ if (!st)
+ continue;
+ if (do_default_layout)
+ s->layout = default_layout(st, s->level, c->verbose);
+ switch (st->ss->validate_geometry(
+ st, s->level, s->layout, s->raiddisks,
+ &s->chunk, s->size*2,
+ dv->data_offset, dname,
+ &freesize, c->verbose > 0)) {
+ case -1: /* Not valid, message printed, and not
+ * worth checking any further */
+ exit(2);
+ break;
+ case 0: /* Geometry not valid */
+ free(st);
+ st = NULL;
+ s->chunk = do_default_chunk ? UnSet : s->chunk;
+ break;
+ case 1: /* All happy */
+ break;
+ }
+ }
+
+ if (!st) {
+ int dfd = open(dname, O_RDONLY|O_EXCL);
+ if (dfd < 0) {
+ pr_err("cannot open %s: %s\n",
+ dname, strerror(errno));
+ exit(2);
+ }
+ pr_err("device %s not suitable for any style of array\n",
+ dname);
+ exit(2);
+ }
+ if (st->ss != &super0 ||
+ st->minor_version != 90)
+ did_default = 1;
+ } else {
+ if (do_default_layout)
+ s->layout = default_layout(st, s->level, 0);
+ if (!st->ss->validate_geometry(st, s->level, s->layout,
+ s->raiddisks,
+ &s->chunk, s->size*2,
+ dv->data_offset,
+ dname, &freesize,
+ c->verbose >= 0)) {
+
+ pr_err("%s is not suitable for this array.\n",
+ dname);
+ fail = 1;
+ continue;
+ }
+ }
+
+ if (dv->disposition == 'j')
+ goto skip_size_check; /* skip write journal for size check */
+
+ freesize /= 2; /* convert to K */
+ if (s->chunk && s->chunk != UnSet) {
+ /* round to chunk size */
+ freesize = freesize & ~(s->chunk-1);
+ if (do_default_chunk) {
+ /* default chunk was just set */
+ if (c->verbose > 0)
+ pr_err("chunk size defaults to %dK\n", s->chunk);
+ s->size &= ~(unsigned long long)(s->chunk - 1);
+ do_default_chunk = 0;
+ }
+ }
+ if (!freesize) {
+ pr_err("no free space left on %s\n", dname);
+ fail = 1;
+ continue;
+ }
+
+ if (s->size && freesize < s->size) {
+ pr_err("%s is smaller than given size. %lluK < %lluK + metadata\n",
+ dname, freesize, s->size);
+ fail = 1;
+ continue;
+ }
+ if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
+ maxdisc = dname;
+ maxsize = freesize;
+ }
+ if (mindisc ==NULL || (mindisc && freesize < minsize)) {
+ mindisc = dname;
+ minsize = freesize;
+ }
+ skip_size_check:
+ if (c->runstop != 1 || c->verbose >= 0) {
+ int fd = open(dname, O_RDONLY);
+ if (fd <0 ) {
+ pr_err("Cannot open %s: %s\n",
+ dname, strerror(errno));
+ fail=1;
+ continue;
+ }
+ warn |= check_ext2(fd, dname);
+ warn |= check_reiser(fd, dname);
+ warn |= check_raid(fd, dname);
+ if (strcmp(st->ss->name, "1.x") == 0 &&
+ st->minor_version >= 1)
+ /* metadata at front */
+ warn |= check_partitions(fd, dname, 0, 0);
+ else if (s->level == 1 || s->level == LEVEL_CONTAINER
+ || (s->level == 0 && s->raiddisks == 1))
+ /* partitions could be meaningful */
+ warn |= check_partitions(fd, dname, freesize*2, s->size*2);
+ else
+ /* partitions cannot be meaningful */
+ warn |= check_partitions(fd, dname, 0, 0);
+ if (strcmp(st->ss->name, "1.x") == 0 &&
+ st->minor_version >= 1 &&
+ did_default &&
+ s->level == 1 &&
+ (warn & 1024) == 0) {
+ warn |= 1024;
+ pr_err("Note: this array has metadata at the start and\n"
+ " may not be suitable as a boot device. If you plan to\n"
+ " store '/boot' on this device please ensure that\n"
+ " your boot-loader understands md/v1.x metadata, or use\n"
+ " --metadata=0.90\n");
+ }
+ close(fd);
+ }
+ }
+ if (s->raiddisks + s->sparedisks > st->max_devs) {
+ pr_err("Too many devices: %s metadata only supports %d\n",
+ st->ss->name, st->max_devs);
+ return 1;
+ }
+ if (have_container)
+ info.array.working_disks = s->raiddisks;
+ if (fail) {
+ pr_err("create aborted\n");
+ return 1;
+ }
+ if (s->size == 0) {
+ if (mindisc == NULL && !have_container) {
+ pr_err("no size and no drives given - aborting create.\n");
+ return 1;
+ }
+ if (s->level > 0 || s->level == LEVEL_MULTIPATH
+ || s->level == LEVEL_FAULTY
+ || st->ss->external ) {
+ /* size is meaningful */
+ if (!st->ss->validate_geometry(st, s->level, s->layout,
+ s->raiddisks,
+ &s->chunk, minsize*2,
+ data_offset,
+ NULL, NULL, 0)) {
+ pr_err("devices too large for RAID level %d\n", s->level);
+ return 1;
+ }
+ s->size = minsize;
+ if (s->level == 1)
+ /* If this is ever reshaped to RAID5, we will
+ * need a chunksize. So round it off a bit
+ * now just to be safe
+ */
+ s->size &= ~(64ULL-1);
+ if (c->verbose > 0)
+ pr_err("size set to %lluK\n", s->size);
+ }
+ }
+
+ if (!s->bitmap_file &&
+ s->level >= 1 &&
+ st->ss->add_internal_bitmap &&
+ (s->write_behind || s->size > 100*1024*1024ULL)) {
+ if (c->verbose > 0)
+ pr_err("automatically enabling write-intent bitmap on large array\n");
+ s->bitmap_file = "internal";
+ }
+ if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0)
+ s->bitmap_file = NULL;
+
+ if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) {
+ if (c->runstop != 1 || c->verbose >= 0)
+ pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
+ maxdisc, s->size);
+ warn = 1;
+ }
+
+ if (st->ss->detail_platform && st->ss->detail_platform(0, 1, NULL) != 0) {
+ if (c->runstop != 1 || c->verbose >= 0)
+ pr_err("%s unable to enumerate platform support\n"
+ " array may not be compatible with hardware/firmware\n",
+ st->ss->name);
+ warn = 1;
+ }
+ st->nodes = c->nodes;
+ st->cluster_name = c->homecluster;
+
+ if (warn) {
+ if (c->runstop!= 1) {
+ if (!ask("Continue creating array? ")) {
+ pr_err("create aborted.\n");
+ return 1;
+ }
+ } else {
+ if (c->verbose > 0)
+ pr_err("creation continuing despite oddities due to --run\n");
+ }
+ }
+
+ /* If this is raid4/5, we want to configure the last active slot
+ * as missing, so that a reconstruct happens (faster than re-parity)
+ * FIX: Can we do this for raid6 as well?
+ */
+ if (st->ss->external == 0 &&
+ s->assume_clean==0 && c->force == 0 && first_missing >= s->raiddisks) {
+ switch ( s->level ) {
+ case 4:
+ case 5:
+ insert_point = s->raiddisks-1;
+ s->sparedisks++;
+ info.array.active_disks--;
+ missing_disks++;
+ break;
+ default:
+ break;
+ }
+ }
+ /* For raid6, if creating with 1 missing drive, make a good drive
+ * into a spare, else the create will fail
+ */
+ if (s->assume_clean == 0 && c->force == 0 && first_missing < s->raiddisks &&
+ st->ss->external == 0 &&
+ second_missing >= s->raiddisks && s->level == 6) {
+ insert_point = s->raiddisks - 1;
+ if (insert_point == first_missing)
+ insert_point--;
+ s->sparedisks ++;
+ info.array.active_disks--;
+ missing_disks++;
+ }
+
+ if (s->level <= 0 && first_missing < subdevs * 2) {
+ pr_err("This level does not support missing devices\n");
+ return 1;
+ }
+
+ /* We need to create the device */
+ map_lock(&map);
+ mdfd = create_mddev(mddev, name, c->autof, LOCAL, chosen_name);
+ if (mdfd < 0) {
+ map_unlock(&map);
+ return 1;
+ }
+ /* verify if chosen_name is not in use,
+ * it could be in conflict with already existing device
+ * e.g. container, array
+ */
+ if (strncmp(chosen_name, "/dev/md/", 8) == 0
+ && map_by_name(&map, chosen_name+8) != NULL) {
+ pr_err("Array name %s is in use already.\n",
+ chosen_name);
+ close(mdfd);
+ map_unlock(&map);
+ return 1;
+ }
+ mddev = chosen_name;
+
+ vers = md_get_version(mdfd);
+ if (vers < 9000) {
+ pr_err("Create requires md driver version 0.90.0 or later\n");
+ goto abort_locked;
+ } else {
+ mdu_array_info_t inf;
+ memset(&inf, 0, sizeof(inf));
+ ioctl(mdfd, GET_ARRAY_INFO, &inf);
+ if (inf.working_disks != 0) {
+ pr_err("another array by this name is already running.\n");
+ goto abort_locked;
+ }
+ }
+
+ /* Ok, lets try some ioctls */
+
+ info.array.level = s->level;
+ info.array.size = s->size;
+ info.array.raid_disks = s->raiddisks;
+ /* The kernel should *know* what md_minor we are dealing
+ * with, but it chooses to trust me instead. Sigh
+ */
+ info.array.md_minor = 0;
+ if (fstat(mdfd, &stb)==0)
+ info.array.md_minor = minor(stb.st_rdev);
+ info.array.not_persistent = 0;
+
+ if ( ( (s->level == 4 || s->level == 5) &&
+ (insert_point < s->raiddisks || first_missing < s->raiddisks) )
+ ||
+ ( s->level == 6 && (insert_point < s->raiddisks
+ || second_missing < s->raiddisks))
+ ||
+ ( s->level <= 0 )
+ ||
+ s->assume_clean
+ ) {
+ info.array.state = 1; /* clean, but one+ drive will be missing*/
+ info.resync_start = MaxSector;
+ } else {
+ info.array.state = 0; /* not clean, but no errors */
+ info.resync_start = 0;
+ }
+ if (s->level == 10) {
+ /* for raid10, the bitmap size is the capacity of the array,
+ * which is array.size * raid_disks / ncopies;
+ * .. but convert to sectors.
+ */
+ int ncopies = ((s->layout>>8) & 255) * (s->layout & 255);
+ bitmapsize = s->size * s->raiddisks / ncopies * 2;
+/* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, s->size, s->raiddisks, ncopies);*/
+ } else
+ bitmapsize = s->size * 2;
+
+ /* There is lots of redundancy in these disk counts,
+ * raid_disks is the most meaningful value
+ * it describes the geometry of the array
+ * it is constant
+ * nr_disks is total number of used slots.
+ * it should be raid_disks+spare_disks
+ * spare_disks is the number of extra disks present
+ * see above
+ * active_disks is the number of working disks in
+ * active slots. (With raid_disks)
+ * working_disks is the total number of working disks,
+ * including spares
+ * failed_disks is the number of disks marked failed
+ *
+ * Ideally, the kernel would keep these (except raid_disks)
+ * up-to-date as we ADD_NEW_DISK, but it doesn't (yet).
+ * So for now, we assume that all raid and spare
+ * devices will be given.
+ */
+ info.array.spare_disks=s->sparedisks;
+ info.array.failed_disks=missing_disks;
+ info.array.nr_disks = info.array.working_disks
+ + info.array.failed_disks;
+ info.array.layout = s->layout;
+ info.array.chunk_size = s->chunk*1024;
+
+ if (name == NULL || *name == 0) {
+ /* base name on mddev */
+ /* /dev/md0 -> 0
+ * /dev/md_d0 -> d0
+ * /dev/md_foo -> foo
+ * /dev/md/1 -> 1
+ * /dev/md/d1 -> d1
+ * /dev/md/home -> home
+ * /dev/mdhome -> home
+ */
+ /* FIXME compare this with rules in create_mddev */
+ name = strrchr(mddev, '/');
+ if (name) {
+ name++;
+ if (strncmp(name, "md_", 3)==0 &&
+ strlen(name) > 3 &&
+ (name-mddev) == 5 /* /dev/ */)
+ name += 3;
+ else if (strncmp(name, "md", 2)==0 &&
+ strlen(name) > 2 &&
+ isdigit(name[2]) &&
+ (name-mddev) == 5 /* /dev/ */)
+ name += 2;
+ }
+ }
+ if (!st->ss->init_super(st, &info.array, s->size, name, c->homehost, uuid,
+ data_offset))
+ goto abort_locked;
+
+ total_slots = info.array.nr_disks;
+ st->ss->getinfo_super(st, &info, NULL);
+ sysfs_init(&info, mdfd, NULL);
+
+ if (did_default && c->verbose >= 0) {
+ if (is_subarray(info.text_version)) {
+ char devnm[32];
+ char *ep;
+ struct mdinfo *mdi;
+
+ strncpy(devnm, info.text_version+1, 32);
+ devnm[31] = 0;
+ ep = strchr(devnm, '/');
+ if (ep)
+ *ep = 0;
+
+ mdi = sysfs_read(-1, devnm, GET_VERSION);
+
+ pr_err("Creating array inside %s container %s\n",
+ mdi?mdi->text_version:"managed", devnm);
+ sysfs_free(mdi);
+ } else
+ pr_err("Defaulting to version %s metadata\n", info.text_version);
+ }
+
+ map_update(&map, fd2devnm(mdfd), info.text_version,
+ info.uuid, chosen_name);
+ /* Keep map locked until devices have been added to array
+ * to stop another mdadm from finding and using those devices.
+ */
+
+ if (s->bitmap_file && vers < 9003) {
+ major_num = BITMAP_MAJOR_HOSTENDIAN;
+#ifdef __BIG_ENDIAN
+ pr_err("Warning - bitmaps created on this kernel are not portable\n"
+ " between different architectured. Consider upgrading the Linux kernel.\n");
+#endif
+ }
+
+ if (s->bitmap_file && (strcmp(s->bitmap_file, "internal")==0 ||
+ strcmp(s->bitmap_file, "clustered")==0)) {
+ if ((vers%100) < 2) {
+ pr_err("internal bitmaps not supported by this kernel.\n");
+ goto abort_locked;
+ }
+ if (!st->ss->add_internal_bitmap) {
+ pr_err("internal bitmaps not supported with %s metadata\n",
+ st->ss->name);
+ goto abort_locked;
+ }
+ if (!st->ss->add_internal_bitmap(st, &s->bitmap_chunk,
+ c->delay, s->write_behind,
+ bitmapsize, 1, major_num)) {
+ pr_err("Given bitmap chunk size not supported.\n");
+ goto abort_locked;
+ }
+ s->bitmap_file = NULL;
+ }
+
+ sysfs_init(&info, mdfd, NULL);
+
+ if (st->ss->external && st->container_devnm[0]) {
+ /* member */
+
+ /* When creating a member, we need to be careful
+ * to negotiate with mdmon properly.
+ * If it is already running, we cannot write to
+ * the devices and must ask it to do that part.
+ * If it isn't running, we write to the devices,
+ * and then start it.
+ * We hold an exclusive open on the container
+ * device to make sure mdmon doesn't exit after
+ * we checked that it is running.
+ *
+ * For now, fail if it is already running.
+ */
+ container_fd = open_dev_excl(st->container_devnm);
+ if (container_fd < 0) {
+ pr_err("Cannot get exclusive open on container - weird.\n");
+ goto abort_locked;
+ }
+ if (mdmon_running(st->container_devnm)) {
+ if (c->verbose)
+ pr_err("reusing mdmon for %s.\n",
+ st->container_devnm);
+ st->update_tail = &st->updates;
+ } else
+ need_mdmon = 1;
+ }
+ rv = set_array_info(mdfd, st, &info);
+ if (rv) {
+ pr_err("failed to set array info for %s: %s\n",
+ mddev, strerror(errno));
+ goto abort_locked;
+ }
+
+ if (s->bitmap_file) {
+ int uuid[4];
+
+ st->ss->uuid_from_super(st, uuid);
+ if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk,
+ c->delay, s->write_behind,
+ bitmapsize,
+ major_num)) {
+ goto abort_locked;
+ }
+ bitmap_fd = open(s->bitmap_file, O_RDWR);
+ if (bitmap_fd < 0) {
+ pr_err("weird: %s cannot be openned\n",
+ s->bitmap_file);
+ goto abort_locked;
+ }
+ if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
+ pr_err("Cannot set bitmap file for %s: %s\n",
+ mddev, strerror(errno));
+ goto abort_locked;
+ }
+ }
+
+ infos = xmalloc(sizeof(*infos) * total_slots);
+ enable_fds(total_slots);
+ for (pass=1; pass <=2 ; pass++) {
+ struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
+
+ for (dnum=0, raid_disk_num=0, dv = devlist ; dv ;
+ dv=(dv->next)?(dv->next):moved_disk, dnum++) {
+ int fd;
+ struct stat stb;
+ struct mdinfo *inf = &infos[dnum];
+
+ if (dnum >= total_slots)
+ abort();
+ if (dnum == insert_point) {
+ raid_disk_num += 1;
+ moved_disk = dv;
+ continue;
+ }
+ if (strcasecmp(dv->devname, "missing")==0) {
+ raid_disk_num += 1;
+ continue;
+ }
+ if (have_container)
+ moved_disk = NULL;
+ if (have_container && dnum < info.array.raid_disks - 1)
+ /* repeatedly use the container */
+ moved_disk = dv;
+
+ switch(pass) {
+ case 1:
+ *inf = info;
+
+ inf->disk.number = dnum;
+ inf->disk.raid_disk = raid_disk_num++;
+
+ if (dv->disposition == 'j') {
+ inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL;
+ inf->disk.state = (1<<MD_DISK_JOURNAL);
+ raid_disk_num--;
+ } else if (inf->disk.raid_disk < s->raiddisks)
+ inf->disk.state = (1<<MD_DISK_ACTIVE) |
+ (1<<MD_DISK_SYNC);
+ else
+ inf->disk.state = 0;
+
+ if (dv->writemostly == 1)
+ inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+
+ if (have_container)
+ fd = -1;
+ else {
+ if (st->ss->external &&
+ st->container_devnm[0])
+ fd = open(dv->devname, O_RDWR);
+ else
+ fd = open(dv->devname, O_RDWR|O_EXCL);
+
+ if (fd < 0) {
+ pr_err("failed to open %s after earlier success - aborting\n",
+ dv->devname);
+ goto abort_locked;
+ }
+ fstat(fd, &stb);
+ inf->disk.major = major(stb.st_rdev);
+ inf->disk.minor = minor(stb.st_rdev);
+ }
+ if (fd >= 0)
+ remove_partitions(fd);
+ if (st->ss->add_to_super(st, &inf->disk,
+ fd, dv->devname,
+ dv->data_offset)) {
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ goto abort_locked;
+ }
+ st->ss->getinfo_super(st, inf, NULL);
+ safe_mode_delay = inf->safe_mode_delay;
+
+ if (have_container && c->verbose > 0)
+ pr_err("Using %s for device %d\n",
+ map_dev(inf->disk.major,
+ inf->disk.minor,
+ 0), dnum);
+
+ if (!have_container) {
+ /* getinfo_super might have lost these ... */
+ inf->disk.major = major(stb.st_rdev);
+ inf->disk.minor = minor(stb.st_rdev);
+ }
+ break;
+ case 2:
+ inf->errors = 0;
+
+ rv = add_disk(mdfd, st, &info, inf);
+
+ if (rv) {
+ pr_err("ADD_NEW_DISK for %s failed: %s\n",
+ dv->devname, strerror(errno));
+ goto abort_locked;
+ }
+ break;
+ }
+ if (!have_container &&
+ dv == moved_disk && dnum != insert_point) break;
+ }
+ if (pass == 1) {
+ struct mdinfo info_new;
+ struct map_ent *me = NULL;
+
+ /* check to see if the uuid has changed due to these
+ * metadata changes, and if so update the member array
+ * and container uuid. Note ->write_init_super clears
+ * the subarray cursor such that ->getinfo_super once
+ * again returns container info.
+ */
+ st->ss->getinfo_super(st, &info_new, NULL);
+ if (st->ss->external && s->level != LEVEL_CONTAINER &&
+ !same_uuid(info_new.uuid, info.uuid, 0)) {
+ map_update(&map, fd2devnm(mdfd),
+ info_new.text_version,
+ info_new.uuid, chosen_name);
+ me = map_by_devnm(&map, st->container_devnm);
+ }
+
+ if (st->ss->write_init_super(st)) {
+ st->ss->free_super(st);
+ goto abort_locked;
+ }
+
+ /* update parent container uuid */
+ if (me) {
+ char *path = xstrdup(me->path);
+
+ st->ss->getinfo_super(st, &info_new, NULL);
+ map_update(&map, st->container_devnm,
+ info_new.text_version,
+ info_new.uuid, path);
+ free(path);
+ }
+
+ flush_metadata_updates(st);
+ st->ss->free_super(st);
+ }
+ }
+ map_unlock(&map);
+ free(infos);
+
+ if (s->level == LEVEL_CONTAINER) {
+ /* No need to start. But we should signal udev to
+ * create links */
+ sysfs_uevent(&info, "change");
+ if (c->verbose >= 0)
+ pr_err("container %s prepared.\n", mddev);
+ wait_for(chosen_name, mdfd);
+ } else if (c->runstop == 1 || subdevs >= s->raiddisks) {
+ if (st->ss->external) {
+ int err;
+ switch(s->level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ err = sysfs_set_str(&info, NULL, "array_state",
+ c->readonly
+ ? "readonly"
+ : "active");
+ need_mdmon = 0;
+ break;
+ default:
+ err = sysfs_set_str(&info, NULL, "array_state",
+ "readonly");
+ break;
+ }
+ sysfs_set_safemode(&info, safe_mode_delay);
+ if (err) {
+ pr_err("failed to activate array.\n");
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ goto abort;
+ }
+ } else if (c->readonly &&
+ sysfs_attribute_available(
+ &info, NULL, "array_state")) {
+ if (sysfs_set_str(&info, NULL,
+ "array_state", "readonly") < 0) {
+ pr_err("Failed to start array: %s\n",
+ strerror(errno));
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ goto abort;
+ }
+ } else {
+ /* param is not actually used */
+ mdu_param_t param;
+ if (ioctl(mdfd, RUN_ARRAY, &param)) {
+ pr_err("RUN_ARRAY failed: %s\n",
+ strerror(errno));
+ if (info.array.chunk_size & (info.array.chunk_size-1)) {
+ cont_err("Problem may be that chunk size is not a power of 2\n");
+ }
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ goto abort;
+ }
+ /* if start_ro module parameter is set, array is
+ * auto-read-only, which is bad as the resync won't
+ * start. So lets make it read-write now.
+ */
+ ioctl(mdfd, RESTART_ARRAY_RW, NULL);
+ }
+ if (c->verbose >= 0)
+ pr_err("array %s started.\n", mddev);
+ if (st->ss->external && st->container_devnm[0]) {
+ if (need_mdmon)
+ start_mdmon(st->container_devnm);
+
+ ping_monitor(st->container_devnm);
+ close(container_fd);
+ }
+ wait_for(chosen_name, mdfd);
+ } else {
+ pr_err("not starting array - not enough devices.\n");
+ }
+ close(mdfd);
+ return 0;
+
+ abort:
+ map_lock(&map);
+ abort_locked:
+ map_remove(&map, fd2devnm(mdfd));
+ map_unlock(&map);
+
+ if (mdfd >= 0)
+ close(mdfd);
+ return 1;
+}
diff --git a/Detail.c b/Detail.c
new file mode 100644
index 0000000..0cfccad
--- /dev/null
+++ b/Detail.c
@@ -0,0 +1,768 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_p.h"
+#include "md_u.h"
+#include <dirent.h>
+
+static int cmpstringp(const void *p1, const void *p2)
+{
+ return strcmp(* (char * const *) p1, * (char * const *) p2);
+}
+
+static int add_device(const char *dev, char ***p_devices,
+ int *p_max_devices, int n_devices)
+{
+ if (n_devices + 1 >= *p_max_devices) {
+ *p_max_devices += 16;
+ *p_devices = xrealloc(*p_devices, *p_max_devices *
+ sizeof(**p_devices));
+ if (!*p_devices) {
+ *p_max_devices = 0;
+ return 0;
+ }
+ };
+ (*p_devices)[n_devices] = xstrdup(dev);
+ return n_devices + 1;
+}
+
+int Detail(char *dev, struct context *c)
+{
+ /*
+ * Print out details for an md array by using
+ * GET_ARRAY_INFO and GET_DISK_INFO ioctl calls
+ */
+
+ int fd = open(dev, O_RDONLY);
+ int vers;
+ mdu_array_info_t array;
+ mdu_disk_info_t *disks;
+ int next;
+ int d;
+ time_t atime;
+ char *str;
+ char **devices = NULL;
+ int max_devices = 0, n_devices = 0;
+ int spares = 0;
+ struct stat stb;
+ int is_26 = get_linux_version() >= 2006000;
+ int is_rebuilding = 0;
+ int failed = 0;
+ struct supertype *st;
+ char *subarray = NULL;
+ int max_disks = MD_SB_DISKS; /* just a default */
+ struct mdinfo *info = NULL;
+ struct mdinfo *sra;
+ struct mdinfo *subdev;
+ char *member = NULL;
+ char *container = NULL;
+
+ int rv = c->test ? 4 : 1;
+ int avail_disks = 0;
+ char *avail = NULL;
+ int external;
+ int inactive;
+
+ if (fd < 0) {
+ pr_err("cannot open %s: %s\n",
+ dev, strerror(errno));
+ return rv;
+ }
+ vers = md_get_version(fd);
+ if (vers < 0) {
+ pr_err("%s does not appear to be an md device\n",
+ dev);
+ close(fd);
+ return rv;
+ }
+ if (vers < 9000) {
+ pr_err("cannot get detail for md device %s: driver version too old.\n",
+ dev);
+ close(fd);
+ return rv;
+ }
+ sra = sysfs_read(fd, NULL, GET_VERSION|GET_DEVS);
+ external = (sra != NULL && sra->array.major_version == -1
+ && sra->array.minor_version == -2);
+ st = super_by_fd(fd, &subarray);
+ if (ioctl(fd, GET_ARRAY_INFO, &array) == 0) {
+ inactive = 0;
+ } else if (errno == ENODEV && sra) {
+ array = sra->array;
+ inactive = 1;
+ } else {
+ pr_err("cannot get array detail for %s: %s\n",
+ dev, strerror(errno));
+ close(fd);
+ return rv;
+ }
+
+ if (fstat(fd, &stb) != 0 && !S_ISBLK(stb.st_mode))
+ stb.st_rdev = 0;
+ rv = 0;
+
+ if (st)
+ max_disks = st->max_devs;
+
+ if (subarray) {
+ /* This is a subarray of some container.
+ * We want the name of the container, and the member
+ */
+ int devid = devnm2devid(st->container_devnm);
+ int cfd, err;
+
+ member = subarray;
+ container = map_dev_preferred(major(devid), minor(devid),
+ 1, c->prefer);
+ cfd = open_dev(st->container_devnm);
+ if (cfd >= 0) {
+ err = st->ss->load_container(st, cfd, NULL);
+ close(cfd);
+ if (err == 0)
+ info = st->ss->container_content(st, subarray);
+ }
+ }
+
+ /* try to load a superblock. Try sra->devs first, then try ioctl */
+ if (st && !info) for (d = 0, subdev = sra ? sra->devs : NULL;
+ d < max_disks || subdev;
+ subdev ? (void)(subdev = subdev->next) : (void)(d++)){
+ mdu_disk_info_t disk;
+ char *dv;
+ int fd2;
+ int err;
+ if (subdev)
+ disk = subdev->disk;
+ else {
+ disk.number = d;
+ if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
+ continue;
+ if (d >= array.raid_disks &&
+ disk.major == 0 &&
+ disk.minor == 0)
+ continue;
+ }
+
+ if (array.raid_disks > 0 &&
+ (disk.state & (1 << MD_DISK_ACTIVE)) == 0)
+ continue;
+
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv)
+ continue;
+
+ fd2 = dev_open(dv, O_RDONLY);
+ if (fd2 < 0)
+ continue;
+
+ if (st->sb)
+ st->ss->free_super(st);
+
+ err = st->ss->load_super(st, fd2, NULL);
+ close(fd2);
+ if (err)
+ continue;
+ if (info)
+ free(info);
+ if (subarray)
+ info = st->ss->container_content(st, subarray);
+ else {
+ info = xmalloc(sizeof(*info));
+ st->ss->getinfo_super(st, info, NULL);
+ }
+ if (!info)
+ continue;
+
+ if (array.raid_disks != 0 && /* container */
+ (info->array.ctime != array.ctime ||
+ info->array.level != array.level)) {
+ st->ss->free_super(st);
+ continue;
+ }
+ /* some formats (imsm) have free-floating-spares
+ * with a uuid of uuid_zero, they don't
+ * have very good info about the rest of the
+ * container, so keep searching when
+ * encountering such a device. Otherwise, stop
+ * after the first successful call to
+ * ->load_super.
+ */
+ if (memcmp(uuid_zero,
+ info->uuid,
+ sizeof(uuid_zero)) == 0) {
+ st->ss->free_super(st);
+ continue;
+ }
+ break;
+ }
+
+ /* Ok, we have some info to print... */
+ str = map_num(pers, array.level);
+
+ if (c->export) {
+ if (array.raid_disks) {
+ if (str)
+ printf("MD_LEVEL=%s\n", str);
+ printf("MD_DEVICES=%d\n", array.raid_disks);
+ } else {
+ if (!inactive)
+ printf("MD_LEVEL=container\n");
+ printf("MD_DEVICES=%d\n", array.nr_disks);
+ }
+ if (container) {
+ printf("MD_CONTAINER=%s\n", container);
+ printf("MD_MEMBER=%s\n", member);
+ } else {
+ if (sra && sra->array.major_version < 0)
+ printf("MD_METADATA=%s\n", sra->text_version);
+ else
+ printf("MD_METADATA=%d.%d\n",
+ array.major_version, array.minor_version);
+ }
+
+ if (st && st->sb && info) {
+ char nbuf[64];
+ struct map_ent *mp, *map = NULL;
+
+ fname_from_uuid(st, info, nbuf, ':');
+ printf("MD_UUID=%s\n", nbuf+5);
+ mp = map_by_uuid(&map, info->uuid);
+ if (mp && mp->path &&
+ strncmp(mp->path, "/dev/md/", 8) == 0) {
+ printf("MD_DEVNAME=");
+ print_escape(mp->path+8);
+ putchar('\n');
+ }
+
+ if (st->ss->export_detail_super)
+ st->ss->export_detail_super(st);
+ } else {
+ struct map_ent *mp, *map = NULL;
+ char nbuf[64];
+ mp = map_by_devnm(&map, fd2devnm(fd));
+ if (mp) {
+ __fname_from_uuid(mp->uuid, 0, nbuf, ':');
+ printf("MD_UUID=%s\n", nbuf+5);
+ }
+ if (mp && mp->path &&
+ strncmp(mp->path, "/dev/md/", 8) == 0) {
+ printf("MD_DEVNAME=");
+ print_escape(mp->path+8);
+ putchar('\n');
+ }
+ }
+ if (sra) {
+ struct mdinfo *mdi;
+ for (mdi = sra->devs; mdi; mdi = mdi->next) {
+ char *path =
+ map_dev(mdi->disk.major,
+ mdi->disk.minor, 0);
+
+ if (mdi->disk.raid_disk >= 0)
+ printf("MD_DEVICE_%s_ROLE=%d\n",
+ mdi->sys_name+4,
+ mdi->disk.raid_disk);
+ else
+ printf("MD_DEVICE_%s_ROLE=spare\n",
+ mdi->sys_name+4);
+ if (path)
+ printf("MD_DEVICE_%s_DEV=%s\n",
+ mdi->sys_name+4, path);
+ }
+ }
+ goto out;
+ }
+
+ disks = xmalloc(max_disks * 2 * sizeof(mdu_disk_info_t));
+ for (d = 0; d < max_disks * 2; d++) {
+ disks[d].state = (1<<MD_DISK_REMOVED);
+ disks[d].major = disks[d].minor = 0;
+ disks[d].number = -1;
+ disks[d].raid_disk = d/2;
+ }
+
+ next = array.raid_disks*2;
+ if (inactive) {
+ struct mdinfo *mdi;
+ if (sra != NULL)
+ for (mdi = sra->devs; mdi; mdi = mdi->next) {
+ disks[next++] = mdi->disk;
+ disks[next-1].number = -1;
+ }
+ } else for (d = 0; d < max_disks; d++) {
+ mdu_disk_info_t disk;
+ disk.number = d;
+ if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
+ if (d < array.raid_disks)
+ pr_err("cannot get device detail for device %d: %s\n",
+ d, strerror(errno));
+ continue;
+ }
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks
+ && disks[disk.raid_disk*2].state == (1<<MD_DISK_REMOVED))
+ disks[disk.raid_disk*2] = disk;
+ else if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks
+ && disks[disk.raid_disk*2+1].state == (1<<MD_DISK_REMOVED)
+ && !(disk.state & (1<<MD_DISK_JOURNAL)))
+ disks[disk.raid_disk*2+1] = disk;
+ else if (next < max_disks*2)
+ disks[next++] = disk;
+ }
+
+ avail = xcalloc(array.raid_disks, 1);
+
+ for (d= 0; d < array.raid_disks; d++) {
+
+ if ((disks[d*2].state & (1<<MD_DISK_SYNC)) ||
+ (disks[d*2+1].state & (1<<MD_DISK_SYNC))) {
+ avail_disks ++;
+ avail[d] = 1;
+ } else
+ rv |= !! c->test;
+ }
+
+ if (c->brief) {
+ mdu_bitmap_file_t bmf;
+ printf("%sARRAY %s", inactive ? "INACTIVE-":"", dev);
+ if (c->verbose > 0) {
+ if (array.raid_disks)
+ printf(" level=%s num-devices=%d",
+ str?str:"-unknown-",
+ array.raid_disks );
+ else if (!inactive)
+ printf(" level=container num-devices=%d",
+ array.nr_disks);
+ else
+ printf(" num-devices=%d", array.nr_disks);
+ }
+ if (container) {
+ printf(" container=%s", container);
+ printf(" member=%s", member);
+ } else {
+ if (sra && sra->array.major_version < 0)
+ printf(" metadata=%s", sra->text_version);
+ else
+ printf(" metadata=%d.%d",
+ array.major_version, array.minor_version);
+ }
+
+ /* Only try GET_BITMAP_FILE for 0.90.01 and later */
+ if (vers >= 9001 &&
+ ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 &&
+ bmf.pathname[0]) {
+ printf(" bitmap=%s", bmf.pathname);
+ }
+ } else {
+ mdu_bitmap_file_t bmf;
+ unsigned long long larray_size;
+ struct mdstat_ent *ms = mdstat_read(0, 0);
+ struct mdstat_ent *e;
+ char *devnm;
+
+ devnm = stat2devnm(&stb);
+ for (e=ms; e; e=e->next)
+ if (strcmp(e->devnm, devnm) == 0)
+ break;
+ if (!get_dev_size(fd, NULL, &larray_size))
+ larray_size = 0;
+
+ printf("%s:\n", dev);
+
+ if (container)
+ printf(" Container : %s, member %s\n", container, member);
+ else {
+ if (sra && sra->array.major_version < 0)
+ printf(" Version : %s\n", sra->text_version);
+ else
+ printf(" Version : %d.%d\n",
+ array.major_version, array.minor_version);
+ }
+
+ atime = array.ctime;
+ if (atime)
+ printf(" Creation Time : %.24s\n", ctime(&atime));
+ if (array.raid_disks == 0 && external)
+ str = "container";
+ if (str)
+ printf(" Raid Level : %s\n", str);
+ if (larray_size)
+ printf(" Array Size : %llu%s\n", (larray_size>>10),
+ human_size(larray_size));
+ if (array.level >= 1) {
+ if (sra)
+ array.major_version = sra->array.major_version;
+ if (array.major_version != 0 &&
+ (larray_size >= 0xFFFFFFFFULL|| array.size == 0)) {
+ unsigned long long dsize = get_component_size(fd);
+ if (dsize > 0)
+ printf(" Used Dev Size : %llu%s\n",
+ dsize/2,
+ human_size((long long)dsize<<9));
+ else
+ printf(" Used Dev Size : unknown\n");
+ } else
+ printf(" Used Dev Size : %lu%s\n",
+ (unsigned long)array.size,
+ human_size((unsigned long long)array.size<<10));
+ }
+ if (array.raid_disks)
+ printf(" Raid Devices : %d\n", array.raid_disks);
+ printf(" Total Devices : %d\n", array.nr_disks);
+ if (!container &&
+ ((sra == NULL && array.major_version == 0) ||
+ (sra && sra->array.major_version == 0)))
+ printf("Preferred Minor : %d\n", array.md_minor);
+ if (sra == NULL || sra->array.major_version >= 0)
+ printf(" Persistence : Superblock is %spersistent\n",
+ array.not_persistent?"not ":"");
+ printf("\n");
+ /* Only try GET_BITMAP_FILE for 0.90.01 and later */
+ if (vers >= 9001 &&
+ ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 &&
+ bmf.pathname[0]) {
+ printf(" Intent Bitmap : %s\n", bmf.pathname);
+ printf("\n");
+ } else if (array.state & (1<<MD_SB_BITMAP_PRESENT))
+ printf(" Intent Bitmap : Internal\n\n");
+ atime = array.utime;
+ if (atime)
+ printf(" Update Time : %.24s\n", ctime(&atime));
+ if (array.raid_disks) {
+ static char *sync_action[] = {
+ ", recovering", ", resyncing",
+ ", reshaping", ", checking" };
+ char *st;
+ if (avail_disks == array.raid_disks)
+ st = "";
+ else if (!enough(array.level, array.raid_disks,
+ array.layout, 1, avail))
+ st = ", FAILED";
+ else
+ st = ", degraded";
+
+ printf(" State : %s%s%s%s%s%s \n",
+ (array.state&(1<<MD_SB_CLEAN))?"clean":"active", st,
+ (!e || (e->percent < 0 && e->percent != RESYNC_PENDING &&
+ e->percent != RESYNC_DELAYED)) ? "" : sync_action[e->resync],
+ larray_size ? "": ", Not Started",
+ (e && e->percent == RESYNC_DELAYED) ? " (DELAYED)": "",
+ (e && e->percent == RESYNC_PENDING) ? " (PENDING)": "");
+ } else if (inactive) {
+ printf(" State : inactive\n");
+ }
+ if (array.raid_disks)
+ printf(" Active Devices : %d\n", array.active_disks);
+ if (array.working_disks > 0)
+ printf("Working Devices : %d\n", array.working_disks);
+ if (array.raid_disks) {
+ printf(" Failed Devices : %d\n", array.failed_disks);
+ printf(" Spare Devices : %d\n", array.spare_disks);
+ }
+ printf("\n");
+ if (array.level == 5) {
+ str = map_num(r5layout, array.layout);
+ printf(" Layout : %s\n", str?str:"-unknown-");
+ }
+ if (array.level == 6) {
+ str = map_num(r6layout, array.layout);
+ printf(" Layout : %s\n", str?str:"-unknown-");
+ }
+ if (array.level == 10) {
+ printf(" Layout :");
+ print_r10_layout(array.layout);
+ printf("\n");
+ }
+ switch (array.level) {
+ case 0:
+ case 4:
+ case 5:
+ case 10:
+ case 6:
+ if (array.chunk_size)
+ printf(" Chunk Size : %dK\n\n",
+ array.chunk_size/1024);
+ break;
+ case -1:
+ printf(" Rounding : %dK\n\n", array.chunk_size/1024);
+ break;
+ default: break;
+ }
+
+ if (e && e->percent >= 0) {
+ static char *sync_action[] = {
+ "Rebuild", "Resync",
+ "Reshape", "Check"};
+ printf(" %7s Status : %d%% complete\n", sync_action[e->resync], e->percent);
+ is_rebuilding = 1;
+ }
+ free_mdstat(ms);
+
+ if ((st && st->sb) && (info && info->reshape_active)) {
+#if 0
+This is pretty boring
+ printf(" Reshape pos'n : %llu%s\n", (unsigned long long) info->reshape_progress<<9,
+ human_size((unsigned long long)info->reshape_progress<<9));
+#endif
+ if (info->delta_disks != 0)
+ printf(" Delta Devices : %d, (%d->%d)\n",
+ info->delta_disks,
+ array.raid_disks - info->delta_disks,
+ array.raid_disks);
+ if (info->new_level != array.level) {
+ str = map_num(pers, info->new_level);
+ printf(" New Level : %s\n", str?str:"-unknown-");
+ }
+ if (info->new_level != array.level ||
+ info->new_layout != array.layout) {
+ if (info->new_level == 5) {
+ str = map_num(r5layout, info->new_layout);
+ printf(" New Layout : %s\n",
+ str?str:"-unknown-");
+ }
+ if (info->new_level == 6) {
+ str = map_num(r6layout, info->new_layout);
+ printf(" New Layout : %s\n",
+ str?str:"-unknown-");
+ }
+ if (info->new_level == 10) {
+ printf(" New Layout : near=%d, %s=%d\n",
+ info->new_layout&255,
+ (info->new_layout&0x10000)?"offset":"far",
+ (info->new_layout>>8)&255);
+ }
+ }
+ if (info->new_chunk != array.chunk_size)
+ printf(" New Chunksize : %dK\n", info->new_chunk/1024);
+ printf("\n");
+ } else if (e && e->percent >= 0)
+ printf("\n");
+ if (st && st->sb)
+ st->ss->detail_super(st, c->homehost);
+
+ if (array.raid_disks == 0 && sra && sra->array.major_version == -1
+ && sra->array.minor_version == -2 && sra->text_version[0] != '/') {
+ /* This looks like a container. Find any active arrays
+ * That claim to be a member.
+ */
+ DIR *dir = opendir("/sys/block");
+ struct dirent *de;
+
+ printf(" Member Arrays :");
+
+ while (dir && (de = readdir(dir)) != NULL) {
+ char path[200];
+ char vbuf[1024];
+ int nlen = strlen(sra->sys_name);
+ int devid;
+ if (de->d_name[0] == '.')
+ continue;
+ sprintf(path, "/sys/block/%s/md/metadata_version",
+ de->d_name);
+ if (load_sys(path, vbuf) < 0)
+ continue;
+ if (strncmp(vbuf, "external:", 9) != 0 ||
+ !is_subarray(vbuf+9) ||
+ strncmp(vbuf+10, sra->sys_name, nlen) != 0 ||
+ vbuf[10+nlen] != '/')
+ continue;
+ devid = devnm2devid(de->d_name);
+ printf(" %s", map_dev_preferred(
+ major(devid),
+ minor(devid), 1, c->prefer));
+ }
+ if (dir)
+ closedir(dir);
+ printf("\n\n");
+ }
+
+ if (array.raid_disks)
+ printf(" Number Major Minor RaidDevice State\n");
+ else
+ printf(" Number Major Minor RaidDevice\n");
+ }
+ free(info);
+
+ for (d= 0; d < max_disks * 2; d++) {
+ char *dv;
+ mdu_disk_info_t disk = disks[d];
+
+ if (d >= array.raid_disks*2 &&
+ disk.major == 0 &&
+ disk.minor == 0)
+ continue;
+ if ((d & 1) &&
+ disk.major == 0 &&
+ disk.minor == 0)
+ continue;
+ if (!c->brief) {
+ if (d == array.raid_disks*2) printf("\n");
+ if (disk.number < 0 && disk.raid_disk < 0)
+ printf(" - %5d %5d - ",
+ disk.major, disk.minor);
+ else if (disk.raid_disk < 0 || disk.state & (1<<MD_DISK_JOURNAL))
+ printf(" %5d %5d %5d - ",
+ disk.number, disk.major, disk.minor);
+ else if (disk.number < 0)
+ printf(" - %5d %5d %5d ",
+ disk.major, disk.minor, disk.raid_disk);
+ else
+ printf(" %5d %5d %5d %5d ",
+ disk.number, disk.major, disk.minor, disk.raid_disk);
+ }
+ if (!c->brief && array.raid_disks) {
+
+ if (disk.state & (1<<MD_DISK_FAULTY)) {
+ printf(" faulty");
+ if (disk.raid_disk < array.raid_disks &&
+ disk.raid_disk >= 0)
+ failed++;
+ }
+ if (disk.state & (1<<MD_DISK_ACTIVE)) printf(" active");
+ if (disk.state & (1<<MD_DISK_SYNC)) {
+ printf(" sync");
+ if (array.level == 10 && (array.layout & ~0x1FFFF) == 0) {
+ int nc = array.layout & 0xff;
+ int fc = (array.layout >> 8) & 0xff;
+ int copies = nc*fc;
+ if (fc == 1 && array.raid_disks % copies == 0 && copies <= 26) {
+ /* We can divide the devices into 'sets' */
+ int set = disk.raid_disk % copies;
+ printf(" set-%c", set + 'A');
+ }
+ }
+ }
+ if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed");
+ if (disk.state & (1<<MD_DISK_WRITEMOSTLY)) printf(" writemostly");
+ if (disk.state & (1<<MD_DISK_JOURNAL)) printf(" journal");
+ if ((disk.state &
+ ((1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC)
+ |(1<<MD_DISK_REMOVED)|(1<<MD_DISK_FAULTY)|(1<<MD_DISK_JOURNAL)))
+ == 0) {
+ printf(" spare");
+ if (is_26) {
+ if (disk.raid_disk < array.raid_disks && disk.raid_disk >= 0)
+ printf(" rebuilding");
+ } else if (is_rebuilding && failed) {
+ /* Taking a bit of a risk here, we remove the
+ * device from the array, and then put it back.
+ * If this fails, we are rebuilding
+ */
+ int err = ioctl(fd, HOT_REMOVE_DISK, makedev(disk.major, disk.minor));
+ if (err == 0) ioctl(fd, HOT_ADD_DISK, makedev(disk.major, disk.minor));
+ if (err && errno == EBUSY)
+ printf(" rebuilding");
+ }
+ }
+ }
+ if (disk.state == 0) spares++;
+ dv=map_dev_preferred(disk.major, disk.minor, 0, c->prefer);
+ if (dv != NULL) {
+ if (c->brief)
+ n_devices = add_device(dv, &devices,
+ &max_devices,
+ n_devices);
+ else
+ printf(" %s", dv);
+ }
+ if (!c->brief) printf("\n");
+ }
+ if (spares && c->brief && array.raid_disks) printf(" spares=%d", spares);
+ if (c->brief && st && st->sb)
+ st->ss->brief_detail_super(st);
+ if (st)
+ st->ss->free_super(st);
+
+ if (c->brief && c->verbose > 0 && devices) {
+ qsort(devices, n_devices, sizeof(*devices), cmpstringp);
+ printf("\n devices=%s", devices[0]);
+ for (d = 1; d < n_devices; d++)
+ printf(",%s", devices[d]);
+ }
+ if (c->brief)
+ printf("\n");
+ if (c->test &&
+ !enough(array.level, array.raid_disks, array.layout,
+ 1, avail))
+ rv = 2;
+
+ free(disks);
+out:
+ close(fd);
+ free(subarray);
+ free(avail);
+ for (d = 0; d < n_devices; d++)
+ free(devices[d]);
+ free(devices);
+ sysfs_free(sra);
+ return rv;
+}
+
+int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path)
+{
+ /* display platform capabilities for the given metadata format
+ * 'scan' in this context means iterate over all metadata types
+ */
+ int i;
+ int err = 1;
+
+ if (ss && export && ss->export_detail_platform)
+ err = ss->export_detail_platform(verbose, controller_path);
+ else if (ss && ss->detail_platform)
+ err = ss->detail_platform(verbose, 0, controller_path);
+ else if (ss) {
+ if (verbose > 0)
+ pr_err("%s metadata is platform independent\n",
+ ss->name ? : "[no name]");
+ } else if (!scan) {
+ if (verbose > 0)
+ pr_err("specify a metadata type or --scan\n");
+ }
+
+ if (!scan)
+ return err;
+
+ err = 0;
+ for (i = 0; superlist[i]; i++) {
+ struct superswitch *meta = superlist[i];
+
+ if (meta == ss)
+ continue;
+ if (verbose > 0)
+ pr_err("checking metadata %s\n",
+ meta->name ? : "[no name]");
+ if (!meta->detail_platform) {
+ if (verbose > 0)
+ pr_err("%s metadata is platform independent\n",
+ meta->name ? : "[no name]");
+ } else if (export && meta->export_detail_platform) {
+ err |= meta->export_detail_platform(verbose, controller_path);
+ } else
+ err |= meta->detail_platform(verbose, 0, controller_path);
+ }
+
+ return err;
+}
diff --git a/Dump.c b/Dump.c
new file mode 100644
index 0000000..7bdbf6f
--- /dev/null
+++ b/Dump.c
@@ -0,0 +1,311 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2013 Neil Brown <neilb@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include <sys/dir.h>
+
+int Dump_metadata(char *dev, char *dir, struct context *c,
+ struct supertype *st)
+{
+ /* create a new file in 'dir' named for the basename of 'dev'.
+ * Truncate to the same size as 'dev' and ask the metadata
+ * handler to copy metadata there.
+ * For every name in /dev/disk/by-id that points to this device,
+ * create a hardlink in 'dir'.
+ * Complain if any of those hardlinks cannot be created.
+ */
+ int fd, fl;
+ struct stat stb, dstb;
+ char *base;
+ char *fname = NULL;
+ unsigned long long size;
+ DIR *dirp;
+ struct dirent *de;
+
+ if (stat(dir, &stb) != 0 ||
+ (S_IFMT & stb.st_mode) != S_IFDIR) {
+ pr_err("--dump requires an existing directory, not: %s\n",
+ dir);
+ return 16;
+ }
+
+ fd = dev_open(dev, O_RDONLY);
+ if (fd < 0) {
+ pr_err("Cannot open %s to dump metadata: %s\n",
+ dev, strerror(errno));
+ return 1;
+ }
+ if (!get_dev_size(fd, dev, &size)) {
+ close(fd);
+ return 1;
+ }
+
+ if (st == NULL)
+ st = guess_super_type(fd, guess_array);
+ if (!st) {
+ pr_err("Cannot find RAID metadata on %s\n", dev);
+ close(fd);
+ return 1;
+ }
+
+ st->ignore_hw_compat = 1;
+ if (st->ss->load_super(st, fd, NULL) != 0) {
+ pr_err("No %s metadata found on %s\n",
+ st->ss->name, dev);
+ close(fd);
+ return 1;
+ }
+ if (st->ss->copy_metadata == NULL) {
+ pr_err("%s metadata on %s cannot be copied\n",
+ st->ss->name, dev);
+ close(fd);
+ return 1;
+ }
+
+ base = strrchr(dev, '/');
+ if (base)
+ base++;
+ else
+ base = dev;
+ xasprintf(&fname, "%s/%s", dir, base);
+ fl = open(fname, O_RDWR|O_CREAT|O_EXCL, 0666);
+ if (fl < 0) {
+ pr_err("Cannot create dump file %s: %s\n",
+ fname, strerror(errno));
+ close(fd);
+ free(fname);
+ return 1;
+ }
+ if (ftruncate(fl, size) < 0) {
+ pr_err("failed to set size of dump file: %s\n",
+ strerror(errno));
+ close(fd);
+ close(fl);
+ free(fname);
+ return 1;
+ }
+
+ if (st->ss->copy_metadata(st, fd, fl) != 0) {
+ pr_err("Failed to copy metadata from %s to %s\n",
+ dev, fname);
+ close(fd);
+ close(fl);
+ unlink(fname);
+ free(fname);
+ return 1;
+ }
+ if (c->verbose >= 0)
+ printf("%s saved as %s.\n", dev, fname);
+ fstat(fd, &dstb);
+ close(fd);
+ close(fl);
+ if ((dstb.st_mode & S_IFMT) != S_IFBLK) {
+ /* Not a block device, so cannot create links */
+ free(fname);
+ return 0;
+ }
+ /* mostly done: just want to find some other names */
+ dirp = opendir("/dev/disk/by-id");
+ if (!dirp) {
+ free(fname);
+ return 0;
+ }
+ while ((de = readdir(dirp)) != NULL) {
+ char *p = NULL;
+ if (de->d_name[0] == '.')
+ continue;
+ xasprintf(&p, "/dev/disk/by-id/%s", de->d_name);
+ if (stat(p, &stb) != 0 ||
+ (stb.st_mode & S_IFMT) != S_IFBLK ||
+ stb.st_rdev != dstb.st_rdev) {
+ /* Not this one */
+ free(p);
+ continue;
+ }
+ free(p);
+ xasprintf(&p, "%s/%s", dir, de->d_name);
+ if (link(fname, p) == 0) {
+ if (c->verbose >= 0)
+ printf("%s also saved as %s.\n",
+ dev, p);
+ } else {
+ pr_err("Could not save %s as %s!!\n",
+ dev, p);
+ }
+ free(p);
+ }
+ closedir(dirp);
+ free(fname);
+ return 0;
+}
+
+int Restore_metadata(char *dev, char *dir, struct context *c,
+ struct supertype *st, int only)
+{
+ /* If 'dir' really is a directory we choose a name
+ * from it that matches a suitable name in /dev/disk/by-id,
+ * and copy metadata from the file to the device.
+ * If two names from by-id match and aren't both the same
+ * inode, we fail. If none match and basename of 'dev'
+ * can be found in dir, use that.
+ * If 'dir' is really a file then it is only permitted if
+ * 'only' is set (meaning there was only one device given)
+ * and the metadata is restored irrespective of file names.
+ */
+ int fd, fl;
+ struct stat stb, dstb;
+ char *fname = NULL;
+ unsigned long long size;
+
+ if (stat(dir, &stb) != 0) {
+ pr_err("%s does not exist: cannot restore from there.\n",
+ dir);
+ return 16;
+ } else if ((S_IFMT & stb.st_mode) != S_IFDIR && !only) {
+ pr_err("--restore requires a directory when multiple devices given\n");
+ return 16;
+ }
+
+ fd = dev_open(dev, O_RDWR);
+ if (fd < 0) {
+ pr_err("Cannot open %s to restore metadata: %s\n",
+ dev, strerror(errno));
+ return 1;
+ }
+ if (!get_dev_size(fd, dev, &size)) {
+ close(fd);
+ return 1;
+ }
+
+ if ((S_IFMT & stb.st_mode) == S_IFDIR) {
+ /* choose one name from the directory. */
+ DIR *d = opendir(dir);
+ struct dirent *de;
+ char *chosen = NULL;
+ unsigned int chosen_inode = 0;
+
+ fstat(fd, &dstb);
+
+ while (d && (de = readdir(d)) != NULL) {
+ if (de->d_name[0] == '.')
+ continue;
+ xasprintf(&fname, "/dev/disk/by-id/%s", de->d_name);
+ if (stat(fname, &stb) != 0) {
+ free(fname);
+ continue;
+ }
+ free(fname);
+ if ((S_IFMT & stb.st_mode) != S_IFBLK)
+ continue;
+ if (stb.st_rdev != dstb.st_rdev)
+ continue;
+ /* This file is a good match for our device. */
+ xasprintf(&fname, "%s/%s", dir, de->d_name);
+ if (stat(fname, &stb) != 0) {
+ /* Weird! */
+ free(fname);
+ continue;
+ }
+ if (chosen == NULL) {
+ chosen = fname;
+ chosen_inode = stb.st_ino;
+ continue;
+ }
+ if (chosen_inode == stb.st_ino) {
+ /* same, no need to change */
+ free(fname);
+ continue;
+ }
+ /* Oh dear, two names both match. Must give up. */
+ pr_err("Both %s and %s seem suitable for %s. Please choose one.\n",
+ chosen, fname, dev);
+ free(fname);
+ free(chosen);
+ close(fd);
+ closedir(d);
+ return 1;
+ }
+ closedir(d);
+ if (!chosen) {
+ /* One last chance: try basename of device */
+ char *base = strrchr(dev, '/');
+ if (base)
+ base++;
+ else
+ base = dev;
+ xasprintf(&fname, "%s/%s", dir, base);
+ if (stat(fname, &stb) == 0)
+ chosen = fname;
+ else
+ free(fname);
+ }
+ fname = chosen;
+ } else
+ fname = strdup(dir);
+
+ if (!fname) {
+ pr_err("Cannot find suitable file in %s for %s\n",
+ dir, dev);
+ close(fd);
+ return 1;
+ }
+
+ fl = open(fname, O_RDONLY);
+ if (!fl) {
+ pr_err("Could not open %s for --restore.\n",
+ fname);
+ goto err;
+ }
+ if (((unsigned long long)stb.st_size) != size) {
+ pr_err("%s is not the same size as %s - cannot restore.\n",
+ fname, dev);
+ goto err;
+ }
+ if (st == NULL)
+ st = guess_super_type(fl, guess_array);
+ if (!st) {
+ pr_err("Cannot find metadata on %s\n", fname);
+ goto err;
+ }
+ st->ignore_hw_compat = 1;
+ if (st->ss->load_super(st, fl, NULL) != 0) {
+ pr_err("No %s metadata found on %s\n",
+ st->ss->name, fname);
+ goto err;
+ }
+ if (st->ss->copy_metadata == NULL) {
+ pr_err("%s metadata on %s cannot be copied\n",
+ st->ss->name, dev);
+ goto err;
+ }
+ if (st->ss->copy_metadata(st, fl, fd) != 0) {
+ pr_err("Failed to copy metadata from %s to %s\n",
+ fname, dev);
+ goto err;
+ }
+ if (c->verbose >= 0)
+ printf("%s restored from %s.\n", dev, fname);
+ return 0;
+
+err:
+ close(fd);
+ close(fl);
+ free(fname);
+ return 1;
+}
diff --git a/Examine.c b/Examine.c
new file mode 100644
index 0000000..953b8ee
--- /dev/null
+++ b/Examine.c
@@ -0,0 +1,225 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "dlink.h"
+
+#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN)
+#error no endian defined
+#endif
+#include "md_u.h"
+#include "md_p.h"
+int Examine(struct mddev_dev *devlist,
+ struct context *c,
+ struct supertype *forcest)
+{
+
+ /* Read the raid superblock from a device and
+ * display important content.
+ *
+ * If cannot be found, print reason: too small, bad magic
+ *
+ * Print:
+ * version, ctime, level, size, raid+spare+
+ * prefered minor
+ * uuid
+ *
+ * utime, state etc
+ *
+ * If (brief) gather devices for same array and just print a mdadm.conf
+ * line including devices=
+ * if devlist==NULL, use conf_get_devs()
+ */
+ int fd;
+ int rv = 0;
+ int err = 0;
+
+ struct array {
+ struct supertype *st;
+ struct mdinfo info;
+ void *devs;
+ struct array *next;
+ int spares;
+ } *arrays = NULL;
+
+ for (; devlist ; devlist = devlist->next) {
+ struct supertype *st;
+ int have_container = 0;
+
+ fd = dev_open(devlist->devname, O_RDONLY);
+ if (fd < 0) {
+ if (!c->scan) {
+ pr_err("cannot open %s: %s\n",
+ devlist->devname, strerror(errno));
+ rv = 1;
+ }
+ err = 1;
+ }
+ else {
+ int container = 0;
+ if (forcest)
+ st = dup_super(forcest);
+ else if (must_be_container(fd)) {
+ /* might be a container */
+ st = super_by_fd(fd, NULL);
+ container = 1;
+ } else
+ st = guess_super(fd);
+ if (st) {
+ err = 1;
+ st->ignore_hw_compat = 1;
+ if (!container)
+ err = st->ss->load_super(st, fd,
+ (c->brief||c->scan) ? NULL
+ :devlist->devname);
+ if (err && st->ss->load_container) {
+ err = st->ss->load_container(st, fd,
+ (c->brief||c->scan) ? NULL
+ :devlist->devname);
+ if (!err)
+ have_container = 1;
+ }
+ st->ignore_hw_compat = 0;
+ } else {
+ if (!c->brief) {
+ pr_err("No md superblock detected on %s.\n", devlist->devname);
+ rv = 1;
+ }
+ err = 1;
+ }
+ close(fd);
+ }
+ if (err)
+ continue;
+
+ if (c->SparcAdjust)
+ st->ss->update_super(st, NULL, "sparc2.2",
+ devlist->devname, 0, 0, NULL);
+ /* Ok, its good enough to try, though the checksum could be wrong */
+
+ if (c->brief && st->ss->brief_examine_super == NULL) {
+ if (!c->scan)
+ pr_err("No brief listing for %s on %s\n",
+ st->ss->name, devlist->devname);
+ } else if (c->brief) {
+ struct array *ap;
+ char *d;
+ for (ap = arrays; ap; ap = ap->next) {
+ if (st->ss == ap->st->ss &&
+ st->ss->compare_super(ap->st, st) == 0)
+ break;
+ }
+ if (!ap) {
+ ap = xmalloc(sizeof(*ap));
+ ap->devs = dl_head();
+ ap->next = arrays;
+ ap->spares = 0;
+ ap->st = st;
+ arrays = ap;
+ st->ss->getinfo_super(st, &ap->info, NULL);
+ } else
+ st->ss->getinfo_super(st, &ap->info, NULL);
+ if (!have_container &&
+ !(ap->info.disk.state & (1<<MD_DISK_SYNC)))
+ ap->spares++;
+ d = dl_strdup(devlist->devname);
+ dl_add(ap->devs, d);
+ } else if (c->export) {
+ if (st->ss->export_examine_super)
+ st->ss->export_examine_super(st);
+ st->ss->free_super(st);
+ } else {
+ printf("%s:\n",devlist->devname);
+ st->ss->examine_super(st, c->homehost);
+ st->ss->free_super(st);
+ }
+ }
+ if (c->brief) {
+ struct array *ap;
+ for (ap = arrays; ap; ap = ap->next) {
+ char sep='=';
+ char *d;
+ int newline = 0;
+
+ ap->st->ss->brief_examine_super(ap->st, c->verbose > 0);
+ if (ap->spares)
+ newline += printf(" spares=%d", ap->spares);
+ if (c->verbose > 0) {
+ newline += printf(" devices");
+ for (d = dl_next(ap->devs);
+ d != ap->devs;
+ d=dl_next(d)) {
+ printf("%c%s", sep, d);
+ sep=',';
+ }
+ }
+ if (ap->st->ss->brief_examine_subarrays) {
+ if (newline)
+ printf("\n");
+ ap->st->ss->brief_examine_subarrays(ap->st, c->verbose);
+ }
+ ap->st->ss->free_super(ap->st);
+ /* FIXME free ap */
+ if (ap->spares || c->verbose > 0)
+ printf("\n");
+ }
+ }
+ return rv;
+}
+
+int ExamineBadblocks(char *devname, int brief, struct supertype *forcest)
+{
+ int fd = dev_open(devname, O_RDONLY);
+ struct supertype *st = forcest;
+ int err = 1;
+
+ if (fd < 0) {
+ pr_err("cannot open %s: %s\n", devname, strerror(errno));
+ return 1;
+ }
+ if (!st)
+ st = guess_super(fd);
+ if (!st) {
+ if (!brief)
+ pr_err("No md superblock detected on %s\n", devname);
+ goto out;
+ }
+ if (!st->ss->examine_badblocks) {
+ pr_err("%s metadata does not support badblocks\n", st->ss->name);
+ goto out;
+ }
+ err = st->ss->load_super(st, fd, brief ? NULL : devname);
+ if (err)
+ goto out;
+ err = st->ss->examine_badblocks(st, fd, devname);
+
+out:
+ if (fd >= 0)
+ close(fd);
+ if (st) {
+ st->ss->free_super(st);
+ free(st);
+ }
+ return err;
+}
diff --git a/Grow.c b/Grow.c
new file mode 100755
index 0000000..bbdd46c
--- /dev/null
+++ b/Grow.c
@@ -0,0 +1,4985 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+#include "mdadm.h"
+#include "dlink.h"
+#include <sys/mman.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <signal.h>
+#include <sys/wait.h>
+
+#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN)
+#error no endian defined
+#endif
+#include "md_u.h"
+#include "md_p.h"
+
+int restore_backup(struct supertype *st,
+ struct mdinfo *content,
+ int working_disks,
+ int next_spare,
+ char **backup_filep,
+ int verbose)
+{
+ int i;
+ int *fdlist;
+ struct mdinfo *dev;
+ int err;
+ int disk_count = next_spare + working_disks;
+ char *backup_file = *backup_filep;
+
+ dprintf("Called restore_backup()\n");
+ fdlist = xmalloc(sizeof(int) * disk_count);
+
+ enable_fds(next_spare);
+ for (i = 0; i < next_spare; i++)
+ fdlist[i] = -1;
+ for (dev = content->devs; dev; dev = dev->next) {
+ char buf[22];
+ int fd;
+ sprintf(buf, "%d:%d",
+ dev->disk.major,
+ dev->disk.minor);
+ fd = dev_open(buf, O_RDWR);
+
+ if (dev->disk.raid_disk >= 0)
+ fdlist[dev->disk.raid_disk] = fd;
+ else
+ fdlist[next_spare++] = fd;
+ }
+
+ if (!backup_file) {
+ backup_file = locate_backup(content->sys_name);
+ *backup_filep = backup_file;
+ }
+
+ if (st->ss->external && st->ss->recover_backup)
+ err = st->ss->recover_backup(st, content);
+ else
+ err = Grow_restart(st, content, fdlist, next_spare,
+ backup_file, verbose > 0);
+
+ while (next_spare > 0) {
+ next_spare--;
+ if (fdlist[next_spare] >= 0)
+ close(fdlist[next_spare]);
+ }
+ free(fdlist);
+ if (err) {
+ pr_err("Failed to restore critical section for reshape - sorry.\n");
+ if (!backup_file)
+ pr_err("Possibly you need to specify a --backup-file\n");
+ return 1;
+ }
+
+ dprintf("restore_backup() returns status OK.\n");
+ return 0;
+}
+
+int Grow_Add_device(char *devname, int fd, char *newdev)
+{
+ /* Add a device to an active array.
+ * Currently, just extend a linear array.
+ * This requires writing a new superblock on the
+ * new device, calling the kernel to add the device,
+ * and if that succeeds, update the superblock on
+ * all other devices.
+ * This means that we need to *find* all other devices.
+ */
+ struct mdinfo info;
+
+ struct stat stb;
+ int nfd, fd2;
+ int d, nd;
+ struct supertype *st = NULL;
+ char *subarray = NULL;
+
+ if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
+ pr_err("cannot get array info for %s\n", devname);
+ return 1;
+ }
+
+ if (info.array.level != -1) {
+ pr_err("can only add devices to linear arrays\n");
+ return 1;
+ }
+
+ st = super_by_fd(fd, &subarray);
+ if (!st) {
+ pr_err("cannot handle arrays with superblock version %d\n",
+ info.array.major_version);
+ return 1;
+ }
+
+ if (subarray) {
+ pr_err("Cannot grow linear sub-arrays yet\n");
+ free(subarray);
+ free(st);
+ return 1;
+ }
+
+ nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
+ if (nfd < 0) {
+ pr_err("cannot open %s\n", newdev);
+ free(st);
+ return 1;
+ }
+ fstat(nfd, &stb);
+ if ((stb.st_mode & S_IFMT) != S_IFBLK) {
+ pr_err("%s is not a block device!\n", newdev);
+ close(nfd);
+ free(st);
+ return 1;
+ }
+ /* now check out all the devices and make sure we can read the
+ * superblock */
+ for (d=0 ; d < info.array.raid_disks ; d++) {
+ mdu_disk_info_t disk;
+ char *dv;
+
+ st->ss->free_super(st);
+
+ disk.number = d;
+ if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
+ pr_err("cannot get device detail for device %d\n",
+ d);
+ close(nfd);
+ free(st);
+ return 1;
+ }
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv) {
+ pr_err("cannot find device file for device %d\n",
+ d);
+ close(nfd);
+ free(st);
+ return 1;
+ }
+ fd2 = dev_open(dv, O_RDWR);
+ if (fd2 < 0) {
+ pr_err("cannot open device file %s\n", dv);
+ close(nfd);
+ free(st);
+ return 1;
+ }
+
+ if (st->ss->load_super(st, fd2, NULL)) {
+ pr_err("cannot find super block on %s\n", dv);
+ close(nfd);
+ close(fd2);
+ free(st);
+ return 1;
+ }
+ close(fd2);
+ }
+ /* Ok, looks good. Lets update the superblock and write it out to
+ * newdev.
+ */
+
+ info.disk.number = d;
+ info.disk.major = major(stb.st_rdev);
+ info.disk.minor = minor(stb.st_rdev);
+ info.disk.raid_disk = d;
+ info.disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
+ st->ss->update_super(st, &info, "linear-grow-new", newdev,
+ 0, 0, NULL);
+
+ if (st->ss->store_super(st, nfd)) {
+ pr_err("Cannot store new superblock on %s\n",
+ newdev);
+ close(nfd);
+ return 1;
+ }
+ close(nfd);
+
+ if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) {
+ pr_err("Cannot add new disk to this array\n");
+ return 1;
+ }
+ /* Well, that seems to have worked.
+ * Now go through and update all superblocks
+ */
+
+ if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) {
+ pr_err("cannot get array info for %s\n", devname);
+ return 1;
+ }
+
+ nd = d;
+ for (d=0 ; d < info.array.raid_disks ; d++) {
+ mdu_disk_info_t disk;
+ char *dv;
+
+ disk.number = d;
+ if (ioctl(fd, GET_DISK_INFO, &disk) < 0) {
+ pr_err("cannot get device detail for device %d\n",
+ d);
+ return 1;
+ }
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv) {
+ pr_err("cannot find device file for device %d\n",
+ d);
+ return 1;
+ }
+ fd2 = dev_open(dv, O_RDWR);
+ if (fd2 < 0) {
+ pr_err("cannot open device file %s\n", dv);
+ return 1;
+ }
+ if (st->ss->load_super(st, fd2, NULL)) {
+ pr_err("cannot find super block on %s\n", dv);
+ close(fd);
+ return 1;
+ }
+ info.array.raid_disks = nd+1;
+ info.array.nr_disks = nd+1;
+ info.array.active_disks = nd+1;
+ info.array.working_disks = nd+1;
+
+ st->ss->update_super(st, &info, "linear-grow-update", dv,
+ 0, 0, NULL);
+
+ if (st->ss->store_super(st, fd2)) {
+ pr_err("Cannot store new superblock on %s\n", dv);
+ close(fd2);
+ return 1;
+ }
+ close(fd2);
+ }
+
+ return 0;
+}
+
+int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
+{
+ /*
+ * First check that array doesn't have a bitmap
+ * Then create the bitmap
+ * Then add it
+ *
+ * For internal bitmaps, we need to check the version,
+ * find all the active devices, and write the bitmap block
+ * to all devices
+ */
+ mdu_bitmap_file_t bmf;
+ mdu_array_info_t array;
+ struct supertype *st;
+ char *subarray = NULL;
+ int major = BITMAP_MAJOR_HI;
+ int vers = md_get_version(fd);
+ unsigned long long bitmapsize, array_size;
+
+ if (vers < 9003) {
+ major = BITMAP_MAJOR_HOSTENDIAN;
+ pr_err("Warning - bitmaps created on this kernel are not portable\n"
+ " between different architectures. Consider upgrading the Linux kernel.\n");
+ }
+
+ if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0)
+ major = BITMAP_MAJOR_CLUSTERED;
+
+ if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) {
+ if (errno == ENOMEM)
+ pr_err("Memory allocation failure.\n");
+ else
+ pr_err("bitmaps not supported by this kernel.\n");
+ return 1;
+ }
+ if (bmf.pathname[0]) {
+ if (strcmp(s->bitmap_file,"none")==0) {
+ if (ioctl(fd, SET_BITMAP_FILE, -1)!= 0) {
+ pr_err("failed to remove bitmap %s\n",
+ bmf.pathname);
+ return 1;
+ }
+ return 0;
+ }
+ pr_err("%s already has a bitmap (%s)\n",
+ devname, bmf.pathname);
+ return 1;
+ }
+ if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
+ pr_err("cannot get array status for %s\n", devname);
+ return 1;
+ }
+ if (array.state & (1<<MD_SB_BITMAP_PRESENT)) {
+ if (strcmp(s->bitmap_file, "none")==0) {
+ array.state &= ~(1<<MD_SB_BITMAP_PRESENT);
+ if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
+ if (array.state & (1<<MD_SB_CLUSTERED))
+ pr_err("failed to remove clustered bitmap.\n");
+ else
+ pr_err("failed to remove internal bitmap.\n");
+ return 1;
+ }
+ return 0;
+ }
+ pr_err("bitmap already present on %s\n", devname);
+ return 1;
+ }
+
+ if (strcmp(s->bitmap_file, "none") == 0) {
+ pr_err("no bitmap found on %s\n", devname);
+ return 1;
+ }
+ if (array.level <= 0) {
+ pr_err("Bitmaps not meaningful with level %s\n",
+ map_num(pers, array.level)?:"of this array");
+ return 1;
+ }
+ bitmapsize = array.size;
+ bitmapsize <<= 1;
+ if (get_dev_size(fd, NULL, &array_size) &&
+ array_size > (0x7fffffffULL<<9)) {
+ /* Array is big enough that we cannot trust array.size
+ * try other approaches
+ */
+ bitmapsize = get_component_size(fd);
+ }
+ if (bitmapsize == 0) {
+ pr_err("Cannot reliably determine size of array to create bitmap - sorry.\n");
+ return 1;
+ }
+
+ if (array.level == 10) {
+ int ncopies = (array.layout&255)*((array.layout>>8)&255);
+ bitmapsize = bitmapsize * array.raid_disks / ncopies;
+ }
+
+ st = super_by_fd(fd, &subarray);
+ if (!st) {
+ pr_err("Cannot understand version %d.%d\n",
+ array.major_version, array.minor_version);
+ return 1;
+ }
+ if (subarray) {
+ pr_err("Cannot add bitmaps to sub-arrays yet\n");
+ free(subarray);
+ free(st);
+ return 1;
+ }
+ if (strcmp(s->bitmap_file, "internal") == 0 ||
+ strcmp(s->bitmap_file, "clustered") == 0) {
+ int rv;
+ int d;
+ int offset_setable = 0;
+ struct mdinfo *mdi;
+ if (st->ss->add_internal_bitmap == NULL) {
+ pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name);
+ return 1;
+ }
+ st->nodes = c->nodes;
+ st->cluster_name = c->homecluster;
+ mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION);
+ if (mdi)
+ offset_setable = 1;
+ for (d=0; d< st->max_devs; d++) {
+ mdu_disk_info_t disk;
+ char *dv;
+ disk.number = d;
+ if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
+ continue;
+ if (disk.major == 0 &&
+ disk.minor == 0)
+ continue;
+ if ((disk.state & (1<<MD_DISK_SYNC))==0)
+ continue;
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (dv) {
+ int fd2 = dev_open(dv, O_RDWR);
+ if (fd2 < 0)
+ continue;
+ if (st->ss->load_super(st, fd2, NULL)==0) {
+ if (st->ss->add_internal_bitmap(
+ st,
+ &s->bitmap_chunk, c->delay, s->write_behind,
+ bitmapsize, offset_setable,
+ major)
+ )
+ st->ss->write_bitmap(st, fd2, NoUpdate);
+ else {
+ pr_err("failed to create internal bitmap - chunksize problem.\n");
+ close(fd2);
+ return 1;
+ }
+ }
+ close(fd2);
+ }
+ }
+ if (offset_setable) {
+ st->ss->getinfo_super(st, mdi, NULL);
+ sysfs_init(mdi, fd, NULL);
+ rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location",
+ mdi->bitmap_offset);
+ } else {
+ if (strcmp(s->bitmap_file, "clustered") == 0)
+ array.state |= (1<<MD_SB_CLUSTERED);
+ array.state |= (1<<MD_SB_BITMAP_PRESENT);
+ rv = ioctl(fd, SET_ARRAY_INFO, &array);
+ }
+ if (rv < 0) {
+ if (errno == EBUSY)
+ pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n");
+ pr_err("failed to set internal bitmap.\n");
+ return 1;
+ }
+ } else {
+ int uuid[4];
+ int bitmap_fd;
+ int d;
+ int max_devs = st->max_devs;
+
+ /* try to load a superblock */
+ for (d = 0; d < max_devs; d++) {
+ mdu_disk_info_t disk;
+ char *dv;
+ int fd2;
+ disk.number = d;
+ if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
+ continue;
+ if ((disk.major==0 && disk.minor==0) ||
+ (disk.state & (1<<MD_DISK_REMOVED)))
+ continue;
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv)
+ continue;
+ fd2 = dev_open(dv, O_RDONLY);
+ if (fd2 >= 0) {
+ if (st->ss->load_super(st, fd2, NULL) == 0) {
+ close(fd2);
+ st->ss->uuid_from_super(st, uuid);
+ break;
+ }
+ close(fd2);
+ }
+ }
+ if (d == max_devs) {
+ pr_err("cannot find UUID for array!\n");
+ return 1;
+ }
+ if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk,
+ c->delay, s->write_behind, bitmapsize, major)) {
+ return 1;
+ }
+ bitmap_fd = open(s->bitmap_file, O_RDWR);
+ if (bitmap_fd < 0) {
+ pr_err("weird: %s cannot be opened\n",
+ s->bitmap_file);
+ return 1;
+ }
+ if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) {
+ int err = errno;
+ if (errno == EBUSY)
+ pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n");
+ pr_err("Cannot set bitmap file for %s: %s\n",
+ devname, strerror(err));
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * When reshaping an array we might need to backup some data.
+ * This is written to all spares with a 'super_block' describing it.
+ * The superblock goes 4K from the end of the used space on the
+ * device.
+ * It if written after the backup is complete.
+ * It has the following structure.
+ */
+
+static struct mdp_backup_super {
+ char magic[16]; /* md_backup_data-1 or -2 */
+ __u8 set_uuid[16];
+ __u64 mtime;
+ /* start/sizes in 512byte sectors */
+ __u64 devstart; /* address on backup device/file of data */
+ __u64 arraystart;
+ __u64 length;
+ __u32 sb_csum; /* csum of preceeding bytes. */
+ __u32 pad1;
+ __u64 devstart2; /* offset in to data of second section */
+ __u64 arraystart2;
+ __u64 length2;
+ __u32 sb_csum2; /* csum of preceeding bytes. */
+ __u8 pad[512-68-32];
+} __attribute__((aligned(512))) bsb, bsb2;
+
+static __u32 bsb_csum(char *buf, int len)
+{
+ int i;
+ int csum = 0;
+ for (i = 0; i < len; i++)
+ csum = (csum<<3) + buf[0];
+ return __cpu_to_le32(csum);
+}
+
+static int check_idle(struct supertype *st)
+{
+ /* Check that all member arrays for this container, or the
+ * container of this array, are idle
+ */
+ char *container = (st->container_devnm[0]
+ ? st->container_devnm : st->devnm);
+ struct mdstat_ent *ent, *e;
+ int is_idle = 1;
+
+ ent = mdstat_read(0, 0);
+ for (e = ent ; e; e = e->next) {
+ if (!is_container_member(e, container))
+ continue;
+ if (e->percent >= 0) {
+ is_idle = 0;
+ break;
+ }
+ }
+ free_mdstat(ent);
+ return is_idle;
+}
+
+static int freeze_container(struct supertype *st)
+{
+ char *container = (st->container_devnm[0]
+ ? st->container_devnm : st->devnm);
+
+ if (!check_idle(st))
+ return -1;
+
+ if (block_monitor(container, 1)) {
+ pr_err("failed to freeze container\n");
+ return -2;
+ }
+
+ return 1;
+}
+
+static void unfreeze_container(struct supertype *st)
+{
+ char *container = (st->container_devnm[0]
+ ? st->container_devnm : st->devnm);
+
+ unblock_monitor(container, 1);
+}
+
+static int freeze(struct supertype *st)
+{
+ /* Try to freeze resync/rebuild on this array/container.
+ * Return -1 if the array is busy,
+ * return -2 container cannot be frozen,
+ * return 0 if this kernel doesn't support 'frozen'
+ * return 1 if it worked.
+ */
+ if (st->ss->external)
+ return freeze_container(st);
+ else {
+ struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION);
+ int err;
+ char buf[20];
+
+ if (!sra)
+ return -1;
+ /* Need to clear any 'read-auto' status */
+ if (sysfs_get_str(sra, NULL, "array_state", buf, 20) > 0 &&
+ strncmp(buf, "read-auto", 9) == 0)
+ sysfs_set_str(sra, NULL, "array_state", "clean");
+
+ err = sysfs_freeze_array(sra);
+ sysfs_free(sra);
+ return err;
+ }
+}
+
+static void unfreeze(struct supertype *st)
+{
+ if (st->ss->external)
+ return unfreeze_container(st);
+ else {
+ struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION);
+ char buf[20];
+
+ if (sra &&
+ sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0
+ && strcmp(buf, "frozen\n") == 0)
+ sysfs_set_str(sra, NULL, "sync_action", "idle");
+ sysfs_free(sra);
+ }
+}
+
+static void wait_reshape(struct mdinfo *sra)
+{
+ int fd = sysfs_get_fd(sra, NULL, "sync_action");
+ char action[20];
+
+ if (fd < 0)
+ return;
+
+ while (sysfs_fd_get_str(fd, action, 20) > 0 &&
+ strncmp(action, "reshape", 7) == 0)
+ sysfs_wait(fd, NULL);
+ close(fd);
+}
+
+static int reshape_super(struct supertype *st, unsigned long long size,
+ int level, int layout, int chunksize, int raid_disks,
+ int delta_disks, char *backup_file, char *dev,
+ int direction, int verbose)
+{
+ /* nothing extra to check in the native case */
+ if (!st->ss->external)
+ return 0;
+ if (!st->ss->reshape_super ||
+ !st->ss->manage_reshape) {
+ pr_err("%s metadata does not support reshape\n",
+ st->ss->name);
+ return 1;
+ }
+
+ return st->ss->reshape_super(st, size, level, layout, chunksize,
+ raid_disks, delta_disks, backup_file, dev,
+ direction, verbose);
+}
+
+static void sync_metadata(struct supertype *st)
+{
+ if (st->ss->external) {
+ if (st->update_tail) {
+ flush_metadata_updates(st);
+ st->update_tail = &st->updates;
+ } else
+ st->ss->sync_metadata(st);
+ }
+}
+
+static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n)
+{
+ /* when dealing with external metadata subarrays we need to be
+ * prepared to handle EAGAIN. The kernel may need to wait for
+ * mdmon to mark the array active so the kernel can handle
+ * allocations/writeback when preparing the reshape action
+ * (md_allow_write()). We temporarily disable safe_mode_delay
+ * to close a race with the array_state going clean before the
+ * next write to raid_disks / stripe_cache_size
+ */
+ char safe[50];
+ int rc;
+
+ /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */
+ if (!container ||
+ (strcmp(name, "raid_disks") != 0 &&
+ strcmp(name, "stripe_cache_size") != 0))
+ return sysfs_set_num(sra, NULL, name, n);
+
+ rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe));
+ if (rc <= 0)
+ return -1;
+ sysfs_set_num(sra, NULL, "safe_mode_delay", 0);
+ rc = sysfs_set_num(sra, NULL, name, n);
+ if (rc < 0 && errno == EAGAIN) {
+ ping_monitor(container);
+ /* if we get EAGAIN here then the monitor is not active
+ * so stop trying
+ */
+ rc = sysfs_set_num(sra, NULL, name, n);
+ }
+ sysfs_set_str(sra, NULL, "safe_mode_delay", safe);
+ return rc;
+}
+
+int start_reshape(struct mdinfo *sra, int already_running,
+ int before_data_disks, int data_disks)
+{
+ int err;
+ unsigned long long sync_max_to_set;
+
+ sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+ err = sysfs_set_num(sra, NULL, "suspend_hi", sra->reshape_progress);
+ err = err ?: sysfs_set_num(sra, NULL, "suspend_lo",
+ sra->reshape_progress);
+ if (before_data_disks <= data_disks)
+ sync_max_to_set = sra->reshape_progress / data_disks;
+ else
+ sync_max_to_set = (sra->component_size * data_disks
+ - sra->reshape_progress) / data_disks;
+ if (!already_running)
+ sysfs_set_num(sra, NULL, "sync_min", sync_max_to_set);
+ err = err ?: sysfs_set_num(sra, NULL, "sync_max", sync_max_to_set);
+ if (!already_running && err == 0) {
+ int cnt = 5;
+ do {
+ err = sysfs_set_str(sra, NULL, "sync_action", "reshape");
+ if (err)
+ sleep(1);
+ } while (err && errno == EBUSY && cnt-- > 0);
+ }
+ return err;
+}
+
+void abort_reshape(struct mdinfo *sra)
+{
+ sysfs_set_str(sra, NULL, "sync_action", "idle");
+ sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+ sysfs_set_num(sra, NULL, "suspend_hi", 0);
+ sysfs_set_num(sra, NULL, "suspend_lo", 0);
+ sysfs_set_num(sra, NULL, "sync_min", 0);
+ // It isn't safe to reset sync_max as we aren't monitoring.
+ // Array really should be stopped at this point.
+}
+
+int remove_disks_for_takeover(struct supertype *st,
+ struct mdinfo *sra,
+ int layout)
+{
+ int nr_of_copies;
+ struct mdinfo *remaining;
+ int slot;
+
+ if (sra->array.level == 10)
+ nr_of_copies = layout & 0xff;
+ else if (sra->array.level == 1)
+ nr_of_copies = sra->array.raid_disks;
+ else
+ return 1;
+
+ remaining = sra->devs;
+ sra->devs = NULL;
+ /* for each 'copy', select one device and remove from the list. */
+ for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) {
+ struct mdinfo **diskp;
+ int found = 0;
+
+ /* Find a working device to keep */
+ for (diskp = &remaining; *diskp ; diskp = &(*diskp)->next) {
+ struct mdinfo *disk = *diskp;
+
+ if (disk->disk.raid_disk < slot)
+ continue;
+ if (disk->disk.raid_disk >= slot + nr_of_copies)
+ continue;
+ if (disk->disk.state & (1<<MD_DISK_REMOVED))
+ continue;
+ if (disk->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (!(disk->disk.state & (1<<MD_DISK_SYNC)))
+ continue;
+
+ /* We have found a good disk to use! */
+ *diskp = disk->next;
+ disk->next = sra->devs;
+ sra->devs = disk;
+ found = 1;
+ break;
+ }
+ if (!found)
+ break;
+ }
+
+ if (slot < sra->array.raid_disks) {
+ /* didn't find all slots */
+ struct mdinfo **e;
+ e = &remaining;
+ while (*e)
+ e = &(*e)->next;
+ *e = sra->devs;
+ sra->devs = remaining;
+ return 1;
+ }
+
+ /* Remove all 'remaining' devices from the array */
+ while (remaining) {
+ struct mdinfo *sd = remaining;
+ remaining = sd->next;
+
+ sysfs_set_str(sra, sd, "state", "faulty");
+ sysfs_set_str(sra, sd, "slot", "none");
+ /* for external metadata disks should be removed in mdmon */
+ if (!st->ss->external)
+ sysfs_set_str(sra, sd, "state", "remove");
+ sd->disk.state |= (1<<MD_DISK_REMOVED);
+ sd->disk.state &= ~(1<<MD_DISK_SYNC);
+ sd->next = sra->devs;
+ sra->devs = sd;
+ }
+ return 0;
+}
+
+void reshape_free_fdlist(int *fdlist,
+ unsigned long long *offsets,
+ int size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ if (fdlist[i] >= 0)
+ close(fdlist[i]);
+
+ free(fdlist);
+ free(offsets);
+}
+
+int reshape_prepare_fdlist(char *devname,
+ struct mdinfo *sra,
+ int raid_disks,
+ int nrdisks,
+ unsigned long blocks,
+ char *backup_file,
+ int *fdlist,
+ unsigned long long *offsets)
+{
+ int d = 0;
+ struct mdinfo *sd;
+
+ enable_fds(nrdisks);
+ for (d = 0; d <= nrdisks; d++)
+ fdlist[d] = -1;
+ d = raid_disks;
+ for (sd = sra->devs; sd; sd = sd->next) {
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (sd->disk.state & (1<<MD_DISK_SYNC) &&
+ sd->disk.raid_disk < raid_disks) {
+ char *dn = map_dev(sd->disk.major,
+ sd->disk.minor, 1);
+ fdlist[sd->disk.raid_disk]
+ = dev_open(dn, O_RDONLY);
+ offsets[sd->disk.raid_disk] = sd->data_offset*512;
+ if (fdlist[sd->disk.raid_disk] < 0) {
+ pr_err("%s: cannot open component %s\n",
+ devname, dn ? dn : "-unknown-");
+ d = -1;
+ goto release;
+ }
+ } else if (backup_file == NULL) {
+ /* spare */
+ char *dn = map_dev(sd->disk.major,
+ sd->disk.minor, 1);
+ fdlist[d] = dev_open(dn, O_RDWR);
+ offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
+ if (fdlist[d] < 0) {
+ pr_err("%s: cannot open component %s\n",
+ devname, dn ? dn : "-unknown-");
+ d = -1;
+ goto release;
+ }
+ d++;
+ }
+ }
+release:
+ return d;
+}
+
+int reshape_open_backup_file(char *backup_file,
+ int fd,
+ char *devname,
+ long blocks,
+ int *fdlist,
+ unsigned long long *offsets,
+ char *sys_name,
+ int restart)
+{
+ /* Return 1 on success, 0 on any form of failure */
+ /* need to check backup file is large enough */
+ char buf[512];
+ struct stat stb;
+ unsigned int dev;
+ int i;
+
+ *fdlist = open(backup_file, O_RDWR|O_CREAT|(restart ? O_TRUNC : O_EXCL),
+ S_IRUSR | S_IWUSR);
+ *offsets = 8 * 512;
+ if (*fdlist < 0) {
+ pr_err("%s: cannot create backup file %s: %s\n",
+ devname, backup_file, strerror(errno));
+ return 0;
+ }
+ /* Guard against backup file being on array device.
+ * If array is partitioned or if LVM etc is in the
+ * way this will not notice, but it is better than
+ * nothing.
+ */
+ fstat(*fdlist, &stb);
+ dev = stb.st_dev;
+ fstat(fd, &stb);
+ if (stb.st_rdev == dev) {
+ pr_err("backup file must NOT be on the array being reshaped.\n");
+ close(*fdlist);
+ return 0;
+ }
+
+ memset(buf, 0, 512);
+ for (i=0; i < blocks + 8 ; i++) {
+ if (write(*fdlist, buf, 512) != 512) {
+ pr_err("%s: cannot create backup file %s: %s\n",
+ devname, backup_file, strerror(errno));
+ return 0;
+ }
+ }
+ if (fsync(*fdlist) != 0) {
+ pr_err("%s: cannot create backup file %s: %s\n",
+ devname, backup_file, strerror(errno));
+ return 0;
+ }
+
+ if (!restart && strncmp(backup_file, MAP_DIR, strlen(MAP_DIR)) != 0) {
+ char *bu = make_backup(sys_name);
+ if (symlink(backup_file, bu))
+ pr_err("Recording backup file in " MAP_DIR " failed: %s\n",
+ strerror(errno));
+ free(bu);
+ }
+
+ return 1;
+}
+
+unsigned long compute_backup_blocks(int nchunk, int ochunk,
+ unsigned int ndata, unsigned int odata)
+{
+ unsigned long a, b, blocks;
+ /* So how much do we need to backup.
+ * We need an amount of data which is both a whole number of
+ * old stripes and a whole number of new stripes.
+ * So LCM for (chunksize*datadisks).
+ */
+ a = (ochunk/512) * odata;
+ b = (nchunk/512) * ndata;
+ /* Find GCD */
+ a = GCD(a, b);
+ /* LCM == product / GCD */
+ blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a;
+
+ return blocks;
+}
+
+char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re)
+{
+ /* Based on the current array state in info->array and
+ * the changes in info->new_* etc, determine:
+ * - whether the change is possible
+ * - Intermediate level/raid_disks/layout
+ * - whether a restriping reshape is needed
+ * - number of sectors in minimum change unit. This
+ * will cover a whole number of stripes in 'before' and
+ * 'after'.
+ *
+ * Return message if the change should be rejected
+ * NULL if the change can be achieved
+ *
+ * This can be called as part of starting a reshape, or
+ * when assembling an array that is undergoing reshape.
+ */
+ int near, far, offset, copies;
+ int new_disks;
+ int old_chunk, new_chunk;
+ /* delta_parity records change in number of devices
+ * caused by level change
+ */
+ int delta_parity = 0;
+
+ memset(re, 0, sizeof(*re));
+
+ /* If a new level not explicitly given, we assume no-change */
+ if (info->new_level == UnSet)
+ info->new_level = info->array.level;
+
+ if (info->new_chunk)
+ switch (info->new_level) {
+ case 0:
+ case 4:
+ case 5:
+ case 6:
+ case 10:
+ /* chunk size is meaningful, must divide component_size
+ * evenly
+ */
+ if (info->component_size % (info->new_chunk/512)) {
+ unsigned long long shrink = info->component_size;
+ shrink &= ~(unsigned long long)(info->new_chunk/512-1);
+ pr_err("New chunk size (%dK) does not evenly divide device size (%lluk)\n",
+ info->new_chunk/1024, info->component_size/2);
+ pr_err("After shrinking any filesystem, \"mdadm --grow %s --size %llu\"\n",
+ devname, shrink/2);
+ pr_err("will shrink the array so the given chunk size would work.\n");
+ return "";
+ }
+ break;
+ default:
+ return "chunk size not meaningful for this level";
+ }
+ else
+ info->new_chunk = info->array.chunk_size;
+
+ switch (info->array.level) {
+ default:
+ return "No reshape is possibly for this RAID level";
+ case LEVEL_LINEAR:
+ if (info->delta_disks != UnSet)
+ return "Only --add is supported for LINEAR, setting --raid-disks is not needed";
+ else
+ return "Only --add is supported for LINEAR, other --grow options are not meaningful";
+ case 1:
+ /* RAID1 can convert to RAID1 with different disks, or
+ * raid5 with 2 disks, or
+ * raid0 with 1 disk
+ */
+ if (info->new_level > 1 &&
+ (info->component_size & 7))
+ return "Cannot convert RAID1 of this size - reduce size to multiple of 4K first.";
+ if (info->new_level == 0) {
+ if (info->delta_disks != UnSet &&
+ info->delta_disks != 0)
+ return "Cannot change number of disks with RAID1->RAID0 conversion";
+ re->level = 0;
+ re->before.data_disks = 1;
+ re->after.data_disks = 1;
+ return NULL;
+ }
+ if (info->new_level == 1) {
+ if (info->delta_disks == UnSet)
+ /* Don't know what to do */
+ return "no change requested for Growing RAID1";
+ re->level = 1;
+ return NULL;
+ }
+ if (info->array.raid_disks == 2 &&
+ info->new_level == 5) {
+
+ re->level = 5;
+ re->before.data_disks = 1;
+ if (info->delta_disks != UnSet &&
+ info->delta_disks != 0)
+ re->after.data_disks = 1 + info->delta_disks;
+ else
+ re->after.data_disks = 1;
+ if (re->after.data_disks < 1)
+ return "Number of disks too small for RAID5";
+
+ re->before.layout = ALGORITHM_LEFT_SYMMETRIC;
+ info->array.chunk_size = 65536;
+ break;
+ }
+ /* Could do some multi-stage conversions, but leave that to
+ * later.
+ */
+ return "Impossibly level change request for RAID1";
+
+ case 10:
+ /* RAID10 can be converted from near mode to
+ * RAID0 by removing some devices.
+ * It can also be reshaped if the kernel supports
+ * new_data_offset.
+ */
+ switch (info->new_level) {
+ case 0:
+ if ((info->array.layout & ~0xff) != 0x100)
+ return "Cannot Grow RAID10 with far/offset layout";
+ /* number of devices must be multiple of number of copies */
+ if (info->array.raid_disks % (info->array.layout & 0xff))
+ return "RAID10 layout too complex for Grow operation";
+
+ new_disks = (info->array.raid_disks
+ / (info->array.layout & 0xff));
+ if (info->delta_disks == UnSet)
+ info->delta_disks = (new_disks
+ - info->array.raid_disks);
+
+ if (info->delta_disks != new_disks - info->array.raid_disks)
+ return "New number of raid-devices impossible for RAID10";
+ if (info->new_chunk &&
+ info->new_chunk != info->array.chunk_size)
+ return "Cannot change chunk-size with RAID10 Grow";
+
+ /* looks good */
+ re->level = 0;
+ re->before.data_disks = new_disks;
+ re->after.data_disks = re->before.data_disks;
+ return NULL;
+
+ case 10:
+ near = info->array.layout & 0xff;
+ far = (info->array.layout >> 8) & 0xff;
+ offset = info->array.layout & 0x10000;
+ if (far > 1 && !offset)
+ return "Cannot reshape RAID10 in far-mode";
+ copies = near * far;
+
+ old_chunk = info->array.chunk_size * far;
+
+ if (info->new_layout == UnSet)
+ info->new_layout = info->array.layout;
+ else {
+ near = info->new_layout & 0xff;
+ far = (info->new_layout >> 8) & 0xff;
+ offset = info->new_layout & 0x10000;
+ if (far > 1 && !offset)
+ return "Cannot reshape RAID10 to far-mode";
+ if (near * far != copies)
+ return "Cannot change number of copies when reshaping RAID10";
+ }
+ if (info->delta_disks == UnSet)
+ info->delta_disks = 0;
+ new_disks = (info->array.raid_disks +
+ info->delta_disks);
+
+ new_chunk = info->new_chunk * far;
+
+ re->level = 10;
+ re->before.layout = info->array.layout;
+ re->before.data_disks = info->array.raid_disks;
+ re->after.layout = info->new_layout;
+ re->after.data_disks = new_disks;
+ /* For RAID10 we don't do backup but do allow reshape,
+ * so set backup_blocks to INVALID_SECTORS rather than
+ * zero.
+ * And there is no need to synchronise stripes on both
+ * 'old' and 'new'. So the important
+ * number is the minimum data_offset difference
+ * which is the larger of (offset copies * chunk).
+ */
+ re->backup_blocks = INVALID_SECTORS;
+ re->min_offset_change = max(old_chunk, new_chunk) / 512;
+ if (new_disks < re->before.data_disks &&
+ info->space_after < re->min_offset_change)
+ /* Reduce component size by one chunk */
+ re->new_size = (info->component_size -
+ re->min_offset_change);
+ else
+ re->new_size = info->component_size;
+ re->new_size = re->new_size * new_disks / copies;
+ return NULL;
+
+ default:
+ return "RAID10 can only be changed to RAID0";
+ }
+ case 0:
+ /* RAID0 can be converted to RAID10, or to RAID456 */
+ if (info->new_level == 10) {
+ if (info->new_layout == UnSet && info->delta_disks == UnSet) {
+ /* Assume near=2 layout */
+ info->new_layout = 0x102;
+ info->delta_disks = info->array.raid_disks;
+ }
+ if (info->new_layout == UnSet) {
+ int copies = 1 + (info->delta_disks
+ / info->array.raid_disks);
+ if (info->array.raid_disks * (copies-1)
+ != info->delta_disks)
+ return "Impossible number of devices for RAID0->RAID10";
+ info->new_layout = 0x100 + copies;
+ }
+ if (info->delta_disks == UnSet) {
+ int copies = info->new_layout & 0xff;
+ if (info->new_layout != 0x100 + copies)
+ return "New layout impossible for RAID0->RAID10";;
+ info->delta_disks = (copies - 1) *
+ info->array.raid_disks;
+ }
+ if (info->new_chunk &&
+ info->new_chunk != info->array.chunk_size)
+ return "Cannot change chunk-size with RAID0->RAID10";
+ /* looks good */
+ re->level = 10;
+ re->before.data_disks = (info->array.raid_disks +
+ info->delta_disks);
+ re->after.data_disks = re->before.data_disks;
+ re->before.layout = info->new_layout;
+ return NULL;
+ }
+
+ /* RAID0 can also covert to RAID0/4/5/6 by first converting to
+ * a raid4 style layout of the final level.
+ */
+ switch (info->new_level) {
+ case 4:
+ delta_parity = 1;
+ case 0:
+ re->level = 4;
+ re->before.layout = 0;
+ break;
+ case 5:
+ delta_parity = 1;
+ re->level = 5;
+ re->before.layout = ALGORITHM_PARITY_N;
+ if (info->new_layout == UnSet)
+ info->new_layout = map_name(r5layout, "default");
+ break;
+ case 6:
+ delta_parity = 2;
+ re->level = 6;
+ re->before.layout = ALGORITHM_PARITY_N;
+ if (info->new_layout == UnSet)
+ info->new_layout = map_name(r6layout, "default");
+ break;
+ default:
+ return "Impossible level change requested";
+ }
+ re->before.data_disks = info->array.raid_disks;
+ /* determining 'after' layout happens outside this 'switch' */
+ break;
+
+ case 4:
+ info->array.layout = ALGORITHM_PARITY_N;
+ case 5:
+ switch (info->new_level) {
+ case 0:
+ delta_parity = -1;
+ case 4:
+ re->level = info->array.level;
+ re->before.data_disks = info->array.raid_disks - 1;
+ re->before.layout = info->array.layout;
+ break;
+ case 5:
+ re->level = 5;
+ re->before.data_disks = info->array.raid_disks - 1;
+ re->before.layout = info->array.layout;
+ break;
+ case 6:
+ delta_parity = 1;
+ re->level = 6;
+ re->before.data_disks = info->array.raid_disks - 1;
+ switch (info->array.layout) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ re->before.layout = ALGORITHM_LEFT_ASYMMETRIC_6;
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ re->before.layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ re->before.layout = ALGORITHM_LEFT_SYMMETRIC_6;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ re->before.layout = ALGORITHM_RIGHT_SYMMETRIC_6;
+ break;
+ case ALGORITHM_PARITY_0:
+ re->before.layout = ALGORITHM_PARITY_0_6;
+ break;
+ case ALGORITHM_PARITY_N:
+ re->before.layout = ALGORITHM_PARITY_N_6;
+ break;
+ default:
+ return "Cannot convert an array with this layout";
+ }
+ break;
+ case 1:
+ if (info->array.raid_disks != 2)
+ return "Can only convert a 2-device array to RAID1";
+ if (info->delta_disks != UnSet &&
+ info->delta_disks != 0)
+ return "Cannot set raid_disk when converting RAID5->RAID1";
+ re->level = 1;
+ info->new_chunk = 0;
+ return NULL;
+ default:
+ return "Impossible level change requested";
+ }
+ break;
+ case 6:
+ switch (info->new_level) {
+ case 4:
+ case 5:
+ delta_parity = -1;
+ case 6:
+ re->level = 6;
+ re->before.data_disks = info->array.raid_disks - 2;
+ re->before.layout = info->array.layout;
+ break;
+ default:
+ return "Impossible level change requested";
+ }
+ break;
+ }
+
+ /* If we reached here then it looks like a re-stripe is
+ * happening. We have determined the intermediate level
+ * and initial raid_disks/layout and stored these in 're'.
+ *
+ * We need to deduce the final layout that can be atomically
+ * converted to the end state.
+ */
+ switch (info->new_level) {
+ case 0:
+ /* We can only get to RAID0 from RAID4 or RAID5
+ * with appropriate layout and one extra device
+ */
+ if (re->level != 4 && re->level != 5)
+ return "Cannot covert to RAID0 from this level";
+
+ switch (re->level) {
+ case 4:
+ re->before.layout = 0;
+ re->after.layout = 0;
+ break;
+ case 5:
+ re->after.layout = ALGORITHM_PARITY_N;
+ break;
+ }
+ break;
+
+ case 4:
+ /* We can only get to RAID4 from RAID5 */
+ if (re->level != 4 && re->level != 5)
+ return "Cannot convert to RAID4 from this level";
+
+ switch (re->level) {
+ case 4:
+ re->after.layout = 0;
+ break;
+ case 5:
+ re->after.layout = ALGORITHM_PARITY_N;
+ break;
+ }
+ break;
+
+ case 5:
+ /* We get to RAID5 from RAID5 or RAID6 */
+ if (re->level != 5 && re->level != 6)
+ return "Cannot convert to RAID5 from this level";
+
+ switch (re->level) {
+ case 5:
+ if (info->new_layout == UnSet)
+ re->after.layout = re->before.layout;
+ else
+ re->after.layout = info->new_layout;
+ break;
+ case 6:
+ if (info->new_layout == UnSet)
+ info->new_layout = re->before.layout;
+
+ /* after.layout needs to be raid6 version of new_layout */
+ if (info->new_layout == ALGORITHM_PARITY_N)
+ re->after.layout = ALGORITHM_PARITY_N;
+ else {
+ char layout[40];
+ char *ls = map_num(r5layout, info->new_layout);
+ int l;
+ if (ls) {
+ /* Current RAID6 layout has a RAID5
+ * equivalent - good
+ */
+ strcat(strcpy(layout, ls), "-6");
+ l = map_name(r6layout, layout);
+ if (l == UnSet)
+ return "Cannot find RAID6 layout to convert to";
+ } else {
+ /* Current RAID6 has no equivalent.
+ * If it is already a '-6' layout we
+ * can leave it unchanged, else we must
+ * fail
+ */
+ ls = map_num(r6layout, info->new_layout);
+ if (!ls ||
+ strcmp(ls+strlen(ls)-2, "-6") != 0)
+ return "Please specify new layout";
+ l = info->new_layout;
+ }
+ re->after.layout = l;
+ }
+ }
+ break;
+
+ case 6:
+ /* We must already be at level 6 */
+ if (re->level != 6)
+ return "Impossible level change";
+ if (info->new_layout == UnSet)
+ re->after.layout = info->array.layout;
+ else
+ re->after.layout = info->new_layout;
+ break;
+ default:
+ return "Impossible level change requested";
+ }
+ if (info->delta_disks == UnSet)
+ info->delta_disks = delta_parity;
+
+ re->after.data_disks = (re->before.data_disks
+ + info->delta_disks
+ - delta_parity);
+ switch (re->level) {
+ case 6: re->parity = 2;
+ break;
+ case 4:
+ case 5: re->parity = 1;
+ break;
+ default: re->parity = 0;
+ break;
+ }
+ /* So we have a restripe operation, we need to calculate the number
+ * of blocks per reshape operation.
+ */
+ re->new_size = info->component_size * re->before.data_disks;
+ if (info->new_chunk == 0)
+ info->new_chunk = info->array.chunk_size;
+ if (re->after.data_disks == re->before.data_disks &&
+ re->after.layout == re->before.layout &&
+ info->new_chunk == info->array.chunk_size) {
+ /* Nothing to change, can change level immediately. */
+ re->level = info->new_level;
+ re->backup_blocks = 0;
+ return NULL;
+ }
+ if (re->after.data_disks == 1 && re->before.data_disks == 1) {
+ /* chunk and layout changes make no difference */
+ re->level = info->new_level;
+ re->backup_blocks = 0;
+ return NULL;
+ }
+
+ if (re->after.data_disks == re->before.data_disks &&
+ get_linux_version() < 2006032)
+ return "in-place reshape is not safe before 2.6.32 - sorry.";
+
+ if (re->after.data_disks < re->before.data_disks &&
+ get_linux_version() < 2006030)
+ return "reshape to fewer devices is not supported before 2.6.30 - sorry.";
+
+ re->backup_blocks = compute_backup_blocks(
+ info->new_chunk, info->array.chunk_size,
+ re->after.data_disks,
+ re->before.data_disks);
+ re->min_offset_change = re->backup_blocks / re->before.data_disks;
+
+ re->new_size = info->component_size * re->after.data_disks;
+ return NULL;
+}
+
+static int set_array_size(struct supertype *st, struct mdinfo *sra,
+ char *text_version)
+{
+ struct mdinfo *info;
+ char *subarray;
+ int ret_val = -1;
+
+ if ((st == NULL) || (sra == NULL))
+ return ret_val;
+
+ if (text_version == NULL)
+ text_version = sra->text_version;
+ subarray = strchr(text_version+1, '/')+1;
+ info = st->ss->container_content(st, subarray);
+ if (info) {
+ unsigned long long current_size = 0;
+ unsigned long long new_size =
+ info->custom_array_size/2;
+
+ if (sysfs_get_ll(sra, NULL, "array_size", &current_size) == 0 &&
+ new_size > current_size) {
+ if (sysfs_set_num(sra, NULL, "array_size", new_size)
+ < 0)
+ dprintf("Error: Cannot set array size");
+ else {
+ ret_val = 0;
+ dprintf("Array size changed");
+ }
+ dprintf_cont(" from %llu to %llu.\n",
+ current_size, new_size);
+ }
+ sysfs_free(info);
+ } else
+ dprintf("Error: set_array_size(): info pointer in NULL\n");
+
+ return ret_val;
+}
+
+static int reshape_array(char *container, int fd, char *devname,
+ struct supertype *st, struct mdinfo *info,
+ int force, struct mddev_dev *devlist,
+ unsigned long long data_offset,
+ char *backup_file, int verbose, int forked,
+ int restart, int freeze_reshape);
+static int reshape_container(char *container, char *devname,
+ int mdfd,
+ struct supertype *st,
+ struct mdinfo *info,
+ int force,
+ char *backup_file, int verbose,
+ int forked, int restart, int freeze_reshape);
+
+int Grow_reshape(char *devname, int fd,
+ struct mddev_dev *devlist,
+ unsigned long long data_offset,
+ struct context *c, struct shape *s)
+{
+ /* Make some changes in the shape of an array.
+ * The kernel must support the change.
+ *
+ * There are three different changes. Each can trigger
+ * a resync or recovery so we freeze that until we have
+ * requested everything (if kernel supports freezing - 2.6.30).
+ * The steps are:
+ * - change size (i.e. component_size)
+ * - change level
+ * - change layout/chunksize/ndisks
+ *
+ * The last can require a reshape. It is different on different
+ * levels so we need to check the level before actioning it.
+ * Some times the level change needs to be requested after the
+ * reshape (e.g. raid6->raid5, raid5->raid0)
+ *
+ */
+ struct mdu_array_info_s array;
+ int rv = 0;
+ struct supertype *st;
+ char *subarray = NULL;
+
+ int frozen;
+ int changed = 0;
+ char *container = NULL;
+ int cfd = -1;
+
+ struct mddev_dev *dv;
+ int added_disks;
+
+ struct mdinfo info;
+ struct mdinfo *sra;
+
+ if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) {
+ pr_err("%s is not an active md array - aborting\n",
+ devname);
+ return 1;
+ }
+ if (data_offset != INVALID_SECTORS && array.level != 10
+ && (array.level < 4 || array.level > 6)) {
+ pr_err("--grow --data-offset not yet supported\n");
+ return 1;
+ }
+
+ if (s->size > 0 &&
+ (s->chunk || s->level!= UnSet || s->layout_str || s->raiddisks)) {
+ pr_err("cannot change component size at the same time as other changes.\n"
+ " Change size first, then check data is intact before making other changes.\n");
+ return 1;
+ }
+
+ if (s->raiddisks && s->raiddisks < array.raid_disks && array.level > 1 &&
+ get_linux_version() < 2006032 &&
+ !check_env("MDADM_FORCE_FEWER")) {
+ pr_err("reducing the number of devices is not safe before Linux 2.6.32\n"
+ " Please use a newer kernel\n");
+ return 1;
+ }
+
+ st = super_by_fd(fd, &subarray);
+ if (!st) {
+ pr_err("Unable to determine metadata format for %s\n", devname);
+ return 1;
+ }
+ if (s->raiddisks > st->max_devs) {
+ pr_err("Cannot increase raid-disks on this array beyond %d\n", st->max_devs);
+ return 1;
+ }
+ if (s->level == 0 &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)) &&
+ !(array.state & (1<<MD_SB_CLUSTERED))) {
+ array.state &= ~(1<<MD_SB_BITMAP_PRESENT);
+ if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
+ pr_err("failed to remove internal bitmap.\n");
+ return 1;
+ }
+ }
+
+ /* in the external case we need to check that the requested reshape is
+ * supported, and perform an initial check that the container holds the
+ * pre-requisite spare devices (mdmon owns final validation)
+ */
+ if (st->ss->external) {
+ int rv;
+
+ if (subarray) {
+ container = st->container_devnm;
+ cfd = open_dev_excl(st->container_devnm);
+ } else {
+ container = st->devnm;
+ close(fd);
+ cfd = open_dev_excl(st->devnm);
+ fd = cfd;
+ }
+ if (cfd < 0) {
+ pr_err("Unable to open container for %s\n",
+ devname);
+ free(subarray);
+ return 1;
+ }
+
+ rv = st->ss->load_container(st, cfd, NULL);
+
+ if (rv) {
+ pr_err("Cannot read superblock for %s\n",
+ devname);
+ free(subarray);
+ return 1;
+ }
+
+ /* check if operation is supported for metadata handler */
+ if (st->ss->container_content) {
+ struct mdinfo *cc = NULL;
+ struct mdinfo *content = NULL;
+
+ cc = st->ss->container_content(st, subarray);
+ for (content = cc; content ; content = content->next) {
+ int allow_reshape = 1;
+
+ /* check if reshape is allowed based on metadata
+ * indications stored in content.array.status
+ */
+ if (content->array.state & (1<<MD_SB_BLOCK_VOLUME))
+ allow_reshape = 0;
+ if (content->array.state
+ & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE))
+ allow_reshape = 0;
+ if (!allow_reshape) {
+ pr_err("cannot reshape arrays in container with unsupported metadata: %s(%s)\n",
+ devname, container);
+ sysfs_free(cc);
+ free(subarray);
+ return 1;
+ }
+ }
+ sysfs_free(cc);
+ }
+ if (mdmon_running(container))
+ st->update_tail = &st->updates;
+ }
+
+ added_disks = 0;
+ for (dv = devlist; dv; dv = dv->next)
+ added_disks++;
+ if (s->raiddisks > array.raid_disks &&
+ array.spare_disks +added_disks < (s->raiddisks - array.raid_disks) &&
+ !c->force) {
+ pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n"
+ " Use --force to over-ride this check.\n",
+ s->raiddisks - array.raid_disks,
+ s->raiddisks - array.raid_disks == 1 ? "" : "s",
+ array.spare_disks + added_disks);
+ return 1;
+ }
+
+ sra = sysfs_read(fd, NULL, GET_LEVEL | GET_DISKS | GET_DEVS
+ | GET_STATE | GET_VERSION);
+ if (sra) {
+ if (st->ss->external && subarray == NULL) {
+ array.level = LEVEL_CONTAINER;
+ sra->array.level = LEVEL_CONTAINER;
+ }
+ } else {
+ pr_err("failed to read sysfs parameters for %s\n",
+ devname);
+ return 1;
+ }
+ frozen = freeze(st);
+ if (frozen < -1) {
+ /* freeze() already spewed the reason */
+ sysfs_free(sra);
+ return 1;
+ } else if (frozen < 0) {
+ pr_err("%s is performing resync/recovery and cannot be reshaped\n", devname);
+ sysfs_free(sra);
+ return 1;
+ }
+
+ /* ========= set size =============== */
+ if (s->size > 0 && (s->size == MAX_SIZE || s->size != (unsigned)array.size)) {
+ unsigned long long orig_size = get_component_size(fd)/2;
+ unsigned long long min_csize;
+ struct mdinfo *mdi;
+ int raid0_takeover = 0;
+
+ if (orig_size == 0)
+ orig_size = (unsigned) array.size;
+
+ if (orig_size == 0) {
+ pr_err("Cannot set device size in this type of array.\n");
+ rv = 1;
+ goto release;
+ }
+
+ if (reshape_super(st, s->size, UnSet, UnSet, 0, 0, UnSet, NULL,
+ devname, APPLY_METADATA_CHANGES, c->verbose > 0)) {
+ rv = 1;
+ goto release;
+ }
+ sync_metadata(st);
+ if (st->ss->external) {
+ /* metadata can have size limitation
+ * update size value according to metadata information
+ */
+ struct mdinfo *sizeinfo =
+ st->ss->container_content(st, subarray);
+ if (sizeinfo) {
+ unsigned long long new_size =
+ sizeinfo->custom_array_size/2;
+ int data_disks = get_data_disks(
+ sizeinfo->array.level,
+ sizeinfo->array.layout,
+ sizeinfo->array.raid_disks);
+ new_size /= data_disks;
+ dprintf("Metadata size correction from %llu to %llu (%llu)\n", orig_size, new_size,
+ new_size * data_disks);
+ s->size = new_size;
+ sysfs_free(sizeinfo);
+ }
+ }
+
+ /* Update the size of each member device in case
+ * they have been resized. This will never reduce
+ * below the current used-size. The "size" attribute
+ * understands '0' to mean 'max'.
+ */
+ min_csize = 0;
+ rv = 0;
+ for (mdi = sra->devs; mdi; mdi = mdi->next) {
+ if (sysfs_set_num(sra, mdi, "size",
+ s->size == MAX_SIZE ? 0 : s->size) < 0) {
+ /* Probably kernel refusing to let us
+ * reduce the size - not an error.
+ */
+ break;
+ }
+ if (array.not_persistent == 0 &&
+ array.major_version == 0 &&
+ get_linux_version() < 3001000) {
+ /* Dangerous to allow size to exceed 2TB */
+ unsigned long long csize;
+ if (sysfs_get_ll(sra, mdi, "size", &csize) == 0) {
+ if (csize >= 2ULL*1024*1024*1024)
+ csize = 2ULL*1024*1024*1024;
+ if ((min_csize == 0 || (min_csize
+ > csize)))
+ min_csize = csize;
+ }
+ }
+ }
+ if (rv) {
+ pr_err("Cannot set size on array members.\n");
+ goto size_change_error;
+ }
+ if (min_csize && s->size > min_csize) {
+ pr_err("Cannot safely make this array use more than 2TB per device on this kernel.\n");
+ rv = 1;
+ goto size_change_error;
+ }
+ if (min_csize && s->size == MAX_SIZE) {
+ /* Don't let the kernel choose a size - it will get
+ * it wrong
+ */
+ pr_err("Limited v0.90 array to 2TB per device\n");
+ s->size = min_csize;
+ }
+ if (st->ss->external) {
+ if (sra->array.level == 0) {
+ rv = sysfs_set_str(sra, NULL, "level",
+ "raid5");
+ if (!rv) {
+ raid0_takeover = 1;
+ /* get array parameters after takeover
+ * to change one parameter at time only
+ */
+ rv = ioctl(fd, GET_ARRAY_INFO, &array);
+ }
+ }
+ /* make sure mdmon is
+ * aware of the new level */
+ if (!mdmon_running(st->container_devnm))
+ start_mdmon(st->container_devnm);
+ ping_monitor(container);
+ if (mdmon_running(st->container_devnm) &&
+ st->update_tail == NULL)
+ st->update_tail = &st->updates;
+ }
+
+ if (s->size == MAX_SIZE)
+ s->size = 0;
+ array.size = s->size;
+ if (s->size & ~INT32_MAX) {
+ /* got truncated to 32bit, write to
+ * component_size instead
+ */
+ if (sra)
+ rv = sysfs_set_num(sra, NULL,
+ "component_size", s->size);
+ else
+ rv = -1;
+ } else {
+ rv = ioctl(fd, SET_ARRAY_INFO, &array);
+
+ /* manage array size when it is managed externally
+ */
+ if ((rv == 0) && st->ss->external)
+ rv = set_array_size(st, sra, sra->text_version);
+ }
+
+ if (raid0_takeover) {
+ /* do not recync non-existing parity,
+ * we will drop it anyway
+ */
+ sysfs_set_str(sra, NULL, "sync_action", "frozen");
+ /* go back to raid0, drop parity disk
+ */
+ sysfs_set_str(sra, NULL, "level", "raid0");
+ ioctl(fd, GET_ARRAY_INFO, &array);
+ }
+
+size_change_error:
+ if (rv != 0) {
+ int err = errno;
+
+ /* restore metadata */
+ if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0,
+ UnSet, NULL, devname,
+ ROLLBACK_METADATA_CHANGES,
+ c->verbose) == 0)
+ sync_metadata(st);
+ pr_err("Cannot set device size for %s: %s\n",
+ devname, strerror(err));
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ cont_err("Bitmap must be removed before size can be changed\n");
+ rv = 1;
+ goto release;
+ }
+ if (s->assume_clean) {
+ /* This will fail on kernels older than 3.0 unless
+ * a backport has been arranged.
+ */
+ if (sra == NULL ||
+ sysfs_set_str(sra, NULL, "resync_start", "none") < 0)
+ pr_err("--assume-clean not supported with --grow on this kernel\n");
+ }
+ ioctl(fd, GET_ARRAY_INFO, &array);
+ s->size = get_component_size(fd)/2;
+ if (s->size == 0)
+ s->size = array.size;
+ if (c->verbose >= 0) {
+ if (s->size == orig_size)
+ pr_err("component size of %s unchanged at %lluK\n",
+ devname, s->size);
+ else
+ pr_err("component size of %s has been set to %lluK\n",
+ devname, s->size);
+ }
+ changed = 1;
+ } else if (array.level != LEVEL_CONTAINER) {
+ s->size = get_component_size(fd)/2;
+ if (s->size == 0)
+ s->size = array.size;
+ }
+
+ /* See if there is anything else to do */
+ if ((s->level == UnSet || s->level == array.level) &&
+ (s->layout_str == NULL) &&
+ (s->chunk == 0 || s->chunk == array.chunk_size) &&
+ data_offset == INVALID_SECTORS &&
+ (s->raiddisks == 0 || s->raiddisks == array.raid_disks)) {
+ /* Nothing more to do */
+ if (!changed && c->verbose >= 0)
+ pr_err("%s: no change requested\n",
+ devname);
+ goto release;
+ }
+
+ /* ========= check for Raid10/Raid1 -> Raid0 conversion ===============
+ * current implementation assumes that following conditions must be met:
+ * - RAID10:
+ * - far_copies == 1
+ * - near_copies == 2
+ */
+ if ((s->level == 0 && array.level == 10 && sra &&
+ array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) ||
+ (s->level == 0 && array.level == 1 && sra)) {
+ int err;
+ err = remove_disks_for_takeover(st, sra, array.layout);
+ if (err) {
+ dprintf("Array cannot be reshaped\n");
+ if (cfd > -1)
+ close(cfd);
+ rv = 1;
+ goto release;
+ }
+ /* Make sure mdmon has seen the device removal
+ * and updated metadata before we continue with
+ * level change
+ */
+ if (container)
+ ping_monitor(container);
+ }
+
+ memset(&info, 0, sizeof(info));
+ info.array = array;
+ sysfs_init(&info, fd, NULL);
+ strcpy(info.text_version, sra->text_version);
+ info.component_size = s->size*2;
+ info.new_level = s->level;
+ info.new_chunk = s->chunk * 1024;
+ if (info.array.level == LEVEL_CONTAINER) {
+ info.delta_disks = UnSet;
+ info.array.raid_disks = s->raiddisks;
+ } else if (s->raiddisks)
+ info.delta_disks = s->raiddisks - info.array.raid_disks;
+ else
+ info.delta_disks = UnSet;
+ if (s->layout_str == NULL) {
+ info.new_layout = UnSet;
+ if (info.array.level == 6 &&
+ (info.new_level == 6 || info.new_level == UnSet) &&
+ info.array.layout >= 16) {
+ pr_err("%s has a non-standard layout. If you wish to preserve this\n", devname);
+ cont_err("during the reshape, please specify --layout=preserve\n");
+ cont_err("If you want to change it, specify a layout or use --layout=normalise\n");
+ rv = 1;
+ goto release;
+ }
+ } else if (strcmp(s->layout_str, "normalise") == 0 ||
+ strcmp(s->layout_str, "normalize") == 0) {
+ /* If we have a -6 RAID6 layout, remove the '-6'. */
+ info.new_layout = UnSet;
+ if (info.array.level == 6 && info.new_level == UnSet) {
+ char l[40], *h;
+ strcpy(l, map_num(r6layout, info.array.layout));
+ h = strrchr(l, '-');
+ if (h && strcmp(h, "-6") == 0) {
+ *h = 0;
+ info.new_layout = map_name(r6layout, l);
+ }
+ } else {
+ pr_err("%s is only meaningful when reshaping a RAID6 array.\n", s->layout_str);
+ rv = 1;
+ goto release;
+ }
+ } else if (strcmp(s->layout_str, "preserve") == 0) {
+ /* This means that a non-standard RAID6 layout
+ * is OK.
+ * In particular:
+ * - When reshape a RAID6 (e.g. adding a device)
+ * which is in a non-standard layout, it is OK
+ * to preserve that layout.
+ * - When converting a RAID5 to RAID6, leave it in
+ * the XXX-6 layout, don't re-layout.
+ */
+ if (info.array.level == 6 && info.new_level == UnSet)
+ info.new_layout = info.array.layout;
+ else if (info.array.level == 5 && info.new_level == 6) {
+ char l[40];
+ strcpy(l, map_num(r5layout, info.array.layout));
+ strcat(l, "-6");
+ info.new_layout = map_name(r6layout, l);
+ } else {
+ pr_err("%s in only meaningful when reshaping to RAID6\n", s->layout_str);
+ rv = 1;
+ goto release;
+ }
+ } else {
+ int l = info.new_level;
+ if (l == UnSet)
+ l = info.array.level;
+ switch (l) {
+ case 5:
+ info.new_layout = map_name(r5layout, s->layout_str);
+ break;
+ case 6:
+ info.new_layout = map_name(r6layout, s->layout_str);
+ break;
+ case 10:
+ info.new_layout = parse_layout_10(s->layout_str);
+ break;
+ case LEVEL_FAULTY:
+ info.new_layout = parse_layout_faulty(s->layout_str);
+ break;
+ default:
+ pr_err("layout not meaningful with this level\n");
+ rv = 1;
+ goto release;
+ }
+ if (info.new_layout == UnSet) {
+ pr_err("layout %s not understood for this level\n",
+ s->layout_str);
+ rv = 1;
+ goto release;
+ }
+ }
+
+ if (array.level == LEVEL_FAULTY) {
+ if (s->level != UnSet && s->level != array.level) {
+ pr_err("cannot change level of Faulty device\n");
+ rv =1 ;
+ }
+ if (s->chunk) {
+ pr_err("cannot set chunksize of Faulty device\n");
+ rv =1 ;
+ }
+ if (s->raiddisks && s->raiddisks != 1) {
+ pr_err("cannot set raid_disks of Faulty device\n");
+ rv =1 ;
+ }
+ if (s->layout_str) {
+ if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
+ dprintf("Cannot get array information.\n");
+ goto release;
+ }
+ array.layout = info.new_layout;
+ if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+ pr_err("failed to set new layout\n");
+ rv = 1;
+ } else if (c->verbose >= 0)
+ printf("layout for %s set to %d\n",
+ devname, array.layout);
+ }
+ } else if (array.level == LEVEL_CONTAINER) {
+ /* This change is to be applied to every array in the
+ * container. This is only needed when the metadata imposes
+ * restraints of the various arrays in the container.
+ * Currently we only know that IMSM requires all arrays
+ * to have the same number of devices so changing the
+ * number of devices (On-Line Capacity Expansion) must be
+ * performed at the level of the container
+ */
+ rv = reshape_container(container, devname, -1, st, &info,
+ c->force, c->backup_file, c->verbose, 0, 0, 0);
+ frozen = 0;
+ } else {
+ /* get spare devices from external metadata
+ */
+ if (st->ss->external) {
+ struct mdinfo *info2;
+
+ info2 = st->ss->container_content(st, subarray);
+ if (info2) {
+ info.array.spare_disks =
+ info2->array.spare_disks;
+ sysfs_free(info2);
+ }
+ }
+
+ /* Impose these changes on a single array. First
+ * check that the metadata is OK with the change. */
+
+ if (reshape_super(st, 0, info.new_level,
+ info.new_layout, info.new_chunk,
+ info.array.raid_disks, info.delta_disks,
+ c->backup_file, devname, APPLY_METADATA_CHANGES,
+ c->verbose)) {
+ rv = 1;
+ goto release;
+ }
+ sync_metadata(st);
+ rv = reshape_array(container, fd, devname, st, &info, c->force,
+ devlist, data_offset, c->backup_file, c->verbose,
+ 0, 0, 0);
+ frozen = 0;
+ }
+release:
+ sysfs_free(sra);
+ if (frozen > 0)
+ unfreeze(st);
+ return rv;
+}
+
+/* verify_reshape_position()
+ * Function checks if reshape position in metadata is not farther
+ * than position in md.
+ * Return value:
+ * 0 : not valid sysfs entry
+ * it can be caused by not started reshape, it should be started
+ * by reshape array or raid0 array is before takeover
+ * -1 : error, reshape position is obviously wrong
+ * 1 : success, reshape progress correct or updated
+*/
+static int verify_reshape_position(struct mdinfo *info, int level)
+{
+ int ret_val = 0;
+ char buf[40];
+ int rv;
+
+ /* read sync_max, failure can mean raid0 array */
+ rv = sysfs_get_str(info, NULL, "sync_max", buf, 40);
+
+ if (rv > 0) {
+ char *ep;
+ unsigned long long position = strtoull(buf, &ep, 0);
+
+ dprintf("Read sync_max sysfs entry is: %s\n", buf);
+ if (!(ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))) {
+ position *= get_data_disks(level,
+ info->new_layout,
+ info->array.raid_disks);
+ if (info->reshape_progress < position) {
+ dprintf("Corrected reshape progress (%llu) to md position (%llu)\n",
+ info->reshape_progress, position);
+ info->reshape_progress = position;
+ ret_val = 1;
+ } else if (info->reshape_progress > position) {
+ pr_err("Fatal error: array reshape was not properly frozen (expected reshape position is %llu, but reshape progress is %llu.\n",
+ position, info->reshape_progress);
+ ret_val = -1;
+ } else {
+ dprintf("Reshape position in md and metadata are the same;");
+ ret_val = 1;
+ }
+ }
+ } else if (rv == 0) {
+ /* for valid sysfs entry, 0-length content
+ * should be indicated as error
+ */
+ ret_val = -1;
+ }
+
+ return ret_val;
+}
+
+static unsigned long long choose_offset(unsigned long long lo,
+ unsigned long long hi,
+ unsigned long long min,
+ unsigned long long max)
+{
+ /* Choose a new offset between hi and lo.
+ * It must be between min and max, but
+ * we would prefer something near the middle of hi/lo, and also
+ * prefer to be aligned to a big power of 2.
+ *
+ * So we start with the middle, then for each bit,
+ * starting at '1' and increasing, if it is set, we either
+ * add it or subtract it if possible, preferring the option
+ * which is furthest from the boundary.
+ *
+ * We stop once we get a 1MB alignment. As units are in sectors,
+ * 1MB = 2*1024 sectors.
+ */
+ unsigned long long choice = (lo + hi) / 2;
+ unsigned long long bit = 1;
+
+ for (bit = 1; bit < 2*1024; bit = bit << 1) {
+ unsigned long long bigger, smaller;
+ if (! (bit & choice))
+ continue;
+ bigger = choice + bit;
+ smaller = choice - bit;
+ if (bigger > max && smaller < min)
+ break;
+ if (bigger > max)
+ choice = smaller;
+ else if (smaller < min)
+ choice = bigger;
+ else if (hi - bigger > smaller - lo)
+ choice = bigger;
+ else
+ choice = smaller;
+ }
+ return choice;
+}
+
+static int set_new_data_offset(struct mdinfo *sra, struct supertype *st,
+ char *devname, int delta_disks,
+ unsigned long long data_offset,
+ unsigned long long min,
+ int can_fallback)
+{
+ struct mdinfo *sd;
+ int dir = 0;
+ int err = 0;
+ unsigned long long before, after;
+
+ /* Need to find min space before and after so same is used
+ * on all devices
+ */
+ before = UINT64_MAX;
+ after = UINT64_MAX;
+ for (sd = sra->devs; sd; sd = sd->next) {
+ char *dn;
+ int dfd;
+ int rv;
+ struct supertype *st2;
+ struct mdinfo info2;
+
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ dn = map_dev(sd->disk.major, sd->disk.minor, 0);
+ dfd = dev_open(dn, O_RDONLY);
+ if (dfd < 0) {
+ pr_err("%s: cannot open component %s\n",
+ devname, dn ? dn : "-unknown-");
+ goto release;
+ }
+ st2 = dup_super(st);
+ rv = st2->ss->load_super(st2,dfd, NULL);
+ close(dfd);
+ if (rv) {
+ free(st2);
+ pr_err("%s: cannot get superblock from %s\n",
+ devname, dn);
+ goto release;
+ }
+ st2->ss->getinfo_super(st2, &info2, NULL);
+ st2->ss->free_super(st2);
+ free(st2);
+ if (info2.space_before == 0 &&
+ info2.space_after == 0) {
+ /* Metadata doesn't support data_offset changes */
+ if (!can_fallback)
+ pr_err("%s: Metadata version doesn't support data_offset changes\n",
+ devname);
+ goto fallback;
+ }
+ if (before > info2.space_before)
+ before = info2.space_before;
+ if (after > info2.space_after)
+ after = info2.space_after;
+
+ if (data_offset != INVALID_SECTORS) {
+ if (dir == 0) {
+ if (info2.data_offset == data_offset) {
+ pr_err("%s: already has that data_offset\n",
+ dn);
+ goto release;
+ }
+ if (data_offset < info2.data_offset)
+ dir = -1;
+ else
+ dir = 1;
+ } else if ((data_offset <= info2.data_offset && dir == 1) ||
+ (data_offset >= info2.data_offset && dir == -1)) {
+ pr_err("%s: differing data offsets on devices make this --data-offset setting impossible\n",
+ dn);
+ goto release;
+ }
+ }
+ }
+ if (before == UINT64_MAX)
+ /* impossible really, there must be no devices */
+ return 1;
+
+ for (sd = sra->devs; sd; sd = sd->next) {
+ char *dn = map_dev(sd->disk.major, sd->disk.minor, 0);
+ unsigned long long new_data_offset;
+
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (delta_disks < 0) {
+ /* Don't need any space as array is shrinking
+ * just move data_offset up by min
+ */
+ if (data_offset == INVALID_SECTORS)
+ new_data_offset = sd->data_offset + min;
+ else {
+ if (data_offset < sd->data_offset + min) {
+ pr_err("--data-offset too small for %s\n",
+ dn);
+ goto release;
+ }
+ new_data_offset = data_offset;
+ }
+ } else if (delta_disks > 0) {
+ /* need space before */
+ if (before < min) {
+ if (can_fallback)
+ goto fallback;
+ pr_err("Insufficient head-space for reshape on %s\n",
+ dn);
+ goto release;
+ }
+ if (data_offset == INVALID_SECTORS)
+ new_data_offset = sd->data_offset - min;
+ else {
+ if (data_offset > sd->data_offset - min) {
+ pr_err("--data-offset too large for %s\n",
+ dn);
+ goto release;
+ }
+ new_data_offset = data_offset;
+ }
+ } else {
+ if (dir == 0) {
+ /* can move up or down. If 'data_offset'
+ * was set we would have already decided,
+ * so just choose direction with most space.
+ */
+ if (before > after)
+ dir = -1;
+ else
+ dir = 1;
+ }
+ sysfs_set_str(sra, NULL, "reshape_direction",
+ dir == 1 ? "backwards" : "forwards");
+ if (dir > 0) {
+ /* Increase data offset */
+ if (after < min) {
+ if (can_fallback)
+ goto fallback;
+ pr_err("Insufficient tail-space for reshape on %s\n",
+ dn);
+ goto release;
+ }
+ if (data_offset != INVALID_SECTORS &&
+ data_offset < sd->data_offset + min) {
+ pr_err("--data-offset too small on %s\n",
+ dn);
+ goto release;
+ }
+ if (data_offset != INVALID_SECTORS)
+ new_data_offset = data_offset;
+ else
+ new_data_offset = choose_offset(sd->data_offset,
+ sd->data_offset + after,
+ sd->data_offset + min,
+ sd->data_offset + after);
+ } else {
+ /* Decrease data offset */
+ if (before < min) {
+ if (can_fallback)
+ goto fallback;
+ pr_err("insufficient head-room on %s\n",
+ dn);
+ goto release;
+ }
+ if (data_offset != INVALID_SECTORS &&
+ data_offset < sd->data_offset - min) {
+ pr_err("--data-offset too small on %s\n",
+ dn);
+ goto release;
+ }
+ if (data_offset != INVALID_SECTORS)
+ new_data_offset = data_offset;
+ else
+ new_data_offset = choose_offset(sd->data_offset - before,
+ sd->data_offset,
+ sd->data_offset - before,
+ sd->data_offset - min);
+ }
+ }
+ err = sysfs_set_num(sra, sd, "new_offset", new_data_offset);
+ if (err < 0 && errno == E2BIG) {
+ /* try again after increasing data size to max */
+ err = sysfs_set_num(sra, sd, "size", 0);
+ if (err < 0 && errno == EINVAL &&
+ !(sd->disk.state & (1<<MD_DISK_SYNC))) {
+ /* some kernels have a bug where you cannot
+ * use '0' on spare devices. */
+ sysfs_set_num(sra, sd, "size",
+ (sra->component_size + after)/2);
+ }
+ err = sysfs_set_num(sra, sd, "new_offset",
+ new_data_offset);
+ }
+ if (err < 0) {
+ if (errno == E2BIG && data_offset != INVALID_SECTORS) {
+ pr_err("data-offset is too big for %s\n",
+ dn);
+ goto release;
+ }
+ if (sd == sra->devs &&
+ (errno == ENOENT || errno == E2BIG))
+ /* Early kernel, no 'new_offset' file,
+ * or kernel doesn't like us.
+ * For RAID5/6 this is not fatal
+ */
+ return 1;
+ pr_err("Cannot set new_offset for %s\n",
+ dn);
+ break;
+ }
+ }
+ return err;
+release:
+ return -1;
+fallback:
+ /* Just use a backup file */
+ return 1;
+}
+
+static int raid10_reshape(char *container, int fd, char *devname,
+ struct supertype *st, struct mdinfo *info,
+ struct reshape *reshape,
+ unsigned long long data_offset,
+ int force, int verbose)
+{
+ /* Changing raid_disks, layout, chunksize or possibly
+ * just data_offset for a RAID10.
+ * We must always change data_offset. We change by at least
+ * ->min_offset_change which is the largest of the old and new
+ * chunk sizes.
+ * If raid_disks is increasing, then data_offset must decrease
+ * by at least this copy size.
+ * If raid_disks is unchanged, data_offset must increase or
+ * decrease by at least min_offset_change but preferably by much more.
+ * We choose half of the available space.
+ * If raid_disks is decreasing, data_offset must increase by
+ * at least min_offset_change. To allow of this, component_size
+ * must be decreased by the same amount.
+ *
+ * So we calculate the required minimum and direction, possibly
+ * reduce the component_size, then iterate through the devices
+ * and set the new_data_offset.
+ * If that all works, we set chunk_size, layout, raid_disks, and start
+ * 'reshape'
+ */
+ struct mdinfo *sra;
+ unsigned long long min;
+ int err = 0;
+
+ sra = sysfs_read(fd, NULL,
+ GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK
+ );
+ if (!sra) {
+ pr_err("%s: Cannot get array details from sysfs\n",
+ devname);
+ goto release;
+ }
+ min = reshape->min_offset_change;
+
+ if (info->delta_disks)
+ sysfs_set_str(sra, NULL, "reshape_direction",
+ info->delta_disks < 0 ? "backwards" : "forwards");
+ if (info->delta_disks < 0 &&
+ info->space_after < min) {
+ int rv = sysfs_set_num(sra, NULL, "component_size",
+ (sra->component_size -
+ min)/2);
+ if (rv) {
+ pr_err("cannot reduce component size\n");
+ goto release;
+ }
+ }
+ err = set_new_data_offset(sra, st, devname, info->delta_disks, data_offset,
+ min, 0);
+ if (err == 1) {
+ pr_err("Cannot set new_data_offset: RAID10 reshape not\n");
+ cont_err("supported on this kernel\n");
+ err = -1;
+ }
+ if (err < 0)
+ goto release;
+
+ if (!err && sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0)
+ err = errno;
+ if (!err && sysfs_set_num(sra, NULL, "layout", reshape->after.layout) < 0)
+ err = errno;
+ if (!err && sysfs_set_num(sra, NULL, "raid_disks",
+ info->array.raid_disks + info->delta_disks) < 0)
+ err = errno;
+ if (!err && sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0)
+ err = errno;
+ if (err) {
+ pr_err("Cannot set array shape for %s\n",
+ devname);
+ if (err == EBUSY &&
+ (info->array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ cont_err(" Bitmap must be removed before shape can be changed\n");
+ goto release;
+ }
+ sysfs_free(sra);
+ return 0;
+release:
+ sysfs_free(sra);
+ return 1;
+}
+
+static void get_space_after(int fd, struct supertype *st, struct mdinfo *info)
+{
+ struct mdinfo *sra, *sd;
+ /* Initialisation to silence compiler warning */
+ unsigned long long min_space_before = 0, min_space_after = 0;
+ int first = 1;
+
+ sra = sysfs_read(fd, NULL, GET_DEVS);
+ if (!sra)
+ return;
+ for (sd = sra->devs; sd; sd = sd->next) {
+ char *dn;
+ int dfd;
+ struct supertype *st2;
+ struct mdinfo info2;
+
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ dn = map_dev(sd->disk.major, sd->disk.minor, 0);
+ dfd = dev_open(dn, O_RDONLY);
+ if (dfd < 0)
+ break;
+ st2 = dup_super(st);
+ if (st2->ss->load_super(st2,dfd, NULL)) {
+ close(dfd);
+ free(st2);
+ break;
+ }
+ close(dfd);
+ st2->ss->getinfo_super(st2, &info2, NULL);
+ st2->ss->free_super(st2);
+ free(st2);
+ if (first ||
+ min_space_before > info2.space_before)
+ min_space_before = info2.space_before;
+ if (first ||
+ min_space_after > info2.space_after)
+ min_space_after = info2.space_after;
+ first = 0;
+ }
+ if (sd == NULL && !first) {
+ info->space_after = min_space_after;
+ info->space_before = min_space_before;
+ }
+ sysfs_free(sra);
+}
+
+static void update_cache_size(char *container, struct mdinfo *sra,
+ struct mdinfo *info,
+ int disks, unsigned long long blocks)
+{
+ /* Check that the internal stripe cache is
+ * large enough, or it won't work.
+ * It must hold at least 4 stripes of the larger
+ * chunk size
+ */
+ unsigned long cache;
+ cache = max(info->array.chunk_size, info->new_chunk);
+ cache *= 4; /* 4 stripes minimum */
+ cache /= 512; /* convert to sectors */
+ /* make sure there is room for 'blocks' with a bit to spare */
+ if (cache < 16 + blocks / disks)
+ cache = 16 + blocks / disks;
+ cache /= (4096/512); /* Convert from sectors to pages */
+
+ if (sra->cache_size < cache)
+ subarray_set_num(container, sra, "stripe_cache_size",
+ cache+1);
+}
+
+static int impose_reshape(struct mdinfo *sra,
+ struct mdinfo *info,
+ struct supertype *st,
+ int fd,
+ int restart,
+ char *devname, char *container,
+ struct reshape *reshape)
+{
+ struct mdu_array_info_s array;
+
+ sra->new_chunk = info->new_chunk;
+
+ if (restart) {
+ /* for external metadata checkpoint saved by mdmon can be lost
+ * or missed /due to e.g. crash/. Check if md is not during
+ * restart farther than metadata points to.
+ * If so, this means metadata information is obsolete.
+ */
+ if (st->ss->external)
+ verify_reshape_position(info, reshape->level);
+ sra->reshape_progress = info->reshape_progress;
+ } else {
+ sra->reshape_progress = 0;
+ if (reshape->after.data_disks < reshape->before.data_disks)
+ /* start from the end of the new array */
+ sra->reshape_progress = (sra->component_size
+ * reshape->after.data_disks);
+ }
+
+ ioctl(fd, GET_ARRAY_INFO, &array);
+ if (info->array.chunk_size == info->new_chunk &&
+ reshape->before.layout == reshape->after.layout &&
+ st->ss->external == 0) {
+ /* use SET_ARRAY_INFO but only if reshape hasn't started */
+ array.raid_disks = reshape->after.data_disks + reshape->parity;
+ if (!restart &&
+ ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+ int err = errno;
+
+ pr_err("Cannot set device shape for %s: %s\n",
+ devname, strerror(errno));
+
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ cont_err("Bitmap must be removed before shape can be changed\n");
+
+ goto release;
+ }
+ } else if (!restart) {
+ /* set them all just in case some old 'new_*' value
+ * persists from some earlier problem.
+ */
+ int err = 0;
+ if (sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0)
+ err = errno;
+ if (!err && sysfs_set_num(sra, NULL, "layout",
+ reshape->after.layout) < 0)
+ err = errno;
+ if (!err && subarray_set_num(container, sra, "raid_disks",
+ reshape->after.data_disks +
+ reshape->parity) < 0)
+ err = errno;
+ if (err) {
+ pr_err("Cannot set device shape for %s\n",
+ devname);
+
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ cont_err("Bitmap must be removed before shape can be changed\n");
+ goto release;
+ }
+ }
+ return 0;
+release:
+ return -1;
+}
+
+static int impose_level(int fd, int level, char *devname, int verbose)
+{
+ char *c;
+ struct mdu_array_info_s array;
+ struct mdinfo info;
+ sysfs_init(&info, fd, NULL);
+
+ ioctl(fd, GET_ARRAY_INFO, &array);
+ if (level == 0 &&
+ (array.level >= 4 && array.level <= 6)) {
+ /* To convert to RAID0 we need to fail and
+ * remove any non-data devices. */
+ int found = 0;
+ int d;
+ int data_disks = array.raid_disks - 1;
+ if (array.level == 6)
+ data_disks -= 1;
+ if (array.level == 5 &&
+ array.layout != ALGORITHM_PARITY_N)
+ return -1;
+ if (array.level == 6 &&
+ array.layout != ALGORITHM_PARITY_N_6)
+ return -1;
+ sysfs_set_str(&info, NULL,"sync_action", "idle");
+ /* First remove any spares so no recovery starts */
+ for (d = 0, found = 0;
+ d < MAX_DISKS && found < array.nr_disks;
+ d++) {
+ mdu_disk_info_t disk;
+ disk.number = d;
+ if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ found++;
+ if ((disk.state & (1 << MD_DISK_ACTIVE))
+ && disk.raid_disk < data_disks)
+ /* keep this */
+ continue;
+ ioctl(fd, HOT_REMOVE_DISK,
+ makedev(disk.major, disk.minor));
+ }
+ /* Now fail anything left */
+ ioctl(fd, GET_ARRAY_INFO, &array);
+ for (d = 0, found = 0;
+ d < MAX_DISKS && found < array.nr_disks;
+ d++) {
+ int cnt;
+ mdu_disk_info_t disk;
+ disk.number = d;
+ if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ found++;
+ if ((disk.state & (1 << MD_DISK_ACTIVE))
+ && disk.raid_disk < data_disks)
+ /* keep this */
+ continue;
+ ioctl(fd, SET_DISK_FAULTY,
+ makedev(disk.major, disk.minor));
+ cnt = 5;
+ while (ioctl(fd, HOT_REMOVE_DISK,
+ makedev(disk.major, disk.minor)) < 0
+ && errno == EBUSY
+ && cnt--) {
+ usleep(10000);
+ }
+ }
+ }
+ c = map_num(pers, level);
+ if (c) {
+ int err = sysfs_set_str(&info, NULL, "level", c);
+ if (err) {
+ err = errno;
+ pr_err("%s: could not set level to %s\n",
+ devname, c);
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ cont_err("Bitmap must be removed before level can be changed\n");
+ return err;
+ }
+ if (verbose >= 0)
+ pr_err("level of %s changed to %s\n",
+ devname, c);
+ }
+ return 0;
+}
+
+int sigterm = 0;
+static void catch_term(int sig)
+{
+ sigterm = 1;
+}
+
+static int continue_via_systemd(char *devnm)
+{
+ int skipped, i, pid, status;
+ char pathbuf[1024];
+ /* In a systemd/udev world, it is best to get systemd to
+ * run "mdadm --grow --continue" rather than running in the
+ * background.
+ */
+ switch(fork()) {
+ case 0:
+ /* FIXME yuk. CLOSE_EXEC?? */
+ skipped = 0;
+ for (i = 3; skipped < 20; i++)
+ if (close(i) < 0)
+ skipped++;
+ else
+ skipped = 0;
+
+ /* Don't want to see error messages from
+ * systemctl. If the service doesn't exist,
+ * we fork ourselves.
+ */
+ close(2);
+ open("/dev/null", O_WRONLY);
+ snprintf(pathbuf, sizeof(pathbuf), "mdadm-grow-continue@%s.service",
+ devnm);
+ status = execl("/usr/bin/systemctl", "systemctl",
+ "start",
+ pathbuf, NULL);
+ status = execl("/bin/systemctl", "systemctl", "start",
+ pathbuf, NULL);
+ exit(1);
+ case -1: /* Just do it ourselves. */
+ break;
+ default: /* parent - good */
+ pid = wait(&status);
+ if (pid >= 0 && status == 0)
+ return 1;
+ }
+ return 0;
+}
+
+static int reshape_array(char *container, int fd, char *devname,
+ struct supertype *st, struct mdinfo *info,
+ int force, struct mddev_dev *devlist,
+ unsigned long long data_offset,
+ char *backup_file, int verbose, int forked,
+ int restart, int freeze_reshape)
+{
+ struct reshape reshape;
+ int spares_needed;
+ char *msg;
+ int orig_level = UnSet;
+ int odisks;
+ int delayed;
+
+ struct mdu_array_info_s array;
+ char *c;
+
+ struct mddev_dev *dv;
+ int added_disks;
+
+ int *fdlist = NULL;
+ unsigned long long *offsets = NULL;
+ int d;
+ int nrdisks;
+ int err;
+ unsigned long blocks;
+ unsigned long long array_size;
+ int done;
+ struct mdinfo *sra = NULL;
+ char buf[20];
+
+ /* when reshaping a RAID0, the component_size might be zero.
+ * So try to fix that up.
+ */
+ if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
+ dprintf("Cannot get array information.\n");
+ goto release;
+ }
+ if (array.level == 0 && info->component_size == 0) {
+ get_dev_size(fd, NULL, &array_size);
+ info->component_size = array_size / array.raid_disks;
+ }
+
+ if (array.level == 10)
+ /* Need space_after info */
+ get_space_after(fd, st, info);
+
+ if (info->reshape_active) {
+ int new_level = info->new_level;
+ info->new_level = UnSet;
+ if (info->delta_disks > 0)
+ info->array.raid_disks -= info->delta_disks;
+ msg = analyse_change(devname, info, &reshape);
+ info->new_level = new_level;
+ if (info->delta_disks > 0)
+ info->array.raid_disks += info->delta_disks;
+ if (!restart)
+ /* Make sure the array isn't read-only */
+ ioctl(fd, RESTART_ARRAY_RW, 0);
+ } else
+ msg = analyse_change(devname, info, &reshape);
+ if (msg) {
+ /* if msg == "", error has already been printed */
+ if (msg[0])
+ pr_err("%s\n", msg);
+ goto release;
+ }
+ if (restart &&
+ (reshape.level != info->array.level ||
+ reshape.before.layout != info->array.layout ||
+ reshape.before.data_disks + reshape.parity
+ != info->array.raid_disks - max(0, info->delta_disks))) {
+ pr_err("reshape info is not in native format - cannot continue.\n");
+ goto release;
+ }
+
+ if (st->ss->external && restart && (info->reshape_progress == 0) &&
+ !((sysfs_get_str(info, NULL, "sync_action", buf, sizeof(buf)) > 0) &&
+ (strncmp(buf, "reshape", 7) == 0))) {
+ /* When reshape is restarted from '0', very begin of array
+ * it is possible that for external metadata reshape and array
+ * configuration doesn't happen.
+ * Check if md has the same opinion, and reshape is restarted
+ * from 0. If so, this is regular reshape start after reshape
+ * switch in metadata to next array only.
+ */
+ if ((verify_reshape_position(info, reshape.level) >= 0) &&
+ (info->reshape_progress == 0))
+ restart = 0;
+ }
+ if (restart) {
+ /* reshape already started. just skip to monitoring the reshape */
+ if (reshape.backup_blocks == 0)
+ return 0;
+ if (restart & RESHAPE_NO_BACKUP)
+ return 0;
+
+ /* Need 'sra' down at 'started:' */
+ sra = sysfs_read(fd, NULL,
+ GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK|
+ GET_CACHE);
+ if (!sra) {
+ pr_err("%s: Cannot get array details from sysfs\n",
+ devname);
+ goto release;
+ }
+
+ if (!backup_file)
+ backup_file = locate_backup(sra->sys_name);
+
+ goto started;
+ }
+ /* The container is frozen but the array may not be.
+ * So freeze the array so spares don't get put to the wrong use
+ * FIXME there should probably be a cleaner separation between
+ * freeze_array and freeze_container.
+ */
+ sysfs_freeze_array(info);
+ /* Check we have enough spares to not be degraded */
+ added_disks = 0;
+ for (dv = devlist; dv ; dv=dv->next)
+ added_disks++;
+ spares_needed = max(reshape.before.data_disks,
+ reshape.after.data_disks)
+ + reshape.parity - array.raid_disks;
+
+ if (!force &&
+ info->new_level > 1 && info->array.level > 1 &&
+ spares_needed > info->array.spare_disks + added_disks) {
+ pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n"
+ " Use --force to over-ride this check.\n",
+ spares_needed,
+ spares_needed == 1 ? "" : "s",
+ info->array.spare_disks + added_disks);
+ goto release;
+ }
+ /* Check we have enough spares to not fail */
+ spares_needed = max(reshape.before.data_disks,
+ reshape.after.data_disks)
+ - array.raid_disks;
+ if ((info->new_level > 1 || info->new_level == 0) &&
+ spares_needed > info->array.spare_disks +added_disks) {
+ pr_err("Need %d spare%s to create working array, and only have %d.\n",
+ spares_needed,
+ spares_needed == 1 ? "" : "s",
+ info->array.spare_disks + added_disks);
+ goto release;
+ }
+
+ if (reshape.level != array.level) {
+ int err = impose_level(fd, reshape.level, devname, verbose);
+ if (err)
+ goto release;
+ info->new_layout = UnSet; /* after level change,
+ * layout is meaningless */
+ orig_level = array.level;
+ sysfs_freeze_array(info);
+
+ if (reshape.level > 0 && st->ss->external) {
+ /* make sure mdmon is aware of the new level */
+ if (mdmon_running(container))
+ flush_mdmon(container);
+
+ if (!mdmon_running(container))
+ start_mdmon(container);
+ ping_monitor(container);
+ if (mdmon_running(container) &&
+ st->update_tail == NULL)
+ st->update_tail = &st->updates;
+ }
+ }
+ /* ->reshape_super might have chosen some spares from the
+ * container that it wants to be part of the new array.
+ * We can collect them with ->container_content and give
+ * them to the kernel.
+ */
+ if (st->ss->reshape_super && st->ss->container_content) {
+ char *subarray = strchr(info->text_version+1, '/')+1;
+ struct mdinfo *info2 =
+ st->ss->container_content(st, subarray);
+ struct mdinfo *d;
+
+ if (info2) {
+ sysfs_init(info2, fd, st->devnm);
+ /* When increasing number of devices, we need to set
+ * new raid_disks before adding these, or they might
+ * be rejected.
+ */
+ if (reshape.backup_blocks &&
+ reshape.after.data_disks > reshape.before.data_disks)
+ subarray_set_num(container, info2, "raid_disks",
+ reshape.after.data_disks +
+ reshape.parity);
+ for (d = info2->devs; d; d = d->next) {
+ if (d->disk.state == 0 &&
+ d->disk.raid_disk >= 0) {
+ /* This is a spare that wants to
+ * be part of the array.
+ */
+ add_disk(fd, st, info2, d);
+ }
+ }
+ sysfs_free(info2);
+ }
+ }
+ /* We might have been given some devices to add to the
+ * array. Now that the array has been changed to the right
+ * level and frozen, we can safely add them.
+ */
+ if (devlist)
+ Manage_subdevs(devname, fd, devlist, verbose,
+ 0,NULL, 0);
+
+ if (reshape.backup_blocks == 0 && data_offset != INVALID_SECTORS)
+ reshape.backup_blocks = reshape.before.data_disks * info->array.chunk_size/512;
+ if (reshape.backup_blocks == 0) {
+ /* No restriping needed, but we might need to impose
+ * some more changes: layout, raid_disks, chunk_size
+ */
+ /* read current array info */
+ if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) {
+ dprintf("Cannot get array information.\n");
+ goto release;
+ }
+ /* compare current array info with new values and if
+ * it is different update them to new */
+ if (info->new_layout != UnSet &&
+ info->new_layout != array.layout) {
+ array.layout = info->new_layout;
+ if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+ pr_err("failed to set new layout\n");
+ goto release;
+ } else if (verbose >= 0)
+ printf("layout for %s set to %d\n",
+ devname, array.layout);
+ }
+ if (info->delta_disks != UnSet &&
+ info->delta_disks != 0 &&
+ array.raid_disks != (info->array.raid_disks + info->delta_disks)) {
+ array.raid_disks += info->delta_disks;
+ if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) {
+ pr_err("failed to set raid disks\n");
+ goto release;
+ } else if (verbose >= 0) {
+ printf("raid_disks for %s set to %d\n",
+ devname, array.raid_disks);
+ }
+ }
+ if (info->new_chunk != 0 &&
+ info->new_chunk != array.chunk_size) {
+ if (sysfs_set_num(info, NULL,
+ "chunk_size", info->new_chunk) != 0) {
+ pr_err("failed to set chunk size\n");
+ goto release;
+ } else if (verbose >= 0)
+ printf("chunk size for %s set to %d\n",
+ devname, array.chunk_size);
+ }
+ unfreeze(st);
+ return 0;
+ }
+
+ /*
+ * There are three possibilities.
+ * 1/ The array will shrink.
+ * We need to ensure the reshape will pause before reaching
+ * the 'critical section'. We also need to fork and wait for
+ * that to happen. When it does we
+ * suspend/backup/complete/unfreeze
+ *
+ * 2/ The array will not change size.
+ * This requires that we keep a backup of a sliding window
+ * so that we can restore data after a crash. So we need
+ * to fork and monitor progress.
+ * In future we will allow the data_offset to change, so
+ * a sliding backup becomes unnecessary.
+ *
+ * 3/ The array will grow. This is relatively easy.
+ * However the kernel's restripe routines will cheerfully
+ * overwrite some early data before it is safe. So we
+ * need to make a backup of the early parts of the array
+ * and be ready to restore it if rebuild aborts very early.
+ * For externally managed metadata, we still need a forked
+ * child to monitor the reshape and suspend IO over the region
+ * that is being reshaped.
+ *
+ * We backup data by writing it to one spare, or to a
+ * file which was given on command line.
+ *
+ * In each case, we first make sure that storage is available
+ * for the required backup.
+ * Then we:
+ * - request the shape change.
+ * - fork to handle backup etc.
+ */
+ /* Check that we can hold all the data */
+ get_dev_size(fd, NULL, &array_size);
+ if (reshape.new_size < (array_size/512)) {
+ pr_err("this change will reduce the size of the array.\n"
+ " use --grow --array-size first to truncate array.\n"
+ " e.g. mdadm --grow %s --array-size %llu\n",
+ devname, reshape.new_size/2);
+ goto release;
+ }
+
+ if (array.level == 10) {
+ /* Reshaping RAID10 does not require any data backup by
+ * user-space. Instead it requires that the data_offset
+ * is changed to avoid the need for backup.
+ * So this is handled very separately
+ */
+ if (restart)
+ /* Nothing to do. */
+ return 0;
+ return raid10_reshape(container, fd, devname, st, info,
+ &reshape, data_offset,
+ force, verbose);
+ }
+ sra = sysfs_read(fd, NULL,
+ GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK|
+ GET_CACHE);
+ if (!sra) {
+ pr_err("%s: Cannot get array details from sysfs\n",
+ devname);
+ goto release;
+ }
+
+ if (!backup_file)
+ switch(set_new_data_offset(sra, st, devname,
+ reshape.after.data_disks - reshape.before.data_disks,
+ data_offset,
+ reshape.min_offset_change, 1)) {
+ case -1:
+ goto release;
+ case 0:
+ /* Updated data_offset, so it's easy now */
+ update_cache_size(container, sra, info,
+ min(reshape.before.data_disks,
+ reshape.after.data_disks),
+ reshape.backup_blocks);
+
+ /* Right, everything seems fine. Let's kick things off.
+ */
+ sync_metadata(st);
+
+ if (impose_reshape(sra, info, st, fd, restart,
+ devname, container, &reshape) < 0)
+ goto release;
+ if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) {
+ struct mdinfo *sd;
+ if (errno != EINVAL) {
+ pr_err("Failed to initiate reshape!\n");
+ goto release;
+ }
+ /* revert data_offset and try the old way */
+ for (sd = sra->devs; sd; sd = sd->next) {
+ sysfs_set_num(sra, sd, "new_offset",
+ sd->data_offset);
+ sysfs_set_str(sra, NULL, "reshape_direction",
+ "forwards");
+ }
+ break;
+ }
+ if (info->new_level == reshape.level)
+ return 0;
+ /* need to adjust level when reshape completes */
+ switch(fork()) {
+ case -1: /* ignore error, but don't wait */
+ return 0;
+ default: /* parent */
+ return 0;
+ case 0:
+ map_fork();
+ break;
+ }
+ close(fd);
+ wait_reshape(sra);
+ fd = open_dev(sra->sys_name);
+ if (fd >= 0)
+ impose_level(fd, info->new_level, devname, verbose);
+ return 0;
+ case 1: /* Couldn't set data_offset, try the old way */
+ if (data_offset != INVALID_SECTORS) {
+ pr_err("Cannot update data_offset on this array\n");
+ goto release;
+ }
+ break;
+ }
+
+started:
+ /* Decide how many blocks (sectors) for a reshape
+ * unit. The number we have so far is just a minimum
+ */
+ blocks = reshape.backup_blocks;
+ if (reshape.before.data_disks ==
+ reshape.after.data_disks) {
+ /* Make 'blocks' bigger for better throughput, but
+ * not so big that we reject it below.
+ * Try for 16 megabytes
+ */
+ while (blocks * 32 < sra->component_size &&
+ blocks < 16*1024*2)
+ blocks *= 2;
+ } else
+ pr_err("Need to backup %luK of critical section..\n", blocks/2);
+
+ if (blocks >= sra->component_size/2) {
+ pr_err("%s: Something wrong - reshape aborted\n",
+ devname);
+ goto release;
+ }
+
+ /* Now we need to open all these devices so we can read/write.
+ */
+ nrdisks = max(reshape.before.data_disks,
+ reshape.after.data_disks) + reshape.parity
+ + sra->array.spare_disks;
+ fdlist = xcalloc((1+nrdisks), sizeof(int));
+ offsets = xcalloc((1+nrdisks), sizeof(offsets[0]));
+
+ odisks = reshape.before.data_disks + reshape.parity;
+ d = reshape_prepare_fdlist(devname, sra, odisks,
+ nrdisks, blocks, backup_file,
+ fdlist, offsets);
+ if (d < odisks) {
+ goto release;
+ }
+ if ((st->ss->manage_reshape == NULL) ||
+ (st->ss->recover_backup == NULL)) {
+ if (backup_file == NULL) {
+ if (reshape.after.data_disks <=
+ reshape.before.data_disks) {
+ pr_err("%s: Cannot grow - need backup-file\n",
+ devname);
+ pr_err(" Please provide one with \"--backup=...\"\n");
+ goto release;
+ } else if (d == odisks) {
+ pr_err("%s: Cannot grow - need a spare or backup-file to backup critical section\n", devname);
+ goto release;
+ }
+ } else {
+ if (!reshape_open_backup_file(backup_file, fd, devname,
+ (signed)blocks,
+ fdlist+d, offsets+d,
+ sra->sys_name,
+ restart)) {
+ goto release;
+ }
+ d++;
+ }
+ }
+
+ update_cache_size(container, sra, info,
+ min(reshape.before.data_disks, reshape.after.data_disks),
+ blocks);
+
+ /* Right, everything seems fine. Let's kick things off.
+ * If only changing raid_disks, use ioctl, else use
+ * sysfs.
+ */
+ sync_metadata(st);
+
+ if (impose_reshape(sra, info, st, fd, restart,
+ devname, container, &reshape) < 0)
+ goto release;
+
+ err = start_reshape(sra, restart, reshape.before.data_disks,
+ reshape.after.data_disks);
+ if (err) {
+ pr_err("Cannot %s reshape for %s\n",
+ restart ? "continue" : "start",
+ devname);
+ goto release;
+ }
+ if (restart)
+ sysfs_set_str(sra, NULL, "array_state", "active");
+ if (freeze_reshape) {
+ free(fdlist);
+ free(offsets);
+ sysfs_free(sra);
+ pr_err("Reshape has to be continued from location %llu when root filesystem has been mounted.\n",
+ sra->reshape_progress);
+ return 1;
+ }
+
+ if (!forked && !check_env("MDADM_NO_SYSTEMCTL"))
+ if (continue_via_systemd(container ?: sra->sys_name)) {
+ free(fdlist);
+ free(offsets);
+ sysfs_free(sra);
+ return 0;
+ }
+
+ /* Now we just need to kick off the reshape and watch, while
+ * handling backups of the data...
+ * This is all done by a forked background process.
+ */
+ switch(forked ? 0 : fork()) {
+ case -1:
+ pr_err("Cannot run child to monitor reshape: %s\n",
+ strerror(errno));
+ abort_reshape(sra);
+ goto release;
+ default:
+ free(fdlist);
+ free(offsets);
+ sysfs_free(sra);
+ return 0;
+ case 0:
+ map_fork();
+ break;
+ }
+
+ /* If another array on the same devices is busy, the
+ * reshape will wait for them. This would mean that
+ * the first section that we suspend will stay suspended
+ * for a long time. So check on that possibility
+ * by looking for "DELAYED" in /proc/mdstat, and if found,
+ * wait a while
+ */
+ do {
+ struct mdstat_ent *mds, *m;
+ delayed = 0;
+ mds = mdstat_read(1, 0);
+ for (m = mds; m; m = m->next)
+ if (strcmp(m->devnm, sra->sys_name) == 0) {
+ if (m->resync &&
+ m->percent == RESYNC_DELAYED)
+ delayed = 1;
+ if (m->resync == 0)
+ /* Haven't started the reshape thread
+ * yet, wait a bit
+ */
+ delayed = 2;
+ break;
+ }
+ free_mdstat(mds);
+ if (delayed == 1 && get_linux_version() < 3007000) {
+ pr_err("Reshape is delayed, but cannot wait carefully with this kernel.\n"
+ " You might experience problems until other reshapes complete.\n");
+ delayed = 0;
+ }
+ if (delayed)
+ mdstat_wait(30 - (delayed-1) * 25);
+ } while (delayed);
+ mdstat_close();
+ close(fd);
+ if (check_env("MDADM_GROW_VERIFY"))
+ fd = open(devname, O_RDONLY | O_DIRECT);
+ else
+ fd = -1;
+ mlockall(MCL_FUTURE);
+
+ signal(SIGTERM, catch_term);
+
+ if (st->ss->external) {
+ /* metadata handler takes it from here */
+ done = st->ss->manage_reshape(
+ fd, sra, &reshape, st, blocks,
+ fdlist, offsets,
+ d - odisks, fdlist+odisks,
+ offsets+odisks);
+ } else
+ done = child_monitor(
+ fd, sra, &reshape, st, blocks,
+ fdlist, offsets,
+ d - odisks, fdlist+odisks,
+ offsets+odisks);
+
+ free(fdlist);
+ free(offsets);
+
+ if (backup_file && done) {
+ char *bul;
+ bul = make_backup(sra->sys_name);
+ if (bul) {
+ char buf[1024];
+ int l = readlink(bul, buf, sizeof(buf) - 1);
+ if (l > 0) {
+ buf[l]=0;
+ unlink(buf);
+ }
+ unlink(bul);
+ free(bul);
+ }
+ unlink(backup_file);
+ }
+ if (!done) {
+ abort_reshape(sra);
+ goto out;
+ }
+
+ if (!st->ss->external &&
+ !(reshape.before.data_disks != reshape.after.data_disks
+ && info->custom_array_size) &&
+ info->new_level == reshape.level &&
+ !forked) {
+ /* no need to wait for the reshape to finish as
+ * there is nothing more to do.
+ */
+ sysfs_free(sra);
+ exit(0);
+ }
+ wait_reshape(sra);
+
+ if (st->ss->external) {
+ /* Re-load the metadata as much could have changed */
+ int cfd = open_dev(st->container_devnm);
+ if (cfd >= 0) {
+ flush_mdmon(container);
+ st->ss->free_super(st);
+ st->ss->load_container(st, cfd, container);
+ close(cfd);
+ }
+ }
+
+ /* set new array size if required customer_array_size is used
+ * by this metadata.
+ */
+ if (reshape.before.data_disks !=
+ reshape.after.data_disks &&
+ info->custom_array_size)
+ set_array_size(st, info, info->text_version);
+
+ if (info->new_level != reshape.level) {
+ if (fd < 0)
+ fd = open(devname, O_RDONLY);
+ impose_level(fd, info->new_level, devname, verbose);
+ close(fd);
+ if (info->new_level == 0)
+ st->update_tail = NULL;
+ }
+out:
+ sysfs_free(sra);
+ if (forked)
+ return 0;
+ unfreeze(st);
+ exit(0);
+
+release:
+ free(fdlist);
+ free(offsets);
+ if (orig_level != UnSet && sra) {
+ c = map_num(pers, orig_level);
+ if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
+ pr_err("aborting level change\n");
+ }
+ sysfs_free(sra);
+ if (!forked)
+ unfreeze(st);
+ return 1;
+}
+
+/* mdfd handle is passed to be closed in child process (after fork).
+ */
+int reshape_container(char *container, char *devname,
+ int mdfd,
+ struct supertype *st,
+ struct mdinfo *info,
+ int force,
+ char *backup_file, int verbose,
+ int forked, int restart, int freeze_reshape)
+{
+ struct mdinfo *cc = NULL;
+ int rv = restart;
+ char last_devnm[32] = "";
+
+ /* component_size is not meaningful for a container,
+ * so pass '0' meaning 'no change'
+ */
+ if (!restart &&
+ reshape_super(st, 0, info->new_level,
+ info->new_layout, info->new_chunk,
+ info->array.raid_disks, info->delta_disks,
+ backup_file, devname, APPLY_METADATA_CHANGES,
+ verbose)) {
+ unfreeze(st);
+ return 1;
+ }
+
+ sync_metadata(st);
+
+ /* ping monitor to be sure that update is on disk
+ */
+ ping_monitor(container);
+
+ if (!forked && !freeze_reshape && !check_env("MDADM_NO_SYSTEMCTL"))
+ if (continue_via_systemd(container))
+ return 0;
+
+ switch (forked ? 0 : fork()) {
+ case -1: /* error */
+ perror("Cannot fork to complete reshape\n");
+ unfreeze(st);
+ return 1;
+ default: /* parent */
+ if (!freeze_reshape)
+ printf("%s: multi-array reshape continues in background\n", Name);
+ return 0;
+ case 0: /* child */
+ map_fork();
+ break;
+ }
+
+ /* close unused handle in child process
+ */
+ if (mdfd > -1)
+ close(mdfd);
+
+ while(1) {
+ /* For each member array with reshape_active,
+ * we need to perform the reshape.
+ * We pick the first array that needs reshaping and
+ * reshape it. reshape_array() will re-read the metadata
+ * so the next time through a different array should be
+ * ready for reshape.
+ * It is possible that the 'different' array will not
+ * be assembled yet. In that case we simple exit.
+ * When it is assembled, the mdadm which assembles it
+ * will take over the reshape.
+ */
+ struct mdinfo *content;
+ int fd;
+ struct mdstat_ent *mdstat;
+ char *adev;
+ int devid;
+
+ sysfs_free(cc);
+
+ cc = st->ss->container_content(st, NULL);
+
+ for (content = cc; content ; content = content->next) {
+ char *subarray;
+ if (!content->reshape_active)
+ continue;
+
+ subarray = strchr(content->text_version+1, '/')+1;
+ mdstat = mdstat_by_subdev(subarray, container);
+ if (!mdstat)
+ continue;
+ if (mdstat->active == 0) {
+ pr_err("Skipping inactive array %s.\n",
+ mdstat->devnm);
+ free_mdstat(mdstat);
+ mdstat = NULL;
+ continue;
+ }
+ break;
+ }
+ if (!content)
+ break;
+
+ devid = devnm2devid(mdstat->devnm);
+ adev = map_dev(major(devid), minor(devid), 0);
+ if (!adev)
+ adev = content->text_version;
+
+ fd = open_dev(mdstat->devnm);
+ if (fd < 0) {
+ pr_err("Device %s cannot be opened for reshape.\n", adev);
+ break;
+ }
+
+ if (strcmp(last_devnm, mdstat->devnm) == 0) {
+ /* Do not allow for multiple reshape_array() calls for
+ * the same array.
+ * It can happen when reshape_array() returns without
+ * error, when reshape is not finished (wrong reshape
+ * starting/continuation conditions). Mdmon doesn't
+ * switch to next array in container and reentry
+ * conditions for the same array occur.
+ * This is possibly interim until the behaviour of
+ * reshape_array is resolved().
+ */
+ printf("%s: Multiple reshape execution detected for device %s.\n", Name, adev);
+ close(fd);
+ break;
+ }
+ strcpy(last_devnm, mdstat->devnm);
+
+ sysfs_init(content, fd, mdstat->devnm);
+
+ if (mdmon_running(container))
+ flush_mdmon(container);
+
+ rv = reshape_array(container, fd, adev, st,
+ content, force, NULL, INVALID_SECTORS,
+ backup_file, verbose, 1, restart,
+ freeze_reshape);
+ close(fd);
+
+ if (freeze_reshape) {
+ sysfs_free(cc);
+ exit(0);
+ }
+
+ restart = 0;
+ if (rv)
+ break;
+
+ if (mdmon_running(container))
+ flush_mdmon(container);
+ }
+ if (!rv)
+ unfreeze(st);
+ sysfs_free(cc);
+ exit(0);
+}
+
+/*
+ * We run a child process in the background which performs the following
+ * steps:
+ * - wait for resync to reach a certain point
+ * - suspend io to the following section
+ * - backup that section
+ * - allow resync to proceed further
+ * - resume io
+ * - discard the backup.
+ *
+ * When are combined in slightly different ways in the three cases.
+ * Grow:
+ * - suspend/backup/allow/wait/resume/discard
+ * Shrink:
+ * - allow/wait/suspend/backup/allow/wait/resume/discard
+ * same-size:
+ * - wait/resume/discard/suspend/backup/allow
+ *
+ * suspend/backup/allow always come together
+ * wait/resume/discard do too.
+ * For the same-size case we have two backups to improve flow.
+ *
+ */
+
+int progress_reshape(struct mdinfo *info, struct reshape *reshape,
+ unsigned long long backup_point,
+ unsigned long long wait_point,
+ unsigned long long *suspend_point,
+ unsigned long long *reshape_completed, int *frozen)
+{
+ /* This function is called repeatedly by the reshape manager.
+ * It determines how much progress can safely be made and allows
+ * that progress.
+ * - 'info' identifies the array and particularly records in
+ * ->reshape_progress the metadata's knowledge of progress
+ * This is a sector offset from the start of the array
+ * of the next array block to be relocated. This number
+ * may increase from 0 or decrease from array_size, depending
+ * on the type of reshape that is happening.
+ * Note that in contrast, 'sync_completed' is a block count of the
+ * reshape so far. It gives the distance between the start point
+ * (head or tail of device) and the next place that data will be
+ * written. It always increases.
+ * - 'reshape' is the structure created by analyse_change
+ * - 'backup_point' shows how much the metadata manager has backed-up
+ * data. For reshapes with increasing progress, it is the next address
+ * to be backed up, previous addresses have been backed-up. For
+ * decreasing progress, it is the earliest address that has been
+ * backed up - later address are also backed up.
+ * So addresses between reshape_progress and backup_point are
+ * backed up providing those are in the 'correct' order.
+ * - 'wait_point' is an array address. When reshape_completed
+ * passes this point, progress_reshape should return. It might
+ * return earlier if it determines that ->reshape_progress needs
+ * to be updated or further backup is needed.
+ * - suspend_point is maintained by progress_reshape and the caller
+ * should not touch it except to initialise to zero.
+ * It is an array address and it only increases in 2.6.37 and earlier.
+ * This makes it difficult to handle reducing reshapes with
+ * external metadata.
+ * However: it is similar to backup_point in that it records the
+ * other end of a suspended region from reshape_progress.
+ * it is moved to extend the region that is safe to backup and/or
+ * reshape
+ * - reshape_completed is read from sysfs and returned. The caller
+ * should copy this into ->reshape_progress when it has reason to
+ * believe that the metadata knows this, and any backup outside this
+ * has been erased.
+ *
+ * Return value is:
+ * 1 if more data from backup_point - but only as far as suspend_point,
+ * should be backed up
+ * 0 if things are progressing smoothly
+ * -1 if the reshape is finished because it is all done,
+ * -2 if the reshape is finished due to an error.
+ */
+
+ int advancing = (reshape->after.data_disks
+ >= reshape->before.data_disks);
+ unsigned long long need_backup; /* All data between start of array and
+ * here will at some point need to
+ * be backed up.
+ */
+ unsigned long long read_offset, write_offset;
+ unsigned long long write_range;
+ unsigned long long max_progress, target, completed;
+ unsigned long long array_size = (info->component_size
+ * reshape->before.data_disks);
+ int fd;
+ char buf[20];
+
+ /* First, we unsuspend any region that is now known to be safe.
+ * If suspend_point is on the 'wrong' side of reshape_progress, then
+ * we don't have or need suspension at the moment. This is true for
+ * native metadata when we don't need to back-up.
+ */
+ if (advancing) {
+ if (info->reshape_progress <= *suspend_point)
+ sysfs_set_num(info, NULL, "suspend_lo",
+ info->reshape_progress);
+ } else {
+ /* Note: this won't work in 2.6.37 and before.
+ * Something somewhere should make sure we don't need it!
+ */
+ if (info->reshape_progress >= *suspend_point)
+ sysfs_set_num(info, NULL, "suspend_hi",
+ info->reshape_progress);
+ }
+
+ /* Now work out how far it is safe to progress.
+ * If the read_offset for ->reshape_progress is less than
+ * 'blocks' beyond the write_offset, we can only progress as far
+ * as a backup.
+ * Otherwise we can progress until the write_offset for the new location
+ * reaches (within 'blocks' of) the read_offset at the current location.
+ * However that region must be suspended unless we are using native
+ * metadata.
+ * If we need to suspend more, we limit it to 128M per device, which is
+ * rather arbitrary and should be some time-based calculation.
+ */
+ read_offset = info->reshape_progress / reshape->before.data_disks;
+ write_offset = info->reshape_progress / reshape->after.data_disks;
+ write_range = info->new_chunk/512;
+ if (reshape->before.data_disks == reshape->after.data_disks)
+ need_backup = array_size;
+ else
+ need_backup = reshape->backup_blocks;
+ if (advancing) {
+ if (read_offset < write_offset + write_range)
+ max_progress = backup_point;
+ else
+ max_progress =
+ read_offset *
+ reshape->after.data_disks;
+ } else {
+ if (read_offset > write_offset - write_range)
+ /* Can only progress as far as has been backed up,
+ * which must be suspended */
+ max_progress = backup_point;
+ else if (info->reshape_progress <= need_backup)
+ max_progress = backup_point;
+ else {
+ if (info->array.major_version >= 0)
+ /* Can progress until backup is needed */
+ max_progress = need_backup;
+ else {
+ /* Can progress until metadata update is required */
+ max_progress =
+ read_offset *
+ reshape->after.data_disks;
+ /* but data must be suspended */
+ if (max_progress < *suspend_point)
+ max_progress = *suspend_point;
+ }
+ }
+ }
+
+ /* We know it is safe to progress to 'max_progress' providing
+ * it is suspended or we are using native metadata.
+ * Consider extending suspend_point 128M per device if it
+ * is less than 64M per device beyond reshape_progress.
+ * But always do a multiple of 'blocks'
+ * FIXME this is too big - it takes to long to complete
+ * this much.
+ */
+ target = 64*1024*2 * min(reshape->before.data_disks,
+ reshape->after.data_disks);
+ target /= reshape->backup_blocks;
+ if (target < 2)
+ target = 2;
+ target *= reshape->backup_blocks;
+
+ /* For externally managed metadata we always need to suspend IO to
+ * the area being reshaped so we regularly push suspend_point forward.
+ * For native metadata we only need the suspend if we are going to do
+ * a backup.
+ */
+ if (advancing) {
+ if ((need_backup > info->reshape_progress
+ || info->array.major_version < 0) &&
+ *suspend_point < info->reshape_progress + target) {
+ if (need_backup < *suspend_point + 2 * target)
+ *suspend_point = need_backup;
+ else if (*suspend_point + 2 * target < array_size)
+ *suspend_point += 2 * target;
+ else
+ *suspend_point = array_size;
+ sysfs_set_num(info, NULL, "suspend_hi", *suspend_point);
+ if (max_progress > *suspend_point)
+ max_progress = *suspend_point;
+ }
+ } else {
+ if (info->array.major_version >= 0) {
+ /* Only need to suspend when about to backup */
+ if (info->reshape_progress < need_backup * 2 &&
+ *suspend_point > 0) {
+ *suspend_point = 0;
+ sysfs_set_num(info, NULL, "suspend_lo", 0);
+ sysfs_set_num(info, NULL, "suspend_hi", need_backup);
+ }
+ } else {
+ /* Need to suspend continually */
+ if (info->reshape_progress < *suspend_point)
+ *suspend_point = info->reshape_progress;
+ if (*suspend_point + target < info->reshape_progress)
+ /* No need to move suspend region yet */;
+ else {
+ if (*suspend_point >= 2 * target)
+ *suspend_point -= 2 * target;
+ else
+ *suspend_point = 0;
+ sysfs_set_num(info, NULL, "suspend_lo",
+ *suspend_point);
+ }
+ if (max_progress < *suspend_point)
+ max_progress = *suspend_point;
+ }
+ }
+
+ /* now set sync_max to allow that progress. sync_max, like
+ * sync_completed is a count of sectors written per device, so
+ * we find the difference between max_progress and the start point,
+ * and divide that by after.data_disks to get a sync_max
+ * number.
+ * At the same time we convert wait_point to a similar number
+ * for comparing against sync_completed.
+ */
+ /* scale down max_progress to per_disk */
+ max_progress /= reshape->after.data_disks;
+ /* Round to chunk size as some kernels give an erroneously high number */
+ max_progress /= info->new_chunk/512;
+ max_progress *= info->new_chunk/512;
+ /* And round to old chunk size as the kernel wants that */
+ max_progress /= info->array.chunk_size/512;
+ max_progress *= info->array.chunk_size/512;
+ /* Limit progress to the whole device */
+ if (max_progress > info->component_size)
+ max_progress = info->component_size;
+ wait_point /= reshape->after.data_disks;
+ if (!advancing) {
+ /* switch from 'device offset' to 'processed block count' */
+ max_progress = info->component_size - max_progress;
+ wait_point = info->component_size - wait_point;
+ }
+
+ if (!*frozen)
+ sysfs_set_num(info, NULL, "sync_max", max_progress);
+
+ /* Now wait. If we have already reached the point that we were
+ * asked to wait to, don't wait at all, else wait for any change.
+ * We need to select on 'sync_completed' as that is the place that
+ * notifications happen, but we are really interested in
+ * 'reshape_position'
+ */
+ fd = sysfs_get_fd(info, NULL, "sync_completed");
+ if (fd < 0)
+ goto check_progress;
+
+ if (sysfs_fd_get_ll(fd, &completed) < 0)
+ goto check_progress;
+
+ while (completed < max_progress && completed < wait_point) {
+ /* Check that sync_action is still 'reshape' to avoid
+ * waiting forever on a dead array
+ */
+ char action[20];
+ if (sysfs_get_str(info, NULL, "sync_action",
+ action, 20) <= 0 ||
+ strncmp(action, "reshape", 7) != 0)
+ break;
+ /* Some kernels reset 'sync_completed' to zero
+ * before setting 'sync_action' to 'idle'.
+ * So we need these extra tests.
+ */
+ if (completed == 0 && advancing
+ && strncmp(action, "idle", 4) == 0
+ && info->reshape_progress > 0)
+ break;
+ if (completed == 0 && !advancing
+ && strncmp(action, "idle", 4) == 0
+ && info->reshape_progress < (info->component_size
+ * reshape->after.data_disks))
+ break;
+ sysfs_wait(fd, NULL);
+ if (sysfs_fd_get_ll(fd, &completed) < 0)
+ goto check_progress;
+ }
+ /* Some kernels reset 'sync_completed' to zero,
+ * we need to have real point we are in md.
+ * So in that case, read 'reshape_position' from sysfs.
+ */
+ if (completed == 0) {
+ unsigned long long reshapep;
+ char action[20];
+ if (sysfs_get_str(info, NULL, "sync_action",
+ action, 20) > 0 &&
+ strncmp(action, "idle", 4) == 0 &&
+ sysfs_get_ll(info, NULL,
+ "reshape_position", &reshapep) == 0)
+ *reshape_completed = reshapep;
+ } else {
+ /* some kernels can give an incorrectly high
+ * 'completed' number, so round down */
+ completed /= (info->new_chunk/512);
+ completed *= (info->new_chunk/512);
+ /* Convert 'completed' back in to a 'progress' number */
+ completed *= reshape->after.data_disks;
+ if (!advancing)
+ completed = (info->component_size
+ * reshape->after.data_disks
+ - completed);
+ *reshape_completed = completed;
+ }
+
+ close(fd);
+
+ /* We return the need_backup flag. Caller will decide
+ * how much - a multiple of ->backup_blocks up to *suspend_point
+ */
+ if (advancing)
+ return need_backup > info->reshape_progress;
+ else
+ return need_backup >= info->reshape_progress;
+
+check_progress:
+ /* if we couldn't read a number from sync_completed, then
+ * either the reshape did complete, or it aborted.
+ * We can tell which by checking for 'none' in reshape_position.
+ * If it did abort, then it might immediately restart if it
+ * it was just a device failure that leaves us degraded but
+ * functioning.
+ */
+ if (sysfs_get_str(info, NULL, "reshape_position", buf, sizeof(buf)) < 0
+ || strncmp(buf, "none", 4) != 0) {
+ /* The abort might only be temporary. Wait up to 10
+ * seconds for fd to contain a valid number again.
+ */
+ int wait = 10000;
+ int rv = -2;
+ unsigned long long new_sync_max;
+ while (fd >= 0 && rv < 0 && wait > 0) {
+ if (sysfs_wait(fd, &wait) != 1)
+ break;
+ switch (sysfs_fd_get_ll(fd, &completed)) {
+ case 0:
+ /* all good again */
+ rv = 1;
+ /* If "sync_max" is no longer max_progress
+ * we need to freeze things
+ */
+ sysfs_get_ll(info, NULL, "sync_max", &new_sync_max);
+ *frozen = (new_sync_max != max_progress);
+ break;
+ case -2: /* read error - abort */
+ wait = 0;
+ break;
+ }
+ }
+ if (fd >= 0)
+ close(fd);
+ return rv; /* abort */
+ } else {
+ /* Maybe racing with array shutdown - check state */
+ if (fd >= 0)
+ close(fd);
+ if (sysfs_get_str(info, NULL, "array_state", buf, sizeof(buf)) < 0
+ || strncmp(buf, "inactive", 8) == 0
+ || strncmp(buf, "clear",5) == 0)
+ return -2; /* abort */
+ return -1; /* complete */
+ }
+}
+
+/* FIXME return status is never checked */
+static int grow_backup(struct mdinfo *sra,
+ unsigned long long offset, /* per device */
+ unsigned long stripes, /* per device, in old chunks */
+ int *sources, unsigned long long *offsets,
+ int disks, int chunk, int level, int layout,
+ int dests, int *destfd, unsigned long long *destoffsets,
+ int part, int *degraded,
+ char *buf)
+{
+ /* Backup 'blocks' sectors at 'offset' on each device of the array,
+ * to storage 'destfd' (offset 'destoffsets'), after first
+ * suspending IO. Then allow resync to continue
+ * over the suspended section.
+ * Use part 'part' of the backup-super-block.
+ */
+ int odata = disks;
+ int rv = 0;
+ int i;
+ unsigned long long ll;
+ int new_degraded;
+ //printf("offset %llu\n", offset);
+ if (level >= 4)
+ odata--;
+ if (level == 6)
+ odata--;
+
+ /* Check that array hasn't become degraded, else we might backup the wrong data */
+ if (sysfs_get_ll(sra, NULL, "degraded", &ll) < 0)
+ return -1; /* FIXME this error is ignored */
+ new_degraded = (int)ll;
+ if (new_degraded != *degraded) {
+ /* check each device to ensure it is still working */
+ struct mdinfo *sd;
+ for (sd = sra->devs ; sd ; sd = sd->next) {
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+ char sbuf[20];
+ if (sysfs_get_str(sra, sd, "state", sbuf, 20) < 0 ||
+ strstr(sbuf, "faulty") ||
+ strstr(sbuf, "in_sync") == NULL) {
+ /* this device is dead */
+ sd->disk.state = (1<<MD_DISK_FAULTY);
+ if (sd->disk.raid_disk >= 0 &&
+ sources[sd->disk.raid_disk] >= 0) {
+ close(sources[sd->disk.raid_disk]);
+ sources[sd->disk.raid_disk] = -1;
+ }
+ }
+ }
+ }
+ *degraded = new_degraded;
+ }
+ if (part) {
+ bsb.arraystart2 = __cpu_to_le64(offset * odata);
+ bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata);
+ } else {
+ bsb.arraystart = __cpu_to_le64(offset * odata);
+ bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata);
+ }
+ if (part)
+ bsb.magic[15] = '2';
+ for (i = 0; i < dests; i++)
+ if (part)
+ lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0);
+ else
+ lseek64(destfd[i], destoffsets[i], 0);
+
+ rv = save_stripes(sources, offsets,
+ disks, chunk, level, layout,
+ dests, destfd,
+ offset*512*odata, stripes * chunk * odata,
+ buf);
+
+ if (rv)
+ return rv;
+ bsb.mtime = __cpu_to_le64(time(0));
+ for (i = 0; i < dests; i++) {
+ bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
+
+ bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
+ if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
+ bsb.sb_csum2 = bsb_csum((char*)&bsb,
+ ((char*)&bsb.sb_csum2)-((char*)&bsb));
+
+ rv = -1;
+ if ((unsigned long long)lseek64(destfd[i], destoffsets[i] - 4096, 0)
+ != destoffsets[i] - 4096)
+ break;
+ if (write(destfd[i], &bsb, 512) != 512)
+ break;
+ if (destoffsets[i] > 4096) {
+ if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) !=
+ destoffsets[i]+stripes*chunk*odata)
+ break;
+ if (write(destfd[i], &bsb, 512) != 512)
+ break;
+ }
+ fsync(destfd[i]);
+ rv = 0;
+ }
+
+ return rv;
+}
+
+/* in 2.6.30, the value reported by sync_completed can be
+ * less that it should be by one stripe.
+ * This only happens when reshape hits sync_max and pauses.
+ * So allow wait_backup to either extent sync_max further
+ * than strictly necessary, or return before the
+ * sync has got quite as far as we would really like.
+ * This is what 'blocks2' is for.
+ * The various caller give appropriate values so that
+ * every works.
+ */
+/* FIXME return value is often ignored */
+static int forget_backup(int dests, int *destfd,
+ unsigned long long *destoffsets,
+ int part)
+{
+ /*
+ * Erase backup 'part' (which is 0 or 1)
+ */
+ int i;
+ int rv;
+
+ if (part) {
+ bsb.arraystart2 = __cpu_to_le64(0);
+ bsb.length2 = __cpu_to_le64(0);
+ } else {
+ bsb.arraystart = __cpu_to_le64(0);
+ bsb.length = __cpu_to_le64(0);
+ }
+ bsb.mtime = __cpu_to_le64(time(0));
+ rv = 0;
+ for (i = 0; i < dests; i++) {
+ bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
+ bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb));
+ if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
+ bsb.sb_csum2 = bsb_csum((char*)&bsb,
+ ((char*)&bsb.sb_csum2)-((char*)&bsb));
+ if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) !=
+ destoffsets[i]-4096)
+ rv = -1;
+ if (rv == 0 &&
+ write(destfd[i], &bsb, 512) != 512)
+ rv = -1;
+ fsync(destfd[i]);
+ }
+ return rv;
+}
+
+static void fail(char *msg)
+{
+ int rv;
+ rv = (write(2, msg, strlen(msg)) != (int)strlen(msg));
+ rv |= (write(2, "\n", 1) != 1);
+ exit(rv ? 1 : 2);
+}
+
+static char *abuf, *bbuf;
+static unsigned long long abuflen;
+static void validate(int afd, int bfd, unsigned long long offset)
+{
+ /* check that the data in the backup against the array.
+ * This is only used for regression testing and should not
+ * be used while the array is active
+ */
+ if (afd < 0)
+ return;
+ lseek64(bfd, offset - 4096, 0);
+ if (read(bfd, &bsb2, 512) != 512)
+ fail("cannot read bsb");
+ if (bsb2.sb_csum != bsb_csum((char*)&bsb2,
+ ((char*)&bsb2.sb_csum)-((char*)&bsb2)))
+ fail("first csum bad");
+ if (memcmp(bsb2.magic, "md_backup_data", 14) != 0)
+ fail("magic is bad");
+ if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 &&
+ bsb2.sb_csum2 != bsb_csum((char*)&bsb2,
+ ((char*)&bsb2.sb_csum2)-((char*)&bsb2)))
+ fail("second csum bad");
+
+ if (__le64_to_cpu(bsb2.devstart)*512 != offset)
+ fail("devstart is wrong");
+
+ if (bsb2.length) {
+ unsigned long long len = __le64_to_cpu(bsb2.length)*512;
+
+ if (abuflen < len) {
+ free(abuf);
+ free(bbuf);
+ abuflen = len;
+ if (posix_memalign((void**)&abuf, 4096, abuflen) ||
+ posix_memalign((void**)&bbuf, 4096, abuflen)) {
+ abuflen = 0;
+ /* just stop validating on mem-alloc failure */
+ return;
+ }
+ }
+
+ lseek64(bfd, offset, 0);
+ if ((unsigned long long)read(bfd, bbuf, len) != len) {
+ //printf("len %llu\n", len);
+ fail("read first backup failed");
+ }
+ lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0);
+ if ((unsigned long long)read(afd, abuf, len) != len)
+ fail("read first from array failed");
+ if (memcmp(bbuf, abuf, len) != 0) {
+#if 0
+ int i;
+ printf("offset=%llu len=%llu\n",
+ (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len);
+ for (i=0; i<len; i++)
+ if (bbuf[i] != abuf[i]) {
+ printf("first diff byte %d\n", i);
+ break;
+ }
+#endif
+ fail("data1 compare failed");
+ }
+ }
+ if (bsb2.length2) {
+ unsigned long long len = __le64_to_cpu(bsb2.length2)*512;
+
+ if (abuflen < len) {
+ free(abuf);
+ free(bbuf);
+ abuflen = len;
+ abuf = xmalloc(abuflen);
+ bbuf = xmalloc(abuflen);
+ }
+
+ lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0);
+ if ((unsigned long long)read(bfd, bbuf, len) != len)
+ fail("read second backup failed");
+ lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0);
+ if ((unsigned long long)read(afd, abuf, len) != len)
+ fail("read second from array failed");
+ if (memcmp(bbuf, abuf, len) != 0)
+ fail("data2 compare failed");
+ }
+}
+
+int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+ struct supertype *st, unsigned long blocks,
+ int *fds, unsigned long long *offsets,
+ int dests, int *destfd, unsigned long long *destoffsets)
+{
+ /* Monitor a reshape where backup is being performed using
+ * 'native' mechanism - either to a backup file, or
+ * to some space in a spare.
+ */
+ char *buf;
+ int degraded = -1;
+ unsigned long long speed;
+ unsigned long long suspend_point, array_size;
+ unsigned long long backup_point, wait_point;
+ unsigned long long reshape_completed;
+ int done = 0;
+ int increasing = reshape->after.data_disks >= reshape->before.data_disks;
+ int part = 0; /* The next part of the backup area to fill. It may already
+ * be full, so we need to check */
+ int level = reshape->level;
+ int layout = reshape->before.layout;
+ int data = reshape->before.data_disks;
+ int disks = reshape->before.data_disks + reshape->parity;
+ int chunk = sra->array.chunk_size;
+ struct mdinfo *sd;
+ unsigned long stripes;
+ int uuid[4];
+ int frozen = 0;
+
+ /* set up the backup-super-block. This requires the
+ * uuid from the array.
+ */
+ /* Find a superblock */
+ for (sd = sra->devs; sd; sd = sd->next) {
+ char *dn;
+ int devfd;
+ int ok;
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ dn = map_dev(sd->disk.major, sd->disk.minor, 1);
+ devfd = dev_open(dn, O_RDONLY);
+ if (devfd < 0)
+ continue;
+ ok = st->ss->load_super(st, devfd, NULL);
+ close(devfd);
+ if (ok == 0)
+ break;
+ }
+ if (!sd) {
+ pr_err("Cannot find a superblock\n");
+ return 0;
+ }
+
+ memset(&bsb, 0, 512);
+ memcpy(bsb.magic, "md_backup_data-1", 16);
+ st->ss->uuid_from_super(st, uuid);
+ memcpy(bsb.set_uuid, uuid, 16);
+ bsb.mtime = __cpu_to_le64(time(0));
+ bsb.devstart2 = blocks;
+
+ stripes = blocks / (sra->array.chunk_size/512) /
+ reshape->before.data_disks;
+
+ if (posix_memalign((void**)&buf, 4096, disks * chunk))
+ /* Don't start the 'reshape' */
+ return 0;
+ if (reshape->before.data_disks == reshape->after.data_disks) {
+ sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
+ sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
+ }
+
+ if (increasing) {
+ array_size = sra->component_size * reshape->after.data_disks;
+ backup_point = sra->reshape_progress;
+ suspend_point = 0;
+ } else {
+ array_size = sra->component_size * reshape->before.data_disks;
+ backup_point = reshape->backup_blocks;
+ suspend_point = array_size;
+ }
+
+ while (!done) {
+ int rv;
+
+ /* Want to return as soon the oldest backup slot can
+ * be released as that allows us to start backing up
+ * some more, providing suspend_point has been
+ * advanced, which it should have.
+ */
+ if (increasing) {
+ wait_point = array_size;
+ if (part == 0 && __le64_to_cpu(bsb.length) > 0)
+ wait_point = (__le64_to_cpu(bsb.arraystart) +
+ __le64_to_cpu(bsb.length));
+ if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
+ wait_point = (__le64_to_cpu(bsb.arraystart2) +
+ __le64_to_cpu(bsb.length2));
+ } else {
+ wait_point = 0;
+ if (part == 0 && __le64_to_cpu(bsb.length) > 0)
+ wait_point = __le64_to_cpu(bsb.arraystart);
+ if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
+ wait_point = __le64_to_cpu(bsb.arraystart2);
+ }
+
+ reshape_completed = sra->reshape_progress;
+ rv = progress_reshape(sra, reshape,
+ backup_point, wait_point,
+ &suspend_point, &reshape_completed,
+ &frozen);
+ /* external metadata would need to ping_monitor here */
+ sra->reshape_progress = reshape_completed;
+
+ /* Clear any backup region that is before 'here' */
+ if (increasing) {
+ if (__le64_to_cpu(bsb.length) > 0 &&
+ reshape_completed >= (__le64_to_cpu(bsb.arraystart) +
+ __le64_to_cpu(bsb.length)))
+ forget_backup(dests, destfd,
+ destoffsets, 0);
+ if (__le64_to_cpu(bsb.length2) > 0 &&
+ reshape_completed >= (__le64_to_cpu(bsb.arraystart2) +
+ __le64_to_cpu(bsb.length2)))
+ forget_backup(dests, destfd,
+ destoffsets, 1);
+ } else {
+ if (__le64_to_cpu(bsb.length) > 0 &&
+ reshape_completed <= (__le64_to_cpu(bsb.arraystart)))
+ forget_backup(dests, destfd,
+ destoffsets, 0);
+ if (__le64_to_cpu(bsb.length2) > 0 &&
+ reshape_completed <= (__le64_to_cpu(bsb.arraystart2)))
+ forget_backup(dests, destfd,
+ destoffsets, 1);
+ }
+ if (sigterm)
+ rv = -2;
+ if (rv < 0) {
+ if (rv == -1)
+ done = 1;
+ break;
+ }
+ if (rv == 0 && increasing && !st->ss->external) {
+ /* No longer need to monitor this reshape */
+ sysfs_set_str(sra, NULL, "sync_max", "max");
+ done = 1;
+ break;
+ }
+
+ while (rv) {
+ unsigned long long offset;
+ unsigned long actual_stripes;
+ /* Need to backup some data.
+ * If 'part' is not used and the desired
+ * backup size is suspended, do a backup,
+ * then consider the next part.
+ */
+ /* Check that 'part' is unused */
+ if (part == 0 && __le64_to_cpu(bsb.length) != 0)
+ break;
+ if (part == 1 && __le64_to_cpu(bsb.length2) != 0)
+ break;
+
+ offset = backup_point / data;
+ actual_stripes = stripes;
+ if (increasing) {
+ if (offset + actual_stripes * (chunk/512) >
+ sra->component_size)
+ actual_stripes = ((sra->component_size - offset)
+ / (chunk/512));
+ if (offset + actual_stripes * (chunk/512) >
+ suspend_point/data)
+ break;
+ } else {
+ if (offset < actual_stripes * (chunk/512))
+ actual_stripes = offset / (chunk/512);
+ offset -= actual_stripes * (chunk/512);
+ if (offset < suspend_point/data)
+ break;
+ }
+ if (actual_stripes == 0)
+ break;
+ grow_backup(sra, offset, actual_stripes,
+ fds, offsets,
+ disks, chunk, level, layout,
+ dests, destfd, destoffsets,
+ part, &degraded, buf);
+ validate(afd, destfd[0], destoffsets[0]);
+ /* record where 'part' is up to */
+ part = !part;
+ if (increasing)
+ backup_point += actual_stripes * (chunk/512) * data;
+ else
+ backup_point -= actual_stripes * (chunk/512) * data;
+ }
+ }
+
+ /* FIXME maybe call progress_reshape one more time instead */
+ /* remove any remaining suspension */
+ sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+ sysfs_set_num(sra, NULL, "suspend_hi", 0);
+ sysfs_set_num(sra, NULL, "suspend_lo", 0);
+ sysfs_set_num(sra, NULL, "sync_min", 0);
+
+ if (reshape->before.data_disks == reshape->after.data_disks)
+ sysfs_set_num(sra, NULL, "sync_speed_min", speed);
+ free(buf);
+ return done;
+}
+
+/*
+ * If any spare contains md_back_data-1 which is recent wrt mtime,
+ * write that data into the array and update the super blocks with
+ * the new reshape_progress
+ */
+int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt,
+ char *backup_file, int verbose)
+{
+ int i, j;
+ int old_disks;
+ unsigned long long *offsets;
+ unsigned long long nstripe, ostripe;
+ int ndata, odata;
+
+ odata = info->array.raid_disks - info->delta_disks - 1;
+ if (info->array.level == 6) odata--; /* number of data disks */
+ ndata = info->array.raid_disks - 1;
+ if (info->new_level == 6) ndata--;
+
+ old_disks = info->array.raid_disks - info->delta_disks;
+
+ if (info->delta_disks <= 0)
+ /* Didn't grow, so the backup file must have
+ * been used
+ */
+ old_disks = cnt;
+ for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
+ struct mdinfo dinfo;
+ int fd;
+ int bsbsize;
+ char *devname, namebuf[20];
+ unsigned long long lo, hi;
+
+ /* This was a spare and may have some saved data on it.
+ * Load the superblock, find and load the
+ * backup_super_block.
+ * If either fail, go on to next device.
+ * If the backup contains no new info, just return
+ * else restore data and update all superblocks
+ */
+ if (i == old_disks-1) {
+ fd = open(backup_file, O_RDONLY);
+ if (fd<0) {
+ pr_err("backup file %s inaccessible: %s\n",
+ backup_file, strerror(errno));
+ continue;
+ }
+ devname = backup_file;
+ } else {
+ fd = fdlist[i];
+ if (fd < 0)
+ continue;
+ if (st->ss->load_super(st, fd, NULL))
+ continue;
+
+ st->ss->getinfo_super(st, &dinfo, NULL);
+ st->ss->free_super(st);
+
+ if (lseek64(fd,
+ (dinfo.data_offset + dinfo.component_size - 8) <<9,
+ 0) < 0) {
+ pr_err("Cannot seek on device %d\n", i);
+ continue; /* Cannot seek */
+ }
+ sprintf(namebuf, "device-%d", i);
+ devname = namebuf;
+ }
+ if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) {
+ if (verbose)
+ pr_err("Cannot read from %s\n", devname);
+ continue; /* Cannot read */
+ }
+ if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 &&
+ memcmp(bsb.magic, "md_backup_data-2", 16) != 0) {
+ if (verbose)
+ pr_err("No backup metadata on %s\n", devname);
+ continue;
+ }
+ if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) {
+ if (verbose)
+ pr_err("Bad backup-metadata checksum on %s\n", devname);
+ continue; /* bad checksum */
+ }
+ if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 &&
+ bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) {
+ if (verbose)
+ pr_err("Bad backup-metadata checksum2 on %s\n", devname);
+ continue; /* Bad second checksum */
+ }
+ if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) {
+ if (verbose)
+ pr_err("Wrong uuid on backup-metadata on %s\n", devname);
+ continue; /* Wrong uuid */
+ }
+
+ /* array utime and backup-mtime should be updated at much the same time, but it seems that
+ * sometimes they aren't... So allow considerable flexability in matching, and allow
+ * this test to be overridden by an environment variable.
+ */
+ if(time_after(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) + 2*60*60) ||
+ time_before(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) - 10*60)) {
+ if (check_env("MDADM_GROW_ALLOW_OLD")) {
+ pr_err("accepting backup with timestamp %lu for array with timestamp %lu\n",
+ (unsigned long)__le64_to_cpu(bsb.mtime),
+ (unsigned long)info->array.utime);
+ } else {
+ pr_err("too-old timestamp on backup-metadata on %s\n", devname);
+ pr_err("If you think it is should be safe, try 'export MDADM_GROW_ALLOW_OLD=1'\n");
+ continue; /* time stamp is too bad */
+ }
+ }
+
+ if (bsb.magic[15] == '1') {
+ if (bsb.length == 0)
+ continue;
+ if (info->delta_disks >= 0) {
+ /* reshape_progress is increasing */
+ if (__le64_to_cpu(bsb.arraystart)
+ + __le64_to_cpu(bsb.length)
+ < info->reshape_progress) {
+ nonew:
+ if (verbose)
+ pr_err("backup-metadata found on %s but is not needed\n", devname);
+ continue; /* No new data here */
+ }
+ } else {
+ /* reshape_progress is decreasing */
+ if (__le64_to_cpu(bsb.arraystart) >=
+ info->reshape_progress)
+ goto nonew; /* No new data here */
+ }
+ } else {
+ if (bsb.length == 0 && bsb.length2 == 0)
+ continue;
+ if (info->delta_disks >= 0) {
+ /* reshape_progress is increasing */
+ if ((__le64_to_cpu(bsb.arraystart)
+ + __le64_to_cpu(bsb.length)
+ < info->reshape_progress)
+ &&
+ (__le64_to_cpu(bsb.arraystart2)
+ + __le64_to_cpu(bsb.length2)
+ < info->reshape_progress))
+ goto nonew; /* No new data here */
+ } else {
+ /* reshape_progress is decreasing */
+ if (__le64_to_cpu(bsb.arraystart) >=
+ info->reshape_progress &&
+ __le64_to_cpu(bsb.arraystart2) >=
+ info->reshape_progress)
+ goto nonew; /* No new data here */
+ }
+ }
+ if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) {
+ second_fail:
+ if (verbose)
+ pr_err("Failed to verify secondary backup-metadata block on %s\n",
+ devname);
+ continue; /* Cannot seek */
+ }
+ /* There should be a duplicate backup superblock 4k before here */
+ if (lseek64(fd, -4096, 1) < 0 ||
+ read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2))
+ goto second_fail; /* Cannot find leading superblock */
+ if (bsb.magic[15] == '1')
+ bsbsize = offsetof(struct mdp_backup_super, pad1);
+ else
+ bsbsize = offsetof(struct mdp_backup_super, pad);
+ if (memcmp(&bsb2, &bsb, bsbsize) != 0)
+ goto second_fail; /* Cannot find leading superblock */
+
+ /* Now need the data offsets for all devices. */
+ offsets = xmalloc(sizeof(*offsets)*info->array.raid_disks);
+ for(j=0; j<info->array.raid_disks; j++) {
+ if (fdlist[j] < 0)
+ continue;
+ if (st->ss->load_super(st, fdlist[j], NULL))
+ /* FIXME should be this be an error */
+ continue;
+ st->ss->getinfo_super(st, &dinfo, NULL);
+ st->ss->free_super(st);
+ offsets[j] = dinfo.data_offset * 512;
+ }
+ printf("%s: restoring critical section\n", Name);
+
+ if (restore_stripes(fdlist, offsets,
+ info->array.raid_disks,
+ info->new_chunk,
+ info->new_level,
+ info->new_layout,
+ fd, __le64_to_cpu(bsb.devstart)*512,
+ __le64_to_cpu(bsb.arraystart)*512,
+ __le64_to_cpu(bsb.length)*512, NULL)) {
+ /* didn't succeed, so giveup */
+ if (verbose)
+ pr_err("Error restoring backup from %s\n",
+ devname);
+ free(offsets);
+ return 1;
+ }
+
+ if (bsb.magic[15] == '2' &&
+ restore_stripes(fdlist, offsets,
+ info->array.raid_disks,
+ info->new_chunk,
+ info->new_level,
+ info->new_layout,
+ fd, __le64_to_cpu(bsb.devstart)*512 +
+ __le64_to_cpu(bsb.devstart2)*512,
+ __le64_to_cpu(bsb.arraystart2)*512,
+ __le64_to_cpu(bsb.length2)*512, NULL)) {
+ /* didn't succeed, so giveup */
+ if (verbose)
+ pr_err("Error restoring second backup from %s\n",
+ devname);
+ free(offsets);
+ return 1;
+ }
+
+ free(offsets);
+
+ /* Ok, so the data is restored. Let's update those superblocks. */
+
+ lo = hi = 0;
+ if (bsb.length) {
+ lo = __le64_to_cpu(bsb.arraystart);
+ hi = lo + __le64_to_cpu(bsb.length);
+ }
+ if (bsb.magic[15] == '2' && bsb.length2) {
+ unsigned long long lo1, hi1;
+ lo1 = __le64_to_cpu(bsb.arraystart2);
+ hi1 = lo1 + __le64_to_cpu(bsb.length2);
+ if (lo == hi) {
+ lo = lo1;
+ hi = hi1;
+ } else if (lo < lo1)
+ hi = hi1;
+ else
+ lo = lo1;
+ }
+ if (lo < hi &&
+ (info->reshape_progress < lo ||
+ info->reshape_progress > hi))
+ /* backup does not affect reshape_progress*/ ;
+ else if (info->delta_disks >= 0) {
+ info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
+ __le64_to_cpu(bsb.length);
+ if (bsb.magic[15] == '2') {
+ unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) +
+ __le64_to_cpu(bsb.length2);
+ if (p2 > info->reshape_progress)
+ info->reshape_progress = p2;
+ }
+ } else {
+ info->reshape_progress = __le64_to_cpu(bsb.arraystart);
+ if (bsb.magic[15] == '2') {
+ unsigned long long p2 = __le64_to_cpu(bsb.arraystart2);
+ if (p2 < info->reshape_progress)
+ info->reshape_progress = p2;
+ }
+ }
+ for (j=0; j<info->array.raid_disks; j++) {
+ if (fdlist[j] < 0)
+ continue;
+ if (st->ss->load_super(st, fdlist[j], NULL))
+ continue;
+ st->ss->getinfo_super(st, &dinfo, NULL);
+ dinfo.reshape_progress = info->reshape_progress;
+ st->ss->update_super(st, &dinfo,
+ "_reshape_progress",
+ NULL,0, 0, NULL);
+ st->ss->store_super(st, fdlist[j]);
+ st->ss->free_super(st);
+ }
+ return 0;
+ }
+ /* Didn't find any backup data, try to see if any
+ * was needed.
+ */
+ if (info->delta_disks < 0) {
+ /* When shrinking, the critical section is at the end.
+ * So see if we are before the critical section.
+ */
+ unsigned long long first_block;
+ nstripe = ostripe = 0;
+ first_block = 0;
+ while (ostripe >= nstripe) {
+ ostripe += info->array.chunk_size / 512;
+ first_block = ostripe * odata;
+ nstripe = first_block / ndata / (info->new_chunk/512) *
+ (info->new_chunk/512);
+ }
+
+ if (info->reshape_progress >= first_block)
+ return 0;
+ }
+ if (info->delta_disks > 0) {
+ /* See if we are beyond the critical section. */
+ unsigned long long last_block;
+ nstripe = ostripe = 0;
+ last_block = 0;
+ while (nstripe >= ostripe) {
+ nstripe += info->new_chunk / 512;
+ last_block = nstripe * ndata;
+ ostripe = last_block / odata / (info->array.chunk_size/512) *
+ (info->array.chunk_size/512);
+ }
+
+ if (info->reshape_progress >= last_block)
+ return 0;
+ }
+ /* needed to recover critical section! */
+ if (verbose)
+ pr_err("Failed to find backup of critical section\n");
+ return 1;
+}
+
+int Grow_continue_command(char *devname, int fd,
+ char *backup_file, int verbose)
+{
+ int ret_val = 0;
+ struct supertype *st = NULL;
+ struct mdinfo *content = NULL;
+ struct mdinfo array;
+ char *subarray = NULL;
+ struct mdinfo *cc = NULL;
+ struct mdstat_ent *mdstat = NULL;
+ int cfd = -1;
+ int fd2 = -1;
+
+ dprintf("Grow continue from command line called for %s\n",
+ devname);
+
+ st = super_by_fd(fd, &subarray);
+ if (!st || !st->ss) {
+ pr_err("Unable to determine metadata format for %s\n",
+ devname);
+ return 1;
+ }
+ dprintf("Grow continue is run for ");
+ if (st->ss->external == 0) {
+ int d;
+ dprintf_cont("native array (%s)\n", devname);
+ if (ioctl(fd, GET_ARRAY_INFO, &array.array) < 0) {
+ pr_err("%s is not an active md array - aborting\n", devname);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+ content = &array;
+ /* Need to load a superblock.
+ * FIXME we should really get what we need from
+ * sysfs
+ */
+ for (d = 0; d < MAX_DISKS; d++) {
+ mdu_disk_info_t disk;
+ char *dv;
+ int err;
+ disk.number = d;
+ if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ if ((disk.state & (1 << MD_DISK_ACTIVE)) == 0)
+ continue;
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv)
+ continue;
+ fd2 = dev_open(dv, O_RDONLY);
+ if (fd2 < 0)
+ continue;
+ err = st->ss->load_super(st, fd2, NULL);
+ close(fd2);
+ /* invalidate fd2 to avoid possible double close() */
+ fd2 = -1;
+ if (err)
+ continue;
+ break;
+ }
+ if (d == MAX_DISKS) {
+ pr_err("Unable to load metadata for %s\n",
+ devname);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+ st->ss->getinfo_super(st, content, NULL);
+ } else {
+ char *container;
+
+ if (subarray) {
+ dprintf_cont("subarray (%s)\n", subarray);
+ container = st->container_devnm;
+ cfd = open_dev_excl(st->container_devnm);
+ } else {
+ container = st->devnm;
+ close(fd);
+ cfd = open_dev_excl(st->devnm);
+ dprintf_cont("container (%s)\n", container);
+ fd = cfd;
+ }
+ if (cfd < 0) {
+ pr_err("Unable to open container for %s\n", devname);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+
+ /* find in container array under reshape
+ */
+ ret_val = st->ss->load_container(st, cfd, NULL);
+ if (ret_val) {
+ pr_err("Cannot read superblock for %s\n",
+ devname);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+
+ cc = st->ss->container_content(st, subarray);
+ for (content = cc; content ; content = content->next) {
+ char *array;
+ int allow_reshape = 1;
+
+ if (content->reshape_active == 0)
+ continue;
+ /* The decision about array or container wide
+ * reshape is taken in Grow_continue based
+ * content->reshape_active state, therefore we
+ * need to check_reshape based on
+ * reshape_active and subarray name
+ */
+ if (content->array.state & (1<<MD_SB_BLOCK_VOLUME))
+ allow_reshape = 0;
+ if (content->reshape_active == CONTAINER_RESHAPE &&
+ (content->array.state
+ & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE)))
+ allow_reshape = 0;
+
+ if (!allow_reshape) {
+ pr_err("cannot continue reshape of an array in container with unsupported metadata: %s(%s)\n",
+ devname, container);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+
+ array = strchr(content->text_version+1, '/')+1;
+ mdstat = mdstat_by_subdev(array, container);
+ if (!mdstat)
+ continue;
+ if (mdstat->active == 0) {
+ pr_err("Skipping inactive array %s.\n",
+ mdstat->devnm);
+ free_mdstat(mdstat);
+ mdstat = NULL;
+ continue;
+ }
+ break;
+ }
+ if (!content) {
+ pr_err("Unable to determine reshaped array for %s\n", devname);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+ fd2 = open_dev(mdstat->devnm);
+ if (fd2 < 0) {
+ pr_err("cannot open (%s)\n", mdstat->devnm);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+
+ sysfs_init(content, fd2, mdstat->devnm);
+
+ close(fd2);
+ fd2 = -1;
+
+ /* start mdmon in case it is not running
+ */
+ if (!mdmon_running(container))
+ start_mdmon(container);
+ ping_monitor(container);
+
+ if (mdmon_running(container))
+ st->update_tail = &st->updates;
+ else {
+ pr_err("No mdmon found. Grow cannot continue.\n");
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+ }
+
+ /* verify that array under reshape is started from
+ * correct position
+ */
+ if (verify_reshape_position(content, content->array.level) < 0) {
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+
+ /* continue reshape
+ */
+ ret_val = Grow_continue(fd, st, content, backup_file, 1, 0);
+
+Grow_continue_command_exit:
+ if (fd2 > -1)
+ close(fd2);
+ if (cfd > -1)
+ close(cfd);
+ st->ss->free_super(st);
+ free_mdstat(mdstat);
+ sysfs_free(cc);
+ free(subarray);
+
+ return ret_val;
+}
+
+int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
+ char *backup_file, int forked, int freeze_reshape)
+{
+ int ret_val = 2;
+
+ if (!info->reshape_active)
+ return ret_val;
+
+ if (st->ss->external) {
+ int cfd = open_dev(st->container_devnm);
+
+ if (cfd < 0)
+ return 1;
+
+ st->ss->load_container(st, cfd, st->container_devnm);
+ close(cfd);
+ ret_val = reshape_container(st->container_devnm, NULL, mdfd,
+ st, info, 0, backup_file,
+ 0, forked,
+ 1 | info->reshape_active,
+ freeze_reshape);
+ } else
+ ret_val = reshape_array(NULL, mdfd, "array", st, info, 1,
+ NULL, INVALID_SECTORS,
+ backup_file, 0, forked,
+ 1 | info->reshape_active,
+ freeze_reshape);
+
+ return ret_val;
+}
+
+char *make_backup(char *name)
+{
+ char *base = "backup_file-";
+ int len;
+ char *fname;
+
+ len = strlen(MAP_DIR) + 1 + strlen(base) + strlen(name)+1;
+ fname = xmalloc(len);
+ sprintf(fname, "%s/%s%s", MAP_DIR, base, name);
+ return fname;
+}
+
+char *locate_backup(char *name)
+{
+ char *fl = make_backup(name);
+ struct stat stb;
+
+ if (stat(fl, &stb) == 0 &&
+ S_ISREG(stb.st_mode))
+ return fl;
+
+ free(fl);
+ return NULL;
+}
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..f7bcc3e
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,13 @@
+
+To build mdadm, simply run:
+
+ make
+
+to install, run
+
+ make install
+
+as root.
+
+
+No configuration is necessary.
diff --git a/Incremental.c b/Incremental.c
new file mode 100644
index 0000000..24fd827
--- /dev/null
+++ b/Incremental.c
@@ -0,0 +1,1808 @@
+/*
+ * Incremental.c - support --incremental. Part of:
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ * Paper: Neil Brown
+ * Novell Inc
+ * GPO Box Q1283
+ * QVB Post Office, NSW 1230
+ * Australia
+ */
+
+#include "mdadm.h"
+#include <sys/wait.h>
+#include <dirent.h>
+#include <ctype.h>
+
+static int count_active(struct supertype *st, struct mdinfo *sra,
+ int mdfd, char **availp,
+ struct mdinfo *info);
+static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
+ int number, __u64 events, int verbose,
+ char *array_name);
+static int try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+ struct map_ent *target,
+ struct supertype *st, int verbose);
+
+static int Incremental_container(struct supertype *st, char *devname,
+ struct context *c, char *only);
+
+int Incremental(struct mddev_dev *devlist, struct context *c,
+ struct supertype *st)
+{
+ /* Add this device to an array, creating the array if necessary
+ * and starting the array if sensible or - if runstop>0 - if possible.
+ *
+ * This has several steps:
+ *
+ * 1/ Check if device is permitted by mdadm.conf, reject if not.
+ * 2/ Find metadata, reject if none appropriate (check
+ * version/name from args)
+ * 3/ Check if there is a match in mdadm.conf
+ * 3a/ if not, check for homehost match. If no match, assemble as
+ * a 'foreign' array.
+ * 4/ Determine device number.
+ * - If in mdadm.conf with std name, use that
+ * - UUID in /var/run/mdadm.map use that
+ * - If name is suggestive, use that. unless in use with different uuid.
+ * - Choose a free, high number.
+ * - Use a partitioned device unless strong suggestion not to.
+ * e.g. auto=md
+ * Don't choose partitioned for containers.
+ * 5/ Find out if array already exists
+ * 5a/ if it does not
+ * - choose a name, from mdadm.conf or 'name' field in array.
+ * - create the array
+ * - add the device
+ * 5b/ if it does
+ * - check one drive in array to make sure metadata is a reasonably
+ * close match. Reject if not (e.g. different type)
+ * - add the device
+ * 6/ Make sure /var/run/mdadm.map contains this array.
+ * 7/ Is there enough devices to possibly start the array?
+ * For a container, this means running Incremental_container.
+ * 7a/ if not, finish with success.
+ * 7b/ if yes,
+ * - read all metadata and arrange devices like -A does
+ * - if number of OK devices match expected, or -R and there are enough,
+ * start the array (auto-readonly).
+ */
+ struct stat stb;
+ struct mdinfo info, dinfo;
+ struct mdinfo *sra = NULL, *d;
+ struct mddev_ident *match;
+ char chosen_name[1024];
+ char *md_devname;
+ int rv = 1;
+ struct map_ent *mp, *map = NULL;
+ int dfd = -1, mdfd = -1;
+ char *avail = NULL;
+ int active_disks;
+ int trustworthy;
+ char *name_to_use;
+ mdu_array_info_t ainf;
+ struct dev_policy *policy = NULL;
+ struct map_ent target_array;
+ int have_target;
+ char *devname = devlist->devname;
+ int journal_device_missing = 0;
+
+ struct createinfo *ci = conf_get_create_info();
+
+ if (stat(devname, &stb) < 0) {
+ if (c->verbose >= 0)
+ pr_err("stat failed for %s: %s.\n",
+ devname, strerror(errno));
+ return rv;
+ }
+ if ((stb.st_mode & S_IFMT) != S_IFBLK) {
+ if (c->verbose >= 0)
+ pr_err("%s is not a block device.\n",
+ devname);
+ return rv;
+ }
+ dfd = dev_open(devname, O_RDONLY);
+ if (dfd < 0) {
+ if (c->verbose >= 0)
+ pr_err("cannot open %s: %s.\n",
+ devname, strerror(errno));
+ return rv;
+ }
+ /* If the device is a container, we do something very different */
+ if (must_be_container(dfd)) {
+ if (!st)
+ st = super_by_fd(dfd, NULL);
+ if (st && st->ss->load_container)
+ rv = st->ss->load_container(st, dfd, NULL);
+
+ close(dfd);
+ if (!rv && st->ss->container_content) {
+ if (map_lock(&map))
+ pr_err("failed to get exclusive lock on mapfile\n");
+ if (c->export)
+ printf("MD_DEVNAME=%s\n", devname);
+ rv = Incremental_container(st, devname, c, NULL);
+ map_unlock(&map);
+ return rv;
+ }
+
+ pr_err("%s is not part of an md array.\n",
+ devname);
+ return rv;
+ }
+
+ /* 1/ Check if device is permitted by mdadm.conf */
+
+ for (;devlist; devlist = devlist->next)
+ if (conf_test_dev(devlist->devname))
+ break;
+ if (!devlist) {
+ devlist = conf_get_devs();
+ for (;devlist; devlist = devlist->next) {
+ struct stat st2;
+ if (stat(devlist->devname, &st2) == 0 &&
+ (st2.st_mode & S_IFMT) == S_IFBLK &&
+ st2.st_rdev == stb.st_rdev)
+ break;
+ }
+ }
+ if (!devlist) {
+ if (c->verbose >= 0)
+ pr_err("%s not permitted by mdadm.conf.\n",
+ devname);
+ goto out;
+ }
+
+ /* 2/ Find metadata, reject if none appropriate (check
+ * version/name from args) */
+
+ if (fstat(dfd, &stb) < 0) {
+ if (c->verbose >= 0)
+ pr_err("fstat failed for %s: %s.\n",
+ devname, strerror(errno));
+ goto out;
+ }
+ if ((stb.st_mode & S_IFMT) != S_IFBLK) {
+ if (c->verbose >= 0)
+ pr_err("%s is not a block device.\n",
+ devname);
+ goto out;
+ }
+
+ dinfo.disk.major = major(stb.st_rdev);
+ dinfo.disk.minor = minor(stb.st_rdev);
+
+ policy = disk_policy(&dinfo);
+ have_target = policy_check_path(&dinfo, &target_array);
+
+ if (st == NULL && (st = guess_super_type(dfd, guess_array)) == NULL) {
+ if (c->verbose >= 0)
+ pr_err("no recognisable superblock on %s.\n",
+ devname);
+ rv = try_spare(devname, &dfd, policy,
+ have_target ? &target_array : NULL,
+ NULL, c->verbose);
+ goto out;
+ }
+ st->ignore_hw_compat = 0;
+
+ if (st->ss->compare_super == NULL ||
+ st->ss->load_super(st, dfd, c->verbose >= 0 ? devname : NULL)) {
+ if (c->verbose >= 0)
+ pr_err("no RAID superblock on %s.\n",
+ devname);
+ rv = try_spare(devname, &dfd, policy,
+ have_target ? &target_array : NULL,
+ st, c->verbose);
+ free(st);
+ goto out;
+ }
+ close (dfd); dfd = -1;
+
+ st->ss->getinfo_super(st, &info, NULL);
+
+ /* 3/ Check if there is a match in mdadm.conf */
+ match = conf_match(st, &info, devname, c->verbose, &rv);
+ if (!match && rv == 2)
+ goto out;
+
+ if (match && match->devname
+ && strcasecmp(match->devname, "<ignore>") == 0) {
+ if (c->verbose >= 0)
+ pr_err("array containing %s is explicitly ignored by mdadm.conf\n",
+ devname);
+ goto out;
+ }
+
+ /* 3a/ if not, check for homehost match. If no match, continue
+ * but don't trust the 'name' in the array. Thus a 'random' minor
+ * number will be assigned, and the device name will be based
+ * on that. */
+ if (match)
+ trustworthy = LOCAL;
+ else if (st->ss->match_home(st, c->homehost) == 1)
+ trustworthy = LOCAL;
+ else if (st->ss->match_home(st, "any") == 1)
+ trustworthy = LOCAL_ANY;
+ else
+ trustworthy = FOREIGN;
+
+ if (!match && !conf_test_metadata(st->ss->name, policy,
+ (trustworthy == LOCAL))) {
+ if (c->verbose >= 1)
+ pr_err("%s has metadata type %s for which auto-assembly is disabled\n",
+ devname, st->ss->name);
+ goto out;
+ }
+ if (trustworthy == LOCAL_ANY)
+ trustworthy = LOCAL;
+
+ /* There are three possible sources for 'autof': command line,
+ * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf.
+ * ARRAY takes precedence, then command line, then
+ * CREATE.
+ */
+ if (match && match->autof)
+ c->autof = match->autof;
+ if (c->autof == 0)
+ c->autof = ci->autof;
+
+ name_to_use = info.name;
+ if (name_to_use[0] == 0 &&
+ info.array.level == LEVEL_CONTAINER) {
+ name_to_use = info.text_version;
+ trustworthy = METADATA;
+ }
+ if (name_to_use[0] && trustworthy != LOCAL &&
+ ! c->require_homehost &&
+ conf_name_is_free(name_to_use))
+ trustworthy = LOCAL;
+
+ /* strip "hostname:" prefix from name if we have decided
+ * to treat it as LOCAL
+ */
+ if (trustworthy == LOCAL && strchr(name_to_use, ':') != NULL)
+ name_to_use = strchr(name_to_use, ':')+1;
+
+ /* 4/ Check if array exists.
+ */
+ if (map_lock(&map))
+ pr_err("failed to get exclusive lock on mapfile\n");
+ /* Now check we can get O_EXCL. If not, probably "mdadm -A" has
+ * taken over
+ */
+ dfd = dev_open(devname, O_RDONLY|O_EXCL);
+ if (dfd < 0) {
+ if (c->verbose >= 0)
+ pr_err("cannot reopen %s: %s.\n",
+ devname, strerror(errno));
+ goto out_unlock;
+ }
+ /* Cannot hold it open while we add the device to the array,
+ * so we must release the O_EXCL and depend on the map_lock()
+ * So now is the best time to remove any partitions.
+ */
+ remove_partitions(dfd);
+ close(dfd);
+ dfd = -1;
+
+ mp = map_by_uuid(&map, info.uuid);
+ if (mp)
+ mdfd = open_dev(mp->devnm);
+ else
+ mdfd = -1;
+
+ if (mdfd < 0) {
+
+ /* Skip the clustered ones. This should be started by
+ * clustering resource agents
+ */
+ if (info.array.state & (1 << MD_SB_CLUSTERED))
+ goto out;
+
+ /* Couldn't find an existing array, maybe make a new one */
+ mdfd = create_mddev(match ? match->devname : NULL,
+ name_to_use, c->autof, trustworthy, chosen_name);
+
+ if (mdfd < 0)
+ goto out_unlock;
+
+ sysfs_init(&info, mdfd, NULL);
+
+ if (set_array_info(mdfd, st, &info) != 0) {
+ pr_err("failed to set array info for %s: %s\n",
+ chosen_name, strerror(errno));
+ rv = 2;
+ goto out_unlock;
+ }
+
+ dinfo = info;
+ dinfo.disk.major = major(stb.st_rdev);
+ dinfo.disk.minor = minor(stb.st_rdev);
+ if (add_disk(mdfd, st, &info, &dinfo) != 0) {
+ pr_err("failed to add %s to new array %s: %s.\n",
+ devname, chosen_name, strerror(errno));
+ ioctl(mdfd, STOP_ARRAY, 0);
+ rv = 2;
+ goto out_unlock;
+ }
+ sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE |
+ GET_OFFSET | GET_SIZE));
+
+ if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) {
+ /* It really should be 'none' - must be old buggy
+ * kernel, and mdadm -I may not be able to complete.
+ * So reject it.
+ */
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ pr_err("You have an old buggy kernel which cannot support\n --incremental reliably. Aborting.\n");
+ rv = 2;
+ goto out_unlock;
+ }
+ info.array.working_disks = 1;
+ /* 6/ Make sure /var/run/mdadm.map contains this array. */
+ map_update(&map, fd2devnm(mdfd),
+ info.text_version,
+ info.uuid, chosen_name);
+ } else {
+ /* 5b/ if it does */
+ /* - check one drive in array to make sure metadata is a reasonably */
+ /* close match. Reject if not (e.g. different type) */
+ /* - add the device */
+ char dn[20];
+ int dfd2;
+ int err;
+ struct supertype *st2;
+ struct mdinfo info2, *d;
+
+ sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE |
+ GET_OFFSET | GET_SIZE));
+
+ if (mp->path)
+ strcpy(chosen_name, mp->path);
+ else
+ strcpy(chosen_name, mp->devnm);
+
+ /* It is generally not OK to add non-spare drives to a
+ * running array as they are probably missing because
+ * they failed. However if runstop is 1, then the
+ * array was possibly started early and our best bet is
+ * to add this anyway.
+ * Also if action policy is re-add or better we allow
+ * re-add.
+ * This doesn't apply to containers as the 'non-spare'
+ * flag has a different meaning. The test has to happen
+ * at the device level there
+ */
+ if (!st->ss->external
+ && (info.disk.state & (1<<MD_DISK_SYNC)) != 0
+ && ! policy_action_allows(policy, st->ss->name,
+ act_re_add)
+ && c->runstop < 1) {
+ if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) {
+ pr_err("not adding %s to active array (without --run) %s\n",
+ devname, chosen_name);
+ rv = 2;
+ goto out_unlock;
+ }
+ }
+ if (!sra) {
+ rv = 2;
+ goto out_unlock;
+ }
+ if (sra->devs) {
+ sprintf(dn, "%d:%d", sra->devs->disk.major,
+ sra->devs->disk.minor);
+ dfd2 = dev_open(dn, O_RDONLY);
+ if (dfd2 < 0) {
+ pr_err("unable to open %s\n", devname);
+ rv = 2;
+ goto out_unlock;
+ }
+ st2 = dup_super(st);
+ if (st2->ss->load_super(st2, dfd2, NULL) ||
+ st->ss->compare_super(st, st2) != 0) {
+ pr_err("metadata mismatch between %s and chosen array %s\n",
+ devname, chosen_name);
+ close(dfd2);
+ rv = 2;
+ goto out_unlock;
+ }
+ close(dfd2);
+ st2->ss->getinfo_super(st2, &info2, NULL);
+ st2->ss->free_super(st2);
+ if (info.array.level != info2.array.level ||
+ memcmp(info.uuid, info2.uuid, 16) != 0 ||
+ info.array.raid_disks != info2.array.raid_disks) {
+ pr_err("unexpected difference between %s and %s.\n",
+ chosen_name, devname);
+ rv = 2;
+ goto out_unlock;
+ }
+ }
+ info.disk.major = major(stb.st_rdev);
+ info.disk.minor = minor(stb.st_rdev);
+ /* add disk needs to know about containers */
+ if (st->ss->external)
+ sra->array.level = LEVEL_CONTAINER;
+
+ if (info.array.state & (1 << MD_SB_CLUSTERED))
+ info.disk.state |= (1 << MD_DISK_CLUSTER_ADD);
+
+ err = add_disk(mdfd, st, sra, &info);
+ if (err < 0 && errno == EBUSY) {
+ /* could be another device present with the same
+ * disk.number. Find and reject any such
+ */
+ find_reject(mdfd, st, sra, info.disk.number,
+ info.events, c->verbose, chosen_name);
+ err = add_disk(mdfd, st, sra, &info);
+ }
+ if (err < 0 && errno == EINVAL &&
+ info.disk.state & (1<<MD_DISK_SYNC)) {
+ /* Maybe it needs to be added as a spare */
+ if (policy_action_allows(policy, st->ss->name,
+ act_force_spare)) {
+ info.disk.state &= ~(1<<MD_DISK_SYNC);
+ err = add_disk(mdfd, st, sra, &info);
+ } else
+ if (c->verbose >= 0)
+ pr_err("can only add %s to %s as a spare, and force-spare is not set.\n",
+ devname, chosen_name);
+ }
+ if (err < 0) {
+ pr_err("failed to add %s to existing array %s: %s.\n",
+ devname, chosen_name, strerror(errno));
+ rv = 2;
+ goto out_unlock;
+ }
+ info.array.working_disks = 0;
+ for (d = sra->devs; d; d=d->next)
+ info.array.working_disks ++;
+
+ }
+ if (strncmp(chosen_name, "/dev/md/", 8) == 0)
+ md_devname = chosen_name+8;
+ else
+ md_devname = chosen_name;
+ if (c->export) {
+ printf("MD_DEVICE=%s\n", fd2devnm(mdfd));
+ printf("MD_DEVNAME=%s\n", md_devname);
+ printf("MD_FOREIGN=%s\n", trustworthy == FOREIGN ? "yes" : "no");
+ }
+
+ /* 7/ Is there enough devices to possibly start the array? */
+ /* 7a/ if not, finish with success. */
+ if (info.array.level == LEVEL_CONTAINER) {
+ char devnm[32];
+ /* Try to assemble within the container */
+ sysfs_uevent(sra, "change");
+ if (!c->export && c->verbose >= 0)
+ pr_err("container %s now has %d device%s\n",
+ chosen_name, info.array.working_disks,
+ info.array.working_disks == 1?"":"s");
+ wait_for(chosen_name, mdfd);
+ if (st->ss->external)
+ strcpy(devnm, fd2devnm(mdfd));
+ if (st->ss->load_container)
+ rv = st->ss->load_container(st, mdfd, NULL);
+ close(mdfd);
+ sysfs_free(sra);
+ if (!rv)
+ rv = Incremental_container(st, chosen_name, c, NULL);
+ map_unlock(&map);
+ /* after spare is added, ping monitor for external metadata
+ * so that it can eg. try to rebuild degraded array */
+ if (st->ss->external)
+ ping_monitor(devnm);
+ return rv;
+ }
+
+ /* We have added something to the array, so need to re-read the
+ * state. Eventually this state should be kept up-to-date as
+ * things change.
+ */
+ sysfs_free(sra);
+ sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE |
+ GET_OFFSET | GET_SIZE));
+ active_disks = count_active(st, sra, mdfd, &avail, &info);
+
+ journal_device_missing = (info.journal_device_required) && (info.journal_clean == 0);
+
+ if (enough(info.array.level, info.array.raid_disks,
+ info.array.layout, info.array.state & 1,
+ avail) == 0) {
+ if (c->export) {
+ printf("MD_STARTED=no\n");
+ } else if (c->verbose >= 0)
+ pr_err("%s attached to %s, not enough to start (%d).\n",
+ devname, chosen_name, active_disks);
+ rv = 0;
+ goto out_unlock;
+ }
+
+ /* 7b/ if yes, */
+ /* - if number of OK devices match expected, or -R and there */
+ /* are enough, */
+ /* + add any bitmap file */
+ /* + start the array (auto-readonly). */
+
+ if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) {
+ if (c->export) {
+ printf("MD_STARTED=already\n");
+ } else if (c->verbose >= 0)
+ pr_err("%s attached to %s which is already active.\n",
+ devname, chosen_name);
+ rv = 0;
+ goto out_unlock;
+ }
+
+ map_unlock(&map);
+ if (c->runstop > 0 || (!journal_device_missing && active_disks >= info.array.working_disks)) {
+ struct mdinfo *dsk;
+ /* Let's try to start it */
+
+ if (journal_device_missing)
+ pr_err("Trying to run with missing journal device\n");
+ if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) {
+ pr_err("%s: This array is being reshaped and cannot be started\n",
+ chosen_name);
+ cont_err("by --incremental. Please use --assemble\n");
+ goto out;
+ }
+ if (match && match->bitmap_file) {
+ int bmfd = open(match->bitmap_file, O_RDWR);
+ if (bmfd < 0) {
+ pr_err("Could not open bitmap file %s.\n",
+ match->bitmap_file);
+ goto out;
+ }
+ if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) {
+ close(bmfd);
+ pr_err("Failed to set bitmapfile for %s.\n",
+ chosen_name);
+ goto out;
+ }
+ close(bmfd);
+ }
+ /* Need to remove from the array any devices which
+ * 'count_active' discerned were too old or inappropriate
+ */
+ for (d = sra ? sra->devs : NULL ; d ; d = d->next)
+ if (d->disk.state & (1<<MD_DISK_REMOVED))
+ remove_disk(mdfd, st, sra, d);
+
+ if ((sra == NULL || active_disks >= info.array.working_disks)
+ && trustworthy != FOREIGN)
+ rv = ioctl(mdfd, RUN_ARRAY, NULL);
+ else
+ rv = sysfs_set_str(sra, NULL,
+ "array_state", "read-auto");
+ /* Array might be O_EXCL which will interfere with
+ * fsck and mount. So re-open without O_EXCL.
+ */
+ reopen_mddev(mdfd);
+ if (rv == 0) {
+ if (c->export) {
+ printf("MD_STARTED=yes\n");
+ } else if (c->verbose >= 0)
+ pr_err("%s attached to %s, which has been started.\n",
+ devname, chosen_name);
+ rv = 0;
+ wait_for(chosen_name, mdfd);
+ /* We just started the array, so some devices
+ * might have been evicted from the array
+ * because their event counts were too old.
+ * If the action=re-add policy is in-force for
+ * those devices we should re-add them now.
+ */
+ for (dsk = sra->devs; dsk ; dsk = dsk->next) {
+ if (disk_action_allows(dsk, st->ss->name, act_re_add) &&
+ add_disk(mdfd, st, sra, dsk) == 0)
+ pr_err("%s re-added to %s\n",
+ dsk->sys_name, chosen_name);
+ }
+ } else {
+ pr_err("%s attached to %s, but failed to start: %s.\n",
+ devname, chosen_name, strerror(errno));
+ rv = 1;
+ }
+ } else {
+ if (c->export) {
+ printf("MD_STARTED=unsafe\n");
+ } else if (journal_device_missing) {
+ pr_err("Journal device is missing, not safe to start yet.\n");
+ } else if (c->verbose >= 0)
+ pr_err("%s attached to %s, not enough to start safely.\n",
+ devname, chosen_name);
+ rv = 0;
+ }
+out:
+ free(avail);
+ if (dfd >= 0)
+ close(dfd);
+ if (mdfd >= 0)
+ close(mdfd);
+ if (policy)
+ dev_policy_free(policy);
+ if (sra)
+ sysfs_free(sra);
+ return rv;
+out_unlock:
+ map_unlock(&map);
+ goto out;
+}
+
+static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
+ int number, __u64 events, int verbose,
+ char *array_name)
+{
+ /* Find a device attached to this array with a disk.number of number
+ * and events less than the passed events, and remove the device.
+ */
+ struct mdinfo *d;
+ mdu_array_info_t ra;
+
+ if (ioctl(mdfd, GET_ARRAY_INFO, &ra) == 0)
+ return; /* not safe to remove from active arrays
+ * without thinking more */
+
+ for (d = sra->devs; d ; d = d->next) {
+ char dn[24]; // 2*11 bytes for ints (including sign) + colon + null byte
+ int dfd;
+ struct mdinfo info;
+ sprintf(dn, "%d:%d", d->disk.major, d->disk.minor);
+ dfd = dev_open(dn, O_RDONLY);
+ if (dfd < 0)
+ continue;
+ if (st->ss->load_super(st, dfd, NULL)) {
+ close(dfd);
+ continue;
+ }
+ st->ss->getinfo_super(st, &info, NULL);
+ st->ss->free_super(st);
+ close(dfd);
+
+ if (info.disk.number != number ||
+ info.events >= events)
+ continue;
+
+ if (d->disk.raid_disk > -1)
+ sysfs_set_str(sra, d, "slot", "none");
+ if (sysfs_set_str(sra, d, "state", "remove") == 0)
+ if (verbose >= 0)
+ pr_err("removing old device %s from %s\n",
+ d->sys_name+4, array_name);
+ }
+}
+
+static int count_active(struct supertype *st, struct mdinfo *sra,
+ int mdfd, char **availp,
+ struct mdinfo *bestinfo)
+{
+ /* count how many devices in sra think they are active */
+ struct mdinfo *d;
+ int cnt = 0;
+ int replcnt = 0;
+ __u64 max_events = 0;
+ char *avail = NULL;
+ int *best = NULL;
+ char *devmap = NULL;
+ int numdevs = 0;
+ int devnum;
+ int b, i;
+ int raid_disks = 0;
+
+ if (!sra)
+ return 0;
+
+ for (d = sra->devs ; d ; d = d->next)
+ numdevs++;
+ for (d = sra->devs, devnum = 0 ; d ; d = d->next, devnum++) {
+ char dn[30];
+ int dfd;
+ int ok;
+ struct mdinfo info;
+
+ sprintf(dn, "%d:%d", d->disk.major, d->disk.minor);
+ dfd = dev_open(dn, O_RDONLY);
+ if (dfd < 0)
+ continue;
+ ok = st->ss->load_super(st, dfd, NULL);
+ close(dfd);
+ if (ok != 0)
+ continue;
+
+ info.array.raid_disks = raid_disks;
+ st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum);
+ if (info.disk.raid_disk == MD_DISK_ROLE_JOURNAL)
+ bestinfo->journal_clean = 1;
+ if (!avail) {
+ raid_disks = info.array.raid_disks;
+ avail = xcalloc(raid_disks, 1);
+ *availp = avail;
+
+ best = xcalloc(raid_disks, sizeof(int));
+ devmap = xcalloc(raid_disks, numdevs);
+
+ st->ss->getinfo_super(st, &info, devmap);
+ }
+
+ if (info.disk.state & (1<<MD_DISK_SYNC))
+ {
+ if (cnt == 0) {
+ cnt++;
+ max_events = info.events;
+ avail[info.disk.raid_disk] = 2;
+ best[info.disk.raid_disk] = devnum;
+ st->ss->getinfo_super(st, bestinfo, NULL);
+ } else if (info.events == max_events) {
+ avail[info.disk.raid_disk] = 2;
+ best[info.disk.raid_disk] = devnum;
+ } else if (info.events == max_events-1) {
+ if (avail[info.disk.raid_disk] == 0) {
+ avail[info.disk.raid_disk] = 1;
+ best[info.disk.raid_disk] = devnum;
+ }
+ } else if (info.events < max_events - 1)
+ ;
+ else if (info.events == max_events+1) {
+ int i;
+ max_events = info.events;
+ for (i = 0; i < raid_disks; i++)
+ if (avail[i])
+ avail[i]--;
+ avail[info.disk.raid_disk] = 2;
+ best[info.disk.raid_disk] = devnum;
+ st->ss->getinfo_super(st, bestinfo, NULL);
+ } else { /* info.events much bigger */
+ memset(avail, 0, raid_disks);
+ max_events = info.events;
+ avail[info.disk.raid_disk] = 2;
+ best[info.disk.raid_disk] = devnum;
+ st->ss->getinfo_super(st, bestinfo, NULL);
+ }
+ } else if (info.disk.state & (1<<MD_DISK_REPLACEMENT))
+ replcnt++;
+ st->ss->free_super(st);
+ }
+
+ if (!avail)
+ return 0;
+ /* We need to reject any device that thinks the best device is
+ * failed or missing */
+ for (b = 0; b < raid_disks; b++)
+ if (avail[b] == 2)
+ break;
+ cnt = 0;
+ for (i = 0 ; i < raid_disks ; i++) {
+ if (i != b && avail[i])
+ if (devmap[raid_disks * best[i] + b] == 0) {
+ /* This device thinks 'b' is failed -
+ * don't use it */
+ devnum = best[i];
+ for (d=sra->devs ; devnum; d = d->next)
+ devnum--;
+ d->disk.state |= (1 << MD_DISK_REMOVED);
+ avail[i] = 0;
+ }
+ if (avail[i])
+ cnt++;
+ }
+ /* Also need to reject any spare device with an event count that
+ * is too high
+ */
+ for (d = sra->devs; d; d = d->next) {
+ if (!(d->disk.state & (1<<MD_DISK_SYNC)) &&
+ d->events > max_events)
+ d->disk.state |= (1 << MD_DISK_REMOVED);
+ }
+ free(best);
+ free(devmap);
+ return cnt + replcnt;
+}
+
+/* test if container has degraded member(s) */
+static int container_members_max_degradation(struct map_ent *map, struct map_ent *me)
+{
+ mdu_array_info_t array;
+ int afd;
+ int max_degraded = 0;
+
+ for(; map; map = map->next) {
+ if (!metadata_container_matches(map->metadata, me->devnm))
+ continue;
+ afd = open_dev(map->devnm);
+ if (afd < 0)
+ continue;
+ /* most accurate information regarding array degradation */
+ if (ioctl(afd, GET_ARRAY_INFO, &array) >= 0) {
+ int degraded = array.raid_disks - array.active_disks -
+ array.spare_disks;
+ if (degraded > max_degraded)
+ max_degraded = degraded;
+ }
+ close(afd);
+ }
+ return (max_degraded);
+}
+
+static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+ struct map_ent *target, int bare,
+ struct supertype *st, int verbose)
+{
+ /* This device doesn't have any md metadata
+ * The device policy allows 'spare' and if !bare, it allows spare-same-slot.
+ * If 'st' is not set, then we only know that some metadata allows this,
+ * others possibly don't.
+ * So look for a container or array to attach the device to.
+ * Prefer 'target' if that is set and the array is found.
+ *
+ * If st is set, then only arrays of that type are considered
+ * Return 0 on success, or some exit code on failure, probably 1.
+ */
+ int rv = 1;
+ struct stat stb;
+ struct map_ent *mp, *map = NULL;
+ struct mdinfo *chosen = NULL;
+ int dfd = *dfdp;
+
+ if (fstat(dfd, &stb) != 0)
+ return 1;
+
+ /*
+ * Now we need to find a suitable array to add this to.
+ * We only accept arrays that:
+ * - match 'st'
+ * - are in the same domains as the device
+ * - are of an size for which the device will be useful
+ * and we choose the one that is the most degraded
+ */
+
+ if (map_lock(&map)) {
+ pr_err("failed to get exclusive lock on mapfile\n");
+ return 1;
+ }
+ for (mp = map ; mp ; mp = mp->next) {
+ struct supertype *st2;
+ struct domainlist *dl = NULL;
+ struct mdinfo *sra;
+ unsigned long long devsize;
+ unsigned long long component_size = 0;
+
+ if (is_subarray(mp->metadata))
+ continue;
+ if (st) {
+ st2 = st->ss->match_metadata_desc(mp->metadata);
+ if (!st2 ||
+ (st->minor_version >= 0 &&
+ st->minor_version != st2->minor_version)) {
+ if (verbose > 1)
+ pr_err("not adding %s to %s as metadata type doesn't match\n",
+ devname, mp->path);
+ free(st2);
+ continue;
+ }
+ free(st2);
+ }
+ sra = sysfs_read(-1, mp->devnm,
+ GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|
+ GET_DEGRADED|GET_COMPONENT|GET_VERSION);
+ if (!sra) {
+ /* Probably a container - no degraded info */
+ sra = sysfs_read(-1, mp->devnm,
+ GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|
+ GET_COMPONENT|GET_VERSION);
+ if (sra)
+ sra->array.failed_disks = -1;
+ }
+ if (!sra)
+ continue;
+ if (st == NULL) {
+ int i;
+ st2 = NULL;
+ for(i = 0; !st2 && superlist[i]; i++)
+ st2 = superlist[i]->match_metadata_desc(
+ sra->text_version);
+ if (!st2) {
+ if (verbose > 1)
+ pr_err("not adding %s to %s as metadata not recognised.\n",
+ devname, mp->path);
+ goto next;
+ }
+ /* Need to double check the 'act_spare' permissions applies
+ * to this metadata.
+ */
+ if (!policy_action_allows(pol, st2->ss->name, act_spare))
+ goto next;
+ if (!bare && !policy_action_allows(pol, st2->ss->name,
+ act_spare_same_slot))
+ goto next;
+ } else
+ st2 = st;
+ /* update number of failed disks for mostly degraded
+ * container member */
+ if (sra->array.failed_disks == -1)
+ sra->array.failed_disks = container_members_max_degradation(map, mp);
+
+ get_dev_size(dfd, NULL, &devsize);
+ if (sra->component_size == 0) {
+ /* true for containers, here we must read superblock
+ * to obtain minimum spare size */
+ struct supertype *st3 = dup_super(st2);
+ int mdfd = open_dev(mp->devnm);
+ if (mdfd < 0) {
+ free(st3);
+ goto next;
+ }
+ if (st3->ss->load_container &&
+ !st3->ss->load_container(st3, mdfd, mp->path)) {
+ component_size = st3->ss->min_acceptable_spare_size(st3);
+ st3->ss->free_super(st3);
+ }
+ free(st3);
+ close(mdfd);
+ }
+ if ((sra->component_size > 0 &&
+ st2->ss->avail_size(st2, devsize,
+ sra->devs
+ ? sra->devs->data_offset
+ : INVALID_SECTORS)
+ < sra->component_size)
+ ||
+ (sra->component_size == 0 && devsize < component_size)) {
+ if (verbose > 1)
+ pr_err("not adding %s to %s as it is too small\n",
+ devname, mp->path);
+ goto next;
+ }
+ /* test against target.
+ * If 'target' is set and 'bare' is false, we only accept
+ * arrays/containers that match 'target'.
+ * If 'target' is set and 'bare' is true, we prefer the
+ * array which matches 'target'.
+ * target is considered only if we deal with degraded array
+ */
+ if (target && policy_action_allows(pol, st2->ss->name,
+ act_spare_same_slot)) {
+ if (strcmp(target->metadata, mp->metadata) == 0 &&
+ memcmp(target->uuid, mp->uuid,
+ sizeof(target->uuid)) == 0 &&
+ sra->array.failed_disks > 0) {
+ /* This is our target!! */
+ if (chosen)
+ sysfs_free(chosen);
+ chosen = sra;
+ sra = NULL;
+ /* skip to end so we don't check any more */
+ while (mp->next)
+ mp = mp->next;
+ goto next;
+ }
+ /* not our target */
+ if (!bare)
+ goto next;
+ }
+
+ dl = domain_from_array(sra, st2->ss->name);
+ if (domain_test(dl, pol, st2->ss->name) != 1) {
+ /* domain test fails */
+ if (verbose > 1)
+ pr_err("not adding %s to %s as it is not in a compatible domain\n",
+ devname, mp->path);
+
+ goto next;
+ }
+ /* all tests passed, OK to add to this array */
+ if (!chosen) {
+ chosen = sra;
+ sra = NULL;
+ } else if (chosen->array.failed_disks < sra->array.failed_disks) {
+ sysfs_free(chosen);
+ chosen = sra;
+ sra = NULL;
+ }
+ next:
+ if (sra)
+ sysfs_free(sra);
+ if (st != st2)
+ free(st2);
+ if (dl)
+ domain_free(dl);
+ }
+ if (chosen) {
+ /* add current device to chosen array as a spare */
+ int mdfd = open_dev(chosen->sys_name);
+ if (mdfd >= 0) {
+ struct mddev_dev devlist;
+ char chosen_devname[24]; // 2*11 for int (including signs) + colon + null
+ devlist.next = NULL;
+ devlist.used = 0;
+ devlist.writemostly = 0;
+ devlist.devname = chosen_devname;
+ sprintf(chosen_devname, "%d:%d", major(stb.st_rdev),
+ minor(stb.st_rdev));
+ devlist.disposition = 'a';
+ close(dfd);
+ *dfdp = -1;
+ rv = Manage_subdevs(chosen->sys_name, mdfd, &devlist,
+ -1, 0, NULL, 0);
+ close(mdfd);
+ }
+ if (verbose > 0) {
+ if (rv == 0)
+ pr_err("added %s as spare for %s\n",
+ devname, chosen->sys_name);
+ else
+ pr_err("failed to add %s as spare for %s\n",
+ devname, chosen->sys_name);
+ }
+ sysfs_free(chosen);
+ }
+ map_unlock(&map);
+ return rv;
+}
+
+static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+ struct supertype *st, int verbose)
+{
+ /* we know that at least one partition virtual-metadata is
+ * allowed to incorporate spares like this device. We need to
+ * find a suitable device to copy partition information from.
+ *
+ * Getting a list of all disk (not partition) devices is
+ * slightly non-trivial. We could look at /sys/block, but
+ * that is theoretically due to be removed. Maybe best to use
+ * /dev/disk/by-path/?* and ignore names ending '-partNN' as
+ * we depend on this directory of 'path' info. But that fails
+ * to find loop devices and probably others. Maybe don't
+ * worry about that, they aren't the real target.
+ *
+ * So: check things in /dev/disk/by-path to see if they are in
+ * a compatible domain, then load the partition table and see
+ * if it is OK for the new device, and choose the largest
+ * partition table that fits.
+ */
+ DIR *dir;
+ struct dirent *de;
+ char *chosen = NULL;
+ unsigned long long chosen_size = 0;
+ struct supertype *chosen_st = NULL;
+ int fd;
+
+ dir = opendir("/dev/disk/by-path");
+ if (!dir)
+ return 1;
+ while ((de = readdir(dir)) != NULL) {
+ char *ep;
+ struct dev_policy *pol2 = NULL;
+ struct domainlist *domlist = NULL;
+ int fd = -1;
+ struct mdinfo info;
+ struct supertype *st2 = NULL;
+ char *devname = NULL;
+ unsigned long long devsectors;
+
+ if (de->d_ino == 0 ||
+ de->d_name[0] == '.' ||
+ (de->d_type != DT_LNK && de->d_type != DT_UNKNOWN))
+ goto next;
+
+ ep = de->d_name + strlen(de->d_name);
+ while (ep > de->d_name &&
+ isdigit(ep[-1]))
+ ep--;
+ if (ep > de->d_name + 5 &&
+ strncmp(ep-5, "-part", 5) == 0)
+ /* This is a partition - skip it */
+ goto next;
+
+ pol2 = path_policy(de->d_name, type_disk);
+
+ domain_merge(&domlist, pol2, st ? st->ss->name : NULL);
+ if (domain_test(domlist, pol, st ? st->ss->name : NULL) != 1)
+ /* new device is incompatible with this device. */
+ goto next;
+
+ domain_free(domlist);
+ domlist = NULL;
+
+ if (asprintf(&devname, "/dev/disk/by-path/%s", de->d_name) != 1) {
+ devname = NULL;
+ goto next;
+ }
+ fd = open(devname, O_RDONLY);
+ if (fd < 0)
+ goto next;
+ if (get_dev_size(fd, devname, &devsectors) == 0)
+ goto next;
+ devsectors >>= 9;
+
+ if (st)
+ st2 = dup_super(st);
+ else
+ st2 = guess_super_type(fd, guess_partitions);
+ if (st2 == NULL ||
+ st2->ss->load_super(st2, fd, NULL) < 0)
+ goto next;
+ st2->ignore_hw_compat = 0;
+
+ if (!st) {
+ /* Check domain policy again, this time referring to metadata */
+ domain_merge(&domlist, pol2, st2->ss->name);
+ if (domain_test(domlist, pol, st2->ss->name) != 1)
+ /* Incompatible devices for this metadata type */
+ goto next;
+ if (!policy_action_allows(pol, st2->ss->name, act_spare))
+ /* Some partition types allow sparing, but not
+ * this one.
+ */
+ goto next;
+ }
+
+ st2->ss->getinfo_super(st2, &info, NULL);
+ if (info.component_size > devsectors)
+ /* This partitioning doesn't fit in the device */
+ goto next;
+
+ /* This is an acceptable device to copy partition
+ * metadata from. We could just stop here, but I
+ * think I want to keep looking incase a larger
+ * metadata which makes better use of the device can
+ * be found.
+ */
+ if (chosen == NULL ||
+ chosen_size < info.component_size) {
+ chosen_size = info.component_size;
+ free(chosen);
+ chosen = devname;
+ devname = NULL;
+ if (chosen_st) {
+ chosen_st->ss->free_super(chosen_st);
+ free(chosen_st);
+ }
+ chosen_st = st2;
+ st2 = NULL;
+ }
+
+ next:
+ free(devname);
+ domain_free(domlist);
+ dev_policy_free(pol2);
+ if (st2)
+ st2->ss->free_super(st2);
+ free(st2);
+
+ if (fd >= 0)
+ close(fd);
+ }
+
+ closedir(dir);
+
+ if (!chosen)
+ return 1;
+
+ /* 'chosen' is the best device we can find. Let's write its
+ * metadata to devname dfd is read-only so don't use that
+ */
+ fd = open(devname, O_RDWR);
+ if (fd >= 0) {
+ chosen_st->ss->store_super(chosen_st, fd);
+ close(fd);
+ }
+ free(chosen);
+ chosen_st->ss->free_super(chosen_st);
+ free(chosen_st);
+ return 0;
+}
+
+static int is_bare(int dfd)
+{
+ unsigned long long size = 0;
+ char bufpad[4096 + 4096];
+ char *buf = (char*)(((long)bufpad + 4096) & ~4095);
+
+ if (lseek(dfd, 0, SEEK_SET) != 0 ||
+ read(dfd, buf, 4096) != 4096)
+ return 0;
+
+ if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff')
+ return 0;
+ if (memcmp(buf, buf+1, 4095) != 0)
+ return 0;
+
+ /* OK, first 4K appear blank, try the end. */
+ get_dev_size(dfd, NULL, &size);
+ if (lseek(dfd, size-4096, SEEK_SET) < 0 ||
+ read(dfd, buf, 4096) != 4096)
+ return 0;
+
+ if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff')
+ return 0;
+ if (memcmp(buf, buf+1, 4095) != 0)
+ return 0;
+
+ return 1;
+}
+
+/* adding a spare to a regular array is quite different from adding one to
+ * a set-of-partitions virtual array.
+ * This function determines which is worth trying and tries as appropriate.
+ * Arrays are given priority over partitions.
+ */
+static int try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+ struct map_ent *target,
+ struct supertype *st, int verbose)
+{
+ int i;
+ int rv;
+ int arrays_ok = 0;
+ int partitions_ok = 0;
+ int dfd = *dfdp;
+ int bare;
+
+ /* Can only add a spare if device has at least one domain */
+ if (pol_find(pol, pol_domain) == NULL)
+ return 1;
+ /* And only if some action allows spares */
+ if (!policy_action_allows(pol, st?st->ss->name:NULL, act_spare))
+ return 1;
+
+ /* Now check if the device is bare.
+ * bare devices can always be added as a spare
+ * non-bare devices can only be added if spare-same-slot is permitted,
+ * and this device is replacing a previous device - in which case 'target'
+ * will be set.
+ */
+ if (!is_bare(dfd)) {
+ /* Must have a target and allow same_slot */
+ /* Later - may allow force_spare without target */
+ if (!target ||
+ !policy_action_allows(pol, st?st->ss->name:NULL,
+ act_spare_same_slot)) {
+ if (verbose > 1)
+ pr_err("%s is not bare, so not considering as a spare\n",
+ devname);
+ return 1;
+ }
+ bare = 0;
+ } else
+ bare = 1;
+
+ /* It might be OK to add this device to an array - need to see
+ * what arrays might be candidates.
+ */
+ if (st) {
+ /* just try try 'array' or 'partition' based on this metadata */
+ if (st->ss->add_to_super)
+ return array_try_spare(devname, dfdp, pol, target, bare,
+ st, verbose);
+ else
+ return partition_try_spare(devname, dfdp, pol,
+ st, verbose);
+ }
+ /* No metadata was specified or found so options are open.
+ * Check for whether any array metadata, or any partition metadata
+ * might allow adding the spare. This check is just help to avoid
+ * a more costly scan of all arrays when we can be sure that will
+ * fail.
+ */
+ for (i = 0; (!arrays_ok || !partitions_ok) && superlist[i] ; i++) {
+ if (superlist[i]->add_to_super && !arrays_ok &&
+ policy_action_allows(pol, superlist[i]->name, act_spare))
+ arrays_ok = 1;
+ if (superlist[i]->add_to_super == NULL && !partitions_ok &&
+ policy_action_allows(pol, superlist[i]->name, act_spare))
+ partitions_ok = 1;
+ }
+ rv = 1;
+ if (arrays_ok)
+ rv = array_try_spare(devname, dfdp, pol, target, bare,
+ st, verbose);
+ if (rv != 0 && partitions_ok)
+ rv = partition_try_spare(devname, dfdp, pol, st, verbose);
+ return rv;
+}
+
+int IncrementalScan(struct context *c, char *devnm)
+{
+ /* look at every device listed in the 'map' file.
+ * If one is found that is not running then:
+ * look in mdadm.conf for bitmap file.
+ * if one exists, but array has none, add it.
+ * try to start array in auto-readonly mode
+ */
+ struct map_ent *mapl = NULL;
+ struct map_ent *me;
+ struct mddev_ident *devs, *mddev;
+ int rv = 0;
+ char container[32];
+ char *only = NULL;
+
+ map_read(&mapl);
+ devs = conf_get_ident(NULL);
+
+restart:
+ for (me = mapl ; me ; me = me->next) {
+ mdu_array_info_t array;
+ mdu_bitmap_file_t bmf;
+ struct mdinfo *sra;
+ int mdfd;
+
+ if (devnm && strcmp(devnm, me->devnm) != 0)
+ continue;
+ if (devnm && me->metadata[0] == '/') {
+ char *sl;
+ /* member array, need to work on container */
+ strncpy(container, me->metadata+1, 32);
+ container[31] = 0;
+ sl = strchr(container, '/');
+ if (sl)
+ *sl = 0;
+ only = devnm;
+ devnm = container;
+ goto restart;
+ }
+ mdfd = open_dev(me->devnm);
+
+ if (mdfd < 0)
+ continue;
+ if (!isdigit(me->metadata[0])) {
+ /* must be a container */
+ struct supertype *st = super_by_fd(mdfd, NULL);
+ int ret = 0;
+ struct map_ent *map = NULL;
+
+ if (st && st->ss->load_container)
+ ret = st->ss->load_container(st, mdfd, NULL);
+ close(mdfd);
+ if (!ret && st && st->ss->container_content) {
+ if (map_lock(&map))
+ pr_err("failed to get exclusive lock on mapfile\n");
+ ret = Incremental_container(st, me->path, c, only);
+ map_unlock(&map);
+ }
+ if (ret)
+ rv = 1;
+ continue;
+ }
+ if (ioctl(mdfd, GET_ARRAY_INFO, &array) == 0 ||
+ errno != ENODEV) {
+ close(mdfd);
+ continue;
+ }
+ /* Ok, we can try this one. Maybe it needs a bitmap */
+ for (mddev = devs ; mddev ; mddev = mddev->next)
+ if (mddev->devname && me->path
+ && devname_matches(mddev->devname, me->path))
+ break;
+ if (mddev && mddev->bitmap_file) {
+ /*
+ * Note: early kernels will wrongly fail this, so it
+ * is a hint only
+ */
+ int added = -1;
+ if (ioctl(mdfd, GET_ARRAY_INFO, &bmf) < 0) {
+ int bmfd = open(mddev->bitmap_file, O_RDWR);
+ if (bmfd >= 0) {
+ added = ioctl(mdfd, SET_BITMAP_FILE,
+ bmfd);
+ close(bmfd);
+ }
+ }
+ if (c->verbose >= 0) {
+ if (added == 0)
+ pr_err("Added bitmap %s to %s\n",
+ mddev->bitmap_file, me->path);
+ else if (errno != EEXIST)
+ pr_err("Failed to add bitmap to %s: %s\n",
+ me->path, strerror(errno));
+ }
+ }
+ /* FIXME check for reshape_active and consider not
+ * starting array.
+ */
+ sra = sysfs_read(mdfd, NULL, 0);
+ if (sra) {
+ if (sysfs_set_str(sra, NULL,
+ "array_state", "read-auto") == 0) {
+ if (c->verbose >= 0)
+ pr_err("started array %s\n",
+ me->path ?: me->devnm);
+ } else {
+ pr_err("failed to start array %s: %s\n",
+ me->path ?: me->devnm,
+ strerror(errno));
+ rv = 1;
+ }
+ sysfs_free(sra);
+ }
+ }
+ return rv;
+}
+
+static char *container2devname(char *devname)
+{
+ char *mdname = NULL;
+
+ if (devname[0] == '/') {
+ int fd = open(devname, O_RDONLY);
+ if (fd >= 0) {
+ mdname = xstrdup(fd2devnm(fd));
+ close(fd);
+ }
+ } else {
+ int uuid[4];
+ struct map_ent *mp, *map = NULL;
+
+ if (!parse_uuid(devname, uuid))
+ return mdname;
+ mp = map_by_uuid(&map, uuid);
+ if (mp)
+ mdname = xstrdup(mp->devnm);
+ map_free(map);
+ }
+
+ return mdname;
+}
+
+static int Incremental_container(struct supertype *st, char *devname,
+ struct context *c, char *only)
+{
+ /* Collect the contents of this container and for each
+ * array, choose a device name and assemble the array.
+ */
+
+ struct mdinfo *list;
+ struct mdinfo *ra;
+ struct map_ent *map = NULL;
+ struct mdinfo info;
+ int trustworthy;
+ struct mddev_ident *match;
+ int rv = 0;
+ struct domainlist *domains;
+ struct map_ent *smp;
+ int suuid[4];
+ int sfd;
+ int ra_blocked = 0;
+ int ra_all = 0;
+ int result = 0;
+
+ st->ss->getinfo_super(st, &info, NULL);
+
+ if ((c->runstop > 0 && info.container_enough >= 0) ||
+ info.container_enough > 0)
+ /* pass */;
+ else {
+ if (c->export) {
+ printf("MD_STARTED=no\n");
+ } else if (c->verbose)
+ pr_err("not enough devices to start the container\n");
+ return 0;
+ }
+
+ match = conf_match(st, &info, devname, c->verbose, &rv);
+ if (match == NULL && rv == 2)
+ return rv;
+
+ /* Need to compute 'trustworthy' */
+ if (match)
+ trustworthy = LOCAL;
+ else if (st->ss->match_home(st, c->homehost) == 1)
+ trustworthy = LOCAL;
+ else if (st->ss->match_home(st, "any") == 1)
+ trustworthy = LOCAL;
+ else
+ trustworthy = FOREIGN;
+
+ list = st->ss->container_content(st, NULL);
+ /* when nothing to activate - quit */
+ if (list == NULL) {
+ if (c->export) {
+ printf("MD_STARTED=nothing\n");
+ }
+ return 0;
+ }
+ for (ra = list ; ra ; ra = ra->next) {
+ int mdfd;
+ char chosen_name[1024];
+ struct map_ent *mp;
+ struct mddev_ident *match = NULL;
+
+ ra_all++;
+ /* do not activate arrays blocked by metadata handler */
+ if (ra->array.state & (1 << MD_SB_BLOCK_VOLUME)) {
+ pr_err("Cannot activate array %s in %s.\n",
+ ra->text_version, devname);
+ ra_blocked++;
+ continue;
+ }
+ mp = map_by_uuid(&map, ra->uuid);
+
+ if (mp) {
+ mdfd = open_dev(mp->devnm);
+ if (mp->path)
+ strcpy(chosen_name, mp->path);
+ else
+ strcpy(chosen_name, mp->devnm);
+ } else if (!only) {
+
+ /* Check in mdadm.conf for container == devname and
+ * member == ra->text_version after second slash.
+ */
+ char *sub = strchr(ra->text_version+1, '/');
+ struct mddev_ident *array_list;
+ if (sub) {
+ sub++;
+ array_list = conf_get_ident(NULL);
+ } else
+ array_list = NULL;
+ for(; array_list ; array_list = array_list->next) {
+ char *dn;
+ if (array_list->member == NULL ||
+ array_list->container == NULL)
+ continue;
+ if (strcmp(array_list->member, sub) != 0)
+ continue;
+ if (array_list->uuid_set &&
+ !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid))
+ continue;
+ dn = container2devname(array_list->container);
+ if (dn == NULL)
+ continue;
+ if (strncmp(dn, ra->text_version+1,
+ strlen(dn)) != 0 ||
+ ra->text_version[strlen(dn)+1] != '/') {
+ free(dn);
+ continue;
+ }
+ free(dn);
+ /* we have a match */
+ match = array_list;
+ if (c->verbose>0)
+ pr_err("match found for member %s\n",
+ array_list->member);
+ break;
+ }
+
+ if (match && match->devname &&
+ strcasecmp(match->devname, "<ignore>") == 0) {
+ if (c->verbose > 0)
+ pr_err("array %s/%s is explicitly ignored by mdadm.conf\n",
+ match->container, match->member);
+ continue;
+ }
+ if (match)
+ trustworthy = LOCAL;
+
+ mdfd = create_mddev(match ? match->devname : NULL,
+ ra->name,
+ c->autof,
+ trustworthy,
+ chosen_name);
+ }
+ if (only && (!mp || strcmp(mp->devnm, only) != 0))
+ continue;
+
+ if (mdfd < 0) {
+ pr_err("failed to open %s: %s.\n",
+ chosen_name, strerror(errno));
+ return 2;
+ }
+
+ assemble_container_content(st, mdfd, ra, c,
+ chosen_name, &result);
+ close(mdfd);
+ }
+ if (c->export && result) {
+ char sep = '=';
+ printf("MD_STARTED");
+ if (result & INCR_NO) {
+ printf("%cno", sep);
+ sep = ',';
+ }
+ if (result & INCR_UNSAFE) {
+ printf("%cunsafe", sep);
+ sep = ',';
+ }
+ if (result & INCR_ALREADY) {
+ printf("%calready", sep);
+ sep = ',';
+ }
+ if (result & INCR_YES) {
+ printf("%cyes", sep);
+ sep = ',';
+ }
+ printf("\n");
+ }
+
+ /* don't move spares to container with volume being activated
+ when all volumes are blocked */
+ if (ra_all == ra_blocked)
+ return 0;
+
+ /* Now move all suitable spares from spare container */
+ domains = domain_from_array(list, st->ss->name);
+ memcpy(suuid, uuid_zero, sizeof(int[4]));
+ if (domains &&
+ (smp = map_by_uuid(&map, suuid)) != NULL &&
+ (sfd = open(smp->path, O_RDONLY)) >= 0) {
+ /* spare container found */
+ struct supertype *sst =
+ super_imsm.match_metadata_desc("imsm");
+ struct mdinfo *sinfo;
+ unsigned long long min_size = 0;
+ if (st->ss->min_acceptable_spare_size)
+ min_size = st->ss->min_acceptable_spare_size(st);
+ if (!sst->ss->load_container(sst, sfd, NULL)) {
+ close(sfd);
+ sinfo = container_choose_spares(sst, min_size,
+ domains, NULL,
+ st->ss->name, 0);
+ sst->ss->free_super(sst);
+ if (sinfo){
+ int count = 0;
+ struct mdinfo *disks = sinfo->devs;
+ while (disks) {
+ /* move spare from spare
+ * container to currently
+ * assembled one
+ */
+ if (move_spare(
+ smp->path,
+ devname,
+ makedev(disks->disk.major,
+ disks->disk.minor)))
+ count++;
+ disks = disks->next;
+ }
+ if (count)
+ pr_err("Added %d spare%s to %s\n",
+ count, count>1?"s":"", devname);
+ }
+ sysfs_free(sinfo);
+ } else
+ close(sfd);
+ }
+ domain_free(domains);
+ return 0;
+}
+
+static void run_udisks(char *arg1, char *arg2)
+{
+ int pid = fork();
+ int status;
+ if (pid == 0) {
+ execl("/usr/bin/udisks", "udisks", arg1, arg2, NULL);
+ execl("/bin/udisks", "udisks", arg1, arg2, NULL);
+ exit(1);
+ }
+ while (pid > 0 && wait(&status) != pid)
+ ;
+}
+
+/*
+ * IncrementalRemove - Attempt to see if the passed in device belongs to any
+ * raid arrays, and if so first fail (if needed) and then remove the device.
+ *
+ * @devname - The device we want to remove
+ * @id_path - name as found in /dev/disk/by-path for this device
+ *
+ * Note: the device name must be a kernel name like "sda", so
+ * that we can find it in /proc/mdstat
+ */
+int IncrementalRemove(char *devname, char *id_path, int verbose)
+{
+ int mdfd;
+ int rv = 0;
+ struct mdstat_ent *ent;
+ struct mddev_dev devlist;
+ struct mdinfo mdi;
+ char buf[32];
+
+ if (!id_path)
+ dprintf("incremental removal without --path <id_path> lacks the possibility to re-add new device in this port\n");
+
+ if (strchr(devname, '/')) {
+ pr_err("incremental removal requires a kernel device name, not a file: %s\n", devname);
+ return 1;
+ }
+ ent = mdstat_by_component(devname);
+ if (!ent) {
+ if (verbose >= 0)
+ pr_err("%s does not appear to be a component of any array\n", devname);
+ return 1;
+ }
+ sysfs_init(&mdi, -1, ent->devnm);
+ mdfd = open_dev_excl(ent->devnm);
+ if (mdfd > 0) {
+ close(mdfd);
+ if (sysfs_get_str(&mdi, NULL, "array_state",
+ buf, sizeof(buf)) > 0) {
+ if (strncmp(buf, "active", 6) == 0 ||
+ strncmp(buf, "clean", 5) == 0)
+ sysfs_set_str(&mdi, NULL,
+ "array_state", "read-auto");
+ }
+ }
+ mdfd = open_dev(ent->devnm);
+ if (mdfd < 0) {
+ if (verbose >= 0)
+ pr_err("Cannot open array %s!!\n", ent->devnm);
+ free_mdstat(ent);
+ return 1;
+ }
+
+ if (id_path) {
+ struct map_ent *map = NULL, *me;
+ me = map_by_devnm(&map, ent->devnm);
+ if (me)
+ policy_save_path(id_path, me);
+ map_free(map);
+ }
+
+ memset(&devlist, 0, sizeof(devlist));
+ devlist.devname = devname;
+ devlist.disposition = 'f';
+ /* for a container, we must fail each member array */
+ if (ent->metadata_version &&
+ strncmp(ent->metadata_version, "external:", 9) == 0) {
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *memb;
+ for (memb = mdstat ; memb ; memb = memb->next)
+ if (is_container_member(memb, ent->devnm)) {
+ int subfd = open_dev(memb->devnm);
+ if (subfd >= 0) {
+ rv |= Manage_subdevs(
+ memb->devnm, subfd,
+ &devlist, verbose, 0,
+ NULL, 0);
+ close(subfd);
+ }
+ }
+ free_mdstat(mdstat);
+ } else
+ rv |= Manage_subdevs(ent->devnm, mdfd, &devlist,
+ verbose, 0, NULL, 0);
+ if (rv & 2) {
+ /* Failed due to EBUSY, try to stop the array.
+ * Give udisks a chance to unmount it first.
+ */
+ int devid = devnm2devid(ent->devnm);
+ run_udisks("--unmount", map_dev(major(devid),minor(devid), 0));
+ rv = Manage_stop(ent->devnm, mdfd, verbose, 1);
+ if (rv)
+ /* At least we can try to trigger a 'remove' */
+ sysfs_uevent(&mdi, "remove");
+ if (verbose) {
+ if (rv)
+ pr_err("Fail to stop %s too.\n", ent->devnm);
+ }
+ } else {
+ devlist.disposition = 'r';
+ rv = Manage_subdevs(ent->devnm, mdfd, &devlist,
+ verbose, 0, NULL, 0);
+ }
+ close(mdfd);
+ free_mdstat(ent);
+ return rv;
+}
diff --git a/Kill.c b/Kill.c
new file mode 100644
index 0000000..f2fdb85
--- /dev/null
+++ b/Kill.c
@@ -0,0 +1,146 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ *
+ * Added by Dale Stephenson
+ * steph@snapserver.com
+ */
+
+#include "mdadm.h"
+#include "md_u.h"
+#include "md_p.h"
+
+int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl)
+{
+ /*
+ * Nothing fancy about Kill. It just zeroes out a superblock
+ * Definitely not safe.
+ * Returns:
+ * 0 - a zero superblock was successfully written out
+ * 1 - failed to write the zero superblock
+ * 2 - failed to open the device or find a superblock.
+ */
+
+ int fd, rv = 0;
+
+ if (force)
+ noexcl = 1;
+ fd = open(dev, O_RDWR|(noexcl ? 0 : O_EXCL));
+ if (fd < 0) {
+ if (verbose >= 0)
+ pr_err("Couldn't open %s for write - not zeroing\n",
+ dev);
+ return 2;
+ }
+ if (st == NULL)
+ st = guess_super(fd);
+ if (st == NULL || st->ss->init_super == NULL) {
+ if (verbose >= 0)
+ pr_err("Unrecognised md component device - %s\n", dev);
+ close(fd);
+ return 2;
+ }
+ st->ignore_hw_compat = 1;
+ rv = st->ss->load_super(st, fd, dev);
+ if (rv == 0 || (force && rv >= 2)) {
+ st->ss->free_super(st);
+ st->ss->init_super(st, NULL, 0, "", NULL, NULL,
+ INVALID_SECTORS);
+ if (st->ss->store_super(st, fd)) {
+ if (verbose >= 0)
+ pr_err("Could not zero superblock on %s\n",
+ dev);
+ rv = 1;
+ } else if (rv) {
+ if (verbose >= 0)
+ pr_err("superblock zeroed anyway\n");
+ rv = 0;
+ }
+ }
+ close(fd);
+ return rv;
+}
+
+int Kill_subarray(char *dev, char *subarray, int verbose)
+{
+ /* Delete a subarray out of a container, the subarry must be
+ * inactive. The subarray string must be a subarray index
+ * number.
+ *
+ * 0 = successfully deleted subarray from all container members
+ * 1 = failed to sync metadata to one or more devices
+ * 2 = failed to find the container, subarray, or other resource
+ * issue
+ */
+ struct supertype supertype, *st = &supertype;
+ int fd, rv = 2;
+
+ memset(st, 0, sizeof(*st));
+
+ fd = open_subarray(dev, subarray, st, verbose < 0);
+ if (fd < 0)
+ return 2;
+
+ if (!st->ss->kill_subarray) {
+ if (verbose >= 0)
+ pr_err("Operation not supported for %s metadata\n",
+ st->ss->name);
+ goto free_super;
+ }
+
+ if (is_subarray_active(subarray, st->devnm)) {
+ if (verbose >= 0)
+ pr_err("Subarray-%s still active, aborting\n",
+ subarray);
+ goto free_super;
+ }
+
+ if (mdmon_running(st->devnm))
+ st->update_tail = &st->updates;
+
+ /* ok we've found our victim, drop the axe */
+ rv = st->ss->kill_subarray(st);
+ if (rv) {
+ if (verbose >= 0)
+ pr_err("Failed to delete subarray-%s from %s\n",
+ subarray, dev);
+ goto free_super;
+ }
+
+ /* FIXME these routines do not report success/failure */
+ if (st->update_tail)
+ flush_metadata_updates(st);
+ else
+ st->ss->sync_metadata(st);
+
+ if (verbose >= 0)
+ pr_err("Deleted subarray-%s from %s, UUIDs may have changed\n",
+ subarray, dev);
+
+ rv = 0;
+
+ free_super:
+ st->ss->free_super(st);
+ close(fd);
+
+ return rv;
+}
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..664c79f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,345 @@
+#
+# mdadm - manage Linux "md" devices aka RAID arrays.
+#
+# Copyright (C) 2001-2002 Neil Brown <neilb@cse.unsw.edu.au>
+# Copyright (C) 2013 Neil Brown <neilb@suse.de>
+#
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+# Author: Neil Brown
+# Email: <neilb@cse.unsw.edu.au>
+# Paper: Neil Brown
+# School of Computer Science and Engineering
+# The University of New South Wales
+# Sydney, 2052
+# Australia
+#
+
+# define "CXFLAGS" to give extra flags to CC.
+# e.g. make CXFLAGS=-O to optimise
+TCC = tcc
+UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found )
+#DIET_GCC = diet gcc
+# sorry, but diet-libc doesn't know about posix_memalign,
+# so we cannot use it any more.
+DIET_GCC = gcc -DHAVE_STDINT_H
+
+KLIBC=/home/src/klibc/klibc-0.77
+
+KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32
+
+CC = $(CROSS_COMPILE)gcc
+CXFLAGS ?= -ggdb
+CWFLAGS = -Wall -Wstrict-prototypes -Wextra -Wno-unused-parameter
+ifdef WARN_UNUSED
+CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3
+endif
+
+ifdef DEBIAN
+CPPFLAGS += -DDEBIAN
+endif
+ifdef DEFAULT_OLD_METADATA
+ CPPFLAGS += -DDEFAULT_OLD_METADATA
+ DEFAULT_METADATA=0.90
+else
+ DEFAULT_METADATA=1.2
+endif
+CPPFLAGS += -DBINDIR=\"$(BINDIR)\"
+
+PKG_CONFIG ?= pkg-config
+
+SYSCONFDIR = /etc
+CONFFILE = $(SYSCONFDIR)/mdadm/mdadm.conf
+CONFFILE2 = $(SYSCONFDIR)/mdadm.conf
+MAILCMD =/usr/sbin/sendmail -t
+CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\"
+# Both MAP_DIR and MDMON_DIR should be somewhere that persists across the
+# pivotroot from early boot to late boot.
+# /run is best, but for distros that don't support that.
+# /dev can work, in which case you probably want /dev/.mdadm
+RUN_DIR=/run/mdadm
+CHECK_RUN_DIR=1
+MAP_DIR=$(RUN_DIR)
+MAP_FILE = map
+MAP_PATH = $(MAP_DIR)/$(MAP_FILE)
+MDMON_DIR = $(RUN_DIR)
+# place for autoreplace cookies
+FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots
+SYSTEMD_DIR=/lib/systemd/system
+
+COROSYNC:=$(shell [ -d /usr/include/corosync ] || echo -DNO_COROSYNC)
+DLM:=$(shell [ -f /usr/include/libdlm.h ] || echo -DNO_DLM)
+
+DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\"
+DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\"
+DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\"
+CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) $(COROSYNC) $(DLM)
+
+VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//')
+VERS_DATE = $(shell [ -d .git ] && date --date="`git log -n1 --format=format:%cd --date=short`" '+%0dth %B %Y' | sed -e 's/1th/1st/' -e 's/2th/2nd/' -e 's/11st/11th/' -e 's/12nd/12th/')
+DVERS = $(if $(VERSION),-DVERSION=\"$(VERSION)\",)
+DDATE = $(if $(VERS_DATE),-DVERS_DATE="\"$(VERS_DATE)\"",)
+CFLAGS += $(DVERS) $(DDATE)
+
+# The glibc TLS ABI requires applications that call clone(2) to set up
+# TLS data structures, use pthreads until mdmon implements this support
+USE_PTHREADS = 1
+ifdef USE_PTHREADS
+CFLAGS += -DUSE_PTHREADS
+MON_LDFLAGS += -pthread
+endif
+
+# If you want a static binary, you might uncomment these
+# LDFLAGS = -static
+# STRIP = -s
+LDLIBS=-ldl
+
+INSTALL = /usr/bin/install
+DESTDIR =
+BINDIR = /sbin
+MANDIR = /usr/share/man
+MAN4DIR = $(MANDIR)/man4
+MAN5DIR = $(MANDIR)/man5
+MAN8DIR = $(MANDIR)/man8
+
+UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null)
+ifndef UDEVDIR
+ UDEVDIR = /lib/udev
+endif
+
+ifeq (,$(findstring s,$(MAKEFLAGS)))
+ ECHO=echo
+else
+ ECHO=:
+endif
+
+OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o util.o maps.o lib.o \
+ Manage.o Assemble.o Build.o \
+ Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
+ Incremental.o Dump.o \
+ mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
+ super-mbr.o super-gpt.o \
+ restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \
+ platform-intel.o probe_roms.o crc32c.o
+
+CHECK_OBJS = restripe.o sysfs.o maps.o lib.o xmalloc.o dlink.o
+
+SRCS = $(patsubst %.o,%.c,$(OBJS))
+
+INCL = mdadm.h part.h bitmap.h
+
+MON_OBJS = mdmon.o monitor.o managemon.o util.o maps.o mdstat.o sysfs.o \
+ policy.o lib.o \
+ Kill.o sg_io.o dlink.o ReadMe.o super-intel.o \
+ super-mbr.o super-gpt.o \
+ super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \
+ platform-intel.o probe_roms.o
+
+MON_SRCS = $(patsubst %.o,%.c,$(MON_OBJS))
+
+STATICSRC = pwgr.c
+STATICOBJS = pwgr.o
+
+ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c policy.c dlink.c util.c \
+ maps.c lib.c xmalloc.c \
+ super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \
+ platform-intel.c probe_roms.c sysfs.c super-mbr.c super-gpt.c mapfile.c
+ASSEMBLE_AUTO_SRCS := mdopen.c
+ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE
+ifdef MDASSEMBLE_AUTO
+ASSEMBLE_SRCS += $(ASSEMBLE_AUTO_SRCS)
+ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO
+endif
+
+all : mdadm mdmon
+man : mdadm.man md.man mdadm.conf.man mdmon.man raid6check.man
+
+check_rundir:
+ @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" = 1 ]; then \
+ echo "***** Parent of $(RUN_DIR) does not exist. Maybe set different RUN_DIR="; \
+ echo "***** e.g. make RUN_DIR=/dev/.mdadm" ; \
+ echo "***** or set CHECK_RUN_DIR=0"; exit 1; \
+ fi
+
+everything: all mdadm.static swap_super test_stripe raid6check \
+ mdassemble mdassemble.auto mdassemble.static mdassemble.man \
+ mdadm.Os mdadm.O2 man
+everything-test: all mdadm.static swap_super test_stripe \
+ mdassemble.auto mdassemble.static mdassemble.man \
+ mdadm.Os mdadm.O2 man
+# mdadm.uclibc and mdassemble.uclibc don't work on x86-64
+# mdadm.tcc doesn't work..
+
+mdadm : $(OBJS) | check_rundir
+ $(CC) $(CFLAGS) $(LDFLAGS) -o mdadm $(OBJS) $(LDLIBS)
+
+mdadm.static : $(OBJS) $(STATICOBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS) $(LDLIBS)
+
+mdadm.tcc : $(SRCS) $(INCL)
+ $(TCC) -o mdadm.tcc $(SRCS)
+
+mdadm.klibc : $(SRCS) $(INCL)
+ rm -f $(OBJS)
+ $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS)
+
+mdadm.Os : $(SRCS) $(INCL)
+ $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) $(LDLIBS)
+
+mdadm.O2 : $(SRCS) $(INCL) mdmon.O2
+ $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) $(LDLIBS)
+
+mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h
+ $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) $(LDLIBS)
+
+# use '-z now' to guarantee no dynamic linker interactions with the monitor thread
+mdmon : $(MON_OBJS) | check_rundir
+ $(CC) $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -Wl,-z,now -o mdmon $(MON_OBJS) $(LDLIBS)
+msg.o: msg.c msg.h
+
+test_stripe : restripe.c xmalloc.o mdadm.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c
+
+raid6check : raid6check.o mdadm.h $(CHECK_OBJS)
+ $(CC) $(CXFLAGS) $(LDFLAGS) -o raid6check raid6check.o $(CHECK_OBJS)
+
+mdassemble : $(ASSEMBLE_SRCS) $(INCL)
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) $(STATICSRC)
+
+mdassemble.diet : $(ASSEMBLE_SRCS) $(INCL)
+ rm -f $(OBJS)
+ $(DIET_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) $(STATICSRC)
+
+mdassemble.static : $(ASSEMBLE_SRCS) $(INCL)
+ rm -f $(OBJS)
+ $(CC) $(LDFLAGS) $(CPPFLAGS) $(ASSEMBLE_FLAGS) -static -DHAVE_STDINT_H -o mdassemble.static $(ASSEMBLE_SRCS) $(STATICSRC)
+
+mdassemble.auto : $(ASSEMBLE_SRCS) $(INCL) $(ASSEMBLE_AUTO_SRCS)
+ rm -f mdassemble.static
+ $(MAKE) MDASSEMBLE_AUTO=1 mdassemble.static
+ mv mdassemble.static mdassemble.auto
+
+mdassemble.uclibc : $(ASSEMBLE_SRCS) $(INCL)
+ rm -f $(OJS)
+ $(UCLIBC_GCC) $(ASSEMBLE_FLAGS) -DUCLIBC -DHAVE_STDINT_H -static -o mdassemble.uclibc $(ASSEMBLE_SRCS) $(STATICSRC)
+
+# This doesn't work
+mdassemble.klibc : $(ASSEMBLE_SRCS) $(INCL)
+ rm -f $(OBJS)
+ $(KLIBC_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS)
+
+mdadm.8 : mdadm.8.in
+ sed -e 's/{DEFAULT_METADATA}/$(DEFAULT_METADATA)/g' \
+ -e 's,{MAP_PATH},$(MAP_PATH),g' mdadm.8.in > mdadm.8
+
+mdadm.man : mdadm.8
+ man -l mdadm.8 > mdadm.man
+
+mdmon.man : mdmon.8
+ man -l mdmon.8 > mdmon.man
+
+md.man : md.4
+ man -l md.4 > md.man
+
+mdadm.conf.man : mdadm.conf.5
+ man -l mdadm.conf.5 > mdadm.conf.man
+
+mdassemble.man : mdassemble.8
+ man -l mdassemble.8 > mdassemble.man
+
+raid6check.man : raid6check.8
+ man -l raid6check.8 > raid6check.man
+
+$(OBJS) : $(INCL) mdmon.h
+$(MON_OBJS) : $(INCL) mdmon.h
+
+sha1.o : sha1.c sha1.h md5.h
+ $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c
+
+install : mdadm mdmon install-man install-udev
+ $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm
+ $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon
+
+install-static : mdadm.static install-man
+ $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm
+
+install-tcc : mdadm.tcc install-man
+ $(INSTALL) -D $(STRIP) -m 755 mdadm.tcc $(DESTDIR)$(BINDIR)/mdadm
+
+install-uclibc : mdadm.uclibc install-man
+ $(INSTALL) -D $(STRIP) -m 755 mdadm.uclibc $(DESTDIR)$(BINDIR)/mdadm
+
+install-klibc : mdadm.klibc install-man
+ $(INSTALL) -D $(STRIP) -m 755 mdadm.klibc $(DESTDIR)$(BINDIR)/mdadm
+
+install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8
+ $(INSTALL) -D -m 644 mdadm.8 $(DESTDIR)$(MAN8DIR)/mdadm.8
+ $(INSTALL) -D -m 644 mdmon.8 $(DESTDIR)$(MAN8DIR)/mdmon.8
+ $(INSTALL) -D -m 644 md.4 $(DESTDIR)$(MAN4DIR)/md.4
+ $(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5
+
+install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules
+ @for file in 63-md-raid-arrays.rules 64-md-raid-assembly.rules ; \
+ do sed -e 's,BINDIR,$(BINDIR),g' udev-$${file#??-} > .install.tmp.1 && \
+ $(ECHO) $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
+ $(INSTALL) -D -m 644 .install.tmp.1 $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
+ rm -f .install.tmp.1; \
+ done
+
+install-systemd: systemd/mdmon@.service
+ @for file in mdmon@.service mdmonitor.service mdadm-last-resort@.timer \
+ mdadm-last-resort@.service mdadm-grow-continue@.service; \
+ do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.2 && \
+ $(ECHO) $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
+ $(INSTALL) -D -m 644 .install.tmp.2 $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
+ rm -f .install.tmp.2; \
+ done
+ @for file in mdadm.shutdown ; \
+ do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.3 && \
+ $(ECHO) $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
+ $(INSTALL) -D -m 755 .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
+ rm -f .install.tmp.3; \
+ done
+ if [ -f /etc/SuSE-release -o -n "$(SUSE)" ] ;then $(INSTALL) -D -m 755 systemd/SUSE-mdadm_env.sh $(DESTDIR)$(SYSTEMD_DIR)/../scripts/mdadm_env.sh ;fi
+
+uninstall:
+ rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm
+
+test: mdadm mdmon test_stripe swap_super raid6check
+ @echo "Please run './test' as root"
+
+clean :
+ rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \
+ mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt .merge_file_* \
+ mdadm.Os mdadm.O2 mdmon.O2 \
+ mdassemble mdassemble.static mdassemble.auto mdassemble.uclibc \
+ mdassemble.klibc swap_super \
+ init.cpio.gz mdadm.uclibc.static test_stripe raid6check raid6check.o mdmon \
+ mdadm.8
+
+dist : clean
+ ./makedist
+
+testdist : everything-test clean
+ ./makedist test
+
+TAGS :
+ etags *.h *.c
+
+DISTRO_MAKEFILE := $(wildcard distropkg/Makefile)
+ifdef DISTRO_MAKEFILE
+include $(DISTRO_MAKEFILE)
+endif
diff --git a/Manage.c b/Manage.c
new file mode 100644
index 0000000..7e1b94b
--- /dev/null
+++ b/Manage.c
@@ -0,0 +1,1786 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_u.h"
+#include "md_p.h"
+#include <ctype.h>
+
+#define REGISTER_DEV _IO (MD_MAJOR, 1)
+#define START_MD _IO (MD_MAJOR, 2)
+#define STOP_MD _IO (MD_MAJOR, 3)
+
+int Manage_ro(char *devname, int fd, int readonly)
+{
+ /* switch to readonly or rw
+ *
+ * requires >= 0.90.0
+ * first check that array is runing
+ * use RESTART_ARRAY_RW or STOP_ARRAY_RO
+ *
+ */
+ mdu_array_info_t array;
+#ifndef MDASSEMBLE
+ struct mdinfo *mdi;
+#endif
+ int rv = 0;
+
+ if (md_get_version(fd) < 9000) {
+ pr_err("need md driver version 0.90.0 or later\n");
+ return 1;
+ }
+#ifndef MDASSEMBLE
+ /* If this is an externally-managed array, we need to modify the
+ * metadata_version so that mdmon doesn't undo our change.
+ */
+ mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION);
+ if (mdi &&
+ mdi->array.major_version == -1 &&
+ is_subarray(mdi->text_version)) {
+ char vers[64];
+ strcpy(vers, "external:");
+ strcat(vers, mdi->text_version);
+ if (readonly > 0) {
+ int rv;
+ /* We set readonly ourselves. */
+ vers[9] = '-';
+ sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+ close(fd);
+ rv = sysfs_set_str(mdi, NULL, "array_state", "readonly");
+
+ if (rv < 0) {
+ pr_err("failed to set readonly for %s: %s\n",
+ devname, strerror(errno));
+
+ vers[9] = mdi->text_version[0];
+ sysfs_set_str(mdi, NULL, "metadata_version", vers);
+ rv = 1;
+ goto out;
+ }
+ } else {
+ char *cp;
+ /* We cannot set read/write - must signal mdmon */
+ vers[9] = '/';
+ sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+ cp = strchr(vers+10, '/');
+ if (cp)
+ *cp = 0;
+ ping_monitor(vers+10);
+ if (mdi->array.level <= 0)
+ sysfs_set_str(mdi, NULL, "array_state", "active");
+ }
+ goto out;
+ }
+#endif
+ if (ioctl(fd, GET_ARRAY_INFO, &array)) {
+ pr_err("%s does not appear to be active.\n",
+ devname);
+ rv = 1;
+ goto out;
+ }
+
+ if (readonly > 0) {
+ if (ioctl(fd, STOP_ARRAY_RO, NULL)) {
+ pr_err("failed to set readonly for %s: %s\n",
+ devname, strerror(errno));
+ rv = 1;
+ goto out;
+ }
+ } else if (readonly < 0) {
+ if (ioctl(fd, RESTART_ARRAY_RW, NULL)) {
+ pr_err("failed to set writable for %s: %s\n",
+ devname, strerror(errno));
+ rv = 1;
+ goto out;
+ }
+ }
+out:
+#ifndef MDASSEMBLE
+ if (mdi)
+ sysfs_free(mdi);
+#endif
+ return rv;
+}
+
+#ifndef MDASSEMBLE
+
+static void remove_devices(char *devnm, char *path)
+{
+ /*
+ * Remove names at 'path' - possibly with
+ * partition suffixes - which link to the 'standard'
+ * name for devnm. These were probably created
+ * by mdadm when the array was assembled.
+ */
+ char base[40];
+ char *path2;
+ char link[1024];
+ int n;
+ int part;
+ char *be;
+ char *pe;
+
+ if (!path)
+ return;
+
+ sprintf(base, "/dev/%s", devnm);
+ be = base + strlen(base);
+
+ path2 = xmalloc(strlen(path)+20);
+ strcpy(path2, path);
+ pe = path2 + strlen(path2);
+
+ for (part = 0; part < 16; part++) {
+ if (part) {
+ sprintf(be, "p%d", part);
+
+ if (isdigit(pe[-1]))
+ sprintf(pe, "p%d", part);
+ else
+ sprintf(pe, "%d", part);
+ }
+ n = readlink(path2, link, sizeof(link));
+ if (n > 0 && (int)strlen(base) == n &&
+ strncmp(link, base, n) == 0)
+ unlink(path2);
+ }
+ free(path2);
+}
+
+int Manage_run(char *devname, int fd, struct context *c)
+{
+ /* Run the array. Array must already be configured
+ * Requires >= 0.90.0
+ */
+ char nm[32], *nmp;
+
+ if (md_get_version(fd) < 9000) {
+ pr_err("need md driver version 0.90.0 or later\n");
+ return 1;
+ }
+ nmp = fd2devnm(fd);
+ if (!nmp) {
+ pr_err("Cannot find %s in sysfs!!\n", devname);
+ return 1;
+ }
+ strcpy(nm, nmp);
+ return IncrementalScan(c, nm);
+}
+
+int Manage_stop(char *devname, int fd, int verbose, int will_retry)
+{
+ /* Stop the array. Array must already be configured
+ * 'will_retry' means that error messages are not wanted.
+ */
+ int rv = 0;
+ struct map_ent *map = NULL;
+ struct mdinfo *mdi;
+ char devnm[32];
+ char container[32];
+ int err;
+ int count;
+ char buf[32];
+ unsigned long long rd1, rd2;
+
+ if (will_retry && verbose == 0)
+ verbose = -1;
+
+ if (md_get_version(fd) < 9000) {
+ if (ioctl(fd, STOP_MD, 0) == 0)
+ return 0;
+ pr_err("stopping device %s failed: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+
+ strcpy(devnm, fd2devnm(fd));
+ /* Get EXCL access first. If this fails, then attempting
+ * to stop is probably a bad idea.
+ */
+ mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION);
+ if (mdi && is_subarray(mdi->text_version)) {
+ char *sl;
+ strncpy(container, mdi->text_version+1, sizeof(container));
+ container[sizeof(container)-1] = 0;
+ sl = strchr(container, '/');
+ if (sl)
+ *sl = 0;
+ } else
+ container[0] = 0;
+ close(fd);
+ count = 5;
+ while (((fd = ((devname[0] == '/')
+ ?open(devname, O_RDONLY|O_EXCL)
+ :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0
+ || strcmp(fd2devnm(fd), devnm) != 0)
+ && container[0]
+ && mdmon_running(container)
+ && count) {
+ /* Can't open, so something might be wrong. However it
+ * is a container, so we might be racing with mdmon, so
+ * retry for a bit.
+ */
+ if (fd >= 0)
+ close(fd);
+ flush_mdmon(container);
+ count--;
+ }
+ if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) {
+ if (fd >= 0)
+ close(fd);
+ if (verbose >= 0)
+ pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n",
+ devname);
+ return 1;
+ }
+ /* If this is an mdmon managed array, just write 'inactive'
+ * to the array state and let mdmon clear up.
+ */
+ if (mdi &&
+ mdi->array.level > 0 &&
+ is_subarray(mdi->text_version)) {
+ int err;
+ /* This is mdmon managed. */
+ close(fd);
+
+ /* As we had an O_EXCL open, any use of the device
+ * which blocks STOP_ARRAY is probably a transient use,
+ * so it is reasonable to retry for a while - 5 seconds.
+ */
+ count = 25;
+ while (count &&
+ (err = sysfs_set_str(mdi, NULL,
+ "array_state",
+ "inactive")) < 0
+ && errno == EBUSY) {
+ usleep(200000);
+ count--;
+ }
+ if (err) {
+ if (verbose >= 0)
+ pr_err("failed to stop array %s: %s\n",
+ devname, strerror(errno));
+ rv = 1;
+ goto out;
+ }
+
+ /* Give monitor a chance to act */
+ ping_monitor(mdi->text_version);
+
+ fd = open_dev_excl(devnm);
+ if (fd < 0) {
+ if (verbose >= 0)
+ pr_err("failed to completely stop %s: Device is busy\n",
+ devname);
+ rv = 1;
+ goto out;
+ }
+ } else if (mdi &&
+ mdi->array.major_version == -1 &&
+ mdi->array.minor_version == -2 &&
+ !is_subarray(mdi->text_version)) {
+ struct mdstat_ent *mds, *m;
+ /* container, possibly mdmon-managed.
+ * Make sure mdmon isn't opening it, which
+ * would interfere with the 'stop'
+ */
+ ping_monitor(mdi->sys_name);
+
+ /* now check that there are no existing arrays
+ * which are members of this array
+ */
+ mds = mdstat_read(0, 0);
+ for (m = mds; m; m = m->next)
+ if (m->metadata_version &&
+ strncmp(m->metadata_version, "external:", 9)==0 &&
+ metadata_container_matches(m->metadata_version+9,
+ devnm)) {
+ if (verbose >= 0)
+ pr_err("Cannot stop container %s: member %s still active\n",
+ devname, m->devnm);
+ free_mdstat(mds);
+ rv = 1;
+ goto out;
+ }
+ }
+
+ /* If the array is undergoing a reshape which changes the number
+ * of devices, then it would be nice to stop it at a point where
+ * it has completed a full number of stripes in both old and
+ * new layouts as this will allow the reshape to be reverted.
+ * So if 'sync_action' is "reshape" and 'raid_disks' shows two
+ * different numbers, then
+ * - freeze reshape
+ * - set sync_max to next multiple of both data_disks and
+ * chunk sizes (or next but one)
+ * - unfreeze reshape
+ * - wait on 'sync_completed' for that point to be reached.
+ */
+ if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) &&
+ sysfs_attribute_available(mdi, NULL, "sync_action") &&
+ sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
+ sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 &&
+ strcmp(buf, "reshape\n") == 0 &&
+ sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) {
+ unsigned long long position, curr;
+ unsigned long long chunk1, chunk2;
+ unsigned long long rddiv, chunkdiv;
+ unsigned long long sectors;
+ unsigned long long sync_max, old_sync_max;
+ unsigned long long completed;
+ int backwards = 0;
+ int delay;
+ int scfd;
+
+ delay = 40;
+ while (rd1 > rd2 && delay > 0 &&
+ sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) {
+ /* must be in the critical section - wait a bit */
+ delay -= 1;
+ usleep(100000);
+ }
+
+ if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0)
+ goto done;
+ /* Array is frozen */
+
+ rd1 -= mdi->array.level == 6 ? 2 : 1;
+ rd2 -= mdi->array.level == 6 ? 2 : 1;
+ sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
+ if (strncmp(buf, "back", 4) == 0)
+ backwards = 1;
+ if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) {
+ /* reshape must have finished now */
+ sysfs_set_str(mdi, NULL, "sync_action", "idle");
+ goto done;
+ }
+ sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
+ chunk1 /= 512;
+ chunk2 /= 512;
+ rddiv = GCD(rd1, rd2);
+ chunkdiv = GCD(chunk1, chunk2);
+ sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2;
+
+ if (backwards) {
+ /* Need to subtract 'reshape_position' from
+ * array size to get equivalent of sync_max.
+ * Size calculation based on raid5_size in kernel.
+ */
+ unsigned long long size = mdi->component_size;
+ size &= ~(chunk1-1);
+ size &= ~(chunk2-1);
+ /* rd1 must be smaller */
+ /* Reshape may have progressed further backwards than
+ * recorded, so target even further back (hence "-1")
+ */
+ position = (position / sectors - 1) * sectors;
+ /* rd1 is always the conversion factor between 'sync'
+ * position and 'reshape' position.
+ * We read 1 "new" stripe worth of data from where-ever,
+ * and when write out that full stripe.
+ */
+ sync_max = size - position/rd1;
+ } else {
+ /* Reshape will very likely be beyond position, and it may
+ * be too late to stop at '+1', so aim for '+2'
+ */
+ position = (position / sectors + 2) * sectors;
+ sync_max = position/rd1;
+ }
+ if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0)
+ old_sync_max = mdi->component_size;
+ /* Must not advance sync_max as that could confuse
+ * the reshape monitor */
+ if (sync_max < old_sync_max)
+ sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+ sysfs_set_str(mdi, NULL, "sync_action", "idle");
+
+ /* That should have set things going again. Now we
+ * wait a little while (3 second max) for sync_completed
+ * to reach the target.
+ * The reshape process can block for 500msec if
+ * the sync speed limit is hit, so we need to wait
+ * a lot longer than that. 1 second is usually
+ * enough. 3 is safe.
+ */
+ delay = 3000;
+ scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
+ while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
+ unsigned long long max_completed;
+ sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
+ sysfs_fd_get_str(scfd, buf, sizeof(buf));
+ if (strncmp(buf, "none", 4) == 0) {
+ /* Either reshape has aborted, or hasn't
+ * quite started yet. Wait a bit and
+ * check 'sync_action' to see.
+ */
+ usleep(10000);
+ sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf));
+ if (strncmp(buf, "reshape", 7) != 0)
+ break;
+ }
+
+ if (sysfs_fd_get_two(scfd, &completed,
+ &max_completed) == 2 &&
+ /* 'completed' sometimes reads as max-uulong */
+ completed < max_completed &&
+ (completed > sync_max ||
+ (completed == sync_max && curr != position))) {
+ while (completed > sync_max) {
+ sync_max += sectors / rd1;
+ if (backwards)
+ position -= sectors;
+ else
+ position += sectors;
+ }
+ if (sync_max < old_sync_max)
+ sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+ }
+
+ if (!backwards && curr >= position)
+ break;
+ if (backwards && curr <= position)
+ break;
+ sysfs_wait(scfd, &delay);
+ }
+ if (scfd >= 0)
+ close(scfd);
+
+ }
+done:
+
+ /* As we have an O_EXCL open, any use of the device
+ * which blocks STOP_ARRAY is probably a transient use,
+ * so it is reasonable to retry for a while - 5 seconds.
+ */
+ count = 25; err = 0;
+ while (count && fd >= 0
+ && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0
+ && errno == EBUSY) {
+ usleep(200000);
+ count --;
+ }
+ if (fd >= 0 && err) {
+ if (verbose >= 0) {
+ pr_err("failed to stop array %s: %s\n",
+ devname, strerror(errno));
+ if (errno == EBUSY)
+ cont_err("Perhaps a running process, mounted filesystem or active volume group?\n");
+ }
+ rv = 1;
+ goto out;
+ }
+ /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
+ * was stopped, so We'll do it here just to be sure. Drop any
+ * partitions as well...
+ */
+ if (fd >= 0)
+ ioctl(fd, BLKRRPART, 0);
+ if (mdi)
+ sysfs_uevent(mdi, "change");
+
+ if (devnm[0] && use_udev()) {
+ struct map_ent *mp = map_by_devnm(&map, devnm);
+ remove_devices(devnm, mp ? mp->path : NULL);
+ }
+
+ if (verbose >= 0)
+ pr_err("stopped %s\n", devname);
+ map_lock(&map);
+ map_remove(&map, devnm);
+ map_unlock(&map);
+out:
+ if (mdi)
+ sysfs_free(mdi);
+
+ return rv;
+}
+
+static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp)
+{
+ struct mddev_dev *new;
+ new = xmalloc(sizeof(*new));
+ memset(new, 0, sizeof(*new));
+ new->devname = xstrdup(name);
+ new->disposition = disp;
+ new->next = dv->next;
+ dv->next = new;
+ return new;
+}
+
+static void add_faulty(struct mddev_dev *dv, int fd, char disp)
+{
+ mdu_array_info_t array;
+ mdu_disk_info_t disk;
+ int remaining_disks;
+ int i;
+
+ if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
+ return;
+
+ remaining_disks = array.nr_disks;
+ for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+ char buf[40];
+ disk.number = i;
+ if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ remaining_disks--;
+ if ((disk.state & 1) == 0) /* not faulty */
+ continue;
+ sprintf(buf, "%d:%d", disk.major, disk.minor);
+ dv = add_one(dv, buf, disp);
+ }
+}
+
+static void add_detached(struct mddev_dev *dv, int fd, char disp)
+{
+ mdu_array_info_t array;
+ mdu_disk_info_t disk;
+ int remaining_disks;
+ int i;
+
+ if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
+ return;
+
+ remaining_disks = array.nr_disks;
+ for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+ char buf[40];
+ int sfd;
+ disk.number = i;
+ if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ remaining_disks--;
+ if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */
+ continue;
+ sprintf(buf, "%d:%d", disk.major, disk.minor);
+ sfd = dev_open(buf, O_RDONLY);
+ if (sfd >= 0) {
+ /* Not detached */
+ close(sfd);
+ continue;
+ }
+ if (errno != ENXIO)
+ /* Probably not detached */
+ continue;
+ dv = add_one(dv, buf, disp);
+ }
+}
+
+static void add_set(struct mddev_dev *dv, int fd, char set_char)
+{
+ mdu_array_info_t array;
+ mdu_disk_info_t disk;
+ int remaining_disks;
+ int copies, set;
+ int i;
+
+ if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
+ return;
+ if (array.level != 10)
+ return;
+ copies = ((array.layout & 0xff) *
+ ((array.layout >> 8) & 0xff));
+ if (array.raid_disks % copies)
+ return;
+
+ remaining_disks = array.nr_disks;
+ for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+ char buf[40];
+ disk.number = i;
+ if (ioctl(fd, GET_DISK_INFO, &disk) != 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ remaining_disks--;
+ set = disk.raid_disk % copies;
+ if (set_char != set + 'A')
+ continue;
+ sprintf(buf, "%d:%d", disk.major, disk.minor);
+ dv = add_one(dv, buf, dv->disposition);
+ }
+}
+
+int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
+ struct supertype *dev_st, struct supertype *tst,
+ unsigned long rdev,
+ char *update, char *devname, int verbose,
+ mdu_array_info_t *array)
+{
+ struct mdinfo mdi;
+ int duuid[4];
+ int ouuid[4];
+
+ dev_st->ss->getinfo_super(dev_st, &mdi, NULL);
+ dev_st->ss->uuid_from_super(dev_st, ouuid);
+ if (tst->sb)
+ tst->ss->uuid_from_super(tst, duuid);
+ else
+ /* Assume uuid matches: kernel will check */
+ memcpy(duuid, ouuid, sizeof(ouuid));
+ if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) &&
+ !(mdi.disk.state & (1<<MD_DISK_FAULTY)) &&
+ memcmp(duuid, ouuid, sizeof(ouuid))==0) {
+ /* Looks like it is worth a
+ * try. Need to make sure
+ * kernel will accept it
+ * though.
+ */
+ mdu_disk_info_t disc;
+ /* re-add doesn't work for version-1 superblocks
+ * before 2.6.18 :-(
+ */
+ if (array->major_version == 1 &&
+ get_linux_version() <= 2006018)
+ goto skip_re_add;
+ disc.number = mdi.disk.number;
+ if (ioctl(fd, GET_DISK_INFO, &disc) != 0
+ || disc.major != 0 || disc.minor != 0
+ )
+ goto skip_re_add;
+ disc.major = major(rdev);
+ disc.minor = minor(rdev);
+ disc.number = mdi.disk.number;
+ disc.raid_disk = mdi.disk.raid_disk;
+ disc.state = mdi.disk.state;
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ /* extra flags are needed when adding to a cluster as
+ * there are two cases to distinguish
+ */
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
+ if (dv->writemostly == 1)
+ disc.state |= 1 << MD_DISK_WRITEMOSTLY;
+ if (dv->writemostly == 2)
+ disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
+ remove_partitions(tfd);
+ if (update || dv->writemostly > 0) {
+ int rv = -1;
+ tfd = dev_open(dv->devname, O_RDWR);
+ if (tfd < 0) {
+ pr_err("failed to open %s for superblock update during re-add\n", dv->devname);
+ return -1;
+ }
+
+ if (dv->writemostly == 1)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "writemostly",
+ devname, verbose, 0, NULL);
+ if (dv->writemostly == 2)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "readwrite",
+ devname, verbose, 0, NULL);
+ if (update)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, update,
+ devname, verbose, 0, NULL);
+ if (rv == 0)
+ rv = dev_st->ss->store_super(dev_st, tfd);
+ close(tfd);
+ if (rv != 0) {
+ pr_err("failed to update superblock during re-add\n");
+ return -1;
+ }
+ }
+ /* don't even try if disk is marked as faulty */
+ errno = 0;
+ if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) {
+ if (verbose >= 0)
+ pr_err("re-added %s\n", dv->devname);
+ return 1;
+ }
+ if (errno == ENOMEM || errno == EROFS) {
+ pr_err("add new device failed for %s: %s\n",
+ dv->devname, strerror(errno));
+ if (dv->disposition == 'M')
+ return 0;
+ return -1;
+ }
+ }
+skip_re_add:
+ return 0;
+}
+
+int Manage_add(int fd, int tfd, struct mddev_dev *dv,
+ struct supertype *tst, mdu_array_info_t *array,
+ int force, int verbose, char *devname,
+ char *update, unsigned long rdev, unsigned long long array_size,
+ int raid_slot)
+{
+ unsigned long long ldsize;
+ struct supertype *dev_st = NULL;
+ int j;
+ mdu_disk_info_t disc;
+
+ if (!get_dev_size(tfd, dv->devname, &ldsize)) {
+ if (dv->disposition == 'M')
+ return 0;
+ else
+ return -1;
+ }
+
+ if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
+ /* More than 4TB is wasted on v0.90 */
+ if (!force) {
+ pr_err("%s is larger than %s can effectively use.\n"
+ " Add --force is you really want to add this device.\n",
+ dv->devname, devname);
+ return -1;
+ }
+ pr_err("%s is larger than %s can effectively use.\n"
+ " Adding anyway as --force was given.\n",
+ dv->devname, devname);
+ }
+ if (!tst->ss->external &&
+ array->major_version == 0 &&
+ md_get_version(fd)%100 < 2) {
+ if (ioctl(fd, HOT_ADD_DISK, rdev)==0) {
+ if (verbose >= 0)
+ pr_err("hot added %s\n",
+ dv->devname);
+ return 1;
+ }
+
+ pr_err("hot add failed for %s: %s\n",
+ dv->devname, strerror(errno));
+ return -1;
+ }
+
+ if (array->not_persistent == 0 || tst->ss->external) {
+
+ /* need to find a sample superblock to copy, and
+ * a spare slot to use.
+ * For 'external' array (well, container based),
+ * We can just load the metadata for the array->
+ */
+ int array_failed;
+ if (tst->sb)
+ /* already loaded */;
+ else if (tst->ss->external) {
+ tst->ss->load_container(tst, fd, NULL);
+ } else for (j = 0; j < tst->max_devs; j++) {
+ char *dev;
+ int dfd;
+ disc.number = j;
+ if (ioctl(fd, GET_DISK_INFO, &disc))
+ continue;
+ if (disc.major==0 && disc.minor==0)
+ continue;
+ if ((disc.state & 4)==0) /* sync */
+ continue;
+ /* Looks like a good device to try */
+ dev = map_dev(disc.major, disc.minor, 1);
+ if (!dev)
+ continue;
+ dfd = dev_open(dev, O_RDONLY);
+ if (dfd < 0)
+ continue;
+ if (tst->ss->load_super(tst, dfd,
+ NULL)) {
+ close(dfd);
+ continue;
+ }
+ close(dfd);
+ break;
+ }
+ /* FIXME this is a bad test to be using */
+ if (!tst->sb && (dv->disposition != 'a'
+ && dv->disposition != 'S')) {
+ /* we are re-adding a device to a
+ * completely dead array - have to depend
+ * on kernel to check
+ */
+ } else if (!tst->sb) {
+ pr_err("cannot load array metadata from %s\n", devname);
+ return -1;
+ }
+
+ /* Make sure device is large enough */
+ if (dv->disposition != 'j' && /* skip size check for Journal */
+ tst->sb &&
+ tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
+ array_size) {
+ if (dv->disposition == 'M')
+ return 0;
+ pr_err("%s not large enough to join array\n",
+ dv->devname);
+ return -1;
+ }
+
+ /* Possibly this device was recently part of
+ * the array and was temporarily removed, and
+ * is now being re-added. If so, we can
+ * simply re-add it.
+ */
+
+ if (array->not_persistent==0) {
+ dev_st = dup_super(tst);
+ dev_st->ss->load_super(dev_st, tfd, NULL);
+ }
+ if (dev_st && dev_st->sb && dv->disposition != 'S') {
+ int rv = attempt_re_add(fd, tfd, dv,
+ dev_st, tst,
+ rdev,
+ update, devname,
+ verbose,
+ array);
+ dev_st->ss->free_super(dev_st);
+ if (rv)
+ return rv;
+ }
+ if (dv->disposition == 'M') {
+ if (verbose > 0)
+ pr_err("--re-add for %s to %s is not possible\n",
+ dv->devname, devname);
+ return 0;
+ }
+ if (dv->disposition == 'A') {
+ pr_err("--re-add for %s to %s is not possible\n",
+ dv->devname, devname);
+ return -1;
+ }
+ if (array->active_disks < array->raid_disks) {
+ char *avail = xcalloc(array->raid_disks, 1);
+ int d;
+ int found = 0;
+
+ for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) {
+ disc.number = d;
+ if (ioctl(fd, GET_DISK_INFO, &disc))
+ continue;
+ if (disc.major == 0 && disc.minor == 0)
+ continue;
+ found++;
+ if (!(disc.state & (1<<MD_DISK_SYNC)))
+ continue;
+ avail[disc.raid_disk] = 1;
+ }
+ array_failed = !enough(array->level, array->raid_disks,
+ array->layout, 1, avail);
+ free(avail);
+ } else
+ array_failed = 0;
+ if (array_failed) {
+ pr_err("%s has failed so using --add cannot work and might destroy\n",
+ devname);
+ pr_err("data on %s. You should stop the array and re-assemble it.\n",
+ dv->devname);
+ return -1;
+ }
+ } else {
+ /* non-persistent. Must ensure that new drive
+ * is at least array->size big.
+ */
+ if (ldsize/512 < array_size) {
+ pr_err("%s not large enough to join array\n",
+ dv->devname);
+ return -1;
+ }
+ }
+ /* committed to really trying this device now*/
+ remove_partitions(tfd);
+
+ /* in 2.6.17 and earlier, version-1 superblocks won't
+ * use the number we write, but will choose a free number.
+ * we must choose the same free number, which requires
+ * starting at 'raid_disks' and counting up
+ */
+ for (j = array->raid_disks; j < tst->max_devs; j++) {
+ disc.number = j;
+ if (ioctl(fd, GET_DISK_INFO, &disc))
+ break;
+ if (disc.major==0 && disc.minor==0)
+ break;
+ if (disc.state & 8) /* removed */
+ break;
+ }
+ disc.major = major(rdev);
+ disc.minor = minor(rdev);
+ if (raid_slot < 0)
+ disc.number = j;
+ else
+ disc.number = raid_slot;
+ disc.state = 0;
+
+ /* only add journal to array that supports journaling */
+ if (dv->disposition == 'j') {
+ struct mdinfo mdi;
+ struct mdinfo *mdp;
+
+ mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
+
+ if (strncmp(mdp->sysfs_array_state, "readonly", 8) != 0) {
+ pr_err("%s is not readonly, cannot add journal.\n", devname);
+ return -1;
+ }
+
+ tst->ss->getinfo_super(tst, &mdi, NULL);
+ if (mdi.journal_device_required == 0) {
+ pr_err("%s does not support journal device.\n", devname);
+ return -1;
+ }
+ disc.raid_disk = 0;
+ }
+
+ if (array->not_persistent==0) {
+ int dfd;
+ if (dv->disposition == 'j')
+ disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
+ if (dv->writemostly == 1)
+ disc.state |= 1 << MD_DISK_WRITEMOSTLY;
+ dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+ if (tst->ss->add_to_super(tst, &disc, dfd,
+ dv->devname, INVALID_SECTORS))
+ return -1;
+ if (tst->ss->write_init_super(tst))
+ return -1;
+ } else if (dv->disposition == 'A') {
+ /* this had better be raid1.
+ * As we are "--re-add"ing we must find a spare slot
+ * to fill.
+ */
+ char *used = xcalloc(array->raid_disks, 1);
+ for (j = 0; j < tst->max_devs; j++) {
+ mdu_disk_info_t disc2;
+ disc2.number = j;
+ if (ioctl(fd, GET_DISK_INFO, &disc2))
+ continue;
+ if (disc2.major==0 && disc2.minor==0)
+ continue;
+ if (disc2.state & 8) /* removed */
+ continue;
+ if (disc2.raid_disk < 0)
+ continue;
+ if (disc2.raid_disk > array->raid_disks)
+ continue;
+ used[disc2.raid_disk] = 1;
+ }
+ for (j = 0 ; j < array->raid_disks; j++)
+ if (!used[j]) {
+ disc.raid_disk = j;
+ disc.state |= (1<<MD_DISK_SYNC);
+ break;
+ }
+ free(used);
+ }
+
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
+
+ if (dv->writemostly == 1)
+ disc.state |= (1 << MD_DISK_WRITEMOSTLY);
+ if (tst->ss->external) {
+ /* add a disk
+ * to an external metadata container */
+ struct mdinfo new_mdi;
+ struct mdinfo *sra;
+ int container_fd;
+ char devnm[32];
+ int dfd;
+
+ strcpy(devnm, fd2devnm(fd));
+
+ container_fd = open_dev_excl(devnm);
+ if (container_fd < 0) {
+ pr_err("add failed for %s: could not get exclusive access to container\n",
+ dv->devname);
+ tst->ss->free_super(tst);
+ return -1;
+ }
+
+ Kill(dv->devname, NULL, 0, -1, 0);
+ dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+ if (mdmon_running(tst->container_devnm))
+ tst->update_tail = &tst->updates;
+ if (tst->ss->add_to_super(tst, &disc, dfd,
+ dv->devname, INVALID_SECTORS)) {
+ close(dfd);
+ close(container_fd);
+ return -1;
+ }
+ if (tst->update_tail)
+ flush_metadata_updates(tst);
+ else
+ tst->ss->sync_metadata(tst);
+
+ sra = sysfs_read(container_fd, NULL, 0);
+ if (!sra) {
+ pr_err("add failed for %s: sysfs_read failed\n",
+ dv->devname);
+ close(container_fd);
+ tst->ss->free_super(tst);
+ return -1;
+ }
+ sra->array.level = LEVEL_CONTAINER;
+ /* Need to set data_offset and component_size */
+ tst->ss->getinfo_super(tst, &new_mdi, NULL);
+ new_mdi.disk.major = disc.major;
+ new_mdi.disk.minor = disc.minor;
+ new_mdi.recovery_start = 0;
+ /* Make sure fds are closed as they are O_EXCL which
+ * would block add_disk */
+ tst->ss->free_super(tst);
+ if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
+ pr_err("add new device to external metadata failed for %s\n", dv->devname);
+ close(container_fd);
+ sysfs_free(sra);
+ return -1;
+ }
+ ping_monitor(devnm);
+ sysfs_free(sra);
+ close(container_fd);
+ } else {
+ tst->ss->free_super(tst);
+ if (ioctl(fd, ADD_NEW_DISK, &disc)) {
+ if (dv->disposition == 'j')
+ pr_err("Failed to hot add %s as journal, "
+ "please try restart %s.\n", dv->devname, devname);
+ else
+ pr_err("add new device failed for %s as %d: %s\n",
+ dv->devname, j, strerror(errno));
+ return -1;
+ }
+ if (dv->disposition == 'j') {
+ pr_err("Journal added successfully, making %s read-write\n", devname);
+ if (Manage_ro(devname, fd, -1))
+ pr_err("Failed to make %s read-write\n", devname);
+ }
+
+ }
+ if (verbose >= 0)
+ pr_err("added %s\n", dv->devname);
+ return 1;
+}
+
+int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
+ int sysfd, unsigned long rdev, int verbose, char *devname)
+{
+ int lfd = -1;
+ int err;
+
+ if (tst->ss->external) {
+ /* To remove a device from a container, we must
+ * check that it isn't in use in an array.
+ * This involves looking in the 'holders'
+ * directory - there must be just one entry,
+ * the container.
+ * To ensure that it doesn't get used as a
+ * hot spare while we are checking, we
+ * get an O_EXCL open on the container
+ */
+ int ret;
+ char devnm[32];
+ strcpy(devnm, fd2devnm(fd));
+ lfd = open_dev_excl(devnm);
+ if (lfd < 0) {
+ pr_err("Cannot get exclusive access to container - odd\n");
+ return -1;
+ }
+ /* We may not be able to check on holders in
+ * sysfs, either because we don't have the dev num
+ * (rdev == 0) or because the device has been detached
+ * and the 'holders' directory no longer exists
+ * (ret == -1). In that case, assume it is OK to
+ * remove.
+ */
+ if (rdev == 0)
+ ret = -1;
+ else
+ ret = sysfs_unique_holder(devnm, rdev);
+ if (ret == 0) {
+ pr_err("%s is not a member, cannot remove.\n",
+ dv->devname);
+ close(lfd);
+ return -1;
+ }
+ if (ret >= 2) {
+ pr_err("%s is still in use, cannot remove.\n",
+ dv->devname);
+ close(lfd);
+ return -1;
+ }
+ }
+ /* FIXME check that it is a current member */
+ if (sysfd >= 0) {
+ /* device has been removed and we don't know
+ * the major:minor number
+ */
+ int n = write(sysfd, "remove", 6);
+ if (n != 6)
+ err = -1;
+ else
+ err = 0;
+ } else {
+ err = ioctl(fd, HOT_REMOVE_DISK, rdev);
+ if (err && errno == ENODEV) {
+ /* Old kernels rejected this if no personality
+ * is registered */
+ struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS);
+ struct mdinfo *dv = NULL;
+ if (sra)
+ dv = sra->devs;
+ for ( ; dv ; dv=dv->next)
+ if (dv->disk.major == (int)major(rdev) &&
+ dv->disk.minor == (int)minor(rdev))
+ break;
+ if (dv)
+ err = sysfs_set_str(sra, dv,
+ "state", "remove");
+ else
+ err = -1;
+ if (sra)
+ sysfs_free(sra);
+ }
+ }
+ if (err) {
+ pr_err("hot remove failed for %s: %s\n", dv->devname,
+ strerror(errno));
+ if (lfd >= 0)
+ close(lfd);
+ return -1;
+ }
+ if (tst->ss->external) {
+ /*
+ * Before dropping our exclusive open we make an
+ * attempt at preventing mdmon from seeing an
+ * 'add' event before reconciling this 'remove'
+ * event.
+ */
+ char *devnm = fd2devnm(fd);
+
+ if (!devnm) {
+ pr_err("unable to get container name\n");
+ return -1;
+ }
+
+ ping_manager(devnm);
+ }
+ if (lfd >= 0)
+ close(lfd);
+ if (verbose >= 0)
+ pr_err("hot removed %s from %s\n",
+ dv->devname, devname);
+ return 1;
+}
+
+int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv,
+ unsigned long rdev, int verbose, char *devname)
+{
+ struct mdinfo *mdi, *di;
+ if (tst->ss->external) {
+ pr_err("--replace only supported for native metadata (0.90 or 1.x)\n");
+ return -1;
+ }
+ /* Need to find the device in sysfs and add 'want_replacement' to the
+ * status.
+ */
+ mdi = sysfs_read(fd, NULL, GET_DEVS);
+ if (!mdi || !mdi->devs) {
+ pr_err("Cannot find status of %s to enable replacement - strange\n",
+ devname);
+ return -1;
+ }
+ for (di = mdi->devs; di; di = di->next)
+ if (di->disk.major == (int)major(rdev) &&
+ di->disk.minor == (int)minor(rdev))
+ break;
+ if (di) {
+ int rv;
+ if (di->disk.raid_disk < 0) {
+ pr_err("%s is not active and so cannot be replaced.\n",
+ dv->devname);
+ sysfs_free(mdi);
+ return -1;
+ }
+ rv = sysfs_set_str(mdi, di,
+ "state", "want_replacement");
+ if (rv) {
+ sysfs_free(mdi);
+ pr_err("Failed to request replacement for %s\n",
+ dv->devname);
+ return -1;
+ }
+ if (verbose >= 0)
+ pr_err("Marked %s (device %d in %s) for replacement\n",
+ dv->devname, di->disk.raid_disk, devname);
+ /* If there is a matching 'with', we need to tell it which
+ * raid disk
+ */
+ while (dv && dv->disposition != 'W')
+ dv = dv->next;
+ if (dv) {
+ dv->disposition = 'w';
+ dv->used = di->disk.raid_disk;
+ }
+ return 1;
+ }
+ sysfs_free(mdi);
+ pr_err("%s not found in %s so cannot --replace it\n",
+ dv->devname, devname);
+ return -1;
+}
+
+int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv,
+ unsigned long rdev, int verbose, char *devname)
+{
+ struct mdinfo *mdi, *di;
+ /* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */
+ mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE);
+ if (!mdi || !mdi->devs) {
+ pr_err("Cannot find status of %s to enable replacement - strange\n",
+ devname);
+ return -1;
+ }
+ for (di = mdi->devs; di; di = di->next)
+ if (di->disk.major == (int)major(rdev) &&
+ di->disk.minor == (int)minor(rdev))
+ break;
+ if (di) {
+ int rv;
+ if (di->disk.state & (1<<MD_DISK_FAULTY)) {
+ pr_err("%s is faulty and cannot be a replacement\n",
+ dv->devname);
+ sysfs_free(mdi);
+ return -1;
+ }
+ if (di->disk.raid_disk >= 0) {
+ pr_err("%s is active and cannot be a replacement\n",
+ dv->devname);
+ sysfs_free(mdi);
+ return -1;
+ }
+ rv = sysfs_set_num(mdi, di,
+ "slot", dv->used);
+ if (rv) {
+ sysfs_free(mdi);
+ pr_err("Failed to set %s as preferred replacement.\n",
+ dv->devname);
+ return -1;
+ }
+ if (verbose >= 0)
+ pr_err("Marked %s in %s as replacement for device %d\n",
+ dv->devname, devname, dv->used);
+ return 1;
+ }
+ sysfs_free(mdi);
+ pr_err("%s not found in %s so cannot make it preferred replacement\n",
+ dv->devname, devname);
+ return -1;
+}
+
+int Manage_subdevs(char *devname, int fd,
+ struct mddev_dev *devlist, int verbose, int test,
+ char *update, int force)
+{
+ /* Do something to each dev.
+ * devmode can be
+ * 'a' - add the device
+ * try HOT_ADD_DISK
+ * If that fails EINVAL, try ADD_NEW_DISK
+ * 'S' - add the device as a spare - don't try re-add
+ * 'j' - add the device as a journal device
+ * 'A' - re-add the device
+ * 'r' - remove the device: HOT_REMOVE_DISK
+ * device can be 'faulty' or 'detached' in which case all
+ * matching devices are removed.
+ * 'f' - set the device faulty SET_DISK_FAULTY
+ * device can be 'detached' in which case any device that
+ * is inaccessible will be marked faulty.
+ * 'R' - mark this device as wanting replacement.
+ * 'W' - this device is added if necessary and activated as
+ * a replacement for a previous 'R' device.
+ * -----
+ * 'w' - 'W' will be changed to 'w' when it is paired with
+ * a 'R' device. If a 'W' is found while walking the list
+ * it must be unpaired, and is an error.
+ * 'M' - this is created by a 'missing' target. It is a slight
+ * variant on 'A'
+ * 'F' - Another variant of 'A', where the device was faulty
+ * so must be removed from the array first.
+ * 'c' - confirm the device as found (for clustered environments)
+ *
+ * For 'f' and 'r', the device can also be a kernel-internal
+ * name such as 'sdb'.
+ */
+ mdu_array_info_t array;
+ unsigned long long array_size;
+ struct mddev_dev *dv;
+ int tfd = -1;
+ struct supertype *tst;
+ char *subarray = NULL;
+ int sysfd = -1;
+ int count = 0; /* number of actions taken */
+ struct mdinfo info;
+ struct mdinfo devinfo;
+ int frozen = 0;
+ int busy = 0;
+ int raid_slot = -1;
+
+ if (ioctl(fd, GET_ARRAY_INFO, &array)) {
+ pr_err("Cannot get array info for %s\n",
+ devname);
+ goto abort;
+ }
+ sysfs_init(&info, fd, NULL);
+
+ /* array.size is only 32 bits and may be truncated.
+ * So read from sysfs if possible, and record number of sectors
+ */
+
+ array_size = get_component_size(fd);
+ if (array_size <= 0)
+ array_size = array.size * 2;
+
+ tst = super_by_fd(fd, &subarray);
+ if (!tst) {
+ pr_err("unsupport array - version %d.%d\n",
+ array.major_version, array.minor_version);
+ goto abort;
+ }
+
+ for (dv = devlist; dv; dv = dv->next) {
+ unsigned long rdev = 0; /* device to add/remove etc */
+ int rv;
+ int mj,mn;
+
+ raid_slot = -1;
+ if (dv->disposition == 'c') {
+ rv = parse_cluster_confirm_arg(dv->devname,
+ &dv->devname,
+ &raid_slot);
+ if (rv) {
+ pr_err("Could not get the devname of cluster\n");
+ goto abort;
+ }
+ }
+
+ if (strcmp(dv->devname, "failed") == 0 ||
+ strcmp(dv->devname, "faulty") == 0) {
+ if (dv->disposition != 'A'
+ && dv->disposition != 'r') {
+ pr_err("%s only meaningful with -r or --re-add, not -%c\n",
+ dv->devname, dv->disposition);
+ goto abort;
+ }
+ add_faulty(dv, fd, (dv->disposition == 'A'
+ ? 'F' : 'r'));
+ continue;
+ }
+ if (strcmp(dv->devname, "detached") == 0) {
+ if (dv->disposition != 'r' && dv->disposition != 'f') {
+ pr_err("%s only meaningful with -r of -f, not -%c\n",
+ dv->devname, dv->disposition);
+ goto abort;
+ }
+ add_detached(dv, fd, dv->disposition);
+ continue;
+ }
+
+ if (strcmp(dv->devname, "missing") == 0) {
+ struct mddev_dev *add_devlist = NULL;
+ struct mddev_dev **dp;
+ if (dv->disposition == 'c') {
+ rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
+ break;
+ }
+
+ if (dv->disposition != 'A') {
+ pr_err("'missing' only meaningful with --re-add\n");
+ goto abort;
+ }
+ add_devlist = conf_get_devs();
+ if (add_devlist == NULL) {
+ pr_err("no devices to scan for missing members.");
+ continue;
+ }
+ for (dp = &add_devlist; *dp; dp = & (*dp)->next)
+ /* 'M' (for 'missing') is like 'A' without errors */
+ (*dp)->disposition = 'M';
+ *dp = dv->next;
+ dv->next = add_devlist;
+ continue;
+ }
+
+ if (strncmp(dv->devname, "set-", 4) == 0 &&
+ strlen(dv->devname) == 5) {
+ int copies;
+
+ if (dv->disposition != 'r' &&
+ dv->disposition != 'f') {
+ pr_err("'%s' only meaningful with -r or -f\n",
+ dv->devname);
+ goto abort;
+ }
+ if (array.level != 10) {
+ pr_err("'%s' only meaningful with RAID10 arrays\n",
+ dv->devname);
+ goto abort;
+ }
+ copies = ((array.layout & 0xff) *
+ ((array.layout >> 8) & 0xff));
+ if (array.raid_disks % copies != 0 ||
+ dv->devname[4] < 'A' ||
+ dv->devname[4] >= 'A' + copies ||
+ copies > 26) {
+ pr_err("'%s' not meaningful with this array\n",
+ dv->devname);
+ goto abort;
+ }
+ add_set(dv, fd, dv->devname[4]);
+ continue;
+ }
+
+ if (strchr(dv->devname, '/') == NULL &&
+ strchr(dv->devname, ':') == NULL &&
+ strlen(dv->devname) < 50) {
+ /* Assume this is a kernel-internal name like 'sda1' */
+ int found = 0;
+ char dname[55];
+ if (dv->disposition != 'r' && dv->disposition != 'f') {
+ pr_err("%s only meaningful with -r or -f, not -%c\n",
+ dv->devname, dv->disposition);
+ goto abort;
+ }
+
+ sprintf(dname, "dev-%s", dv->devname);
+ sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev");
+ if (sysfd >= 0) {
+ char dn[20];
+ if (sysfs_fd_get_str(sysfd, dn, 20) > 0 &&
+ sscanf(dn, "%d:%d", &mj,&mn) == 2) {
+ rdev = makedev(mj,mn);
+ found = 1;
+ }
+ close(sysfd);
+ sysfd = -1;
+ }
+ if (!found) {
+ sysfd = sysfs_open(fd2devnm(fd), dname, "state");
+ if (sysfd < 0) {
+ pr_err("%s does not appear to be a component of %s\n",
+ dv->devname, devname);
+ goto abort;
+ }
+ }
+ } else if ((dv->disposition == 'r' || dv->disposition == 'f')
+ && get_maj_min(dv->devname, &mj, &mn)) {
+ /* for 'fail' and 'remove', the device might
+ * not exist.
+ */
+ rdev = makedev(mj, mn);
+ } else {
+ struct stat stb;
+ tfd = dev_open(dv->devname, O_RDONLY);
+ if (tfd >= 0)
+ fstat(tfd, &stb);
+ else {
+ int open_err = errno;
+ if (stat(dv->devname, &stb) != 0) {
+ pr_err("Cannot find %s: %s\n",
+ dv->devname, strerror(errno));
+ goto abort;
+ }
+ if ((stb.st_mode & S_IFMT) != S_IFBLK) {
+ if (dv->disposition == 'M')
+ /* non-fatal. Also improbable */
+ continue;
+ pr_err("%s is not a block device.\n",
+ dv->devname);
+ goto abort;
+ }
+ if (dv->disposition == 'r')
+ /* Be happy, the stat worked, that is
+ * enough for --remove
+ */
+ ;
+ else {
+ if (dv->disposition == 'M')
+ /* non-fatal */
+ continue;
+ pr_err("Cannot open %s: %s\n",
+ dv->devname, strerror(open_err));
+ goto abort;
+ }
+ }
+ rdev = stb.st_rdev;
+ }
+ switch(dv->disposition){
+ default:
+ pr_err("internal error - devmode[%s]=%d\n",
+ dv->devname, dv->disposition);
+ goto abort;
+ case 'a':
+ case 'S': /* --add-spare */
+ case 'j': /* --add-journal */
+ case 'A':
+ case 'M': /* --re-add missing */
+ case 'F': /* --re-add faulty */
+ case 'c': /* --cluster-confirm */
+ /* add the device */
+ if (subarray) {
+ pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
+ goto abort;
+ }
+
+ /* Let's first try to write re-add to sysfs */
+ if (rdev != 0 &&
+ (dv->disposition == 'A' || dv->disposition == 'F')) {
+ sysfs_init_dev(&devinfo, rdev);
+ if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
+ pr_err("re-add %s to %s succeed\n",
+ dv->devname, info.sys_name);
+ break;
+ }
+ }
+
+ if (dv->disposition == 'F')
+ /* Need to remove first */
+ ioctl(fd, HOT_REMOVE_DISK, rdev);
+ /* Make sure it isn't in use (in 2.6 or later) */
+ tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
+ if (tfd >= 0) {
+ /* We know no-one else is using it. We'll
+ * need non-exclusive access to add it, so
+ * do that now.
+ */
+ close(tfd);
+ tfd = dev_open(dv->devname, O_RDONLY);
+ }
+ if (tfd < 0) {
+ if (dv->disposition == 'M')
+ continue;
+ pr_err("Cannot open %s: %s\n",
+ dv->devname, strerror(errno));
+ goto abort;
+ }
+ if (!frozen) {
+ if (sysfs_freeze_array(&info) == 1)
+ frozen = 1;
+ else
+ frozen = -1;
+ }
+ rv = Manage_add(fd, tfd, dv, tst, &array,
+ force, verbose, devname, update,
+ rdev, array_size, raid_slot);
+ close(tfd);
+ tfd = -1;
+ if (rv < 0)
+ goto abort;
+ if (rv > 0)
+ count++;
+ break;
+
+ case 'r':
+ /* hot remove */
+ if (subarray) {
+ pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n");
+ rv = -1;
+ } else
+ rv = Manage_remove(tst, fd, dv, sysfd,
+ rdev, verbose,
+ devname);
+ if (sysfd >= 0)
+ close(sysfd);
+ sysfd = -1;
+ if (rv < 0)
+ goto abort;
+ if (rv > 0)
+ count++;
+ break;
+
+ case 'f': /* set faulty */
+ /* FIXME check current member */
+ if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
+ (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
+ rdev))) {
+ if (errno == EBUSY)
+ busy = 1;
+ pr_err("set device faulty failed for %s: %s\n",
+ dv->devname, strerror(errno));
+ if (sysfd >= 0)
+ close(sysfd);
+ goto abort;
+ }
+ if (sysfd >= 0)
+ close(sysfd);
+ sysfd = -1;
+ count++;
+ if (verbose >= 0)
+ pr_err("set %s faulty in %s\n",
+ dv->devname, devname);
+ break;
+ case 'R': /* Mark as replaceable */
+ if (subarray) {
+ pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n");
+ rv = -1;
+ } else {
+ if (!frozen) {
+ if (sysfs_freeze_array(&info) == 1)
+ frozen = 1;
+ else
+ frozen = -1;
+ }
+ rv = Manage_replace(tst, fd, dv,
+ rdev, verbose,
+ devname);
+ }
+ if (rv < 0)
+ goto abort;
+ if (rv > 0)
+ count++;
+ break;
+ case 'W': /* --with device that doesn't match */
+ pr_err("No matching --replace device for --with %s\n",
+ dv->devname);
+ goto abort;
+ case 'w': /* --with device which was matched */
+ rv = Manage_with(tst, fd, dv,
+ rdev, verbose, devname);
+ if (rv < 0)
+ goto abort;
+ break;
+ }
+ }
+ if (frozen > 0)
+ sysfs_set_str(&info, NULL, "sync_action","idle");
+ if (test && count == 0)
+ return 2;
+ return 0;
+
+abort:
+ if (frozen > 0)
+ sysfs_set_str(&info, NULL, "sync_action","idle");
+ return !test && busy ? 2 : 1;
+}
+
+int autodetect(void)
+{
+ /* Open any md device, and issue the RAID_AUTORUN ioctl */
+ int rv = 1;
+ int fd = dev_open("9:0", O_RDONLY);
+ if (fd >= 0) {
+ if (ioctl(fd, RAID_AUTORUN, 0) == 0)
+ rv = 0;
+ close(fd);
+ }
+ return rv;
+}
+
+int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int verbose)
+{
+ struct supertype supertype, *st = &supertype;
+ int fd, rv = 2;
+
+ memset(st, 0, sizeof(*st));
+
+ fd = open_subarray(dev, subarray, st, verbose < 0);
+ if (fd < 0)
+ return 2;
+
+ if (!st->ss->update_subarray) {
+ if (verbose >= 0)
+ pr_err("Operation not supported for %s metadata\n",
+ st->ss->name);
+ goto free_super;
+ }
+
+ if (mdmon_running(st->devnm))
+ st->update_tail = &st->updates;
+
+ rv = st->ss->update_subarray(st, subarray, update, ident);
+
+ if (rv) {
+ if (verbose >= 0)
+ pr_err("Failed to update %s of subarray-%s in %s\n",
+ update, subarray, dev);
+ } else if (st->update_tail)
+ flush_metadata_updates(st);
+ else
+ st->ss->sync_metadata(st);
+
+ if (rv == 0 && strcmp(update, "name") == 0 && verbose >= 0)
+ pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n",
+ subarray, dev);
+
+ free_super:
+ st->ss->free_super(st);
+ close(fd);
+
+ return rv;
+}
+
+/* Move spare from one array to another If adding to destination array fails
+ * add back to original array.
+ * Returns 1 on success, 0 on failure */
+int move_spare(char *from_devname, char *to_devname, dev_t devid)
+{
+ struct mddev_dev devlist;
+ char devname[20];
+
+ /* try to remove and add */
+ int fd1 = open(to_devname, O_RDONLY);
+ int fd2 = open(from_devname, O_RDONLY);
+
+ if (fd1 < 0 || fd2 < 0) {
+ if (fd1>=0) close(fd1);
+ if (fd2>=0) close(fd2);
+ return 0;
+ }
+
+ devlist.next = NULL;
+ devlist.used = 0;
+ devlist.writemostly = 0;
+ devlist.devname = devname;
+ sprintf(devname, "%d:%d", major(devid), minor(devid));
+
+ devlist.disposition = 'r';
+ if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0) == 0) {
+ devlist.disposition = 'a';
+ if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0, NULL, 0) == 0) {
+ /* make sure manager is aware of changes */
+ ping_manager(to_devname);
+ ping_manager(from_devname);
+ close(fd1);
+ close(fd2);
+ return 1;
+ }
+ else Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0);
+ }
+ close(fd1);
+ close(fd2);
+ return 0;
+}
+#endif
diff --git a/Monitor.c b/Monitor.c
new file mode 100644
index 0000000..f19c2e5
--- /dev/null
+++ b/Monitor.c
@@ -0,0 +1,1143 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_p.h"
+#include "md_u.h"
+#include <sys/wait.h>
+#include <signal.h>
+#include <limits.h>
+#include <syslog.h>
+
+struct state {
+ char *devname;
+ char devnm[32]; /* to sync with mdstat info */
+ long utime;
+ int err;
+ char *spare_group;
+ int active, working, failed, spare, raid;
+ int from_config;
+ int from_auto;
+ int expected_spares;
+ int devstate[MAX_DISKS];
+ dev_t devid[MAX_DISKS];
+ int percent;
+ char parent_devnm[32]; /* For subarray, devnm of parent.
+ * For others, ""
+ */
+ struct supertype *metadata;
+ struct state *subarray;/* for a container it is a link to first subarray
+ * for a subarray it is a link to next subarray
+ * in the same container */
+ struct state *parent; /* for a subarray it is a link to its container
+ */
+ struct state *next;
+};
+
+struct alert_info {
+ char *mailaddr;
+ char *mailfrom;
+ char *alert_cmd;
+ int dosyslog;
+};
+static int make_daemon(char *pidfile);
+static int check_one_sharer(int scan);
+static void alert(char *event, char *dev, char *disc, struct alert_info *info);
+static int check_array(struct state *st, struct mdstat_ent *mdstat,
+ int test, struct alert_info *info,
+ int increments, char *prefer);
+static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist,
+ int test, struct alert_info *info);
+static void try_spare_migration(struct state *statelist, struct alert_info *info);
+static void link_containers_with_subarrays(struct state *list);
+
+int Monitor(struct mddev_dev *devlist,
+ char *mailaddr, char *alert_cmd,
+ struct context *c,
+ int daemonise, int oneshot,
+ int dosyslog, char *pidfile, int increments,
+ int share)
+{
+ /*
+ * Every few seconds, scan every md device looking for changes
+ * When a change is found, log it, possibly run the alert command,
+ * and possibly send Email
+ *
+ * For each array, we record:
+ * Update time
+ * active/working/failed/spare drives
+ * State of each device.
+ * %rebuilt if rebuilding
+ *
+ * If the update time changes, check out all the data again
+ * It is possible that we cannot get the state of each device
+ * due to bugs in the md kernel module.
+ * We also read /proc/mdstat to get rebuild percent,
+ * and to get state on all active devices incase of kernel bug.
+ *
+ * Events are:
+ * Fail
+ * An active device had Faulty set or Active/Sync removed
+ * FailSpare
+ * A spare device had Faulty set
+ * SpareActive
+ * An active device had a reverse transition
+ * RebuildStarted
+ * percent went from -1 to +ve
+ * RebuildNN
+ * percent went from below to not-below NN%
+ * DeviceDisappeared
+ * Couldn't access a device which was previously visible
+ *
+ * if we detect an array with active<raid and spare==0
+ * we look at other arrays that have same spare-group
+ * If we find one with active==raid and spare>0,
+ * and if we can get_disk_info and find a name
+ * Then we hot-remove and hot-add to the other array
+ *
+ * If devlist is NULL, then we can monitor everything because --scan
+ * was given. We get an initial list from config file and add anything
+ * that appears in /proc/mdstat
+ */
+
+ struct state *statelist = NULL;
+ struct state *st2;
+ int finished = 0;
+ struct mdstat_ent *mdstat = NULL;
+ char *mailfrom = NULL;
+ struct alert_info info;
+
+ if (!mailaddr) {
+ mailaddr = conf_get_mailaddr();
+ if (mailaddr && ! c->scan)
+ pr_err("Monitor using email address \"%s\" from config file\n",
+ mailaddr);
+ }
+ mailfrom = conf_get_mailfrom();
+
+ if (!alert_cmd) {
+ alert_cmd = conf_get_program();
+ if (alert_cmd && ! c->scan)
+ pr_err("Monitor using program \"%s\" from config file\n",
+ alert_cmd);
+ }
+ if (c->scan && !mailaddr && !alert_cmd && !dosyslog) {
+ pr_err("No mail address or alert command - not monitoring.\n");
+ return 1;
+ }
+ info.alert_cmd = alert_cmd;
+ info.mailaddr = mailaddr;
+ info.mailfrom = mailfrom;
+ info.dosyslog = dosyslog;
+
+ if (daemonise) {
+ int rv = make_daemon(pidfile);
+ if (rv >= 0)
+ return rv;
+ }
+
+ if (share)
+ if (check_one_sharer(c->scan))
+ return 1;
+
+ if (devlist == NULL) {
+ struct mddev_ident *mdlist = conf_get_ident(NULL);
+ for (; mdlist; mdlist=mdlist->next) {
+ struct state *st;
+ if (mdlist->devname == NULL)
+ continue;
+ if (strcasecmp(mdlist->devname, "<ignore>") == 0)
+ continue;
+ st = xcalloc(1, sizeof *st);
+ if (mdlist->devname[0] == '/')
+ st->devname = xstrdup(mdlist->devname);
+ else {
+ st->devname = xmalloc(8+strlen(mdlist->devname)+1);
+ strcpy(strcpy(st->devname, "/dev/md/"),
+ mdlist->devname);
+ }
+ st->next = statelist;
+ st->devnm[0] = 0;
+ st->percent = RESYNC_UNKNOWN;
+ st->from_config = 1;
+ st->expected_spares = mdlist->spare_disks;
+ if (mdlist->spare_group)
+ st->spare_group = xstrdup(mdlist->spare_group);
+ statelist = st;
+ }
+ } else {
+ struct mddev_dev *dv;
+ for (dv=devlist ; dv; dv=dv->next) {
+ struct mddev_ident *mdlist = conf_get_ident(dv->devname);
+ struct state *st = xcalloc(1, sizeof *st);
+ st->devname = xstrdup(dv->devname);
+ st->next = statelist;
+ st->devnm[0] = 0;
+ st->percent = RESYNC_UNKNOWN;
+ st->expected_spares = -1;
+ if (mdlist) {
+ st->expected_spares = mdlist->spare_disks;
+ if (mdlist->spare_group)
+ st->spare_group = xstrdup(mdlist->spare_group);
+ }
+ statelist = st;
+ }
+ }
+
+ while (! finished) {
+ int new_found = 0;
+ struct state *st, **stp;
+ int anydegraded = 0;
+
+ if (mdstat)
+ free_mdstat(mdstat);
+ mdstat = mdstat_read(oneshot?0:1, 0);
+
+ for (st=statelist; st; st=st->next)
+ if (check_array(st, mdstat, c->test, &info,
+ increments, c->prefer))
+ anydegraded = 1;
+
+ /* now check if there are any new devices found in mdstat */
+ if (c->scan)
+ new_found = add_new_arrays(mdstat, &statelist, c->test,
+ &info);
+
+ /* If an array has active < raid && spare == 0 && spare_group != NULL
+ * Look for another array with spare > 0 and active == raid and same spare_group
+ * if found, choose a device and hotremove/hotadd
+ */
+ if (share && anydegraded)
+ try_spare_migration(statelist, &info);
+ if (!new_found) {
+ if (oneshot)
+ break;
+ else
+ mdstat_wait(c->delay);
+ }
+ c->test = 0;
+
+ for (stp = &statelist; (st = *stp) != NULL; ) {
+ if (st->from_auto && st->err > 5) {
+ *stp = st->next;
+ free(st->devname);
+ free(st->spare_group);
+ free(st);
+ } else
+ stp = &st->next;
+ }
+ }
+ for (st2 = statelist; st2; st2 = statelist) {
+ statelist = st2->next;
+ free(st2);
+ }
+
+ if (pidfile)
+ unlink(pidfile);
+ return 0;
+}
+
+static int make_daemon(char *pidfile)
+{
+ /* Return:
+ * -1 in the forked daemon
+ * 0 in the parent
+ * 1 on error
+ * so a none-negative becomes the exit code.
+ */
+ int pid = fork();
+ if (pid > 0) {
+ if (!pidfile)
+ printf("%d\n", pid);
+ else {
+ FILE *pid_file;
+ pid_file=fopen(pidfile, "w");
+ if (!pid_file)
+ perror("cannot create pid file");
+ else {
+ fprintf(pid_file,"%d\n", pid);
+ fclose(pid_file);
+ }
+ }
+ return 0;
+ }
+ if (pid < 0) {
+ perror("daemonise");
+ return 1;
+ }
+ close(0);
+ open("/dev/null", O_RDWR);
+ dup2(0,1);
+ dup2(0,2);
+ setsid();
+ return -1;
+}
+
+static int check_one_sharer(int scan)
+{
+ int pid, rv;
+ FILE *fp;
+ char dir[20];
+ char path[100];
+ struct stat buf;
+ sprintf(path, "%s/autorebuild.pid", MDMON_DIR);
+ fp = fopen(path, "r");
+ if (fp) {
+ if (fscanf(fp, "%d", &pid) != 1)
+ pid = -1;
+ sprintf(dir, "/proc/%d", pid);
+ rv = stat(dir, &buf);
+ if (rv != -1) {
+ if (scan) {
+ pr_err("Only one autorebuild process allowed in scan mode, aborting\n");
+ fclose(fp);
+ return 1;
+ } else {
+ pr_err("Warning: One autorebuild process already running.\n");
+ }
+ }
+ fclose(fp);
+ }
+ if (scan) {
+ if (mkdir(MDMON_DIR, S_IRWXU) < 0 &&
+ errno != EEXIST) {
+ pr_err("Can't create autorebuild.pid file\n");
+ } else {
+ fp = fopen(path, "w");
+ if (!fp)
+ pr_err("Cannot create autorebuild.pidfile\n");
+ else {
+ pid = getpid();
+ fprintf(fp, "%d\n", pid);
+ fclose(fp);
+ }
+ }
+ }
+ return 0;
+}
+
+static void alert(char *event, char *dev, char *disc, struct alert_info *info)
+{
+ int priority;
+
+ if (!info->alert_cmd && !info->mailaddr && !info->dosyslog) {
+ time_t now = time(0);
+
+ printf("%1.15s: %s on %s %s\n", ctime(&now)+4, event, dev, disc?disc:"unknown device");
+ }
+ if (info->alert_cmd) {
+ int pid = fork();
+ switch(pid) {
+ default:
+ waitpid(pid, NULL, 0);
+ break;
+ case -1:
+ break;
+ case 0:
+ execl(info->alert_cmd, info->alert_cmd,
+ event, dev, disc, NULL);
+ exit(2);
+ }
+ }
+ if (info->mailaddr &&
+ (strncmp(event, "Fail", 4)==0 ||
+ strncmp(event, "Test", 4)==0 ||
+ strncmp(event, "Spares", 6)==0 ||
+ strncmp(event, "Degrade", 7)==0)) {
+ FILE *mp = popen(Sendmail, "w");
+ if (mp) {
+ FILE *mdstat;
+ char hname[256];
+ gethostname(hname, sizeof(hname));
+ signal(SIGPIPE, SIG_IGN);
+ if (info->mailfrom)
+ fprintf(mp, "From: %s\n", info->mailfrom);
+ else
+ fprintf(mp, "From: %s monitoring <root>\n", Name);
+ fprintf(mp, "To: %s\n", info->mailaddr);
+ fprintf(mp, "Subject: %s event on %s:%s\n\n",
+ event, dev, hname);
+
+ fprintf(mp,
+ "This is an automatically generated mail message from %s\n", Name);
+ fprintf(mp, "running on %s\n\n", hname);
+
+ fprintf(mp,
+ "A %s event had been detected on md device %s.\n\n", event, dev);
+
+ if (disc && disc[0] != ' ')
+ fprintf(mp,
+ "It could be related to component device %s.\n\n", disc);
+ if (disc && disc[0] == ' ')
+ fprintf(mp, "Extra information:%s.\n\n", disc);
+
+ fprintf(mp, "Faithfully yours, etc.\n");
+
+ mdstat = fopen("/proc/mdstat", "r");
+ if (mdstat) {
+ char buf[8192];
+ int n;
+ fprintf(mp,
+ "\nP.S. The /proc/mdstat file currently contains the following:\n\n");
+ while ( (n=fread(buf, 1, sizeof(buf), mdstat)) > 0)
+ n=fwrite(buf, 1, n, mp);
+ fclose(mdstat);
+ }
+ pclose(mp);
+ }
+ }
+
+ /* log the event to syslog maybe */
+ if (info->dosyslog) {
+ /* Log at a different severity depending on the event.
+ *
+ * These are the critical events: */
+ if (strncmp(event, "Fail", 4)==0 ||
+ strncmp(event, "Degrade", 7)==0 ||
+ strncmp(event, "DeviceDisappeared", 17)==0)
+ priority = LOG_CRIT;
+ /* Good to know about, but are not failures: */
+ else if (strncmp(event, "Rebuild", 7)==0 ||
+ strncmp(event, "MoveSpare", 9)==0 ||
+ strncmp(event, "Spares", 6) != 0)
+ priority = LOG_WARNING;
+ /* Everything else: */
+ else
+ priority = LOG_INFO;
+
+ if (disc && disc[0] != ' ')
+ syslog(priority,
+ "%s event detected on md device %s, component device %s", event, dev, disc);
+ else if (disc)
+ syslog(priority,
+ "%s event detected on md device %s: %s",
+ event, dev, disc);
+ else
+ syslog(priority,
+ "%s event detected on md device %s",
+ event, dev);
+ }
+}
+
+static int check_array(struct state *st, struct mdstat_ent *mdstat,
+ int test, struct alert_info *ainfo,
+ int increments, char *prefer)
+{
+ /* Update the state 'st' to reflect any changes shown in mdstat,
+ * or found by directly examining the array, and return
+ * '1' if the array is degraded, or '0' if it is optimal (or dead).
+ */
+ struct { int state, major, minor; } info[MAX_DISKS];
+ mdu_array_info_t array;
+ struct mdstat_ent *mse = NULL, *mse2;
+ char *dev = st->devname;
+ int fd = -1;
+ int i;
+ int remaining_disks;
+ int last_disk;
+ int new_array = 0;
+
+ if (test)
+ alert("TestMessage", dev, NULL, ainfo);
+ if (st->devnm[0])
+ fd = open("/sys/block", O_RDONLY|O_DIRECTORY);
+ if (fd >= 0) {
+ /* Don't open the device unless it is present and
+ * active in sysfs.
+ */
+ char buf[10];
+ close(fd);
+ fd = sysfs_open(st->devnm, NULL, "array_state");
+ if (fd < 0 ||
+ read(fd, buf, 10) < 5 ||
+ strncmp(buf,"clear",5) == 0 ||
+ strncmp(buf,"inact",5) == 0) {
+ if (fd >= 0)
+ close(fd);
+ fd = sysfs_open(st->devnm, NULL, "level");
+ if (fd < 0 || read(fd, buf, 10) != 0) {
+ if (fd >= 0)
+ close(fd);
+ if (!st->err)
+ alert("DeviceDisappeared", dev, NULL, ainfo);
+ st->err++;
+ return 0;
+ }
+ }
+ close(fd);
+ }
+ fd = open(dev, O_RDONLY);
+ if (fd < 0) {
+ if (!st->err)
+ alert("DeviceDisappeared", dev, NULL, ainfo);
+ st->err++;
+ return 0;
+ }
+ fcntl(fd, F_SETFD, FD_CLOEXEC);
+ if (ioctl(fd, GET_ARRAY_INFO, &array)<0) {
+ if (!st->err)
+ alert("DeviceDisappeared", dev, NULL, ainfo);
+ st->err++;
+ close(fd);
+ return 0;
+ }
+ /* It's much easier to list what array levels can't
+ * have a device disappear than all of them that can
+ */
+ if (array.level == 0 || array.level == -1) {
+ if (!st->err && !st->from_config)
+ alert("DeviceDisappeared", dev, " Wrong-Level", ainfo);
+ st->err++;
+ close(fd);
+ return 0;
+ }
+ if (st->devnm[0] == 0)
+ strcpy(st->devnm, fd2devnm(fd));
+
+ for (mse2 = mdstat ; mse2 ; mse2=mse2->next)
+ if (strcmp(mse2->devnm, st->devnm) == 0) {
+ mse2->devnm[0] = 0; /* flag it as "used" */
+ mse = mse2;
+ }
+
+ if (!mse) {
+ /* duplicated array in statelist
+ * or re-created after reading mdstat*/
+ st->err++;
+ close(fd);
+ return 0;
+ }
+ /* this array is in /proc/mdstat */
+ if (array.utime == 0)
+ /* external arrays don't update utime, so
+ * just make sure it is always different. */
+ array.utime = st->utime + 1;;
+
+ if (st->err) {
+ /* New array appeared where previously had an error */
+ st->err = 0;
+ st->percent = RESYNC_NONE;
+ new_array = 1;
+ alert("NewArray", st->devname, NULL, ainfo);
+ }
+
+ if (st->utime == array.utime &&
+ st->failed == array.failed_disks &&
+ st->working == array.working_disks &&
+ st->spare == array.spare_disks &&
+ (mse == NULL || (
+ mse->percent == st->percent
+ ))) {
+ close(fd);
+ if ((st->active < st->raid) && st->spare == 0)
+ return 1;
+ else
+ return 0;
+ }
+ if (st->utime == 0 && /* new array */
+ mse->pattern && strchr(mse->pattern, '_') /* degraded */
+ )
+ alert("DegradedArray", dev, NULL, ainfo);
+
+ if (st->utime == 0 && /* new array */
+ st->expected_spares > 0 &&
+ array.spare_disks < st->expected_spares)
+ alert("SparesMissing", dev, NULL, ainfo);
+ if (st->percent < 0 && st->percent != RESYNC_UNKNOWN &&
+ mse->percent >= 0)
+ alert("RebuildStarted", dev, NULL, ainfo);
+ if (st->percent >= 0 &&
+ mse->percent >= 0 &&
+ (mse->percent / increments) > (st->percent / increments)) {
+ char percentalert[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars)
+
+ if((mse->percent / increments) == 0)
+ snprintf(percentalert, sizeof(percentalert), "RebuildStarted");
+ else
+ snprintf(percentalert, sizeof(percentalert), "Rebuild%02d", mse->percent);
+
+ alert(percentalert, dev, NULL, ainfo);
+ }
+
+ if (mse->percent == RESYNC_NONE &&
+ st->percent >= 0) {
+ /* Rebuild/sync/whatever just finished.
+ * If there is a number in /mismatch_cnt,
+ * we should report that.
+ */
+ struct mdinfo *sra =
+ sysfs_read(-1, st->devnm, GET_MISMATCH);
+ if (sra && sra->mismatch_cnt > 0) {
+ char cnt[80];
+ snprintf(cnt, sizeof(cnt),
+ " mismatches found: %d (on raid level %d)",
+ sra->mismatch_cnt, array.level);
+ alert("RebuildFinished", dev, cnt, ainfo);
+ } else
+ alert("RebuildFinished", dev, NULL, ainfo);
+ if (sra)
+ free(sra);
+ }
+ st->percent = mse->percent;
+
+ remaining_disks = array.nr_disks;
+ for (i=0; i<MAX_DISKS && remaining_disks > 0;
+ i++) {
+ mdu_disk_info_t disc;
+ disc.number = i;
+ if (ioctl(fd, GET_DISK_INFO, &disc) >= 0) {
+ info[i].state = disc.state;
+ info[i].major = disc.major;
+ info[i].minor = disc.minor;
+ if (disc.major || disc.minor)
+ remaining_disks --;
+ } else
+ info[i].major = info[i].minor = 0;
+ }
+ last_disk = i;
+
+ if (mse->metadata_version &&
+ strncmp(mse->metadata_version, "external:", 9) == 0 &&
+ is_subarray(mse->metadata_version+9)) {
+ char *sl;
+ strcpy(st->parent_devnm,
+ mse->metadata_version+10);
+ sl = strchr(st->parent_devnm, '/');
+ if (sl)
+ *sl = 0;
+ } else
+ st->parent_devnm[0] = 0;
+ if (st->metadata == NULL &&
+ st->parent_devnm[0] == 0)
+ st->metadata = super_by_fd(fd, NULL);
+
+ close(fd);
+
+ for (i=0; i<MAX_DISKS; i++) {
+ mdu_disk_info_t disc = {0,0,0,0,0};
+ int newstate=0;
+ int change;
+ char *dv = NULL;
+ disc.number = i;
+ if (i < last_disk &&
+ (info[i].major || info[i].minor)) {
+ newstate = info[i].state;
+ dv = map_dev_preferred(
+ info[i].major, info[i].minor, 1,
+ prefer);
+ disc.state = newstate;
+ disc.major = info[i].major;
+ disc.minor = info[i].minor;
+ } else
+ newstate = (1 << MD_DISK_REMOVED);
+
+ if (dv == NULL && st->devid[i])
+ dv = map_dev_preferred(
+ major(st->devid[i]),
+ minor(st->devid[i]), 1, prefer);
+ change = newstate ^ st->devstate[i];
+ if (st->utime && change && !st->err && !new_array) {
+ if ((st->devstate[i]&change)&(1<<MD_DISK_SYNC))
+ alert("Fail", dev, dv, ainfo);
+ else if ((newstate & (1<<MD_DISK_FAULTY)) &&
+ (disc.major || disc.minor) &&
+ st->devid[i] == makedev(disc.major, disc.minor))
+ alert("FailSpare", dev, dv, ainfo);
+ else if ((newstate&change)&(1<<MD_DISK_SYNC))
+ alert("SpareActive", dev, dv, ainfo);
+ }
+ st->devstate[i] = newstate;
+ st->devid[i] = makedev(disc.major, disc.minor);
+ }
+ st->active = array.active_disks;
+ st->working = array.working_disks;
+ st->spare = array.spare_disks;
+ st->failed = array.failed_disks;
+ st->utime = array.utime;
+ st->raid = array.raid_disks;
+ st->err = 0;
+ if ((st->active < st->raid) && st->spare == 0)
+ return 1;
+ return 0;
+}
+
+static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist,
+ int test, struct alert_info *info)
+{
+ struct mdstat_ent *mse;
+ int new_found = 0;
+ char *name;
+
+ for (mse=mdstat; mse; mse=mse->next)
+ if (mse->devnm[0] &&
+ (!mse->level || /* retrieve containers */
+ (strcmp(mse->level, "raid0") != 0 &&
+ strcmp(mse->level, "linear") != 0))
+ ) {
+ struct state *st = xcalloc(1, sizeof *st);
+ mdu_array_info_t array;
+ int fd;
+
+ name = get_md_name(mse->devnm);
+ if (!name) {
+ free(st);
+ continue;
+ }
+
+ st->devname = xstrdup(name);
+ if ((fd = open(st->devname, O_RDONLY)) < 0 ||
+ ioctl(fd, GET_ARRAY_INFO, &array)< 0) {
+ /* no such array */
+ if (fd >=0) close(fd);
+ put_md_name(st->devname);
+ free(st->devname);
+ if (st->metadata) {
+ st->metadata->ss->free_super(st->metadata);
+ free(st->metadata);
+ }
+ free(st);
+ continue;
+ }
+ close(fd);
+ st->next = *statelist;
+ st->err = 1;
+ st->from_auto = 1;
+ strcpy(st->devnm, mse->devnm);
+ st->percent = RESYNC_UNKNOWN;
+ st->expected_spares = -1;
+ if (mse->metadata_version &&
+ strncmp(mse->metadata_version, "external:", 9) == 0 &&
+ is_subarray(mse->metadata_version+9)) {
+ char *sl;
+ strcpy(st->parent_devnm,
+ mse->metadata_version+10);
+ sl = strchr(st->parent_devnm, '/');
+ *sl = 0;
+ } else
+ st->parent_devnm[0] = 0;
+ *statelist = st;
+ if (test)
+ alert("TestMessage", st->devname, NULL, info);
+ new_found = 1;
+ }
+ return new_found;
+}
+
+static int get_min_spare_size_required(struct state *st, unsigned long long *sizep)
+{
+ int fd;
+
+ if (!st->metadata ||
+ !st->metadata->ss->min_acceptable_spare_size) {
+ *sizep = 0;
+ return 0;
+ }
+
+ fd = open(st->devname, O_RDONLY);
+ if (fd < 0)
+ return 1;
+ if (st->metadata->ss->external)
+ st->metadata->ss->load_container(st->metadata, fd, st->devname);
+ else
+ st->metadata->ss->load_super(st->metadata, fd, st->devname);
+ close(fd);
+ if (!st->metadata->sb)
+ return 1;
+ *sizep = st->metadata->ss->min_acceptable_spare_size(st->metadata);
+ st->metadata->ss->free_super(st->metadata);
+
+ return 0;
+}
+
+static int check_donor(struct state *from, struct state *to)
+{
+ struct state *sub;
+
+ if (from == to)
+ return 0;
+ if (from->parent)
+ /* Cannot move from a member */
+ return 0;
+ if (from->err)
+ return 0;
+ for (sub = from->subarray; sub; sub = sub->subarray)
+ /* If source array has degraded subarrays, don't
+ * remove anything
+ */
+ if (sub->active < sub->raid)
+ return 0;
+ if (from->metadata->ss->external == 0)
+ if (from->active < from->raid)
+ return 0;
+ if (from->spare <= 0)
+ return 0;
+ return 1;
+}
+
+static dev_t choose_spare(struct state *from, struct state *to,
+ struct domainlist *domlist, unsigned long long min_size)
+{
+ int d;
+ dev_t dev = 0;
+
+ for (d = from->raid; !dev && d < MAX_DISKS; d++) {
+ if (from->devid[d] > 0 &&
+ from->devstate[d] == 0) {
+ struct dev_policy *pol;
+ unsigned long long dev_size;
+
+ if (to->metadata->ss->external &&
+ test_partition_from_id(from->devid[d]))
+ continue;
+
+ if (min_size &&
+ dev_size_from_id(from->devid[d], &dev_size) &&
+ dev_size < min_size)
+ continue;
+
+ pol = devid_policy(from->devid[d]);
+ if (from->spare_group)
+ pol_add(&pol, pol_domain,
+ from->spare_group, NULL);
+ if (domain_test(domlist, pol, to->metadata->ss->name) == 1)
+ dev = from->devid[d];
+ dev_policy_free(pol);
+ }
+ }
+ return dev;
+}
+
+static dev_t container_choose_spare(struct state *from, struct state *to,
+ struct domainlist *domlist,
+ unsigned long long min_size, int active)
+{
+ /* This is similar to choose_spare, but we cannot trust devstate,
+ * so we need to read the metadata instead
+ */
+ struct mdinfo *list;
+ struct supertype *st = from->metadata;
+ int fd = open(from->devname, O_RDONLY);
+ int err;
+ dev_t dev = 0;
+
+ if (fd < 0)
+ return 0;
+ if (!st->ss->getinfo_super_disks) {
+ close(fd);
+ return 0;
+ }
+
+ err = st->ss->load_container(st, fd, NULL);
+ close(fd);
+ if (err)
+ return 0;
+
+ if (from == to) {
+ /* We must check if number of active disks has not increased
+ * since ioctl in main loop. mdmon may have added spare
+ * to subarray. If so we do not need to look for more spares
+ * so return non zero value */
+ int active_cnt = 0;
+ struct mdinfo *dp;
+ list = st->ss->getinfo_super_disks(st);
+ if (!list) {
+ st->ss->free_super(st);
+ return 1;
+ }
+ dp = list->devs;
+ while (dp) {
+ if (dp->disk.state & (1<<MD_DISK_SYNC) &&
+ !(dp->disk.state & (1<<MD_DISK_FAULTY)))
+ active_cnt++;
+ dp = dp->next;
+ }
+ sysfs_free(list);
+ if (active < active_cnt) {
+ /* Spare just activated.*/
+ st->ss->free_super(st);
+ return 1;
+ }
+ }
+
+ /* We only need one spare so full list not needed */
+ list = container_choose_spares(st, min_size, domlist, from->spare_group,
+ to->metadata->ss->name, 1);
+ if (list) {
+ struct mdinfo *disks = list->devs;
+ if (disks)
+ dev = makedev(disks->disk.major, disks->disk.minor);
+ sysfs_free(list);
+ }
+ st->ss->free_super(st);
+ return dev;
+}
+
+static void try_spare_migration(struct state *statelist, struct alert_info *info)
+{
+ struct state *from;
+ struct state *st;
+
+ link_containers_with_subarrays(statelist);
+ for (st = statelist; st; st = st->next)
+ if (st->active < st->raid &&
+ st->spare == 0 && !st->err) {
+ struct domainlist *domlist = NULL;
+ int d;
+ struct state *to = st;
+ unsigned long long min_size;
+
+ if (to->parent_devnm[0] && !to->parent)
+ /* subarray monitored without parent container
+ * we can't move spares here */
+ continue;
+
+ if (to->parent)
+ /* member of a container */
+ to = to->parent;
+
+ if (get_min_spare_size_required(to, &min_size))
+ continue;
+ if (to->metadata->ss->external) {
+ /* We must make sure there is
+ * no suitable spare in container already.
+ * If there is we don't add more */
+ dev_t devid = container_choose_spare(
+ to, to, NULL, min_size, st->active);
+ if (devid > 0)
+ continue;
+ }
+ for (d = 0; d < MAX_DISKS; d++)
+ if (to->devid[d])
+ domainlist_add_dev(&domlist,
+ to->devid[d],
+ to->metadata->ss->name);
+ if (to->spare_group)
+ domain_add(&domlist, to->spare_group);
+ /*
+ * No spare migration if the destination
+ * has no domain. Skip this array.
+ */
+ if (!domlist)
+ continue;
+ for (from=statelist ; from ; from=from->next) {
+ dev_t devid;
+ if (!check_donor(from, to))
+ continue;
+ if (from->metadata->ss->external)
+ devid = container_choose_spare(
+ from, to, domlist, min_size, 0);
+ else
+ devid = choose_spare(from, to, domlist,
+ min_size);
+ if (devid > 0
+ && move_spare(from->devname, to->devname, devid)) {
+ alert("MoveSpare", to->devname, from->devname, info);
+ break;
+ }
+ }
+ domain_free(domlist);
+ }
+}
+
+/* search the statelist to connect external
+ * metadata subarrays with their containers
+ * We always completely rebuild the tree from scratch as
+ * that is safest considering the possibility of entries
+ * disappearing or changing.
+ */
+static void link_containers_with_subarrays(struct state *list)
+{
+ struct state *st;
+ struct state *cont;
+ for (st = list; st; st = st->next) {
+ st->parent = NULL;
+ st->subarray = NULL;
+ }
+ for (st = list; st; st = st->next)
+ if (st->parent_devnm[0])
+ for (cont = list; cont; cont = cont->next)
+ if (!cont->err &&
+ cont->parent_devnm[0] == 0 &&
+ strcmp(cont->devnm, st->parent_devnm) == 0) {
+ st->parent = cont;
+ st->subarray = cont->subarray;
+ cont->subarray = st;
+ break;
+ }
+}
+
+/* Not really Monitor but ... */
+int Wait(char *dev)
+{
+ struct stat stb;
+ char devnm[32];
+ int rv = 1;
+ int frozen_remaining = 3;
+
+ if (stat(dev, &stb) != 0) {
+ pr_err("Cannot find %s: %s\n", dev,
+ strerror(errno));
+ return 2;
+ }
+ strcpy(devnm, stat2devnm(&stb));
+
+ while(1) {
+ struct mdstat_ent *ms = mdstat_read(1, 0);
+ struct mdstat_ent *e;
+
+ for (e=ms ; e; e=e->next)
+ if (strcmp(e->devnm, devnm) == 0)
+ break;
+
+ if (e && e->percent == RESYNC_NONE) {
+ /* We could be in the brief pause before something
+ * starts. /proc/mdstat doesn't show that, but
+ * sync_action does.
+ */
+ struct mdinfo mdi;
+ char buf[21];
+ sysfs_init(&mdi, -1, devnm);
+ if (sysfs_get_str(&mdi, NULL, "sync_action",
+ buf, 20) > 0 &&
+ strcmp(buf,"idle\n") != 0) {
+ e->percent = RESYNC_UNKNOWN;
+ if (strcmp(buf, "frozen\n") == 0) {
+ if (frozen_remaining == 0)
+ e->percent = RESYNC_NONE;
+ else
+ frozen_remaining -= 1;
+ }
+ }
+ }
+ if (!e || e->percent == RESYNC_NONE) {
+ if (e && e->metadata_version &&
+ strncmp(e->metadata_version, "external:", 9) == 0) {
+ if (is_subarray(&e->metadata_version[9]))
+ ping_monitor(&e->metadata_version[9]);
+ else
+ ping_monitor(devnm);
+ }
+ free_mdstat(ms);
+ return rv;
+ }
+ free_mdstat(ms);
+ rv = 0;
+ mdstat_wait(5);
+ }
+}
+
+#ifndef MDASSEMBLE
+
+static char *clean_states[] = {
+ "clear", "inactive", "readonly", "read-auto", "clean", NULL };
+
+int WaitClean(char *dev, int sock, int verbose)
+{
+ int fd;
+ struct mdinfo *mdi;
+ int rv = 1;
+ char devnm[32];
+
+ fd = open(dev, O_RDONLY);
+ if (fd < 0) {
+ if (verbose)
+ pr_err("Couldn't open %s: %s\n", dev, strerror(errno));
+ return 1;
+ }
+
+ strcpy(devnm, fd2devnm(fd));
+ mdi = sysfs_read(fd, devnm, GET_VERSION|GET_LEVEL|GET_SAFEMODE);
+ if (!mdi) {
+ if (verbose)
+ pr_err("Failed to read sysfs attributes for %s\n", dev);
+ close(fd);
+ return 0;
+ }
+
+ switch(mdi->array.level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ /* safemode delay is irrelevant for these levels */
+ rv = 0;
+ }
+
+ /* for internal metadata the kernel handles the final clean
+ * transition, containers can never be dirty
+ */
+ if (!is_subarray(mdi->text_version))
+ rv = 0;
+
+ /* safemode disabled ? */
+ if (mdi->safe_mode_delay == 0)
+ rv = 0;
+
+ if (rv) {
+ int state_fd = sysfs_open(fd2devnm(fd), NULL, "array_state");
+ char buf[20];
+ int delay = 5000;
+
+ /* minimize the safe_mode_delay and prepare to wait up to 5s
+ * for writes to quiesce
+ */
+ sysfs_set_safemode(mdi, 1);
+
+ /* wait for array_state to be clean */
+ while (1) {
+ rv = read(state_fd, buf, sizeof(buf));
+ if (rv < 0)
+ break;
+ if (sysfs_match_word(buf, clean_states) <= 4)
+ break;
+ rv = sysfs_wait(state_fd, &delay);
+ if (rv < 0 && errno != EINTR)
+ break;
+ lseek(state_fd, 0, SEEK_SET);
+ }
+ if (rv < 0)
+ rv = 1;
+ else if (fping_monitor(sock) == 0 ||
+ ping_monitor(mdi->text_version) == 0) {
+ /* we need to ping to close the window between array
+ * state transitioning to clean and the metadata being
+ * marked clean
+ */
+ rv = 0;
+ } else
+ rv = 1;
+ if (rv && verbose)
+ pr_err("Error waiting for %s to be clean\n",
+ dev);
+
+ /* restore the original safe_mode_delay */
+ sysfs_set_safemode(mdi, mdi->safe_mode_delay);
+ close(state_fd);
+ }
+
+ sysfs_free(mdi);
+ close(fd);
+
+ return rv;
+}
+#endif /* MDASSEMBLE */
diff --git a/Query.c b/Query.c
new file mode 100644
index 0000000..fbc1d10
--- /dev/null
+++ b/Query.c
@@ -0,0 +1,126 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_p.h"
+#include "md_u.h"
+
+int Query(char *dev)
+{
+ /* Give a brief description of the device,
+ * whether it is an md device and whether it has
+ * a superblock
+ */
+ int fd = open(dev, O_RDONLY);
+ int vers;
+ int ioctlerr;
+ int superror;
+ struct mdinfo info;
+ mdu_array_info_t array;
+ struct supertype *st = NULL;
+
+ unsigned long long larray_size;
+ struct stat stb;
+ char *mddev;
+ mdu_disk_info_t disc;
+ char *activity;
+
+ if (fd < 0){
+ pr_err("cannot open %s: %s\n",
+ dev, strerror(errno));
+ return 1;
+ }
+
+ vers = md_get_version(fd);
+ if (ioctl(fd, GET_ARRAY_INFO, &array)<0)
+ ioctlerr = errno;
+ else ioctlerr = 0;
+
+ fstat(fd, &stb);
+
+ if (vers>=9000 && !ioctlerr) {
+ if (!get_dev_size(fd, NULL, &larray_size))
+ larray_size = 0;
+ }
+
+ if (vers < 0)
+ printf("%s: is not an md array\n", dev);
+ else if (vers < 9000)
+ printf("%s: is an md device, but kernel cannot provide details\n", dev);
+ else if (ioctlerr == ENODEV)
+ printf("%s: is an md device which is not active\n", dev);
+ else if (ioctlerr)
+ printf("%s: is an md device, but gives \"%s\" when queried\n",
+ dev, strerror(ioctlerr));
+ else {
+ printf("%s: %s %s %d devices, %d spare%s. Use mdadm --detail for more detail.\n",
+ dev,
+ human_size_brief(larray_size,IEC),
+ map_num(pers, array.level),
+ array.raid_disks,
+ array.spare_disks, array.spare_disks==1?"":"s");
+ }
+ st = guess_super(fd);
+ if (st && st->ss->compare_super != NULL)
+ superror = st->ss->load_super(st, fd, dev);
+ else
+ superror = -1;
+ close(fd);
+ if (superror == 0) {
+ /* array might be active... */
+ int uuid[4];
+ struct map_ent *me, *map = NULL;
+ st->ss->getinfo_super(st, &info, NULL);
+ st->ss->uuid_from_super(st, uuid);
+ me = map_by_uuid(&map, uuid);
+ if (me) {
+ mddev = me->path;
+ disc.number = info.disk.number;
+ activity = "undetected";
+ if (mddev && (fd = open(mddev, O_RDONLY))>=0) {
+ if (md_get_version(fd) >= 9000 &&
+ ioctl(fd, GET_ARRAY_INFO, &array)>= 0) {
+ if (ioctl(fd, GET_DISK_INFO, &disc) >= 0 &&
+ makedev((unsigned)disc.major,(unsigned)disc.minor) == stb.st_rdev)
+ activity = "active";
+ else
+ activity = "mismatch";
+ }
+ close(fd);
+ }
+ } else {
+ activity = "inactive";
+ mddev = "array";
+ }
+ printf("%s: device %d in %d device %s %s %s. Use mdadm --examine for more detail.\n",
+ dev,
+ info.disk.number, info.array.raid_disks,
+ activity,
+ map_num(pers, info.array.level),
+ mddev);
+ if (st->ss == &super0)
+ put_md_name(mddev);
+ }
+ return 0;
+}
diff --git a/README.initramfs b/README.initramfs
new file mode 100644
index 0000000..8f9b8dd
--- /dev/null
+++ b/README.initramfs
@@ -0,0 +1,123 @@
+Assembling md arrays at boot time.
+---------------------------------
+December 2005
+
+These notes apply to 2.6 kernels only and, in some cases,
+to 2.6.15 or later.
+
+Md arrays can be assembled at boot time using the 'autodetect' functionality
+which is triggered by storing components of an array in partitions of type
+'fd' - Linux Raid Autodetect.
+They can also be assembled by specifying the component devices in a
+kernel parameter such as
+ md=0,/dev/sda,/dev/sdb
+In this case, /dev/md0 will be assembled (because of the 0) from the listed
+devices.
+
+These mechanisms, while useful, do not provide complete functionality
+and are unlikely to be extended. The preferred way to assemble md
+arrays at boot time is using 'mdadm' or 'mdassemble' (which is a
+trimmed-down mdadm). To assemble an array which contains the root
+filesystem, mdadm needs to be run before that filesystem is mounted,
+and so needs to be run from an initial-ram-fs. It is how this can
+work that is the primary focus of this document.
+
+It should be noted up front that only the array containing the root
+filesystem should be assembled from the initramfs. Any other arrays
+should be assembled under the control of files on the main filesystem
+as this enhanced flexibility and maintainability.
+
+A minimal initramfs for assembling md arrays can be created using 3
+files and one directory. These are:
+
+/bin Directory
+/bin/mdadm statically linked mdadm binary
+/bin/busybox statically linked busybox binary
+/bin/sh hard link to /bin/busybox
+/init a shell script which call mdadm appropriately.
+
+An example init script is:
+
+==============================================
+#!/bin/sh
+
+echo 'Auto-assembling boot md array'
+mkdir /proc
+mount -t proc proc /proc
+if [ -n "$rootuuid" ]
+then arg=--uuid=$rootuuid
+elif [ -n "$mdminor" ]
+then arg=--super-minor=$mdminor
+else arg=--super-minor=0
+fi
+echo "Using $arg"
+mdadm -Acpartitions $arg --auto=part /dev/mda
+cd /
+mount /dev/mda1 /root || mount /dev/mda /root
+umount /proc
+cd /root
+exec chroot . /sbin/init < /dev/console > /dev/console 2>&1
+=============================================
+
+This could certainly be extended, or merged into a larger init script.
+Though tested and in production use, it is not presented here as
+"The Right Way" to do it, but as a useful example.
+Some key points are:
+
+ /proc needs to be mounted so that /proc/partitions can be accessed
+ by mdadm, and so that /proc/filesystems can be accessed by mount.
+
+ The uuid of the array can be passed in as a kernel parameter
+ (rootuuid). As the kernel doesn't use this value, it is made available
+ in the environment for /init
+
+ If no uuid is given, we default to md0, (--super-minor=0) which is a
+ commonly used to store the root filesystem. This may not work in
+ all situations.
+
+ We assemble the array as a partitionable array (/dev/mda) even if we
+ end up using the whole array. There is no cost in using the partitionable
+ interface, and in this context it is simpler.
+
+ We try mounting both /dev/mda1 and /dev/mda as they are the most like
+ part of the array to contain the root filesystem.
+
+ The --auto flag is given to mdadm so that it will create /dev/md*
+ files auto