diff options
author | Yann Soubeyrand <yann-externe.soubeyrand@edf.fr> | 2015-11-10 10:18:39 +0000 |
---|---|---|
committer | Yann Soubeyrand <yann-externe.soubeyrand@edf.fr> | 2015-11-10 10:18:39 +0000 |
commit | 910e28f2b4651d0e8533c577d3bc65338cea69f8 (patch) | |
tree | fdca1d7c60e15fb527a171285d8917f68699f64a |
mdadm (3.3.4-1.1) unstable; urgency=medium
* Non-maintainer upload.
* disable-incremental-assembly.patch: incremental assembly prevents booting
in degraded mode (Closes: #784070)
# imported from the archive
312 files changed, 79815 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..217fe76d --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +/*.o +/*.man +/*-stamp +/mdadm +/mdadm.8 +/mdadm.udeb +/mdassemble +/mdmon +/swap_super +/test_stripe +/TAGS +/mdadm.O2 +/mdadm.Os +/mdadm.static +/mdassemble.auto +/mdassemble.static +/mdmon.O2 +/raid6check diff --git a/ANNOUNCE-3.0 b/ANNOUNCE-3.0 new file mode 100644 index 00000000..f2d4f847 --- /dev/null +++ b/ANNOUNCE-3.0 @@ -0,0 +1,98 @@ +Subject: ANNOUNCE: mdadm 3.0 - A tool for managing Soft RAID under Linux + +I am pleased to (finally) announce the availability of + mdadm version 3.0 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This is a major new version and as such should be treated with some +caution. However it has seen substantial testing and is considerred +to be ready for wide use. + + +The significant change which justifies the new major version number is +that mdadm can now handle metadata updates entirely in userspace. +This allows mdadm to support metadata formats that the kernel knows +nothing about. + +Currently two such metadata formats are supported: + - DDF - The SNIA standard format + - Intel Matrix - The metadata used by recent Intel ICH controlers. + +Also the approach to device names has changed significantly. + +If udev is installed on the system, mdadm will not create any devices +in /dev. Rather it allows udev to manage those devices. For this to work +as expected, the included udev rules file should be installed. + +If udev is not installed, mdadm will still create devices and symlinks +as required, and will also remove them when the array is stopped. + +mdadm now requires all devices which do not have a standard name (mdX +or md_dX) to live in the directory /dev/md/. Names in this directory +will always be created as symlinks back to the standard name in /dev. + +The man pages contain some information about the new externally managed +metadata. However see below for a more condensed overview. + +Externally managed metadata introduces the concept of a 'container'. +A container is a collection of (normally) physical devices which have +a common set of metadata. A container is assembled as an md array, but +is left 'inactive'. + +A container can contain one or more data arrays. These are composed from +slices (partitions?) of various devices in the container. + +For example, a 5 devices DDF set can container a RAID1 using the first +half of two devices, a RAID0 using the first half of the remain 3 devices, +and a RAID5 over thte second half of all 5 devices. + +A container can be created with + + mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde] + +or "-e imsm" to use the Intel Matrix Storage Manager. + +An array can be created within a container either by giving the +container name and the only member: + + mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0 + +or by listing the component devices + + mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde] + +To assemble a container, it is easiest just to pass each device in turn to +mdadm -I + + for i in /dev/sd[abcde] + do mdadm -I $i + done + +This will assemble the container and the components. + +Alternately the container can be assembled explicitly + + mdadm -A /dev/md0 /dev/sd[abcde] + +Then the components can all be assembled with + + mdadm -I /dev/md0 + +For each container, mdadm will start a program called "mdmon" which will +monitor the array and effect any metadata updates needed. The array is +initially assembled readonly. It is up to "mdmon" to mark the metadata +as 'dirty' and which the array to 'read-write'. + +The version 0.90 and 1.x metadata formats supported by previous +versions for mdadm are still supported and the kernel still performs +the same updates it use to. The new 'mdmon' approach is only used for +newly introduced metadata types. + +NeilBrown 2nd June 2009 diff --git a/ANNOUNCE-3.0.1 b/ANNOUNCE-3.0.1 new file mode 100644 index 00000000..91b44284 --- /dev/null +++ b/ANNOUNCE-3.0.1 @@ -0,0 +1,22 @@ +Subject: ANNOUNCE: mdadm 3.0.1 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This contains only minor bug fixes over 3.0. If you are using +3.0, you could consider upgrading. + +The brief change log is: + - Fix various segfaults + - Fixed for --examine with containers + - Lots of other little fixes. + +NeilBrown 25th September 2009 diff --git a/ANNOUNCE-3.0.2 b/ANNOUNCE-3.0.2 new file mode 100644 index 00000000..93643d17 --- /dev/null +++ b/ANNOUNCE-3.0.2 @@ -0,0 +1,21 @@ +Subject: ANNOUNCE: mdadm 3.0.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This just contains one bugfix over 3.0.1 - I was obviously a bit hasty +in releasing that one. + +The brief change log is: + - Fix crash when hosthost is not set, as often happens in + early boot. + +NeilBrown 25th September 2009 diff --git a/ANNOUNCE-3.0.3 b/ANNOUNCE-3.0.3 new file mode 100644 index 00000000..d6117a1d --- /dev/null +++ b/ANNOUNCE-3.0.3 @@ -0,0 +1,29 @@ +Subject: ANNOUNCE: mdadm 3.0.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This contains a collection of bug fixes and minor enhancements over +3.0.1. + +The brief change log is: + - Improvements for creating arrays giving just a name, like 'foo', + rather than the full '/dev/md/foo'. + - Improvements for assembling member arrays of containers. + - Improvements to test suite + - Add option to change increment for RebuildNN messages reported + by "mdadm --monitor" + - Improvements to mdmon 'hand-over' from initrd to final root. + - Handle merging of devices that have left an IMSM array and are + being re-incorporated. + - Add missing space in "--detail --brief" output. + +NeilBrown 22nd October 2009 diff --git a/ANNOUNCE-3.1 b/ANNOUNCE-3.1 new file mode 100644 index 00000000..343b85da --- /dev/null +++ b/ANNOUNCE-3.1 @@ -0,0 +1,33 @@ +Subject: ANNOUNCE: mdadm 3.1 - A tool for managing Soft RAID under Linux + +Hot on the heals of 3.0.3 I am pleased to announce the availability of + mdadm version 3.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +It contains significant feature enhancements over 3.0.x + +The brief change log is: + - Support --grow to change the layout of RAID4/5/6 + - Support --grow to change the chunksize of raid 4/5/6 + - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and + back. + - Support --grow to reduce the number of devices in RAID4/5/6. + - Support restart of these grow options which assembling an array + which is partially grown. + - Assorted tests of this code, and of different RAID6 layouts. + +Note that a 2.6.31 or later is needed to have access to these. +Reducing devices in a RAID4/5/6 requires 2.6.32. +Changing RAID5 to RAID1 requires 2.6.33. + +You should only upgrade if you need to use, or which to test, these +features. + +NeilBrown 22nd October 2009 diff --git a/ANNOUNCE-3.1.1 b/ANNOUNCE-3.1.1 new file mode 100644 index 00000000..9e480dc0 --- /dev/null +++ b/ANNOUNCE-3.1.1 @@ -0,0 +1,39 @@ +Subject: ANNOUNCE: mdadm 3.1.1 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix release over 3.1, which was withdrawn due to serious +bugs. So it might be best to ignore 3.1 and say that this is a significant +feature release over 3.0.x + +Significant changes are: + - RAID level conversion between RAID1, RAID5, and RAID6 are + possible were the kernel supports it (2.6.32 at least) + - online chunksize and layout changing for RAID5 and RAID6 + where the kernel supports it. + - reduce the number of devices in a RAID4/5/6 array. + + - The default metadata is not v1.1. This metadata is stored at the + start of the device so is safer in many ways but could interfere with + boot loaded. The old default (0.90) is still available and fully + supported. + + - The default chunksize is now 512K rather than 64K. This seems more + appropriate for modern devices. + + - The default bitmap chunksize for internal bitmaps is now at least + 64Meg as fine grained bitmaps tend to impact performance more for + little extra gain. + +This release is believed to be stable and you should feel free to +upgrade to 3.1.1. + +NeilBrown 19th November 2009 diff --git a/ANNOUNCE-3.1.2 b/ANNOUNCE-3.1.2 new file mode 100644 index 00000000..321b8bef --- /dev/null +++ b/ANNOUNCE-3.1.2 @@ -0,0 +1,46 @@ +Subject: ANNOUNCE: mdadm 3.1.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.1. + +Significant changes are: + - The default metadata has change again (sorry about that). + It is now v1.2 and will hopefully stay that way. It turned + out there with boot-block issues with v1.1 which make it + unsuitable for a default, though in many cases it is still + suitable to use. + - Stopping a container is not permitted when members are still + active + - Add 'homehost' to the valid words for the "AUTO" config file + line. When followed by "-all", this causes mdadm to + auto-assemble any array belonging to this host, but not + auto-assemble anything else. + - Fix some bugs with "--grow --chunksize=" for changing chunksize. + - VAR_RUN can be easily changed at compile time just like ALT_RUN. + This gives distros more flexability in how to manage the + pid and sock files that mdmon needs. + - Various mdmon fixes + - Alway make bitmap 4K-aligned if at all possible. + - If mdadm.conf lists arrays which have inter-dependencies, + the previously had to be listed in the "right" order. Now + any order should work. + - Fix --force assembly of v1.x arrays which are in the process + of recovering. + - Add section on 'scrubbing' to 'md' man page. + - Various command-line-option parsing improvements. + - ... and lots of other bug fixes. + + +This release is believed to be stable and you should feel free to +upgrade to 3.1.2 + +NeilBrown 10th March 2010 diff --git a/ANNOUNCE-3.1.3 b/ANNOUNCE-3.1.3 new file mode 100644 index 00000000..95b2b6c1 --- /dev/null +++ b/ANNOUNCE-3.1.3 @@ -0,0 +1,46 @@ +Subject: ANNOUNCE: mdadm 3.1.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.2 + +Significant changes are: + - mapfile now lives in a fixed location which default to + /dev/.mdadm/map but can be changed at compile time. This + location is choses and most distros provide it during early + boot and preserve it through. As long a /dev exists and is + writable, /dev/.mdadm will be created. + Other files file communication with mdmon live here too. + This fixes a bug reported by Debian and Gentoo users where + udev would spin in early-boot. + - IMSM and DDF metadata will not be recognised on partitions + as they should only be used on whole-disks. + - Various overflows causes by 2G drives have been addressed. + - A subarray of an IMSM contain can now be killed with + --kill-subarray. Also subarrays can be renamed with + --update-subarray + - -If (or --incremental --fail) can be used from udev to + fail and remove from all arrays a device which has been + unplugged from the system. i.e. hot-unplug-support. + - "mdadm /dev/mdX --re-add missing" will look for any device + that looks like it should be a member of /dev/mdX but isn't + and will automatically --re-add it + - Now compile with -Wextra to get extra warnings. + - Lots of minor bug fixes, documentation improvements, etcc + +This release is believed to be stable and you should feel free to +upgrade to 3.1.3 + +It is expected that the next release will be 3.2 with a number of new +features. 3.1.4 will only happen if important bugs show up before 3.2 +is stable. + +NeilBrown 6th August 2010 diff --git a/ANNOUNCE-3.1.4 b/ANNOUNCE-3.1.4 new file mode 100644 index 00000000..c157a36a --- /dev/null +++ b/ANNOUNCE-3.1.4 @@ -0,0 +1,37 @@ +Subject: ANNOUNCE: mdadm 3.1.4 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.4 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.3. +3.1.3 had a couple of embarrasing regressions and a couple of other +issues surfaces which had easy fixes so I decided to make a 3.1.4 +release after all. + +Two fixes related to configs that aren't using udev: + - Don't remove md devices which 'standard' names on --stop + - Allow dev_open to work on read-only /dev +And fixed regressions: + - Allow --incremental to add spares to an array + - Accept --no-degraded as a deprecated option rather than + throwing an error + - Return correct success status when --incrmental assembling + a container which does not yet have enough devices. + - Don't link mdadm with pthreads, only mdmon needs it. + - Fix compiler warning due to bad use of snprintf + - Fix spare migration + +This release is believed to be stable and you should feel free to +upgrade to 3.1.4 + +It is expected that the next release will be 3.2 with a number of new +features. + +NeilBrown 31st August 2010 diff --git a/ANNOUNCE-3.1.5 b/ANNOUNCE-3.1.5 new file mode 100644 index 00000000..baa1f921 --- /dev/null +++ b/ANNOUNCE-3.1.5 @@ -0,0 +1,42 @@ +Subject: ANNOUNCE: mdadm 3.1.5 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.5 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.4. It contains all the +important bugfixes found while working on 3.2 and 3.2.1. It will be +the last 3.1.x release - 3.2.1 is expected to be released in a few days. + +Changes include: + - Fixes for v1.x metadata on big-endian machines. + - man page improvements + - Improve '--detail --export' when run on partitions of an md array. + - Fix regression with removing 'failed' or 'detached' devices. + - Fixes for "--assemble --force" in various unusual cases. + - Allow '-Y' to mean --export. This was documented but not implemented. + - Various fixed for handling 'ddf' metadata. This is now more reliable + but could benefit from more interoperability testing. + - Correctly list subarrays of a container in "--detail" output. + - Improve checks on whether the requested number of devices is supported + by the metadata - both for --create and --grow. + - Don't remove partitions from a device that is being included in an + array until we are fully committed to including it. + - Allow "--assemble --update=no-bitmap" so an array with a corrupt + bitmap can still be assembled. + - Don't allow --add to succeed if it looks like a "--re-add" is probably + wanted, but cannot succeed. This avoids inadvertently turning + devices into spares when an array is failed. + +This release is believed to be stable and you should feel free to +upgrade to 3.1.5 + + +NeilBrown 23rd March 2011 + diff --git a/ANNOUNCE-3.2 b/ANNOUNCE-3.2 new file mode 100644 index 00000000..9e282bc6 --- /dev/null +++ b/ANNOUNCE-3.2 @@ -0,0 +1,77 @@ +Subject: ANNOUNCE: mdadm 3.2 - A tool for managing Soft RAID under Linux (DEVEL ONLY) + +I am pleased to announce the availability of + mdadm version 3.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm devel-3.2 + http://neil.brown.name/git?p=mdadm + +This is a "Developers only" release. Please don't consider using it +or making it available to others without reading the following. + + +By far the most significant change in this release related to the +management of reshaping arrays. This code has been substantially +re-written so that it can work with 'externally managed metadata' - +Intel's IMSM in particular. We now support level migration and +OnLine Capacity Expansion on these arrays. + +However, while the code largely works it has not been tested +exhaustively so there are likely to be problems. As the reshape code +for native metadata arrays was changed as part of this rewrite these +problems could also result in regressions for reshape of native +metadata. + +It is partly to encourage greater testing that this release is being +made. Any reports of problem - particular reproducible recipes for +triggering the problems - will be gratefully received. + +It is hopped that a "3.2.1" release will be available in early March +which will be a bugfix release over this and can be considered +suitable for general use. + +Other changes of note: + + - Policy framework. + Various policy statements can be made in the mdadm.conf to guide + the behaviour of mdadm, particular with regards to how new devices + are treated by "mdadm -I". + Depending on the 'action' associated with a device (identified by + its 'path') such need devices can be automatically re-added to and + existing array that they previously fell out off, or automatically + added as a spare if they appear to contain no data. + + - mdadm now has a limited understanding of partition tables. This + allows the policy framework to make decisions about partitioned + devices as well. + + - --incremental --remove can be told what --path the device was on, + and this info will be recorded so that another device appearing at + the same physical location can be preferentially added to the same + array (provides the spare-same-slot action policy applied to the + path). + + - A new flags "--invalid-backup" flag is available in --assemble + mode. This can be used to re-assemble an array which was stopping + in the middle of a reshape, and for which the 'backup file' is no + longer available or is corrupted. The array may have some + corruption in it at the point where reshape was up to, but at least + the rest of the array will become available. + + + - Various internal restructuring - more is needed. + + +Any feed back and bug reports are always welcomed at: + linux-raid@vger.kernel.org + +And please: don't use this in production - particularly not the +--grow functionality. + +NeilBrown 1st February 2011 + + diff --git a/ANNOUNCE-3.2.1 b/ANNOUNCE-3.2.1 new file mode 100644 index 00000000..0e7826ca --- /dev/null +++ b/ANNOUNCE-3.2.1 @@ -0,0 +1,75 @@ + + +I am pleased to announce the availability of + mdadm version 3.2.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +Many of the changes in this release are of internal interest only, +restructuring and refactoring code and so forth. + +Most of the bugs found and fixed during development for 3.2.1 have been +back-ported for the recently-release 3.1.5 so this release primarily +provides a few new features over 3.1.5. + +They include: + - policy framework + Policy can be expressed for moving spare devices between arrays, and + for how to handle hot-plugged devices. This policy can be different + for devices plugged in to different controllers etc. + This, for example, allows a configuration where when a device is plugged + in it is immediately included in an md array as a hot spare and + possibly starts recovery immediately if an array is degraded. + + - some understanding of mbr and gpt paritition tables + This is primarly to support the new hot-plug support. If a + device is plugged in and policy suggests it should have a partition table, + the partition table will be copied from a suitably similar device, and + then the partitions will hot-plug and can then be added to md arrays. + + - "--incremental --remove" can remember where a device was removed from + so if a device gets plugged back in the same place, special policy applies + to it, allowing it to be included in an array even if a general hotplug + will not be included. + + - enhanced reshape options, including growing a RAID0 by converting to RAID4, + restriping, and converting back. Also convertions between RAID0 and + RAID10 and between RAID1 and RAID10 are possible (with a suitably recent + kernel). + + - spare migration for IMSM arrays. + Spare migration can now work across 'containers' using non-native metadata + and specifically Intel's IMSM arrays support spare migrations. + + - OLCE and level migration for Intel IMSM arrays. + OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is + supported for Intel Matrix Storage Manager arrays. + This support is currently 'experimental' for technical reasons. It can + be enabled with "export MDADM_EXPERIMENTAL=1" + + - avoid including wayward devices + If you split a RAID1, mount the two halves as two separate degraded RAID1s, + and then later bring the two back together, it is possible that the md + metadata won't properly show that one must over-ride the other. + mdadm now does extra checking to detect this possibilty and avoid + potentially corrupting data. + + - remove any possible confusion between similar options. + e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't + notice if one was used where the other was expected. + + - allow K,M,G suffixes on chunk sizes + + +While mdadm-3.2.1 is considered to be reasonably stable, you should +only use it if you want to try out the new features, or if you +generally like to be on the bleeding edge. If the new features are not +important to you, then 3.1.5 is probably the appropriate version to be using +until 3.2.2 comes out. + +NeilBrown 28th March 2011 diff --git a/ANNOUNCE-3.2.2 b/ANNOUNCE-3.2.2 new file mode 100644 index 00000000..b70d18b9 --- /dev/null +++ b/ANNOUNCE-3.2.2 @@ -0,0 +1,36 @@ +Subject: ANNOUNCE: mdadm 3.2.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a stablising release for the 3.2 series. +Many of the changes just fix bugs introduces in 3.2 or 3.2.1. + +There are some new features. They are: + - reshaping IMSM (Intel metadata) arrays is no longer 'experimental', + it should work properly and be largely compatible with IMSM drivers in + other platforms. + - --assume-clean can be used with --grow --size to avoid resyncing the + new part of the array. This is only support with very new kernels. + - RAID0 arrays can have chunksize which is not a power of 2. This has been + supported in the kernel for a while but is only now supprted by + mdadm. + + - A new tool 'raid6check' is available which can check a RAID6 array, + or part of it, and report which device is most inconsistent with the + others if any stripe is inconsistent. This is still under development + and does not have a man page yet. If anyone tries it out and has any + questions or experience to report, they would be most welcome on + linux-raid@vger.kernel.org. + +Future releases in the 3.2 series will only be made if bugfixes are needed. +The next release to add features is expected to be 3.3. + +NeilBrown 17th June 2011 diff --git a/ANNOUNCE-3.2.3 b/ANNOUNCE-3.2.3 new file mode 100644 index 00000000..8a8dba46 --- /dev/null +++ b/ANNOUNCE-3.2.3 @@ -0,0 +1,24 @@ +Subject: ANNOUNCE: mdadm 3.2.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a bugfix release for the 3.2 series with many +minor fixes with little or no impact. + +The largest single area of change is support for reshape of Intel +IMSM arrays (OnLine Capacity Explansion and Level Migtration). +Among other fixes, this now has a better chance of surviving if a +device fails during reshape. + +Upgrading is recommended - particularly if you use mdadm for IMSM +arrays - but not essential. + +NeilBrown 23rd December 2011 diff --git a/ANNOUNCE-3.2.4 b/ANNOUNCE-3.2.4 new file mode 100644 index 00000000..e3216786 --- /dev/null +++ b/ANNOUNCE-3.2.4 @@ -0,0 +1,144 @@ +Subject: ANNOUNCE: mdadm 3.2.4 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.4 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a bugfix release for the 3.2 series with many +minor fixes with little or no impact. + +"--oneline" log of changes is below. Some notable ones are: + + - --offroot argument to improve interactions between mdmon and initrd + - --prefer argument to select which /dev names to display in some + circumstances. + - relax restructions on when "--add" will be allowed + - Fix bug with adding write-intent-bitmap to active array + - Now defaults to "/run/mdadm" for storing run-time files. + +Upgrading is encouraged. + +The next mdadm release is expected to be 3.3 with a number of new +features. + +NeilBrown 9th May 2012 + +77b3ac8 monitor: make return from read_and_act more symbolic. +68226a8 monitor: ensure we retry soon when 'remove' fails. +8453f8d fix: Monitor sometimes crashes +90fa1a2 Work around gcc-4.7's strict aliasing checks +0c4304c fix: container creation with --incremental used. +5d1c7cd FIX: External metadata sometimes is not updated +3c20f98 FIX: mdmon check in reshape_container() can cause a problem +59ab9f5 FIX: Typo error in fprint command +9587c37 imsm: load_super_imsm_all function refactoring +ec50f7b imsm: load_imsm_super_all supports loading metadata from the device list +ca9de18 imsm: validate the number of imsm volumes per controller +30602f5 imsm: display fd in error trace when when store_imsm_mpb failes +eb155f6 mdmon: Use getopt_long() to parse command line options +08ca2ad Add --offroot argument to mdadm +da82751 Add --offroot argument to mdmon +a0963a8 Spawn mdmon with --offroot if mdadm was launched with --offroot +f878b24 imsm: fix, the second array need to have the whole available space on devices +d597705 getinfo_super1: Use MaxSector in place of sb->size +6ef8905 super1: make aread/awrite always use an aligned buffer. +de5a472 Remove avail_disks arg from 'enough'. +da8fe5a Assemble: fix --force assemble during reshape. +b10c663 config: fix handing of 'homehost' in AUTO line. +92d49ec FIX: NULL pointer to strdup() can be passed +d2bde6d imsm: FIX: No new missing disks are allowed during general migration +111e9fd FIX: Array is not run when expansion disks are added +bf5cf7c imsm: FIX: imsm_get_allowed_degradation() doesn't count degradation for raid1 +50927b1 Fix: Sometimes mdmon throws core dump during reshape +78340e2 Flush mdmon before next reshape step during container operation +e174219 imsm: FIX: Chunk size migration problem +f93346e FIX: use md position to reshape restart +6a75c8c imsm: FIX: use md position to reshape restart +51d83f5 imsm: FIX: Clear migration record when migration switches to next volume. +e1dd332 FIX: restart reshape when reshape process is stopped just between 2 reshapes +1ca90aa FIX: Do not try to (continue) reshape using inactive array +9f1b0f0 config: conf_match should ignore devname when not set. +d669228 Use posix_memalign() for memory used to write bitmaps +178950e FIX: Changes in '0' case for reshape position verification +9200d41 avoid double-free upon "old buggy kernel" sysfs_read failure +4011421 Print error message if failing to write super for 1.x metadata +0011874 Use MDMON_DIR for pid files created in Monitor.c +56d1885 Assemble: don't use O_EXCL until we have checked device content. +b720636 Assemble: support assembling of a RAID0 being reshaped. +c69ffac Manage: allow --re-add to failed array. +52f07f5 Reset bad flag on map update +911cead super1: support superblocks up to 4K. +ad6db3c Create: reduce the verbosity of 'default_layout'. +b2bfdfa super1.c don't keep recalculating bitmap pointer +4122675 Define and use SUPER1_SIZE for allocations +1afa930 init_super1() memset full buffer allocated for superblock +2de0b8a match_metadata_desc1(): Use calloc instead of malloc+memset +3c0bcd4 Use 4K buffer alignment for superblock allocations +308340a Use struct align_fd to cache fd's block size for aligned reads/writes +65ed615 match_metadata_desc0(): Use calloc instead of malloc+memset +de89706 Generalize ROUND_UP() macro and introduce matching ROUND_UP_PTR() +0a2f189 super1.c: use ROUND_UP/ROUND_UP_PTR +654a381 super-intel.c: Use ROUND_UP() instead of manually coding it +42d5dfd __write_init_super_ddf(): Use posix_memalign() instead of static aligned buffer +d4633e0 Examine: fix array size calculation for RAID10. +e62b778 Assemble: improve verbose logging when including old devices. +0073a6e Remove possible crash during RAID6 -> RAID5 reshape. +69fe207 Incremental: fix adding devices with --incremental +bcbb311 Manage: replace 'return 1' with 'goto abort'. +9f58469 Manage: freeze recovery while adding multiple devices. +ae6c05a Create: round off size for RAID1 arrays. +5ca3a90 Grow: print useful error when converting RAID1->RAID5 will fail. +c07d640 Fix tests/05r1-re-add-nosupper +2d762ad Fix the new ROUND_UP macro. +fd324b0 sysfs: fixed sysfs_freeze_array array to work properly with Manage_subdevs. +5551b11 imsm: avoid overflows for disks over 1TB +97f81ee clear hi bits if not used after loading metadata from disk +e03640b simplify calculating array_blocks +29cd082 show 2TB volumes/disks support in --detail-platform +2cc699a check volume size in validate_geometry_imsm_orom +9126b9a check that no disk over 2TB is used to create container when no support +027c374 imsm: set 2tb disk attribute for spare +3556c2f Fix typo: wan -> want +15632a9 parse_size: distinguish between 0 and error. +fbdef49 Bitmap_offset is a signed number +508a7f1 super1: leave more space in front of data by default. +40110b9 Fix two typos in fprintf messages +342460c mdadm man page: fix typo +0e7f69a imsm: display maximum volumes per controller and array +36fd8cc imsm: FIX: Update function imsm_num_data_members() for Raid1/10 +7abc987 imsm: FIX: Add volume size expand support to imsm_analyze_change() +f3871fd imsm: Add new metadata update for volume size expansion +54397ed imsm: Execute size change for external metatdata +016e00f FIX: Support metadata changes rollback +fbf3d20 imsm: FIX: Support metadata changes rollback +44f6f18 FIX: Extend size of raid0 array +7e7e9a4 FIX: Respect metadata size limitations +65a9798 FIX: Detect error and rollback metadata +13bcac9 imsm: Add function imsm_get_free_size() +b130333 imsm: Support setting max size for size change operation +c41e00b imsm: FIX: Component size alignment check +58d26a2 FIX: Size change is possible as standalone change only +4aecb54 FIX: Assembled second array is in read only state during reshape +ae2416e FIX: resolve make everything compilation error +480f356 Raid limit of 1024 when scanning for devices. +c2ecf5f Add --prefer option for --detail and --monitor +0a99975 Relax restrictions on when --add is permitted. +7ce0570 imsm: fix: rebuild does not continue after reboot +b51702b fix: correct extending size of raid0 array +34a1395 Fix sign extension of bitmap_offset in super1.c +012a864 Introduce sysfs_set_num_signed() and use it to set bitmap/offset +5d7b407 imsm: fix: thunderdome may drop 2tb attribute +5ffdc2d Update test for "is udev active". +96fd06e Adjust to new standard of /run +974e039 test: don't worry too much about array size. +b0a658f Grow: failing the set the per-device size is not an error. +36614e9 super-intel.c: Don't try to close negative fd +562aa10 super-intel.c: Fix resource leak from opendir() + diff --git a/ANNOUNCE-3.2.5 b/ANNOUNCE-3.2.5 new file mode 100644 index 00000000..396da12a --- /dev/null +++ b/ANNOUNCE-3.2.5 @@ -0,0 +1,31 @@ +Subject: ANNOUNCE: mdadm 3.2.5 - A tool for managing Soft RAID under Linux + +I am somewhat disappointed to have to announce the availability of + mdadm version 3.2.5 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release primarily fixes a serious regression in 3.2.4. +This regression does *not* cause any risk to data. It simply +means that adding a device with "--add" would sometime fail +when it should not. + +The fix also includes a couple of minor fixes such as making +the "--layout=preserve" option to "--grow" work again. + +A reminder that the default location for runtime files is now +"/run/mdadm". If you compile this for a distro that does not +have "/run", you will need to compile with an alternate setting for +MAP_DIR. e.g. + make MAP_DIR=/var/run/mdadm +or + make MAP_DIR=/dev/.mdadm + +NeilBrown 18th May 2012 + diff --git a/ANNOUNCE-3.2.6 b/ANNOUNCE-3.2.6 new file mode 100644 index 00000000..f5cfd492 --- /dev/null +++ b/ANNOUNCE-3.2.6 @@ -0,0 +1,57 @@ +Subject: ANNOUNCE: mdadm 3.2.6 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.6 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This is a stablity release which adds a number of bugfixs to 3.2.5. +There are no real stand-out fixes, just lots of little bits and pieces. + +Below is the "git log --oneline --reverse" list of changes since +3.2.5. + +NeilBrown 25th October 2012 + +b7e05d2 udev-rules: prevent systemd from mount devices before they are ready. +0d478e2 mdadm: Fix Segmentation fault. +42f0ca1 imsm: fix: correct checking volume's degradation +fcf2195 Monitor: fix inconsistencies in values for ->percent +5f862fb Monitor: Report NewArray when an array the disappeared, reappears. +6f51b1c Monitor: fix reporting for Fail vs FailSpare etc. +68ad53b mdmon: fix arg parsing. +517f135 Assemble: don't leak memory with fdlist. +090900c udev-rules: prevent systemd from mount devices before they are ready. +446e000 sha1.h: remove ansidecl.h header inclusion +ec894f5 Manage: zero metadata before adding to 'external' array. +3a84db5 ddf: allow a non-spare to be used to recovery a missing device. +c5d61ca ddf: hack to fix container recognition. +23084aa mdmon: fix arg processing for -a +c4e96a3 mdmon: allow --takeover when original was started with --offroot +80841df find_free_devnum: avoid auto-using names in /etc/mdadm.conf +c5c56d6 mapfile: fix mapfile rebuild for containers +aec89f6 fix segfaults in Detail() +2117ad1 Fix 'enough' function for RAID10. +0bc300d Use --offroot flag when assembling md arrays via --incrmental +ac78f24 Grow: make warning about old metadata more explicit. +14026ab Replace sha1.h with slightly older version. +6f6809f Add zlib license to crc32.c +5267ba0 Handles spaces in array names better. +c51f288 imsm: allow --assume-clean to work. +acf7076 Grow: allow --grow --continue to work for native metadata. +335d2a6 Grow: fix a couple of typos with --assume-clean usage +9ff1427 Fix open_container +3713633 mdadm: super0: do not override uuid with homehost +31bff58 Trivial bugfix and spelling fixes. +e1e539f Detail: don't report a faulty device as 'spare' or 'rebuilding'. +22a6461 super0: allow creation of array on 2TB+ devices. +a5d47a2 Create new md devices consistently +eb48676 Monitor: don't complain about non-monitorable arrays in mdadm.conf +ecdf2d7 Query: don't be confused by partition tables. +f7b75c1 Query: allow member of non-0.90 arrays to be better reported. diff --git a/ANNOUNCE-3.3 b/ANNOUNCE-3.3 new file mode 100644 index 00000000..f770aa13 --- /dev/null +++ b/ANNOUNCE-3.3 @@ -0,0 +1,63 @@ +Subject: ANNOUNCE: mdadm 3.3 - A tools for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm + +This is a major new release so don't be too surprised if there are a +few issues. If I hear about them they will be fixed in 3.3.1. +git log reports nearly 500 changes since 3.2.6 so I won't list them +all. + +Some highlights are: + +- Some array reshapes can proceed without needing backup file. + This is done by changing the 'data_offset' so we never need to write + any data back over where it was before. If there is no "head space" + or "tail space" to allow data_offset to change, the old mechanism + with a backup file can still be used. +- RAID10 arrays can be reshaped to change the number of devices, + change the chunk size, or change the layout between 'near' + and 'offset'. + This will always change data_offset, and will fail if there is no + room for data_offset to be moved. +- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array. +- bad-block-logs are supported (but not heavily tested yet) +- "--assemble --update=revert-reshape" can be used to undo a reshape + that has just been started but isn't really wanted. This is very + new and while it passes basic tests it cannot be guaranteed. +- improved locking between --incremental and --assemble +- uses systemd to run "mdmon" if systemd is configured to do that. +- kernel names of md devices can be non-numeric. e.g. "md_home" rather than + "md0". This will probably confuse lots of other tools, so you need to + echo CREATE names=yes >> /etc/mdadm.conf + or the feature will not be used. (you also need a reasonably new kernel). +- "--stop" can be given a kernel name instead of a device name. i.e + mdadm --stop md4 + will work even if /dev/md4 doesn't exist. +- "--detail --export" has some information about the devices in the array +- --dump and --restore can be used to backup and restore the metadata on an + array. +- Hot-replace is supported with + mdadm /dev/mdX --replace /dev/foo + and + mdadm /dev/mdX --replace /dev/foo --with /dev/bar +- Config file can be a directory in which case all "*.conf" files are + read in lexical order. + Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d + Thus + echo CREATE name=yes > /etc/mdadm.conf.d/names.conf + will also enable the use of named md devices. + +- Lots of improvements to DDF support including adding support for + RAID10 (thanks Martin Wilck). + +and lots of bugfixes and other little changes. + +NeilBrown 3rd September 2013 diff --git a/ANNOUNCE-3.3.1 b/ANNOUNCE-3.3.1 new file mode 100644 index 00000000..7d5e666e --- /dev/null +++ b/ANNOUNCE-3.3.1 @@ -0,0 +1,23 @@ +Subject: ANNOUNCE: mdadm 3.3.1 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.1 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +The main changes are: + - lots of work on "DDF" support. Hopefully it will be more stable + now. Bug reports are always welcome. + - improved interactions with 'systemd'. Where possible, background + tasks are run from systemd (if it is present) rather then forking + disassociationg from the session. This is important because udev + doesn't really let you disassociate. + +though there are a number of other little bug fixes too. + +NeilBrown 5th June 2014 diff --git a/ANNOUNCE-3.3.2 b/ANNOUNCE-3.3.2 new file mode 100644 index 00000000..6b549611 --- /dev/null +++ b/ANNOUNCE-3.3.2 @@ -0,0 +1,16 @@ +Subject: ANNOUNCE: mdadm 3.3.2 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.2 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +Changes since 3.3.1 are mostly little bugfixes and some man-page +updates. + +NeilBrown 21st August 2014 diff --git a/ANNOUNCE-3.3.3 b/ANNOUNCE-3.3.3 new file mode 100644 index 00000000..ac1b2173 --- /dev/null +++ b/ANNOUNCE-3.3.3 @@ -0,0 +1,18 @@ +Subject: ANNOUNCE: mdadm 3.3.3 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.3 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +The 100 changes since 3.3.3 are mostly little bugfixes and some improvements +to the selftests. +raid6check now handle all RAID6 layouts including DDF correctly. +See git log for the rest. + +NeilBrown 24th July 2015 diff --git a/ANNOUNCE-3.3.4 b/ANNOUNCE-3.3.4 new file mode 100644 index 00000000..52b94562 --- /dev/null +++ b/ANNOUNCE-3.3.4 @@ -0,0 +1,37 @@ +Subject: ANNOUNCE: mdadm 3.3.4 - A tool for managing md Soft RAID under Linux + +I am somewhat disappointed to have to announce the availability of + mdadm version 3.3.4 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +In mdadm-3.3 a change was made to how IMSM (Intel Matrix Storage +Manager) metadata was handled. Previously an IMSM array would only +be assembled if it was attached to an IMSM controller. + +In 3.3 this was relaxed as there are circumstances where the +controller is not properly detected. Unfortunately this has negative +consequences which have only just come to light. + +If you have an IMSM RAID1 configured and then disable RAID in the +BIOS, the metadata will remain on the devices. If you then install +some other OS on one device and then install Linux on the other, Linux +might eventually start noticing the IMSM metadata (depending a bit on whether +mdadm is included in the initramfs) and might start up the RAID1. This could +copy one device over the other, thus trashing one of the installations. + +Not good. + +So with this release IMSM arrays will only be assembled if attached to +an IMSM controller, or if "--force" is given to --assemble, or if the +environment variable IMSM_NO_PLATFORM is set (used primarily for +testing). + +I strongly recommend upgrading to 3.3.4 if you are using 3.3 or later. + +NeilBrown 3rd August 2015. diff --git a/Assemble.c b/Assemble.c new file mode 100644 index 00000000..29257330 --- /dev/null +++ b/Assemble.c @@ -0,0 +1,2031 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <ctype.h> + +static int name_matches(char *found, char *required, char *homehost, int require_homehost) +{ + /* See if the name found matches the required name, possibly + * prefixed with 'homehost' + */ + char *sep; + unsigned int l; + + if (strcmp(found, required)==0) + return 1; + sep = strchr(found, ':'); + if (!sep) + return 0; + l = sep - found; + if (strncmp(found, "any:", 4) == 0 || + (homehost && strcmp(homehost, "any") == 0) || + !require_homehost || + (homehost && strlen(homehost) == l && + strncmp(found, homehost, l) == 0)) { + /* matching homehost */ + if (strcmp(sep+1, required) == 0) + return 1; + } + return 0; +} + +static int is_member_busy(char *metadata_version) +{ + /* check if the given member array is active */ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + int busy = 0; + + for (ent = mdstat; ent; ent = ent->next) { + if (ent->metadata_version == NULL) + continue; + if (strncmp(ent->metadata_version, "external:", 9) != 0) + continue; + if (!is_subarray(&ent->metadata_version[9])) + continue; + /* Skip first char - it can be '/' or '-' */ + if (strcmp(&ent->metadata_version[10], metadata_version+1) == 0) { + busy = 1; + break; + } + } + free_mdstat(mdstat); + + return busy; +} + +static int ident_matches(struct mddev_ident *ident, + struct mdinfo *content, + struct supertype *tst, + char *homehost, int require_homehost, + char *update, char *devname) +{ + + if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) && + same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0 && + memcmp(content->uuid, uuid_zero, sizeof(int[4])) != 0) { + if (devname) + pr_err("%s has wrong uuid.\n", devname); + return 0; + } + if (ident->name[0] && (!update || strcmp(update, "name")!= 0) && + name_matches(content->name, ident->name, homehost, require_homehost)==0) { + if (devname) + pr_err("%s has wrong name.\n", devname); + return 0; + } + if (ident->super_minor != UnSet && + ident->super_minor != content->array.md_minor) { + if (devname) + pr_err("%s has wrong super-minor.\n", + devname); + return 0; + } + if (ident->level != UnSet && + ident->level != content->array.level) { + if (devname) + pr_err("%s has wrong raid level.\n", + devname); + return 0; + } + if (ident->raid_disks != UnSet && + content->array.raid_disks != 0 && /* metadata doesn't know how many to expect */ + ident->raid_disks!= content->array.raid_disks) { + if (devname) + pr_err("%s requires wrong number of drives.\n", + devname); + return 0; + } + if (ident->member && ident->member[0]) { + /* content->text_version must match */ + char *s = strchr(content->text_version+1, '/'); + if (s == NULL) { + if (devname) + pr_err("%s is not a container and one is required.\n", + devname); + return 0; + } else if (strcmp(ident->member, s+1) != 0) { + if (devname) + pr_err("skipping wrong member %s is %s\n", + content->text_version, devname); + return 0; + } + } + return 1; +} + +static int select_devices(struct mddev_dev *devlist, + struct mddev_ident *ident, + struct supertype **stp, + struct mdinfo **contentp, + struct context *c, + int inargv, int auto_assem) +{ + struct mddev_dev *tmpdev; + int num_devs; + struct supertype *st = *stp; + struct mdinfo *content = NULL; + int report_mismatch = ((inargv && c->verbose >= 0) || c->verbose > 0); + struct domainlist *domains = NULL; + + tmpdev = devlist; num_devs = 0; + while (tmpdev) { + if (tmpdev->used) + tmpdev->used = 2; + else + num_devs++; + tmpdev->disposition = 0; + tmpdev = tmpdev->next; + } + + /* first walk the list of devices to find a consistent set + * that match the criterea, if that is possible. + * We flag the ones we like with 'used'. + */ + for (tmpdev = devlist; + tmpdev; + tmpdev = tmpdev ? tmpdev->next : NULL) { + char *devname = tmpdev->devname; + int dfd; + struct stat stb; + struct supertype *tst; + struct dev_policy *pol = NULL; + int found_container = 0; + + if (tmpdev->used > 1) + continue; + + if (ident->container) { + if (ident->container[0] == '/' && + !same_dev(ident->container, devname)) { + if (report_mismatch) + pr_err("%s is not the container required (%s)\n", + devname, ident->container); + continue; + } + } else if (ident->devices && + !match_oneof(ident->devices, devname)) { + /* Note that we ignore the "device=" identifier if a + * "container=" is given. Checking both is unnecessarily + * complicated. + */ + if (report_mismatch) + pr_err("%s is not one of %s\n", devname, ident->devices); + continue; + } + + tst = dup_super(st); + + dfd = dev_open(devname, O_RDONLY); + if (dfd < 0) { + if (report_mismatch) + pr_err("cannot open device %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if (fstat(dfd, &stb)< 0) { + /* Impossible! */ + pr_err("fstat failed for %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if ((stb.st_mode & S_IFMT) != S_IFBLK) { + pr_err("%s is not a block device.\n", + devname); + tmpdev->used = 2; + } else if (must_be_container(dfd)) { + if (st) { + /* already found some components, this cannot + * be another one. + */ + if (report_mismatch) + pr_err("%s is a container, but we are looking for components\n", + devname); + tmpdev->used = 2; +#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) + } if (!tst && (tst = super_by_fd(dfd, NULL)) == NULL) { + if (report_mismatch) + pr_err("not a recognisable container: %s\n", + devname); + tmpdev->used = 2; +#endif + } else if (!tst->ss->load_container + || tst->ss->load_container(tst, dfd, NULL)) { + if (report_mismatch) + pr_err("no correct container type: %s\n", + devname); + tmpdev->used = 2; + } else if (auto_assem && + !conf_test_metadata(tst->ss->name, (pol = devid_policy(stb.st_rdev)), + tst->ss->match_home(tst, c->homehost) == 1)) { + if (report_mismatch) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, tst->ss->name); + tmpdev->used = 2; + } else + found_container = 1; + } else { + if (!tst && (tst = guess_super(dfd)) == NULL) { + if (report_mismatch) + pr_err("no recogniseable superblock on %s\n", + devname); + tmpdev->used = 2; + } else if ((tst->ignore_hw_compat = 0), + tst->ss->load_super(tst, dfd, + report_mismatch ? devname : NULL)) { + if (report_mismatch) + pr_err("no RAID superblock on %s\n", + devname); + tmpdev->used = 2; + } else if (tst->ss->compare_super == NULL) { + if (report_mismatch) + pr_err("Cannot assemble %s metadata on %s\n", + tst->ss->name, devname); + tmpdev->used = 2; + } else if (auto_assem && st == NULL && + !conf_test_metadata(tst->ss->name, (pol = devid_policy(stb.st_rdev)), + tst->ss->match_home(tst, c->homehost) == 1)) { + if (report_mismatch) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, tst->ss->name); + tmpdev->used = 2; + } + } + if (dfd >= 0) close(dfd); + if (tmpdev->used == 2) { + if (auto_assem || !inargv) + /* Ignore unrecognised devices during auto-assembly */ + goto loop; + if (ident->uuid_set || ident->name[0] || + ident->super_minor != UnSet) + /* Ignore unrecognised device if looking for + * specific array */ + goto loop; + + pr_err("%s has no superblock - assembly aborted\n", + devname); + if (st) + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + return -1; + } + + if (found_container) { + /* tmpdev is a container. We need to be either + * looking for a member, or auto-assembling + */ + /* should be safe to try an exclusive open now, we + * have rejected anything that some other mdadm might + * be looking at + */ + dfd = dev_open(devname, O_RDONLY | O_EXCL); + if (dfd < 0) { + if (report_mismatch) + pr_err("%s is busy - skipping\n", devname); + goto loop; + } + close(dfd); + + if (ident->container && ident->container[0] != '/') { + /* we have a uuid */ + int uuid[4]; + + content = *contentp; + tst->ss->getinfo_super(tst, content, NULL); + + if (!parse_uuid(ident->container, uuid) || + !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) { + if (report_mismatch) + pr_err("%s has wrong UUID to be required container\n", + devname); + goto loop; + } + } + /* It is worth looking inside this container. + */ + if (c->verbose > 0) + pr_err("looking in container %s\n", + devname); + + for (content = tst->ss->container_content(tst, NULL); + content; + content = content->next) { + + if (!ident_matches(ident, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + /* message already printed */; + else if (is_member_busy(content->text_version)) { + if (report_mismatch) + pr_err("member %s in %s is already assembled\n", + content->text_version, + devname); + } else if (content->array.state & (1<<MD_SB_BLOCK_VOLUME)) { + /* do not assemble arrays with unsupported configurations */ + pr_err("Cannot activate member %s in %s.\n", + content->text_version, + devname); + } else + break; + } + if (!content) { + tmpdev->used = 2; + goto loop; /* empty container */ + } + + st = tst; tst = NULL; + if (!auto_assem && inargv && tmpdev->next != NULL) { + pr_err("%s is a container, but is not only device given: confused and aborting\n", + devname); + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + return -1; + } + if (c->verbose > 0) + pr_err("found match on member %s in %s\n", + content->text_version, devname); + + /* make sure we finished the loop */ + tmpdev = NULL; + goto loop; + } else { + content = *contentp; + tst->ss->getinfo_super(tst, content, NULL); + + if (!ident_matches(ident, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + goto loop; + + if (auto_assem) { + /* Never auto-assemble things that conflict + * with mdadm.conf in some way + */ + struct mddev_ident *match; + int rv = 0; + + match = conf_match(tst, content, devname, + report_mismatch ? c->verbose : -1, + &rv); + if (!match && rv == 2) + goto loop; + if (match && match->devname && + strcasecmp(match->devname, "<ignore>") == 0) { + if (report_mismatch) + pr_err("%s is a member of an explicitly ignored array\n", + devname); + goto loop; + } + if (match && !ident_matches(match, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + /* Array exists in mdadm.conf but some + * details don't match, so reject it + */ + goto loop; + } + + /* should be safe to try an exclusive open now, we + * have rejected anything that some other mdadm might + * be looking at + */ + dfd = dev_open(devname, O_RDONLY | O_EXCL); + if (dfd < 0) { + if (report_mismatch) + pr_err("%s is busy - skipping\n", devname); + goto loop; + } + close(dfd); + + if (st == NULL) + st = dup_super(tst); + if (st->minor_version == -1) + st->minor_version = tst->minor_version; + + if (memcmp(content->uuid, uuid_zero, + sizeof(int[4])) == 0) { + /* this is a floating spare. It cannot define + * an array unless there are no more arrays of + * this type to be found. It can be included + * in an array of this type though. + */ + tmpdev->used = 3; + goto loop; + } + + if (st->ss != tst->ss || + st->minor_version != tst->minor_version || + st->ss->compare_super(st, tst) != 0) { + /* Some mismatch. If exactly one array matches this host, + * we can resolve on that one. + * Or, if we are auto assembling, we just ignore the second + * for now. + */ + if (auto_assem) + goto loop; + if (c->homehost) { + int first = st->ss->match_home(st, c->homehost); + int last = tst->ss->match_home(tst, c->homehost); + if (first != last && + (first == 1 || last == 1)) { + /* We can do something */ + if (first) {/* just ignore this one */ + if (report_mismatch) + pr_err("%s misses out due to wrong homehost\n", + devname); + goto loop; + } else { /* reject all those sofar */ + struct mddev_dev *td; + if (report_mismatch) + pr_err("%s overrides previous devices due to good homehost\n", + devname); + for (td=devlist; td != tmpdev; td=td->next) + if (td->used == 1) + td->used = 0; + tmpdev->used = 1; + goto loop; + } + } + } + pr_err("superblock on %s doesn't match others - assembly aborted\n", + devname); + tst->ss->free_super(tst); + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + return -1; + } + tmpdev->used = 1; + } + loop: + /* Collect domain information from members only */ + if (tmpdev && tmpdev->used == 1) { + if (!pol) + pol = devid_policy(stb.st_rdev); + domain_merge(&domains, pol, tst?tst->ss->name:NULL); + } + dev_policy_free(pol); + pol = NULL; + if (tst) + tst->ss->free_super(tst); + } + + /* Check if we found some imsm spares but no members */ + if ((auto_assem || + (ident->uuid_set && + memcmp(uuid_zero, ident->uuid,sizeof(uuid_zero)) == 0)) && + (!st || !st->sb)) + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if (tmpdev->used != 3) + continue; + tmpdev->used = 1; + content = *contentp; + + if (!st->sb) { + /* we need sb from one of the spares */ + int dfd = dev_open(tmpdev->devname, O_RDONLY); + if (dfd < 0 || + st->ss->load_super(st, dfd, NULL)) + tmpdev->used = 2; + if (dfd > 0) + close(dfd); + } + } + + /* Now reject spares that don't match domains of identified members */ + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + struct stat stb; + if (tmpdev->used != 3) + continue; + if (stat(tmpdev->devname, &stb)< 0) { + pr_err("fstat failed for %s: %s\n", + tmpdev->devname, strerror(errno)); + tmpdev->used = 2; + } else { + struct dev_policy *pol = devid_policy(stb.st_rdev); + int dt = domain_test(domains, pol, NULL); + if (inargv && dt != 0) + /* take this spare as domains match + * if there are any */ + tmpdev->used = 1; + else if (!inargv && dt == 1) + /* device wasn't explicitly listed, so need + * explicit domain match - which we have */ + tmpdev->used = 1; + else + /* if domains don't match mark as unused */ + tmpdev->used = 0; + dev_policy_free(pol); + } + } + domain_free(domains); + *stp = st; + if (st && st->sb && content == *contentp) + st->ss->getinfo_super(st, content, NULL); + *contentp = content; + + return num_devs; +} + +struct devs { + char *devname; + int uptodate; /* set once we decide that this device is as + * recent as everything else in the array. + */ + int included; /* set if the device is already in the array + * due to a previous '-I' + */ + struct mdinfo i; +}; + +static int load_devices(struct devs *devices, char *devmap, + struct mddev_ident *ident, struct supertype **stp, + struct mddev_dev *devlist, struct context *c, + struct mdinfo *content, + int mdfd, char *mddev, + int *most_recentp, int *bestcntp, int **bestp, + int inargv) +{ + struct mddev_dev *tmpdev; + int devcnt = 0; + int nextspare = 0; +#ifndef MDASSEMBLE + int bitmap_done = 0; +#endif + int most_recent = -1; + int bestcnt = 0; + int *best = *bestp; + struct supertype *st = *stp; + + for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) { + char *devname = tmpdev->devname; + struct stat stb; + struct supertype *tst; + int i; + int dfd; + + if (tmpdev->used != 1) + continue; + /* looks like a good enough match to update the super block if needed */ +#ifndef MDASSEMBLE + if (c->update) { + /* prepare useful information in info structures */ + struct stat stb2; + int err; + fstat(mdfd, &stb2); + + if (strcmp(c->update, "uuid")==0 && + !ident->uuid_set) { + int rfd; + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, ident->uuid, 16) != 16) { + *(__u32*)(ident->uuid) = random(); + *(__u32*)(ident->uuid+1) = random(); + *(__u32*)(ident->uuid+2) = random(); + *(__u32*)(ident->uuid+3) = random(); + } + if (rfd >= 0) close(rfd); + } + dfd = dev_open(devname, + tmpdev->disposition == 'I' + ? O_RDWR : (O_RDWR|O_EXCL)); + + tst = dup_super(st); + if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { + pr_err("cannot re-read metadata from %s - aborting\n", + devname); + if (dfd >= 0) + close(dfd); + close(mdfd); + free(devices); + free(devmap); + tst->ss->free_super(tst); + free(tst); + *stp = st; + return -1; + } + tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks); + + memcpy(content->uuid, ident->uuid, 16); + strcpy(content->name, ident->name); + content->array.md_minor = minor(stb2.st_rdev); + + if (strcmp(c->update, "byteorder") == 0) + err = 0; + else + err = tst->ss->update_super(tst, content, c->update, + devname, c->verbose, + ident->uuid_set, + c->homehost); + if (err < 0) { + if (err == -1) + pr_err("--update=%s not understood for %s metadata\n", + c->update, tst->ss->name); + tst->ss->free_super(tst); + free(tst); + close(mdfd); + close(dfd); + free(devices); + free(devmap); + *stp = st; + return -1; + } + if (strcmp(c->update, "uuid")==0 && + !ident->uuid_set) { + ident->uuid_set = 1; + memcpy(ident->uuid, content->uuid, 16); + } + if (tst->ss->store_super(tst, dfd)) + pr_err("Could not re-write superblock on %s.\n", + devname); + + if (strcmp(c->update, "uuid")==0 && + ident->bitmap_fd >= 0 && !bitmap_done) { + if (bitmap_update_uuid(ident->bitmap_fd, + content->uuid, + tst->ss->swapuuid) != 0) + pr_err("Could not update uuid on external bitmap.\n"); + else + bitmap_done = 1; + } + } else +#endif + { + dfd = dev_open(devname, + tmpdev->disposition == 'I' + ? O_RDWR : (O_RDWR|O_EXCL)); + tst = dup_super(st); + + if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { + pr_err("cannot re-read metadata from %s - aborting\n", + devname); + if (dfd >= 0) + close(dfd); + close(mdfd); + free(devices); + free(devmap); + tst->ss->free_super(tst); + free(tst); + *stp = st; + return -1; + } + tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks); + } + + fstat(dfd, &stb); + close(dfd); + + if (c->verbose > 0) + pr_err("%s is identified as a member of %s, slot %d%s.\n", + devname, mddev, content->disk.raid_disk, + (content->disk.state & (1<<MD_DISK_REPLACEMENT)) ? " replacement":""); + devices[devcnt].devname = devname; + devices[devcnt].uptodate = 0; + devices[devcnt].included = (tmpdev->disposition == 'I'); + devices[devcnt].i = *content; + devices[devcnt].i.disk.major = major(stb.st_rdev); + devices[devcnt].i.disk.minor = minor(stb.st_rdev); + + if (devices[devcnt].i.disk.state == 6) { + if (most_recent < 0 || + devices[devcnt].i.events + > devices[most_recent].i.events) { + struct supertype *tmp = tst; + tst = st; + st = tmp; + most_recent = devcnt; + } + } + tst->ss->free_super(tst); + free(tst); + + if (content->array.level == LEVEL_MULTIPATH) + /* with multipath, the raid_disk from the superblock is meaningless */ + i = devcnt; + else + i = devices[devcnt].i.disk.raid_disk; + if (i+1 == 0) { + if (nextspare < content->array.raid_disks*2) + nextspare = content->array.raid_disks*2; + i = nextspare++; + } else { + /* i is raid_disk - double it so there is room for + * replacements */ + i *= 2; + if (devices[devcnt].i.disk.state & (1<<MD_DISK_REPLACEMENT)) + i++; + if (i >= content->array.raid_disks*2 && + i >= nextspare) + nextspare = i+1; + } + if (i < 10000) { + if (i >= bestcnt) { + int newbestcnt = i+10; + int *newbest = xmalloc(sizeof(int)*newbestcnt); + int c; + for (c=0; c < newbestcnt; c++) + if (c < bestcnt) + newbest[c] = best[c]; + else + newbest[c] = -1; + if (best)free(best); + best = newbest; + bestcnt = newbestcnt; + } + if (best[i] >=0 && + devices[best[i]].i.events + == devices[devcnt].i.events + && (devices[best[i]].i.disk.minor + != devices[devcnt].i.disk.minor) + && st->ss == &super0 + && content->array.level != LEVEL_MULTIPATH) { + /* two different devices with identical superblock. + * Could be a mis-detection caused by overlapping + * partitions. fail-safe. + */ + pr_err("WARNING %s and %s appear to have very similar superblocks.\n" + " If they are really different, please --zero the superblock on one\n" + " If they are the same or overlap, please remove one from %s.\n", + devices[best[i]].devname, devname, + inargv ? "the list" : + "the\n DEVICE list in mdadm.conf" + ); + close(mdfd); + free(devices); + free(devmap); + *stp = st; + return -1; + } + if (best[i] == -1 + || (devices[best[i]].i.events + < devices[devcnt].i.events)) + best[i] = devcnt; + } + devcnt++; + } + if (most_recent >= 0) + *most_recentp = most_recent; + *bestcntp = bestcnt; + *bestp = best; + *stp = st; + return devcnt; +} + +static int force_array(struct mdinfo *content, + struct devs *devices, + int *best, int bestcnt, char *avail, + int most_recent, + struct supertype *st, + struct context *c) +{ + int okcnt = 0; + while (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, + avail) + || + (content->reshape_active && content->delta_disks > 0 && + !enough(content->array.level, (content->array.raid_disks + - content->delta_disks), + content->new_layout, 1, + avail) + )) { + /* Choose the newest best drive which is + * not up-to-date, update the superblock + * and add it. + */ + int fd; + struct supertype *tst; + unsigned long long current_events; + int chosen_drive = -1; + int i; + + for (i = 0; + i < content->array.raid_disks * 2 && i < bestcnt; + i += 2) { + int j = best[i]; + if (j < 0) + continue; + if (devices[j].uptodate) + continue; + if (devices[j].i.recovery_start != MaxSector) { + int delta; + if (!devices[j].i.reshape_active || + devices[j].i.delta_disks <= 0) + continue; + /* When increasing number of devices, an + * added device also appears to be + * recovering. It is safe to include it + * as long as it won't be a source of + * data. + * For now, just allow for last data + * devices in RAID4 or last devices in RAID4/5/6. + */ + delta = devices[j].i.delta_disks; + if (devices[j].i.array.level >= 4 && + devices[j].i.array.level <= 6 && + i/2 >= content->array.raid_disks - delta) + /* OK */; + else if (devices[j].i.array.level == 4 && + i/2 >= content->array.raid_disks - delta - 1) + /* OK */; + else + continue; + } + if (chosen_drive < 0 || + devices[j].i.events + > devices[chosen_drive].i.events) + chosen_drive = j; + } + if (chosen_drive < 0) + break; + current_events = devices[chosen_drive].i.events; + add_another: + if (c->verbose >= 0) + pr_err("forcing event count in %s(%d) from %d upto %d\n", + devices[chosen_drive].devname, + devices[chosen_drive].i.disk.raid_disk, + (int)(devices[chosen_drive].i.events), + (int)(devices[most_recent].i.events)); + fd = dev_open(devices[chosen_drive].devname, + devices[chosen_drive].included ? O_RDWR + : (O_RDWR|O_EXCL)); + if (fd < 0) { + pr_err("Couldn't open %s for write - not updating\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + continue; + } + tst = dup_super(st); + if (tst->ss->load_super(tst,fd, NULL)) { + close(fd); + pr_err("RAID superblock disappeared from %s - not updating.\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + continue; + } + content->events = devices[most_recent].i.events; + tst->ss->update_super(tst, content, "force-one", + devices[chosen_drive].devname, c->verbose, + 0, NULL); + + if (tst->ss->store_super(tst, fd)) { + close(fd); + pr_err("Could not re-write superblock on %s\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + tst->ss->free_super(tst); + continue; + } + close(fd); + devices[chosen_drive].i.events = devices[most_recent].i.events; + devices[chosen_drive].uptodate = 1; + avail[chosen_drive] = 1; + okcnt++; + tst->ss->free_super(tst); + + /* If there are any other drives of the same vintage, + * add them in as well. We can't lose and we might gain + */ + for (i = 0; + i < content->array.raid_disks * 2 && i < bestcnt ; + i += 2) { + int j = best[i]; + if (j >= 0 && + !devices[j].uptodate && + devices[j].i.recovery_start == MaxSector && + devices[j].i.events == current_events) { + chosen_drive = j; + goto add_another; + } + } + } + return okcnt; +} + +static int start_array(int mdfd, + char *mddev, + struct mdinfo *content, + struct supertype *st, + struct mddev_ident *ident, + int *best, int bestcnt, + int chosen_drive, + struct devs *devices, + unsigned int okcnt, + unsigned int sparecnt, + unsigned int rebuilding_cnt, + struct context *c, + int clean, char *avail, + int start_partial_ok, + int err_ok, + int was_forced + ) +{ + int rv; + int i; + unsigned int req_cnt; + + rv = set_array_info(mdfd, st, content); + if (rv && !err_ok) { + pr_err("failed to set array info for %s: %s\n", + mddev, strerror(errno)); + return 1; + } + if (ident->bitmap_fd >= 0) { + if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) { + pr_err("SET_BITMAP_FILE failed.\n"); + return 1; + } + } else if (ident->bitmap_file) { + /* From config file */ + int bmfd = open(ident->bitmap_file, O_RDWR); + if (bmfd < 0) { + pr_err("Could not open bitmap file %s\n", + ident->bitmap_file); + return 1; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { + pr_err("Failed to set bitmapfile for %s\n", mddev); + close(bmfd); + return 1; + } + close(bmfd); + } + + /* First, add the raid disks, but add the chosen one last */ + for (i=0; i<= bestcnt; i++) { + int j; + if (i < bestcnt) { + j = best[i]; + if (j == chosen_drive) + continue; + } else + j = chosen_drive; + + if (j >= 0 && !devices[j].included) { + int dfd = dev_open(devices[j].devname, + O_RDWR|O_EXCL); + if (dfd >= 0) { + remove_partitions(dfd); + close(dfd); + } + rv = add_disk(mdfd, st, content, &devices[j].i); + + if (rv) { + pr_err("failed to add %s to %s: %s\n", + devices[j].devname, + mddev, + strerror(errno)); + if (i < content->array.raid_disks * 2 + || i == bestcnt) + okcnt--; + else + sparecnt--; + } else if (c->verbose > 0) + pr_err("added %s to %s as %d%s%s\n", + devices[j].devname, mddev, + devices[j].i.disk.raid_disk, + devices[j].uptodate?"": + " (possibly out of date)", + (devices[j].i.disk.state & (1<<MD_DISK_REPLACEMENT))?" replacement":""); + } else if (j >= 0) { + if (c->verbose > 0) + pr_err("%s is already in %s as %d\n", + devices[j].devname, mddev, + devices[j].i.disk.raid_disk); + } else if (c->verbose > 0 && i < content->array.raid_disks*2 + && (i&1) == 0) + pr_err("no uptodate device for slot %d of %s\n", + i/2, mddev); + } + + if (content->array.level == LEVEL_CONTAINER) { + if (c->verbose >= 0) { + pr_err("Container %s has been assembled with %d drive%s", + mddev, okcnt+sparecnt, okcnt+sparecnt==1?"":"s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", + content->array.raid_disks); + fprintf(stderr, "\n"); + } + + if (st->ss->validate_container) { + struct mdinfo *devices_list; + struct mdinfo *info_devices = xmalloc(sizeof(struct mdinfo)*(okcnt+sparecnt)); + unsigned int count; + devices_list = NULL; + for (count = 0; count < okcnt+sparecnt; count++) { + info_devices[count] = devices[count].i; + info_devices[count].next = devices_list; + devices_list = &info_devices[count]; + } + if (st->ss->validate_container(devices_list)) + pr_err("Mismatch detected!\n"); + free(info_devices); + } + + st->ss->free_super(st); + sysfs_uevent(content, "change"); + if (err_ok && okcnt < (unsigned)content->array.raid_disks) + /* Was partial, is still partial, so signal an error + * to ensure we don't retry */ + return 1; + return 0; + } + + /* Get number of in-sync devices according to the superblock. + * We must have this number to start the array without -s or -R + */ + req_cnt = content->array.working_disks; + + if (c->runstop == 1 || + (c->runstop <= 0 && + ( enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, avail) && + (okcnt + rebuilding_cnt >= req_cnt || start_partial_ok) + ))) { + /* This array is good-to-go. + * If a reshape is in progress then we might need to + * continue monitoring it. In that case we start + * it read-only and let the grow code make it writable. + */ + int rv; +#ifndef MDASSEMBLE + if (content->reshape_active && + !(content->reshape_active & RESHAPE_NO_BACKUP) && + content->delta_disks <= 0) { + if (!c->backup_file) { + pr_err("%s: Need a backup file to complete reshape of this array.\n", + mddev); + pr_err("Please provided one with \"--backup-file=...\"\n"); + if (c->update && + strcmp(c->update, "revert-reshape") == 0) + pr_err("(Don't specify --update=revert-reshape again, that part succeeded.)\n"); + return 1; + } + rv = sysfs_set_str(content, NULL, + "array_state", "readonly"); + if (rv == 0) + rv = Grow_continue(mdfd, st, content, + c->backup_file, 0, + c->freeze_reshape); + } else if (c->readonly && + sysfs_attribute_available( + content, NULL, "array_state")) { + rv = sysfs_set_str(content, NULL, + "array_state", "readonly"); + } else +#endif + rv = ioctl(mdfd, RUN_ARRAY, NULL); + reopen_mddev(mdfd); /* drop O_EXCL */ + if (rv == 0) { + if (c->verbose >= 0) { + pr_err("%s has been started with %d drive%s", + mddev, okcnt, okcnt==1?"":"s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", content->array.raid_disks); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt); + if (sparecnt) + fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); + fprintf(stderr, ".\n"); + } + if (content->reshape_active && + content->array.level >= 4 && + content->array.level <= 6) { + /* might need to increase the size + * of the stripe cache - default is 256 + */ + int chunk_size = content->array.chunk_size; + if (content->reshape_active && + content->new_chunk > chunk_size) + chunk_size = content->new_chunk; + if (256 < 4 * ((chunk_size+4065)/4096)) { + struct mdinfo *sra = sysfs_read(mdfd, NULL, 0); + if (sra) + sysfs_set_num(sra, NULL, + "stripe_cache_size", + (4 * chunk_size / 4096) + 1); + sysfs_free(sra); + } + } + if (okcnt < (unsigned)content->array.raid_disks) { + /* If any devices did not get added + * because the kernel rejected them based + * on event count, try adding them + * again providing the action policy is + * 're-add' or greater. The bitmap + * might allow them to be included, or + * they will become spares. + */ + for (i = 0; i < bestcnt; i++) { + int j = best[i]; + if (j >= 0 && !devices[j].uptodate) { + if (!disk_action_allows(&devices[j].i, st->ss->name, act_re_add)) + continue; + rv = add_disk(mdfd, st, content, + &devices[j].i); + if (rv == 0 && c->verbose >= 0) + pr_err("%s has been re-added.\n", + devices[j].devname); + } + } + } + if (content->array.level == 6 && + okcnt + 1 == (unsigned)content->array.raid_disks && + was_forced) { + struct mdinfo *sra = sysfs_read(mdfd, NULL, 0); + if (sra) + sysfs_set_str(sra, NULL, + "sync_action", "repair"); + sysfs_free(sra); + } + return 0; + } + pr_err("failed to RUN_ARRAY %s: %s\n", + mddev, strerror(errno)); + + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail)) + pr_err("Not enough devices to start the array.\n"); + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, + avail)) + pr_err("Not enough devices to start the array while not clean - consider --force.\n"); + + return 1; + } + if (c->runstop == -1) { + pr_err("%s assembled from %d drive%s", + mddev, okcnt, okcnt==1?"":"s"); + if (okcnt != (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", content->array.raid_disks); + fprintf(stderr, ", but not started.\n"); + return 2; + } + if (c->verbose >= -1) { + pr_err("%s assembled from %d drive%s", mddev, okcnt, okcnt==1?"":"s"); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt); + if (sparecnt) + fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail)) + fprintf(stderr, " - not enough to start the array.\n"); + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, + avail)) + fprintf(stderr, " - not enough to start the array while not clean - consider --force.\n"); + else { + if (req_cnt == (unsigned)content->array.raid_disks) + fprintf(stderr, " - need all %d to start it", req_cnt); + else + fprintf(stderr, " - need %d to start", req_cnt); + fprintf(stderr, " (use --run to insist).\n"); + } + } + return 1; +} + +int Assemble(struct supertype *st, char *mddev, + struct mddev_ident *ident, + struct mddev_dev *devlist, + struct context *c) +{ + /* + * The task of Assemble is to find a collection of + * devices that should (according to their superblocks) + * form an array, and to give this collection to the MD driver. + * In Linux-2.4 and later, this involves submitting a + * SET_ARRAY_INFO ioctl with no arg - to prepare + * the array - and then submit a number of + * ADD_NEW_DISK ioctls to add disks into + * the array. Finally RUN_ARRAY might + * be submitted to start the array. + * + * Much of the work of Assemble is in finding and/or + * checking the disks to make sure they look right. + * + * If mddev is not set, then scan must be set and we + * read through the config file for dev+uuid mapping + * We recurse, setting mddev, for each device that + * - isn't running + * - has a valid uuid (or any uuid if !uuidset) + * + * If mddev is set, we try to determine state of md. + * check version - must be at least 0.90.0 + * check kernel version. must be at least 2.4. + * If not, we can possibly fall back on START_ARRAY + * Try to GET_ARRAY_INFO. + * If possible, give up + * If not, try to STOP_ARRAY just to make sure + * + * If !uuidset and scan, look in conf-file for uuid + * If not found, give up + * If !devlist and scan and uuidset, get list of devs from conf-file + * + * For each device: + * Check superblock - discard if bad + * Check uuid (set if we don't have one) - discard if no match + * Check superblock similarity if we have a superblock - discard if different + * Record events, devicenum + * This should give us a list of devices for the array + * We should collect the most recent event number + * + * Count disks with recent enough event count + * While force && !enough disks + * Choose newest rejected disks, update event count + * mark clean and rewrite superblock + * If recent kernel: + * SET_ARRAY_INFO + * foreach device with recent events : ADD_NEW_DISK + * if runstop == 1 || "enough" disks and runstop==0 -> RUN_ARRAY + * If old kernel: + * Check the device numbers in superblock are right + * update superblock if any changes + * START_ARRAY + * + */ + int rv; + int mdfd; + int clean; + int auto_assem = (mddev == NULL && !ident->uuid_set && + ident->super_minor == UnSet && ident->name[0] == 0 + && (ident->container == NULL || ident->member == NULL)); + struct devs *devices; + char *devmap; + int *best = NULL; /* indexed by raid_disk */ + int bestcnt = 0; + int devcnt; + unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt; + int i; + int was_forced = 0; + int most_recent = 0; + int chosen_drive; + int change = 0; + int inargv = 0; + int start_partial_ok = (c->runstop >= 0) && + (c->force || devlist==NULL || auto_assem); + int num_devs; + struct mddev_dev *tmpdev; + struct mdinfo info; + struct mdinfo *content = NULL; + struct mdinfo *pre_exist = NULL; + char *avail; + char *name = NULL; + char chosen_name[1024]; + struct map_ent *map = NULL; + struct map_ent *mp; + + /* + * If any subdevs are listed, then any that don't + * match ident are discarded. Remainder must all match and + * become the array. + * If no subdevs, then we scan all devices in the config file, but + * there must be something in the identity + */ + + if (!devlist && + ident->uuid_set == 0 && + (ident->super_minor < 0 || ident->super_minor == UnSet) && + ident->name[0] == 0 && + (ident->container == NULL || ident->member == NULL) && + ident->devices == NULL) { + pr_err("No identity information available for %s - cannot assemble.\n", + mddev ? mddev : "further assembly"); + return 1; + } + + if (devlist == NULL) + devlist = conf_get_devs(); + else if (mddev) + inargv = 1; + +try_again: + /* We come back here when doing auto-assembly and attempting some + * set of devices failed. Those are now marked as ->used==2 and + * we ignore them and try again + */ + if (!st && ident->st) + st = ident->st; + if (c->verbose>0) + pr_err("looking for devices for %s\n", + mddev ? mddev : "further assembly"); + + content = &info; + if (st && c->force) + st->ignore_hw_compat = 1; + num_devs = select_devices(devlist, ident, &st, &content, c, + inargv, auto_assem); + if (num_devs < 0) + return 1; + + if (!st || !st->sb || !content) + return 2; + + /* We have a full set of devices - we now need to find the + * array device. + * However there is a risk that we are racing with "mdadm -I" + * and the array is already partially assembled - we will have + * rejected any devices already in this address. + * So we take a lock on the map file - to prevent further races - + * and look for the uuid in there. If found and the array is + * active, we abort. If found and the array is not active + * we commit to that md device and add all the contained devices + * to our list. We flag them so that we don't try to re-add, + * but can remove if they turn out to not be wanted. + */ + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile - continue anyway...\n"); + if (c->update && strcmp(c->update,"uuid") == 0) + mp = NULL; + else + mp = map_by_uuid(&map, content->uuid); + if (mp) { + struct mdinfo *dv; + /* array already exists. */ + pre_exist = sysfs_read(-1, mp->devnm, GET_LEVEL|GET_DEVS); + if (pre_exist->array.level != UnSet) { + pr_err("Found some drive for an array that is already active: %s\n", + mp->path); + pr_err("giving up.\n"); + return 1; + } + for (dv = pre_exist->devs; dv; dv = dv->next) { + /* We want to add this device to our list, + * but it could already be there if "mdadm -I" + * started *after* we checked for O_EXCL. + * If we add it to the top of the list + * it will be preferred over later copies. + */ + struct mddev_dev *newdev; + char *devname = map_dev(dv->disk.major, + dv->disk.minor, + 0); + if (!devname) + continue; + newdev = xmalloc(sizeof(*newdev)); + newdev->devname = devname; + newdev->disposition = 'I'; + newdev->used = 1; + newdev->next = devlist; + devlist = newdev; + num_devs++; + } + strcpy(chosen_name, mp->path); + if (c->verbose > 0 || mddev == NULL || + strcmp(mddev, chosen_name) != 0) + pr_err("Merging with already-assembled %s\n", + chosen_name); + mdfd = open_dev_excl(mp->devnm); + } else { + int trustworthy = FOREIGN; + name = content->name; + switch (st->ss->match_home(st, c->homehost) + ?: st->ss->match_home(st, "any")) { + case 1: + trustworthy = LOCAL; + name = strchr(content->name, ':'); + if (name) + name++; + else + name = content->name; + break; + } + if (!auto_assem) + /* If the array is listed in mdadm.conf or on + * command line, then we trust the name + * even if the array doesn't look local + */ + trustworthy = LOCAL; + + if (name[0] == 0 && + content->array.level == LEVEL_CONTAINER) { + name = content->text_version; + trustworthy = METADATA; + } + + if (name[0] && trustworthy != LOCAL && + ! c->require_homehost && + conf_name_is_free(name)) + trustworthy = LOCAL; + + if (trustworthy == LOCAL && + strchr(name, ':')) + /* Ignore 'host:' prefix of name */ + name = strchr(name, ':')+1; + + mdfd = create_mddev(mddev, name, ident->autof, trustworthy, + chosen_name); + } + if (mdfd < 0) { + st->ss->free_super(st); + if (auto_assem) + goto try_again; + return 1; + } + mddev = chosen_name; + if (get_linux_version() < 2004000 || + md_get_version(mdfd) < 9000) { + pr_err("Assemble requires Linux 2.4 or later, and\n" + " md driver version 0.90.0 or later.\n" + " Upgrade your kernel or try --build\n"); + close(mdfd); + return 1; + } + if (pre_exist == NULL) { + if (mddev_busy(fd2devnm(mdfd))) { + pr_err("%s already active, cannot restart it!\n", + mddev); + for (tmpdev = devlist ; + tmpdev && tmpdev->used != 1; + tmpdev = tmpdev->next) + ; + if (tmpdev && auto_assem) + pr_err("%s needed for %s...\n", + mddev, tmpdev->devname); + close(mdfd); + mdfd = -3; + st->ss->free_super(st); + if (auto_assem) + goto try_again; + return 1; + } + /* just incase it was started but has no content */ + ioctl(mdfd, STOP_ARRAY, NULL); + } + +#ifndef MDASSEMBLE + if (content != &info) { + /* This is a member of a container. Try starting the array. */ + int err; + err = assemble_container_content(st, mdfd, content, c, + chosen_name, NULL); + close(mdfd); + return err; + } +#endif + /* Ok, no bad inconsistancy, we can try updating etc */ + devices = xcalloc(num_devs, sizeof(*devices)); + devmap = xcalloc(num_devs, content->array.raid_disks); + devcnt = load_devices(devices, devmap, ident, &st, devlist, + c, content, mdfd, mddev, + &most_recent, &bestcnt, &best, inargv); + if (devcnt < 0) + return 1; + + if (devcnt == 0) { + pr_err("no devices found for %s\n", + mddev); + if (st) + st->ss->free_super(st); + close(mdfd); + free(devices); + free(devmap); + return 1; + } + + if (c->update && strcmp(c->update, "byteorder")==0) + st->minor_version = 90; + + st->ss->getinfo_super(st, content, NULL); + clean = content->array.state & 1; + + /* now we have some devices that might be suitable. + * I wonder how many + */ + avail = xcalloc(content->array.raid_disks, 1); + okcnt = 0; + replcnt = 0; + sparecnt=0; + rebuilding_cnt=0; + for (i=0; i< bestcnt; i++) { + int j = best[i]; + int event_margin = 1; /* always allow a difference of '1' + * like the kernel does + */ + if (j < 0) continue; + /* note: we ignore error flags in multipath arrays + * as they don't make sense + */ + if (content->array.level != LEVEL_MULTIPATH) + if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) { + if (!(devices[j].i.disk.state + & (1<<MD_DISK_FAULTY))) { + devices[j].uptodate = 1; + sparecnt++; + } + continue; + } + /* If this device thinks that 'most_recent' has failed, then + * we must reject this device. + */ + if (j != most_recent && !c->force && + content->array.raid_disks > 0 && + devices[most_recent].i.disk.raid_disk >= 0 && + devmap[j * content->array.raid_disks + devices[most_recent].i.disk.raid_disk] == 0) { + if (c->verbose > -1) + pr_err("ignoring %s as it reports %s as failed\n", + devices[j].devname, devices[most_recent].devname); + best[i] = -1; + continue; + } + /* Require event counter to be same as, or just less than, + * most recent. If it is bigger, it must be a stray spare and + * should be ignored. + */ + if (devices[j].i.events+event_margin >= + devices[most_recent].i.events && + devices[j].i.events <= + devices[most_recent].i.events + ) { + devices[j].uptodate = 1; + if (i < content->array.raid_disks * 2) { + if (devices[j].i.recovery_start == MaxSector || + (content->reshape_active && + i >= content->array.raid_disks - content->delta_disks)) { + if (!avail[i/2]) { + okcnt++; + avail[i/2]=1; + } else + replcnt++; + } else + rebuilding_cnt++; + } else + sparecnt++; + } + } + free(devmap); + if (c->force) { + int force_ok = force_array(content, devices, best, bestcnt, + avail, most_recent, st, c); + okcnt += force_ok; + if (force_ok) + was_forced = 1; + } + /* Now we want to look at the superblock which the kernel will base things on + * and compare the devices that we think are working with the devices that the + * superblock thinks are working. + * If there are differences and --force is given, then update this chosen + * superblock. + */ + chosen_drive = -1; + st->ss->free_super(st); + for (i=0; chosen_drive < 0 && i<bestcnt; i+=2) { + int j = best[i]; + int fd; + + if (j<0) + continue; + if (!devices[j].uptodate) + continue; + if (devices[j].i.events < devices[most_recent].i.events) + continue; + chosen_drive = j; + if ((fd=dev_open(devices[j].devname, + devices[j].included ? O_RDONLY + : (O_RDONLY|O_EXCL)))< 0) { + pr_err("Cannot open %s: %s\n", + devices[j].devname, strerror(errno)); + close(mdfd); + free(devices); + return 1; + } + if (st->ss->load_super(st,fd, NULL)) { + close(fd); + pr_err("RAID superblock has disappeared from %s\n", + devices[j].devname); + close(mdfd); + free(devices); + return 1; + } + close(fd); + } + if (st->sb == NULL) { + pr_err("No suitable drives found for %s\n", mddev); + close(mdfd); + free(devices); + return 1; + } + st->ss->getinfo_super(st, content, NULL); +#ifndef MDASSEMBLE + sysfs_init(content, mdfd, NULL); +#endif + for (i=0; i<bestcnt; i++) { + int j = best[i]; + unsigned int desired_state; + + if (i >= content->array.raid_disks * 2) + desired_state = 0; + else if (i & 1) + desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_REPLACEMENT); + else + desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC); + + if (j<0) + continue; + if (!devices[j].uptodate) + continue; + + devices[j].i.disk.state = desired_state; + if (!(devices[j].i.array.state & 1)) + clean = 0; + + if (st->ss->update_super(st, &devices[j].i, "assemble", NULL, + c->verbose, 0, NULL)) { + if (c->force) { + if (c->verbose >= 0) + pr_err("clearing FAULTY flag for device %d in %s for %s\n", + j, mddev, devices[j].devname); + change = 1; + } else { + if (c->verbose >= -1) + pr_err("device %d in %s has wrong state in superblock, but %s seems ok\n", + i, mddev, devices[j].devname); + } + } +#if 0 + if (!(super.disks[i].i.disk.state & (1 << MD_DISK_FAULTY))) { + pr_err("devices %d of %s is not marked FAULTY in superblock, but cannot be found\n", + i, mddev); + } +#endif + } + if (c->force && !clean && + !enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, + avail)) { + change += st->ss->update_super(st, content, "force-array", + devices[chosen_drive].devname, c->verbose, + 0, NULL); + was_forced = 1; + clean = 1; + } + + if (change) { + int fd; + fd = dev_open(devices[chosen_drive].devname, + devices[chosen_drive].included ? + O_RDWR : (O_RDWR|O_EXCL)); + if (fd < 0) { + pr_err("Could not open %s for write - cannot Assemble array.\n", + devices[chosen_drive].devname); + close(mdfd); + free(devices); + return 1; + } + if (st->ss->store_super(st, fd)) { + close(fd); + pr_err("Could not re-write superblock on %s\n", + devices[chosen_drive].devname); + close(mdfd); + free(devices); + return 1; + } + if (c->verbose >= 0) + pr_err("Marking array %s as 'clean'\n", + mddev); + close(fd); + } + + /* If we are in the middle of a reshape we may need to restore saved data + * that was moved aside due to the reshape overwriting live data + * The code of doing this lives in Grow.c + */ +#ifndef MDASSEMBLE + if (content->reshape_active && + !(content->reshape_active & RESHAPE_NO_BACKUP)) { + int err = 0; + int *fdlist = xmalloc(sizeof(int)* bestcnt); + if (c->verbose > 0) + pr_err("%s has an active reshape - checking if critical section needs to be restored\n", + chosen_name); + if (!c->backup_file) + c->backup_file = locate_backup(content->sys_name); + enable_fds(bestcnt/2); + for (i = 0; i < bestcnt/2; i++) { + int j = best[i*2]; + if (j >= 0) { + fdlist[i] = dev_open(devices[j].devname, + devices[j].included + ? O_RDWR : (O_RDWR|O_EXCL)); + if (fdlist[i] < 0) { + pr_err("Could not open %s for write - cannot Assemble array.\n", + devices[j].devname); + err = 1; + break; + } + } else + fdlist[i] = -1; + } + if (!err) { + if (st->ss->external && st->ss->recover_backup) + err = st->ss->recover_backup(st, content); + else + err = Grow_restart(st, content, fdlist, bestcnt/2, + c->backup_file, c->verbose > 0); + if (err && c->invalid_backup) { + if (c->verbose > 0) + pr_err("continuing without restoring backup\n"); + err = 0; + } + } + while (i>0) { + i--; + if (fdlist[i]>=0) close(fdlist[i]); + } + free(fdlist); + if (err) { + pr_err("Failed to restore critical section for reshape, sorry.\n"); + if (c->backup_file == NULL) + cont_err("Possibly you needed to specify the --backup-file\n"); + close(mdfd); + free(devices); + return err; + } + } +#endif + + /* Almost ready to actually *do* something */ + /* First, fill in the map, so that udev can find our name + * as soon as we become active. + */ + if (c->update && strcmp(c->update, "metadata")==0) { + content->array.major_version = 1; + content->array.minor_version = 0; + strcpy(content->text_version, "1.0"); + } + + map_update(&map, fd2devnm(mdfd), content->text_version, + content->uuid, chosen_name); + + rv = start_array(mdfd, mddev, content, + st, ident, best, bestcnt, + chosen_drive, devices, okcnt, sparecnt, + rebuilding_cnt, + c, + clean, avail, start_partial_ok, + pre_exist != NULL, + was_forced); + if (rv == 1 && !pre_exist) + ioctl(mdfd, STOP_ARRAY, NULL); + free(devices); + map_unlock(&map); + if (rv == 0) { + wait_for(chosen_name, mdfd); + close(mdfd); + if (auto_assem) { + int usecs = 1; + /* There is a nasty race with 'mdadm --monitor'. + * If it opens this device before we close it, + * it gets an incomplete open on which IO + * doesn't work and the capacity is + * wrong. + * If we reopen (to check for layered devices) + * before --monitor closes, we loose. + * + * So: wait upto 1 second for there to be + * a non-zero capacity. + */ + while (usecs < 1000) { + mdfd = open(mddev, O_RDONLY); + if (mdfd >= 0) { + unsigned long long size; + if (get_dev_size(mdfd, NULL, &size) && + size > 0) + break; + close(mdfd); + } + usleep(usecs); + usecs <<= 1; + } + } + } else + close(mdfd); + + /* '2' means 'OK, but not started yet' */ + return rv == 2 ? 0 : rv; +} + +#ifndef MDASSEMBLE +int assemble_container_content(struct supertype *st, int mdfd, + struct mdinfo *content, struct context *c, + char *chosen_name, int *result) +{ + struct mdinfo *dev, *sra, *dev2; + int working = 0, preexist = 0; + int expansion = 0; + struct map_ent *map = NULL; + int old_raid_disks; + int start_reshape; + char *avail = NULL; + int err; + + sysfs_init(content, mdfd, NULL); + + sra = sysfs_read(mdfd, NULL, GET_VERSION|GET_DEVS); + if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0) { + if (content->array.major_version == -1 && + content->array.minor_version == -2 && + c->readonly && + content->text_version[0] == '/') + content->text_version[0] = '-'; + if (sysfs_set_array(content, md_get_version(mdfd)) != 0) { + if (sra) + sysfs_free(sra); + return 1; + } + } + + /* There are two types of reshape: container wide or sub-array specific + * Check if metadata requests blocking container wide reshapes + */ + start_reshape = (content->reshape_active && + !((content->reshape_active == CONTAINER_RESHAPE) && + (content->array.state & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE)))); + + /* Block subarray here if it is under reshape now + * Do not allow for any changes in this array + */ + if (st->ss->external && content->recovery_blocked && start_reshape) + block_subarray(content); + + for (dev2 = sra->devs; dev2; dev2 = dev2->next) { + for (dev = content->devs; dev; dev = dev->next) + if (dev2->disk.major == dev->disk.major && + dev2->disk.minor == dev->disk.minor) + break; + if (dev) + continue; + /* Don't want this one any more */ + if (sysfs_set_str(sra, dev2, "slot", "none") < 0 && + errno == EBUSY) { + pr_err("Cannot remove old device %s: not updating %s\n", dev2->sys_name, sra->sys_name); + sysfs_free(sra); + return 1; + } + sysfs_set_str(sra, dev2, "state", "remove"); + } + old_raid_disks = content->array.raid_disks - content->delta_disks; + avail = xcalloc(content->array.raid_disks, 1); + for (dev = content->devs; dev; dev = dev->next) { + if (dev->disk.raid_disk >= 0) + avail[dev->disk.raid_disk] = 1; + if (sysfs_add_disk(content, dev, 1) == 0) { + if (dev->disk.raid_disk >= old_raid_disks && + content->reshape_active) + expansion++; + else + working++; + } else if (errno == EEXIST) + preexist++; + } + sysfs_free(sra); + if (working + expansion == 0 && c->runstop <= 0) { + free(avail); + return 1;/* Nothing new, don't try to start */ + } + map_update(&map, fd2devnm(mdfd), + content->text_version, + content->uuid, chosen_name); + + + if (enough(content->array.level, content->array.raid_disks, + content->array.layout, content->array.state & 1, avail) == 0) { + if (c->export && result) + *result |= INCR_NO; + else if (c->verbose >= 0) { + pr_err("%s assembled with %d device%s", + chosen_name, preexist + working, + preexist + working == 1 ? "":"s"); + if (preexist) + fprintf(stderr, " (%d new)", working); + fprintf(stderr, " but not started\n"); + } + free(avail); + return 1; + } + free(avail); + + if (c->runstop <= 0 && + (working + preexist + expansion) < + content->array.working_disks) { + if (c->export && result) + *result |= INCR_UNSAFE; + else if (c->verbose >= 0) { + pr_err("%s assembled with %d device%s", + chosen_name, preexist + working, + preexist + working == 1 ? "":"s"); + if (preexist) + fprintf(stderr, " (%d new)", working); + fprintf(stderr, " but not safe to start\n"); + } + return 1; + } + + + if (start_reshape) { + int spare = content->array.raid_disks + expansion; + if (restore_backup(st, content, + working, + spare, &c->backup_file, c->verbose) == 1) + return 1; + + err = sysfs_set_str(content, NULL, + "array_state", "readonly"); + if (err) + return 1; + + if (st->ss->external) { + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(st->container_devnm); + if (mdmon_running(st->container_devnm) && + st->update_tail == NULL) + st->update_tail = &st->updates; + } + + err = Grow_continue(mdfd, st, content, c->backup_file, + 0, c->freeze_reshape); + } else switch(content->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + err = sysfs_set_str(content, NULL, "array_state", + c->readonly ? "readonly" : "active"); + break; + default: + err = sysfs_set_str(content, NULL, "array_state", + "readonly"); + /* start mdmon if needed. */ + if (!err) { + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(st->container_devnm); + } + break; + } + if (!err) + sysfs_set_safemode(content, content->safe_mode_delay); + + /* Block subarray here if it is not reshaped now + * It has be blocked a little later to allow mdmon to switch in + * in to R/W state + */ + if (st->ss->external && content->recovery_blocked && + !start_reshape) + block_subarray(content); + + if (c->export && result) { + if (err) + *result |= INCR_NO; + else + *result |= INCR_YES; + } else if (c->verbose >= 0) { + if (err) + pr_err("array %s now has %d device%s", + chosen_name, working + preexist, + working + preexist == 1 ? "":"s"); + else + pr_err("Started %s with %d device%s", + chosen_name, working + preexist, + working + preexist == 1 ? "":"s"); + if (preexist) + fprintf(stderr, " (%d new)", working); + if (expansion) + fprintf(stderr, " ( + %d for expansion)", + expansion); + fprintf(stderr, "\n"); + } + if (!err) + wait_for(chosen_name, mdfd); + return err; + /* FIXME should have an O_EXCL and wait for read-auto */ +} +#endif diff --git a/Build.c b/Build.c new file mode 100644 index 00000000..8603c710 --- /dev/null +++ b/Build.c @@ -0,0 +1,292 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" + +#define REGISTER_DEV _IO (MD_MAJOR, 1) +#define START_MD _IO (MD_MAJOR, 2) +#define STOP_MD _IO (MD_MAJOR, 3) + +int Build(char *mddev, struct mddev_dev *devlist, + struct shape *s, struct context *c) +{ + /* Build a linear or raid0 arrays without superblocks + * We cannot really do any checks, we just do it. + * For md_version < 0.90.0, we call REGISTER_DEV + * with the device numbers, and then + * START_MD giving the "geometry" + * geometry is 0xpp00cc + * where pp is personality: 1==linear, 2=raid0 + * cc = chunk size factor: 0==4k, 1==8k etc. + * + * For md_version >= 0.90.0 we call + * SET_ARRAY_INFO, ADD_NEW_DISK, RUN_ARRAY + * + */ + int i; + int vers; + struct stat stb; + int subdevs = 0, missing_disks = 0; + struct mddev_dev *dv; + int bitmap_fd; + unsigned long long bitmapsize; + int mdfd; + char chosen_name[1024]; + int uuid[4] = {0,0,0,0}; + struct map_ent *map = NULL; + + /* scan all devices, make sure they really are block devices */ + for (dv = devlist; dv; dv=dv->next) { + subdevs++; + if (strcmp("missing", dv->devname) == 0) { + missing_disks++; + continue; + } + if (stat(dv->devname, &stb)) { + pr_err("Cannot find %s: %s\n", + dv->devname, strerror(errno)); + return 1; + } + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + pr_err("%s is not a block device.\n", + dv->devname); + return 1; + } + } + + if (s->raiddisks != subdevs) { + pr_err("requested %d devices in array but listed %d\n", + s->raiddisks, subdevs); + return 1; + } + + if (s->layout == UnSet) + switch(s->level) { + default: /* no layout */ + s->layout = 0; + break; + case 10: + s->layout = 0x102; /* near=2, far=1 */ + if (c->verbose > 0) + pr_err("layout defaults to n1\n"); + break; + case 5: + case 6: + s->layout = map_name(r5layout, "default"); + if (c->verbose > 0) + pr_err("layout defaults to %s\n", map_num(r5layout, s->layout)); + break; + case LEVEL_FAULTY: + s->layout = map_name(faultylayout, "default"); + + if (c->verbose > 0) + pr_err("layout defaults to %s\n", map_num(faultylayout, s->layout)); + break; + } + + /* We need to create the device. It can have no name. */ + map_lock(&map); + mdfd = create_mddev(mddev, NULL, c->autof, LOCAL, + chosen_name); + if (mdfd < 0) { + map_unlock(&map); + return 1; + } + mddev = chosen_name; + + map_update(&map, fd2devnm(mdfd), "none", uuid, chosen_name); + map_unlock(&map); + + vers = md_get_version(mdfd); + + /* looks Ok, go for it */ + if (vers >= 9000) { + mdu_array_info_t array; + array.level = s->level; + if (s->size == MAX_SIZE) + s->size = 0; + array.size = s->size; + array.nr_disks = s->raiddisks; + array.raid_disks = s->raiddisks; + array.md_minor = 0; + if (fstat(mdfd, &stb)==0) + array.md_minor = minor(stb.st_rdev); + array.not_persistent = 1; + array.state = 0; /* not clean, but no errors */ + if (s->assume_clean) + array.state |= 1; + array.active_disks = s->raiddisks - missing_disks; + array.working_disks = s->raiddisks - missing_disks; + array.spare_disks = 0; + array.failed_disks = missing_disks; + if (s->chunk == 0 && (s->level==0 || s->level==LEVEL_LINEAR)) + s->chunk = 64; + array.chunk_size = s->chunk*1024; + array.layout = s->layout; + if (ioctl(mdfd, SET_ARRAY_INFO, &array)) { + pr_err("SET_ARRAY_INFO failed for %s: %s\n", + mddev, strerror(errno)); + goto abort; + } + } else if (s->bitmap_file) { + pr_err("bitmaps not supported with this kernel\n"); + goto abort; + } + + if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) + s->bitmap_file = NULL; + if (s->bitmap_file && s->level <= 0) { + pr_err("bitmaps not meaningful with level %s\n", + map_num(pers, s->level)?:"given"); + goto abort; + } + /* now add the devices */ + for ((i=0), (dv = devlist) ; dv ; i++, dv=dv->next) { + unsigned long long dsize; + int fd; + if (strcmp("missing", dv->devname) == 0) + continue; + if (stat(dv->devname, &stb)) { + pr_err("Weird: %s has disappeared.\n", + dv->devname); + goto abort; + } + if ((stb.st_mode & S_IFMT)!= S_IFBLK) { + pr_err("Weird: %s is no longer a block device.\n", + dv->devname); + goto abort; + } + fd = open(dv->devname, O_RDONLY|O_EXCL); + if (fd < 0) { + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if (get_dev_size(fd, NULL, &dsize) && + (s->size == 0 || s->size == MAX_SIZE || dsize < s->size)) + s->size = dsize; + close(fd); + if (vers >= 9000) { + mdu_disk_info_t disk; + disk.number = i; + disk.raid_disk = i; + disk.state = (1<<MD_DISK_SYNC) | (1<<MD_DISK_ACTIVE); + if (dv->writemostly == 1) + disk.state |= 1<<MD_DISK_WRITEMOSTLY; + disk.major = major(stb.st_rdev); + disk.minor = minor(stb.st_rdev); + if (ioctl(mdfd, ADD_NEW_DISK, &disk)) { + pr_err("ADD_NEW_DISK failed for %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + } else { + if (ioctl(mdfd, REGISTER_DEV, &stb.st_rdev)) { + pr_err("REGISTER_DEV failed for %s: %s.\n", + dv->devname, strerror(errno)); + goto abort; + } + } + } + /* now to start it */ + if (vers >= 9000) { + mdu_param_t param; /* not used by syscall */ + if (s->bitmap_file) { + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + int major = BITMAP_MAJOR_HI; +#if 0 + if (s->bitmap_chunk == UnSet) { + pr_err("%s cannot be openned.", + s->bitmap_file); + goto abort; + } +#endif + if (vers < 9003) { + major = BITMAP_MAJOR_HOSTENDIAN; +#ifdef __BIG_ENDIAN + pr_err("Warning - bitmaps created on this kernel are not portable\n" + " between different architectures. Consider upgrading the Linux kernel.\n"); +#endif + } + bitmapsize = s->size>>9; /* FIXME wrong for RAID10 */ + if (CreateBitmap(s->bitmap_file, 1, NULL, s->bitmap_chunk, + c->delay, s->write_behind, bitmapsize, major)) { + goto abort; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("%s cannot be openned.", + s->bitmap_file); + goto abort; + } + } + if (bitmap_fd >= 0) { + if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { + pr_err("Cannot set bitmap file for %s: %s\n", + mddev, strerror(errno)); + goto abort; + } + } + } + if (ioctl(mdfd, RUN_ARRAY, ¶m)) { + pr_err("RUN_ARRAY failed: %s\n", + strerror(errno)); + if (s->chunk & (s->chunk-1)) { + cont_err("Problem may be that chunk size is not a power of 2\n"); + } + goto abort; + } + } else { + unsigned long arg; + arg=0; + while (s->chunk > 4096) { + arg++; + s->chunk >>= 1; + } + if (s->level == 0) + arg |= 0x20000; + else + arg |= 0x10000; + if (ioctl(mdfd, START_MD, arg)) { + pr_err("START_MD failed: %s\n", + strerror(errno)); + goto abort; + } + } + if (c->verbose >= 0) + pr_err("array %s built and started.\n", + mddev); + wait_for(mddev, mdfd); + close(mdfd); + return 0; + + abort: + if (vers >= 9000) + ioctl(mdfd, STOP_ARRAY, 0); + else + ioctl(mdfd, STOP_MD, 0); + close(mdfd); + return 1; +} diff --git a/COPYING b/COPYING new file mode 100644 index 00000000..d159169d --- /dev/null +++ b/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 00000000..a3bf7007 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,306 @@ +Please see git logs for detailed change log. +This file just contains highlight. + +Changes Prior to release 3.3 +- Some array reshapes can proceed without needing backup file. + This is done by changing the 'data_offset' so we never need to write + any data back over where it was before. If there is no "head space" + or "tail space" to allow data_offset to change, the old mechanism + with a backup file can still be used. +- RAID10 arrays can be reshaped to change the number of devices, + change the chunk size, or change the layout between 'near' + and 'offset'. + This will always change data_offset, and will fail if there is no + room for data_offset to be moved. +- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array. +- bad-block-logs are supported (but not heavily tested yet) +- "--assemble --update=revert-reshape" can be used to undo a reshape + that has just been started but isn't really wanted. This is very + new and while it passes basic tests it cannot be guaranteed. +- improved locking between --incremental and --assemble +- uses systemd to run "mdmon" if systemd is configured to do that. +- kernel names of md devices can be non-numeric. e.g. "md_home" rather than + "md0". This will probably confuse lots of other tools, so you need to + echo CREATE names=yes >> /etc/mdadm.conf + or the feature will not be used. (you also need a reasonably new kernel). +- "--stop" can be given a kernel name instead of a device name. i.e + mdadm --stop md4 + will work even if /dev/md4 doesn't exist. +- "--detail --export" has some information about the devices in the array +- --dump and --restore can be used to backup and restore the metadata on an + array. +- Hot-replace is supported with + mdadm /dev/mdX --replace /dev/foo + and + mdadm /dev/mdX --replace /dev/foo --with /dev/bar +- Config file can be a directory in which case all "*.conf" files are + read in lexical order. + Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d + Thus + echo CREATE name=yes > /etc/mdadm.conf.d/names.conf + will also enable the use of named md devices. + +- Lots of improvements to DDF support including adding support for + RAID10 (thanks Martin Wilck). + +Changes Prior to release 3.2.6 + - There are no real stand-out fixes, just lots of little bits and pieces. + +Changes Prior to release 3.2.5 + - This release primarily fixes a serious regression in 3.2.4. + This regression does *not* cause any risk to data. It simply + means that adding a device with "--add" would sometime fail + when it should not. + + - The fix also includes a couple of minor fixes such as making + the "--layout=preserve" option to "--grow" work again. + + +Changes Prior to release 3.2.4 +"--oneline" log of changes is below. Some notable ones are: + + - --offroot argument to improve interactions between mdmon and initrd + - --prefer argument to select which /dev names to display in some + circumstances. + - relax restructions on when "--add" will be allowed + - Fix bug with adding write-intent-bitmap to active array + - Now defaults to "/run/mdadm" for storing run-time files. + +Changes Prior to release 3.2.3 + - The largest single area of change is support for reshape of Intel + IMSM arrays (OnLine Capacity Explansion and Level Migration). + - Among other fixes, this now has a better chance of surviving if a + device fails during reshape. + +Changes Prior to release 3.2.2 + - reshaping IMSM (Intel metadata) arrays is no longer 'experimental', + it should work properly and be largely compatible with IMSM drivers in + other platforms. + - --assume-clean can be used with --grow --size to avoid resyncing the + new part of the array. This is only support with very new kernels. + - RAID0 arrays can have chunksize which is not a power of 2. This has been + supported in the kernel for a while but is only now supprted by + mdadm. + + - A new tool 'raid6check' is available which can check a RAID6 array, + or part of it, and report which device is most inconsistent with the + others if any stripe is inconsistent. This is still under development + and does not have a man page yet. If anyone tries it out and has any + questions or experience to report, they would be most welcome on + linux-raid@vger.kernel.org. + +Changes Prior to release 3.2.1 + - policy framework + Policy can be expressed for moving spare devices between arrays, and + for how to handle hot-plugged devices. This policy can be different + for devices plugged in to different controllers etc. + This, for example, allows a configuration where when a device is plugged + in it is immediately included in an md array as a hot spare and + possibly starts recovery immediately if an array is degraded. + + - some understanding of mbr and gpt paritition tables + This is primarly to support the new hot-plug support. If a + device is plugged in and policy suggests it should have a partition table, + the partition table will be copied from a suitably similar device, and + then the partitions will hot-plug and can then be added to md arrays. + + - "--incremental --remove" can remember where a device was removed from + so if a device gets plugged back in the same place, special policy applies + to it, allowing it to be included in an array even if a general hotplug + will not be included. + + - enhanced reshape options, including growing a RAID0 by converting to RAID4, + restriping, and converting back. Also convertions between RAID0 and + RAID10 and between RAID1 and RAID10 are possible (with a suitably recent + kernel). + + - spare migration for IMSM arrays. + Spare migration can now work across 'containers' using non-native metadata + and specifically Intel's IMSM arrays support spare migrations. + + - OLCE and level migration for Intel IMSM arrays. + OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is + supported for Intel Matrix Storage Manager arrays. + This support is currently 'experimental' for technical reasons. It can + be enabled with "export MDADM_EXPERIMENTAL=1" + + - avoid including wayward devices + If you split a RAID1, mount the two halves as two separate degraded RAID1s, + and then later bring the two back together, it is possible that the md + metadata won't properly show that one must over-ride the other. + mdadm now does extra checking to detect this possibilty and avoid + potentially corrupting data. + + - remove any possible confusion between similar options. + e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't + notice if one was used where the other was expected. + + - allow K,M,G suffixes on chunk sizes + +Changes Prior to release 3.2 + - By far the most significant change in this release related to the + management of reshaping arrays. This code has been substantially + re-written so that it can work with 'externally managed metadata' - + Intel's IMSM in particular. We now support level migration and + OnLine Capacity Expansion on these arrays. + - Policy framework. + Various policy statements can be made in the mdadm.conf to guide + the behaviour of mdadm, particular with regards to how new devices + are treated by "mdadm -I". + Depending on the 'action' associated with a device (identified by + its 'path') such need devices can be automatically re-added to and + existing array that they previously fell out off, or automatically + added as a spare if they appear to contain no data. + + - mdadm now has a limited understanding of partition tables. This + allows the policy framework to make decisions about partitioned + devices as well. + + - --incremental --remove can be told what --path the device was on, + and this info will be recorded so that another device appearing at + the same physical location can be preferentially added to the same + array (provides the spare-same-slot action policy applied to the + path). + + - A new flags "--invalid-backup" flag is available in --assemble + mode. This can be used to re-assemble an array which was stopping + in the middle of a reshape, and for which the 'backup file' is no + longer available or is corrupted. The array may have some + corruption in it at the point where reshape was up to, but at least + the rest of the array will become available. + + + - Various internal restructuring - more is needed. + +Changes Prior to release 3.1.5 + - Fixes for v1.x metadata on big-endian machines. + - man page improvements + - Improve '--detail --export' when run on partitions of an md array. + - Fix regression with removing 'failed' or 'detached' devices. + - Fixes for "--assemble --force" in various unusual cases. + - Allow '-Y' to mean --export. This was documented but not implemented. + - Various fixed for handling 'ddf' metadata. This is now more reliable + but could benefit from more interoperability testing. + - Correctly list subarrays of a container in "--detail" output. + - Improve checks on whether the requested number of devices is supported + by the metadata - both for --create and --grow. + - Don't remove partitions from a device that is being included in an + array until we are fully committed to including it. + - Allow "--assemble --update=no-bitmap" so an array with a corrupt + bitmap can still be assembled. + - Don't allow --add to succeed if it looks like a "--re-add" is probably + wanted, but cannot succeed. This avoids inadvertently turning + devices into spares when an array is failed. + +Changes Prior to release 3.1.4 + Two fixes related to configs that aren't using udev: + - Don't remove md devices which 'standard' names on --stop + - Allow dev_open to work on read-only /dev + And fixed regressions: + - Allow --incremental to add spares to an array + - Accept --no-degraded as a deprecated option rather than + throwing an error + - Return correct success status when --incrmental assembling + a container which does not yet have enough devices. + - Don't link mdadm with pthreads, only mdmon needs it. + - Fix compiler warning due to bad use of snprintf + +Changes Prior to release 3.1.3 + - mapfile now lives in a fixed location which default to + /dev/.mdadm/map but can be changed at compile time. This + location is choses and most distros provide it during early + boot and preserve it through. As long a /dev exists and is + writable, /dev/.mdadm will be created. + Other files file communication with mdmon live here too. + This fixes a bug reported by Debian and Gentoo users where + udev would spin in early-boot. + - IMSM and DDF metadata will not be recognised on partitions + as they should only be used on whole-disks. + - Various overflows causes by 2G drives have been addressed. + - A subarray of an IMSM contain can now be killed with + --kill-subarray. Also subarrays can be renamed with + --update-subarray + - -If (or --incremental --fail) can be used from udev to + fail and remove from all arrays a device which has been + unplugged from the system. i.e. hot-unplug-support. + - "mdadm /dev/mdX --re-add missing" will look for any device + that looks like it should be a member of /dev/mdX but isn't + and will automatically --re-add it + - Now compile with -Wextra to get extra warnings. + - Lots of minor bug fixes, documentation improvements, etcc + +Changes Prior to release 3.1.2 + - The default metadata has change again (sorry about that). + It is now v1.2 and will hopefully stay that way. It turned + out there with boot-block issues with v1.1 which make it + unsuitable for a default, though in many cases it is still + suitable to use. + - Stopping a container is not permitted when members are still + active + - Add 'homehost' to the valid words for the "AUTO" config file + line. When followed by "-all", this causes mdadm to + auto-assemble any array belonging to this host, but not + auto-assemble anything else. + - Fix some bugs with "--grow --chunksize=" for changing chunksize. + - VAR_RUN can be easily changed at compile time just like ALT_RUN. + This gives distros more flexability in how to manage the + pid and sock files that mdmon needs. + - Various mdmon fixes + - Alway make bitmap 4K-aligned if at all possible. + - If mdadm.conf lists arrays which have inter-dependencies, + the previously had to be listed in the "right" order. Now + any order should work. + - Fix --force assembly of v1.x arrays which are in the process + of recovering. + - Add section on 'scrubbing' to 'md' man page. + - Various command-line-option parsing improvements. + - ... and lots of other bug fixes. + +Changes Prior to release 3.1.1 + - Multiple fixes for new --grow levels including fixes for + serious data corruption problems. + - Change default metadata to v1.1 + - Change default chunk size to 512K + - Change default bitmap chunk size to 64Meg + - When --re-add is used, don't fall back to + --add if --re-add fails as this can destroy data. + +Changes Prior to release 3.1 + - Support --grow to change the layout of RAID4/5/6 + - Support --grow to change the chunksize of raid 4/5/6 + - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and + back. + - Support --grow to reduce the number of devices in RAID4/5/6. + - Support restart of these grow options which assembling an array + which is partially grown. + - Assorted tests of this code, and of different RAID6 layouts. + +Changes Prior to release 3.0.3 + - Improvements for creating arrays giving just a name, like 'foo', + rather than the full '/dev/md/foo'. + - Improvements for assembling member arrays of containers. + - Improvements to test suite + - Add option to change increment for RebuildNN messages reported + by "mdadm --monitor" + - Improvements to mdmon 'hand-over' from initrd to final root. + - Handle merging of devices that have left an IMSM array and are + being re-incorporated. + - Add missing space in "--detail --brief" output. + +Changes Prior to release 3.0.2 + - Fix crash when hosthost is not set, as often happens in + early boot. + +Changes Prior to release 3.0.1 + - Fix various segfaults + - Fixed for --examine with containers + - Lots of other little fixes. + +Changes Prior to release 3.0 + - Support for externally managed metadata, specifically DDF and IMSM. + - Depend on udev to create entries in /dev, rather than creating them + ourselves. + - remove --auto-update-home-hosts + - new config file line "auto" + - new "<ignore>" and "any" options for "homehost" + - numerous bug fixes and minor enhancements. diff --git a/Create.c b/Create.c new file mode 100644 index 00000000..ef28da0c --- /dev/null +++ b/Create.c @@ -0,0 +1,1054 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" +#include <ctype.h> + +static int default_layout(struct supertype *st, int level, int verbose) +{ + int layout = UnSet; + + if (st && st->ss->default_geometry) + st->ss->default_geometry(st, &level, &layout, NULL); + + if (layout == UnSet) + switch(level) { + default: /* no layout */ + layout = 0; + break; + case 10: + layout = 0x102; /* near=2, far=1 */ + if (verbose > 0) + pr_err("layout defaults to n2\n"); + break; + case 5: + case 6: + layout = map_name(r5layout, "default"); + if (verbose > 0) + pr_err("layout defaults to %s\n", map_num(r5layout, layout)); + break; + case LEVEL_FAULTY: + layout = map_name(faultylayout, "default"); + + if (verbose > 0) + pr_err("layout defaults to %s\n", map_num(faultylayout, layout)); + break; + } + + return layout; +} + +int Create(struct supertype *st, char *mddev, + char *name, int *uuid, + int subdevs, struct mddev_dev *devlist, + struct shape *s, + struct context *c, unsigned long long data_offset) +{ + /* + * Create a new raid array. + * + * First check that necessary details are available + * (i.e. level, raid-disks) + * + * Then check each disk to see what might be on it + * and report anything interesting. + * + * If anything looks odd, and runstop not set, + * abort. + * + * SET_ARRAY_INFO and ADD_NEW_DISK, and + * if runstop==run, or raiddisks disks were used, + * RUN_ARRAY + */ + int mdfd; + unsigned long long minsize=0, maxsize=0; + char *mindisc = NULL; + char *maxdisc = NULL; + int dnum; + struct mddev_dev *dv; + int fail=0, warn=0; + struct stat stb; + int first_missing = subdevs * 2; + int second_missing = subdevs * 2; + int missing_disks = 0; + int insert_point = subdevs * 2; /* where to insert a missing drive */ + int total_slots; + int pass; + int vers; + int rv; + int bitmap_fd; + int have_container = 0; + int container_fd = -1; + int need_mdmon = 0; + unsigned long long bitmapsize; + struct mdinfo info, *infos; + int did_default = 0; + int do_default_layout = 0; + int do_default_chunk = 0; + unsigned long safe_mode_delay = 0; + char chosen_name[1024]; + struct map_ent *map = NULL; + unsigned long long newsize; + + int major_num = BITMAP_MAJOR_HI; + + memset(&info, 0, sizeof(info)); + if (s->level == UnSet && st && st->ss->default_geometry) + st->ss->default_geometry(st, &s->level, NULL, NULL); + if (s->level == UnSet) { + pr_err("a RAID level is needed to create an array.\n"); + return 1; + } + if (s->raiddisks < 4 && s->level == 6) { + pr_err("at least 4 raid-devices needed for level 6\n"); + return 1; + } + if (s->raiddisks > 256 && s->level == 6) { + pr_err("no more than 256 raid-devices supported for level 6\n"); + return 1; + } + if (s->raiddisks < 2 && s->level >= 4) { + pr_err("at least 2 raid-devices needed for level 4 or 5\n"); + return 1; + } + if (s->level <= 0 && s->sparedisks) { + pr_err("This level does not support spare devices\n"); + return 1; + } + + if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) { + /* If given a single device, it might be a container, and we can + * extract a device list from there + */ + mdu_array_info_t inf; + int fd; + + memset(&inf, 0, sizeof(inf)); + fd = open(devlist->devname, O_RDONLY); + if (fd >= 0 && + ioctl(fd, GET_ARRAY_INFO, &inf) == 0 && + inf.raid_disks == 0) { + /* yep, looks like a container */ + if (st) { + rv = st->ss->load_container(st, fd, + devlist->devname); + if (rv == 0) + have_container = 1; + } else { + st = super_by_fd(fd, NULL); + if (st && !(rv = st->ss-> + load_container(st, fd, + devlist->devname))) + have_container = 1; + else + st = NULL; + } + if (have_container) { + subdevs = s->raiddisks; + first_missing = subdevs * 2; + second_missing = subdevs * 2; + insert_point = subdevs * 2; + } + } + if (fd >= 0) + close(fd); + } + if (st && st->ss->external && s->sparedisks) { + pr_err("This metadata type does not support spare disks at create time\n"); + return 1; + } + if (subdevs > s->raiddisks+s->sparedisks) { + pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks); + return 1; + } + if (!have_container && subdevs < s->raiddisks+s->sparedisks) { + pr_err("You haven't given enough devices (real or missing) to create this array\n"); + return 1; + } + if (s->bitmap_file && s->level <= 0) { + pr_err("bitmaps not meaningful with level %s\n", + map_num(pers, s->level)?:"given"); + return 1; + } + + /* now set some defaults */ + + if (s->layout == UnSet) { + do_default_layout = 1; + s->layout = default_layout(st, s->level, c->verbose); + } + + if (s->level == 10) + /* check layout fits in array*/ + if ((s->layout&255) * ((s->layout>>8)&255) > s->raiddisks) { + pr_err("that layout requires at least %d devices\n", + (s->layout&255) * ((s->layout>>8)&255)); + return 1; + } + + switch(s->level) { + case 4: + case 5: + case 10: + case 6: + case 0: + if (s->chunk == 0 || s->chunk == UnSet) { + s->chunk = UnSet; + do_default_chunk = 1; + /* chunk will be set later */ + } + break; + case LEVEL_LINEAR: + /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */ + if (get_linux_version() < 2006016 && s->chunk == 0) { + s->chunk = 64; + if (c->verbose > 0) + pr_err("chunk size defaults to 64K\n"); + } + break; + case 1: + case LEVEL_FAULTY: + case LEVEL_MULTIPATH: + case LEVEL_CONTAINER: + if (s->chunk) { + s->chunk = 0; + if (c->verbose > 0) + pr_err("chunk size ignored for this level\n"); + } + break; + default: + pr_err("unknown level %d\n", s->level); + return 1; + } + if (s->size == MAX_SIZE) + /* use '0' to mean 'max' now... */ + s->size = 0; + if (s->size && s->chunk && s->chunk != UnSet) + s->size &= ~(unsigned long long)(s->chunk - 1); + newsize = s->size * 2; + if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks, + &s->chunk, s->size*2, + data_offset, NULL, + &newsize, c->verbose>=0)) + return 1; + + if (s->chunk && s->chunk != UnSet) { + newsize &= ~(unsigned long long)(s->chunk*2 - 1); + if (do_default_chunk) { + /* default chunk was just set */ + if (c->verbose > 0) + pr_err("chunk size defaults to %dK\n", s->chunk); + s->size &= ~(unsigned long long)(s->chunk - 1); + do_default_chunk = 0; + } + } + + if (s->size == 0) { + s->size = newsize / 2; + if (s->level == 1) + /* If this is ever reshaped to RAID5, we will + * need a chunksize. So round it off a bit + * now just to be safe + */ + s->size &= ~(64ULL-1); + + if (s->size && c->verbose > 0) + pr_err("setting size to %lluK\n", s->size); + } + + /* now look at the subdevs */ + info.array.active_disks = 0; + info.array.working_disks = 0; + dnum = 0; + for (dv = devlist; dv ; dv = dv->next) + if (data_offset == VARIABLE_OFFSET) + dv->data_offset = INVALID_SECTORS; + else + dv->data_offset = data_offset; + + for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) { + char *dname = dv->devname; + unsigned long long freesize; + int dfd; + char *doff; + + if (strcasecmp(dname, "missing")==0) { + if (first_missing > dnum) + first_missing = dnum; + if (second_missing > dnum && dnum > first_missing) + second_missing = dnum; + missing_disks ++; + continue; + } + if (data_offset == VARIABLE_OFFSET) { + doff = strchr(dname, ':'); + if (doff) { + *doff++ = 0; + dv->data_offset = parse_size(doff); + } else + dv->data_offset = INVALID_SECTORS; + } else + dv->data_offset = data_offset; + + dfd = open(dname, O_RDONLY); + if (dfd < 0) { + pr_err("cannot open %s: %s\n", + dname, strerror(errno)); + exit(2); + } + if (fstat(dfd, &stb) != 0 || + (stb.st_mode & S_IFMT) != S_IFBLK) { + close(dfd); + pr_err("%s is not a block device\n", + dname); + exit(2); + } + close(dfd); + info.array.working_disks++; + if (dnum < s->raiddisks) + info.array.active_disks++; + if (st == NULL) { + struct createinfo *ci = conf_get_create_info(); + if (ci) + st = ci->supertype; + } + if (st == NULL) { + /* Need to choose a default metadata, which is different + * depending on geometry of array. + */ + int i; + char *name = "default"; + for(i=0; !st && superlist[i]; i++) { + st = superlist[i]->match_metadata_desc(name); + if (!st) + continue; + if (do_default_layout) + s->layout = default_layout(st, s->level, c->verbose); + switch (st->ss->validate_geometry( + st, s->level, s->layout, s->raiddisks, + &s->chunk, s->size*2, + dv->data_offset, dname, + &freesize, c->verbose > 0)) { + case -1: /* Not valid, message printed, and not + * worth checking any further */ + exit(2); + break; + case 0: /* Geometry not valid */ + free(st); + st = NULL; + s->chunk = do_default_chunk ? UnSet : s->chunk; + break; + case 1: /* All happy */ + break; + } + } + + if (!st) { + int dfd = open(dname, O_RDONLY|O_EXCL); + if (dfd < 0) { + pr_err("cannot open %s: %s\n", + dname, strerror(errno)); + exit(2); + } + pr_err("device %s not suitable for any style of array\n", + dname); + exit(2); + } + if (st->ss != &super0 || + st->minor_version != 90) + did_default = 1; + } else { + if (do_default_layout) + s->layout = default_layout(st, s->level, 0); + if (!st->ss->validate_geometry(st, s->level, s->layout, + s->raiddisks, + &s->chunk, s->size*2, + dv->data_offset, + dname, &freesize, + c->verbose >= 0)) { + + pr_err("%s is not suitable for this array.\n", + dname); + fail = 1; + continue; + } + } + + freesize /= 2; /* convert to K */ + if (s->chunk && s->chunk != UnSet) { + /* round to chunk size */ + freesize = freesize & ~(s->chunk-1); + if (do_default_chunk) { + /* default chunk was just set */ + if (c->verbose > 0) + pr_err("chunk size defaults to %dK\n", s->chunk); + s->size &= ~(unsigned long long)(s->chunk - 1); + do_default_chunk = 0; + } + } + if (!freesize) { + pr_err("no free space left on %s\n", dname); + fail = 1; + continue; + } + + if (s->size && freesize < s->size) { + pr_err("%s is smaller than given size. %lluK < %lluK + metadata\n", + dname, freesize, s->size); + fail = 1; + continue; + } + if (maxdisc == NULL || (maxdisc && freesize > maxsize)) { + maxdisc = dname; + maxsize = freesize; + } + if (mindisc ==NULL || (mindisc && freesize < minsize)) { + mindisc = dname; + minsize = freesize; + } + if (c->runstop != 1 || c->verbose >= 0) { + int fd = open(dname, O_RDONLY); + if (fd <0 ) { + pr_err("Cannot open %s: %s\n", + dname, strerror(errno)); + fail=1; + continue; + } + warn |= check_ext2(fd, dname); + warn |= check_reiser(fd, dname); + warn |= check_raid(fd, dname); + if (strcmp(st->ss->name, "1.x") == 0 && + st->minor_version >= 1) + /* metadata at front */ + warn |= check_partitions(fd, dname, 0, 0); + else if (s->level == 1 || s->level == LEVEL_CONTAINER + || (s->level == 0 && s->raiddisks == 1)) + /* partitions could be meaningful */ + warn |= check_partitions(fd, dname, freesize*2, s->size*2); + else + /* partitions cannot be meaningful */ + warn |= check_partitions(fd, dname, 0, 0); + if (strcmp(st->ss->name, "1.x") == 0 && + st->minor_version >= 1 && + did_default && + s->level == 1 && + (warn & 1024) == 0) { + warn |= 1024; + pr_err("Note: this array has metadata at the start and\n" + " may not be suitable as a boot device. If you plan to\n" + " store '/boot' on this device please ensure that\n" + " your boot-loader understands md/v1.x metadata, or use\n" + " --metadata=0.90\n"); + } + close(fd); + } + } + if (s->raiddisks + s->sparedisks > st->max_devs) { + pr_err("Too many devices: %s metadata only supports %d\n", + st->ss->name, st->max_devs); + return 1; + } + if (have_container) + info.array.working_disks = s->raiddisks; + if (fail) { + pr_err("create aborted\n"); + return 1; + } + if (s->size == 0) { + if (mindisc == NULL && !have_container) { + pr_err("no size and no drives given - aborting create.\n"); + return 1; + } + if (s->level > 0 || s->level == LEVEL_MULTIPATH + || s->level == LEVEL_FAULTY + || st->ss->external ) { + /* size is meaningful */ + if (!st->ss->validate_geometry(st, s->level, s->layout, + s->raiddisks, + &s->chunk, minsize*2, + data_offset, + NULL, NULL, 0)) { + pr_err("devices too large for RAID level %d\n", s->level); + return 1; + } + s->size = minsize; + if (s->level == 1) + /* If this is ever reshaped to RAID5, we will + * need a chunksize. So round it off a bit + * now just to be safe + */ + s->size &= ~(64ULL-1); + if (c->verbose > 0) + pr_err("size set to %lluK\n", s->size); + } + } + + if (!s->bitmap_file && + s->level >= 1 && + st->ss->add_internal_bitmap && + (s->write_behind || s->size > 100*1024*1024ULL)) { + if (c->verbose > 0) + pr_err("automatically enabling write-intent bitmap on large array\n"); + s->bitmap_file = "internal"; + } + if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) + s->bitmap_file = NULL; + + if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) { + if (c->runstop != 1 || c->verbose >= 0) + pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n", + maxdisc, s->size); + warn = 1; + } + + if (st->ss->detail_platform && st->ss->detail_platform(0, 1, NULL) != 0) { + if (c->runstop != 1 || c->verbose >= 0) + pr_err("%s unable to enumerate platform support\n" + " array may not be compatible with hardware/firmware\n", + st->ss->name); + warn = 1; + } + + if (warn) { + if (c->runstop!= 1) { + if (!ask("Continue creating array? ")) { + pr_err("create aborted.\n"); + return 1; + } + } else { + if (c->verbose > 0) + pr_err("creation continuing despite oddities due to --run\n"); + } + } + + /* If this is raid4/5, we want to configure the last active slot + * as missing, so that a reconstruct happens (faster than re-parity) + * FIX: Can we do this for raid6 as well? + */ + if (st->ss->external == 0 && + s->assume_clean==0 && c->force == 0 && first_missing >= s->raiddisks) { + switch ( s->level ) { + case 4: + case 5: + insert_point = s->raiddisks-1; + s->sparedisks++; + info.array.active_disks--; + missing_disks++; + break; + default: + break; + } + } + /* For raid6, if creating with 1 missing drive, make a good drive + * into a spare, else the create will fail + */ + if (s->assume_clean == 0 && c->force == 0 && first_missing < s->raiddisks && + st->ss->external == 0 && + second_missing >= s->raiddisks && s->level == 6) { + insert_point = s->raiddisks - 1; + if (insert_point == first_missing) + insert_point--; + s->sparedisks ++; + info.array.active_disks--; + missing_disks++; + } + + if (s->level <= 0 && first_missing < subdevs * 2) { + pr_err("This level does not support missing devices\n"); + return 1; + } + + /* We need to create the device */ + map_lock(&map); + mdfd = create_mddev(mddev, name, c->autof, LOCAL, chosen_name); + if (mdfd < 0) { + map_unlock(&map); + return 1; + } + /* verify if chosen_name is not in use, + * it could be in conflict with already existing device + * e.g. container, array + */ + if (strncmp(chosen_name, "/dev/md/", 8) == 0 + && map_by_name(&map, chosen_name+8) != NULL) { + pr_err("Array name %s is in use already.\n", + chosen_name); + close(mdfd); + map_unlock(&map); + return 1; + } + mddev = chosen_name; + + vers = md_get_version(mdfd); + if (vers < 9000) { + pr_err("Create requires md driver version 0.90.0 or later\n"); + goto abort_locked; + } else { + mdu_array_info_t inf; + memset(&inf, 0, sizeof(inf)); + ioctl(mdfd, GET_ARRAY_INFO, &inf); + if (inf.working_disks != 0) { + pr_err("another array by this name is already running.\n"); + goto abort_locked; + } + } + + /* Ok, lets try some ioctls */ + + info.array.level = s->level; + info.array.size = s->size; + info.array.raid_disks = s->raiddisks; + /* The kernel should *know* what md_minor we are dealing + * with, but it chooses to trust me instead. Sigh + */ + info.array.md_minor = 0; + if (fstat(mdfd, &stb)==0) + info.array.md_minor = minor(stb.st_rdev); + info.array.not_persistent = 0; + + if ( ( (s->level == 4 || s->level == 5) && + (insert_point < s->raiddisks || first_missing < s->raiddisks) ) + || + ( s->level == 6 && (insert_point < s->raiddisks + || second_missing < s->raiddisks)) + || + ( s->level <= 0 ) + || + s->assume_clean + ) { + info.array.state = 1; /* clean, but one+ drive will be missing*/ + info.resync_start = MaxSector; + } else { + info.array.state = 0; /* not clean, but no errors */ + info.resync_start = 0; + } + if (s->level == 10) { + /* for raid10, the bitmap size is the capacity of the array, + * which is array.size * raid_disks / ncopies; + * .. but convert to sectors. + */ + int ncopies = ((s->layout>>8) & 255) * (s->layout & 255); + bitmapsize = s->size * s->raiddisks / ncopies * 2; +/* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, s->size, s->raiddisks, ncopies);*/ + } else + bitmapsize = s->size * 2; + + /* There is lots of redundancy in these disk counts, + * raid_disks is the most meaningful value + * it describes the geometry of the array + * it is constant + * nr_disks is total number of used slots. + * it should be raid_disks+spare_disks + * spare_disks is the number of extra disks present + * see above + * active_disks is the number of working disks in + * active slots. (With raid_disks) + * working_disks is the total number of working disks, + * including spares + * failed_disks is the number of disks marked failed + * + * Ideally, the kernel would keep these (except raid_disks) + * up-to-date as we ADD_NEW_DISK, but it doesn't (yet). + * So for now, we assume that all raid and spare + * devices will be given. + */ + info.array.spare_disks=s->sparedisks; + info.array.failed_disks=missing_disks; + info.array.nr_disks = info.array.working_disks + + info.array.failed_disks; + info.array.layout = s->layout; + info.array.chunk_size = s->chunk*1024; + + if (name == NULL || *name == 0) { + /* base name on mddev */ + /* /dev/md0 -> 0 + * /dev/md_d0 -> d0 + * /dev/md_foo -> foo + * /dev/md/1 -> 1 + * /dev/md/d1 -> d1 + * /dev/md/home -> home + * /dev/mdhome -> home + */ + /* FIXME compare this with rules in create_mddev */ + name = strrchr(mddev, '/'); + if (name) { + name++; + if (strncmp(name, "md_", 3)==0 && + strlen(name) > 3 && + (name-mddev) == 5 /* /dev/ */) + name += 3; + else if (strncmp(name, "md", 2)==0 && + strlen(name) > 2 && + isdigit(name[2]) && + (name-mddev) == 5 /* /dev/ */) + name += 2; + } + } + if (!st->ss->init_super(st, &info.array, s->size, name, c->homehost, uuid, + data_offset)) + goto abort_locked; + + total_slots = info.array.nr_disks; + st->ss->getinfo_super(st, &info, NULL); + sysfs_init(&info, mdfd, NULL); + + if (did_default && c->verbose >= 0) { + if (is_subarray(info.text_version)) { + char devnm[32]; + char *ep; + struct mdinfo *mdi; + + strncpy(devnm, info.text_version+1, 32); + devnm[31] = 0; + ep = strchr(devnm, '/'); + if (ep) + *ep = 0; + + mdi = sysfs_read(-1, devnm, GET_VERSION); + + pr_err("Creating array inside %s container %s\n", + mdi?mdi->text_version:"managed", devnm); + sysfs_free(mdi); + } else + pr_err("Defaulting to version %s metadata\n", info.text_version); + } + + map_update(&map, fd2devnm(mdfd), info.text_version, + info.uuid, chosen_name); + /* Keep map locked until devices have been added to array + * to stop another mdadm from finding and using those devices. + */ + + if (s->bitmap_file && vers < 9003) { + major_num = BITMAP_MAJOR_HOSTENDIAN; +#ifdef __BIG_ENDIAN + pr_err("Warning - bitmaps created on this kernel are not portable\n" + " between different architectured. Consider upgrading the Linux kernel.\n"); +#endif + } + + if (s->bitmap_file && strcmp(s->bitmap_file, "internal")==0) { + if ((vers%100) < 2) { + pr_err("internal bitmaps not supported by this kernel.\n"); + goto abort_locked; + } + if (!st->ss->add_internal_bitmap) { + pr_err("internal bitmaps not supported with %s metadata\n", + st->ss->name); + goto abort_locked; + } + if (!st->ss->add_internal_bitmap(st, &s->bitmap_chunk, + c->delay, s->write_behind, + bitmapsize, 1, major_num)) { + pr_err("Given bitmap chunk size not supported.\n"); + goto abort_locked; + } + s->bitmap_file = NULL; + } + + sysfs_init(&info, mdfd, NULL); + + if (st->ss->external && st->container_devnm[0]) { + /* member */ + + /* When creating a member, we need to be careful + * to negotiate with mdmon properly. + * If it is already running, we cannot write to + * the devices and must ask it to do that part. + * If it isn't running, we write to the devices, + * and then start it. + * We hold an exclusive open on the container + * device to make sure mdmon doesn't exit after + * we checked that it is running. + * + * For now, fail if it is already running. + */ + container_fd = open_dev_excl(st->container_devnm); + if (container_fd < 0) { + pr_err("Cannot get exclusive open on container - weird.\n"); + goto abort_locked; + } + if (mdmon_running(st->container_devnm)) { + if (c->verbose) + pr_err("reusing mdmon for %s.\n", + st->container_devnm); + st->update_tail = &st->updates; + } else + need_mdmon = 1; + } + rv = set_array_info(mdfd, st, &info); + if (rv) { + pr_err("failed to set array info for %s: %s\n", + mddev, strerror(errno)); + goto abort_locked; + } + + if (s->bitmap_file) { + int uuid[4]; + + st->ss->uuid_from_super(st, uuid); + if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk, + c->delay, s->write_behind, + bitmapsize, + major_num)) { + goto abort_locked; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("weird: %s cannot be openned\n", + s->bitmap_file); + goto abort_locked; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { + pr_err("Cannot set bitmap file for %s: %s\n", + mddev, strerror(errno)); + goto abort_locked; + } + } + + infos = xmalloc(sizeof(*infos) * total_slots); + enable_fds(total_slots); + for (pass=1; pass <=2 ; pass++) { + struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */ + + for (dnum=0, dv = devlist ; dv ; + dv=(dv->next)?(dv->next):moved_disk, dnum++) { + int fd; + struct stat stb; + struct mdinfo *inf = &infos[dnum]; + + if (dnum >= total_slots) + abort(); + if (dnum == insert_point) { + moved_disk = dv; + continue; + } + if (strcasecmp(dv->devname, "missing")==0) + continue; + if (have_container) + moved_disk = NULL; + if (have_container && dnum < info.array.raid_disks - 1) + /* repeatedly use the container */ + moved_disk = dv; + + switch(pass) { + case 1: + *inf = info; + + inf->disk.number = dnum; + inf->disk.raid_disk = dnum; + if (inf->disk.raid_disk < s->raiddisks) + inf->disk.state = (1<<MD_DISK_ACTIVE) | + (1<<MD_DISK_SYNC); + else + inf->disk.state = 0; + + if (dv->writemostly == 1) + inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY); + + if (have_container) + fd = -1; + else { + if (st->ss->external && + st->container_devnm[0]) + fd = open(dv->devname, O_RDWR); + else + fd = open(dv->devname, O_RDWR|O_EXCL); + + if (fd < 0) { + pr_err("failed to open %s after earlier success - aborting\n", + dv->devname); + goto abort_locked; + } + fstat(fd, &stb); + inf->disk.major = major(stb.st_rdev); + inf->disk.minor = minor(stb.st_rdev); + } + if (fd >= 0) + remove_partitions(fd); + if (st->ss->add_to_super(st, &inf->disk, + fd, dv->devname, + dv->data_offset)) { + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort_locked; + } + st->ss->getinfo_super(st, inf, NULL); + safe_mode_delay = inf->safe_mode_delay; + + if (have_container && c->verbose > 0) + pr_err("Using %s for device %d\n", + map_dev(inf->disk.major, + inf->disk.minor, + 0), dnum); + + if (!have_container) { + /* getinfo_super might have lost these ... */ + inf->disk.major = major(stb.st_rdev); + inf->disk.minor = minor(stb.st_rdev); + } + break; + case 2: + inf->errors = 0; + + rv = add_disk(mdfd, st, &info, inf); + + if (rv) { + pr_err("ADD_NEW_DISK for %s failed: %s\n", + dv->devname, strerror(errno)); + goto abort_locked; + } + break; + } + if (!have_container && + dv == moved_disk && dnum != insert_point) break; + } + if (pass == 1) { + struct mdinfo info_new; + struct map_ent *me = NULL; + + /* check to see if the uuid has changed due to these + * metadata changes, and if so update the member array + * and container uuid. Note ->write_init_super clears + * the subarray cursor such that ->getinfo_super once + * again returns container info. + */ + st->ss->getinfo_super(st, &info_new, NULL); + if (st->ss->external && s->level != LEVEL_CONTAINER && + !same_uuid(info_new.uuid, info.uuid, 0)) { + map_update(&map, fd2devnm(mdfd), + info_new.text_version, + info_new.uuid, chosen_name); + me = map_by_devnm(&map, st->container_devnm); + } + + if (st->ss->write_init_super(st)) { + st->ss->free_super(st); + goto abort_locked; + } + + /* update parent container uuid */ + if (me) { + char *path = xstrdup(me->path); + + st->ss->getinfo_super(st, &info_new, NULL); + map_update(&map, st->container_devnm, + info_new.text_version, + info_new.uuid, path); + free(path); + } + + flush_metadata_updates(st); + st->ss->free_super(st); + } + } + map_unlock(&map); + free(infos); + + if (s->level == LEVEL_CONTAINER) { + /* No need to start. But we should signal udev to + * create links */ + sysfs_uevent(&info, "change"); + if (c->verbose >= 0) + pr_err("container %s prepared.\n", mddev); + wait_for(chosen_name, mdfd); + } else if (c->runstop == 1 || subdevs >= s->raiddisks) { + if (st->ss->external) { + int err; + switch(s->level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + err = sysfs_set_str(&info, NULL, "array_state", + c->readonly + ? "readonly" + : "active"); + need_mdmon = 0; + break; + default: + err = sysfs_set_str(&info, NULL, "array_state", + "readonly"); + break; + } + sysfs_set_safemode(&info, safe_mode_delay); + if (err) { + pr_err("failed to activate array.\n"); + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + } else if (c->readonly && + sysfs_attribute_available( + &info, NULL, "array_state")) { + if (sysfs_set_str(&info, NULL, + "array_state", "readonly") < 0) { + pr_err("Failed to start array: %s\n", + strerror(errno)); + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + } else { + /* param is not actually used */ + mdu_param_t param; + if (ioctl(mdfd, RUN_ARRAY, ¶m)) { + pr_err("RUN_ARRAY failed: %s\n", + strerror(errno)); + if (info.array.chunk_size & (info.array.chunk_size-1)) { + cont_err("Problem may be that chunk size is not a power of 2\n"); + } + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + /* if start_ro module parameter is set, array is + * auto-read-only, which is bad as the resync won't + * start. So lets make it read-write now. + */ + ioctl(mdfd, RESTART_ARRAY_RW, NULL); + } + if (c->verbose >= 0) + pr_err("array %s started.\n", mddev); + if (st->ss->external && st->container_devnm[0]) { + if (need_mdmon) + start_mdmon(st->container_devnm); + + ping_monitor(st->container_devnm); + close(container_fd); + } + wait_for(chosen_name, mdfd); + } else { + pr_err("not starting array - not enough devices.\n"); + } + close(mdfd); + return 0; + + abort: + map_lock(&map); + abort_locked: + map_remove(&map, fd2devnm(mdfd)); + map_unlock(&map); + + if (mdfd >= 0) + close(mdfd); + return 1; +} diff --git a/Detail.c b/Detail.c new file mode 100644 index 00000000..dd72eded --- /dev/null +++ b/Detail.c @@ -0,0 +1,763 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" +#include <dirent.h> + +static int cmpstringp(const void *p1, const void *p2) +{ + return strcmp(* (char * const *) p1, * (char * const *) p2); +} + +static int add_device(const char *dev, char ***p_devices, + int *p_max_devices, int n_devices) +{ + if (n_devices + 1 >= *p_max_devices) { + *p_max_devices += 16; + *p_devices = xrealloc(*p_devices, *p_max_devices * + sizeof(**p_devices)); + if (!*p_devices) { + *p_max_devices = 0; + return 0; + } + }; + (*p_devices)[n_devices] = xstrdup(dev); + return n_devices + 1; +} + +int Detail(char *dev, struct context *c) +{ + /* + * Print out details for an md array by using + * GET_ARRAY_INFO and GET_DISK_INFO ioctl calls + */ + + int fd = open(dev, O_RDONLY); + int vers; + mdu_array_info_t array; + mdu_disk_info_t *disks; + int next; + int d; + time_t atime; + char *str; + char **devices = NULL; + int max_devices = 0, n_devices = 0; + int spares = 0; + struct stat stb; + int is_26 = get_linux_version() >= 2006000; + int is_rebuilding = 0; + int failed = 0; + struct supertype *st; + char *subarray = NULL; + int max_disks = MD_SB_DISKS; /* just a default */ + struct mdinfo *info = NULL; + struct mdinfo *sra; + struct mdinfo *subdev; + char *member = NULL; + char *container = NULL; + + int rv = c->test ? 4 : 1; + int avail_disks = 0; + char *avail = NULL; + int external; + int inactive; + + if (fd < 0) { + pr_err("cannot open %s: %s\n", + dev, strerror(errno)); + return rv; + } + vers = md_get_version(fd); + if (vers < 0) { + pr_err("%s does not appear to be an md device\n", + dev); + close(fd); + return rv; + } + if (vers < 9000) { + pr_err("cannot get detail for md device %s: driver version too old.\n", + dev); + close(fd); + return rv; + } + sra = sysfs_read(fd, NULL, GET_VERSION|GET_DEVS); + external = (sra != NULL && sra->array.major_version == -1 + && sra->array.minor_version == -2); + st = super_by_fd(fd, &subarray); + if (ioctl(fd, GET_ARRAY_INFO, &array) == 0) { + inactive = 0; + } else if (errno == ENODEV && sra) { + array = sra->array; + inactive = 1; + } else { + pr_err("cannot get array detail for %s: %s\n", + dev, strerror(errno)); + close(fd); + return rv; + } + + if (fstat(fd, &stb) != 0 && !S_ISBLK(stb.st_mode)) + stb.st_rdev = 0; + rv = 0; + + if (st) + max_disks = st->max_devs; + + if (subarray) { + /* This is a subarray of some container. + * We want the name of the container, and the member + */ + int devid = devnm2devid(st->container_devnm); + int cfd, err; + + member = subarray; + container = map_dev_preferred(major(devid), minor(devid), + 1, c->prefer); + cfd = open_dev(st->container_devnm); + if (cfd >= 0) { + err = st->ss->load_container(st, cfd, NULL); + close(cfd); + if (err == 0) + info = st->ss->container_content(st, subarray); + } + } + + /* try to load a superblock. Try sra->devs first, then try ioctl */ + if (st && !info) for (d = 0, subdev = sra ? sra->devs : NULL; + d < max_disks || subdev; + subdev ? (void)(subdev = subdev->next) : (void)(d++)){ + mdu_disk_info_t disk; + char *dv; + int fd2; + int err; + if (subdev) + disk = subdev->disk; + else { + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (d >= array.raid_disks && + disk.major == 0 && + disk.minor == 0) + continue; + } + + if (array.raid_disks > 0 && + (disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + + if (st->sb) + st->ss->free_super(st); + + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + if (err) + continue; + if (info) + free(info); + if (subarray) + info = st->ss->container_content(st, subarray); + else { + info = xmalloc(sizeof(*info)); + st->ss->getinfo_super(st, info, NULL); + } + if (!info) + continue; + + if (array.raid_disks != 0 && /* container */ + (info->array.ctime != array.ctime || + info->array.level != array.level)) { + st->ss->free_super(st); + continue; + } + /* some formats (imsm) have free-floating-spares + * with a uuid of uuid_zero, they don't + * have very good info about the rest of the + * container, so keep searching when + * encountering such a device. Otherwise, stop + * after the first successful call to + * ->load_super. + */ + if (memcmp(uuid_zero, + info->uuid, + sizeof(uuid_zero)) == 0) { + st->ss->free_super(st); + continue; + } + break; + } + + /* Ok, we have some info to print... */ + str = map_num(pers, array.level); + + if (c->export) { + if (array.raid_disks) { + if (str) + printf("MD_LEVEL=%s\n", str); + printf("MD_DEVICES=%d\n", array.raid_disks); + } else { + if (!inactive) + printf("MD_LEVEL=container\n"); + printf("MD_DEVICES=%d\n", array.nr_disks); + } + if (container) { + printf("MD_CONTAINER=%s\n", container); + printf("MD_MEMBER=%s\n", member); + } else { + if (sra && sra->array.major_version < 0) + printf("MD_METADATA=%s\n", sra->text_version); + else + printf("MD_METADATA=%d.%d\n", + array.major_version, array.minor_version); + } + + if (st && st->sb && info) { + char nbuf[64]; + struct map_ent *mp, *map = NULL; + + fname_from_uuid(st, info, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + mp = map_by_uuid(&map, info->uuid); + if (mp && mp->path && + strncmp(mp->path, "/dev/md/", 8) == 0) { + printf("MD_DEVNAME="); + print_escape(mp->path+8); + putchar('\n'); + } + + if (st->ss->export_detail_super) + st->ss->export_detail_super(st); + } else { + struct map_ent *mp, *map = NULL; + char nbuf[64]; + mp = map_by_devnm(&map, fd2devnm(fd)); + if (mp) { + __fname_from_uuid(mp->uuid, 0, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + } + if (mp && mp->path && + strncmp(mp->path, "/dev/md/", 8) == 0) { + printf("MD_DEVNAME="); + print_escape(mp->path+8); + putchar('\n'); + } + } + if (sra) { + struct mdinfo *mdi; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + char *path = + map_dev(mdi->disk.major, + mdi->disk.minor, 0); + + if (mdi->disk.raid_disk >= 0) + printf("MD_DEVICE_%s_ROLE=%d\n", + mdi->sys_name+4, + mdi->disk.raid_disk); + else + printf("MD_DEVICE_%s_ROLE=spare\n", + mdi->sys_name+4); + if (path) + printf("MD_DEVICE_%s_DEV=%s\n", + mdi->sys_name+4, path); + } + } + goto out; + } + + disks = xmalloc(max_disks * 2 * sizeof(mdu_disk_info_t)); + for (d = 0; d < max_disks * 2; d++) { + disks[d].state = (1<<MD_DISK_REMOVED); + disks[d].major = disks[d].minor = 0; + disks[d].number = disks[d].raid_disk = d; + } + + next = array.raid_disks*2; + if (inactive) { + struct mdinfo *mdi; + if (sra != NULL) + for (mdi = sra->devs; mdi; mdi = mdi->next) { + disks[next++] = mdi->disk; + disks[next-1].number = -1; + } + } else for (d = 0; d < max_disks; d++) { + mdu_disk_info_t disk; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) { + if (d < array.raid_disks) + pr_err("cannot get device detail for device %d: %s\n", + d, strerror(errno)); + continue; + } + if (disk.major == 0 && disk.minor == 0) + continue; + if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks + && disks[disk.raid_disk*2].state == (1<<MD_DISK_REMOVED)) + disks[disk.raid_disk*2] = disk; + else if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks + && disks[disk.raid_disk*2+1].state == (1<<MD_DISK_REMOVED)) + disks[disk.raid_disk*2+1] = disk; + else if (next < max_disks*2) + disks[next++] = disk; + } + + avail = xcalloc(array.raid_disks, 1); + + for (d= 0; d < array.raid_disks; d++) { + + if ((disks[d*2].state & (1<<MD_DISK_SYNC)) || + (disks[d*2+1].state & (1<<MD_DISK_SYNC))) { + avail_disks ++; + avail[d] = 1; + } + } + + if (c->brief) { + mdu_bitmap_file_t bmf; + printf("%sARRAY %s", inactive ? "INACTIVE-":"", dev); + if (c->verbose > 0) { + if (array.raid_disks) + printf(" level=%s num-devices=%d", + str?str:"-unknown-", + array.raid_disks ); + else if (!inactive) + printf(" level=container num-devices=%d", + array.nr_disks); + else + printf(" num-devices=%d", array.nr_disks); + } + if (container) { + printf(" container=%s", container); + printf(" member=%s", member); + } else { + if (sra && sra->array.major_version < 0) + printf(" metadata=%s", sra->text_version); + else + printf(" metadata=%d.%d", + array.major_version, array.minor_version); + } + + /* Only try GET_BITMAP_FILE for 0.90.01 and later */ + if (vers >= 9001 && + ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && + bmf.pathname[0]) { + printf(" bitmap=%s", bmf.pathname); + } + } else { + mdu_bitmap_file_t bmf; + unsigned long long larray_size; + struct mdstat_ent *ms = mdstat_read(0, 0); + struct mdstat_ent *e; + char *devnm; + + devnm = stat2devnm(&stb); + for (e=ms; e; e=e->next) + if (strcmp(e->devnm, devnm) == 0) + break; + if (!get_dev_size(fd, NULL, &larray_size)) + larray_size = 0; + + printf("%s:\n", dev); + + if (container) + printf(" Container : %s, member %s\n", container, member); + else { + if (sra && sra->array.major_version < 0) + printf(" Version : %s\n", sra->text_version); + else + printf(" Version : %d.%d\n", + array.major_version, array.minor_version); + } + + atime = array.ctime; + if (atime) + printf(" Creation Time : %.24s\n", ctime(&atime)); + if (array.raid_disks == 0 && external) + str = "container"; + if (str) + printf(" Raid Level : %s\n", str); + if (larray_size) + printf(" Array Size : %llu%s\n", (larray_size>>10), + human_size(larray_size)); + if (array.level >= 1) { + if (sra) + array.major_version = sra->array.major_version; + if (array.major_version != 0 && + (larray_size >= 0xFFFFFFFFULL|| array.size == 0)) { + unsigned long long dsize = get_component_size(fd); + if (dsize > 0) + printf(" Used Dev Size : %llu%s\n", + dsize/2, + human_size((long long)dsize<<9)); + else + printf(" Used Dev Size : unknown\n"); + } else + printf(" Used Dev Size : %d%s\n", array.size, + human_size((long long)array.size<<10)); + } + if (array.raid_disks) + printf(" Raid Devices : %d\n", array.raid_disks); + printf(" Total Devices : %d\n", array.nr_disks); + if (!container && + ((sra == NULL && array.major_version == 0) || + (sra && sra->array.major_version == 0))) + printf("Preferred Minor : %d\n", array.md_minor); + if (sra == NULL || sra->array.major_version >= 0) + printf(" Persistence : Superblock is %spersistent\n", + array.not_persistent?"not ":""); + printf("\n"); + /* Only try GET_BITMAP_FILE for 0.90.01 and later */ + if (vers >= 9001 && + ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && + bmf.pathname[0]) { + printf(" Intent Bitmap : %s\n", bmf.pathname); + printf("\n"); + } else if (array.state & (1<<MD_SB_BITMAP_PRESENT)) + printf(" Intent Bitmap : Internal\n\n"); + atime = array.utime; + if (atime) + printf(" Update Time : %.24s\n", ctime(&atime)); + if (array.raid_disks) { + static char *sync_action[] = { + ", recovering", ", resyncing", + ", reshaping", ", checking" }; + char *st; + if (avail_disks == array.raid_disks) + st = ""; + else if (!enough(array.level, array.raid_disks, + array.layout, 1, avail)) + st = ", FAILED"; + else + st = ", degraded"; + + printf(" State : %s%s%s%s%s%s \n", + (array.state&(1<<MD_SB_CLEAN))?"clean":"active", st, + (!e || (e->percent < 0 && e->percent != RESYNC_PENDING && + e->percent != RESYNC_DELAYED)) ? "" : sync_action[e->resync], + larray_size ? "": ", Not Started", + (e && e->percent == RESYNC_DELAYED) ? " (DELAYED)": "", + (e && e->percent == RESYNC_PENDING) ? " (PENDING)": ""); + } else if (inactive) { + printf(" State : inactive\n"); + } + if (array.raid_disks) + printf(" Active Devices : %d\n", array.active_disks); + if (array.working_disks > 0) + printf("Working Devices : %d\n", array.working_disks); + if (array.raid_disks) { + printf(" Failed Devices : %d\n", array.failed_disks); + printf(" Spare Devices : %d\n", array.spare_disks); + } + printf("\n"); + if (array.level == 5) { + str = map_num(r5layout, array.layout); + printf(" Layout : %s\n", str?str:"-unknown-"); + } + if (array.level == 6) { + str = map_num(r6layout, array.layout); + printf(" Layout : %s\n", str?str:"-unknown-"); + } + if (array.level == 10) { + printf(" Layout :"); + print_r10_layout(array.layout); + printf("\n"); + } + switch (array.level) { + case 0: + case 4: + case 5: + case 10: + case 6: + if (array.chunk_size) + printf(" Chunk Size : %dK\n\n", + array.chunk_size/1024); + break; + case -1: + printf(" Rounding : %dK\n\n", array.chunk_size/1024); + break; + default: break; + } + + if (e && e->percent >= 0) { + static char *sync_action[] = { + "Rebuild", "Resync", + "Reshape", "Check"}; + printf(" %7s Status : %d%% complete\n", sync_action[e->resync], e->percent); + is_rebuilding = 1; + } + free_mdstat(ms); + + if ((st && st->sb) && (info && info->reshape_active)) { +#if 0 +This is pretty boring + printf(" Reshape pos'n : %llu%s\n", (unsigned long long) info->reshape_progress<<9, + human_size((unsigned long long)info->reshape_progress<<9)); +#endif + if (info->delta_disks != 0) + printf(" Delta Devices : %d, (%d->%d)\n", + info->delta_disks, + array.raid_disks - info->delta_disks, + array.raid_disks); + if (info->new_level != array.level) { + str = map_num(pers, info->new_level); + printf(" New Level : %s\n", str?str:"-unknown-"); + } + if (info->new_level != array.level || + info->new_layout != array.layout) { + if (info->new_level == 5) { + str = map_num(r5layout, info->new_layout); + printf(" New Layout : %s\n", + str?str:"-unknown-"); + } + if (info->new_level == 6) { + str = map_num(r6layout, info->new_layout); + printf(" New Layout : %s\n", + str?str:"-unknown-"); + } + if (info->new_level == 10) { + printf(" New Layout : near=%d, %s=%d\n", + info->new_layout&255, + (info->new_layout&0x10000)?"offset":"far", + (info->new_layout>>8)&255); + } + } + if (info->new_chunk != array.chunk_size) + printf(" New Chunksize : %dK\n", info->new_chunk/1024); + printf("\n"); + } else if (e && e->percent >= 0) + printf("\n"); + if (st && st->sb) + st->ss->detail_super(st, c->homehost); + + if (array.raid_disks == 0 && sra && sra->array.major_version == -1 + && sra->array.minor_version == -2 && sra->text_version[0] != '/') { + /* This looks like a container. Find any active arrays + * That claim to be a member. + */ + DIR *dir = opendir("/sys/block"); + struct dirent *de; + + printf(" Member Arrays :"); + + while (dir && (de = readdir(dir)) != NULL) { + char path[200]; + char vbuf[1024]; + int nlen = strlen(sra->sys_name); + int devid; + if (de->d_name[0] == '.') + continue; + sprintf(path, "/sys/block/%s/md/metadata_version", + de->d_name); + if (load_sys(path, vbuf) < 0) + continue; + if (strncmp(vbuf, "external:", 9) != 0 || + !is_subarray(vbuf+9) || + strncmp(vbuf+10, sra->sys_name, nlen) != 0 || + vbuf[10+nlen] != '/') + continue; + devid = devnm2devid(de->d_name); + printf(" %s", map_dev_preferred( + major(devid), + minor(devid), 1, c->prefer)); + } + if (dir) + closedir(dir); + printf("\n\n"); + } + + if (array.raid_disks) + printf(" Number Major Minor RaidDevice State\n"); + else + printf(" Number Major Minor RaidDevice\n"); + } + free(info); + + for (d= 0; d < max_disks * 2; d++) { + char *dv; + mdu_disk_info_t disk = disks[d]; + + if (d >= array.raid_disks*2 && + disk.major == 0 && + disk.minor == 0) + continue; + if ((d & 1) && + disk.major == 0 && + disk.minor == 0) + continue; + if (!c->brief) { + if (d == array.raid_disks*2) printf("\n"); + if (disk.number < 0) + printf(" - %5d %5d - ", + disk.major, disk.minor); + else if (disk.raid_disk < 0) + printf(" %5d %5d %5d - ", + disk.number, disk.major, disk.minor); + else + printf(" %5d %5d %5d %5d ", + disk.number, disk.major, disk.minor, disk.raid_disk); + } + if (!c->brief && array.raid_disks) { + + if (disk.state & (1<<MD_DISK_FAULTY)) { + printf(" faulty"); + if (disk.raid_disk < array.raid_disks && + disk.raid_disk >= 0) + failed++; + } + if (disk.state & (1<<MD_DISK_ACTIVE)) printf(" active"); + if (disk.state & (1<<MD_DISK_SYNC)) { + printf(" sync"); + if (array.level == 10 && (array.layout & ~0x1FFFF) == 0) { + int nc = array.layout & 0xff; + int fc = (array.layout >> 8) & 0xff; + int copies = nc*fc; + if (fc == 1 && array.raid_disks % copies == 0 && copies <= 26) { + /* We can divide the devices into 'sets' */ + int set = disk.raid_disk % copies; + printf(" set-%c", set + 'A'); + } + } + } + if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed"); + if (disk.state & (1<<MD_DISK_WRITEMOSTLY)) printf(" writemostly"); + if ((disk.state & + ((1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC) + |(1<<MD_DISK_REMOVED)|(1<<MD_DISK_FAULTY))) + == 0) { + printf(" spare"); + if (is_26) { + if (disk.raid_disk < array.raid_disks && disk.raid_disk >= 0) + printf(" rebuilding"); + } else if (is_rebuilding && failed) { + /* Taking a bit of a risk here, we remove the + * device from the array, and then put it back. + * If this fails, we are rebuilding + */ + int err = ioctl(fd, HOT_REMOVE_DISK, makedev(disk.major, disk.minor)); + if (err == 0) ioctl(fd, HOT_ADD_DISK, makedev(disk.major, disk.minor)); + if (err && errno == EBUSY) + printf(" rebuilding"); + } + } + } + if (disk.state == 0) spares++; + if (c->test && d < array.raid_disks + && !(disk.state & (1<<MD_DISK_SYNC))) + rv |= 1; + dv=map_dev_preferred(disk.major, disk.minor, 0, c->prefer); + if (dv != NULL) { + if (c->brief) + n_devices = add_device(dv, &devices, + &max_devices, + n_devices); + else + printf(" %s", dv); + } + if (!c->brief) printf("\n"); + } + if (spares && c->brief && array.raid_disks) printf(" spares=%d", spares); + if (c->brief && st && st->sb) + st->ss->brief_detail_super(st); + if (st) + st->ss->free_super(st); + + if (c->brief && c->verbose > 0 && devices) { + qsort(devices, n_devices, sizeof(*devices), cmpstringp); + printf("\n devices=%s", devices[0]); + for (d = 1; d < n_devices; d++) + printf(",%s", devices[d]); + } + if (c->brief) + printf("\n"); + if (c->test && + !enough(array.level, array.raid_disks, array.layout, + 1, avail)) + rv = 2; + + free(disks); +out: + close(fd); + free(subarray); + free(avail); + for (d = 0; d < n_devices; d++) + free(devices[d]); + free(devices); + sysfs_free(sra); + return rv; +} + +int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path) +{ + /* display platform capabilities for the given metadata format + * 'scan' in this context means iterate over all metadata types + */ + int i; + int err = 1; + + if (ss && export && ss->export_detail_platform) + err = ss->export_detail_platform(verbose, controller_path); + else if (ss && ss->detail_platform) + err = ss->detail_platform(verbose, 0, controller_path); + else if (ss) { + if (verbose > 0) + pr_err("%s metadata is platform independent\n", + ss->name ? : "[no name]"); + } else if (!scan) { + if (verbose > 0) + pr_err("specify a metadata type or --scan\n"); + } + + if (!scan) + return err; + + err = 0; + for (i = 0; superlist[i]; i++) { + struct superswitch *meta = superlist[i]; + + if (meta == ss) + continue; + if (verbose > 0) + pr_err("checking metadata %s\n", + meta->name ? : "[no name]"); + if (!meta->detail_platform) { + if (verbose > 0) + pr_err("%s metadata is platform independent\n", + meta->name ? : "[no name]"); + } else if (export && meta->export_detail_platform) { + err |= meta->export_detail_platform(verbose, controller_path); + } else + err |= meta->detail_platform(verbose, 0, controller_path); + } + + return err; +} @@ -0,0 +1,311 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2013 Neil Brown <neilb@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <sys/dir.h> + +int Dump_metadata(char *dev, char *dir, struct context *c, + struct supertype *st) +{ + /* create a new file in 'dir' named for the basename of 'dev'. + * Truncate to the same size as 'dev' and ask the metadata + * handler to copy metadata there. + * For every name in /dev/disk/by-id that points to this device, + * create a hardlink in 'dir'. + * Complain if any of those hardlinks cannot be created. + */ + int fd, fl; + struct stat stb, dstb; + char *base; + char *fname = NULL; + unsigned long long size; + DIR *dirp; + struct dirent *de; + + if (stat(dir, &stb) != 0 || + (S_IFMT & stb.st_mode) != S_IFDIR) { + pr_err("--dump requires an existing directory, not: %s\n", + dir); + return 16; + } + + fd = dev_open(dev, O_RDONLY); + if (fd < 0) { + pr_err("Cannot open %s to dump metadata: %s\n", + dev, strerror(errno)); + return 1; + } + if (!get_dev_size(fd, dev, &size)) { + close(fd); + return 1; + } + + if (st == NULL) + st = guess_super_type(fd, guess_array); + if (!st) { + pr_err("Cannot find RAID metadata on %s\n", dev); + close(fd); + return 1; + } + + st->ignore_hw_compat = 1; + if (st->ss->load_super(st, fd, NULL) != 0) { + pr_err("No %s metadata found on %s\n", + st->ss->name, dev); + close(fd); + return 1; + } + if (st->ss->copy_metadata == NULL) { + pr_err("%s metadata on %s cannot be copied\n", + st->ss->name, dev); + close(fd); + return 1; + } + + base = strrchr(dev, '/'); + if (base) + base++; + else + base = dev; + xasprintf(&fname, "%s/%s", dir, base); + fl = open(fname, O_RDWR|O_CREAT|O_EXCL, 0666); + if (fl < 0) { + pr_err("Cannot create dump file %s: %s\n", + fname, strerror(errno)); + close(fd); + free(fname); + return 1; + } + if (ftruncate(fl, size) < 0) { + pr_err("failed to set size of dump file: %s\n", + strerror(errno)); + close(fd); + close(fl); + free(fname); + return 1; + } + + if (st->ss->copy_metadata(st, fd, fl) != 0) { + pr_err("Failed to copy metadata from %s to %s\n", + dev, fname); + close(fd); + close(fl); + unlink(fname); + free(fname); + return 1; + } + if (c->verbose >= 0) + printf("%s saved as %s.\n", dev, fname); + fstat(fd, &dstb); + close(fd); + close(fl); + if ((dstb.st_mode & S_IFMT) != S_IFBLK) { + /* Not a block device, so cannot create links */ + free(fname); + return 0; + } + /* mostly done: just want to find some other names */ + dirp = opendir("/dev/disk/by-id"); + if (!dirp) { + free(fname); + return 0; + } + while ((de = readdir(dirp)) != NULL) { + char *p = NULL; + if (de->d_name[0] == '.') + continue; + xasprintf(&p, "/dev/disk/by-id/%s", de->d_name); + if (stat(p, &stb) != 0 || + (stb.st_mode & S_IFMT) != S_IFBLK || + stb.st_rdev != dstb.st_rdev) { + /* Not this one */ + free(p); + continue; + } + free(p); + xasprintf(&p, "%s/%s", dir, de->d_name); + if (link(fname, p) == 0) { + if (c->verbose >= 0) + printf("%s also saved as %s.\n", + dev, p); + } else { + pr_err("Could not save %s as %s!!\n", + dev, p); + } + free(p); + } + closedir(dirp); + free(fname); + return 0; +} + +int Restore_metadata(char *dev, char *dir, struct context *c, + struct supertype *st, int only) +{ + /* If 'dir' really is a directory we choose a name + * from it that matches a suitable name in /dev/disk/by-id, + * and copy metadata from the file to the device. + * If two names from by-id match and aren't both the same + * inode, we fail. If none match and basename of 'dev' + * can be found in dir, use that. + * If 'dir' is really a file then it is only permitted if + * 'only' is set (meaning there was only one device given) + * and the metadata is restored irrespective of file names. + */ + int fd, fl; + struct stat stb, dstb; + char *fname = NULL; + unsigned long long size; + + if (stat(dir, &stb) != 0) { + pr_err("%s does not exist: cannot restore from there.\n", + dir); + return 16; + } else if ((S_IFMT & stb.st_mode) != S_IFDIR && !only) { + pr_err("--restore requires a directory when multiple devices given\n"); + return 16; + } + + fd = dev_open(dev, O_RDWR); + if (fd < 0) { + pr_err("Cannot open %s to restore metadata: %s\n", + dev, strerror(errno)); + return 1; + } + if (!get_dev_size(fd, dev, &size)) { + close(fd); + return 1; + } + + if ((S_IFMT & stb.st_mode) == S_IFDIR) { + /* choose one name from the directory. */ + DIR *d = opendir(dir); + struct dirent *de; + char *chosen = NULL; + unsigned int chosen_inode = 0; + + fstat(fd, &dstb); + + while (d && (de = readdir(d)) != NULL) { + if (de->d_name[0] == '.') + continue; + xasprintf(&fname, "/dev/disk/by-id/%s", de->d_name); + if (stat(fname, &stb) != 0) { + free(fname); + continue; + } + free(fname); + if ((S_IFMT & stb.st_mode) != S_IFBLK) + continue; + if (stb.st_rdev != dstb.st_rdev) + continue; + /* This file is a good match for our device. */ + xasprintf(&fname, "%s/%s", dir, de->d_name); + if (stat(fname, &stb) != 0) { + /* Weird! */ + free(fname); + continue; + } + if (chosen == NULL) { + chosen = fname; + chosen_inode = stb.st_ino; + continue; + } + if (chosen_inode == stb.st_ino) { + /* same, no need to change */ + free(fname); + continue; + } + /* Oh dear, two names both match. Must give up. */ + pr_err("Both %s and %s seem suitable for %s. Please choose one.\n", + chosen, fname, dev); + free(fname); + free(chosen); + close(fd); + closedir(d); + return 1; + } + closedir(d); + if (!chosen) { + /* One last chance: try basename of device */ + char *base = strrchr(dev, '/'); + if (base) + base++; + else + base = dev; + xasprintf(&fname, "%s/%s", dir, base); + if (stat(fname, &stb) == 0) + chosen = fname; + else + free(fname); + } + fname = chosen; + } else + fname = strdup(dir); + + if (!fname) { + pr_err("Cannot find suitable file in %s for %s\n", + dir, dev); + close(fd); + return 1; + } + + fl = open(fname, O_RDONLY); + if (!fl) { + pr_err("Could not open %s for --restore.\n", + fname); + goto err; + } + if (((unsigned long long)stb.st_size) != size) { + pr_err("%s is not the same size as %s - cannot restore.\n", + fname, dev); + goto err; + } + if (st == NULL) + st = guess_super_type(fl, guess_array); + if (!st) { + pr_err("Cannot find metadata on %s\n", fname); + goto err; + } + st->ignore_hw_compat = 1; + if (st->ss->load_super(st, fl, NULL) != 0) { + pr_err("No %s metadata found on %s\n", + st->ss->name, fname); + goto err; + } + if (st->ss->copy_metadata == NULL) { + pr_err("%s metadata on %s cannot be copied\n", + st->ss->name, dev); + goto err; + } + if (st->ss->copy_metadata(st, fl, fd) != 0) { + pr_err("Failed to copy metadata from %s to %s\n", + fname, dev); + goto err; + } + if (c->verbose >= 0) + printf("%s restored from %s.\n", dev, fname); + return 0; + +err: + close(fd); + close(fl); + free(fname); + return 1; +} diff --git a/Examine.c b/Examine.c new file mode 100644 index 00000000..953b8eee --- /dev/null +++ b/Examine.c @@ -0,0 +1,225 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "dlink.h" + +#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) +#error no endian defined +#endif +#include "md_u.h" +#include "md_p.h" +int Examine(struct mddev_dev *devlist, + struct context *c, + struct supertype *forcest) +{ + + /* Read the raid superblock from a device and + * display important content. + * + * If cannot be found, print reason: too small, bad magic + * + * Print: + * version, ctime, level, size, raid+spare+ + * prefered minor + * uuid + * + * utime, state etc + * + * If (brief) gather devices for same array and just print a mdadm.conf + * line including devices= + * if devlist==NULL, use conf_get_devs() + */ + int fd; + int rv = 0; + int err = 0; + + struct array { + struct supertype *st; + struct mdinfo info; + void *devs; + struct array *next; + int spares; + } *arrays = NULL; + + for (; devlist ; devlist = devlist->next) { + struct supertype *st; + int have_container = 0; + + fd = dev_open(devlist->devname, O_RDONLY); + if (fd < 0) { + if (!c->scan) { + pr_err("cannot open %s: %s\n", + devlist->devname, strerror(errno)); + rv = 1; + } + err = 1; + } + else { + int container = 0; + if (forcest) + st = dup_super(forcest); + else if (must_be_container(fd)) { + /* might be a container */ + st = super_by_fd(fd, NULL); + container = 1; + } else + st = guess_super(fd); + if (st) { + err = 1; + st->ignore_hw_compat = 1; + if (!container) + err = st->ss->load_super(st, fd, + (c->brief||c->scan) ? NULL + :devlist->devname); + if (err && st->ss->load_container) { + err = st->ss->load_container(st, fd, + (c->brief||c->scan) ? NULL + :devlist->devname); + if (!err) + have_container = 1; + } + st->ignore_hw_compat = 0; + } else { + if (!c->brief) { + pr_err("No md superblock detected on %s.\n", devlist->devname); + rv = 1; + } + err = 1; + } + close(fd); + } + if (err) + continue; + + if (c->SparcAdjust) + st->ss->update_super(st, NULL, "sparc2.2", + devlist->devname, 0, 0, NULL); + /* Ok, its good enough to try, though the checksum could be wrong */ + + if (c->brief && st->ss->brief_examine_super == NULL) { + if (!c->scan) + pr_err("No brief listing for %s on %s\n", + st->ss->name, devlist->devname); + } else if (c->brief) { + struct array *ap; + char *d; + for (ap = arrays; ap; ap = ap->next) { + if (st->ss == ap->st->ss && + st->ss->compare_super(ap->st, st) == 0) + break; + } + if (!ap) { + ap = xmalloc(sizeof(*ap)); + ap->devs = dl_head(); + ap->next = arrays; + ap->spares = 0; + ap->st = st; + arrays = ap; + st->ss->getinfo_super(st, &ap->info, NULL); + } else + st->ss->getinfo_super(st, &ap->info, NULL); + if (!have_container && + !(ap->info.disk.state & (1<<MD_DISK_SYNC))) + ap->spares++; + d = dl_strdup(devlist->devname); + dl_add(ap->devs, d); + } else if (c->export) { + if (st->ss->export_examine_super) + st->ss->export_examine_super(st); + st->ss->free_super(st); + } else { + printf("%s:\n",devlist->devname); + st->ss->examine_super(st, c->homehost); + st->ss->free_super(st); + } + } + if (c->brief) { + struct array *ap; + for (ap = arrays; ap; ap = ap->next) { + char sep='='; + char *d; + int newline = 0; + + ap->st->ss->brief_examine_super(ap->st, c->verbose > 0); + if (ap->spares) + newline += printf(" spares=%d", ap->spares); + if (c->verbose > 0) { + newline += printf(" devices"); + for (d = dl_next(ap->devs); + d != ap->devs; + d=dl_next(d)) { + printf("%c%s", sep, d); + sep=','; + } + } + if (ap->st->ss->brief_examine_subarrays) { + if (newline) + printf("\n"); + ap->st->ss->brief_examine_subarrays(ap->st, c->verbose); + } + ap->st->ss->free_super(ap->st); + /* FIXME free ap */ + if (ap->spares || c->verbose > 0) + printf("\n"); + } + } + return rv; +} + +int ExamineBadblocks(char *devname, int brief, struct supertype *forcest) +{ + int fd = dev_open(devname, O_RDONLY); + struct supertype *st = forcest; + int err = 1; + + if (fd < 0) { + pr_err("cannot open %s: %s\n", devname, strerror(errno)); + return 1; + } + if (!st) + st = guess_super(fd); + if (!st) { + if (!brief) + pr_err("No md superblock detected on %s\n", devname); + goto out; + } + if (!st->ss->examine_badblocks) { + pr_err("%s metadata does not support badblocks\n", st->ss->name); + goto out; + } + err = st->ss->load_super(st, fd, brief ? NULL : devname); + if (err) + goto out; + err = st->ss->examine_badblocks(st, fd, devname); + +out: + if (fd >= 0) + close(fd); + if (st) { + st->ss->free_super(st); + free(st); + } + return err; +} @@ -0,0 +1,4963 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ +#include "mdadm.h" +#include "dlink.h" +#include <sys/mman.h> +#include <stddef.h> +#include <stdint.h> +#include <signal.h> +#include <sys/wait.h> + +#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) +#error no endian defined +#endif +#include "md_u.h" +#include "md_p.h" + +int restore_backup(struct supertype *st, + struct mdinfo *content, + int working_disks, + int next_spare, + char **backup_filep, + int verbose) +{ + int i; + int *fdlist; + struct mdinfo *dev; + int err; + int disk_count = next_spare + working_disks; + char *backup_file = *backup_filep; + + dprintf("Called restore_backup()\n"); + fdlist = xmalloc(sizeof(int) * disk_count); + + enable_fds(next_spare); + for (i = 0; i < next_spare; i++) + fdlist[i] = -1; + for (dev = content->devs; dev; dev = dev->next) { + char buf[22]; + int fd; + sprintf(buf, "%d:%d", + dev->disk.major, + dev->disk.minor); + fd = dev_open(buf, O_RDWR); + + if (dev->disk.raid_disk >= 0) + fdlist[dev->disk.raid_disk] = fd; + else + fdlist[next_spare++] = fd; + } + + if (!backup_file) { + backup_file = locate_backup(content->sys_name); + *backup_filep = backup_file; + } + + if (st->ss->external && st->ss->recover_backup) + err = st->ss->recover_backup(st, content); + else + err = Grow_restart(st, content, fdlist, next_spare, + backup_file, verbose > 0); + + while (next_spare > 0) { + next_spare--; + if (fdlist[next_spare] >= 0) + close(fdlist[next_spare]); + } + free(fdlist); + if (err) { + pr_err("Failed to restore critical section for reshape - sorry.\n"); + if (!backup_file) + pr_err("Possibly you need to specify a --backup-file\n"); + return 1; + } + + dprintf("restore_backup() returns status OK.\n"); + return 0; +} + +int Grow_Add_device(char *devname, int fd, char *newdev) +{ + /* Add a device to an active array. + * Currently, just extend a linear array. + * This requires writing a new superblock on the + * new device, calling the kernel to add the device, + * and if that succeeds, update the superblock on + * all other devices. + * This means that we need to *find* all other devices. + */ + struct mdinfo info; + + struct stat stb; + int nfd, fd2; + int d, nd; + struct supertype *st = NULL; + char *subarray = NULL; + + if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) { + pr_err("cannot get array info for %s\n", devname); + return 1; + } + + if (info.array.level != -1) { + pr_err("can only add devices to linear arrays\n"); + return 1; + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("cannot handle arrays with superblock version %d\n", + info.array.major_version); + return 1; + } + + if (subarray) { + pr_err("Cannot grow linear sub-arrays yet\n"); + free(subarray); + free(st); + return 1; + } + + nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT); + if (nfd < 0) { + pr_err("cannot open %s\n", newdev); + free(st); + return 1; + } + fstat(nfd, &stb); + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + pr_err("%s is not a block device!\n", newdev); + close(nfd); + free(st); + return 1; + } + /* now check out all the devices and make sure we can read the + * superblock */ + for (d=0 ; d < info.array.raid_disks ; d++) { + mdu_disk_info_t disk; + char *dv; + + st->ss->free_super(st); + + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) { + pr_err("cannot get device detail for device %d\n", + d); + close(nfd); + free(st); + return 1; + } + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) { + pr_err("cannot find device file for device %d\n", + d); + close(nfd); + free(st); + return 1; + } + fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) { + pr_err("cannot open device file %s\n", dv); + close(nfd); + free(st); + return 1; + } + + if (st->ss->load_super(st, fd2, NULL)) { + pr_err("cannot find super block on %s\n", dv); + close(nfd); + close(fd2); + free(st); + return 1; + } + close(fd2); + } + /* Ok, looks good. Lets update the superblock and write it out to + * newdev. + */ + + info.disk.number = d; + info.disk.major = major(stb.st_rdev); + info.disk.minor = minor(stb.st_rdev); + info.disk.raid_disk = d; + info.disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + st->ss->update_super(st, &info, "linear-grow-new", newdev, + 0, 0, NULL); + + if (st->ss->store_super(st, nfd)) { + pr_err("Cannot store new superblock on %s\n", + newdev); + close(nfd); + return 1; + } + close(nfd); + + if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) { + pr_err("Cannot add new disk to this array\n"); + return 1; + } + /* Well, that seems to have worked. + * Now go through and update all superblocks + */ + + if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) { + pr_err("cannot get array info for %s\n", devname); + return 1; + } + + nd = d; + for (d=0 ; d < info.array.raid_disks ; d++) { + mdu_disk_info_t disk; + char *dv; + + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) { + pr_err("cannot get device detail for device %d\n", + d); + return 1; + } + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) { + pr_err("cannot find device file for device %d\n", + d); + return 1; + } + fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) { + pr_err("cannot open device file %s\n", dv); + return 1; + } + if (st->ss->load_super(st, fd2, NULL)) { + pr_err("cannot find super block on %s\n", dv); + close(fd); + return 1; + } + info.array.raid_disks = nd+1; + info.array.nr_disks = nd+1; + info.array.active_disks = nd+1; + info.array.working_disks = nd+1; + + st->ss->update_super(st, &info, "linear-grow-update", dv, + 0, 0, NULL); + + if (st->ss->store_super(st, fd2)) { + pr_err("Cannot store new superblock on %s\n", dv); + close(fd2); + return 1; + } + close(fd2); + } + + return 0; +} + +int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) +{ + /* + * First check that array doesn't have a bitmap + * Then create the bitmap + * Then add it + * + * For internal bitmaps, we need to check the version, + * find all the active devices, and write the bitmap block + * to all devices + */ + mdu_bitmap_file_t bmf; + mdu_array_info_t array; + struct supertype *st; + char *subarray = NULL; + int major = BITMAP_MAJOR_HI; + int vers = md_get_version(fd); + unsigned long long bitmapsize, array_size; + + if (vers < 9003) { + major = BITMAP_MAJOR_HOSTENDIAN; + pr_err("Warning - bitmaps created on this kernel are not portable\n" + " between different architectures. Consider upgrading the Linux kernel.\n"); + } + + if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) { + if (errno == ENOMEM) + pr_err("Memory allocation failure.\n"); + else + pr_err("bitmaps not supported by this kernel.\n"); + return 1; + } + if (bmf.pathname[0]) { + if (strcmp(s->bitmap_file,"none")==0) { + if (ioctl(fd, SET_BITMAP_FILE, -1)!= 0) { + pr_err("failed to remove bitmap %s\n", + bmf.pathname); + return 1; + } + return 0; + } + pr_err("%s already has a bitmap (%s)\n", + devname, bmf.pathname); + return 1; + } + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { + pr_err("cannot get array status for %s\n", devname); + return 1; + } + if (array.state & (1<<MD_SB_BITMAP_PRESENT)) { + if (strcmp(s->bitmap_file, "none")==0) { + array.state &= ~(1<<MD_SB_BITMAP_PRESENT); + if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) { + pr_err("failed to remove internal bitmap.\n"); + return 1; + } + return 0; + } + pr_err("Internal bitmap already present on %s\n", + devname); + return 1; + } + + if (strcmp(s->bitmap_file, "none") == 0) { + pr_err("no bitmap found on %s\n", devname); + return 1; + } + if (array.level <= 0) { + pr_err("Bitmaps not meaningful with level %s\n", + map_num(pers, array.level)?:"of this array"); + return 1; + } + bitmapsize = array.size; + bitmapsize <<= 1; + if (get_dev_size(fd, NULL, &array_size) && + array_size > (0x7fffffffULL<<9)) { + /* Array is big enough that we cannot trust array.size + * try other approaches + */ + bitmapsize = get_component_size(fd); + } + if (bitmapsize == 0) { + pr_err("Cannot reliably determine size of array to create bitmap - sorry.\n"); + return 1; + } + + if (array.level == 10) { + int ncopies = (array.layout&255)*((array.layout>>8)&255); + bitmapsize = bitmapsize * array.raid_disks / ncopies; + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("Cannot understand version %d.%d\n", + array.major_version, array.minor_version); + return 1; + } + if (subarray) { + pr_err("Cannot add bitmaps to sub-arrays yet\n"); + free(subarray); + free(st); + return 1; + } + if (strcmp(s->bitmap_file, "internal") == 0) { + int rv; + int d; + int offset_setable = 0; + struct mdinfo *mdi; + if (st->ss->add_internal_bitmap == NULL) { + pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name); + return 1; + } + mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION); + if (mdi) + offset_setable = 1; + for (d=0; d< st->max_devs; d++) { + mdu_disk_info_t disk; + char *dv; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && + disk.minor == 0) + continue; + if ((disk.state & (1<<MD_DISK_SYNC))==0) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (dv) { + int fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) + continue; + if (st->ss->load_super(st, fd2, NULL)==0) { + if (st->ss->add_internal_bitmap( + st, + &s->bitmap_chunk, c->delay, s->write_behind, + bitmapsize, offset_setable, + major) + ) + st->ss->write_bitmap(st, fd2); + else { + pr_err("failed to create internal bitmap - chunksize problem.\n"); + close(fd2); + return 1; + } + } + close(fd2); + } + } + if (offset_setable) { + st->ss->getinfo_super(st, mdi, NULL); + sysfs_init(mdi, fd, NULL); + rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location", + mdi->bitmap_offset); + } else { + array.state |= (1<<MD_SB_BITMAP_PRESENT); + rv = ioctl(fd, SET_ARRAY_INFO, &array); + } + if (rv < 0) { + if (errno == EBUSY) + pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n"); + pr_err("failed to set internal bitmap.\n"); + return 1; + } + } else { + int uuid[4]; + int bitmap_fd; + int d; + int max_devs = st->max_devs; + + /* try to load a superblock */ + for (d = 0; d < max_devs; d++) { + mdu_disk_info_t disk; + char *dv; + int fd2; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if ((disk.major==0 && disk.minor==0) || + (disk.state & (1<<MD_DISK_REMOVED))) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + fd2 = dev_open(dv, O_RDONLY); + if (fd2 >= 0) { + if (st->ss->load_super(st, fd2, NULL) == 0) { + close(fd2); + st->ss->uuid_from_super(st, uuid); + break; + } + close(fd2); + } + } + if (d == max_devs) { + pr_err("cannot find UUID for array!\n"); + return 1; + } + if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk, + c->delay, s->write_behind, bitmapsize, major)) { + return 1; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("weird: %s cannot be opened\n", + s->bitmap_file); + return 1; + } + if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) { + int err = errno; + if (errno == EBUSY) + pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n"); + pr_err("Cannot set bitmap file for %s: %s\n", + devname, strerror(err)); + return 1; + } + } + + return 0; +} + +/* + * When reshaping an array we might need to backup some data. + * This is written to all spares with a 'super_block' describing it. + * The superblock goes 4K from the end of the used space on the + * device. + * It if written after the backup is complete. + * It has the following structure. + */ + +static struct mdp_backup_super { + char magic[16]; /* md_backup_data-1 or -2 */ + __u8 set_uuid[16]; + __u64 mtime; + /* start/sizes in 512byte sectors */ + __u64 devstart; /* address on backup device/file of data */ + __u64 arraystart; + __u64 length; + __u32 sb_csum; /* csum of preceeding bytes. */ + __u32 pad1; + __u64 devstart2; /* offset in to data of second section */ + __u64 arraystart2; + __u64 length2; + __u32 sb_csum2; /* csum of preceeding bytes. */ + __u8 pad[512-68-32]; +} __attribute__((aligned(512))) bsb, bsb2; + +static __u32 bsb_csum(char *buf, int len) +{ + int i; + int csum = 0; + for (i = 0; i < len; i++) + csum = (csum<<3) + buf[0]; + return __cpu_to_le32(csum); +} + +static int check_idle(struct supertype *st) +{ + /* Check that all member arrays for this container, or the + * container of this array, are idle + */ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + struct mdstat_ent *ent, *e; + int is_idle = 1; + + ent = mdstat_read(0, 0); + for (e = ent ; e; e = e->next) { + if (!is_container_member(e, container)) + continue; + if (e->percent >= 0) { + is_idle = 0; + break; + } + } + free_mdstat(ent); + return is_idle; +} + +static int freeze_container(struct supertype *st) +{ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + + if (!check_idle(st)) + return -1; + + if (block_monitor(container, 1)) { + pr_err("failed to freeze container\n"); + return -2; + } + + return 1; +} + +static void unfreeze_container(struct supertype *st) +{ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + + unblock_monitor(container, 1); +} + +static int freeze(struct supertype *st) +{ + /* Try to freeze resync/rebuild on this array/container. + * Return -1 if the array is busy, + * return -2 container cannot be frozen, + * return 0 if this kernel doesn't support 'frozen' + * return 1 if it worked. + */ + if (st->ss->external) + return freeze_container(st); + else { + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); + int err; + char buf[20]; + + if (!sra) + return -1; + /* Need to clear any 'read-auto' status */ + if (sysfs_get_str(sra, NULL, "array_state", buf, 20) > 0 && + strncmp(buf, "read-auto", 9) == 0) + sysfs_set_str(sra, NULL, "array_state", "clean"); + + err = sysfs_freeze_array(sra); + sysfs_free(sra); + return err; + } +} + +static void unfreeze(struct supertype *st) +{ + if (st->ss->external) + return unfreeze_container(st); + else { + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); + char buf[20]; + + if (sra && + sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 + && strcmp(buf, "frozen\n") == 0) + sysfs_set_str(sra, NULL, "sync_action", "idle"); + sysfs_free(sra); + } +} + +static void wait_reshape(struct mdinfo *sra) +{ + int fd = sysfs_get_fd(sra, NULL, "sync_action"); + char action[20]; + + if (fd < 0) + return; + + while (sysfs_fd_get_str(fd, action, 20) > 0 && + strncmp(action, "reshape", 7) == 0) + sysfs_wait(fd, NULL); + close(fd); +} + +static int reshape_super(struct supertype *st, unsigned long long size, + int level, int layout, int chunksize, int raid_disks, + int delta_disks, char *backup_file, char *dev, + int direction, int verbose) +{ + /* nothing extra to check in the native case */ + if (!st->ss->external) + return 0; + if (!st->ss->reshape_super || + !st->ss->manage_reshape) { + pr_err("%s metadata does not support reshape\n", + st->ss->name); + return 1; + } + + return st->ss->reshape_super(st, size, level, layout, chunksize, + raid_disks, delta_disks, backup_file, dev, + direction, verbose); +} + +static void sync_metadata(struct supertype *st) +{ + if (st->ss->external) { + if (st->update_tail) { + flush_metadata_updates(st); + st->update_tail = &st->updates; + } else + st->ss->sync_metadata(st); + } +} + +static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n) +{ + /* when dealing with external metadata subarrays we need to be + * prepared to handle EAGAIN. The kernel may need to wait for + * mdmon to mark the array active so the kernel can handle + * allocations/writeback when preparing the reshape action + * (md_allow_write()). We temporarily disable safe_mode_delay + * to close a race with the array_state going clean before the + * next write to raid_disks / stripe_cache_size + */ + char safe[50]; + int rc; + + /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */ + if (!container || + (strcmp(name, "raid_disks") != 0 && + strcmp(name, "stripe_cache_size") != 0)) + return sysfs_set_num(sra, NULL, name, n); + + rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe)); + if (rc <= 0) + return -1; + sysfs_set_num(sra, NULL, "safe_mode_delay", 0); + rc = sysfs_set_num(sra, NULL, name, n); + if (rc < 0 && errno == EAGAIN) { + ping_monitor(container); + /* if we get EAGAIN here then the monitor is not active + * so stop trying + */ + rc = sysfs_set_num(sra, NULL, name, n); + } + sysfs_set_str(sra, NULL, "safe_mode_delay", safe); + return rc; +} + +int start_reshape(struct mdinfo *sra, int already_running, + int before_data_disks, int data_disks) +{ + int err; + unsigned long long sync_max_to_set; + + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + err = sysfs_set_num(sra, NULL, "suspend_hi", sra->reshape_progress); + err = err ?: sysfs_set_num(sra, NULL, "suspend_lo", + sra->reshape_progress); + if (before_data_disks <= data_disks) + sync_max_to_set = sra->reshape_progress / data_disks; + else + sync_max_to_set = (sra->component_size * data_disks + - sra->reshape_progress) / data_disks; + if (!already_running) + sysfs_set_num(sra, NULL, "sync_min", sync_max_to_set); + err = err ?: sysfs_set_num(sra, NULL, "sync_max", sync_max_to_set); + if (!already_running && err == 0) { + int cnt = 5; + do { + err = sysfs_set_str(sra, NULL, "sync_action", "reshape"); + if (err) + sleep(1); + } while (err && errno == EBUSY && cnt-- > 0); + } + return err; +} + +void abort_reshape(struct mdinfo *sra) +{ + sysfs_set_str(sra, NULL, "sync_action", "idle"); + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "sync_min", 0); + // It isn't safe to reset sync_max as we aren't monitoring. + // Array really should be stopped at this point. +} + +int remove_disks_for_takeover(struct supertype *st, + struct mdinfo *sra, + int layout) +{ + int nr_of_copies; + struct mdinfo *remaining; + int slot; + + if (sra->array.level == 10) + nr_of_copies = layout & 0xff; + else if (sra->array.level == 1) + nr_of_copies = sra->array.raid_disks; + else + return 1; + + remaining = sra->devs; + sra->devs = NULL; + /* for each 'copy', select one device and remove from the list. */ + for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) { + struct mdinfo **diskp; + int found = 0; + + /* Find a working device to keep */ + for (diskp = &remaining; *diskp ; diskp = &(*diskp)->next) { + struct mdinfo *disk = *diskp; + + if (disk->disk.raid_disk < slot) + continue; + if (disk->disk.raid_disk >= slot + nr_of_copies) + continue; + if (disk->disk.state & (1<<MD_DISK_REMOVED)) + continue; + if (disk->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (!(disk->disk.state & (1<<MD_DISK_SYNC))) + continue; + + /* We have found a good disk to use! */ + *diskp = disk->next; + disk->next = sra->devs; + sra->devs = disk; + found = 1; + break; + } + if (!found) + break; + } + + if (slot < sra->array.raid_disks) { + /* didn't find all slots */ + struct mdinfo **e; + e = &remaining; + while (*e) + e = &(*e)->next; + *e = sra->devs; + sra->devs = remaining; + return 1; + } + + /* Remove all 'remaining' devices from the array */ + while (remaining) { + struct mdinfo *sd = remaining; + remaining = sd->next; + + sysfs_set_str(sra, sd, "state", "faulty"); + sysfs_set_str(sra, sd, "slot", "none"); + /* for external metadata disks should be removed in mdmon */ + if (!st->ss->external) + sysfs_set_str(sra, sd, "state", "remove"); + sd->disk.state |= (1<<MD_DISK_REMOVED); + sd->disk.state &= ~(1<<MD_DISK_SYNC); + sd->next = sra->devs; + sra->devs = sd; + } + return 0; +} + +void reshape_free_fdlist(int *fdlist, + unsigned long long *offsets, + int size) +{ + int i; + + for (i = 0; i < size; i++) + if (fdlist[i] >= 0) + close(fdlist[i]); + + free(fdlist); + free(offsets); +} + +int reshape_prepare_fdlist(char *devname, + struct mdinfo *sra, + int raid_disks, + int nrdisks, + unsigned long blocks, + char *backup_file, + int *fdlist, + unsigned long long *offsets) +{ + int d = 0; + struct mdinfo *sd; + + enable_fds(nrdisks); + for (d = 0; d <= nrdisks; d++) + fdlist[d] = -1; + d = raid_disks; + for (sd = sra->devs; sd; sd = sd->next) { + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (sd->disk.state & (1<<MD_DISK_SYNC) && + sd->disk.raid_disk < raid_disks) { + char *dn = map_dev(sd->disk.major, + sd->disk.minor, 1); + fdlist[sd->disk.raid_disk] + = dev_open(dn, O_RDONLY); + offsets[sd->disk.raid_disk] = sd->data_offset*512; + if (fdlist[sd->disk.raid_disk] < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + d = -1; + goto release; + } + } else if (backup_file == NULL) { + /* spare */ + char *dn = map_dev(sd->disk.major, + sd->disk.minor, 1); + fdlist[d] = dev_open(dn, O_RDWR); + offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512; + if (fdlist[d] < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + d = -1; + goto release; + } + d++; + } + } +release: + return d; +} + +int reshape_open_backup_file(char *backup_file, + int fd, + char *devname, + long blocks, + int *fdlist, + unsigned long long *offsets, + char *sys_name, + int restart) +{ + /* Return 1 on success, 0 on any form of failure */ + /* need to check backup file is large enough */ + char buf[512]; + struct stat stb; + unsigned int dev; + int i; + + *fdlist = open(backup_file, O_RDWR|O_CREAT|(restart ? O_TRUNC : O_EXCL), + S_IRUSR | S_IWUSR); + *offsets = 8 * 512; + if (*fdlist < 0) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + /* Guard against backup file being on array device. + * If array is partitioned or if LVM etc is in the + * way this will not notice, but it is better than + * nothing. + */ + fstat(*fdlist, &stb); + dev = stb.st_dev; + fstat(fd, &stb); + if (stb.st_rdev == dev) { + pr_err("backup file must NOT be on the array being reshaped.\n"); + close(*fdlist); + return 0; + } + + memset(buf, 0, 512); + for (i=0; i < blocks + 8 ; i++) { + if (write(*fdlist, buf, 512) != 512) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + } + if (fsync(*fdlist) != 0) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + + if (!restart && strncmp(backup_file, MAP_DIR, strlen(MAP_DIR)) != 0) { + char *bu = make_backup(sys_name); + if (symlink(backup_file, bu)) + pr_err("Recording backup file in " MAP_DIR " failed: %s\n", + strerror(errno)); + free(bu); + } + + return 1; +} + +unsigned long compute_backup_blocks(int nchunk, int ochunk, + unsigned int ndata, unsigned int odata) +{ + unsigned long a, b, blocks; + /* So how much do we need to backup. + * We need an amount of data which is both a whole number of + * old stripes and a whole number of new stripes. + * So LCM for (chunksize*datadisks). + */ + a = (ochunk/512) * odata; + b = (nchunk/512) * ndata; + /* Find GCD */ + a = GCD(a, b); + /* LCM == product / GCD */ + blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a; + + return blocks; +} + +char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re) +{ + /* Based on the current array state in info->array and + * the changes in info->new_* etc, determine: + * - whether the change is possible + * - Intermediate level/raid_disks/layout + * - whether a restriping reshape is needed + * - number of sectors in minimum change unit. This + * will cover a whole number of stripes in 'before' and + * 'after'. + * + * Return message if the change should be rejected + * NULL if the change can be achieved + * + * This can be called as part of starting a reshape, or + * when assembling an array that is undergoing reshape. + */ + int near, far, offset, copies; + int new_disks; + int old_chunk, new_chunk; + /* delta_parity records change in number of devices + * caused by level change + */ + int delta_parity = 0; + + memset(re, 0, sizeof(*re)); + + /* If a new level not explicitly given, we assume no-change */ + if (info->new_level == UnSet) + info->new_level = info->array.level; + + if (info->new_chunk) + switch (info->new_level) { + case 0: + case 4: + case 5: + case 6: + case 10: + /* chunk size is meaningful, must divide component_size + * evenly + */ + if (info->component_size % (info->new_chunk/512)) { + unsigned long long shrink = info->component_size; + shrink &= ~(unsigned long long)(info->new_chunk/512-1); + pr_err("New chunk size (%dK) does not evenly divide device size (%lluk)\n", + info->new_chunk/1024, info->component_size/2); + pr_err("After shrinking any filesystem, \"mdadm --grow %s --size %llu\"\n", + devname, shrink/2); + pr_err("will shrink the array so the given chunk size would work.\n"); + return ""; + } + break; + default: + return "chunk size not meaningful for this level"; + } + else + info->new_chunk = info->array.chunk_size; + + switch (info->array.level) { + default: + return "No reshape is possibly for this RAID level"; + case LEVEL_LINEAR: + if (info->delta_disks != UnSet) + return "Only --add is supported for LINEAR, setting --raid-disks is not needed"; + else + return "Only --add is supported for LINEAR, other --grow options are not meaningful"; + case 1: + /* RAID1 can convert to RAID1 with different disks, or + * raid5 with 2 disks, or + * raid0 with 1 disk + */ + if (info->new_level > 1 && + (info->component_size & 7)) + return "Cannot convert RAID1 of this size - reduce size to multiple of 4K first."; + if (info->new_level == 0) { + if (info->delta_disks != UnSet && + info->delta_disks != 0) + return "Cannot change number of disks with RAID1->RAID0 conversion"; + re->level = 0; + re->before.data_disks = 1; + re->after.data_disks = 1; + return NULL; + } + if (info->new_level == 1) { + if (info->delta_disks == UnSet) + /* Don't know what to do */ + return "no change requested for Growing RAID1"; + re->level = 1; + return NULL; + } + if (info->array.raid_disks == 2 && + info->new_level == 5) { + + re->level = 5; + re->before.data_disks = 1; + if (info->delta_disks != UnSet && + info->delta_disks != 0) + re->after.data_disks = 1 + info->delta_disks; + else + re->after.data_disks = 1; + if (re->after.data_disks < 1) + return "Number of disks too small for RAID5"; + + re->before.layout = ALGORITHM_LEFT_SYMMETRIC; + info->array.chunk_size = 65536; + break; + } + /* Could do some multi-stage conversions, but leave that to + * later. + */ + return "Impossibly level change request for RAID1"; + + case 10: + /* RAID10 can be converted from near mode to + * RAID0 by removing some devices. + * It can also be reshaped if the kernel supports + * new_data_offset. + */ + switch (info->new_level) { + case 0: + if ((info->array.layout & ~0xff) != 0x100) + return "Cannot Grow RAID10 with far/offset layout"; + /* number of devices must be multiple of number of copies */ + if (info->array.raid_disks % (info->array.layout & 0xff)) + return "RAID10 layout too complex for Grow operation"; + + new_disks = (info->array.raid_disks + / (info->array.layout & 0xff)); + if (info->delta_disks == UnSet) + info->delta_disks = (new_disks + - info->array.raid_disks); + + if (info->delta_disks != new_disks - info->array.raid_disks) + return "New number of raid-devices impossible for RAID10"; + if (info->new_chunk && + info->new_chunk != info->array.chunk_size) + return "Cannot change chunk-size with RAID10 Grow"; + + /* looks good */ + re->level = 0; + re->before.data_disks = new_disks; + re->after.data_disks = re->before.data_disks; + return NULL; + + case 10: + near = info->array.layout & 0xff; + far = (info->array.layout >> 8) & 0xff; + offset = info->array.layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 in far-mode"; + copies = near * far; + + old_chunk = info->array.chunk_size * far; + + if (info->new_layout == UnSet) + info->new_layout = info->array.layout; + else { + near = info->new_layout & 0xff; + far = (info->new_layout >> 8) & 0xff; + offset = info->new_layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 to far-mode"; + if (near * far != copies) + return "Cannot change number of copies when reshaping RAID10"; + } + if (info->delta_disks == UnSet) + info->delta_disks = 0; + new_disks = (info->array.raid_disks + + info->delta_disks); + + new_chunk = info->new_chunk * far; + + re->level = 10; + re->before.layout = info->array.layout; + re->before.data_disks = info->array.raid_disks; + re->after.layout = info->new_layout; + re->after.data_disks = new_disks; + /* For RAID10 we don't do backup but do allow reshape, + * so set backup_blocks to INVALID_SECTORS rather than + * zero. + * And there is no need to synchronise stripes on both + * 'old' and 'new'. So the important + * number is the minimum data_offset difference + * which is the larger of (offset copies * chunk). + */ + re->backup_blocks = INVALID_SECTORS; + re->min_offset_change = max(old_chunk, new_chunk) / 512; + if (new_disks < re->before.data_disks && + info->space_after < re->min_offset_change) + /* Reduce component size by one chunk */ + re->new_size = (info->component_size - + re->min_offset_change); + else + re->new_size = info->component_size; + re->new_size = re->new_size * new_disks / copies; + return NULL; + + default: + return "RAID10 can only be changed to RAID0"; + } + case 0: + /* RAID0 can be converted to RAID10, or to RAID456 */ + if (info->new_level == 10) { + if (info->new_layout == UnSet && info->delta_disks == UnSet) { + /* Assume near=2 layout */ + info->new_layout = 0x102; + info->delta_disks = info->array.raid_disks; + } + if (info->new_layout == UnSet) { + int copies = 1 + (info->delta_disks + / info->array.raid_disks); + if (info->array.raid_disks * (copies-1) + != info->delta_disks) + return "Impossible number of devices for RAID0->RAID10"; + info->new_layout = 0x100 + copies; + } + if (info->delta_disks == UnSet) { + int copies = info->new_layout & 0xff; + if (info->new_layout != 0x100 + copies) + return "New layout impossible for RAID0->RAID10";; + info->delta_disks = (copies - 1) * + info->array.raid_disks; + } + if (info->new_chunk && + info->new_chunk != info->array.chunk_size) + return "Cannot change chunk-size with RAID0->RAID10"; + /* looks good */ + re->level = 10; + re->before.data_disks = (info->array.raid_disks + + info->delta_disks); + re->after.data_disks = re->before.data_disks; + re->before.layout = info->new_layout; + return NULL; + } + + /* RAID0 can also covert to RAID0/4/5/6 by first converting to + * a raid4 style layout of the final level. + */ + switch (info->new_level) { + case 4: + delta_parity = 1; + case 0: + re->level = 4; + re->before.layout = 0; + break; + case 5: + delta_parity = 1; + re->level = 5; + re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r5layout, "default"); + break; + case 6: + delta_parity = 2; + re->level = 6; + re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r6layout, "default"); + break; + default: + return "Impossible level change requested"; + } + re->before.data_disks = info->array.raid_disks; + /* determining 'after' layout happens outside this 'switch' */ + break; + + case 4: + info->array.layout = ALGORITHM_PARITY_N; + case 5: + switch (info->new_level) { + case 0: + delta_parity = -1; + case 4: + re->level = info->array.level; + re->before.data_disks = info->array.raid_disks - 1; + re->before.layout = info->array.layout; + break; + case 5: + re->level = 5; + re->before.data_disks = info->array.raid_disks - 1; + re->before.layout = info->array.layout; + break; + case 6: + delta_parity = 1; + re->level = 6; + re->before.data_disks = info->array.raid_disks - 1; + switch (info->array.layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + re->before.layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + re->before.layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + re->before.layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + re->before.layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + re->before.layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + re->before.layout = ALGORITHM_PARITY_N_6; + break; + default: + return "Cannot convert an array with this layout"; + } + break; + case 1: + if (info->array.raid_disks != 2) + return "Can only convert a 2-device array to RAID1"; + if (info->delta_disks != UnSet && + info->delta_disks != 0) + return "Cannot set raid_disk when converting RAID5->RAID1"; + re->level = 1; + info->new_chunk = 0; + return NULL; + default: + return "Impossible level change requested"; + } + break; + case 6: + switch (info->new_level) { + case 4: + case 5: + delta_parity = -1; + case 6: + re->level = 6; + re->before.data_disks = info->array.raid_disks - 2; + re->before.layout = info->array.layout; + break; + default: + return "Impossible level change requested"; + } + break; + } + + /* If we reached here then it looks like a re-stripe is + * happening. We have determined the intermediate level + * and initial raid_disks/layout and stored these in 're'. + * + * We need to deduce the final layout that can be atomically + * converted to the end state. + */ + switch (info->new_level) { + case 0: + /* We can only get to RAID0 from RAID4 or RAID5 + * with appropriate layout and one extra device + */ + if (re->level != 4 && re->level != 5) + return "Cannot covert to RAID0 from this level"; + + switch (re->level) { + case 4: + re->before.layout = 0; + re->after.layout = 0; + break; + case 5: + re->after.layout = ALGORITHM_PARITY_N; + break; + } + break; + + case 4: + /* We can only get to RAID4 from RAID5 */ + if (re->level != 4 && re->level != 5) + return "Cannot convert to RAID4 from this level"; + + switch (re->level) { + case 4: + re->after.layout = 0; + break; + case 5: + re->after.layout = ALGORITHM_PARITY_N; + break; + } + break; + + case 5: + /* We get to RAID5 from RAID5 or RAID6 */ + if (re->level != 5 && re->level != 6) + return "Cannot convert to RAID5 from this level"; + + switch (re->level) { + case 5: + if (info->new_layout == UnSet) + re->after.layout = re->before.layout; + else + re->after.layout = info->new_layout; + break; + case 6: + if (info->new_layout == UnSet) + info->new_layout = re->before.layout; + + /* after.layout needs to be raid6 version of new_layout */ + if (info->new_layout == ALGORITHM_PARITY_N) + re->after.layout = ALGORITHM_PARITY_N; + else { + char layout[40]; + char *ls = map_num(r5layout, info->new_layout); + int l; + if (ls) { + /* Current RAID6 layout has a RAID5 + * equivalent - good + */ + strcat(strcpy(layout, ls), "-6"); + l = map_name(r6layout, layout); + if (l == UnSet) + return "Cannot find RAID6 layout to convert to"; + } else { + /* Current RAID6 has no equivalent. + * If it is already a '-6' layout we + * can leave it unchanged, else we must + * fail + */ + ls = map_num(r6layout, info->new_layout); + if (!ls || + strcmp(ls+strlen(ls)-2, "-6") != 0) + return "Please specify new layout"; + l = info->new_layout; + } + re->after.layout = l; + } + } + break; + + case 6: + /* We must already be at level 6 */ + if (re->level != 6) + return "Impossible level change"; + if (info->new_layout == UnSet) + re->after.layout = info->array.layout; + else + re->after.layout = info->new_layout; + break; + default: + return "Impossible level change requested"; + } + if (info->delta_disks == UnSet) + info->delta_disks = delta_parity; + + re->after.data_disks = (re->before.data_disks + + info->delta_disks + - delta_parity); + switch (re->level) { + case 6: re->parity = 2; + break; + case 4: + case 5: re->parity = 1; + break; + default: re->parity = 0; + break; + } + /* So we have a restripe operation, we need to calculate the number + * of blocks per reshape operation. + */ + re->new_size = info->component_size * re->before.data_disks; + if (info->new_chunk == 0) + info->new_chunk = info->array.chunk_size; + if (re->after.data_disks == re->before.data_disks && + re->after.layout == re->before.layout && + info->new_chunk == info->array.chunk_size) { + /* Nothing to change, can change level immediately. */ + re->level = info->new_level; + re->backup_blocks = 0; + return NULL; + } + if (re->after.data_disks == 1 && re->before.data_disks == 1) { + /* chunk and layout changes make no difference */ + re->level = info->new_level; + re->backup_blocks = 0; + return NULL; + } + + if (re->after.data_disks == re->before.data_disks && + get_linux_version() < 2006032) + return "in-place reshape is not safe before 2.6.32 - sorry."; + + if (re->after.data_disks < re->before.data_disks && + get_linux_version() < 2006030) + return "reshape to fewer devices is not supported before 2.6.30 - sorry."; + + re->backup_blocks = compute_backup_blocks( + info->new_chunk, info->array.chunk_size, + re->after.data_disks, + re->before.data_disks); + re->min_offset_change = re->backup_blocks / re->before.data_disks; + + re->new_size = info->component_size * re->after.data_disks; + return NULL; +} + +static int set_array_size(struct supertype *st, struct mdinfo *sra, + char *text_version) +{ + struct mdinfo *info; + char *subarray; + int ret_val = -1; + + if ((st == NULL) || (sra == NULL)) + return ret_val; + + if (text_version == NULL) + text_version = sra->text_version; + subarray = strchr(text_version+1, '/')+1; + info = st->ss->container_content(st, subarray); + if (info) { + unsigned long long current_size = 0; + unsigned long long new_size = + info->custom_array_size/2; + + if (sysfs_get_ll(sra, NULL, "array_size", ¤t_size) == 0 && + new_size > current_size) { + if (sysfs_set_num(sra, NULL, "array_size", new_size) + < 0) + dprintf("Error: Cannot set array size"); + else { + ret_val = 0; + dprintf("Array size changed"); + } + dprintf_cont(" from %llu to %llu.\n", + current_size, new_size); + } + sysfs_free(info); + } else + dprintf("Error: set_array_size(): info pointer in NULL\n"); + + return ret_val; +} + +static int reshape_array(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + int force, struct mddev_dev *devlist, + unsigned long long data_offset, + char *backup_file, int verbose, int forked, + int restart, int freeze_reshape); +static int reshape_container(char *container, char *devname, + int mdfd, + struct supertype *st, + struct mdinfo *info, + int force, + char *backup_file, int verbose, + int forked, int restart, int freeze_reshape); + +int Grow_reshape(char *devname, int fd, + struct mddev_dev *devlist, + unsigned long long data_offset, + struct context *c, struct shape *s) +{ + /* Make some changes in the shape of an array. + * The kernel must support the change. + * + * There are three different changes. Each can trigger + * a resync or recovery so we freeze that until we have + * requested everything (if kernel supports freezing - 2.6.30). + * The steps are: + * - change size (i.e. component_size) + * - change level + * - change layout/chunksize/ndisks + * + * The last can require a reshape. It is different on different + * levels so we need to check the level before actioning it. + * Some times the level change needs to be requested after the + * reshape (e.g. raid6->raid5, raid5->raid0) + * + */ + struct mdu_array_info_s array; + int rv = 0; + struct supertype *st; + char *subarray = NULL; + + int frozen; + int changed = 0; + char *container = NULL; + int cfd = -1; + + struct mddev_dev *dv; + int added_disks; + + struct mdinfo info; + struct mdinfo *sra; + + if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) { + pr_err("%s is not an active md array - aborting\n", + devname); + return 1; + } + if (data_offset != INVALID_SECTORS && array.level != 10 + && (array.level < 4 || array.level > 6)) { + pr_err("--grow --data-offset not yet supported\n"); + return 1; + } + + if (s->size > 0 && + (s->chunk || s->level!= UnSet || s->layout_str || s->raiddisks)) { + pr_err("cannot change component size at the same time as other changes.\n" + " Change size first, then check data is intact before making other changes.\n"); + return 1; + } + + if (s->raiddisks && s->raiddisks < array.raid_disks && array.level > 1 && + get_linux_version() < 2006032 && + !check_env("MDADM_FORCE_FEWER")) { + pr_err("reducing the number of devices is not safe before Linux 2.6.32\n" + " Please use a newer kernel\n"); + return 1; + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("Unable to determine metadata format for %s\n", devname); + return 1; + } + if (s->raiddisks > st->max_devs) { + pr_err("Cannot increase raid-disks on this array beyond %d\n", st->max_devs); + return 1; + } + + /* in the external case we need to check that the requested reshape is + * supported, and perform an initial check that the container holds the + * pre-requisite spare devices (mdmon owns final validation) + */ + if (st->ss->external) { + int rv; + + if (subarray) { + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); + } else { + container = st->devnm; + close(fd); + cfd = open_dev_excl(st->devnm); + fd = cfd; + } + if (cfd < 0) { + pr_err("Unable to open container for %s\n", + devname); + free(subarray); + return 1; + } + + rv = st->ss->load_container(st, cfd, NULL); + + if (rv) { + pr_err("Cannot read superblock for %s\n", + devname); + free(subarray); + return 1; + } + + /* check if operation is supported for metadata handler */ + if (st->ss->container_content) { + struct mdinfo *cc = NULL; + struct mdinfo *content = NULL; + + cc = st->ss->container_content(st, subarray); + for (content = cc; content ; content = content->next) { + int allow_reshape = 1; + + /* check if reshape is allowed based on metadata + * indications stored in content.array.status + */ + if (content->array.state & (1<<MD_SB_BLOCK_VOLUME)) + allow_reshape = 0; + if (content->array.state + & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE)) + allow_reshape = 0; + if (!allow_reshape) { + pr_err("cannot reshape arrays in container with unsupported metadata: %s(%s)\n", + devname, container); + sysfs_free(cc); + free(subarray); + return 1; + } + } + sysfs_free(cc); + } + if (mdmon_running(container)) + st->update_tail = &st->updates; + } + + added_disks = 0; + for (dv = devlist; dv; dv = dv->next) + added_disks++; + if (s->raiddisks > array.raid_disks && + array.spare_disks +added_disks < (s->raiddisks - array.raid_disks) && + !c->force) { + pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n" + " Use --force to over-ride this check.\n", + s->raiddisks - array.raid_disks, + s->raiddisks - array.raid_disks == 1 ? "" : "s", + array.spare_disks + added_disks); + return 1; + } + + sra = sysfs_read(fd, NULL, GET_LEVEL | GET_DISKS | GET_DEVS + | GET_STATE | GET_VERSION); + if (sra) { + if (st->ss->external && subarray == NULL) { + array.level = LEVEL_CONTAINER; + sra->array.level = LEVEL_CONTAINER; + } + } else { + pr_err("failed to read sysfs parameters for %s\n", + devname); + return 1; + } + frozen = freeze(st); + if (frozen < -1) { + /* freeze() already spewed the reason */ + sysfs_free(sra); + return 1; + } else if (frozen < 0) { + pr_err("%s is performing resync/recovery and cannot be reshaped\n", devname); + sysfs_free(sra); + return 1; + } + + /* ========= set size =============== */ + if (s->size > 0 && (s->size == MAX_SIZE || s->size != (unsigned)array.size)) { + unsigned long long orig_size = get_component_size(fd)/2; + unsigned long long min_csize; + struct mdinfo *mdi; + int raid0_takeover = 0; + + if (orig_size == 0) + orig_size = (unsigned) array.size; + + if (orig_size == 0) { + pr_err("Cannot set device size in this type of array.\n"); + rv = 1; + goto release; + } + + if (reshape_super(st, s->size, UnSet, UnSet, 0, 0, UnSet, NULL, + devname, APPLY_METADATA_CHANGES, c->verbose > 0)) { + rv = 1; + goto release; + } + sync_metadata(st); + if (st->ss->external) { + /* metadata can have size limitation + * update size value according to metadata information + */ + struct mdinfo *sizeinfo = + st->ss->container_content(st, subarray); + if (sizeinfo) { + unsigned long long new_size = + sizeinfo->custom_array_size/2; + int data_disks = get_data_disks( + sizeinfo->array.level, + sizeinfo->array.layout, + sizeinfo->array.raid_disks); + new_size /= data_disks; + dprintf("Metadata size correction from %llu to %llu (%llu)\n", orig_size, new_size, + new_size * data_disks); + s->size = new_size; + sysfs_free(sizeinfo); + } + } + + /* Update the size of each member device in case + * they have been resized. This will never reduce + * below the current used-size. The "size" attribute + * understands '0' to mean 'max'. + */ + min_csize = 0; + rv = 0; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + if (sysfs_set_num(sra, mdi, "size", + s->size == MAX_SIZE ? 0 : s->size) < 0) { + /* Probably kernel refusing to let us + * reduce the size - not an error. + */ + break; + } + if (array.not_persistent == 0 && + array.major_version == 0 && + get_linux_version() < 3001000) { + /* Dangerous to allow size to exceed 2TB */ + unsigned long long csize; + if (sysfs_get_ll(sra, mdi, "size", &csize) == 0) { + if (csize >= 2ULL*1024*1024*1024) + csize = 2ULL*1024*1024*1024; + if ((min_csize == 0 || (min_csize + > csize))) + min_csize = csize; + } + } + } + if (rv) { + pr_err("Cannot set size on array members.\n"); + goto size_change_error; + } + if (min_csize && s->size > min_csize) { + pr_err("Cannot safely make this array use more than 2TB per device on this kernel.\n"); + rv = 1; + goto size_change_error; + } + if (min_csize && s->size == MAX_SIZE) { + /* Don't let the kernel choose a size - it will get + * it wrong + */ + pr_err("Limited v0.90 array to 2TB per device\n"); + s->size = min_csize; + } + if (st->ss->external) { + if (sra->array.level == 0) { + rv = sysfs_set_str(sra, NULL, "level", + "raid5"); + if (!rv) { + raid0_takeover = 1; + /* get array parameters after takeover + * to change one parameter at time only + */ + rv = ioctl(fd, GET_ARRAY_INFO, &array); + } + } + /* make sure mdmon is + * aware of the new level */ + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(container); + if (mdmon_running(st->container_devnm) && + st->update_tail == NULL) + st->update_tail = &st->updates; + } + + if (s->size == MAX_SIZE) + s->size = 0; + array.size = s->size; + if (s->size & ~INT32_MAX) { + /* got truncated to 32bit, write to + * component_size instead + */ + if (sra) + rv = sysfs_set_num(sra, NULL, + "component_size", s->size); + else + rv = -1; + } else { + rv = ioctl(fd, SET_ARRAY_INFO, &array); + + /* manage array size when it is managed externally + */ + if ((rv == 0) && st->ss->external) + rv = set_array_size(st, sra, sra->text_version); + } + + if (raid0_takeover) { + /* do not recync non-existing parity, + * we will drop it anyway + */ + sysfs_set_str(sra, NULL, "sync_action", "frozen"); + /* go back to raid0, drop parity disk + */ + sysfs_set_str(sra, NULL, "level", "raid0"); + ioctl(fd, GET_ARRAY_INFO, &array); + } + +size_change_error: + if (rv != 0) { + int err = errno; + + /* restore metadata */ + if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0, + UnSet, NULL, devname, + ROLLBACK_METADATA_CHANGES, + c->verbose) == 0) + sync_metadata(st); + pr_err("Cannot set device size for %s: %s\n", + devname, strerror(err)); + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before size can be changed\n"); + rv = 1; + goto release; + } + if (s->assume_clean) { + /* This will fail on kernels older than 3.0 unless + * a backport has been arranged. + */ + if (sra == NULL || + sysfs_set_str(sra, NULL, "resync_start", "none") < 0) + pr_err("--assume-clean not supported with --grow on this kernel\n"); + } + ioctl(fd, GET_ARRAY_INFO, &array); + s->size = get_component_size(fd)/2; + if (s->size == 0) + s->size = array.size; + if (c->verbose >= 0) { + if (s->size == orig_size) + pr_err("component size of %s unchanged at %lluK\n", + devname, s->size); + else + pr_err("component size of %s has been set to %lluK\n", + devname, s->size); + } + changed = 1; + } else if (array.level != LEVEL_CONTAINER) { + s->size = get_component_size(fd)/2; + if (s->size == 0) + s->size = array.size; + } + + /* See if there is anything else to do */ + if ((s->level == UnSet || s->level == array.level) && + (s->layout_str == NULL) && + (s->chunk == 0 || s->chunk == array.chunk_size) && + data_offset == INVALID_SECTORS && + (s->raiddisks == 0 || s->raiddisks == array.raid_disks)) { + /* Nothing more to do */ + if (!changed && c->verbose >= 0) + pr_err("%s: no change requested\n", + devname); + goto release; + } + + /* ========= check for Raid10/Raid1 -> Raid0 conversion =============== + * current implementation assumes that following conditions must be met: + * - RAID10: + * - far_copies == 1 + * - near_copies == 2 + */ + if ((s->level == 0 && array.level == 10 && sra && + array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) || + (s->level == 0 && array.level == 1 && sra)) { + int err; + err = remove_disks_for_takeover(st, sra, array.layout); + if (err) { + dprintf("Array cannot be reshaped\n"); + if (cfd > -1) + close(cfd); + rv = 1; + goto release; + } + /* Make sure mdmon has seen the device removal + * and updated metadata before we continue with + * level change + */ + if (container) + ping_monitor(container); + } + + memset(&info, 0, sizeof(info)); + info.array = array; + sysfs_init(&info, fd, NULL); + strcpy(info.text_version, sra->text_version); + info.component_size = s->size*2; + info.new_level = s->level; + info.new_chunk = s->chunk * 1024; + if (info.array.level == LEVEL_CONTAINER) { + info.delta_disks = UnSet; + info.array.raid_disks = s->raiddisks; + } else if (s->raiddisks) + info.delta_disks = s->raiddisks - info.array.raid_disks; + else + info.delta_disks = UnSet; + if (s->layout_str == NULL) { + info.new_layout = UnSet; + if (info.array.level == 6 && + (info.new_level == 6 || info.new_level == UnSet) && + info.array.layout >= 16) { + pr_err("%s has a non-standard layout. If you wish to preserve this\n", devname); + cont_err("during the reshape, please specify --layout=preserve\n"); + cont_err("If you want to change it, specify a layout or use --layout=normalise\n"); + rv = 1; + goto release; + } + } else if (strcmp(s->layout_str, "normalise") == 0 || + strcmp(s->layout_str, "normalize") == 0) { + /* If we have a -6 RAID6 layout, remove the '-6'. */ + info.new_layout = UnSet; + if (info.array.level == 6 && info.new_level == UnSet) { + char l[40], *h; + strcpy(l, map_num(r6layout, info.array.layout)); + h = strrchr(l, '-'); + if (h && strcmp(h, "-6") == 0) { + *h = 0; + info.new_layout = map_name(r6layout, l); + } + } else { + pr_err("%s is only meaningful when reshaping a RAID6 array.\n", s->layout_str); + rv = 1; + goto release; + } + } else if (strcmp(s->layout_str, "preserve") == 0) { + /* This means that a non-standard RAID6 layout + * is OK. + * In particular: + * - When reshape a RAID6 (e.g. adding a device) + * which is in a non-standard layout, it is OK + * to preserve that layout. + * - When converting a RAID5 to RAID6, leave it in + * the XXX-6 layout, don't re-layout. + */ + if (info.array.level == 6 && info.new_level == UnSet) + info.new_layout = info.array.layout; + else if (info.array.level == 5 && info.new_level == 6) { + char l[40]; + strcpy(l, map_num(r5layout, info.array.layout)); + strcat(l, "-6"); + info.new_layout = map_name(r6layout, l); + } else { + pr_err("%s in only meaningful when reshaping to RAID6\n", s->layout_str); + rv = 1; + goto release; + } + } else { + int l = info.new_level; + if (l == UnSet) + l = info.array.level; + switch (l) { + case 5: + info.new_layout = map_name(r5layout, s->layout_str); + break; + case 6: + info.new_layout = map_name(r6layout, s->layout_str); + break; + case 10: + info.new_layout = parse_layout_10(s->layout_str); + break; + case LEVEL_FAULTY: + info.new_layout = parse_layout_faulty(s->layout_str); + break; + default: + pr_err("layout not meaningful with this level\n"); + rv = 1; + goto release; + } + if (info.new_layout == UnSet) { + pr_err("layout %s not understood for this level\n", + s->layout_str); + rv = 1; + goto release; + } + } + + if (array.level == LEVEL_FAULTY) { + if (s->level != UnSet && s->level != array.level) { + pr_err("cannot change level of Faulty device\n"); + rv =1 ; + } + if (s->chunk) { + pr_err("cannot set chunksize of Faulty device\n"); + rv =1 ; + } + if (s->raiddisks && s->raiddisks != 1) { + pr_err("cannot set raid_disks of Faulty device\n"); + rv =1 ; + } + if (s->layout_str) { + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + array.layout = info.new_layout; + if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + pr_err("failed to set new layout\n"); + rv = 1; + } else if (c->verbose >= 0) + printf("layout for %s set to %d\n", + devname, array.layout); + } + } else if (array.level == LEVEL_CONTAINER) { + /* This change is to be applied to every array in the + * container. This is only needed when the metadata imposes + * restraints of the various arrays in the container. + * Currently we only know that IMSM requires all arrays + * to have the same number of devices so changing the + * number of devices (On-Line Capacity Expansion) must be + * performed at the level of the container + */ + rv = reshape_container(container, devname, -1, st, &info, + c->force, c->backup_file, c->verbose, 0, 0, 0); + frozen = 0; + } else { + /* get spare devices from external metadata + */ + if (st->ss->external) { + struct mdinfo *info2; + + info2 = st->ss->container_content(st, subarray); + if (info2) { + info.array.spare_disks = + info2->array.spare_disks; + sysfs_free(info2); + } + } + + /* Impose these changes on a single array. First + * check that the metadata is OK with the change. */ + + if (reshape_super(st, 0, info.new_level, + info.new_layout, info.new_chunk, + info.array.raid_disks, info.delta_disks, + c->backup_file, devname, APPLY_METADATA_CHANGES, + c->verbose)) { + rv = 1; + goto release; + } + sync_metadata(st); + rv = reshape_array(container, fd, devname, st, &info, c->force, + devlist, data_offset, c->backup_file, c->verbose, + 0, 0, 0); + frozen = 0; + } +release: + sysfs_free(sra); + if (frozen > 0) + unfreeze(st); + return rv; +} + +/* verify_reshape_position() + * Function checks if reshape position in metadata is not farther + * than position in md. + * Return value: + * 0 : not valid sysfs entry + * it can be caused by not started reshape, it should be started + * by reshape array or raid0 array is before takeover + * -1 : error, reshape position is obviously wrong + * 1 : success, reshape progress correct or updated +*/ +static int verify_reshape_position(struct mdinfo *info, int level) +{ + int ret_val = 0; + char buf[40]; + int rv; + + /* read sync_max, failure can mean raid0 array */ + rv = sysfs_get_str(info, NULL, "sync_max", buf, 40); + + if (rv > 0) { + char *ep; + unsigned long long position = strtoull(buf, &ep, 0); + + dprintf("Read sync_max sysfs entry is: %s\n", buf); + if (!(ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))) { + position *= get_data_disks(level, + info->new_layout, + info->array.raid_disks); + if (info->reshape_progress < position) { + dprintf("Corrected reshape progress (%llu) to md position (%llu)\n", + info->reshape_progress, position); + info->reshape_progress = position; + ret_val = 1; + } else if (info->reshape_progress > position) { + pr_err("Fatal error: array reshape was not properly frozen (expected reshape position is %llu, but reshape progress is %llu.\n", + position, info->reshape_progress); + ret_val = -1; + } else { + dprintf("Reshape position in md and metadata are the same;"); + ret_val = 1; + } + } + } else if (rv == 0) { + /* for valid sysfs entry, 0-length content + * should be indicated as error + */ + ret_val = -1; + } + + return ret_val; +} + +static unsigned long long choose_offset(unsigned long long lo, + unsigned long long hi, + unsigned long long min, + unsigned long long max) +{ + /* Choose a new offset between hi and lo. + * It must be between min and max, but + * we would prefer something near the middle of hi/lo, and also + * prefer to be aligned to a big power of 2. + * + * So we start with the middle, then for each bit, + * starting at '1' and increasing, if it is set, we either + * add it or subtract it if possible, preferring the option + * which is furthest from the boundary. + * + * We stop once we get a 1MB alignment. As units are in sectors, + * 1MB = 2*1024 sectors. + */ + unsigned long long choice = (lo + hi) / 2; + unsigned long long bit = 1; + + for (bit = 1; bit < 2*1024; bit = bit << 1) { + unsigned long long bigger, smaller; + if (! (bit & choice)) + continue; + bigger = choice + bit; + smaller = choice - bit; + if (bigger > max && smaller < min) + break; + if (bigger > max) + choice = smaller; + else if (smaller < min) + choice = bigger; + else if (hi - bigger > smaller - lo) + choice = bigger; + else + choice = smaller; + } + return choice; +} + +static int set_new_data_offset(struct mdinfo *sra, struct supertype *st, + char *devname, int delta_disks, + unsigned long long data_offset, + unsigned long long min, + int can_fallback) +{ + struct mdinfo *sd; + int dir = 0; + int err = 0; + unsigned long long before, after; + + /* Need to find min space before and after so same is used + * on all devices + */ + before = UINT64_MAX; + after = UINT64_MAX; + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int dfd; + int rv; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + dn = map_dev(sd->disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + goto release; + } + st2 = dup_super(st); + rv = st2->ss->load_super(st2,dfd, NULL); + close(dfd); + if (rv) { + free(st2); + pr_err("%s: cannot get superblock from %s\n", + devname, dn); + goto release; + } + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (info2.space_before == 0 && + info2.space_after == 0) { + /* Metadata doesn't support data_offset changes */ + if (!can_fallback) + pr_err("%s: Metadata version doesn't support data_offset changes\n", + devname); + goto fallback; + } + if (before > info2.space_before) + before = info2.space_before; + if (after > info2.space_after) + after = info2.space_after; + + if (data_offset != INVALID_SECTORS) { + if (dir == 0) { + if (info2.data_offset == data_offset) { + pr_err("%s: already has that data_offset\n", + dn); + goto release; + } + if (data_offset < info2.data_offset) + dir = -1; + else + dir = 1; + } else if ((data_offset <= info2.data_offset && dir == 1) || + (data_offset >= info2.data_offset && dir == -1)) { + pr_err("%s: differing data offsets on devices make this --data-offset setting impossible\n", + dn); + goto release; + } + } + } + if (before == UINT64_MAX) + /* impossible really, there must be no devices */ + return 1; + + for (sd = sra->devs; sd; sd = sd->next) { + char *dn = map_dev(sd->disk.major, sd->disk.minor, 0); + unsigned long long new_data_offset; + + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (delta_disks < 0) { + /* Don't need any space as array is shrinking + * just move data_offset up by min + */ + if (data_offset == INVALID_SECTORS) + new_data_offset = sd->data_offset + min; + else { + if (data_offset < sd->data_offset + min) { + pr_err("--data-offset too small for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else if (delta_disks > 0) { + /* need space before */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient head-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset == INVALID_SECTORS) + new_data_offset = sd->data_offset - min; + else { + if (data_offset > sd->data_offset - min) { + pr_err("--data-offset too large for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else { + if (dir == 0) { + /* can move up or down. If 'data_offset' + * was set we would have already decided, + * so just choose direction with most space. + */ + if (before > after) + dir = -1; + else + dir = 1; + } + sysfs_set_str(sra, NULL, "reshape_direction", + dir == 1 ? "backwards" : "forwards"); + if (dir > 0) { + /* Increase data offset */ + if (after < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient tail-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset < sd->data_offset + min) { + pr_err("--data-offset too small on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset, + sd->data_offset + after, + sd->data_offset + min, + sd->data_offset + after); + } else { + /* Decrease data offset */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("insufficient head-room on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset < sd->data_offset - min) { + pr_err("--data-offset too small on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset - before, + sd->data_offset, + sd->data_offset - before, + sd->data_offset - min); + } + } + err = sysfs_set_num(sra, sd, "new_offset", new_data_offset); + if (err < 0 && errno == E2BIG) { + /* try again after increasing data size to max */ + err = sysfs_set_num(sra, sd, "size", 0); + if (err < 0 && errno == EINVAL && + !(sd->disk.state & (1<<MD_DISK_SYNC))) { + /* some kernels have a bug where you cannot + * use '0' on spare devices. */ + sysfs_set_num(sra, sd, "size", + (sra->component_size + after)/2); + } + err = sysfs_set_num(sra, sd, "new_offset", + new_data_offset); + } + if (err < 0) { + if (errno == E2BIG && data_offset != INVALID_SECTORS) { + pr_err("data-offset is too big for %s\n", + dn); + goto release; + } + if (sd == sra->devs && + (errno == ENOENT || errno == E2BIG)) + /* Early kernel, no 'new_offset' file, + * or kernel doesn't like us. + * For RAID5/6 this is not fatal + */ + return 1; + pr_err("Cannot set new_offset for %s\n", + dn); + break; + } + } + return err; +release: + return -1; +fallback: + /* Just use a backup file */ + return 1; +} + +static int raid10_reshape(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + struct reshape *reshape, + unsigned long long data_offset, + int force, int verbose) +{ + /* Changing raid_disks, layout, chunksize or possibly + * just data_offset for a RAID10. + * We must always change data_offset. We change by at least + * ->min_offset_change which is the largest of the old and new + * chunk sizes. + * If raid_disks is increasing, then data_offset must decrease + * by at least this copy size. + * If raid_disks is unchanged, data_offset must increase or + * decrease by at least min_offset_change but preferably by much more. + * We choose half of the available space. + * If raid_disks is decreasing, data_offset must increase by + * at least min_offset_change. To allow of this, component_size + * must be decreased by the same amount. + * + * So we calculate the required minimum and direction, possibly + * reduce the component_size, then iterate through the devices + * and set the new_data_offset. + * If that all works, we set chunk_size, layout, raid_disks, and start + * 'reshape' + */ + struct mdinfo *sra; + unsigned long long min; + int err = 0; + + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK + ); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + min = reshape->min_offset_change; + + if (info->delta_disks) + sysfs_set_str(sra, NULL, "reshape_direction", + info->delta_disks < 0 ? "backwards" : "forwards"); + if (info->delta_disks < 0 && + info->space_after < min) { + int rv = sysfs_set_num(sra, NULL, "component_size", + (sra->component_size - + min)/2); + if (rv) { + pr_err("cannot reduce component size\n"); + goto release; + } + } + err = set_new_data_offset(sra, st, devname, info->delta_disks, data_offset, + min, 0); + if (err == 1) { + pr_err("Cannot set new_data_offset: RAID10 reshape not\n"); + cont_err("supported on this kernel\n"); + err = -1; + } + if (err < 0) + goto release; + + if (!err && sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", reshape->after.layout) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "raid_disks", + info->array.raid_disks + info->delta_disks) < 0) + err = errno; + if (!err && sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) + err = errno; + if (err) { + pr_err("Cannot set array shape for %s\n", + devname); + if (err == EBUSY && + (info->array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err(" Bitmap must be removed before shape can be changed\n"); + goto release; + } + sysfs_free(sra); + return 0; +release: + sysfs_free(sra); + return 1; +} + +static void get_space_after(int fd, struct supertype *st, struct mdinfo *info) +{ + struct mdinfo *sra, *sd; + /* Initialisation to silence compiler warning */ + unsigned long long min_space_before = 0, min_space_after = 0; + int first = 1; + + sra = sysfs_read(fd, NULL, GET_DEVS); + if (!sra) + return; + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int dfd; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + dn = map_dev(sd->disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + break; + st2 = dup_super(st); + if (st2->ss->load_super(st2,dfd, NULL)) { + close(dfd); + free(st2); + break; + } + close(dfd); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (first || + min_space_before > info2.space_before) + min_space_before = info2.space_before; + if (first || + min_space_after > info2.space_after) + min_space_after = info2.space_after; + first = 0; + } + if (sd == NULL && !first) { + info->space_after = min_space_after; + info->space_before = min_space_before; + } + sysfs_free(sra); +} + +static void update_cache_size(char *container, struct mdinfo *sra, + struct mdinfo *info, + int disks, unsigned long long blocks) +{ + /* Check that the internal stripe cache is + * large enough, or it won't work. + * It must hold at least 4 stripes of the larger + * chunk size + */ + unsigned long cache; + cache = max(info->array.chunk_size, info->new_chunk); + cache *= 4; /* 4 stripes minimum */ + cache /= 512; /* convert to sectors */ + /* make sure there is room for 'blocks' with a bit to spare */ + if (cache < 16 + blocks / disks) + cache = 16 + blocks / disks; + cache /= (4096/512); /* Convert from sectors to pages */ + + if (sra->cache_size < cache) + subarray_set_num(container, sra, "stripe_cache_size", + cache+1); +} + +static int impose_reshape(struct mdinfo *sra, + struct mdinfo *info, + struct supertype *st, + int fd, + int restart, + char *devname, char *container, + struct reshape *reshape) +{ + struct mdu_array_info_s array; + + sra->new_chunk = info->new_chunk; + + if (restart) { + /* for external metadata checkpoint saved by mdmon can be lost + * or missed /due to e.g. crash/. Check if md is not during + * restart farther than metadata points to. + * If so, this means metadata information is obsolete. + */ + if (st->ss->external) + verify_reshape_position(info, reshape->level); + sra->reshape_progress = info->reshape_progress; + } else { + sra->reshape_progress = 0; + if (reshape->after.data_disks < reshape->before.data_disks) + /* start from the end of the new array */ + sra->reshape_progress = (sra->component_size + * reshape->after.data_disks); + } + + ioctl(fd, GET_ARRAY_INFO, &array); + if (info->array.chunk_size == info->new_chunk && + reshape->before.layout == reshape->after.layout && + st->ss->external == 0) { + /* use SET_ARRAY_INFO but only if reshape hasn't started */ + array.raid_disks = reshape->after.data_disks + reshape->parity; + if (!restart && + ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + int err = errno; + + pr_err("Cannot set device shape for %s: %s\n", + devname, strerror(errno)); + + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before shape can be changed\n"); + + goto release; + } + } else if (!restart) { + /* set them all just in case some old 'new_*' value + * persists from some earlier problem. + */ + int err = 0; + if (sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", + reshape->after.layout) < 0) + err = errno; + if (!err && subarray_set_num(container, sra, "raid_disks", + reshape->after.data_disks + + reshape->parity) < 0) + err = errno; + if (err) { + pr_err("Cannot set device shape for %s\n", + devname); + + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before shape can be changed\n"); + goto release; + } + } + return 0; +release: + return -1; +} + +static int impose_level(int fd, int level, char *devname, int verbose) +{ + char *c; + struct mdu_array_info_s array; + struct mdinfo info; + sysfs_init(&info, fd, NULL); + + ioctl(fd, GET_ARRAY_INFO, &array); + if (level == 0 && + (array.level >= 4 && array.level <= 6)) { + /* To convert to RAID0 we need to fail and + * remove any non-data devices. */ + int found = 0; + int d; + int data_disks = array.raid_disks - 1; + if (array.level == 6) + data_disks -= 1; + if (array.level == 5 && + array.layout != ALGORITHM_PARITY_N) + return -1; + if (array.level == 6 && + array.layout != ALGORITHM_PARITY_N_6) + return -1; + sysfs_set_str(&info, NULL,"sync_action", "idle"); + /* First remove any spares so no recovery starts */ + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; + d++) { + mdu_disk_info_t disk; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) + && disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, HOT_REMOVE_DISK, + makedev(disk.major, disk.minor)); + } + /* Now fail anything left */ + ioctl(fd, GET_ARRAY_INFO, &array); + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; + d++) { + int cnt; + mdu_disk_info_t disk; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) + && disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, SET_DISK_FAULTY, + makedev(disk.major, disk.minor)); + cnt = 5; + while (ioctl(fd, HOT_REMOVE_DISK, + makedev(disk.major, disk.minor)) < 0 + && errno == EBUSY + && cnt--) { + usleep(10000); + } + } + } + c = map_num(pers, level); + if (c) { + int err = sysfs_set_str(&info, NULL, "level", c); + if (err) { + err = errno; + pr_err("%s: could not set level to %s\n", + devname, c); + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before level can be changed\n"); + return err; + } + if (verbose >= 0) + pr_err("level of %s changed to %s\n", + devname, c); + } + return 0; +} + +int sigterm = 0; +static void catch_term(int sig) +{ + sigterm = 1; +} + +static int continue_via_systemd(char *devnm) +{ + int skipped, i, pid, status; + char pathbuf[1024]; + /* In a systemd/udev world, it is best to get systemd to + * run "mdadm --grow --continue" rather than running in the + * background. + */ + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + skipped = 0; + for (i = 3; skipped < 20; i++) + if (close(i) < 0) + skipped++; + else + skipped = 0; + + /* Don't want to see error messages from + * systemctl. If the service doesn't exist, + * we fork ourselves. + */ + close(2); + open("/dev/null", O_WRONLY); + snprintf(pathbuf, sizeof(pathbuf), "mdadm-grow-continue@%s.service", + devnm); + status = execl("/usr/bin/systemctl", "systemctl", + "start", + pathbuf, NULL); + status = execl("/bin/systemctl", "systemctl", "start", + pathbuf, NULL); + exit(1); + case -1: /* Just do it ourselves. */ + break; + default: /* parent - good */ + pid = wait(&status); + if (pid >= 0 && status == 0) + return 1; + } + return 0; +} + +static int reshape_array(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + int force, struct mddev_dev *devlist, + unsigned long long data_offset, + char *backup_file, int verbose, int forked, + int restart, int freeze_reshape) +{ + struct reshape reshape; + int spares_needed; + char *msg; + int orig_level = UnSet; + int odisks; + int delayed; + + struct mdu_array_info_s array; + char *c; + + struct mddev_dev *dv; + int added_disks; + + int *fdlist = NULL; + unsigned long long *offsets = NULL; + int d; + int nrdisks; + int err; + unsigned long blocks; + unsigned long long array_size; + int done; + struct mdinfo *sra = NULL; + char buf[20]; + + /* when reshaping a RAID0, the component_size might be zero. + * So try to fix that up. + */ + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + if (array.level == 0 && info->component_size == 0) { + get_dev_size(fd, NULL, &array_size); + info->component_size = array_size / array.raid_disks; + } + + if (array.level == 10) + /* Need space_after info */ + get_space_after(fd, st, info); + + if (info->reshape_active) { + int new_level = info->new_level; + info->new_level = UnSet; + if (info->delta_disks > 0) + info->array.raid_disks -= info->delta_disks; + msg = analyse_change(devname, info, &reshape); + info->new_level = new_level; + if (info->delta_disks > 0) + info->array.raid_disks += info->delta_disks; + if (!restart) + /* Make sure the array isn't read-only */ + ioctl(fd, RESTART_ARRAY_RW, 0); + } else + msg = analyse_change(devname, info, &reshape); + if (msg) { + /* if msg == "", error has already been printed */ + if (msg[0]) + pr_err("%s\n", msg); + goto release; + } + if (restart && + (reshape.level != info->array.level || + reshape.before.layout != info->array.layout || + reshape.before.data_disks + reshape.parity + != info->array.raid_disks - max(0, info->delta_disks))) { + pr_err("reshape info is not in native format - cannot continue.\n"); + goto release; + } + + if (st->ss->external && restart && (info->reshape_progress == 0) && + !((sysfs_get_str(info, NULL, "sync_action", buf, sizeof(buf)) > 0) && + (strncmp(buf, "reshape", 7) == 0))) { + /* When reshape is restarted from '0', very begin of array + * it is possible that for external metadata reshape and array + * configuration doesn't happen. + * Check if md has the same opinion, and reshape is restarted + * from 0. If so, this is regular reshape start after reshape + * switch in metadata to next array only. + */ + if ((verify_reshape_position(info, reshape.level) >= 0) && + (info->reshape_progress == 0)) + restart = 0; + } + if (restart) { + /* reshape already started. just skip to monitoring the reshape */ + if (reshape.backup_blocks == 0) + return 0; + if (restart & RESHAPE_NO_BACKUP) + return 0; + + /* Need 'sra' down at 'started:' */ + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| + GET_CACHE); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + + if (!backup_file) + backup_file = locate_backup(sra->sys_name); + + goto started; + } + /* The container is frozen but the array may not be. + * So freeze the array so spares don't get put to the wrong use + * FIXME there should probably be a cleaner separation between + * freeze_array and freeze_container. + */ + sysfs_freeze_array(info); + /* Check we have enough spares to not be degraded */ + added_disks = 0; + for (dv = devlist; dv ; dv=dv->next) + added_disks++; + spares_needed = max(reshape.before.data_disks, + reshape.after.data_disks) + + reshape.parity - array.raid_disks; + + if (!force && + info->new_level > 1 && info->array.level > 1 && + spares_needed > info->array.spare_disks + added_disks) { + pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n" + " Use --force to over-ride this check.\n", + spares_needed, + spares_needed == 1 ? "" : "s", + info->array.spare_disks + added_disks); + goto release; + } + /* Check we have enough spares to not fail */ + spares_needed = max(reshape.before.data_disks, + reshape.after.data_disks) + - array.raid_disks; + if ((info->new_level > 1 || info->new_level == 0) && + spares_needed > info->array.spare_disks +added_disks) { + pr_err("Need %d spare%s to create working array, and only have %d.\n", + spares_needed, + spares_needed == 1 ? "" : "s", + info->array.spare_disks + added_disks); + goto release; + } + + if (reshape.level != array.level) { + int err = impose_level(fd, reshape.level, devname, verbose); + if (err) + goto release; + info->new_layout = UnSet; /* after level change, + * layout is meaningless */ + orig_level = array.level; + sysfs_freeze_array(info); + + if (reshape.level > 0 && st->ss->external) { + /* make sure mdmon is aware of the new level */ + if (mdmon_running(container)) + flush_mdmon(container); + + if (!mdmon_running(container)) + start_mdmon(container); + ping_monitor(container); + if (mdmon_running(container) && + st->update_tail == NULL) + st->update_tail = &st->updates; + } + } + /* ->reshape_super might have chosen some spares from the + * container that it wants to be part of the new array. + * We can collect them with ->container_content and give + * them to the kernel. + */ + if (st->ss->reshape_super && st->ss->container_content) { + char *subarray = strchr(info->text_version+1, '/')+1; + struct mdinfo *info2 = + st->ss->container_content(st, subarray); + struct mdinfo *d; + + if (info2) { + sysfs_init(info2, fd, st->devnm); + /* When increasing number of devices, we need to set + * new raid_disks before adding these, or they might + * be rejected. + */ + if (reshape.backup_blocks && + reshape.after.data_disks > reshape.before.data_disks) + subarray_set_num(container, info2, "raid_disks", + reshape.after.data_disks + + reshape.parity); + for (d = info2->devs; d; d = d->next) { + if (d->disk.state == 0 && + d->disk.raid_disk >= 0) { + /* This is a spare that wants to + * be part of the array. + */ + add_disk(fd, st, info2, d); + } + } + sysfs_free(info2); + } + } + /* We might have been given some devices to add to the + * array. Now that the array has been changed to the right + * level and frozen, we can safely add them. + */ + if (devlist) + Manage_subdevs(devname, fd, devlist, verbose, + 0,NULL, 0); + + if (reshape.backup_blocks == 0 && data_offset != INVALID_SECTORS) + reshape.backup_blocks = reshape.before.data_disks * info->array.chunk_size/512; + if (reshape.backup_blocks == 0) { + /* No restriping needed, but we might need to impose + * some more changes: layout, raid_disks, chunk_size + */ + /* read current array info */ + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + /* compare current array info with new values and if + * it is different update them to new */ + if (info->new_layout != UnSet && + info->new_layout != array.layout) { + array.layout = info->new_layout; + if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + pr_err("failed to set new layout\n"); + goto release; + } else if (verbose >= 0) + printf("layout for %s set to %d\n", + devname, array.layout); + } + if (info->delta_disks != UnSet && + info->delta_disks != 0 && + array.raid_disks != (info->array.raid_disks + info->delta_disks)) { + array.raid_disks += info->delta_disks; + if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + pr_err("failed to set raid disks\n"); + goto release; + } else if (verbose >= 0) { + printf("raid_disks for %s set to %d\n", + devname, array.raid_disks); + } + } + if (info->new_chunk != 0 && + info->new_chunk != array.chunk_size) { + if (sysfs_set_num(info, NULL, + "chunk_size", info->new_chunk) != 0) { + pr_err("failed to set chunk size\n"); + goto release; + } else if (verbose >= 0) + printf("chunk size for %s set to %d\n", + devname, array.chunk_size); + } + unfreeze(st); + return 0; + } + + /* + * There are three possibilities. + * 1/ The array will shrink. + * We need to ensure the reshape will pause before reaching + * the 'critical section'. We also need to fork and wait for + * that to happen. When it does we + * suspend/backup/complete/unfreeze + * + * 2/ The array will not change size. + * This requires that we keep a backup of a sliding window + * so that we can restore data after a crash. So we need + * to fork and monitor progress. + * In future we will allow the data_offset to change, so + * a sliding backup becomes unnecessary. + * + * 3/ The array will grow. This is relatively easy. + * However the kernel's restripe routines will cheerfully + * overwrite some early data before it is safe. So we + * need to make a backup of the early parts of the array + * and be ready to restore it if rebuild aborts very early. + * For externally managed metadata, we still need a forked + * child to monitor the reshape and suspend IO over the region + * that is being reshaped. + * + * We backup data by writing it to one spare, or to a + * file which was given on command line. + * + * In each case, we first make sure that storage is available + * for the required backup. + * Then we: + * - request the shape change. + * - fork to handle backup etc. + */ + /* Check that we can hold all the data */ + get_dev_size(fd, NULL, &array_size); + if (reshape.new_size < (array_size/512)) { + pr_err("this change will reduce the size of the array.\n" + " use --grow --array-size first to truncate array.\n" + " e.g. mdadm --grow %s --array-size %llu\n", + devname, reshape.new_size/2); + goto release; + } + + if (array.level == 10) { + /* Reshaping RAID10 does not require any data backup by + * user-space. Instead it requires that the data_offset + * is changed to avoid the need for backup. + * So this is handled very separately + */ + if (restart) + /* Nothing to do. */ + return 0; + return raid10_reshape(container, fd, devname, st, info, + &reshape, data_offset, + force, verbose); + } + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| + GET_CACHE); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + + if (!backup_file) + switch(set_new_data_offset(sra, st, devname, + reshape.after.data_disks - reshape.before.data_disks, + data_offset, + reshape.min_offset_change, 1)) { + case -1: + goto release; + case 0: + /* Updated data_offset, so it's easy now */ + update_cache_size(container, sra, info, + min(reshape.before.data_disks, + reshape.after.data_disks), + reshape.backup_blocks); + + /* Right, everything seems fine. Let's kick things off. + */ + sync_metadata(st); + + if (impose_reshape(sra, info, st, fd, restart, + devname, container, &reshape) < 0) + goto release; + if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) { + struct mdinfo *sd; + if (errno != EINVAL) { + pr_err("Failed to initiate reshape!\n"); + goto release; + } + /* revert data_offset and try the old way */ + for (sd = sra->devs; sd; sd = sd->next) { + sysfs_set_num(sra, sd, "new_offset", + sd->data_offset); + sysfs_set_str(sra, NULL, "reshape_direction", + "forwards"); + } + break; + } + if (info->new_level == reshape.level) + return 0; + /* need to adjust level when reshape completes */ + switch(fork()) { + case -1: /* ignore error, but don't wait */ + return 0; + default: /* parent */ + return 0; + case 0: + map_fork(); + break; + } + close(fd); + wait_reshape(sra); + fd = open_dev(sra->sys_name); + if (fd >= 0) + impose_level(fd, info->new_level, devname, verbose); + return 0; + case 1: /* Couldn't set data_offset, try the old way */ + if (data_offset != INVALID_SECTORS) { + pr_err("Cannot update data_offset on this array\n"); + goto release; + } + break; + } + +started: + /* Decide how many blocks (sectors) for a reshape + * unit. The number we have so far is just a minimum + */ + blocks = reshape.backup_blocks; + if (reshape.before.data_disks == + reshape.after.data_disks) { + /* Make 'blocks' bigger for better throughput, but + * not so big that we reject it below. + * Try for 16 megabytes + */ + while (blocks * 32 < sra->component_size && + blocks < 16*1024*2) + blocks *= 2; + } else + pr_err("Need to backup %luK of critical section..\n", blocks/2); + + if (blocks >= sra->component_size/2) { + pr_err("%s: Something wrong - reshape aborted\n", + devname); + goto release; + } + + /* Now we need to open all these devices so we can read/write. + */ + nrdisks = max(reshape.before.data_disks, + reshape.after.data_disks) + reshape.parity + + sra->array.spare_disks; + fdlist = xcalloc((1+nrdisks), sizeof(int)); + offsets = xcalloc((1+nrdisks), sizeof(offsets[0])); + + odisks = reshape.before.data_disks + reshape.parity; + d = reshape_prepare_fdlist(devname, sra, odisks, + nrdisks, blocks, backup_file, + fdlist, offsets); + if (d < odisks) { + goto release; + } + if ((st->ss->manage_reshape == NULL) || + (st->ss->recover_backup == NULL)) { + if (backup_file == NULL) { + if (reshape.after.data_disks <= + reshape.before.data_disks) { + pr_err("%s: Cannot grow - need backup-file\n", + devname); + pr_err(" Please provide one with \"--backup=...\"\n"); + goto release; + } else if (d == odisks) { + pr_err("%s: Cannot grow - need a spare or backup-file to backup critical section\n", devname); + goto release; + } + } else { + if (!reshape_open_backup_file(backup_file, fd, devname, + (signed)blocks, + fdlist+d, offsets+d, + sra->sys_name, + restart)) { + goto release; + } + d++; + } + } + + update_cache_size(container, sra, info, + min(reshape.before.data_disks, reshape.after.data_disks), + blocks); + + /* Right, everything seems fine. Let's kick things off. + * If only changing raid_disks, use ioctl, else use + * sysfs. + */ + sync_metadata(st); + + if (impose_reshape(sra, info, st, fd, restart, + devname, container, &reshape) < 0) + goto release; + + err = start_reshape(sra, restart, reshape.before.data_disks, + reshape.after.data_disks); + if (err) { + pr_err("Cannot %s reshape for %s\n", + restart ? "continue" : "start", + devname); + goto release; + } + if (restart) + sysfs_set_str(sra, NULL, "array_state", "active"); + if (freeze_reshape) { + free(fdlist); + free(offsets); + sysfs_free(sra); + pr_err("Reshape has to be continued from location %llu when root filesystem has been mounted.\n", + sra->reshape_progress); + return 1; + } + + if (!forked && !check_env("MDADM_NO_SYSTEMCTL")) + if (continue_via_systemd(container ?: sra->sys_name)) { + free(fdlist); + free(offsets); + sysfs_free(sra); + return 0; + } + + /* Now we just need to kick off the reshape and watch, while + * handling backups of the data... + * This is all done by a forked background process. + */ + switch(forked ? 0 : fork()) { + case -1: + pr_err("Cannot run child to monitor reshape: %s\n", + strerror(errno)); + abort_reshape(sra); + goto release; + default: + free(fdlist); + free(offsets); + sysfs_free(sra); + return 0; + case 0: + map_fork(); + break; + } + + /* If another array on the same devices is busy, the + * reshape will wait for them. This would mean that + * the first section that we suspend will stay suspended + * for a long time. So check on that possibility + * by looking for "DELAYED" in /proc/mdstat, and if found, + * wait a while + */ + do { + struct mdstat_ent *mds, *m; + delayed = 0; + mds = mdstat_read(1, 0); + for (m = mds; m; m = m->next) + if (strcmp(m->devnm, sra->sys_name) == 0) { + if (m->resync && + m->percent == RESYNC_DELAYED) + delayed = 1; + if (m->resync == 0) + /* Haven't started the reshape thread + * yet, wait a bit + */ + delayed = 2; + break; + } + free_mdstat(mds); + if (delayed == 1 && get_linux_version() < 3007000) { + pr_err("Reshape is delayed, but cannot wait carefully with this kernel.\n" + " You might experience problems until other reshapes complete.\n"); + delayed = 0; + } + if (delayed) + mdstat_wait(30 - (delayed-1) * 25); + } while (delayed); + mdstat_close(); + close(fd); + if (check_env("MDADM_GROW_VERIFY")) + fd = open(devname, O_RDONLY | O_DIRECT); + else + fd = -1; + mlockall(MCL_FUTURE); + + signal(SIGTERM, catch_term); + + if (st->ss->external) { + /* metadata handler takes it from here */ + done = st->ss->manage_reshape( + fd, sra, &reshape, st, blocks, + fdlist, offsets, + d - odisks, fdlist+odisks, + offsets+odisks); + } else + done = child_monitor( + fd, sra, &reshape, st, blocks, + fdlist, offsets, + d - odisks, fdlist+odisks, + offsets+odisks); + + free(fdlist); + free(offsets); + + if (backup_file && done) { + char *bul; + bul = make_backup(sra->sys_name); + if (bul) { + char buf[1024]; + int l = readlink(bul, buf, sizeof(buf) - 1); + if (l > 0) { + buf[l]=0; + unlink(buf); + } + unlink(bul); + free(bul); + } + unlink(backup_file); + } + if (!done) { + abort_reshape(sra); + goto out; + } + + if (!st->ss->external && + !(reshape.before.data_disks != reshape.after.data_disks + && info->custom_array_size) && + info->new_level == reshape.level && + !forked) { + /* no need to wait for the reshape to finish as + * there is nothing more to do. + */ + sysfs_free(sra); + exit(0); + } + wait_reshape(sra); + + if (st->ss->external) { + /* Re-load the metadata as much could have changed */ + int cfd = open_dev(st->container_devnm); + if (cfd >= 0) { + flush_mdmon(container); + st->ss->free_super(st); + st->ss->load_container(st, cfd, container); + close(cfd); + } + } + + /* set new array size if required customer_array_size is used + * by this metadata. + */ + if (reshape.before.data_disks != + reshape.after.data_disks && + info->custom_array_size) + set_array_size(st, info, info->text_version); + + if (info->new_level != reshape.level) { + if (fd < 0) + fd = open(devname, O_RDONLY); + impose_level(fd, info->new_level, devname, verbose); + close(fd); + if (info->new_level == 0) + st->update_tail = NULL; + } +out: + sysfs_free(sra); + if (forked) + return 0; + unfreeze(st); + exit(0); + +release: + free(fdlist); + free(offsets); + if (orig_level != UnSet && sra) { + c = map_num(pers, orig_level); + if (c && sysfs_set_str(sra, NULL, "level", c) == 0) + pr_err("aborting level change\n"); + } + sysfs_free(sra); + if (!forked) + unfreeze(st); + return 1; +} + +/* mdfd handle is passed to be closed in child process (after fork). + */ +int reshape_container(char *container, char *devname, + int mdfd, + struct supertype *st, + struct mdinfo *info, + int force, + char *backup_file, int verbose, + int forked, int restart, int freeze_reshape) +{ + struct mdinfo *cc = NULL; + int rv = restart; + char last_devnm[32] = ""; + + /* component_size is not meaningful for a container, + * so pass '0' meaning 'no change' + */ + if (!restart && + reshape_super(st, 0, info->new_level, + info->new_layout, info->new_chunk, + info->array.raid_disks, info->delta_disks, + backup_file, devname, APPLY_METADATA_CHANGES, + verbose)) { + unfreeze(st); + return 1; + } + + sync_metadata(st); + + /* ping monitor to be sure that update is on disk + */ + ping_monitor(container); + + if (!forked && !freeze_reshape && !check_env("MDADM_NO_SYSTEMCTL")) + if (continue_via_systemd(container)) + return 0; + + switch (forked ? 0 : fork()) { + case -1: /* error */ + perror("Cannot fork to complete reshape\n"); + unfreeze(st); + return 1; + default: /* parent */ + if (!freeze_reshape) + printf("%s: multi-array reshape continues in background\n", Name); + return 0; + case 0: /* child */ + map_fork(); + break; + } + + /* close unused handle in child process + */ + if (mdfd > -1) + close(mdfd); + + while(1) { + /* For each member array with reshape_active, + * we need to perform the reshape. + * We pick the first array that needs reshaping and + * reshape it. reshape_array() will re-read the metadata + * so the next time through a different array should be + * ready for reshape. + * It is possible that the 'different' array will not + * be assembled yet. In that case we simple exit. + * When it is assembled, the mdadm which assembles it + * will take over the reshape. + */ + struct mdinfo *content; + int fd; + struct mdstat_ent *mdstat; + char *adev; + int devid; + + sysfs_free(cc); + + cc = st->ss->container_content(st, NULL); + + for (content = cc; content ; content = content->next) { + char *subarray; + if (!content->reshape_active) + continue; + + subarray = strchr(content->text_version+1, '/')+1; + mdstat = mdstat_by_subdev(subarray, container); + if (!mdstat) + continue; + if (mdstat->active == 0) { + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); + free_mdstat(mdstat); + mdstat = NULL; + continue; + } + break; + } + if (!content) + break; + + devid = devnm2devid(mdstat->devnm); + adev = map_dev(major(devid), minor(devid), 0); + if (!adev) + adev = content->text_version; + + fd = open_dev(mdstat->devnm); + if (fd < 0) { + pr_err("Device %s cannot be opened for reshape.\n", adev); + break; + } + + if (strcmp(last_devnm, mdstat->devnm) == 0) { + /* Do not allow for multiple reshape_array() calls for + * the same array. + * It can happen when reshape_array() returns without + * error, when reshape is not finished (wrong reshape + * starting/continuation conditions). Mdmon doesn't + * switch to next array in container and reentry + * conditions for the same array occur. + * This is possibly interim until the behaviour of + * reshape_array is resolved(). + */ + printf("%s: Multiple reshape execution detected for device %s.\n", Name, adev); + close(fd); + break; + } + strcpy(last_devnm, mdstat->devnm); + + sysfs_init(content, fd, mdstat->devnm); + + if (mdmon_running(container)) + flush_mdmon(container); + + rv = reshape_array(container, fd, adev, st, + content, force, NULL, INVALID_SECTORS, + backup_file, verbose, 1, restart, + freeze_reshape); + close(fd); + + if (freeze_reshape) { + sysfs_free(cc); + exit(0); + } + + restart = 0; + if (rv) + break; + + if (mdmon_running(container)) + flush_mdmon(container); + } + if (!rv) + unfreeze(st); + sysfs_free(cc); + exit(0); +} + +/* + * We run a child process in the background which performs the following + * steps: + * - wait for resync to reach a certain point + * - suspend io to the following section + * - backup that section + * - allow resync to proceed further + * - resume io + * - discard the backup. + * + * When are combined in slightly different ways in the three cases. + * Grow: + * - suspend/backup/allow/wait/resume/discard + * Shrink: + * - allow/wait/suspend/backup/allow/wait/resume/discard + * same-size: + * - wait/resume/discard/suspend/backup/allow + * + * suspend/backup/allow always come together + * wait/resume/discard do too. + * For the same-size case we have two backups to improve flow. + * + */ + +int progress_reshape(struct mdinfo *info, struct reshape *reshape, + unsigned long long backup_point, + unsigned long long wait_point, + unsigned long long *suspend_point, + unsigned long long *reshape_completed, int *frozen) +{ + /* This function is called repeatedly by the reshape manager. + * It determines how much progress can safely be made and allows + * that progress. + * - 'info' identifies the array and particularly records in + * ->reshape_progress the metadata's knowledge of progress + * This is a sector offset from the start of the array + * of the next array block to be relocated. This number + * may increase from 0 or decrease from array_size, depending + * on the type of reshape that is happening. + * Note that in contrast, 'sync_completed' is a block count of the + * reshape so far. It gives the distance between the start point + * (head or tail of device) and the next place that data will be + * written. It always increases. + * - 'reshape' is the structure created by analyse_change + * - 'backup_point' shows how much the metadata manager has backed-up + * data. For reshapes with increasing progress, it is the next address + * to be backed up, previous addresses have been backed-up. For + * decreasing progress, it is the earliest address that has been + * backed up - later address are also backed up. + * So addresses between reshape_progress and backup_point are + * backed up providing those are in the 'correct' order. + * - 'wait_point' is an array address. When reshape_completed + * passes this point, progress_reshape should return. It might + * return earlier if it determines that ->reshape_progress needs + * to be updated or further backup is needed. + * - suspend_point is maintained by progress_reshape and the caller + * should not touch it except to initialise to zero. + * It is an array address and it only increases in 2.6.37 and earlier. + * This makes it difficult to handle reducing reshapes with + * external metadata. + * However: it is similar to backup_point in that it records the + * other end of a suspended region from reshape_progress. + * it is moved to extend the region that is safe to backup and/or + * reshape + * - reshape_completed is read from sysfs and returned. The caller + * should copy this into ->reshape_progress when it has reason to + * believe that the metadata knows this, and any backup outside this + * has been erased. + * + * Return value is: + * 1 if more data from backup_point - but only as far as suspend_point, + * should be backed up + * 0 if things are progressing smoothly + * -1 if the reshape is finished because it is all done, + * -2 if the reshape is finished due to an error. + */ + + int advancing = (reshape->after.data_disks + >= reshape->before.data_disks); + unsigned long long need_backup; /* All data between start of array and + * here will at some point need to + * be backed up. + */ + unsigned long long read_offset, write_offset; + unsigned long long write_range; + unsigned long long max_progress, target, completed; + unsigned long long array_size = (info->component_size + * reshape->before.data_disks); + int fd; + char buf[20]; + + /* First, we unsuspend any region that is now known to be safe. + * If suspend_point is on the 'wrong' side of reshape_progress, then + * we don't have or need suspension at the moment. This is true for + * native metadata when we don't need to back-up. + */ + if (advancing) { + if (info->reshape_progress <= *suspend_point) + sysfs_set_num(info, NULL, "suspend_lo", + info->reshape_progress); + } else { + /* Note: this won't work in 2.6.37 and before. + * Something somewhere should make sure we don't need it! + */ + if (info->reshape_progress >= *suspend_point) + sysfs_set_num(info, NULL, "suspend_hi", + info->reshape_progress); + } + + /* Now work out how far it is safe to progress. + * If the read_offset for ->reshape_progress is less than + * 'blocks' beyond the write_offset, we can only progress as far + * as a backup. + * Otherwise we can progress until the write_offset for the new location + * reaches (within 'blocks' of) the read_offset at the current location. + * However that region must be suspended unless we are using native + * metadata. + * If we need to suspend more, we limit it to 128M per device, which is + * rather arbitrary and should be some time-based calculation. + */ + read_offset = info->reshape_progress / reshape->before.data_disks; + write_offset = info->reshape_progress / reshape->after.data_disks; + write_range = info->new_chunk/512; + if (reshape->before.data_disks == reshape->after.data_disks) + need_backup = array_size; + else + need_backup = reshape->backup_blocks; + if (advancing) { + if (read_offset < write_offset + write_range) + max_progress = backup_point; + else + max_progress = + read_offset * + reshape->after.data_disks; + } else { + if (read_offset > write_offset - write_range) + /* Can only progress as far as has been backed up, + * which must be suspended */ + max_progress = backup_point; + else if (info->reshape_progress <= need_backup) + max_progress = backup_point; + else { + if (info->array.major_version >= 0) + /* Can progress until backup is needed */ + max_progress = need_backup; + else { + /* Can progress until metadata update is required */ + max_progress = + read_offset * + reshape->after.data_disks; + /* but data must be suspended */ + if (max_progress < *suspend_point) + max_progress = *suspend_point; + } + } + } + + /* We know it is safe to progress to 'max_progress' providing + * it is suspended or we are using native metadata. + * Consider extending suspend_point 128M per device if it + * is less than 64M per device beyond reshape_progress. + * But always do a multiple of 'blocks' + * FIXME this is too big - it takes to long to complete + * this much. + */ + target = 64*1024*2 * min(reshape->before.data_disks, + reshape->after.data_disks); + target /= reshape->backup_blocks; + if (target < 2) + target = 2; + target *= reshape->backup_blocks; + + /* For externally managed metadata we always need to suspend IO to + * the area being reshaped so we regularly push suspend_point forward. + * For native metadata we only need the suspend if we are going to do + * a backup. + */ + if (advancing) { + if ((need_backup > info->reshape_progress + || info->array.major_version < 0) && + *suspend_point < info->reshape_progress + target) { + if (need_backup < *suspend_point + 2 * target) + *suspend_point = need_backup; + else if (*suspend_point + 2 * target < array_size) + *suspend_point += 2 * target; + else + *suspend_point = array_size; + sysfs_set_num(info, NULL, "suspend_hi", *suspend_point); + if (max_progress > *suspend_point) + max_progress = *suspend_point; + } + } else { + if (info->array.major_version >= 0) { + /* Only need to suspend when about to backup */ + if (info->reshape_progress < need_backup * 2 && + *suspend_point > 0) { + *suspend_point = 0; + sysfs_set_num(info, NULL, "suspend_lo", 0); + sysfs_set_num(info, NULL, "suspend_hi", need_backup); + } + } else { + /* Need to suspend continually */ + if (info->reshape_progress < *suspend_point) + *suspend_point = info->reshape_progress; + if (*suspend_point + target < info->reshape_progress) + /* No need to move suspend region yet */; + else { + if (*suspend_point >= 2 * target) + *suspend_point -= 2 * target; + else + *suspend_point = 0; + sysfs_set_num(info, NULL, "suspend_lo", + *suspend_point); + } + if (max_progress < *suspend_point) + max_progress = *suspend_point; + } + } + + /* now set sync_max to allow that progress. sync_max, like + * sync_completed is a count of sectors written per device, so + * we find the difference between max_progress and the start point, + * and divide that by after.data_disks to get a sync_max + * number. + * At the same time we convert wait_point to a similar number + * for comparing against sync_completed. + */ + /* scale down max_progress to per_disk */ + max_progress /= reshape->after.data_disks; + /* Round to chunk size as some kernels give an erroneously high number */ + max_progress /= info->new_chunk/512; + max_progress *= info->new_chunk/512; + /* And round to old chunk size as the kernel wants that */ + max_progress /= info->array.chunk_size/512; + max_progress *= info->array.chunk_size/512; + /* Limit progress to the whole device */ + if (max_progress > info->component_size) + max_progress = info->component_size; + wait_point /= reshape->after.data_disks; + if (!advancing) { + /* switch from 'device offset' to 'processed block count' */ + max_progress = info->component_size - max_progress; + wait_point = info->component_size - wait_point; + } + + if (!*frozen) + sysfs_set_num(info, NULL, "sync_max", max_progress); + + /* Now wait. If we have already reached the point that we were + * asked to wait to, don't wait at all, else wait for any change. + * We need to select on 'sync_completed' as that is the place that + * notifications happen, but we are really interested in + * 'reshape_position' + */ + fd = sysfs_get_fd(info, NULL, "sync_completed"); + if (fd < 0) + goto check_progress; + + if (sysfs_fd_get_ll(fd, &completed) < 0) + goto check_progress; + + while (completed < max_progress && completed < wait_point) { + /* Check that sync_action is still 'reshape' to avoid + * waiting forever on a dead array + */ + char action[20]; + if (sysfs_get_str(info, NULL, "sync_action", + action, 20) <= 0 || + strncmp(action, "reshape", 7) != 0) + break; + /* Some kernels reset 'sync_completed' to zero + * before setting 'sync_action' to 'idle'. + * So we need these extra tests. + */ + if (completed == 0 && advancing + && strncmp(action, "idle", 4) == 0 + && info->reshape_progress > 0) + break; + if (completed == 0 && !advancing + && strncmp(action, "idle", 4) == 0 + && info->reshape_progress < (info->component_size + * reshape->after.data_disks)) + break; + sysfs_wait(fd, NULL); + if (sysfs_fd_get_ll(fd, &completed) < 0) + goto check_progress; + } + /* Some kernels reset 'sync_completed' to zero, + * we need to have real point we are in md. + * So in that case, read 'reshape_position' from sysfs. + */ + if (completed == 0) { + unsigned long long reshapep; + char action[20]; + if (sysfs_get_str(info, NULL, "sync_action", + action, 20) > 0 && + strncmp(action, "idle", 4) == 0 && + sysfs_get_ll(info, NULL, + "reshape_position", &reshapep) == 0) + *reshape_completed = reshapep; + } else { + /* some kernels can give an incorrectly high + * 'completed' number, so round down */ + completed /= (info->new_chunk/512); + completed *= (info->new_chunk/512); + /* Convert 'completed' back in to a 'progress' number */ + completed *= reshape->after.data_disks; + if (!advancing) + completed = (info->component_size + * reshape->after.data_disks + - completed); + *reshape_completed = completed; + } + + close(fd); + + /* We return the need_backup flag. Caller will decide + * how much - a multiple of ->backup_blocks up to *suspend_point + */ + if (advancing) + return need_backup > info->reshape_progress; + else + return need_backup >= info->reshape_progress; + +check_progress: + /* if we couldn't read a number from sync_completed, then + * either the reshape did complete, or it aborted. + * We can tell which by checking for 'none' in reshape_position. + * If it did abort, then it might immediately restart if it + * it was just a device failure that leaves us degraded but + * functioning. + */ + if (sysfs_get_str(info, NULL, "reshape_position", buf, sizeof(buf)) < 0 + || strncmp(buf, "none", 4) != 0) { + /* The abort might only be temporary. Wait up to 10 + * seconds for fd to contain a valid number again. + */ + int wait = 10000; + int rv = -2; + unsigned long long new_sync_max; + while (fd >= 0 && rv < 0 && wait > 0) { + if (sysfs_wait(fd, &wait) != 1) + break; + switch (sysfs_fd_get_ll(fd, &completed)) { + case 0: + /* all good again */ + rv = 1; + /* If "sync_max" is no longer max_progress + * we need to freeze things + */ + sysfs_get_ll(info, NULL, "sync_max", &new_sync_max); + *frozen = (new_sync_max != max_progress); + break; + case -2: /* read error - abort */ + wait = 0; + break; + } + } + if (fd >= 0) + close(fd); + return rv; /* abort */ + } else { + /* Maybe racing with array shutdown - check state */ + if (fd >= 0) + close(fd); + if (sysfs_get_str(info, NULL, "array_state", buf, sizeof(buf)) < 0 + || strncmp(buf, "inactive", 8) == 0 + || strncmp(buf, "clear",5) == 0) + return -2; /* abort */ + return -1; /* complete */ + } +} + +/* FIXME return status is never checked */ +static int grow_backup(struct mdinfo *sra, + unsigned long long offset, /* per device */ + unsigned long stripes, /* per device, in old chunks */ + int *sources, unsigned long long *offsets, + int disks, int chunk, int level, int layout, + int dests, int *destfd, unsigned long long *destoffsets, + int part, int *degraded, + char *buf) +{ + /* Backup 'blocks' sectors at 'offset' on each device of the array, + * to storage 'destfd' (offset 'destoffsets'), after first + * suspending IO. Then allow resync to continue + * over the suspended section. + * Use part 'part' of the backup-super-block. + */ + int odata = disks; + int rv = 0; + int i; + unsigned long long ll; + int new_degraded; + //printf("offset %llu\n", offset); + if (level >= 4) + odata--; + if (level == 6) + odata--; + + /* Check that array hasn't become degraded, else we might backup the wrong data */ + if (sysfs_get_ll(sra, NULL, "degraded", &ll) < 0) + return -1; /* FIXME this error is ignored */ + new_degraded = (int)ll; + if (new_degraded != *degraded) { + /* check each device to ensure it is still working */ + struct mdinfo *sd; + for (sd = sra->devs ; sd ; sd = sd->next) { + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (sd->disk.state & (1<<MD_DISK_SYNC)) { + char sbuf[20]; + if (sysfs_get_str(sra, sd, "state", sbuf, 20) < 0 || + strstr(sbuf, "faulty") || + strstr(sbuf, "in_sync") == NULL) { + /* this device is dead */ + sd->disk.state = (1<<MD_DISK_FAULTY); + if (sd->disk.raid_disk >= 0 && + sources[sd->disk.raid_disk] >= 0) { + close(sources[sd->disk.raid_disk]); + sources[sd->disk.raid_disk] = -1; + } + } + } + } + *degraded = new_degraded; + } + if (part) { + bsb.arraystart2 = __cpu_to_le64(offset * odata); + bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata); + } else { + bsb.arraystart = __cpu_to_le64(offset * odata); + bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata); + } + if (part) + bsb.magic[15] = '2'; + for (i = 0; i < dests; i++) + if (part) + lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0); + else + lseek64(destfd[i], destoffsets[i], 0); + + rv = save_stripes(sources, offsets, + disks, chunk, level, layout, + dests, destfd, + offset*512*odata, stripes * chunk * odata, + buf); + + if (rv) + return rv; + bsb.mtime = __cpu_to_le64(time(0)); + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + + bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + + rv = -1; + if ((unsigned long long)lseek64(destfd[i], destoffsets[i] - 4096, 0) + != destoffsets[i] - 4096) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + if (destoffsets[i] > 4096) { + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) != + destoffsets[i]+stripes*chunk*odata) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + } + fsync(destfd[i]); + rv = 0; + } + + return rv; +} + +/* in 2.6.30, the value reported by sync_completed can be + * less that it should be by one stripe. + * This only happens when reshape hits sync_max and pauses. + * So allow wait_backup to either extent sync_max further + * than strictly necessary, or return before the + * sync has got quite as far as we would really like. + * This is what 'blocks2' is for. + * The various caller give appropriate values so that + * every works. + */ +/* FIXME return value is often ignored */ +static int forget_backup(int dests, int *destfd, + unsigned long long *destoffsets, + int part) +{ + /* + * Erase backup 'part' (which is 0 or 1) + */ + int i; + int rv; + + if (part) { + bsb.arraystart2 = __cpu_to_le64(0); + bsb.length2 = __cpu_to_le64(0); + } else { + bsb.arraystart = __cpu_to_le64(0); + bsb.length = __cpu_to_le64(0); + } + bsb.mtime = __cpu_to_le64(time(0)); + rv = 0; + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) != + destoffsets[i]-4096) + rv = -1; + if (rv == 0 && + write(destfd[i], &bsb, 512) != 512) + rv = -1; + fsync(destfd[i]); + } + return rv; +} + +static void fail(char *msg) +{ + int rv; + rv = (write(2, msg, strlen(msg)) != (int)strlen(msg)); + rv |= (write(2, "\n", 1) != 1); + exit(rv ? 1 : 2); +} + +static char *abuf, *bbuf; +static unsigned long long abuflen; +static void validate(int afd, int bfd, unsigned long long offset) +{ + /* check that the data in the backup against the array. + * This is only used for regression testing and should not + * be used while the array is active + */ + if (afd < 0) + return; + lseek64(bfd, offset - 4096, 0); + if (read(bfd, &bsb2, 512) != 512) + fail("cannot read bsb"); + if (bsb2.sb_csum != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum)-((char*)&bsb2))) + fail("first csum bad"); + if (memcmp(bsb2.magic, "md_backup_data", 14) != 0) + fail("magic is bad"); + if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 && + bsb2.sb_csum2 != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum2)-((char*)&bsb2))) + fail("second csum bad"); + + if (__le64_to_cpu(bsb2.devstart)*512 != offset) + fail("devstart is wrong"); + + if (bsb2.length) { + unsigned long long len = __le64_to_cpu(bsb2.length)*512; + + if (abuflen < len) { + free(abuf); + free(bbuf); + abuflen = len; + if (posix_memalign((void**)&abuf, 4096, abuflen) || + posix_memalign((void**)&bbuf, 4096, abuflen)) { + abuflen = 0; + /* just stop validating on mem-alloc failure */ + return; + } + } + + lseek64(bfd, offset, 0); + if ((unsigned long long)read(bfd, bbuf, len) != len) { + //printf("len %llu\n", len); + fail("read first backup failed"); + } + lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0); + if ((unsigned long long)read(afd, abuf, len) != len) + fail("read first from array failed"); + if (memcmp(bbuf, abuf, len) != 0) { +#if 0 + int i; + printf("offset=%llu len=%llu\n", + (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len); + for (i=0; i<len; i++) + if (bbuf[i] != abuf[i]) { + printf("first diff byte %d\n", i); + break; + } +#endif + fail("data1 compare failed"); + } + } + if (bsb2.length2) { + unsigned long long len = __le64_to_cpu(bsb2.length2)*512; + + if (abuflen < len) { + free(abuf); + free(bbuf); + abuflen = len; + abuf = xmalloc(abuflen); + bbuf = xmalloc(abuflen); + } + + lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0); + if ((unsigned long long)read(bfd, bbuf, len) != len) + fail("read second backup failed"); + lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0); + if ((unsigned long long)read(afd, abuf, len) != len) + fail("read second from array failed"); + if (memcmp(bbuf, abuf, len) != 0) + fail("data2 compare failed"); + } +} + +int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) +{ + /* Monitor a reshape where backup is being performed using + * 'native' mechanism - either to a backup file, or + * to some space in a spare. + */ + char *buf; + int degraded = -1; + unsigned long long speed; + unsigned long long suspend_point, array_size; + unsigned long long backup_point, wait_point; + unsigned long long reshape_completed; + int done = 0; + int increasing = reshape->after.data_disks >= reshape->before.data_disks; + int part = 0; /* The next part of the backup area to fill. It may already + * be full, so we need to check */ + int level = reshape->level; + int layout = reshape->before.layout; + int data = reshape->before.data_disks; + int disks = reshape->before.data_disks + reshape->parity; + int chunk = sra->array.chunk_size; + struct mdinfo *sd; + unsigned long stripes; + int uuid[4]; + int frozen = 0; + + /* set up the backup-super-block. This requires the + * uuid from the array. + */ + /* Find a superblock */ + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int devfd; + int ok; + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + dn = map_dev(sd->disk.major, sd->disk.minor, 1); + devfd = dev_open(dn, O_RDONLY); + if (devfd < 0) + continue; + ok = st->ss->load_super(st, devfd, NULL); + close(devfd); + if (ok == 0) + break; + } + if (!sd) { + pr_err("Cannot find a superblock\n"); + return 0; + } + + memset(&bsb, 0, 512); + memcpy(bsb.magic, "md_backup_data-1", 16); + st->ss->uuid_from_super(st, uuid); + memcpy(bsb.set_uuid, uuid, 16); + bsb.mtime = __cpu_to_le64(time(0)); + bsb.devstart2 = blocks; + + stripes = blocks / (sra->array.chunk_size/512) / + reshape->before.data_disks; + + if (posix_memalign((void**)&buf, 4096, disks * chunk)) + /* Don't start the 'reshape' */ + return 0; + if (reshape->before.data_disks == reshape->after.data_disks) { + sysfs_get_ll(sra, NULL, "sync_speed_min", &speed); + sysfs_set_num(sra, NULL, "sync_speed_min", 200000); + } + + if (increasing) { + array_size = sra->component_size * reshape->after.data_disks; + backup_point = sra->reshape_progress; + suspend_point = 0; + } else { + array_size = sra->component_size * reshape->before.data_disks; + backup_point = reshape->backup_blocks; + suspend_point = array_size; + } + + while (!done) { + int rv; + + /* Want to return as soon the oldest backup slot can + * be released as that allows us to start backing up + * some more, providing suspend_point has been + * advanced, which it should have. + */ + if (increasing) { + wait_point = array_size; + if (part == 0 && __le64_to_cpu(bsb.length) > 0) + wait_point = (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length)); + if (part == 1 && __le64_to_cpu(bsb.length2) > 0) + wait_point = (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2)); + } else { + wait_point = 0; + if (part == 0 && __le64_to_cpu(bsb.length) > 0) + wait_point = __le64_to_cpu(bsb.arraystart); + if (part == 1 && __le64_to_cpu(bsb.length2) > 0) + wait_point = __le64_to_cpu(bsb.arraystart2); + } + + reshape_completed = sra->reshape_progress; + rv = progress_reshape(sra, reshape, + backup_point, wait_point, + &suspend_point, &reshape_completed, + &frozen); + /* external metadata would need to ping_monitor here */ + sra->reshape_progress = reshape_completed; + + /* Clear any backup region that is before 'here' */ + if (increasing) { + if (__le64_to_cpu(bsb.length) > 0 && + reshape_completed >= (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length))) + forget_backup(dests, destfd, + destoffsets, 0); + if (__le64_to_cpu(bsb.length2) > 0 && + reshape_completed >= (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2))) + forget_backup(dests, destfd, + destoffsets, 1); + } else { + if (__le64_to_cpu(bsb.length) > 0 && + reshape_completed <= (__le64_to_cpu(bsb.arraystart))) + forget_backup(dests, destfd, + destoffsets, 0); + if (__le64_to_cpu(bsb.length2) > 0 && + reshape_completed <= (__le64_to_cpu(bsb.arraystart2))) + forget_backup(dests, destfd, + destoffsets, 1); + } + if (sigterm) + rv = -2; + if (rv < 0) { + if (rv == -1) + done = 1; + break; + } + if (rv == 0 && increasing && !st->ss->external) { + /* No longer need to monitor this reshape */ + sysfs_set_str(sra, NULL, "sync_max", "max"); + done = 1; + break; + } + + while (rv) { + unsigned long long offset; + unsigned long actual_stripes; + /* Need to backup some data. + * If 'part' is not used and the desired + * backup size is suspended, do a backup, + * then consider the next part. + */ + /* Check that 'part' is unused */ + if (part == 0 && __le64_to_cpu(bsb.length) != 0) + break; + if (part == 1 && __le64_to_cpu(bsb.length2) != 0) + break; + + offset = backup_point / data; + actual_stripes = stripes; + if (increasing) { + if (offset + actual_stripes * (chunk/512) > + sra->component_size) + actual_stripes = ((sra->component_size - offset) + / (chunk/512)); + if (offset + actual_stripes * (chunk/512) > + suspend_point/data) + break; + } else { + if (offset < actual_stripes * (chunk/512)) + actual_stripes = offset / (chunk/512); + offset -= actual_stripes * (chunk/512); + if (offset < suspend_point/data) + break; + } + if (actual_stripes == 0) + break; + grow_backup(sra, offset, actual_stripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + part, °raded, buf); + validate(afd, destfd[0], destoffsets[0]); + /* record where 'part' is up to */ + part = !part; + if (increasing) + backup_point += actual_stripes * (chunk/512) * data; + else + backup_point -= actual_stripes * (chunk/512) * data; + } + } + + /* FIXME maybe call progress_reshape one more time instead */ + /* remove any remaining suspension */ + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "sync_min", 0); + + if (reshape->before.data_disks == reshape->after.data_disks) + sysfs_set_num(sra, NULL, "sync_speed_min", speed); + free(buf); + return done; +} + +/* + * If any spare contains md_back_data-1 which is recent wrt mtime, + * write that data into the array and update the super blocks with + * the new reshape_progress + */ +int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt, + char *backup_file, int verbose) +{ + int i, j; + int old_disks; + unsigned long long *offsets; + unsigned long long nstripe, ostripe; + int ndata, odata; + + odata = info->array.raid_disks - info->delta_disks - 1; + if (info->array.level == 6) odata--; /* number of data disks */ + ndata = info->array.raid_disks - 1; + if (info->new_level == 6) ndata--; + + old_disks = info->array.raid_disks - info->delta_disks; + + if (info->delta_disks <= 0) + /* Didn't grow, so the backup file must have + * been used + */ + old_disks = cnt; + for (i=old_disks-(backup_file?1:0); i<cnt; i++) { + struct mdinfo dinfo; + int fd; + int bsbsize; + char *devname, namebuf[20]; + unsigned long long lo, hi; + + /* This was a spare and may have some saved data on it. + * Load the superblock, find and load the + * backup_super_block. + * If either fail, go on to next device. + * If the backup contains no new info, just return + * else restore data and update all superblocks + */ + if (i == old_disks-1) { + fd = open(backup_file, O_RDONLY); + if (fd<0) { + pr_err("backup file %s inaccessible: %s\n", + backup_file, strerror(errno)); + continue; + } + devname = backup_file; + } else { + fd = fdlist[i]; + if (fd < 0) + continue; + if (st->ss->load_super(st, fd, NULL)) + continue; + + st->ss->getinfo_super(st, &dinfo, NULL); + st->ss->free_super(st); + + if (lseek64(fd, + (dinfo.data_offset + dinfo.component_size - 8) <<9, + 0) < 0) { + pr_err("Cannot seek on device %d\n", i); + continue; /* Cannot seek */ + } + sprintf(namebuf, "device-%d", i); + devname = namebuf; + } + if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) { + if (verbose) + pr_err("Cannot read from %s\n", devname); + continue; /* Cannot read */ + } + if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 && + memcmp(bsb.magic, "md_backup_data-2", 16) != 0) { + if (verbose) + pr_err("No backup metadata on %s\n", devname); + continue; + } + if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) { + if (verbose) + pr_err("Bad backup-metadata checksum on %s\n", devname); + continue; /* bad checksum */ + } + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 && + bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) { + if (verbose) + pr_err("Bad backup-metadata checksum2 on %s\n", devname); + continue; /* Bad second checksum */ + } + if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) { + if (verbose) + pr_err("Wrong uuid on backup-metadata on %s\n", devname); + continue; /* Wrong uuid */ + } + + /* array utime and backup-mtime should be updated at much the same time, but it seems that + * sometimes they aren't... So allow considerable flexability in matching, and allow + * this test to be overridden by an environment variable. + */ + if (info->array.utime > (int)__le64_to_cpu(bsb.mtime) + 2*60*60 || + info->array.utime < (int)__le64_to_cpu(bsb.mtime) - 10*60) { + if (check_env("MDADM_GROW_ALLOW_OLD")) { + pr_err("accepting backup with timestamp %lu for array with timestamp %lu\n", + (unsigned long)__le64_to_cpu(bsb.mtime), + (unsigned long)info->array.utime); + } else { + pr_err("too-old timestamp on backup-metadata on %s\n", devname); + pr_err("If you think it is should be safe, try 'export MDADM_GROW_ALLOW_OLD=1'\n"); + continue; /* time stamp is too bad */ + } + } + + if (bsb.magic[15] == '1') { + if (bsb.length == 0) + continue; + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length) + < info->reshape_progress) { + nonew: + if (verbose) + pr_err("backup-metadata found on %s but is not needed\n", devname); + continue; /* No new data here */ + } + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } + } else { + if (bsb.length == 0 && bsb.length2 == 0) + continue; + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if ((__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length) + < info->reshape_progress) + && + (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2) + < info->reshape_progress)) + goto nonew; /* No new data here */ + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } + } + if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) { + second_fail: + if (verbose) + pr_err("Failed to verify secondary backup-metadata block on %s\n", + devname); + continue; /* Cannot seek */ + } + /* There should be a duplicate backup superblock 4k before here */ + if (lseek64(fd, -4096, 1) < 0 || + read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2)) + goto second_fail; /* Cannot find leading superblock */ + if (bsb.magic[15] == '1') + bsbsize = offsetof(struct mdp_backup_super, pad1); + else + bsbsize = offsetof(struct mdp_backup_super, pad); + if (memcmp(&bsb2, &bsb, bsbsize) != 0) + goto second_fail; /* Cannot find leading superblock */ + + /* Now need the data offsets for all devices. */ + offsets = xmalloc(sizeof(*offsets)*info->array.raid_disks); + for(j=0; j<info->array.raid_disks; j++) { + if (fdlist[j] < 0) + continue; + if (st->ss->load_super(st, fdlist[j], NULL)) + /* FIXME should be this be an error */ + continue; + st->ss->getinfo_super(st, &dinfo, NULL); + st->ss->free_super(st); + offsets[j] = dinfo.data_offset * 512; + } + printf("%s: restoring critical section\n", Name); + + if (restore_stripes(fdlist, offsets, + info->array.raid_disks, + info->new_chunk, + info->new_level, + info->new_layout, + fd, __le64_to_cpu(bsb.devstart)*512, + __le64_to_cpu(bsb.arraystart)*512, + __le64_to_cpu(bsb.length)*512, NULL)) { + /* didn't succeed, so giveup */ + if (verbose) + pr_err("Error restoring backup from %s\n", + devname); + free(offsets); + return 1; + } + + if (bsb.magic[15] == '2' && + restore_stripes(fdlist, offsets, + info->array.raid_disks, + info->new_chunk, + info->new_level, + info->new_layout, + fd, __le64_to_cpu(bsb.devstart)*512 + + __le64_to_cpu(bsb.devstart2)*512, + __le64_to_cpu(bsb.arraystart2)*512, + __le64_to_cpu(bsb.length2)*512, NULL)) { + /* didn't succeed, so giveup */ + if (verbose) + pr_err("Error restoring second backup from %s\n", + devname); + free(offsets); + return 1; + } + + free(offsets); + + /* Ok, so the data is restored. Let's update those superblocks. */ + + lo = hi = 0; + if (bsb.length) { + lo = __le64_to_cpu(bsb.arraystart); + hi = lo + __le64_to_cpu(bsb.length); + } + if (bsb.magic[15] == '2' && bsb.length2) { + unsigned long long lo1, hi1; + lo1 = __le64_to_cpu(bsb.arraystart2); + hi1 = lo1 + __le64_to_cpu(bsb.length2); + if (lo == hi) { + lo = lo1; + hi = hi1; + } else if (lo < lo1) + hi = hi1; + else + lo = lo1; + } + if (lo < hi && + (info->reshape_progress < lo || + info->reshape_progress > hi)) + /* backup does not affect reshape_progress*/ ; + else if (info->delta_disks >= 0) { + info->reshape_progress = __le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length); + if (bsb.magic[15] == '2') { + unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2); + if (p2 > info->reshape_progress) + info->reshape_progress = p2; + } + } else { + info->reshape_progress = __le64_to_cpu(bsb.arraystart); + if (bsb.magic[15] == '2') { + unsigned long long p2 = __le64_to_cpu(bsb.arraystart2); + if (p2 < info->reshape_progress) + info->reshape_progress = p2; + } + } + for (j=0; j<info->array.raid_disks; j++) { + if (fdlist[j] < 0) + continue; + if (st->ss->load_super(st, fdlist[j], NULL)) + continue; + st->ss->getinfo_super(st, &dinfo, NULL); + dinfo.reshape_progress = info->reshape_progress; + st->ss->update_super(st, &dinfo, + "_reshape_progress", + NULL,0, 0, NULL); + st->ss->store_super(st, fdlist[j]); + st->ss->free_super(st); + } + return 0; + } + /* Didn't find any backup data, try to see if any + * was needed. + */ + if (info->delta_disks < 0) { + /* When shrinking, the critical section is at the end. + * So see if we are before the critical section. + */ + unsigned long long first_block; + nstripe = ostripe = 0; + first_block = 0; + while (ostripe >= nstripe) { + ostripe += info->array.chunk_size / 512; + first_block = ostripe * odata; + nstripe = first_block / ndata / (info->new_chunk/512) * + (info->new_chunk/512); + } + + if (info->reshape_progress >= first_block) + return 0; + } + if (info->delta_disks > 0) { + /* See if we are beyond the critical section. */ + unsigned long long last_block; + nstripe = ostripe = 0; + last_block = 0; + while (nstripe >= ostripe) { + nstripe += info->new_chunk / 512; + last_block = nstripe * ndata; + ostripe = last_block / odata / (info->array.chunk_size/512) * + (info->array.chunk_size/512); + } + + if (info->reshape_progress >= last_block) + return 0; + } + /* needed to recover critical section! */ + if (verbose) + pr_err("Failed to find backup of critical section\n"); + return 1; +} + +int Grow_continue_command(char *devname, int fd, + char *backup_file, int verbose) +{ + int ret_val = 0; + struct supertype *st = NULL; + struct mdinfo *content = NULL; + struct mdinfo array; + char *subarray = NULL; + struct mdinfo *cc = NULL; + struct mdstat_ent *mdstat = NULL; + int cfd = -1; + int fd2 = -1; + + dprintf("Grow continue from command line called for %s\n", + devname); + + st = super_by_fd(fd, &subarray); + if (!st || !st->ss) { + pr_err("Unable to determine metadata format for %s\n", + devname); + return 1; + } + dprintf("Grow continue is run for "); + if (st->ss->external == 0) { + int d; + dprintf_cont("native array (%s)\n", devname); + if (ioctl(fd, GET_ARRAY_INFO, &array.array) < 0) { + pr_err("%s is not an active md array - aborting\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + content = &array; + /* Need to load a superblock. + * FIXME we should really get what we need from + * sysfs + */ + for (d = 0; d < MAX_DISKS; d++) { + mdu_disk_info_t disk; + char *dv; + int err; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + if ((disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + /* invalidate fd2 to avoid possible double close() */ + fd2 = -1; + if (err) + continue; + break; + } + if (d == MAX_DISKS) { + pr_err("Unable to load metadata for %s\n", + devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + st->ss->getinfo_super(st, content, NULL); + } else { + char *container; + + if (subarray) { + dprintf_cont("subarray (%s)\n", subarray); + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); + } else { + container = st->devnm; + close(fd); + cfd = open_dev_excl(st->devnm); + dprintf_cont("container (%s)\n", container); + fd = cfd; + } + if (cfd < 0) { + pr_err("Unable to open container for %s\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + + /* find in container array under reshape + */ + ret_val = st->ss->load_container(st, cfd, NULL); + if (ret_val) { + pr_err("Cannot read superblock for %s\n", + devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + + cc = st->ss->container_content(st, subarray); + for (content = cc; content ; content = content->next) { + char *array; + int allow_reshape = 1; + + if (content->reshape_active == 0) + continue; + /* The decision about array or container wide + * reshape is taken in Grow_continue based + * content->reshape_active state, therefore we + * need to check_reshape based on + * reshape_active and subarray name + */ + if (content->array.state & (1<<MD_SB_BLOCK_VOLUME)) + allow_reshape = 0; + if (content->reshape_active == CONTAINER_RESHAPE && + (content->array.state + & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE))) + allow_reshape = 0; + + if (!allow_reshape) { + pr_err("cannot continue reshape of an array in container with unsupported metadata: %s(%s)\n", + devname, container); + ret_val = 1; + goto Grow_continue_command_exit; + } + + array = strchr(content->text_version+1, '/')+1; + mdstat = mdstat_by_subdev(array, container); + if (!mdstat) + continue; + if (mdstat->active == 0) { + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); + free_mdstat(mdstat); + mdstat = NULL; + continue; + } + break; + } + if (!content) { + pr_err("Unable to determine reshaped array for %s\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + fd2 = open_dev(mdstat->devnm); + if (fd2 < 0) { + pr_err("cannot open (%s)\n", mdstat->devnm); + ret_val = 1; + goto Grow_continue_command_exit; + } + + sysfs_init(content, fd2, mdstat->devnm); + + /* start mdmon in case it is not running + */ + if (!mdmon_running(container)) + start_mdmon(container); + ping_monitor(container); + + if (mdmon_running(container)) + st->update_tail = &st->updates; + else { + pr_err("No mdmon found. Grow cannot continue.\n"); + ret_val = 1; + goto Grow_continue_command_exit; + } + } + + /* verify that array under reshape is started from + * correct position + */ + if (verify_reshape_position(content, content->array.level) < 0) { + ret_val = 1; + goto Grow_continue_command_exit; + } + + /* continue reshape + */ + ret_val = Grow_continue(fd, st, content, backup_file, 1, 0); + +Grow_continue_command_exit: + if (fd2 > -1) + close(fd2); + if (cfd > -1) + close(cfd); + st->ss->free_super(st); + free_mdstat(mdstat); + sysfs_free(cc); + free(subarray); + + return ret_val; +} + +int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, + char *backup_file, int forked, int freeze_reshape) +{ + int ret_val = 2; + + if (!info->reshape_active) + return ret_val; + + if (st->ss->external) { + int cfd = open_dev(st->container_devnm); + + if (cfd < 0) + return 1; + + st->ss->load_container(st, cfd, st->container_devnm); + close(cfd); + ret_val = reshape_container(st->container_devnm, NULL, mdfd, + st, info, 0, backup_file, + 0, forked, + 1 | info->reshape_active, + freeze_reshape); + } else + ret_val = reshape_array(NULL, mdfd, "array", st, info, 1, + NULL, INVALID_SECTORS, + backup_file, 0, forked, + 1 | info->reshape_active, + freeze_reshape); + + return ret_val; +} + +char *make_backup(char *name) +{ + char *base = "backup_file-"; + int len; + char *fname; + + len = strlen(MAP_DIR) + 1 + strlen(base) + strlen(name)+1; + fname = xmalloc(len); + sprintf(fname, "%s/%s%s", MAP_DIR, base, name); + return fname; +} + +char *locate_backup(char *name) +{ + char *fl = make_backup(name); + struct stat stb; + + if (stat(fl, &stb) == 0 && + S_ISREG(stb.st_mode)) + return fl; + + free(fl); + return NULL; +} diff --git a/INSTALL b/INSTALL new file mode 100644 index 00000000..f7bcc3e6 --- /dev/null +++ b/INSTALL @@ -0,0 +1,13 @@ + +To build mdadm, simply run: + + make + +to install, run + + make install + +as root. + + +No configuration is necessary. diff --git a/Incremental.c b/Incremental.c new file mode 100644 index 00000000..41876b9e --- /dev/null +++ b/Incremental.c @@ -0,0 +1,1786 @@ +/* + * Incremental.c - support --incremental. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * Paper: Neil Brown + * Novell Inc + * GPO Box Q1283 + * QVB Post Office, NSW 1230 + * Australia + */ + +#include "mdadm.h" +#include <sys/wait.h> +#include <dirent.h> +#include <ctype.h> + +static int count_active(struct supertype *st, struct mdinfo *sra, + int mdfd, char **availp, + struct mdinfo *info); +static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, + int number, __u64 events, int verbose, + char *array_name); +static int try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, + struct supertype *st, int verbose); + +static int Incremental_container(struct supertype *st, char *devname, + struct context *c, char *only); + +int Incremental(struct mddev_dev *devlist, struct context *c, + struct supertype *st) +{ + /* Add this device to an array, creating the array if necessary + * and starting the array if sensible or - if runstop>0 - if possible. + * + * This has several steps: + * + * 1/ Check if device is permitted by mdadm.conf, reject if not. + * 2/ Find metadata, reject if none appropriate (check + * version/name from args) + * 3/ Check if there is a match in mdadm.conf + * 3a/ if not, check for homehost match. If no match, assemble as + * a 'foreign' array. + * 4/ Determine device number. + * - If in mdadm.conf with std name, use that + * - UUID in /var/run/mdadm.map use that + * - If name is suggestive, use that. unless in use with different uuid. + * - Choose a free, high number. + * - Use a partitioned device unless strong suggestion not to. + * e.g. auto=md + * Don't choose partitioned for containers. + * 5/ Find out if array already exists + * 5a/ if it does not + * - choose a name, from mdadm.conf or 'name' field in array. + * - create the array + * - add the device + * 5b/ if it does + * - check one drive in array to make sure metadata is a reasonably + * close match. Reject if not (e.g. different type) + * - add the device + * 6/ Make sure /var/run/mdadm.map contains this array. + * 7/ Is there enough devices to possibly start the array? + * For a container, this means running Incremental_container. + * 7a/ if not, finish with success. + * 7b/ if yes, + * - read all metadata and arrange devices like -A does + * - if number of OK devices match expected, or -R and there are enough, + * start the array (auto-readonly). + */ + struct stat stb; + struct mdinfo info, dinfo; + struct mdinfo *sra = NULL, *d; + struct mddev_ident *match; + char chosen_name[1024]; + char *md_devname; + int rv = 1; + struct map_ent *mp, *map = NULL; + int dfd = -1, mdfd = -1; + char *avail = NULL; + int active_disks; + int trustworthy; + char *name_to_use; + mdu_array_info_t ainf; + struct dev_policy *policy = NULL; + struct map_ent target_array; + int have_target; + char *devname = devlist->devname; + + struct createinfo *ci = conf_get_create_info(); + + if (stat(devname, &stb) < 0) { + if (c->verbose >= 0) + pr_err("stat failed for %s: %s.\n", + devname, strerror(errno)); + return rv; + } + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + if (c->verbose >= 0) + pr_err("%s is not a block device.\n", + devname); + return rv; + } + dfd = dev_open(devname, O_RDONLY); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot open %s: %s.\n", + devname, strerror(errno)); + return rv; + } + /* If the device is a container, we do something very different */ + if (must_be_container(dfd)) { + if (!st) + st = super_by_fd(dfd, NULL); + if (st && st->ss->load_container) + rv = st->ss->load_container(st, dfd, NULL); + + close(dfd); + if (!rv && st->ss->container_content) { + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + if (c->export) + printf("MD_DEVNAME=%s\n", devname); + rv = Incremental_container(st, devname, c, NULL); + map_unlock(&map); + return rv; + } + + pr_err("%s is not part of an md array.\n", + devname); + return rv; + } + + /* 1/ Check if device is permitted by mdadm.conf */ + + for (;devlist; devlist = devlist->next) + if (conf_test_dev(devlist->devname)) + break; + if (!devlist) { + devlist = conf_get_devs(); + for (;devlist; devlist = devlist->next) { + struct stat st2; + if (stat(devlist->devname, &st2) == 0 && + (st2.st_mode & S_IFMT) == S_IFBLK && + st2.st_rdev == stb.st_rdev) + break; + } + } + if (!devlist) { + if (c->verbose >= 0) + pr_err("%s not permitted by mdadm.conf.\n", + devname); + goto out; + } + + /* 2/ Find metadata, reject if none appropriate (check + * version/name from args) */ + + if (fstat(dfd, &stb) < 0) { + if (c->verbose >= 0) + pr_err("fstat failed for %s: %s.\n", + devname, strerror(errno)); + goto out; + } + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + if (c->verbose >= 0) + pr_err("%s is not a block device.\n", + devname); + goto out; + } + + dinfo.disk.major = major(stb.st_rdev); + dinfo.disk.minor = minor(stb.st_rdev); + + policy = disk_policy(&dinfo); + have_target = policy_check_path(&dinfo, &target_array); + + if (st == NULL && (st = guess_super_type(dfd, guess_array)) == NULL) { + if (c->verbose >= 0) + pr_err("no recognisable superblock on %s.\n", + devname); + rv = try_spare(devname, &dfd, policy, + have_target ? &target_array : NULL, + NULL, c->verbose); + goto out; + } + st->ignore_hw_compat = 0; + + if (st->ss->compare_super == NULL || + st->ss->load_super(st, dfd, c->verbose >= 0 ? devname : NULL)) { + if (c->verbose >= 0) + pr_err("no RAID superblock on %s.\n", + devname); + rv = try_spare(devname, &dfd, policy, + have_target ? &target_array : NULL, + st, c->verbose); + free(st); + goto out; + } + close (dfd); dfd = -1; + + st->ss->getinfo_super(st, &info, NULL); + + /* 3/ Check if there is a match in mdadm.conf */ + match = conf_match(st, &info, devname, c->verbose, &rv); + if (!match && rv == 2) + goto out; + + if (match && match->devname + && strcasecmp(match->devname, "<ignore>") == 0) { + if (c->verbose >= 0) + pr_err("array containing %s is explicitly ignored by mdadm.conf\n", + devname); + goto out; + } + + /* 3a/ if not, check for homehost match. If no match, continue + * but don't trust the 'name' in the array. Thus a 'random' minor + * number will be assigned, and the device name will be based + * on that. */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, c->homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL_ANY; + else + trustworthy = FOREIGN; + + if (!match && !conf_test_metadata(st->ss->name, policy, + (trustworthy == LOCAL))) { + if (c->verbose >= 1) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, st->ss->name); + goto out; + } + if (trustworthy == LOCAL_ANY) + trustworthy = LOCAL; + + /* There are three possible sources for 'autof': command line, + * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf. + * ARRAY takes precedence, then command line, then + * CREATE. + */ + if (match && match->autof) + c->autof = match->autof; + if (c->autof == 0) + c->autof = ci->autof; + + name_to_use = info.name; + if (name_to_use[0] == 0 && + info.array.level == LEVEL_CONTAINER) { + name_to_use = info.text_version; + trustworthy = METADATA; + } + if (name_to_use[0] && trustworthy != LOCAL && + ! c->require_homehost && + conf_name_is_free(name_to_use)) + trustworthy = LOCAL; + + /* strip "hostname:" prefix from name if we have decided + * to treat it as LOCAL + */ + if (trustworthy == LOCAL && strchr(name_to_use, ':') != NULL) + name_to_use = strchr(name_to_use, ':')+1; + + /* 4/ Check if array exists. + */ + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + /* Now check we can get O_EXCL. If not, probably "mdadm -A" has + * taken over + */ + dfd = dev_open(devname, O_RDONLY|O_EXCL); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot reopen %s: %s.\n", + devname, strerror(errno)); + goto out_unlock; + } + /* Cannot hold it open while we add the device to the array, + * so we must release the O_EXCL and depend on the map_lock() + * So now is the best time to remove any partitions. + */ + remove_partitions(dfd); + close(dfd); + dfd = -1; + + mp = map_by_uuid(&map, info.uuid); + if (mp) + mdfd = open_dev(mp->devnm); + else + mdfd = -1; + + if (mdfd < 0) { + + /* Couldn't find an existing array, maybe make a new one */ + mdfd = create_mddev(match ? match->devname : NULL, + name_to_use, c->autof, trustworthy, chosen_name); + + if (mdfd < 0) + goto out_unlock; + + sysfs_init(&info, mdfd, NULL); + + if (set_array_info(mdfd, st, &info) != 0) { + pr_err("failed to set array info for %s: %s\n", + chosen_name, strerror(errno)); + rv = 2; + goto out_unlock; + } + + dinfo = info; + dinfo.disk.major = major(stb.st_rdev); + dinfo.disk.minor = minor(stb.st_rdev); + if (add_disk(mdfd, st, &info, &dinfo) != 0) { + pr_err("failed to add %s to new array %s: %s.\n", + devname, chosen_name, strerror(errno)); + ioctl(mdfd, STOP_ARRAY, 0); + rv = 2; + goto out_unlock; + } + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + + if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) { + /* It really should be 'none' - must be old buggy + * kernel, and mdadm -I may not be able to complete. + * So reject it. + */ + ioctl(mdfd, STOP_ARRAY, NULL); + pr_err("You have an old buggy kernel which cannot support\n --incremental reliably. Aborting.\n"); + rv = 2; + goto out_unlock; + } + info.array.working_disks = 1; + /* 6/ Make sure /var/run/mdadm.map contains this array. */ + map_update(&map, fd2devnm(mdfd), + info.text_version, + info.uuid, chosen_name); + } else { + /* 5b/ if it does */ + /* - check one drive in array to make sure metadata is a reasonably */ + /* close match. Reject if not (e.g. different type) */ + /* - add the device */ + char dn[20]; + int dfd2; + int err; + struct supertype *st2; + struct mdinfo info2, *d; + + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, mp->devnm); + + /* It is generally not OK to add non-spare drives to a + * running array as they are probably missing because + * they failed. However if runstop is 1, then the + * array was possibly started early and our best bet is + * to add this anyway. + * Also if action policy is re-add or better we allow + * re-add. + * This doesn't apply to containers as the 'non-spare' + * flag has a different meaning. The test has to happen + * at the device level there + */ + if (!st->ss->external + && (info.disk.state & (1<<MD_DISK_SYNC)) != 0 + && ! policy_action_allows(policy, st->ss->name, + act_re_add) + && c->runstop < 1) { + if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) { + pr_err("not adding %s to active array (without --run) %s\n", + devname, chosen_name); + rv = 2; + goto out_unlock; + } + } + if (!sra) { + rv = 2; + goto out_unlock; + } + if (sra->devs) { + sprintf(dn, "%d:%d", sra->devs->disk.major, + sra->devs->disk.minor); + dfd2 = dev_open(dn, O_RDONLY); + if (dfd2 < 0) { + pr_err("unable to open %s\n", devname); + rv = 2; + goto out_unlock; + } + st2 = dup_super(st); + if (st2->ss->load_super(st2, dfd2, NULL) || + st->ss->compare_super(st, st2) != 0) { + pr_err("metadata mismatch between %s and chosen array %s\n", + devname, chosen_name); + close(dfd2); + rv = 2; + goto out_unlock; + } + close(dfd2); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + if (info.array.level != info2.array.level || + memcmp(info.uuid, info2.uuid, 16) != 0 || + info.array.raid_disks != info2.array.raid_disks) { + pr_err("unexpected difference between %s and %s.\n", + chosen_name, devname); + rv = 2; + goto out_unlock; + } + } + info.disk.major = major(stb.st_rdev); + info.disk.minor = minor(stb.st_rdev); + /* add disk needs to know about containers */ + if (st->ss->external) + sra->array.level = LEVEL_CONTAINER; + err = add_disk(mdfd, st, sra, &info); + if (err < 0 && errno == EBUSY) { + /* could be another device present with the same + * disk.number. Find and reject any such + */ + find_reject(mdfd, st, sra, info.disk.number, + info.events, c->verbose, chosen_name); + err = add_disk(mdfd, st, sra, &info); + } + if (err < 0 && errno == EINVAL && + info.disk.state & (1<<MD_DISK_SYNC)) { + /* Maybe it needs to be added as a spare */ + if (policy_action_allows(policy, st->ss->name, + act_force_spare)) { + info.disk.state &= ~(1<<MD_DISK_SYNC); + err = add_disk(mdfd, st, sra, &info); + } else + if (c->verbose >= 0) + pr_err("can only add %s to %s as a spare, and force-spare is not set.\n", + devname, chosen_name); + } + if (err < 0) { + pr_err("failed to add %s to existing array %s: %s.\n", + devname, chosen_name, strerror(errno)); + rv = 2; + goto out_unlock; + } + info.array.working_disks = 0; + for (d = sra->devs; d; d=d->next) + info.array.working_disks ++; + + } + if (strncmp(chosen_name, "/dev/md/", 8) == 0) + md_devname = chosen_name+8; + else + md_devname = chosen_name; + if (c->export) { + printf("MD_DEVICE=%s\n", fd2devnm(mdfd)); + printf("MD_DEVNAME=%s\n", md_devname); + printf("MD_FOREIGN=%s\n", trustworthy == FOREIGN ? "yes" : "no"); + } + + /* 7/ Is there enough devices to possibly start the array? */ + /* 7a/ if not, finish with success. */ + if (info.array.level == LEVEL_CONTAINER) { + char devnm[32]; + /* Try to assemble within the container */ + sysfs_uevent(sra, "change"); + if (!c->export && c->verbose >= 0) + pr_err("container %s now has %d device%s\n", + chosen_name, info.array.working_disks, + info.array.working_disks == 1?"":"s"); + wait_for(chosen_name, mdfd); + if (st->ss->external) + strcpy(devnm, fd2devnm(mdfd)); + if (st->ss->load_container) + rv = st->ss->load_container(st, mdfd, NULL); + close(mdfd); + sysfs_free(sra); + if (!rv) + rv = Incremental_container(st, chosen_name, c, NULL); + map_unlock(&map); + /* after spare is added, ping monitor for external metadata + * so that it can eg. try to rebuild degraded array */ + if (st->ss->external) + ping_monitor(devnm); + return rv; + } + + /* We have added something to the array, so need to re-read the + * state. Eventually this state should be kept up-to-date as + * things change. + */ + sysfs_free(sra); + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + active_disks = count_active(st, sra, mdfd, &avail, &info); + if (enough(info.array.level, info.array.raid_disks, + info.array.layout, info.array.state & 1, + avail) == 0) { + if (c->export) { + printf("MD_STARTED=no\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start (%d).\n", + devname, chosen_name, active_disks); + rv = 0; + goto out_unlock; + } + + /* 7b/ if yes, */ + /* - if number of OK devices match expected, or -R and there */ + /* are enough, */ + /* + add any bitmap file */ + /* + start the array (auto-readonly). */ + + if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) { + if (c->export) { + printf("MD_STARTED=already\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s which is already active.\n", + devname, chosen_name); + rv = 0; + goto out_unlock; + } + + map_unlock(&map); + if (c->runstop > 0 || active_disks >= info.array.working_disks) { + struct mdinfo *dsk; + /* Let's try to start it */ + + if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) { + pr_err("%s: This array is being reshaped and cannot be started\n", + chosen_name); + cont_err("by --incremental. Please use --assemble\n"); + goto out; + } + if (match && match->bitmap_file) { + int bmfd = open(match->bitmap_file, O_RDWR); + if (bmfd < 0) { + pr_err("Could not open bitmap file %s.\n", + match->bitmap_file); + goto out; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { + close(bmfd); + pr_err("Failed to set bitmapfile for %s.\n", + chosen_name); + goto out; + } + close(bmfd); + } + /* Need to remove from the array any devices which + * 'count_active' discerned were too old or inappropriate + */ + for (d = sra ? sra->devs : NULL ; d ; d = d->next) + if (d->disk.state & (1<<MD_DISK_REMOVED)) + remove_disk(mdfd, st, sra, d); + + if ((sra == NULL || active_disks >= info.array.working_disks) + && trustworthy != FOREIGN) + rv = ioctl(mdfd, RUN_ARRAY, NULL); + else + rv = sysfs_set_str(sra, NULL, + "array_state", "read-auto"); + /* Array might be O_EXCL which will interfere with + * fsck and mount. So re-open without O_EXCL. + */ + reopen_mddev(mdfd); + if (rv == 0) { + if (c->export) { + printf("MD_STARTED=yes\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, which has been started.\n", + devname, chosen_name); + rv = 0; + wait_for(chosen_name, mdfd); + /* We just started the array, so some devices + * might have been evicted from the array + * because their event counts were too old. + * If the action=re-add policy is in-force for + * those devices we should re-add them now. + */ + for (dsk = sra->devs; dsk ; dsk = dsk->next) { + if (disk_action_allows(dsk, st->ss->name, act_re_add) && + add_disk(mdfd, st, sra, dsk) == 0) + pr_err("%s re-added to %s\n", + dsk->sys_name, chosen_name); + } + } else { + pr_err("%s attached to %s, but failed to start: %s.\n", + devname, chosen_name, strerror(errno)); + rv = 1; + } + } else { + if (c->export) { + printf("MD_STARTED=unsafe\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start safely.\n", + devname, chosen_name); + rv = 0; + } +out: + free(avail); + if (dfd >= 0) + close(dfd); + if (mdfd >= 0) + close(mdfd); + if (policy) + dev_policy_free(policy); + if (sra) + sysfs_free(sra); + return rv; +out_unlock: + map_unlock(&map); + goto out; +} + +static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, + int number, __u64 events, int verbose, + char *array_name) +{ + /* Find a device attached to this array with a disk.number of number + * and events less than the passed events, and remove the device. + */ + struct mdinfo *d; + mdu_array_info_t ra; + + if (ioctl(mdfd, GET_ARRAY_INFO, &ra) == 0) + return; /* not safe to remove from active arrays + * without thinking more */ + + for (d = sra->devs; d ; d = d->next) { + char dn[10]; + int dfd; + struct mdinfo info; + sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + if (st->ss->load_super(st, dfd, NULL)) { + close(dfd); + continue; + } + st->ss->getinfo_super(st, &info, NULL); + st->ss->free_super(st); + close(dfd); + + if (info.disk.number != number || + info.events >= events) + continue; + + if (d->disk.raid_disk > -1) + sysfs_set_str(sra, d, "slot", "none"); + if (sysfs_set_str(sra, d, "state", "remove") == 0) + if (verbose >= 0) + pr_err("removing old device %s from %s\n", + d->sys_name+4, array_name); + } +} + +static int count_active(struct supertype *st, struct mdinfo *sra, + int mdfd, char **availp, + struct mdinfo *bestinfo) +{ + /* count how many devices in sra think they are active */ + struct mdinfo *d; + int cnt = 0; + int replcnt = 0; + __u64 max_events = 0; + char *avail = NULL; + int *best = NULL; + char *devmap = NULL; + int numdevs = 0; + int devnum; + int b, i; + int raid_disks = 0; + + if (!sra) + return 0; + + for (d = sra->devs ; d ; d = d->next) + numdevs++; + for (d = sra->devs, devnum = 0 ; d ; d = d->next, devnum++) { + char dn[30]; + int dfd; + int ok; + struct mdinfo info; + + sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + ok = st->ss->load_super(st, dfd, NULL); + close(dfd); + if (ok != 0) + continue; + info.array.raid_disks = raid_disks; + st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum); + if (!avail) { + raid_disks = info.array.raid_disks; + avail = xcalloc(raid_disks, 1); + *availp = avail; + + best = xcalloc(raid_disks, sizeof(int)); + devmap = xcalloc(raid_disks, numdevs); + + st->ss->getinfo_super(st, &info, devmap); + } + + if (info.disk.state & (1<<MD_DISK_SYNC)) + { + if (cnt == 0) { + cnt++; + max_events = info.events; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } else if (info.events == max_events) { + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + } else if (info.events == max_events-1) { + if (avail[info.disk.raid_disk] == 0) { + avail[info.disk.raid_disk] = 1; + best[info.disk.raid_disk] = devnum; + } + } else if (info.events < max_events - 1) + ; + else if (info.events == max_events+1) { + int i; + max_events = info.events; + for (i = 0; i < raid_disks; i++) + if (avail[i]) + avail[i]--; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } else { /* info.events much bigger */ + memset(avail, 0, raid_disks); + max_events = info.events; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } + } else if (info.disk.state & (1<<MD_DISK_REPLACEMENT)) + replcnt++; + st->ss->free_super(st); + } + if (!avail) + return 0; + /* We need to reject any device that thinks the best device is + * failed or missing */ + for (b = 0; b < raid_disks; b++) + if (avail[b] == 2) + break; + cnt = 0; + for (i = 0 ; i < raid_disks ; i++) { + if (i != b && avail[i]) + if (devmap[raid_disks * best[i] + b] == 0) { + /* This device thinks 'b' is failed - + * don't use it */ + devnum = best[i]; + for (d=sra->devs ; devnum; d = d->next) + devnum--; + d->disk.state |= (1 << MD_DISK_REMOVED); + avail[i] = 0; + } + if (avail[i]) + cnt++; + } + /* Also need to reject any spare device with an event count that + * is too high + */ + for (d = sra->devs; d; d = d->next) { + if (!(d->disk.state & (1<<MD_DISK_SYNC)) && + d->events > max_events) + d->disk.state |= (1 << MD_DISK_REMOVED); + } + free(best); + free(devmap); + return cnt + replcnt; +} + +/* test if container has degraded member(s) */ +static int container_members_max_degradation(struct map_ent *map, struct map_ent *me) +{ + mdu_array_info_t array; + int afd; + int max_degraded = 0; + + for(; map; map = map->next) { + if (!metadata_container_matches(map->metadata, me->devnm)) + continue; + afd = open_dev(map->devnm); + if (afd < 0) + continue; + /* most accurate information regarding array degradation */ + if (ioctl(afd, GET_ARRAY_INFO, &array) >= 0) { + int degraded = array.raid_disks - array.active_disks - + array.spare_disks; + if (degraded > max_degraded) + max_degraded = degraded; + } + close(afd); + } + return (max_degraded); +} + +static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, int bare, + struct supertype *st, int verbose) +{ + /* This device doesn't have any md metadata + * The device policy allows 'spare' and if !bare, it allows spare-same-slot. + * If 'st' is not set, then we only know that some metadata allows this, + * others possibly don't. + * So look for a container or array to attach the device to. + * Prefer 'target' if that is set and the array is found. + * + * If st is set, then only arrays of that type are considered + * Return 0 on success, or some exit code on failure, probably 1. + */ + int rv = 1; + struct stat stb; + struct map_ent *mp, *map = NULL; + struct mdinfo *chosen = NULL; + int dfd = *dfdp; + + if (fstat(dfd, &stb) != 0) + return 1; + + /* + * Now we need to find a suitable array to add this to. + * We only accept arrays that: + * - match 'st' + * - are in the same domains as the device + * - are of an size for which the device will be useful + * and we choose the one that is the most degraded + */ + + if (map_lock(&map)) { + pr_err("failed to get exclusive lock on mapfile\n"); + return 1; + } + for (mp = map ; mp ; mp = mp->next) { + struct supertype *st2; + struct domainlist *dl = NULL; + struct mdinfo *sra; + unsigned long long devsize; + unsigned long long component_size = 0; + + if (is_subarray(mp->metadata)) + continue; + if (st) { + st2 = st->ss->match_metadata_desc(mp->metadata); + if (!st2 || + (st->minor_version >= 0 && + st->minor_version != st2->minor_version)) { + if (verbose > 1) + pr_err("not adding %s to %s as metadata type doesn't match\n", + devname, mp->path); + free(st2); + continue; + } + free(st2); + } + sra = sysfs_read(-1, mp->devnm, + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| + GET_DEGRADED|GET_COMPONENT|GET_VERSION); + if (!sra) { + /* Probably a container - no degraded info */ + sra = sysfs_read(-1, mp->devnm, + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| + GET_COMPONENT|GET_VERSION); + if (sra) + sra->array.failed_disks = -1; + } + if (!sra) + continue; + if (st == NULL) { + int i; + st2 = NULL; + for(i = 0; !st2 && superlist[i]; i++) + st2 = superlist[i]->match_metadata_desc( + sra->text_version); + if (!st2) { + if (verbose > 1) + pr_err("not adding %s to %s as metadata not recognised.\n", + devname, mp->path); + goto next; + } + /* Need to double check the 'act_spare' permissions applies + * to this metadata. + */ + if (!policy_action_allows(pol, st2->ss->name, act_spare)) + goto next; + if (!bare && !policy_action_allows(pol, st2->ss->name, + act_spare_same_slot)) + goto next; + } else + st2 = st; + /* update number of failed disks for mostly degraded + * container member */ + if (sra->array.failed_disks == -1) + sra->array.failed_disks = container_members_max_degradation(map, mp); + + get_dev_size(dfd, NULL, &devsize); + if (sra->component_size == 0) { + /* true for containers, here we must read superblock + * to obtain minimum spare size */ + struct supertype *st3 = dup_super(st2); + int mdfd = open_dev(mp->devnm); + if (mdfd < 0) { + free(st3); + goto next; + } + if (st3->ss->load_container && + !st3->ss->load_container(st3, mdfd, mp->path)) { + component_size = st3->ss->min_acceptable_spare_size(st3); + st3->ss->free_super(st3); + } + free(st3); + close(mdfd); + } + if ((sra->component_size > 0 && + st2->ss->avail_size(st2, devsize, + sra->devs + ? sra->devs->data_offset + : INVALID_SECTORS) + < sra->component_size) + || + (sra->component_size == 0 && devsize < component_size)) { + if (verbose > 1) + pr_err("not adding %s to %s as it is too small\n", + devname, mp->path); + goto next; + } + /* test against target. + * If 'target' is set and 'bare' is false, we only accept + * arrays/containers that match 'target'. + * If 'target' is set and 'bare' is true, we prefer the + * array which matches 'target'. + * target is considered only if we deal with degraded array + */ + if (target && policy_action_allows(pol, st2->ss->name, + act_spare_same_slot)) { + if (strcmp(target->metadata, mp->metadata) == 0 && + memcmp(target->uuid, mp->uuid, + sizeof(target->uuid)) == 0 && + sra->array.failed_disks > 0) { + /* This is our target!! */ + if (chosen) + sysfs_free(chosen); + chosen = sra; + sra = NULL; + /* skip to end so we don't check any more */ + while (mp->next) + mp = mp->next; + goto next; + } + /* not our target */ + if (!bare) + goto next; + } + + dl = domain_from_array(sra, st2->ss->name); + if (domain_test(dl, pol, st2->ss->name) != 1) { + /* domain test fails */ + if (verbose > 1) + pr_err("not adding %s to %s as it is not in a compatible domain\n", + devname, mp->path); + + goto next; + } + /* all tests passed, OK to add to this array */ + if (!chosen) { + chosen = sra; + sra = NULL; + } else if (chosen->array.failed_disks < sra->array.failed_disks) { + sysfs_free(chosen); + chosen = sra; + sra = NULL; + } + next: + if (sra) + sysfs_free(sra); + if (st != st2) + free(st2); + if (dl) + domain_free(dl); + } + if (chosen) { + /* add current device to chosen array as a spare */ + int mdfd = open_dev(chosen->sys_name); + if (mdfd >= 0) { + struct mddev_dev devlist; + char devname[20]; + devlist.next = NULL; + devlist.used = 0; + devlist.writemostly = 0; + devlist.devname = devname; + sprintf(devname, "%d:%d", major(stb.st_rdev), + minor(stb.st_rdev)); + devlist.disposition = 'a'; + close(dfd); + *dfdp = -1; + rv = Manage_subdevs(chosen->sys_name, mdfd, &devlist, + -1, 0, NULL, 0); + close(mdfd); + } + if (verbose > 0) { + if (rv == 0) + pr_err("added %s as spare for %s\n", + devname, chosen->sys_name); + else + pr_err("failed to add %s as spare for %s\n", + devname, chosen->sys_name); + } + sysfs_free(chosen); + } + map_unlock(&map); + return rv; +} + +static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct supertype *st, int verbose) +{ + /* we know that at least one partition virtual-metadata is + * allowed to incorporate spares like this device. We need to + * find a suitable device to copy partition information from. + * + * Getting a list of all disk (not partition) devices is + * slightly non-trivial. We could look at /sys/block, but + * that is theoretically due to be removed. Maybe best to use + * /dev/disk/by-path/?* and ignore names ending '-partNN' as + * we depend on this directory of 'path' info. But that fails + * to find loop devices and probably others. Maybe don't + * worry about that, they aren't the real target. + * + * So: check things in /dev/disk/by-path to see if they are in + * a compatible domain, then load the partition table and see + * if it is OK for the new device, and choose the largest + * partition table that fits. + */ + DIR *dir; + struct dirent *de; + char *chosen = NULL; + unsigned long long chosen_size = 0; + struct supertype *chosen_st = NULL; + int fd; + + dir = opendir("/dev/disk/by-path"); + if (!dir) + return 1; + while ((de = readdir(dir)) != NULL) { + char *ep; + struct dev_policy *pol2 = NULL; + struct domainlist *domlist = NULL; + int fd = -1; + struct mdinfo info; + struct supertype *st2 = NULL; + char *devname = NULL; + unsigned long long devsectors; + + if (de->d_ino == 0 || + de->d_name[0] == '.' || + (de->d_type != DT_LNK && de->d_type != DT_UNKNOWN)) + goto next; + + ep = de->d_name + strlen(de->d_name); + while (ep > de->d_name && + isdigit(ep[-1])) + ep--; + if (ep > de->d_name + 5 && + strncmp(ep-5, "-part", 5) == 0) + /* This is a partition - skip it */ + goto next; + + pol2 = path_policy(de->d_name, type_disk); + + domain_merge(&domlist, pol2, st ? st->ss->name : NULL); + if (domain_test(domlist, pol, st ? st->ss->name : NULL) != 1) + /* new device is incompatible with this device. */ + goto next; + + domain_free(domlist); + domlist = NULL; + + if (asprintf(&devname, "/dev/disk/by-path/%s", de->d_name) != 1) { + devname = NULL; + goto next; + } + fd = open(devname, O_RDONLY); + if (fd < 0) + goto next; + if (get_dev_size(fd, devname, &devsectors) == 0) + goto next; + devsectors >>= 9; + + if (st) + st2 = dup_super(st); + else + st2 = guess_super_type(fd, guess_partitions); + if (st2 == NULL || + st2->ss->load_super(st2, fd, NULL) < 0) + goto next; + st2->ignore_hw_compat = 0; + + if (!st) { + /* Check domain policy again, this time referring to metadata */ + domain_merge(&domlist, pol2, st2->ss->name); + if (domain_test(domlist, pol, st2->ss->name) != 1) + /* Incompatible devices for this metadata type */ + goto next; + if (!policy_action_allows(pol, st2->ss->name, act_spare)) + /* Some partition types allow sparing, but not + * this one. + */ + goto next; + } + + st2->ss->getinfo_super(st2, &info, NULL); + if (info.component_size > devsectors) + /* This partitioning doesn't fit in the device */ + goto next; + + /* This is an acceptable device to copy partition + * metadata from. We could just stop here, but I + * think I want to keep looking incase a larger + * metadata which makes better use of the device can + * be found. + */ + if (chosen == NULL || + chosen_size < info.component_size) { + chosen_size = info.component_size; + free(chosen); + chosen = devname; + devname = NULL; + if (chosen_st) { + chosen_st->ss->free_super(chosen_st); + free(chosen_st); + } + chosen_st = st2; + st2 = NULL; + } + + next: + free(devname); + domain_free(domlist); + dev_policy_free(pol2); + if (st2) + st2->ss->free_super(st2); + free(st2); + + if (fd >= 0) + close(fd); + } + + closedir(dir); + + if (!chosen) + return 1; + + /* 'chosen' is the best device we can find. Let's write its + * metadata to devname dfd is read-only so don't use that + */ + fd = open(devname, O_RDWR); + if (fd >= 0) { + chosen_st->ss->store_super(chosen_st, fd); + close(fd); + } + free(chosen); + chosen_st->ss->free_super(chosen_st); + free(chosen_st); + return 0; +} + +static int is_bare(int dfd) +{ + unsigned long long size = 0; + char bufpad[4096 + 4096]; + char *buf = (char*)(((long)bufpad + 4096) & ~4095); + + if (lseek(dfd, 0, SEEK_SET) != 0 || + read(dfd, buf, 4096) != 4096) + return 0; + + if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff') + return 0; + if (memcmp(buf, buf+1, 4095) != 0) + return 0; + + /* OK, first 4K appear blank, try the end. */ + get_dev_size(dfd, NULL, &size); + if (lseek(dfd, size-4096, SEEK_SET) < 0 || + read(dfd, buf, 4096) != 4096) + return 0; + + if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff') + return 0; + if (memcmp(buf, buf+1, 4095) != 0) + return 0; + + return 1; +} + +/* adding a spare to a regular array is quite different from adding one to + * a set-of-partitions virtual array. + * This function determines which is worth trying and tries as appropriate. + * Arrays are given priority over partitions. + */ +static int try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, + struct supertype *st, int verbose) +{ + int i; + int rv; + int arrays_ok = 0; + int partitions_ok = 0; + int dfd = *dfdp; + int bare; + + /* Can only add a spare if device has at least one domain */ + if (pol_find(pol, pol_domain) == NULL) + return 1; + /* And only if some action allows spares */ + if (!policy_action_allows(pol, st?st->ss->name:NULL, act_spare)) + return 1; + + /* Now check if the device is bare. + * bare devices can always be added as a spare + * non-bare devices can only be added if spare-same-slot is permitted, + * and this device is replacing a previous device - in which case 'target' + * will be set. + */ + if (!is_bare(dfd)) { + /* Must have a target and allow same_slot */ + /* Later - may allow force_spare without target */ + if (!target || + !policy_action_allows(pol, st?st->ss->name:NULL, + act_spare_same_slot)) { + if (verbose > 1) + pr_err("%s is not bare, so not considering as a spare\n", + devname); + return 1; + } + bare = 0; + } else + bare = 1; + + /* It might be OK to add this device to an array - need to see + * what arrays might be candidates. + */ + if (st) { + /* just try try 'array' or 'partition' based on this metadata */ + if (st->ss->add_to_super) + return array_try_spare(devname, dfdp, pol, target, bare, + st, verbose); + else + return partition_try_spare(devname, dfdp, pol, + st, verbose); + } + /* No metadata was specified or found so options are open. + * Check for whether any array metadata, or any partition metadata + * might allow adding the spare. This check is just help to avoid + * a more costly scan of all arrays when we can be sure that will + * fail. + */ + for (i = 0; (!arrays_ok || !partitions_ok) && superlist[i] ; i++) { + if (superlist[i]->add_to_super && !arrays_ok && + policy_action_allows(pol, superlist[i]->name, act_spare)) + arrays_ok = 1; + if (superlist[i]->add_to_super == NULL && !partitions_ok && + policy_action_allows(pol, superlist[i]->name, act_spare)) + partitions_ok = 1; + } + rv = 1; + if (arrays_ok) + rv = array_try_spare(devname, dfdp, pol, target, bare, + st, verbose); + if (rv != 0 && partitions_ok) + rv = partition_try_spare(devname, dfdp, pol, st, verbose); + return rv; +} + +int IncrementalScan(struct context *c, char *devnm) +{ + /* look at every device listed in the 'map' file. + * If one is found that is not running then: + * look in mdadm.conf for bitmap file. + * if one exists, but array has none, add it. + * try to start array in auto-readonly mode + */ + struct map_ent *mapl = NULL; + struct map_ent *me; + struct mddev_ident *devs, *mddev; + int rv = 0; + char container[32]; + char *only = NULL; + + map_read(&mapl); + devs = conf_get_ident(NULL); + +restart: + for (me = mapl ; me ; me = me->next) { + mdu_array_info_t array; + mdu_bitmap_file_t bmf; + struct mdinfo *sra; + int mdfd; + + if (devnm && strcmp(devnm, me->devnm) != 0) + continue; + if (devnm && me->metadata[0] == '/') { + char *sl; + /* member array, need to work on container */ + strncpy(container, me->metadata+1, 32); + container[31] = 0; + sl = strchr(container, '/'); + if (sl) + *sl = 0; + only = devnm; + devnm = container; + goto restart; + } + mdfd = open_dev(me->devnm); + + if (mdfd < 0) + continue; + if (!isdigit(me->metadata[0])) { + /* must be a container */ + struct supertype *st = super_by_fd(mdfd, NULL); + int ret = 0; + struct map_ent *map = NULL; + + if (st && st->ss->load_container) + ret = st->ss->load_container(st, mdfd, NULL); + close(mdfd); + if (!ret && st && st->ss->container_content) { + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + ret = Incremental_container(st, me->path, c, only); + map_unlock(&map); + } + if (ret) + rv = 1; + continue; + } + if (ioctl(mdfd, GET_ARRAY_INFO, &array) == 0 || + errno != ENODEV) { + close(mdfd); + continue; + } + /* Ok, we can try this one. Maybe it needs a bitmap */ + for (mddev = devs ; mddev ; mddev = mddev->next) + if (mddev->devname && me->path + && devname_matches(mddev->devname, me->path)) + break; + if (mddev && mddev->bitmap_file) { + /* + * Note: early kernels will wrongly fail this, so it + * is a hint only + */ + int added = -1; + if (ioctl(mdfd, GET_ARRAY_INFO, &bmf) < 0) { + int bmfd = open(mddev->bitmap_file, O_RDWR); + if (bmfd >= 0) { + added = ioctl(mdfd, SET_BITMAP_FILE, + bmfd); + close(bmfd); + } + } + if (c->verbose >= 0) { + if (added == 0) + pr_err("Added bitmap %s to %s\n", + mddev->bitmap_file, me->path); + else if (errno != EEXIST) + pr_err("Failed to add bitmap to %s: %s\n", + me->path, strerror(errno)); + } + } + /* FIXME check for reshape_active and consider not + * starting array. + */ + sra = sysfs_read(mdfd, NULL, 0); + if (sra) { + if (sysfs_set_str(sra, NULL, + "array_state", "read-auto") == 0) { + if (c->verbose >= 0) + pr_err("started array %s\n", + me->path ?: me->devnm); + } else { + pr_err("failed to start array %s: %s\n", + me->path ?: me->devnm, + strerror(errno)); + rv = 1; + } + sysfs_free(sra); + } + } + return rv; +} + +static char *container2devname(char *devname) +{ + char *mdname = NULL; + + if (devname[0] == '/') { + int fd = open(devname, O_RDONLY); + if (fd >= 0) { + mdname = xstrdup(fd2devnm(fd)); + close(fd); + } + } else { + int uuid[4]; + struct map_ent *mp, *map = NULL; + + if (!parse_uuid(devname, uuid)) + return mdname; + mp = map_by_uuid(&map, uuid); + if (mp) + mdname = xstrdup(mp->devnm); + map_free(map); + } + + return mdname; +} + +static int Incremental_container(struct supertype *st, char *devname, + struct context *c, char *only) +{ + /* Collect the contents of this container and for each + * array, choose a device name and assemble the array. + */ + + struct mdinfo *list; + struct mdinfo *ra; + struct map_ent *map = NULL; + struct mdinfo info; + int trustworthy; + struct mddev_ident *match; + int rv = 0; + struct domainlist *domains; + struct map_ent *smp; + int suuid[4]; + int sfd; + int ra_blocked = 0; + int ra_all = 0; + int result = 0; + + st->ss->getinfo_super(st, &info, NULL); + + if ((c->runstop > 0 && info.container_enough >= 0) || + info.container_enough > 0) + /* pass */; + else { + if (c->export) { + printf("MD_STARTED=no\n"); + } else if (c->verbose) + pr_err("not enough devices to start the container\n"); + return 0; + } + + match = conf_match(st, &info, devname, c->verbose, &rv); + if (match == NULL && rv == 2) + return rv; + + /* Need to compute 'trustworthy' */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, c->homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL; + else + trustworthy = FOREIGN; + + list = st->ss->container_content(st, NULL); + /* when nothing to activate - quit */ + if (list == NULL) { + if (c->export) { + printf("MD_STARTED=nothing\n"); + } + return 0; + } + for (ra = list ; ra ; ra = ra->next) { + int mdfd; + char chosen_name[1024]; + struct map_ent *mp; + struct mddev_ident *match = NULL; + + ra_all++; + /* do not activate arrays blocked by metadata handler */ + if (ra->array.state & (1 << MD_SB_BLOCK_VOLUME)) { + pr_err("Cannot activate array %s in %s.\n", + ra->text_version, devname); + ra_blocked++; + continue; + } + mp = map_by_uuid(&map, ra->uuid); + + if (mp) { + mdfd = open_dev(mp->devnm); + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, mp->devnm); + } else if (!only) { + + /* Check in mdadm.conf for container == devname and + * member == ra->text_version after second slash. + */ + char *sub = strchr(ra->text_version+1, '/'); + struct mddev_ident *array_list; + if (sub) { + sub++; + array_list = conf_get_ident(NULL); + } else + array_list = NULL; + for(; array_list ; array_list = array_list->next) { + char *dn; + if (array_list->member == NULL || + array_list->container == NULL) + continue; + if (strcmp(array_list->member, sub) != 0) + continue; + if (array_list->uuid_set && + !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid)) + continue; + dn = container2devname(array_list->container); + if (dn == NULL) + continue; + if (strncmp(dn, ra->text_version+1, + strlen(dn)) != 0 || + ra->text_version[strlen(dn)+1] != '/') { + free(dn); + continue; + } + free(dn); + /* we have a match */ + match = array_list; + if (c->verbose>0) + pr_err("match found for member %s\n", + array_list->member); + break; + } + + if (match && match->devname && + strcasecmp(match->devname, "<ignore>") == 0) { + if (c->verbose > 0) + pr_err("array %s/%s is explicitly ignored by mdadm.conf\n", + match->container, match->member); + continue; + } + if (match) + trustworthy = LOCAL; + + mdfd = create_mddev(match ? match->devname : NULL, + ra->name, + c->autof, + trustworthy, + chosen_name); + } + if (only && (!mp || strcmp(mp->devnm, only) != 0)) + continue; + + if (mdfd < 0) { + pr_err("failed to open %s: %s.\n", + chosen_name, strerror(errno)); + return 2; + } + + assemble_container_content(st, mdfd, ra, c, + chosen_name, &result); + close(mdfd); + } + if (c->export && result) { + char sep = '='; + printf("MD_STARTED"); + if (result & INCR_NO) { + printf("%cno", sep); + sep = ','; + } + if (result & INCR_UNSAFE) { + printf("%cunsafe", sep); + sep = ','; + } + if (result & INCR_ALREADY) { + printf("%calready", sep); + sep = ','; + } + if (result & INCR_YES) { + printf("%cyes", sep); + sep = ','; + } + printf("\n"); + } + + /* don't move spares to container with volume being activated + when all volumes are blocked */ + if (ra_all == ra_blocked) + return 0; + + /* Now move all suitable spares from spare container */ + domains = domain_from_array(list, st->ss->name); + memcpy(suuid, uuid_zero, sizeof(int[4])); + if (domains && + (smp = map_by_uuid(&map, suuid)) != NULL && + (sfd = open(smp->path, O_RDONLY)) >= 0) { + /* spare container found */ + struct supertype *sst = + super_imsm.match_metadata_desc("imsm"); + struct mdinfo *sinfo; + unsigned long long min_size = 0; + if (st->ss->min_acceptable_spare_size) + min_size = st->ss->min_acceptable_spare_size(st); + if (!sst->ss->load_container(sst, sfd, NULL)) { + close(sfd); + sinfo = container_choose_spares(sst, min_size, + domains, NULL, + st->ss->name, 0); + sst->ss->free_super(sst); + if (sinfo){ + int count = 0; + struct mdinfo *disks = sinfo->devs; + while (disks) { + /* move spare from spare + * container to currently + * assembled one + */ + if (move_spare( + smp->path, + devname, + makedev(disks->disk.major, + disks->disk.minor))) + count++; + disks = disks->next; + } + if (count) + pr_err("Added %d spare%s to %s\n", + count, count>1?"s":"", devname); + } + sysfs_free(sinfo); + } else + close(sfd); + } + domain_free(domains); + return 0; +} + +static void run_udisks(char *arg1, char *arg2) +{ + int pid = fork(); + int status; + if (pid == 0) { + execl("/usr/bin/udisks", "udisks", arg1, arg2, NULL); + execl("/bin/udisks", "udisks", arg1, arg2, NULL); + exit(1); + } + while (pid > 0 && wait(&status) != pid) + ; +} + +/* + * IncrementalRemove - Attempt to see if the passed in device belongs to any + * raid arrays, and if so first fail (if needed) and then remove the device. + * + * @devname - The device we want to remove + * @id_path - name as found in /dev/disk/by-path for this device + * + * Note: the device name must be a kernel name like "sda", so + * that we can find it in /proc/mdstat + */ +int IncrementalRemove(char *devname, char *id_path, int verbose) +{ + int mdfd; + int rv = 0; + struct mdstat_ent *ent; + struct mddev_dev devlist; + struct mdinfo mdi; + char buf[32]; + + if (!id_path) + dprintf("incremental removal without --path <id_path> lacks the possibility to re-add new device in this port\n"); + + if (strchr(devname, '/')) { + pr_err("incremental removal requires a kernel device name, not a file: %s\n", devname); + return 1; + } + ent = mdstat_by_component(devname); + if (!ent) { + if (verbose >= 0) + pr_err("%s does not appear to be a component of any array\n", devname); + return 1; + } + sysfs_init(&mdi, -1, ent->devnm); + mdfd = open_dev_excl(ent->devnm); + if (mdfd > 0) { + close(mdfd); + if (sysfs_get_str(&mdi, NULL, "array_state", + buf, sizeof(buf)) > 0) { + if (strncmp(buf, "active", 6) == 0 || + strncmp(buf, "clean", 5) == 0) + sysfs_set_str(&mdi, NULL, + "array_state", "read-auto"); + } + } + mdfd = open_dev(ent->devnm); + if (mdfd < 0) { + if (verbose >= 0) + pr_err("Cannot open array %s!!\n", ent->devnm); + free_mdstat(ent); + return 1; + } + + if (id_path) { + struct map_ent *map = NULL, *me; + me = map_by_devnm(&map, ent->devnm); + if (me) + policy_save_path(id_path, me); + map_free(map); + } + + memset(&devlist, 0, sizeof(devlist)); + devlist.devname = devname; + devlist.disposition = 'f'; + /* for a container, we must fail each member array */ + if (ent->metadata_version && + strncmp(ent->metadata_version, "external:", 9) == 0) { + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *memb; + for (memb = mdstat ; memb ; memb = memb->next) + if (is_container_member(memb, ent->devnm)) { + int subfd = open_dev(memb->devnm); + if (subfd >= 0) { + rv |= Manage_subdevs( + memb->devnm, subfd, + &devlist, verbose, 0, + NULL, 0); + close(subfd); + } + } + free_mdstat(mdstat); + } else + rv |= Manage_subdevs(ent->devnm, mdfd, &devlist, + verbose, 0, NULL, 0); + if (rv & 2) { + /* Failed due to EBUSY, try to stop the array. + * Give udisks a chance to unmount it first. + */ + int devid = devnm2devid(ent->devnm); + run_udisks("--unmount", map_dev(major(devid),minor(devid), 0)); + rv = Manage_stop(ent->devnm, mdfd, verbose, 1); + if (rv) + /* At least we can try to trigger a 'remove' */ + sysfs_uevent(&mdi, "remove"); + if (verbose) { + if (rv) + pr_err("Fail to stop %s too.\n", ent->devnm); + } + } else { + devlist.disposition = 'r'; + rv = Manage_subdevs(ent->devnm, mdfd, &devlist, + verbose, 0, NULL, 0); + } + close(mdfd); + free_mdstat(ent); + return rv; +} @@ -0,0 +1,146 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * + * Added by Dale Stephenson + * steph@snapserver.com + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" + +int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl) +{ + /* + * Nothing fancy about Kill. It just zeroes out a superblock + * Definitely not safe. + * Returns: + * 0 - a zero superblock was successfully written out + * 1 - failed to write the zero superblock + * 2 - failed to open the device or find a superblock. + */ + + int fd, rv = 0; + + if (force) + noexcl = 1; + fd = open(dev, O_RDWR|(noexcl ? 0 : O_EXCL)); + if (fd < 0) { + if (verbose >= 0) + pr_err("Couldn't open %s for write - not zeroing\n", + dev); + return 2; + } + if (st == NULL) + st = guess_super(fd); + if (st == NULL || st->ss->init_super == NULL) { + if (verbose >= 0) + pr_err("Unrecognised md component device - %s\n", dev); + close(fd); + return 2; + } + st->ignore_hw_compat = 1; + rv = st->ss->load_super(st, fd, dev); + if (rv == 0 || (force && rv >= 2)) { + st->ss->free_super(st); + st->ss->init_super(st, NULL, 0, "", NULL, NULL, + INVALID_SECTORS); + if (st->ss->store_super(st, fd)) { + if (verbose >= 0) + pr_err("Could not zero superblock on %s\n", + dev); + rv = 1; + } else if (rv) { + if (verbose >= 0) + pr_err("superblock zeroed anyway\n"); + rv = 0; + } + } + close(fd); + return rv; +} + +int Kill_subarray(char *dev, char *subarray, int verbose) +{ + /* Delete a subarray out of a container, the subarry must be + * inactive. The subarray string must be a subarray index + * number. + * + * 0 = successfully deleted subarray from all container members + * 1 = failed to sync metadata to one or more devices + * 2 = failed to find the container, subarray, or other resource + * issue + */ + struct supertype supertype, *st = &supertype; + int fd, rv = 2; + + memset(st, 0, sizeof(*st)); + + fd = open_subarray(dev, subarray, st, verbose < 0); + if (fd < 0) + return 2; + + if (!st->ss->kill_subarray) { + if (verbose >= 0) + pr_err("Operation not supported for %s metadata\n", + st->ss->name); + goto free_super; + } + + if (is_subarray_active(subarray, st->devnm)) { + if (verbose >= 0) + pr_err("Subarray-%s still active, aborting\n", + subarray); + goto free_super; + } + + if (mdmon_running(st->devnm)) + st->update_tail = &st->updates; + + /* ok we've found our victim, drop the axe */ + rv = st->ss->kill_subarray(st); + if (rv) { + if (verbose >= 0) + pr_err("Failed to delete subarray-%s from %s\n", + subarray, dev); + goto free_super; + } + + /* FIXME these routines do not report success/failure */ + if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + + if (verbose >= 0) + pr_err("Deleted subarray-%s from %s, UUIDs may have changed\n", + subarray, dev); + + rv = 0; + + free_super: + st->ss->free_super(st); + close(fd); + + return rv; +} diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..a02a97f3 --- /dev/null +++ b/Makefile @@ -0,0 +1,334 @@ +# +# mdadm - manage Linux "md" devices aka RAID arrays. +# +# Copyright (C) 2001-2002 Neil Brown <neilb@cse.unsw.edu.au> +# Copyright (C) 2013 Neil Brown <neilb@suse.de> +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Author: Neil Brown +# Email: <neilb@cse.unsw.edu.au> +# Paper: Neil Brown +# School of Computer Science and Engineering +# The University of New South Wales +# Sydney, 2052 +# Australia +# + +# define "CXFLAGS" to give extra flags to CC. +# e.g. make CXFLAGS=-O to optimise +TCC = tcc +UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found ) +#DIET_GCC = diet gcc +# sorry, but diet-libc doesn't know about posix_memalign, +# so we cannot use it any more. +DIET_GCC = gcc -DHAVE_STDINT_H + +KLIBC=/home/src/klibc/klibc-0.77 + +KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 + +CC = $(CROSS_COMPILE)gcc +CXFLAGS ?= -ggdb +CWFLAGS = -Wall -Wstrict-prototypes -Wextra -Wno-unused-parameter +ifdef WARN_UNUSED +CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3 +endif + +ifdef DEBIAN +CPPFLAGS += -DDEBIAN +endif +ifdef DEFAULT_OLD_METADATA + CPPFLAGS += -DDEFAULT_OLD_METADATA + DEFAULT_METADATA=0.90 +else + DEFAULT_METADATA=1.2 +endif +CPPFLAGS += -DBINDIR=\"$(BINDIR)\" + +PKG_CONFIG ?= pkg-config + +SYSCONFDIR = /etc +CONFFILE = $(SYSCONFDIR)/mdadm/mdadm.conf +CONFFILE2 = $(SYSCONFDIR)/mdadm.conf +MAILCMD =/usr/sbin/sendmail -t +CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" +# Both MAP_DIR and MDMON_DIR should be somewhere that persists across the +# pivotroot from early boot to late boot. +# /run is best, but for distros that don't support that. +# /dev can work, in which case you probably want /dev/.mdadm +RUN_DIR=/run/mdadm +CHECK_RUN_DIR=1 +MAP_DIR=$(RUN_DIR) +MAP_FILE = map +MAP_PATH = $(MAP_DIR)/$(MAP_FILE) +MDMON_DIR = $(RUN_DIR) +# place for autoreplace cookies +FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots +SYSTEMD_DIR=/lib/systemd/system +DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\" +DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\" +DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\" +CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) + +VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//') +VERS_DATE = $(shell [ -d .git ] && date --date="`git log -n1 --format=format:%cd --date=short`" '+%0dth %B %Y' | sed -e 's/1th/1st/' -e 's/2th/2nd/' -e 's/11st/11th/' -e 's/12nd/12th/') +DVERS = $(if $(VERSION),-DVERSION=\"$(VERSION)\",) +DDATE = $(if $(VERS_DATE),-DVERS_DATE="\"$(VERS_DATE)\"",) +CFLAGS += $(DVERS) $(DDATE) + +# The glibc TLS ABI requires applications that call clone(2) to set up +# TLS data structures, use pthreads until mdmon implements this support +USE_PTHREADS = 1 +ifdef USE_PTHREADS +CFLAGS += -DUSE_PTHREADS +MON_LDFLAGS += -pthread +endif + +# If you want a static binary, you might uncomment these +# LDFLAGS = -static +# STRIP = -s + +INSTALL = /usr/bin/install +DESTDIR = +BINDIR = /sbin +MANDIR = /usr/share/man +MAN4DIR = $(MANDIR)/man4 +MAN5DIR = $(MANDIR)/man5 +MAN8DIR = $(MANDIR)/man8 + +UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null) +ifndef UDEVDIR + UDEVDIR = /lib/udev +endif + +OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o util.o maps.o lib.o \ + Manage.o Assemble.o Build.o \ + Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \ + Incremental.o Dump.o \ + mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \ + super-mbr.o super-gpt.o \ + restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \ + platform-intel.o probe_roms.o + +CHECK_OBJS = restripe.o sysfs.o maps.o lib.o xmalloc.o dlink.o + +SRCS = $(patsubst %.o,%.c,$(OBJS)) + +INCL = mdadm.h part.h bitmap.h + +MON_OBJS = mdmon.o monitor.o managemon.o util.o maps.o mdstat.o sysfs.o \ + policy.o lib.o \ + Kill.o sg_io.o dlink.o ReadMe.o super-intel.o \ + super-mbr.o super-gpt.o \ + super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \ + platform-intel.o probe_roms.o + +MON_SRCS = $(patsubst %.o,%.c,$(MON_OBJS)) + +STATICSRC = pwgr.c +STATICOBJS = pwgr.o + +ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c policy.c dlink.c util.c \ + maps.c lib.c xmalloc.c \ + super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \ + platform-intel.c probe_roms.c sysfs.c super-mbr.c super-gpt.c mapfile.c +ASSEMBLE_AUTO_SRCS := mdopen.c +ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE +ifdef MDASSEMBLE_AUTO +ASSEMBLE_SRCS += $(ASSEMBLE_AUTO_SRCS) +ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO +endif + +all : mdadm mdmon +man : mdadm.man md.man mdadm.conf.man mdmon.man raid6check.man + +check_rundir: + @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" = 1 ]; then \ + echo "***** Parent of $(RUN_DIR) does not exist. Maybe set different RUN_DIR="; \ + echo "***** e.g. make RUN_DIR=/dev/.mdadm" ; \ + echo "***** or set CHECK_RUN_DIR=0"; exit 1; \ + fi + +everything: all mdadm.static swap_super test_stripe raid6check \ + mdassemble mdassemble.auto mdassemble.static mdassemble.man \ + mdadm.Os mdadm.O2 man +everything-test: all mdadm.static swap_super test_stripe \ + mdassemble.auto mdassemble.static mdassemble.man \ + mdadm.Os mdadm.O2 man +# mdadm.uclibc and mdassemble.uclibc don't work on x86-64 +# mdadm.tcc doesn't work.. + +mdadm : $(OBJS) | check_rundir + $(CC) $(CFLAGS) $(LDFLAGS) -o mdadm $(OBJS) $(LDLIBS) + +mdadm.static : $(OBJS) $(STATICOBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS) + +mdadm.tcc : $(SRCS) $(INCL) + $(TCC) -o mdadm.tcc $(SRCS) + +mdadm.klibc : $(SRCS) $(INCL) + rm -f $(OBJS) + $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS) + +mdadm.Os : $(SRCS) $(INCL) + $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) + +mdadm.O2 : $(SRCS) $(INCL) mdmon.O2 + $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) + +mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h + $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) + +# use '-z now' to guarantee no dynamic linker interactions with the monitor thread +mdmon : $(MON_OBJS) | check_rundir + $(CC) $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -Wl,-z,now -o mdmon $(MON_OBJS) $(LDLIBS) +msg.o: msg.c msg.h + +test_stripe : restripe.c xmalloc.o mdadm.h + $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c + +raid6check : raid6check.o mdadm.h $(CHECK_OBJS) + $(CC) $(CXFLAGS) $(LDFLAGS) -o raid6check raid6check.o $(CHECK_OBJS) + +mdassemble : $(ASSEMBLE_SRCS) $(INCL) + $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) $(STATICSRC) + +mdassemble.diet : $(ASSEMBLE_SRCS) $(INCL) + rm -f $(OBJS) + $(DIET_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) $(STATICSRC) + +mdassemble.static : $(ASSEMBLE_SRCS) $(INCL) + rm -f $(OBJS) + $(CC) $(LDFLAGS) $(CPPFLAGS) $(ASSEMBLE_FLAGS) -static -DHAVE_STDINT_H -o mdassemble.static $(ASSEMBLE_SRCS) $(STATICSRC) + +mdassemble.auto : $(ASSEMBLE_SRCS) $(INCL) $(ASSEMBLE_AUTO_SRCS) + rm -f mdassemble.static + $(MAKE) MDASSEMBLE_AUTO=1 mdassemble.static + mv mdassemble.static mdassemble.auto + +mdassemble.uclibc : $(ASSEMBLE_SRCS) $(INCL) + rm -f $(OJS) + $(UCLIBC_GCC) $(ASSEMBLE_FLAGS) -DUCLIBC -DHAVE_STDINT_H -static -o mdassemble.uclibc $(ASSEMBLE_SRCS) $(STATICSRC) + +# This doesn't work +mdassemble.klibc : $(ASSEMBLE_SRCS) $(INCL) + rm -f $(OBJS) + $(KLIBC_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) + +mdadm.8 : mdadm.8.in + sed -e 's/{DEFAULT_METADATA}/$(DEFAULT_METADATA)/g' \ + -e 's,{MAP_PATH},$(MAP_PATH),g' mdadm.8.in > mdadm.8 + +mdadm.man : mdadm.8 + man -l mdadm.8 > mdadm.man + +mdmon.man : mdmon.8 + man -l mdmon.8 > mdmon.man + +md.man : md.4 + man -l md.4 > md.man + +mdadm.conf.man : mdadm.conf.5 + man -l mdadm.conf.5 > mdadm.conf.man + +mdassemble.man : mdassemble.8 + man -l mdassemble.8 > mdassemble.man + +raid6check.man : raid6check.8 + man -l raid6check.8 > raid6check.man + +$(OBJS) : $(INCL) mdmon.h +$(MON_OBJS) : $(INCL) mdmon.h + +sha1.o : sha1.c sha1.h md5.h + $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c + +install : mdadm mdmon install-man install-udev + $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm + $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon + +install-static : mdadm.static install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm + +install-tcc : mdadm.tcc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.tcc $(DESTDIR)$(BINDIR)/mdadm + +install-uclibc : mdadm.uclibc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.uclibc $(DESTDIR)$(BINDIR)/mdadm + +install-klibc : mdadm.klibc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.klibc $(DESTDIR)$(BINDIR)/mdadm + +install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8 + $(INSTALL) -D -m 644 mdadm.8 $(DESTDIR)$(MAN8DIR)/mdadm.8 + $(INSTALL) -D -m 644 mdmon.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 + $(INSTALL) -D -m 644 md.4 $(DESTDIR)$(MAN4DIR)/md.4 + $(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 + +install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules + @for file in 63-md-raid-arrays.rules 64-md-raid-assembly.rules ; \ + do sed -e 's,BINDIR,$(BINDIR),g' udev-$${file#??-} > .install.tmp.1 && \ + echo $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ + $(INSTALL) -D -m 644 .install.tmp.1 $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ + rm -f .install.tmp.1; \ + done + +install-systemd: systemd/mdmon@.service + @for file in mdmon@.service mdmonitor.service mdadm-last-resort@.timer \ + mdadm-last-resort@.service mdadm-grow-continue@.service; \ + do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.2 && \ + echo $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ + $(INSTALL) -D -m 644 .install.tmp.2 $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ + rm -f .install.tmp.2; \ + done + @for file in mdadm.shutdown ; \ + do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.3 && \ + echo $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ + $(INSTALL) -D -m 755 .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ + rm -f .install.tmp.3; \ + done + if [ -f /etc/SuSE-release -o -n "$(SUSE)" ] ;then $(INSTALL) -D -m 755 systemd/SUSE-mdadm_env.sh $(DESTDIR)$(SYSTEMD_DIR)/../scripts/mdadm_env.sh ;fi + +uninstall: + rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm + +test: mdadm mdmon test_stripe swap_super raid6check + @echo "Please run './test' as root" + +clean : + rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \ + mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt .merge_file_* \ + mdadm.Os mdadm.O2 mdmon.O2 \ + mdassemble mdassemble.static mdassemble.auto mdassemble.uclibc \ + mdassemble.klibc swap_super \ + init.cpio.gz mdadm.uclibc.static test_stripe raid6check raid6check.o mdmon \ + mdadm.8 + +dist : clean + ./makedist + +testdist : everything-test clean + ./makedist test + +TAGS : + etags *.h *.c + +DISTRO_MAKEFILE := $(wildcard distropkg/Makefile) +ifdef DISTRO_MAKEFILE +include $(DISTRO_MAKEFILE) +endif diff --git a/Manage.c b/Manage.c new file mode 100644 index 00000000..47faeedb --- /dev/null +++ b/Manage.c @@ -0,0 +1,1697 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" +#include <ctype.h> + +#define REGISTER_DEV _IO (MD_MAJOR, 1) +#define START_MD _IO (MD_MAJOR, 2) +#define STOP_MD _IO (MD_MAJOR, 3) + +int Manage_ro(char *devname, int fd, int readonly) +{ + /* switch to readonly or rw + * + * requires >= 0.90.0 + * first check that array is runing + * use RESTART_ARRAY_RW or STOP_ARRAY_RO + * + */ + mdu_array_info_t array; +#ifndef MDASSEMBLE + struct mdinfo *mdi; +#endif + int rv = 0; + + if (md_get_version(fd) < 9000) { + pr_err("need md driver version 0.90.0 or later\n"); + return 1; + } +#ifndef MDASSEMBLE + /* If this is an externally-managed array, we need to modify the + * metadata_version so that mdmon doesn't undo our change. + */ + mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION); + if (mdi && + mdi->array.major_version == -1 && + is_subarray(mdi->text_version)) { + char vers[64]; + strcpy(vers, "external:"); + strcat(vers, mdi->text_version); + if (readonly > 0) { + int rv; + /* We set readonly ourselves. */ + vers[9] = '-'; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + + close(fd); + rv = sysfs_set_str(mdi, NULL, "array_state", "readonly"); + + if (rv < 0) { + pr_err("failed to set readonly for %s: %s\n", + devname, strerror(errno)); + + vers[9] = mdi->text_version[0]; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + rv = 1; + goto out; + } + } else { + char *cp; + /* We cannot set read/write - must signal mdmon */ + vers[9] = '/'; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + + cp = strchr(vers+10, '/'); + if (cp) + *cp = 0; + ping_monitor(vers+10); + if (mdi->array.level <= 0) + sysfs_set_str(mdi, NULL, "array_state", "active"); + } + goto out; + } +#endif + if (ioctl(fd, GET_ARRAY_INFO, &array)) { + pr_err("%s does not appear to be active.\n", + devname); + rv = 1; + goto out; + } + + if (readonly > 0) { + if (ioctl(fd, STOP_ARRAY_RO, NULL)) { + pr_err("failed to set readonly for %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + } else if (readonly < 0) { + if (ioctl(fd, RESTART_ARRAY_RW, NULL)) { + pr_err("failed to set writable for %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + } +out: +#ifndef MDASSEMBLE + if (mdi) + sysfs_free(mdi); +#endif + return rv; +} + +#ifndef MDASSEMBLE + +static void remove_devices(char *devnm, char *path) +{ + /* + * Remove names at 'path' - possibly with + * partition suffixes - which link to the 'standard' + * name for devnm. These were probably created + * by mdadm when the array was assembled. + */ + char base[40]; + char *path2; + char link[1024]; + int n; + int part; + char *be; + char *pe; + + if (!path) + return; + + sprintf(base, "/dev/%s", devnm); + be = base + strlen(base); + + path2 = xmalloc(strlen(path)+20); + strcpy(path2, path); + pe = path2 + strlen(path2); + + for (part = 0; part < 16; part++) { + if (part) { + sprintf(be, "p%d", part); + + if (isdigit(pe[-1])) + sprintf(pe, "p%d", part); + else + sprintf(pe, "%d", part); + } + n = readlink(path2, link, sizeof(link)); + if (n > 0 && (int)strlen(base) == n && + strncmp(link, base, n) == 0) + unlink(path2); + } + free(path2); +} + +int Manage_run(char *devname, int fd, struct context *c) +{ + /* Run the array. Array must already be configured + * Requires >= 0.90.0 + */ + char nm[32], *nmp; + + if (md_get_version(fd) < 9000) { + pr_err("need md driver version 0.90.0 or later\n"); + return 1; + } + nmp = fd2devnm(fd); + if (!nmp) { + pr_err("Cannot find %s in sysfs!!\n", devname); + return 1; + } + strcpy(nm, nmp); + return IncrementalScan(c, nm); +} + +int Manage_stop(char *devname, int fd, int verbose, int will_retry) +{ + /* Stop the array. Array must already be configured + * 'will_retry' means that error messages are not wanted. + */ + int rv = 0; + struct map_ent *map = NULL; + struct mdinfo *mdi; + char devnm[32]; + char container[32]; + int err; + int count; + char buf[32]; + unsigned long long rd1, rd2; + + if (will_retry && verbose == 0) + verbose = -1; + + if (md_get_version(fd) < 9000) { + if (ioctl(fd, STOP_MD, 0) == 0) + return 0; + pr_err("stopping device %s failed: %s\n", + devname, strerror(errno)); + return 1; + } + + strcpy(devnm, fd2devnm(fd)); + /* Get EXCL access first. If this fails, then attempting + * to stop is probably a bad idea. + */ + mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION); + if (mdi && is_subarray(mdi->text_version)) { + char *sl; + strncpy(container, mdi->text_version+1, sizeof(container)); + container[sizeof(container)-1] = 0; + sl = strchr(container, '/'); + if (sl) + *sl = 0; + } else + container[0] = 0; + close(fd); + count = 5; + while (((fd = ((devname[0] == '/') + ?open(devname, O_RDONLY|O_EXCL) + :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0 + || strcmp(fd2devnm(fd), devnm) != 0) + && container[0] + && mdmon_running(container) + && count) { + /* Can't open, so something might be wrong. However it + * is a container, so we might be racing with mdmon, so + * retry for a bit. + */ + if (fd >= 0) + close(fd); + flush_mdmon(container); + count--; + } + if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) { + if (fd >= 0) + close(fd); + if (verbose >= 0) + pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n", + devname); + return 1; + } + /* If this is an mdmon managed array, just write 'inactive' + * to the array state and let mdmon clear up. + */ + if (mdi && + mdi->array.level > 0 && + is_subarray(mdi->text_version)) { + int err; + /* This is mdmon managed. */ + close(fd); + + /* As we had an O_EXCL open, any use of the device + * which blocks STOP_ARRAY is probably a transient use, + * so it is reasonable to retry for a while - 5 seconds. + */ + count = 25; + while (count && + (err = sysfs_set_str(mdi, NULL, + "array_state", + "inactive")) < 0 + && errno == EBUSY) { + usleep(200000); + count--; + } + if (err) { + if (verbose >= 0) + pr_err("failed to stop array %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + + /* Give monitor a chance to act */ + ping_monitor(mdi->text_version); + + fd = open_dev_excl(devnm); + if (fd < 0) { + if (verbose >= 0) + pr_err("failed to completely stop %s: Device is busy\n", + devname); + rv = 1; + goto out; + } + } else if (mdi && + mdi->array.major_version == -1 && + mdi->array.minor_version == -2 && + !is_subarray(mdi->text_version)) { + struct mdstat_ent *mds, *m; + /* container, possibly mdmon-managed. + * Make sure mdmon isn't opening it, which + * would interfere with the 'stop' + */ + ping_monitor(mdi->sys_name); + + /* now check that there are no existing arrays + * which are members of this array + */ + mds = mdstat_read(0, 0); + for (m = mds; m; m = m->next) + if (m->metadata_version && + strncmp(m->metadata_version, "external:", 9)==0 && + metadata_container_matches(m->metadata_version+9, + devnm)) { + if (verbose >= 0) + pr_err("Cannot stop container %s: member %s still active\n", + devname, m->devnm); + free_mdstat(mds); + rv = 1; + goto out; + } + } + + /* If the array is undergoing a reshape which changes the number + * of devices, then it would be nice to stop it at a point where + * it has completed a full number of stripes in both old and + * new layouts as this will allow the reshape to be reverted. + * So if 'sync_action' is "reshape" and 'raid_disks' shows two + * different numbers, then + * - freeze reshape + * - set sync_max to next multiple of both data_disks and + * chunk sizes (or next but one) + * - unfreeze reshape + * - wait on 'sync_completed' for that point to be reached. + */ + if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) && + sysfs_attribute_available(mdi, NULL, "sync_action") && + sysfs_attribute_available(mdi, NULL, "reshape_direction") && + sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 && + strcmp(buf, "reshape\n") == 0 && + sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) { + unsigned long long position, curr; + unsigned long long chunk1, chunk2; + unsigned long long rddiv, chunkdiv; + unsigned long long sectors; + unsigned long long sync_max, old_sync_max; + unsigned long long completed; + int backwards = 0; + int delay; + int scfd; + + delay = 40; + while (rd1 > rd2 && delay > 0 && + sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) { + /* must be in the critical section - wait a bit */ + delay -= 1; + usleep(100000); + } + + if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0) + goto done; + /* Array is frozen */ + + rd1 -= mdi->array.level == 6 ? 2 : 1; + rd2 -= mdi->array.level == 6 ? 2 : 1; + sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf)); + if (strncmp(buf, "back", 4) == 0) + backwards = 1; + if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) { + /* reshape must have finished now */ + sysfs_set_str(mdi, NULL, "sync_action", "idle"); + goto done; + } + sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2); + chunk1 /= 512; + chunk2 /= 512; + rddiv = GCD(rd1, rd2); + chunkdiv = GCD(chunk1, chunk2); + sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2; + + if (backwards) { + /* Need to subtract 'reshape_position' from + * array size to get equivalent of sync_max. + * Size calculation based on raid5_size in kernel. + */ + unsigned long long size = mdi->component_size; + size &= ~(chunk1-1); + size &= ~(chunk2-1); + /* rd1 must be smaller */ + /* Reshape may have progressed further backwards than + * recorded, so target even further back (hence "-1") + */ + position = (position / sectors - 1) * sectors; + /* rd1 is always the conversion factor between 'sync' + * position and 'reshape' position. + * We read 1 "new" stripe worth of data from where-ever, + * and when write out that full stripe. + */ + sync_max = size - position/rd1; + } else { + /* Reshape will very likely be beyond position, and it may + * be too late to stop at '+1', so aim for '+2' + */ + position = (position / sectors + 2) * sectors; + sync_max = position/rd1; + } + if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0) + old_sync_max = mdi->component_size; + /* Must not advance sync_max as that could confuse + * the reshape monitor */ + if (sync_max < old_sync_max) + sysfs_set_num(mdi, NULL, "sync_max", sync_max); + sysfs_set_str(mdi, NULL, "sync_action", "idle"); + + /* That should have set things going again. Now we + * wait a little while (3 second max) for sync_completed + * to reach the target. + * The reshape process can block for 500msec if + * the sync speed limit is hit, so we need to wait + * a lot longer than that. 1 second is usually + * enough. 3 is safe. + */ + delay = 3000; + scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed"); + while (scfd >= 0 && delay > 0 && old_sync_max > 0) { + unsigned long long max_completed; + sysfs_get_ll(mdi, NULL, "reshape_position", &curr); + sysfs_fd_get_str(scfd, buf, sizeof(buf)); + if (strncmp(buf, "none", 4) == 0) { + /* Either reshape has aborted, or hasn't + * quite started yet. Wait a bit and + * check 'sync_action' to see. + */ + usleep(10000); + sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf)); + if (strncmp(buf, "reshape", 7) != 0) + break; + } + + if (sysfs_fd_get_two(scfd, &completed, + &max_completed) == 2 && + /* 'completed' sometimes reads as max-uulong */ + completed < max_completed && + (completed > sync_max || + (completed == sync_max && curr != position))) { + while (completed > sync_max) { + sync_max += sectors / rd1; + if (backwards) + position -= sectors; + else + position += sectors; + } + if (sync_max < old_sync_max) + sysfs_set_num(mdi, NULL, "sync_max", sync_max); + } + + if (!backwards && curr >= position) + break; + if (backwards && curr <= position) + break; + sysfs_wait(scfd, &delay); + } + if (scfd >= 0) + close(scfd); + + } +done: + + /* As we have an O_EXCL open, any use of the device + * which blocks STOP_ARRAY is probably a transient use, + * so it is reasonable to retry for a while - 5 seconds. + */ + count = 25; err = 0; + while (count && fd >= 0 + && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 + && errno == EBUSY) { + usleep(200000); + count --; + } + if (fd >= 0 && err) { + if (verbose >= 0) { + pr_err("failed to stop array %s: %s\n", + devname, strerror(errno)); + if (errno == EBUSY) + cont_err("Perhaps a running process, mounted filesystem or active volume group?\n"); + } + rv = 1; + goto out; + } + /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array + * was stopped, so We'll do it here just to be sure. Drop any + * partitions as well... + */ + if (fd >= 0) + ioctl(fd, BLKRRPART, 0); + if (mdi) + sysfs_uevent(mdi, "change"); + + if (devnm[0] && use_udev()) { + struct map_ent *mp = map_by_devnm(&map, devnm); + remove_devices(devnm, mp ? mp->path : NULL); + } + + if (verbose >= 0) + pr_err("stopped %s\n", devname); + map_lock(&map); + map_remove(&map, devnm); + map_unlock(&map); +out: + if (mdi) + sysfs_free(mdi); + + return rv; +} + +static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp) +{ + struct mddev_dev *new; + new = xmalloc(sizeof(*new)); + memset(new, 0, sizeof(*new)); + new->devname = xstrdup(name); + new->disposition = disp; + new->next = dv->next; + dv->next = new; + return new; +} + +static void add_faulty(struct mddev_dev *dv, int fd, char disp) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int i; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + if ((disk.state & 1) == 0) /* not faulty */ + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + dv = add_one(dv, buf, disp); + } +} + +static void add_detached(struct mddev_dev *dv, int fd, char disp) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int i; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + int sfd; + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */ + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + sfd = dev_open(buf, O_RDONLY); + if (sfd >= 0) { + /* Not detached */ + close(sfd); + continue; + } + if (errno != ENXIO) + /* Probably not detached */ + continue; + dv = add_one(dv, buf, disp); + } +} + +static void add_set(struct mddev_dev *dv, int fd, char set_char) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int copies, set; + int i; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) + return; + if (array.level != 10) + return; + copies = ((array.layout & 0xff) * + ((array.layout >> 8) & 0xff)); + if (array.raid_disks % copies) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + set = disk.raid_disk % copies; + if (set_char != set + 'A') + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + dv = add_one(dv, buf, dv->disposition); + } +} + +int attempt_re_add(int fd, int tfd, struct mddev_dev *dv, + struct supertype *dev_st, struct supertype *tst, + unsigned long rdev, + char *update, char *devname, int verbose, + mdu_array_info_t *array) +{ + struct mdinfo mdi; + int duuid[4]; + int ouuid[4]; + + dev_st->ss->getinfo_super(dev_st, &mdi, NULL); + dev_st->ss->uuid_from_super(dev_st, ouuid); + if (tst->sb) + tst->ss->uuid_from_super(tst, duuid); + else + /* Assume uuid matches: kernel will check */ + memcpy(duuid, ouuid, sizeof(ouuid)); + if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) && + !(mdi.disk.state & (1<<MD_DISK_FAULTY)) && + memcmp(duuid, ouuid, sizeof(ouuid))==0) { + /* Looks like it is worth a + * try. Need to make sure + * kernel will accept it + * though. + */ + mdu_disk_info_t disc; + /* re-add doesn't work for version-1 superblocks + * before 2.6.18 :-( + */ + if (array->major_version == 1 && + get_linux_version() <= 2006018) + goto skip_re_add; + disc.number = mdi.disk.number; + if (ioctl(fd, GET_DISK_INFO, &disc) != 0 + || disc.major != 0 || disc.minor != 0 + ) + goto skip_re_add; + disc.major = major(rdev); + disc.minor = minor(rdev); + disc.number = mdi.disk.number; + disc.raid_disk = mdi.disk.raid_disk; + disc.state = mdi.disk.state; + if (dv->writemostly == 1) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; + if (dv->writemostly == 2) + disc.state &= ~(1 << MD_DISK_WRITEMOSTLY); + remove_partitions(tfd); + if (update || dv->writemostly > 0) { + int rv = -1; + tfd = dev_open(dv->devname, O_RDWR); + if (tfd < 0) { + pr_err("failed to open %s for superblock update during re-add\n", dv->devname); + return -1; + } + + if (dv->writemostly == 1) + rv = dev_st->ss->update_super( + dev_st, NULL, "writemostly", + devname, verbose, 0, NULL); + if (dv->writemostly == 2) + rv = dev_st->ss->update_super( + dev_st, NULL, "readwrite", + devname, verbose, 0, NULL); + if (update) + rv = dev_st->ss->update_super( + dev_st, NULL, update, + devname, verbose, 0, NULL); + if (rv == 0) + rv = dev_st->ss->store_super(dev_st, tfd); + close(tfd); + if (rv != 0) { + pr_err("failed to update superblock during re-add\n"); + return -1; + } + } + /* don't even try if disk is marked as faulty */ + errno = 0; + if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) { + if (verbose >= 0) + pr_err("re-added %s\n", dv->devname); + return 1; + } + if (errno == ENOMEM || errno == EROFS) { + pr_err("add new device failed for %s: %s\n", + dv->devname, strerror(errno)); + if (dv->disposition == 'M') + return 0; + return -1; + } + } +skip_re_add: + return 0; +} + +int Manage_add(int fd, int tfd, struct mddev_dev *dv, + struct supertype *tst, mdu_array_info_t *array, + int force, int verbose, char *devname, + char *update, unsigned long rdev, unsigned long long array_size) +{ + unsigned long long ldsize; + struct supertype *dev_st = NULL; + int j; + mdu_disk_info_t disc; + + if (!get_dev_size(tfd, dv->devname, &ldsize)) { + if (dv->disposition == 'M') + return 0; + else + return -1; + } + + if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) { + /* More than 4TB is wasted on v0.90 */ + if (!force) { + pr_err("%s is larger than %s can effectively use.\n" + " Add --force is you really want to add this device.\n", + dv->devname, devname); + return -1; + } + pr_err("%s is larger than %s can effectively use.\n" + " Adding anyway as --force was given.\n", + dv->devname, devname); + } + if (!tst->ss->external && + array->major_version == 0 && + md_get_version(fd)%100 < 2) { + if (ioctl(fd, HOT_ADD_DISK, rdev)==0) { + if (verbose >= 0) + pr_err("hot added %s\n", + dv->devname); + return 1; + } + + pr_err("hot add failed for %s: %s\n", + dv->devname, strerror(errno)); + return -1; + } + + if (array->not_persistent == 0 || tst->ss->external) { + + /* need to find a sample superblock to copy, and + * a spare slot to use. + * For 'external' array (well, container based), + * We can just load the metadata for the array-> + */ + int array_failed; + if (tst->sb) + /* already loaded */; + else if (tst->ss->external) { + tst->ss->load_container(tst, fd, NULL); + } else for (j = 0; j < tst->max_devs; j++) { + char *dev; + int dfd; + disc.number = j; + if (ioctl(fd, GET_DISK_INFO, &disc)) + continue; + if (disc.major==0 && disc.minor==0) + continue; + if ((disc.state & 4)==0) /* sync */ + continue; + /* Looks like a good device to try */ + dev = map_dev(disc.major, disc.minor, 1); + if (!dev) + continue; + dfd = dev_open(dev, O_RDONLY); + if (dfd < 0) + continue; + if (tst->ss->load_super(tst, dfd, + NULL)) { + close(dfd); + continue; + } + close(dfd); + break; + } + /* FIXME this is a bad test to be using */ + if (!tst->sb && (dv->disposition != 'a' + && dv->disposition != 'S')) { + /* we are re-adding a device to a + * completely dead array - have to depend + * on kernel to check + */ + } else if (!tst->sb) { + pr_err("cannot load array metadata from %s\n", devname); + return -1; + } + + /* Make sure device is large enough */ + if (tst->sb && + tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) < + array_size) { + if (dv->disposition == 'M') + return 0; + pr_err("%s not large enough to join array\n", + dv->devname); + return -1; + } + + /* Possibly this device was recently part of + * the array and was temporarily removed, and + * is now being re-added. If so, we can + * simply re-add it. + */ + + if (array->not_persistent==0) { + dev_st = dup_super(tst); + dev_st->ss->load_super(dev_st, tfd, NULL); + } + if (dev_st && dev_st->sb && dv->disposition != 'S') { + int rv = attempt_re_add(fd, tfd, dv, + dev_st, tst, + rdev, + update, devname, + verbose, + array); + dev_st->ss->free_super(dev_st); + if (rv) + return rv; + } + if (dv->disposition == 'M') { + if (verbose > 0) + pr_err("--re-add for %s to %s is not possible\n", + dv->devname, devname); + return 0; + } + if (dv->disposition == 'A') { + pr_err("--re-add for %s to %s is not possible\n", + dv->devname, devname); + return -1; + } + if (array->active_disks < array->raid_disks) { + char *avail = xcalloc(array->raid_disks, 1); + int d; + int found = 0; + + for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) { + disc.number = d; + if (ioctl(fd, GET_DISK_INFO, &disc)) + continue; + if (disc.major == 0 && disc.minor == 0) + continue; + found++; + if (!(disc.state & (1<<MD_DISK_SYNC))) + continue; + avail[disc.raid_disk] = 1; + } + array_failed = !enough(array->level, array->raid_disks, + array->layout, 1, avail); + free(avail); + } else + array_failed = 0; + if (array_failed) { + pr_err("%s has failed so using --add cannot work and might destroy\n", + devname); + pr_err("data on %s. You should stop the array and re-assemble it.\n", + dv->devname); + return -1; + } + } else { + /* non-persistent. Must ensure that new drive + * is at least array->size big. + */ + if (ldsize/512 < array_size) { + pr_err("%s not large enough to join array\n", + dv->devname); + return -1; + } + } + /* committed to really trying this device now*/ + remove_partitions(tfd); + + /* in 2.6.17 and earlier, version-1 superblocks won't + * use the number we write, but will choose a free number. + * we must choose the same free number, which requires + * starting at 'raid_disks' and counting up + */ + for (j = array->raid_disks; j < tst->max_devs; j++) { + disc.number = j; + if (ioctl(fd, GET_DISK_INFO, &disc)) + break; + if (disc.major==0 && disc.minor==0) + break; + if (disc.state & 8) /* removed */ + break; + } + disc.major = major(rdev); + disc.minor = minor(rdev); + disc.number =j; + disc.state = 0; + if (array->not_persistent==0) { + int dfd; + if (dv->writemostly == 1) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname, INVALID_SECTORS)) + return -1; + if (tst->ss->write_init_super(tst)) + return -1; + } else if (dv->disposition == 'A') { + /* this had better be raid1. + * As we are "--re-add"ing we must find a spare slot + * to fill. + */ + char *used = xcalloc(array->raid_disks, 1); + for (j = 0; j < tst->max_devs; j++) { + mdu_disk_info_t disc2; + disc2.number = j; + if (ioctl(fd, GET_DISK_INFO, &disc2)) + continue; + if (disc2.major==0 && disc2.minor==0) + continue; + if (disc2.state & 8) /* removed */ + continue; + if (disc2.raid_disk < 0) + continue; + if (disc2.raid_disk > array->raid_disks) + continue; + used[disc2.raid_disk] = 1; + } + for (j = 0 ; j < array->raid_disks; j++) + if (!used[j]) { + disc.raid_disk = j; + disc.state |= (1<<MD_DISK_SYNC); + break; + } + free(used); + } + if (dv->writemostly == 1) + disc.state |= (1 << MD_DISK_WRITEMOSTLY); + if (tst->ss->external) { + /* add a disk + * to an external metadata container */ + struct mdinfo new_mdi; + struct mdinfo *sra; + int container_fd; + char devnm[32]; + int dfd; + + strcpy(devnm, fd2devnm(fd)); + + container_fd = open_dev_excl(devnm); + if (container_fd < 0) { + pr_err("add failed for %s: could not get exclusive access to container\n", + dv->devname); + tst->ss->free_super(tst); + return -1; + } + + Kill(dv->devname, NULL, 0, -1, 0); + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (mdmon_running(tst->container_devnm)) + tst->update_tail = &tst->updates; + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname, INVALID_SECTORS)) { + close(dfd); + close(container_fd); + return -1; + } + if (tst->update_tail) + flush_metadata_updates(tst); + else + tst->ss->sync_metadata(tst); + + sra = sysfs_read(container_fd, NULL, 0); + if (!sra) { + pr_err("add failed for %s: sysfs_read failed\n", + dv->devname); + close(container_fd); + tst->ss->free_super(tst); + return -1; + } + sra->array.level = LEVEL_CONTAINER; + /* Need to set data_offset and component_size */ + tst->ss->getinfo_super(tst, &new_mdi, NULL); + new_mdi.disk.major = disc.major; + new_mdi.disk.minor = disc.minor; + new_mdi.recovery_start = 0; + /* Make sure fds are closed as they are O_EXCL which + * would block add_disk */ + tst->ss->free_super(tst); + if (sysfs_add_disk(sra, &new_mdi, 0) != 0) { + pr_err("add new device to external metadata failed for %s\n", dv->devname); + close(container_fd); + sysfs_free(sra); + return -1; + } + ping_monitor(devnm); + sysfs_free(sra); + close(container_fd); + } else { + tst->ss->free_super(tst); + if (ioctl(fd, ADD_NEW_DISK, &disc)) { + pr_err("add new device failed for %s as %d: %s\n", + dv->devname, j, strerror(errno)); + return -1; + } + } + if (verbose >= 0) + pr_err("added %s\n", dv->devname); + return 1; +} + +int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv, + int sysfd, unsigned long rdev, int verbose, char *devname) +{ + int lfd = -1; + int err; + + if (tst->ss->external) { + /* To remove a device from a container, we must + * check that it isn't in use in an array. + * This involves looking in the 'holders' + * directory - there must be just one entry, + * the container. + * To ensure that it doesn't get used as a + * hot spare while we are checking, we + * get an O_EXCL open on the container + */ + int ret; + char devnm[32]; + strcpy(devnm, fd2devnm(fd)); + lfd = open_dev_excl(devnm); + if (lfd < 0) { + pr_err("Cannot get exclusive access to container - odd\n"); + return -1; + } + /* We may not be able to check on holders in + * sysfs, either because we don't have the dev num + * (rdev == 0) or because the device has been detached + * and the 'holders' directory no longer exists + * (ret == -1). In that case, assume it is OK to + * remove. + */ + if (rdev == 0) + ret = -1; + else + ret = sysfs_unique_holder(devnm, rdev); + if (ret == 0) { + pr_err("%s is not a member, cannot remove.\n", + dv->devname); + close(lfd); + return -1; + } + if (ret >= 2) { + pr_err("%s is still in use, cannot remove.\n", + dv->devname); + close(lfd); + return -1; + } + } + /* FIXME check that it is a current member */ + if (sysfd >= 0) { + /* device has been removed and we don't know + * the major:minor number + */ + int n = write(sysfd, "remove", 6); + if (n != 6) + err = -1; + else + err = 0; + } else { + err = ioctl(fd, HOT_REMOVE_DISK, rdev); + if (err && errno == ENODEV) { + /* Old kernels rejected this if no personality + * is registered */ + struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS); + struct mdinfo *dv = NULL; + if (sra) + dv = sra->devs; + for ( ; dv ; dv=dv->next) + if (dv->disk.major == (int)major(rdev) && + dv->disk.minor == (int)minor(rdev)) + break; + if (dv) + err = sysfs_set_str(sra, dv, + "state", "remove"); + else + err = -1; + if (sra) + sysfs_free(sra); + } + } + if (err) { + pr_err("hot remove failed for %s: %s\n", dv->devname, + strerror(errno)); + if (lfd >= 0) + close(lfd); + return -1; + } + if (tst->ss->external) { + /* + * Before dropping our exclusive open we make an + * attempt at preventing mdmon from seeing an + * 'add' event before reconciling this 'remove' + * event. + */ + char *devnm = fd2devnm(fd); + + if (!devnm) { + pr_err("unable to get container name\n"); + return -1; + } + + ping_manager(devnm); + } + if (lfd >= 0) + close(lfd); + if (verbose >= 0) + pr_err("hot removed %s from %s\n", + dv->devname, devname); + return 1; +} + +int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv, + unsigned long rdev, int verbose, char *devname) +{ + struct mdinfo *mdi, *di; + if (tst->ss->external) { + pr_err("--replace only supported for native metadata (0.90 or 1.x)\n"); + return -1; + } + /* Need to find the device in sysfs and add 'want_replacement' to the + * status. + */ + mdi = sysfs_read(fd, NULL, GET_DEVS); + if (!mdi || !mdi->devs) { + pr_err("Cannot find status of %s to enable replacement - strange\n", + devname); + return -1; + } + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (int)major(rdev) && + di->disk.minor == (int)minor(rdev)) + break; + if (di) { + int rv; + if (di->disk.raid_disk < 0) { + pr_err("%s is not active and so cannot be replaced.\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + rv = sysfs_set_str(mdi, di, + "state", "want_replacement"); + if (rv) { + sysfs_free(mdi); + pr_err("Failed to request replacement for %s\n", + dv->devname); + return -1; + } + if (verbose >= 0) + pr_err("Marked %s (device %d in %s) for replacement\n", + dv->devname, di->disk.raid_disk, devname); + /* If there is a matching 'with', we need to tell it which + * raid disk + */ + while (dv && dv->disposition != 'W') + dv = dv->next; + if (dv) { + dv->disposition = 'w'; + dv->used = di->disk.raid_disk; + } + return 1; + } + sysfs_free(mdi); + pr_err("%s not found in %s so cannot --replace it\n", + dv->devname, devname); + return -1; +} + +int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv, + unsigned long rdev, int verbose, char *devname) +{ + struct mdinfo *mdi, *di; + /* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */ + mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE); + if (!mdi || !mdi->devs) { + pr_err("Cannot find status of %s to enable replacement - strange\n", + devname); + return -1; + } + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (int)major(rdev) && + di->disk.minor == (int)minor(rdev)) + break; + if (di) { + int rv; + if (di->disk.state & (1<<MD_DISK_FAULTY)) { + pr_err("%s is faulty and cannot be a replacement\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + if (di->disk.raid_disk >= 0) { + pr_err("%s is active and cannot be a replacement\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + rv = sysfs_set_num(mdi, di, + "slot", dv->used); + if (rv) { + sysfs_free(mdi); + pr_err("Failed to set %s as preferred replacement.\n", + dv->devname); + return -1; + } + if (verbose >= 0) + pr_err("Marked %s in %s as replacement for device %d\n", + dv->devname, devname, dv->used); + return 1; + } + sysfs_free(mdi); + pr_err("%s not found in %s so cannot make it preferred replacement\n", + dv->devname, devname); + return -1; +} + +int Manage_subdevs(char *devname, int fd, + struct mddev_dev *devlist, int verbose, int test, + char *update, int force) +{ + /* Do something to each dev. + * devmode can be + * 'a' - add the device + * try HOT_ADD_DISK + * If that fails EINVAL, try ADD_NEW_DISK + * 'S' - add the device as a spare - don't try re-add + * 'A' - re-add the device + * 'r' - remove the device: HOT_REMOVE_DISK + * device can be 'faulty' or 'detached' in which case all + * matching devices are removed. + * 'f' - set the device faulty SET_DISK_FAULTY + * device can be 'detached' in which case any device that + * is inaccessible will be marked faulty. + * 'R' - mark this device as wanting replacement. + * 'W' - this device is added if necessary and activated as + * a replacement for a previous 'R' device. + * ----- + * 'w' - 'W' will be changed to 'w' when it is paired with + * a 'R' device. If a 'W' is found while walking the list + * it must be unpaired, and is an error. + * 'M' - this is created by a 'missing' target. It is a slight + * variant on 'A' + * 'F' - Another variant of 'A', where the device was faulty + * so must be removed from the array first. + * + * For 'f' and 'r', the device can also be a kernel-internal + * name such as 'sdb'. + */ + mdu_array_info_t array; + unsigned long long array_size; + struct mddev_dev *dv; + int tfd = -1; + struct supertype *tst; + char *subarray = NULL; + int sysfd = -1; + int count = 0; /* number of actions taken */ + struct mdinfo info; + int frozen = 0; + int busy = 0; + + if (ioctl(fd, GET_ARRAY_INFO, &array)) { + pr_err("Cannot get array info for %s\n", + devname); + goto abort; + } + sysfs_init(&info, fd, NULL); + + /* array.size is only 32 bits and may be truncated. + * So read from sysfs if possible, and record number of sectors + */ + + array_size = get_component_size(fd); + if (array_size <= 0) + array_size = array.size * 2; + + tst = super_by_fd(fd, &subarray); + if (!tst) { + pr_err("unsupport array - version %d.%d\n", + array.major_version, array.minor_version); + goto abort; + } + + for (dv = devlist; dv; dv = dv->next) { + unsigned long rdev = 0; /* device to add/remove etc */ + int rv; + int mj,mn; + + if (strcmp(dv->devname, "failed") == 0 || + strcmp(dv->devname, "faulty") == 0) { + if (dv->disposition != 'A' + && dv->disposition != 'r') { + pr_err("%s only meaningful with -r or --re-add, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + add_faulty(dv, fd, (dv->disposition == 'A' + ? 'F' : 'r')); + continue; + } + if (strcmp(dv->devname, "detached") == 0) { + if (dv->disposition != 'r' && dv->disposition != 'f') { + pr_err("%s only meaningful with -r of -f, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + add_detached(dv, fd, dv->disposition); + continue; + } + + if (strcmp(dv->devname, "missing") == 0) { + struct mddev_dev *add_devlist = NULL; + struct mddev_dev **dp; + if (dv->disposition != 'A') { + pr_err("'missing' only meaningful with --re-add\n"); + goto abort; + } + add_devlist = conf_get_devs(); + if (add_devlist == NULL) { + pr_err("no devices to scan for missing members."); + continue; + } + for (dp = &add_devlist; *dp; dp = & (*dp)->next) + /* 'M' (for 'missing') is like 'A' without errors */ + (*dp)->disposition = 'M'; + *dp = dv->next; + dv->next = add_devlist; + continue; + } + + if (strncmp(dv->devname, "set-", 4) == 0 && + strlen(dv->devname) == 5) { + int copies; + + if (dv->disposition != 'r' && + dv->disposition != 'f') { + pr_err("'%s' only meaningful with -r or -f\n", + dv->devname); + goto abort; + } + if (array.level != 10) { + pr_err("'%s' only meaningful with RAID10 arrays\n", + dv->devname); + goto abort; + } + copies = ((array.layout & 0xff) * + ((array.layout >> 8) & 0xff)); + if (array.raid_disks % copies != 0 || + dv->devname[4] < 'A' || + dv->devname[4] >= 'A' + copies || + copies > 26) { + pr_err("'%s' not meaningful with this array\n", + dv->devname); + goto abort; + } + add_set(dv, fd, dv->devname[4]); + continue; + } + + if (strchr(dv->devname, '/') == NULL && + strchr(dv->devname, ':') == NULL && + strlen(dv->devname) < 50) { + /* Assume this is a kernel-internal name like 'sda1' */ + int found = 0; + char dname[55]; + if (dv->disposition != 'r' && dv->disposition != 'f') { + pr_err("%s only meaningful with -r or -f, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + + sprintf(dname, "dev-%s", dv->devname); + sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev"); + if (sysfd >= 0) { + char dn[20]; + if (sysfs_fd_get_str(sysfd, dn, 20) > 0 && + sscanf(dn, "%d:%d", &mj,&mn) == 2) { + rdev = makedev(mj,mn); + found = 1; + } + close(sysfd); + sysfd = -1; + } + if (!found) { + sysfd = sysfs_open(fd2devnm(fd), dname, "state"); + if (sysfd < 0) { + pr_err("%s does not appear to be a component of %s\n", + dv->devname, devname); + goto abort; + } + } + } else if ((dv->disposition == 'r' || dv->disposition == 'f') + && get_maj_min(dv->devname, &mj, &mn)) { + /* for 'fail' and 'remove', the device might + * not exist. + */ + rdev = makedev(mj, mn); + } else { + struct stat stb; + tfd = dev_open(dv->devname, O_RDONLY); + if (tfd >= 0) + fstat(tfd, &stb); + else { + int open_err = errno; + if (stat(dv->devname, &stb) != 0) { + pr_err("Cannot find %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + if (dv->disposition == 'M') + /* non-fatal. Also improbable */ + continue; + pr_err("%s is not a block device.\n", + dv->devname); + goto abort; + } + if (dv->disposition == 'r') + /* Be happy, the stat worked, that is + * enough for --remove + */ + ; + else { + if (dv->disposition == 'M') + /* non-fatal */ + continue; + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(open_err)); + goto abort; + } + } + rdev = stb.st_rdev; + } + switch(dv->disposition){ + default: + pr_err("internal error - devmode[%s]=%d\n", + dv->devname, dv->disposition); + goto abort; + case 'a': + case 'S': /* --add-spare */ + case 'A': + case 'M': /* --re-add missing */ + case 'F': /* --re-add faulty */ + /* add the device */ + if (subarray) { + pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n"); + goto abort; + } + if (dv->disposition == 'F') + /* Need to remove first */ + ioctl(fd, HOT_REMOVE_DISK, rdev); + /* Make sure it isn't in use (in 2.6 or later) */ + tfd = dev_open(dv->devname, O_RDONLY|O_EXCL); + if (tfd >= 0) { + /* We know no-one else is using it. We'll + * need non-exclusive access to add it, so + * do that now. + */ + close(tfd); + tfd = dev_open(dv->devname, O_RDONLY); + } + if (tfd < 0) { + if (dv->disposition == 'M') + continue; + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if (!frozen) { + if (sysfs_freeze_array(&info) == 1) + frozen = 1; + else + frozen = -1; + } + rv = Manage_add(fd, tfd, dv, tst, &array, + force, verbose, devname, update, + rdev, array_size); + close(tfd); + tfd = -1; + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + + case 'r': + /* hot remove */ + if (subarray) { + pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n"); + rv = -1; + } else + rv = Manage_remove(tst, fd, dv, sysfd, + rdev, verbose, + devname); + if (sysfd >= 0) + close(sysfd); + sysfd = -1; + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + + case 'f': /* set faulty */ + /* FIXME check current member */ + if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) || + (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY, + rdev))) { + if (errno == EBUSY) + busy = 1; + pr_err("set device faulty failed for %s: %s\n", + dv->devname, strerror(errno)); + if (sysfd >= 0) + close(sysfd); + goto abort; + } + if (sysfd >= 0) + close(sysfd); + sysfd = -1; + count++; + if (verbose >= 0) + pr_err("set %s faulty in %s\n", + dv->devname, devname); + break; + case 'R': /* Mark as replaceable */ + if (subarray) { + pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n"); + rv = -1; + } else { + if (!frozen) { + if (sysfs_freeze_array(&info) == 1) + frozen = 1; + else + frozen = -1; + } + rv = Manage_replace(tst, fd, dv, + rdev, verbose, + devname); + } + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + case 'W': /* --with device that doesn't match */ + pr_err("No matching --replace device for --with %s\n", + dv->devname); + goto abort; + case 'w': /* --with device which was matched */ + rv = Manage_with(tst, fd, dv, + rdev, verbose, devname); + if (rv < 0) + goto abort; + break; + } + } + if (frozen > 0) + sysfs_set_str(&info, NULL, "sync_action","idle"); + if (test && count == 0) + return 2; + return 0; + +abort: + if (frozen > 0) + sysfs_set_str(&info, NULL, "sync_action","idle"); + return !test && busy ? 2 : 1; +} + +int autodetect(void) +{ + /* Open any md device, and issue the RAID_AUTORUN ioctl */ + int rv = 1; + int fd = dev_open("9:0", O_RDONLY); + if (fd >= 0) { + if (ioctl(fd, RAID_AUTORUN, 0) == 0) + rv = 0; + close(fd); + } + return rv; +} + +int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int verbose) +{ + struct supertype supertype, *st = &supertype; + int fd, rv = 2; + + memset(st, 0, sizeof(*st)); + + fd = open_subarray(dev, subarray, st, verbose < 0); + if (fd < 0) + return 2; + + if (!st->ss->update_subarray) { + if (verbose >= 0) + pr_err("Operation not supported for %s metadata\n", + st->ss->name); + goto free_super; + } + + if (mdmon_running(st->devnm)) + st->update_tail = &st->updates; + + rv = st->ss->update_subarray(st, subarray, update, ident); + + if (rv) { + if (verbose >= 0) + pr_err("Failed to update %s of subarray-%s in %s\n", + update, subarray, dev); + } else if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + + if (rv == 0 && strcmp(update, "name") == 0 && verbose >= 0) + pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n", + subarray, dev); + + free_super: + st->ss->free_super(st); + close(fd); + + return rv; +} + +/* Move spare from one array to another If adding to destination array fails + * add back to original array. + * Returns 1 on success, 0 on failure */ +int move_spare(char *from_devname, char *to_devname, dev_t devid) +{ + struct mddev_dev devlist; + char devname[20]; + + /* try to remove and add */ + int fd1 = open(to_devname, O_RDONLY); + int fd2 = open(from_devname, O_RDONLY); + + if (fd1 < 0 || fd2 < 0) { + if (fd1>=0) close(fd1); + if (fd2>=0) close(fd2); + return 0; + } + + devlist.next = NULL; + devlist.used = 0; + devlist.writemostly = 0; + devlist.devname = devname; + sprintf(devname, "%d:%d", major(devid), minor(devid)); + + devlist.disposition = 'r'; + if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0) == 0) { + devlist.disposition = 'a'; + if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0, NULL, 0) == 0) { + /* make sure manager is aware of changes */ + ping_manager(to_devname); + ping_manager(from_devname); + close(fd1); + close(fd2); + return 1; + } + else Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0); + } + close(fd1); + close(fd2); + return 0; +} +#endif diff --git a/Monitor.c b/Monitor.c new file mode 100644 index 00000000..f19c2e58 --- /dev/null +++ b/Monitor.c @@ -0,0 +1,1143 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" +#include <sys/wait.h> +#include <signal.h> +#include <limits.h> +#include <syslog.h> + +struct state { + char *devname; + char devnm[32]; /* to sync with mdstat info */ + long utime; + int err; + char *spare_group; + int active, working, failed, spare, raid; + int from_config; + int from_auto; + int expected_spares; + int devstate[MAX_DISKS]; + dev_t devid[MAX_DISKS]; + int percent; + char parent_devnm[32]; /* For subarray, devnm of parent. + * For others, "" + */ + struct supertype *metadata; + struct state *subarray;/* for a container it is a link to first subarray + * for a subarray it is a link to next subarray + * in the same container */ + struct state *parent; /* for a subarray it is a link to its container + */ + struct state *next; +}; + +struct alert_info { + char *mailaddr; + char *mailfrom; + char *alert_cmd; + int dosyslog; +}; +static int make_daemon(char *pidfile); +static int check_one_sharer(int scan); +static void alert(char *event, char *dev, char *disc, struct alert_info *info); +static int check_array(struct state *st, struct mdstat_ent *mdstat, + int test, struct alert_info *info, + int increments, char *prefer); +static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist, + int test, struct alert_info *info); +static void try_spare_migration(struct state *statelist, struct alert_info *info); +static void link_containers_with_subarrays(struct state *list); + +int Monitor(struct mddev_dev *devlist, + char *mailaddr, char *alert_cmd, + struct context *c, + int daemonise, int oneshot, + int dosyslog, char *pidfile, int increments, + int share) +{ + /* + * Every few seconds, scan every md device looking for changes + * When a change is found, log it, possibly run the alert command, + * and possibly send Email + * + * For each array, we record: + * Update time + * active/working/failed/spare drives + * State of each device. + * %rebuilt if rebuilding + * + * If the update time changes, check out all the data again + * It is possible that we cannot get the state of each device + * due to bugs in the md kernel module. + * We also read /proc/mdstat to get rebuild percent, + * and to get state on all active devices incase of kernel bug. + * + * Events are: + * Fail + * An active device had Faulty set or Active/Sync removed + * FailSpare + * A spare device had Faulty set + * SpareActive + * An active device had a reverse transition + * RebuildStarted + * percent went from -1 to +ve + * RebuildNN + * percent went from below to not-below NN% + * DeviceDisappeared + * Couldn't access a device which was previously visible + * + * if we detect an array with active<raid and spare==0 + * we look at other arrays that have same spare-group + * If we find one with active==raid and spare>0, + * and if we can get_disk_info and find a name + * Then we hot-remove and hot-add to the other array + * + * If devlist is NULL, then we can monitor everything because --scan + * was given. We get an initial list from config file and add anything + * that appears in /proc/mdstat + */ + + struct state *statelist = NULL; + struct state *st2; + int finished = 0; + struct mdstat_ent *mdstat = NULL; + char *mailfrom = NULL; + struct alert_info info; + + if (!mailaddr) { + mailaddr = conf_get_mailaddr(); + if (mailaddr && ! c->scan) + pr_err("Monitor using email address \"%s\" from config file\n", + mailaddr); + } + mailfrom = conf_get_mailfrom(); + + if (!alert_cmd) { + alert_cmd = conf_get_program(); + if (alert_cmd && ! c->scan) + pr_err("Monitor using program \"%s\" from config file\n", + alert_cmd); + } + if (c->scan && !mailaddr && !alert_cmd && !dosyslog) { + pr_err("No mail address or alert command - not monitoring.\n"); + return 1; + } + info.alert_cmd = alert_cmd; + info.mailaddr = mailaddr; + info.mailfrom = mailfrom; + info.dosyslog = dosyslog; + + if (daemonise) { + int rv = make_daemon(pidfile); + if (rv >= 0) + return rv; + } + + if (share) + if (check_one_sharer(c->scan)) + return 1; + + if (devlist == NULL) { + struct mddev_ident *mdlist = conf_get_ident(NULL); + for (; mdlist; mdlist=mdlist->next) { + struct state *st; + if (mdlist->devname == NULL) + continue; + if (strcasecmp(mdlist->devname, "<ignore>") == 0) + continue; + st = xcalloc(1, sizeof *st); + if (mdlist->devname[0] == '/') + st->devname = xstrdup(mdlist->devname); + else { + st->devname = xmalloc(8+strlen(mdlist->devname)+1); + strcpy(strcpy(st->devname, "/dev/md/"), + mdlist->devname); + } + st->next = statelist; + st->devnm[0] = 0; + st->percent = RESYNC_UNKNOWN; + st->from_config = 1; + st->expected_spares = mdlist->spare_disks; + if (mdlist->spare_group) + st->spare_group = xstrdup(mdlist->spare_group); + statelist = st; + } + } else { + struct mddev_dev *dv; + for (dv=devlist ; dv; dv=dv->next) { + struct mddev_ident *mdlist = conf_get_ident(dv->devname); + struct state *st = xcalloc(1, sizeof *st); + st->devname = xstrdup(dv->devname); + st->next = statelist; + st->devnm[0] = 0; + st->percent = RESYNC_UNKNOWN; + st->expected_spares = -1; + if (mdlist) { + st->expected_spares = mdlist->spare_disks; + if (mdlist->spare_group) + st->spare_group = xstrdup(mdlist->spare_group); + } + statelist = st; + } + } + + while (! finished) { + int new_found = 0; + struct state *st, **stp; + int anydegraded = 0; + + if (mdstat) + free_mdstat(mdstat); + mdstat = mdstat_read(oneshot?0:1, 0); + + for (st=statelist; st; st=st->next) + if (check_array(st, mdstat, c->test, &info, + increments, c->prefer)) + anydegraded = 1; + + /* now check if there are any new devices found in mdstat */ + if (c->scan) + new_found = add_new_arrays(mdstat, &statelist, c->test, + &info); + + /* If an array has active < raid && spare == 0 && spare_group != NULL + * Look for another array with spare > 0 and active == raid and same spare_group + * if found, choose a device and hotremove/hotadd + */ + if (share && anydegraded) + try_spare_migration(statelist, &info); + if (!new_found) { + if (oneshot) + break; + else + mdstat_wait(c->delay); + } + c->test = 0; + + for (stp = &statelist; (st = *stp) != NULL; ) { + if (st->from_auto && st->err > 5) { + *stp = st->next; + free(st->devname); + free(st->spare_group); + free(st); + } else + stp = &st->next; + } + } + for (st2 = statelist; st2; st2 = statelist) { + statelist = st2->next; + free(st2); + } + + if (pidfile) + unlink(pidfile); + return 0; +} + +static int make_daemon(char *pidfile) +{ + /* Return: + * -1 in the forked daemon + * 0 in the parent + * 1 on error + * so a none-negative becomes the exit code. + */ + int pid = fork(); + if (pid > 0) { + if (!pidfile) + printf("%d\n", pid); + else { + FILE *pid_file; + pid_file=fopen(pidfile, "w"); + if (!pid_file) + perror("cannot create pid file"); + else { + fprintf(pid_file,"%d\n", pid); + fclose(pid_file); + } + } + return 0; + } + if (pid < 0) { + perror("daemonise"); + return 1; + } + close(0); + open("/dev/null", O_RDWR); + dup2(0,1); + dup2(0,2); + setsid(); + return -1; +} + +static int check_one_sharer(int scan) +{ + int pid, rv; + FILE *fp; + char dir[20]; + char path[100]; + struct stat buf; + sprintf(path, "%s/autorebuild.pid", MDMON_DIR); + fp = fopen(path, "r"); + if (fp) { + if (fscanf(fp, "%d", &pid) != 1) + pid = -1; + sprintf(dir, "/proc/%d", pid); + rv = stat(dir, &buf); + if (rv != -1) { + if (scan) { + pr_err("Only one autorebuild process allowed in scan mode, aborting\n"); + fclose(fp); + return 1; + } else { + pr_err("Warning: One autorebuild process already running.\n"); + } + } + fclose(fp); + } + if (scan) { + if (mkdir(MDMON_DIR, S_IRWXU) < 0 && + errno != EEXIST) { + pr_err("Can't create autorebuild.pid file\n"); + } else { + fp = fopen(path, "w"); + if (!fp) + pr_err("Cannot create autorebuild.pidfile\n"); + else { + pid = getpid(); + fprintf(fp, "%d\n", pid); + fclose(fp); + } + } + } + return 0; +} + +static void alert(char *event, char *dev, char *disc, struct alert_info *info) +{ + int priority; + + if (!info->alert_cmd && !info->mailaddr && !info->dosyslog) { + time_t now = time(0); + + printf("%1.15s: %s on %s %s\n", ctime(&now)+4, event, dev, disc?disc:"unknown device"); + } + if (info->alert_cmd) { + int pid = fork(); + switch(pid) { + default: + waitpid(pid, NULL, 0); + break; + case -1: + break; + case 0: + execl(info->alert_cmd, info->alert_cmd, + event, dev, disc, NULL); + exit(2); + } + } + if (info->mailaddr && + (strncmp(event, "Fail", 4)==0 || + strncmp(event, "Test", 4)==0 || + strncmp(event, "Spares", 6)==0 || + strncmp(event, "Degrade", 7)==0)) { + FILE *mp = popen(Sendmail, "w"); + if (mp) { + FILE *mdstat; + char hname[256]; + gethostname(hname, sizeof(hname)); + signal(SIGPIPE, SIG_IGN); + if (info->mailfrom) + fprintf(mp, "From: %s\n", info->mailfrom); + else + fprintf(mp, "From: %s monitoring <root>\n", Name); + fprintf(mp, "To: %s\n", info->mailaddr); + fprintf(mp, "Subject: %s event on %s:%s\n\n", + event, dev, hname); + + fprintf(mp, + "This is an automatically generated mail message from %s\n", Name); + fprintf(mp, "running on %s\n\n", hname); + + fprintf(mp, + "A %s event had been detected on md device %s.\n\n", event, dev); + + if (disc && disc[0] != ' ') + fprintf(mp, + "It could be related to component device %s.\n\n", disc); + if (disc && disc[0] == ' ') + fprintf(mp, "Extra information:%s.\n\n", disc); + + fprintf(mp, "Faithfully yours, etc.\n"); + + mdstat = fopen("/proc/mdstat", "r"); + if (mdstat) { + char buf[8192]; + int n; + fprintf(mp, + "\nP.S. The /proc/mdstat file currently contains the following:\n\n"); + while ( (n=fread(buf, 1, sizeof(buf), mdstat)) > 0) + n=fwrite(buf, 1, n, mp); + fclose(mdstat); + } + pclose(mp); + } + } + + /* log the event to syslog maybe */ + if (info->dosyslog) { + /* Log at a different severity depending on the event. + * + * These are the critical events: */ + if (strncmp(event, "Fail", 4)==0 || + strncmp(event, "Degrade", 7)==0 || + strncmp(event, "DeviceDisappeared", 17)==0) + priority = LOG_CRIT; + /* Good to know about, but are not failures: */ + else if (strncmp(event, "Rebuild", 7)==0 || + strncmp(event, "MoveSpare", 9)==0 || + strncmp(event, "Spares", 6) != 0) + priority = LOG_WARNING; + /* Everything else: */ + else + priority = LOG_INFO; + + if (disc && disc[0] != ' ') + syslog(priority, + "%s event detected on md device %s, component device %s", event, dev, disc); + else if (disc) + syslog(priority, + "%s event detected on md device %s: %s", + event, dev, disc); + else + syslog(priority, + "%s event detected on md device %s", + event, dev); + } +} + +static int check_array(struct state *st, struct mdstat_ent *mdstat, + int test, struct alert_info *ainfo, + int increments, char *prefer) +{ + /* Update the state 'st' to reflect any changes shown in mdstat, + * or found by directly examining the array, and return + * '1' if the array is degraded, or '0' if it is optimal (or dead). + */ + struct { int state, major, minor; } info[MAX_DISKS]; + mdu_array_info_t array; + struct mdstat_ent *mse = NULL, *mse2; + char *dev = st->devname; + int fd = -1; + int i; + int remaining_disks; + int last_disk; + int new_array = 0; + + if (test) + alert("TestMessage", dev, NULL, ainfo); + if (st->devnm[0]) + fd = open("/sys/block", O_RDONLY|O_DIRECTORY); + if (fd >= 0) { + /* Don't open the device unless it is present and + * active in sysfs. + */ + char buf[10]; + close(fd); + fd = sysfs_open(st->devnm, NULL, "array_state"); + if (fd < 0 || + read(fd, buf, 10) < 5 || + strncmp(buf,"clear",5) == 0 || + strncmp(buf,"inact",5) == 0) { + if (fd >= 0) + close(fd); + fd = sysfs_open(st->devnm, NULL, "level"); + if (fd < 0 || read(fd, buf, 10) != 0) { + if (fd >= 0) + close(fd); + if (!st->err) + alert("DeviceDisappeared", dev, NULL, ainfo); + st->err++; + return 0; + } + } + close(fd); + } + fd = open(dev, O_RDONLY); + if (fd < 0) { + if (!st->err) + alert("DeviceDisappeared", dev, NULL, ainfo); + st->err++; + return 0; + } + fcntl(fd, F_SETFD, FD_CLOEXEC); + if (ioctl(fd, GET_ARRAY_INFO, &array)<0) { + if (!st->err) + alert("DeviceDisappeared", dev, NULL, ainfo); + st->err++; + close(fd); + return 0; + } + /* It's much easier to list what array levels can't + * have a device disappear than all of them that can + */ + if (array.level == 0 || array.level == -1) { + if (!st->err && !st->from_config) + alert("DeviceDisappeared", dev, " Wrong-Level", ainfo); + st->err++; + close(fd); + return 0; + } + if (st->devnm[0] == 0) + strcpy(st->devnm, fd2devnm(fd)); + + for (mse2 = mdstat ; mse2 ; mse2=mse2->next) + if (strcmp(mse2->devnm, st->devnm) == 0) { + mse2->devnm[0] = 0; /* flag it as "used" */ + mse = mse2; + } + + if (!mse) { + /* duplicated array in statelist + * or re-created after reading mdstat*/ + st->err++; + close(fd); + return 0; + } + /* this array is in /proc/mdstat */ + if (array.utime == 0) + /* external arrays don't update utime, so + * just make sure it is always different. */ + array.utime = st->utime + 1;; + + if (st->err) { + /* New array appeared where previously had an error */ + st->err = 0; + st->percent = RESYNC_NONE; + new_array = 1; + alert("NewArray", st->devname, NULL, ainfo); + } + + if (st->utime == array.utime && + st->failed == array.failed_disks && + st->working == array.working_disks && + st->spare == array.spare_disks && + (mse == NULL || ( + mse->percent == st->percent + ))) { + close(fd); + if ((st->active < st->raid) && st->spare == 0) + return 1; + else + return 0; + } + if (st->utime == 0 && /* new array */ + mse->pattern && strchr(mse->pattern, '_') /* degraded */ + ) + alert("DegradedArray", dev, NULL, ainfo); + + if (st->utime == 0 && /* new array */ + st->expected_spares > 0 && + array.spare_disks < st->expected_spares) + alert("SparesMissing", dev, NULL, ainfo); + if (st->percent < 0 && st->percent != RESYNC_UNKNOWN && + mse->percent >= 0) + alert("RebuildStarted", dev, NULL, ainfo); + if (st->percent >= 0 && + mse->percent >= 0 && + (mse->percent / increments) > (st->percent / increments)) { + char percentalert[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars) + + if((mse->percent / increments) == 0) + snprintf(percentalert, sizeof(percentalert), "RebuildStarted"); + else + snprintf(percentalert, sizeof(percentalert), "Rebuild%02d", mse->percent); + + alert(percentalert, dev, NULL, ainfo); + } + + if (mse->percent == RESYNC_NONE && + st->percent >= 0) { + /* Rebuild/sync/whatever just finished. + * If there is a number in /mismatch_cnt, + * we should report that. + */ + struct mdinfo *sra = + sysfs_read(-1, st->devnm, GET_MISMATCH); + if (sra && sra->mismatch_cnt > 0) { + char cnt[80]; + snprintf(cnt, sizeof(cnt), + " mismatches found: %d (on raid level %d)", + sra->mismatch_cnt, array.level); + alert("RebuildFinished", dev, cnt, ainfo); + } else + alert("RebuildFinished", dev, NULL, ainfo); + if (sra) + free(sra); + } + st->percent = mse->percent; + + remaining_disks = array.nr_disks; + for (i=0; i<MAX_DISKS && remaining_disks > 0; + i++) { + mdu_disk_info_t disc; + disc.number = i; + if (ioctl(fd, GET_DISK_INFO, &disc) >= 0) { + info[i].state = disc.state; + info[i].major = disc.major; + info[i].minor = disc.minor; + if (disc.major || disc.minor) + remaining_disks --; + } else + info[i].major = info[i].minor = 0; + } + last_disk = i; + + if (mse->metadata_version && + strncmp(mse->metadata_version, "external:", 9) == 0 && + is_subarray(mse->metadata_version+9)) { + char *sl; + strcpy(st->parent_devnm, + mse->metadata_version+10); + sl = strchr(st->parent_devnm, '/'); + if (sl) + *sl = 0; + } else + st->parent_devnm[0] = 0; + if (st->metadata == NULL && + st->parent_devnm[0] == 0) + st->metadata = super_by_fd(fd, NULL); + + close(fd); + + for (i=0; i<MAX_DISKS; i++) { + mdu_disk_info_t disc = {0,0,0,0,0}; + int newstate=0; + int change; + char *dv = NULL; + disc.number = i; + if (i < last_disk && + (info[i].major || info[i].minor)) { + newstate = info[i].state; + dv = map_dev_preferred( + info[i].major, info[i].minor, 1, + prefer); + disc.state = newstate; + disc.major = info[i].major; + disc.minor = info[i].minor; + } else + newstate = (1 << MD_DISK_REMOVED); + + if (dv == NULL && st->devid[i]) + dv = map_dev_preferred( + major(st->devid[i]), + minor(st->devid[i]), 1, prefer); + change = newstate ^ st->devstate[i]; + if (st->utime && change && !st->err && !new_array) { + if ((st->devstate[i]&change)&(1<<MD_DISK_SYNC)) + alert("Fail", dev, dv, ainfo); + else if ((newstate & (1<<MD_DISK_FAULTY)) && + (disc.major || disc.minor) && + st->devid[i] == makedev(disc.major, disc.minor)) + alert("FailSpare", dev, dv, ainfo); + else if ((newstate&change)&(1<<MD_DISK_SYNC)) + alert("SpareActive", dev, dv, ainfo); + } + st->devstate[i] = newstate; + st->devid[i] = makedev(disc.major, disc.minor); + } + st->active = array.active_disks; + st->working = array.working_disks; + st->spare = array.spare_disks; + st->failed = array.failed_disks; + st->utime = array.utime; + st->raid = array.raid_disks; + st->err = 0; + if ((st->active < st->raid) && st->spare == 0) + return 1; + return 0; +} + +static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist, + int test, struct alert_info *info) +{ + struct mdstat_ent *mse; + int new_found = 0; + char *name; + + for (mse=mdstat; mse; mse=mse->next) + if (mse->devnm[0] && + (!mse->level || /* retrieve containers */ + (strcmp(mse->level, "raid0") != 0 && + strcmp(mse->level, "linear") != 0)) + ) { + struct state *st = xcalloc(1, sizeof *st); + mdu_array_info_t array; + int fd; + + name = get_md_name(mse->devnm); + if (!name) { + free(st); + continue; + } + + st->devname = xstrdup(name); + if ((fd = open(st->devname, O_RDONLY)) < 0 || + ioctl(fd, GET_ARRAY_INFO, &array)< 0) { + /* no such array */ + if (fd >=0) close(fd); + put_md_name(st->devname); + free(st->devname); + if (st->metadata) { + st->metadata->ss->free_super(st->metadata); + free(st->metadata); + } + free(st); + continue; + } + close(fd); + st->next = *statelist; + st->err = 1; + st->from_auto = 1; + strcpy(st->devnm, mse->devnm); + st->percent = RESYNC_UNKNOWN; + st->expected_spares = -1; + if (mse->metadata_version && + strncmp(mse->metadata_version, "external:", 9) == 0 && + is_subarray(mse->metadata_version+9)) { + char *sl; + strcpy(st->parent_devnm, + mse->metadata_version+10); + sl = strchr(st->parent_devnm, '/'); + *sl = 0; + } else + st->parent_devnm[0] = 0; + *statelist = st; + if (test) + alert("TestMessage", st->devname, NULL, info); + new_found = 1; + } + return new_found; +} + +static int get_min_spare_size_required(struct state *st, unsigned long long *sizep) +{ + int fd; + + if (!st->metadata || + !st->metadata->ss->min_acceptable_spare_size) { + *sizep = 0; + return 0; + } + + fd = open(st->devname, O_RDONLY); + if (fd < 0) + return 1; + if (st->metadata->ss->external) + st->metadata->ss->load_container(st->metadata, fd, st->devname); + else + st->metadata->ss->load_super(st->metadata, fd, st->devname); + close(fd); + if (!st->metadata->sb) + return 1; + *sizep = st->metadata->ss->min_acceptable_spare_size(st->metadata); + st->metadata->ss->free_super(st->metadata); + + return 0; +} + +static int check_donor(struct state *from, struct state *to) +{ + struct state *sub; + + if (from == to) + return 0; + if (from->parent) + /* Cannot move from a member */ + return 0; + if (from->err) + return 0; + for (sub = from->subarray; sub; sub = sub->subarray) + /* If source array has degraded subarrays, don't + * remove anything + */ + if (sub->active < sub->raid) + return 0; + if (from->metadata->ss->external == 0) + if (from->active < from->raid) + return 0; + if (from->spare <= 0) + return 0; + return 1; +} + +static dev_t choose_spare(struct state *from, struct state *to, + struct domainlist *domlist, unsigned long long min_size) +{ + int d; + dev_t dev = 0; + + for (d = from->raid; !dev && d < MAX_DISKS; d++) { + if (from->devid[d] > 0 && + from->devstate[d] == 0) { + struct dev_policy *pol; + unsigned long long dev_size; + + if (to->metadata->ss->external && + test_partition_from_id(from->devid[d])) + continue; + + if (min_size && + dev_size_from_id(from->devid[d], &dev_size) && + dev_size < min_size) + continue; + + pol = devid_policy(from->devid[d]); + if (from->spare_group) + pol_add(&pol, pol_domain, + from->spare_group, NULL); + if (domain_test(domlist, pol, to->metadata->ss->name) == 1) + dev = from->devid[d]; + dev_policy_free(pol); + } + } + return dev; +} + +static dev_t container_choose_spare(struct state *from, struct state *to, + struct domainlist *domlist, + unsigned long long min_size, int active) +{ + /* This is similar to choose_spare, but we cannot trust devstate, + * so we need to read the metadata instead + */ + struct mdinfo *list; + struct supertype *st = from->metadata; + int fd = open(from->devname, O_RDONLY); + int err; + dev_t dev = 0; + + if (fd < 0) + return 0; + if (!st->ss->getinfo_super_disks) { + close(fd); + return 0; + } + + err = st->ss->load_container(st, fd, NULL); + close(fd); + if (err) + return 0; + + if (from == to) { + /* We must check if number of active disks has not increased + * since ioctl in main loop. mdmon may have added spare + * to subarray. If so we do not need to look for more spares + * so return non zero value */ + int active_cnt = 0; + struct mdinfo *dp; + list = st->ss->getinfo_super_disks(st); + if (!list) { + st->ss->free_super(st); + return 1; + } + dp = list->devs; + while (dp) { + if (dp->disk.state & (1<<MD_DISK_SYNC) && + !(dp->disk.state & (1<<MD_DISK_FAULTY))) + active_cnt++; + dp = dp->next; + } + sysfs_free(list); + if (active < active_cnt) { + /* Spare just activated.*/ + st->ss->free_super(st); + return 1; + } + } + + /* We only need one spare so full list not needed */ + list = container_choose_spares(st, min_size, domlist, from->spare_group, + to->metadata->ss->name, 1); + if (list) { + struct mdinfo *disks = list->devs; + if (disks) + dev = makedev(disks->disk.major, disks->disk.minor); + sysfs_free(list); + } + st->ss->free_super(st); + return dev; +} + +static void try_spare_migration(struct state *statelist, struct alert_info *info) +{ + struct state *from; + struct state *st; + + link_containers_with_subarrays(statelist); + for (st = statelist; st; st = st->next) + if (st->active < st->raid && + st->spare == 0 && !st->err) { + struct domainlist *domlist = NULL; + int d; + struct state *to = st; + unsigned long long min_size; + + if (to->parent_devnm[0] && !to->parent) + /* subarray monitored without parent container + * we can't move spares here */ + continue; + + if (to->parent) + /* member of a container */ + to = to->parent; + + if (get_min_spare_size_required(to, &min_size)) + continue; + if (to->metadata->ss->external) { + /* We must make sure there is + * no suitable spare in container already. + * If there is we don't add more */ + dev_t devid = container_choose_spare( + to, to, NULL, min_size, st->active); + if (devid > 0) + continue; + } + for (d = 0; d < MAX_DISKS; d++) + if (to->devid[d]) + domainlist_add_dev(&domlist, + to->devid[d], + to->metadata->ss->name); + if (to->spare_group) + domain_add(&domlist, to->spare_group); + /* + * No spare migration if the destination + * has no domain. Skip this array. + */ + if (!domlist) + continue; + for (from=statelist ; from ; from=from->next) { + dev_t devid; + if (!check_donor(from, to)) + continue; + if (from->metadata->ss->external) + devid = container_choose_spare( + from, to, domlist, min_size, 0); + else + devid = choose_spare(from, to, domlist, + min_size); + if (devid > 0 + && move_spare(from->devname, to->devname, devid)) { + alert("MoveSpare", to->devname, from->devname, info); + break; + } + } + domain_free(domlist); + } +} + +/* search the statelist to connect external + * metadata subarrays with their containers + * We always completely rebuild the tree from scratch as + * that is safest considering the possibility of entries + * disappearing or changing. + */ +static void link_containers_with_subarrays(struct state *list) +{ + struct state *st; + struct state *cont; + for (st = list; st; st = st->next) { + st->parent = NULL; + st->subarray = NULL; + } + for (st = list; st; st = st->next) + if (st->parent_devnm[0]) + for (cont = list; cont; cont = cont->next) + if (!cont->err && + cont->parent_devnm[0] == 0 && + strcmp(cont->devnm, st->parent_devnm) == 0) { + st->parent = cont; + st->subarray = cont->subarray; + cont->subarray = st; + break; + } +} + +/* Not really Monitor but ... */ +int Wait(char *dev) +{ + struct stat stb; + char devnm[32]; + int rv = 1; + int frozen_remaining = 3; + + if (stat(dev, &stb) != 0) { + pr_err("Cannot find %s: %s\n", dev, + strerror(errno)); + return 2; + } + strcpy(devnm, stat2devnm(&stb)); + + while(1) { + struct mdstat_ent *ms = mdstat_read(1, 0); + struct mdstat_ent *e; + + for (e=ms ; e; e=e->next) + if (strcmp(e->devnm, devnm) == 0) + break; + + if (e && e->percent == RESYNC_NONE) { + /* We could be in the brief pause before something + * starts. /proc/mdstat doesn't show that, but + * sync_action does. + */ + struct mdinfo mdi; + char buf[21]; + sysfs_init(&mdi, -1, devnm); + if (sysfs_get_str(&mdi, NULL, "sync_action", + buf, 20) > 0 && + strcmp(buf,"idle\n") != 0) { + e->percent = RESYNC_UNKNOWN; + if (strcmp(buf, "frozen\n") == 0) { + if (frozen_remaining == 0) + e->percent = RESYNC_NONE; + else + frozen_remaining -= 1; + } + } + } + if (!e || e->percent == RESYNC_NONE) { + if (e && e->metadata_version && + strncmp(e->metadata_version, "external:", 9) == 0) { + if (is_subarray(&e->metadata_version[9])) + ping_monitor(&e->metadata_version[9]); + else + ping_monitor(devnm); + } + free_mdstat(ms); + return rv; + } + free_mdstat(ms); + rv = 0; + mdstat_wait(5); + } +} + +#ifndef MDASSEMBLE + +static char *clean_states[] = { + "clear", "inactive", "readonly", "read-auto", "clean", NULL }; + +int WaitClean(char *dev, int sock, int verbose) +{ + int fd; + struct mdinfo *mdi; + int rv = 1; + char devnm[32]; + + fd = open(dev, O_RDONLY); + if (fd < 0) { + if (verbose) + pr_err("Couldn't open %s: %s\n", dev, strerror(errno)); + return 1; + } + + strcpy(devnm, fd2devnm(fd)); + mdi = sysfs_read(fd, devnm, GET_VERSION|GET_LEVEL|GET_SAFEMODE); + if (!mdi) { + if (verbose) + pr_err("Failed to read sysfs attributes for %s\n", dev); + close(fd); + return 0; + } + + switch(mdi->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + /* safemode delay is irrelevant for these levels */ + rv = 0; + } + + /* for internal metadata the kernel handles the final clean + * transition, containers can never be dirty + */ + if (!is_subarray(mdi->text_version)) + rv = 0; + + /* safemode disabled ? */ + if (mdi->safe_mode_delay == 0) + rv = 0; + + if (rv) { + int state_fd = sysfs_open(fd2devnm(fd), NULL, "array_state"); + char buf[20]; + int delay = 5000; + + /* minimize the safe_mode_delay and prepare to wait up to 5s + * for writes to quiesce + */ + sysfs_set_safemode(mdi, 1); + + /* wait for array_state to be clean */ + while (1) { + rv = read(state_fd, buf, sizeof(buf)); + if (rv < 0) + break; + if (sysfs_match_word(buf, clean_states) <= 4) + break; + rv = sysfs_wait(state_fd, &delay); + if (rv < 0 && errno != EINTR) + break; + lseek(state_fd, 0, SEEK_SET); + } + if (rv < 0) + rv = 1; + else if (fping_monitor(sock) == 0 || + ping_monitor(mdi->text_version) == 0) { + /* we need to ping to close the window between array + * state transitioning to clean and the metadata being + * marked clean + */ + rv = 0; + } else + rv = 1; + if (rv && verbose) + pr_err("Error waiting for %s to be clean\n", + dev); + + /* restore the original safe_mode_delay */ + sysfs_set_safemode(mdi, mdi->safe_mode_delay); + close(state_fd); + } + + sysfs_free(mdi); + close(fd); + + return rv; +} +#endif /* MDASSEMBLE */ diff --git a/Query.c b/Query.c new file mode 100644 index 00000000..fbc1d103 --- /dev/null +++ b/Query.c @@ -0,0 +1,126 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" + +int Query(char *dev) +{ + /* Give a brief description of the device, + * whether it is an md device and whether it has + * a superblock + */ + int fd = open(dev, O_RDONLY); + int vers; + int ioctlerr; + int superror; + struct mdinfo info; + mdu_array_info_t array; + struct supertype *st = NULL; + + unsigned long long larray_size; + struct stat stb; + char *mddev; + mdu_disk_info_t disc; + char *activity; + + if (fd < 0){ + pr_err("cannot open %s: %s\n", + dev, strerror(errno)); + return 1; + } + + vers = md_get_version(fd); + if (ioctl(fd, GET_ARRAY_INFO, &array)<0) + ioctlerr = errno; + else ioctlerr = 0; + + fstat(fd, &stb); + + if (vers>=9000 && !ioctlerr) { + if (!get_dev_size(fd, NULL, &larray_size)) + larray_size = 0; + } + + if (vers < 0) + printf("%s: is not an md array\n", dev); + else if (vers < 9000) + printf("%s: is an md device, but kernel cannot provide details\n", dev); + else if (ioctlerr == ENODEV) + printf("%s: is an md device which is not active\n", dev); + else if (ioctlerr) + printf("%s: is an md device, but gives \"%s\" when queried\n", + dev, strerror(ioctlerr)); + else { + printf("%s: %s %s %d devices, %d spare%s. Use mdadm --detail for more detail.\n", + dev, + human_size_brief(larray_size,IEC), + map_num(pers, array.level), + array.raid_disks, + array.spare_disks, array.spare_disks==1?"":"s"); + } + st = guess_super(fd); + if (st && st->ss->compare_super != NULL) + superror = st->ss->load_super(st, fd, dev); + else + superror = -1; + close(fd); + if (superror == 0) { + /* array might be active... */ + int uuid[4]; + struct map_ent *me, *map = NULL; + st->ss->getinfo_super(st, &info, NULL); + st->ss->uuid_from_super(st, uuid); + me = map_by_uuid(&map, uuid); + if (me) { + mddev = me->path; + disc.number = info.disk.number; + activity = "undetected"; + if (mddev && (fd = open(mddev, O_RDONLY))>=0) { + if (md_get_version(fd) >= 9000 && + ioctl(fd, GET_ARRAY_INFO, &array)>= 0) { + if (ioctl(fd, GET_DISK_INFO, &disc) >= 0 && + makedev((unsigned)disc.major,(unsigned)disc.minor) == stb.st_rdev) + activity = "active"; + else + activity = "mismatch"; + } + close(fd); + } + } else { + activity = "inactive"; + mddev = "array"; + } + printf("%s: device %d in %d device %s %s %s. Use mdadm --examine for more detail.\n", + dev, + info.disk.number, info.array.raid_disks, + activity, + map_num(pers, info.array.level), + mddev); + if (st->ss == &super0) + put_md_name(mddev); + } + return 0; +} diff --git a/README.initramfs b/README.initramfs new file mode 100644 index 00000000..8f9b8ddf --- /dev/null +++ b/README.initramfs @@ -0,0 +1,123 @@ +Assembling md arrays at boot time. +--------------------------------- +December 2005 + +These notes apply to 2.6 kernels only and, in some cases, +to 2.6.15 or later. + +Md arrays can be assembled at boot time using the 'autodetect' functionality +which is triggered by storing components of an array in partitions of type +'fd' - Linux Raid Autodetect. +They can also be assembled by specifying the component devices in a +kernel parameter such as + md=0,/dev/sda,/dev/sdb +In this case, /dev/md0 will be assembled (because of the 0) from the listed +devices. + +These mechanisms, while useful, do not provide complete functionality +and are unlikely to be extended. The preferred way to assemble md +arrays at boot time is using 'mdadm' or 'mdassemble' (which is a +trimmed-down mdadm). To assemble an array which contains the root +filesystem, mdadm needs to be run before that filesystem is mounted, +and so needs to be run from an initial-ram-fs. It is how this can +work that is the primary focus of this document. + +It should be noted up front that only the array containing the root +filesystem should be assembled from the initramfs. Any other arrays +should be assembled under the control of files on the main filesystem +as this enhanced flexibility and maintainability. + +A minimal initramfs for assembling md arrays can be created using 3 +files and one directory. These are: + +/bin Directory +/bin/mdadm statically linked mdadm binary +/bin/busybox statically linked busybox binary +/bin/sh hard link to /bin/busybox +/init a shell script which call mdadm appropriately. + +An example init script is: + +============================================== +#!/bin/sh + +echo 'Auto-assembling boot md array' +mkdir /proc +mount -t proc proc /proc +if [ -n "$rootuuid" ] +then arg=--uuid=$rootuuid +elif [ -n "$mdminor" ] +then arg=--super-minor=$mdminor +else arg=--super-minor=0 +fi +echo "Using $arg" +mdadm -Acpartitions $arg --auto=part /dev/mda +cd / +mount /dev/mda1 /root || mount /dev/mda /root +umount /proc +cd /root +exec chroot . /sbin/init < /dev/console > /dev/console 2>&1 +============================================= + +This could certainly be extended, or merged into a larger init script. +Though tested and in production use, it is not presented here as +"The Right Way" to do it, but as a useful example. +Some key points are: + + /proc needs to be mounted so that /proc/partitions can be accessed + by mdadm, and so that /proc/filesystems can be accessed by mount. + + The uuid of the array can be passed in as a kernel parameter + (rootuuid). As the kernel doesn't use this value, it is made available + in the environment for /init + + If no uuid is given, we default to md0, (--super-minor=0) which is a + commonly used to store the root filesystem. This may not work in + all situations. + + We assemble the array as a partitionable array (/dev/mda) even if we + end up using the whole array. There is no cost in using the partitionable + interface, and in this context it is simpler. + + We try mounting both /dev/mda1 and /dev/mda as they are the most like + part of the array to contain the root filesystem. + + The --auto flag is given to mdadm so that it will create /dev/md* + files automatically. This is needed as /dev will not contain + and md files, and udev will not create them (as udev only created device + files after the device exists, and mdadm need the device file to create + the device). Note that the created md files may not exist in /dev + of the mounted root filesystem. This needs to be deal with separately + from mdadm - possibly using udev. + + We do not need to create device files for the components which will + be assembled into /dev/mda. mdadm finds the major/minor numbers from + /proc/partitions and creates a temporary /dev file if one doesn't already + exist. + +The script "mkinitramfs" which is included with the mdadm distribution +can be used to create a minimal initramfs. It creates a file called +'init.cpio.gz' which can be specified as an 'initrd' to lilo or grub +(or whatever boot loader is being used). + + + + +Resume from an md array +----------------------- + +If you want to make use of the suspend-to-disk/resume functionality in Linux, +and want to have swap on an md array, you will need to assemble the array +before resume is possible. +However, because the array is active in the resumed image, you do not want +anything written to any drives during the resume process, such as superblock +updates or array resync. + +This can be achieved in 2.6.15-rc1 and later kernels using the +'start_readonly' module parameter. +Simply include the command + echo 1 > /sys/module/md_mod/parameters/start_ro +before assembling the array with 'mdadm'. +You can then echo + 9:0 +or whatever is appropriate to /sys/power/resume to trigger the resume. diff --git a/ReadMe.c b/ReadMe.c new file mode 100644 index 00000000..a05c74ec --- /dev/null +++ b/ReadMe.c @@ -0,0 +1,636 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2015 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" + +#ifndef VERSION +#define VERSION "3.3.4" +#endif +#ifndef VERS_DATE +#define VERS_DATE "3rd August 2015" +#endif +char Version[] = "mdadm - v" VERSION " - " VERS_DATE "\n"; + +/* + * File: ReadMe.c + * + * This file contains general comments about the implementation + * and the various usage messages that can be displayed by mdadm + * + */ + +/* + * mdadm has 7 major modes of operation: + * 1/ Create + * This mode is used to create a new array with a superblock + * 2/ Assemble + * This mode is used to assemble the parts of a previously created + * array into an active array. Components can be explicitly given + * or can be searched for. mdadm (optionally) checks that the components + * do form a bona-fide array, and can, on request, fiddle superblock + * version numbers so as to assemble a faulty array. + * 3/ Build + * This is for building legacy arrays without superblocks + * 4/ Manage + * This is for doing something to one or more devices + * in an array, such as add,remove,fail. + * run/stop/readonly/readwrite are also available + * 5/ Misc + * This is for doing things to individual devices. + * They might be parts of an array so + * zero-superblock, examine might be appropriate + * They might be md arrays so + * run,stop,rw,ro,detail might be appropriate + * Also query will treat it as either + * 6/ Monitor + * This mode never exits but just monitors arrays and reports changes. + * 7/ Grow + * This mode allows for changing of key attributes of a raid array, such + * as size, number of devices, and possibly even layout. + * 8/ Incremental + * Is assembles an array incrementally instead of all at once. + * As devices are discovered they can be passed to "mdadm --incremental" + * which will collect them. When enough devices to for an array are + * found, it is started. + */ + +char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; +char short_bitmap_options[]= + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; +char short_bitmap_auto_options[]= + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:"; + +struct option long_options[] = { + {"manage", 0, 0, ManageOpt}, + {"misc", 0, 0, MiscOpt}, + {"assemble", 0, 0, 'A'}, + {"build", 0, 0, 'B'}, + {"create", 0, 0, 'C'}, + {"detail", 0, 0, 'D'}, + {"examine", 0, 0, 'E'}, + {"follow", 0, 0, 'F'}, + {"grow", 0, 0, 'G'}, + {"incremental",0,0, 'I'}, + {"zero-superblock", 0, 0, KillOpt}, /* deliberately not a short_option */ + {"query", 0, 0, 'Q'}, + {"examine-bitmap", 0, 0, 'X'}, + {"auto-detect", 0, 0, AutoDetect}, + {"detail-platform", 0, 0, DetailPlatform}, + {"kill-subarray", 1, 0, KillSubarray}, + {"update-subarray", 1, 0, UpdateSubarray}, + {"udev-rules", 2, 0, UdevRules}, + {"offroot", 0, 0, OffRootOpt}, + {"examine-badblocks", 0, 0, ExamineBB}, + + {"dump", 1, 0, Dump}, + {"restore", 1, 0, Restore}, + + /* synonyms */ + {"monitor", 0, 0, 'F'}, + + /* after those will normally come the name of the md device */ + + {"help", 0, 0, 'h'}, + {"help-options",0,0, HelpOptions}, + {"version", 0, 0, 'V'}, + {"verbose", 0, 0, 'v'}, + {"quiet", 0, 0, 'q'}, + + /* For create or build: */ + {"chunk", 1, 0, ChunkSize}, + {"rounding", 1, 0, ChunkSize}, /* for linear, chunk is really a + * rounding number */ + {"level", 1, 0, 'l'}, /* 0,1,4,5,6,linear */ + {"parity", 1, 0, Layout}, /* {left,right}-{a,}symmetric */ + {"layout", 1, 0, Layout}, + {"raid-disks",1, 0, 'n'}, + {"raid-devices",1, 0, 'n'}, + {"spare-disks",1,0, 'x'}, + {"spare-devices",1,0, 'x'}, + {"size", 1, 0, 'z'}, + {"auto", 1, 0, Auto}, /* also for --assemble */ + {"assume-clean",0,0, AssumeClean }, + {"metadata", 1, 0, 'e'}, /* superblock format */ + {"bitmap", 1, 0, Bitmap}, + {"bitmap-chunk", 1, 0, BitmapChunk}, + {"write-behind", 2, 0, WriteBehind}, + {"write-mostly",0, 0, WriteMostly}, + {"re-add", 0, 0, ReAdd}, + {"homehost", 1, 0, HomeHost}, + {"symlinks", 1, 0, Symlinks}, + {"data-offset",1, 0, DataOffset}, + + /* For assemble */ + {"uuid", 1, 0, 'u'}, + {"super-minor",1,0, SuperMinor}, + {"name", 1, 0, 'N'}, + {"config", 1, 0, ConfigFile}, + {"scan", 0, 0, 's'}, + {"force", 0, 0, Force}, + {"update", 1, 0, 'U'}, + {"freeze-reshape", 0, 0, FreezeReshape}, + + /* Management */ + {"add", 0, 0, Add}, + {"add-spare", 0, 0, AddSpare}, + {"remove", 0, 0, Remove}, + {"fail", 0, 0, Fail}, + {"set-faulty",0, 0, Fail}, + {"replace", 0, 0, Replace}, + {"with", 0, 0, With}, + {"run", 0, 0, 'R'}, + {"stop", 0, 0, 'S'}, + {"readonly", 0, 0, 'o'}, + {"readwrite", 0, 0, 'w'}, + {"no-degraded",0,0, NoDegraded }, + {"wait", 0, 0, WaitOpt}, + {"wait-clean", 0, 0, Waitclean }, + {"action", 1, 0, Action }, + + /* For Detail/Examine */ + {"brief", 0, 0, Brief}, + {"export", 0, 0, 'Y'}, + {"sparc2.2", 0, 0, Sparc22}, + {"test", 0, 0, 't'}, + {"prefer", 1, 0, Prefer}, + + /* For Follow/monitor */ + {"mail", 1, 0, EMail}, + {"program", 1, 0, ProgramOpt}, + {"alert", 1, 0, ProgramOpt}, + {"increment", 1, 0, Increment}, + {"delay", 1, 0, 'd'}, + {"daemonise", 0, 0, Fork}, + {"daemonize", 0, 0, Fork}, + {"oneshot", 0, 0, '1'}, + {"pid-file", 1, 0, 'i'}, + {"syslog", 0, 0, 'y'}, + {"no-sharing", 0, 0, NoSharing}, + + /* For Grow */ + {"backup-file", 1,0, BackupFile}, + {"invalid-backup",0,0,InvalidBackup}, + {"array-size", 1, 0, 'Z'}, + {"continue", 0, 0, Continue}, + + /* For Incremental */ + {"rebuild-map", 0, 0, RebuildMapOpt}, + {"path", 1, 0, IncrementalPath}, + + {0, 0, 0, 0} +}; + +char Usage[] = +"Usage: mdadm --help\n" +" for help\n" +; + +char Help[] = +"mdadm is used for building, managing, and monitoring\n" +"Linux md devices (aka RAID arrays)\n" +"Usage: mdadm --create device options...\n" +" Create a new array from unused devices.\n" +" mdadm --assemble device options...\n" +" Assemble a previously created array.\n" +" mdadm --build device options...\n" +" Create or assemble an array without metadata.\n" +" mdadm --manage device options...\n" +" make changes to an existing array.\n" +" mdadm --misc options... devices\n" +" report on or modify various md related devices.\n" +" mdadm --grow options device\n" +" resize/reshape an active array\n" +" mdadm --incremental device\n" +" add/remove a device to/from an array as appropriate\n" +" mdadm --monitor options...\n" +" Monitor one or more array for significant changes.\n" +" mdadm device options...\n" +" Shorthand for --manage.\n" +"Any parameter that does not start with '-' is treated as a device name\n" +"or, for --examine-bitmap, a file name.\n" +"The first such name is often the name of an md device. Subsequent\n" +"names are often names of component devices.\n" +"\n" +" For detailed help on the above major modes use --help after the mode\n" +" e.g.\n" +" mdadm --assemble --help\n" +" For general help on options use\n" +" mdadm --help-options\n" +; + +char OptionHelp[] = +"Any parameter that does not start with '-' is treated as a device name\n" +"or, for --examine-bitmap, a file name.\n" +"The first such name is often the name of an md device. Subsequent\n" +"names are often names of component devices.\n" +"\n" +"Some common options are:\n" +" --help -h : General help message or, after above option,\n" +" mode specific help message\n" +" --help-options : This help message\n" +" --version -V : Print version information for mdadm\n" +" --verbose -v : Be more verbose about what is happening\n" +" --quiet -q : Don't print un-necessary messages\n" +" --brief -b : Be less verbose, more brief\n" +" --export -Y : With --detail, --detail-platform or --examine use\n" +" key=value format for easy import into environment\n" +" --force -f : Override normal checks and be more forceful\n" +"\n" +" --assemble -A : Assemble an array\n" +" --build -B : Build an array without metadata\n" +" --create -C : Create a new array\n" +" --detail -D : Display details of an array\n" +" --examine -E : Examine superblock on an array component\n" +" --examine-bitmap -X: Display the detail of a bitmap file\n" +" --examine-badblocks: Display list of known bad blocks on device\n" +" --monitor -F : monitor (follow) some arrays\n" +" --grow -G : resize/ reshape and array\n" +" --incremental -I : add/remove a single device to/from an array as appropriate\n" +" --query -Q : Display general information about how a\n" +" device relates to the md driver\n" +" --auto-detect : Start arrays auto-detected by the kernel\n" +; +/* +"\n" +" For create or build:\n" +" --bitmap= -b : File to store bitmap in - may pre-exist for --build\n" +" --chunk= -c : chunk size of kibibytes\n" +" --rounding= : rounding factor for linear array (==chunk size)\n" +" --level= -l : raid level: 0,1,4,5,6,10,linear, or mp for create.\n" +" : 0,1,10,mp,faulty or linear for build.\n" +" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" +" --layout= : same as --parity, for RAID10: [fno]NN \n" +" --raid-devices= -n : number of active devices in array\n" +" --spare-devices= -x: number of spare (eXtra) devices in initial array\n" +" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" +" --force -f : Honour devices as listed on command line. Don't\n" +" : insert a missing drive for RAID5.\n" +" --assume-clean : Assume the array is already in-sync. This is dangerous for RAID5.\n" +" --bitmap-chunk= : chunksize of bitmap in bitmap file (Kilobytes)\n" +" --delay= -d : seconds between bitmap updates\n" +" --write-behind= : number of simultaneous write-behind requests to allow (requires bitmap)\n" +" --name= -N : Textual name for array - max 32 characters\n" +"\n" +" For assemble:\n" +" --bitmap= -b : File to find bitmap information in\n" +" --uuid= -u : uuid of array to assemble. Devices which don't\n" +" have this uuid are excluded\n" +" --super-minor= -m : minor number to look for in super-block when\n" +" choosing devices to use.\n" +" --name= -N : Array name to look for in super-block.\n" +" --config= -c : config file\n" +" --scan -s : scan config file for missing information\n" +" --force -f : Assemble the array even if some superblocks appear out-of-date\n" +" --update= -U : Update superblock: try '-A --update=?' for list of options.\n" +" --no-degraded : Do not start any degraded arrays - default unless --scan.\n" +"\n" +" For detail or examine:\n" +" --brief -b : Just print device name and UUID\n" +"\n" +" For follow/monitor:\n" +" --mail= -m : Address to mail alerts of failure to\n" +" --program= -p : Program to run when an event is detected\n" +" --alert= : same as --program\n" +" --delay= -d : seconds of delay between polling state. default=60\n" +"\n" +" General management:\n" +" --add -a : add, or hotadd subsequent devices\n" +" --re-add : re-add a recently removed device\n" +" --remove -r : remove subsequent devices\n" +" --fail -f : mark subsequent devices as faulty\n" +" --set-faulty : same as --fail\n" +" --replace : mark a device for replacement\n" +" --run -R : start a partially built array\n" +" --stop -S : deactivate array, releasing all resources\n" +" --readonly -o : mark array as readonly\n" +" --readwrite -w : mark array as readwrite\n" +" --zero-superblock : erase the MD superblock from a device.\n" +" --wait -W : wait for recovery/resync/reshape to finish.\n" +; +*/ + +char Help_create[] = +"Usage: mdadm --create device -chunk=X --level=Y --raid-devices=Z devices\n" +"\n" +" This usage will initialise a new md array, associate some\n" +" devices with it, and activate the array. In order to create an\n" +" array with some devices missing, use the special word 'missing' in\n" +" place of the relevant device name.\n" +"\n" +" Before devices are added, they are checked to see if they already contain\n" +" raid superblocks or filesystems. They are also checked to see if\n" +" the variance in device size exceeds 1%.\n" +" If any discrepancy is found, the user will be prompted for confirmation\n" +" before the array is created. The presence of a '--run' can override this\n" +" caution.\n" +"\n" +" If the --size option is given then only that many kilobytes of each\n" +" device is used, no matter how big each device is.\n" +" If no --size is given, the apparent size of the smallest drive given\n" +" is used for raid level 1 and greater, and the full device is used for\n" +" other levels.\n" +"\n" +" Options that are valid with --create (-C) are:\n" +" --bitmap= : Create a bitmap for the array with the given filename\n" +" : or an internal bitmap is 'internal' is given\n" +" --chunk= -c : chunk size in kibibytes\n" +" --rounding= : rounding factor for linear array (==chunk size)\n" +" --level= -l : raid level: 0,1,4,5,6,10,linear,multipath and synonyms\n" +" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" +" --layout= : same as --parity, for RAID10: [fno]NN \n" +" --raid-devices= -n : number of active devices in array\n" +" --spare-devices= -x: number of spare (eXtra) devices in initial array\n" +" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" +" --data-offset= : Space to leave between start of device and start\n" +" : of array data.\n" +" --force -f : Honour devices as listed on command line. Don't\n" +" : insert a missing drive for RAID5.\n" +" --run -R : insist of running the array even if not all\n" +" : devices are present or some look odd.\n" +" --readonly -o : start the array readonly - not supported yet.\n" +" --name= -N : Textual name for array - max 32 characters\n" +" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n" +" --delay= -d : bitmap update delay in seconds.\n" +"\n" +; + +char Help_build[] = +"Usage: mdadm --build device -chunk=X --level=Y --raid-devices=Z devices\n" +"\n" +" This usage is similar to --create. The difference is that it creates\n" +" a legacy array without a superblock. With these arrays there is no\n" +" different between initially creating the array and subsequently\n" +" assembling the array, except that hopefully there is useful data\n" +" there in the second case.\n" +"\n" +" The level may only be 0, 1, 10, linear, multipath, or faulty.\n" +" All devices must be listed and the array will be started once complete.\n" +" Options that are valid with --build (-B) are:\n" +" --bitmap= : file to store/find bitmap information in.\n" +" --chunk= -c : chunk size of kibibytes\n" +" --rounding= : rounding factor for linear array (==chunk size)\n" +" --level= -l : 0, 1, 10, linear, multipath, faulty\n" +" --raid-devices= -n : number of active devices in array\n" +" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n" +" --delay= -d : bitmap update delay in seconds.\n" +; + +char Help_assemble[] = +"Usage: mdadm --assemble device options...\n" +" mdadm --assemble --scan options...\n" +"\n" +"This usage assembles one or more raid arrays from pre-existing\n" +"components.\n" +"For each array, mdadm needs to know the md device, the identity of\n" +"the array, and a number of sub devices. These can be found in a number\n" +"of ways.\n" +"\n" +"The md device is given on the command line, is found listed in the\n" +"config file, or can be deduced from the array identity.\n" +"The array identity is determined either from the --uuid, --name, or\n" +"--super-minor commandline arguments, from the config file,\n" +"or from the first component device on the command line.\n" +"\n" +"The different combinations of these are as follows:\n" +" If the --scan option is not given, then only devices and identities\n" +" listed on the command line are considered.\n" +" The first device will be the array device, and the remainder will be\n" +" examined when looking for components.\n" +" If an explicit identity is given with --uuid or --super-minor, then\n" +" only devices with a superblock which matches that identity is considered,\n" +" otherwise every device listed is considered.\n" +"\n" +" If the --scan option is given, and no devices are listed, then\n" +" every array listed in the config file is considered for assembly.\n" +" The identity of candidate devices are determined from the config file.\n" +" After these arrays are assembled, mdadm will look for other devices\n" +" that could form further arrays and tries to assemble them. This can\n" +" be disabled using the 'AUTO' option in the config file.\n" +"\n" +" If the --scan option is given as well as one or more devices, then\n" +" Those devices are md devices that are to be assembled. Their identity\n" +" and components are determined from the config file.\n" +"\n" +" If mdadm can not find all of the components for an array, it will assemble\n" +" it but not activate it unless --run or --scan is given. To preserve this\n" +" behaviour even with --scan, add --no-degraded. Note that \"all of the\n" +" components\" means as many as were present the last time the array was running\n" +" as recorded in the superblock. If the array was already degraded, and\n" +" the missing device is not a new problem, it will still be assembled. It\n" +" is only newly missing devices that cause the array not to be started.\n" +"\n" +"Options that are valid with --assemble (-A) are:\n" +" --bitmap= : bitmap file to use with the array\n" +" --uuid= -u : uuid of array to assemble. Devices which don't\n" +" have this uuid are excluded\n" +" --super-minor= -m : minor number to look for in super-block when\n" +" choosing devices to use.\n" +" --name= -N : Array name to look for in super-block.\n" +" --config= -c : config file\n" +" --scan -s : scan config file for missing information\n" +" --run -R : Try to start the array even if not enough devices\n" +" for a full array are present\n" +" --force -f : Assemble the array even if some superblocks appear\n" +" : out-of-date. This involves modifying the superblocks.\n" +" --update= -U : Update superblock: try '-A --update=?' for option list.\n" +" --no-degraded : Assemble but do not start degraded arrays.\n" +" --readonly -o : Mark the array as read-only. No resync will start.\n" +; + +char Help_manage[] = +"Usage: mdadm arraydevice options component devices...\n" +"\n" +"This usage is for managing the component devices within an array.\n" +"The --manage option is not needed and is assumed if the first argument\n" +"is a device name or a management option.\n" +"The first device listed will be taken to be an md array device, any\n" +"subsequent devices are (potential) components of that array.\n" +"\n" +"Options that are valid with management mode are:\n" +" --add -a : hotadd subsequent devices to the array\n" +" --re-add : subsequent devices are re-added if there were\n" +" : recent members of the array\n" +" --remove -r : remove subsequent devices, which must not be active\n" +" --fail -f : mark subsequent devices a faulty\n" +" --set-faulty : same as --fail\n" +" --replace : mark device(s) to be replaced by spares. Once\n" +" : replacement completes, device will be marked faulty\n" +" --with : Indicate which spare a previous '--replace' should\n" +" : prefer to use\n" +" --run -R : start a partially built array\n" +" --stop -S : deactivate array, releasing all resources\n" +" --readonly -o : mark array as readonly\n" +" --readwrite -w : mark array as readwrite\n" +; + +char Help_misc[] = +"Usage: mdadm misc_option devices...\n" +"\n" +"This usage is for performing some task on one or more devices, which\n" +"may be arrays or components, depending on the task.\n" +"The --misc option is not needed (though it is allowed) and is assumed\n" +"if the first argument in a misc option.\n" +"\n" +"Options that are valid with the miscellaneous mode are:\n" +" --query -Q : Display general information about how a\n" +" device relates to the md driver\n" +" --detail -D : Display details of an array\n" +" --detail-platform : Display hardware/firmware details\n" +" --examine -E : Examine superblock on an array component\n" +" --examine-bitmap -X: Display contents of a bitmap file\n" +" --examine-badblocks: Display list of known bad blocks on device\n" +" --zero-superblock : erase the MD superblock from a device.\n" +" --run -R : start a partially built array\n" +" --stop -S : deactivate array, releasing all resources\n" +" --readonly -o : mark array as readonly\n" +" --readwrite -w : mark array as readwrite\n" +" --test -t : exit status 0 if ok, 1 if degrade, 2 if dead, 4 if missing\n" +" --wait -W : wait for resync/rebuild/recovery to finish\n" +" --action= : initiate or abort ('idle' or 'frozen') a 'check' or 'repair'.\n" +; + +char Help_monitor[] = +"Usage: mdadm --monitor options devices\n" +"\n" +"This usage causes mdadm to monitor a number of md arrays by periodically\n" +"polling their status and acting on any changes.\n" +"If any devices are listed then those devices are monitored, otherwise\n" +"all devices listed in the config file are monitored.\n" +"The address for mailing advisories to, and the program to handle\n" +"each change can be specified in the config file or on the command line.\n" +"There must be at least one destination for advisories, whether\n" +"an email address, a program, or --syslog\n" +"\n" +"Options that are valid with the monitor (-F --follow) mode are:\n" +" --mail= -m : Address to mail alerts of failure to\n" +" --program= -p : Program to run when an event is detected\n" +" --alert= : same as --program\n" +" --syslog -y : Report alerts via syslog\n" +" --increment= -r : Report RebuildNN events in the given increment. default=20\n" +" --delay= -d : seconds of delay between polling state. default=60\n" +" --config= -c : specify a different config file\n" +" --scan -s : find mail-address/program in config file\n" +" --daemonise -f : Fork and continue in child, parent exits\n" +" --pid-file= -i : In daemon mode write pid to specified file instead of stdout\n" +" --oneshot -1 : Check for degraded arrays, then exit\n" +" --test -t : Generate a TestMessage event against each array at startup\n" +; + +char Help_grow[] = +"Usage: mdadm --grow device options\n" +"\n" +"This usage causes mdadm to attempt to reconfigure a running array.\n" +"This is only possibly if the kernel being used supports a particular\n" +"reconfiguration.\n" +"\n" +"Options that are valid with the grow (-G --grow) mode are:\n" +" --level= -l : Tell mdadm what level to convert the array to.\n" +" --layout= -p : For a FAULTY array, set/change the error mode.\n" +" : for other arrays, update the layout\n" +" --size= -z : Change the active size of devices in an array.\n" +" : This is useful if all devices have been replaced\n" +" : with larger devices. Value is in Kilobytes, or\n" +" : the special word 'max' meaning 'as large as possible'.\n" +" --assume-clean : When increasing the --size, this flag will avoid\n" +" : a resync of the new space\n" +" --chunk= -c : Change the chunksize of the array\n" +" --raid-devices= -n : Change the number of active devices in an array.\n" +" --add= -a : Add listed devices as part of reshape. This is\n" +" : needed for resizing a RAID0 which cannot have\n" +" : spares already present.\n" +" --bitmap= -b : Add or remove a write-intent bitmap.\n" +" --backup-file= file : A file on a different device to store data for a\n" +" : short time while increasing raid-devices on a\n" +" : RAID4/5/6 array. Also needed throughout a reshape\n" +" : when changing parameters other than raid-devices\n" +" --array-size= -Z : Change visible size of array. This does not change\n" +" : any data on the device, and is not stable across restarts.\n" +" --data-offset= : Location on device to move start of data to.\n" +; + +char Help_incr[] = +"Usage: mdadm --incremental [-Rqrsf] device\n" +"\n" +"This usage allows for incremental assembly of md arrays. Devices can be\n" +"added one at a time as they are discovered. Once an array has all expected\n" +"devices, it will be started.\n" +"\n" +"Optionally, the process can be reversed by using the fail option.\n" +"When fail mode is invoked, mdadm will see if the device belongs to an array\n" +"and then both fail (if needed) and remove the device from that array.\n" +"\n" +"Options that are valid with incremental assembly (-I --incremental) are:\n" +" --run -R : Run arrays as soon as a minimal number of devices are\n" +" : present rather than waiting for all expected.\n" +" --quiet -q : Don't print any information messages, just errors.\n" +" --rebuild-map -r : Rebuild the 'map' file that mdadm uses for tracking\n" +" : partial arrays.\n" +" --scan -s : Use with -R to start any arrays that have the minimal\n" +" : required number of devices, but are not yet started.\n" +" --fail -f : First fail (if needed) and then remove device from\n" +" : any array that it is a member of.\n" +; + +char Help_config[] = +"The /etc/mdadm/mdadm.conf config file:\n\n" +" The config file contains, apart from blank lines and comment lines that\n" +" start with a hash(#), array lines, device lines, and various\n" +" configuration lines.\n" +" Each line is constructed of a number of space separated words, and can\n" +" be continued on subsequent physical lines by indenting those lines.\n" +"\n" +" A device line starts with the word 'device' and then has a number of words\n" +" which identify devices. These words should be names of devices in the\n" +" filesystem, and can contain wildcards. There can be multiple words or each\n" +" device line, and multiple device lines. All devices so listed are checked\n" +" for relevant super blocks when assembling arrays.\n" +"\n" +" An array line start with the word 'array'. This is followed by the name of\n" +" the array device in the filesystem, e.g. '/dev/md2'. Subsequent words\n" +" describe the identity of the array, used to recognise devices to include in the\n" +" array. The identity can be given as a UUID with a word starting 'uuid=', or\n" +" as a minor-number stored in the superblock using 'super-minor=', or as a list\n" +" of devices. This is given as a comma separated list of names, possibly\n" +" containing wildcards, preceded by 'devices='. If multiple critea are given,\n" +" than a device must match all of them to be considered.\n" +"\n" +" Other configuration lines include:\n" +" mailaddr, mailfrom, program used for --monitor mode\n" +" create, auto used when creating device names in /dev\n" +" homehost, policy, part-policy used to guide policy in various\n" +" situations\n" +"\n" +; + +char *mode_help[mode_count] = { + [0] = Help, + [ASSEMBLE] = Help_assemble, + [BUILD] = Help_build, + [CREATE] = Help_create, + [MANAGE] = Help_manage, + [MISC] = Help_misc, + [MONITOR] = Help_monitor, + [GROW] = Help_grow, + [INCREMENTAL] = Help_incr, +}; @@ -0,0 +1,213 @@ + - add 'name' field to metadata type and use it. + - use validate_geometry more + - metadata should be able to check/reject bitmap stuff. + +DDF: + Three new metadata types: + ddf - used only to create a container. + ddf-bvd - used to create an array in a container + ddf-svd - used to create a secondary array from bvds. + + Usage: + mdadm -C /dev/ddf1 /dev/sd[abcdef] + mdadm -C /dev/md1 -e ddf /dev/sd[a-f] + mdadm -C /dev/md1 -l container /dev/sd[a-f] + + Each of these create a new ddf container using all those + devices. The name 'ddf*' signals that ddf metadata should be used. + '-e ddf' only supports one level - 'container'. 'container' is only + supported by ddf. + + mdadm -C /dev/md1 -l0 -n4 /dev/ddf1 # or maybe not ??? + mdadm -C /dev/md1 -l1 -n2 /dev/sda /dev/sdb + If exactly one device is given, and it is a container, we select + devices from that container. + If devices are given that are already in use, they must be in use by + a container, and the array is created in the container. + If devices given are bvds, we slip under the hood to make + the svd arrays. + + mdadm -A /dev/ddf ...... + base drives make a container. Anything in that container is started + auto-read-only. + if /dev/ddf is already assembled, we assemble bvds and svds inside it. + + +2005-dec-20 + Want an incremental assembly mode to work nicely with udev. + Core usage would be something like + mdadm --incr-assemble /dev/newdevice + This would + - examine the device to determine uuid etc. + - look for a match in /etc/mdadm.conf, abort if not found + - find that device and collect current contents + - perform an 'assemble' analysis to make sure we have the best set of devices. + - remove or add devices as appropriate + - possibly start the array if it was complete + + Other usages could involve + - specify which array to auto-add to. + This requires an existing array for uuid matching... is there any point? + + - + + +2004-june-02 + * Don't print 'errors' flag, it is meaningless. DONE + * Handle new superblock format + * create device file on demand, particularly partitionable devices. DONE + BUT figure a way to create the partition devices. + auto=partN + * Use Event: interface to listen for events. DONE, untested + * Make sure mdadm -As can assemble multi-level RAIDs ok. + * --build to build raid1 or multipath arrays + clean or not ??? + +---------------------------------------------------------------------------- +* mdadm --monitor to monitor failed multipath paths and re-instate them. + +* Maybe make "--help" fit in 80x24 and have a --long-help with more info. DONE + + +* maybe "missing" instead of <bold>missing</> in doco DONE +* possibly wait for resync to start, or even finish while assembling.- NO + +* -Db should have a devices= entry if possible. - DONE +* when assembling multipath arrays, ignore any error indicators. - DONE +* rationalise --monitor usage: + mdadm --monitor + doesn't do as expected. DONE + +* --assemble could have a --update option. - DONE + following word can be: + sparc2.2 + super-minor + +* mdadm /dev/md11, where md11 is raid0 can segfault, particularly when looking in the + [UU_UUU] string ... which doesn't exist ! +It should be more sensible. DONE + +Example: + +from Raimund Sacherer <raimund.sacherer@ngit.at> + +mke2fs -m0 -q /dev/ram1 300 +mount -n -t ext2 /dev/ram1 /tmp +echo DEVICE /dev/[sh]* >> /tmp/mdadm.conf +mdadm -Esb /dev/[sh]* 2>/dev/null >> /tmp/mdadm.conf +mdadm -ARsc /tmp/mdadm.conf +umount /tmp + + +?? Allow -S /dev/md? - current complains subsequent not a/d/r - DONE + +* new "Query" mode to subsume --detail and --examine. + --query or -Q, takes a device and tells if it is an MD device, + and also tells in a raid superblock is found. + DONE + +* write mdstat.c to parse /proc/mdstat file + Build list of arrays: name, rebuild-percent + DONE + +* parse /proc/partitions and map major/minor into /dev/* names, + and use that for default DEVICE list ???? + +* --detail --scan to read /proc/mdstat, and then iterate over these, + but assume --brief. --verbose can override + check each subdevice to see if it is in conf_get_devs. + Warn if not. + DONE, but don't warn yet... + +* Support multipath ... maybe... + maybe DONE + +* --follow to syslog + +* --follow to move spares around DONE + +* --follow to notice other events: DONE + rebuild started + spare activated + spare removed + spare added + +------------------------------------ +- --examine --scan scans all drives and build an mdadm.conf file DONE + +- check superblock checksum in examine DONE +- report "chunk" or "rounding" depending on raid level DONE +- report "linear" instead of "-1" for raid level DONE +- decode ayout depending on raid level DONE +- --verbose and --force flags. DONE + +- set md_minor, *_disks for Create - DONE +- for create raid5, how to choose between + all working, but not insync + one missing, one spare, insync DONE (--force) +- and for raid1 - some failed drives... (missing) + +- when RUN_ARRAY, make sure *_disks counts are right + +- get --detail to extract extra stuff from superblock, + like uuid DONE +- --detail --brief to give a config file line DONE +- parse config file. DONE +- test... + +- when --assemble --scan, if an underlying device is an md device, + then try to assemble that device first. + + +- mdadm -S /dev/md0 /dev/md1 gives internal error FIXED + +- mdadm --detail --scan print summary of what it can find? DONE + + +--------- +Assemble doesn't add spares. - DONE +Create to allow "missing" name for devices. +Create to accept "--force" for do exactly what is requested +- get Assemble to upgrade devices if force flag. +ARRAY lines in config file to have super_minor=n +ARRAY lines in config file to have device=pattern, and only accept + those devices + If UUID given, insist on that + If not, but super_minor given, require all found with that minor + to have same uuid + If only device given, all valid supers on those devices must have + same uuid +allow /dev/mdX as first argument before any options +Possible --dry-run option for create and assemble--force + +Assemble to check that all devices mentioned in superblock + are present. + +New mode: --Monitor (or --Follow) + Periodically check status of all arrays (listed in config file). + Log every event and apparent cause - or differences + Email and alert - or run a program - for important events + Move spares around if necessary. + + An Array line can have a spare-group= field that indicates that + the array shares spares with other arrays with the same + spare-group name. + If an array has a failed and no spares, then check all other + arrays in the spare group. If one has no failures and a spare, + then consider that spare. + Choose the smallest considered spare that is large enough. + If there is one, then hot-remove it from it's home, and + hot-add it to the array in question. + + --mail-to address + --alert-handler program + + Will also extract information from /proc/mdstat if present, + and consider 20% marks in rebuild as events. + + Events are: + drive fails - causes mail to be sent + rebuild started + spare activated + spare removed + spare added diff --git a/bitmap.c b/bitmap.c new file mode 100644 index 00000000..bbe9baec --- /dev/null +++ b/bitmap.c @@ -0,0 +1,446 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2004 Paul Clements, SteelEye Technology, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "mdadm.h" + +static inline void sb_le_to_cpu(bitmap_super_t *sb) +{ + sb->magic = __le32_to_cpu(sb->magic); + sb->version = __le32_to_cpu(sb->version); + /* uuid gets no translation */ + sb->events = __le64_to_cpu(sb->events); + sb->events_cleared = __le64_to_cpu(sb->events_cleared); + sb->state = __le32_to_cpu(sb->state); + sb->chunksize = __le32_to_cpu(sb->chunksize); + sb->daemon_sleep = __le32_to_cpu(sb->daemon_sleep); + sb->sync_size = __le64_to_cpu(sb->sync_size); + sb->write_behind = __le32_to_cpu(sb->write_behind); +} + +static inline void sb_cpu_to_le(bitmap_super_t *sb) +{ + sb_le_to_cpu(sb); /* these are really the same thing */ +} + +mapping_t bitmap_states[] = { + { "OK", 0 }, + { "Out of date", 2 }, + { NULL, -1 } +}; + +const char *bitmap_state(int state_num) +{ + char *state = map_num(bitmap_states, state_num); + return state ? state : "Unknown"; +} + +const char *human_chunksize(unsigned long bytes) +{ + static char buf[16]; + char *suffixes[] = { "B", "KB", "MB", "GB", "TB", NULL }; + int i = 0; + + while (bytes >> 10) { + bytes >>= 10; + i++; + } + + snprintf(buf, sizeof(buf), "%lu %s", bytes, suffixes[i]); + + return buf; +} + +typedef struct bitmap_info_s { + bitmap_super_t sb; + unsigned long long total_bits; + unsigned long long dirty_bits; +} bitmap_info_t; + +/* count the dirty bits in the first num_bits of byte */ +static inline int count_dirty_bits_byte(char byte, int num_bits) +{ + int num = 0; + + switch (num_bits) { /* fall through... */ + case 8: if (byte & 128) num++; + case 7: if (byte & 64) num++; + case 6: if (byte & 32) num++; + case 5: if (byte & 16) num++; + case 4: if (byte & 8) num++; + case 3: if (byte & 4) num++; + case 2: if (byte & 2) num++; + case 1: if (byte & 1) num++; + default: break; + } + + return num; +} + +int count_dirty_bits(char *buf, int num_bits) +{ + int i, num = 0; + + for (i = 0; i < num_bits / 8; i++) + num += count_dirty_bits_byte(buf[i], 8); + + if (num_bits % 8) /* not an even byte boundary */ + num += count_dirty_bits_byte(buf[i], num_bits % 8); + + return num; +} + +/* calculate the size of the bitmap given the array size and bitmap chunksize */ +unsigned long long bitmap_bits(unsigned long long array_size, + unsigned long chunksize) +{ + return (array_size * 512 + chunksize - 1) / chunksize; +} + +unsigned long bitmap_sectors(struct bitmap_super_s *bsb) +{ + unsigned long long bits = bitmap_bits(__le64_to_cpu(bsb->sync_size), + __le32_to_cpu(bsb->chunksize)); + int bits_per_sector = 8*512; + return (bits + bits_per_sector - 1) / bits_per_sector; +} + +bitmap_info_t *bitmap_fd_read(int fd, int brief) +{ + /* Note: fd might be open O_DIRECT, so we must be + * careful to align reads properly + */ + unsigned long long total_bits = 0, read_bits = 0, dirty_bits = 0; + bitmap_info_t *info; + void *buf; + unsigned int n, skip; + + if (posix_memalign(&buf, 4096, 8192) != 0) { + pr_err("failed to allocate 8192 bytes\n"); + return NULL; + } + n = read(fd, buf, 8192); + + info = xmalloc(sizeof(*info)); + + if (n < sizeof(info->sb)) { + pr_err("failed to read superblock of bitmap file: %s\n", strerror(errno)); + free(info); + free(buf); + return NULL; + } + memcpy(&info->sb, buf, sizeof(info->sb)); + skip = sizeof(info->sb); + + sb_le_to_cpu(&info->sb); /* convert superblock to CPU byte ordering */ + + if (brief || info->sb.sync_size == 0 || info->sb.chunksize == 0) + goto out; + + /* read the rest of the file counting total bits and dirty bits -- + * we stop when either: + * 1) we hit EOF, in which case we assume the rest of the bits (if any) + * are dirty + * 2) we've read the full bitmap, in which case we ignore any trailing + * data in the file + */ + total_bits = bitmap_bits(info->sb.sync_size, info->sb.chunksize); + + while(read_bits < total_bits) { + unsigned long long remaining = total_bits - read_bits; + + if (n == 0) { + n = read(fd, buf, 8192); + skip = 0; + if (n <= 0) + break; + } + if (remaining > (n-skip) * 8) /* we want the full buffer */ + remaining = (n-skip) * 8; + + dirty_bits += count_dirty_bits(buf+skip, remaining); + + read_bits += remaining; + n = 0; + } + + if (read_bits < total_bits) { /* file truncated... */ + pr_err("WARNING: bitmap file is not large enough for array size %llu!\n\n", + (unsigned long long)info->sb.sync_size); + total_bits = read_bits; + } +out: + free(buf); + info->total_bits = total_bits; + info->dirty_bits = dirty_bits; + return info; +} + +int bitmap_file_open(char *filename, struct supertype **stp) +{ + int fd; + struct stat stb; + struct supertype *st = *stp; + + if (stat(filename, &stb) < 0) { + pr_err("failed to find file %s: %s\n", + filename, strerror(errno)); + return -1; + } + if ((S_IFMT & stb.st_mode) == S_IFBLK) { + fd = open(filename, O_RDONLY|O_DIRECT); + if (fd < 0) { + pr_err("failed to open bitmap file %s: %s\n", + filename, strerror(errno)); + return -1; + } + /* block device, so we are probably after an internal bitmap */ + if (!st) st = guess_super(fd); + if (!st) { + /* just look at device... */ + lseek(fd, 0, 0); + } else if (!st->ss->locate_bitmap) { + pr_err("No bitmap possible with %s metadata\n", + st->ss->name); + return -1; + } else + st->ss->locate_bitmap(st, fd); + + *stp = st; + } else { + fd = open(filename, O_RDONLY|O_DIRECT); + if (fd < 0) { + pr_err("failed to open bitmap file %s: %s\n", + filename, strerror(errno)); + return -1; + } + } + + return fd; +} + +__u32 swapl(__u32 l) +{ + char *c = (char*)&l; + char t= c[0]; + c[0] = c[3]; + c[3] = t; + + t = c[1]; + c[1] = c[2]; + c[2] = t; + return l; +} +int ExamineBitmap(char *filename, int brief, struct supertype *st) +{ + /* + * Read the bitmap file and display its contents + */ + + bitmap_super_t *sb; + bitmap_info_t *info; + int rv = 1; + char buf[64]; + int swap; + int fd; + __u32 uuid32[4]; + + fd = bitmap_file_open(filename, &st); + if (fd < 0) + return rv; + + info = bitmap_fd_read(fd, brief); + if (!info) + return rv; + sb = &info->sb; + if (sb->magic != BITMAP_MAGIC && md_get_version(fd) > 0) { + pr_err("This is an md array. To view a bitmap you need to examine\n"); + pr_err("a member device, not the array.\n"); + pr_err("Reporting bitmap that would be used if this array were used\n"); + pr_err("as a member of some other array\n"); + } + close(fd); + printf(" Filename : %s\n", filename); + printf(" Magic : %08x\n", sb->magic); + if (sb->magic != BITMAP_MAGIC) { + pr_err("invalid bitmap magic 0x%x, the bitmap file appears\n", + sb->magic); + pr_err("to be corrupted or missing.\n"); + } + printf(" Version : %d\n", sb->version); + if (sb->version < BITMAP_MAJOR_LO || + sb->version > BITMAP_MAJOR_HI) { + pr_err("unknown bitmap version %d, either the bitmap file\n", + sb->version); + pr_err("is corrupted or you need to upgrade your tools\n"); + goto free_info; + } + + rv = 0; + if (st) + swap = st->ss->swapuuid; + else +#if __BYTE_ORDER == BIG_ENDIAN + swap = 0; +#else + swap = 1; +#endif + memcpy(uuid32, sb->uuid, 16); + if (swap) + printf(" UUID : %08x:%08x:%08x:%08x\n", + swapl(uuid32[0]), + swapl(uuid32[1]), + swapl(uuid32[2]), + swapl(uuid32[3])); + else + printf(" UUID : %08x:%08x:%08x:%08x\n", + uuid32[0], + uuid32[1], + uuid32[2], + uuid32[3]); + + printf(" Events : %llu\n", (unsigned long long)sb->events); + printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared); + printf(" State : %s\n", bitmap_state(sb->state)); + printf(" Chunksize : %s\n", human_chunksize(sb->chunksize)); + printf(" Daemon : %ds flush period\n", sb->daemon_sleep); + if (sb->write_behind) + sprintf(buf, "Allow write behind, max %d", sb->write_behind); + else + sprintf(buf, "Normal"); + printf(" Write Mode : %s\n", buf); + printf(" Sync Size : %llu%s\n", (unsigned long long)sb->sync_size/2, + human_size(sb->sync_size * 512)); + if (brief) + goto free_info; + printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", + info->total_bits, info->dirty_bits, + 100.0 * info->dirty_bits / (info->total_bits?:1)); +free_info: + free(info); + return rv; +} + +int CreateBitmap(char *filename, int force, char uuid[16], + unsigned long chunksize, unsigned long daemon_sleep, + unsigned long write_behind, + unsigned long long array_size /* sectors */, + int major) +{ + /* + * Create a bitmap file with a superblock and (optionally) a full bitmap + */ + + FILE *fp; + int rv = 1; + char block[512]; + bitmap_super_t sb; + long long bytes, filesize; + + if (!force && access(filename, F_OK) == 0) { + pr_err("bitmap file %s already exists, use --force to overwrite\n", filename); + return rv; + } + + fp = fopen(filename, "w"); + if (fp == NULL) { + pr_err("failed to open bitmap file %s: %s\n", + filename, strerror(errno)); + return rv; + } + + if (chunksize == UnSet) { + /* We don't want more than 2^21 chunks, as 2^11 fill up one + * 4K page (2 bytes per chunk), and 2^10 address of those + * fill up a 4K indexing page. 2^20 might be safer, especially + * on 64bit hosts, so use that. + */ + chunksize = DEFAULT_BITMAP_CHUNK; + /* <<20 for 2^20 chunks, >>9 to convert bytes to sectors */ + while (array_size > ((unsigned long long)chunksize << (20-9))) + chunksize <<= 1; + } + + memset(&sb, 0, sizeof(sb)); + sb.magic = BITMAP_MAGIC; + sb.version = major; + if (uuid != NULL) + memcpy(sb.uuid, uuid, 16); + sb.chunksize = chunksize; + sb.daemon_sleep = daemon_sleep; + sb.write_behind = write_behind; + sb.sync_size = array_size; + + sb_cpu_to_le(&sb); /* convert to on-disk byte ordering */ + + if (fwrite(&sb, sizeof(sb), 1, fp) != 1) { + pr_err("failed to write superblock to bitmap file %s: %s\n", filename, strerror(errno)); + goto out; + } + + /* calculate the size of the bitmap and write it to disk */ + bytes = (bitmap_bits(array_size, chunksize) + 7) / 8; + if (!bytes) { + rv = 0; + goto out; + } + + filesize = bytes + sizeof(sb); + + memset(block, 0xff, sizeof(block)); + + while (bytes > 0) { + if (fwrite(block, sizeof(block), 1, fp) != 1) { + pr_err("failed to write bitmap file %s: %s\n", filename, strerror(errno)); + goto out; + } + bytes -= sizeof(block); + } + + rv = 0; + fflush(fp); + /* make the file be the right size (well, to the nearest byte) */ + if (ftruncate(fileno(fp), filesize)) + perror("ftrunace"); +out: + fclose(fp); + if (rv) + unlink(filename); /* possibly corrupted, better get rid of it */ + return rv; +} + +int bitmap_update_uuid(int fd, int *uuid, int swap) +{ + struct bitmap_super_s bm; + if (lseek(fd, 0, 0) != 0) + return 1; + if (read(fd, &bm, sizeof(bm)) != sizeof(bm)) + return 1; + if (bm.magic != __cpu_to_le32(BITMAP_MAGIC)) + return 1; + copy_uuid(bm.uuid, uuid, swap); + if (lseek(fd, 0, 0) != 0) + return 2; + if (write(fd, &bm, sizeof(bm)) != sizeof(bm)) { + lseek(fd, 0, 0); + return 2; + } + lseek(fd, 0, 0); + return 0; +} diff --git a/bitmap.h b/bitmap.h new file mode 100644 index 00000000..c8725a30 --- /dev/null +++ b/bitmap.h @@ -0,0 +1,287 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +#define BITMAP_MINOR 39 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared aswell, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SIZE 512 +#define BITMAP_BLOCK_SHIFT 9 + +/* how many blocks per chunk? (this is variable) */ +#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) + +/* when hijacked, the counters and bits represent even larger "chunks" */ +/* there will be 1024 chunks represented by each counter in the page pointers */ +#define PAGEPTR_BLOCK_RATIO(bitmap) \ + (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) +#define PAGEPTR_BLOCK_SHIFT(bitmap) \ + (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) +#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ + +/* map chunks (bits) to file pages - offset by the size of the superblock */ +#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_ACTIVE = 0x001, /* the bitmap is in use */ + BITMAP_STALE = 0x002 /* the bitmap file is out of date or had -EIO */ +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __u32 magic; /* 0 BITMAP_MAGIC */ + __u32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __u64 events; /* 24 event counter for the bitmap (1)*/ + __u64 events_cleared;/*32 event counter when last bit cleared (2) */ + __u64 sync_size; /* 40 the size of the md device's sync range(3) */ + __u32 state; /* 48 bitmap state information */ + __u32 chunksize; /* 52 the bitmap chunk size in bytes */ + __u32 daemon_sleep; /* 56 seconds between disk flushes */ + __u32 write_behind; /* 60 number of outstanding write-behind writes */ + + __u8 pad[256 - 64]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked; + /* + * count of dirty bits on the page + */ + int count; +}; + +/* keep track of bitmap file pages that have pending writes on them */ +struct page_list { + struct list_head list; + struct page *page; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + struct bitmap_page *bp; + unsigned long pages; /* total number of pages in the bitmap */ + unsigned long missing_pages; /* number of pages not yet allocated */ + + mddev_t *mddev; /* the md device that the bitmap is for */ + + int counter_bits; /* how many bits per block counter */ + + /* bitmap chunksize -- how much data does each bit represent? */ + unsigned long chunksize; + unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunks; /* total number of data chunks for the array */ + + /* We hold a count on the chunk currently being synced, and drop + * it when the last block is started. If the resync is aborted + * midway, we need to be able to drop that count, so we remember + * the counted chunk.. + */ + unsigned long syncchunk; + + __u64 events_cleared; + + /* bitmap spinlock */ + spinlock_t lock; + + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap file superblock */ + struct page **filemap; /* list of cache pages for the file */ + unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file */ + + unsigned long flags; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + mdk_thread_t *daemon; + unsigned long daemon_sleep; /* how many seconds between updates? */ + + /* + * bitmap write daemon - this daemon performs writes to the bitmap file + * this thread is only needed because of a limitation in ext3 (jbd) + * that does not allow a task to have two journal transactions ongoing + * simultaneously (even if the transactions are for two different + * filesystems) -- in the case of bitmap, that would be the filesystem + * that the bitmap file resides on and the filesystem that is mounted + * on the md device -- see current->journal_info in jbd/transaction.c + */ + mdk_thread_t *write_daemon; + mdk_thread_t *writeback_daemon; + spinlock_t write_lock; + struct semaphore write_ready; + struct semaphore write_done; + unsigned long writes_pending; + wait_queue_head_t write_wait; + struct list_head write_pages; + struct list_head complete_pages; + mempool_t *write_pool; +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(mddev_t *mddev); +void bitmap_destroy(mddev_t *mddev); +int bitmap_active(struct bitmap *bitmap); + +char *file_path(struct file *file, char *buf, int count); +void bitmap_print_sb(struct bitmap *bitmap); +int bitmap_update_sb(struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); + +/* these are exported */ +void bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, + int success); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); + +int bitmap_unplug(struct bitmap *bitmap); +#endif + +#endif diff --git a/config.c b/config.c new file mode 100644 index 00000000..a882ed33 --- /dev/null +++ b/config.c @@ -0,0 +1,1178 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "dlink.h" +#include <dirent.h> +#include <glob.h> +#include <fnmatch.h> +#include <ctype.h> +#include <pwd.h> +#include <grp.h> + +/* + * Read the config file + * + * conf_get_uuids gets a list of devicename+uuid pairs + * conf_get_devs gets device names after expanding wildcards + * + * Each keeps the returned list and frees it when asked to make + * a new list. + * + * The format of the config file needs to be fairly extensible. + * Now, arrays only have names and uuids and devices merely are. + * But later arrays might want names, and devices might want superblock + * versions, and who knows what else. + * I like free format, abhore backslash line continuation, adore + * indentation for structure and am ok about # comments. + * + * So, each line that isn't blank or a #comment must either start + * with a key word, and not be indented, or must start with a + * non-key-word and must be indented. + * + * Keywords are DEVICE and ARRAY ... and several others. + * DEV{ICE} introduces some devices that might contain raid components. + * e.g. + * DEV style=0 /dev/sda* /dev/hd* + * DEV style=1 /dev/sd[b-f]* + * ARR{AY} describes an array giving md device and attributes like uuid=whatever + * e.g. + * ARRAY /dev/md0 uuid=whatever name=something + * Spaces separate words on each line. Quoting, with "" or '' protects them, + * but may not wrap over lines + * + */ + +#ifndef CONFFILE +#define CONFFILE "/etc/mdadm.conf" +#endif +#ifndef CONFFILE2 +/* for Debian compatibility .... */ +#define CONFFILE2 "/etc/mdadm/mdadm.conf" +#endif +char DefaultConfFile[] = CONFFILE; +char DefaultConfDir[] = CONFFILE ".d"; +char DefaultAltConfFile[] = CONFFILE2; +char DefaultAltConfDir[] = CONFFILE2 ".d"; + +enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev, + Homehost, AutoMode, Policy, PartPolicy, LTEnd }; +char *keywords[] = { + [Devices] = "devices", + [Array] = "array", + [Mailaddr] = "mailaddr", + [Mailfrom] = "mailfrom", + [Program] = "program", + [CreateDev]= "create", + [Homehost] = "homehost", + [AutoMode] = "auto", + [Policy] = "policy", + [PartPolicy]="part-policy", + [LTEnd] = NULL +}; + +/* + * match_keyword returns an index into the keywords array, or -1 for no match + * case is ignored, and at least three characters must be given + */ + +int match_keyword(char *word) +{ + int len = strlen(word); + int n; + + if (len < 3) return -1; + for (n=0; keywords[n]; n++) { + if (strncasecmp(word, keywords[n], len)==0) + return n; + } + return -1; +} + +struct conf_dev { + struct conf_dev *next; + char *name; +} *cdevlist = NULL; + +struct mddev_dev *load_partitions(void) +{ + FILE *f = fopen("/proc/partitions", "r"); + char buf[1024]; + struct mddev_dev *rv = NULL; + if (f == NULL) { + pr_err("cannot open /proc/partitions\n"); + return NULL; + } + while (fgets(buf, 1024, f)) { + int major, minor; + char *name, *mp; + struct mddev_dev *d; + + buf[1023] = '\0'; + if (buf[0] != ' ') + continue; + major = strtoul(buf, &mp, 10); + if (mp == buf || *mp != ' ') + continue; + minor = strtoul(mp, NULL, 10); + + name = map_dev(major, minor, 1); + if (!name) + continue; + d = xmalloc(sizeof(*d)); + memset(d, 0, sizeof(*d)); + d->devname = xstrdup(name); + d->next = rv; + rv = d; + } + fclose(f); + return rv; +} + +struct mddev_dev *load_containers(void) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + struct mddev_dev *d; + struct mddev_dev *rv = NULL; + struct map_ent *map = NULL, *me; + + if (!mdstat) + return NULL; + + for (ent = mdstat; ent; ent = ent->next) + if (ent->metadata_version && + strncmp(ent->metadata_version, "external:", 9) == 0 && + !is_subarray(&ent->metadata_version[9])) { + d = xmalloc(sizeof(*d)); + memset(d, 0, sizeof(*d)); + me = map_by_devnm(&map, ent->devnm); + if (me) + d->devname = xstrdup(me->path); + else if (asprintf(&d->devname, "/dev/%s", ent->devnm) < 0) { + free(d); + continue; + } + d->next = rv; + rv = d; + } + free_mdstat(mdstat); + map_free(map); + + return rv; +} + +struct createinfo createinfo = { + .autof = 2, /* by default, create devices with standard names */ + .symlinks = 1, + .names = 0, /* By default, stick with numbered md devices. */ + .bblist = 1, /* Use a bad block list by default */ +#ifdef DEBIAN + .gid = 6, /* disk */ + .mode = 0660, +#else + .mode = 0600, +#endif +}; + +int parse_auto(char *str, char *msg, int config) +{ + int autof; + if (str == NULL || *str == 0) + autof = 2; + else if (strcasecmp(str,"no")==0) + autof = 1; + else if (strcasecmp(str,"yes")==0) + autof = 2; + else if (strcasecmp(str,"md")==0) + autof = config?5:3; + else { + /* There might be digits, and maybe a hypen, at the end */ + char *e = str + strlen(str); + int num = 4; + int len; + while (e > str && isdigit(e[-1])) + e--; + if (*e) { + num = atoi(e); + if (num <= 0) num = 1; + } + if (e > str && e[-1] == '-') + e--; + len = e - str; + if ((len == 2 && strncasecmp(str,"md",2)==0)) { + autof = config ? 5 : 3; + } else if ((len == 3 && strncasecmp(str,"yes",3)==0)) { + autof = 2; + } else if ((len == 3 && strncasecmp(str,"mdp",3)==0)) { + autof = config ? 6 : 4; + } else if ((len == 1 && strncasecmp(str,"p",1)==0) || + (len >= 4 && strncasecmp(str,"part",4)==0)) { + autof = 6; + } else { + pr_err("%s arg of \"%s\" unrecognised: use no,yes,md,mdp,part\n" + " optionally followed by a number.\n", + msg, str); + exit(2); + } + autof |= num << 3; + } + return autof; +} + +static void createline(char *line) +{ + char *w; + char *ep; + + for (w=dl_next(line); w!=line; w=dl_next(w)) { + if (strncasecmp(w, "auto=", 5) == 0) + createinfo.autof = parse_auto(w+5, "auto=", 1); + else if (strncasecmp(w, "owner=", 6) == 0) { + if (w[6] == 0) { + pr_err("missing owner name\n"); + continue; + } + createinfo.uid = strtoul(w+6, &ep, 10); + if (*ep != 0) { + struct passwd *pw; + /* must be a name */ + pw = getpwnam(w+6); + if (pw) + createinfo.uid = pw->pw_uid; + else + pr_err("CREATE user %s not found\n", w+6); + } + } else if (strncasecmp(w, "group=", 6) == 0) { + if (w[6] == 0) { + pr_err("missing group name\n"); + continue; + } + createinfo.gid = strtoul(w+6, &ep, 10); + if (*ep != 0) { + struct group *gr; + /* must be a name */ + gr = getgrnam(w+6); + if (gr) + createinfo.gid = gr->gr_gid; + else + pr_err("CREATE group %s not found\n", w+6); + } + } else if (strncasecmp(w, "mode=", 5) == 0) { + if (w[5] == 0) { + pr_err("missing CREATE mode\n"); + continue; + } + createinfo.mode = strtoul(w+5, &ep, 8); + if (*ep != 0) { + createinfo.mode = 0600; + pr_err("unrecognised CREATE mode %s\n", + w+5); + } + } else if (strncasecmp(w, "metadata=", 9) == 0) { + /* style of metadata to use by default */ + int i; + for (i=0; superlist[i] && !createinfo.supertype; i++) + createinfo.supertype = + superlist[i]->match_metadata_desc(w+9); + if (!createinfo.supertype) + pr_err("metadata format %s unknown, ignoring\n", + w+9); + } else if (strncasecmp(w, "symlinks=yes", 12) == 0) + createinfo.symlinks = 1; + else if (strncasecmp(w, "symlinks=no", 11) == 0) + createinfo.symlinks = 0; + else if (strncasecmp(w, "names=yes", 12) == 0) + createinfo.names = 1; + else if (strncasecmp(w, "names=no", 11) == 0) + createinfo.names = 0; + else if (strncasecmp(w, "bbl=no", 11) == 0) + createinfo.bblist = 0; + else if (strncasecmp(w, "bbl=yes", 11) == 0) + createinfo.bblist = 1; + else { + pr_err("unrecognised word on CREATE line: %s\n", + w); + } + } +} + +void devline(char *line) +{ + char *w; + struct conf_dev *cd; + + for (w=dl_next(line); w != line; w=dl_next(w)) { + if (w[0] == '/' || strcasecmp(w, "partitions") == 0 || + strcasecmp(w, "containers") == 0) { + cd = xmalloc(sizeof(*cd)); + cd->name = xstrdup(w); + cd->next = cdevlist; + cdevlist = cd; + } else { + pr_err("unreconised word on DEVICE line: %s\n", + w); + } + } +} + +struct mddev_ident *mddevlist = NULL; +struct mddev_ident **mddevlp = &mddevlist; + +static int is_number(char *w) +{ + /* check if there are 1 or more digits and nothing else */ + int digits = 0; + while (*w && isdigit(*w)) { + digits++; + w++; + } + return (digits && ! *w); +} + +void arrayline(char *line) +{ + char *w; + + struct mddev_ident mis; + struct mddev_ident *mi; + + mis.uuid_set = 0; + mis.super_minor = UnSet; + mis.level = UnSet; + mis.raid_disks = UnSet; + mis.spare_disks = 0; + mis.devices = NULL; + mis.devname = NULL; + mis.spare_group = NULL; + mis.autof = 0; + mis.next = NULL; + mis.st = NULL; + mis.bitmap_fd = -1; + mis.bitmap_file = NULL; + mis.name[0] = 0; + mis.container = NULL; + mis.member = NULL; + + for (w=dl_next(line); w!=line; w=dl_next(w)) { + if (w[0] == '/' || strchr(w, '=') == NULL) { + /* This names the device, or is '<ignore>'. + * The rules match those in create_mddev. + * 'w' must be: + * /dev/md/{anything} + * /dev/mdNN + * /dev/md_dNN + * <ignore> + * or anything that doesn't start '/' or '<' + */ + if (strcasecmp(w, "<ignore>") == 0 || + strncmp(w, "/dev/md/", 8) == 0 || + (w[0] != '/' && w[0] != '<') || + (strncmp(w, "/dev/md", 7) == 0 && + is_number(w+7)) || + (strncmp(w, "/dev/md_d", 9) == 0 && + is_number(w+9)) + ) { + /* This is acceptable */; + if (mis.devname) + pr_err("only give one device per ARRAY line: %s and %s\n", + mis.devname, w); + else + mis.devname = w; + }else { + pr_err("%s is an invalid name for an md device - ignored.\n", w); + } + } else if (strncasecmp(w, "uuid=", 5)==0 ) { + if (mis.uuid_set) + pr_err("only specify uuid once, %s ignored.\n", + w); + else { + if (parse_uuid(w+5, mis.uuid)) + mis.uuid_set = 1; + else + pr_err("bad uuid: %s\n", w); + } + } else if (strncasecmp(w, "super-minor=", 12)==0 ) { + if (mis.super_minor != UnSet) + pr_err("only specify super-minor once, %s ignored.\n", + w); + else { + char *endptr; + int minor = strtol(w+12, &endptr, 10); + + if (w[12]==0 || endptr[0]!=0 || minor < 0) + pr_err("invalid super-minor number: %s\n", + w); + else + mis.super_minor = minor; + } + } else if (strncasecmp(w, "name=", 5)==0) { + if (mis.name[0]) + pr_err("only specify name once, %s ignored.\n", + w); + else if (strlen(w+5) > 32) + pr_err("name too long, ignoring %s\n", w); + else + strcpy(mis.name, w+5); + + } else if (strncasecmp(w, "bitmap=", 7) == 0) { + if (mis.bitmap_file) + pr_err("only specify bitmap file once. %s ignored\n", + w); + else + mis.bitmap_file = xstrdup(w+7); + + } else if (strncasecmp(w, "devices=", 8 ) == 0 ) { + if (mis.devices) + pr_err("only specify devices once (use a comma separated list). %s ignored\n", + w); + else + mis.devices = xstrdup(w+8); + } else if (strncasecmp(w, "spare-group=", 12) == 0 ) { + if (mis.spare_group) + pr_err("only specify one spare group per array. %s ignored.\n", + w); + else + mis.spare_group = xstrdup(w+12); + } else if (strncasecmp(w, "level=", 6) == 0 ) { + /* this is mainly for compatability with --brief output */ + mis.level = map_name(pers, w+6); + } else if (strncasecmp(w, "disks=", 6) == 0 ) { + /* again, for compat */ + mis.raid_disks = atoi(w+6); + } else if (strncasecmp(w, "num-devices=", 12) == 0 ) { + /* again, for compat */ + mis.raid_disks = atoi(w+12); + } else if (strncasecmp(w, "spares=", 7) == 0 ) { + /* for warning if not all spares present */ + mis.spare_disks = atoi(w+7); + } else if (strncasecmp(w, "metadata=", 9) == 0) { + /* style of metadata on the devices. */ + int i; + + for(i=0; superlist[i] && !mis.st; i++) + mis.st = superlist[i]->match_metadata_desc(w+9); + + if (!mis.st) + pr_err("metadata format %s unknown, ignored.\n", w+9); + } else if (strncasecmp(w, "auto=", 5) == 0 ) { + /* whether to create device special files as needed */ + mis.autof = parse_auto(w+5, "auto type", 0); + } else if (strncasecmp(w, "member=", 7) == 0) { + /* subarray within a container */ + mis.member = xstrdup(w+7); + } else if (strncasecmp(w, "container=", 10) == 0) { + /* the container holding this subarray. Either a device name + * or a uuid */ + mis.container = xstrdup(w+10); + } else { + pr_err("unrecognised word on ARRAY line: %s\n", + w); + } + } + if (mis.uuid_set == 0 && mis.devices == NULL && + mis.super_minor == UnSet && mis.name[0] == 0 && + (mis.container == NULL || mis.member == NULL)) + pr_err("ARRAY line %s has no identity information.\n", mis.devname); + else { + mi = xmalloc(sizeof(*mi)); + *mi = mis; + mi->devname = mis.devname ? xstrdup(mis.devname) : NULL; + mi->next = NULL; + *mddevlp = mi; + mddevlp = &mi->next; + } +} + +static char *alert_email = NULL; +void mailline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) + if (alert_email == NULL) + alert_email = xstrdup(w); +} + +static char *alert_mail_from = NULL; +void mailfromline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) { + if (alert_mail_from == NULL) + alert_mail_from = xstrdup(w); + else { + char *t = NULL; + + if (xasprintf(&t, "%s %s", alert_mail_from, w) > 0) { + free(alert_mail_from); + alert_mail_from = t; + } + } + } +} + +static char *alert_program = NULL; +void programline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) + if (alert_program == NULL) + alert_program = xstrdup(w); +} + +static char *home_host = NULL; +static int require_homehost = 1; +void homehostline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) { + if (strcasecmp(w, "<ignore>")==0) + require_homehost = 0; + else if (home_host == NULL) { + if (strcasecmp(w, "<none>")==0) + home_host = xstrdup(""); + else + home_host = xstrdup(w); + } + } +} + +char auto_yes[] = "yes"; +char auto_no[] = "no"; +char auto_homehost[] = "homehost"; + +static int auto_seen = 0; +void autoline(char *line) +{ + char *w; + char *seen; + int super_cnt; + char *dflt = auto_yes; + int homehost = 0; + int i; + + if (auto_seen) + return; + auto_seen = 1; + + /* Parse the 'auto' line creating policy statements for the 'auto' policy. + * + * The default is 'yes' but the 'auto' line might over-ride that. + * Words in the line are processed in order with the first + * match winning. + * word can be: + * +version - that version can be assembled + * -version - that version cannot be auto-assembled + * yes or +all - any other version can be assembled + * no or -all - no other version can be assembled. + * homehost - any array associated by 'homehost' to this + * host can be assembled. + * + * Thus: + * +ddf -0.90 homehost -all + * will auto-assemble any ddf array, no 0.90 array, and + * any other array (imsm, 1.x) if and only if it is identified + * as belonging to this host. + * + * We translate that to policy by creating 'auto=yes' when we see + * a '+version' line, 'auto=no' if we see '-version' before 'homehost', + * or 'auto=homehost' if we see '-version' after 'homehost'. + * When we see yes, no, +all or -all we stop and any version that hasn't + * been seen gets an appropriate auto= entry. + */ + + /* If environment variable MDADM_CONF_AUTO is defined, then + * it is prepended to the auto line. This allow a script + * to easily disable some metadata types. + */ + w = getenv("MDADM_CONF_AUTO"); + if (w && *w) { + char *l = xstrdup(w); + char *head = line; + w = strtok(l, " \t"); + while (w) { + char *nw = dl_strdup(w); + dl_insert(head, nw); + head = nw; + w = strtok(NULL, " \t"); + } + free(l); + } + + for (super_cnt = 0; superlist[super_cnt]; super_cnt++) + ; + seen = xcalloc(super_cnt, 1); + + for (w = dl_next(line); w != line ; w = dl_next(w)) { + char *val; + + if (strcasecmp(w, "yes") == 0) { + dflt = auto_yes; + break; + } + if (strcasecmp(w, "no") == 0) { + if (homehost) + dflt = auto_homehost; + else + dflt = auto_no; + break; + } + if (strcasecmp(w, "homehost") == 0) { + homehost = 1; + continue; + } + if (w[0] == '+') + val = auto_yes; + else if (w[0] == '-') { + if (homehost) + val = auto_homehost; + else + val = auto_no; + } else + continue; + + if (strcasecmp(w+1, "all") == 0) { + dflt = val; + break; + } + for (i = 0; superlist[i]; i++) { + const char *version = superlist[i]->name; + if (strcasecmp(w+1, version) == 0) + break; + /* 1 matches 1.x, 0 matches 0.90 */ + if (version[1] == '.' && + strlen(w+1) == 1 && + w[1] == version[0]) + break; + /* 1.anything matches 1.x */ + if (strcmp(version, "1.x") == 0 && + strncmp(w+1, "1.", 2) == 0) + break; + } + if (superlist[i] == NULL) + /* ignore this word */ + continue; + if (seen[i]) + /* already know about this metadata */ + continue; + policy_add(rule_policy, pol_auto, val, pol_metadata, superlist[i]->name, NULL); + seen[i] = 1; + } + for (i = 0; i < super_cnt; i++) + if (!seen[i]) + policy_add(rule_policy, pol_auto, dflt, pol_metadata, superlist[i]->name, NULL); + + free(seen); +} + +int loaded = 0; + +static char *conffile = NULL; +void set_conffile(char *file) +{ + conffile = file; +} + +void conf_file(FILE *f) +{ + char *line; + while ((line=conf_line(f))) { + switch(match_keyword(line)) { + case Devices: + devline(line); + break; + case Array: + arrayline(line); + break; + case Mailaddr: + mailline(line); + break; + case Mailfrom: + mailfromline(line); + break; + case Program: + programline(line); + break; + case CreateDev: + createline(line); + break; + case Homehost: + homehostline(line); + break; + case AutoMode: + autoline(line); + break; + case Policy: + policyline(line, rule_policy); + break; + case PartPolicy: + policyline(line, rule_part); + break; + default: + pr_err("Unknown keyword %s\n", line); + } + free_line(line); + } +} + +struct fname { + struct fname *next; + char name[]; +}; + +void conf_file_or_dir(FILE *f) +{ + struct stat st; + DIR *dir; + struct dirent *dp; + struct fname *list = NULL; + + fstat(fileno(f), &st); + if (S_ISREG(st.st_mode)) + conf_file(f); + else if (!S_ISDIR(st.st_mode)) + return; +#if _XOPEN_SOURCE >= 700 || _POSIX_C_SOURCE >= 200809L + dir = fdopendir(fileno(f)); + if (!dir) + return; + while ((dp = readdir(dir)) != NULL) { + int l; + struct fname *fn, **p; + if (dp->d_ino == 0) + continue; + if (dp->d_name[0] == '.') + continue; + l = strlen(dp->d_name); + if (l < 6 || strcmp(dp->d_name+l-5, ".conf") != 0) + continue; + fn = xmalloc(sizeof(*fn)+l+1); + strcpy(fn->name, dp->d_name); + for (p = &list; + *p && strcmp((*p)->name, fn->name) < 0; + p = & (*p)->next) + ; + fn->next = *p; + *p = fn; + } + while (list) { + int fd; + FILE *f2; + struct fname *fn = list; + list = list->next; + fd = openat(fileno(f), fn->name, O_RDONLY); + free(fn); + if (fd < 0) + continue; + f2 = fdopen(fd, "r"); + if (!f2) { + close(fd); + continue; + } + conf_file(f2); + fclose(f2); + } + closedir(dir); +#endif +} + +void load_conffile(void) +{ + FILE *f; + char *confdir = NULL; + char *head; + + if (loaded) + return; + if (conffile == NULL) { + conffile = DefaultConfFile; + confdir = DefaultConfDir; + } + + if (strcmp(conffile, "partitions")==0) { + char *list = dl_strdup("DEV"); + dl_init(list); + dl_add(list, dl_strdup("partitions")); + devline(list); + free_line(list); + } else if (strcmp(conffile, "none") != 0) { + f = fopen(conffile, "r"); + /* Debian chose to relocate mdadm.conf into /etc/mdadm/. + * To allow Debian users to compile from clean source and still + * have a working mdadm, we read /etc/mdadm/mdadm.conf + * if /etc/mdadm.conf doesn't exist + */ + if (f == NULL && + conffile == DefaultConfFile) { + f = fopen(DefaultAltConfFile, "r"); + if (f) { + conffile = DefaultAltConfFile; + confdir = DefaultAltConfDir; + } + } + if (f) { + conf_file_or_dir(f); + fclose(f); + } + if (confdir) { + f = fopen(confdir, "r"); + if (f) { + conf_file_or_dir(f); + fclose(f); + } + } + } + /* If there was no AUTO line, process an empty line + * now so that the MDADM_CONF_AUTO env var gets processed. + */ + head = dl_strdup("AUTO"); + dl_init(head); + autoline(head); + free_line(head); + + loaded = 1; +} + +char *conf_get_mailaddr(void) +{ + load_conffile(); + return alert_email; +} + +char *conf_get_mailfrom(void) +{ + load_conffile(); + return alert_mail_from; +} + +char *conf_get_program(void) +{ + load_conffile(); + return alert_program; +} + +char *conf_get_homehost(int *require_homehostp) +{ + load_conffile(); + if (require_homehostp) + *require_homehostp = require_homehost; + return home_host; +} + +struct createinfo *conf_get_create_info(void) +{ + load_conffile(); + return &createinfo; +} + +struct mddev_ident *conf_get_ident(char *dev) +{ + struct mddev_ident *rv; + load_conffile(); + rv = mddevlist; + while (dev && rv && (rv->devname == NULL + || !devname_matches(dev, rv->devname))) + rv = rv->next; + return rv; +} + +static void append_dlist(struct mddev_dev **dlp, struct mddev_dev *list) +{ + while (*dlp) + dlp = &(*dlp)->next; + *dlp = list; +} + +struct mddev_dev *conf_get_devs() +{ + glob_t globbuf; + struct conf_dev *cd; + int flags = 0; + static struct mddev_dev *dlist = NULL; + unsigned int i; + + while (dlist) { + struct mddev_dev *t = dlist; + dlist = dlist->next; + free(t->devname); + free(t); + } + + load_conffile(); + + if (cdevlist == NULL) { + /* default to 'partitions' and 'containers' */ + dlist = load_partitions(); + append_dlist(&dlist, load_containers()); + } + + for (cd=cdevlist; cd; cd=cd->next) { + if (strcasecmp(cd->name, "partitions")==0) + append_dlist(&dlist, load_partitions()); + else if (strcasecmp(cd->name, "containers")==0) + append_dlist(&dlist, load_containers()); + else { + glob(cd->name, flags, NULL, &globbuf); + flags |= GLOB_APPEND; + } + } + if (flags & GLOB_APPEND) { + for (i=0; i<globbuf.gl_pathc; i++) { + struct mddev_dev *t = xmalloc(sizeof(*t)); + memset(t, 0, sizeof(*t)); + t->devname = xstrdup(globbuf.gl_pathv[i]); + t->next = dlist; + dlist = t; +/* printf("one dev is %s\n", t->devname);*/ + } + globfree(&globbuf); + } + + return dlist; +} + +int conf_test_dev(char *devname) +{ + struct conf_dev *cd; + if (cdevlist == NULL) + /* allow anything by default */ + return 1; + for (cd = cdevlist ; cd ; cd = cd->next) { + if (strcasecmp(cd->name, "partitions") == 0) + return 1; + if (fnmatch(cd->name, devname, FNM_PATHNAME) == 0) + return 1; + } + return 0; +} + +int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost) +{ + /* If anyone said 'yes', that sticks. + * else if homehost applies, use that + * else if there is a 'no', say 'no'. + * else 'yes'. + */ + struct dev_policy *p; + int no=0, found_homehost=0; + load_conffile(); + + pol = pol_find(pol, pol_auto); + pol_for_each(p, pol, version) { + if (strcmp(p->value, "yes") == 0) + return 1; + if (strcmp(p->value, "homehost") == 0) + found_homehost = 1; + if (strcmp(p->value, "no") == 0) + no = 1; + } + if (is_homehost && found_homehost) + return 1; + if (no) + return 0; + return 1; +} + +int match_oneof(char *devices, char *devname) +{ + /* check if one of the comma separated patterns in devices + * matches devname + */ + + while (devices && *devices) { + char patn[1024]; + char *p = devices; + devices = strchr(devices, ','); + if (!devices) + devices = p + strlen(p); + if (devices-p < 1024) { + strncpy(patn, p, devices-p); + patn[devices-p] = 0; + if (fnmatch(patn, devname, FNM_PATHNAME)==0) + return 1; + } + if (*devices == ',') + devices++; + } + return 0; +} + +int devname_matches(char *name, char *match) +{ + /* See if the given array name matches the + * given match from config file. + * + * First strip and /dev/md/ or /dev/, then + * see if there might be a numeric match of + * mdNN with NN + * then just strcmp + */ + if (strncmp(name, "/dev/md/", 8) == 0) + name += 8; + else if (strncmp(name, "/dev/", 5) == 0) + name += 5; + + if (strncmp(match, "/dev/md/", 8) == 0) + match += 8; + else if (strncmp(match, "/dev/", 5) == 0) + match += 5; + + if (strncmp(name, "md", 2) == 0 && + isdigit(name[2])) + name += 2; + if (strncmp(match, "md", 2) == 0 && + isdigit(match[2])) + match += 2; + + return (strcmp(name, match) == 0); +} + +int conf_name_is_free(char *name) +{ + /* Check if this name is already taken by an ARRAY entry in + * the config file. + * It can be taken either by a match on devname, name, or + * even super-minor. + */ + struct mddev_ident *dev; + + load_conffile(); + for (dev = mddevlist; dev; dev = dev->next) { + char nbuf[100]; + if (dev->devname && devname_matches(name, dev->devname)) + return 0; + if (dev->name[0] && devname_matches(name, dev->name)) + return 0; + sprintf(nbuf, "%d", dev->super_minor); + if (dev->super_minor != UnSet && + devname_matches(name, nbuf)) + return 0; + } + return 1; +} + +struct mddev_ident *conf_match(struct supertype *st, + struct mdinfo *info, + char *devname, + int verbose, int *rvp) +{ + struct mddev_ident *array_list, *match; + array_list = conf_get_ident(NULL); + match = NULL; + for (; array_list; array_list = array_list->next) { + if (array_list->uuid_set && + same_uuid(array_list->uuid, info->uuid, st->ss->swapuuid) + == 0) { + if (verbose >= 2 && array_list->devname) + pr_err("UUID differs from %s.\n", + array_list->devname); + continue; + } + if (array_list->name[0] && + strcasecmp(array_list->name, info->name) != 0) { + if (verbose >= 2 && array_list->devname) + pr_err("Name differs from %s.\n", + array_list->devname); + continue; + } + if (array_list->devices && devname && + !match_oneof(array_list->devices, devname)) { + if (verbose >= 2 && array_list->devname) + pr_err("Not a listed device for %s.\n", + array_list->devname); + continue; + } + if (array_list->super_minor != UnSet && + array_list->super_minor != info->array.md_minor) { + if (verbose >= 2 && array_list->devname) + pr_err("Different super-minor to %s.\n", + array_list->devname); + continue; + } + if (!array_list->uuid_set && + !array_list->name[0] && + !array_list->devices && + array_list->super_minor == UnSet) { + if (verbose >= 2 && array_list->devname) + pr_err("%s doesn't have any identifying information.\n", + array_list->devname); + continue; + } + /* FIXME, should I check raid_disks and level too?? */ + + if (match) { + if (verbose >= 0) { + if (match->devname && array_list->devname) + pr_err("we match both %s and %s - cannot decide which to use.\n", + match->devname, + array_list->devname); + else + pr_err("multiple lines in mdadm.conf match\n"); + } + if (rvp) + *rvp = 2; + match = NULL; + break; + } + match = array_list; + } + return match; +} + +int conf_verify_devnames(struct mddev_ident *array_list) +{ + struct mddev_ident *a1, *a2; + + for (a1 = array_list; a1; a1 = a1->next) { + if (!a1->devname) + continue; + if (strcmp(a1->devname, "<ignore>") == 0) + continue; + for (a2 = a1->next; a2; a2 = a2->next) { + if (!a2->devname) + continue; + if (strcmp(a1->devname, a2->devname) != 0) + continue; + + if (a1->uuid_set && a2->uuid_set) { + char nbuf[64]; + __fname_from_uuid(a1->uuid, 0, nbuf, ':'); + pr_err("Devices %s and ", + nbuf); + __fname_from_uuid(a2->uuid, 0, nbuf, ':'); + fprintf(stderr, + "%s have the same name: %s\n", + nbuf, a1->devname); + } else + pr_err("Device %s given twice in config file\n", a1->devname); + return 1; + } + } + + return 0; +} diff --git a/crc32.c b/crc32.c new file mode 100644 index 00000000..94fda06a --- /dev/null +++ b/crc32.c @@ -0,0 +1,360 @@ +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2003 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Note: zlib license from from zlib.h added explicitly as mdadm does + * not include zlib.h. License from v1.2.2 of zlib: + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * + * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results about a factor + * of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +/* @(#) $Id$ */ + +/* + Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore + protection on the static variables used to control the first-use generation + of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should + first call get_crc_table() to initialize the tables before allowing more than + one thread to use crc32(). + */ + +#ifdef MAKECRCH +# include <stdio.h> +# ifndef DYNAMIC_CRC_TABLE +# define DYNAMIC_CRC_TABLE +# endif /* !DYNAMIC_CRC_TABLE */ +#endif /* MAKECRCH */ + +/* #include "zutil.h" / * for STDC and FAR definitions */ +#define STDC +#define FAR +#define Z_NULL ((void*)0) +#define OF(X) X +#define ZEXPORT +typedef long ptrdiff_t; +#define NOBYFOUR + +#define local static + +/* Find a four-byte integer type for crc32_little() and crc32_big(). */ +#ifndef NOBYFOUR +# ifdef STDC /* need ANSI C limits.h to determine sizes */ +# include <limits.h> +# define BYFOUR +# if (UINT_MAX == 0xffffffffUL) + typedef unsigned int u4; +# else +# if (ULONG_MAX == 0xffffffffUL) + typedef unsigned long u4; +# else +# if (USHRT_MAX == 0xffffffffUL) + typedef unsigned short u4; +# else +# undef BYFOUR /* can't find a four-byte integer type! */ +# endif +# endif +# endif +# endif /* STDC */ +#endif /* !NOBYFOUR */ + +/* Definitions for doing the crc four data bytes at a time. */ +#ifdef BYFOUR +# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \ + (((w)&0xff00)<<8)+(((w)&0xff)<<24)) + local unsigned long crc32_little OF((unsigned long, + const unsigned char FAR *, unsigned)); + local unsigned long crc32_big OF((unsigned long, + const unsigned char FAR *, unsigned)); +# define TBLS 8 +#else +# define TBLS 1 +#endif /* BYFOUR */ + +#ifdef DYNAMIC_CRC_TABLE + +local volatile int crc_table_empty = 1; +local unsigned long FAR crc_table[TBLS][256]; +local void make_crc_table OF((void)); +#ifdef MAKECRCH + local void write_table OF((FILE *, const unsigned long FAR *)); +#endif /* MAKECRCH */ + +/* + Generate tables for a byte-wise 32-bit CRC calculation on the polynomial: + x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. + + Polynomials over GF(2) are represented in binary, one bit per coefficient, + with the lowest powers in the most significant bit. Then adding polynomials + is just exclusive-or, and multiplying a polynomial by x is a right shift by + one. If we call the above polynomial p, and represent a byte as the + polynomial q, also with the lowest power in the most significant bit (so the + byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p, + where a mod b means the remainder after dividing a by b. + + This calculation is done using the shift-register method of multiplying and + taking the remainder. The register is initialized to zero, and for each + incoming bit, x^32 is added mod p to the register if the bit is a one (where + x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by + x (which is shifting right by one and adding x^32 mod p if the bit shifted + out is a one). We start with the highest power (least significant bit) of + q and repeat for all eight bits of q. + + The first table is simply the CRC of all possible eight bit values. This is + all the information needed to generate CRCs on data a byte at a time for all + combinations of CRC register values and incoming bytes. The remaining tables + allow for word-at-a-time CRC calculation for both big-endian and little- + endian machines, where a word is four bytes. +*/ +local void make_crc_table() +{ + unsigned long c; + int n, k; + unsigned long poly; /* polynomial exclusive-or pattern */ + /* terms of polynomial defining this crc (except x^32): */ + static volatile int first = 1; /* flag to limit concurrent making */ + static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26}; + + /* See if another task is already doing this (not thread-safe, but better + than nothing -- significantly reduces duration of vulnerability in + case the advice about DYNAMIC_CRC_TABLE is ignored) */ + if (first) { + first = 0; + + /* make exclusive-or pattern from polynomial (0xedb88320UL) */ + poly = 0UL; + for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++) + poly |= 1UL << (31 - p[n]); + + /* generate a crc for every 8-bit value */ + for (n = 0; n < 256; n++) { + c = (unsigned long)n; + for (k = 0; k < 8; k++) + c = c & 1 ? poly ^ (c >> 1) : c >> 1; + crc_table[0][n] = c; + } + +#ifdef BYFOUR + /* generate crc for each value followed by one, two, and three zeros, + and then the byte reversal of those as well as the first table */ + for (n = 0; n < 256; n++) { + c = crc_table[0][n]; + crc_table[4][n] = REV(c); + for (k = 1; k < 4; k++) { + c = crc_table[0][c & 0xff] ^ (c >> 8); + crc_table[k][n] = c; + crc_table[k + 4][n] = REV(c); + } + } +#endif /* BYFOUR */ + + crc_table_empty = 0; + } + else { /* not first */ + /* wait for the other guy to finish (not efficient, but rare) */ + while (crc_table_empty) + ; + } + +#ifdef MAKECRCH + /* write out CRC tables to crc32.h */ + { + FILE *out; + + out = fopen("crc32.h", "w"); + if (out == NULL) return; + fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n"); + fprintf(out, " * Generated automatically by crc32.c\n */\n\n"); + fprintf(out, "local const unsigned long FAR "); + fprintf(out, "crc_table[TBLS][256] =\n{\n {\n"); + write_table(out, crc_table[0]); +# ifdef BYFOUR + fprintf(out, "#ifdef BYFOUR\n"); + for (k = 1; k < 8; k++) { + fprintf(out, " },\n {\n"); + write_table(out, crc_table[k]); + } + fprintf(out, "#endif\n"); +# endif /* BYFOUR */ + fprintf(out, " }\n};\n"); + fclose(out); + } +#endif /* MAKECRCH */ +} + +#ifdef MAKECRCH +local void write_table(out, table) + FILE *out; + const unsigned long FAR *table; +{ + int n; + + for (n = 0; n < 256; n++) + fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n], + n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", ")); +} +#endif /* MAKECRCH */ + +#else /* !DYNAMIC_CRC_TABLE */ +/* ======================================================================== + * Tables of CRC-32s of all single-byte values, made by make_crc_table(). + */ +#include "crc32.h" +#endif /* DYNAMIC_CRC_TABLE */ + +/* ========================================================================= + * This function can be used by asm versions of crc32() + */ +const unsigned long FAR * ZEXPORT get_crc_table(void) +{ +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + return (const unsigned long FAR *)crc_table; +} + +/* ========================================================================= */ +#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8) +#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 + +/* ========================================================================= */ +unsigned long ZEXPORT crc32( + unsigned long crc, + const unsigned char FAR *buf, + unsigned len) +{ + if (buf == Z_NULL) return 0UL; + +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + +#ifdef BYFOUR + if (sizeof(void *) == sizeof(ptrdiff_t)) { + u4 endian; + + endian = 1; + if (*((unsigned char *)(&endian))) + return crc32_little(crc, buf, len); + else + return crc32_big(crc, buf, len); + } +#endif /* BYFOUR */ +/* crc = crc ^ 0xffffffffUL;*/ + while (len >= 8) { + DO8; + len -= 8; + } + if (len) do { + DO1; + } while (--len); + return crc /* ^ 0xffffffffUL*/; +} + +#ifdef BYFOUR + +/* ========================================================================= */ +#define DOLIT4 c ^= *buf4++; \ + c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ + crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] +#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 + +/* ========================================================================= */ +local unsigned long crc32_little(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = (u4)crc; + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + len--; + } + + buf4 = (const u4 FAR *)buf; + while (len >= 32) { + DOLIT32; + len -= 32; + } + while (len >= 4) { + DOLIT4; + len -= 4; + } + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + } while (--len); + c = ~c; + return (unsigned long)c; +} + +/* ========================================================================= */ +#define DOBIG4 c ^= *++buf4; \ + c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ + crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] +#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 + +/* ========================================================================= */ +local unsigned long crc32_big(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = REV((u4)crc); + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + len--; + } + + buf4 = (const u4 FAR *)buf; + buf4--; + while (len >= 32) { + DOBIG32; + len -= 32; + } + while (len >= 4) { + DOBIG4; + len -= 4; + } + buf4++; + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + } while (--len); + c = ~c; + return (unsigned long)(REV(c)); +} + +#endif /* BYFOUR */ diff --git a/crc32.h b/crc32.h new file mode 100644 index 00000000..8053b611 --- /dev/null +++ b/crc32.h @@ -0,0 +1,441 @@ +/* crc32.h -- tables for rapid CRC calculation + * Generated automatically by crc32.c + */ + +local const unsigned long FAR crc_table[TBLS][256] = +{ + { + 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL, + 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL, + 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL, + 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL, + 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL, + 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL, + 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL, + 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL, + 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL, + 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL, + 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL, + 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL, + 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL, + 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL, + 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL, + 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL, + 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL, + 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL, + 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL, + 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL, + 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL, + 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL, + 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL, + 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL, + 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL, + 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL, + 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL, + 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL, + 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL, + 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL, + 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL, + 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL, + 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL, + 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL, + 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL, + 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL, + 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL, + 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL, + 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL, + 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL, + 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL, + 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL, + 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL, + 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL, + 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL, + 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL, + 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL, + 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL, + 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL, + 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL, + 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, + 0x2d02ef8dUL +#ifdef BYFOUR + }, + { + 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL, + 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL, + 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL, + 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL, + 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL, + 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL, + 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL, + 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL, + 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL, + 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL, + 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL, + 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL, + 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL, + 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL, + 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL, + 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL, + 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL, + 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL, + 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL, + 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL, + 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL, + 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL, + 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL, + 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL, + 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL, + 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL, + 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL, + 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL, + 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL, + 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL, + 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL, + 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL, + 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL, + 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL, + 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL, + 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL, + 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL, + 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL, + 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL, + 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL, + 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL, + 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL, + 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL, + 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL, + 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL, + 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL, + 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL, + 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL, + 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL, + 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL, + 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL, + 0x9324fd72UL + }, + { + 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL, + 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL, + 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL, + 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL, + 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL, + 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL, + 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL, + 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL, + 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL, + 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL, + 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL, + 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL, + 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL, + 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL, + 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL, + 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL, + 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL, + 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL, + 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL, + 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL, + 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL, + 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL, + 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL, + 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL, + 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL, + 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL, + 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL, + 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL, + 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL, + 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL, + 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL, + 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL, + 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL, + 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL, + 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL, + 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL, + 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL, + 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL, + 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL, + 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL, + 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL, + 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL, + 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL, + 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL, + 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL, + 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL, + 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL, + 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL, + 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL, + 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL, + 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL, + 0xbe9834edUL + }, + { + 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL, + 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL, + 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL, + 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL, + 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL, + 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL, + 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL, + 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL, + 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL, + 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL, + 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL, + 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL, + 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL, + 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL, + 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL, + 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL, + 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL, + 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL, + 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL, + 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL, + 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL, + 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL, + 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL, + 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL, + 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL, + 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL, + 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL, + 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL, + 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL, + 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL, + 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL, + 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL, + 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL, + 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL, + 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL, + 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL, + 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL, + 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL, + 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL, + 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL, + 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL, + 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL, + 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL, + 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL, + 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL, + 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL, + 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL, + 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL, + 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL, + 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL, + 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL, + 0xde0506f1UL + }, + { + 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL, + 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL, + 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL, + 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL, + 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL, + 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL, + 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL, + 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL, + 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL, + 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL, + 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL, + 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL, + 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL, + 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL, + 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL, + 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL, + 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL, + 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL, + 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL, + 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL, + 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL, + 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL, + 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL, + 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL, + 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL, + 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL, + 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL, + 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL, + 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL, + 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL, + 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL, + 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL, + 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL, + 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL, + 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL, + 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL, + 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL, + 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL, + 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL, + 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL, + 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL, + 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL, + 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL, + 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL, + 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL, + 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL, + 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL, + 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL, + 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL, + 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL, + 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL, + 0x8def022dUL + }, + { + 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL, + 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL, + 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL, + 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL, + 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL, + 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL, + 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL, + 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL, + 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL, + 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL, + 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL, + 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL, + 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL, + 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL, + 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL, + 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL, + 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL, + 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL, + 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL, + 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL, + 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL, + 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL, + 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL, + 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL, + 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL, + 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL, + 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL, + 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL, + 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL, + 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL, + 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL, + 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL, + 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL, + 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL, + 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL, + 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL, + 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL, + 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL, + 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL, + 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL, + 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL, + 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL, + 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL, + 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL, + 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL, + 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL, + 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL, + 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL, + 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL, + 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL, + 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL, + 0x72fd2493UL + }, + { + 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL, + 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL, + 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL, + 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL, + 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL, + 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL, + 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL, + 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL, + 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL, + 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL, + 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL, + 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL, + 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL, + 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL, + 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL, + 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL, + 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL, + 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL, + 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL, + 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL, + 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL, + 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL, + 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL, + 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL, + 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL, + 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL, + 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL, + 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL, + 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL, + 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL, + 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL, + 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL, + 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL, + 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL, + 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL, + 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL, + 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL, + 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL, + 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL, + 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL, + 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL, + 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL, + 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL, + 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL, + 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL, + 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL, + 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL, + 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL, + 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL, + 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL, + 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL, + 0xed3498beUL + }, + { + 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL, + 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL, + 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL, + 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL, + 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL, + 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL, + 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL, + 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL, + 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL, + 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL, + 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL, + 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL, + 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL, + 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL, + 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL, + 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL, + 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL, + 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL, + 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL, + 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL, + 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL, + 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL, + 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL, + 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL, + 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL, + 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL, + 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL, + 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL, + 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL, + 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL, + 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL, + 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL, + 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL, + 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL, + 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL, + 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL, + 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL, + 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL, + 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL, + 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL, + 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL, + 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL, + 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL, + 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL, + 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL, + 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL, + 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL, + 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL, + 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL, + 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL, + 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL, + 0xf10605deUL +#endif + } +}; diff --git a/debian/FAQ b/debian/FAQ new file mode 100644 index 00000000..325fcb59 --- /dev/null +++ b/debian/FAQ @@ -0,0 +1,581 @@ +Frequently asked questions -- Debian mdadm +========================================== + +Also see /usr/share/doc/mdadm/README.recipes.gz . + +The latest version of this FAQ is available here: + http://git.debian.org/?p=pkg-mdadm/mdadm.git;a=blob;f=debian/FAQ;hb=HEAD + +0. What does MD stand for? +~~~~~~~~~~~~~~~~~~~~~~~~~~ + MD is an abbreviation for "multiple device" (also often called "multi- + disk"). The Linux MD implementation implements various strategies for + combining multiple physical devices into single logical ones. The most + common use case is commonly known as "Software RAID". Linux supports RAID + levels 1, 4, 5, 6, and 10, as well as the "pseudo-redundant" RAID level 0. + In addition, the MD implementation covers linear and multipath + configurations. + + Most people refer to MD as RAID. Since the original name of the RAID + configuration software is "md"adm, I chose to use MD consistently instead. + +1. How do I overwrite ("zero") the superblock? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --zero-superblock /dev/mdX + + Note that this is a destructive operation. It does not actually delete any + data, but the device will have lost its "authority". You cannot assemble the + array with it anymore, and if you add the device to another array, the + synchronisation process *will* *overwrite* all data on the device. + + Nevertheless, sometimes it is necessary to zero the superblock: + + - If you are reusing a disk that has been part of an array with an different + superblock version and/or location. In this case you zero the superblock + before you assemble the array, or add the device to an array. + + - If you are trying to prevent a device from being recognised as part of an + array. Say for instance you are trying to change an array spanning sd[ab]1 + to sd[bc]1 (maybe because sda is failing or too slow), then automatic + (scan) assembly will still recognise sda1 as a valid device. You can limit + the devices to scan with the DEVICE keyword in the configuration file, but + this may not be what you want. Instead, zeroing the superblock will + (permanently) prevent a device from being considered as part of an array. + +2. How do I change the preferred minor of an MD array (RAID)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + See item 12 in /usr/share/doc/mdadm/README.recipes.gz and read the mdadm + manpage (search for 'preferred'). + +3. How does mdadm determine which /dev/mdX or /dev/md/X to use? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + The logic used by mdadm to determine the device node name in the mdadm + --examine output (which is used to generate mdadm.conf) depends on several + factors. Here's how mdadm determines it: + + It first checks the superblock version of a given array (or each array in + turn when iterating all of them). Run + + mdadm --detail /dev/mdX | sed -ne 's,.*Version : ,,p' + + to determine the superblock version of a running array, or + + mdadm --examine /dev/sdXY | sed -ne 's,.*Version : ,,p' + + to determine the superblock version from a component device of an array. + + Version 0 superblocks (00.90.XX) + '''''''''''''''''''''''''''''''' + You need to know the preferred minor number stored in the superblock, + so run either of + + mdadm --detail /dev/mdX | sed -ne 's,.*Preferred Minor : ,,p' + mdadm --examine /dev/sdXY | sed -ne 's,.*Preferred Minor : ,,p' + + Let's call the resulting number MINOR. Also see FAQ 2 further up. + + Given MINOR, mdadm will output /dev/md<MINOR> if the device node + /dev/md<MINOR> exists. + Otherwise, it outputs /dev/md/<MINOR> + + Version 1 superblocks (01.XX.XX) + '''''''''''''''''''''''''''''''' + Version 1 superblocks actually seem to ignore preferred minors and instead + use the value of the name field in the superblock. Unless specified + explicitly during creation (-N|--name) the name is determined from the + device name used, using the following regexp: 's,/dev/md/?(.*),$1,', thus: + + /dev/md0 -> 0 + /dev/md/0 -> 0 + /dev/md_d0 -> _d0 (d0 in later versions) + /dev/md/d0 -> d0 + /dev/md/name -> name + (/dev/name does not seem to work) + + mdadm will append the name to '/dev/md/', so it will always output device + names under the /dev/md/ directory. Newer versions can create a symlink + from /dev/mdX. See the symlinks option in mdadm.con(5) and mdadm(8). + + If you want to change the name, you can do so during assembly: + + mdadm -A -U name -N newname /dev/mdX /dev/sd[abc]X + + I know this all sounds inconsistent and upstream has some work to do. + We're on it. + +4. Which RAID level should I use? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Many people seem to prefer RAID4/5/6 because it makes more efficient use of + space. For example, if you have disks of size X, then in order to get 2X + storage, you need 3 disks for RAID5, but 4 if you use RAID10 or RAID1+0 (or + RAID6). + + This gain in usable space comes at a price: performance; RAID1/10 can be up + to four times faster than RAID4/5/6. + + At the same time, however, RAID4/5/6 provide somewhat better redundancy in + the event of two failing disks. In a RAID10 configuration, if one disk is + already dead, the RAID can only survive if any of the two disks in the other + RAID1 array fails, but not if the second disk in the degraded RAID1 array + fails (see next item, 4b). A RAID6 across four disks can cope with any two + disks failing. However, RAID6 is noticeably slower than RAID5. RAID5 and + RAID4 do not differ much, but can only handle single-disk failures. + + If you can afford the extra disks (storage *is* cheap these days), I suggest + RAID1/10 over RAID4/5/6. If you don't care about performance but need as + much space as possible, go with RAID4/5/6, but make sure to have backups. + Heck, make sure to have backups whatever you do. + + Let it be said, however, that I thoroughly regret putting my primary + workstation on RAID5. Anything disk-intensive brings the system to its + knees; I will have to migrate to RAID10 at one point. + +4b. Can a 4-disk RAID10 survive two disk failures? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + I am assuming that you are talking about a setup with two copies of each + block, so --layout=near2/far2/offset2: + + In two thirds of the cases, yes[0], and it does not matter which layout you + use. When you assemble 4 disks into a RAID10, you essentially stripe a RAID0 + across two RAID1, so the four disks A,B,C,D become two pairs: A,B and C,D. + If A fails, the RAID10 can only survive if the second failing disk is either + C or D; If B fails, your array is dead. + + Thus, if you see a disk failing, replace it as soon as possible! + + If you need to handle two failing disks out of a set of four, you have to + use RAID6, or store more than two copies of each block (see the --layout + option in the mdadm(8) manpage). + + See also question 18 further down. + + 0. it's actually (n-2)/(n-1), where n is the number of disks. I am not + a mathematician, see http://aput.net/~jheiss/raid10/, which gives the + chance of *failure* as 1/(n-1), so the chance of success is 1-1/(n-1), or + (n-2)/(n-1), or 2/3 in the four disk example. + (Thanks to Per Olofsson for clarifying this in #493577). + +5. How to convert RAID5 to RAID10? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + To convert RAID5 to RAID10, you need a spare disk (either a spare, forth + disk in the array, or a new one). Then you remove the spare and one of the + three disks from the RAID5, create a degraded RAID10 across them, create + the filesystem and copy the data (or do a raw copy), then add the other two + disks to the new RAID10. However, mdadm cannot assemble a RAID10 with 50% + missing devices the way you might like it: + + mdadm --create -l 10 -n4 -pn2 /dev/md1 /dev/sd[cd] missing missing + + For reasons that may be answered by question 20 further down, mdadm actually + cares about the order of devices you give it. If you intersperse the missing + keywords with the physical drives, it should work: + + mdadm --create -l 10 -n4 -pn2 /dev/md1 /dev/sdc missing /dev/sdd missing + + or even + + mdadm --create -l 10 -n4 -pn2 /dev/md1 missing /dev/sd[cd] missing + + Also see item (4b) further up, and this thread: + http://marc.theaimsgroup.com/?l=linux-raid&m=116004333406395&w=2 + +6. What is the difference between RAID1+0 and RAID10? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RAID1+0 is a form of RAID in which a RAID0 is striped across two RAID1 + arrays. To assemble it, you create two RAID1 arrays and then create a RAID0 + array with the two md arrays. + + The Linux kernel provides the RAID10 level to do pretty much exactly the + same for you, but with greater flexibility (and somewhat improved + performance). While RAID1+0 makes sense with 4 disks, RAID10 can be + configured to work with only 3 disks. Also, RAID10 has a little less + overhead than RAID1+0, which has data pass the md layer twice. + + I prefer RAID10 over RAID1+0. + +6b. What's the difference between RAID1+0 and RAID0+1? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In short: RAID1+0 concatenates two mirrored arrays while RAID0+1 mirrors two + concatenated arrays. However, the two are also often switched. + + The linux MD driver supports RAID10, which is equivalent to the above + RAID1+0 definition. + + RAID1+0/10 has a greater chance to survive two disk failures, its + performance suffers less when in degraded state, and it resyncs faster after + replacing a failed disk. + + See http://aput.net/~jheiss/raid10/ for more details. + +7. Which RAID10 layout scheme should I use +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RAID10 gives you the choice between three ways of laying out the blocks on + the disk. Assuming a simple 4 drive setup with 2 copies of each block, then + if A,B,C are data blocks, a,b their parts, and 1,2 denote their copies, the + following would be a classic RAID1+0 where 1,2 and 3,4 are RAID0 pairs + combined into a RAID1: + + near=2 would be (this is the classic RAID1+0) + + hdd1 Aa1 Ba1 Ca1 + hdd2 Aa2 Ba2 Ca2 + hdd3 Ab1 Bb1 Cb1 + hdd4 Ab2 Bb2 Cb2 + + offset=2 would be + + hdd1 Aa1 Bb2 Ca1 Db2 + hdd2 Ab1 Aa2 Cb1 Ca2 + hdd3 Ba1 Ab2 Da1 Cb2 + hdd4 Bb1 Ba2 Db1 Da2 + + far=2 would be + + hdd1 Aa1 Ca1 .... Bb2 Db2 + hdd2 Ab1 Cb1 .... Aa2 Ca2 + hdd3 Ba1 Da1 .... Ab2 Cb2 + hdd4 Bb1 Db1 .... Ba2 Da2 + + Where the second set start half-way through the drives. + + The advantage of far= is that you can easily spread a long sequential read + across the drives. The cost is more seeking for writes. offset= can + possibly get similar benefits with large enough chunk size. Neither upstream + nor the package maintainer have tried to understand all the implications of + that layout. It was added simply because it is a supported layout in DDF and + DDF support is a goal. + +8. (One of) my RAID arrays is busy and cannot be stopped. What gives? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + It is perfectly normal for mdadm to report the array with the root + filesystem to be busy on shutdown. The reason for this is that the root + filesystem must be mounted to be able to stop the array (or otherwise + /sbin/mdadm does not exist), but to stop the array, the root filesystem + cannot be mounted. Catch 22. The kernel actually stops the array just before + halting, so it's all well. + + If mdadm cannot stop other arrays on your system, check that these arrays + aren't used anymore. Common causes for busy/locked arrays are: + + * The array contains a mounted filesystem (check the `mount' output) + * The array is used as a swap backend (check /proc/swaps) + * The array is used by the device-mapper (check with `dmsetup') + * LVM + * dm-crypt + * EVMS + * The array contains a swap partition used for suspend-to-ram + (check /etc/initramfs-tools/conf.d/resume) + * The array is used by a process (check with `lsof') + +9. Should I use RAID0 (or linear)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + No. Unless you know what you're doing and keep backups, or use it for data + that can be lost. + +9b. Why not? +~~~~~~~~~~~~ + RAID0 has zero redundancy. If you stripe a RAID0 across X disks, you + increase the likelyhood of complete loss of the filesystem by a factor of X. + + The same applies to LVM by the way. + + If you want/must used LVM or RAID0, stripe it across RAID1 arrays + (RAID10/RAID1+0, or LVM on RAID1), and keep backups! + +10. Can I cancel a running array check (checkarray)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + See the -x option in the `/usr/share/mdadm/checkarray --help` output. + +11. mdadm warns about duplicate/similar superblocks; what gives? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In certain configurations, especially if your last partition extends all the + way to the end of the disk, mdadm may display a warning like: + + mdadm: WARNING /dev/hdc3 and /dev/hdc appear to have very similar + superblocks. If they are really different, please --zero the superblock on + one. If they are the same or overlap, please remove one from the DEVICE + list in mdadm.conf. + + There are two ways to solve this: + + (a) recreate the arrays with version-1 superblocks, which is not always an + option -- you cannot yet upgrade version-0 to version-1 superblocks for + existing arrays. + + (b) instead of 'DEVICE partitions', list exactly those devices that are + components of MD arrays on your system. So in the above example: + + - DEVICE partitions + + DEVICE /dev/hd[ab]* /dev/hdc[123] + +12. mdadm -E / mkconf report different arrays with the same device + name / minor number. What gives? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In almost all cases, mdadm updates the super-minor field in an array's + superblock when assembling the array. It does *not* do this for RAID0 + arrays. Thus, you may end up seeing something like this when you run + mdadm -E or mkconf: + + ARRAY /dev/md0 level=raid0 num-devices=2 UUID=abcd... + ARRAY /dev/md0 level=raid1 num-devices=2 UUID=dcba... + + Note how the two arrays have different UUIDs but both appear as /dev/md0. + + The solution in this case is to explicitly tell mdadm to update the + superblock of the RAID0 array. Assuming that the RAID0 array in the above + example should really be /dev/md1: + + mdadm --stop /dev/md1 + mdadm --assemble --update=super-minor --uuid=abcd... /dev/md1 + + See question 2 of this FAQ, and also http://bugs.debian.org/386315 and + recipe #12 in README.recipes . + +13. Can a MD array be partitioned? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Since kernel 2.6.28, MD arrays can be partitioned like any other block + device. + + Prior to 2.6.28, for a MD array to be able to hold partitions, it must be + created as a "partitionable array", using the configuration auto=part on the + command line or in the configuration file, or by using the standard naming + scheme (md_d* or md/d*) for partitionable arrays: + + mdadm --create --auto=yes ... /dev/md_d0 ... + # see mdadm(8) manpage about the values of the --auto keyword + +14. When would I partition an array? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + This answer by Doug Ledford is shamelessly adapted from [0] (with + permission): + + First, not all MD types make sense to be split up, e.g. multipath. For + those types, when a disk fails, the *entire* disk is considered to have + failed, but with different arrays you won't switch over to the next path + until each MD array has attempted to access the bad path. This can have + obvious bad consequences for certain array types that do automatic + failover from one port to another (you can end up getting the array in + a loop of switching ports repeatedly to satisfy the fact that one array + failed over during a path down, then the path came back up, and another + array stayed on the old path because it didn't send any commands during + the path down time period). + + Second, convenience. Assume you have a 6 disk RAID5 array. If a disk + fails and you are using a partitioned MD array, then all the partitions on + the disk will already be handled without using that disk. No need to + manually fail any still active array members from other arrays. + + Third, safety. Again with the raid5 array. If you use multiple arrays on + a single disk, and that disk fails, but it only failed on one array, then + you now need to manually fail that disk from the other arrays before + shutting down or hot swapping the disk. Generally speaking, that's not + a big deal, but people do occasionally have fat finger syndrome and this + is a good opportunity for someone to accidentally fail the wrong disk, and + when you then go to remove the disk you create a two disk failure instead + of one and now you are in real trouble. + + Forth, to respond to what you wrote about independent of each other -- + part of the reason why you partition. I would argue that's not true. If + your goal is to salvage as much use from a failing disk as possible, then + OK. But, generally speaking, people that have something of value on their + disks don't want to salvage any part of a failing disk, they want that + disk gone and replaced immediately. There simply is little to no value in + an already malfunctioning disk. They're too cheap and the data stored on + them too valuable to risk loosing something in an effort to further + utilize broken hardware. This of course is written with the understanding + that the latest MD RAID code will do read error rewrites to compensate for + minor disk issues, so anything that will throw a disk out of an array is + more than just a minor sector glitch. + + 0. http://marc.theaimsgroup.com/?l=linux-raid&m=116117813315590&w=2 + +15. How can I start a dirty degraded array? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + A degraded array (e.g. a RAID5 with only two disks) that has not been + properly stopped cannot be assembled just like that; mdadm will refuse and + complain about a "dirty degraded array", for good reasons. + + The solution might be to force-assemble it, and then to start it. Please see + recipes 4 and 4b of /usr/share/doc/mdadm/README.recipes.gz and make sure you + know what you're doing. + +16. How can I influence the speed with which an array is resynchronised? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + For each array, the MD subsystem exports parameters governing the + synchronisation speed via sysfs. The values are in kB/sec. + + /sys/block/mdX/md/sync_speed -- the current speed + /sys/block/mdX/md/sync_speed_max -- the maximum speed + /sys/block/mdX/md/sync_speed_min -- the guaranteed minimum speed + +17. When I create a new array, why does it resynchronise at first? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + See the mdadm(8) manpage: + When creating a RAID5 array, mdadm will automatically create a degraded + array with an extra spare drive. This is because building the spare into + a degraded array is in general faster than resyncing the parity on + a non-degraded, but not clean, array. This feature can be over-ridden with + the --force option. + + This also applies to RAID levels 4 and 6. + + It does not make much sense for RAID levels 1 and 10 and can thus be + overridden with the --force and --assume-clean options, but it is not + recommended. Read the manpage. + +18. How many failed disks can a RAID10 handle? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + (see also question 4b) + + The following table shows how many disks you can lose and still have an + operational array. In some cases, you *can* lose more than the given number + of disks, but there is no guarantee that the array survives. Thus, the + following is the guaranteed number of failed disks a RAID10 array survives + and the maximum number of failed disks the array can (but is not guaranteed + to) handle, given the number of disks used and the number of data block + copies. Note that 2 copies means original + 1 copy. Thus, if you only have + one copy (the original), you cannot handle any failures. + + 1 2 3 4 (# of copies) + 1 0/0 0/0 0/0 0/0 + 2 0/0 1/1 1/1 1/1 + 3 0/0 1/1 2/2 2/2 + 4 0/0 1/2 2/2 3/3 + 5 0/0 1/2 2/2 3/3 + 6 0/0 1/3 2/3 3/3 + 7 0/0 1/3 2/3 3/3 + 8 0/0 1/4 2/3 3/4 + (# of disks) + + Note: I have not really verified the above information. Please don't count + on it. If a disk fails, replace it as soon as possible. Corrections welcome. + +19. What should I do if a disk fails? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Replace it as soon as possible: + + mdadm --remove /dev/md0 /dev/sda1 + halt + <replace disk and start the machine> + mdadm --add /dev/md0 /dev/sda1 + +20. So how do I find out which other disk(s) can fail without killing the + array? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Did you read the previous question and its answer? + + For cases when you have two copies of each block, the question is easily + answered by looking at the output of /proc/mdstat. For instance on a four + disk array: + + md3 : active raid10 sdg7[3] sde7[0] sdh7[2] sdf7[1] + + you know that sde7/sdf7 form one pair and sdg7/sgh7 the other. + + If sdh now fails, this will become + + md3 : active raid10 sdg7[3] sde7[0] sdh7[4](F) sdf7[1] + + So now the second pair is broken; the array could take another failure in + the first pair, but if sdg now also fails, you're history. + + Now go and read question 19. + + For cases with more copies per block, it becomes more complicated. Let's + think of a seven disk array with three copies: + + md5 : active raid10 sdg7[6] sde7[4] sdb7[5] sdf7[2] sda7[3] sdc7[1] sdd7[0] + + Each mirror now has 7/3 = 2.33 disks to it, so in order to determine groups, + you need to round up. Note how the disks are arranged in increasing order of + their indices (the number in brackes in /proc/mdstat): + + disk: -sdd7- -sdc7- -sdf7- -sda7- -sde7- -sdb7- -sdg7- + group: [ one ][ two ][ three ] + + Basically this means that after two disk failed, you need to make sure that + the third failed disk doesn't destroy all copies of any given block. And + that's not always easy as it depends on the layout chosen: whether the + blocks are near (same offset within each group), far (spread apart in a way + to maximise the mean distance), or offset (offset by size/n within each + block). + + I'll leave it up to you to figure things out. Now go read question 19. + +21. Why does the kernel speak of 'resync' when using checkarray? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Please see README.checkarray and + http://www.mail-archive.com/linux-raid@vger.kernel.org/msg04835.html . + + In short: it's a bug. checkarray is actually not a resync, but the kernel + does not distinguish between them. + +22. Can I prioritise the sync process and sync certain arrays before others? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Upon start, md will resynchronise any unclean arrays, starting in somewhat + random order. Sometimes it's desirable to sync e.g. /dev/md3 first (because + it's the most important), but while /dev/md1 is synchronising, /dev/md3 will + be DELAYED (see /proc/mdstat; only if they share the same physical + components. + + It is possible to delay the synchronisation via /sys: + + echo idle >/sys/block/md1/md/sync_action + + This will cause md1 to go idle and md to synchronise md3 (or whatever is + queued next; repeat the above for other devices if necessary). md will also + realise that md1 is still not in sync and queue it for resynchronisation, + so it will sync automatically when its turn has come. + +23. mdadm's init script fails because it cannot find any arrays. What gives? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + This does not happen anymore, if no arrays present in config file, no arrays + will be started. + +24. What happened to mdrun? How do I replace it? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdrun used to be the sledgehammer approach to assembling arrays. It has + accumulated several problems over the years (e.g. #354705) and thus has been + deprecated and removed with the 2.6.7-2 version of this package. + + If you are still using mdrun, please ensure that you have a valid + /etc/mdadm/mdadm.conf file (run /usr/share/mdadm/mkconf --generate to get + one), and run + + mdadm --assemble --scan --auto=yes + + instead of mdrun. + +25. Why are my arrays marked auto-read-only in /proc/mdstat? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Arrays are kept read-only until the first write occurs. This allows md to + skip lengthy resynchronisation for arrays that have not been properly shut + down, but which also not have changed. + +26. Why doesn't mdadm find arrays specified in the config file and causes the + boot to fail? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + My boot process dies at an early stage and drops me into the busybox shell. + The last relevant output seems to be from mdadm and is something like + + "/dev/md2 does not exist" + + or + + "No devices listed in conf file found" + + Why does mdadm break my system? + + Short answer: It doesn't, the underlying devices aren't yet available yet + when mdadm runs during the early boot process. + + Long answer: It doesn't. but the drivers of those devices incorrectly + communicate to the kernel that the devices are ready, when in fact they are + not. I consider this a bug in those drivers. Please consider reporting it. + + Workaround: there is nothing mdadm can or will do against this. Fortunately + though, initramfs provides a method, documented at + http://wiki.debian.org/InitramfsDebug. Please append rootdelay=10 to the + kernel command line and try if the boot now works. + + -- martin f. krafft <madduck@debian.org> Wed, 13 May 2009 09:59:53 +0200 diff --git a/debian/NEWS b/debian/NEWS new file mode 100644 index 00000000..691f7171 --- /dev/null +++ b/debian/NEWS @@ -0,0 +1,107 @@ +mdadm (3.2.2-1) unstable; urgency=low + + Metadata format change requires recent Grub + The following only applies to users who want to let the grub-pc bootloader + load the kernel directly off a RAID device created with mdadm 3.x and + default values, or when the metadata version is explicitly set using -e. + + Specifically, this includes all arrays created during or after the + installation of Debian squeeze (mdadm-3.1.4+8efb9d1). Arrays created with + older mdadm versions, and RAIDs created with the command-line option + -e 0.9 are not affected. + + Versions of grub-pc older than 1.98+20100720-1 will not be able to boot + directly off a RAID with the 1.x metadata formats (the new default is 1.2). + To ensure a bootable system, please make sure to use grub-pc 1.98+20100720-1 + or later, which is provided by Debian squeeze. An unbootable system may be + rescued with Super Grub2 Disk (http://www.supergrubdisk.org/super-grub2-disk/) + or grml (http://grml.org/). + + -- Scott Schaefer <saschaefer@neurodiverse.org> Wed, 27 Jul 2011 20:21:50 -0400 + +mdadm (3.1.4-1+8efb9d1) unstable; urgency=low + + Default metadata format for newly created arrays has changed from + 0.90 to 1.2. Location of the superblock is now 4Kb from the start + of the device, instead of at the end of the device for 0.90. + The change from 0.9 to 1.x lifted many restrictions of the old + metadata format, and change in location (from end to 4k after + start for 1.2) reduced chances to confuse a raid array with + filesystem inside it. It is now less easy to mount a component + device as separate filesystem by incident, thus destroying the + array. + + Also, chunk size by default is 512K (was 64K) and bitmap chunk size + is 64Mb. + + -- Michael Tokarev <mjt@tls.msk.ru> Sat, 10 Sep 2011 13:35:12 +0400 + +mdadm (2.6.7-2) unstable; urgency=low + + /dev/disk symlinks: + mdadm now creates symlinks in /dev/disk/by-id, using the template + md-uuid-* for the array UUIDs and md-name-* for any names assigned to + arrays (version-1 superblocks only). Thanks to Suse for the udev rules + file. + + mdrun removed: + This version also removes mdrun once and for all. If you are still using + mdrun, please ensure that you have a valid /etc/mdadm/mdadm.conf file (run + /usr/share/mdadm/mkconf --generate to get one), and run + + mdadm --assemble --scan --auto=yes + + -- martin f. krafft <madduck@debian.org> Wed, 02 Jul 2008 10:57:32 +0200 + +mdadm (2.5.3.git200608201206-1) unstable; urgency=low + + This version makes mdadm.conf mandatory. If you do not have such a file, it + will be created for you. + + You must verify the contents of this file and ensure that it represents your + local configuration. See /usr/share/doc/mdadm/README.upgrading-2.5.3.gz for + more information. + + -- martin f. krafft <madduck@debian.org> Sun, 20 Aug 2006 21:58:43 +0100 + +mdadm (2.5-1) unstable; urgency=low + + mdrun has been (finally) obsoleted, and an appropriate warning message is + written to the console if you (or a script) attempts to run it. If you + cannot live without mdrun, you can disable the warning by setting + USE_DEPRECATED_MDRUN=1 in /etc/default/mdadm. Note that mdrun will *not* be + supported. Please also see /usr/share/doc/mdadm/README.mdrun . + + -- martin f. krafft <madduck@debian.org> Tue, 30 May 2006 23:25:13 +0200 + +mdadm (2.4.1-5) unstable; urgency=low + + This version drops the automatic generation of the /etc/mdadm/mdadm.conf + file on every boot (if it was missing). This means that you need to ensure + that you have a valid configuration file. If none is present during package + configuration, mdadm *will* try to generate one, but it will only contain + information about arrays that were running at the time of package + configuration. Arrays not listed in the configuration file will *not* be + started automatically after boot (with the exception of the root partition). + + If you want to recreate your configuration file, either figure out what it + should contain from the mdadm.conf(5) manpage, or simply assemble and run + all the arrays the way you like it, then run + /usr/share/mdadm/mkconf force-generate /etc/mdadm/mdadm.conf + + -- martin f. krafft <madduck@debian.org> Sat, 03 Jun 2006 17:45:47 +0200 + +mdadm (2.4.1-1) unstable; urgency=low + + As of version 2.3, mdadm uses /etc/mdadm.conf as its main configuration + file, and falls back to /etc/mdadm/mdadm.conf if the former is not found. + Since Debian uses /etc/mdadm/mdadm.conf as the configuration file path, this + order was reverted: Debian's mdadm reads /etc/mdadm/mdadm.conf as its main + file and falls back to /etc/mdadm.conf if the former is not found. + + An incompatible change in the reshaping of RAID 5 arrays was made in this + upstream release. If you want to reshape a RAID 5 array with a version-1 + superblock, please make sure to use mdadm 2.4.1 and at least a 2.6.17-rc2 + kernel. + + -- martin f. krafft <madduck@debian.org> Tue, 16 May 2006 13:07:49 -0500 diff --git a/debian/README.Debian b/debian/README.Debian new file mode 100644 index 00000000..bfca8cb3 --- /dev/null +++ b/debian/README.Debian @@ -0,0 +1,148 @@ +mdadm for Debian +================ + +Please make sure you read into /usr/share/doc/mdadm/NEWS.Debian.gz and the +documents listed under "further reading" a little later in this file. + +The latest version of this document is available here: + http://git.debian.org/?p=pkg-mdadm/mdadm.gita=blob;f=debian/README.Debian;hb=HEAD + +Autostarting devices +~~~~~~~~~~~~~~~~~~~~ +The mdadm.conf file controls which devices are to be started automatically by +mdadm during boot, and various other parameters about how they are to be started. +The file can also contain some control parameters for the mdadm monitor daemon. +See mdadm.conf(5) for more information. + +Note: this only applies to modular kernels. If you use a monolithic kernel, +you can control which devices are started automatically by changing the +partition type: 0xfd for autostart, 0x83 to prevent autostart. mdadm does not +actually care about the partition type, only the kernel does. + +Common recipes +~~~~~~~~~~~~~~ +Check /usr/share/doc/mdadm/README.recipes.gz for some simple examples of how +to do the most common stuff with mdadm. + +To RAID5 or not to RAID5 +~~~~~~~~~~~~~~~~~~~~~~~~ +See http://www.miracleas.com/BAARF/BAARF2.html . The package maintainer could +not possibly come up with so much emotion over such a technical topic. + +Further reading +~~~~~~~~~~~~~~~ +The documentation for the kernel md driver is included in +/usr/share/doc/mdadm/md.txt.gz. In addition, the md(4) manpage provides +valuable information about the applicable concepts. Do read those! + +Further documents of interest: + - Linux-RAID reference Wiki: + http://linux-raid.osdl.org + - Linux software RAID HOWTO: + http://tldp.org/HOWTO/Software-RAID-HOWTO.html + - linux-raid mailing list info: + http://vger.kernel.org/vger-lists.html#linux-raid + - linux-raid mailing list FAQ: + http://www.faqs.org/contrib/linux-raid/ + +Upstream +~~~~~~~~ +For completeness: The upstream repository is available from + git clone git://neil.brown.name/mdadm + +You can browse Neil's repository here: + http://neil.brown.name/git?p=mdadm + +You can also clone from Debian's Git repository, where upstream's code is in +the 'upstream' branch: + git://git.debian.org/git/pkg-mdadm/mdadm + +Reporting bugs +~~~~~~~~~~~~~~ +For reporting bugs, please use the reportbug tool, as it collects useful +information about the system where you're experiencing the problem. + +If the system is another, please include the output of +/usr/share/bug/mdadm/scripts with your report. + +If you are turning to the linux-raid@vger.kernel.org mailing list because you +already know that the issue is with the md kernel driver and certainly not +Debian-specific, please also include the output of +/usr/share/bug/mdadm/scripts. + +In general, report bugs against the mdadm Debian package, using reportbug. +I am happy to route reports to where they belong. + +Debian package maintenance +~~~~~~~~~~~~~~~~~~~~~~~~~~ +The package is maintained with Git and published on git.debian.org. To obtain +the source: + + git clone git://git.debian.org/git/pkg-mdadm/mdadm + +You can browse the repository here: + http://git.debian.org/?p=pkg-mdadm/mdadm + +If you want to join the mdadm effort, please send me an email. I'll be very +glad for any help I get. + +There are things to do listed in debian/TODO. + +You might also be interested in the following document, which explains how +package maintenance of mdadm was migrated from SVN to Git: + http://blog.madduck.net/debian/2007.10.07_converting-a-package-to-git + +Patches +~~~~~~~ +The best way to submit patches is with git-format-patch, as outlined in the +following. If this is too complicated for you, please feel free to make +normal diffs, or contact me for assistance if you'd like to learn how to use +Git. + +Please try to follow the guidelines outlined in + http://repo.or.cz/w/git.git?a=blob;f=Documentation/SubmittingPatches;hb=HEAD + +First, the setup, which you only have to do once on each machine you work with: + +# leave out --global if you want to set your identity only for mdadm +git config --global user.name 'your name' +git config --global user.email 'your@email.address' +git clone git://git.debian.org/git/pkg-mdadm/mdadm.git + +To prepare the actual patch, do the following: + +git pull +git checkout -b some-name-identifying-my-work +while not finished: + // if resuming after a while, maybe update your branch: + git rebase master + // edit files + git add files + git commit + ... +end + +After you've brought your change to a state where you want to submit it, please +squash it into logical single commits. If you only made one change, then this +will do: + +git checkout -b temp-squash master +git merge --squash some-name-identifying-my-work +git commit // ... remove the "Squashed commit of the following:" leader +git format-patch -M -s master +// now inspect the files this created in $PWD +// when you're ready to submit, do: +git send-email --to your@email.address +// check that it's okay when it arrives +git send-email --to pkg-mdadm-devel@lists.alioth.debian.org + +For multiple logical changes, cherry-pick or squash-merge every commit +belonging to a change to the integration branch and then commit it. + +Also, read the git-send-email manpage in case you're submitting multiple +logical changes, in case you want to thread them. + +The manpage also includes information about adding a prologue message explaining your patch, or how to insert it into an existing +thread (in-reply-to). + + -- martin f. krafft <madduck@debian.org> Tue, 16 Oct 2007 18:12:13 +0100 diff --git a/debian/README.checkarray b/debian/README.checkarray new file mode 100644 index 00000000..8071a4d6 --- /dev/null +++ b/debian/README.checkarray @@ -0,0 +1,33 @@ +checkarray notes +================ + +checkarray will run parity checks across all your redundant arrays. By +default, it is configured to run on the first Sunday of each month, at 01:06 +in the morning. This is realised by asking cron to wake up every Sunday with +/etc/cron.d/mdadm, but then only running the script when the day of the month +is less than or equal to 7. See #380425. + +Cron will try to run the check at "idle I/O priority" (see ionice(1)), so that +the check does not overload the system too much. Note that this will only +work if all the component devices of the array employ the (default) "cfq" I/O +scheduler. See the kernel documentation[0] for information on how to verify +and modify the scheduler. checkarray does not verify this for you. + + 0. http://www.kernel.org/doc/Documentation/block/switching-sched.txt + +If you manually invoke checkarray, it runs with default I/O priority. Should +you need to run a check at a higher (or lower) I/O priority, then have a look +at the --idle, --slow, --fast, and --realtime options. + +'check' is a read-only operation, even though the kernel logs may suggest +otherwise (e.g. /proc/mdstat and several kernel messages will mention +"resync"). Please also see question 21 of the FAQ. + +If, however, while reading, a read error occurs, the check will trigger the +normal response to read errors which is to generate the 'correct' data and try +to write that out - so it is possible that a 'check' will trigger a write. +However in the absence of read errors it is read-only. + +You can cancel a running array check with the -x option to checkarray. + + -- martin f. krafft <madduck@debian.org> Thu, 02 Sep 2010 10:27:29 +0200 diff --git a/debian/README.recipes b/debian/README.recipes new file mode 100644 index 00000000..2b1891e0 --- /dev/null +++ b/debian/README.recipes @@ -0,0 +1,149 @@ +mdadm recipes +============= + +The following examples/recipes may help you with your mdadm experience. I'll +leave it as an exercise to use the correct device names and parameters in each +case. You can find pointers to additional documentation in the README.Debian +file. + +Enjoy. Submissions welcome. + +The latest version of this document is available here: + http://git.debian.org/?p=pkg-mdadm/mdadm.git;a=blob;f=debian/README.recipes;hb=HEAD + +0. create a new array +~~~~~~~~~~~~~~~~~~~~~ + mdadm --create -l1 -n2 -x1 /dev/md0 /dev/sd[abc]1 # RAID 1, 1 spare + mdadm --create -l5 -n3 -x1 /dev/md0 /dev/sd[abcd]1 # RAID 5, 1 spare + mdadm --create -l6 -n4 -x1 /dev/md0 /dev/sd[abcde]1 # RAID 6, 1 spare + +1. create a degraded array +~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --create -l5 -n3 /dev/md0 /dev/sda1 missing /dev/sdb1 + mdadm --create -l6 -n4 /dev/md0 /dev/sda1 missing /dev/sdb1 missing + +2. assemble an existing array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --assemble --auto=yes /dev/md0 /dev/sd[abc]1 + + # if the array is degraded, it won't be started. use --run: + mdadm --assemble --auto=yes --run /dev/md0 /dev/sd[ab]1 + + # or start it by hand: + mdadm --run /dev/md0 + +3. assemble all arrays in /etc/mdadm/mdadm.conf +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --assemble --auto=yes --scan + +4. assemble a dirty degraded array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --assemble --auto=yes --force /dev/md0 /dev/sd[ab]1 + mdadm --run /dev/md0 + +4b. assemble a dirty degraded array at boot-time +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + If the array is started at boot time by the kernel (partition type 0xfd), + you can force-assemble it by passing the kernel boot parameter + + md-mod.start_dirty_degraded=1 + +5. stop arrays +~~~~~~~~~~~~~~ + mdadm --stop /dev/md0 + + # to stop all arrays in /etc/mdadm/mdadm.conf + mdadm --stop --scan + +6. hot-add components +~~~~~~~~~~~~~~~~~~~~~ + # on the running array: + mdadm --add /dev/md0 /dev/sdc1 + # if you add more components than the array was setup with, additional + # components will be spares + +7. hot-remove components +~~~~~~~~~~~~~~~~~~~~~~~~ + # on the running array: + mdadm --fail /dev/md0 /dev/sdb1 + # if you have configured spares, watch /proc/mdstat how it fills in + mdadm --remove /dev/md0 /dev/sdb1 + +8. hot-grow a RAID1 by adding new components +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # on the running array, in either order: + mdadm --grow -n3 /dev/md0 + mdadm --add /dev/md0 /dev/sdc1 + # note: without growing first, additional devices become spares and are + # *not* synchronised after the add. + +9. hot-shrink a RAID1 by removing components +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --fail /dev/md0 /dev/sdc1 + mdadm --remove /dev/md0 /dev/sdc1 + mdadm --grow -n2 /dev/md0 + +10. convert existing filesystem to RAID 1 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # The idea is to create a degraded RAID 1 on the second partition, move + # data, then hot add the first. This seems safer to me than simply to + # force-add a superblock to the existing filesystem. + # + # Assume /dev/sda1 holds the data (and let's assume it's mounted on + # /home) and /dev/sdb1 is empty and of the same size... + # + mdadm --create /dev/md0 -l1 -n2 /dev/sdb1 missing + mkfs -t <type> /dev/md0 + mount /dev/md0 /mnt + tar -cf- -C /home . | tar -xf- -C /mnt -p + # consider verifying the data + umount /home + umount /mnt + mount /dev/md0 /home # also change /etc/fstab + mdadm --add /dev/md0 /dev/sda1 + + Warren Togami has a document explaining how to convert a filesystem on + a remote system via SSH: http://togami.com/~warren/guides/remoteraidcrazies/ + +10b. convert existing filesystem to RAID 1 in-place +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In-place conversion of /dev/sda1 to /dev/md0 is effectively + mdadm --create /dev/md0 -l1 -n2 /dev/sda1 missing + however, do NOT do this, as you risk filesystem corruption. + + If you need to do this, first unmount and shrink the filesystem by + a megabyte (if supported). Then run the above command, then (optionally) + again grow the filesystem as much as possible. + + Do make sure you have backups. If you do not yet, consider method (10) + instead (and make backups anyway!). + +11. convert existing filesystem to RAID 5/6 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # See (10) for the basics. + mdadm --create /dev/md0 -l5 -n3 /dev/sdb1 /dev/sdc1 missing + #mdadm --create /dev/md0 -l6 -n4 /dev/sdb1 /dev/sdc1 /dev/sdd1 missing + mkfs -t <type> /dev/md0 + mount /dev/md0 /mnt + tar -cf- -C /home . | tar -xf- -C /mnt -p + # consider verifying the data + umount /home + umount /mnt + mount /dev/md0 /home # also change /etc/fstab + mdadm --add /dev/md0 /dev/sda1 + +12. change the preferred minor of an MD array (RAID) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # you need to manually assemble the array to change the preferred minor + # if you manually assemble, the superblock will be updated to reflect + # the preferred minor as you indicate with the assembly. + # for example, to set the preferred minor to 4: + mdadm --assemble /dev/md4 /dev/sd[abc]1 + + # this only works on 2.6 kernels, and only for RAID levels of 1 and above. + # for other MD arrays, you need to specify --update explicitly: + mdadm --assemble --update=super-minor /dev/md4 /dev/sd[abc]1 + + # see also item 12 in the FAQ contained with the Debian package. + + -- martin f. krafft <madduck@debian.org> Fri, 06 Oct 2006 15:39:58 +0200 diff --git a/debian/TODO b/debian/TODO new file mode 100644 index 00000000..eb3299e0 --- /dev/null +++ b/debian/TODO @@ -0,0 +1,29 @@ +debian mdadm TODO list +====================== + +- version-1 is a nightmare. E.g. on partitionable arrays, with / on + /dev/md_d0p3, mdadm -Es ignores /dev/md_d0 and just uses the name, so + /dev/md/<arrayname>. +- figure out something about device names. +- (better) udev integration + +- check whether mdadm.conf and system are consistent during initramfs creation + and fail otherwise (#381303). +- add code to compare existing and expected configuration, after standardising + the files. In most cases, we'll have to answer DUNNO as to whether the + existing configuration file is okay, but I guess in some cases we can + determine that the configuration is okay. A conservative approach would be + beneficial to the user. Not sure if it's worth the effort though. +- one nice^W important thing would be to check device names and UUIDs at least. + +- verify operation without udev + - udev removed before mdadm installed + - udev removed after mdadm installed + +- more granular handling of init.d starts/stops, don't force all arrays to be + started. +- let user specify when to start/stop which array (#398310). +- also only stop those array we started; this can be easily done with + sentinels in $STATEDIR + +- manage DAEMON_OPTIONS with debconf diff --git a/debian/bugscript b/debian/bugscript new file mode 100755 index 00000000..dcb88ebd --- /dev/null +++ b/debian/bugscript @@ -0,0 +1,219 @@ +#!/bin/bash +# +# mdadm bug submission control script +# +# allows Debian's bug tools to include relevant information in bug reports. +# +# Copyright © martin f. krafft <madduck@debian.org> +# distributed under the terms of the Artistic Licence 2.0 +# +# we need /bin/bash for readline and -n capabalities in the prompt(s) +# + +# maximise information output even in the case of errors +set +eu + +if ! command -v yesno >/dev/null; then + if [ -r /usr/share/reportbug/handle_bugscript ]; then + exec /usr/share/reportbug/handle_bugscript ". $0" /dev/stdout + fi + yesno() { + read -n1 -p"$1" REPLY + case "$REPLY" in + [yY]) REPLY=yep;; + [nN]) REPLY=nop;; + ('') REPLY="$2";; + esac + } + exec 3>&1 +fi + +# do not let people ctrl-c out of the bugscript +trap : INT + +if [ $(id -u) != 0 ]; then + if [ -x "$(command -v sudo)" ]; then + yesno "Gather system information as root using sudo? (Y/n) " yep + if [ "$REPLY" = yep ]; then + echo running sudo "$0" "$@"... + sudo "$0" "$@" >&3 && exit 0 + echo "sudo invocation failed, trying /bin/su..." + fi + fi + + yesno "Gather system information as root using su? (Y/n) " yep + if [ "$REPLY" = yep ]; then + ARGS= + for i in "$@"; do ARGS="${ARGS:+$ARGS }'$1'"; shift; done + echo "running su root -s '/bin/sh -c $0${ARGS:+ $ARGS}'..." + su root -s /bin/sh -c "$0 $ARGS" >&3 && exit 0 + unset ARGS + echo "su invocation failed." + fi + + # arrive here only if neither sudo nor su worked: + yesno "Will you provide system information in the bug report yourself? (N/y) " nop + if [ "$REPLY" = yep ]; then + cat <<_eof >&3 + +IMPORTANT: + please do not forget to include all relevant system information with this + bug report. You could run + /usr/share/bug/mdadm/script 3>&1 + as root and attach or include the output. + +_eof + exit 0 + fi + + # try our best + cat <<_eof >&3 + +WARNING: + the following output was not generated by the root user. If you can, please + replace the following up until "-- System Information:" with the output of + /usr/share/bug/mdadm/script 3>&1 + run as root. Thanks! + +_eof +fi + +if [ ! -r /proc/mdstat ]; then + echo "The local system does not have MD (RAID) support: no drivers loaded." + echo "Without MD support, I cannot collect as much information as I'd like." + + #yesno "Are you sure you want to report a bug at this time? " yep + yesno "Hit any key to continue..." yep + #[ "$REPLY" = yep ] || exit 1 +fi + +echo "--- mdadm.conf" >&3 +if [ -r /etc/mdadm/mdadm.conf ]; then + grep '^[^#]' /etc/mdadm/mdadm.conf >&3 +elif [ -r /etc/mdadm.conf ]; then + grep '^[^#]' /etc/mdadm.conf >&3 +else + echo no mdadm.conf file. >&3 +fi +echo >&3 + +echo "--- /etc/default/mdadm" >&3 +if [ -r /etc/default/mdadm ]; then + grep '^[^#]' /etc/default/mdadm >&3 +else + echo no /etc/default/mdadm file. >&3 +fi +echo >&3 + +echo "--- /proc/mdstat:" >&3 +cat /proc/mdstat >&3 2>&3 || : +echo >&3 + +echo "--- /proc/partitions:" >&3 +cat /proc/partitions >&3 2>&3 || : +echo >&3 + +echo "--- LVM physical volumes:" >&3 +if [ -x "$(command -v pvs)" ]; then + pvs >&3 +else + echo "LVM does not seem to be used." >&3 +fi + +echo "--- mount output" >&3 +mount >&3 +echo >&3 + +echo "--- initrd.img-$(uname -r):" >&3 +if [ -r /boot/initrd.img-$(uname -r) ]; then + TEMPDIR=$(mktemp -d) + OLDPWD="$PWD" + cd "$TEMPDIR" + zcat /boot/initrd.img-$(uname -r) 2>&3 | cpio -i 2>&3 + find -regex '.*/md[a/].+' -type f -exec md5sum {} \; >&3 + + echo >&3 + echo "--- initrd's /conf/conf.d/md:" >&3 + if [ -r conf/conf.d/md ]; then + cat conf/conf.d/md >&3 + else + echo "no conf/md file." >&3 + fi + + cd "$OLDPWD" + rm -rf "$TEMPDIR" + unset TEMPDIR +else + echo "no initrd.img-$(uname -r) found." >&3 +fi +echo >&3 + +if [ -r /proc/modules ]; then + echo "--- /proc/modules:" >&3 + egrep '(dm_|raid|linear|multipath|faulty)' < /proc/modules >&3 || : + echo >&3 +fi + +if [ -f /var/log/syslog ]; then + if [ -r /var/log/syslog ]; then + echo "--- /var/log/syslog:" >&3 + egrep "^\w{3} [ :[:digit:]]{11} ($(hostname)|localhost) (kernel: md|mdadm): " /var/log/syslog >&3 || : + echo >&3 + else + echo "syslog not readable by user." >&3 + fi +fi + +echo "--- volume detail:" >&3 +for dev in /dev/[hsv]d[a-z]*; do + [ ! -r $dev ] && echo "$dev not readable by user." && continue + mdadm -E $dev 2>/dev/null && echo -- || echo "$dev is not recognised by mdadm." +done >&3 +echo >&3 + +if [ -r /proc/cmdline ]; then + echo "--- /proc/cmdline" >&3 + cat /proc/cmdline >&3 + echo >&3 +fi + +if [ -f /boot/grub/grub.cfg ]; then + echo "--- grub2:" >&3 + if [ -r /boot/grub/grub.cfg ]; then + egrep '^[^#].*\<(root=|raid)' /boot/grub/grub.cfg >&3 || : + else + echo grub.cfg file not readable. >&3 + fi + echo >&3 +fi + +if [ -f /boot/grub/menu.lst ]; then + echo "--- grub legacy:" >&3 + if [ -r /boot/grub/menu.lst ]; then + grep '^[^#].*\<root=' /boot/grub/menu.lst >&3 || : + else + echo menu.lst file not readable. >&3 + fi + echo >&3 +fi + +if [ -f /etc/lilo.conf ]; then + echo "--- lilo:" >&3 + if [ -r /etc/lilo.conf ]; then + egrep '^([^#].*)?root=' /etc/lilo.conf >&3 || : + else + echo lilo.conf file not readable. >&3 + fi + echo >&3 +fi + +echo "--- udev:" >&3 +COLUMNS=70 dpkg -l udev | grep '\<udev\>' >&3 +md5sum /etc/udev/rules.d/*md* /lib/udev/rules.d/*md* >&3 2>/dev/null +echo >&3 + +echo "--- /dev:" >&3 +ls -l /dev/md* /dev/disk/by-* >&3 +echo >&3 + +echo "Auto-generated on $(date -R) by mdadm bugscript" >&3 diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 00000000..cd2f9c2b --- /dev/null +++ b/debian/changelog @@ -0,0 +1,1851 @@ +mdadm (3.3.4-1.1) unstable; urgency=medium + + * Non-maintainer upload. + * disable-incremental-assembly.patch: incremental assembly prevents booting + in degraded mode (Closes: #784070) + + -- Yann Soubeyrand <yann-externe.soubeyrand@edf.fr> Tue, 10 Nov 2015 11:18:39 +0100 + +mdadm (3.3.4-1) unstable; urgency=medium + + [ Dimitri John Ledkov ] + * Adopting the package as per mjt delegation. Thank you for your + service mainting this package over the years, your contributions + are highly appreciated by all of Debian and derivate communities. + * New upstream release. + * Bump standards version to 3.9.6.0, no changes required. + * Use dh_prep instead of dh_clean -k + * Drop cherrypicked patches. + + [ Helmut Grohne ] + * Fix FTCBFS. Export CROSS_COMPILE=<triplet>- (Closes: #794335) + + [ Cyril B. ] + * Copy AUTO lines from host into initrd, when updating initrd. + (Closes: #785104) + + [ Martin von Wittich ] + * Ignore errors attempting to apply *nice to checkarray, it may be + done already. (Closes: #791554) + + -- Dimitri John Ledkov <xnox@debian.org> Sun, 08 Nov 2015 11:48:03 +0000 + +mdadm (3.3.2-5) unstable; urgency=medium + + * use-tempnode-not-devnode.patch: change udev rules file to use + $tempnode which works both on wheezy and jessie udev, instead + of $devnode which only works in jessie. At this stage it is + better to make rules file compatible with old version instead + of adding versioned dependency. Should be removed for jessie+1. + (Closes: #770883) + * fix Closes: list in previous entry (Closes: #771852) + + -- Michael Tokarev <mjt@tls.msk.ru> Sat, 20 Dec 2014 11:48:44 +0300 + +mdadm (3.3.2-4) unstable; urgency=medium + + * really remove /var/lib/mdadm in postinst, fixing a brown-paper bag + bug in previous upload (I fixed it earlier but forgot to commit it + before 3.3.2-3 release). (Closes: #764036, #771852) + * mention closing of #588965 #599352 #694513 by 3.3-1 + + -- Michael Tokarev <mjt@tls.msk.ru> Fri, 05 Dec 2014 17:29:22 +0300 + +mdadm (3.3.2-3) unstable; urgency=medium + + * remove /var/lib/mdadm dir in postinst to clean up from old pkg, + remove config files on purge (restore extraneous cleanup from + last change) (Closes: #764036) + * remove set -u (error on unset variables) from maintscripts + (Closes: #766308) + * rebuildmap-strip-local-host-name-from-device-name.patch - a patch + from upstream fixing a bug when mdadm have to re-create device + nodes after assembling arrays (eg, when switching from initramfs + without preserving /dev and /run), to choose the same device names + as when doing inital assembly + * readlink-path.patch: readlink is in /bin not /usr/bin on debian + (Closes: #766416) + * mdmonitor-service-simplify.diff: simplify mdmonitor.service + systemd file, do not try to read non-existing files (Closes: #764647) + + -- Michael Tokarev <mjt@tls.msk.ru> Fri, 28 Nov 2014 09:55:14 +0300 + +mdadm (3.3.2-2) unstable; urgency=medium + + * remove more leftovers from old versions + * do not embed $VERSION to mkconf and bugscript + * removed unneeded lintian-overrides file + * removed examples/mdadd.sh + * removed references to MAIL_TO from /etc/default/mdadm (pre-2.x mdadm) + * removed AUTOSTART variable from /etc/default/mdadm + (system will start arrays listed in mdadm.conf) + * simplify d/rules, build udeb in a subdir (for now, to be removed) + * install systemd services and disable some initscripts (mask them) + when systemd is running (Closes: #763959) + * build-sys-no-check_rundir.patch: stop (re)linking executables + at install time + + -- Michael Tokarev <mjt@tls.msk.ru> Sat, 04 Oct 2014 20:38:36 +0400 + +mdadm (3.3.2-1) unstable; urgency=low + + * new minor/bugfix upstream release (Closes: #731884, #763080) + * removed remove-bashism-from-makefile.patch (applied upstream) + * copy 64-md-raid-assembly.rules to initramfs too, this should + bring us array auto-assemble during initramfs run (Closes: #678691) + * denote inability to initialize md subsystem in local-top script + from fatal error to warning (Closes: #733574) + * create /etc/modprobe.d/mdadm.conf to set start_ro=1 there + instead of doing it in the initramfs + * use install -D in d/rules consistently and remove dirs from d/mdadm.dirs + * remove references to old (2.x and 1.x) mdadm versions (Closes: #728804) + * remove references to very old (pre-wheeze) breaks/replaces/depends + * remove support of makedev + * reformat mdadm-raid initscript a bit + * remove rebuilding-raid.html (had invisible refs) and + RAID5_versus_RAID10.txt docs. Maybe it's a good idea to remove + other docs shipped with the package, since these becoming stale + and these days, better alternatives exist online + * removed debian-specific mdadm-startall script which does nothing more than + "mdadm --assemble --scan". Remove docbook-to-man from build-deps. + * pass dpkg-buildflags to upstream build system and + stop using our own -O0, -Os etc + * remove separate CFLAGS for udeb and simplify *FLAGS handling + * bump Standards-Version to 3.9.5 (no changes needed) + * fix VCS links to point to anonscm.d.o + + -- Michael Tokarev <mjt@tls.msk.ru> Wed, 01 Oct 2014 15:09:07 +0400 + +mdadm (3.3-2) unstable; urgency=low + + * use 63-md-raid-arrays.rules instead of old 64-md-raid.rules + (Closes: #726237) + * do not use builtin blkid in udev rules, as our udev (at least + on wheezy) does not have it (use-external-blkid.diff) + + -- Michael Tokarev <mjt@tls.msk.ru> Mon, 14 Oct 2013 15:49:54 +0400 + +mdadm (3.3-1) unstable; urgency=low + + [ Michael Tokarev ] + * new upstream 3.3 release (Closes: #718896, #588965, #599352, #694513) + See ANNOUNCE-3.3 for details. + Patches: + - refreshed debian-conffile-location.diff + (added .conf.d) + - removed debian-disable-udev-incr-assembly.diff + (do not ship udev-md-raid-assembly.rules for now) + - refreshed debian-no-Werror.diff + - refreshed sha1-includes.diff + - removed patches (included upstream)A: + spelling-and-manpages.patch + fix-enough-function-for-RAID10.patch + fix-segfaults-in-detail.patch + super0-do-not-override-uuid-with-homehost.patch + mdmon-allow-takeover-when-original-was-started-with-.patch + mdmon-fix-arg-parsing.patch + mdmon-fix-arg-processing-for-a.path + Install udev-md-raid-arrays.rules instead of udev-md-raid.rules, + don't install new udev-md-raid-assembly.rules for now. + * remove Martin F. Krafft from uploaders per his request. + Thank you for your contributions! + * added remove-bashism-from-makefile.patch patch to work around + newly introduced bashism + * remove debian/source/options, there's no need to set compression + options for debian.tar.gz. + * remove outdated debian/docs/md_superblock_formats.txt and + debian/docs/md.txt (Closes: #714977, #714978) + * ship ANNOUNCE-*, external-reshape-design.txt, mdmon-design.txt + files as documentation (Closes: #715324) + + [ Dmitrijs Ledkovs ] + * Properly remove 65-mdadm.vol_id.rules, instead of trying to remove a + never-existed 65_mdadm.vol_id.rules (note the 65- vs 65_). + + -- Michael Tokarev <mjt@tls.msk.ru> Fri, 11 Oct 2013 10:12:47 +0400 + +mdadm (3.2.5-6) unstable; urgency=low + + * replace home-grown and not-working-since-etch udevsettle call + in initramfs script with proper wait_for_udev function (from + common initramfs functions). This unbreaks situations when + the underlying device needs some udev magic to happen before + being available, which includes stacked devices (md on lvm) + and other cases. Thanks to Thomas Parmelan and Dave Whitla + for finding the root cause of breakage and for providing + the fix. (Closes: #644876) + + -- Michael Tokarev <mjt@tls.msk.ru> Tue, 05 Mar 2013 13:32:21 +0400 + +mdadm (3.2.5-5) unstable; urgency=low + + * add (empty) restart and force-reload actions to mdadm-waitidle + script -- this script is used only when the system needs to be + shut down or rebooted, there's nothing to start or restart. + + -- Michael Tokarev <mjt@tls.msk.ru> Thu, 24 Jan 2013 17:04:49 +0400 + +mdadm (3.2.5-4+mdmon) experimental; urgency=low + + * fix `/etc/init.d/mdadm-raid status' inverse logic (Closes: #686100) + * /etc/init.d/mdadm: change RUNDIR to /run instead of /var/run. + Mdadm itself uses /run internally, we properly depend on initscripts + version which creates /run, and the initscript itself is started + after local_fs is processed, so this is merely a no-op, but let's + do it for consistency. + * Fix 'enough' function for RAID10, to prevent starting of a RAID10 + array which does not have required minimum of component devices. + (Closes: #691668). + * fix segfaults in Detail() - mdadm --detail may segfault if a drive + has been removed from the array (Closes: #691670) + * super0: do not override uuid with homehost. The bug prevented + re-creating an array with v0.90 superblock with the specified uuid + when homehost is also specified. (Closes: #686703) + * several fixes for mdmon argument processing (Closes: #691671): + - allow --takeover when original was started with --offroot + - fix arg parsing. + - fix arg processing for -a + * Changes based on a patch by Miquel van Smoorenburg (Closes: #684708): + - install mdmon in udeb and initramfs, so imsm arrays can work. + mdadm runs mdmon automatically when needed (currently for imsm + arrays), and mdmon is required to make such arrays read-write + (they're read-only by default) so merely presence of mdmon is + enough to be able to use imsm arrays. + - /etc/init.d/mdadm start: if a mdmon pidfile is found in /run/mdadm, + restart mdmon (--takeover --all) + - /etc/init.d/mdadm stop: link pidfiles of mdmon processes into + /run/sendsigs.omit.d, and make sure that happens before sendsigs runs. + - stop mdadm before sendsigs, so that the above code works + - add script mdadm-waitidle that runs just before reboot/halt. + For each array that is still running, it sets sync_action to idle, + and uses mdadm --wait-clean to wait for all arrays to go idle + (it has a short timeout). + + -- Michael Tokarev <mjt@tls.msk.ru> Sat, 20 Oct 2012 19:20:12 +0400 + +mdadm (3.2.5-3) unstable; urgency=low + + * revert "Drop unused debconf templates" change -- the templates + are actually being (indirectly) used in debian/mdadm.config + * fix initramfs-script config name (slipped in the initramfs fixes) + + -- Michael Tokarev <mjt@tls.msk.ru> Sat, 25 Aug 2012 23:12:50 +0400 + +mdadm (3.2.5-2) unstable; urgency=low + + [ Sergey B Kirpichev ] + * Fix mdadm.lintian-overrides + * Fix spelling in binaries, fix lintian warnings + manpage-has-errors-from-man and hyphen-used-as-minus-sign + * Drop unused debconf templates + * Implement status option for mdadm-raid init.d script + * Fix lintian info's conflicts-with-version: Conflicts -> Breaks + + [ Michael Tokarev ] + * fix spelling mistakes in previous changelog entry + * some cleanups for checkarray: + - change --help printing and shorten/simplify the text + - make --quiet cumulative and stop documenting --real-quiet + - do not produce help in case of incorrect usage, and exit with 1 + * fixes for initramfs integration (Closes: #644389, #678262, #685161): + - check INITRDSTART=none early + - do not explicitly load raid level modules (modprobe/kmod does this) + - do not collect needed raid levels (we include all modules anyway) + - load md_mod explicitly since we need to change global parameter + + [ Dmitrijs Ledkovs ] + * Use dh_installinit with --no-restart-on-upgrade, which will start + arrays, but will not stop them during upgrade. (Closes: 678971) + * Copy local administrator's modified udev rules as well as the system + one. (Closes: #678973) + + -- Michael Tokarev <mjt@tls.msk.ru> Sat, 25 Aug 2012 16:25:37 +0400 + +mdadm (3.2.5-1) unstable; urgency=low + + [ Michael Tokarev ] + * new upstream (bugfix) release, fixing regression when --add'ing + device to an array, introduced in 3.2.4, plus other minor fixes + (Closes: #673104, #673344) + * new patch: sha1-includes.diff to fix #include mess in new sha1.h + * added a check into debian/checkarray to skip checking arrays created + less than 2 weeks ago (Closes: #624273) + + [ Dmitrijs Ledkovs ] + * Remove obsolete documentation dating back to ~etch release + * Remove reference to obsolete documention from debconf templates + * Update debconf templates translations + * Remove compatibility with ancient initramfs-tools + * Remove debian-specific mdadm-startall.8 in clean target + + -- Michael Tokarev <mjt@tls.msk.ru> Fri, 25 May 2012 20:23:52 +0400 + +mdadm (3.2.4-1) unstable; urgency=low + + * new upstream (bugfix) release (Closes: #664088, #661552) + * removed debian-run-udev.diff (applied upstream), and + all RUNDIR handling from debian/rules (it is the default now) + * add build-arch and build-indep targets to debian/rules, and + bump Standards-Version to 3.9.3 + + -- Michael Tokarev <mjt@tls.msk.ru> Thu, 10 May 2012 17:51:41 +0400 + +mdadm (3.2.3-3) unstable; urgency=low + + * switch from topgit to plain 3.0 (quilt) layout, creating + debian/patches. Don't build-depend on quilt as patching + is done automatically by dpkg-source. + * debian/patches/debian-run-udev.diff by Roger Leigh (Closes: #644319, #627774) + * update debian/mdadm.logcheck.ignore.server to recognize "k" in + addition of "blocks" in kernel messages. Thanks to Frédéric Brière + for the patch (Closes: #656038) + + -- Michael Tokarev <mjt@tls.msk.ru> Mon, 30 Apr 2012 14:12:38 +0400 + +mdadm (3.2.3-2) unstable; urgency=low + + [ Michael Tokarev ] + * new upstream bugfix/stable version, with lots of fixes all over. + Closes: #641886, #628667, #645563, #651880, #607375, #633880 + * update Neil's email (Closes: #650630) + * update mdadd.sh to version 1.52 (Closes: #655212) + * fixed a typo (RAID6 vs RAID10) in FAQ (Closes: #637068) + * declare ordering dependency for multipath-tools-boot in + mdadm-raid init script (Closes: #641584) + While at it, remove mention of devfsd + * added Slovak (sk.po) po-debconf translation from Slavko <linux@slavino.sk> + (Closes: #641972) + * set nice value of the check/resync thread too, together with I/O + scheduling class, based on patch by Sergey B Kirpichev (Closes: #652547) + * small changes for debian/checkarray + * (internal) move files from contrib/* topgit branches into debian directory + * remove dh_testroot from clean target + * add myself to uploaders + + [ Peter Eisentraut ] + * Added support for "status" action to mdadm init script (Closes: #651737) + + -- Michael Tokarev <mjt@tls.msk.ru> Wed, 18 Jan 2012 22:33:01 +0400 + +mdadm (3.2.2-1) unstable; urgency=low + + [ martin f. krafft ] + * New upstream version (closes: #615494), which addresses: + - --grow option aware of size change of underlying device (closes: #618463) + - builds with -Werror on gcc-4.6 (closes: #625392). + - new features/functionality: + + Policy framework. + + Improved management of reshaping arrays. + + Support for Intel Matrix Storage Manager (IMSM) + Thanks to Scott Schaefer for his help in preparing this release! + * Do not print io rescheduling info message when run by cron + (closes: #598957). + * Fix checkarray script so that it does not die after scheduling the first + device when there is no scheduling class specified; thanks to Mario + 'BitKoenig' Holbe (closes: #611627). + * Do not include DEVICE line in generated configs but use the built-in value + by default (closes: #604702). + * Make mdadm-raid init script depend on hostname; thanks to Mario + 'BitKoenig' Holbe (closes: #610421). + * Schedule start/stop of mdadm-raid before/after filesystems are + checked&mounted/unmounted; thanks to Mario 'BitKoenig' Holbe + (closes: #611632). + * Work around a shell coding bug for cases when there are zero active + devices (closes: #618561). + * Add NEWS entry about metadata change and requirement on new Grub version + (closes: #595516). + * Update mdadd.sh script from Arno's webpage. + * Update md.txt from latest kernel source. + * Updated debconf translations: + - Catalan by Innocent De Marchi (closes: #628371) + - Danish by Joe Dalton (closes: #621346) + * Small typo fix in RAID5_versus_RAID10.txt + * Bump Standards-Version to 3.9.2; no changes required. + + [ Michael Tokarev ] + * don't print W: auto-read-only in checkarray in quiet mode, + thanks to Bernd Hanisch for the patch (Closes: #605722) + * move initscript metadata from /lib/init/rw/.mdadm to /run/mdadm, + and depend on initscripts (>= 2.88dsf-13.3) for /run (Closes: #633054) + * move runtime stuff from /dev/.mdadm to /run/mdadm + * document defaults change in debian/NEWS (it was forgotten to be + done for squeeze) - metadata & chunk size (Closes: #595516) + + -- martin f. krafft <madduck@debian.org> Mon, 01 Aug 2011 10:41:41 +0200 + +mdadm (3.1.4-1+8efb9d1) unstable; urgency=low + + * Added patch with Makefile fix from upstream (commit 8efb9d1) to fix + compiler/linker problem on non-x86 architectures (closes: #595290). + + -- martin f. krafft <madduck@debian.org> Fri, 03 Sep 2010 10:45:01 +0200 + +mdadm (3.1.4-1) unstable; urgency=low + + * New upstream release, which closes:#595039 and addresses the following + issues too: + - reverts move of incremental map (closes: #585015). + - fixes mdadm monitor in the case of an inactive (or start-failed) raid0 + or linear array (closes: #539154). + - prevent --remove faulty from skipping renumbered devices + (closes: #587550). + - fixed overflow when growing a RAID6 (closes: #589493). + * However, disable the incremental assembly upstream turned on in 3.1.3 for + now, this will have to wait until after the squeeze release. + * initramfs/hook: make sure configuration file exists before accessing it; + thanks to Michael Prokop for the fix and NMU (closes: #589836). + * initramfs/hook: Match UUID case-insensitive while checking for running + arrays not listed in mdadm.conf; thanks to Mario 'BitKoenig' Holbe for the + patch (closes: #583545). + * Fix URL in the bug reporting preamble (presubj) (closes: #589833). + * Add I/O rescheduling functionality to the checkarray script and make the + cronjob use the idle priority; this should now minimise the impact of the + monthly re-check on the running system; Florian Heigl had the idea + (closes: #592149, #508123). + + -- martin f. krafft <madduck@debian.org> Sun, 29 Aug 2010 13:44:59 +0200 + +mdadm (3.1.2-2) unstable; urgency=low + + * Fix logcheck regexp to cure "egrep: Unmatched [ or [^" message + (closes: #583376). + * Cherry-pick 94fcb80 from upstream to fix compiler error due to argument + type error (at least on ia64, sparc, powerpc) (closes: #583495). + + -- martin f. krafft <madduck@debian.org> Fri, 28 May 2010 09:35:42 +0200 + +mdadm (3.1.2-1) unstable; urgency=low + + * New upstream release (closes: #567167). + * Ignore lintian error about not stopping in runlevel 1. + * Include more information about the configuration, initrd, and LVM in + bugscript output. + * Check active devices against configuration file based on UUID, not device + name (closes: #553896). + * When preparing the list of devices, treat /dev/mdX and /dev/md/X equally. + * Bump Standards-Version to 3.8.4 without having to make changes. + * Fix logcheck rule with patch from Frédéric Brière, since Rebuild events + are now arbitrary, no longer multiples of 20 (closes: #570315). + * checkarray: do not exit non-zero when there are no arrays found (see + #582360). + * Do not exit checkarray cronjob non-zero when the tests whether to run + fail (closes: #580825). This is related to a recent change in behaviour in + cron 3.0pl1-110 (see e.g. #581612). Thanks to everyone for their + suggestions! + * Call MAKEDEV to create device nodes only if MAKEDEV is installed + (closes: #569360). + + -- martin f. krafft <madduck@debian.org> Thu, 27 May 2010 09:34:24 +0200 + +mdadm (3.1.1-1) unstable; urgency=low + + * New upstream release. + * Retire fixed/blkid-dev branch. + * Reword warning about unbootable system when mdadm is purged + (closes: #544558). + * Updated FAQ to include s2ram as one of the reasons that can prevent an + array from being stopped; thanks to Pascal Hambourg for writing in. + * Install udev rules into udeb package (closes: #558823). + * Update mdadd.sh (formerly /usr/share/doc/mdadm/examples/newdisk.gz) from + Arno's website, refactoring the script into its own branch/patch + (closes: #539103). + * Do not single-quote homehost in initramfs script (closes: #549083). + + -- martin f. krafft <madduck@debian.org> Wed, 27 Jan 2010 10:14:25 +1300 + +mdadm (3.0.3-2) unstable; urgency=low + + * Bumped Standards-Version to 3.8.3 without having to make changes. + * Fixed init dependencies of mdadm daemon init.d script; thanks Petter + Reinholdtsen (closes: #541396). + * Switched source package to v3-quilt format. + + -- martin f. krafft <madduck@debian.org> Fri, 06 Nov 2009 10:06:03 +0100 + +mdadm (3.0.3-1) unstable; urgency=low + + * New upstream release. + * Acknowledge 3.0-3.1 NMU by Christian Kujau (patch by Marco d'Itri) + (closes: #541884), and add util-linux dependency. + * Copy udev rules into initramfs, which udev stopped doing + (closes: #549535, #549083, #538843, #538143). + * Fix the bug script to write debug information to the correct file + descriptor (closes: #537734). + * Switch to TopGit and split Debian diff into topic branches; add + README.source. + * Drop the outdated rootraiddoc.97.html document. + + -- martin f. krafft <madduck@debian.org> Tue, 27 Oct 2009 18:06:13 +0100 + +mdadm (3.0-3.1) unstable; urgency=medium + + * Non-maintainer upload. + * use blkid instead of vold_id in udev-md-raid.rules (closes: #541884) + + -- christian kujau <lists@nerdbynature.de> Mon, 14 Sep 2009 10:15:21 +0200 + +mdadm (3.0-2) unstable; urgency=low + + * Fixed initramfs script with patch from Steffen Hau: it was still using + --auto-update-homehost, which has been removed and replaced by a better + heuristic: arrays created for a different "homehost" will now be + assembled read-only, rather than shoehorned into the system with + --auto-update-homehost (closes: #537820). + * Add version stamps to bugscript and mkconf scripts to facilitate + debugging. + + -- martin f. krafft <madduck@debian.org> Tue, 21 Jul 2009 10:33:30 +0200 + +mdadm (3.0-1) unstable; urgency=low + + * New stable upstream release. + * Add information about udev and device links in /dev to bugscript output. + * Add pointer to FAQ and in particular rootdelay to the bug script + pre-subject file, which is displayed by tools like reportbug and thus + hopefully reduces the numbers of duplicated bugs. + * Patch from Frédéric Brière to make logcheck rules printk_time aware + (closes: #537460). + * Updated German translation due to typos and old spelling rules; thanks to + Helge Kreutzmann for the patch (closes: #534663). + * Bumped Standard-Version to 3.8.2; no changes necessary. + + -- martin f. krafft <madduck@debian.org> Mon, 20 Jul 2009 16:12:41 +0200 + +mdadm (3.0~devel3-43-g2800528-1) experimental; urgency=low + + * Merge tip of upstream's devel-3.0 branch at commit 2800528. + * Drop our own udev rules in favour of upstream's. If + /etc/udev/rules.d/65_mdadm.vol_id.rules has not been modified (md5sum + check), it is automatically removed; else, a warning is emitted. + * Add information about udev and device links in /dev to bugscript output. + + -- martin f. krafft <madduck@debian.org> Tue, 05 May 2009 15:10:46 +0200 + +mdadm (2.6.9-3) unstable; urgency=low + + * Fix the multipath prereq patch (#516605) and make it exit after printing + the prerequisites (closes: #526793). + * Change my previous recommendation for postfix over to the new virtual + package default-mta (see #522300 and #508644). + * Enhance bugscript, which now asks to run as root (sudo/su) if invoked by + a normal user. + * Include MD5 sums of md-related files in initrd in bug reports. + * Add grub2 information retrieval to bugscript. + * Trap SIGINT and thus prevent ctrl-c from terminating the bugscript + prematurely. + + -- martin f. krafft <madduck@debian.org> Tue, 05 May 2009 11:46:22 +0200 + +mdadm (3.0~devel3-1) experimental; urgency=low + + * Initial release of DEVELOPMENT BRANCH 3.0 to experimental. + + -- martin f. krafft <madduck@debian.org> Thu, 30 Apr 2009 11:51:39 +0200 + +mdadm (2.6.9-2) unstable; urgency=low + + * Fix the check of whether mdadm.conf defines all devices known to the + system; thanks Cristian Ionescu-Idbohrn (closes: #525655). + * No longer pass -k to modprobe, which has been deprecated for a long time; + thanks to Jan Hudec (closes: #519999). + * Remove Mario Joußen from the uploaders list, since his email started + bouncing. + * Prepare mdadm source to use quilt, with the long-term goal to switch to + TopGit, once I find the time. + * Cherry-pick caa0f6c & 667e66d from Neil into a quilt patch to fix gcc-4.4 + compiler issues (closes: #505375). + + -- martin f. krafft <madduck@debian.org> Sun, 26 Apr 2009 16:08:28 +0200 + +mdadm (2.6.9-1) unstable; urgency=low + + * New upstream release. + * Do not set -eu in the bugscript to maximise information output in the case + of errors. + * Make initramfs script depend on multipath to ensure its script is run + before ours (closes: #516605). + * Provide an alternative (postfix) for mail-transport-agent (closes: + #522300). I chose postfix because that's the only one I could recommend, + and since the alternative does not affect people who already have an MTA + installed, or have a preference, it won't affect them. + * Honour debconf pre-selection of mdadm/initrdstart (closes: #516802). + * Incorporate patch from Adrian Bridgett: the initramfs hook now checks to + see if all known arrays are listed in mdadm.conf and issues a warning if + this is not the case (closes: #519328). + * Make checkarray skip over arrays still marked auto-read-only + (closes: #510641). + * Add cron.daily snippet from Paul Slootman to run one-shot scans every day + to ensure that failed arrays don't go unnoticed (closes: #497949). + * Bumped Standards-Version to 3.8.1; no changes necessary. + + -- martin f. krafft <madduck@debian.org> Sat, 25 Apr 2009 19:04:47 +0200 + +mdadm (2.6.8-12-gb47dff6-2) unstable; urgency=low + + * Brown paper bag release: I built from the wrong branch which caused some + Debian-specific changes not to get into the package. Thus build fixes it. + + -- martin f. krafft <madduck@debian.org> Mon, 16 Feb 2009 12:15:37 +0100 + +mdadm (2.6.8-12-gb47dff6-1) unstable; urgency=low + + * New upstream release. + - better checks asprintf() return codes, thanks to patch from Dustin + Kirkland (closes: #509167). + * Fix start/stop runlevels in header of mdadm monitor init.d script + (closes: #514923). + * Use modprobe -q instead of --syslog from initramfs (closes: #502988). + + -- martin f. krafft <madduck@debian.org> Mon, 16 Feb 2009 11:07:18 +0100 + +mdadm (2.6.7.2-1) unstable; urgency=low + + * New upstream release, created for Debian lenny: + - fixes assembly of arrays that are being reshaped (closes: #512475) + - this bug was also responsible for other assembly problems + (closes: #498505, #499643, #496334) + Again, many thanks to Neil Brown for being such an awesome upstream. + + * Documentation updates: + - Actually install David Pashley's blog post added in 2.6.7.1-1, and + register it with doc-base. + - Update md.txt to version 2.6.26 (the lenny kernel). + - Add a dump of a website detailing md superblock formats. + - Register FAQ, md.txt, RAID5-vs-RAID10, README.recipes with doc-base + - Cherry-picked UID/UUID typo in mdadm.conf(5) manpage fix (commit + 0e69da7) (closes: #506245). + + * Added Italian debconf translation; thanks Luca Monducci (closes: #506572). + + -- martin f. krafft <madduck@debian.org> Tue, 03 Feb 2009 21:28:34 +0100 + +mdadm (2.6.7.1-1) unstable; urgency=low + + * New upstream release, specifically created for Debian lenny to fix the RC + bugs, which + - fixes typo in forced assembly code (closes: #496334, #499643, #498505). + - fixes array component size detection (closes: 500309). + Thanks Neil Brown, mdadm upstream: you are spoiling me. :) + + * Minor fixes to documentation: + - Add David Pashley's Rebuilding RAID blog post. + - Add new (bugfix) version 1.40 of the newdisk script + (/usr/share/doc/mdadm/examples/newdisk.gz) (closes: #490955). + - Add link to Warren Togami's writeup about remote RAID-1 conversion to + README.recipes. + - Fix probability of survival in FAQ 4b, since I erroneously labeled the + chance of failure as the chance of survival; thanks to Per Olofssen for + clarification (closes: #493577). + - Cherry-picked 6d6de2e from Neil, which adds HOMEHOST to the manpage and + closes: #489257. + + -- martin f. krafft <madduck@debian.org> Wed, 15 Oct 2008 10:27:23 +0200 + +mdadm (2.6.7-3.1) unstable; urgency=low + + * Non-maintainer upload with permission. + * Ask mdadm/initrdstart at medium priority in chrooted environment. + (Closes: #493099) + * Update Swedish debconf translations. Thanks Martin Ã…gren. + (Closes: #492074) + + -- Jérémy Bobbio <lunar@debian.org> Mon, 25 Aug 2008 22:28:53 +0200 + +mdadm (2.6.7-3) unstable; urgency=low + + * Correct credits in the NEWS file: Suse authored the udev rules, not Ubuntu + (who use the same file without credit). + + -- martin f. krafft <madduck@debian.org> Sat, 05 Jul 2008 12:22:58 +0200 + +mdadm (2.6.7-2) unstable; urgency=low + + * Remove mdrun completely. + * Import udev vol_id handling from Suse, and thus finally export + /dev/disk/by-id/* and /dev/disk/by-uuid symlinks (closes: #435983). + * Update upstream URLs in copyright and watch file (closes: #488364). + * Bump Standards-Version to 3.8.0; no changes required. + * Add mdadm homepage link to debian/control. + + -- martin f. krafft <madduck@debian.org> Wed, 02 Jul 2008 11:13:18 +0200 + +mdadm (2.6.7-1) unstable; urgency=low + + * New upstream version, which + - fixes a segfault when reading /proc/mdstat (closes: #462154). + - fixes a possible bug with v1 bitmap space allocation (closes: #474548). + - supports large files for loop assembly (closes: #463769). + * Moved mdadm-raid init.d script to position S60 (from S50) for runlevels + 0 and 6, so that arrays get stopped after cryptdisks-early; thanks to + J.M.Roth (closes: #486012). + * Fixed a typo in checkarray; thanks to Helmut Grohne (closes: #445540). + * Updated debconf translations: + - Spanish; thanks to Javier Fernández-Sanguino (closes: #477920). + - Basque; thanks to Piarres Beobide (closes: #478676). + * Updated logcheck rule so that it matches mdadm log entries with and without + PID (2.6.5 introduces PIDs in the messages). + + -- martin f. krafft <madduck@debian.org> Wed, 25 Jun 2008 17:31:15 +0200 + +mdadm (2.6.4-2) unstable; urgency=low + + * Adjusted logcheck rules to follow kernel changes; thanks to Frédéric + Brière (closes: #462478). + * Debconf templates and debian/control reviewed by the debian-l10n- + english team as part of the Smith review project. Closes: #463673 + * Debconf translation updates (thanks to Christian Perrier for compiling + them): + * Japanese. Closes: #464438 + * Galician. Closes: #464454 + * French. Closes: #465984 + * Czech. Closes: #466306 + * Dutch. Closes: #466543 + * Russian. Closes: #466577 + * Portuguese. Closes: #466794 + * German. Closes: #466989 + * Vietnamese. Closes: #467118 + * New debconf translations: + * Finnish. Closes: #468048 + * Fixed bashism in mdadm-raid init.d script; thanks to Raphael Geisser + (closes: #471874). + * Do not output warning when run from cron and no arrays are found + (closes: #474542). + * Add doc-base registration file; thanks to Roberto C. Sanchez + (closes: #451684). + * Reschedule "mdadm Sunday" to 00:57 instead of 01:06 to prevent double + invocation on DST change (closes: #449244). + * Bump DH compatibility level to 6; no changes required. + + -- martin f. krafft <madduck@debian.org> Fri, 11 Apr 2008 10:48:45 +0200 + +mdadm (2.6.4-1) unstable; urgency=low + + * New upstream release. + * Apply patch by Petter Reinholdtsen to fix dependency loop in + init.d script (closes: #460256). + * Bump Standards-Version to 3.7.3; no changes needed. + + -- martin f. krafft <madduck@debian.org> Mon, 14 Jan 2008 12:47:14 +0100 + +mdadm (2.6.3+200709292116+4450e59-3) unstable; urgency=low + + * Patch by Jérémy Bobbio which completes the fix for #444682. + + -- martin f. krafft <madduck@debian.org> Mon, 01 Oct 2007 16:16:19 +0100 + +mdadm (2.6.3+200709292116+4450e59-2) unstable; urgency=low + + * Patch the routine loading v1 superblocks to fix a segfault on amd64 + (closes: #444682). + + -- martin f. krafft <madduck@debian.org> Sun, 30 Sep 2007 14:10:41 +0100 + +mdadm (2.6.3+200709292116+4450e59-1) unstable; urgency=low + + * New merge from upstream @4450e59ffaf75623fa4261e244b0717a7463aa84 + - makes "--write-mostly" effective when re-adding a device to an array. + (closes: #442874). + * Do not call update-initramfs -k all, it can be set via + /etc/initramfs-tools/update-initramfs.conf (closes: #439334). + * Depend on udev|makedev instead of just makedev and invoke /dev/MAKEDEV, + not /sbin/MAKEDEV (closes: #436998). + * De-escalate the initramfs hook warning about an "emergency procedure" and + simply note that this involves initramfs assembling arrays it finds + automatically at boot. Also added a FAQ entry on how to turn off the init + script warning when no arrays are found (closes: #434934). + * Add --scan to the single-device-assembly-codepath in the initramfs + local-top script; thanks to Mario 'BitKoenig' Holbe for the patch + (closes: #440703). + * Use short option in initramfs script mkdir call, so make it klibc-utils + compatible; thanks maximilian attems for the patch (closes: #443436). + * Handle module name synonyms in initramfs hook script (closes: #432585). + + -- martin f. krafft <madduck@debian.org> Sat, 29 Sep 2007 21:21:25 +0100 + +mdadm (2.6.2-2) unstable; urgency=low + + * Fix typos in md(4) manpage; thanks Jeroen (closes: #425576). + * Make init script not report failure when there are no arrays defined in + config file. + * Add /usr/share/doc/mdadm/examples/newdisk, a script to integrate + a replacement disk into an existing array with minimal effort; will remain + in examples/ until I had a chance to really test and understand it. Thanks + to Arno van Amersfoort (closes: #427880). + * Does some sanity checking for proper format of level= arguments in + mdadm.conf and bails if an error is found. Since the RAID levels are used + verbatim as module names, a discrepancy might lead to an unusable system. + Instead, we thus use the emergency fallback. Thanks to Andrew + Sackville-West for spotting this and helping me with the fix. + + -- martin f. krafft <madduck@debian.org> Tue, 10 Jul 2007 09:59:45 +0200 + +mdadm (2.6.2-1) unstable; urgency=low + + * New upstream release + - new options --fail detach and --remove faulty can be used to fail and + remove devices that are no longer physically present (closes: #416512). + - --help output now goes to stdout (closes: #416653). + - plenty of manpage fixes, thanks Peter Samuelson (closes: #414688). + * Incorporated patch by Mikko Rasa to fix detecting of raid6 (and raid10) + devices with --scan (closes: #421915). + + -- martin f. krafft <madduck@debian.org> Mon, 21 May 2007 14:25:43 +0200 + +mdadm (2.6.1-1) unstable; urgency=low + + * Release to unstable. + * Start arrays read-only in initramfs to prevent syncing and hence enable + resuming/freezing. The arrays will automatically sync as soon something + writes to it; thanks to Tim Dijkstra, Neil Brown, and Luis Rodrigo + Gallardo Cruz (closes: #415441). + * mkconf now tries to preserve existing values for DEVICE, CREATE and + HOMEHOST (in addition to MAILADDR, which it preserved previously already). + PROGRAM is preserved but only added to mdadm.conf if it occured in the + previously existing configuration file. + * startall is now mdadm-startall and lives in /sbin, thanks to Eduard Bloch. + It now can handle existing mdadm.conf files much more gracefully, mostly + thanks to the above mkconf enhancements (closes: #415336). + + -- martin f. krafft <madduck@debian.org> Sat, 05 May 2007 16:12:29 +0200 + +mdadm (2.6.1-1~exp.5) experimental; urgency=low + + * Fix mdadm.conf typo; thanks Tim Phipps (closes: #416626). + * Execute udevtrigger after assembly of arrays during initramfs processing; + thanks to Michael Prokop (closes: #416658). + + -- martin f. krafft <madduck@debian.org> Sat, 31 Mar 2007 12:12:27 +0200 + +mdadm (2.6.1-1~exp.4) experimental; urgency=low + + * Incorporate patches by Peter Samuelson fixing several typography as well + as typesetting issues in the manpages. Thanks a lot! (closes: #414688) + + -- martin f. krafft <madduck@debian.org> Tue, 13 Mar 2007 19:26:37 +0100 + +mdadm (2.6.1-1~exp.3) experimental; urgency=low + + * Patch by Jørn V. Christensen to properly handle multiple email addresses + for the MAILADDR setting (closes: #413330). + + -- martin f. krafft <madduck@debian.org> Mon, 5 Mar 2007 11:19:18 +0000 + +mdadm (2.6.1-1~exp.2) experimental; urgency=low + + * Updated debconf translations: + - Galician by Jacobo Tarrio (closes: #412203). + + -- martin f. krafft <madduck@debian.org> Sat, 24 Feb 2007 16:41:16 +0100 + +mdadm (2.6.1-1~exp.1) experimental; urgency=low + + * New upstream release, targeted at experimental until etch is out: + - adds --syslog option (closes: #402457). + - now can --wait for sync activity to finish (closes: #328197). + - for other changes, please see /usr/share/doc/mdadm/changelog.gz + * Updated debconf translations: + - Portuguese by Rui Branco (closes: #411745). + + -- martin f. krafft <madduck@debian.org> Thu, 22 Feb 2007 16:35:16 +0100 + +mdadm (2.5.6-9) UNRELEASED; urgency=low + + * More logcheck filters to prevent redundant information logged by mdadm + --syslog. + * Improved mdadm-raid init script to correctly output status information for + drives that are initialising or adding spares. + + -- martin f. krafft <madduck@debian.org> Thu, 11 Jan 2007 16:05:12 +0100 + +mdadm (2.5.6-8) unstable; urgency=low + + * Hard-code path to /sbin/mdadm binary rather than searching the $PATH. This + closes: #403307 and should be a little more robust in the presence of + installations of mdadm in /usr/local. + * Made the bugscript a little more failure-resilient. + * Added more documentation. + + -- martin f. krafft <madduck@debian.org> Mon, 8 Jan 2007 02:04:25 +0100 + +mdadm (2.5.6-7) unstable; urgency=low + + * Only parse ARRAY lines from configuration file when collecting the array + pairs. Thanks to Daniel Dehennin for the bug report and suggested fix + (closes: #402106). + * Prevent modules from being loaded during initramfs time if no arrays are + to be assembled at this stage. + * Export DAEMON_OPTIONS to /etc/default/mdadm, which gets passed to the + mdadm daemon on startup from the init.d script. The value is set to + -y/--syslog by default, and will get incorporated into debconf in a future + version; thanks for the idea by Alex Owen (closes: #401696). + * Incorporate patch by upstream to fix handling of --syslog long option + (closes: #402457). + * Added logcheck filters for new syslog entries by mdadm monitoring daemon. + * Added Spanish debconf translation by Javier Fernández-Sanguino + (closes: #402681). + + -- martin f. krafft <madduck@debian.org> Tue, 12 Dec 2006 11:49:52 +0100 + +mdadm (2.5.6-6) unstable; urgency=medium + + * Fixed a typo in the debconf control script which would cause failures with + a dash shell; thanks to Santiago Garcia Mantinan for reporting this, and + Andreas Metzeler for providing a solution (closes: #399315). + + -- martin f. krafft <madduck@debian.org> Mon, 20 Nov 2006 15:02:34 +0800 + +mdadm (2.5.6-5) unstable; urgency=low + + * Prevent initramfs hook from exiting prematurely when VERBOSE=false. + * Moved debconf question about arrays to start by initramfs to high + priority. + + -- martin f. krafft <madduck@debian.org> Mon, 13 Nov 2006 11:24:21 +0100 + +mdadm (2.5.6-4) unstable; urgency=low + + * Actually remove mdadm.conf on purge; previously, the generation of + a temporary file for initramfs would screw up the purging; thanks to + Fabrice Lorrain for the report (closes: #398088). + + -- martin f. krafft <madduck@debian.org> Sat, 11 Nov 2006 20:07:55 +0100 + +mdadm (2.5.6-3) unstable; urgency=low + + * Fix a syntax error in mdadm-raid script. + + -- martin f. krafft <madduck@debian.org> Thu, 9 Nov 2006 15:47:51 +0100 + +mdadm (2.5.6-2) unstable; urgency=low + + * Small fixes to mkconf, now returns 255 instead of -1. + * Added /usr/share/mdadm/startall, a helper script to facilitate starting + all arrays when booting from rescue/live media. It overrides AUTOSTART in + /etc/default/mdadm and starts arrays even if the variable is set to + a false value. + + -- martin f. krafft <madduck@debian.org> Thu, 9 Nov 2006 14:44:35 +0100 + +mdadm (2.5.6-1) unstable; urgency=low + + * New upstream release: + - added note to mdadm(8)/--metadata about overriding the default in + mdadm.conf (closes: #396914). + - fixed problems that could cause infinite loop with auto assemble. Thanks + to Dan Pascu for pointing this out (closes: #396582). + - fixed problems with bitmap file names lost after reading from + configuration file. + * Merged patch by Dan Pascu to nicely handle situations where a degraded + array only has one drive left nicely by the mdadm-raid script. + * Updated Japanese debconf translation; thanks to Hideki Yamane + (closes: #396400). + + -- martin f. krafft <madduck@debian.org> Thu, 9 Nov 2006 00:47:45 +0100 + +mdadm (2.5.5-1) unstable; urgency=low + + * New upstream release: + - fixes the build problems on several architectures (closes: #393314) by + including the contents of linux/blkpg.h literally, not via #include. + - optimises bitmap file use on 64bit systems. + - does not error out anymore when trying to assemble an already assembled + array without a corresponding /dev device node. + - does not report an error if --assemble --scan only finds already running + arrays. + - fixes several bugs related to RAID10 and the new offset layout. + - improves error message when a wrong '--update' option is given. + * Added FAQ entries about partitionable arrays. + * chroot detection now also works for 2.6.18 and beyond (c.f. kernel commit + 778c1144771f0064b6f51bee865cceb0d996f2f9). + * Now recommends module-init-tools. + * Hides ugly errors during configuration in the absense of module-init-tools + or initramfs-tools. + * Send udev events for arrays assembled by the mdadm-raid init.d script. + This does not close #394193 but it's a good addition anyway. I am not + sending these events from the initramfs as well because it would be + non-trivial to ensure that an event doesn't get sent twice for a given + array. + Anyway, this is all a hack until the kernel sends online/offline events to + udev. See #394193. + * Added more RAID10 information to the FAQ. + * Added filters to logcheck for regular events, even by the md driver; also + promoted messages about non-fresh components to security events. + * Hide informational messages unless VERBOSE is set to a true value in + /etc/default/mdadm. + + -- martin f. krafft <madduck@debian.org> Thu, 26 Oct 2006 22:35:24 +0200 + +mdadm (2.5.4-1) unstable; urgency=low + + * New upstream release: + - --examine now reports chunk size also for RAID6 and RAID10 + - fix endianness issues with v1 superblocks (closes: #385726) and bitmap + metadata. + - improved message when mdadm detects similar superblocks + (closes: #385951). + - documents that the automatic update of the super-minor field in the + superblock when using a 2.6 kernel only applies to RAID levels 1 and + higher. RAID0 array superblocks must be manually updated + (closes: #386315, #388172). + - removes partition table from any whole device added to an array. + - allow --auto=yes to specify a number; if mdadm determines from the + device name that you want a partitionable array, this number determines + the number of sub-device nodes to create. + * Removed patch previously used to fix #385951 because it's not adequate. + See the bug log for reasons. + + -- martin f. krafft <madduck@debian.org> Fri, 13 Oct 2006 08:32:20 +0200 + +mdadm (2.5.3.git200608202239-8) unstable; urgency=low + + * This revision is dedicated to Peter Samuelson for his RAID10 expertise^W + educated guess^W^W pure luck. (: + * Now writes minimal mdadm.conf file even if the MD subsystem has not been + loaded and the scan for arrays thus failed. + * Now tries to ensure that the configuration file used for the initramfs + actually defines arrays. + * Now preserves MAILADDR from an existing mdadm.conf when generating a new + one. + * Documentation updates. + * Updated debconf translations: + - German by Mario Joußen. + - Vietnamese by Clytie Siddall (closes: #390311). + - Dutch by Frans Pop (closes: #390955). + - French by Jean-Luc Coulon (closes: #391215). + + -- martin f. krafft <madduck@debian.org> Fri, 6 Oct 2006 15:03:46 +0200 + +mdadm (2.5.3.git200608202239-7) unstable; urgency=medium + + * Fixed a serious bug in the debconf script which would cause the + configuration to exit prematurely in cases when the root could not be + determined. Since this is RC, the urgency is set to medium. + * Updated the documentation a bit. + * Updated debconf translations: + - Swedish by Daniel Nylander (closes: #389040). + - Czech by Miroslav Kure (closes: #389083). + - Russian by Yuri Kozlov (closes: #389086). + - Brazilian Portuguese by Felipe Augusto. + + -- martin f. krafft <madduck@debian.org> Fri, 29 Sep 2006 16:31:44 +0200 + +mdadm (2.5.3.git200608202239-6) unstable; urgency=high + + * Bumping urgency to high because previous version has been in unstable for + three days and this one really only fixes a stupid segfault: + * Reworked the parsing of /proc/partitions and spotted a mean segfault + (closes: #388355). + + -- martin f. krafft <madduck@debian.org> Thu, 21 Sep 2006 15:25:21 +0200 + +mdadm (2.5.3.git200608202239-5) unstable; urgency=medium + + * Keeping medium urgency due to RC bug. + * Modified the patch responsible for pruning parent devices so that + superblocks at the end of a disk do not get interpreted twice. It now + makes less assumptions about the exact output of /proc/partitions and + should thus be more robust (now closes: #385951). + * Added code that defers mdadm preconfiguration when the debconf backend is + too old (and does not provide debconf-escape). Now configuration is + postponed until the postinst is run in this case. + + -- martin f. krafft <madduck@debian.org> Thu, 14 Sep 2006 11:16:39 +0200 + +mdadm (2.5.3.git200608202239-4) unstable; urgency=medium + + * Correct error related to an unbound variable in postinst. + * Keeping medium urgency. + + -- martin f. krafft <madduck@debian.org> Wed, 13 Sep 2006 20:49:33 +0200 + +mdadm (2.5.3.git200608202239-3) unstable; urgency=medium + + * Urgency medium because of RC bugs. + * Add versioned dependency to debconf (closes: #385994); temporary fix until + we find a proper fix for #386439. + * Add patch by Steinar H. Gunderson to ensure mdadm does not interpret + a superblock as belonging to a device when it's actually part of + a partition on that device (closes: #385951). + * Do not override the superblock default version in mdadm.conf to prevent + creation of superblocks that the kernel can't handle (closes: #384614). + * Added a note to alert people that the warning about arrays not listed in + the configuration file is only relevant if the arrays are needed to be + brought up by mdadm from initramfs during boot (closes: #385017). + * Added bootloader/cmdline info to bugscript so that future bug reports via + bug/reportbug include information on how the system is booted with respect + to RAID (the root partition). + * If mdadm is being configured in a chroot, it now defaults to starting all + arrays from the initial ramdisk, rather than trying to figure out the root + MD array (we're using /proc information, so it would be the one of the + host, not the one of the chroot) (closes: #386468). + * Added LSB headers to init scripts. + * Reworked the documentation with respect to the use of "MD" and "RAID", and + added a FAQ entry on the meaning of "MD". Thanks to Frans Pop for his + help! + * Updated debconf translations: + - Czech by Miroslav Kure (closes: #384754). + - French by Florentin Duneau (closes: #385690). + - Russian by Yuri Kozlov (closes: #387017). + + -- martin f. krafft <madduck@debian.org> Thu, 7 Sep 2006 14:32:04 +0200 + +mdadm (2.5.3.git200608202239-2) unstable; urgency=low + + * Allow ARRAY lines in configuration file to break across lines + (closes: #384222). + * Improved initramfs hook; now does not rely on initramfs to provide RAID + assembly: + - if a checked mdadm.conf file is present, use that. + - if an unchecked mdadm.conf is present, create a temporary one + - if that fails, use the unchecked one iff it contains at least one + ARRAY statement. Otherwise rely on auto-generation from the initramfs + during the book (and hope for the best). + - if no mdadm.conf is found, create one on the fly + - if that fails, hope that the auto-generation will work during boot + * Improved the messages printed by the initramfs script. + * Do not store the debconf answer for whether arrays not listed in the + configuration file should be used. + * Now asks again for the devices to start (preseeded with 'all') if the user + does not want to proceed with devices not listed in the configuration + file. + + -- martin f. krafft <madduck@debian.org> Wed, 30 Aug 2006 16:29:07 +0200 + +mdadm (2.5.3.git200608202239-1) unstable; urgency=low + + * Tracking upstream git releases. + - now the --run switch behaves as stated in the manpage. This properly + fixes #287415 + - new version-1 partitionable arrays are now named X instead of _dX (e.g. + 0 instead of _d0) for device names like md_dX. I actually think this is + a bug and hope upstream will use dX (e.g. d0) instead in a future + version. + - we specify --symlink=no and thus disable the new feature to create + /dev/mdX symlinks to /dev/md/X devices until the entire device node mess + is cleared up. No need to introduce yet another complicating factor at + this stage. + * Instead of trying to do a whole lot of magic with respect to detecting + RAID devices to start, mdadm from now on requires a valid mdadm.conf file + to be installed. It still tries to do what it can, but there are no + heuristics anymore. See /usr/share/doc/mdadm/README.upgrading-2.5.3 . + * Removed the zero-superblock warning because we require the user to sign + off the configuration file anyway. + * This also enables us to use mdadm.conf from the initial ramdisk and thus + closes: #381303. + * In case the user chooses to assemble all arrays from the initial ramdisk, + use the new homehost feature of mdadm (closes: #381057). This will start + only those arrays belonging to the local system, unless it is the first + run on a system, in which case it will start all arrays and mark them as + belonging to the local system. + * Improved the debconf control script: integrated error messages into the + frontends, and made it a bit smarter. Error messages can now be + translated, and the script checks whether the user's choice is listed in + the configuration file and only proceeds if it is, or the user chooses to + ignore that it is not. + * mkconf can now take a generate/force-generate parameter to write directly + to /etc/mdadm/mdadm.conf. A second parameter specifies an alternate + filename. + * mkconf now outputs comments for the settings it suggests. + * Removed all udev-related stuff. We must coexist with udev because there + are setups that assemble arrays without mdadm, so the device nodes must be + created by udev, if that is used. + (closes: #382263, #382450, #383688, #383891, #383806, #382480) + Staying at low urgency since these (RC) bugs only exist in unstable. + * Now installs MD modules and mdadm/mdrun into initial ramdisk regardless of + whether the hook script thinks there are devices to start. This was done + to enable recovery from the initramfs shell. + * Now uses 'MD' instead of 'RAID' consistently in all messages. + * Now rebuilds initramfs for all installed kernels. + * Now breaks the endless config loop only when the user does not see the + question (see #381284, #381007). + * Don't fail mdadm-raid when /dev is on a read-only filesystem + (closes: #382876). + * Updated debconf translations: + - French by Florentin Duneau (closes: #382389). + + -- martin f. krafft <madduck@debian.org> Mon, 21 Aug 2006 00:25:22 +0100 + +mdadm (2.5.2-10) unstable; urgency=low + + * Applied patch by upstream to fix the logic of the --run switch (see + #287415). Thus also reverted the mdadm-raid hack used to fix the bug in + the 2.5.2-9 upload. + * Recognise devfs-style device nodes by fixing a regression bug in the root + RAID autodetection code (closes: #381007), which was introduced as part of + the fix for #380596 in the 2.5.2-9 upload. The bug is RC, but it only + applies to unstable right now, so I am not pumping up the urgency. + The autodetection code now doesn't care about the actual name of the array + device, but instead only insists that it exists, is a block device, and + recognised as an array by mdadm (mdadm --detail). + * Added safety net to prevent endless loops in RAID autodetection. Now just + falls back to starting all arrays from the initramfs if it fails to + determine an acceptable array for the / filesystem in three tries + (closes: #381284, also see #381007). + * Added udev rules file to prevent udev from ever creating md device nodes, + which can get in the way of mdadm (also see next item). + * Added a workaround to the initramfs hook to deal with the problem with + /dev/md/X device nodes when /dev/mdX is also present and version-0 + superblocks are in use (closes: #381181). + * checkarray: correctly recognise when the kernel is too old for parity + checks, or when there are no redundant arrays present (closes: #380746). + * checkarray: now supports -s|--status switch to query parity check status + for given devices. + * checkarray: now supports -x|--cancel switch to cancel running checks. + * mkconf: now also outputs 'MAILADDR root' and 'HOMEHOST <system>'. + * Added README.checkarray with some information about the check process. + * Added /usr/share/doc/mdadm/FAQ to answer some FAQs. + + -- martin f. krafft <madduck@debian.org> Thu, 3 Aug 2006 22:54:04 +0100 + +mdadm (2.5.2-9) unstable; urgency=low + + * Added logcheck rules for kernel messages generated by checkarray, using + logcheck server level. + * Added handling of partitionable arrays to root RAID autodetection script + (closes: #380596). + * Forcing RAID assembly to run the arrays, working around an upstream bug + until that's fixed (closes: #287415). + * Updated documentation in README.initramfs-transition to include + information related to #380089. + * Updated debconf translations: + - Vietnamese by Clytie Syddall, thanks! + + -- martin f. krafft <madduck@debian.org> Mon, 31 Jul 2006 14:35:38 +0100 + +mdadm (2.5.2-8) unstable; urgency=low + + * Re-added rootraiddoc.97.html which was mysteriously lost (closes: #378678). + * Catching modprobe error in case of absence of the kernel modules, or + a non-modular kernel. Thanks to Holger Levsen. + * Copy raid456 kernel module into initramfs, if present (closes: #380152). + * checkarray: check for presence of active RAID arrays and give an + appropriate error if there are none present (closes: #379019). + * checkarray: skip sync for non-redundant devices (closes: #379352, #380424). + * Fixing cron registration for checkarray. crontab(5) is really stupid and + makes me think that they simply documented a bug instead of fixing it, so + now I have to hack around it. See the cron.d file (closes: #380425). + * Removed the code writing auto-detected devices to /var, which was silly + since /var isn't necessarily mounted yet by the time mdadm-raid is called. + Thanks to Maurizio Avogadro for pointing this out. + * Add reference to BAARF to README.Debian and included the RAID5 vs RAID10 + article from the BAARF website. + * Updated debconf translations: + - Japanese by Hideki Yamane, thanks! + - French by Florentin Duneau, thanks! (closes: #379511) + + -- martin f. krafft <madduck@debian.org> Thu, 27 Jul 2006 22:49:32 +0100 + +mdadm (2.5.2-7) unstable; urgency=low + + * Release to unstable. + + -- martin f. krafft <madduck@debian.org> Thu, 20 Jul 2006 17:23:23 +0100 + +mdadm (2.5.2-6) experimental; urgency=low + + * Adding mdrun to generated udeb. I will only remove mdrun after etch. + * Updated debconf translations: + - Dutch, thanks to Frans "Franzerl" Pop! (closes: #377412) + - French, thanks to Florentin Duneau! (closes: #377968) + + -- martin f. krafft <madduck@debian.org> Thu, 13 Jul 2006 23:11:24 +0200 + +mdadm (2.5.2-5) UNRELEASED; urgency=low + + * Remove the check for the lvm prereq in the initramfs hook, as #369617 is + now fixed. Thus conflicts against initramfs-tools (<< 0.65). + * Updated debconf translations: + - German, thanks to Mario Joußen! + * Added short note about maintenance in SVN to README.Debian. + + -- martin f. krafft <madduck@debian.org> Thu, 13 Jul 2006 23:10:36 +0200 + +mdadm (2.5.2-4) UNRELEASED; urgency=low + + * The "it takes two to swing" release. + * Now does not stop arrays on upgrade or remove. Thanks (and sorry) to + Christian Pernegger (and hopefully no others). + * Fixed small problem in debconf configuration script related to unbound + MAIL_TO variable. Thanks to Christian Pernegger. + + -- martin f. krafft <madduck@debian.org> Fri, 7 Jul 2006 16:59:01 +0200 + +mdadm (2.5.2-3) UNRELEASED; urgency=low + + * The initramfs now gets all RAID modules installed. It's a lot safer to + have them all around, the size difference is negligible, and we still only + load the needed ones at boot time. + * Added /usr/share/mdadm/checkarray, which can be used to check arrays for + parity. Also added a debconf question to let the user choose whether + cron should run these checks (closes: #377071). + * Only shut down arrays automatically when they've been automatically + started (closes: #376009). + * Make sure the user has a chance to choose the autostart feature by + elevating the debconf priority to high (see #376009). The warning about + reuse of RAID components has also been elevated to debconf priority high. + * The MAIL_TO setting from /etc/default/mdadm has been removed. Instead, use + MAILADDR in /etc/mdadm/mdadm.conf. See mdadm.conf(5). Your setting should + be automatically migrated. + * Now rewrites /etc/default/mdadm (but preserves settings) instead of trying + to patch it with changes. + * Added note to README.Debian to ensure users know that only the devices + listed in mdadm.conf will be autostarted (see #376009). + * Now includes latest md.txt from kernel documentation in + /usr/share/doc/mdadm/md.txt.gz. + * Added some more recipes to /usr/share/doc/mdadm/README.recipes.gz. + * Updated debconf translations: + - Swedish, thanks to Daniel Nylander! + - Brazilian Portuguese, thanks to Felipe Augusto van de Wiel! + - Czech, thanks to Miroslav Kure! + - Russian, thanks to Yuri Kozlov (closes: #376181). + * Further updates to the debconf templates; I hope the translators aren't + going to kill me. + + -- martin f. krafft <madduck@debian.org> Fri, 7 Jul 2006 15:09:40 +0200 + +mdadm (2.5.2-2) experimental; urgency=low + + * The "if it weren't for Munich's wheat beer, there'd be no" release. + * Removed -fno-strict-aliasing from compiler options, after upstream fixed + the bug that led to its use (see #369779, #356153). Thanks to Elimar + Riesebieter for pointing this out (closes: #375876). + * Moved detection of RAID devices from initramfs hook to debconf control + file, and added a (low-priority) debconf question as to which devices + should be started early in the boot sequence. For the cases where we + failed to auto-detect previously (e.g. root on LVM on RAID), it's paranoid + and suggests to start them all (closes: #375879). Thanks to Alec Berryman + for spotting this. + * Fixed a typo in README.experimental, which could lead to an unbootable + system with initramfs-tools 0.64 or before. Again, thanks to Alec for + spotting this. + * Extended bug script to include --examine output for all components (at + least if called by root, which hopefully should never happen. Err, + wait...) + * Disabled deprecation warning in mdrun until the transition is complete. + * Reworded the debconf templates due to a new question, and also for + readability. + + -- martin f. krafft <madduck@debian.org> Thu, 29 Jun 2006 22:54:47 +0200 + +mdadm (2.5.2-1) experimental; urgency=low + + * New upstream release. + * Implemented checks in the initramfs hooks and scripts for compatibility + with initramfs-tools. Now we do not need a conflict anymore because + mdadm's hooks and scripts will simply do nothing while the ones provided + by initramfs-tools are still present. + * Not using /bin/bash for mdrun, which I thought we'd need for read timeout + support (for the deprecation warning). Since the -n and -t flags to the + read shell builtin are non-POSIX, I dropped them, they were merely + cosmetic anyway. + + -- martin f. krafft <madduck@debian.org> Tue, 27 Jun 2006 15:06:55 +0200 + +mdadm (2.5.1-2) experimental; urgency=low + + * Updating dependency on initramfs-tools, which has not yet adopted to mdadm + taking over the hooks. + + -- martin f. krafft <madduck@debian.org> Mon, 26 Jun 2006 22:35:08 +0200 + +mdadm (2.5.1-1) experimental; urgency=low + + * New upstream release: + - Really fixes return status of examine (closes: #367901). + - Fixes a memory leak in monitor mode (closes: #372618). + - Fixes compiler warnings and errors (closes: #373802, #356153, #369779). + - Fix byte swapping issues (closes: #369765). + - Now lists devices it stops (closes: #369850). This also leads to + beautification of the init.d script's stop action. + * Fixed RAID init script to not complain about missing logger command. + + -- martin f. krafft <madduck@debian.org> Mon, 26 Jun 2006 00:58:36 +0200 + +mdadm (2.5-4) experimental; urgency=low + + * The "would you like fries with your parasite?" release. + * Now does not require RAID support from the kernel just for package + installation; that was silly of me, sorry (closes: Bug#370115). + * Added version to Replaces: initramfs-tools dependency. + * Further init.d script improvements. + * Recommends mail-transport-agent, or the monitor daemon won't be able to + send anything. + * Ignores failures from modprobe in postinst when RAID modules are not + available (closes: #370582). + + -- martin f. krafft <madduck@debian.org> Tue, 6 Jun 2006 12:45:53 +0200 + +mdadm (2.5-3) experimental; urgency=low + + * Added /usr/share/doc/mdadm/README.recipes with some common usage examples. + * Vastly improved the mdadm-raid init.d script output, and removed bashisms. + + -- martin f. krafft <madduck@debian.org> Fri, 2 Jun 2006 00:45:06 +0200 + +mdadm (2.5-2) experimental; urgency=low + + * The "on her majesty's secret service" release. + * Enabled -DDEBIAN during build, which will take care of default permissions + on devices created by mdadm. Together with the CREATE configuration + directive in 2.5, this now certainly closes: Bug#310241. + * Added a patch (incorporating lib/mm/xlate.h from lvm2) to prevent direct + access to kernel headers from userspace (closes: Bug#369765). + * Disabled strict aliasing compiler checks until we find a better + implementation for linked lists in C (closes: Bug#369779, Bug#356153). + * Actually decreased the size of the udeb mdadm binary with -Os + -fomit-frame-pointer (as suggested by Joey Hess) (closes: Bug#314370) + * Added Replaces: initramfs-tools to communicate that we're not conflicting + but replacing instead (see Bug#367567) + * Updated conflict with initramfs-tools to (<< 0.63) per suggestion by the + maintainers. + + -- martin f. krafft <madduck@debian.org> Thu, 1 Jun 2006 20:15:17 +0200 + +mdadm (2.5-1) experimental; urgency=low + + * The "show me the way to the next whiskey bar" release. + * See /usr/share/doc/mdadm/README.experimental or + http://madduck.net/~madduck/scratch/README.experimental + * New upstream release: + - mails include /proc/mdstat output (closes: Bug#355882) + - allows specification of device permissions in config (closes: Bug#310241) + * /sbin/mdrun has been deprecated and replaced by calls to /sbin/mdadm; + a proper deprecation warning is in place (see NEWS). + * Moved initramfs hook and script into the package, and switched it to mdadm + (from mdadm. Thanks to Stephen Frost for his help (closes: Bug#354144). + This should make sure that the right minor numbers are chosen during boot + (mainly because mdadm takes care of it all) (closes: Bug#361408). + * Removing mdrun from the udeb (d-i patch submitted to debian-boot mailing + list) + * Upstream links against openssl for SHA1 support (homehost feature), which + is a problem. An internal SHA1 implementation is provided, however, so + I just link against that. + * Switched init.d scripts to use LSB-compliant output. + * Enhanced init.d script output. + + -- martin f. krafft <madduck@debian.org> Thu, 1 Jun 2006 02:20:22 +0200 + +mdadm (2.4.1-2) unstable; urgency=low + + * The "this took way longer than I thought" release. + * Migrating to unstable. + * If the init.d script creates the mdadm.conf file, it should remove it on + purge. To accomplish this, I create a semaphore in /var/lib/mdadm if it + was generated, and only remove the conffile on purge if the semaphore + exists. + * Added a little helper /usr/share/mdadm/mkconf to aid generation of + configuration file. + * Added a bug script to collect some important information when the user + uses Debian bug reporting tools (such as reportbug). + * Added a debian/watch file. + + -- martin f. krafft <madduck@debian.org> Wed, 31 May 2006 23:07:48 +0200 + +mdadm (2.4.1-1) experimental; urgency=low + + * The "I'll kill that maintainer... uh, wait, it's me" release. Sorry for + the delay, here's the long awaited new upstream release (closes: + Bug#337903, Bug#363592), which gets rid of a bunch of functionality bugs: + - reiserfs size does not overflow anymore (closes: Bug#318230) + - fixed typos in manpages (closes: Bug#352798) + Oh, and we're moving away from that arch nightmare too. Sorry for the + confusion. + * Experimental release, because I really don't want to be responsible for + data loss. Though I am quite sure that the upgrade is painless, I also + don't have access to 18 drive RAID 10 with multipath on s390 or similar + arrangements. + * We now make the /dev/md* devices in postinst unless /dev/md15 exists (no + longer checking for /dev/md0), or unless devfs is in use. If udev is used, + /dev/md15 will only exist in complex setups, so the devices will be made + in /dev/.static by MAKEDEV, which is not really a concern. I opted against + unconditionally calling MAKEDEV until #367407 is fixed so as to preserve + custom permissions or owner settings. This also acknowledges the NMU + (#299623). + closes: Bug#310247, Bug#299623 + * Patched some of the code to make mdadm honour /etc/mdadm/mdadm.conf over + /etc/mdadm.conf (see NEWS). + * Fixed a couple of typos in the mdadm(8) manpage; thanks to Reuben Thomas. + closes: Bug#345669, Bug#345667 + * Pushed Standards-Version to 3.7.2; no changes required. + * Updated Debconf translations: + - Vietnamese by Clytie Siddall (closes: Bug#323950) + - Czech by Miroslav Kure (closes: Bug#360290) + - Russian by Yuri Kozlov (closes: Bug#361116) + - French by Eric Madesclair (closes: Bug#323988) + * Added new Debconf translations: + - Swedish by Daniel Nylander (closes: Bug#333486) + - Dutch by Frans Pop (closes: Bug#344714) + + -- martin f. krafft <madduck@debian.org> Tue, 16 May 2006 18:21:36 -0500 + +mdadm (1.12.0-1) unstable; urgency=low + + * New upstream release. + (obsoletes branches: symlinks) + (reduces branches: gcc4signedness, debian, autoscan) + * Fixed typo in mdadm.conf(5) manpage (closes: Bug#321152). + + -- martin f. krafft <madduck@debian.org> Sun, 24 Jul 2005 19:20:01 +0200 + +mdadm (1.9.0-5) unstable; urgency=low + + * martin f. krafft: (the, "look ma', we're maintained in arch now!" release) + (no functional differences except for added/updated translations) + - Acknowledge NMU by Steve Langasek; thanks! (closes: Bug#299623) + - split diff.gz into different arch branches (see debian/arch-branches). + - debian/control: + - Changed maintainer to pkg-mdadm-devel. + - Reworded some of the descriptions (closes: Bug#304170). + - Pushed Standards-Version to 3.6.2.1; no changes needed. + - fixed po-debconf integration + - debian/rules: + - fixed po-debconf integration + - l10n changes: + - Removed amiguity from debconf template (closes: Bug#312754). + - Added Vietnamese debconf translation; thanks to Clytie Siddall! + (closes: Bug#312753) + - Added Czech debconf translation; thanks to Miroslav Kure! (closes: Bug#319626) + - Updated German debconf translation; thanks to Jens Seidel! (closes: Bug#313981) + - backported upstream's gcc4 signedness fixes from 1.12.0 (gcc4signedness + branch) (closes: Bug#319743). + + -- martin f. krafft <madduck@debian.org> Sun, 24 Jul 2005 17:58:46 +0200 + +mdadm (1.9.0-4.1) unstable; urgency=high + + * Non-maintainer upload. + * High-urgency upload for sarge-targetted RC bugfix + * Make sure error output from MAKEDEV is sent to stderr, to avoid + interfering with debconf; this avoids installation problems on + udev-using systems. Thanks to Jonas Smedegaard for the patch. + Closes: #299623. + + -- Steve Langasek <vorlon@debian.org> Wed, 1 Jun 2005 03:36:42 -0700 + +mdadm (1.9.0-4) unstable; urgency=high + + * High-urgency upload for sarge targeted RC bugfix. + * mdrun: replaced invocation of /usr/bin/seq with hard-coded sequence + (closes: Bug#310671). + + -- martin f. krafft <madduck@debian.org> Wed, 25 May 2005 09:51:41 +0200 + +mdadm (1.9.0-3) unstable; urgency=high + + * High-urgency upload for sarge targeted RC bugfix. + * Applied patch by Peter Samuelson <peter@p12n.org>, which causes mdadm to + follow symlinks of device nodes (closes: #274859, #310412, #310492). + * Added myself as co-maintainer as per agreement with Mario Joussen. + + -- martin f. krafft <madduck@debian.org> Tue, 24 May 2005 00:03:49 +0200 + +mdadm (1.9.0-2.3) unstable; urgency=high + + * Non-maintainer upload. + * Do not prevent postinst node creation when udev is being used; MAKEDEV + puts files into /dev/.static/dev with udev, which is needed so that device + nodes will be there even if udev is removed. Sorry for letting this slip + my mind and thanks to Steve Langasek for spotting this error. + * Leaving urgency at high to make sarge. + + -- martin f. krafft <madduck@debian.org> Sun, 22 May 2005 19:35:04 +0200 + +mdadm (1.9.0-2.2) unstable; urgency=high + + * Non-maintainer upload. + * High-urgency upload for sarge targeted RC bugfix. + * Move mdadm-raid back to S25 as it needs to run after modules have been + loaded at S20 (see followups to #294404, #301560). + * Verified that Steve Langasek's patch to config.c (see item 4 of the + 1.9.0-2.1 changelog) is necessary for `mdadm -A -s` to work. + (closes: #301560) + * Integrated patch by Erik van Konijnenburg to fix mdadm's --auto + option in the presence of --scan. + (closes: #294404, #273182, #284028, #310126). + * Modified mdrun to call mdadm with --auto in assembly mode. Removed code + which would auto-create 24 device nodes during system startup when udev + was used. + * Fixed next_free_md function in mdrun to iterate all 24 nodes instead of + using some fragile shell globbing, which did not work anyway. + * Prevent postinst node creation when udev is being used. + * Added a README.udev file to /usr/share/doc/mdadm. + + -- martin f. krafft <madduck@debian.org> Sun, 22 May 2005 12:57:56 +0200 + +mdadm (1.9.0-2.1) unstable; urgency=high + + * Non-maintainer upload. + * High-urgency upload for sarge targetted RC bugfix. + * Start mdadm-raid before udev on boot-up, so that mdadm device node + creation is honored, and support changing the init script ordering + on upgrades (closes: #294404). + * Fix mdadm --scan to prefer the values contained in /proc/partitions, + instead of picking up device node names at random from /dev. + * Teach mdrun to look at /dev/.static/dev instead of /.dev for udev + mounts requiring autostart (closes: #301560). + + -- Steve Langasek <vorlon@debian.org> Sun, 27 Mar 2005 21:59:12 -0800 + +mdadm (1.9.0-2) unstable; urgency=low + + * Patched is_standard() to accept /dev/md/* names as standard. + Thanks to Colin Watson <cjwatson@debian.org>. + (closes: Bug#296794) + * Added another typecast to make it compilable on amd64 with gcc-4.0. + Thanks to Andreas Jochens <aj@andaco.de>. + (closes: Bug#294217) + * Removed unnecessary second assignment to $BASE in mdrun. + Thanks to Colin Watson <cjwatson@debian.org>. + (closes: Bug#295433) + + -- Mario Joussen <joussen@debian.org> Sun, 6 Mar 2005 14:22:24 +0100 + +mdadm (1.9.0-1) unstable; urgency=high + + * New upstream release. + Solves problems with same UUID for each array. + Again a stable upstream version. + (closes: Bug#292282, Bug#293406, Bug#292784, Bug#290363, Bug#292715) + * Added some typecasts to make it compilable on amd64 with gcc-4.0. + Thanks to Andreas Jochens <aj@andaco.de>. + (closes: Bug#287638) + + -- Mario Joussen <joussen@debian.org> Sun, 6 Feb 2005 12:25:03 +0100 + +mdadm (1.8.1-1) unstable; urgency=low + + * New upstream release. + Fixed segfault if no config file present and --scan is used. + (closes: Bug#283425, Bug#282604, Bug#284024) + * Fixed typo in detailed help of grow mode. + (closes: Bug#286980) + * Added japanese debconf translation. Thanks to Hideki Yamane + <henrich@samba.gr.jp>. + (closes: Bug#281073) + * Fixed missing variable initialization causing segfaults. + + -- Mario Joussen <joussen@debian.org> Sun, 26 Dec 2004 14:44:31 +0100 + +mdadm (1.7.0-2) unstable; urgency=high + + * Changed debconf script to save the settings from the config file. + Thanks to Fabio Massimo Di Nitto <fabbione@fabbione.net> and + Frank Lichtenheld <djpig@debian.org> for the patch. + (closes: Bug#274208) + * Moved try to load md module inside the AUTOSTART if branch in + /etc/init.d/mdadm. + * Removed try to load md module from /etc/init.d/mdadm-raid. + + -- Mario Joussen <joussen@debian.org> Sun, 24 Oct 2004 19:48:06 +0200 + +mdadm (1.7.0-1) unstable; urgency=low + + * New upstream release. + (closes: Bug#267814) + + -- Mario Joussen <joussen@debian.org> Sun, 12 Sep 2004 20:48:33 +0200 + +mdadm (1.6.0-3) unstable; urgency=high + + * Added 'Conflicts: raidtools2 (<< 1.00.3-12.1)' because these packages + contain a mdrun.8 man page also. + (closes: Bug#268634, Bug#266527) + * Updated the french translation. + Thanks to Eric <eric-m@wanadoo.fr> + (closes: Bug#266251) + + -- Mario Joussen <joussen@debian.org> Sat, 28 Aug 2004 18:23:17 +0200 + +mdadm (1.6.0-2) unstable; urgency=low + + * Included version 0.97 of "Debian Software Root Raid Documentation". + * Now mdrun is only used if no mdadm.conf is present. + Thanks to Thomas Prokosch <7nrmi1s02@sneakemail.com>. + (closes: Bug#264059) + * Added man page for mdrun. + Thanks to Robert Collins <robertc@robertcollins.net>. + (closes: Bug#265480) + * Moved /etc/mdadm/debian.conf to /etc/default/mdadm. + (closes: Bug#254922) + * Added a little workaround to mdrun to interact better with udev. + Thanks to Fabio Massimo Di Nitto <fabbione@fabbione.net>. + (closes: Bug#259491) + * Updated Brazilian Portuguese translation. + Thanks to Andre Luis Lopes <andrelop@debian.org>. + (closes: Bug#264220) + + -- Mario Joussen <joussen@debian.org> Mon, 16 Aug 2004 22:10:59 +0200 + +mdadm (1.6.0-1) unstable; urgency=low + + * New upstream release. + Detect degraded arrays in --monitor mode now. + (closes: Bug#257357) + * Changed default to autostart RAID array. + (closes: Bug#250792) + * Fixed mdrun problem with kernel 2.6. + Thanks to Andre Tomt <andre@tomt.net> and Fabio Massimo Di Nitto + <fabbione@fabbione.net> + (closes: Bug#231823) + * Changed reuse warning to be less misleading. + (closes: Bug#253339) + + -- Mario Joussen <joussen@debian.org> Tue, 20 Jul 2004 21:40:33 +0200 + +mdadm (1.5.0-2) unstable; urgency=low + + * Added french debconf template. + Thanks to Eric Madesclair <eric-m@wanadoo.fr>. + (closes: Bug#231968) + + -- Mario Joussen <joussen@debian.org> Tue, 4 May 2004 21:29:19 +0200 + +mdadm (1.5.0-1) unstable; urgency=low + + * New upstream release. + * Rewrote debconf templates to avoid referring to debconf interface + widgets. + (closes: Bug#231221) + * Removed manual scan for RAID devices from init script. + (closes: Bug#233122, Bug#236762) + * Added creation of an udeb package. + (closes: Bug#243609) + * Added "Debian Software Root Raid Documentation". + Thanks to Lucas Albers <albersl@cs.montana.edu>. + + -- Mario Joussen <joussen@debian.org> Sun, 25 Apr 2004 16:16:06 +0200 + +mdadm (1.4.0-3) unstable; urgency=low + + * Updated to standards version 3.6.1.0 and debhelper 4. + * Corrected definition of BLKGETSIZE64 macro to compile with the + 2.6 kernel headers. + (closes: Bug#223191) + * Swichted to po-debconf to provide localized debconf templates. + (closes: Bug#225288) + + -- Mario Joussen <joussen@debian.org> Sun, 1 Feb 2004 19:30:53 +0100 + +mdadm (1.4.0-2) unstable; urgency=low + + * Corrected human readable size calculation. + (closes: Bug#225041) + * Added a warning about reusing hard disks and using the autostart + feature. + (closes: Bug#223790) + + -- Mario Joussen <joussen@debian.org> Thu, 25 Dec 2003 19:52:57 +0100 + +mdadm (1.4.0-1) unstable; urgency=low + + * New upstream release. + + -- Mario Joussen <joussen@debian.org> Sun, 7 Dec 2003 19:39:27 +0100 + +mdadm (1.3.0-2) unstable; urgency=low + + * Added upstream changelog to package. + + -- Mario Joussen <joussen@debian.org> Tue, 12 Aug 2003 21:51:59 +0200 + +mdadm (1.3.0-1) unstable; urgency=low + + * New upstream release. + (closes: Bug#191561, Bug#200921) + + -- Mario Joussen <joussen@debian.org> Thu, 31 Jul 2003 20:59:20 +0200 + +mdadm (1.2.0-1) unstable; urgency=low + + * New upstream release. (closes: Bug#183191) + * New version of mdrun that works properly with devfs and temporary + device directory. + (closes: Bug#182035) + * Added 'Conflicts: raidtools' because of a name clash with mdrun. + (closes: Bug#182960) + + -- Mario Joussen <joussen@debian.org> Sun, 16 Mar 2003 13:32:45 +0100 + +mdadm (1.0.1-4) unstable; urgency=low + + * Changed mdrun so that it can deal with partition statistics in + /proc/partitions. + (closes: Bug#174000, Bug#175130) + * Added russian (ru) debconf template translation. Thanks to Sergey + Spiridonov <sena@hurd.homeunix.org>. + + -- Mario Joussen <joussen@debian.org> Sun, 5 Jan 2003 13:14:45 +0100 + +mdadm (1.0.1-3) unstable; urgency=low + + * Fixed a bug in mdrun. (closes: Bug#167607) + + -- Mario Joussen <joussen@debian.org> Mon, 11 Nov 2002 07:53:23 +0100 + +mdadm (1.0.1-2) unstable; urgency=low + + * Fixed typo in help option. (closes: Bug#151533) + * Added a script that discovers and assembles all arrays automatically. + Thanks to Eduard Bloch <blade@debian.org>. + (closes: Bug#161699) + + -- Mario Joussen <joussen@debian.org> Fri, 1 Nov 2002 13:46:47 +0100 + +mdadm (1.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Mario Joussen <joussen@debian.org> Thu, 30 May 2002 14:01:22 +0200 + +mdadm (0.8.2-1) unstable; urgency=low + + * New upstream release. + * Splitted up mdadm.templates in one file for each language. + * Added brazilian portuguese (pt_BR) debconf template translation. + Thanks to Andre Luis Lopes <andrelop@ig.com.br>. + (closes: Bug#141540) + + -- Mario Joussen <joussen@debian.org> Thu, 18 Apr 2002 19:31:00 +0200 + +mdadm (0.7.2-1) unstable; urgency=low + + * New upstream release. + * 'mdctl' was renamed to 'mdadm' upstream. + * Removed question about updating mdctl init script links. + + -- Mario Joussen <joussen@debian.org> Sat, 23 Mar 2002 02:50:51 +0100 + +mdctl (0.5-4) unstable; urgency=low + + * Added debconf template to ask the user if the init script links + should be updated. + + -- Mario Joussen <joussen@debian.org> Mon, 4 Mar 2002 22:53:37 +0100 + +mdctl (0.5-3) unstable; urgency=low + + * Splitted up init script in two parts. One starts the md array and the + other starts the raid monitor daemon. + (closes: Bug#136184) + + -- Mario Joussen <joussen@debian.org> Thu, 28 Feb 2002 22:45:57 +0100 + +mdctl (0.5-2) unstable; urgency=low + + * Included optimization in Makefile + (closes: Bug#127687) + * Removed Conflicts/Replaces/Provides: mdutils + (closes: Bug#127684, Bug#127719) + * Added an init script, which can start md arrays and the raid monitor + daemon + * MD devices are now created under /dev if necessary + * Added a sample configuration file + + -- Mario Joussen <mario@joussen.org> Sun, 13 Jan 2002 23:43:40 +0100 + +mdctl (0.5-1) unstable; urgency=low + + * Initial Release. + (closes: Bug#126610) + + -- Mario Joussen <mario@joussen.org> Wed, 26 Dec 2001 17:07:09 +0100 diff --git a/debian/checkarray b/debian/checkarray new file mode 100644 index 00000000..1fb97356 --- /dev/null +++ b/debian/checkarray @@ -0,0 +1,219 @@ +#!/bin/sh +# +# checkarray -- initiates a check run of an MD array's redundancy information. +# +# Copyright © martin f. krafft <madduck@debian.org> +# distributed under the terms of the Artistic Licence 2.0 +# +set -eu + +PROGNAME=${0##*/} + +about() +{ + echo "\ +$PROGNAME -- MD array (RAID) redundancy checker tool +Copyright © martin f. krafft <madduck@debian.org> +Released under the terms of the Artistic Licence 2.0" +} + +usage() +{ + about + echo " +Usage: $PROGNAME [options] [arrays] + +Valid options are: + -a|--all check all assembled arrays (ignores arrays in command line). + -s|--status print redundancy check status of devices. + -x|--cancel queue a request to cancel a running redundancy check. + -i|--idle perform check in a lowest scheduling class (idle) + -l|--slow perform check in a lower-than-standard scheduling class + -f|--fast perform check in higher-than-standard scheduling class + --realtime perform check in real-time scheduling class (DANGEROUS!) + -c|--cron honour AUTOCHECK setting in /etc/default/mdadm. + -q|--quiet suppress informational messages + (use twice to suppress error messages too). + -h|--help show this output. + -V|--version show version information. + +Examples: + $PROGNAME --all --idle + $PROGNAME --quiet /dev/md[123] + $PROGNAME -sa + $PROGNAME -x --all + +Devices can be specified in almost any format. The following are equivalent: + /dev/md0, md0, /dev/md/0, /sys/block/md0 + +You can also control the status of a check with /proc/mdstat file." +} + +SHORTOPTS=achVqQsxilf +LONGOPTS=all,cron,help,version,quiet,real-quiet,status,cancel,idle,slow,fast,realtime + +eval set -- $(getopt -o $SHORTOPTS -l $LONGOPTS -n $PROGNAME -- "$@") + +arrays='' +cron=0 +all=0 +quiet=0 +status=0 +action=check +ionice= + +for opt in $@; do + case "$opt" in + -a|--all) all=1;; + -s|--status) action=status;; + -x|--cancel) action=idle;; + -i|--idle) ionice=idle;; + -l|--slow) ionice=low;; + -f|--fast) ionice=high;; + --realtime) ionice=realtime;; + -c|--cron) cron=1;; + -q|--quiet) quiet=$(($quiet+1));; + -Q|--real-quiet) quiet=$(($quiet+2));; # for compatibility + -h|--help) usage; exit 0;; + -V|--version) about; exit 0;; + /dev/md/*|md/*) arrays="${arrays:+$arrays }md${opt#*md/}";; + /dev/md*|md*) arrays="${arrays:+$arrays }${opt#/dev/}";; + /sys/block/md*) arrays="${arrays:+$arrays }${opt#/sys/block/}";; + --) :;; + *) echo "$PROGNAME: E: invalid option: $opt. Try --help." >&2; exit 1;; + esac +done + +is_true() +{ + case "${1:-}" in + [Yy]es|[Yy]|1|[Tt]rue|[Tt]) return 0;; + *) return 1; + esac +} + +DEBIANCONFIG=/etc/default/mdadm +[ -r $DEBIANCONFIG ] && . $DEBIANCONFIG +if [ $cron = 1 ] && ! is_true ${AUTOCHECK:-false}; then + [ $quiet -lt 1 ] && echo "$PROGNAME: I: disabled in $DEBIANCONFIG ." >&2 + exit 0 +fi + +if [ ! -f /proc/mdstat ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: E: MD subsystem not loaded, or /proc unavailable." >&2 + exit 2 +fi + +if [ ! -d /sys/block ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: E: /sys filesystem not available." >&2 + exit 7 +fi + +if [ -z "$(ls /sys/block/md* 2>/dev/null)" ]; then + if [ $quiet -lt 2 ] && [ $cron != 1 ]; then + echo "$PROGNAME: W: no active MD arrays found." >&2 + echo "$PROGNAME: W: (maybe uninstall the mdadm package?)" >&2 + fi + exit 0 +fi + +if [ -z "$(ls /sys/block/md*/md/level 2>/dev/null)" ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: E: kernel too old, no support for redundancy checks." >&2 + exit 6 +fi + +if ! egrep -q '^raid([1456]|10)$' /sys/block/md*/md/level 2>/dev/null; then + [ $quiet -lt 1 ] && echo "$PROGNAME: I: no redundant arrays present; skipping checks..." >&2 + exit 0 +fi + +if [ -z "$(ls /sys/block/md*/md/sync_action 2>/dev/null)" ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: E: no kernel support for redundancy checks." >&2 + exit 3 +fi + +[ $all = 1 ] && arrays="$(ls -d1 /sys/block/md* | cut -d/ -f4)" + +for array in $arrays; do + MDBASE=/sys/block/$array/md + + if [ ! -e $MDBASE/sync_action ]; then + [ $quiet -lt 1 ] && echo "$PROGNAME: I: skipping non-redundant array $array." >&2 + continue + fi + + read cur_status < $MDBASE/sync_action + + if [ $action = status ]; then + echo "$array: $cur_status" + continue + fi + + if [ ! -w $MDBASE/sync_action ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: E: $MDBASE/sync_action not writeable." >&2 + exit 4 + fi + + if [ "$(cat $MDBASE/array_state)" = 'read-auto' ]; then + [ $quiet -lt 1 ] && echo "$PROGNAME: W: array $array in auto-read-only state, skipping..." >&2 + continue + fi + + case "$action" in + idle) + echo $action > $MDBASE/sync_action + [ $quiet -lt 1 ] && echo "$PROGNAME: I: cancel request queued for array $array." >&2 + ;; + + check) + if [ "$cur_status" != idle ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: W: array $array not idle, skipping..." >&2 + continue + fi + + # check if the array created recently and skip test if it is + created=$(mdadm --detail /dev/$array 2>/dev/null | + sed -n 's/.*Creation Time *://p' ) + if [ -n "$created" ]; then + created=$(date +%s -d "$created" 2>/dev/null) + fi + if [ -n "$created" ]; then + now=$(date +%s) + if [ "$created" -lt "$now" -a \ + "$created" -gt "$(($now - 14 * 24 * 60 * 60))" ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: I: array $array created recently, skipping..." >&2 + continue + fi + fi + + # queue request for the array. The kernel will make sure that these requests + # are properly queued so as to not kill one of the array. + echo $action > $MDBASE/sync_action + [ $quiet -lt 1 ] && echo "$PROGNAME: I: check queued for array $array." >&2 + + case "$ionice" in + idle) ioarg='-c3'; renice=15;; + low) ioarg='-c2 -n7'; renice=5;; + high) ioarg='-c2 -n0'; renice=0;; + realtime) ioarg='-c1 -n4'; renice=-5;; + *) break;; + esac + + resync_pid= wait=5 + while [ $wait -gt 0 ]; do + wait=$((wait - 1)) + resync_pid=$(ps -ef | awk -v dev=$array 'BEGIN { pattern = "^\\[" dev "_resync]$" } $8 ~ pattern { print $2 }') + if [ -n "$resync_pid" ]; then + [ $quiet -lt 1 ] && echo "$PROGNAME: I: selecting $ionice I/O scheduling class and $renice niceness for resync of $array." >&2 + ionice -p "$resync_pid" $ioarg 2>/dev/null || : + renice -n $renice -p "$resync_pid" 1>/dev/null 2>&1 || : + break + fi + sleep 1 + done + ;; + esac + +done + +exit 0 diff --git a/debian/compat b/debian/compat new file mode 100644 index 00000000..1e8b3149 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +6 diff --git a/debian/control b/debian/control new file mode 100644 index 00000000..2d29ac09 --- /dev/null +++ b/debian/control @@ -0,0 +1,32 @@ +Source: mdadm +Section: admin +Priority: optional +Maintainer: Debian mdadm maintainers <pkg-mdadm-devel@lists.alioth.debian.org> +Uploaders: Dimitri John Ledkov <xnox@debian.org> +Build-Depends: debhelper (>= 6.0.7~), po-debconf, groff-base +Standards-Version: 3.9.6 +Vcs-Git: git://anonscm.debian.org/pkg-mdadm/mdadm.git +Vcs-Browser: https://anonscm.debian.org/cgit/pkg-mdadm/mdadm.git +Homepage: http://neil.brown.name/blog/mdadm + +Package: mdadm +Architecture: any +Depends: ${shlibs:Depends}, udev, ${misc:Depends}, lsb-base, debconf, initscripts +Recommends: default-mta | mail-transport-agent, kmod | module-init-tools +Description: tool to administer Linux MD arrays (software RAID) + The mdadm utility can be used to create, manage, and monitor MD + (multi-disk) arrays for software RAID or multipath I/O. + . + This package automatically configures mdadm to assemble arrays during the + system startup process. If not needed, this functionality can be disabled. + +Package: mdadm-udeb +Section: debian-installer +XC-Package-Type: udeb +Architecture: any +Depends: ${shlibs:Depends} +Description: tool to administer Linux MD arrays (software RAID) + The mdadm utility can be used to create, manage, and monitor MD + (multi-disk) arrays for software RAID or multipath I/O. + . + This is a minimal package used by the debian-installer. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 00000000..f6eaec91 --- /dev/null +++ b/debian/copyright @@ -0,0 +1,21 @@ +This package was debianized by Mario Jou/3en <joussen@debian.org> on +Wed, 26 Dec 2001 17:07:09 +0100. +Martin F. Krafft <madduck@debian.org> took over on +Tue, 16 May 2006 13:21:06 -0500 + +The mdadm source was downloaded from + http://www.kernel.org/pub/linux/utils/raid/mdadm/ + +Upstream Author: Neil Brown <neilb@suse.de> + +Copyright © 2001-2006 Neil Brown <neilb@suse.de> +Packaging copyright © 2001-2005 Mario Jou/3en <joussen@debian.org> +Packaging copyright © 2005-2008 Martin F. Krafft <madduck@debian.org> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +On Debian GNU/Linux systems, the complete text of the GNU General +Public License can be found in '/usr/share/common-licenses/GPL'. diff --git a/debian/initramfs/hook b/debian/initramfs/hook new file mode 100644 index 00000000..74ed7e38 --- /dev/null +++ b/debian/initramfs/hook @@ -0,0 +1,266 @@ +#!/bin/sh +# +# Copyright © 2006-2008 Martin F. Krafft <madduck@debian.org>, +# 2012 Michael Tokarev <mjt@tls.msk.ru> +# based on the scripts in the initramfs-tools package. +# released under the terms of the Artistic Licence. +# +set -eu + +PREREQ= +prereqs() +{ + echo "$PREREQ" +} + +case "${1:-}" in + prereqs) + prereqs + exit 0 + ;; +esac + +is_true() +{ + case "${1:-}" in + [Yy]es|[Yy]|1|[Tt]rue|[Tt]) return 0;; + *) return 1;; + esac +} + +write() +{ + local PREFIX; PREFIX=$1; shift + echo "${PREFIX}: mdadm: $@" >&2 +} + +info() +{ + is_true ${VERBOSE:-false} && write I "$@" || : +} + +warn() +{ + write W "$@" +} + +err() +{ + write E "$@" +} + +MDADM=/sbin/mdadm +MDMON=/sbin/mdmon +[ -x "$MDADM" ] || exit 0 + +[ -r /usr/share/initramfs-tools/hook-functions ] || exit 0 +. /usr/share/initramfs-tools/hook-functions + +# copy the binary as early as possible +copy_exec $MDADM /sbin +copy_exec $MDMON /sbin + +# copy all modules into the initramfs, just for safety. +# we copy raid456 / raid5+raid6 because the hook script just won't do +# anything when the module cannot be found. +modules="linear multipath raid0 raid1 raid456 raid5 raid6 raid10" +for mod in $modules; do manual_add_modules $mod; done + +# read in the configuration +CONFIG=/etc/mdadm/mdadm.conf +ALTCONFIG=/etc/mdadm.conf +[ ! -f $CONFIG ] && [ -f $ALTCONFIG ] && CONFIG=$ALTCONFIG || : + +DEBIANCONFIG=/etc/default/mdadm +INITRDSTART=all +[ -s $DEBIANCONFIG ] && . $DEBIANCONFIG +[ -z "$INITRDSTART" ] && INITRDSTART=none + +DESTMDADMCONF=$DESTDIR/etc/mdadm/mdadm.conf +DESTCONFIG=$DESTDIR/conf/mdadm + +if [ -f $CONFIG ]; then + homehost="$(sed -ne 's,^[[:space:]]*HOMEHOST[[:space:]]*,,p' $CONFIG)" +fi +if [ -z "${homehost:-}" ] || [ "${homehost:-}" = '<system>' ]; then + echo "MD_HOMEHOST='$(hostname)'" > $DESTCONFIG +fi + +install_config() +{ + # install the configuration file + mkdir -p ${2%/*} + # only copy ARRAY/DEVICE/HOMEHOST/AUTO lines, and merge continuation lines into one + if [ -f "$1" ] ; then + sed -e :a -re '$!N;s/\n[[:space:]]+/ /;ta' -ne '/^(ARRAY|DEVICE|HOMEHOST|AUTO)/P;D' $1 > $2 + fi +} + +if [ ! -f $CONFIG ]; then + # there is no configuration file, so let's create one + + if /usr/share/mdadm/mkconf generate $CONFIG; then + # all is well + install_config $CONFIG $DESTMDADMCONF + info "auto-generated the mdadm.conf configuration file." + else + # we failed to auto-generate, so let the emergency procedure take over + warn "failed to auto-generate the mdadm.conf file." + fi + +else + + if grep -q '^ARRAY' $CONFIG; then + + # this is the ideal case + install_config $CONFIG $DESTMDADMCONF + info "using configuration file: $CONFIG" + + else + + # the file defines no ARRAYs. We better create a temporary file to be sure. + + warn "$CONFIG defines no arrays." + + mkdir --parents ${DESTMDADMCONF%/*} + tmpfile="${DESTMDADMCONF}.tmp" + if /usr/share/mdadm/mkconf > $tmpfile; then + # all is well, we now have a temporary configuration file + info "auto-generated temporary mdadm.conf configuration file." + install_config $tmpfile $DESTMDADMCONF + else + # stuff's really broke, as we failed to generate a temporary file. + # let's hope the unchecked file works, provided it contains at least one + # ARRAY statement... + warn "failed to auto-generate temporary mdadm.conf file." + if grep -q '^ARRAY' $CONFIG; then + warn "using the unchecked file and hoping for the best..." + install_config $CONFIG $DESTMDADMCONF + fi + fi + rm -f $tmpfile + + fi + +fi + +# if at this point, $DESTMDADMCONF does not exist or it does not contain any +# ARRAY statements, we must let the initramfs handle stuff. +if [ ! -f $DESTMDADMCONF ]; then + warn "no configuration file available." + info "letting initramfs assemble auto-detected arrays." + exit 0 +elif ! grep -q '^ARRAY' $DESTMDADMCONF; then + warn "no arrays defined in configuration file." + info "letting initramfs assemble auto-detected arrays." + exit 0 +else + # obtain devices list from config file, honouring multiline entries + devices="$( + dev= + while read line; do + case "$line" in + (ARRAY*) :;; + (*) continue;; + esac + for atom in $line; do + case "$atom" in + (/dev*) dev=$atom;; + esac + done + + # /dev/mdX and /dev/md/X are the same, really + case "$dev" in + "") continue ;; + (/dev/md/*) alt=/dev/md${dev##*/};; + (/dev/md*) alt=/dev/md/${dev#/dev/md};; + (*) + err "unknown device encountered: $dev" + warn_emergency + exit 0 + ;; + esac + if [ ! -b "$dev" ] && [ -b "$alt" ]; then + dev="$alt" + fi + + echo "$dev" + done < $DESTMDADMCONF)" || exit $? +fi + +if [ "$INITRDSTART" != none ] && [ -n "$devices" ]; then + + devs= + for dev in $devices; do + case "$INITRDSTART " in + all|*${dev}[[:space:]]*) + case "$devs " in # uniquiness + (*${dev}\ *) :;; + (*) devs="${devs:+$devs }$dev" ;; + esac + ;; + *) :;; + esac + done + + # make sure the configuration file knows about all running devices + $MDADM --detail --scan | while read array device params; do + uuid=${params#*UUID=}; uuid=${uuid%% *} + if ! grep -qi "uuid=$uuid" $DESTMDADMCONF; then + warn "the array $device with UUID $uuid" + warn "is currently active, but it is not listed in mdadm.conf. if" + warn "it is needed for boot, then YOUR SYSTEM IS NOW UNBOOTABLE!" + warn "please inspect the output of /usr/share/mdadm/mkconf, compare" + warn "it to $CONFIG, and make the necessary changes." + fi + done + + for i in $INITRDSTART; do + case "$INITRDSTART" in all) break;; *) :;; esac + case "$devs" in + *${i}*) continue;; + *) :;; + esac + + warn "I am supposed to start $i from the initial ramdisk," + warn "yet I cannot find the array in the configuration file." + warn "I am thus reverting to starting all arrays." + INITRDSTART=all + break + done + + if [ "$INITRDSTART" = all ]; then + echo "MD_DEVS=all" >> $DESTCONFIG + else + echo "MD_DEVS='$devs'" >> $DESTCONFIG + fi + + if [ "$INITRDSTART" = all ]; then + info "will start all available MD arrays from the initial ramdisk." + else + for i in $devs; do + info "will start MD array $i from the initial ramdisk." + done + fi + + # Copy udev rules, which udev no longer does + for rules_file in 63-md-raid-arrays.rules 64-md-raid-assembly.rules + do + for rules_folder in /lib/udev/rules.d /etc/udev/rules.d; do + if [ -f $rules_folder/$rules_file ]; then + mkdir -p $DESTDIR$rules_folder + cp $rules_folder/$rules_file $DESTDIR$rules_folder/$rules_file + fi + done + done + +else + echo "MD_DEVS=none" >> $DESTCONFIG + info "no MD arrays will be started from the initial ramdisk." +fi + +# only output this on Debian systems +[ -s /etc/default/mdadm ] && \ + info 'use `dpkg-reconfigure --priority=low mdadm` to change this.' + +exit 0 diff --git a/debian/initramfs/script.local-top b/debian/initramfs/script.local-top new file mode 100644 index 00000000..9450bb72 --- /dev/null +++ b/debian/initramfs/script.local-top @@ -0,0 +1,101 @@ +#!/bin/sh +# +# Copyright © 2006-2008 Martin F. Krafft <madduck@debian.org>, +# 2012 Michael Tokarev <mjt@tls.msk.ru> +# based on the scripts in the initramfs-tools package. +# released under the terms of the Artistic Licence. +# +set -eu + +case ${1:-} in + prereqs) echo "multipath"; exit 0;; +esac + +. /scripts/functions + +maybe_break pre-mdadm + +if [ -e /scripts/local-top/md ]; then + log_warning_msg "old md initialisation script found, getting out of its way..." + exit 0 +fi + +MDADM=/sbin/mdadm +[ -x "$MDADM" ] || exit 0 + +verbose() +{ + case "$quiet" in y*|Y*|1|t*|T*) + return 1;; + *) + return 0;; + esac +} + +MD_DEVS=all +[ -s /conf/mdadm ] && . /conf/mdadm + +if [ "$MD_DEVS" = none ]; then + verbose && + log_warning_msg "INITRDSTART set to \"none\" in /etc/default/mdadm, not assembling raid arrays" + exit 0 +fi + +if [ ! -f /proc/mdstat ] && ! modprobe -q md_mod; then + verbose && log_warning_msg "failed to load module md_mod." +fi +if [ ! -f /proc/mdstat ]; then + verbose && log_warning_msg "cannot initialise MD subsystem (/proc/mdstat missing)" + exit 0 +fi + +# handle /dev/md/X nodes +mkdir -p /dev/md + +CONFIG=/etc/mdadm/mdadm.conf +# in case the hook failed to install a configuration file, this is our last +# attempt... the "emergency procedure"... <drumroll> +if [ ! -e $CONFIG ]; then + log_warning_msg "missing mdadm.conf file, trying to create one..." + mkdir -p ${CONFIG%/*} + echo DEVICE partitions > $CONFIG + $MDADM --examine --scan >> $CONFIG + if [ -s $CONFIG ]; then + verbose && log_success_msg "mdadm.conf created." + else + verbose && log_failure_msg "could not create mdadm.conf, the boot will likely fail." + fi + MD_DEVS=all +fi + +if [ "$MD_DEVS" = all ]; then + + verbose && log_begin_msg "Assembling all MD arrays" + extra_args='' + [ -n "${MD_HOMEHOST:-}" ] && extra_args="--homehost=$MD_HOMEHOST" + if $MDADM --assemble --scan --run --auto=yes${extra_args:+ $extra_args}; then + verbose && log_success_msg "assembled all arrays." + else + log_failure_msg "failed to assemble all arrays." + fi + verbose && log_end_msg + +else + for dev in $MD_DEVS; do + + verbose && log_begin_msg "Assembling MD array $dev" + if $MDADM --assemble --scan --run --auto=yes $dev; then + verbose && log_success_msg "started $dev" + else + log_failure_msg "failed to start $dev" + fi + verbose && log_end_msg + + done +fi + +wait_for_udev 10 + +maybe_break post-mdadm + +exit 0 diff --git a/debian/mdadm-raid b/debian/mdadm-raid new file mode 100644 index 00000000..6d4d6a99 --- /dev/null +++ b/debian/mdadm-raid @@ -0,0 +1,256 @@ +#!/bin/sh +# +# Start all arrays specified in the configuration file. +# +# Copyright © 2001-2005 Mario Jou/3en <joussen@debian.org> +# Copyright © 2005-2008 Martin F. Krafft <madduck@debian.org> +# Distributable under the terms of the GNU GPL version 2. +# +### BEGIN INIT INFO +# Provides: mdadm-raid +# Required-Start: mountkernfs hostname +# Should-Start: udev multipath-tools-boot +# X-Start-Before: checkfs mountall +# Required-Stop: mountkernfs +# Should-Stop: udev +# X-Stop-After: umountfs +# Default-Start: S +# Default-Stop: 0 6 +# Short-Description: MD array assembly +# Description: This script assembles a system's MD arrays, according to +# the settings in /etc/mdadm/mdadm.conf and the preferences +# in /etc/default/mdadm. +### END INIT INFO +# +set -eu + +MDADM=/sbin/mdadm +CONFIG=/etc/mdadm/mdadm.conf +ALTCONFIG=/etc/mdadm.conf +DEBIANCONFIG=/etc/default/mdadm + +test -x "$MDADM" || exit 0 + +STATEDIR=/run/mdadm +test -f $DEBIANCONFIG && . $DEBIANCONFIG + +. /lib/lsb/init-functions + +short_dev() +{ + local dev; dev=${1##*/} + case "$dev" in + md*|md_*|mdp*|mdp_*) echo "$dev";; + d*) echo "md_${dev}";; + *) echo "md${dev}";; + esac +} + +log() +{ + case "$1" in + [[:digit:]]*) success=$1; shift;; + *) :;; + esac + log_action_begin_msg "$1"; shift + log_action_end_msg ${success:-0} "$*" +} + +log_dev() +{ + success=${1:-}; shift + dev=${1:-}; shift + log $success "${PREFIX:-} $(short_dev ${dev:-})" "$*" +} + +log_notice() +{ + log 0 "${PREFIX:-}s" "$*" +} + +log_problem() +{ + log 1 "${PREFIX:-}s" "$*" +} + +is_true() +{ + case "${1:-}" in + [Yy]es|[Yy]|1|[Tt]rue|[Tt]) return 0;; + *) return 1;; + esac +} + +case "${1:-}" in + + start) + PREFIX="Assembling MD array" + + if [ ! -f /proc/mdstat ] && [ -x "$(command -v modprobe)" ] ; then + modprobe -q md 2>/dev/null || : + fi + if [ ! -f /proc/mdstat ]; then + log_problem "failed to load MD subsystem" + exit 0 + fi + + if [ ! -f $CONFIG -a ! -f $ALTCONFIG ]; then + log_problem "no $CONFIG file" + exit 0 + fi + + # handle devfs-style names and version-1 devices + # fail gracefully in case we're on a read-only filesystem, in which + # case it's safe to assume that the admin knows what s/he's doing. + # See (#382876). + mkdir --parent /dev/md || : + + # ugly hack because shell sucks + IFSOLD=${IFS:-} + IFS=' +' + for line in $($MDADM --assemble --scan --auto=yes --symlink=no 2>&1); do + IFS=$IFSOLD + set -- $line + shift + + case "$@" in + + 'No arrays found in config file'*) + # no point in carrying on. + shift + log_notice "no $*" + exit 0 + ;; + + 'Unknown keyword'*) + # warn only + if [ -x $(command -v logger >/dev/null) ]; then + logger -t mdadm -p syslog.warning -- "$*" + elif [ -w /dev/console ]; then + echo "mdadm: $*" > /dev/console + else + echo "mdadm: $*" >&2 + fi + ;; + + *' is already active.') + log_dev 0 $1 "already running" + ;; + + *'has been started with '[[:digit:]]*' drive'*' (out of '[[:digit:]]*') and '[[:digit:]]*' spare'*'.') + log_dev 0 $1 "initialising [$6/${10%).}]" + ;; + + *'has been started with '[[:digit:]]*' drive'*' (out of '[[:digit:]]*').') + log_dev 0 $1 "degraded [$6/${10%).}]" + ;; + + *'has been started with '[[:digit:]]*' drive'*'.') + log_dev 0 $1 "started [$6/$6]" + ;; + + *'assembled from '[[:digit:]]*' drive'*' - not enough to start the array.') + log_dev 1 $1 "not enough devices" + ;; + + 'no devices found for '*) + log_dev 1 $5 "no devices found" + ;; + + 'failed to RUN_ARRAY '*': Input/output error') + log_dev 1 ${4%:} "RUN_ARRAY input/output error" + ;; + + *) :;; + esac + done + ret=$? + + log_action_begin_msg "Generating udev events for MD arrays" + [ -d $STATEDIR ] || mkdir -p $STATEDIR + for uevent in /sys/block/md*/uevent; do + test -e $uevent || break + sentinel=${uevent#/sys/block/}; sentinel=${sentinel%/uevent}-uevent + test -e $STATEDIR/$sentinel && continue + test -w $uevent || continue + echo add > $uevent + test -d $STATEDIR && : > $STATEDIR/$sentinel + done + log_action_end_msg 0 + + exit $ret + ;; + + stop) + PREFIX="Stopping MD array" + + if [ ! -f /proc/mdstat ]; then + log_problem "no MD subsystem loaded" + exit 0 + fi + + # ugly hack because shell sucks + IFSOLD=${IFS:-} + IFS=' +' + set +e + for line in $($MDADM --stop --scan 2>&1); do + set -e + IFS=$IFSOLD + set -- $line + shift + case "$@" in + + 'Unknown keyword'*) + # warn only + if [ -x $(command -v logger >/dev/null) ]; then + logger -t mdadm -p syslog.warning -- "$*" + elif [ -w /dev/console ]; then + echo "mdadm: $*" > /dev/console + else + echo "mdadm: $*" >&2 + fi + ;; + + 'stopped '*) + log_dev 0 $2 stopped + ;; + + 'fail to stop array '*': Device or resource busy') + log_dev 1 ${5%:} busy + ;; + + *) :;; + esac + done || exit $? + + rm -f $STATEDIR/md*-uevent + ;; + + restart|force-reload) + ${0:-} stop + ${0:-} start + ;; + + reload) + PREFIX="Reloading MD array" + log_notice "never anything to do" + ;; + + status) + if [ ! -f /proc/mdstat ]; then + log_problem "no MD subsystem loaded" + exit 1 + else + cat /proc/mdstat + fi + ;; + + *) + echo "Usage: ${0:-} {start|stop|restart}" >&2 + exit 1;; + +esac + +exit 0 diff --git a/debian/mdadm-waitidle b/debian/mdadm-waitidle new file mode 100644 index 00000000..920272c7 --- /dev/null +++ b/debian/mdadm-waitidle @@ -0,0 +1,56 @@ +#!/bin/sh +# This script is not used when systemd is running +### BEGIN INIT INFO +# Provides: mdadm-waitidle +# Required-Start: +# Required-Stop: +# Should-Stop: halt reboot kexec +# X-Stop-After: umountroot +# Default-Start: +# Default-Stop: 0 6 +# Short-Description: Wait for MD arrays to become idle +# Description: Waits until all MD arrays are in idle and synced state +# before halt/reboot. +### END INIT INFO +# +set -eu + +MDADM=/sbin/mdadm +test -x "$MDADM" || exit 0 +test -f /proc/mdstat || exit 0 + +. /lib/lsb/init-functions + +case "${1:-}" in + + start|restart|force-reload) + # nothing, the only reason the script is here is to stop arrays + ;; + + stop) + sync + wait= + for md in /sys/block/md*/md ; do + [ -d "$md" ] || continue + [ "$wait" ] || log_action_begin_msg "Waiting for MD arrays to become idle" + wait=y + [ -w $md/sync_action ] && echo idle > $md/sync_action + done + if [ "$wait" ]; then + # mdadm --wait-clean has a short internal timeout + if $MDADM --wait-clean --scan; then + log_action_end_msg 0 + else + log_action_end_msg 1 + sleep 1 + fi + fi + ;; + + *) + echo "Usage: ${0:-} stop" >&2 + exit 1;; + +esac + +exit 0 diff --git a/debian/mdadm.config b/debian/mdadm.config new file mode 100644 index 00000000..97352662 --- /dev/null +++ b/debian/mdadm.config @@ -0,0 +1,174 @@ +#!/bin/sh +# Copyright © 2001-2004 Mario Jou/3en <joussen@debian.org> +# Copyright © martin f. krafft <madduck@debian.org> +# Distributable under the terms of the GNU GPL version 2. +# +. /usr/share/debconf/confmodule +# see #369953 for ordering +set -eu + +CONFIG=/etc/mdadm/mdadm.conf +ALTCONFIG=/etc/mdadm.conf +[ ! -f $CONFIG ] && [ -f $ALTCONFIG ] && CONFIG=$ALTCONFIG + +DEBIANCONFIG=/etc/default/mdadm + +db_get mdadm/initrdstart || : +INITRDSTART="$RET" + +if [ -s $DEBIANCONFIG ] ; then + AUTOCHECK=true + START_DAEMON=true + MAILADDR=root + + [ -f $DEBIANCONFIG ] && . $DEBIANCONFIG + if [ -f $CONFIG ]; then + MAILADDR=$(sed -rne 's/^MAILADDR[[:space:]]*([^[:space:]]+).*/\1/p' $CONFIG) + fi + + [ -n "$AUTOCHECK" ] && db_set mdadm/autocheck "$AUTOCHECK" + [ -n "$START_DAEMON" ] && db_set mdadm/start_daemon "$START_DAEMON" + [ -n "$MAILADDR" ] && db_set mdadm/mail_to "$MAILADDR" +fi + +chrooted() { + test "$(stat -c "%d/%i" /)" != "$(stat -Lc "%d/%i" /proc/1/root 2>/dev/null)" +} + +get_root_raiddev() { + local rootraiddev; + rootraiddev="$(df --portability / | sed -rne 's,^(/dev/[^[:space:]]+).*,\1,p')" + if ! mdadm --detail $rootraiddev >/dev/null 2>&1; then + return 1 + fi + echo ${rootraiddev%p[0-9]*} + return 0 +} + +get_initrdstart() { + db_fget mdadm/initrdstart seen + if chrooted || [ "$RET $INITRDSTART" = "false all" ]; then + echo all + return 1 + fi + + get_root_raiddev || echo all +} + +[ -z "$INITRDSTART" ] && INITRDSTART="$(get_initrdstart)" || : +[ -n "$INITRDSTART" ] && db_set mdadm/initrdstart "$INITRDSTART" + +INITRDSTART_PRIORITY=high +if chrooted; then + INITRDSTART_PRIORITY=medium +fi + +db_capb escape + +msg=intro; suffix=''; error=0 +while true; do + db_metaget mdadm/initrdstart_msg_${msg} extended_description || : + db_subst mdadm/initrdstart msg "$(echo -n "${RET}${suffix}" | debconf-escape -e)" + ret=0; db_input $INITRDSTART_PRIORITY mdadm/initrdstart || ret=$? + db_go + + if [ $error -ne 0 ] && [ $ret -eq 30 ]; then + # there was an error in a previous run of this loop, but the above question + # was not asked, so we better exit the endless loop... + echo "W: mdadm: unable to determine MD arrays needed for boot." >&2 + echo "W: mdadm: falling back to starting all of them..." >&2 + INITRDSTART=all + break + fi + + db_get mdadm/initrdstart + INITRDSTART="$(echo $RET | tr , ' ')" + + case "$INITRDSTART" in + ''|none) INITRDSTART=none; break;; + all) break;; + + *) + arrays='' + for i in $INITRDSTART; do + + # standardise by prefixing /dev/ + i="/dev/${i#/dev/}" + + # remove partition from partitionable array + i="${i%p[0-9]*}" + + if [ ! -e "$i" ]; then + error=1; msg=errexist; suffix=": $i"; break + fi + + if [ ! -b "$i" ]; then + t="$(readlink -nf "$i")" + if [ ! -b "$t" ]; then + suffix=": $t" + else + suffix=": $i" + fi + error=1; msg=errblock; break + fi + + if ! mdadm --detail "$i" >/dev/null 2>&1; then + error=1; msg=errmd; suffix=": $i"; break + fi + + if [ -f $CONFIG ]; then + if ! egrep -q "^(ARRAY)?[[:space:]]+$i([[:space:]]+|$)" $CONFIG; then + t="$(readlink -nf "$i")" + if [ -L "$i" ] \ + && egrep -q "^(ARRAY)?[[:space:]]+$t([[:space:]]|$)+" $CONFIG; then + i="$t" + else + db_set mdadm/initrdstart_notinconf false + db_fset mdadm/initrdstart_notinconf seen false + db_subst mdadm/initrdstart_notinconf array "$i" + db_subst mdadm/initrdstart_notinconf config "$CONFIG" + db_input high mdadm/initrdstart_notinconf || : + db_go + db_get mdadm/initrdstart_notinconf + if [ "$RET" = true ]; then + error=0 + else + db_fset mdadm/initrdstart seen false + db_set mdadm/initrdstart all + error=1; msg=errconf; suffix=": $i"; break + fi + fi + fi + fi + + # uniquely add device name + echo $arrays | egrep -q "\b${i}\b" || arrays="${arrays:+$arrays }$i" + done + + if [ $error -eq 0 ]; then + INITRDSTART="$arrays" + # exit the while true loop + break + fi + ;; + esac +done + +db_set mdadm/initrdstart "$INITRDSTART" + +if [ "$INITRDSTART" != all ]; then + db_input high mdadm/autostart || : + db_go +fi + +db_input medium mdadm/autocheck || : +db_go + +db_input medium mdadm/start_daemon || : +db_go + +db_get mdadm/start_daemon || : +if [ "$RET" = true ]; then + db_input medium mdadm/mail_to || : + db_go +fi diff --git a/debian/mdadm.cron.d b/debian/mdadm.cron.d new file mode 100644 index 00000000..309d180e --- /dev/null +++ b/debian/mdadm.cron.d @@ -0,0 +1,12 @@ +# +# cron.d/mdadm -- schedules periodic redundancy checks of MD devices +# +# Copyright © martin f. krafft <madduck@madduck.net> +# distributed under the terms of the Artistic Licence 2.0 +# + +# By default, run at 00:57 on every Sunday, but do nothing unless the day of +# the month is less than or equal to 7. Thus, only run on the first Sunday of +# each month. crontab(5) sucks, unfortunately, in this regard; therefore this +# hack (see #380425). +57 0 * * 0 root if [ -x /usr/share/mdadm/checkarray ] && [ $(date +\%d) -le 7 ]; then /usr/share/mdadm/checkarray --cron --all --idle --quiet; fi diff --git a/debian/mdadm.cron.daily b/debian/mdadm.cron.daily new file mode 100644 index 00000000..d5ac1ae0 --- /dev/null +++ b/debian/mdadm.cron.daily @@ -0,0 +1,18 @@ +#!/bin/sh +# +# cron.daily/mdadm -- daily check that MD devices are functional +# +# Copyright © 2008 Paul Slootman <paul@debian.org> +# distributed under the terms of the Artistic Licence 2.0 + +# As recommended by the manpage, run +# mdadm --monitor --scan --oneshot +# every day to ensure that any degraded MD devices don't go unnoticed. +# Email will go to the address specified in /etc/mdadm/mdadm.conf . +# +set -eu + +MDADM=/sbin/mdadm +[ -x $MDADM ] || exit 0 # package may be removed but not purged + +exec $MDADM --monitor --scan --oneshot diff --git a/debian/mdadm.doc-base.faq b/debian/mdadm.doc-base.faq new file mode 100644 index 00000000..3fff4504 --- /dev/null +++ b/debian/mdadm.doc-base.faq @@ -0,0 +1,9 @@ +Document: mdadm-faq +Title: mdadm Debian FAQ +Author: martin f. krafft +Abstract: The document answers frequently asked questions about Debian's mdadm +Section: System/Administration + +Format: text +Index: /usr/share/doc/mdadm/FAQ.gz +Files: /usr/share/doc/mdadm/FAQ.gz diff --git a/debian/mdadm.doc-base.recipes b/debian/mdadm.doc-base.recipes new file mode 100644 index 00000000..d1069682 --- /dev/null +++ b/debian/mdadm.doc-base.recipes @@ -0,0 +1,9 @@ +Document: mdadm-readme-recipes +Title: mdadm Debian recipes +Author: David Pashley +Abstract: The document contains some common recipes for mdadm usage on Debian +Section: System/Administration + +Format: text +Index: /usr/share/doc/mdadm/README.recipes.gz +Files: /usr/share/doc/mdadm/README.recipes.gz diff --git a/debian/mdadm.docs b/debian/mdadm.docs new file mode 100644 index 00000000..830665f4 --- /dev/null +++ b/debian/mdadm.docs @@ -0,0 +1,7 @@ +TODO +debian/README.recipes +debian/README.checkarray +debian/FAQ +ANNOUNCE-* +external-reshape-design.txt +mdmon-design.txt diff --git a/debian/mdadm.init b/debian/mdadm.init new file mode 100644 index 00000000..8b603796 --- /dev/null +++ b/debian/mdadm.init @@ -0,0 +1,94 @@ +#!/bin/sh +# +# Start the MD monitor daemon for all active MD arrays if desired. +# This script is not used under systemd. +# +# Copyright © 2001-2005 Mario Jou/3en <joussen@debian.org> +# Copyright © 2005-2009 Martin F. Krafft <madduck@debian.org> +# Distributable under the terms of the GNU GPL version 2. +# +### BEGIN INIT INFO +# Provides: mdadm +# Required-Start: $local_fs $syslog mdadm-raid +# Required-Stop: $local_fs $syslog sendsigs mdadm-raid +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: MD monitoring daemon +# Description: mdadm provides a monitor mode, in which it will scan for +# problems with the MD devices. If a problem is found, the +# administrator is alerted via email, or a custom script is +# run. +### END INIT INFO +# +set -eu + +MDADM=/sbin/mdadm +MDMON=/sbin/mdmon +RUNDIR=/run/mdadm +PIDFILE=$RUNDIR/monitor.pid +DEBIANCONFIG=/etc/default/mdadm + +test -x "$MDADM" || exit 0 + +test -f /proc/mdstat || exit 0 + +START_DAEMON=true +test -f $DEBIANCONFIG && . $DEBIANCONFIG + +. /lib/lsb/init-functions + +is_true() +{ + case "${1:-}" in + [Yy]es|[Yy]|1|[Tt]|[Tt]rue) return 0;; + *) return 1; + esac +} + +case "${1:-}" in + start) + if is_true $START_DAEMON; then + log_daemon_msg "Starting MD monitoring service" "mdadm --monitor" + mkdir -p $RUNDIR + set +e + start-stop-daemon -S -p $PIDFILE -x $MDADM -- \ + --monitor --pid-file $PIDFILE --daemonise --scan ${DAEMON_OPTIONS:-} + log_end_msg $? + set -e + fi + if [ "$(echo $RUNDIR/md[0-9]*.pid)" != "$RUNDIR/md[0-9]*.pid" ]; then + log_daemon_msg "Restarting MD external metadata monitor" "mdmon --takeover --all" + set +e + $MDMON --takeover --all + log_end_msg $? + set -e + fi + ;; + stop) + if [ -f $PIDFILE ] ; then + log_daemon_msg "Stopping MD monitoring service" "mdadm --monitor" + set +e + start-stop-daemon -K -p $PIDFILE -x $MDADM + rm -f $PIDFILE + log_end_msg $? + set -e + fi + for file in $RUNDIR/md[0-9]*.pid ; do + [ ! -f "$file" ] && continue + ln -sf $file /run/sendsigs.omit.d/mdmon-${file##*/} + done + ;; + status) + status_of_proc -p $PIDFILE "$MDADM" "mdadm" && exit 0 || exit $? + ;; + restart|reload|force-reload) + ${0:-} stop + ${0:-} start + ;; + *) + echo "Usage: ${0:-} {start|stop|status|restart|reload|force-reload}" >&2 + exit 1 + ;; +esac + +exit 0 diff --git a/debian/mdadm.logcheck.ignore.server b/debian/mdadm.logcheck.ignore.server new file mode 100644 index 00000000..051c4732 --- /dev/null +++ b/debian/mdadm.logcheck.ignore.server @@ -0,0 +1,23 @@ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md driver [.[:digit:]]+ MAX_MD_DEVS=[[:digit:]]+, MD_SB_DISKS=[[:digit:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: bitmap version [.[:digit:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md[[:digit:]]+ stopped\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md[[:digit:]]+ still in use\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: cannot remove active disk [[:alnum:]]+ from md[[:digit:]]+ \.\.\. ?$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: raid([01456]|456|10) personality registered for level ([01456]|10)$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: (data-check|requested-resync|resync|reshape|recovery) of RAID array md[[:digit:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: resuming (data-check|requested-resync|resync|reshape|recovery) of md[[:digit:]]+ from checkpoint\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md[[:digit:]]+: (data-check|requested-resync|resync|reshape|recovery) done\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: minimum _guaranteed_ ?speed: [[:digit:]]+ KB/sec/disk\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: using maximum available idle IO bandwidth \(but not more than [[:digit:]]+ KB/sec\) for (data-check|requested-resync|resync|reshape|recovery)\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: delaying (data-check|requested-resync|resync|reshape|recovery) of md[[:digit:]]+ until md[[:digit:]]+ has finished \(they share one or more physical units\)$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: using [[:digit:]]+k window, over a total of [[:digit:]]+( blocks|k)\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: (un)?bind<[^>]+>$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: export_rdev\([^)]+\)$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? raid[[:digit:]]+: raid set [[:alnum:]]+ active with [[:digit:]]+ out of [[:digit:]]+ mirrors$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? RAID([01456]|10) conf printout:$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])?[[:space:]]+---( [wrf]d:[[:digit:]]+){2,3}$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])?[[:space:]]+disk [[:digit:]]+,( wo:[[:digit:]]+,)? o:[[:digit:]]+, dev:[[:alnum:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: Rebuild((Start|Finish)ed|[[:digit:]]+) event detected on md device /dev/[-_./[:alnum:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: SpareActive event detected on md device /dev/[-_./[:alnum:]]+, component device /dev/[-_./[:alnum:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: (New|Degraded)Array event detected on md device /dev/[-_./[:alnum:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: DeviceDisappeared event detected on md device /dev/[-_./[:alnum:]]+$ diff --git a/debian/mdadm.logcheck.violations b/debian/mdadm.logcheck.violations new file mode 100644 index 00000000..ea8cce72 --- /dev/null +++ b/debian/mdadm.logcheck.violations @@ -0,0 +1,3 @@ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: kicking non-fresh [[:alnum:]]+ from array!$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? raid[[:digit:]]+: Disk failure on [[:alnum:]]+, disabling device\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])?[[:space:]]+Operation continuing on [[:digit:]]+ devices?$ diff --git a/debian/mdadm.modules b/debian/mdadm.modules new file mode 100644 index 00000000..5ad12499 --- /dev/null +++ b/debian/mdadm.modules @@ -0,0 +1,8 @@ +# mdadm module configuration file +# set start_ro=1 to make newly assembled arrays read-only initially, +# to prevent metadata writes. This is needed in order to allow +# resume-from-disk to work - new boot should not perform writes +# because it will be done behind the back of the system being +# resumed. See http://bugs.debian.org/415441 for details. + +options md_mod start_ro=1 diff --git a/debian/mdadm.postinst b/debian/mdadm.postinst new file mode 100644 index 00000000..a603376d --- /dev/null +++ b/debian/mdadm.postinst @@ -0,0 +1,112 @@ +#!/bin/sh +# Copyright © 2001-2005 Mario Jou/3en <joussen@debian.org> +# Copyright © 2005-2008 Martin F. Krafft <madduck@debian.org> +# Distributable under the terms of the GNU GPL version 2. +# +set -e + +. /usr/share/debconf/confmodule + +case "${1:-}" in + configure|reconfigure) + + if [ ! -f /proc/mdstat ] && [ -x $(command -v modprobe 2>/dev/null) ]; then + modprobe md >/dev/null 2>&1 || : + fi + if [ ! -f /proc/mdstat ]; then + echo 'W: mdadm: failed to load MD subsystem.' >&2 + fi + + DEBIANCONFIG=/etc/default/mdadm + CONFIG=/etc/mdadm/mdadm.conf + ALTCONFIG=/etc/mdadm.conf + MDADM=/sbin/mdadm + + # load current settings, most of which will be overwritten. + [ -f $DEBIANCONFIG ] && . $DEBIANCONFIG + + db_get mdadm/mail_to + MAILADDR="${RET:-root}" + + [ ! -f $CONFIG ] && [ -f $ALTCONFIG ] && CONFIG=$ALTCONFIG + if [ ! -f $CONFIG ]; then + echo -n 'Generating mdadm.conf... ' >&2 + # pass the MAILADDR variable into the script + MDADM_MAILADDR__="$MAILADDR"; export MDADM_MAILADDR__ + if /usr/share/mdadm/mkconf generate $CONFIG 2>/dev/null; then + echo done. >&2 + else + echo "done (failed to scan arrays; /proc probably not mounted)." >&2 + fi + fi + + if [ -w $CONFIG ] && [ -z "${MDADM_MAILADDR__:-}" ]; then + # if the configuration is writeable but has not been written just + # before, then edit it to reflect the MAILADDR preference + if grep -q '^MAILADDR' $CONFIG; then + sed -i -e "s/^MAILADDR.*/MAILADDR $MAILADDR/" $CONFIG + else + echo "MAILADDR $MAILADDR" >> $CONFIG + fi + fi + unset MDADM_MAILADDR__ + + db_get mdadm/initrdstart + INITRDSTART="${RET:-all}" + db_get mdadm/autocheck + AUTOCHECK="${RET:-true}" + db_get mdadm/start_daemon + START_DAEMON="${RET:-true}" + #db_get mdadm/daemon_options + [ -n "${DAEMON_OPTIONS:-}" ] || DAEMON_OPTIONS='--syslog' + + cat <<_eof > $DEBIANCONFIG +# mdadm Debian configuration +# +# You can run 'dpkg-reconfigure mdadm' to modify the values in this file, if +# you want. You can also change the values here and changes will be preserved. +# Do note that only the values are preserved; the rest of the file is +# rewritten. +# + +# INITRDSTART: +# list of arrays (or 'all') to start automatically when the initial ramdisk +# loads. This list *must* include the array holding your root filesystem. Use +# 'none' to prevent any array from being started from the initial ramdisk. +INITRDSTART='$INITRDSTART' + +# AUTOCHECK: +# should mdadm run periodic redundancy checks over your arrays? See +# /etc/cron.d/mdadm. +AUTOCHECK=$AUTOCHECK + +# START_DAEMON: +# should mdadm start the MD monitoring daemon during boot? +START_DAEMON=$START_DAEMON + +# DAEMON_OPTIONS: +# additional options to pass to the daemon. +DAEMON_OPTIONS="$DAEMON_OPTIONS" + +# VERBOSE: +# if this variable is set to true, mdadm will be a little more verbose e.g. +# when creating the initramfs. +VERBOSE=${VERBOSE:-false} +_eof + + db_stop + + command -v update-initramfs >/dev/null 2>&1 && update-initramfs -u + + if dpkg --compare-versions "$2" le 3.3.2-3; then + rm -f /var/lib/mdadm/CONF-UNCHECKED /var/lib/mdadm/mdadm.conf-generated + if [ -d /var/lib/mdadm ]; then + rmdir --ignore-fail-on-non-empty /var/lib/mdadm + fi + fi + ;; +esac + +[ -d /run/systemd/system ] && systemctl --system daemon-reload >/dev/null || : + +#DEBHELPER# diff --git a/debian/mdadm.postrm b/debian/mdadm.postrm new file mode 100644 index 00000000..30309626 --- /dev/null +++ b/debian/mdadm.postrm @@ -0,0 +1,25 @@ +#! /bin/sh +# Copyright © 2001,2002 Mario Jou/3en <joussen@debian.org> +# Copyright © 2006-2008 Martin F. Krafft <madduck@debian.org> +# Distributable under the terms of the GNU GPL version 2. +# +set -e + +case "${1:-}" in + remove) + if command -v update-initramfs >/dev/null 2>&1; then + echo "W: mdadm: I'll update the initramfs, but if you need MD to boot" >&2 + echo "W: mdadm: with initramfs, your system may be left unbootable!" >&2 + update-initramfs -u + fi + ;; + + purge) + rm -f /etc/default/mdadm /etc/mdadm.conf /etc/mdadm/mdadm.conf + ;; + +esac + +[ -d /run/systemd/system ] && systemctl --system daemon-reload >/dev/null || : + +#DEBHELPER# diff --git a/debian/mdadm.preinst b/debian/mdadm.preinst new file mode 100644 index 00000000..b90ffb87 --- /dev/null +++ b/debian/mdadm.preinst @@ -0,0 +1,45 @@ +#!/bin/sh +# Copyright © martin f. krafft <madduck@debian.org> +# Distributed under the terms of the Artistic Licence 2.0 +# +set -e + +# based on idea from http://www.dpkg.org/dpkg/ConffileHandling +rm_conffile() { + local conffile md5sum package old_md5sum + + conffile="$1" + if [ -e "$conffile" ]; then + md5sum=$(md5sum "$conffile" | cut -d' ' -f1) + package=$(dpkg -S "$conffile" | cut -d: -f1) + old_md5sum=$(dpkg -s $package | sed -rne "s,[[:space:]]+${conffile}[[:space:]]+,,p") + if [ "$md5sum" != "$old_md5sum" ]; then + echo "Obsolete conffile $conffile has been modified by you." + echo "Saving as ${conffile}.dpkg-bak ..." + mv -f "$conffile" "$conffile".dpkg-bak + else + echo "Removing obsolete conffile $conffile ..." + rm -f "$conffile" + fi + fi +} + +case "$1" in + + upgrade|install) + + # migrate old configuration from *way back then* + DEBIANCONFIG=/etc/default/mdadm + OLDCONFIG=/etc/mdadm/debian.conf + if [ -s $OLDCONFIG ] && [ ! -f $DEBIANCONFIG ]; then + mv $OLDCONFIG $DEBIANCONFIG + fi + + # Used incorrect name s/_/-/, keep all throughout until after jessie is released. + rm_conffile /etc/udev/rules.d/65-mdadm.vol_id.rules + ;; + + *) :;; +esac + +#DEBHELPER# diff --git a/debian/mdadm.templates b/debian/mdadm.templates new file mode 100644 index 00000000..ddf63c76 --- /dev/null +++ b/debian/mdadm.templates @@ -0,0 +1,99 @@ +# These templates have been reviewed by the debian-l10n-english +# team +# +# If modifications/additions/rewording are needed, please ask +# debian-l10n-english@lists.debian.org for advice. +# +# Even minor modifications require translation updates and such +# changes should be coordinated with translators and reviewers. + +Template: mdadm/initrdstart +Type: string +Default: all +#flag:translate!:2 +_Description: MD arrays needed for the root file system: + ${msg} + . + Please enter 'all', 'none', or a space-separated list of devices such as + 'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted). + +Template: mdadm/initrdstart_msg_intro +Type: text +_Description: for internal use - only the long description is needed. + If the system's root file system is located on an MD array (RAID), it needs to be + started early during the boot sequence. If it is located on + a logical volume (LVM), which is on MD, all constituent arrays need to be + started. + . + If you know exactly which arrays are needed to bring up the root file system, + and you want to postpone starting all other arrays to a later point in the + boot sequence, enter the arrays to start here. Alternatively, enter 'all' to + simply start all available arrays. + . + If you do not need or want to start any arrays for the root file system, leave + the answer blank (or enter 'none'). This may be the case if you are using + kernel autostart or do not need any arrays to boot. + +Template: mdadm/initrdstart_msg_errexist +Type: text +_Description: + An error occurred: device node does not exist + +Template: mdadm/initrdstart_msg_errblock +Type: text +_Description: + An error occurred: not a block device + +Template: mdadm/initrdstart_msg_errmd +Type: text +_Description: + An error occurred: not an MD array + +Template: mdadm/initrdstart_msg_errconf +Type: text +_Description: + An error occurred: array not listed in mdadm.conf file + +Template: mdadm/initrdstart_notinconf +Type: boolean +Default: false +_Description: Start arrays not listed in mdadm.conf? + The specified array (${array}) is not listed in the configuration + file (${config}). Therefore, it cannot be started during boot, unless you + correct the configuration file and recreate the initial ramdisk. + . + This warning is only relevant if you need arrays to be started from the + initial ramdisk to be able to boot. If you use kernel autostarting, or do not + need any arrays to be started as early as the initial ramdisk is loaded, you + can simply continue. Alternatively, choose not to continue and enter 'none' + when prompted which arrays to start from the initial ramdisk. + +Template: mdadm/autocheck +Type: boolean +Default: true +_Description: Should mdadm run monthly redundancy checks of the MD arrays? + If the kernel supports it (versions greater than 2.6.14), mdadm can periodically check the + redundancy of MD arrays (RAIDs). This may be a resource-intensive process, + depending on the local setup, but it could help prevent rare cases of data loss. + Note that this is a read-only check unless errors are found; if errors are + found, mdadm will try to correct them, which may result in write access to + the media. + . + The default, if turned on, is to check on the first Sunday of every + month at 01:06. + +Template: mdadm/start_daemon +Type: boolean +Default: true +_Description: Do you want to start the MD monitoring daemon? + The MD (RAID) monitor daemon sends email notifications in response to + important MD events (such as a disk failure). + . + Enabling this option is recommended. + +Template: mdadm/mail_to +Type: string +Default: root +_Description: Recipient for email notifications: + Please enter the email address of the user who should get the email + notifications for important MD events. diff --git a/debian/mkconf b/debian/mkconf new file mode 100644 index 00000000..a580f5b3 --- /dev/null +++ b/debian/mkconf @@ -0,0 +1,101 @@ +#!/bin/sh +# +# mkconf -- outputs valid mdadm.conf contents for the local system +# +# Copyright © martin f. krafft <madduck@madduck.net> +# distributed under the terms of the Artistic Licence 2.0 +# +set -eu + +ME="${0##*/}" +MDADM=/sbin/mdadm +DEBIANCONFIG=/etc/default/mdadm +CONFIG=/etc/mdadm/mdadm.conf + +# initialise config variables in case the environment leaks +MAILADDR= DEVICE= CREATE= HOMEHOST= PROGRAM= + +test -r $DEBIANCONFIG && . $DEBIANCONFIG + +if [ -n "${MDADM_MAILADDR__:-}" ]; then + # honour MAILADDR from the environment (from postinst) + MAILADDR="$MDADM_MAILADDR__" +else + # preserve existing MAILADDR + MAILADDR="$(sed -ne 's/^MAILADDR //p' $CONFIG 2>/dev/null)" || : +fi + +# save existing values as defaults +if [ -r "$CONFIG" ]; then + DEVICE="$(sed -ne 's/^DEVICE //p' $CONFIG)" + CREATE="$(sed -ne 's/^CREATE //p' $CONFIG)" + HOMEHOST="$(sed -ne 's/^HOMEHOST //p' $CONFIG)" + PROGRAM="$(sed -ne 's/^PROGRAM //p' $CONFIG)" +fi + +[ "${1:-}" = force-generate ] && rm -f $CONFIG +case "${1:-}" in + generate|force-generate) + [ -n "${2:-}" ] && CONFIG=$2 + # only barf if the config file specifies anything else than MAILADDR + if egrep -qv '^(MAILADDR.*|#.*|)$' $CONFIG 2>/dev/null; then + echo "E: $ME: $CONFIG already exists." >&2 + exit 255 + fi + + mkdir --parent ${CONFIG%/*} + exec >$CONFIG + ;; +esac + +cat <<_eof +# mdadm.conf +# +# Please refer to mdadm.conf(5) for information about this file. +# + +# by default (built-in), scan all partitions (/proc/partitions) and all +# containers for MD superblocks. alternatively, specify devices to scan, using +# wildcards if desired. +#DEVICE ${DEVICE:-partitions containers} + +# auto-create devices with Debian standard permissions +CREATE ${CREATE:-owner=root group=disk mode=0660 auto=yes} + +# automatically tag new arrays as belonging to the local system +HOMEHOST ${HOMEHOST:-<system>} + +# instruct the monitoring daemon where to send mail alerts +MAILADDR ${MAILADDR:-root} + +_eof + +if [ -n "${PROGRAM:-}" ]; then + cat <<-_eof + # program to run when mdadm monitor detects potentially interesting events + PROGRAM ${PROGRAM} + + _eof +fi + +error=0 +if [ ! -r /proc/mdstat ]; then + echo W: $ME: MD subsystem is not loaded, thus I cannot scan for arrays. >&2 + error=1 +elif [ ! -r /proc/partitions ]; then + echo W: $ME: /proc/partitions cannot be read, thus I cannot scan for arrays. >&2 + error=2 +else + echo "# definitions of existing MD arrays" + if ! $MDADM --examine --scan --config=partitions; then + error=$(($? + 128)) + echo W: $ME: failed to scan for partitions. >&2 + echo "### WARNING: scan failed." + else + echo + fi +fi + +echo "# This configuration was auto-generated on $(date -R) by mkconf" + +exit $error diff --git a/debian/patches/debian-conffile-location.diff b/debian/patches/debian-conffile-location.diff new file mode 100644 index 00000000..8acc6077 --- /dev/null +++ b/debian/patches/debian-conffile-location.diff @@ -0,0 +1,115 @@ +From: martin f. krafft <madduck@debian.org> +Subject: Set /etc/mdadm/mdadm.conf as primary config file location + +On Debian, the configuration file resides primarily in /etc/mdadm/mdadm.conf, +/etc/mdadm.conf is only used as a backup. + +This is a Debian-specific patch. + +Forwarded: not-needed +Reviewed-by: martin f. krafft <madduck@debian.org> + +--- + Makefile | 4 ++-- + ReadMe.c | 2 +- + mdadm.8.in | 14 ++++++-------- + mdadm.conf.5 | 2 +- + mdassemble.8 | 2 +- + 5 files changed, 11 insertions(+), 13 deletions(-) + +--- a/Makefile ++++ b/Makefile +@@ -61,8 +61,8 @@ + PKG_CONFIG ?= pkg-config + + SYSCONFDIR = /etc +-CONFFILE = $(SYSCONFDIR)/mdadm.conf +-CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf ++CONFFILE = $(SYSCONFDIR)/mdadm/mdadm.conf ++CONFFILE2 = $(SYSCONFDIR)/mdadm.conf + MAILCMD =/usr/sbin/sendmail -t + CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" + # Both MAP_DIR and MDMON_DIR should be somewhere that persists across the +--- a/ReadMe.c ++++ b/ReadMe.c +@@ -590,7 +590,7 @@ + ; + + char Help_config[] = +-"The /etc/mdadm.conf config file:\n\n" ++"The /etc/mdadm/mdadm.conf config file:\n\n" + " The config file contains, apart from blank lines and comment lines that\n" + " start with a hash(#), array lines, device lines, and various\n" + " configuration lines.\n" +--- a/mdadm.8.in ++++ b/mdadm.8.in +@@ -264,13 +264,13 @@ + .TP + .BR \-c ", " \-\-config= + Specify the config file or directory. Default is to use +-.B /etc/mdadm.conf ++.B /etc/mdadm/mdadm.conf + and +-.BR /etc/mdadm.conf.d , ++.BR /etc/mdadm/mdadm.conf.d , + or if those are missing then +-.B /etc/mdadm/mdadm.conf ++.B /etc/mdadm.conf + and +-.BR /etc/mdadm/mdadm.conf.d . ++.BR /etc/mdadm.conf.d . + If the config file given is + .B "partitions" + then nothing will be read, but +@@ -1742,9 +1742,9 @@ + or requested with (a possibly implicit) + .BR \-\-scan . + In the later case, +-.B /etc/mdadm.conf +-or + .B /etc/mdadm/mdadm.conf ++or ++.B /etc/mdadm.conf + is used. + + If +@@ -3003,7 +3003,7 @@ + is given in Misc mode, and to monitor array reconstruction + on Monitor mode. + +-.SS /etc/mdadm.conf ++.SS /etc/mdadm/mdadm.conf (or /etc/mdadm.conf) + + The config file lists which devices may be scanned to see if + they contain MD super block, and gives identifying information +@@ -3011,7 +3011,7 @@ + .BR mdadm.conf (5) + for more details. + +-.SS /etc/mdadm.conf.d ++.SS /etc/mdadm/mdadm.conf.d (or /etc/mdadm.conf.d) + + A directory containing configuration files which are read in lexical + order. +--- a/mdadm.conf.5 ++++ b/mdadm.conf.5 +@@ -8,7 +8,7 @@ + .SH NAME + mdadm.conf \- configuration for management of Software RAID with mdadm + .SH SYNOPSIS +-/etc/mdadm.conf ++/etc/mdadm/mdadm.conf + .SH DESCRIPTION + .PP + .I mdadm +--- a/mdassemble.8 ++++ b/mdassemble.8 +@@ -40,7 +40,7 @@ + + .SH FILES + +-.SS /etc/mdadm.conf ++.SS /etc/mdadm/mdadm.conf + + The config file lists which devices may be scanned to see if + they contain MD super block, and gives identifying information diff --git a/debian/patches/debian-no-Werror.diff b/debian/patches/debian-no-Werror.diff new file mode 100644 index 00000000..0a427f16 --- /dev/null +++ b/debian/patches/debian-no-Werror.diff @@ -0,0 +1,24 @@ +From: martin f. krafft <madduck@debian.org> +Subject: Remove -Werror from compiler flags + +-Werror seems like a bad idea on released/packaged code because a toolchain +update (introducing new warnings) could break the build. We'll let upstream +use it to beautify the code, but remove it for out builds. + +Signed-off-by: martin f. krafft <madduck@debian.org> + +--- + Makefile | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +--- a/Makefile ++++ b/Makefile +@@ -43,7 +43,7 @@ + + CC = $(CROSS_COMPILE)gcc + CXFLAGS ?= -ggdb +-CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter ++CWFLAGS = -Wall -Wstrict-prototypes -Wextra -Wno-unused-parameter + ifdef WARN_UNUSED + CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3 + endif diff --git a/debian/patches/disable-incremental-assembly.patch b/debian/patches/disable-incremental-assembly.patch new file mode 100644 index 00000000..693a65dc --- /dev/null +++ b/debian/patches/disable-incremental-assembly.patch @@ -0,0 +1,12 @@ +--- a/udev-md-raid-assembly.rules ++++ b/udev-md-raid-assembly.rules +@@ -25,6 +25,9 @@ GOTO="md_inc_end" + + LABEL="md_inc" + ++# Disable incremental assembly to fix Debian bug #784070 ++GOTO="md_inc_end" ++ + # remember you can limit what gets auto/incrementally assembled by + # mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY' + ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $tempnode --offroot ${DEVLINKS}" diff --git a/debian/patches/mdmonitor-service-simplify.diff b/debian/patches/mdmonitor-service-simplify.diff new file mode 100644 index 00000000..ea706a68 --- /dev/null +++ b/debian/patches/mdmonitor-service-simplify.diff @@ -0,0 +1,20 @@ +Subject: simplify mdmonitor.service +From: Michael Tokarev <mjt@tls.msk.ru> +Date: Fri, 14 Nov 2014 19:18:05 +0300 +Bug-Debian: http://bugs.debian.org/764647 +Forwarded: no + +There isn't much for customization for mdadm --monitor. +it'll just do what it's supposed to do, so just run it. + +--- a/systemd/mdmonitor.service ++++ b/systemd/mdmonitor.service +@@ -10,7 +10,4 @@ Description=MD array monitor + DefaultDependencies=no + + [Service] +-Environment= MDADM_MONITOR_ARGS=--scan +-EnvironmentFile=-/run/sysconfig/mdadm +-ExecStartPre=-/usr/lib/systemd/scripts/mdadm_env.sh +-ExecStart=BINDIR/mdadm --monitor $MDADM_MONITOR_ARGS ++ExecStart=BINDIR/mdadm --monitor --scan diff --git a/debian/patches/readlink-path.patch b/debian/patches/readlink-path.patch new file mode 100644 index 00000000..86544592 --- /dev/null +++ b/debian/patches/readlink-path.patch @@ -0,0 +1,15 @@ +From: Michael Tokarev <mjt@tls.msk.ru> +Subject: readlink is in /bin not /usr/bin on debian +Date: Fri, 14 Nov 2014 19:11:51 +0300 +Bug-Debian: http://bugs.debian.org/766416 +Forwarded: no + +This is a debian-specific change, upstream ships +the rule to use /usr/bin/readlink while on debian +it is /bin/readlink + +--- a/udev-md-raid-arrays.rules ++++ b/udev-md-raid-arrays.rules +@@ -38 +38 @@ ENV{MD_LEVEL}=="raid[1-9]*", ENV{SYSTEMD_WANTS}+="mdmonitor.service" +-ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/usr/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c" ++ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c" diff --git a/debian/patches/series b/debian/patches/series new file mode 100644 index 00000000..9aa3e3d7 --- /dev/null +++ b/debian/patches/series @@ -0,0 +1,8 @@ +debian-conffile-location.diff +debian-no-Werror.diff +sha1-includes.diff +use-external-blkid.diff +use-tempnode-not-devnode.patch +readlink-path.patch +mdmonitor-service-simplify.diff +disable-incremental-assembly.patch diff --git a/debian/patches/sha1-includes.diff b/debian/patches/sha1-includes.diff new file mode 100644 index 00000000..0dfd7daf --- /dev/null +++ b/debian/patches/sha1-includes.diff @@ -0,0 +1,40 @@ +From: Michael Tokarev <mjt@tls.msk.ru> +Subject: do not #include ansidecl.h from sha1.h, use system headers + +In 3.2.5 version of mdadm, new sha1 implementation has been included +which tries to include ansidecl.h header which is internal to some +other project. But this #include isn't really necessary, since this +implementation does not actually use any defines from ansidecl.h. So +just remove the #include, instead of adding a new external dependency. + +References: http://www.spinics.net/lists/raid/msg38859.html + +While at it, unconditionally include system headers like limits.h and +stdint.h, since on a Linux system these headers are available, and +these contains definitive information about real system types than +any guesses. + +--- a/sha1.h ++++ b/sha1.h +@@ -22,7 +22,7 @@ + + #include <stdio.h> + +-#if defined HAVE_LIMITS_H || _LIBC ++#if 1 /* defined HAVE_LIMITS_H || _LIBC */ + # include <limits.h> + #endif + +@@ -33,9 +33,9 @@ + the resulting executable. Locally running cross-compiled executables + is usually not possible. */ + +-#ifdef _LIBC +-# include <sys/types.h> +-typedef u_int32_t sha1_uint32; ++#if 1 /* def _LIBC */ ++# include <stdint.h> ++typedef uint32_t sha1_uint32; + typedef uintptr_t sha1_uintptr; + #else + # define INT_MAX_32_BITS 2147483647 diff --git a/debian/patches/use-external-blkid.diff b/debian/patches/use-external-blkid.diff new file mode 100644 index 00000000..637f7c3a --- /dev/null +++ b/debian/patches/use-external-blkid.diff @@ -0,0 +1,16 @@ +From: Michael Tokarev <mjt@tls.msk.ru> +Subject: blkid is not udev builtin, use /sbin/blkid + +--- a/udev-md-raid-arrays.rules ++++ b/udev-md-raid-arrays.rules +@@ -26,9 +26,7 @@ ENV{DEVTYPE}=="partition", ENV{MD_UUID}= + ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n" + ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n" + +-IMPORT{builtin}="blkid" +-OPTIONS+="link_priority=100" +-OPTIONS+="watch" ++IMPORT{program}="/sbin/blkid -o udev -p -u noraid $tempnode" + ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" + ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" + diff --git a/debian/patches/use-tempnode-not-devnode.patch b/debian/patches/use-tempnode-not-devnode.patch new file mode 100644 index 00000000..38a55044 --- /dev/null +++ b/debian/patches/use-tempnode-not-devnode.patch @@ -0,0 +1,31 @@ +From: Michael Tokarev <mjt@tls.msk.ru> +Subject: use tempnode not devnode in udev rules +Bug-Debian: http://bugs.debian.org/770883 +Forwarded: no + +udev in wheezy does not understand $devnode construct +in rules file, while upstream uses it in mdadm rules +files. udev in jessie has $devnode and it also supports +old $tempnode which is the way it worked in wheezy and +before, even if $tempnode in jessie's udev is not documented. +So on jessie, both $tempnode and $devnode works fine, while +in wheezy, only $tempnode works. + +Use $tempnode instead of $devnode. Since mdadm is important +enough for system functionality and easily can break system +by making it unbootable, and this is the only incompatibility +between wheezy's and jessie's udev wrt mdadm, it is better than +having a versioned dependency on udev. + +This patch is debian-specific and should be dropped for jessie+1. + +--- a/udev-md-raid-arrays.rules ++++ b/udev-md-raid-arrays.rules +@@ -20 +20 @@ +-IMPORT{program}="BINDIR/mdadm --detail --export $devnode" ++IMPORT{program}="BINDIR/mdadm --detail --export $tempnode" +--- a/udev-md-raid-assembly.rules ++++ b/udev-md-raid-assembly.rules +@@ -30 +30 @@ +-ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $devnode --offroot ${DEVLINKS}" ++ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $tempnode --offroot ${DEVLINKS}" diff --git a/debian/po/POTFILES.in b/debian/po/POTFILES.in new file mode 100644 index 00000000..04922385 --- /dev/null +++ b/debian/po/POTFILES.in @@ -0,0 +1 @@ +[type: gettext/rfc822deb] mdadm.templates diff --git a/debian/po/ca.po b/debian/po/ca.po new file mode 100644 index 00000000..0710fd90 --- /dev/null +++ b/debian/po/ca.po @@ -0,0 +1,220 @@ +# mdadm Catalan translation. +# Copyright (C) 2004-2006 Software in the Public Interest +# This file is distributed under the same license as the squid package. +# Innocent De Marchi <tangram.peces@gmail.com>, 2011. +# +msgid "" +msgstr "" +"Project-Id-Version: 3.1.4-1+8efb9d1\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2011-05-17 16:54+0100\n" +"Last-Translator: Innocent De Marchi <tangram.peces@gmail.com>\n" +"Language-Team: Catalan <debian-l10n-catalan@lists.debian.org>\n" +"Language: ca\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"X-Poedit-Language: Catalan\n" +"X-Poedit-Country: SPAIN\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Les matrius MD necessaris per al sistema de fitxers arrel:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Escriviu «all» (tots), «none» (cap) o una llista separada per espais dels " +"dispositius com «md0 md1» o «md/1 md/d0» (podeu ometre el «/dev/» inicial)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "per a ús intern, només és necessà ria la descripció llarga. " + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Si el sistema de fitxers arrel del sistema està en un conjunt MD (RAID), cal " +"que s'iniciï al principi de la seqüència d'arrencada. Si està en un volum " +"lògic (LVM), que està definit sobre un MD, cal iniciar totes les matrius que " +"el constitueixen." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Si sabeu exactament quines matrius són necessà ries per arrencar el sistema " +"de fitxers arrel, i vol ajornar l'arrencada de la resta de conjunts a un " +"punt posterior de la seqüència d'arrencada, Introduïu aquà els conjunts que " +"voleu arrencar. També podeu seleccionar «all» per, simplement, arrencar tots " +"els disponibles." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Si no necessita o vol arrencar qualsevol matriu per al sistema de fitxers " +"arrel, deixau en blanc la resposta (o escriviu «none»). Pot ésser el seu cas " +"si fa servir l'auto-arrencada del nucli o no necessiteu cap matriu en " +"l'arrencada." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "S'ha produït un error: el node de dispositiu no existeix." + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "S'ha produït un error: no és un dispositiu de blocs." + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "S'ha produït un error: no és un conjunt («array») MD." + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "" +"S'ha produït un error: la matriu («array») no apareix llistada en el fitxer " +"de configuració «mdadm.conf»." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Voleu arrencar les matrius no llistats en el fitxer «mdadm.conf»?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"La matriu («array») especificada (${array}) no apareix llistada en el " +"fitxer de configuració (${config}). Per tant, no es pot iniciar la matriu " +"durant l'arrencada del sistema, llevat que corregeixi el fitxer de " +"configuració i regenereu el disc RAM inicial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Aquest avÃs només és important si necessiteu que les matrius s'arrenquin en " +"el disc RAM inicial per poder arrencar el sistema. Si feu servir l'arrencada " +"automà tica del nucli o no necessiteu que les matrius estiguin arrencats " +"quan es carregui el disc RAM, podeu continuar. També podeu decidir no " +"continuar i introduir «none» quan se li demani quines matrius cal iniciar " +"del disc RAM inicial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"Voleu que «mdadm» executi comprovacions de redundà ncia mensuals de les " +"matrius MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Si el nucli ho accepta (versions superiors a la 2.6.14), «mdadm» pot fer " +"comprovacions periòdiques de la redundà ncia de les matrius MD (RAIDs). Pot " +"ésser que aquest procés consumeixi molts recursos del sistema, depenent de " +"la configuració, però pot ajudar a prevenir casos poc freqüents de pèrdua de " +"dades. Teniu present que aquestes comprovacions es fan en mode lectura " +"llevat que es detectin errors: si hi ha errors, «mdadm» els corregirà i per " +"això, caldrà que tengui accés d'escriptura als mitjans fÃsics. " + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"El valor predeterminat, si s'activa, es fer la comprovació el primer " +"diumenge de cada mes a les 01:06 am." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Desitjau arrencar el dimoni monitor MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"El dimoni monitor de MD (RAID) envia notificacions per correu electrònic " +"quan es produeixen esdeveniments importants en els dispositius MD (com un " +"error de disc)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Es recomana l'activació d'aquesta opció." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Destinatari de les notificacions de correu electrònic:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Introduïu l'adreça de correu electrònic de l'usuari que ha de rebre les " +"notificacions de correu electrònic per a esdeveniments MD rellevants." diff --git a/debian/po/cs.po b/debian/po/cs.po new file mode 100644 index 00000000..40e8668c --- /dev/null +++ b/debian/po/cs.po @@ -0,0 +1,265 @@ +# +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans +# +# Developers do not need to manually edit POT or PO files. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-02-17 21:10+0100\n" +"Last-Translator: Miroslav Kure <kurem@debian.cz>\n" +"Language-Team: Czech <debian-l10n-czech@lists.debian.org>\n" +"Language: cs\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "MD pole vyžadovaná pro koÅ™enový souborový systém:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Zadejte prosÃm mezerami oddÄ›lený seznam zaÅ™ÃzenÃ, pÅ™ÃpadnÄ› „all“ nebo " +"„none“. PoÄáteÄnà „/dev/“ můžete vynechat a zadat jen napÅ™. „md0 md1“ nebo " +"„md/1 md/d0“." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "pro vnitÅ™nà použità - pouze kvůli zobrazenà dlouhého popisu." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Pokud je koÅ™enový souborový systém umÃstÄ›n na MD (RAID) svazku, musà být " +"tento spuÅ¡tÄ›n bÄ›hem zavádÄ›nà systému co nejdÅ™Ãve. Pokud se koÅ™enový " +"souborový systém nacházà na logickém svazku LVM, který je vytvoÅ™en nad MD " +"polem, musà se spustit vÅ¡echna souvisejÃcà pole." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Jestliže pÅ™esnÄ› vÃte, která pole jsou potÅ™eba pro pÅ™ipojenà koÅ™enového " +"souborového systému a zároveň chcete pozdržet spuÅ¡tÄ›nà ostatnÃch polà na " +"pozdÄ›jÅ¡Ã dobu, zadejte zde prosÃm pole, která se majà spustit. Chcete-li " +"spustit vÅ¡echna dostupná pole, můžete zadat „all“." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"NepotÅ™ebujete-li nebo nechcete-li spouÅ¡tÄ›t pole pro koÅ™enový souborový " +"systém, ponechte odpovÄ›Ä prázdnou, pÅ™ÃpadnÄ› zadejte „none“. To může nastat " +"tÅ™eba v pÅ™ÃpadÄ›, že použÃváte automatický start pÅ™Ãmo v jádÅ™e, nebo pokud k " +"zavedenà systému žádná pole nepotÅ™ebujete." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Vyskytla se chyba: uzel zaÅ™Ãzenà neexistuje" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Vyskytla se chyba: nenà blokovým zaÅ™ÃzenÃm" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Vyskytla se chyba: nenà MD polem" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Vyskytla se chyba: pole nenà uvedeno v souboru mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Spustit pole neuvedená v mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"Zadané pole (${array}) nenà uvedeno v konfiguraÄnÃm souboru ${config} a tÃm " +"pádem nemůže být spuÅ¡tÄ›no pÅ™i zavádÄ›nà systému. Napravit to můžete opravou " +"konfiguraÄnÃho souboru a znovuvytvoÅ™enÃm poÄáteÄnÃho ramdisku." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Toto varovánà je relevantnà pouze pokud k zavedenà systému potÅ™ebujete, aby " +"se pole spustila z poÄáteÄnÃho ramdisku. PoužÃváte-li automatické spouÅ¡tÄ›nà " +"pÅ™Ãmo v jádÅ™e, nebo pokud nepotÅ™ebujete pouÅ¡tÄ›t žádná pole jeÅ¡tÄ› z " +"poÄáteÄnÃho ramdisku, můžete jednoduÅ¡e pokraÄovat. Jinou možnostà je " +"nepokraÄovat dále a pÅ™i dotazu na seznam polÃ, která se majà spouÅ¡tÄ›t z " +"poÄáteÄnÃho ramdisku, zadat 'none'." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Má mdadm spouÅ¡tÄ›t mÄ›sÃÄnà kontroly redundance MD polÃ?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Pokud to vaÅ¡e jádro podporuje (verze vÄ›tÅ¡Ã než 2.6.14), může mdadm " +"pravidelnÄ› kontrolovat redundanci MD polà (RAIDů). Podle konfigurace " +"poÄÃtaÄe to může být proces velmi nároÄný na prostÅ™edky, ovÅ¡em může pÅ™edejÃt " +"vzácným pÅ™Ãpadům ztráty dat. Pokud nejsou nalezeny chyby, použÃvá tato " +"kontrola v zásadÄ› jen Ätecà operace. PÅ™i nalezenà chyb se je mdadm pokusà " +"opravit, což může znamenat zápis na médium." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Pokud kontrolu povolÃte, bude se dle výchozÃho nastavenà spouÅ¡tÄ›t každou " +"prvnà nedÄ›li v mÄ›sÃci v 01:06 ráno." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Chcete spustit daemon pro monitorovánà MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Daemon pro monitorovánà MD (RAIDu) zasÃlá emailová upozornÄ›nà na významné MD " +"události, jako je selhánà disku." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Povolenà této možnosti je doporuÄeno." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "PÅ™Ãjemce emailových upozornÄ›nÃ:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Zadejte prosÃm emailovou adresu uživatele, který má dostávat emailová " +"upozornÄ›nà pÅ™i výskytu významných MD událostÃ." + +#~ msgid "Initialise the superblock if you reuse hard disks" +#~ msgstr "PÅ™i znovupoužità starÅ¡Ãch disků inicializujte superblok" + +#~ msgid "" +#~ "WARNING! If you are using hard disks which have RAID superblocks from " +#~ "earlier installations in different RAID arrays, you MUST zero each " +#~ "superblock *before* activating the autostart feature." +#~ msgstr "" +#~ "VAROVÃNÃ! PoužÃváte-li pevné disky, které obsahujà RAID superbloky z " +#~ "dÅ™ÃvÄ›jÅ¡Ã instalace v jiném RAID poli, MUSÃTE vÅ¡echny superbloky pÅ™ed " +#~ "použitÃm automatického spouÅ¡tÄ›nà vynulovat." + +#~ msgid "" +#~ "To do this, do not start the RAID devices automatically. First, zero the " +#~ "superblock (mdadm --zero-superblock /dev/mdX). Next, use `dpkg-" +#~ "reconfigure mdadm` to reactivate the autostart feature." +#~ msgstr "" +#~ "Chcete-li to provést, nespouÅ¡tÄ›jte RAID zaÅ™Ãzenà automaticky. Nejprve " +#~ "vynulujte superblok pÅ™Ãkazem 'mdadm --zero-superblock /dev/mdX' a teprve " +#~ "poté můžete povolit automatické spouÅ¡tÄ›nà RAIDu pÅ™Ãkazem 'dpkg-" +#~ "reconfigure mdadm'." + +#~ msgid "" +#~ "You have the option to start all other arrays (those not needed for the " +#~ "root filesystem) later in the boot sequence. Doing so will give you " +#~ "greater control over the arrays with the mdadm configuration file. " +#~ "Starting all arrays at boot-time may be safer though." +#~ msgstr "" +#~ "VÅ¡echna ostatnà pole (ta, která nejsou potÅ™eba pro koÅ™enový souborový " +#~ "systém) můžete spustit pozdÄ›ji. Pokud tak uÄinÃte, budete mÃt v " +#~ "konfiguraÄnÃm souboru mdadm nad poli vÄ›tÅ¡Ã kontrolu. Na druhou stranu je " +#~ "spouÅ¡tÄ›nà vÅ¡ech polà hned na zaÄátku zavádÄ›nà o nÄ›co jistÄ›jÅ¡Ã volbou." + +#~ msgid "" +#~ "If RAID devices are started automatically, all RAID devices are " +#~ "discovered and assembled automatically at system startup. This option " +#~ "should only be used if the md driver is compiled as a module. If it is " +#~ "compiled into your kernel, the automatic startup will be performed at " +#~ "boot time by the kernel and therefore you should not choose this option." +#~ msgstr "" +#~ "Jestliže jsou RAID zaÅ™Ãzenà spouÅ¡tÄ›na automaticky, jsou vÅ¡echna RAID " +#~ "zaÅ™Ãzenà rozpoznána a poskládána automaticky pÅ™i zavádÄ›nà systému. Tuto " +#~ "volbu byste mÄ›li použÃt pouze v pÅ™ÃpadÄ›, že jste ovladaÄ md zakompilovali " +#~ "jako modul. Pokud jste jej zakompilovali pÅ™Ãmo do jádra, o automatické " +#~ "spuÅ¡tÄ›nà se postará samotné jádro a tedy tuto možnost nepotÅ™ebujete." diff --git a/debian/po/da.po b/debian/po/da.po new file mode 100644 index 00000000..47294504 --- /dev/null +++ b/debian/po/da.po @@ -0,0 +1,210 @@ +# Danish translation mdadm. +# Copyright (C) 2011 mdadm & nedenstÃ¥ende oversættere. +# This file is distributed under the same license as the mdadm package. +# Joe Hansen <joedalton2@yahoo.dk>, 2011. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2011-04-03 17:30+01:00\n" +"Last-Translator: Joe Hansen <joedalton2@yahoo.dk>\n" +"Language-Team: Danish <debian-l10n-danish@lists.debian.org> \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "MD arrays krævet for rodfilsystemet:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Indtast venligst »all«, »none« eller en mellemrumsadskilt liste af enheder " +"sÃ¥som »md0 md1« eller »md/1 md/d0« (det foranstillede »/dev/« kan udelades)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "for intern brug - kun den lange beskrivelse er krævet." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Hvis systemets rodfilsystem er placeret pÃ¥ en MD-array (RAID), skal det " +"startes tidligt under opstartssekvensen. Hvis den er placeret pÃ¥ en logisk " +"diskenhed (LVM), som er pÃ¥ MD, skal alle indgÃ¥ende arrays startes." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Hvis du ved præcis hvilke arrays som er krævet, for at fÃ¥ rodfilsystemet op, " +"og du ønsker at udsætte start af alle andre arrays til et senere tidspunkt i " +"opstartssekvensen, sÃ¥ indtast her de arrays som skal startes. Alternativt " +"kan du indtaste »all« for at starte alle tilgængelige arrays." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Hvis du ikke har brug for eller ønsker at starte nogen arrays for " +"rodfilsystemet, sÃ¥ efterlad svaret tomt (eller indtast »none«). Dette kan " +"være tilfældet, hvis du bruger automatisk start af kernen eller ikke skal " +"bruge arrays til at starte op med." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Der opstod en fejl: Enhedsknude findes ikke" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Der opstod en fejl: Ikke en blokenhed" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Der opstod en fejl: Ikke en MD array" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Der opstod en fejl: Array er ikke anført i mdadm.conf-filen" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Start arrays er ikke anført i mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"Den angivne array (${array}) er ikke anført i konfigurationsfilen " +"(${config}). Den kan derfor ikke startes under opstarten, med mindre du " +"retter i konfigurationsfilen og gendanner den oprindleige ramdisk." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Denne advarsel er kun relevant hvis du skal have arrays til at blive startet " +"fra den oprindelige ramdisk for at kunne starte op. Hvis du bruger den " +"automatiske opstart i kernen, eller ikke skal bruge at arrays startes sÃ¥ " +"tidligt som den oprindelige ramdisk indlæses, sÃ¥ kan du bare fortsætte. " +"Alternativt sÃ¥ vælg at fortsætte og indtaste »none« nÃ¥r du bliver spurgt om " +"hvilke arrays, der skal startes fra den oprindelige ramdisk." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Skal mdadm køre mÃ¥nedlig redundanskontrol af MD arrays?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Hvis kernen understøtter det (versioner større end 2.6.14), kan mdadm " +"periodisk kontrollere redundansen pÃ¥ MD arrays (RAID'er). Det kan være en " +"ressourcekrævende proces, afhængig af den lokale opsætning, men det kan " +"hjælpe med at forhindre at du i sjældne tilfælde fÃ¥r datatab. Bemærk at " +"dette er en skrivebeskyttet kontrol med mindre at der findes fejl; hvis der " +"registreres fejl vil mdadm forsøge at rette dem, hvilket kan medføre " +"skriveadgang til mediet." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Standarden - hvis aktiveret - er at kontrollere pÃ¥ den første søndag i hver " +"mÃ¥ned klokken 01:06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Ønsker du at starte MD-overvÃ¥gningsdæmonen?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"MD-overvÃ¥gningsdæmonem (RAID) sender e-post-pÃ¥mindelser udløst af vigtige MD-" +"hændelser (sÃ¥som en diskfejl)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Aktivering af denne indstilling anbefales." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Modtager af e-post-pÃ¥mindelser:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Indtast venligst e-post-adressen pÃ¥ brugeren, som skal modtage e-post-" +"pÃ¥mindelser for vigtige MD-hændelser." diff --git a/debian/po/de.po b/debian/po/de.po new file mode 100644 index 00000000..e1191e21 --- /dev/null +++ b/debian/po/de.po @@ -0,0 +1,284 @@ +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans +# Developers do not need to manually edit POT or PO files. +# +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 2.6.9-3\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2009-06-24 17:35+0200\n" +"Last-Translator: Mario Joussen <joussen@debian.org>\n" +"Language-Team: German <debian-l10n-german@lists.debian.org>\n" +"Language: de\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=ISO-8859-1\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Für das Wurzeldateisystem benötigte MD folgende Verbünde:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Bitte geben Sie »all«, »none« oder eine leerzeichenseparierte Geräteliste " +"wie zum Beispiel »md0 md1« oder »md/1 md/d0« ein (das führende »/dev« kann " +"weggelassen werden)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "" +"für internen Gebrauch - es wird nur die ausführliche Beschreibung benötigt." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Wenn das Wurzeldateisystem Ihres Systems auf einem MD-Verbund (RAID) liegt, " +"muss es frühzeitig während des Bootvorgangs gestartet werden. Wenn sich Ihr " +"Wurzeldateisystem auf einem logischen Laufwerk (LVM) befindet, das sich " +"wiederum auf einem MD Verbund befindet, müssen alle zugehörigen Verbünde " +"gestartet werden." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Wenn Sie genau wissen, welche Verbünde benötigt werden, um das " +"Wurzeldateisystem zu starten, und Sie den Start der anderen Verbünde auf " +"einen späteren Zeitpunkt in der Bootreihenfolge verschieben wollen, geben " +"Sie die zu startenden Verbünde hier ein. Alternativ geben Sie »all« ein, um " +"alle verfügbaren Verbünde zu starten." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Falls Sie keine RAID-Verbünde für das Wurzeldateisystem benötigen oder " +"starten wollen, lassen Sie die Antwort leer (oder geben »none« ein). Dies " +"könnte der Fall sein, wenn Sie entweder die Autostartfunktion des Kernels " +"verwenden oder keine Verbünde zum Booten benötigen." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Ein Fehler ist aufgetreten: Geräteknoten existiert nicht" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Ein Fehler ist aufgetreten: kein Blockgerät" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Ein Fehler ist aufgetreten: kein RAID-Verbund" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "" +"Ein Fehler ist aufgetreten: Verbund nicht in der Datei mdadm.conf aufgeführt" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Nicht in mdadm.conf aufgeführte Verbünde starten?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"Der angegebene Verbund (${array}) ist in der Konfigurationsdatei ${config} " +"nicht aufgeführt. Deshalb kann er während des Bootvorgangs nicht gestartet " +"werden, es sei denn, Sie korrigieren die Konfigurationsdatei und erzeugen " +"die initiale Ramdisk neu." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Diese Warnung ist nur von Bedeutung, wenn Sie RAID-Verbünde, die von der " +"initialen Ramdisk gestartet werden, benötigen, um booten zu können. Falls " +"Sie die Autostartfunktion des Kernels verwenden oder kein RAID-Verbund zum " +"frühen Zeitpunkt des Ladens der initialen Ramdisk gestartet werden muss, " +"können Sie einfach fortfahren. Alternativ wählen Sie, nicht fortzufahren und " +"geben »none« ein, wenn Sie gefragt werden, welche RAID-Verbünde von der " +"initialen Ramdisk gestartet werden sollen." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"Soll mdadm monatlich die Redundanzüberprüfung auf den RAID-Verbünden " +"ausführen?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Falls Ihr Kernel es unterstützt (Versionen größer als 2.6.14) kann mdadm " +"regelmäßig die Redundanz Ihrer MD-Verbünde (RAID) überprüfen. Dies kann " +"abhängig von Ihrer Installation ein resourcenintensiver Vorgang sein, der " +"aber helfen kann, seltene Fälle von Datenverlust zu vermeiden. Bitte " +"beachten Sie, dass diese Überprüfung nur lesend erfolgt, solange keine " +"Fehler gefunden werden. Falls Fehler gefunden werden, wird mdadm versuchen, " +"diese zu beheben, was zu schreibendem Zugriff auf das Medium führen kann." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Die Voreinstellung ist, falls eingeschaltet, die Überprüfung am ersten " +"Sonntag jedes Monats um 01:06 Uhr durchzuführen." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Möchten Sie den RAID-Überwachungsdämon starten?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Der MD- (RAID-)Überwachungsdämon verschickt Benachrichtigungen als Reaktion " +"auf wichtige RAID-Ereignisse (wie zum Beispiel Festplattenfehler)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Das Aktivieren dieser Option ist empfohlen." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Empfänger der E-Mail-Benachrichtungen:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Geben Sie bitte die E-Mail-Adresse des Benutzers an, der die E-Mail-" +"Benachrichtigung für wichtigen MD-Ereignisse erhalten soll." + +#~ msgid "Initialise the superblock if you reuse hard disks" +#~ msgstr "" +#~ "Initialisieren Sie den Superblock, wenn Sie Festplatten wieder verwenden." + +#~ msgid "" +#~ "WARNING! If you are using hard disks which have RAID superblocks from " +#~ "earlier installations in different RAID arrays, you MUST zero each " +#~ "superblock *before* activating the autostart feature." +#~ msgstr "" +#~ "WARNUNG! Wenn Sie Festplatten verwenden, die bereits einen md-Superblock " +#~ "von einer vorherigen Installation in einem anderen RAID-Verbund besitzen, " +#~ "so MÜSSEN Sie diesen löschen, *bevor* Sie die Autostart-Funktion " +#~ "aktivieren." + +#~ msgid "" +#~ "To do this, do not start the RAID devices automatically. First, zero the " +#~ "superblock (mdadm --zero-superblock /dev/mdX). Next, use `dpkg-" +#~ "reconfigure mdadm` to reactivate the autostart feature." +#~ msgstr "" +#~ "Dazu starten Sie die RAID-Laufwerke nicht automatisch und löschen dann " +#~ "erst den Superblock (mdadm --zero-superblock /dev/mdX). Danach können Sie " +#~ "mit »dpkg-reconfigure mdadm« die Autostart-Funktion aktivieren." + +#~ msgid "" +#~ "You have the option to start all other arrays (those not needed for the " +#~ "root filesystem) later in the boot sequence. Doing so will give you " +#~ "greater control over the arrays with the mdadm configuration file. " +#~ "Starting all arrays at boot-time may be safer though." +#~ msgstr "" +#~ "Sie haben die Option, alle anderen Verbünde (diese die nicht für das " +#~ "Wurzeldateisystem benötigt werden) später während des Bootvorgangs zu " +#~ "starten. Damit haben Sie größere Kontrolle über die Verbünde mit Hilfe " +#~ "der mdadm-Konfigurationsdatei. Es ist jedoch sicherer, alle Verbünde beim " +#~ "Booten zu starten." + +#~ msgid "" +#~ "If RAID devices are started automatically, all RAID devices are " +#~ "discovered and assembled automatically at system startup. This option " +#~ "should only be used if the md driver is compiled as a module. If it is " +#~ "compiled into your kernel, the automatic startup will be performed at " +#~ "boot time by the kernel and therefore you should not choose this option." +#~ msgstr "" +#~ "Wenn die RAID-Laufwerke automatisch gestartet werden, werden alle RAID-" +#~ "Laufwerke beim Systemstart automatisch gefunden und gestartet. Diese " +#~ "Option sollte nur benutzt werden, falls der md-Treiber als Modul " +#~ "kompiliert wurde. Falls er in den Kernel einkompiliert wurde, führt der " +#~ "Kernel den automatischen Start beim Booten durch und Sie sollten diese " +#~ "Option deshalb nicht auswählen." + +#~ msgid "" +#~ "When the RAID monitor daemon runs, email notifications are sent when a " +#~ "disk belonging to a RAID array fails or changes its status for some " +#~ "reason." +#~ msgstr "" +#~ "Wird der RAID-Überwachungsdaemon gestartet, so werden E-Mail-" +#~ "Benachrichtigungen verschickt, falls ein zum RAID gehörendes Laufwerk " +#~ "ausfällt oder den Status ändert." diff --git a/debian/po/es.po b/debian/po/es.po new file mode 100644 index 00000000..d8809892 --- /dev/null +++ b/debian/po/es.po @@ -0,0 +1,254 @@ +# mdadm po-debconf translation to spanish +# Copyright (C) 2006 Software in the Public Interest, SPI Inc. +# This file is distributed under the same license as the mdadm package. +# +# Changes: +# - Initial translation +# Javier Fernández-Sanguino , 2006 +# - Revision +# Fernando Cerezal +# +# +# Traductores, si no conoce el formato PO, merece la pena leer la +# documentación de gettext, especialmente las secciones dedicadas a este +# formato, por ejemplo ejecutando: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# +# Equipo de traducción al español, por favor lean antes de traducir +# los siguientes documentos: +# +# - El proyecto de traducción de Debian al español +# http://www.debian.org/intl/spanish/ +# especialmente las notas y normas de traducción en +# http://www.debian.org/intl/spanish/notas +# +# - La guía de traducción de po's de debconf: +# /usr/share/doc/po-debconf/README-trans +# o http://www.debian.org/intl/l10n/po-debconf/README-trans +# +# Si tiene dudas o consultas sobre esta traducción consulte con el último +# traductor (campo Last-Translator) y ponga en copia a la lista de +# traducción de Debian al español (<debian-l10n-spanish@lists.debian.org>) +# +# Notas: +# - 'array' no está traducido aán. La traducción como 'arreglo' suena +# fatal (y es poco conocida) [ cambiar cuando se cambie en d-i ] +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 2.5.6-6\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-04-25 17:47+0200\n" +"Last-Translator: Javier Fernández-Sanguino <jfs@debian.org>\n" +"Language-Team: Debian Spanish <debian-l10n-spanish@lists.debian.org>\n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=ISO-8859-15\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Arrays MD necesarios para el sistema de ficheros raíz:" + +# No se traduce «all» y «none» porque no aparecen en la plantilla para traducir los elementos individuales +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Introduzca «all» (todos), «none» (ninguno) o una lista de dispositivos " +"separados por espacios como por ejemplo puede sólo introducir «md0 md1» o " +"«md/1 md/d0» (no tiene que preceder los nombres de dispositivos con «/dev»)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "para uso interno. Sólo se utiliza la descripción larga." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Si el sistema de ficheros raíz de su sistema está en un array MD (RAID) " +"tiene que inicializarse antes durante de la secuencia de arranque. Si está " +"en un volumen lógico (LVM), que está definido sobre un MD, todos los arrays " +"que lo forman tienen que haberse inicializado." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Introduzca los arrays a iniciar aquí, si sabe con exactitud cuáles son " +"necesarios para arrancar el sistema de ficheros raíz y quiere posponer el " +"arranque de todos los demás arrays a un punto posterior de la secuencia de " +"arranque. También puede introducir «all» (todos) para, sencillamente, " +"iniciar todos los arrays disponibles." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Puede dejar la respuesta en blanco (o introducir «none») si no necesita o " +"desea arrancar los arrays para el sistema de ficheros raíz. Este puede ser " +"su caso si está utilizando el autoarranque del núcleo o no necesita ningún " +"array para el arranque." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Se produjo un error: el nodo de dispositivo no existe" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Se produjo un error: no es un dispositivo de bloques" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Se produjo un error: no es un array MD" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "" +"Se produjo un error: el array no está en la lista definida en el archivo " +"mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "¿Desea arrancar los arrays no listados en mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"El array que ha especificado (${array}) no está listado en el fichero de " +"configuración ${config}. Este array no podrá iniciarse durante el arranque " +"del sistema a no ser que corrija el fichero de configuración y regenere el " +"disco de ram inicial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Este aviso sólo es relevante si necesita que los arrays se inicien en el " +"disco de RAM inicial para poder arrancar el sistema. Si utiliza el " +"autoarranque del núcleo o no necesita que los arrays estén arrancados tan " +"pronto como se cargue el disco de RAM, puede continuar simplemente. También " +"puede decidir no continuar e introducir «none» cuando se le pregunte qué " +"arrays deberían arrancarse del disco de RAM inicial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"¿Debería mdadm ejecutar comprobaciones de redundancia mensuales de los " +"arrays MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Mdadm puede comprobar de forma periódica la redundancia de sus arrays MD " +"(RAIDs) si el núcleo lo soporta (si su versión es superior a la 2.6.14). " +"Esto puede ser un proceso que consuma muchos recursos, dependiendo de su " +"configuración, pero podría ayudar a prevenir casos raros de pérdida de " +"datos. Tenga en cuenta que estas comprobaciones se hacen en modo lectura " +"salvo que se detecten errores, en cuyo caso mdadm necesitará corregirlos, lo " +"que significa que será necesario tener acceso de escritura a los medios " +"físicos." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"El valor por omisio, si se activa, es comprobar el primer Domingo de cada " +"mes a las 01:06 am." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "¿Desea arrancar el demonio de monitorización MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"El demonio de monitorización MD (RAID) envía notificaciones por correo " +"electrónico cuando se producen eventos importantes en los dispositivos MD " +"(como pueda ser el caso de un fallo de un disco)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Es opcional habilitar esta opción." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Destinatario de las notificaciones por correo:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Introduzca la dirección de correo electrónico del usuario que debería " +"recibir las notificaciones por correo de eventos relevantes en los " +"dispositivos MD." diff --git a/debian/po/eu.po b/debian/po/eu.po new file mode 100644 index 00000000..06d8e537 --- /dev/null +++ b/debian/po/eu.po @@ -0,0 +1,211 @@ +# mdadm debconf templates basque translation +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# Piarres Beobide <pi@beobide.net>, 2008. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm-debconf\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: Y2008-04-30 11:00+0100\n" +"Last-Translator: Piarres Beobide <pi@beobide.net>\n" +"Language-Team: Euskara <debian-l10n-basque@lists.debian.org>\n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Erro fitxategi-sistemarentzat beharrezko MD array-ak:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Mesedez idatzi 'denak', 'batez', edo zuriunez bereziriko gailuen zerrenda, " +"adibidez 'md0 md1' edo 'md/1 md/d0' (hasierako '/dev/' baztertu daiteke)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "barne erabilerako - deskribapen luzea bakarrik behar da." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Sistemaren erro fitxategi-sistema MD array (RAID) batetan kokaturik badago, " +"berau abio sekuentziaren hasieran abiarazi behar da. MD batetan kokaturiko " +"bolumen logiko (LVM) batetan badago osatzen duten array guztiak abiarazi " +"behar dira." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Erro fitxategi-sistema erabiltzeko beharrezkoak diren arrayak zehazki jakin " +"eta beste array-en abiaraztea abioaren beranduagoko puntu batetara atzeratu " +"nahi baduzu, idatzi abiarazi beharreko array-ak hemen. Bestela idatzi " +"'denak' array erabilgarri guztiak abiarazteko." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Ez baduzu erro fitxategi sistemarako array-rik abiarazi behar, hutsik utzi " +"ezazu (edo 'batez' idatzi). Hau abioan array-rik behar ez duzulako edo " +"kernel auto-abioa erabiltzen duzulako izan daiteke." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Errore bat gertatu da: gailu nodoa ez dago" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Errore bat gertatu da: ez da bloke gailu bat" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Errore bat gertatu da: ez da MD array bat" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "" +"Errore bat gertatu da: array-a ez dago mdadm.conf fitxategian zerrendaturik" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Abiarazi mdadm.conf fitxategian ez dauden array-ak?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"Zehazturiko (${array}) array-a ez dago (${config}) konfigurazio fitxategian " +"zerrendaturiko. Horregatik ezin da abioan abiarazi zuk konfigurazio " +"fitxategia konpondu eta abio ramdiskoa bersortu arte." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Abisu hau abiarazi ahal izateko ramdisk.etik array-ak baiaraztea behar " +"baduzu bakarrik da garrantzitsua. Kernel auto-abioa erabiltzen baduzu edo ez " +"baduzu ramdisk-etik hasieran array-rik kargatzea behar aurrera jarraitu " +"dezakezu. Bestela ez jarraitzea hautatu eta 'batez' idatzi hasierako ramdisk-" +"etik kargatu beharreko array-ez galdetzean." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"Mdadm-ek hilabetero egin behar al du MD array-en erredundantzia egiaztapena?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Kernelak onartzen badu (2.6.14 baino bertsio berriagoak), mdadm-ek aldiro MD " +"array-en (RAID-en) erredundantzia aldiro egiazta dezake. Hau errekurtso-" +"behar handiko prozesu bat izan daiteke, konfigurazio lokalaren arabera, " +"baina datu galera kasuak saihesten lagundu dezake. Kontutan izan errorerik " +"aurkitzen ez bada irakurketa-soileko egiaztapen bat dela; errorerik " +"arukituko balitz mdadm konpontzen saiatuko da, honetarako euskarrian idaztea " +"beharrezko izan daitekeelarik." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Lehenespen bezala gaiturik dago, egiaztapena hilabete bakoitzeko lehenengo " +"asteleheneko 01:06-etan egingo da." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "MD monitorizazio deabrua abiarazi nahi al duzu?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"MD (RAID) monitore deabruak eposta bidezko berri-emateak bidaltzen ditu " +"gertaera garrantzitsuetan (disko erroreen antzerakoetan)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Aukera hau gaitzea gomendagarria da." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Eposta berri-emateen hartzailea:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Mesedez idatzi MD gertaera garrantzitsuen berri emate mezuak jaso behar " +"dituen erabiltzailearen eposta helbidea." diff --git a/debian/po/fi.po b/debian/po/fi.po new file mode 100644 index 00000000..688baef0 --- /dev/null +++ b/debian/po/fi.po @@ -0,0 +1,208 @@ +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-02-14 11:24+0200\n" +"Last-Translator: Esko Arajärvi <edu@iki.fi>\n" +"Language-Team: Finnish <debian-l10n-finnish@lists.debian.org>\n" +"Language: fi\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"X-Poedit-Language: Finnish\n" +"X-Poedit-Country: FINLAND\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Juuritiedostojärjestelmän tarvitsemat MD-pakat:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Syötä â€all†(kaikki), â€none†(ei mitään) tai välilyönnein eroteltu lista " +"laitteista, esimerkiksi â€md0 md1†tai â€md/1 md/d0†(edeltävä /dev/ voidaan " +"jättää pois)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "vain sisäiseen käyttöön - vain pitkä kuvaus on tarpeellinen." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Jos järjestelmän juuritiedostojärjestelmä sijaitsee MD-levypakassa (RAID), " +"pakka tulee käynnistää aikaisessa vaiheessa käynnistettäessä järjestelmää. " +"Jos se sijaitsee loogisella taltiolla (LVM), joka on MD-pakassa, kaikki " +"taltioon liittyvät pakat tulee käynnistää." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Jos tiedät tarkalleen mitä pakkoja tarvitaan juuritiedostojärjestelmän " +"käynnistämiseen ja haluat viivästyttää muiden pakkojen käynnistystä, syötä " +"käynnistettävät pakat tähän. Vaihtoehtoisesti voit käynnistää kaikki pakat " +"syöttämällä â€allâ€." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Jos mitään pakkoja ei tarvitse käynnistää juuritiedostojärjestelmän " +"käyttämiseksi, jätä kenttä tyhjäksi (tai syötä â€noneâ€). Tämä voi olla " +"tilanne, jos käytät ytimen autokäynnistystä tai et tarvitse mitään pakkoja " +"käynnistykseen." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Tapahtui virhe: laitetiedostoa ei ole olemassa" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Tapahtui virhe: ei lohkolaite" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Tapahtui virhe: ei MD-pakka" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Tapahtui virhe: pakkaa ei ole listattu tiedostossa mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Käynnistetäänkö pakat, joita ei ole listattu tiedostossa mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"Annettua pakkaa (${array}) ei ole listattu asetustiedostossa (${config}). " +"Niinpä sitä ei voida käynnistää käynnistettäessä järjestelmä, ellei " +"asetustiedostoa korjata ja käynnistysmuistilevyä (initrd) luoda uudelleen." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Tämä varoitus on aiheellinen vain, jos järjestelmän käynnistäminen vaatii " +"pakkojen käynnistämistä käynnistysmuistilevyltä. Jos ytimen autokäynnistys " +"on käytössä tai pakkoja ei tarvita siinä vaiheessa, kun käynnistysmuistilevy " +"ladataan, voit jatkaa. Vaihtoehtoisesti voit olla jatkamatta ja syöttää " +"â€none†kysyttäessä käynnistysmuistilevyltä käynnistettäviä pakkoja." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Tulisiko mdadm:n tarkistaa kuukausittain MD-pakkojen eheys?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Ohjelma mdadm voi säännöllisesti tarkistaa MD-pakkojen (RAIDien) tietojen " +"monistuksen, jos ydin tukee tätä (versiosta 2.6.14 eteenpäin). Tämä prosessi " +"voi paikallisesta kokoonpanosta riippuen kuluttaa paljon resursseja, mutta " +"saattaa ehkäistä tietojen menetyksiä tietyissä harvinaisissa tapauksissa. " +"Tarkistus vaatii vain tietojen lukemista, jos virheitä ei löyty. Jos " +"virheitä löytyy, mdadm yrittää korjata ne, jolloin levylle saatetaan myös " +"kirjoittaa." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Oletuksena, jos tarkistus on käytössä, se tehdään kuukauden ensimmäisenä " +"sunnuntaina kello 01.06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Haluatko käynnistää MD-seurannan?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"MD-pakkoja (RAIDeja) seuraava taustaohjelma lähettää tietoja sähköpostiin " +"tärkeiden MD-tapahtumien (kuten levyrikon) sattuessa." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Tämän valitseminen on suositeltavaa." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Sähköpostiviestien vastaanottaja:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Anna sähköpostiosoite, johon sähköpostitiedotteet tärkeistä MD-tapahtumista " +"lähetetään." diff --git a/debian/po/fr.po b/debian/po/fr.po new file mode 100644 index 00000000..ce2459c4 --- /dev/null +++ b/debian/po/fr.po @@ -0,0 +1,222 @@ +# Translation of mdadm debconf templates to French +# Copyright (C) 2008 Florentin Duneau <fduneau@gmail.com> +# This file is distributed under the same license as the lurker package. +# +# +# Éric Madesclair <eric-m@wanadoo.fr>, 2005, 2006. +# Jean-Luc Coulon (f5ibh) <jean-luc.coulon@wanadoo.fr>, 2006. +# Florentin Duneau <fduneau@gmail.com>, 2006, 2007, 2008. +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-02-15 20:42+0100\n" +"Last-Translator: Florentin Duneau <fduneau@gmail.com>\n" +"Language-Team: French <debian-l10n-french@lists.debian.org>\n" +"Language: fr\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=2; plural=(n > 1);\n" +"X-Generator: KBabel 1.11.4\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Ensembles MD requis par le système de fichiers racine :" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Veuillez indiquer « all », « none » ou une liste de périphériques, séparés " +"par des espaces, par exemple, « md0 md1 » ou « md/1 md/d0 » (vous pouvez " +"omettre « /dev/ »)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "" +"Pour une utilisation interne - seule la description longue est nécessaire" + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Si le système de fichiers racine se trouve sur un ensemble MD (RAID), il " +"doit être lancé au début de la procédure de démarrage. Si le système de " +"fichiers racine se trouve sur un volume logique (« LVM »), qui se trouve " +"aussi sur un volume MD, tous les composants de l'ensemble doivent être " +"démarrés." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Si vous savez exactement quels sont les ensembles RAID nécessaires au " +"démarrage du système de fichiers racine et si vous souhaitez différer le " +"démarrage de tous les autres ensembles, veuillez les indiquer ici. Vous " +"pouvez aussi indiquer « all » pour démarrer tous les ensembles existants." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Si vous n'avez pas besoin ou ne souhaitez pas démarrer d'ensemble RAID pour " +"le système de fichiers racine, veuillez laissez l'entrée vide (ou entrez " +"« none »). Ceci peut être le cas si vous utilisez l'option de démarrage " +"automatique (« autostart ») du noyau ou si vous n'avez besoin d'aucun " +"ensemble pour démarrer." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Erreur : périphérique inconnu" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Erreur : ce n'est pas un périphérique en mode bloc" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Erreur : ce n'est pas un ensemble RAID" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Erreur : ensemble non mentionné dans le fichier mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Faut-il démarrer les ensembles RAID non mentionnés dans mdadm.conf ?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"L'ensemble (${array}) que vous avez spécifié n'est pas mentionné dans le " +"fichier de configuration ${config}. Il ne sera donc pas démarré à moins que " +"vous corrigiez le fichier de configuration et que vous génériez de nouveau " +"le disque mémoire initial (« ramdisk »)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Cet avertissement n'a de signification que si des ensembles RAID doivent " +"être lancés à partir du disque mémoire initial afin de pouvoir démarrer le " +"système. Si vous utilisez le démarrage automatique par le noyau, ou si vous " +"n'avez pas besoin de lancer d'ensemble RAID depuis le disque mémoire " +"initial, vous pouvez simplement poursuivre. Vous pouvez aussi choisir de ne " +"pas poursuivre et entrer « none » lorsqu'il vous sera demandé le nom des " +"ensembles RAID à démarrer à partir du disque mémoire initial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Faut-il vérifier chaque mois la redondance des ensembles RAID ?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Si le noyau le gère (à partir de la version 2.6.14), mdadm peut vérifier " +"périodiquement la redondance des ensembles RAID. Cette action peut demander " +"beaucoup de ressources selon la configuration, mais cela aide à prévenir les " +"rares cas de pertes de données. Notez que ce test est réalisé en lecture " +"seule à moins que des erreurs ne soient rencontrées. Si des erreurs sont " +"détectées, mdadm essayera de les corriger, ce qui entraînera des écritures " +"sur le média." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Par défaut, la vérification s'effectuera tous les premiers dimanche du mois " +"à 01 h 06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Faut-il démarrer le démon de surveillance MD ?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Le démon de surveillance MD envoie des notifications par courriel lors " +"d'importants événements MD (comme une panne de disque dur)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Il est recommandé d'activer cette option." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Destinataire des notifications par courriel :" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Veuillez indiquer l'adresse électronique de l'utilisateur qui doit recevoir " +"les notifications lors d'importants événements MD." diff --git a/debian/po/gl.po b/debian/po/gl.po new file mode 100644 index 00000000..b454e818 --- /dev/null +++ b/debian/po/gl.po @@ -0,0 +1,210 @@ +# Galician translation of mdadm's debconf templates +# This file is distributed under the same license as the mdadm package. +# Jacobo Tarrio <jtarrio@debian.org>, 2007, 2008. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-02-06 23:45+0000\n" +"Last-Translator: Jacobo Tarrio <jtarrio@debian.org>\n" +"Language-Team: Galician <proxecto@trasno.net>\n" +"Language: gl\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Arrays MD necesarios para o sistema de ficheiros raÃz" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Introduza \"all\" (todos), \"none\" (ningún) ou unha lista de dispositivos " +"separados por espazos, tales coma \"md0 md1\" ou \"md/1 md/0\" (pódese " +"omitir o \"/dev/\" do principio)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "para uso interno - só se precisa da descrición longa." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Se o sistema de ficheiros raÃz do sistema está ubicado nun array MD (RAID), " +"hai que o iniciar no principio da secuencia de inicio. Se está ubicado nun " +"volume lóxico (LVM) que está nun MD, hai que iniciar os arrays constituÃntes." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Se sabe exactamente que arrays son necesarios para erguer o sistema de " +"ficheiros raÃz, e se quere pospor o inicio dos demáis arrays ata un punto " +"posterior da secuencia de inicio, introduza aquà os arrays a iniciar. " +"Alternativamente, introduza \"all\" para iniciar tódolos arrays dispoñibles." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Se non quere ou precisa de iniciar ningún array para o sistema de ficheiros " +"raÃz, deixe a resposta en branco (ou introduza \"none\"). Este pode ser o " +"caso se está a empregar o autoinicio do núcleo ou non precisa de ningún " +"array para o inicio." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Houbo un erro: o nodo do dispositivo non existe" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Houbo un erro: non é un dispositivo de bloques" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Houbo un erro: non é un array MD" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Houbo un erro: o array non figura no ficheiro mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "¿Iniciar os arrays que non figuran en mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"O array indicado (${array}) non figura no ficheiro de configuración " +"(${config}). Polo tanto, non se pode arrincar no inicio do sistema, a menos " +"que corrixa o ficheiro de configuración e volva crear o disco RAM inicial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Este aviso só é relevante se precisa de iniciar arrays desde o disco RAM " +"inicial para poder iniciar o sistema. Se emprega autoinicio do núcleo ou non " +"precisa de iniciar arrays tan pronto como se cargue o disco RAM inicial, " +"pode continuar. De xeito alternativo, escolla non continuar e introduza " +"\"none\" cando se lle pregunte que arrays quere iniciar do disco RAM inicial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"¿DeberÃa mdadm facer comprobacións mensuais de redundancia dos arrays MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Se o núcleo ten soporte para iso (en versións superiores á 2.6.14), mdadm " +"pode facer comprobacións periódicas de redundancia dos arrays MD (RAIDs). " +"Este pode ser un proceso intensivo en recursos, dependendo da configuración " +"local, pero pode axudar a evitar algúns casos raros de perdas de datos. Teña " +"en conta que esta é unha comprobación de só lectura a menos que se atopen " +"erros; se se atopan erros, mdadm ha tratar de os arranxar, o que pode " +"producir accesos de escritura aos soportes." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"A opción por defecto, se se activa, é facer as comprobacións o primeiro " +"domingo de cada mes ás 01:16." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "¿Quere iniciar o servizo de monitorización de MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"O servizo de monitorización de MD (RAID) envÃa avisos por email en resposta " +"a eventos importantes de MD (coma fallos nos discos)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Recoméndase activar esta opción." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Destinatario para os avisos por email:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Introduza o enderezo de email do usuario que debe recibir os avisos por " +"email de eventos importantes de MD." diff --git a/debian/po/it.po b/debian/po/it.po new file mode 100644 index 00000000..c2d5285e --- /dev/null +++ b/debian/po/it.po @@ -0,0 +1,213 @@ +# Italian (it) translation of debconf templates for mdadm +# Copyright (C) 2008 Software in the Public Interest +# This file is distributed under the same license as the mdadm package. +# Luca Monducci <luca.mo@tiscali.it>, 2008. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm italian debconf\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-11-19 11:02+0100\n" +"Last-Translator: Luca Monducci <luca.mo@tiscali.it>\n" +"Language-Team: Italian <debian-l10n-italian@lists.debian.org>\n" +"Language: it\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Array MD necessari per il file system di root:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Inserire \"all\", \"none\" oppure un elenco dei device separati da uno " +"spazio, per esempio \"md0 md1\" o \"md/1 md/d0\" (il \"/dev/\" iniziale può " +"essere omesso)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "uso interno - è necessaria solo la descrizione lunga." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Se il file system di root è su un array MD (RAID), è necessario attivare " +"tale array all'inizio della sequenza d'avvio. Se è su un volume logico " +"(LVM), il quale è su un MD, è necessario attivare tutti gli array che " +"costituiscono il volume." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Se si conoscono esattamente quali sono gli array da attivare per il file " +"system di root e si vuole rimandare l'attivazione di tutti gli altri array a " +"una fase successiva della sequenza d'avvio, inserire adesso gli array da " +"attivare. In alternativa, inserire \"all\" per attivare tutti gli array " +"disponibili." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Se non si ha bisogno o non si vuole attivare nessun array per il file system " +"di root, lasciare la risposta in bianco (oppure inserire \"none\"). Questo " +"potrebbe essere il caso se si utilizza l'attivazione automatica da kernel " +"oppure se non si ha bisogno di alcun array per l'avvio." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Errore: il nodo del device non esiste" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Errore: non è un device a blocchi" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Errore: non è un array MD" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Errore: array non elencato nel file mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Avviare gli array non elencati in mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"L'array specificato (${array}) non è presente nel file di configurazione " +"(${config}): quindi non può essere attivato durante l'avvio senza correggere " +"il file di configurazione e ricreare il ramdisk iniziale." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Questo avviso è pertinente solo se è necessario attivare gli array dal " +"ramdisk iniziale per permettere l'avvio. Con l'avvio automatico da kernel o " +"se non è necessario attivare gli array così presto come al caricamento del " +"ramdisk iniziale, si può proseguire. In alternativa, scegliere di non " +"continuare e inserire \"none\" quando viene chiesto quali array attivare dal " +"ramdisk iniziale." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Far eseguire a mdadm i controlli mensili di ridondanza sugli array MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Se il kernel lo supporta (tutte le versioni successive la 2.6.14), mdadm può " +"effettuare delle verifiche periodiche sulla ridondanza degli array MD " +"(RAID). Questo è un processo che potrebbe richiedere molte risorse, in base " +"alle impostazioni locali, ma può prevenire i rari casi di perdita di dati. " +"Notare che questa verifica è di sola-lettura tranne quando riscontra degli " +"errori; quando ci sono errori, mdadm prova a correggerli e potrebbe accedere " +"in scrittura al supporto." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Se attivo, la configurazione predefinita prevede che il controllo sia " +"eseguito la prima domenica di ogni mese alle 01.06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Avviare il demone di monitoraggio MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Il demone di monitoraggio MD (RAID) invia delle notifiche via email quando " +"si verificano eventi importanti (come la rottura di un disco)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Si raccomanda l'attivazione di questa funzione." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Destinatario delle email di notifica:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Inserire l'indirizzo email dell'utente che deve ricevere le notifiche di " +"eventi importanti legati al MD." diff --git a/debian/po/ja.po b/debian/po/ja.po new file mode 100644 index 00000000..eeefe8ed --- /dev/null +++ b/debian/po/ja.po @@ -0,0 +1,268 @@ +#
+# Translators, if you are not familiar with the PO format, gettext
+# documentation is worth reading, especially sections dedicated to
+# this format, e.g. by running:
+# info -n '(gettext)PO Files'
+# info -n '(gettext)Header Entry'
+#
+# Some information specific to po-debconf are available at
+# /usr/share/doc/po-debconf/README-trans
+# or http://www.debian.org/intl/l10n/po-debconf/README-trans
+#
+# Developers do not need to manually edit POT or PO files.
+#
+#
+msgid "" +msgstr "" +"Project-Id-Version: mdadm 2.6.3+200709292116+4450e59-4\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-02-07 05:52+0900\n" +"Last-Translator: Hideki Yamane (Debian-JP) <henrich@debian.or.jp>\n" +"Language-Team: Japanese <debian-japanese@lists.debian.org>\n" +"Language: ja\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "ルートファイルシステムã«å¿…è¦ãª MD アレイ:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"'all' ã¾ãŸã¯ 'none'ã€ã‚ã‚‹ã„ã¯ãƒ‡ãƒã‚¤ã‚¹ã®ãƒªã‚¹ãƒˆã‚’ 'md0 md1' ã‚„ 'md/1 md/d0' ã®" +"よã†ã«ã‚¹ãƒšãƒ¼ã‚¹ã§åŒºåˆ‡ã£ã¦å…¥åŠ›ã—ã¦ãã ã•ã„ (å‰ã«ä»˜ã '/dev/' ã¯çœç•¥å¯èƒ½ã§ã™)。" + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "内部ã§ã®åˆ©ç”¨ã«ã¤ã„㦠- ã§ã‚‚ã€é•·ã„説明ãŒå¿…è¦ã§ã™ã€‚" + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"ã‚ãªãŸã®ã‚·ã‚¹ãƒ†ãƒ ã®ãƒ«ãƒ¼ãƒˆãƒ•ã‚¡ã‚¤ãƒ«ã‚·ã‚¹ãƒ†ãƒ ㌠MD アレイ (RAID) 上ã«é…ç½®ã•ã‚Œã¦ã„" +"ã‚‹ãªã‚‰ã°ã€ãƒ–ートシーケンスã®åˆæœŸæ®µéšŽã§ MD アレイを開始ã™ã‚‹å¿…è¦ãŒã‚ã‚Šã¾ã™ã€‚" +"ルートファイルシステム㌠MD ã®ã‚ˆã†ãªè«–ç†ãƒœãƒªãƒ¥ãƒ¼ãƒ (LVM) 上ã«ã‚ã‚‹å ´åˆã¯ã€æ§‹æˆ" +"ã—ã¦ã„るアレイ全ã¦ã®é–‹å§‹ãŒå¿…è¦ã§ã™ã€‚" + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"ã©ã®ã‚¢ãƒ¬ã‚¤ãŒãƒ«ãƒ¼ãƒˆãƒ•ã‚¡ã‚¤ãƒ«ã‚·ã‚¹ãƒ†ãƒ ã®ç«‹ã¡ä¸Šã’ã«å¿…è¦ã‹ã‚’æ£ç¢ºã«çŸ¥ã£ã¦ãŠã‚Šã€ãƒ–ー" +"トシーケンスã®å¾Œã®æ™‚点ã¾ã§æ„図ã—ã¦ã„ã‚‹ã‚‚ã®ä»¥å¤–å…¨ã¦ã®ã‚¢ãƒ¬ã‚¤èµ·å‹•ã‚’é…らã›ãŸã„å ´" +"åˆã€ã“ã“ã§æœ€åˆã«èµ·å‹•ã™ã‚‹ã‚¢ãƒ¬ã‚¤ã‚’入力ã—ã¦ãã ã•ã„。ãã†ã§ãªã„å ´åˆã€'all' ã¨å…¥" +"力ã—ã¦å˜ã«å…¨ã¦ã®åˆ©ç”¨å¯èƒ½ãªã‚¢ãƒ¬ã‚¤ã‚’最åˆã«ç«‹ã¡ä¸Šã’ã¦ãã ã•ã„。" + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"ルートファイルシステムã®ãŸã‚ã«ã€ã©ã®ã‚¢ãƒ¬ã‚¤ã‚‚å¿…è¦ãªã„ã€ã‚ã‚‹ã„ã¯ã©ã®ã‚¢ãƒ¬ã‚¤ã‚‚èµ·" +"å‹•ã—ãŸãã¯ç„¡ã„ã¨ã„ã†å ´åˆã¯ã€ç©ºç™½ã®ã¾ã¾ã« (ã‚ã‚‹ã„㯠'none' ã¨å…¥åŠ›) ã—ã¦ãã ã•" +"ã„。ã“ã‚Œã¯ã€ã‚«ãƒ¼ãƒãƒ«ã§è‡ªå‹•çš„ã«èµ·å‹•ã•ã‚Œã‚‹å ´åˆã‚„起動時ã«ã¯ã‚¢ãƒ¬ã‚¤ã¯ä¸è¦ã§ã‚ã‚‹ã¨" +"ã„ã†å ´åˆã§ã™ã€‚" + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "エラーãŒç™ºç”Ÿã—ã¾ã—ãŸ: デãƒã‚¤ã‚¹ãƒŽãƒ¼ãƒ‰ãŒå˜åœ¨ã—ã¾ã›ã‚“" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "エラーãŒç™ºç”Ÿã—ã¾ã—ãŸ: ブãƒãƒƒã‚¯ãƒ‡ãƒã‚¤ã‚¹ã§ã¯ã‚ã‚Šã¾ã›ã‚“" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "エラーãŒç™ºç”Ÿã—ã¾ã—ãŸ: MD アレイã§ã¯ã‚ã‚Šã¾ã›ã‚“" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "エラーãŒç™ºç”Ÿã—ã¾ã—ãŸ: mdadm.conf ファイルã«è¨˜è¿°ã•ã‚Œã¦ã„ãªã„アレイã§ã™" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "mdadm.conf ã«è¨˜è¿°ã•ã‚Œã¦ã„ãªã„アレイを起動ã—ã¾ã™ã‹?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"指定ã—ãŸã‚¢ãƒ¬ã‚¤ (${array}) ã¯è¨å®šãƒ•ã‚¡ã‚¤ãƒ« (${config}) ã«è¨˜è¿°ã•ã‚Œã¦ã„ã¾ã›ã‚“。ã" +"ã®ãŸã‚ã€è¨å®šãƒ•ã‚¡ã‚¤ãƒ«ã‚’ä¿®æ£ã—㦠initrd ã‚’å†ç”Ÿæˆã—ãªã‘ã‚Œã°ãƒ–ート時ã«èµ·å‹•ã§ãã¾" +"ã›ã‚“。" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"ã“ã®è¦å‘Šã¯ã€ãƒ–ートã§ãるよã†ã«ã‚¢ãƒ¬ã‚¤ã‚’ initrd ã‹ã‚‰èµ·å‹•ã™ã‚‹å¿…è¦ãŒã‚ã‚‹å ´åˆã ã‘" +"関係ã—ã¾ã™ã€‚カーãƒãƒ«ã§è‡ªå‹•çš„ã«ã‚¢ãƒ¬ã‚¤ã‚’èµ·å‹•ã™ã‚‹ã‚ˆã†ã«ã—ã¦ã„ã‚‹å ´åˆã€ã‚ã‚‹ã„㯠" +"initrd ãŒãƒãƒ¼ãƒ‰ã•ã‚Œã‚‹ç¨‹æ—©ã„段階ã§ã©ã®ã‚¢ãƒ¬ã‚¤ã‚‚èµ·å‹•ã—ãŸãã¯ãªã„å ´åˆã¯ãã®ã¾ã¾ç¶š" +"è¡Œã§ãã¾ã™ã€‚ä»–ã®é¸æŠžè‚¢ã¨ã—ã¦ã¯ã€èµ·å‹•ã®ç¶šè¡Œã‚’ä¸æ¢ã—ã€ã©ã®ã‚¢ãƒ¬ã‚¤ã‚’ initrd ã‹ã‚‰" +"èµ·å‹•ã™ã‚‹ã‹ã‚’å°‹ãられãŸéš›ã« 'none' ã¨å…¥åŠ›ã—ã¾ã™ã€‚" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "mdadm ã¯ã€æ¯Žæœˆ MD アレイã®å†—長性ãƒã‚§ãƒƒã‚¯ã‚’è¡Œã„ã¾ã™ã‹?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"カーãƒãƒ«ãŒã‚µãƒãƒ¼ãƒˆã—ã¦ã„ã‚‹å ´åˆ (ãƒãƒ¼ã‚¸ãƒ§ãƒ³ 2.6.14 以é™)ã€mdadm ã¯å®šæœŸçš„ã« MD " +"アレイ (RAID) ã®å†—長性ãƒã‚§ãƒƒã‚¯ã‚’ã™ã‚‹ã“ã¨ãŒå¯èƒ½ã§ã™ã€‚ã“ã‚Œã¯ã€è¨å®šã«ä¾å˜ã—ã¾ã™" +"ãŒãƒªã‚½ãƒ¼ã‚¹ã‚’集ä¸çš„ã«ä½¿ç”¨ã™ã‚‹å‹•ä½œã§ã™ã€‚ã—ã‹ã—ã€ç¨€ãªãƒ‡ãƒ¼ã‚¿æ¶ˆå¤±ã‚’ã‚らã‹ã˜ã‚é¿ã‘" +"ã‚‹ã®ã«å½¹ç«‹ã¤ã§ã—ょã†ã€‚ã“ã‚Œã¯ã€ã‚¨ãƒ©ãƒ¼ãŒè¦‹ã¤ã‹ã‚‰ãªã„é™ã‚Šã¯èªã¿è¾¼ã¿ãƒã‚§ãƒƒã‚¯ã®ã¿" +"ã§ã‚ã‚‹ã®ã«æ³¨æ„ã—ã¦ãã ã•ã„。エラーãŒç™ºè¦‹ã•ã‚ŒãŸå ´åˆã€mdadm ã¯ä¿®æ£ã—よã†ã¨ã—" +"ã¦ã€çµæžœçš„ã«ãƒ¡ãƒ‡ã‚£ã‚¢ã¸æ›¸ãè¾¼ã¿ã‚’è¡Œã„ã¾ã™ã€‚" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"有効ã«ã—ãŸå ´åˆã€ãƒ‡ãƒ•ã‚©ãƒ«ãƒˆã§ã¯æ¯Žæœˆç¬¬ä¸€æ—¥æ›œ 01:06 ã«ãƒã‚§ãƒƒã‚¯ãŒå®Ÿè¡Œã•ã‚Œã¾ã™ã€‚" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "MD 監視デーモンを起動ã—ã¾ã™ã‹?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"MD (RAID) 監視デーモンã¯ã€é‡å¤§ãª MD 関連ã®ã‚¤ãƒ™ãƒ³ãƒˆ (ディスク障害ãªã©) ã«å¯¾ã—" +"ã¦ãƒ¡ãƒ¼ãƒ«ã§é€šçŸ¥ã‚’é€ã‚Šã¾ã™ã€‚" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "ã“ã®æ©Ÿèƒ½ã‚’有効ã«ã™ã‚‹ã®ã‚’ãŠå‹§ã‚ã—ã¾ã™ã€‚" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "メール通知ã®å®›å…ˆ:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"MD 関連ã®é‡å¤§ãªã‚¤ãƒ™ãƒ³ãƒˆãŒç™ºç”Ÿã—ãŸéš›ã€ãƒ¡ãƒ¼ãƒ«ã§ã®é€šçŸ¥ã‚’å—ã‘å–ã‚‹å¿…è¦ãŒã‚るユーザ" +"ã®ãƒ¡ãƒ¼ãƒ«ã‚¢ãƒ‰ãƒ¬ã‚¹ã‚’入力ã—ã¦ãã ã•ã„。" + +#~ msgid "" +#~ "WARNING! If you are using hard disks which have RAID superblocks from " +#~ "earlier installations in different RAID arrays, you MUST zero each " +#~ "superblock *before* activating the autostart feature." +#~ msgstr "" +#~ "è¦å‘Š! 以å‰ã®ã‚¤ãƒ³ã‚¹ãƒˆãƒ¼ãƒ«ã«ã‚ˆã£ã¦ä»–ã® RAID アレイã«å¯¾ã™ã‚‹ RAID superblock " +#~ "ã‚’ä¿æŒã—ã¦ã„ã‚‹ãƒãƒ¼ãƒ‰ãƒ‡ã‚£ã‚¹ã‚¯ã‚’使ã£ã¦ã„ã‚‹å ´åˆã€è‡ªå‹•èµ·å‹•æ©Ÿèƒ½ã‚’有効ã«ã™ã‚‹" +#~ "「å‰ã€ã«ã€ãã® superblock をゼãƒã§ä¸Šæ›¸ãã™ã‚‹ã“ã¨ãŒã€Œå¿…è¦ã€ã§ã™ã€‚" + +#~ msgid "" +#~ "To do this, do not start the RAID devices automatically. First, zero the " +#~ "superblock (mdadm --zero-superblock /dev/mdX). Next, use `dpkg-" +#~ "reconfigure mdadm` to reactivate the autostart feature." +#~ msgstr "" +#~ "ã“れを行ã†ã«ã¯ã€RAID デãƒã‚¤ã‚¹ã‚’自動的ã«èµ·å‹•ã—ã¦ã¯ã„ã‘ã¾ã›ã‚“。ã¾ãšã€ " +#~ "superblock をゼãƒã§ä¸Šæ›¸ãã—ã¾ã™ (mdadm --zero-superblock /dev/xxx)。 ãã—" +#~ "ã¦ã€è‡ªå‹•èµ·å‹•æ©Ÿèƒ½ã‚’有効ã«ã™ã‚‹ãŸã‚ã€'dpkg-reconfigure mdadm' コマンドを実行" +#~ "ã—ã¾ã™ã€‚" + +#~ msgid "" +#~ "You have the option to start all other arrays (those not needed for the " +#~ "root filesystem) later in the boot sequence. Doing so will give you " +#~ "greater control over the arrays with the mdadm configuration file. " +#~ "Starting all arrays at boot-time may be safer though." +#~ msgstr "" +#~ "ブートシーケンスã®å¾Œã‚ã®æ–¹ã§ (root ファイルシステムã«ã¯å¿…è¦ãªã„) ä»–ã®ã‚¢ãƒ¬" +#~ "イ全ã¦ã‚’èµ·å‹•ã™ã‚‹ã¨ã„ã†é¸æŠžè‚¢ã‚‚ã‚ã‚Šã¾ã™ã€‚ã“れをé¸ã¹ã°ã€mdadm ã®è¨å®šãƒ•ã‚¡ã‚¤ãƒ«" +#~ "を使ã£ã¦ã€ã‚¢ãƒ¬ã‚¤ã«ã¤ã„ã¦æ§˜ã€…ãªè¨å®šãŒå‡ºæ¥ã‚‹ã‚ˆã†ã«ãªã‚‹ã§ã—ょã†ã€‚ã‚‚ã£ã¨ã‚‚ã€èµ·" +#~ "動時ã«å…¨ã¦ã®ã‚¢ãƒ¬ã‚¤ã‚’èµ·å‹•ã™ã‚‹ã»ã†ãŒå®‰å…¨ã§ã¯ã‚ã‚Šã¾ã™ã€‚" + +#~ msgid "" +#~ "If RAID devices are started automatically, all RAID devices are " +#~ "discovered and assembled automatically at system startup. This option " +#~ "should only be used if the md driver is compiled as a module. If it is " +#~ "compiled into your kernel, the automatic startup will be performed at " +#~ "boot time by the kernel and therefore you should not choose this option." +#~ msgstr "" +#~ "RAID デãƒã‚¤ã‚¹ãŒè‡ªå‹•çš„ã«èµ·å‹•ã™ã‚‹ã‚ˆã†ã«ã™ã‚‹ã¨ã€ã‚·ã‚¹ãƒ†ãƒ 起動時ã«å…¨ã¦ã® RAID " +#~ "デãƒã‚¤ã‚¹ãŒæ¤œå‡ºã•ã‚Œã€è‡ªå‹•çš„ã«æ§‹æˆã•ã‚Œã¾ã™ã€‚ã“ã®ã‚ªãƒ—ション㯠md ドライãƒãŒãƒ¢" +#~ "ジュールã¨ã—ã¦ã‚³ãƒ³ãƒ‘イルã•ã‚Œã¦ã„ã‚‹å ´åˆã®ã¿ã«åˆ©ç”¨ã—ã¾ã™ã€‚カーãƒãƒ«ã«çµ„ã¿è¾¼ã‚“" +#~ "ã§ã‚³ãƒ³ãƒ‘イルã—ã¦ã„ãŸå ´åˆã€ã‚·ã‚¹ãƒ†ãƒ 起動時ã«ã‚«ãƒ¼ãƒãƒ«ã«ã‚ˆã£ã¦è‡ªå‹•èµ·å‹•ãŒå®Ÿè¡Œã•" +#~ "れるã®ã§ã€ã“ã®ã‚ªãƒ—ションã§ã®é¸æŠžã¯ã§ãã¾ã›ã‚“。" + +#~ msgid "" +#~ "When the RAID monitor daemon runs, email notifications are sent when a " +#~ "disk belonging to a RAID array fails or changes its status for some " +#~ "reason." +#~ msgstr "" +#~ "RAID 監視デーモンãŒå‹•ä½œã—ã¦ã„ã‚‹å ´åˆã€RAID アレイã«å±žã—ã¦ã„るディスクãŒæ•…éšœ" +#~ "ã™ã‚‹ã‹ä½•ã‚‰ã‹ã®ç†ç”±ã§å¤‰åŒ–ã—ãŸéš›ã«ãƒ¡ãƒ¼ãƒ«ã§é€šçŸ¥ãŒé€ã‚‰ã‚Œã¾ã™ã€‚" diff --git a/debian/po/nl.po b/debian/po/nl.po new file mode 100644 index 00000000..d3e5f3da --- /dev/null +++ b/debian/po/nl.po @@ -0,0 +1,223 @@ +# translation of mdadm_2.6.3+200709292116+4450e59-4.po to Dutch +# +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans# +# Developers do not need to manually edit POT or PO files. +# +# Frans Pop <aragorn@tiscali.nl>, 2005, 2006. +# Frans Pop <elendil@planet.nl>, 2008. +msgid "" +msgstr "" +"Project-Id-Version: mdadm_2.6.3+200709292116+4450e59-4\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-02-19 14:04+0100\n" +"Last-Translator: Frans Pop <elendil@planet.nl>\n" +"Language-Team: Dutch <debian-l10n-dutch@lists.debian.org>\n" +"Language: nl\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"X-Generator: KBabel 1.11.4\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Voor het basisbestandssysteem benodigde RAID-reeksen:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Geef in 'all' (alle), 'none' (geen) of één of meerdere apparaatbestanden " +"(gescheiden door spaties), bijvoorbeeld \"md0 md1\" of \"md/1 md/d0\" (de " +"prefix '/dev/' kan dus worden weggelaten)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "Voor intern gebruik - alleen de lange omschrijving wordt gebruikt." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Als het basisbestandssysteem van uw systeem zich op een RAID-volume bevindt, " +"dient dit vroeg in de opstartcyclus geactiveerd te worden. Als het zich op " +"een logisch volume (LVM) op RAID bevindt, dienen alle betrokken reeksen " +"geactiveerd te worden." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Als u precies weet welke reeksen benodigd zijn voor het basisbestandssysteem " +"en u het activeren van alle overige reeksen wilt uitstellen tot later in de " +"opstartprocedure, geef dan hier de te activeren reeksen in. Anders kunt u " +"'all' ingeven om alle beschikbare reeksen te activeren." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Als u geen reeksen hoeft of wenst te activeren voor het " +"basisbestandssysteem, laat dan het antwoord leeg (of geef 'none' in). Dit " +"kan het geval zijn als u \"kernel autostart\" gebruikt of geen reeksen nodig " +"heeft om uw systeem op te starten." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Er is een fout opgetreden: apparaatbestand bestaat niet" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Er is een fout opgetreden: geen blokapparaat" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Er is een fout opgetreden: geen RAID reeks" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Er is een fout opgetreden: reeks komt niet voor in bestand mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Reeksen activeren die niet in mdadm.conf voorkomen?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"De reeks die u heeft opgegeven (${array}) komt niet voor in het " +"configuratiebestand (${config}). Tenzij u het configuratiebestand corrigeert " +"en de initiële ramdisk opnieuw aanmaakt, kan deze reeks tijdens het " +"opstarten van het systeem niet worden geactiveerd." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Deze waarschuwing is alleen relevant als de reeksen geactiveerd moeten " +"worden vanuit een initiële ramdisk om het systeem te kunnen opstarten. Als u " +"\"kernel autostart\" gebruikt of er geen reeksen zijn die vanuit de initiële " +"ramdisk gestart moeten worden, kunt u gewoon doorgaan. Kies anders nu om " +"niet door te gaan en geef 'none' in bij de vraag welke reeksen vanuit de " +"initiële ramdisk gestart moeten worden." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Maandelijkse redundantiecontrole van RAID-reeksen uitvoeren?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Als uw kernel dit ondersteunt (versies groter dan 2.6.14), kan mdadm " +"periodiek de redundantie van uw RAID reeksen controleren. Afhankelijk van uw " +"configuratie kan dit een intensief proces zijn, maar het kan wel helpen om " +"uitzonderlijke gevallen van gegevensverlies te voorkomen. Zolang geen fouten " +"worden gevonden, zal het proces alleen gegevens lezen; als echter wel fouten " +"worden gevonden zal mdadm deze proberen te corrigeren." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Standaard wordt de controle, indien geactiveerd, uitgevoerd om 01:06 op elke " +"eerste zondag van de maand." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Wilt u de achtergronddienst voor de RAID-monitor starten?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"De achtergronddienst voor de RAID-monitor stuurt per e-mail berichten bij " +"belangrijke gebeurtenissen die betrekking hebben op RAID (zoals een falende " +"harde schijf)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Gebruik van deze optie wordt aanbevolen." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Adres voor e-mailberichten:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Wat is het e-mailadres van de gebruiker die de e-mailberichten voor " +"belangrijke gebeurtenissen met betrekking tot RAID dient te ontvangen." diff --git a/debian/po/pt.po b/debian/po/pt.po new file mode 100644 index 00000000..e450997a --- /dev/null +++ b/debian/po/pt.po @@ -0,0 +1,214 @@ +# Portuguese translation for mdadm debconf messages. +# Copyright (C) 2008 Pedro Ribeiro <p.m42.ribeiro@gmail.com> +# This file is distributed under the same license as the mdadm package. +# Pedro Ribeiro <p.m42.ribeiro@gmail.com>, 2008 +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 2.6.3+200709292116+4450e59-4\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-02-21 00:15+0000\n" +"Last-Translator: Pedro Ribeiro <p.m42.ribeiro@gmail.com>\n" +"Language-Team: Portuguese <traduz@debianpt.org>\n" +"Language: pt\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Grupos MD necessários para o sistema de ficheiros raiz:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Por favor, introduza 'all', 'none', ou uma lista de dispositivos separados " +"por espaços, tais como 'md0 md1' ou 'md/1 md/d0' (o '/dev/' inicial pode ser " +"omitido)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "para uso interno - apenas a descrição longa é necessária" + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Se o sistema de ficheiros de raiz do sistema estiver num grupo MD (RAID), " +"necessita de ser iniciado mais cedo na sequência de arranque. Se o seu " +"sistema de ficheiros de raiz estiver num volume lógico (LVM) que está no MD, " +"todos os grupos constituintes necessitam de ser iniciados." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Se souber exactamente que grupos são necessários para iniciar o sistema de " +"ficheiros raiz, e quiser adiar o inicio de todos os outros grupos para mais " +"tarde no processo de arranque, introduza os grupos aqui. Alternativamente, " +"introduza 'all' para iniciar todos os grupos disponÃveis." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Se não necessita ou deseja iniciar grupos para o sistema de ficheiros raiz, " +"deixe a resposta em branco (ou introduza 'none'). Isto vale no caso de usar " +"o auto-arranque do kernel ou não necessitar de grupos para o arranque do " +"sistema." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Ocorreu um erro: o nó do dispositivo não existe" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Ocorreu um erro: não é um dispositivo de bloco" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Ocorreu um erro: não é um grupo MD" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Ocorreu um erro: o grupo não está listado no ficheiro mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Iniciar grupos não listados no mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"O grupo especificado (${array}) não está listado no ficheiro de configuração " +"(${config}). Portanto, não pode ser iniciado durante o processo de arranque, " +"a não ser que corrija o ficheiro de configuração e recrie o ramdisk inicial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Este aviso só é relevante se houver necessidade de iniciar grupos a partir " +"do ramdisk durante o arranque do sistema. Se usar o auto-arranque do kernel, " +"ou não necessitar de iniciar os grupos tão cedo no processo de arranque do " +"sistema, pode simplesmente continuar. Em alternativa, escolha não continuar " +"e introduza 'none' quando perguntado sobre quais grupos iniciar a partir do " +"ramdisk inicial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"O mdadm deve correr verificações de redundância nos grupos MD mensalmente?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Se o kernel suportar (versões mais recentes que 2.6.14) o mdadm pode " +"verificar periodicamente a redundância dos grupos MD (RAIDs). Isto pode ser " +"um processo que requer muitos recursos, dependendo da sua configuração, mas " +"pode prevenir casos raros de perda de dados. Notar que esta verificação é " +"feita em modo de leitura a não ser que sejam encontrados erros; se forem " +"encontrados erros, o mdadm tenta corrigi-los, o que pode resultar em " +"acessosde escrita aos discos." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"O pré-definido, se ligado, é os testes serem executados no primeiro Domingo " +"de cada mês à s 01:06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Quer iniciar o deamon de monitorização do MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"O daemon monitor MD(RAID) envia notificações por email no caso de eventos " +"importantes (tais como falha de um disco). Provavelmente quer activar esta " +"opção." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "É recomendado activar esta opção." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Destinatário de email para notificações:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Por favor, introduza o endereço de email do utilizador que deve receber as " +"notificações de eventos MD importantes." diff --git a/debian/po/pt_BR.po b/debian/po/pt_BR.po new file mode 100644 index 00000000..118538b8 --- /dev/null +++ b/debian/po/pt_BR.po @@ -0,0 +1,340 @@ +# +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans +# +# Developers do not need to manually edit POT or PO files. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2006-09-24 19:22-0300\n" +"Last-Translator: Felipe Augusto van de Wiel (faw) <faw@cathedrallabs.org>\n" +"Language-Team: l10n portuguese <debian-l10n-portuguese@lists.debian.org>\n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"pt_BR utf-8\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +#, fuzzy +#| msgid "MD arrays needed for the root filesystem:" +msgid "MD arrays needed for the root file system:" +msgstr "Dispositivos MD necessários para o sistema de arquivos raiz:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +#, fuzzy +#| msgid "" +#| "Please enter a space-separated list of devices, 'all', or 'none'. You may " +#| "omit the leading '/dev/' and just enter e.g. \"md0 md1\", or \"md/1 md/" +#| "d0\"." +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Por favor, informe uma lista separada por espaços dos dispositivos, 'all' ou " +"'none'. Você pode omitir a parte inicial '/dev/' e apenas informar, por " +"exemplo, \"md0 md1\", ou \"md/1 md/d0\"." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "para uso interno - apenas a descrição longa é necessária." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +#, fuzzy +#| msgid "" +#| "If your system has its root filesystem on an MD array (RAID), it needs to " +#| "be started early during the boot sequence. If your root filesystem is on " +#| "a logical volume (LVM), which is on MD, all constituent arrays need to be " +#| "started." +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Se o seu sistema tem o sistema de arquivos raiz em um dispositivo MD (RAID), " +"este precisa ser iniciado mais cedo durante a seqüência de inicialização. Se " +"o sistema de arquivos raiz está em um volume lógico (LVM), que está em um " +"MD, todos os dispositivos que o constituem precisam ser iniciados." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +#, fuzzy +#| msgid "" +#| "If you know exactly which arrays are needed to bring up the root " +#| "filesystem, and you want to postpone starting all other arrays to a later " +#| "point in the boot sequence, enter the arrays to start here. " +#| "Alternatively, enter 'all' to simply start all available arrays." +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Se você sabe exatamente quais dispositivos são necessários para ativar o " +"sistema de arquivos raiz, e você deseja adiar o inÃcio de todos os outros " +"dispositivos para um ponto posterior na seqüência de inicialização, informe " +"os dispositivos a serem iniciados aqui. Como alternativa, informe 'all' para " +"simplesmente iniciar todos os dispositivos disponÃveis." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +#, fuzzy +#| msgid "" +#| "If you do not need or want to start any arrays for the root filesystem, " +#| "leave the answer blank (or enter 'none'). This may be the case if you are " +#| "using kernel autostart or do not need any arrays to boot." +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Se você não precisa ou não quer iniciar quaisquer dispositivos para o " +"sistema de arquivos raiz, deixe a resposta em branco (ou informe 'none'). " +"Este pode ser o caso se você está usando \"kernel autostart\" ou não precisa " +"de quaisquer dispositivos para a inicialização." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Um erro ocorreu: o dispositivo (\"device node\") não existe" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Um erro ocorreu: não é um dispositivo de blocos" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Um erro ocorreu: não é um dispositivo MD" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Um erro ocorreu: dispositivo não listado no arquivo mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +#, fuzzy +#| msgid "Proceed with starting arrays not listed in mdadm.conf?" +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Continuar com o inÃcio de dispositivos não listados no mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +#, fuzzy +#| msgid "" +#| "The array you have specified (${array}) is not listed in the " +#| "configuration file ${config}. Therefore it cannot be started during boot, " +#| "unless you correct the configuration file and recreate the initial " +#| "ramdisk." +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"O dispositivo que você especificou (${array}) não está listado no arquivo de " +"configuração ${config}. Portanto não pode ser iniciado durante a " +"inicialização, a menos que você corrija o arquivo de configuração e recrie o " +"\"ramdisk\" inicial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Este aviso só é relevante se você precisa de dispositivos que sejam " +"iniciados a partir do \"ramdisk\" inicial para que seja possÃvel inicializar " +"o computador. Se você usa \"kernel autostarting\", ou não precisa de " +"quaisquer dispositivos sendo iniciados tão logo o \"ramdisk\" inicial seja " +"carregado, você pode simplesmente continuar. Alternativamente, escolha não " +"continuar e informe 'none' quando perguntado quais dispositivos iniciar a " +"partir do \"ramdisk\" inicial." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"O mdadm deve, mensalmente, executar checagens de redundância dos " +"dispositivos MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +#, fuzzy +#| msgid "" +#| "If your kernel supports it (>> 2.6.14), mdadm can periodically check the " +#| "redundancy of your MD arrays (RAIDs). This may be a resource-intensive " +#| "process, depending on your setup, but it could help prevent rare cases of " +#| "data loss. Note that this is a read-only check unless errors are found; " +#| "if errors are found, mdadm will try to correct them, which may result in " +#| "write access to the media." +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Se o seu kernel suporta isto (>> 2.6.14), mdadm pode periodicamente checar a " +"redundância dos seus dispositivos MD (RAIDs). Isto pode ser um processo com " +"uso intensivo dos recursos, dependendo da sua configuração, mas pode ajudar " +"a previnir casos raros de perdas de dados. Note que esta é uma checagem " +"somente-leitura a menos que erros sejam encontrados; se erros são " +"encontrados, mdadm tentará corrigÃ-los, o que poderá resultar em acesso de " +"escrita na mÃdia." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +#, fuzzy +#| msgid "" +#| "The default, if turned on, is to run the checks on the first Sunday of " +#| "every month at 01:06 o'clock." +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"O padrão, se ativado, é executar checagens no primeiro Domingo de cada mês " +"à s 01:06 em ponto." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Você deseja iniciar o \"daemon\" de monitoramento MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +#, fuzzy +#| msgid "" +#| "The MD (RAID) monitor daemon sends email notifications in response to " +#| "important MD events (such as a disk failure). You probably want to enable " +#| "it." +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"O \"daemon\" de monitoramento MD (RAID) envia e-mails de notificações em " +"resposta a eventos MD importantes (como uma falha de disco). Você " +"provavelmente quer habilitar esta opção." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Destinatário para os e-mails de notificações:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +#, fuzzy +#| msgid "" +#| "Please enter the email address of the user who should get the email " +#| "notification for important MD events." +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Por favor, informe o endereço de e-mail do usuário que deverá receber os e-" +"mails de notificações para estes eventos MD importantes." + +#~ msgid "Initialise the superblock if you reuse hard disks" +#~ msgstr "Inicialize o superbloco caso você reutilize discos rÃgidos" + +#~ msgid "" +#~ "WARNING! If you are using hard disks which have RAID superblocks from " +#~ "earlier installations in different RAID arrays, you MUST zero each " +#~ "superblock *before* activating the autostart feature." +#~ msgstr "" +#~ "AVISO! Se você estiver usando discos rÃgidos que já contêm superblocos " +#~ "RAID de instalações anteriores em \"arrays\" RAID diferentes, você DEVE " +#~ "zerar o superbloco *antes* de ativar o recurso de \"autostart\"." + +#~ msgid "" +#~ "To do this, do not start the RAID devices automatically. First, zero the " +#~ "superblock (mdadm --zero-superblock /dev/mdX). Next, use `dpkg-" +#~ "reconfigure mdadm` to reactivate the autostart feature." +#~ msgstr "" +#~ "Para fazê-lo, não inicie os dispositivos RAID automaticamente. Primeiro, " +#~ "zere os superblocos (mdadm --zero-superblock /dev/mdX). Em seguida, use " +#~ "`dpkg-reconfigure mdadm` para reativar o recurso de \"autostart\"." + +#~ msgid "" +#~ "You have the option to start all other arrays (those not needed for the " +#~ "root filesystem) later in the boot sequence. Doing so will give you " +#~ "greater control over the arrays with the mdadm configuration file. " +#~ "Starting all arrays at boot-time may be safer though." +#~ msgstr "" +#~ "Você tem a opção de iniciar todos os \"arrays\" (aqueles que não são " +#~ "necessários pelo sistema de arquivos raiz) posteriormente na seqüência de " +#~ "inicialização. Fazendo isto, você terá um controle maior sobre os \"arrays" +#~ "\" com o arquivo de configuração mdadm. No entanto, iniciar todos os " +#~ "\"arrays\" durante a inicialização pode ser mais seguro." + +#~ msgid "" +#~ "If RAID devices are started automatically, all RAID devices are " +#~ "discovered and assembled automatically at system startup. This option " +#~ "should only be used if the md driver is compiled as a module. If it is " +#~ "compiled into your kernel, the automatic startup will be performed at " +#~ "boot time by the kernel and therefore you should not choose this option." +#~ msgstr "" +#~ "Caso os dispositivos RAID sejam iniciados automaticamente, todos os " +#~ "dispositivos RAID serão detectados e montados automaticamente na " +#~ "inicialização do sistema operacional. Esta opção deverá ser usada somente " +#~ "caso o driver md esteja compilado como módulo. Caso o mesmo esteja " +#~ "compilado embutido em seu kernel, a inicialização automática será " +#~ "executada em tempo de inicialização pelo próprio kernel e, portanto, você " +#~ "não deverá e nem precisará escolher esta opção." + +#~ msgid "" +#~ "When the RAID monitor daemon runs, email notifications are sent when a " +#~ "disk belonging to a RAID array fails or changes its status for some " +#~ "reason." +#~ msgstr "" +#~ "Quando o daemon monitorador RAID é executado, notificações via e-mail são " +#~ "enviadas quando um disco pertencente a uma array RAID falha ou muda seu " +#~ "status por qualquer razão." + +#~ msgid "Which user should get the email notification?" +#~ msgstr "Qual usuário deve receber o e-mail de notificação ?" diff --git a/debian/po/ru.po b/debian/po/ru.po new file mode 100644 index 00000000..a4aff77d --- /dev/null +++ b/debian/po/ru.po @@ -0,0 +1,224 @@ +# translation of ru.po to Russian +# +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans# +# Developers do not need to manually edit POT or PO files. +# +# Yuri Kozlov <kozlov.y@gmail.com>, 2006, 2008. +msgid "" +msgstr "" +"Project-Id-Version: 2.6.3+200709292116+4450e59-4\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-02-07 21:02+0300\n" +"Last-Translator: Yuri Kozlov <kozlov.y@gmail.com>\n" +"Language-Team: Russian <debian-l10n-russian@lists.debian.org>\n" +"Language: ru\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"X-Generator: KBabel 1.11.4\n" +"Plural-Forms: nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n" +"%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2);\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "MD-маÑÑивы, необходимые Ð´Ð»Ñ ÐºÐ¾Ñ€Ð½ÐµÐ²Ð¾Ð¹ файловой ÑиÑтемы:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Введите ÑпиÑок уÑтройÑтв через пробел, Ñлово 'all' или 'none'. Ð’Ñ‹ можете не " +"указывать начальную чаÑÑ‚ÑŒ пути типа '/dev/', а проÑто вводить имена " +"уÑтройÑтв, например 'md0 md1' или 'md/1 md/d0'." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "" +"Ð´Ð»Ñ Ð²Ð½ÑƒÑ‚Ñ€ÐµÐ½Ð½ÐµÐ³Ð¾ Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ð½Ð¸Ñ - нужно иÑпользовать только длинное опиÑание." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"ЕÑли в ÑиÑтеме ÐºÐ¾Ñ€Ð½ÐµÐ²Ð°Ñ Ñ„Ð°Ð¹Ð»Ð¾Ð²Ð°Ñ ÑиÑтема раÑположена на MD-маÑÑиве (RAID), " +"он должен быть запущен в Ñамом начале процеÑÑа загрузки. ЕÑли ÐºÐ¾Ñ€Ð½ÐµÐ²Ð°Ñ " +"Ñ„Ð°Ð¹Ð»Ð¾Ð²Ð°Ñ ÑиÑтема раÑположена на логичеÑком томе (LVM), который раÑположен на " +"MD, то должны быть запущены вÑе ÑоÑтавлÑющие маÑÑивы." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"ЕÑли вы точно знаете, какие маÑÑивы требуютÑÑ Ð´Ð»Ñ Ð¿Ð¾Ð»ÑƒÑ‡ÐµÐ½Ð¸Ñ Ñ€Ð°Ð±Ð¾Ñ‚Ð¾ÑпоÑобной " +"корневой файловой ÑиÑтемы и хотите отложить запуÑк оÑтальных маÑÑивов на " +"более поздний момент в процеÑÑе загрузки, то введите их здеÑÑŒ. Иначе, " +"введите Ñлово 'all', чтобы проÑто запуÑтить вÑе доÑтупные маÑÑивы." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"ЕÑли вам Ñто не нужно, или вы хотите запуÑкать вÑе маÑÑивы Ð´Ð»Ñ ÐºÐ¾Ñ€Ð½ÐµÐ²Ð¾Ð¹ " +"файловой ÑиÑтемы, оÑтавьте Ñто поле пуÑтым (или введите Ñлово 'none'). Ðтот " +"вариант подходит, еÑли вы иÑпользуете автоматичеÑкий запуÑк из Ñдра или еÑли " +"Ð´Ð»Ñ Ð·Ð°Ð³Ñ€ÑƒÐ·ÐºÐ¸ маÑÑивы ненужны." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Произошла ошибка: нода уÑтройÑтва не ÑущеÑтвует" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Произошла ошибка: уÑтройÑтво не ÑвлÑетÑÑ Ð±Ð»Ð¾Ñ‡Ð½Ñ‹Ð¼" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Произошла ошибка: Ñто не MD-маÑÑив" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Произошла ошибка: маÑÑив не опиÑан в файле mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "ЗапуÑтить маÑÑивы, неопиÑанные в mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"Указанный вами маÑÑив (${array}) не опиÑан в конфигурационном файле " +"(${config}). ПоÑтому он не может быть запущен при Ñтарте машины, пока вы не " +"иÑправите конфигурационный файл и не переÑоздадите первоначальный ramdisk." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Данное предупреждение умеÑтно только, еÑли вам требуетÑÑ Ð·Ð°Ð¿ÑƒÑкать маÑÑивы " +"из первоначального ramdisk Ð´Ð»Ñ Ð·Ð°Ð³Ñ€ÑƒÐ·ÐºÐ¸ машины. ЕÑли вы иÑпользуете " +"автоматичеÑкий запуÑк из Ñдра или вам не нужны маÑÑивы Ð´Ð»Ñ Ð·Ð°Ð³Ñ€ÑƒÐ·ÐºÐ¸ на Ñтапе " +"загрузки первоначального ramdisk, вы можете проÑто продолжить. Иначе, " +"выберите не продолжать и введите 'none', когда вам предложат выбрать маÑÑивы " +"Ð´Ð»Ñ Ð·Ð°Ð¿ÑƒÑка из первоначального ramdisk." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"Должен ли mdadm запуÑкать ежемеÑÑчную проверку избыточноÑти на MD-маÑÑивах?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"ЕÑли Ñто поддерживаетÑÑ Ñдром (>> 2.6.14), mdadm может периодичеÑки " +"проверÑÑ‚ÑŒ избыточноÑÑ‚ÑŒ MD маÑÑивов (RAID-ов). Ðто может Ñтать реÑурÑоёмким " +"процеÑÑом в завиÑимоÑти от наÑтройки, но он может помочь предотвратить " +"редкие Ñлучаи потери данных. Заметим, что пока не обнаружено ошибок, работа " +"ведётÑÑ Ð² режиме только чтение; еÑли обнаруживаетÑÑ Ð¾ÑˆÐ¸Ð±ÐºÐ°, mdadm попытаетÑÑ " +"иÑправить её, что может потребовать прав запиÑи на ноÑитель." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"ЕÑли ответить утвердительно, то по умолчанию проверка выполнÑетÑÑ Ð² первое " +"воÑкреÑенье каждого меÑÑца в 01:06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "ЗапуÑкать демон-монитор MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Демон-монитор MD (RAID) поÑылает почтовые ÑƒÐ²ÐµÐ´Ð¾Ð¼Ð»ÐµÐ½Ð¸Ñ Ð² Ñлучае Ð²Ð¾Ð·Ð½Ð¸ÐºÐ½Ð¾Ð²ÐµÐ½Ð¸Ñ " +"важных Ñобытий Ñ MD (таких как отказ диÑка)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "РекомендуетÑÑ Ð¾Ñ‚Ð²ÐµÑ‚Ð¸Ñ‚ÑŒ утвердительно." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Получатель уведомительных пиÑем:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Введите Ð°Ð´Ñ€ÐµÑ Ñлектронной почты пользователÑ, который будет получать " +"почтовые ÑƒÐ²ÐµÐ´Ð¾Ð¼Ð»ÐµÐ½Ð¸Ñ Ð¾ важных изменениÑÑ… в ÑоÑтоÑнии MD." diff --git a/debian/po/sk.po b/debian/po/sk.po new file mode 100644 index 00000000..5d6d5a50 --- /dev/null +++ b/debian/po/sk.po @@ -0,0 +1,212 @@ +# Slovak translations for mdadm package +# Slovenské preklady pre balÃk mdadm. +# Copyright (C) 2011 THE mdadm'S COPYRIGHT HOLDER +# This file is distributed under the same license as the mdadm package. +# Automatically generated, 2011. +# Slavko <linux@slavino.sk>, 2011. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 3.2.2-1\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2011-09-18 11:22+0200\n" +"Last-Translator: Slavko <linux@slavino.sk>\n" +"Language-Team: Slovak <debian-l10n-slovak@lists.debian.org>\n" +"Language: sk\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=3; plural=(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2;\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Polia MD, potrebné pre koreň súborového systému:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"ProsÃm, zadajte „allâ€, „none†alebo medzerou oddelený zoznam zariadenÃ, " +"napr. „md0 md1†alebo „md/1 md/d0†(poÄiatoÄné „/dev/†môže byÅ¥ vynechané)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "na interné použitie – potrebný je len dlhý popis." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Ak je koreň súborového systému umiestnený na poli MD (RAID), musà byÅ¥ " +"spustený poÄas zavádzania systému. Ak je koreň umiestnený na logickom zväzku " +"(LVM), ktorý je na MD, musia byÅ¥ spustené vÅ¡etky súvisiace polia." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Ak viete presne, ktoré polia sú potrebné na pripojenie koreňa súborového " +"systému a chcete odložiÅ¥ Å¡tart vÅ¡etkých ostatných polà na neskorÅ¡Ã okamih " +"zavádzania, zadajte tu polia, ktoré majú byÅ¥ spustené. Alebo zadajte „allâ€, " +"ÄÃm budú jednoducho spustené vÅ¡etky dostupné polia." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Ak pre koreň súborového systému nepotrebujete alebo nechcete spúšťaÅ¥ žiadne " +"polia, nechajte odpoveÄ prázdnu (alebo zadajte „noneâ€). Tento prÃpad môže " +"nastaÅ¥, ak použÃvate automatický Å¡tart polà priamo v jadre alebo " +"nepotrebujete pri zavádzanà žiadne polia." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Nastala chyba: uzol zariadenia neexistuje" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Nastala chyba: nie je blokové zariadenie" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Nastala chyba: nie je pole MD" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Nastala chyba: pole nie je uvedené v súbore mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "SpustiÅ¥ polia, ktoré nie sú uvedené v mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"Zadané pole (${array}) nie je uvedené v konfiguraÄnom súbore (${config}), a " +"preto nemôže byÅ¥ spustené poÄas zavádzania, až kým neopravÃte konfiguraÄný " +"súbor a nevytvorÃte nový poÄiatoÄný ramdisk (initrd)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Toto varovanie je dôležité, len ak potrebujete aby boli polia spúšťané z " +"poÄiatoÄného ramdisku, aby boli dostupné poÄas zavádzania. Ak použÃvate " +"automatické spúšťanie polà priamo z jadra, alebo ak nepotrebujte aby boli " +"polia spúšťané tak skoro (z poÄiatoÄného ramdisku), môžete prosto " +"pokraÄovaÅ¥. Alebo môžete zvoliÅ¥ nepokraÄovaÅ¥ a odpovedaÅ¥ „none†na otázku, " +"ktoré polia majú byÅ¥ spúšťané z poÄiatoÄného ramdisku." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Má mdadm spúšťaÅ¥ mesaÄnú kontrolu redundancie polà MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Ak to jadro podporuje (verzie novÅ¡ie ako 2.6.14), mdadm môže periodicky " +"kontrolovaÅ¥ redundanciu polà MD (RAIDov). Tento proces môže byÅ¥ (v " +"závislosti od lokálneho nastavenia) nároÄný na zdroje systému, ale môže " +"pomôcÅ¥ pri predchádzanà vzácnym prÃpadom straty dát. Pamätajte, že, pokiaľ " +"nie sú nájdené chyby, je to kontrola read-only, až keÄ sú nájdené chyby, " +"pokúsi sa ich mdadm opraviÅ¥, Äo môže maÅ¥ za následok zápis na médium." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Predvolene je vypnuté, ak túto možnosÅ¥ zapnete, bude kontrola vykonávaná " +"každú prvú nedeľu mesiaca o 01:06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Chcete spustiÅ¥ démona monitorovania MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Monitorovacà démon MD (RAID) posiela upozornenia emailom, ako reakcie na " +"dôležité udalosti MD (napr. zlyhanie disku)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Povolenie tejto možnosti je odporúÄané." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "PrÃjemca emailových upozornenÃ:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"ProsÃm, zadajte emailovú adresu použÃvateľa, ktorý má dostávaÅ¥ emailové " +"upozornenia na dôležité udalosti MD." diff --git a/debian/po/sv.po b/debian/po/sv.po new file mode 100644 index 00000000..8257c8dd --- /dev/null +++ b/debian/po/sv.po @@ -0,0 +1,219 @@ +# translation of mdadm_2.6.7-3_sv.po to Swedish +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans +# Developers do not need to manually edit POT or PO files. +# +# Martin Ågren <martin.agren@gmail.com>, 2008. +msgid "" +msgstr "" +"Project-Id-Version: mdadm_2.6.7-3_sv\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-07-23 18:34+0200\n" +"Last-Translator: Martin Ågren <martin.agren@gmail.com>\n" +"Language-Team: Swedish <debian-l10n-swedish@lists.debian.org>\n" +"Language: sv\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=ISO-8859-1\n" +"Content-Transfer-Encoding: 8bit\n" +"X-Generator: KBabel 1.11.4\n" +"Plural-Forms: nplurals=2; plural=(n != 1);\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "MD-kedjor som behövs för rotfilsystemet:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Ange \"all\", \"none\" eller en blankstegsseparerad lista på enheter, såsom " +"\"md0 md1\" eller \"md/1 md/0\" (det inledande \"/dev\" kan uteslutas)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "för intern användning - endast den långa beskrivningen behövs." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Om ditt system har sitt rotfilsystem på en MD-kedja (RAID) behöver den " +"startas upp tidigt under uppstartssekvensen. Om ditt rotfilsystem finns på " +"en logisk volym (LVM), vilket är på MD, behöver alla bestående kedjor " +"startas." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Om du vet exakt vilka kedjor som behövs för att ta upp rotfilsystemet, och " +"du vill skjuta upp uppstarten för alla andra kedjor till en senare tidspunkt " +"i uppstartssekvensen, ange vilka kedjor som ska starta här. Alternativt, " +"ange \"all\" för att helt enkelt starta alla tillgängliga kedjor." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Om du inte behöver eller vill starta några kedjor för rotfilsystemet, lämna " +"svaret blankt (eller ange \"none\"). Detta kan vara fallet om du använder " +"kärnans automatstart eller inte behöver några kedjor för att starta upp." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Ett fel inträffade: enhetsnoden finns inte" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Ett fel inträffade: inte en blockenhet" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Ett fel inträffade: inte en MD-kedja" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Ett fel inträffade: kedjan är inte listad i filen mdadm.conf" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "Starta kedjor som inte är listade i mdadm.conf?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"Kedjan du har angivit (${array}) är inte listad i konfigurationsfilen " +"(${config}). Därför kan den inte startas under systemets uppstart, såvida du " +"inte rättar till konfigurationsfilen och återskapar den initiala ramdisken." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Den här varningen är endast relevant om du behöver kedjor som ska startas " +"från den initiala ramdisken för att kunna starta upp systemet. Om du " +"använder kärnans automatstart, eller inte behöver starta några kedjor så " +"tidigt som när de initiala ramdisken läses in, kan du helt enkelt fortsätta. " +"Alternativt, välj att inte fortsätta och ange \"none\" när frågan om vilka " +"kedjor som ska startas från den initiala ramdisken ställs." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Ska mdadm köra månatliga redundanskontroller av MD-kedjorna?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Om din kärna har stöd för det (versioner senare än 2.6.14), kan mdadm " +"periodvis kontrollera redundansen för dina MD-kedjor (RAID). Det här kan " +"vara en resurskrävande process, beroende på din konfiguration, men den kan " +"hjälpa till att förhindra ovanliga fall av dataförluster. Observera att det " +"är en skrivskyddad kontroll såvida inte fel påträffas; om fel hittas kommer " +"mdadm försöka att rätta till dem, vilket kan leda till skrivåtkomst till " +"mediet." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Standardvärdet, om påslagen, är att kontrollera på den första söndagen i " +"varje månad klockan 01.06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Vill du starta MD-övervakningsdemonen?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"MD-övervakningsdemonen (RAID) skickar e-postnotifieringar för viktiga MD-" +"händelser (såsom ett diskfel)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Att aktivera denna funktion rekommenderas." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "Mottagare av e-postnotifieringar:" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Ange e-postadressen till den användare som ska ta emot e-postnotifieringar " +"för dessa viktiga MD-händelser." diff --git a/debian/po/templates.pot b/debian/po/templates.pot new file mode 100644 index 00000000..beb000aa --- /dev/null +++ b/debian/po/templates.pot @@ -0,0 +1,176 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" +"Language-Team: LANGUAGE <LL@li.org>\n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=CHARSET\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "" + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" diff --git a/debian/po/vi.po b/debian/po/vi.po new file mode 100644 index 00000000..0e4933ca --- /dev/null +++ b/debian/po/vi.po @@ -0,0 +1,215 @@ +# Vietnamese Translation for mdadm. +# Copyright © 2008 Free Software Foundation, Inc. +# Clytie Siddall <clytie@riverland.net.au>, 2005-2008. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 2.6.3+200709292116+4450e59-4\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2012-05-22 00:50+0100\n" +"PO-Revision-Date: 2008-02-23 17:40+1030\n" +"Last-Translator: Clytie Siddall <clytie@riverland.net.au>\n" +"Language-Team: Vietnamese <vi-VN@googlegroups.com>\n" +"Language: vi\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"X-Generator: LocFactoryEditor 1.7b3\n" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "MD arrays needed for the root file system:" +msgstr "Các mảng MD cần thiết cho hệ thống táºp tin gốc:" + +#. Type: string +#. Description +#: ../mdadm.templates:2001 +msgid "" +"Please enter 'all', 'none', or a space-separated list of devices such as " +"'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +msgstr "" +"Hãy nháºp « all » (tất cả), « none » (không có), hoặc má»™t danh sách các thiết " +"bị định giá»›i bằng dấu cách nhÆ° « md0 md1 » hoặc « md/1 md/d0 » (có thể bá» " +"sót phần « /dev/ » Ä‘i trÆ°á»›c)." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "for internal use - only the long description is needed." +msgstr "để sá» dụng ná»™i bá»™ — chỉ cần thiết mô tả dà i." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If the system's root file system is located on an MD array (RAID), it needs " +"to be started early during the boot sequence. If it is located on a logical " +"volume (LVM), which is on MD, all constituent arrays need to be started." +msgstr "" +"Nếu hệ thống táºp tin gốc của hệ thống nằm trên má»™t mảng MD (RAID) thì cần " +"phải khởi chạy nó sá»›m trong tiến trình khởi Ä‘á»™ng. Nếu nó nằm trên má»™t khối " +"tin hợp lý (LVM) mà lần lượt nằm trên má»™t MD thì cần phải khởi chạy tất cả " +"các mảng thà nh phần." + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you know exactly which arrays are needed to bring up the root file " +"system, and you want to postpone starting all other arrays to a later point " +"in the boot sequence, enter the arrays to start here. Alternatively, enter " +"'all' to simply start all available arrays." +msgstr "" +"Nếu bạn biết chÃnh xác những mảng nà o cần thiết để kÃch hoạt hệ thống táºp " +"tin gốc, và bạn muốn hoãn việc khởi chạy các mảng khác tá»›i má»™t Ä‘iểm sau " +"trong dãy khởi Ä‘á»™ng, hãy nháºp và o đây các mảng cần khởi chạy. Hoặc nháºp « " +"all » (tất cả) để khởi chạy Ä‘Æ¡n giản tất cả các mảng sẵn sà ng. " + +#. Type: text +#. Description +#: ../mdadm.templates:3001 +msgid "" +"If you do not need or want to start any arrays for the root file system, " +"leave the answer blank (or enter 'none'). This may be the case if you are " +"using kernel autostart or do not need any arrays to boot." +msgstr "" +"Nếu bạn không cần hoặc muốn khởi chạy mảng nà o cho hệ thống táºp tin gốc, hãy " +"bá» trống câu trả lá»i nà y (hoặc nháºp « none » [không có]). TrÆ°á»ng hợp nà y có " +"thể xảy ra nếu bạn sá» dụng khả năng tá»± Ä‘á»™ng khởi Ä‘á»™ng hạt nhân (kernel " +"autostart), hoặc không cần mảng nà o để khởi Ä‘á»™ng máy tÃnh." + +#. Type: text +#. Description +#: ../mdadm.templates:4001 +msgid "An error occurred: device node does not exist" +msgstr "Gặp lá»—i: nút thiết bị không tồn tại" + +#. Type: text +#. Description +#: ../mdadm.templates:5001 +msgid "An error occurred: not a block device" +msgstr "Gặp lá»—i: không phải là má»™t thiết bị khối" + +#. Type: text +#. Description +#: ../mdadm.templates:6001 +msgid "An error occurred: not an MD array" +msgstr "Gặp lá»—i: không phải là má»™t mảng MD" + +#. Type: text +#. Description +#: ../mdadm.templates:7001 +msgid "An error occurred: array not listed in mdadm.conf file" +msgstr "Gặp lá»—i: mảng không được liệt kê trong táºp tin cấu hình « mdadm.conf »" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "Start arrays not listed in mdadm.conf?" +msgstr "" +"Khởi chạy các mảng không được liệt kê trong táºp tin cấu hình « mdadm.conf » " +"không?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"The specified array (${array}) is not listed in the configuration file " +"(${config}). Therefore, it cannot be started during boot, unless you correct " +"the configuration file and recreate the initial ramdisk." +msgstr "" +"Mảng bạn đã xác định (${array}) không được liệt kê trong táºp tin cấu hình " +"${config}. Vì váºy nó không thể được khởi chạy trong khi khởi Ä‘á»™ng, nếu bạn " +"không sá»a táºp tin cấu hình và tạo lại Ä‘Ä©a RAM đầu tiên." + +#. Type: boolean +#. Description +#: ../mdadm.templates:8001 +msgid "" +"This warning is only relevant if you need arrays to be started from the " +"initial ramdisk to be able to boot. If you use kernel autostarting, or do " +"not need any arrays to be started as early as the initial ramdisk is loaded, " +"you can simply continue. Alternatively, choose not to continue and enter " +"'none' when prompted which arrays to start from the initial ramdisk." +msgstr "" +"Cảnh báo nà y chỉ là thÃch hợp nếu bạn cần thiết mảng được khởi chạy từ Ä‘Ä©a " +"RAM đầu tiên, để có thể khởi Ä‘á»™ng được. Nếu bạn sá» dụng khả năng tá»± Ä‘á»™ng " +"khởi chạy hạt nhân (kernel autostart), hoặc không cần mảng nà o được khởi " +"chạy má»™t khi nạp Ä‘Ä©a RAM đầu tiên, bạn Ä‘Æ¡n giản có thể tiếp tục lại. Hoặc " +"chá»n không tiếp tục, và nháºp « none » (không có) khi được nhắc nháºp những " +"mảng nà o cần khởi chạy từ Ä‘Ä©a RAM đầu tiên." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"mdadm có nên chạy việc kiểm tra thừa hà ng tháng trên những mảng MD không?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Nếu hạt nhân có phải há»— trợ (các phiên bản sau 2.6.14) thì mdadm có thể kiểm " +"tra theo chu kỳ tình thừa của các mảng MD (RAID). Tiến trình nà y có thể " +"chiếm nhiá»u tà i nguyên hệ thống, phụ thuá»™c và o thiết láºp cục bá»™, nhÆ°ng nó có " +"thể giúp ngăn cản trÆ°á»ng hợp mất dữ liệu (Ãt có). Ghi chú rằng việc kiểm tra " +"nà y là chỉ Ä‘á»c: gặp lá»—i thì mdadm sẽ thá» sá»a chữa, mà có thể gây ra truy cáºp " +"ghi và o váºt chứa." + +#. Type: boolean +#. Description +#: ../mdadm.templates:10001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Giá trị mặc định, nếu được báºt, là chạy những việc kiểm tra và o ngà y hôm Chủ " +"Nháºt thứ nhất của má»—i tháng, và o lúc 01:06 giá» (giá» ti)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Bạn có muốn khởi chạy trình ná»n theo dõi MD không?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Trình ná»n theo dõi MD (RAID) gá»i thÆ° thông báo hưởng ứng dữ kiện MD quan " +"trá»ng (v.d. Ä‘Ä©a bị há»ng)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:11001 +msgid "Enabling this option is recommended." +msgstr "Khuyên bạn hiệu lá»±c tùy chá»n nà y." + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "Recipient for email notifications:" +msgstr "NgÆ°á»i nháºn thÆ° thông báo :" + +#. Type: string +#. Description +#: ../mdadm.templates:12001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Hãy nháºp địa chỉ thÆ° của ngÆ°á»i dùng nên nháºn thÆ° thông báo vá» dữ kiện MD " +"quan trá»ng." diff --git a/debian/presubj b/debian/presubj new file mode 100644 index 00000000..103208df --- /dev/null +++ b/debian/presubj @@ -0,0 +1,32 @@ +Reporting bugs against mdadm +============================ +Before reporting bugs against mdadm, please read the README documents, as well +as the FAQ in /usr/share/doc/mdadm . Most issues that are reported against the +mdadm package are adequately answered therein. + +In particular, please do not file bugs about mdadm assembling arrays too +early, e.g. when the driver is not yet ready and device nodes do not exist. +Check FAQ item 27 about use of the rootdelay parameter instead. + +Also, please check out http://bugs.debian.org/mdadm and make sure that the +issue you are facing has not already been reported. + +If you are not sure that the answer you are seeking is in those files, or you +are not sure that you are facing a genuine bug, please approach +debian-user@lists.debian.org or linux-raid@vger.kernel.org with your +question(s). + +Gathering information relevant to mdadm as root +=============================================== +If you are not reporting bugs as root (which you should not), you will be +prompted to give permission to run a script to collect relevant information +from your system as the root user. Only the root user has access to some +information that might be relevant to the bug report you are about to file. + +** Please give permission to run the script as root when asked momentarily. + +If you would rather obtain the same information manually, you can run + /usr/share/bug/mdadm/script 3>&1 +as root and include or attach the output. + + -- martin f. krafft <madduck@debian.org> Mon, 20 Jul 2009 15:02:48 +0200 diff --git a/debian/rules b/debian/rules new file mode 100755 index 00000000..0adcc1bd --- /dev/null +++ b/debian/rules @@ -0,0 +1,102 @@ +#!/usr/bin/make -f +# Copyright © 2001-2005 Mario Jou/3en <joussen@debian.org> +# Copyright © 2005-2008 Martin F. Krafft <madduck@debian.org> +# Distributable under the terms of the GNU GPL version 2. +# + +#export DH_VERBOSE=1 + +export CROSS_COMPILE=$(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)- +LDFLAGS = $(shell dpkg-buildflags --get LDFLAGS) +CXFLAGS = $(shell dpkg-buildflags --get CFLAGS) \ + $(shell dpkg-buildflags --get CPPFLAGS) +BUILDFLAGS = CXFLAGS="$(CXFLAGS)" LDFLAGS="$(LDFLAGS)" DEBIAN=yes +DESTDIR = $(CURDIR)/debian/mdadm +DESTDIR_UDEB = $(DESTDIR)-udeb + +build-arch: build-stamp + +build-stamp: + dh_testdir + $(MAKE) all $(BUILDFLAGS) CONFFILE=/etc/mdadm/mdadm.conf CONFFILE2=/etc/mdadm.conf + touch $@ + +# udeb rules should go, the only diff is the conffile location, +# and d-i specifies path explicitly when needed + +udeb/dir-stamp: + rm -rf udeb + mkdir udeb + ln *.[ch] Makefile udeb/ + touch $@ + +build-arch: udeb/build-stamp + +udeb/build-stamp: udeb/dir-stamp + dh_testdir + $(MAKE) -C udeb mdadm mdmon $(BUILDFLAGS) CONFFILE=/tmp/mdadm.conf + touch $@ + +clean: + dh_testdir + rm -f build-stamp + $(MAKE) clean + rm -rf udeb + dh_clean + +install-arch: build-arch + dh_testdir + dh_prep + dh_installdirs + + $(MAKE) install install-systemd DESTDIR=$(DESTDIR) + + mkdir -p $(DESTDIR)/etc/mdadm + install -Dm0755 debian/initramfs/hook \ + $(DESTDIR)/usr/share/initramfs-tools/hooks/mdadm + install -Dm0755 debian/initramfs/script.local-top \ + $(DESTDIR)/usr/share/initramfs-tools/scripts/local-top/mdadm + install -Dm0644 debian/mdadm.modules \ + $(DESTDIR)/etc/modprobe.d/mdadm.conf + + install -Dm0755 debian/mkconf $(DESTDIR)/usr/share/mdadm/mkconf + install -Dm0755 debian/checkarray $(DESTDIR)/usr/share/mdadm/checkarray + install -Dm0755 debian/bugscript $(DESTDIR)/usr/share/bug/mdadm/script + install -Dm0644 debian/presubj $(DESTDIR)/usr/share/bug/mdadm/presubj + + install -Dm0755 udeb/mdadm $(DESTDIR_UDEB)/sbin/mdadm + install -Dm0755 udeb/mdmon $(DESTDIR_UDEB)/sbin/mdmon + install -Dm0644 udev-md-raid-arrays.rules $(DESTDIR_UDEB)/lib/udev/rules.d/63-md-raid-arrays.rules + +binary-arch: install-arch + dh_testdir + dh_testroot + dh_installdebconf + dh_installdocs + dh_installexamples -pmdadm mdadm.conf-example misc/syslog-events + dh_installinit --init-script=mdadm-raid --no-restart-on-upgrade -- start 25 S . start 60 0 6 . + dh_installinit --init-script=mdadm-waitidle --no-start -- stop 98 0 6 . + dh_link -pmdadm /dev/null /lib/systemd/system/mdadm-waitidle.service + dh_installinit -- defaults 25 + dh_link -pmdadm /dev/null /lib/systemd/system/mdadm.service + dh_installman + dh_installcron + dh_installchangelogs ChangeLog + dh_installlogcheck + dh_link + dh_strip + dh_compress + dh_fixperms + dh_installdeb + dh_shlibdeps + dh_gencontrol + dh_md5sums + dh_builddeb + +build: build-arch +install: install-arch +binary: binary-arch +build-indep: +install-indep: +binary-indep: +.PHONY: clean build build-indep build-arch binary binary-indep binary-arch install install-indep install-arch diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 00000000..163aaf8d --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/debian/watch b/debian/watch new file mode 100644 index 00000000..1b1172a0 --- /dev/null +++ b/debian/watch @@ -0,0 +1,2 @@ +version=3 +http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-([.[:digit:]]+).tar.gz diff --git a/dlink.c b/dlink.c new file mode 100644 index 00000000..3efa94b7 --- /dev/null +++ b/dlink.c @@ -0,0 +1,74 @@ + +/* doubly linked lists */ +/* This is free software. No strings attached. No copyright claimed */ + +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#ifdef __dietlibc__ +char *strncpy(char *dest, const char *src, size_t n) __THROW; +#endif +void *xcalloc(size_t num, size_t size); +#include "dlink.h" + +void *dl_head() +{ + void *h; + h = dl_alloc(0); + dl_next(h) = h; + dl_prev(h) = h; + return h; +} + +void dl_free(void *v) +{ + struct __dl_head *vv = v; + free(vv-1); +} + +void dl_init(void *v) +{ + dl_next(v) = v; + dl_prev(v) = v; +} + +void dl_insert(void *head, void *val) +{ + dl_next(val) = dl_next(head); + dl_prev(val) = head; + dl_next(dl_prev(val)) = val; + dl_prev(dl_next(val)) = val; +} + +void dl_add(void *head, void *val) +{ + dl_prev(val) = dl_prev(head); + dl_next(val) = head; + dl_next(dl_prev(val)) = val; + dl_prev(dl_next(val)) = val; +} + +void dl_del(void *val) +{ + if (dl_prev(val) == 0 || dl_next(val) == 0) + return; + dl_prev(dl_next(val)) = dl_prev(val); + dl_next(dl_prev(val)) = dl_next(val); + dl_prev(val) = dl_next(val) = 0; +} + +char *dl_strndup(char *s, int l) +{ + char *n; + if (s == NULL) + return NULL; + n = dl_newv(char, l+1); + strncpy(n, s, l); + n[l] = 0; + return n; +} + +char *dl_strdup(char *s) +{ + return dl_strndup(s, (int)strlen(s)); +} diff --git a/dlink.h b/dlink.h new file mode 100644 index 00000000..ab2a9459 --- /dev/null +++ b/dlink.h @@ -0,0 +1,25 @@ + +/* doubley linked lists */ +/* This is free software. No strings attached. No copyright claimed */ + +struct __dl_head +{ + void * dh_prev; + void * dh_next; +}; + +#define dl_alloc(size) ((void*)(((char*)xcalloc(1,(size)+sizeof(struct __dl_head)))+sizeof(struct __dl_head))) +#define dl_new(t) ((t*)dl_alloc(sizeof(t))) +#define dl_newv(t,n) ((t*)dl_alloc(sizeof(t)*n)) + +#define dl_next(p) *(&(((struct __dl_head*)(p))[-1].dh_next)) +#define dl_prev(p) *(&(((struct __dl_head*)(p))[-1].dh_prev)) + +void *dl_head(void); +char *dl_strdup(char *); +char *dl_strndup(char *, int); +void dl_insert(void*, void*); +void dl_add(void*, void*); +void dl_del(void*); +void dl_free(void*); +void dl_init(void*); diff --git a/external-reshape-design.txt b/external-reshape-design.txt new file mode 100644 index 00000000..10c57ccb --- /dev/null +++ b/external-reshape-design.txt @@ -0,0 +1,280 @@ +External Reshape + +1 Problem statement + +External (third-party metadata) reshape differs from native-metadata +reshape in three key ways: + +1.1 Format specific constraints + +In the native case reshape is limited by what is implemented in the +generic reshape routine (Grow_reshape()) and what is supported by the +kernel. There are exceptional cases where Grow_reshape() may block +operations when it knows that the kernel implementation is broken, but +otherwise the kernel is relied upon to be the final arbiter of what +reshape operations are supported. + +In the external case the kernel, and the generic checks in +Grow_reshape(), become the super-set of what reshapes are possible. The +metadata format may not support, or have yet to implement a given +reshape type. The implication for Grow_reshape() is that it must query +the metadata handler and effect changes in the metadata before the new +geometry is posted to the kernel. The ->reshape_super method allows +Grow_reshape() to validate the requested operation and post the metadata +update. + +1.2 Scope of reshape + +Native metadata reshape is always performed at the array scope (no +metadata relationship with sibling arrays on the same disks). External +reshape, depending on the format, may not allow the number of member +disks to be changed in a subarray unless the change is simultaneously +applied to all subarrays in the container. For example the imsm format +requires all member disks to be a member of all subarrays, so a 4-disk +raid5 in a container that also houses a 4-disk raid10 array could not be +reshaped to 5 disks as the imsm format does not support a 5-disk raid10 +representation. This requires the ->reshape_super method to check the +contents of the array and ask the user to run the reshape at container +scope (if all subarrays are agreeable to the change), or report an +error in the case where one subarray cannot support the change. + +1.3 Monitoring / checkpointing + +Reshape, unlike rebuild/resync, requires strict checkpointing to survive +interrupted reshape operations. For example when expanding a raid5 +array the first few stripes of the array will be overwritten in a +destructive manner. When restarting the reshape process we need to know +the exact location of the last successfully written stripe, and we need +to restore the data in any partially overwritten stripe. Native +metadata stores this backup data in the unused portion of spares that +are being promoted to array members, or in an external backup file +(located on a non-involved block device). + +The kernel is in charge of recording checkpoints of reshape progress, +but mdadm is delegated the task of managing the backup space which +involves: +1/ Identifying what data will be overwritten in the next unit of reshape + operation +2/ Suspending access to that region so that a snapshot of the data can + be transferred to the backup space. +3/ Allowing the kernel to reshape the saved region and setting the + boundary for the next backup. + +In the external reshape case we want to preserve this mdadm +'reshape-manager' arrangement, but have a third actor, mdmon, to +consider. It is tempting to give the role of managing reshape to mdmon, +but that is counter to its role as a monitor, and conflicts with the +existing capabilities and role of mdadm to manage the progress of +reshape. For clarity the external reshape implementation maintains the +role of mdmon as a (mostly) passive recorder of raid events, and mdadm +treats it as it would the kernel in the native reshape case (modulo +needing to send explicit metadata update messages and checking that +mdmon took the expected action). + +External reshape can use the generic md backup file as a fallback, but in the +optimal/firmware-compatible case the reshape-manager will use the metadata +specific areas for managing reshape. The implementation also needs to spawn a +reshape-manager per subarray when the reshape is being carried out at the +container level. For these two reasons the ->manage_reshape() method is +introduced. This method in addition to base tasks mentioned above: +1/ Processed each subarray one at a time in series - where appropriate. +2/ Uses either generic routines in Grow.c for md-style backup file + support, or uses the metadata-format specific location for storing + recovery data. +This aims to avoid a "midlayer mistake"[1] and lets the metadata handler +optionally take advantage of generic infrastructure in Grow.c + +2 Details for specific reshape requests + +There are quite a few moving pieces spread out across md, mdadm, and mdmon for +the support of external reshape, and there are several different types of +reshape that need to be comprehended by the implementation. A rundown of +these details follows. + +2.0 General provisions: + +Obtain an exclusive open on the container to make sure we are not +running concurrently with a Create() event. + +2.1 Freezing sync_action + + Before making any attempt at a reshape we 'freeze' every array in + the container to ensure no spare assignment or recovery happens. + This involves writing 'frozen' to sync_action and changing the '/' + after 'external:' in metadata_version to a '-'. mdmon knows that + this means not to perform any management. + + Before doing this we check that all sync_actions are 'idle', which + is racy but still useful. + Afterwards we check that all member arrays have no spares + or partial spares (recovery_start != 'none') which would indicate a + race. If they do, we unfreeze again. + + Once this completes we know all the arrays are stable. They may + still have failed devices as devices can fail at any time. However + we treat those like failures that happen during the reshape. + +2.2 Reshape size + + 1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally + initializes st->update_tail + 2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the size change + is allowed (being performed at subarray scope / enough room) prepares a + metadata update + 3/ mdadm::Grow_reshape(): flushes the metadata update (via + flush_metadata_update(), or ->sync_metadata()) + 4/ mdadm::Grow_reshape(): post the new size to the kernel + + +2.3 Reshape level (simple-takeover) + +"simple-takeover" implies the level change can be satisfied without touching +sync_action + + 1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally + initializes st->update_tail + 2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the level change + is allowed (being performed at subarray scope) prepares a + metadata update + 2a/ raid10 --> raid0: degrade all mirror legs prior to calling + ->reshape_super + 3/ mdadm::Grow_reshape(): flushes the metadata update (via + flush_metadata_update(), or ->sync_metadata()) + 4/ mdadm::Grow_reshape(): post the new level to the kernel + +2.4 Reshape chunk, layout + +2.5 Reshape raid disks (grow) + + 1/ mdadm::Grow_reshape(): unconditionally initializes st->update_tail + because only redundant raid levels can modify the number of raid disks + 2/ mdadm::Grow_reshape(): calls ->reshape_super() to check that the level + change is allowed (being performed at proper scope / permissible + geometry / proper spares available in the container), chooses + the spares to use, and prepares a metadata update. + 3/ mdadm::Grow_reshape(): Converts each subarray in the container to the + raid level that can perform the reshape and starts mdmon. + 4/ mdadm::Grow_reshape(): Pushes the update to mdmon. + 5/ mdadm::Grow_reshape(): uses container_content to find details of + the spares and passes them to the kernel. + 6/ mdadm::Grow_reshape(): gives raid_disks update to the kernel, + sets sync_max, sync_min, suspend_lo, suspend_hi all to zero, + and starts the reshape by writing 'reshape' to sync_action. + 7/ mdmon::monitor notices the sync_action change and tells + managemon to check for new devices. managemon notices the new + devices, opens relevant sysfs file, and passes them all to + monitor. + 8/ mdadm::Grow_reshape() calls ->manage_reshape to oversee the + rest of the reshape. + + 9/ mdadm::<format>->manage_reshape(): saves data that will be overwritten by + the kernel to either the backup file or the metadata specific location, + advances sync_max, waits for reshape, ping mdmon, repeat. + Meanwhile mdmon::read_and_act(): records checkpoints. + Specifically. + + 9a/ if the 'next' stripe to be reshaped will over-write + itself during reshape then: + 9a.1/ increase suspend_hi to cover a suitable number of + stripes. + 9a.2/ backup those stripes safely. + 9a.3/ advance sync_max to allow those stripes to be backed up + 9a.4/ when sync_completed indicates that those stripes have + been reshaped, manage_reshape must ping_manager + 9a.5/ when mdmon notices that sync_completed has been updated, + it records the new checkpoint in the metadata + 9a.6/ after the ping_manager, manage_reshape will increase + suspend_lo to allow access to those stripes again + + 9b/ if the 'next' stripe to be reshaped will over-write unused + space during reshape then we apply same process as above, + except that there is no need to back anything up. + Note that we *do* need to keep suspend_hi progressing as + it is not safe to write to the area-under-reshape. For + kernel-managed-metadata this protection is provided by + ->reshape_safe, but that does not protect us in the case + of user-space-managed-metadata. + + 10/ mdadm::<format>->manage_reshape(): Once reshape completes changes the raid + level back to the nominal raid level (if necessary) + + FIXME: native metadata does not have the capability to record the original + raid level in reshape-restart case because the kernel always records current + raid level to the metadata, whereas external metadata can masquerade at an + alternate level based on the reshape state. + +2.6 Reshape raid disks (shrink) + +3 Interaction with metadata handle. + + The following calls are made into the metadata handler to assist + with initiating and monitoring a 'reshape'. + + 1/ ->reshape_super is called quite early (after only minimial + checks) to make sure that the metadata can record the new shape + and any necessary transitions. It may be passed a 'container' + or an individual array within a container, and it should notice + the difference and act accordingly. + When a reshape is requested against a container it is expected + that it should be applied to every array in the container, + however it is up to the metadata handler to determine final + policy. + + If the reshape is supportable, the internal copy of the metadata + should be updated, and a metadata update suitable for sending + to mdmon should be queued. + + If the reshape will involve converting spares into array members, + this must be recorded in the metadata too. + + 2/ ->container_content will be called to find out the new state + of all the array, or all arrays in the container. Any newly + added devices (with state==0 and raid_disk >= 0) will be added + to the array as spares with the relevant slot number. + + It is likely that the info returned by ->container_content will + have ->reshape_active set, ->reshape_progress set to e.g. 0, and + new_* set appropriately. mdadm will use this information to + cause the correct reshape to start at an appropriate time. + + 3/ ->set_array_state will be called by mdmon when reshape has + started and again periodically as it progresses. This should + record the ->last_checkpoint as the point where reshape has + progressed to. When the reshape finished this will be called + again and it should notice that ->curr_action is no longer + 'reshape' and so should record that the reshape has finished + providing 'last_checkpoint' has progressed suitably. + + 4/ ->manage_reshape will be called once the reshape has been set + up in the kernel but before sync_max has been moved from 0, so + no actual reshape will have happened. + + ->manage_reshape should call progress_reshape() to allow the + reshape to progress, and should back-up any data as indicated + by the return value. See the documentation of that function + for more details. + ->manage_reshape will be called multiple times when a + container is being reshaped, once for each member array in + the container. + + + The progress of the metadata is as follows: + 1/ mdadm sends a metadata update to mdmon which marks the array + as undergoing a reshape. This is set up by + ->reshape_super and applied by ->process_update + For container-wide reshape, this happens once for the whole + container. + 2/ mdmon notices progress via the sysfs files and calls + ->set_array_state to update the state periodically + For container-wide reshape, this happens repeatedly for + one array, then repeatedly for the next, etc. + 3/ mdmon notices when reshape has finished and call + ->set_array_state to record the the reshape is complete. + For container-wide reshape, this happens once for each + member array. + + + +... + +[1]: Linux kernel design patterns - part 3, Neil Brown http://lwn.net/Articles/336262/ diff --git a/inventory b/inventory new file mode 100755 index 00000000..a9fc3c01 --- /dev/null +++ b/inventory @@ -0,0 +1,252 @@ + +.gitignore +ANNOUNCE-3.0 +ANNOUNCE-3.0.1 +ANNOUNCE-3.0.2 +ANNOUNCE-3.0.3 +ANNOUNCE-3.1 +ANNOUNCE-3.1.1 +ANNOUNCE-3.1.2 +ANNOUNCE-3.1.3 +ANNOUNCE-3.1.4 +ANNOUNCE-3.1.5 +ANNOUNCE-3.2 +ANNOUNCE-3.2.1 +ANNOUNCE-3.2.2 +ANNOUNCE-3.2.3 +ANNOUNCE-3.2.4 +ANNOUNCE-3.2.5 +ANNOUNCE-3.2.6 +ANNOUNCE-3.3 +ANNOUNCE-3.3.1 +ANNOUNCE-3.3.2 +ANNOUNCE-3.3.3 +ANNOUNCE-3.3.4 +Assemble.c +Build.c +COPYING +ChangeLog +Create.c +Detail.c +Dump.c +Examine.c +Grow.c +INSTALL +Incremental.c +Kill.c +Makefile +Manage.c +Monitor.c +Query.c +README.initramfs +ReadMe.c +TODO +bitmap.c +bitmap.h +config.c +crc32.c +crc32.h +dlink.c +dlink.h +external-reshape-design.txt +inventory +kernel-patch-2.6.18 +kernel-patch-2.6.18.6 +kernel-patch-2.6.19 +kernel-patch-2.6.25 +kernel-patch-2.6.27 +lib.c +makedist +managemon.c +mapfile.c +maps.c +md.4 +md5.h +md_p.h +md_u.h +mdadm.8.in +mdadm.c +mdadm.conf-example +mdadm.conf.5 +mdadm.h +mdadm.spec +mdassemble.8 +mdassemble.c +mdmon-design.txt +mdmon.8 +mdmon.c +mdmon.h +mdopen.c +mdstat.c +misc/ +misc/mdcheck +misc/syslog-events +mkinitramfs +monitor.c +msg.c +msg.h +part.h +platform-intel.c +platform-intel.h +policy.c +probe_roms.c +probe_roms.h +pwgr.c +raid5extend.c +raid6check.8 +raid6check.c +restripe.c +sg_io.c +sha1.c +sha1.h +super-ddf.c +super-gpt.c +super-intel.c +super-mbr.c +super0.c +super1.c +swap_super.c +sysfs.c +systemd/ +systemd/SUSE-mdadm_env.sh +systemd/mdadm-grow-continue@.service +systemd/mdadm-last-resort@.service +systemd/mdadm-last-resort@.timer +systemd/mdadm.shutdown +systemd/mdmon@.service +systemd/mdmonitor.service +test +tests/ +tests/00linear +tests/00multipath +tests/00names +tests/00raid0 +tests/00raid1 +tests/00raid10 +tests/00raid4 +tests/00raid5 +tests/00raid6 +tests/01r1fail +tests/01r5fail +tests/01r5integ +tests/01raid6integ +tests/01replace +tests/02lineargrow +tests/02r1add +tests/02r1grow +tests/02r5grow +tests/02r6grow +tests/03assem-incr +tests/03r0assem +tests/03r5assem +tests/03r5assem-failed +tests/03r5assemV1 +tests/04r0update +tests/04r1update +tests/04r5swap +tests/04update-metadata +tests/04update-uuid +tests/05r1-add-internalbitmap +tests/05r1-add-internalbitmap-v1a +tests/05r1-add-internalbitmap-v1b +tests/05r1-add-internalbitmap-v1c +tests/05r1-bitmapfile +tests/05r1-grow-external +tests/05r1-grow-internal +tests/05r1-grow-internal-1 +tests/05r1-internalbitmap +tests/05r1-internalbitmap-v1a +tests/05r1-internalbitmap-v1b +tests/05r1-internalbitmap-v1c +tests/05r1-n3-bitmapfile +tests/05r1-re-add +tests/05r1-re-add-nosuper +tests/05r1-remove-internalbitmap +tests/05r1-remove-internalbitmap-v1a +tests/05r1-remove-internalbitmap-v1b +tests/05r1-remove-internalbitmap-v1c +tests/05r5-bitmapfile +tests/05r5-internalbitmap +tests/05r6-bitmapfile +tests/05r6tor0 +tests/06name +tests/06sysfs +tests/06wrmostly +tests/07autoassemble +tests/07autodetect +tests/07changelevelintr +tests/07changelevels +tests/07layouts +tests/07reshape5intr +tests/07revert-grow +tests/07revert-inplace +tests/07revert-shrink +tests/07testreshape5 +tests/09imsm-assemble +tests/09imsm-create-fail-rebuild +tests/09imsm-overlap +tests/10ddf-assemble-missing +tests/10ddf-create +tests/10ddf-create-fail-rebuild +tests/10ddf-fail-create-race +tests/10ddf-fail-readd +tests/10ddf-fail-readd-readonly +tests/10ddf-fail-spare +tests/10ddf-fail-stop-readd +tests/10ddf-fail-twice +tests/10ddf-fail-two-spares +tests/10ddf-geometry +tests/10ddf-incremental-wrong-order +tests/10ddf-sudden-degraded +tests/11spare-migration +tests/12imsm-r0_2d-grow-r0_3d +tests/12imsm-r0_2d-grow-r0_4d +tests/12imsm-r0_2d-grow-r0_5d +tests/12imsm-r0_3d-grow-r0_4d +tests/12imsm-r5_3d-grow-r5_4d +tests/12imsm-r5_3d-grow-r5_5d +tests/13imsm-r0_r0_2d-grow-r0_r0_4d +tests/13imsm-r0_r0_2d-grow-r0_r0_5d +tests/13imsm-r0_r0_3d-grow-r0_r0_4d +tests/13imsm-r0_r5_3d-grow-r0_r5_4d +tests/13imsm-r0_r5_3d-grow-r0_r5_5d +tests/13imsm-r5_r0_3d-grow-r5_r0_4d +tests/13imsm-r5_r0_3d-grow-r5_r0_5d +tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d +tests/14imsm-r0_3d_no_spares-migrate-r5_3d +tests/14imsm-r0_r0_2d-takeover-r10_4d +tests/14imsm-r10_4d-grow-r10_5d +tests/14imsm-r10_r5_4d-takeover-r0_2d +tests/14imsm-r1_2d-grow-r1_3d +tests/14imsm-r1_2d-takeover-r0_2d +tests/14imsm-r5_3d-grow-r5_5d-no-spares +tests/14imsm-r5_3d-migrate-r4_3d +tests/15imsm-r0_3d_64k-migrate-r0_3d_256k +tests/15imsm-r5_3d_4k-migrate-r5_3d_256k +tests/15imsm-r5_3d_64k-migrate-r5_3d_256k +tests/15imsm-r5_6d_4k-migrate-r5_6d_256k +tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k +tests/16imsm-r0_3d-migrate-r5_4d +tests/16imsm-r0_5d-migrate-r5_6d +tests/16imsm-r5_3d-migrate-r0_3d +tests/16imsm-r5_5d-migrate-r0_5d +tests/18imsm-1d-takeover-r0_1d +tests/18imsm-1d-takeover-r1_2d +tests/18imsm-r0_2d-takeover-r10_4d +tests/18imsm-r10_4d-takeover-r0_2d +tests/18imsm-r1_2d-takeover-r0_1d +tests/19raid6auto-repair +tests/19raid6check +tests/19raid6repair +tests/19repair-does-not-destroy +tests/ToTest +tests/check +tests/env-ddf-template +tests/env-imsm-template +tests/imsm-grow-template +tests/testdev +tests/utils +udev-md-raid-arrays.rules +udev-md-raid-assembly.rules +util.c +xmalloc.c diff --git a/kernel-patch-2.6.18 b/kernel-patch-2.6.18 new file mode 100644 index 00000000..87496ea2 --- /dev/null +++ b/kernel-patch-2.6.18 @@ -0,0 +1,35 @@ + +### Diffstat output + ./drivers/md/md.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff .prev/drivers/md/md.c ./drivers/md/md.c +--- .prev/drivers/md/md.c 2006-10-23 10:26:37.000000000 +1000 ++++ ./drivers/md/md.c 2006-12-21 16:28:29.000000000 +1100 +@@ -1783,7 +1783,8 @@ state_store(mdk_rdev_t *rdev, const char + else { + mddev_t *mddev = rdev->mddev; + kick_rdev_from_array(rdev); +- md_update_sb(mddev); ++ if (mddev->pers) ++ md_update_sb(mddev); + md_new_event(mddev); + err = 0; + } +@@ -1994,6 +1995,8 @@ static mdk_rdev_t *md_import_device(dev_ + kobject_init(&rdev->kobj); + + rdev->desc_nr = -1; ++ rdev->saved_raid_disk = -1; ++ rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->sb_events = 0; +@@ -3991,6 +3994,7 @@ static int set_array_info(mddev_t * mdde + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; ++ mddev->persistent = ! info->not_persistent; + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; diff --git a/kernel-patch-2.6.18.6 b/kernel-patch-2.6.18.6 new file mode 100644 index 00000000..e702e14a --- /dev/null +++ b/kernel-patch-2.6.18.6 @@ -0,0 +1,35 @@ +Signed-off-by: Neil Brown <neilb@suse.de> + +### Diffstat output + ./drivers/md/md.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff .prev/drivers/md/md.c ./drivers/md/md.c +--- .prev/drivers/md/md.c 2006-12-21 17:08:23.000000000 +1100 ++++ ./drivers/md/md.c 2006-12-21 17:08:26.000000000 +1100 +@@ -1783,7 +1783,8 @@ state_store(mdk_rdev_t *rdev, const char + else { + mddev_t *mddev = rdev->mddev; + kick_rdev_from_array(rdev); +- md_update_sb(mddev); ++ if (mddev->pers) ++ md_update_sb(mddev); + md_new_event(mddev); + err = 0; + } +@@ -1995,6 +1996,7 @@ static mdk_rdev_t *md_import_device(dev_ + + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; ++ rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->sb_events = 0; +@@ -3993,6 +3995,7 @@ static int set_array_info(mddev_t * mdde + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; ++ mddev->persistent = ! info->not_persistent; + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; diff --git a/kernel-patch-2.6.19 b/kernel-patch-2.6.19 new file mode 100644 index 00000000..22a67a39 --- /dev/null +++ b/kernel-patch-2.6.19 @@ -0,0 +1,34 @@ + +### Diffstat output + ./drivers/md/md.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff .prev/drivers/md/md.c ./drivers/md/md.c +--- .prev/drivers/md/md.c 2006-12-21 15:55:01.000000000 +1100 ++++ ./drivers/md/md.c 2006-12-21 16:28:09.000000000 +1100 +@@ -1792,7 +1792,8 @@ state_store(mdk_rdev_t *rdev, const char + else { + mddev_t *mddev = rdev->mddev; + kick_rdev_from_array(rdev); +- md_update_sb(mddev, 1); ++ if (mddev->pers) ++ md_update_sb(mddev, 1); + md_new_event(mddev); + err = 0; + } +@@ -2004,6 +2005,7 @@ static mdk_rdev_t *md_import_device(dev_ + + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; ++ rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->sb_events = 0; +@@ -3977,6 +3979,7 @@ static int set_array_info(mddev_t * mdde + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; ++ mddev->persistent = ! info->not_persistent; + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; diff --git a/kernel-patch-2.6.25 b/kernel-patch-2.6.25 new file mode 100644 index 00000000..23290078 --- /dev/null +++ b/kernel-patch-2.6.25 @@ -0,0 +1,199 @@ +Status: ok + +Support adding a spare to a live md array with external metadata. + +i.e. extend the 'md/dev-XXX/slot' attribute so that you can +tell a device to fill an vacant slot in an and md array. + + +Signed-off-by: Neil Brown <neilb@suse.de> + +### Diffstat output + ./drivers/md/md.c | 44 ++++++++++++++++++++++++++++++++++++++++---- + ./drivers/md/multipath.c | 7 ++++++- + ./drivers/md/raid1.c | 7 ++++++- + ./drivers/md/raid10.c | 10 ++++++++-- + ./drivers/md/raid5.c | 10 ++++++++-- + 5 files changed, 68 insertions(+), 10 deletions(-) + +diff .prev/drivers/md/md.c ./drivers/md/md.c +--- .prev/drivers/md/md.c 2008-06-05 09:19:56.000000000 +1000 ++++ ./drivers/md/md.c 2008-06-10 10:41:21.000000000 +1000 +@@ -1932,7 +1932,7 @@ slot_store(mdk_rdev_t *rdev, const char + slot = -1; + else if (e==buf || (*e && *e!= '\n')) + return -EINVAL; +- if (rdev->mddev->pers) { ++ if (rdev->mddev->pers && slot == -1) { + /* Setting 'slot' on an active array requires also + * updating the 'rd%d' link, and communicating + * with the personality with ->hot_*_disk. +@@ -1940,8 +1940,6 @@ slot_store(mdk_rdev_t *rdev, const char + * failed/spare devices. This normally happens automatically, + * but not when the metadata is externally managed. + */ +- if (slot != -1) +- return -EBUSY; + if (rdev->raid_disk == -1) + return -EEXIST; + /* personality does all needed checks */ +@@ -1955,6 +1953,44 @@ slot_store(mdk_rdev_t *rdev, const char + sysfs_remove_link(&rdev->mddev->kobj, nm); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); ++ } else if (rdev->mddev->pers) { ++ mdk_rdev_t *rdev2; ++ struct list_head *tmp; ++ /* Activating a spare .. or possibly reactivating ++ * if we every get bitmaps working here. ++ */ ++ ++ if (rdev->raid_disk != -1) ++ return -EBUSY; ++ ++ if (rdev->mddev->pers->hot_add_disk == NULL) ++ return -EINVAL; ++ ++ rdev_for_each(rdev2, tmp, rdev->mddev) ++ if (rdev2->raid_disk == slot) ++ return -EEXIST; ++ ++ rdev->raid_disk = slot; ++ if (test_bit(In_sync, &rdev->flags)) ++ rdev->saved_raid_disk = slot; ++ else ++ rdev->saved_raid_disk = -1; ++ err = rdev->mddev->pers-> ++ hot_add_disk(rdev->mddev, rdev); ++ if (err != 1) { ++ rdev->raid_disk = -1; ++ if (err == 0) ++ return -EEXIST; ++ return err; ++ } ++ sprintf(nm, "rd%d", rdev->raid_disk); ++ if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) ++ printk(KERN_WARNING ++ "md: cannot register " ++ "%s for %s\n", ++ nm, mdname(rdev->mddev)); ++ ++ /* don't wakeup anyone, leave that to userspace. */ + } else { + if (slot >= rdev->mddev->raid_disks) + return -ENOSPC; +@@ -4205,7 +4241,7 @@ static int add_new_disk(mddev_t * mddev, + super_types[mddev->major_version]. + validate_super(mddev, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); +- if (err) ++ if (err < 0) + unbind_rdev_from_array(rdev); + } + if (err) + +diff .prev/drivers/md/multipath.c ./drivers/md/multipath.c +--- .prev/drivers/md/multipath.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/multipath.c 2008-06-10 10:35:03.000000000 +1000 +@@ -284,10 +284,15 @@ static int multipath_add_disk(mddev_t *m + int found = 0; + int path; + struct multipath_info *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; ++ ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; + + print_multipath_conf(conf); + +- for (path=0; path<mddev->raid_disks; path++) ++ for (path = first; path <= last; path++) + if ((p=conf->multipaths+path)->rdev == NULL) { + q = rdev->bdev->bd_disk->queue; + blk_queue_stack_limits(mddev->queue, q); + +diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c +--- .prev/drivers/md/raid10.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/raid10.c 2008-06-10 10:28:53.000000000 +1000 +@@ -1116,6 +1116,8 @@ static int raid10_add_disk(mddev_t *mdde + int found = 0; + int mirror; + mirror_info_t *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; + + if (mddev->recovery_cp < MaxSector) + /* only hot-add to in-sync arrays, as recovery is +@@ -1125,12 +1127,16 @@ static int raid10_add_disk(mddev_t *mdde + if (!enough(conf)) + return 0; + ++ if (rdev->raid_disk) ++ first = last = rdev->raid_disk; ++ + if (rdev->saved_raid_disk >= 0 && ++ rdev->saved_raid_disk >= first && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + mirror = rdev->saved_raid_disk; + else +- mirror = 0; +- for ( ; mirror < mddev->raid_disks; mirror++) ++ mirror = first; ++ for ( ; mirror <= last ; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + + blk_queue_stack_limits(mddev->queue, + +diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c +--- .prev/drivers/md/raid1.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/raid1.c 2008-06-10 10:41:00.000000000 +1000 +@@ -1103,8 +1103,13 @@ static int raid1_add_disk(mddev_t *mddev + int found = 0; + int mirror = 0; + mirror_info_t *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; + +- for (mirror=0; mirror < mddev->raid_disks; mirror++) ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; ++ ++ for (mirror = first; mirror <= last; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + + blk_queue_stack_limits(mddev->queue, + +diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c +--- .prev/drivers/md/raid5.c 2008-05-30 14:49:35.000000000 +1000 ++++ ./drivers/md/raid5.c 2008-06-10 10:27:51.000000000 +1000 +@@ -4399,21 +4399,27 @@ static int raid5_add_disk(mddev_t *mddev + int found = 0; + int disk; + struct disk_info *p; ++ int first = 0; ++ int last = conf->raid_disks - 1; + + if (mddev->degraded > conf->max_degraded) + /* no point adding a device */ + return 0; + ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; ++ + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && ++ rdev->saved_raid_disk >= first && + conf->disks[rdev->saved_raid_disk].rdev == NULL) + disk = rdev->saved_raid_disk; + else +- disk = 0; +- for ( ; disk < conf->raid_disks; disk++) ++ disk = first; ++ for ( ; disk <= last ; disk++) + if ((p=conf->disks + disk)->rdev == NULL) { + clear_bit(In_sync, &rdev->flags); + rdev->raid_disk = disk; diff --git a/kernel-patch-2.6.27 b/kernel-patch-2.6.27 new file mode 100644 index 00000000..8d0785d8 --- /dev/null +++ b/kernel-patch-2.6.27 @@ -0,0 +1,36 @@ +touch_mnt_namespace when the mount flags change + +From: Dan Williams <dan.j.williams@intel.com> + +Daemons that need to be launched while the rootfs is read-only can now +poll /proc/mounts to be notified when their O_RDWR requests may no +longer end in EROFS. + +Cc: Kay Sievers <kay.sievers@vrfy.org> +Cc: Neil Brown <neilb@suse.de> +Signed-off-by: Dan Williams <dan.j.williams@intel.com> +--- + + fs/namespace.c | 7 ++++++- + 1 files changed, 6 insertions(+), 1 deletions(-) + + +diff --git a/fs/namespace.c b/fs/namespace.c +index 6e283c9..1bd5ba2 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -1553,8 +1553,13 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, + if (!err) + nd->path.mnt->mnt_flags = mnt_flags; + up_write(&sb->s_umount); +- if (!err) ++ if (!err) { + security_sb_post_remount(nd->path.mnt, flags, data); ++ ++ spin_lock(&vfsmount_lock); ++ touch_mnt_namespace(nd->path.mnt->mnt_ns); ++ spin_unlock(&vfsmount_lock); ++ } + return err; + } + @@ -0,0 +1,475 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2011 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "dlink.h" +#include <ctype.h> + +/* This fill contains various 'library' style function. They + * have no dependency on anything outside this file. + */ + +int get_mdp_major(void) +{ +static int mdp_major = -1; + FILE *fl; + char *w; + int have_block = 0; + int have_devices = 0; + int last_num = -1; + + if (mdp_major != -1) + return mdp_major; + fl = fopen("/proc/devices", "r"); + if (!fl) + return -1; + while ((w = conf_word(fl, 1))) { + if (have_block && strcmp(w, "devices:")==0) + have_devices = 1; + have_block = (strcmp(w, "Block")==0); + if (isdigit(w[0])) + last_num = atoi(w); + if (have_devices && strcmp(w, "mdp")==0) + mdp_major = last_num; + free(w); + } + fclose(fl); + return mdp_major; +} + +char *devid2kname(int devid) +{ + char path[30]; + char link[200]; + static char devnm[32]; + char *cp; + int n; + + /* Look at the + * /sys/dev/block/%d:%d link which must look like + * and take the last component. + */ + sprintf(path, "/sys/dev/block/%d:%d", major(devid), + minor(devid)); + n = readlink(path, link, sizeof(link)-1); + if (n > 0) { + link[n] = 0; + cp = strrchr(link, '/'); + if (cp) { + strcpy(devnm, cp+1); + return devnm; + } + } + return NULL; +} + +char *devid2devnm(int devid) +{ + char path[30]; + char link[200]; + static char devnm[32]; + char *cp, *ep; + int n; + + /* Might be an extended-minor partition or a + * named md device. Look at the + * /sys/dev/block/%d:%d link which must look like + * ../../block/mdXXX/mdXXXpYY + * or + * ...../block/md_FOO + */ + sprintf(path, "/sys/dev/block/%d:%d", major(devid), + minor(devid)); + n = readlink(path, link, sizeof(link)-1); + if (n > 0) { + link[n] = 0; + cp = strstr(link, "/block/"); + if (cp) { + cp += 7; + ep = strchr(cp, '/'); + if (ep) + *ep = 0; + strcpy(devnm, cp); + return devnm; + } + } + if (major(devid) == MD_MAJOR) + sprintf(devnm,"md%d", minor(devid)); + else if (major(devid) == (unsigned)get_mdp_major()) + sprintf(devnm,"md_d%d", + (minor(devid)>>MdpMinorShift)); + else + return NULL; + return devnm; +} + +char *stat2devnm(struct stat *st) +{ + if ((S_IFMT & st->st_mode) != S_IFBLK) + return NULL; + return devid2devnm(st->st_rdev); +} + +char *fd2devnm(int fd) +{ + struct stat stb; + if (fstat(fd, &stb) == 0) + return stat2devnm(&stb); + return NULL; +} + +/* + * convert a major/minor pair for a block device into a name in /dev, if possible. + * On the first call, walk /dev collecting name. + * Put them in a simple linked listfor now. + */ +struct devmap { + int major, minor; + char *name; + struct devmap *next; +} *devlist = NULL; +int devlist_ready = 0; + +int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s) +{ + struct stat st; + + if (S_ISLNK(stb->st_mode)) { + if (stat(name, &st) != 0) + return 0; + stb = &st; + } + + if ((stb->st_mode&S_IFMT)== S_IFBLK) { + char *n = xstrdup(name); + struct devmap *dm = xmalloc(sizeof(*dm)); + if (strncmp(n, "/dev/./", 7)==0) + strcpy(n+4, name+6); + if (dm) { + dm->major = major(stb->st_rdev); + dm->minor = minor(stb->st_rdev); + dm->name = n; + dm->next = devlist; + devlist = dm; + } + } + return 0; +} + +#ifndef HAVE_NFTW +#ifdef HAVE_FTW +int add_dev_1(const char *name, const struct stat *stb, int flag) +{ + return add_dev(name, stb, flag, NULL); +} +int nftw(const char *path, int (*han)(const char *name, const struct stat *stb, int flag, struct FTW *s), int nopenfd, int flags) +{ + return ftw(path, add_dev_1, nopenfd); +} +#else +int nftw(const char *path, int (*han)(const char *name, const struct stat *stb, int flag, struct FTW *s), int nopenfd, int flags) +{ + return 0; +} +#endif /* HAVE_FTW */ +#endif /* HAVE_NFTW */ + +/* + * Find a block device with the right major/minor number. + * If we find multiple names, choose the shortest. + * If we find a name in /dev/md/, we prefer that. + * This applies only to names for MD devices. + * If 'prefer' is set (normally to e.g. /by-path/) + * then we prefer a name which contains that string. + */ +char *map_dev_preferred(int major, int minor, int create, + char *prefer) +{ + struct devmap *p; + char *regular = NULL, *preferred=NULL; + int did_check = 0; + + if (major == 0 && minor == 0) + return NULL; + + retry: + if (!devlist_ready) { + char *dev = "/dev"; + struct stat stb; + while(devlist) { + struct devmap *d = devlist; + devlist = d->next; + free(d->name); + free(d); + } + if (lstat(dev, &stb)==0 && + S_ISLNK(stb.st_mode)) + dev = "/dev/."; + nftw(dev, add_dev, 10, FTW_PHYS); + devlist_ready=1; + did_check = 1; + } + + for (p=devlist; p; p=p->next) + if (p->major == major && + p->minor == minor) { + if (strncmp(p->name, "/dev/md/",8) == 0 + || (prefer && strstr(p->name, prefer))) { + if (preferred == NULL || + strlen(p->name) < strlen(preferred)) + preferred = p->name; + } else { + if (regular == NULL || + strlen(p->name) < strlen(regular)) + regular = p->name; + } + } + if (!regular && !preferred && !did_check) { + devlist_ready = 0; + goto retry; + } + if (create && !regular && !preferred) { + static char buf[30]; + snprintf(buf, sizeof(buf), "%d:%d", major, minor); + regular = buf; + } + + return preferred ? preferred : regular; +} + +/* conf_word gets one word from the conf file. + * if "allow_key", then accept words at the start of a line, + * otherwise stop when such a word is found. + * We assume that the file pointer is at the end of a word, so the + * next character is a space, or a newline. If not, it is the start of a line. + */ + +char *conf_word(FILE *file, int allow_key) +{ + int wsize = 100; + int len = 0; + int c; + int quote; + int wordfound = 0; + char *word = xmalloc(wsize); + + while (wordfound==0) { + /* at the end of a word.. */ + c = getc(file); + if (c == '#') + while (c != EOF && c != '\n') + c = getc(file); + if (c == EOF) break; + if (c == '\n') continue; + + if (c != ' ' && c != '\t' && ! allow_key) { + ungetc(c, file); + break; + } + /* looks like it is safe to get a word here, if there is one */ + quote = 0; + /* first, skip any spaces */ + while (c == ' ' || c == '\t') + c = getc(file); + if (c != EOF && c != '\n' && c != '#') { + /* we really have a character of a word, so start saving it */ + while (c != EOF && c != '\n' && (quote || (c!=' ' && c != '\t'))) { + wordfound = 1; + if (quote && c == quote) quote = 0; + else if (quote == 0 && (c == '\'' || c == '"')) + quote = c; + else { + if (len == wsize-1) { + wsize += 100; + word = xrealloc(word, wsize); + } + word[len++] = c; + } + c = getc(file); + /* Hack for broken kernels (2.6.14-.24) that put + * "active(auto-read-only)" + * in /proc/mdstat instead of + * "active (auto-read-only)" + */ + if (c == '(' && len >= 6 + && strncmp(word+len-6, "active", 6) == 0) + c = ' '; + } + } + if (c != EOF) ungetc(c, file); + } + word[len] = 0; + + /* Further HACK for broken kernels.. 2.6.14-2.6.24 */ + if (strcmp(word, "auto-read-only)") == 0) + strcpy(word, "(auto-read-only)"); + +/* printf("word is <%s>\n", word); */ + if (!wordfound) { + free(word); + word = NULL; + } + return word; +} + +void print_quoted(char *str) +{ + /* Printf the string with surrounding quotes + * iff needed. + * If no space, tab, or quote - leave unchanged. + * Else print surrounded by " or ', swapping quotes + * when we find one that will cause confusion. + */ + + char first_quote = 0, q; + char *c; + + for (c = str; *c; c++) { + switch(*c) { + case '\'': + case '"': + first_quote = *c; + break; + case ' ': + case '\t': + first_quote = *c; + continue; + default: + continue; + } + break; + } + if (!first_quote) { + printf("%s", str); + return; + } + + if (first_quote == '"') + q = '\''; + else + q = '"'; + putchar(q); + for (c = str; *c; c++) { + if (*c == q) { + putchar(q); + q ^= '"' ^ '\''; + putchar(q); + } + putchar(*c); + } + putchar(q); +} + +void print_escape(char *str) +{ + /* print str, but change space and tab to '_' + * as is suitable for device names + */ + for (; *str ; str++) { + switch (*str) { + case ' ': + case '\t': + putchar('_'); + break; + case '/': + putchar('-'); + break; + default: + putchar(*str); + } + } +} + +int check_env(char *name) +{ + char *val = getenv(name); + + if (val && atoi(val) == 1) + return 1; + + return 0; +} + +int use_udev(void) +{ + static int use = -1; + struct stat stb; + + if (use < 0) { + use = ((stat("/dev/.udev", &stb) == 0 + || stat("/run/udev", &stb) == 0) + && check_env("MDADM_NO_UDEV") == 0); + } + return use; +} + +unsigned long GCD(unsigned long a, unsigned long b) +{ + while (a != b) { + if (a < b) + b -= a; + if (b < a) + a -= b; + } + return a; +} + +/* + * conf_line reads one logical line from the conffile or mdstat. + * It skips comments and continues until it finds a line that starts + * with a non blank/comment. This character is pushed back for the next call + * A doubly linked list of words is returned. + * the first word will be a keyword. Other words will have had quotes removed. + */ + +char *conf_line(FILE *file) +{ + char *w; + char *list; + + w = conf_word(file, 1); + if (w == NULL) return NULL; + + list = dl_strdup(w); + free(w); + dl_init(list); + + while ((w = conf_word(file,0))){ + char *w2 = dl_strdup(w); + free(w); + dl_add(list, w2); + } +/* printf("got a line\n");*/ + return list; +} + +void free_line(char *line) +{ + char *w; + for (w=dl_next(line); w != line; w=dl_next(line)) { + dl_del(w); + dl_free(w); + } + dl_free(line); +} diff --git a/makedist b/makedist new file mode 100755 index 00000000..e4f20acf --- /dev/null +++ b/makedist @@ -0,0 +1,96 @@ +#!/bin/sh +# avoid silly sorting +export LANG=C +arg=$1 +target=~/public_html/source/mdadm +if [ " $arg" = " test" ] +then + target=/tmp/mdadm-test + rm -rf $target + mkdir -p $target +fi +if [ -d $target ] +then : +else echo $target is not a directory + exit 2 +fi +set `grep '^#define VERSION' ReadMe.c ` +version=`echo $3 | sed -e 's/"//g'` +grep "^.TH MDADM 8 .. v$version" mdadm.8.in > /dev/null 2>&1 || + { + echo mdadm.8.in does not mention version $version. + exit 1 + } +grep "^.TH MDMON 8 .. v$version" mdmon.8 > /dev/null 2>&1 || + { + echo mdmon.8 does not mention version $version. + exit 1 + } +rpmv=`echo $version | tr - _` +grep "^Version: *$rpmv$" mdadm.spec > /dev/null 2>&1 || + { + echo mdadm.spec does not mention version $version. + exit 1 + } +if [ -f ANNOUNCE-$version ] +then : +else + echo ANNOUNCE-$version does not exist + exit 1 +fi +if grep "^ANNOUNCE-$version\$" inventory +then : +else { cat inventory ; echo ANNOUNCE-$version ; } | sort -o inventory +fi + +echo version = $version +base=mdadm-$version.tar.gz +if [ " $arg" != " diff" ] +then + if [ -f $target/$base ] + then + echo $target/$base exists. + exit 1 + fi + trap "rm $target/$base; exit" 1 2 3 + git archive --prefix=mdadm-$version/ HEAD | gzip --best > $target/$base + chmod a+r $target/$base + ls -l $target/$base + if tar tzf $target/$base | sed 's,[^/]*/,,' | sort | diff -u inventory - + then : correct files found + else echo "Extra files, or inventory is out-of-date" + rm $target/$base + exit 1 + fi + rpmbuild -ta $target/$base || exit 1 + find /home/neilb/src/RPM -name "*mdadm-$version-*" \ + -exec cp {} $target/RPM \; + cp ANNOUNCE-$version $target/ANNOUNCE + cp ChangeLog $target/ChangeLog + if [ " $arg" != " test" ] + then + echo -n "Confirm signing this release? " + read a + if [ " $a" != " y" ]; then echo OK - bye. ; exit 1; fi + if zcat $target/$base | gpg -ba > $target/$base.sign && gpg -ba $target/ANNOUNCE + then + kup put $target/$base $target/$base.sign \ + /pub/linux/utils/raid/mdadm/mdadm-$version.tar.gz + kup put $target/ANNOUNCE $target/ANNOUNCE.asc /pub/linux/utils/raid/mdadm/ANNOUNCE + else + echo signing failed + exit 1 + fi + fi +else + if [ ! -f $target/$base ] + then + echo $target/$base does not exist. + exit 1 + fi + ( cd .. ; ln -s mdadm.v2 mdadm-$version ; tar chf - --exclude=.git --exclude="TAGS" --exclude='*,v' --exclude='*~' --exclude='*.o' --exclude mdadm --exclude=mdadm'.[^ch0-9]' --exclude=RCS mdadm-$version ; rm mdadm-$version ) | gzip --best > /var/tmp/mdadm-new.tgz + mkdir /var/tmp/mdadm-old ; zcat $target/$base | ( cd /var/tmp/mdadm-old ; tar xf - ) + mkdir /var/tmp/mdadm-new ; zcat /var/tmp/mdadm-new.tgz | ( cd /var/tmp/mdadm-new ; tar xf - ) + diff -ru /var/tmp/mdadm-old /var/tmp/mdadm-new + rm -rf /var/tmp/mdadm-old /var/tmp/mdadm-new /var/tmp/mdadm-new.tgz +fi diff --git a/managemon.c b/managemon.c new file mode 100644 index 00000000..6d1b3d85 --- /dev/null +++ b/managemon.c @@ -0,0 +1,926 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de> + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * The management thread for monitoring active md arrays. + * This thread does things which might block such as memory + * allocation. + * In particular: + * + * - Find out about new arrays in this container. + * Allocate the data structures and open the files. + * + * For this we watch /proc/mdstat and find new arrays with + * metadata type that confirms sharing. e.g. "md4" + * When we find a new array we slip it into the list of + * arrays and signal 'monitor' by writing to a pipe. + * + * - Respond to reshape requests by allocating new data structures + * and opening new files. + * + * These come as a change to raid_disks. We allocate a new + * version of the data structures and slip it into the list. + * 'monitor' will notice and release the old version. + * Changes to level, chunksize, layout.. do not need re-allocation. + * Reductions in raid_disks don't really either, but we handle + * them the same way for consistency. + * + * - When a device is added to the container, we add it to the metadata + * as a spare. + * + * - Deal with degraded array + * We only do this when first noticing the array is degraded. + * This can be when we first see the array, when sync completes or + * when recovery completes. + * + * Check if number of failed devices suggests recovery is needed, and + * skip if not. + * Ask metadata to allocate a spare device + * Add device as not in_sync and give a role + * Update metadata. + * Open sysfs files and pass to monitor. + * Make sure that monitor Starts recovery.... + * + * - Pass on metadata updates from external programs such as + * mdadm creating a new array. + * + * This is most-messy. + * It might involve adding a new array or changing the status of + * a spare, or any reconfig that the kernel doesn't get involved in. + * + * The required updates are received via a named pipe. There will + * be one named pipe for each container. Each message contains a + * sync marker: 0x5a5aa5a5, A byte count, and the message. This is + * passed to the metadata handler which will interpret and process it. + * For 'DDF' messages are internal data blocks with the leading + * 'magic number' signifying what sort of data it is. + * + */ + +/* + * We select on /proc/mdstat and the named pipe. + * We create new arrays or updated version of arrays and slip + * them into the head of the list, then signal 'monitor' via a pipe write. + * 'monitor' will notice and place the old array on a return list. + * Metadata updates are placed on a queue just like they arrive + * from the named pipe. + * + * When new arrays are found based on correct metadata string, we + * need to identify them with an entry in the metadata. Maybe we require + * the metadata to be mdX/NN when NN is the index into an appropriate table. + * + */ + +/* + * List of tasks: + * - Watch for spares to be added to the container, and write updated + * metadata to them. + * - Watch for new arrays using this container, confirm they match metadata + * and if so, start monitoring them + * - Watch for spares being added to monitored arrays. This shouldn't + * happen, as we should do all the adding. Just remove them. + * - Watch for change in raid-disks, chunk-size, etc. Update metadata and + * start a reshape. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "mdadm.h" +#include "mdmon.h" +#include <sys/syscall.h> +#include <sys/socket.h> +#include <signal.h> + +static void close_aa(struct active_array *aa) +{ + struct mdinfo *d; + + for (d = aa->info.devs; d; d = d->next) { + close(d->recovery_fd); + close(d->state_fd); + } + + if (aa->action_fd >= 0) + close(aa->action_fd); + if (aa->info.state_fd >= 0) + close(aa->info.state_fd); + if (aa->resync_start_fd >= 0) + close(aa->resync_start_fd); + if (aa->metadata_fd >= 0) + close(aa->metadata_fd); + if (aa->sync_completed_fd >= 0) + close(aa->sync_completed_fd); +} + +static void free_aa(struct active_array *aa) +{ + /* Note that this doesn't close fds if they are being used + * by a clone. ->container will be set for a clone + */ + dprintf("sys_name: %s\n", aa->info.sys_name); + if (!aa->container) + close_aa(aa); + while (aa->info.devs) { + struct mdinfo *d = aa->info.devs; + aa->info.devs = d->next; + free(d); + } + free(aa); +} + +static struct active_array *duplicate_aa(struct active_array *aa) +{ + struct active_array *newa = xmalloc(sizeof(*newa)); + struct mdinfo **dp1, **dp2; + + *newa = *aa; + newa->next = NULL; + newa->replaces = NULL; + newa->info.next = NULL; + + dp2 = &newa->info.devs; + + for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) { + struct mdinfo *d; + if ((*dp1)->state_fd < 0) + continue; + + d = xmalloc(sizeof(*d)); + *d = **dp1; + *dp2 = d; + dp2 = & d->next; + } + *dp2 = NULL; + + return newa; +} + +static void wakeup_monitor(void) +{ + /* tgkill(getpid(), mon_tid, SIGUSR1); */ + int pid = getpid(); + syscall(SYS_tgkill, pid, mon_tid, SIGUSR1); +} + +static void remove_old(void) +{ + if (discard_this) { + discard_this->next = NULL; + free_aa(discard_this); + if (pending_discard == discard_this) + pending_discard = NULL; + discard_this = NULL; + wakeup_monitor(); + } +} + +static void replace_array(struct supertype *container, + struct active_array *old, + struct active_array *new) +{ + /* To replace an array, we add it to the top of the list + * marked with ->replaces to point to the original. + * 'monitor' will take the original out of the list + * and put it on 'discard_this'. We take it from there + * and discard it. + */ + remove_old(); + while (pending_discard) { + while (discard_this == NULL) + sleep(1); + remove_old(); + } + pending_discard = old; + new->replaces = old; + new->next = container->arrays; + container->arrays = new; + wakeup_monitor(); +} + +struct metadata_update *update_queue = NULL; +struct metadata_update *update_queue_handled = NULL; +struct metadata_update *update_queue_pending = NULL; + +static void free_updates(struct metadata_update **update) +{ + while (*update) { + struct metadata_update *this = *update; + void **space_list = this->space_list; + + *update = this->next; + free(this->buf); + free(this->space); + while (space_list) { + void *space = space_list; + space_list = *space_list; + free(space); + } + free(this); + } +} + +void check_update_queue(struct supertype *container) +{ + free_updates(&update_queue_handled); + + if (update_queue == NULL && + update_queue_pending) { + update_queue = update_queue_pending; + update_queue_pending = NULL; + wakeup_monitor(); + } +} + +static void queue_metadata_update(struct metadata_update *mu) +{ + struct metadata_update **qp; + + qp = &update_queue_pending; + while (*qp) + qp = & ((*qp)->next); + *qp = mu; +} + +static void add_disk_to_container(struct supertype *st, struct mdinfo *sd) +{ + int dfd; + char nm[20]; + struct supertype *st2; + struct metadata_update *update = NULL; + struct mdinfo info; + mdu_disk_info_t dk = { + .number = -1, + .major = sd->disk.major, + .minor = sd->disk.minor, + .raid_disk = -1, + .state = 0, + }; + + dprintf("add %d:%d to container\n", sd->disk.major, sd->disk.minor); + + sd->next = st->devs; + st->devs = sd; + + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDWR); + if (dfd < 0) + return; + + /* Check the metadata and see if it is already part of this + * array + */ + st2 = dup_super(st); + if (st2->ss->load_super(st2, dfd, NULL) == 0) { + st2->ss->getinfo_super(st2, &info, NULL); + if (st->ss->compare_super(st, st2) == 0 && + info.disk.raid_disk >= 0) { + /* Looks like a good member of array. + * Just accept it. + * mdadm will incorporate any parts into + * active arrays. + */ + st2->ss->free_super(st2); + return; + } + } + st2->ss->free_super(st2); + + st->update_tail = &update; + st->ss->add_to_super(st, &dk, dfd, NULL, INVALID_SECTORS); + st->ss->write_init_super(st); + queue_metadata_update(update); + st->update_tail = NULL; +} + +/* + * Create and queue update structure about the removed disks. + * The update is prepared by super type handler and passed to the monitor + * thread. + */ +static void remove_disk_from_container(struct supertype *st, struct mdinfo *sd) +{ + struct metadata_update *update = NULL; + mdu_disk_info_t dk = { + .number = -1, + .major = sd->disk.major, + .minor = sd->disk.minor, + .raid_disk = -1, + .state = 0, + }; + dprintf("remove %d:%d from container\n", + sd->disk.major, sd->disk.minor); + + st->update_tail = &update; + st->ss->remove_from_super(st, &dk); + /* FIXME this write_init_super shouldn't be here. + * We have it after add_to_super to write to new device, + * but with 'remove' we don't ant to write to that device! + */ + st->ss->write_init_super(st); + queue_metadata_update(update); + st->update_tail = NULL; +} + +static void manage_container(struct mdstat_ent *mdstat, + struct supertype *container) +{ + /* Of interest here are: + * - if a new device has been added to the container, we + * add it to the array ignoring any metadata on it. + * - if a device has been removed from the container, we + * remove it from the device list and update the metadata. + * FIXME should we look for compatible metadata and take hints + * about spare assignment.... probably not. + */ + if (mdstat->devcnt != container->devcnt) { + struct mdinfo **cdp, *cd, *di, *mdi; + int found; + + /* read /sys/block/NAME/md/dev-??/block/dev to find out + * what is there, and compare with container->info.devs + * To see what is removed and what is added. + * These need to be remove from, or added to, the array + */ + mdi = sysfs_read(-1, mdstat->devnm, GET_DEVS); + if (!mdi) { + /* invalidate the current count so we can try again */ + container->devcnt = -1; + return; + } + + /* check for removals */ + for (cdp = &container->devs; *cdp; ) { + found = 0; + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (*cdp)->disk.major && + di->disk.minor == (*cdp)->disk.minor) { + found = 1; + break; + } + if (!found) { + cd = *cdp; + *cdp = (*cdp)->next; + remove_disk_from_container(container, cd); + free(cd); + } else + cdp = &(*cdp)->next; + } + + /* check for additions */ + for (di = mdi->devs; di; di = di->next) { + for (cd = container->devs; cd; cd = cd->next) + if (di->disk.major == cd->disk.major && + di->disk.minor == cd->disk.minor) + break; + if (!cd) { + struct mdinfo *newd = xmalloc(sizeof(*newd)); + + *newd = *di; + add_disk_to_container(container, newd); + } + } + sysfs_free(mdi); + container->devcnt = mdstat->devcnt; + } +} + +static int sysfs_open2(char *devnum, char *name, char *attr) +{ + int fd = sysfs_open(devnum, name, attr); + if (fd >= 0) { + /* seq_file in the kernel allocates buffer space + * on the first read. Do that now so 'monitor' + * never needs too. + */ + char buf[200]; + if (read(fd, buf, sizeof(buf)) < 0) + /* pretend not to ignore return value */ + return fd; + } + return fd; +} + +static int disk_init_and_add(struct mdinfo *disk, struct mdinfo *clone, + struct active_array *aa) +{ + if (!disk || !clone) + return -1; + + *disk = *clone; + disk->recovery_fd = sysfs_open2(aa->info.sys_name, disk->sys_name, + "recovery_start"); + if (disk->recovery_fd < 0) + return -1; + disk->state_fd = sysfs_open2(aa->info.sys_name, disk->sys_name, "state"); + if (disk->state_fd < 0) { + close(disk->recovery_fd); + return -1; + } + disk->prev_state = read_dev_state(disk->state_fd); + disk->curr_state = disk->prev_state; + disk->next = aa->info.devs; + aa->info.devs = disk; + + return 0; +} + +static void manage_member(struct mdstat_ent *mdstat, + struct active_array *a) +{ + /* Compare mdstat info with known state of member array. + * We do not need to look for device state changes here, that + * is dealt with by the monitor. + * + * If a reshape is being requested, monitor will have noticed + * that sync_action changed and will have set check_reshape. + * We just need to see if new devices have appeared. All metadata + * updates will already have been processed. + * + * We also want to handle degraded arrays here by + * trying to find and assign a spare. + * We do that whenever the monitor tells us too. + */ + char buf[64]; + int frozen; + struct supertype *container = a->container; + unsigned long long int component_size = 0; + + if (container == NULL) + /* Raced with something */ + return; + + if (mdstat->active) { + // FIXME + a->info.array.raid_disks = mdstat->raid_disks; + // MORE + } + + if (sysfs_get_ll(&a->info, NULL, "component_size", &component_size) >= 0) + a->info.component_size = component_size << 1; + + /* honor 'frozen' */ + if (sysfs_get_str(&a->info, NULL, "metadata_version", buf, sizeof(buf)) > 0) + frozen = buf[9] == '-'; + else + frozen = 1; /* can't read metadata_version assume the worst */ + + /* If sync_action is not 'idle' then don't try recovery now */ + if (!frozen + && sysfs_get_str(&a->info, NULL, "sync_action", buf, sizeof(buf)) > 0 + && strncmp(buf, "idle", 4) != 0) + frozen = 1; + + if (mdstat->level) { + int level = map_name(pers, mdstat->level); + if (level == 0 || level == LEVEL_LINEAR) { + a->to_remove = 1; + wakeup_monitor(); + return; + } + else if (a->info.array.level != level && level > 0) { + struct active_array *newa = duplicate_aa(a); + if (newa) { + newa->info.array.level = level; + replace_array(container, a, newa); + a = newa; + } + } + } + + /* we are after monitor kick, + * so container field can be cleared - check it again + */ + if (a->container == NULL) + return; + + if (sigterm && a->info.safe_mode_delay != 1) { + sysfs_set_safemode(&a->info, 1); + a->info.safe_mode_delay = 1; + } + + /* We don't check the array while any update is pending, as it + * might container a change (such as a spare assignment) which + * could affect our decisions. + */ + if (a->check_degraded && !frozen && + update_queue == NULL && update_queue_pending == NULL) { + struct metadata_update *updates = NULL; + struct mdinfo *newdev = NULL; + struct active_array *newa; + struct mdinfo *d; + + a->check_degraded = 0; + + /* The array may not be degraded, this is just a good time + * to check. + */ + newdev = container->ss->activate_spare(a, &updates); + if (!newdev) + return; + + newa = duplicate_aa(a); + if (!newa) + goto out; + /* prevent the kernel from activating the disk(s) before we + * finish adding them + */ + dprintf("freezing %s\n", a->info.sys_name); + sysfs_set_str(&a->info, NULL, "sync_action", "frozen"); + + /* Add device to array and set offset/size/slot. + * and open files for each newdev */ + for (d = newdev; d ; d = d->next) { + struct mdinfo *newd; + + newd = xmalloc(sizeof(*newd)); + if (sysfs_add_disk(&newa->info, d, 0) < 0) { + free(newd); + continue; + } + disk_init_and_add(newd, d, newa); + } + queue_metadata_update(updates); + updates = NULL; + while (update_queue_pending || update_queue) { + check_update_queue(container); + usleep(15*1000); + } + replace_array(container, a, newa); + if (sysfs_set_str(&a->info, NULL, "sync_action", "recover") + == 0) + newa->prev_action = recover; + dprintf("recovery started on %s\n", a->info.sys_name); + out: + while (newdev) { + d = newdev->next; + free(newdev); + newdev = d; + } + free_updates(&updates); + } + + if (a->check_reshape) { + /* mdadm might have added some devices to the array. + * We want to disk_init_and_add any such device to a + * duplicate_aa and replace a with that. + * mdstat doesn't have enough info so we sysfs_read + * and look for new stuff. + */ + struct mdinfo *info, *d, *d2, *newd; + unsigned long long array_size; + struct active_array *newa = NULL; + a->check_reshape = 0; + info = sysfs_read(-1, mdstat->devnm, + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE); + if (!info) + goto out2; + for (d = info->devs; d; d = d->next) { + if (d->disk.raid_disk < 0) + continue; + for (d2 = a->info.devs; d2; d2 = d2->next) + if (d2->disk.raid_disk == + d->disk.raid_disk) + break; + if (d2) + /* already have this one */ + continue; + if (!newa) { + newa = duplicate_aa(a); + if (!newa) + break; + } + newd = xmalloc(sizeof(*newd)); + disk_init_and_add(newd, d, newa); + } + if (sysfs_get_ll(info, NULL, "array_size", &array_size) == 0 + && a->info.custom_array_size > array_size*2) { + sysfs_set_num(info, NULL, "array_size", + a->info.custom_array_size/2); + } + out2: + sysfs_free(info); + if (newa) + replace_array(container, a, newa); + } +} + +static int aa_ready(struct active_array *aa) +{ + struct mdinfo *d; + int level = aa->info.array.level; + + for (d = aa->info.devs; d; d = d->next) + if (d->state_fd < 0) + return 0; + + if (aa->info.state_fd < 0) + return 0; + + if (level > 0 && (aa->action_fd < 0 || aa->resync_start_fd < 0)) + return 0; + + if (!aa->container) + return 0; + + return 1; +} + +static void manage_new(struct mdstat_ent *mdstat, + struct supertype *container, + struct active_array *victim) +{ + /* A new array has appeared in this container. + * Hopefully it is already recorded in the metadata. + * Check, then create the new array to report it to + * the monitor. + */ + + struct active_array *new; + struct mdinfo *mdi, *di; + char *inst; + int i; + int failed = 0; + char buf[40]; + + /* check if array is ready to be monitored */ + if (!mdstat->active || !mdstat->level) + return; + if (strcmp(mdstat->level, "raid0") == 0 || + strcmp(mdstat->level, "linear") == 0) + return; + + mdi = sysfs_read(-1, mdstat->devnm, + GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT| + GET_DEGRADED|GET_SAFEMODE| + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|GET_LAYOUT); + + if (!mdi) + return; + new = xcalloc(1, sizeof(*new)); + + strcpy(new->info.sys_name, mdstat->devnm); + + new->prev_state = new->curr_state = new->next_state = inactive; + new->prev_action= new->curr_action= new->next_action= idle; + + new->container = container; + + inst = to_subarray(mdstat, container->devnm); + + new->info.array = mdi->array; + new->info.component_size = mdi->component_size; + + for (i = 0; i < new->info.array.raid_disks; i++) { + struct mdinfo *newd = xmalloc(sizeof(*newd)); + + for (di = mdi->devs; di; di = di->next) + if (i == di->disk.raid_disk) + break; + + if (disk_init_and_add(newd, di, new) != 0) { + if (newd) + free(newd); + + failed++; + if (failed > new->info.array.failed_disks) { + /* we cannot properly monitor without all working disks */ + new->container = NULL; + break; + } + } + } + + new->action_fd = sysfs_open2(new->info.sys_name, NULL, "sync_action"); + new->info.state_fd = sysfs_open2(new->info.sys_name, NULL, "array_state"); + new->resync_start_fd = sysfs_open2(new->info.sys_name, NULL, "resync_start"); + new->metadata_fd = sysfs_open2(new->info.sys_name, NULL, "metadata_version"); + new->sync_completed_fd = sysfs_open2(new->info.sys_name, NULL, "sync_completed"); + + dprintf("inst: %s action: %d state: %d\n", inst, + new->action_fd, new->info.state_fd); + + if (sigterm) + new->info.safe_mode_delay = 1; + else if (mdi->safe_mode_delay >= 50) + /* Normal start, mdadm set this. */ + new->info.safe_mode_delay = mdi->safe_mode_delay; + else + /* Restart, just pick a number */ + new->info.safe_mode_delay = 5000; + sysfs_set_safemode(&new->info, new->info.safe_mode_delay); + + /* reshape_position is set by mdadm in sysfs + * read this information for new arrays only (empty victim) + */ + if ((victim == NULL) && + (sysfs_get_str(mdi, NULL, "sync_action", buf, 40) > 0) && + (strncmp(buf, "reshape", 7) == 0)) { + if (sysfs_get_ll(mdi, NULL, "reshape_position", + &new->last_checkpoint) != 0) + new->last_checkpoint = 0; + else { + int data_disks = mdi->array.raid_disks; + if (mdi->array.level == 4 || mdi->array.level == 5) + data_disks--; + if (mdi->array.level == 6) + data_disks -= 2; + + new->last_checkpoint /= data_disks; + } + dprintf("mdmon: New monitored array is under reshape.\n" + " Last checkpoint is: %llu\n", + new->last_checkpoint); + } + + sysfs_free(mdi); + + /* if everything checks out tell the metadata handler we want to + * manage this instance + */ + if (!aa_ready(new) || container->ss->open_new(container, new, inst) < 0) { + pr_err("failed to monitor %s\n", + mdstat->metadata_version); + new->container = NULL; + free_aa(new); + } else { + replace_array(container, victim, new); + if (failed) { + new->check_degraded = 1; + manage_member(mdstat, new); + } + } +} + +void manage(struct mdstat_ent *mdstat, struct supertype *container) +{ + /* We have just read mdstat and need to compare it with + * the known active arrays. + * Arrays with the wrong metadata are ignored. + */ + + for ( ; mdstat ; mdstat = mdstat->next) { + struct active_array *a; + if (strcmp(mdstat->devnm, container->devnm) == 0) { + manage_container(mdstat, container); + continue; + } + if (!is_container_member(mdstat, container->devnm)) + /* Not for this array */ + continue; + /* Looks like a member of this container */ + for (a = container->arrays; a; a = a->next) { + if (strcmp(mdstat->devnm, a->info.sys_name) == 0) { + if (a->container && a->to_remove == 0) + manage_member(mdstat, a); + break; + } + } + if (a == NULL || !a->container) + manage_new(mdstat, container, a); + } +} + +static void handle_message(struct supertype *container, struct metadata_update *msg) +{ + /* queue this metadata update through to the monitor */ + + struct metadata_update *mu; + + if (msg->len <= 0) + while (update_queue_pending || update_queue) { + check_update_queue(container); + usleep(15*1000); + } + + if (msg->len == 0) { /* ping_monitor */ + int cnt; + + cnt = monitor_loop_cnt; + if (cnt & 1) + cnt += 2; /* wait until next pselect */ + else + cnt += 3; /* wait for 2 pselects */ + wakeup_monitor(); + + while (monitor_loop_cnt - cnt < 0) + usleep(10 * 1000); + } else if (msg->len == -1) { /* ping_manager */ + struct mdstat_ent *mdstat = mdstat_read(1, 0); + + manage(mdstat, container); + free_mdstat(mdstat); + } else if (!sigterm) { + mu = xmalloc(sizeof(*mu)); + mu->len = msg->len; + mu->buf = msg->buf; + msg->buf = NULL; + mu->space = NULL; + mu->space_list = NULL; + mu->next = NULL; + if (container->ss->prepare_update) + if (!container->ss->prepare_update(container, mu)) + free_updates(&mu); + queue_metadata_update(mu); + } +} + +void read_sock(struct supertype *container) +{ + int fd; + struct metadata_update msg; + int terminate = 0; + long fl; + int tmo = 3; /* 3 second timeout before hanging up the socket */ + + fd = accept(container->sock, NULL, NULL); + if (fd < 0) + return; + + fl = fcntl(fd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(fd, F_SETFL, fl); + + do { + msg.buf = NULL; + + /* read and validate the message */ + if (receive_message(fd, &msg, tmo) == 0) { + handle_message(container, &msg); + if (msg.len == 0) { + /* ping reply with version */ + msg.buf = Version; + msg.len = strlen(Version) + 1; + if (send_message(fd, &msg, tmo) < 0) + terminate = 1; + } else if (ack(fd, tmo) < 0) + terminate = 1; + } else + terminate = 1; + + } while (!terminate); + + close(fd); +} + +int exit_now = 0; +int manager_ready = 0; +void do_manager(struct supertype *container) +{ + struct mdstat_ent *mdstat; + sigset_t set; + + sigprocmask(SIG_UNBLOCK, NULL, &set); + sigdelset(&set, SIGUSR1); + sigdelset(&set, SIGTERM); + + do { + + if (exit_now) + exit(0); + + /* Can only 'manage' things if 'monitor' is not making + * structural changes to metadata, so need to check + * update_queue + */ + if (update_queue == NULL) { + mdstat = mdstat_read(1, 0); + + manage(mdstat, container); + + read_sock(container); + + free_mdstat(mdstat); + } + remove_old(); + + check_update_queue(container); + + manager_ready = 1; + + if (sigterm) + wakeup_monitor(); + + if (update_queue == NULL) + mdstat_wait_fd(container->sock, &set); + else + /* If an update is happening, just wait for signal */ + pselect(0, NULL, NULL, NULL, NULL, &set); + } while(1); +} diff --git a/mapfile.c b/mapfile.c new file mode 100644 index 00000000..41599df0 --- /dev/null +++ b/mapfile.c @@ -0,0 +1,508 @@ +/* + * mapfile - keep track of uuid <-> array mapping. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2010 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * Paper: Neil Brown + * Novell Inc + * GPO Box Q1283 + * QVB Post Office, NSW 1230 + * Australia + */ + +/* The mapfile is used to track arrays being created in --incremental + * mode. It particularly allows lookup from UUID to array device, but + * also allows the array device name to be easily found. + * + * The map file is line based with space separated fields. The fields are: + * Device id - mdX or mdpX where X is a number. + * metadata - 0.90 1.0 1.1 1.2 ddf ... + * UUID - uuid of the array + * path - path where device created: /dev/md/home + * + * The best place for the mapfile is /run/mdadm/map. Distros and users + * which have not switched to /run yet can choose a different location + * at compile time via MAP_DIR and MAP_FILE. + */ +#include "mdadm.h" +#include <sys/file.h> +#include <ctype.h> + +#define MAP_READ 0 +#define MAP_NEW 1 +#define MAP_LOCK 2 +#define MAP_DIRNAME 3 + +char *mapname[4] = { + MAP_DIR "/" MAP_FILE, + MAP_DIR "/" MAP_FILE ".new", + MAP_DIR "/" MAP_FILE ".lock", + MAP_DIR +}; + +int mapmode[3] = { O_RDONLY, O_RDWR|O_CREAT, O_RDWR|O_CREAT|O_TRUNC }; +char *mapsmode[3] = { "r", "w", "w"}; + +FILE *open_map(int modenum) +{ + int fd; + if ((mapmode[modenum] & O_CREAT)) + /* Attempt to create directory, don't worry about + * failure. + */ + (void)mkdir(mapname[MAP_DIRNAME], 0755); + fd = open(mapname[modenum], mapmode[modenum], 0600); + if (fd >= 0) + return fdopen(fd, mapsmode[modenum]); + return NULL; +} + +int map_write(struct map_ent *mel) +{ + FILE *f; + int err; + + f = open_map(MAP_NEW); + + if (!f) + return 0; + for (; mel; mel = mel->next) { + if (mel->bad) + continue; + fprintf(f, "%s ", mel->devnm); + fprintf(f, "%s ", mel->metadata); + fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0], + mel->uuid[1], mel->uuid[2], mel->uuid[3]); + fprintf(f, "%s\n", mel->path?:""); + } + fflush(f); + err = ferror(f); + fclose(f); + if (err) { + unlink(mapname[1]); + return 0; + } + return rename(mapname[1], + mapname[0]) == 0; +} + +static FILE *lf = NULL; +int map_lock(struct map_ent **melp) +{ + while (lf == NULL) { + struct stat buf; + lf = open_map(MAP_LOCK); + if (lf == NULL) + return -1; + if (flock(fileno(lf), LOCK_EX) != 0) { + fclose(lf); + lf = NULL; + return -1; + } + if (fstat(fileno(lf), &buf) != 0 || + buf.st_nlink == 0) { + /* The owner of the lock unlinked it, + * so we have a lock on a stale file, + * try again + */ + fclose(lf); + lf = NULL; + } + } + if (*melp) + map_free(*melp); + map_read(melp); + return 0; +} + +void map_unlock(struct map_ent **melp) +{ + if (lf) { + /* must unlink before closing the file, + * as only the owner of the lock may + * unlink the file + */ + unlink(mapname[2]); + fclose(lf); + } + lf = NULL; +} + +void map_fork(void) +{ + /* We are forking, so must close the lock file. + * Don't risk flushing anything though. + */ + if (lf) { + close(fileno(lf)); + fclose(lf); + lf = NULL; + } +} + +void map_add(struct map_ent **melp, + char * devnm, char *metadata, int uuid[4], char *path) +{ + struct map_ent *me = xmalloc(sizeof(*me)); + + strcpy(me->devnm, devnm); + strcpy(me->metadata, metadata); + memcpy(me->uuid, uuid, 16); + me->path = path ? xstrdup(path) : NULL; + me->next = *melp; + me->bad = 0; + *melp = me; +} + +void map_read(struct map_ent **melp) +{ + FILE *f; + char buf[8192]; + char path[200]; + int uuid[4]; + char devnm[32]; + char metadata[30]; + + *melp = NULL; + + f = open_map(MAP_READ); + if (!f) { + RebuildMap(); + f = open_map(MAP_READ); + } + if (!f) + return; + + while (fgets(buf, sizeof(buf), f)) { + path[0] = 0; + if (sscanf(buf, " %s %s %x:%x:%x:%x %200s", + devnm, metadata, uuid, uuid+1, + uuid+2, uuid+3, path) >= 7) { + map_add(melp, devnm, metadata, uuid, path); + } + } + fclose(f); +} + +void map_free(struct map_ent *map) +{ + while (map) { + struct map_ent *mp = map; + map = mp->next; + free(mp->path); + free(mp); + } +} + +int map_update(struct map_ent **mpp, char *devnm, char *metadata, + int *uuid, char *path) +{ + struct map_ent *map, *mp; + int rv; + + if (mpp && *mpp) + map = *mpp; + else + map_read(&map); + + for (mp = map ; mp ; mp=mp->next) + if (strcmp(mp->devnm, devnm) == 0) { + strcpy(mp->metadata, metadata); + memcpy(mp->uuid, uuid, 16); + free(mp->path); + mp->path = path ? xstrdup(path) : NULL; + mp->bad = 0; + break; + } + if (!mp) + map_add(&map, devnm, metadata, uuid, path); + if (mpp) + *mpp = NULL; + rv = map_write(map); + map_free(map); + return rv; +} + +void map_delete(struct map_ent **mapp, char *devnm) +{ + struct map_ent *mp; + + if (*mapp == NULL) + map_read(mapp); + + for (mp = *mapp; mp; mp = *mapp) { + if (strcmp(mp->devnm, devnm) == 0) { + *mapp = mp->next; + free(mp->path); + free(mp); + } else + mapp = & mp->next; + } +} + +void map_remove(struct map_ent **mapp, char *devnm) +{ + if (devnm[0] == 0) + return; + + map_delete(mapp, devnm); + map_write(*mapp); + map_free(*mapp); +} + +struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (memcmp(uuid, mp->uuid, 16) != 0) + continue; + if (!mddev_busy(mp->devnm)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +struct map_ent *map_by_devnm(struct map_ent **map, char *devnm) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (strcmp(mp->devnm, devnm) != 0) + continue; + if (!mddev_busy(mp->devnm)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +struct map_ent *map_by_name(struct map_ent **map, char *name) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (!mp->path) + continue; + if (strncmp(mp->path, "/dev/md/", 8) != 0) + continue; + if (strcmp(mp->path+8, name) != 0) + continue; + if (!mddev_busy(mp->devnm)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +/* sets the proper subarray and container_dev according to the metadata + * version super_by_fd does this automatically, this routine is meant as + * a supplement for guess_super() + */ +static char *get_member_info(struct mdstat_ent *ent) +{ + + if (ent->metadata_version == NULL || + strncmp(ent->metadata_version, "external:", 9) != 0) + return NULL; + + if (is_subarray(&ent->metadata_version[9])) { + char *subarray; + + subarray = strrchr(ent->metadata_version, '/'); + return subarray + 1; + } + return NULL; +} + +void RebuildMap(void) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *md; + struct map_ent *map = NULL; + int require_homehost; + char sys_hostname[256]; + char *homehost = conf_get_homehost(&require_homehost); + + if (homehost == NULL || strcmp(homehost, "<system>")==0) { + if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) { + sys_hostname[sizeof(sys_hostname)-1] = 0; + homehost = sys_hostname; + } + } + + for (md = mdstat ; md ; md = md->next) { + struct mdinfo *sra = sysfs_read(-1, md->devnm, GET_DEVS); + struct mdinfo *sd; + + if (!sra) + continue; + + for (sd = sra->devs ; sd ; sd = sd->next) { + char namebuf[100]; + char dn[30]; + int dfd; + int ok; + int devid; + struct supertype *st; + char *subarray = NULL; + char *path; + struct mdinfo *info; + + sprintf(dn, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + st = guess_super(dfd); + if ( st == NULL) + ok = -1; + else { + subarray = get_member_info(md); + ok = st->ss->load_super(st, dfd, NULL); + } + close(dfd); + if (ok != 0) + continue; + if (subarray) + info = st->ss->container_content(st, subarray); + else { + info = xmalloc(sizeof(*info)); + st->ss->getinfo_super(st, info, NULL); + } + if (!info) + continue; + + devid = devnm2devid(md->devnm); + path = map_dev(major(devid), minor(devid), 0); + if (path == NULL || + strncmp(path, "/dev/md/", 8) != 0) { + /* We would really like a name that provides + * an MD_DEVNAME for udev. + * The name needs to be unique both in /dev/md/ + * and in this mapfile. + * It needs to match what -I or -As would come + * up with. + * That means: + * Check if array is in mdadm.conf + * - if so use that. + * determine trustworthy from homehost etc + * find a unique name based on metadata name. + * + */ + struct mddev_ident *match = conf_match(st, info, + NULL, 0, + NULL); + struct stat stb; + if (match && match->devname && match->devname[0] == '/') { + path = match->devname; + if (path[0] != '/') { + strcpy(namebuf, "/dev/md/"); + strcat(namebuf, path); + path = namebuf; + } + } else { + int unum = 0; + char *sep = "_"; + const char *name; + int conflict = 1; + if ((homehost == NULL || + st->ss->match_home(st, homehost) != 1) && + st->ss->match_home(st, "any") != 1 && + (require_homehost + || ! conf_name_is_free(info->name))) + /* require a numeric suffix */ + unum = 0; + else + /* allow name to be used as-is if no conflict */ + unum = -1; + name = info->name; + if (!*name) { + name = st->ss->name; + if (!isdigit(name[strlen(name)-1]) && + unum == -1) { + unum = 0; + sep = ""; + } + } + if (strchr(name, ':')) { + /* Probably a uniquifying + * hostname prefix. Allow + * without a suffix, and strip + * hostname if it is us. + */ + if (homehost && unum == -1 && + strncmp(name, homehost, + strlen(homehost)) == 0 && + name[strlen(homehost)] == ':') + name += strlen(homehost)+1; + unum = -1; + } + + while (conflict) { + if (unum >= 0) + sprintf(namebuf, "/dev/md/%s%s%d", + name, sep, unum); + else + sprintf(namebuf, "/dev/md/%s", + name); + unum++; + if (lstat(namebuf, &stb) != 0 && + (map == NULL || + !map_by_name(&map, namebuf+8))) + conflict = 0; + } + path = namebuf; + } + } + map_add(&map, md->devnm, + info->text_version, + info->uuid, path); + st->ss->free_super(st); + free(info); + break; + } + sysfs_free(sra); + } + /* Only trigger a change if we wrote a new map file */ + if (map_write(map)) + for (md = mdstat ; md ; md = md->next) { + struct mdinfo *sra = sysfs_read(-1, md->devnm, + GET_VERSION); + if (sra) + sysfs_uevent(sra, "change"); + sysfs_free(sra); + } + map_free(map); + free_mdstat(mdstat); +} @@ -0,0 +1,150 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2011 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" + +/* name/number mappings */ + +mapping_t r5layout[] = { + { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC}, + { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC}, + { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC}, + { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC}, + + { "default", ALGORITHM_LEFT_SYMMETRIC}, + { "la", ALGORITHM_LEFT_ASYMMETRIC}, + { "ra", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ls", ALGORITHM_LEFT_SYMMETRIC}, + { "rs", ALGORITHM_RIGHT_SYMMETRIC}, + + { "parity-first", ALGORITHM_PARITY_0}, + { "parity-last", ALGORITHM_PARITY_N}, + { "ddf-zero-restart", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ddf-N-restart", ALGORITHM_LEFT_ASYMMETRIC}, + { "ddf-N-continue", ALGORITHM_LEFT_SYMMETRIC}, + + { NULL, 0} +}; +mapping_t r6layout[] = { + { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC}, + { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC}, + { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC}, + { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC}, + + { "default", ALGORITHM_LEFT_SYMMETRIC}, + { "la", ALGORITHM_LEFT_ASYMMETRIC}, + { "ra", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ls", ALGORITHM_LEFT_SYMMETRIC}, + { "rs", ALGORITHM_RIGHT_SYMMETRIC}, + + { "parity-first", ALGORITHM_PARITY_0}, + { "parity-last", ALGORITHM_PARITY_N}, + { "ddf-zero-restart", ALGORITHM_ROTATING_ZERO_RESTART}, + { "ddf-N-restart", ALGORITHM_ROTATING_N_RESTART}, + { "ddf-N-continue", ALGORITHM_ROTATING_N_CONTINUE}, + + { "left-asymmetric-6", ALGORITHM_LEFT_ASYMMETRIC_6}, + { "right-asymmetric-6", ALGORITHM_RIGHT_ASYMMETRIC_6}, + { "left-symmetric-6", ALGORITHM_LEFT_SYMMETRIC_6}, + { "right-symmetric-6", ALGORITHM_RIGHT_SYMMETRIC_6}, + { "parity-first-6", ALGORITHM_PARITY_0_6}, + + { NULL, 0} +}; + +mapping_t pers[] = { + { "linear", LEVEL_LINEAR}, + { "raid0", 0}, + { "0", 0}, + { "stripe", 0}, + { "raid1", 1}, + { "1", 1}, + { "mirror", 1}, + { "raid4", 4}, + { "4", 4}, + { "raid5", 5}, + { "5", 5}, + { "multipath", LEVEL_MULTIPATH}, + { "mp", LEVEL_MULTIPATH}, + { "raid6", 6}, + { "6", 6}, + { "raid10", 10}, + { "10", 10}, + { "faulty", LEVEL_FAULTY}, + { "container", LEVEL_CONTAINER}, + { NULL, 0} +}; + +mapping_t modes[] = { + { "assemble", ASSEMBLE}, + { "build", BUILD}, + { "create", CREATE}, + { "manage", MANAGE}, + { "misc", MISC}, + { "monitor", MONITOR}, + { "grow", GROW}, + { "incremental", INCREMENTAL}, + { "auto-detect", AUTODETECT}, +}; + +mapping_t faultylayout[] = { + { "write-transient", WriteTransient }, + { "wt", WriteTransient }, + { "read-transient", ReadTransient }, + { "rt", ReadTransient }, + { "write-persistent", WritePersistent }, + { "wp", WritePersistent }, + { "read-persistent", ReadPersistent }, + { "rp", ReadPersistent }, + { "write-all", WriteAll }, + { "wa", WriteAll }, + { "read-fixable", ReadFixable }, + { "rf", ReadFixable }, + + { "clear", ClearErrors}, + { "flush", ClearFaults}, + { "none", ClearErrors}, + { "default", ClearErrors}, + { NULL, 0} +}; + +char *map_num(mapping_t *map, int num) +{ + while (map->name) { + if (map->num == num) + return map->name; + map++; + } + return NULL; +} + +int map_name(mapping_t *map, char *name) +{ + while (map->name) { + if (strcmp(map->name, name)==0) + return map->num; + map++; + } + return UnSet; +} @@ -0,0 +1,1125 @@ +.\" Copyright Neil Brown and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH MD 4 +.SH NAME +md \- Multiple Device driver aka Linux Software RAID +.SH SYNOPSIS +.BI /dev/md n +.br +.BI /dev/md/ n +.br +.BR /dev/md/ name +.SH DESCRIPTION +The +.B md +driver provides virtual devices that are created from one or more +independent underlying devices. This array of devices often contains +redundancy and the devices are often disk drives, hence the acronym RAID +which stands for a Redundant Array of Independent Disks. +.PP +.B md +supports RAID levels +1 (mirroring), +4 (striped array with parity device), +5 (striped array with distributed parity information), +6 (striped array with distributed dual redundancy information), and +10 (striped and mirrored). +If some number of underlying devices fails while using one of these +levels, the array will continue to function; this number is one for +RAID levels 4 and 5, two for RAID level 6, and all but one (N-1) for +RAID level 1, and dependent on configuration for level 10. +.PP +.B md +also supports a number of pseudo RAID (non-redundant) configurations +including RAID0 (striped array), LINEAR (catenated array), +MULTIPATH (a set of different interfaces to the same device), +and FAULTY (a layer over a single device into which errors can be injected). + +.SS MD METADATA +Each device in an array may have some +.I metadata +stored in the device. This metadata is sometimes called a +.BR superblock . +The metadata records information about the structure and state of the array. +This allows the array to be reliably re-assembled after a shutdown. + +From Linux kernel version 2.6.10, +.B md +provides support for two different formats of metadata, and +other formats can be added. Prior to this release, only one format is +supported. + +The common format \(em known as version 0.90 \(em has +a superblock that is 4K long and is written into a 64K aligned block that +starts at least 64K and less than 128K from the end of the device +(i.e. to get the address of the superblock round the size of the +device down to a multiple of 64K and then subtract 64K). +The available size of each device is the amount of space before the +super block, so between 64K and 128K is lost when a device in +incorporated into an MD array. +This superblock stores multi-byte fields in a processor-dependent +manner, so arrays cannot easily be moved between computers with +different processors. + +The new format \(em known as version 1 \(em has a superblock that is +normally 1K long, but can be longer. It is normally stored between 8K +and 12K from the end of the device, on a 4K boundary, though +variations can be stored at the start of the device (version 1.1) or 4K from +the start of the device (version 1.2). +This metadata format stores multibyte data in a +processor-independent format and supports up to hundreds of +component devices (version 0.90 only supports 28). + +The metadata contains, among other things: +.TP +LEVEL +The manner in which the devices are arranged into the array +(LINEAR, RAID0, RAID1, RAID4, RAID5, RAID10, MULTIPATH). +.TP +UUID +a 128 bit Universally Unique Identifier that identifies the array that +contains this device. + +.PP +When a version 0.90 array is being reshaped (e.g. adding extra devices +to a RAID5), the version number is temporarily set to 0.91. This +ensures that if the reshape process is stopped in the middle (e.g. by +a system crash) and the machine boots into an older kernel that does +not support reshaping, then the array will not be assembled (which +would cause data corruption) but will be left untouched until a kernel +that can complete the reshape processes is used. + +.SS ARRAYS WITHOUT METADATA +While it is usually best to create arrays with superblocks so that +they can be assembled reliably, there are some circumstances when an +array without superblocks is preferred. These include: +.TP +LEGACY ARRAYS +Early versions of the +.B md +driver only supported LINEAR and RAID0 configurations and did not use +a superblock (which is less critical with these configurations). +While such arrays should be rebuilt with superblocks if possible, +.B md +continues to support them. +.TP +FAULTY +Being a largely transparent layer over a different device, the FAULTY +personality doesn't gain anything from having a superblock. +.TP +MULTIPATH +It is often possible to detect devices which are different paths to +the same storage directly rather than having a distinctive superblock +written to the device and searched for on all paths. In this case, +a MULTIPATH array with no superblock makes sense. +.TP +RAID1 +In some configurations it might be desired to create a RAID1 +configuration that does not use a superblock, and to maintain the state of +the array elsewhere. While not encouraged for general use, it does +have special-purpose uses and is supported. + +.SS ARRAYS WITH EXTERNAL METADATA + +From release 2.6.28, the +.I md +driver supports arrays with externally managed metadata. That is, +the metadata is not managed by the kernel but rather by a user-space +program which is external to the kernel. This allows support for a +variety of metadata formats without cluttering the kernel with lots of +details. +.PP +.I md +is able to communicate with the user-space program through various +sysfs attributes so that it can make appropriate changes to the +metadata \- for example to mark a device as faulty. When necessary, +.I md +will wait for the program to acknowledge the event by writing to a +sysfs attribute. +The manual page for +.IR mdmon (8) +contains more detail about this interaction. + +.SS CONTAINERS +Many metadata formats use a single block of metadata to describe a +number of different arrays which all use the same set of devices. +In this case it is helpful for the kernel to know about the full set +of devices as a whole. This set is known to md as a +.IR container . +A container is an +.I md +array with externally managed metadata and with device offset and size +so that it just covers the metadata part of the devices. The +remainder of each device is available to be incorporated into various +arrays. + +.SS LINEAR + +A LINEAR array simply catenates the available space on each +drive to form one large virtual drive. + +One advantage of this arrangement over the more common RAID0 +arrangement is that the array may be reconfigured at a later time with +an extra drive, so the array is made bigger without disturbing the +data that is on the array. This can even be done on a live +array. + +If a chunksize is given with a LINEAR array, the usable space on each +device is rounded down to a multiple of this chunksize. + +.SS RAID0 + +A RAID0 array (which has zero redundancy) is also known as a +striped array. +A RAID0 array is configured at creation with a +.B "Chunk Size" +which must be a power of two (prior to Linux 2.6.31), and at least 4 +kibibytes. + +The RAID0 driver assigns the first chunk of the array to the first +device, the second chunk to the second device, and so on until all +drives have been assigned one chunk. This collection of chunks forms a +.BR stripe . +Further chunks are gathered into stripes in the same way, and are +assigned to the remaining space in the drives. + +If devices in the array are not all the same size, then once the +smallest device has been exhausted, the RAID0 driver starts +collecting chunks into smaller stripes that only span the drives which +still have remaining space. + + +.SS RAID1 + +A RAID1 array is also known as a mirrored set (though mirrors tend to +provide reflected images, which RAID1 does not) or a plex. + +Once initialised, each device in a RAID1 array contains exactly the +same data. Changes are written to all devices in parallel. Data is +read from any one device. The driver attempts to distribute read +requests across all devices to maximise performance. + +All devices in a RAID1 array should be the same size. If they are +not, then only the amount of space available on the smallest device is +used (any extra space on other devices is wasted). + +Note that the read balancing done by the driver does not make the RAID1 +performance profile be the same as for RAID0; a single stream of +sequential input will not be accelerated (e.g. a single dd), but +multiple sequential streams or a random workload will use more than one +spindle. In theory, having an N-disk RAID1 will allow N sequential +threads to read from all disks. + +Individual devices in a RAID1 can be marked as "write-mostly". +These drives are excluded from the normal read balancing and will only +be read from when there is no other option. This can be useful for +devices connected over a slow link. + +.SS RAID4 + +A RAID4 array is like a RAID0 array with an extra device for storing +parity. This device is the last of the active devices in the +array. Unlike RAID0, RAID4 also requires that all stripes span all +drives, so extra space on devices that are larger than the smallest is +wasted. + +When any block in a RAID4 array is modified, the parity block for that +stripe (i.e. the block in the parity device at the same device offset +as the stripe) is also modified so that the parity block always +contains the "parity" for the whole stripe. I.e. its content is +equivalent to the result of performing an exclusive-or operation +between all the data blocks in the stripe. + +This allows the array to continue to function if one device fails. +The data that was on that device can be calculated as needed from the +parity block and the other data blocks. + +.SS RAID5 + +RAID5 is very similar to RAID4. The difference is that the parity +blocks for each stripe, instead of being on a single device, are +distributed across all devices. This allows more parallelism when +writing, as two different block updates will quite possibly affect +parity blocks on different devices so there is less contention. + +This also allows more parallelism when reading, as read requests are +distributed over all the devices in the array instead of all but one. + +.SS RAID6 + +RAID6 is similar to RAID5, but can handle the loss of any \fItwo\fP +devices without data loss. Accordingly, it requires N+2 drives to +store N drives worth of data. + +The performance for RAID6 is slightly lower but comparable to RAID5 in +normal mode and single disk failure mode. It is very slow in dual +disk failure mode, however. + +.SS RAID10 + +RAID10 provides a combination of RAID1 and RAID0, and is sometimes known +as RAID1+0. Every datablock is duplicated some number of times, and +the resulting collection of datablocks are distributed over multiple +drives. + +When configuring a RAID10 array, it is necessary to specify the number +of replicas of each data block that are required (this will usually +be\ 2) and whether their layout should be "near", "far" or "offset" +(with "offset" being available since Linux\ 2.6.18). + +.B About the RAID10 Layout Examples: +.br +The examples below visualise the chunk distribution on the underlying +devices for the respective layout. + +For simplicity it is assumed that the size of the chunks equals the +size of the blocks of the underlying devices as well as those of the +RAID10 device exported by the kernel (for example \fB/dev/md/\fPname). +.br +Therefore the chunks\ /\ chunk numbers map directly to the blocks\ /\ +block addresses of the exported RAID10 device. + +Decimal numbers (0,\ 1, 2,\ ...) are the chunks of the RAID10 and due +to the above assumption also the blocks and block addresses of the +exported RAID10 device. +.br +Repeated numbers mean copies of a chunk\ /\ block (obviously on +different underlying devices). +.br +Hexadecimal numbers (0x00,\ 0x01, 0x02,\ ...) are the block addresses +of the underlying devices. + +.TP +\fB "near" Layout\fP +When "near" replicas are chosen, the multiple copies of a given chunk are laid +out consecutively ("as close to each other as possible") across the stripes of +the array. + +With an even number of devices, they will likely (unless some misalignment is +present) lay at the very same offset on the different devices. +.br +This is as the "classic" RAID1+0; that is two groups of mirrored devices (in the +example below the groups Device\ #1\ /\ #2 and Device\ #3\ /\ #4 are each a +RAID1) both in turn forming a striped RAID0. + +.ne 10 +.B Example with 2\ copies per chunk and an even number\ (4) of devices: +.TS +tab(;); + C - - - - + C | C | C | C | C | +| - | - | - | - | - | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| - | - | - | - | - | + C C S C S + C C S C S + C C S S S + C C S S S. +; +;Device #1;Device #2;Device #3;Device #4 +0x00;0;0;1;1 +0x01;2;2;3;3 +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +0x80;254;254;255;255 +;\\---------v---------/;\\---------v---------/ +;RAID1;RAID1 +;\\---------------------v---------------------/ +;RAID0 +.TE + +.ne 10 +.B Example with 2\ copies per chunk and an odd number\ (5) of devices: +.TS +tab(;); + C - - - - - + C | C | C | C | C | C | +| - | - | - | - | - | - | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| - | - | - | - | - | - | +C. +; +;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5 +0x00;0;0;1;1;2 +0x01;2;3;3;4;4 +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +0x80;317;318;318;319;319 +; +.TE + +.TP +\fB "far" Layout\fP +When "far" replicas are chosen, the multiple copies of a given chunk +are laid out quite distant ("as far as reasonably possible") from each +other. + +First a complete sequence of all data blocks (that is all the data one +sees on the exported RAID10 block device) is striped over the +devices. Then another (though "shifted") complete sequence of all data +blocks; and so on (in the case of more than 2\ copies per chunk). + +The "shift" needed to prevent placing copies of the same chunks on the +same devices is actually a cyclic permutation with offset\ 1 of each +of the stripes within a complete sequence of chunks. +.br +The offset\ 1 is relative to the previous complete sequence of chunks, +so in case of more than 2\ copies per chunk one gets the following +offsets: +.br +1.\ complete sequence of chunks: offset\ =\ \ 0 +.br +2.\ complete sequence of chunks: offset\ =\ \ 1 +.br +3.\ complete sequence of chunks: offset\ =\ \ 2 +.br + : +.br +n.\ complete sequence of chunks: offset\ =\ n-1 + +.ne 10 +.B Example with 2\ copies per chunk and an even number\ (4) of devices: +.TS +tab(;); + C - - - - + C | C | C | C | C | +| - | - | - | - | - | +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| - | - | - | - | - | +C. +; +;Device #1;Device #2;Device #3;Device #4 +; +0x00;0;1;2;3;\\ +0x01;4;5;6;7;> [#] +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x40;252;253;254;255;/ +0x41;3;0;1;2;\\ +0x42;7;4;5;6;> [#]~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x80;255;252;253;254;/ +; +.TE + +.ne 10 +.B Example with 2\ copies per chunk and an odd number\ (5) of devices: +.TS +tab(;); + C - - - - - + C | C | C | C | C | C | +| - | - | - | - | - | - | +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| - | - | - | - | - | - | +C. +; +;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5 +; +0x00;0;1;2;3;4;\\ +0x01;5;6;7;8;9;> [#] +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x40;315;316;317;318;319;/ +0x41;4;0;1;2;3;\\ +0x42;9;5;6;7;8;> [#]~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x80;319;315;316;317;318;/ +; +.TE + +With [#]\ being the complete sequence of chunks and [#]~\ the cyclic permutation +with offset\ 1 thereof (in the case of more than 2 copies per chunk there would +be ([#]~)~,\ (([#]~)~)~,\ ...). + +The advantage of this layout is that MD can easily spread sequential reads over +the devices, making them similar to RAID0 in terms of speed. +.br +The cost is more seeking for writes, making them substantially slower. + +.TP +\fB"offset" Layout\fP +When "offset" replicas are chosen, all the copies of a given chunk are +striped consecutively ("offset by the stripe length after each other") +over the devices. + +Explained in detail, <number of devices> consecutive chunks are +striped over the devices, immediately followed by a "shifted" copy of +these chunks (and by further such "shifted" copies in the case of more +than 2\ copies per chunk). +.br +This pattern repeats for all further consecutive chunks of the +exported RAID10 device (in other words: all further data blocks). + +The "shift" needed to prevent placing copies of the same chunks on the +same devices is actually a cyclic permutation with offset\ 1 of each +of the striped copies of <number of devices> consecutive chunks. +.br +The offset\ 1 is relative to the previous striped copy of <number of +devices> consecutive chunks, so in case of more than 2\ copies per +chunk one gets the following offsets: +.br +1.\ <number of devices> consecutive chunks: offset\ =\ \ 0 +.br +2.\ <number of devices> consecutive chunks: offset\ =\ \ 1 +.br +3.\ <number of devices> consecutive chunks: offset\ =\ \ 2 +.br + : +.br +n.\ <number of devices> consecutive chunks: offset\ =\ n-1 + +.ne 10 +.B Example with 2\ copies per chunk and an even number\ (4) of devices: +.TS +tab(;); + C - - - - + C | C | C | C | C | +| - | - | - | - | - | +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| - | - | - | - | - | +C. +; +;Device #1;Device #2;Device #3;Device #4 +; +0x00;0;1;2;3;) AA +0x01;3;0;1;2;) AA~ +0x02;4;5;6;7;) AB +0x03;7;4;5;6;) AB~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +:;:;:;:;:; : +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +0x79;251;252;253;254;) EX +0x80;254;251;252;253;) EX~ +; +.TE + +.ne 10 +.B Example with 2\ copies per chunk and an odd number\ (5) of devices: +.TS +tab(;); + C - - - - - + C | C | C | C | C | C | +| - | - | - | - | - | - | +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| - | - | - | - | - | - | +C. +; +;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5 +; +0x00;0;1;2;3;4;) AA +0x01;4;0;1;2;3;) AA~ +0x02;5;6;7;8;9;) AB +0x03;9;5;6;7;8;) AB~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +:;:;:;:;:;:; : +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +0x79;314;315;316;317;318;) EX +0x80;318;314;315;316;317;) EX~ +; +.TE + +With AA,\ AB,\ ..., AZ,\ BA,\ ... being the sets of <number of devices> consecutive +chunks and AA~,\ AB~,\ ..., AZ~,\ BA~,\ ... the cyclic permutations with offset\ 1 +thereof (in the case of more than 2 copies per chunk there would be (AA~)~,\ ... +as well as ((AA~)~)~,\ ... and so on). + +This should give similar read characteristics to "far" if a suitably large chunk +size is used, but without as much seeking for writes. +.PP + + +It should be noted that the number of devices in a RAID10 array need +not be a multiple of the number of replica of each data block; however, +there must be at least as many devices as replicas. + +If, for example, an array is created with 5 devices and 2 replicas, +then space equivalent to 2.5 of the devices will be available, and +every block will be stored on two different devices. + +Finally, it is possible to have an array with both "near" and "far" +copies. If an array is configured with 2 near copies and 2 far +copies, then there will be a total of 4 copies of each block, each on +a different drive. This is an artifact of the implementation and is +unlikely to be of real value. + +.SS MULTIPATH + +MULTIPATH is not really a RAID at all as there is only one real device +in a MULTIPATH md array. However there are multiple access points +(paths) to this device, and one of these paths might fail, so there +are some similarities. + +A MULTIPATH array is composed of a number of logically different +devices, often fibre channel interfaces, that all refer the the same +real device. If one of these interfaces fails (e.g. due to cable +problems), the MULTIPATH driver will attempt to redirect requests to +another interface. + +The MULTIPATH drive is not receiving any ongoing development and +should be considered a legacy driver. The device-mapper based +multipath drivers should be preferred for new installations. + +.SS FAULTY +The FAULTY md module is provided for testing purposes. A FAULTY array +has exactly one component device and is normally assembled without a +superblock, so the md array created provides direct access to all of +the data in the component device. + +The FAULTY module may be requested to simulate faults to allow testing +of other md levels or of filesystems. Faults can be chosen to trigger +on read requests or write requests, and can be transient (a subsequent +read/write at the address will probably succeed) or persistent +(subsequent read/write of the same address will fail). Further, read +faults can be "fixable" meaning that they persist until a write +request at the same address. + +Fault types can be requested with a period. In this case, the fault +will recur repeatedly after the given number of requests of the +relevant type. For example if persistent read faults have a period of +100, then every 100th read request would generate a fault, and the +faulty sector would be recorded so that subsequent reads on that +sector would also fail. + +There is a limit to the number of faulty sectors that are remembered. +Faults generated after this limit is exhausted are treated as +transient. + +The list of faulty sectors can be flushed, and the active list of +failure modes can be cleared. + +.SS UNCLEAN SHUTDOWN + +When changes are made to a RAID1, RAID4, RAID5, RAID6, or RAID10 array +there is a possibility of inconsistency for short periods of time as +each update requires at least two block to be written to different +devices, and these writes probably won't happen at exactly the same +time. Thus if a system with one of these arrays is shutdown in the +middle of a write operation (e.g. due to power failure), the array may +not be consistent. + +To handle this situation, the md driver marks an array as "dirty" +before writing any data to it, and marks it as "clean" when the array +is being disabled, e.g. at shutdown. If the md driver finds an array +to be dirty at startup, it proceeds to correct any possibly +inconsistency. For RAID1, this involves copying the contents of the +first drive onto all other drives. For RAID4, RAID5 and RAID6 this +involves recalculating the parity for each stripe and making sure that +the parity block has the correct data. For RAID10 it involves copying +one of the replicas of each block onto all the others. This process, +known as "resynchronising" or "resync" is performed in the background. +The array can still be used, though possibly with reduced performance. + +If a RAID4, RAID5 or RAID6 array is degraded (missing at least one +drive, two for RAID6) when it is restarted after an unclean shutdown, it cannot +recalculate parity, and so it is possible that data might be +undetectably corrupted. The 2.4 md driver +.B does not +alert the operator to this condition. The 2.6 md driver will fail to +start an array in this condition without manual intervention, though +this behaviour can be overridden by a kernel parameter. + +.SS RECOVERY + +If the md driver detects a write error on a device in a RAID1, RAID4, +RAID5, RAID6, or RAID10 array, it immediately disables that device +(marking it as faulty) and continues operation on the remaining +devices. If there are spare drives, the driver will start recreating +on one of the spare drives the data which was on that failed drive, +either by copying a working drive in a RAID1 configuration, or by +doing calculations with the parity block on RAID4, RAID5 or RAID6, or +by finding and copying originals for RAID10. + +In kernels prior to about 2.6.15, a read error would cause the same +effect as a write error. In later kernels, a read-error will instead +cause md to attempt a recovery by overwriting the bad block. i.e. it +will find the correct data from elsewhere, write it over the block +that failed, and then try to read it back again. If either the write +or the re-read fail, md will treat the error the same way that a write +error is treated, and will fail the whole device. + +While this recovery process is happening, the md driver will monitor +accesses to the array and will slow down the rate of recovery if other +activity is happening, so that normal access to the array will not be +unduly affected. When no other activity is happening, the recovery +process proceeds at full speed. The actual speed targets for the two +different situations can be controlled by the +.B speed_limit_min +and +.B speed_limit_max +control files mentioned below. + +.SS SCRUBBING AND MISMATCHES + +As storage devices can develop bad blocks at any time it is valuable +to regularly read all blocks on all devices in an array so as to catch +such bad blocks early. This process is called +.IR scrubbing . + +md arrays can be scrubbed by writing either +.I check +or +.I repair +to the file +.I md/sync_action +in the +.I sysfs +directory for the device. + +Requesting a scrub will cause +.I md +to read every block on every device in the array, and check that the +data is consistent. For RAID1 and RAID10, this means checking that the copies +are identical. For RAID4, RAID5, RAID6 this means checking that the +parity block is (or blocks are) correct. + +If a read error is detected during this process, the normal read-error +handling causes correct data to be found from other devices and to be +written back to the faulty device. In many case this will +effectively +.I fix +the bad block. + +If all blocks read successfully but are found to not be consistent, +then this is regarded as a +.IR mismatch . + +If +.I check +was used, then no action is taken to handle the mismatch, it is simply +recorded. +If +.I repair +was used, then a mismatch will be repaired in the same way that +.I resync +repairs arrays. For RAID5/RAID6 new parity blocks are written. For RAID1/RAID10, +all but one block are overwritten with the content of that one block. + +A count of mismatches is recorded in the +.I sysfs +file +.IR md/mismatch_cnt . +This is set to zero when a +scrub starts and is incremented whenever a sector is +found that is a mismatch. +.I md +normally works in units much larger than a single sector and when it +finds a mismatch, it does not determine exactly how many actual sectors were +affected but simply adds the number of sectors in the IO unit that was +used. So a value of 128 could simply mean that a single 64KB check +found an error (128 x 512bytes = 64KB). + +If an array is created by +.I mdadm +with +.I \-\-assume\-clean +then a subsequent check could be expected to find some mismatches. + +On a truly clean RAID5 or RAID6 array, any mismatches should indicate +a hardware problem at some level - software issues should never cause +such a mismatch. + +However on RAID1 and RAID10 it is possible for software issues to +cause a mismatch to be reported. This does not necessarily mean that +the data on the array is corrupted. It could simply be that the +system does not care what is stored on that part of the array - it is +unused space. + +The most likely cause for an unexpected mismatch on RAID1 or RAID10 +occurs if a swap partition or swap file is stored on the array. + +When the swap subsystem wants to write a page of memory out, it flags +the page as 'clean' in the memory manager and requests the swap device +to write it out. It is quite possible that the memory will be +changed while the write-out is happening. In that case the 'clean' +flag will be found to be clear when the write completes and so the +swap subsystem will simply forget that the swapout had been attempted, +and will possibly choose a different page to write out. + +If the swap device was on RAID1 (or RAID10), then the data is sent +from memory to a device twice (or more depending on the number of +devices in the array). Thus it is possible that the memory gets changed +between the times it is sent, so different data can be written to +the different devices in the array. This will be detected by +.I check +as a mismatch. However it does not reflect any corruption as the +block where this mismatch occurs is being treated by the swap system as +being empty, and the data will never be read from that block. + +It is conceivable for a similar situation to occur on non-swap files, +though it is less likely. + +Thus the +.I mismatch_cnt +value can not be interpreted very reliably on RAID1 or RAID10, +especially when the device is used for swap. + + +.SS BITMAP WRITE-INTENT LOGGING + +From Linux 2.6.13, +.I md +supports a bitmap based write-intent log. If configured, the bitmap +is used to record which blocks of the array may be out of sync. +Before any write request is honoured, md will make sure that the +corresponding bit in the log is set. After a period of time with no +writes to an area of the array, the corresponding bit will be cleared. + +This bitmap is used for two optimisations. + +Firstly, after an unclean shutdown, the resync process will consult +the bitmap and only resync those blocks that correspond to bits in the +bitmap that are set. This can dramatically reduce resync time. + +Secondly, when a drive fails and is removed from the array, md stops +clearing bits in the intent log. If that same drive is re-added to +the array, md will notice and will only recover the sections of the +drive that are covered by bits in the intent log that are set. This +can allow a device to be temporarily removed and reinserted without +causing an enormous recovery cost. + +The intent log can be stored in a file on a separate device, or it can +be stored near the superblocks of an array which has superblocks. + +It is possible to add an intent log to an active array, or remove an +intent log if one is present. + +In 2.6.13, intent bitmaps are only supported with RAID1. Other levels +with redundancy are supported from 2.6.15. + +.SS BAD BLOCK LIST + +From Linux 3.5 each device in an +.I md +array can store a list of known-bad-blocks. This list is 4K in size +and usually positioned at the end of the space between the superblock +and the data. + +When a block cannot be read and cannot be repaired by writing data +recovered from other devices, the address of the block is stored in +the bad block list. Similarly if an attempt to write a block fails, +the address will be recorded as a bad block. If attempting to record +the bad block fails, the whole device will be marked faulty. + +Attempting to read from a known bad block will cause a read error. +Attempting to write to a known bad block will be ignored if any write +errors have been reported by the device. If there have been no write +errors then the data will be written to the known bad block and if +that succeeds, the address will be removed from the list. + +This allows an array to fail more gracefully - a few blocks on +different devices can be faulty without taking the whole array out of +action. + +The list is particularly useful when recovering to a spare. If a few blocks +cannot be read from the other devices, the bulk of the recovery can +complete and those few bad blocks will be recorded in the bad block list. + +.SS WRITE-BEHIND + +From Linux 2.6.14, +.I md +supports WRITE-BEHIND on RAID1 arrays. + +This allows certain devices in the array to be flagged as +.IR write-mostly . +MD will only read from such devices if there is no +other option. + +If a write-intent bitmap is also provided, write requests to +write-mostly devices will be treated as write-behind requests and md +will not wait for writes to those requests to complete before +reporting the write as complete to the filesystem. + +This allows for a RAID1 with WRITE-BEHIND to be used to mirror data +over a slow link to a remote computer (providing the link isn't too +slow). The extra latency of the remote link will not slow down normal +operations, but the remote system will still have a reasonably +up-to-date copy of all data. + +.SS RESTRIPING + +.IR Restriping , +also known as +.IR Reshaping , +is the processes of re-arranging the data stored in each stripe into a +new layout. This might involve changing the number of devices in the +array (so the stripes are wider), changing the chunk size (so stripes +are deeper or shallower), or changing the arrangement of data and +parity (possibly changing the RAID level, e.g. 1 to 5 or 5 to 6). + +As of Linux 2.6.35, md can reshape a RAID4, RAID5, or RAID6 array to +have a different number of devices (more or fewer) and to have a +different layout or chunk size. It can also convert between these +different RAID levels. It can also convert between RAID0 and RAID10, +and between RAID0 and RAID4 or RAID5. +Other possibilities may follow in future kernels. + +During any stripe process there is a 'critical section' during which +live data is being overwritten on disk. For the operation of +increasing the number of drives in a RAID5, this critical section +covers the first few stripes (the number being the product of the old +and new number of devices). After this critical section is passed, +data is only written to areas of the array which no longer hold live +data \(em the live data has already been located away. + +For a reshape which reduces the number of devices, the 'critical +section' is at the end of the reshape process. + +md is not able to ensure data preservation if there is a crash +(e.g. power failure) during the critical section. If md is asked to +start an array which failed during a critical section of restriping, +it will fail to start the array. + +To deal with this possibility, a user-space program must +.IP \(bu 4 +Disable writes to that section of the array (using the +.B sysfs +interface), +.IP \(bu 4 +take a copy of the data somewhere (i.e. make a backup), +.IP \(bu 4 +allow the process to continue and invalidate the backup and restore +write access once the critical section is passed, and +.IP \(bu 4 +provide for restoring the critical data before restarting the array +after a system crash. +.PP + +.B mdadm +versions from 2.4 do this for growing a RAID5 array. + +For operations that do not change the size of the array, like simply +increasing chunk size, or converting RAID5 to RAID6 with one extra +device, the entire process is the critical section. In this case, the +restripe will need to progress in stages, as a section is suspended, +backed up, restriped, and released. + +.SS SYSFS INTERFACE +Each block device appears as a directory in +.I sysfs +(which is usually mounted at +.BR /sys ). +For MD devices, this directory will contain a subdirectory called +.B md +which contains various files for providing access to information about +the array. + +This interface is documented more fully in the file +.B Documentation/md.txt +which is distributed with the kernel sources. That file should be +consulted for full documentation. The following are just a selection +of attribute files that are available. + +.TP +.B md/sync_speed_min +This value, if set, overrides the system-wide setting in +.B /proc/sys/dev/raid/speed_limit_min +for this array only. +Writing the value +.B "system" +to this file will cause the system-wide setting to have effect. + +.TP +.B md/sync_speed_max +This is the partner of +.B md/sync_speed_min +and overrides +.B /proc/sys/dev/raid/speed_limit_max +described below. + +.TP +.B md/sync_action +This can be used to monitor and control the resync/recovery process of +MD. +In particular, writing "check" here will cause the array to read all +data block and check that they are consistent (e.g. parity is correct, +or all mirror replicas are the same). Any discrepancies found are +.B NOT +corrected. + +A count of problems found will be stored in +.BR md/mismatch_count . + +Alternately, "repair" can be written which will cause the same check +to be performed, but any errors will be corrected. + +Finally, "idle" can be written to stop the check/repair process. + +.TP +.B md/stripe_cache_size +This is only available on RAID5 and RAID6. It records the size (in +pages per device) of the stripe cache which is used for synchronising +all write operations to the array and all read operations if the array +is degraded. The default is 256. Valid values are 17 to 32768. +Increasing this number can increase performance in some situations, at +some cost in system memory. Note, setting this value too high can +result in an "out of memory" condition for the system. + +memory_consumed = system_page_size * nr_disks * stripe_cache_size + +.TP +.B md/preread_bypass_threshold +This is only available on RAID5 and RAID6. This variable sets the +number of times MD will service a full-stripe-write before servicing a +stripe that requires some "prereading". For fairness this defaults to +1. Valid values are 0 to stripe_cache_size. Setting this to 0 +maximizes sequential-write throughput at the cost of fairness to threads +doing small or random writes. + +.SS KERNEL PARAMETERS + +The md driver recognised several different kernel parameters. +.TP +.B raid=noautodetect +This will disable the normal detection of md arrays that happens at +boot time. If a drive is partitioned with MS-DOS style partitions, +then if any of the 4 main partitions has a partition type of 0xFD, +then that partition will normally be inspected to see if it is part of +an MD array, and if any full arrays are found, they are started. This +kernel parameter disables this behaviour. + +.TP +.B raid=partitionable +.TP +.B raid=part +These are available in 2.6 and later kernels only. They indicate that +autodetected MD arrays should be created as partitionable arrays, with +a different major device number to the original non-partitionable md +arrays. The device number is listed as +.I mdp +in +.IR /proc/devices . + +.TP +.B md_mod.start_ro=1 +.TP +.B /sys/module/md_mod/parameters/start_ro +This tells md to start all arrays in read-only mode. This is a soft +read-only that will automatically switch to read-write on the first +write request. However until that write request, nothing is written +to any device by md, and in particular, no resync or recovery +operation is started. + +.TP +.B md_mod.start_dirty_degraded=1 +.TP +.B /sys/module/md_mod/parameters/start_dirty_degraded +As mentioned above, md will not normally start a RAID4, RAID5, or +RAID6 that is both dirty and degraded as this situation can imply +hidden data loss. This can be awkward if the root filesystem is +affected. Using this module parameter allows such arrays to be started +at boot time. It should be understood that there is a real (though +small) risk of data corruption in this situation. + +.TP +.BI md= n , dev , dev ,... +.TP +.BI md=d n , dev , dev ,... +This tells the md driver to assemble +.B /dev/md n +from the listed devices. It is only necessary to start the device +holding the root filesystem this way. Other arrays are best started +once the system is booted. + +In 2.6 kernels, the +.B d +immediately after the +.B = +indicates that a partitionable device (e.g. +.BR /dev/md/d0 ) +should be created rather than the original non-partitionable device. + +.TP +.BI md= n , l , c , i , dev... +This tells the md driver to assemble a legacy RAID0 or LINEAR array +without a superblock. +.I n +gives the md device number, +.I l +gives the level, 0 for RAID0 or \-1 for LINEAR, +.I c +gives the chunk size as a base-2 logarithm offset by twelve, so 0 +means 4K, 1 means 8K. +.I i +is ignored (legacy support). + +.SH FILES +.TP +.B /proc/mdstat +Contains information about the status of currently running array. +.TP +.B /proc/sys/dev/raid/speed_limit_min +A readable and writable file that reflects the current "goal" rebuild +speed for times when non-rebuild activity is current on an array. +The speed is in Kibibytes per second, and is a per-device rate, not a +per-array rate (which means that an array with more disks will shuffle +more data for a given speed). The default is 1000. + +.TP +.B /proc/sys/dev/raid/speed_limit_max +A readable and writable file that reflects the current "goal" rebuild +speed for times when no non-rebuild activity is current on an array. +The default is 200,000. + +.SH SEE ALSO +.BR mdadm (8), @@ -0,0 +1,136 @@ +/* Declaration of functions and data types used for MD5 sum computing + library functions. + Copyright (C) 1995-1997,1999-2005 Free Software Foundation, Inc. + + NOTE: The canonical source of this file is maintained with the GNU C + Library. Bugs can be reported to bug-glibc@prep.ai.mit.edu. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#ifndef _MD5_H +#define _MD5_H 1 + +#include <stdio.h> + +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif +#if HAVE_STDINT_H || _LIBC || defined __UCLIBC__ +# include <stdint.h> +#endif + +#ifndef __GNUC_PREREQ +# if defined __GNUC__ && defined __GNUC_MINOR__ +# define __GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GNUC_PREREQ(maj, min) 0 +# endif +#endif + +#ifndef __THROW +# if defined __cplusplus && __GNUC_PREREQ (2,8) +# define __THROW throw () +# else +# define __THROW +# endif +#endif + +#ifndef __attribute__ +# if ! __GNUC_PREREQ (2,8) || __STRICT_ANSI__ +# define __attribute__(x) +# endif +#endif + +#ifndef _LIBC +# define __md5_buffer md5_buffer +# define __md5_finish_ctx md5_finish_ctx +# define __md5_init_ctx md5_init_ctx +# define __md5_process_block md5_process_block +# define __md5_process_bytes md5_process_bytes +# define __md5_read_ctx md5_read_ctx +# define __md5_stream md5_stream +#endif + +typedef uint32_t md5_uint32; + +/* Structure to save state of computation between the single steps. */ +struct md5_ctx +{ + md5_uint32 A; + md5_uint32 B; + md5_uint32 C; + md5_uint32 D; + + md5_uint32 total[2]; + md5_uint32 buflen; + char buffer[128] __attribute__ ((__aligned__ (__alignof__ (md5_uint32)))); +}; + +/* + * The following three functions are build up the low level used in + * the functions `md5_stream' and `md5_buffer'. + */ + +/* Initialize structure containing state of computation. + (RFC 1321, 3.3: Step 3) */ +extern void __md5_init_ctx (struct md5_ctx *ctx) __THROW; + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is necessary that LEN is a multiple of 64!!! */ +extern void __md5_process_block (const void *buffer, size_t len, + struct md5_ctx *ctx) __THROW; + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is NOT required that LEN is a multiple of 64. */ +extern void __md5_process_bytes (const void *buffer, size_t len, + struct md5_ctx *ctx) __THROW; + +/* Process the remaining bytes in the buffer and put result from CTX + in first 16 bytes following RESBUF. The result is always in little + endian byte order, so that a byte-wise output yields to the wanted + ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF be correctly + aligned for a 32 bits value. */ +extern void *__md5_finish_ctx (struct md5_ctx *ctx, void *resbuf) __THROW; + + +/* Put result from CTX in first 16 bytes following RESBUF. The result is + always in little endian byte order, so that a byte-wise output yields + to the wanted ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32 bits value. */ +extern void *__md5_read_ctx (const struct md5_ctx *ctx, void *resbuf) __THROW; + + +/* Compute MD5 message digest for bytes read from STREAM. The + resulting message digest number will be written into the 16 bytes + beginning at RESBLOCK. */ +extern int __md5_stream (FILE *stream, void *resblock) __THROW; + +/* Compute MD5 message digest for LEN bytes beginning at BUFFER. The + result is always in little endian byte order, so that a byte-wise + output yields to the wanted ASCII representation of the message + digest. */ +extern void *__md5_buffer (const char *buffer, size_t len, + void *resblock) __THROW; + +#endif /* md5.h */ @@ -0,0 +1,198 @@ +/* + md_p.h : physical layout of Linux RAID devices + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_P_H +#define _MD_P_H + +/* + * RAID superblock. + * + * The RAID superblock maintains some statistics on each RAID configuration. + * Each real device in the RAID set contains it near the end of the device. + * Some of the ideas are copied from the ext2fs implementation. + * + * We currently use 4096 bytes as follows: + * + * word offset function + * + * 0 - 31 Constant generic RAID device information. + * 32 - 63 Generic state information. + * 64 - 127 Personality specific information. + * 128 - 511 12 32-words descriptors of the disks in the raid set. + * 512 - 911 Reserved. + * 912 - 1023 Disk specific descriptor. + */ + +/* + * If x is the real device size in bytes, we return an apparent size of: + * + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES + * + * and place the 4kB superblock at offset y. + */ +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) + +#define MD_SB_BYTES 4096 +#define MD_SB_WORDS (MD_SB_BYTES / 4) +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) +#define MD_SB_SECTORS (MD_SB_BYTES / 512) + +/* + * The following are counted in 32-bit words + */ +#define MD_SB_GENERIC_OFFSET 0 +#define MD_SB_PERSONALITY_OFFSET 64 +#define MD_SB_DISKS_OFFSET 128 +#define MD_SB_DESCRIPTOR_OFFSET 992 + +#define MD_SB_GENERIC_CONSTANT_WORDS 32 +#define MD_SB_GENERIC_STATE_WORDS 32 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) +#define MD_SB_PERSONALITY_WORDS 64 +#define MD_SB_DESCRIPTOR_WORDS 32 +#define MD_SB_DISKS 27 +#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) + +/* + * Device "operational" state bits + */ +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */ +#define MD_DISK_ACTIVE 1 /* disk is running but may not be in sync */ +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ + +#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. + * read requests will only be sent here in + * dire need + */ + +#define MD_DISK_REPLACEMENT 17 + +typedef struct mdp_device_descriptor_s { + __u32 number; /* 0 Device number in the entire set */ + __u32 major; /* 1 Device major number */ + __u32 minor; /* 2 Device minor number */ + __u32 raid_disk; /* 3 The role of the device in the raid set */ + __u32 state; /* 4 Operational state */ + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; +} mdp_disk_t; + +#define MD_SB_MAGIC 0xa92b4efc + +/* + * Superblock state bits + */ +#define MD_SB_CLEAN 0 +#define MD_SB_ERRORS 1 +#define MD_SB_BBM_ERRORS 2 +#define MD_SB_BLOCK_CONTAINER_RESHAPE 3 /* block container wide reshapes */ +#define MD_SB_BLOCK_VOLUME 4 /* block activation of array, other arrays + * in container can be activated */ +#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ + +typedef struct mdp_superblock_s { + /* + * Constant generic information + */ + __u32 md_magic; /* 0 MD identifier */ + __u32 major_version; /* 1 major version to which the set conforms */ + __u32 minor_version; /* 2 minor version ... */ + __u32 patch_version; /* 3 patchlevel version ... */ + __u32 gvalid_words; /* 4 Number of used words in this section */ + __u32 set_uuid0; /* 5 Raid set identifier */ + __u32 ctime; /* 6 Creation time */ + __u32 level; /* 7 Raid personality */ + __u32 size; /* 8 Apparent size of each individual disk */ + __u32 nr_disks; /* 9 total disks in the raid set */ + __u32 raid_disks; /* 10 disks in a fully functional raid set */ + __u32 md_minor; /* 11 preferred MD minor device number */ + __u32 not_persistent; /* 12 does it have a persistent superblock */ + __u32 set_uuid1; /* 13 Raid set identifier #2 */ + __u32 set_uuid2; /* 14 Raid set identifier #3 */ + __u32 set_uuid3; /* 15 Raid set identifier #4 */ + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; + + /* + * Generic state information + */ + __u32 utime; /* 0 Superblock update time */ + __u32 state; /* 1 State bits (clean, ...) */ + __u32 active_disks; /* 2 Number of currently active disks */ + __u32 working_disks; /* 3 Number of working disks */ + __u32 failed_disks; /* 4 Number of failed disks */ + __u32 spare_disks; /* 5 Number of spare disks */ + __u32 sb_csum; /* 6 checksum of the whole superblock */ +#if __BYTE_ORDER == __BIG_ENDIAN + __u32 events_hi; /* 7 high-order of superblock update count */ + __u32 events_lo; /* 8 low-order of superblock update count */ + __u32 cp_events_hi; /* 9 high-order of checkpoint update count */ + __u32 cp_events_lo; /* 10 low-order of checkpoint update count */ +#else + __u32 events_lo; /* 7 low-order of superblock update count */ + __u32 events_hi; /* 8 high-order of superblock update count */ + __u32 cp_events_lo; /* 9 low-order of checkpoint update count */ + __u32 cp_events_hi; /* 10 high-order of checkpoint update count */ +#endif + __u32 recovery_cp; /* 11 recovery checkpoint sector count */ + /* There are only valid for minor_version > 90 */ + __u64 reshape_position; /* 12,13 next address in array-space for reshape */ + __u32 new_level; /* 14 new level we are reshaping to */ + __u32 delta_disks; /* 15 change in number of raid_disks */ + __u32 new_layout; /* 16 new layout */ + __u32 new_chunk; /* 17 new chunk size (bytes) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18]; + + /* + * Personality information + */ + __u32 layout; /* 0 the array's physical layout */ + __u32 chunk_size; /* 1 chunk size in bytes */ + __u32 root_pv; /* 2 LV root PV */ + __u32 root_block; /* 3 LV root block */ + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; + + /* + * Disks information + */ + mdp_disk_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + __u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + mdp_disk_t this_disk; + +} mdp_super_t; + +#ifdef __TINYC__ +typedef unsigned long long __u64; +#endif + +static inline __u64 md_event(mdp_super_t *sb) { + __u64 ev = sb->events_hi; + return (ev<<32)| sb->events_lo; +} + +#endif @@ -0,0 +1,122 @@ +/* + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_U_H +#define _MD_U_H + +/* ioctls */ + +/* status */ +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t) +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t) +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) +#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) +#define RAID_AUTORUN _IO (MD_MAJOR, 0x14) +#define GET_BITMAP_FILE _IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t) + +/* configuration */ +#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t) +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22) +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t) +#define SET_DISK_INFO _IO (MD_MAJOR, 0x24) +#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25) +#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26) +#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27) +#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) +#define SET_BITMAP_FILE _IOW (MD_MAJOR, 0x2b, int) + +/* usage */ +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) +#define START_ARRAY _IO (MD_MAJOR, 0x31) +#define STOP_ARRAY _IO (MD_MAJOR, 0x32) +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) + +typedef struct mdu_version_s { + int major; + int minor; + int patchlevel; +} mdu_version_t; + +typedef struct mdu_array_info_s { + /* + * Generic constant information + */ + int major_version; + int minor_version; + int patch_version; + int ctime; + int level; + int size; + int nr_disks; + int raid_disks; + int md_minor; + int not_persistent; + + /* + * Generic state information + */ + int utime; /* 0 Superblock update time */ + int state; /* 1 State bits (clean, ...) */ + int active_disks; /* 2 Number of currently active disks */ + int working_disks; /* 3 Number of working disks */ + int failed_disks; /* 4 Number of failed disks */ + int spare_disks; /* 5 Number of spare disks */ + + /* + * Personality information + */ + int layout; /* 0 the array's physical layout */ + int chunk_size; /* 1 chunk size in bytes */ + +} mdu_array_info_t; + +typedef struct mdu_disk_info_s { + /* + * configuration/status of one particular disk + */ + int number; + int major; + int minor; + int raid_disk; + int state; + +} mdu_disk_info_t; + +typedef struct mdu_start_info_s { + /* + * configuration/status of one particular disk + */ + int major; + int minor; + int raid_disk; + int state; + +} mdu_start_info_t; + +typedef struct mdu_bitmap_file_s +{ + char pathname[4096]; +} mdu_bitmap_file_t; + +typedef struct mdu_param_s +{ + int personality; /* 1,2,3,4 */ + int chunk_size; /* in bytes */ + int max_fault; /* unused for now */ +} mdu_param_t; + +#endif diff --git a/mdadm.8.in b/mdadm.8.in new file mode 100644 index 00000000..14bd8b99 --- /dev/null +++ b/mdadm.8.in @@ -0,0 +1,3212 @@ +.\" -*- nroff -*- +.\" Copyright Neil Brown and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH MDADM 8 "" v3.3.4 +.SH NAME +mdadm \- manage MD devices +.I aka +Linux Software RAID + +.SH SYNOPSIS + +.BI mdadm " [mode] <raiddevice> [options] <component-devices>" + +.SH DESCRIPTION +RAID devices are virtual devices created from two or more +real block devices. This allows multiple devices (typically disk +drives or partitions thereof) to be combined into a single device to +hold (for example) a single filesystem. +Some RAID levels include redundancy and so can survive some degree of +device failure. + +Linux Software RAID devices are implemented through the md (Multiple +Devices) device driver. + +Currently, Linux supports +.B LINEAR +md devices, +.B RAID0 +(striping), +.B RAID1 +(mirroring), +.BR RAID4 , +.BR RAID5 , +.BR RAID6 , +.BR RAID10 , +.BR MULTIPATH , +.BR FAULTY , +and +.BR CONTAINER . + +.B MULTIPATH +is not a Software RAID mechanism, but does involve +multiple devices: +each device is a path to one common physical storage device. +New installations should not use md/multipath as it is not well +supported and has no ongoing development. Use the Device Mapper based +multipath-tools instead. + +.B FAULTY +is also not true RAID, and it only involves one device. It +provides a layer over a true device that can be used to inject faults. + +.B CONTAINER +is different again. A +.B CONTAINER +is a collection of devices that are +managed as a set. This is similar to the set of devices connected to +a hardware RAID controller. The set of devices may contain a number +of different RAID arrays each utilising some (or all) of the blocks from a +number of the devices in the set. For example, two devices in a 5-device set +might form a RAID1 using the whole devices. The remaining three might +have a RAID5 over the first half of each device, and a RAID0 over the +second half. + +With a +.BR CONTAINER , +there is one set of metadata that describes all of +the arrays in the container. So when +.I mdadm +creates a +.B CONTAINER +device, the device just represents the metadata. Other normal arrays (RAID1 +etc) can be created inside the container. + +.SH MODES +mdadm has several major modes of operation: +.TP +.B Assemble +Assemble the components of a previously created +array into an active array. Components can be explicitly given +or can be searched for. +.I mdadm +checks that the components +do form a bona fide array, and can, on request, fiddle superblock +information so as to assemble a faulty array. + +.TP +.B Build +Build an array that doesn't have per-device metadata (superblocks). For these +sorts of arrays, +.I mdadm +cannot differentiate between initial creation and subsequent assembly +of an array. It also cannot perform any checks that appropriate +components have been requested. Because of this, the +.B Build +mode should only be used together with a complete understanding of +what you are doing. + +.TP +.B Create +Create a new array with per-device metadata (superblocks). +Appropriate metadata is written to each device, and then the array +comprising those devices is activated. A 'resync' process is started +to make sure that the array is consistent (e.g. both sides of a mirror +contain the same data) but the content of the device is left otherwise +untouched. +The array can be used as soon as it has been created. There is no +need to wait for the initial resync to finish. + +.TP +.B "Follow or Monitor" +Monitor one or more md devices and act on any state changes. This is +only meaningful for RAID1, 4, 5, 6, 10 or multipath arrays, as +only these have interesting state. RAID0 or Linear never have +missing, spare, or failed drives, so there is nothing to monitor. + +.TP +.B "Grow" +Grow (or shrink) an array, or otherwise reshape it in some way. +Currently supported growth options including changing the active size +of component devices and changing the number of active devices in +Linear and RAID levels 0/1/4/5/6, +changing the RAID level between 0, 1, 5, and 6, and between 0 and 10, +changing the chunk size and layout for RAID 0,4,5,6,10 as well as adding or +removing a write-intent bitmap. + +.TP +.B "Incremental Assembly" +Add a single device to an appropriate array. If the addition of the +device makes the array runnable, the array will be started. +This provides a convenient interface to a +.I hot-plug +system. As each device is detected, +.I mdadm +has a chance to include it in some array as appropriate. +Optionally, when the +.I \-\-fail +flag is passed in we will remove the device from any active array +instead of adding it. + +If a +.B CONTAINER +is passed to +.I mdadm +in this mode, then any arrays within that container will be assembled +and started. + +.TP +.B Manage +This is for doing things to specific components of an array such as +adding new spares and removing faulty devices. + +.TP +.B Misc +This is an 'everything else' mode that supports operations on active +arrays, operations on component devices such as erasing old superblocks, and +information gathering operations. +.\"This mode allows operations on independent devices such as examine MD +.\"superblocks, erasing old superblocks and stopping active arrays. + +.TP +.B Auto-detect +This mode does not act on a specific device or array, but rather it +requests the Linux Kernel to activate any auto-detected arrays. +.SH OPTIONS + +.SH Options for selecting a mode are: + +.TP +.BR \-A ", " \-\-assemble +Assemble a pre-existing array. + +.TP +.BR \-B ", " \-\-build +Build a legacy array without superblocks. + +.TP +.BR \-C ", " \-\-create +Create a new array. + +.TP +.BR \-F ", " \-\-follow ", " \-\-monitor +Select +.B Monitor +mode. + +.TP +.BR \-G ", " \-\-grow +Change the size or shape of an active array. + +.TP +.BR \-I ", " \-\-incremental +Add/remove a single device to/from an appropriate array, and possibly start the array. + +.TP +.B \-\-auto-detect +Request that the kernel starts any auto-detected arrays. This can only +work if +.I md +is compiled into the kernel \(em not if it is a module. +Arrays can be auto-detected by the kernel if all the components are in +primary MS-DOS partitions with partition type +.BR FD , +and all use v0.90 metadata. +In-kernel autodetect is not recommended for new installations. Using +.I mdadm +to detect and assemble arrays \(em possibly in an +.I initrd +\(em is substantially more flexible and should be preferred. + +.P +If a device is given before any options, or if the first option is +one of +.BR \-\-add , +.BR \-\-re\-add , +.BR \-\-add\-spare , +.BR \-\-fail , +.BR \-\-remove , +or +.BR \-\-replace , +then the MANAGE mode is assumed. +Anything other than these will cause the +.B Misc +mode to be assumed. + +.SH Options that are not mode-specific are: + +.TP +.BR \-h ", " \-\-help +Display general help message or, after one of the above options, a +mode-specific help message. + +.TP +.B \-\-help\-options +Display more detailed help about command line parsing and some commonly +used options. + +.TP +.BR \-V ", " \-\-version +Print version information for mdadm. + +.TP +.BR \-v ", " \-\-verbose +Be more verbose about what is happening. This can be used twice to be +extra-verbose. +The extra verbosity currently only affects +.B \-\-detail \-\-scan +and +.BR "\-\-examine \-\-scan" . + +.TP +.BR \-q ", " \-\-quiet +Avoid printing purely informative messages. With this, +.I mdadm +will be silent unless there is something really important to report. + + +.TP +.BR \-f ", " \-\-force +Be more forceful about certain operations. See the various modes for +the exact meaning of this option in different contexts. + +.TP +.BR \-c ", " \-\-config= +Specify the config file or directory. Default is to use +.B /etc/mdadm/mdadm.conf +and +.BR /etc/mdadm/mdadm.conf.d , +or if those are missing then +.B /etc/mdadm.conf +and +.BR /etc/mdadm.conf.d . +If the config file given is +.B "partitions" +then nothing will be read, but +.I mdadm +will act as though the config file contained exactly +.br +.B " DEVICE partitions containers" +.br +and will read +.B /proc/partitions +to find a list of devices to scan, and +.B /proc/mdstat +to find a list of containers to examine. +If the word +.B "none" +is given for the config file, then +.I mdadm +will act as though the config file were empty. + +If the name given is of a directory, then +.I mdadm +will collect all the files contained in the directory with a name ending +in +.BR .conf , +sort them lexically, and process all of those files as config files. + +.TP +.BR \-s ", " \-\-scan +Scan config file or +.B /proc/mdstat +for missing information. +In general, this option gives +.I mdadm +permission to get any missing information (like component devices, +array devices, array identities, and alert destination) from the +configuration file (see previous option); +one exception is MISC mode when using +.B \-\-detail +or +.B \-\-stop, +in which case +.B \-\-scan +says to get a list of array devices from +.BR /proc/mdstat . + +.TP +.BR \-e ", " \-\-metadata= +Declare the style of RAID metadata (superblock) to be used. The +default is {DEFAULT_METADATA} for +.BR \-\-create , +and to guess for other operations. +The default can be overridden by setting the +.B metadata +value for the +.B CREATE +keyword in +.BR mdadm.conf . + +Options are: +.RS +.ie '{DEFAULT_METADATA}'0.90' +.IP "0, 0.90, default" +.el +.IP "0, 0.90" +Use the original 0.90 format superblock. This format limits arrays to +28 component devices and limits component devices of levels 1 and +greater to 2 terabytes. It is also possible for there to be confusion +about whether the superblock applies to a whole device or just the +last partition, if that partition starts on a 64K boundary. +.ie '{DEFAULT_METADATA}'0.90' +.IP "1, 1.0, 1.1, 1.2" +.el +.IP "1, 1.0, 1.1, 1.2 default" +Use the new version-1 format superblock. This has fewer restrictions. +It can easily be moved between hosts with different endian-ness, and a +recovery operation can be checkpointed and restarted. The different +sub-versions store the superblock at different locations on the +device, either at the end (for 1.0), at the start (for 1.1) or 4K from +the start (for 1.2). "1" is equivalent to "1.2" (the commonly +preferred 1.x format). +'if '{DEFAULT_METADATA}'1.2' "default" is equivalent to "1.2". +.IP ddf +Use the "Industry Standard" DDF (Disk Data Format) format defined by +SNIA. +When creating a DDF array a +.B CONTAINER +will be created, and normal arrays can be created in that container. +.IP imsm +Use the Intel(R) Matrix Storage Manager metadata format. This creates a +.B CONTAINER +which is managed in a similar manner to DDF, and is supported by an +option-rom on some platforms: +.IP +.B http://www.intel.com/design/chipsets/matrixstorage_sb.htm +.PP +.RE + +.TP +.B \-\-homehost= +This will override any +.B HOMEHOST +setting in the config file and provides the identity of the host which +should be considered the home for any arrays. + +When creating an array, the +.B homehost +will be recorded in the metadata. For version-1 superblocks, it will +be prefixed to the array name. For version-0.90 superblocks, part of +the SHA1 hash of the hostname will be stored in the later half of the +UUID. + +When reporting information about an array, any array which is tagged +for the given homehost will be reported as such. + +When using Auto-Assemble, only arrays tagged for the given homehost +will be allowed to use 'local' names (i.e. not ending in '_' followed +by a digit string). See below under +.BR "Auto Assembly" . + +The special name "\fBany\fP" can be used as a wild card. If an array +is created with +.B --homehost=any +then the name "\fBany\fP" will be stored in the array and it can be +assembled in the same way on any host. If an array is assembled with +this option, then the homehost recorded on the array will be ignored. + +.TP +.B \-\-prefer= +When +.I mdadm +needs to print the name for a device it normally finds the name in +.B /dev +which refers to the device and is shortest. When a path component is +given with +.B \-\-prefer +.I mdadm +will prefer a longer name if it contains that component. For example +.B \-\-prefer=by-uuid +will prefer a name in a subdirectory of +.B /dev +called +.BR by-uuid . + +This functionality is currently only provided by +.B \-\-detail +and +.BR \-\-monitor . + +.SH For create, build, or grow: + +.TP +.BR \-n ", " \-\-raid\-devices= +Specify the number of active devices in the array. This, plus the +number of spare devices (see below) must equal the number of +.I component-devices +(including "\fBmissing\fP" devices) +that are listed on the command line for +.BR \-\-create . +Setting a value of 1 is probably +a mistake and so requires that +.B \-\-force +be specified first. A value of 1 will then be allowed for linear, +multipath, RAID0 and RAID1. It is never allowed for RAID4, RAID5 or RAID6. +.br +This number can only be changed using +.B \-\-grow +for RAID1, RAID4, RAID5 and RAID6 arrays, and only on kernels which provide +the necessary support. + +.TP +.BR \-x ", " \-\-spare\-devices= +Specify the number of spare (eXtra) devices in the initial array. +Spares can also be added +and removed later. The number of component devices listed +on the command line must equal the number of RAID devices plus the +number of spare devices. + +.TP +.BR \-z ", " \-\-size= +Amount (in Kibibytes) of space to use from each drive in RAID levels 1/4/5/6. +This must be a multiple of the chunk size, and must leave about 128Kb +of space at the end of the drive for the RAID superblock. +If this is not specified +(as it normally is not) the smallest drive (or partition) sets the +size, though if there is a variance among the drives of greater than 1%, a warning is +issued. + +A suffix of 'M' or 'G' can be given to indicate Megabytes or +Gigabytes respectively. + +Sometimes a replacement drive can be a little smaller than the +original drives though this should be minimised by IDEMA standards. +Such a replacement drive will be rejected by +.IR md . +To guard against this it can be useful to set the initial size +slightly smaller than the smaller device with the aim that it will +still be larger than any replacement. + +This value can be set with +.B \-\-grow +for RAID level 1/4/5/6 though +.B CONTAINER +based arrays such as those with IMSM metadata may not be able to +support this. +If the array was created with a size smaller than the currently +active drives, the extra space can be accessed using +.BR \-\-grow . +The size can be given as +.B max +which means to choose the largest size that fits on all current drives. + +Before reducing the size of the array (with +.BR "\-\-grow \-\-size=" ) +you should make sure that space isn't needed. If the device holds a +filesystem, you would need to resize the filesystem to use less space. + +After reducing the array size you should check that the data stored in +the device is still available. If the device holds a filesystem, then +an 'fsck' of the filesystem is a minimum requirement. If there are +problems the array can be made bigger again with no loss with another +.B "\-\-grow \-\-size=" +command. + +This value cannot be used when creating a +.B CONTAINER +such as with DDF and IMSM metadata, though it perfectly valid when +creating an array inside a container. + +.TP +.BR \-Z ", " \-\-array\-size= +This is only meaningful with +.B \-\-grow +and its effect is not persistent: when the array is stopped and +restarted the default array size will be restored. + +Setting the array-size causes the array to appear smaller to programs +that access the data. This is particularly needed before reshaping an +array so that it will be smaller. As the reshape is not reversible, +but setting the size with +.B \-\-array-size +is, it is required that the array size is reduced as appropriate +before the number of devices in the array is reduced. + +Before reducing the size of the array you should make sure that space +isn't needed. If the device holds a filesystem, you would need to +resize the filesystem to use less space. + +After reducing the array size you should check that the data stored in +the device is still available. If the device holds a filesystem, then +an 'fsck' of the filesystem is a minimum requirement. If there are +problems the array can be made bigger again with no loss with another +.B "\-\-grow \-\-array\-size=" +command. + +A suffix of 'M' or 'G' can be given to indicate Megabytes or +Gigabytes respectively. +A value of +.B max +restores the apparent size of the array to be whatever the real +amount of available space is. + +.TP +.BR \-c ", " \-\-chunk= +Specify chunk size of kibibytes. The default when creating an +array is 512KB. To ensure compatibility with earlier versions, the +default when building an array with no persistent metadata is 64KB. +This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10. + +RAID4, RAID5, RAID6, and RAID10 require the chunk size to be a power +of 2. In any case it must be a multiple of 4KB. + +A suffix of 'M' or 'G' can be given to indicate Megabytes or +Gigabytes respectively. + +.TP +.BR \-\-rounding= +Specify rounding factor for a Linear array. The size of each +component will be rounded down to a multiple of this size. +This is a synonym for +.B \-\-chunk +but highlights the different meaning for Linear as compared to other +RAID levels. The default is 64K if a kernel earlier than 2.6.16 is in +use, and is 0K (i.e. no rounding) in later kernels. + +.TP +.BR \-l ", " \-\-level= +Set RAID level. When used with +.BR \-\-create , +options are: linear, raid0, 0, stripe, raid1, 1, mirror, raid4, 4, +raid5, 5, raid6, 6, raid10, 10, multipath, mp, faulty, container. +Obviously some of these are synonymous. + +When a +.B CONTAINER +metadata type is requested, only the +.B container +level is permitted, and it does not need to be explicitly given. + +When used with +.BR \-\-build , +only linear, stripe, raid0, 0, raid1, multipath, mp, and faulty are valid. + +Can be used with +.B \-\-grow +to change the RAID level in some cases. See LEVEL CHANGES below. + +.TP +.BR \-p ", " \-\-layout= +This option configures the fine details of data layout for RAID5, RAID6, +and RAID10 arrays, and controls the failure modes for +.IR faulty . + +The layout of the RAID5 parity block can be one of +.BR left\-asymmetric , +.BR left\-symmetric , +.BR right\-asymmetric , +.BR right\-symmetric , +.BR la ", " ra ", " ls ", " rs . +The default is +.BR left\-symmetric . + +It is also possible to cause RAID5 to use a RAID4-like layout by +choosing +.BR parity\-first , +or +.BR parity\-last . + +Finally for RAID5 there are DDF\-compatible layouts, +.BR ddf\-zero\-restart , +.BR ddf\-N\-restart , +and +.BR ddf\-N\-continue . + +These same layouts are available for RAID6. There are also 4 layouts +that will provide an intermediate stage for converting between RAID5 +and RAID6. These provide a layout which is identical to the +corresponding RAID5 layout on the first N\-1 devices, and has the 'Q' +syndrome (the second 'parity' block used by RAID6) on the last device. +These layouts are: +.BR left\-symmetric\-6 , +.BR right\-symmetric\-6 , +.BR left\-asymmetric\-6 , +.BR right\-asymmetric\-6 , +and +.BR parity\-first\-6 . + +When setting the failure mode for level +.I faulty, +the options are: +.BR write\-transient ", " wt , +.BR read\-transient ", " rt , +.BR write\-persistent ", " wp , +.BR read\-persistent ", " rp , +.BR write\-all , +.BR read\-fixable ", " rf , +.BR clear ", " flush ", " none . + +Each failure mode can be followed by a number, which is used as a period +between fault generation. Without a number, the fault is generated +once on the first relevant request. With a number, the fault will be +generated after that many requests, and will continue to be generated +every time the period elapses. + +Multiple failure modes can be current simultaneously by using the +.B \-\-grow +option to set subsequent failure modes. + +"clear" or "none" will remove any pending or periodic failure modes, +and "flush" will clear any persistent faults. + +Finally, the layout options for RAID10 are one of 'n', 'o' or 'f' followed +by a small number. The default is 'n2'. The supported options are: + +.I 'n' +signals 'near' copies. Multiple copies of one data block are at +similar offsets in different devices. + +.I 'o' +signals 'offset' copies. Rather than the chunks being duplicated +within a stripe, whole stripes are duplicated but are rotated by one +device so duplicate blocks are on different devices. Thus subsequent +copies of a block are in the next drive, and are one chunk further +down. + +.I 'f' +signals 'far' copies +(multiple copies have very different offsets). +See md(4) for more detail about 'near', 'offset', and 'far'. + +The number is the number of copies of each datablock. 2 is normal, 3 +can be useful. This number can be at most equal to the number of +devices in the array. It does not need to divide evenly into that +number (e.g. it is perfectly legal to have an 'n2' layout for an array +with an odd number of devices). + +When an array is converted between RAID5 and RAID6 an intermediate +RAID6 layout is used in which the second parity block (Q) is always on +the last device. To convert a RAID5 to RAID6 and leave it in this new +layout (which does not require re-striping) use +.BR \-\-layout=preserve . +This will try to avoid any restriping. + +The converse of this is +.B \-\-layout=normalise +which will change a non-standard RAID6 layout into a more standard +arrangement. + +.TP +.BR \-\-parity= +same as +.B \-\-layout +(thus explaining the p of +.BR \-p ). + +.TP +.BR \-b ", " \-\-bitmap= +Specify a file to store a write-intent bitmap in. The file should not +exist unless +.B \-\-force +is also given. The same file should be provided +when assembling the array. If the word +.B "internal" +is given, then the bitmap is stored with the metadata on the array, +and so is replicated on all devices. If the word +.B "none" +is given with +.B \-\-grow +mode, then any bitmap that is present is removed. + +To help catch typing errors, the filename must contain at least one +slash ('/') if it is a real file (not 'internal' or 'none'). + +Note: external bitmaps are only known to work on ext2 and ext3. +Storing bitmap files on other filesystems may result in serious problems. + +When creating an array on devices which are 100G or larger, +.I mdadm +automatically adds an internal bitmap as it will usually be +beneficial. This can be suppressed with +.B "\-\-bitmap=none". + +.TP +.BR \-\-bitmap\-chunk= +Set the chunksize of the bitmap. Each bit corresponds to that many +Kilobytes of storage. +When using a file based bitmap, the default is to use the smallest +size that is at-least 4 and requires no more than 2^21 chunks. +When using an +.B internal +bitmap, the chunksize defaults to 64Meg, or larger if necessary to +fit the bitmap into the available space. + +A suffix of 'M' or 'G' can be given to indicate Megabytes or +Gigabytes respectively. + +.TP +.BR \-W ", " \-\-write\-mostly +subsequent devices listed in a +.BR \-\-build , +.BR \-\-create , +or +.B \-\-add +command will be flagged as 'write-mostly'. This is valid for RAID1 +only and means that the 'md' driver will avoid reading from these +devices if at all possible. This can be useful if mirroring over a +slow link. + +.TP +.BR \-\-write\-behind= +Specify that write-behind mode should be enabled (valid for RAID1 +only). If an argument is specified, it will set the maximum number +of outstanding writes allowed. The default value is 256. +A write-intent bitmap is required in order to use write-behind +mode, and write-behind is only attempted on drives marked as +.IR write-mostly . + +.TP +.BR \-\-assume\-clean +Tell +.I mdadm +that the array pre-existed and is known to be clean. It can be useful +when trying to recover from a major failure as you can be sure that no +data will be affected unless you actually write to the array. It can +also be used when creating a RAID1 or RAID10 if you want to avoid the +initial resync, however this practice \(em while normally safe \(em is not +recommended. Use this only if you really know what you are doing. +.IP +When the devices that will be part of a new array were filled +with zeros before creation the operator knows the array is +actually clean. If that is the case, such as after running +badblocks, this argument can be used to tell mdadm the +facts the operator knows. +.IP +When an array is resized to a larger size with +.B "\-\-grow \-\-size=" +the new space is normally resynced in that same way that the whole +array is resynced at creation. From Linux version 3.0, +.B \-\-assume\-clean +can be used with that command to avoid the automatic resync. + +.TP +.BR \-\-backup\-file= +This is needed when +.B \-\-grow +is used to increase the number of raid-devices in a RAID5 or RAID6 if +there are no spare devices available, or to shrink, change RAID level +or layout. See the GROW MODE section below on RAID\-DEVICES CHANGES. +The file must be stored on a separate device, not on the RAID array +being reshaped. + +.TP +.B \-\-data\-offset= +Arrays with 1.x metadata can leave a gap between the start of the +device and the start of array data. This gap can be used for various +metadata. The start of data is known as the +.IR data\-offset . +Normally an appropriate data offset is computed automatically. +However it can be useful to set it explicitly such as when re-creating +an array which was originally created using a different version of +.I mdadm +which computed a different offset. + +Setting the offset explicitly over-rides the default. The value given +is in Kilobytes unless an 'M' or 'G' suffix is given. + +Since Linux 3.4, +.B \-\-data\-offset +can also be used with +.B --grow +for some RAID levels (initially on RAID10). This allows the +data\-offset to be changed as part of the reshape process. When the +data offset is changed, no backup file is required as the difference +in offsets is used to provide the same functionality. + +When the new offset is earlier than the old offset, the number of +devices in the array cannot shrink. When it is after the old offset, +the number of devices in the array cannot increase. + +When creating an array, +.B \-\-data\-offset +can be specified as +.BR variable . +In the case each member device is expected to have a offset appended +to the name, separated by a colon. This makes it possible to recreate +exactly an array which has varying data offsets (as can happen when +different versions of +.I mdadm +are used to add different devices). + +.TP +.BR \-\-continue +This option is complementary to the +.B \-\-freeze-reshape +option for assembly. It is needed when +.B \-\-grow +operation is interrupted and it is not restarted automatically due to +.B \-\-freeze-reshape +usage during array assembly. This option is used together with +.BR \-G +, ( +.BR \-\-grow +) command and device for a pending reshape to be continued. +All parameters required for reshape continuation will be read from array metadata. +If initial +.BR \-\-grow +command had required +.BR \-\-backup\-file= +option to be set, continuation option will require to have exactly the same +backup file given as well. +.IP +Any other parameter passed together with +.BR \-\-continue +option will be ignored. + +.TP +.BR \-N ", " \-\-name= +Set a +.B name +for the array. This is currently only effective when creating an +array with a version-1 superblock, or an array in a DDF container. +The name is a simple textual string that can be used to identify array +components when assembling. If name is needed but not specified, it +is taken from the basename of the device that is being created. +e.g. when creating +.I /dev/md/home +the +.B name +will default to +.IR home . + +.TP +.BR \-R ", " \-\-run +Insist that +.I mdadm +run the array, even if some of the components +appear to be active in another array or filesystem. Normally +.I mdadm +will ask for confirmation before including such components in an +array. This option causes that question to be suppressed. + +.TP +.BR \-f ", " \-\-force +Insist that +.I mdadm +accept the geometry and layout specified without question. Normally +.I mdadm +will not allow creation of an array with only one device, and will try +to create a RAID5 array with one missing drive (as this makes the +initial resync work faster). With +.BR \-\-force , +.I mdadm +will not try to be so clever. + +.TP +.BR \-o ", " \-\-readonly +Start the array +.B read only +rather than read-write as normal. No writes will be allowed to the +array, and no resync, recovery, or reshape will be started. + +.TP +.BR \-a ", " "\-\-auto{=yes,md,mdp,part,p}{NN}" +Instruct mdadm how to create the device file if needed, possibly allocating +an unused minor number. "md" causes a non-partitionable array +to be used (though since Linux 2.6.28, these array devices are in fact +partitionable). "mdp", "part" or "p" causes a partitionable array (2.6 and +later) to be used. "yes" requires the named md device to have +a 'standard' format, and the type and minor number will be determined +from this. With mdadm 3.0, device creation is normally left up to +.I udev +so this option is unlikely to be needed. +See DEVICE NAMES below. + +The argument can also come immediately after +"\-a". e.g. "\-ap". + +If +.B \-\-auto +is not given on the command line or in the config file, then +the default will be +.BR \-\-auto=yes . + +If +.B \-\-scan +is also given, then any +.I auto= +entries in the config file will override the +.B \-\-auto +instruction given on the command line. + +For partitionable arrays, +.I mdadm +will create the device file for the whole array and for the first 4 +partitions. A different number of partitions can be specified at the +end of this option (e.g. +.BR \-\-auto=p7 ). +If the device name ends with a digit, the partition names add a 'p', +and a number, e.g. +.IR /dev/md/home1p3 . +If there is no trailing digit, then the partition names just have a +number added, e.g. +.IR /dev/md/scratch3 . + +If the md device name is in a 'standard' format as described in DEVICE +NAMES, then it will be created, if necessary, with the appropriate +device number based on that name. If the device name is not in one of these +formats, then a unused device number will be allocated. The device +number will be considered unused if there is no active array for that +number, and there is no entry in /dev for that number and with a +non-standard name. Names that are not in 'standard' format are only +allowed in "/dev/md/". + +This is meaningful with +.B \-\-create +or +.BR \-\-build . + +.TP +.BR \-a ", " "\-\-add" +This option can be used in Grow mode in two cases. + +If the target array is a Linear array, then +.B \-\-add +can be used to add one or more devices to the array. They +are simply catenated on to the end of the array. Once added, the +devices cannot be removed. + +If the +.B \-\-raid\-disks +option is being used to increase the number of devices in an array, +then +.B \-\-add +can be used to add some extra devices to be included in the array. +In most cases this is not needed as the extra devices can be added as +spares first, and then the number of raid-disks can be changed. +However for RAID0, it is not possible to add spares. So to increase +the number of devices in a RAID0, it is necessary to set the new +number of devices, and to add the new devices, in the same command. + +.SH For assemble: + +.TP +.BR \-u ", " \-\-uuid= +uuid of array to assemble. Devices which don't have this uuid are +excluded + +.TP +.BR \-m ", " \-\-super\-minor= +Minor number of device that array was created for. Devices which +don't have this minor number are excluded. If you create an array as +/dev/md1, then all superblocks will contain the minor number 1, even if +the array is later assembled as /dev/md2. + +Giving the literal word "dev" for +.B \-\-super\-minor +will cause +.I mdadm +to use the minor number of the md device that is being assembled. +e.g. when assembling +.BR /dev/md0 , +.B \-\-super\-minor=dev +will look for super blocks with a minor number of 0. + +.B \-\-super\-minor +is only relevant for v0.90 metadata, and should not normally be used. +Using +.B \-\-uuid +is much safer. + +.TP +.BR \-N ", " \-\-name= +Specify the name of the array to assemble. This must be the name +that was specified when creating the array. It must either match +the name stored in the superblock exactly, or it must match +with the current +.I homehost +prefixed to the start of the given name. + +.TP +.BR \-f ", " \-\-force +Assemble the array even if the metadata on some devices appears to be +out-of-date. If +.I mdadm +cannot find enough working devices to start the array, but can find +some devices that are recorded as having failed, then it will mark +those devices as working so that the array can be started. +An array which requires +.B \-\-force +to be started may contain data corruption. Use it carefully. + +.TP +.BR \-R ", " \-\-run +Attempt to start the array even if fewer drives were given than were +present last time the array was active. Normally if not all the +expected drives are found and +.B \-\-scan +is not used, then the array will be assembled but not started. +With +.B \-\-run +an attempt will be made to start it anyway. + +.TP +.B \-\-no\-degraded +This is the reverse of +.B \-\-run +in that it inhibits the startup of array unless all expected drives +are present. This is only needed with +.B \-\-scan, +and can be used if the physical connections to devices are +not as reliable as you would like. + +.TP +.BR \-a ", " "\-\-auto{=no,yes,md,mdp,part}" +See this option under Create and Build options. + +.TP +.BR \-b ", " \-\-bitmap= +Specify the bitmap file that was given when the array was created. If +an array has an +.B internal +bitmap, there is no need to specify this when assembling the array. + +.TP +.BR \-\-backup\-file= +If +.B \-\-backup\-file +was used while reshaping an array (e.g. changing number of devices or +chunk size) and the system crashed during the critical section, then the same +.B \-\-backup\-file +must be presented to +.B \-\-assemble +to allow possibly corrupted data to be restored, and the reshape +to be completed. + +.TP +.BR \-\-invalid\-backup +If the file needed for the above option is not available for any +reason an empty file can be given together with this option to +indicate that the backup file is invalid. In this case the data that +was being rearranged at the time of the crash could be irrecoverably +lost, but the rest of the array may still be recoverable. This option +should only be used as a last resort if there is no way to recover the +backup file. + + +.TP +.BR \-U ", " \-\-update= +Update the superblock on each device while assembling the array. The +argument given to this flag can be one of +.BR sparc2.2 , +.BR summaries , +.BR uuid , +.BR name , +.BR homehost , +.BR resync , +.BR byteorder , +.BR devicesize , +.BR no\-bitmap , +.BR bbl , +.BR no\-bbl , +.BR metadata , +or +.BR super\-minor . + +The +.B sparc2.2 +option will adjust the superblock of an array what was created on a Sparc +machine running a patched 2.2 Linux kernel. This kernel got the +alignment of part of the superblock wrong. You can use the +.B "\-\-examine \-\-sparc2.2" +option to +.I mdadm +to see what effect this would have. + +The +.B super\-minor +option will update the +.B "preferred minor" +field on each superblock to match the minor number of the array being +assembled. +This can be useful if +.B \-\-examine +reports a different "Preferred Minor" to +.BR \-\-detail . +In some cases this update will be performed automatically +by the kernel driver. In particular the update happens automatically +at the first write to an array with redundancy (RAID level 1 or +greater) on a 2.6 (or later) kernel. + +The +.B uuid +option will change the uuid of the array. If a UUID is given with the +.B \-\-uuid +option that UUID will be used as a new UUID and will +.B NOT +be used to help identify the devices in the array. +If no +.B \-\-uuid +is given, a random UUID is chosen. + +The +.B name +option will change the +.I name +of the array as stored in the superblock. This is only supported for +version-1 superblocks. + +The +.B homehost +option will change the +.I homehost +as recorded in the superblock. For version-0 superblocks, this is the +same as updating the UUID. +For version-1 superblocks, this involves updating the name. + +The +.B resync +option will cause the array to be marked +.I dirty +meaning that any redundancy in the array (e.g. parity for RAID5, +copies for RAID1) may be incorrect. This will cause the RAID system +to perform a "resync" pass to make sure that all redundant information +is correct. + +The +.B byteorder +option allows arrays to be moved between machines with different +byte-order. +When assembling such an array for the first time after a move, giving +.B "\-\-update=byteorder" +will cause +.I mdadm +to expect superblocks to have their byteorder reversed, and will +correct that order before assembling the array. This is only valid +with original (Version 0.90) superblocks. + +The +.B summaries +option will correct the summaries in the superblock. That is the +counts of total, working, active, failed, and spare devices. + +The +.B devicesize +option will rarely be of use. It applies to version 1.1 and 1.2 metadata +only (where the metadata is at the start of the device) and is only +useful when the component device has changed size (typically become +larger). The version 1 metadata records the amount of the device that +can be used to store data, so if a device in a version 1.1 or 1.2 +array becomes larger, the metadata will still be visible, but the +extra space will not. In this case it might be useful to assemble the +array with +.BR \-\-update=devicesize . +This will cause +.I mdadm +to determine the maximum usable amount of space on each device and +update the relevant field in the metadata. + +The +.B metadata +option only works on v0.90 metadata arrays and will convert them to +v1.0 metadata. The array must not be dirty (i.e. it must not need a +sync) and it must not have a write-intent bitmap. + +The old metadata will remain on the devices, but will appear older +than the new metadata and so will usually be ignored. The old metadata +(or indeed the new metadata) can be removed by giving the appropriate +.B \-\-metadata= +option to +.BR \-\-zero\-superblock . + +The +.B no\-bitmap +option can be used when an array has an internal bitmap which is +corrupt in some way so that assembling the array normally fails. It +will cause any internal bitmap to be ignored. + +The +.B bbl +option will reserve space in each device for a bad block list. This +will be 4K in size and positioned near the end of any free space +between the superblock and the data. + +The +.B no\-bbl +option will cause any reservation of space for a bad block list to be +removed. If the bad block list contains entries, this will fail, as +removing the list could cause data corruption. + +.TP +.BR \-\-freeze\-reshape +Option is intended to be used in start-up scripts during initrd boot phase. +When array under reshape is assembled during initrd phase, this option +stops reshape after reshape critical section is being restored. This happens +before file system pivot operation and avoids loss of file system context. +Losing file system context would cause reshape to be broken. + +Reshape can be continued later using the +.B \-\-continue +option for the grow command. + +.SH For Manage mode: + +.TP +.BR \-t ", " \-\-test +Unless a more serious error occurred, +.I mdadm +will exit with a status of 2 if no changes were made to the array and +0 if at least one change was made. +This can be useful when an indirect specifier such as +.BR missing , +.B detached +or +.B faulty +is used in requesting an operation on the array. +.B \-\-test +will report failure if these specifiers didn't find any match. + +.TP +.BR \-a ", " \-\-add +hot-add listed devices. +If a device appears to have recently been part of the array +(possibly it failed or was removed) the device is re\-added as described +in the next point. +If that fails or the device was never part of the array, the device is +added as a hot-spare. +If the array is degraded, it will immediately start to rebuild data +onto that spare. + +Note that this and the following options are only meaningful on array +with redundancy. They don't apply to RAID0 or Linear. + +.TP +.BR \-\-re\-add +re\-add a device that was previously removed from an array. +If the metadata on the device reports that it is a member of the +array, and the slot that it used is still vacant, then the device will +be added back to the array in the same position. This will normally +cause the data for that device to be recovered. However based on the +event count on the device, the recovery may only require sections that +are flagged a write-intent bitmap to be recovered or may not require +any recovery at all. + +When used on an array that has no metadata (i.e. it was built with +.BR \-\-build) +it will be assumed that bitmap-based recovery is enough to make the +device fully consistent with the array. + +When used with v1.x metadata, +.B \-\-re\-add +can be accompanied by +.BR \-\-update=devicesize , +.BR \-\-update=bbl ", or" +.BR \-\-update=no\-bbl . +See the description of these option when used in Assemble mode for an +explanation of their use. + +If the device name given is +.B missing +then +.I mdadm +will try to find any device that looks like it should be +part of the array but isn't and will try to re\-add all such devices. + +If the device name given is +.B faulty +then +.I mdadm +will find all devices in the array that are marked +.BR faulty , +remove them and attempt to immediately re\-add them. This can be +useful if you are certain that the reason for failure has been +resolved. + +.TP +.B \-\-add\-spare +Add a device as a spare. This is similar to +.B \-\-add +except that it does not attempt +.B \-\-re\-add +first. The device will be added as a spare even if it looks like it +could be an recent member of the array. + +.TP +.BR \-r ", " \-\-remove +remove listed devices. They must not be active. i.e. they should +be failed or spare devices. + +As well as the name of a device file +(e.g. +.BR /dev/sda1 ) +the words +.BR failed , +.B detached +and names like +.B set-A +can be given to +.BR \-\-remove . +The first causes all failed device to be removed. The second causes +any device which is no longer connected to the system (i.e an 'open' +returns +.BR ENXIO ) +to be removed. +The third will remove a set as describe below under +.BR \-\-fail . + +.TP +.BR \-f ", " \-\-fail +Mark listed devices as faulty. +As well as the name of a device file, the word +.B detached +or a set name like +.B set\-A +can be given. The former will cause any device that has been detached from +the system to be marked as failed. It can then be removed. + +For RAID10 arrays where the number of copies evenly divides the number +of devices, the devices can be conceptually divided into sets where +each set contains a single complete copy of the data on the array. +Sometimes a RAID10 array will be configured so that these sets are on +separate controllers. In this case all the devices in one set can be +failed by giving a name like +.B set\-A +or +.B set\-B +to +.BR \-\-fail . +The appropriate set names are reported by +.BR \-\-detail . + +.TP +.BR \-\-set\-faulty +same as +.BR \-\-fail . + +.TP +.B \-\-replace +Mark listed devices as requiring replacement. As soon as a spare is +available, it will be rebuilt and will replace the marked device. +This is similar to marking a device as faulty, but the device remains +in service during the recovery process to increase resilience against +multiple failures. When the replacement process finishes, the +replaced device will be marked as faulty. + +.TP +.B \-\-with +This can follow a list of +.B \-\-replace +devices. The devices listed after +.B \-\-with +will be preferentially used to replace the devices listed after +.BR \-\-replace . +These device must already be spare devices in the array. + +.TP +.BR \-\-write\-mostly +Subsequent devices that are added or re\-added will have the 'write-mostly' +flag set. This is only valid for RAID1 and means that the 'md' driver +will avoid reading from these devices if possible. +.TP +.BR \-\-readwrite +Subsequent devices that are added or re\-added will have the 'write-mostly' +flag cleared. + +.P +Each of these options requires that the first device listed is the array +to be acted upon, and the remainder are component devices to be added, +removed, marked as faulty, etc. Several different operations can be +specified for different devices, e.g. +.in +5 +mdadm /dev/md0 \-\-add /dev/sda1 \-\-fail /dev/sdb1 \-\-remove /dev/sdb1 +.in -5 +Each operation applies to all devices listed until the next +operation. + +If an array is using a write-intent bitmap, then devices which have +been removed can be re\-added in a way that avoids a full +reconstruction but instead just updates the blocks that have changed +since the device was removed. For arrays with persistent metadata +(superblocks) this is done automatically. For arrays created with +.B \-\-build +mdadm needs to be told that this device we removed recently with +.BR \-\-re\-add . + +Devices can only be removed from an array if they are not in active +use, i.e. that must be spares or failed devices. To remove an active +device, it must first be marked as +.B faulty. + +.SH For Misc mode: + +.TP +.BR \-Q ", " \-\-query +Examine a device to see +(1) if it is an md device and (2) if it is a component of an md +array. +Information about what is discovered is presented. + +.TP +.BR \-D ", " \-\-detail +Print details of one or more md devices. + +.TP +.BR \-\-detail\-platform +Print details of the platform's RAID capabilities (firmware / hardware +topology) for a given metadata format. If used without argument, mdadm +will scan all controllers looking for their capabilities. Otherwise, mdadm +will only look at the controller specified by the argument in form of an +absolute filepath or a link, e.g. +.IR /sys/devices/pci0000:00/0000:00:1f.2 . + +.TP +.BR \-Y ", " \-\-export +When used with +.BR \-\-detail , +.BR \-\-detail-platform , +.BR \-\-examine , +or +.B \-\-incremental +output will be formatted as +.B key=value +pairs for easy import into the environment. + +With +.B \-\-incremental +The value +.B MD_STARTED +indicates whether an array was started +.RB ( yes ) +or not, which may include a reason +.RB ( unsafe ", " nothing ", " no ). +Also the value +.B MD_FOREIGN +indicates if the array is expected on this host +.RB ( no ), +or seems to be from elsewhere +.RB ( yes ). + +.TP +.BR \-E ", " \-\-examine +Print contents of the metadata stored on the named device(s). +Note the contrast between +.B \-\-examine +and +.BR \-\-detail . +.B \-\-examine +applies to devices which are components of an array, while +.B \-\-detail +applies to a whole array which is currently active. +.TP +.B \-\-sparc2.2 +If an array was created on a SPARC machine with a 2.2 Linux kernel +patched with RAID support, the superblock will have been created +incorrectly, or at least incompatibly with 2.4 and later kernels. +Using the +.B \-\-sparc2.2 +flag with +.B \-\-examine +will fix the superblock before displaying it. If this appears to do +the right thing, then the array can be successfully assembled using +.BR "\-\-assemble \-\-update=sparc2.2" . + +.TP +.BR \-X ", " \-\-examine\-bitmap +Report information about a bitmap file. +The argument is either an external bitmap file or an array component +in case of an internal bitmap. Note that running this on an array +device (e.g. +.BR /dev/md0 ) +does not report the bitmap for that array. + +.TP +.B \-\-examine\-badblocks +List the bad-blocks recorded for the device, if a bad-blocks list has +been configured. Currently only +.B 1.x +metadata supports bad-blocks lists. + +.TP +.BI \-\-dump= directory +.TP +.BI \-\-restore= directory +Save metadata from lists devices, or restore metadata to listed devices. + +.TP +.BR \-R ", " \-\-run +start a partially assembled array. If +.B \-\-assemble +did not find enough devices to fully start the array, it might leaving +it partially assembled. If you wish, you can then use +.B \-\-run +to start the array in degraded mode. + +.TP +.BR \-S ", " \-\-stop +deactivate array, releasing all resources. + +.TP +.BR \-o ", " \-\-readonly +mark array as readonly. + +.TP +.BR \-w ", " \-\-readwrite +mark array as readwrite. + +.TP +.B \-\-zero\-superblock +If the device contains a valid md superblock, the block is +overwritten with zeros. With +.B \-\-force +the block where the superblock would be is overwritten even if it +doesn't appear to be valid. + +.TP +.B \-\-kill\-subarray= +If the device is a container and the argument to \-\-kill\-subarray +specifies an inactive subarray in the container, then the subarray is +deleted. Deleting all subarrays will leave an 'empty-container' or +spare superblock on the drives. See +.B \-\-zero\-superblock +for completely +removing a superblock. Note that some formats depend on the subarray +index for generating a UUID, this command will fail if it would change +the UUID of an active subarray. + +.TP +.B \-\-update\-subarray= +If the device is a container and the argument to \-\-update\-subarray +specifies a subarray in the container, then attempt to update the given +superblock field in the subarray. See below in +.B MISC MODE +for details. + +.TP +.BR \-t ", " \-\-test +When used with +.BR \-\-detail , +the exit status of +.I mdadm +is set to reflect the status of the device. See below in +.B MISC MODE +for details. + +.TP +.BR \-W ", " \-\-wait +For each md device given, wait for any resync, recovery, or reshape +activity to finish before returning. +.I mdadm +will return with success if it actually waited for every device +listed, otherwise it will return failure. + +.TP +.BR \-\-wait\-clean +For each md device given, or each device in /proc/mdstat if +.B \-\-scan +is given, arrange for the array to be marked clean as soon as possible. +.I mdadm +will return with success if the array uses external metadata and we +successfully waited. For native arrays this returns immediately as the +kernel handles dirty-clean transitions at shutdown. No action is taken +if safe-mode handling is disabled. + +.TP +.B \-\-action= +Set the "sync_action" for all md devices given to one of +.BR idle , +.BR frozen , +.BR check , +.BR repair . +Setting to +.B idle +will abort any currently running action though some actions will +automatically restart. +Setting to +.B frozen +will abort any current action and ensure no other action starts +automatically. + +Details of +.B check +and +.B repair +can be found it +.IR md (4) +under +.BR "SCRUBBING AND MISMATCHES" . + +.SH For Incremental Assembly mode: +.TP +.BR \-\-rebuild\-map ", " \-r +Rebuild the map file +.RB ( {MAP_PATH} ) +that +.I mdadm +uses to help track which arrays are currently being assembled. + +.TP +.BR \-\-run ", " \-R +Run any array assembled as soon as a minimal number of devices are +available, rather than waiting until all expected devices are present. + +.TP +.BR \-\-scan ", " \-s +Only meaningful with +.B \-R +this will scan the +.B map +file for arrays that are being incrementally assembled and will try to +start any that are not already started. If any such array is listed +in +.B mdadm.conf +as requiring an external bitmap, that bitmap will be attached first. + +.TP +.BR \-\-fail ", " \-f +This allows the hot-plug system to remove devices that have fully disappeared +from the kernel. It will first fail and then remove the device from any +array it belongs to. +The device name given should be a kernel device name such as "sda", +not a name in +.IR /dev . + +.TP +.BR \-\-path= +Only used with \-\-fail. The 'path' given will be recorded so that if +a new device appears at the same location it can be automatically +added to the same array. This allows the failed device to be +automatically replaced by a new device without metadata if it appears +at specified path. This option is normally only set by a +.I udev +script. + +.SH For Monitor mode: +.TP +.BR \-m ", " \-\-mail +Give a mail address to send alerts to. + +.TP +.BR \-p ", " \-\-program ", " \-\-alert +Give a program to be run whenever an event is detected. + +.TP +.BR \-y ", " \-\-syslog +Cause all events to be reported through 'syslog'. The messages have +facility of 'daemon' and varying priorities. + +.TP +.BR \-d ", " \-\-delay +Give a delay in seconds. +.I mdadm +polls the md arrays and then waits this many seconds before polling +again. The default is 60 seconds. Since 2.6.16, there is no need to +reduce this as the kernel alerts +.I mdadm +immediately when there is any change. + +.TP +.BR \-r ", " \-\-increment +Give a percentage increment. +.I mdadm +will generate RebuildNN events with the given percentage increment. + +.TP +.BR \-f ", " \-\-daemonise +Tell +.I mdadm +to run as a background daemon if it decides to monitor anything. This +causes it to fork and run in the child, and to disconnect from the +terminal. The process id of the child is written to stdout. +This is useful with +.B \-\-scan +which will only continue monitoring if a mail address or alert program +is found in the config file. + +.TP +.BR \-i ", " \-\-pid\-file +When +.I mdadm +is running in daemon mode, write the pid of the daemon process to +the specified file, instead of printing it on standard output. + +.TP +.BR \-1 ", " \-\-oneshot +Check arrays only once. This will generate +.B NewArray +events and more significantly +.B DegradedArray +and +.B SparesMissing +events. Running +.in +5 +.B " mdadm \-\-monitor \-\-scan \-1" +.in -5 +from a cron script will ensure regular notification of any degraded arrays. + +.TP +.BR \-t ", " \-\-test +Generate a +.B TestMessage +alert for every array found at startup. This alert gets mailed and +passed to the alert program. This can be used for testing that alert +message do get through successfully. + +.TP +.BR \-\-no\-sharing +This inhibits the functionality for moving spares between arrays. +Only one monitoring process started with +.B \-\-scan +but without this flag is allowed, otherwise the two could interfere +with each other. + +.SH ASSEMBLE MODE + +.HP 12 +Usage: +.B mdadm \-\-assemble +.I md-device options-and-component-devices... +.HP 12 +Usage: +.B mdadm \-\-assemble \-\-scan +.I md-devices-and-options... +.HP 12 +Usage: +.B mdadm \-\-assemble \-\-scan +.I options... + +.PP +This usage assembles one or more RAID arrays from pre-existing components. +For each array, mdadm needs to know the md device, the identity of the +array, and a number of component-devices. These can be found in a number of ways. + +In the first usage example (without the +.BR \-\-scan ) +the first device given is the md device. +In the second usage example, all devices listed are treated as md +devices and assembly is attempted. +In the third (where no devices are listed) all md devices that are +listed in the configuration file are assembled. If no arrays are +described by the configuration file, then any arrays that +can be found on unused devices will be assembled. + +If precisely one device is listed, but +.B \-\-scan +is not given, then +.I mdadm +acts as though +.B \-\-scan +was given and identity information is extracted from the configuration file. + +The identity can be given with the +.B \-\-uuid +option, the +.B \-\-name +option, or the +.B \-\-super\-minor +option, will be taken from the md-device record in the config file, or +will be taken from the super block of the first component-device +listed on the command line. + +Devices can be given on the +.B \-\-assemble +command line or in the config file. Only devices which have an md +superblock which contains the right identity will be considered for +any array. + +The config file is only used if explicitly named with +.B \-\-config +or requested with (a possibly implicit) +.BR \-\-scan . +In the later case, +.B /etc/mdadm/mdadm.conf +or +.B /etc/mdadm.conf +is used. + +If +.B \-\-scan +is not given, then the config file will only be used to find the +identity of md arrays. + +Normally the array will be started after it is assembled. However if +.B \-\-scan +is not given and not all expected drives were listed, then the array +is not started (to guard against usage errors). To insist that the +array be started in this case (as may work for RAID1, 4, 5, 6, or 10), +give the +.B \-\-run +flag. + +If +.I udev +is active, +.I mdadm +does not create any entries in +.B /dev +but leaves that to +.IR udev . +It does record information in +.B {MAP_PATH} +which will allow +.I udev +to choose the correct name. + +If +.I mdadm +detects that udev is not configured, it will create the devices in +.B /dev +itself. + +In Linux kernels prior to version 2.6.28 there were two distinctly +different types of md devices that could be created: one that could be +partitioned using standard partitioning tools and one that could not. +Since 2.6.28 that distinction is no longer relevant as both type of +devices can be partitioned. +.I mdadm +will normally create the type that originally could not be partitioned +as it has a well defined major number (9). + +Prior to 2.6.28, it is important that mdadm chooses the correct type +of array device to use. This can be controlled with the +.B \-\-auto +option. In particular, a value of "mdp" or "part" or "p" tells mdadm +to use a partitionable device rather than the default. + +In the no-udev case, the value given to +.B \-\-auto +can be suffixed by a number. This tells +.I mdadm +to create that number of partition devices rather than the default of 4. + +The value given to +.B \-\-auto +can also be given in the configuration file as a word starting +.B auto= +on the ARRAY line for the relevant array. + +.SS Auto Assembly +When +.B \-\-assemble +is used with +.B \-\-scan +and no devices are listed, +.I mdadm +will first attempt to assemble all the arrays listed in the config +file. + +If no arrays are listed in the config (other than those marked +.BR <ignore> ) +it will look through the available devices for possible arrays and +will try to assemble anything that it finds. Arrays which are tagged +as belonging to the given homehost will be assembled and started +normally. Arrays which do not obviously belong to this host are given +names that are expected not to conflict with anything local, and are +started "read-auto" so that nothing is written to any device until the +array is written to. i.e. automatic resync etc is delayed. + +If +.I mdadm +finds a consistent set of devices that look like they should comprise +an array, and if the superblock is tagged as belonging to the given +home host, it will automatically choose a device name and try to +assemble the array. If the array uses version-0.90 metadata, then the +.B minor +number as recorded in the superblock is used to create a name in +.B /dev/md/ +so for example +.BR /dev/md/3 . +If the array uses version-1 metadata, then the +.B name +from the superblock is used to similarly create a name in +.B /dev/md/ +(the name will have any 'host' prefix stripped first). + +This behaviour can be modified by the +.I AUTO +line in the +.I mdadm.conf +configuration file. This line can indicate that specific metadata +type should, or should not, be automatically assembled. If an array +is found which is not listed in +.I mdadm.conf +and has a metadata format that is denied by the +.I AUTO +line, then it will not be assembled. +The +.I AUTO +line can also request that all arrays identified as being for this +homehost should be assembled regardless of their metadata type. +See +.IR mdadm.conf (5) +for further details. + +Note: Auto assembly cannot be used for assembling and activating some +arrays which are undergoing reshape. In particular as the +.B backup\-file +cannot be given, any reshape which requires a backup-file to continue +cannot be started by auto assembly. An array which is growing to more +devices and has passed the critical section can be assembled using +auto-assembly. + +.SH BUILD MODE + +.HP 12 +Usage: +.B mdadm \-\-build +.I md-device +.BI \-\-chunk= X +.BI \-\-level= Y +.BI \-\-raid\-devices= Z +.I devices + +.PP +This usage is similar to +.BR \-\-create . +The difference is that it creates an array without a superblock. With +these arrays there is no difference between initially creating the array and +subsequently assembling the array, except that hopefully there is useful +data there in the second case. + +The level may raid0, linear, raid1, raid10, multipath, or faulty, or +one of their synonyms. All devices must be listed and the array will +be started once complete. It will often be appropriate to use +.B \-\-assume\-clean +with levels raid1 or raid10. + +.SH CREATE MODE + +.HP 12 +Usage: +.B mdadm \-\-create +.I md-device +.BI \-\-chunk= X +.BI \-\-level= Y +.br +.BI \-\-raid\-devices= Z +.I devices + +.PP +This usage will initialise a new md array, associate some devices with +it, and activate the array. + +The named device will normally not exist when +.I "mdadm \-\-create" +is run, but will be created by +.I udev +once the array becomes active. + +As devices are added, they are checked to see if they contain RAID +superblocks or filesystems. They are also checked to see if the variance in +device size exceeds 1%. + +If any discrepancy is found, the array will not automatically be run, though +the presence of a +.B \-\-run +can override this caution. + +To create a "degraded" array in which some devices are missing, simply +give the word "\fBmissing\fP" +in place of a device name. This will cause +.I mdadm +to leave the corresponding slot in the array empty. +For a RAID4 or RAID5 array at most one slot can be +"\fBmissing\fP"; for a RAID6 array at most two slots. +For a RAID1 array, only one real device needs to be given. All of the +others can be +"\fBmissing\fP". + +When creating a RAID5 array, +.I mdadm +will automatically create a degraded array with an extra spare drive. +This is because building the spare into a degraded array is in general +faster than resyncing the parity on a non-degraded, but not clean, +array. This feature can be overridden with the +.B \-\-force +option. + +When creating an array with version-1 metadata a name for the array is +required. +If this is not given with the +.B \-\-name +option, +.I mdadm +will choose a name based on the last component of the name of the +device being created. So if +.B /dev/md3 +is being created, then the name +.B 3 +will be chosen. +If +.B /dev/md/home +is being created, then the name +.B home +will be used. + +When creating a partition based array, using +.I mdadm +with version-1.x metadata, the partition type should be set to +.B 0xDA +(non fs-data). This type selection allows for greater precision since +using any other [RAID auto-detect (0xFD) or a GNU/Linux partition (0x83)], +might create problems in the event of array recovery through a live cdrom. + +A new array will normally get a randomly assigned 128bit UUID which is +very likely to be unique. If you have a specific need, you can choose +a UUID for the array by giving the +.B \-\-uuid= +option. Be warned that creating two arrays with the same UUID is a +recipe for disaster. Also, using +.B \-\-uuid= +when creating a v0.90 array will silently override any +.B \-\-homehost= +setting. +.\"If the +.\".B \-\-size +.\"option is given, it is not necessary to list any component-devices in this command. +.\"They can be added later, before a +.\".B \-\-run. +.\"If no +.\".B \-\-size +.\"is given, the apparent size of the smallest drive given is used. + +If the array type supports a write-intent bitmap, and if the devices +in the array exceed 100G is size, an internal write-intent bitmap +will automatically be added unless some other option is explicitly +requested with the +.B \-\-bitmap +option. In any case space for a bitmap will be reserved so that one +can be added layer with +.BR "\-\-grow \-\-bitmap=internal" . + +If the metadata type supports it (currently only 1.x metadata), space +will be allocated to store a bad block list. This allows a modest +number of bad blocks to be recorded, allowing the drive to remain in +service while only partially functional. + +When creating an array within a +.B CONTAINER +.I mdadm +can be given either the list of devices to use, or simply the name of +the container. The former case gives control over which devices in +the container will be used for the array. The latter case allows +.I mdadm +to automatically choose which devices to use based on how much spare +space is available. + +The General Management options that are valid with +.B \-\-create +are: +.TP +.B \-\-run +insist on running the array even if some devices look like they might +be in use. + +.TP +.B \-\-readonly +start the array readonly \(em not supported yet. + +.SH MANAGE MODE +.HP 12 +Usage: +.B mdadm +.I device +.I options... devices... +.PP + +This usage will allow individual devices in an array to be failed, +removed or added. It is possible to perform multiple operations with +on command. For example: +.br +.B " mdadm /dev/md0 \-f /dev/hda1 \-r /dev/hda1 \-a /dev/hda1" +.br +will firstly mark +.B /dev/hda1 +as faulty in +.B /dev/md0 +and will then remove it from the array and finally add it back +in as a spare. However only one md array can be affected by a single +command. + +When a device is added to an active array, mdadm checks to see if it +has metadata on it which suggests that it was recently a member of the +array. If it does, it tries to "re\-add" the device. If there have +been no changes since the device was removed, or if the array has a +write-intent bitmap which has recorded whatever changes there were, +then the device will immediately become a full member of the array and +those differences recorded in the bitmap will be resolved. + +.SH MISC MODE +.HP 12 +Usage: +.B mdadm +.I options ... +.I devices ... +.PP + +MISC mode includes a number of distinct operations that +operate on distinct devices. The operations are: +.TP +.B \-\-query +The device is examined to see if it is +(1) an active md array, or +(2) a component of an md array. +The information discovered is reported. + +.TP +.B \-\-detail +The device should be an active md device. +.B mdadm +will display a detailed description of the array. +.B \-\-brief +or +.B \-\-scan +will cause the output to be less detailed and the format to be +suitable for inclusion in +.BR mdadm.conf . +The exit status of +.I mdadm +will normally be 0 unless +.I mdadm +failed to get useful information about the device(s); however, if the +.B \-\-test +option is given, then the exit status will be: +.RS +.TP +0 +The array is functioning normally. +.TP +1 +The array has at least one failed device. +.TP +2 +The array has multiple failed devices such that it is unusable. +.TP +4 +There was an error while trying to get information about the device. +.RE + +.TP +.B \-\-detail\-platform +Print detail of the platform's RAID capabilities (firmware / hardware +topology). If the metadata is specified with +.B \-e +or +.B \-\-metadata= +then the return status will be: +.RS +.TP +0 +metadata successfully enumerated its platform components on this system +.TP +1 +metadata is platform independent +.TP +2 +metadata failed to find its platform components on this system +.RE + +.TP +.B \-\-update\-subarray= +If the device is a container and the argument to \-\-update\-subarray +specifies a subarray in the container, then attempt to update the given +superblock field in the subarray. Similar to updating an array in +"assemble" mode, the field to update is selected by +.B \-U +or +.B \-\-update= +option. Currently only +.B name +is supported. + +The +.B name +option updates the subarray name in the metadata, it may not affect the +device node name or the device node symlink until the subarray is +re\-assembled. If updating +.B name +would change the UUID of an active subarray this operation is blocked, +and the command will end in an error. + +.TP +.B \-\-examine +The device should be a component of an md array. +.I mdadm +will read the md superblock of the device and display the contents. +If +.B \-\-brief +or +.B \-\-scan +is given, then multiple devices that are components of the one array +are grouped together and reported in a single entry suitable +for inclusion in +.BR mdadm.conf . + +Having +.B \-\-scan +without listing any devices will cause all devices listed in the +config file to be examined. + +.TP +.BI \-\-dump= directory +If the device contains RAID metadata, a file will be created in the +.I directory +and the metadata will be written to it. The file will be the same +size as the device and have the metadata written in the file at the +same locate that it exists in the device. However the file will be "sparse" so +that only those blocks containing metadata will be allocated. The +total space used will be small. + +The file name used in the +.I directory +will be the base name of the device. Further if any links appear in +.I /dev/disk/by-id +which point to the device, then hard links to the file will be created +in +.I directory +based on these +.I by-id +names. + +Multiple devices can be listed and their metadata will all be stored +in the one directory. + +.TP +.BI \-\-restore= directory +This is the reverse of +.BR \-\-dump . +.I mdadm +will locate a file in the directory that has a name appropriate for +the given device and will restore metadata from it. Names that match +.I /dev/disk/by-id +names are preferred, however if two of those refer to different files, +.I mdadm +will not choose between them but will abort the operation. + +If a file name is given instead of a +.I directory +then +.I mdadm +will restore from that file to a single device, always provided the +size of the file matches that of the device, and the file contains +valid metadata. +.TP +.B \-\-stop +The devices should be active md arrays which will be deactivated, as +long as they are not currently in use. + +.TP +.B \-\-run +This will fully activate a partially assembled md array. + +.TP +.B \-\-readonly +This will mark an active array as read-only, providing that it is +not currently being used. + +.TP +.B \-\-readwrite +This will change a +.B readonly +array back to being read/write. + +.TP +.B \-\-scan +For all operations except +.BR \-\-examine , +.B \-\-scan +will cause the operation to be applied to all arrays listed in +.BR /proc/mdstat . +For +.BR \-\-examine, +.B \-\-scan +causes all devices listed in the config file to be examined. + +.TP +.BR \-b ", " \-\-brief +Be less verbose. This is used with +.B \-\-detail +and +.BR \-\-examine . +Using +.B \-\-brief +with +.B \-\-verbose +gives an intermediate level of verbosity. + +.SH MONITOR MODE + +.HP 12 +Usage: +.B mdadm \-\-monitor +.I options... devices... + +.PP +This usage causes +.I mdadm +to periodically poll a number of md arrays and to report on any events +noticed. +.I mdadm +will never exit once it decides that there are arrays to be checked, +so it should normally be run in the background. + +As well as reporting events, +.I mdadm +may move a spare drive from one array to another if they are in the +same +.B spare-group +or +.B domain +and if the destination array has a failed drive but no spares. + +If any devices are listed on the command line, +.I mdadm +will only monitor those devices. Otherwise all arrays listed in the +configuration file will be monitored. Further, if +.B \-\-scan +is given, then any other md devices that appear in +.B /proc/mdstat +will also be monitored. + +The result of monitoring the arrays is the generation of events. +These events are passed to a separate program (if specified) and may +be mailed to a given E-mail address. + +When passing events to a program, the program is run once for each event, +and is given 2 or 3 command-line arguments: the first is the +name of the event (see below), the second is the name of the +md device which is affected, and the third is the name of a related +device if relevant (such as a component device that has failed). + +If +.B \-\-scan +is given, then a program or an E-mail address must be specified on the +command line or in the config file. If neither are available, then +.I mdadm +will not monitor anything. +Without +.B \-\-scan, +.I mdadm +will continue monitoring as long as something was found to monitor. If +no program or email is given, then each event is reported to +.BR stdout . + +The different events are: + +.RS 4 +.TP +.B DeviceDisappeared +An md array which previously was configured appears to no longer be +configured. (syslog priority: Critical) + +If +.I mdadm +was told to monitor an array which is RAID0 or Linear, then it will +report +.B DeviceDisappeared +with the extra information +.BR Wrong-Level . +This is because RAID0 and Linear do not support the device-failed, +hot-spare and resync operations which are monitored. + +.TP +.B RebuildStarted +An md array started reconstruction (e.g. recovery, resync, reshape, +check, repair). (syslog priority: Warning) + +.TP +.BI Rebuild NN +Where +.I NN +is a two-digit number (ie. 05, 48). This indicates that rebuild +has passed that many percent of the total. The events are generated +with fixed increment since 0. Increment size may be specified with +a commandline option (default is 20). (syslog priority: Warning) + +.TP +.B RebuildFinished +An md array that was rebuilding, isn't any more, either because it +finished normally or was aborted. (syslog priority: Warning) + +.TP +.B Fail +An active component device of an array has been marked as +faulty. (syslog priority: Critical) + +.TP +.B FailSpare +A spare component device which was being rebuilt to replace a faulty +device has failed. (syslog priority: Critical) + +.TP +.B SpareActive +A spare component device which was being rebuilt to replace a faulty +device has been successfully rebuilt and has been made active. +(syslog priority: Info) + +.TP +.B NewArray +A new md array has been detected in the +.B /proc/mdstat +file. (syslog priority: Info) + +.TP +.B DegradedArray +A newly noticed array appears to be degraded. This message is not +generated when +.I mdadm +notices a drive failure which causes degradation, but only when +.I mdadm +notices that an array is degraded when it first sees the array. +(syslog priority: Critical) + +.TP +.B MoveSpare +A spare drive has been moved from one array in a +.B spare-group +or +.B domain +to another to allow a failed drive to be replaced. +(syslog priority: Info) + +.TP +.B SparesMissing +If +.I mdadm +has been told, via the config file, that an array should have a certain +number of spare devices, and +.I mdadm +detects that it has fewer than this number when it first sees the +array, it will report a +.B SparesMissing +message. +(syslog priority: Warning) + +.TP +.B TestMessage +An array was found at startup, and the +.B \-\-test +flag was given. +(syslog priority: Info) +.RE + +Only +.B Fail, +.B FailSpare, +.B DegradedArray, +.B SparesMissing +and +.B TestMessage +cause Email to be sent. All events cause the program to be run. +The program is run with two or three arguments: the event +name, the array device and possibly a second device. + +Each event has an associated array device (e.g. +.BR /dev/md1 ) +and possibly a second device. For +.BR Fail , +.BR FailSpare , +and +.B SpareActive +the second device is the relevant component device. +For +.B MoveSpare +the second device is the array that the spare was moved from. + +For +.I mdadm +to move spares from one array to another, the different arrays need to +be labeled with the same +.B spare-group +or the spares must be allowed to migrate through matching POLICY domains +in the configuration file. The +.B spare-group +name can be any string; it is only necessary that different spare +groups use different names. + +When +.I mdadm +detects that an array in a spare group has fewer active +devices than necessary for the complete array, and has no spare +devices, it will look for another array in the same spare group that +has a full complement of working drive and a spare. It will then +attempt to remove the spare from the second drive and add it to the +first. +If the removal succeeds but the adding fails, then it is added back to +the original array. + +If the spare group for a degraded array is not defined, +.I mdadm +will look at the rules of spare migration specified by POLICY lines in +.B mdadm.conf +and then follow similar steps as above if a matching spare is found. + +.SH GROW MODE +The GROW mode is used for changing the size or shape of an active +array. +For this to work, the kernel must support the necessary change. +Various types of growth are being added during 2.6 development. + +Currently the supported changes include +.IP \(bu 4 +change the "size" attribute for RAID1, RAID4, RAID5 and RAID6. +.IP \(bu 4 +increase or decrease the "raid\-devices" attribute of RAID0, RAID1, RAID4, +RAID5, and RAID6. +.IP \(bu 4 +change the chunk-size and layout of RAID0, RAID4, RAID5, RAID6 and RAID10. +.IP \(bu 4 +convert between RAID1 and RAID5, between RAID5 and RAID6, between +RAID0, RAID4, and RAID5, and between RAID0 and RAID10 (in the near-2 mode). +.IP \(bu 4 +add a write-intent bitmap to any array which supports these bitmaps, or +remove a write-intent bitmap from such an array. +.PP + +Using GROW on containers is currently supported only for Intel's IMSM +container format. The number of devices in a container can be +increased - which affects all arrays in the container - or an array +in a container can be converted between levels where those levels are +supported by the container, and the conversion is on of those listed +above. Resizing arrays in an IMSM container with +.B "--grow --size" +is not yet supported. + +Grow functionality (e.g. expand a number of raid devices) for Intel's +IMSM container format has an experimental status. It is guarded by the +.B MDADM_EXPERIMENTAL +environment variable which must be set to '1' for a GROW command to +succeed. +This is for the following reasons: + +.IP 1. +Intel's native IMSM check-pointing is not fully tested yet. +This can causes IMSM incompatibility during the grow process: an array +which is growing cannot roam between Microsoft Windows(R) and Linux +systems. + +.IP 2. +Interrupting a grow operation is not recommended, because it +has not been fully tested for Intel's IMSM container format yet. + +.PP +Note: Intel's native checkpointing doesn't use +.B --backup-file +option and it is transparent for assembly feature. + +.SS SIZE CHANGES +Normally when an array is built the "size" is taken from the smallest +of the drives. If all the small drives in an arrays are, one at a +time, removed and replaced with larger drives, then you could have an +array of large drives with only a small amount used. In this +situation, changing the "size" with "GROW" mode will allow the extra +space to start being used. If the size is increased in this way, a +"resync" process will start to make sure the new parts of the array +are synchronised. + +Note that when an array changes size, any filesystem that may be +stored in the array will not automatically grow or shrink to use or +vacate the space. The +filesystem will need to be explicitly told to use the extra space +after growing, or to reduce its size +.B prior +to shrinking the array. + +Also the size of an array cannot be changed while it has an active +bitmap. If an array has a bitmap, it must be removed before the size +can be changed. Once the change is complete a new bitmap can be created. + +.SS RAID\-DEVICES CHANGES + +A RAID1 array can work with any number of devices from 1 upwards +(though 1 is not very useful). There may be times which you want to +increase or decrease the number of active devices. Note that this is +different to hot-add or hot-remove which changes the number of +inactive devices. + +When reducing the number of devices in a RAID1 array, the slots which +are to be removed from the array must already be vacant. That is, the +devices which were in those slots must be failed and removed. + +When the number of devices is increased, any hot spares that are +present will be activated immediately. + +Changing the number of active devices in a RAID5 or RAID6 is much more +effort. Every block in the array will need to be read and written +back to a new location. From 2.6.17, the Linux Kernel is able to +increase the number of devices in a RAID5 safely, including restarting +an interrupted "reshape". From 2.6.31, the Linux Kernel is able to +increase or decrease the number of devices in a RAID5 or RAID6. + +From 2.6.35, the Linux Kernel is able to convert a RAID0 in to a RAID4 +or RAID5. +.I mdadm +uses this functionality and the ability to add +devices to a RAID4 to allow devices to be added to a RAID0. When +requested to do this, +.I mdadm +will convert the RAID0 to a RAID4, add the necessary disks and make +the reshape happen, and then convert the RAID4 back to RAID0. + +When decreasing the number of devices, the size of the array will also +decrease. If there was data in the array, it could get destroyed and +this is not reversible, so you should firstly shrink the filesystem on +the array to fit within the new size. To help prevent accidents, +.I mdadm +requires that the size of the array be decreased first with +.BR "mdadm --grow --array-size" . +This is a reversible change which simply makes the end of the array +inaccessible. The integrity of any data can then be checked before +the non-reversible reduction in the number of devices is request. + +When relocating the first few stripes on a RAID5 or RAID6, it is not +possible to keep the data on disk completely consistent and +crash-proof. To provide the required safety, mdadm disables writes to +the array while this "critical section" is reshaped, and takes a +backup of the data that is in that section. For grows, this backup may be +stored in any spare devices that the array has, however it can also be +stored in a separate file specified with the +.B \-\-backup\-file +option, and is required to be specified for shrinks, RAID level +changes and layout changes. If this option is used, and the system +does crash during the critical period, the same file must be passed to +.B \-\-assemble +to restore the backup and reassemble the array. When shrinking rather +than growing the array, the reshape is done from the end towards the +beginning, so the "critical section" is at the end of the reshape. + +.SS LEVEL CHANGES + +Changing the RAID level of any array happens instantaneously. However +in the RAID5 to RAID6 case this requires a non-standard layout of the +RAID6 data, and in the RAID6 to RAID5 case that non-standard layout is +required before the change can be accomplished. So while the level +change is instant, the accompanying layout change can take quite a +long time. A +.B \-\-backup\-file +is required. If the array is not simultaneously being grown or +shrunk, so that the array size will remain the same - for example, +reshaping a 3-drive RAID5 into a 4-drive RAID6 - the backup file will +be used not just for a "cricital section" but throughout the reshape +operation, as described below under LAYOUT CHANGES. + +.SS CHUNK-SIZE AND LAYOUT CHANGES + +Changing the chunk-size of layout without also changing the number of +devices as the same time will involve re-writing all blocks in-place. +To ensure against data loss in the case of a crash, a +.B --backup-file +must be provided for these changes. Small sections of the array will +be copied to the backup file while they are being rearranged. This +means that all the data is copied twice, once to the backup and once +to the new layout on the array, so this type of reshape will go very +slowly. + +If the reshape is interrupted for any reason, this backup file must be +made available to +.B "mdadm --assemble" +so the array can be reassembled. Consequently the file cannot be +stored on the device being reshaped. + + +.SS BITMAP CHANGES + +A write-intent bitmap can be added to, or removed from, an active +array. Either internal bitmaps, or bitmaps stored in a separate file, +can be added. Note that if you add a bitmap stored in a file which is +in a filesystem that is on the RAID array being affected, the system +will deadlock. The bitmap must be on a separate filesystem. + +.SH INCREMENTAL MODE + +.HP 12 +Usage: +.B mdadm \-\-incremental +.RB [ \-\-run ] +.RB [ \-\-quiet ] +.I component-device +.RI [ optional-aliases-for-device ] +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-fail +.I component-device +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-rebuild\-map +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-run \-\-scan + +.PP +This mode is designed to be used in conjunction with a device +discovery system. As devices are found in a system, they can be +passed to +.B "mdadm \-\-incremental" +to be conditionally added to an appropriate array. + +Conversely, it can also be used with the +.B \-\-fail +flag to do just the opposite and find whatever array a particular device +is part of and remove the device from that array. + +If the device passed is a +.B CONTAINER +device created by a previous call to +.IR mdadm , +then rather than trying to add that device to an array, all the arrays +described by the metadata of the container will be started. + +.I mdadm +performs a number of tests to determine if the device is part of an +array, and which array it should be part of. If an appropriate array +is found, or can be created, +.I mdadm +adds the device to the array and conditionally starts the array. + +Note that +.I mdadm +will normally only add devices to an array which were previously working +(active or spare) parts of that array. The support for automatic +inclusion of a new drive as a spare in some array requires +a configuration through POLICY in config file. + +The tests that +.I mdadm +makes are as follow: +.IP + +Is the device permitted by +.BR mdadm.conf ? +That is, is it listed in a +.B DEVICES +line in that file. If +.B DEVICES +is absent then the default it to allow any device. Similarly if +.B DEVICES +contains the special word +.B partitions +then any device is allowed. Otherwise the device name given to +.IR mdadm , +or one of the aliases given, or an alias found in the filesystem, +must match one of the names or patterns in a +.B DEVICES +line. + +This is the only context where the aliases are used. They are +usually provided by a +.I udev +rules mentioning +.BR ${DEVLINKS} . + +.IP + +Does the device have a valid md superblock? If a specific metadata +version is requested with +.B \-\-metadata +or +.B \-e +then only that style of metadata is accepted, otherwise +.I mdadm +finds any known version of metadata. If no +.I md +metadata is found, the device may be still added to an array +as a spare if POLICY allows. + +.ig +.IP + +Does the metadata match an expected array? +The metadata can match in two ways. Either there is an array listed +in +.B mdadm.conf +which identifies the array (either by UUID, by name, by device list, +or by minor-number), or the array was created with a +.B homehost +specified and that +.B homehost +matches the one in +.B mdadm.conf +or on the command line. +If +.I mdadm +is not able to positively identify the array as belonging to the +current host, the device will be rejected. +.. + +.PP +.I mdadm +keeps a list of arrays that it has partially assembled in +.BR {MAP_PATH} . +If no array exists which matches +the metadata on the new device, +.I mdadm +must choose a device name and unit number. It does this based on any +name given in +.B mdadm.conf +or any name information stored in the metadata. If this name +suggests a unit number, that number will be used, otherwise a free +unit number will be chosen. Normally +.I mdadm +will prefer to create a partitionable array, however if the +.B CREATE +line in +.B mdadm.conf +suggests that a non-partitionable array is preferred, that will be +honoured. + +If the array is not found in the config file and its metadata does not +identify it as belonging to the "homehost", then +.I mdadm +will choose a name for the array which is certain not to conflict with +any array which does belong to this host. It does this be adding an +underscore and a small number to the name preferred by the metadata. + +Once an appropriate array is found or created and the device is added, +.I mdadm +must decide if the array is ready to be started. It will +normally compare the number of available (non-spare) devices to the +number of devices that the metadata suggests need to be active. If +there are at least that many, the array will be started. This means +that if any devices are missing the array will not be restarted. + +As an alternative, +.B \-\-run +may be passed to +.I mdadm +in which case the array will be run as soon as there are enough +devices present for the data to be accessible. For a RAID1, that +means one device will start the array. For a clean RAID5, the array +will be started as soon as all but one drive is present. + +Note that neither of these approaches is really ideal. If it can +be known that all device discovery has completed, then +.br +.B " mdadm \-IRs" +.br +can be run which will try to start all arrays that are being +incrementally assembled. They are started in "read-auto" mode in +which they are read-only until the first write request. This means +that no metadata updates are made and no attempt at resync or recovery +happens. Further devices that are found before the first write can +still be added safely. + +.SH ENVIRONMENT +This section describes environment variables that affect how mdadm +operates. + +.TP +.B MDADM_NO_MDMON +Setting this value to 1 will prevent mdadm from automatically launching +mdmon. This variable is intended primarily for debugging mdadm/mdmon. + +.TP +.B MDADM_NO_UDEV +Normally, +.I mdadm +does not create any device nodes in /dev, but leaves that task to +.IR udev . +If +.I udev +appears not to be configured, or if this environment variable is set +to '1', the +.I mdadm +will create and devices that are needed. + +.TP +.B MDADM_NO_SYSTEMCTL +If +.I mdadm +detects that +.I systemd +is in use it will normally request +.I systemd +to start various background tasks (particularly +.IR mdmon ) +rather than forking and running them in the background. This can be +suppressed by setting +.BR MDADM_NO_SYSTEMCTL=1 . + +.TP +.B IMSM_NO_PLATFORM +A key value of IMSM metadata is that it allows interoperability with +boot ROMs on Intel platforms, and with other major operating systems. +Consequently, +.I mdadm +will only allow an IMSM array to be created or modified if detects +that it is running on an Intel platform which supports IMSM, and +supports the particular configuration of IMSM that is being requested +(some functionality requires newer OROM support). + +These checks can be suppressed by setting IMSM_NO_PLATFORM=1 in the +environment. This can be useful for testing or for disaster +recovery. You should be aware that interoperability may be +compromised by setting this value. + +.TP +.B MDADM_GROW_ALLOW_OLD +If an array is stopped while it is performing a reshape and that +reshape was making use of a backup file, then when the array is +re-assembled +.I mdadm +will sometimes complain that the backup file is too old. If this +happens and you are certain it is the right backup file, you can +over-ride this check by setting +.B MDADM_GROW_ALLOW_OLD=1 +in the environment. + +.TP +.B MDADM_CONF_AUTO +Any string given in this variable is added to the start of the +.B AUTO +line in the config file, or treated as the whole +.B AUTO +line if none is given. It can be used to disable certain metadata +types when +.I mdadm +is called from a boot script. For example +.br +.B " export MDADM_CONF_AUTO='-ddf -imsm' +.br +will make sure that +.I mdadm +does not automatically assemble any DDF or +IMSM arrays that are found. This can be useful on systems configured +to manage such arrays with +.BR dmraid . + + +.SH EXAMPLES + +.B " mdadm \-\-query /dev/name-of-device" +.br +This will find out if a given device is a RAID array, or is part of +one, and will provide brief information about the device. + +.B " mdadm \-\-assemble \-\-scan" +.br +This will assemble and start all arrays listed in the standard config +file. This command will typically go in a system startup file. + +.B " mdadm \-\-stop \-\-scan" +.br +This will shut down all arrays that can be shut down (i.e. are not +currently in use). This will typically go in a system shutdown script. + +.B " mdadm \-\-follow \-\-scan \-\-delay=120" +.br +If (and only if) there is an Email address or program given in the +standard config file, then +monitor the status of all arrays listed in that file by +polling them ever 2 minutes. + +.B " mdadm \-\-create /dev/md0 \-\-level=1 \-\-raid\-devices=2 /dev/hd[ac]1" +.br +Create /dev/md0 as a RAID1 array consisting of /dev/hda1 and /dev/hdc1. + +.br +.B " echo 'DEVICE /dev/hd*[0\-9] /dev/sd*[0\-9]' > mdadm.conf" +.br +.B " mdadm \-\-detail \-\-scan >> mdadm.conf" +.br +This will create a prototype config file that describes currently +active arrays that are known to be made from partitions of IDE or SCSI drives. +This file should be reviewed before being used as it may +contain unwanted detail. + +.B " echo 'DEVICE /dev/hd[a\-z] /dev/sd*[a\-z]' > mdadm.conf" +.br +.B " mdadm \-\-examine \-\-scan \-\-config=mdadm.conf >> mdadm.conf" +.br +This will find arrays which could be assembled from existing IDE and +SCSI whole drives (not partitions), and store the information in the +format of a config file. +This file is very likely to contain unwanted detail, particularly +the +.B devices= +entries. It should be reviewed and edited before being used as an +actual config file. + +.B " mdadm \-\-examine \-\-brief \-\-scan \-\-config=partitions" +.br +.B " mdadm \-Ebsc partitions" +.br +Create a list of devices by reading +.BR /proc/partitions , +scan these for RAID superblocks, and printout a brief listing of all +that were found. + +.B " mdadm \-Ac partitions \-m 0 /dev/md0" +.br +Scan all partitions and devices listed in +.BR /proc/partitions +and assemble +.B /dev/md0 +out of all such devices with a RAID superblock with a minor number of 0. + +.B " mdadm \-\-monitor \-\-scan \-\-daemonise > /run/mdadm/mon.pid" +.br +If config file contains a mail address or alert program, run mdadm in +the background in monitor mode monitoring all md devices. Also write +pid of mdadm daemon to +.BR /run/mdadm/mon.pid . + +.B " mdadm \-Iq /dev/somedevice" +.br +Try to incorporate newly discovered device into some array as +appropriate. + +.B " mdadm \-\-incremental \-\-rebuild\-map \-\-run \-\-scan" +.br +Rebuild the array map from any current arrays, and then start any that +can be started. + +.B " mdadm /dev/md4 --fail detached --remove detached" +.br +Any devices which are components of /dev/md4 will be marked as faulty +and then remove from the array. + +.B " mdadm --grow /dev/md4 --level=6 --backup-file=/root/backup-md4" +.br +The array +.B /dev/md4 +which is currently a RAID5 array will be converted to RAID6. There +should normally already be a spare drive attached to the array as a +RAID6 needs one more drive than a matching RAID5. + +.B " mdadm --create /dev/md/ddf --metadata=ddf --raid-disks 6 /dev/sd[a-f]" +.br +Create a DDF array over 6 devices. + +.B " mdadm --create /dev/md/home -n3 -l5 -z 30000000 /dev/md/ddf" +.br +Create a RAID5 array over any 3 devices in the given DDF set. Use +only 30 gigabytes of each device. + +.B " mdadm -A /dev/md/ddf1 /dev/sd[a-f]" +.br +Assemble a pre-exist ddf array. + +.B " mdadm -I /dev/md/ddf1" +.br +Assemble all arrays contained in the ddf array, assigning names as +appropriate. + +.B " mdadm \-\-create \-\-help" +.br +Provide help about the Create mode. + +.B " mdadm \-\-config \-\-help" +.br +Provide help about the format of the config file. + +.B " mdadm \-\-help" +.br +Provide general help. + +.SH FILES + +.SS /proc/mdstat + +If you're using the +.B /proc +filesystem, +.B /proc/mdstat +lists all active md devices with information about them. +.I mdadm +uses this to find arrays when +.B \-\-scan +is given in Misc mode, and to monitor array reconstruction +on Monitor mode. + +.SS /etc/mdadm/mdadm.conf (or /etc/mdadm.conf) + +The config file lists which devices may be scanned to see if +they contain MD super block, and gives identifying information +(e.g. UUID) about known MD arrays. See +.BR mdadm.conf (5) +for more details. + +.SS /etc/mdadm/mdadm.conf.d (or /etc/mdadm.conf.d) + +A directory containing configuration files which are read in lexical +order. + +.SS {MAP_PATH} +When +.B \-\-incremental +mode is used, this file gets a list of arrays currently being created. + +.SH DEVICE NAMES + +.I mdadm +understand two sorts of names for array devices. + +The first is the so-called 'standard' format name, which matches the +names used by the kernel and which appear in +.IR /proc/mdstat . + +The second sort can be freely chosen, but must reside in +.IR /dev/md/ . +When giving a device name to +.I mdadm +to create or assemble an array, either full path name such as +.I /dev/md0 +or +.I /dev/md/home +can be given, or just the suffix of the second sort of name, such as +.I home +can be given. + +When +.I mdadm +chooses device names during auto-assembly or incremental assembly, it +will sometimes add a small sequence number to the end of the name to +avoid conflicted between multiple arrays that have the same name. If +.I mdadm +can reasonably determine that the array really is meant for this host, +either by a hostname in the metadata, or by the presence of the array +in +.BR mdadm.conf , +then it will leave off the suffix if possible. +Also if the homehost is specified as +.B <ignore> +.I mdadm +will only use a suffix if a different array of the same name already +exists or is listed in the config file. + +The standard names for non-partitioned arrays (the only sort of md +array available in 2.4 and earlier) are of the form +.IP +.RB /dev/md NN +.PP +where NN is a number. +The standard names for partitionable arrays (as available from 2.6 +onwards) are of the form: +.IP +.RB /dev/md_d NN +.PP +Partition numbers should be indicated by adding "pMM" to these, thus "/dev/md/d1p2". +.PP +From kernel version 2.6.28 the "non-partitioned array" can actually +be partitioned. So the "md_d\fBNN\fP" +names are no longer needed, and +partitions such as "/dev/md\fBNN\fPp\fBXX\fP" +are possible. +.PP +From kernel version 2.6.29 standard names can be non-numeric following +the form: +.IP +.RB /dev/md_ XXX +.PP +where +.B XXX +is any string. These names are supported by +.I mdadm +since version 3.3 provided they are enabled in +.IR mdadm.conf . + +.SH NOTE +.I mdadm +was previously known as +.IR mdctl . + +.SH SEE ALSO +For further information on mdadm usage, MD and the various levels of +RAID, see: +.IP +.B http://raid.wiki.kernel.org/ +.PP +(based upon Jakob \(/Ostergaard's Software\-RAID.HOWTO) +.PP +The latest version of +.I mdadm +should always be available from +.IP +.B http://www.kernel.org/pub/linux/utils/raid/mdadm/ +.PP +Related man pages: +.PP +.IR mdmon (8), +.IR mdadm.conf (5), +.IR md (4). diff --git a/mdadm.c b/mdadm.c new file mode 100644 index 00000000..93732a8f --- /dev/null +++ b/mdadm.c @@ -0,0 +1,1842 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * + * Additions for bitmap and write-behind RAID options, Copyright (C) 2003-2004, + * Paul Clements, SteelEye Technology, Inc. + */ + +#include "mdadm.h" +#include "md_p.h" +#include <ctype.h> + +static int scan_assemble(struct supertype *ss, + struct context *c, + struct mddev_ident *ident); +static int misc_scan(char devmode, struct context *c); +static int stop_scan(int verbose); +static int misc_list(struct mddev_dev *devlist, + struct mddev_ident *ident, + char *dump_directory, + struct supertype *ss, struct context *c); +const char Name[] = "mdadm"; + +int main(int argc, char *argv[]) +{ + int mode = 0; + int opt; + int option_index; + int rv; + int i; + + unsigned long long array_size = 0; + unsigned long long data_offset = INVALID_SECTORS; + struct mddev_ident ident; + char *configfile = NULL; + int devmode = 0; + int bitmap_fd = -1; + struct mddev_dev *devlist = NULL; + struct mddev_dev **devlistend = & devlist; + struct mddev_dev *dv; + int devs_found = 0; + char *symlinks = NULL; + int grow_continue = 0; + /* autof indicates whether and how to create device node. + * bottom 3 bits are style. Rest (when shifted) are number of parts + * 0 - unset + * 1 - don't create (no) + * 2 - if is_standard, then create (yes) + * 3 - create as 'md' - reject is_standard mdp (md) + * 4 - create as 'mdp' - reject is_standard md (mdp) + * 5 - default to md if not is_standard (md in config file) + * 6 - default to mdp if not is_standard (part, or mdp in config file) + */ + struct context c = { + .require_homehost = 1, + }; + struct shape s = { + .level = UnSet, + .layout = UnSet, + .bitmap_chunk = UnSet, + }; + + char sys_hostname[256]; + char *mailaddr = NULL; + char *program = NULL; + int increments = 20; + int daemonise = 0; + char *pidfile = NULL; + int oneshot = 0; + int spare_sharing = 1; + struct supertype *ss = NULL; + int writemostly = 0; + char *shortopt = short_options; + int dosyslog = 0; + int rebuild_map = 0; + char *remove_path = NULL; + char *udev_filename = NULL; + char *dump_directory = NULL; + + int print_help = 0; + FILE *outf; + + int mdfd = -1; + + srandom(time(0) ^ getpid()); + + ident.uuid_set=0; + ident.level = UnSet; + ident.raid_disks = UnSet; + ident.super_minor= UnSet; + ident.devices=0; + ident.spare_group = NULL; + ident.autof = 0; + ident.st = NULL; + ident.bitmap_fd = -1; + ident.bitmap_file = NULL; + ident.name[0] = 0; + ident.container = NULL; + ident.member = NULL; + + while ((option_index = -1) , + (opt=getopt_long(argc, argv, + shortopt, long_options, + &option_index)) != -1) { + int newmode = mode; + /* firstly, some mode-independent options */ + switch(opt) { + case HelpOptions: + print_help = 2; + continue; + case 'h': + print_help = 1; + continue; + + case 'V': + fputs(Version, stderr); + exit(0); + + case 'v': c.verbose++; + continue; + + case 'q': c.verbose--; + continue; + + case 'b': + if (mode == ASSEMBLE || mode == BUILD || mode == CREATE + || mode == GROW || mode == INCREMENTAL + || mode == MANAGE) + break; /* b means bitmap */ + case Brief: + c.brief = 1; + continue; + + case 'Y': c.export++; + continue; + + case HomeHost: + if (strcasecmp(optarg, "<ignore>") == 0) + c.require_homehost = 0; + else + c.homehost = optarg; + continue; + + case OffRootOpt: + /* Silently ignore old option */ + continue; + + case Prefer: + if (c.prefer) + free(c.prefer); + if (asprintf(&c.prefer, "/%s/", optarg) <= 0) + c.prefer = NULL; + continue; + + case ':': + case '?': + fputs(Usage, stderr); + exit(2); + } + /* second, figure out the mode. + * Some options force the mode. Others + * set the mode if it isn't already + */ + + switch(opt) { + case ManageOpt: + newmode = MANAGE; + shortopt = short_bitmap_options; + break; + case 'a': + case Add: + case AddSpare: + case 'r': + case Remove: + case Replace: + case With: + case 'f': + case Fail: + case ReAdd: /* re-add */ + if (!mode) { + newmode = MANAGE; + shortopt = short_bitmap_options; + } + break; + + case 'A': newmode = ASSEMBLE; + shortopt = short_bitmap_auto_options; + break; + case 'B': newmode = BUILD; + shortopt = short_bitmap_auto_options; + break; + case 'C': newmode = CREATE; + shortopt = short_bitmap_auto_options; + break; + case 'F': newmode = MONITOR; + break; + case 'G': newmode = GROW; + shortopt = short_bitmap_options; + break; + case 'I': newmode = INCREMENTAL; + shortopt = short_bitmap_auto_options; + break; + case AutoDetect: + newmode = AUTODETECT; + break; + + case MiscOpt: + case 'D': + case 'E': + case 'X': + case 'Q': + case ExamineBB: + case Dump: + case Restore: + case Action: + newmode = MISC; + break; + + case 'R': + case 'S': + case 'o': + case 'w': + case 'W': + case WaitOpt: + case Waitclean: + case DetailPlatform: + case KillSubarray: + case UpdateSubarray: + case UdevRules: + case KillOpt: + if (!mode) + newmode = MISC; + break; + + case NoSharing: + newmode = MONITOR; + break; + } + if (mode && newmode == mode) { + /* everybody happy ! */ + } else if (mode && newmode != mode) { + /* not allowed.. */ + pr_err(""); + if (option_index >= 0) + fprintf(stderr, "--%s", long_options[option_index].name); + else + fprintf(stderr, "-%c", opt); + fprintf(stderr, " would set mdadm mode to \"%s\", but it is already set to \"%s\".\n", + map_num(modes, newmode), + map_num(modes, mode)); + exit(2); + } else if (!mode && newmode) { + mode = newmode; + if (mode == MISC && devs_found) { + pr_err("No action given for %s in --misc mode\n", + devlist->devname); + cont_err("Action options must come before device names\n"); + exit(2); + } + } else { + /* special case of -c --help */ + if ((opt == 'c' || opt == ConfigFile) && + ( strncmp(optarg, "--h", 3)==0 || + strncmp(optarg, "-h", 2)==0)) { + fputs(Help_config, stdout); + exit(0); + } + + /* If first option is a device, don't force the mode yet */ + if (opt == 1) { + if (devs_found == 0) { + dv = xmalloc(sizeof(*dv)); + dv->devname = optarg; + dv->disposition = devmode; + dv->writemostly = writemostly; + dv->used = 0; + dv->next = NULL; + *devlistend = dv; + devlistend = &dv->next; + + devs_found++; + continue; + } + /* No mode yet, and this is the second device ... */ + pr_err("An option must be given to set the mode before a second device\n" + " (%s) is listed\n", optarg); + exit(2); + } + if (option_index >= 0) + pr_err("--%s", long_options[option_index].name); + else + pr_err("-%c", opt); + fprintf(stderr, " does not set the mode, and so cannot be the first option.\n"); + exit(2); + } + + /* if we just set the mode, then done */ + switch(opt) { + case ManageOpt: + case MiscOpt: + case 'A': + case 'B': + case 'C': + case 'F': + case 'G': + case 'I': + case AutoDetect: + continue; + } + if (opt == 1) { + /* an undecorated option - must be a device name. + */ + + if (devs_found > 0 && devmode == DetailPlatform) { + pr_err("controller may only be specified once. %s ignored\n", + optarg); + continue; + } + + if (devs_found > 0 && mode == MANAGE && !devmode) { + pr_err("Must give one of -a/-r/-f for subsequent devices at %s\n", optarg); + exit(2); + } + if (devs_found > 0 && mode == GROW && !devmode) { + pr_err("Must give -a/--add for devices to add: %s\n", optarg); + exit(2); + } + dv = xmalloc(sizeof(*dv)); + dv->devname = optarg; + dv->disposition = devmode; + dv->writemostly = writemostly; + dv->used = 0; + dv->next = NULL; + *devlistend = dv; + devlistend = &dv->next; + + devs_found++; + continue; + } + + /* We've got a mode, and opt is now something else which + * could depend on the mode */ +#define O(a,b) ((a<<16)|b) + switch (O(mode,opt)) { + case O(GROW,'c'): + case O(GROW,ChunkSize): + case O(CREATE,'c'): + case O(CREATE,ChunkSize): + case O(BUILD,'c'): /* chunk or rounding */ + case O(BUILD,ChunkSize): /* chunk or rounding */ + if (s.chunk) { + pr_err("chunk/rounding may only be specified once. Second value is %s.\n", optarg); + exit(2); + } + s.chunk = parse_size(optarg); + if (s.chunk == INVALID_SECTORS || + s.chunk < 8 || (s.chunk&1)) { + pr_err("invalid chunk/rounding value: %s\n", + optarg); + exit(2); + } + /* Convert sectors to K */ + s.chunk /= 2; + continue; + + case O(INCREMENTAL, 'e'): + case O(CREATE,'e'): + case O(ASSEMBLE,'e'): + case O(MISC,'e'): /* set metadata (superblock) information */ + if (ss) { + pr_err("metadata information already given\n"); + exit(2); + } + for(i=0; !ss && superlist[i]; i++) + ss = superlist[i]->match_metadata_desc(optarg); + + if (!ss) { + pr_err("unrecognised metadata identifier: %s\n", optarg); + exit(2); + } + continue; + + case O(MANAGE,'W'): + case O(MANAGE,WriteMostly): + case O(BUILD,'W'): + case O(BUILD,WriteMostly): + case O(CREATE,'W'): + case O(CREATE,WriteMostly): + /* set write-mostly for following devices */ + writemostly = 1; + continue; + + case O(MANAGE,'w'): + /* clear write-mostly for following devices */ + writemostly = 2; + continue; + + case O(GROW,'z'): + case O(CREATE,'z'): + case O(BUILD,'z'): /* size */ + if (s.size > 0) { + pr_err("size may only be specified once. Second value is %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "max")==0) + s.size = MAX_SIZE; + else { + s.size = parse_size(optarg); + if (s.size == INVALID_SECTORS || + s.size < 8) { + pr_err("invalid size: %s\n", + optarg); + exit(2); + } + /* convert sectors to K */ + s.size /= 2; + } + continue; + + case O(GROW,'Z'): /* array size */ + if (array_size > 0) { + pr_err("array-size may only be specified once. Second value is %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "max") == 0) + array_size = MAX_SIZE; + else { + array_size = parse_size(optarg); + if (array_size == 0 || + array_size == INVALID_SECTORS) { + pr_err("invalid array size: %s\n", + optarg); + exit(2); + } + } + continue; + + case O(CREATE,DataOffset): + case O(GROW,DataOffset): + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset may only be specified one. Second value is %s.\n", optarg); + exit(2); + } + if (mode == CREATE && + strcmp(optarg, "variable") == 0) + data_offset = VARIABLE_OFFSET; + else + data_offset = parse_size(optarg); + if (data_offset == INVALID_SECTORS) { + pr_err("invalid data-offset: %s\n", + optarg); + exit(2); + } + continue; + + case O(GROW,'l'): + case O(CREATE,'l'): + case O(BUILD,'l'): /* set raid level*/ + if (s.level != UnSet) { + pr_err("raid level may only be set once. Second value is %s.\n", optarg); + exit(2); + } + s.level = map_name(pers, optarg); + if (s.level == UnSet) { + pr_err("invalid raid level: %s\n", + optarg); + exit(2); + } + if (s.level != 0 && s.level != LEVEL_LINEAR && s.level != 1 && + s.level != LEVEL_MULTIPATH && s.level != LEVEL_FAULTY && + s.level != 10 && + mode == BUILD) { + pr_err("Raid level %s not permitted with --build.\n", + optarg); + exit(2); + } + if (s.sparedisks > 0 && s.level < 1 && s.level >= -1) { + pr_err("raid level %s is incompatible with spare-devices setting.\n", + optarg); + exit(2); + } + ident.level = s.level; + continue; + + case O(GROW, 'p'): /* new layout */ + case O(GROW, Layout): + if (s.layout_str) { + pr_err("layout may only be sent once. Second value was %s\n", optarg); + exit(2); + } + s.layout_str = optarg; + /* 'Grow' will parse the value */ + continue; + + case O(CREATE,'p'): /* raid5 layout */ + case O(CREATE,Layout): + case O(BUILD,'p'): /* faulty layout */ + case O(BUILD,Layout): + if (s.layout != UnSet) { + pr_err("layout may only be sent once. Second value was %s\n", optarg); + exit(2); + } + switch(s.level) { + default: + pr_err("layout not meaningful for %s arrays.\n", + map_num(pers, s.level)); + exit(2); + case UnSet: + pr_err("raid level must be given before layout.\n"); + exit(2); + + case 5: + s.layout = map_name(r5layout, optarg); + if (s.layout==UnSet) { + pr_err("layout %s not understood for raid5.\n", + optarg); + exit(2); + } + break; + case 6: + s.layout = map_name(r6layout, optarg); + if (s.layout==UnSet) { + pr_err("layout %s not understood for raid6.\n", + optarg); + exit(2); + } + break; + + case 10: + s.layout = parse_layout_10(optarg); + if (s.layout < 0) { + pr_err("layout for raid10 must be 'nNN', 'oNN' or 'fNN' where NN is a number, not %s\n", optarg); + exit(2); + } + break; + case LEVEL_FAULTY: + /* Faulty + * modeNNN + */ + s.layout = parse_layout_faulty(optarg); + if (s.layout == -1) { + pr_err("layout %s not understood for faulty.\n", + optarg); + exit(2); + } + break; + } + continue; + + case O(CREATE,AssumeClean): + case O(BUILD,AssumeClean): /* assume clean */ + case O(GROW,AssumeClean): + s.assume_clean = 1; + continue; + + case O(GROW,'n'): + case O(CREATE,'n'): + case O(BUILD,'n'): /* number of raid disks */ + if (s.raiddisks) { + pr_err("raid-devices set twice: %d and %s\n", + s.raiddisks, optarg); + exit(2); + } + s.raiddisks = parse_num(optarg); + if (s.raiddisks <= 0) { + pr_err("invalid number of raid devices: %s\n", + optarg); + exit(2); + } + ident.raid_disks = s.raiddisks; + continue; + + case O(CREATE,'x'): /* number of spare (eXtra) disks */ + if (s.sparedisks) { + pr_err("spare-devices set twice: %d and %s\n", + s.sparedisks, optarg); + exit(2); + } + if (s.level != UnSet && s.level <= 0 && s.level >= -1) { + pr_err("spare-devices setting is incompatible with raid level %d\n", + s.level); + exit(2); + } + s.sparedisks = parse_num(optarg); + if (s.sparedisks < 0) { + pr_err("invalid number of spare-devices: %s\n", + optarg); + exit(2); + } + continue; + + case O(CREATE,'a'): + case O(CREATE,Auto): + case O(BUILD,'a'): + case O(BUILD,Auto): + case O(INCREMENTAL,'a'): + case O(INCREMENTAL,Auto): + case O(ASSEMBLE,'a'): + case O(ASSEMBLE,Auto): /* auto-creation of device node */ + c.autof = parse_auto(optarg, "--auto flag", 0); + continue; + + case O(CREATE,Symlinks): + case O(BUILD,Symlinks): + case O(ASSEMBLE,Symlinks): /* auto creation of symlinks in /dev to /dev/md */ + symlinks = optarg; + continue; + + case O(BUILD,'f'): /* force honouring '-n 1' */ + case O(BUILD,Force): /* force honouring '-n 1' */ + case O(GROW,'f'): /* ditto */ + case O(GROW,Force): /* ditto */ + case O(CREATE,'f'): /* force honouring of device list */ + case O(CREATE,Force): /* force honouring of device list */ + case O(ASSEMBLE,'f'): /* force assembly */ + case O(ASSEMBLE,Force): /* force assembly */ + case O(MISC,'f'): /* force zero */ + case O(MISC,Force): /* force zero */ + case O(MANAGE,Force): /* add device which is too large */ + c.force=1; + continue; + /* now for the Assemble options */ + case O(ASSEMBLE, FreezeReshape): /* Freeze reshape during + * initrd phase */ + case O(INCREMENTAL, FreezeReshape): + c.freeze_reshape = 1; + continue; + case O(CREATE,'u'): /* uuid of array */ + case O(ASSEMBLE,'u'): /* uuid of array */ + if (ident.uuid_set) { + pr_err("uuid cannot be set twice. Second value %s.\n", optarg); + exit(2); + } + if (parse_uuid(optarg, ident.uuid)) + ident.uuid_set = 1; + else { + pr_err("Bad uuid: %s\n", optarg); + exit(2); + } + continue; + + case O(CREATE,'N'): + case O(ASSEMBLE,'N'): + case O(MISC,'N'): + if (ident.name[0]) { + pr_err("name cannot be set twice. Second value %s.\n", optarg); + exit(2); + } + if (mode == MISC && !c.subarray) { + pr_err("-N/--name only valid with --update-subarray in misc mode\n"); + exit(2); + } + if (strlen(optarg) > 32) { + pr_err("name '%s' is too long, 32 chars max.\n", + optarg); + exit(2); + } + strcpy(ident.name, optarg); + continue; + + case O(ASSEMBLE,'m'): /* super-minor for array */ + case O(ASSEMBLE,SuperMinor): + if (ident.super_minor != UnSet) { + pr_err("super-minor cannot be set twice. Second value: %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "dev")==0) + ident.super_minor = -2; + else { + ident.super_minor = parse_num(optarg); + if (ident.super_minor < 0) { + pr_err("Bad super-minor number: %s.\n", optarg); + exit(2); + } + } + continue; + + case O(ASSEMBLE,'o'): + case O(MANAGE,'o'): + case O(CREATE,'o'): + c.readonly = 1; + continue; + + case O(ASSEMBLE,'U'): /* update the superblock */ + case O(MISC,'U'): + if (c.update) { + pr_err("Can only update one aspect of superblock, both %s and %s given.\n", + c.update, optarg); + exit(2); + } + if (mode == MISC && !c.subarray) { + pr_err("Only subarrays can be updated in misc mode\n"); + exit(2); + } + c.update = optarg; + if (strcmp(c.update, "sparc2.2")==0) + continue; + if (strcmp(c.update, "super-minor") == 0) + continue; + if (strcmp(c.update, "summaries")==0) + continue; + if (strcmp(c.update, "resync")==0) + continue; + if (strcmp(c.update, "uuid")==0) + continue; + if (strcmp(c.update, "name")==0) + continue; + if (strcmp(c.update, "homehost")==0) + continue; + if (strcmp(c.update, "devicesize")==0) + continue; + if (strcmp(c.update, "no-bitmap")==0) + continue; + if (strcmp(c.update, "bbl") == 0) + continue; + if (strcmp(c.update, "no-bbl") == 0) + continue; + if (strcmp(c.update, "metadata") == 0) + continue; + if (strcmp(c.update, "revert-reshape") == 0) + continue; + if (strcmp(c.update, "byteorder")==0) { + if (ss) { + pr_err("must not set metadata type with --update=byteorder.\n"); + exit(2); + } + for(i=0; !ss && superlist[i]; i++) + ss = superlist[i]->match_metadata_desc( + "0.swap"); + if (!ss) { + pr_err("INTERNAL ERROR cannot find 0.swap\n"); + exit(2); + } + + continue; + } + if (strcmp(c.update,"?") == 0 || + strcmp(c.update, "help") == 0) { + outf = stdout; + fprintf(outf, "%s: ", Name); + } else { + outf = stderr; + fprintf(outf, + "%s: '--update=%s' is invalid. ", + Name, c.update); + } + fprintf(outf, "Valid --update options are:\n" + " 'sparc2.2', 'super-minor', 'uuid', 'name', 'resync',\n" + " 'summaries', 'homehost', 'byteorder', 'devicesize',\n" + " 'no-bitmap', 'metadata', 'revert-reshape'\n" + " 'bbl', 'no-bbl'\n" + ); + exit(outf == stdout ? 0 : 2); + + case O(MANAGE,'U'): + /* update=devicesize is allowed with --re-add */ + if (devmode != 'A') { + pr_err("--update in Manage mode only allowed with --re-add.\n"); + exit(1); + } + if (c.update) { + pr_err("Can only update one aspect of superblock, both %s and %s given.\n", + c.update, optarg); + exit(2); + } + c.update = optarg; + if (strcmp(c.update, "devicesize") != 0 && + strcmp(c.update, "bbl") != 0 && + strcmp(c.update, "no-bbl") != 0) { + pr_err("only 'devicesize', 'bbl' and 'no-bbl' can be updated with --re-add\n"); + exit(2); + } + continue; + + case O(INCREMENTAL,NoDegraded): + pr_err("--no-degraded is deprecated in Incremental mode\n"); + case O(ASSEMBLE,NoDegraded): /* --no-degraded */ + c.runstop = -1; /* --stop isn't allowed for --assemble, + * so we overload slightly */ + continue; + + case O(ASSEMBLE,'c'): + case O(ASSEMBLE,ConfigFile): + case O(INCREMENTAL, 'c'): + case O(INCREMENTAL, ConfigFile): + case O(MISC, 'c'): + case O(MISC, ConfigFile): + case O(MONITOR,'c'): + case O(MONITOR,ConfigFile): + case O(CREATE,ConfigFile): + if (configfile) { + pr_err("configfile cannot be set twice. Second value is %s.\n", optarg); + exit(2); + } + configfile = optarg; + set_conffile(configfile); + /* FIXME possibly check that config file exists. Even parse it */ + continue; + case O(ASSEMBLE,'s'): /* scan */ + case O(MISC,'s'): + case O(MONITOR,'s'): + case O(INCREMENTAL,'s'): + c.scan = 1; + continue; + + case O(MONITOR,'m'): /* mail address */ + case O(MONITOR,EMail): + if (mailaddr) + pr_err("only specify one mailaddress. %s ignored.\n", + optarg); + else + mailaddr = optarg; + continue; + + case O(MONITOR,'p'): /* alert program */ + case O(MONITOR,ProgramOpt): /* alert program */ + if (program) + pr_err("only specify one alter program. %s ignored.\n", + optarg); + else + program = optarg; + continue; + + case O(MONITOR,'r'): /* rebuild increments */ + case O(MONITOR,Increment): + increments = atoi(optarg); + if (increments > 99 || increments < 1) { + pr_err("please specify positive integer between 1 and 99 as rebuild increments.\n"); + exit(2); + } + continue; + + case O(MONITOR,'d'): /* delay in seconds */ + case O(GROW, 'd'): + case O(BUILD,'d'): /* delay for bitmap updates */ + case O(CREATE,'d'): + if (c.delay) + pr_err("only specify delay once. %s ignored.\n", + optarg); + else { + c.delay = parse_num(optarg); + if (c.delay < 1) { + pr_err("invalid delay: %s\n", + optarg); + exit(2); + } + } + continue; + case O(MONITOR,'f'): /* daemonise */ + case O(MONITOR,Fork): + daemonise = 1; + continue; + case O(MONITOR,'i'): /* pid */ + if (pidfile) + pr_err("only specify one pid file. %s ignored.\n", + optarg); + else + pidfile = optarg; + continue; + case O(MONITOR,'1'): /* oneshot */ + oneshot = 1; + spare_sharing = 0; + continue; + case O(MONITOR,'t'): /* test */ + c.test = 1; + continue; + case O(MONITOR,'y'): /* log messages to syslog */ + openlog("mdadm", LOG_PID, SYSLOG_FACILITY); + dosyslog = 1; + continue; + case O(MONITOR, NoSharing): + spare_sharing = 0; + continue; + + /* now the general management options. Some are applicable + * to other modes. None have arguments. + */ + case O(GROW,'a'): + case O(GROW,Add): + case O(MANAGE,'a'): + case O(MANAGE,Add): /* add a drive */ + devmode = 'a'; + continue; + case O(MANAGE,AddSpare): /* add drive - never re-add */ + devmode = 'S'; + continue; + case O(MANAGE,ReAdd): + devmode = 'A'; + continue; + case O(MANAGE,'r'): /* remove a drive */ + case O(MANAGE,Remove): + devmode = 'r'; + continue; + case O(MANAGE,'f'): /* set faulty */ + case O(MANAGE,Fail): + case O(INCREMENTAL,'f'): + case O(INCREMENTAL,Remove): + case O(INCREMENTAL,Fail): /* r for incremental is taken, use f + * even though we will both fail and + * remove the device */ + devmode = 'f'; + continue; + case O(MANAGE,Replace): + /* Mark these devices for replacement */ + devmode = 'R'; + continue; + case O(MANAGE,With): + /* These are the replacements to use */ + if (devmode != 'R') { + pr_err("--with must follow --replace\n"); + exit(2); + } + devmode = 'W'; + continue; + case O(INCREMENTAL,'R'): + case O(MANAGE,'R'): + case O(ASSEMBLE,'R'): + case O(BUILD,'R'): + case O(CREATE,'R'): /* Run the array */ + if (c.runstop < 0) { + pr_err("Cannot both Stop and Run an array\n"); + exit(2); + } + c.runstop = 1; + continue; + case O(MANAGE,'S'): + if (c.runstop > 0) { + pr_err("Cannot both Run and Stop an array\n"); + exit(2); + } + c.runstop = -1; + continue; + case O(MANAGE,'t'): + c.test = 1; + continue; + + case O(MISC,'Q'): + case O(MISC,'D'): + case O(MISC,'E'): + case O(MISC,KillOpt): + case O(MISC,'R'): + case O(MISC,'S'): + case O(MISC,'X'): + case O(MISC, ExamineBB): + case O(MISC,'o'): + case O(MISC,'w'): + case O(MISC,'W'): + case O(MISC, WaitOpt): + case O(MISC, Waitclean): + case O(MISC, DetailPlatform): + case O(MISC, KillSubarray): + case O(MISC, UpdateSubarray): + case O(MISC, Dump): + case O(MISC, Restore): + case O(MISC ,Action): + if (opt == KillSubarray || opt == UpdateSubarray) { + if (c.subarray) { + pr_err("subarray can only be specified once\n"); + exit(2); + } + c.subarray = optarg; + } + if (opt == Action) { + if (c.action) { + pr_err("Only one --action can be specified\n"); + exit(2); + } + if (strcmp(optarg, "idle") == 0 || + strcmp(optarg, "frozen") == 0 || + strcmp(optarg, "check") == 0 || + strcmp(optarg, "repair") == 0) + c.action = optarg; + else { + pr_err("action must be one of idle, frozen, check, repair\n"); + exit(2); + } + } + if (devmode && devmode != opt && + (devmode == 'E' || (opt == 'E' && devmode != 'Q'))) { + pr_err("--examine/-E cannot be given with "); + if (devmode == 'E') { + if (option_index >= 0) + fprintf(stderr, "--%s\n", + long_options[option_index].name); + else + fprintf(stderr, "-%c\n", opt); + } else if (isalpha(devmode)) + fprintf(stderr, "-%c\n", devmode); + else + fprintf(stderr, "previous option\n"); + exit(2); + } + devmode = opt; + if (opt == Dump || opt == Restore) { + if (dump_directory != NULL) { + pr_err("dump/restore directory specified twice: %s and %s\n", + dump_directory, optarg); + exit(2); + } + dump_directory = optarg; + } + continue; + case O(MISC, UdevRules): + if (devmode && devmode != opt) { + pr_err("--udev-rules must be the only option.\n"); + } else { + if (udev_filename) + pr_err("only specify one udev rule filename. %s ignored.\n", + optarg); + else + udev_filename = optarg; + } + devmode = opt; + continue; + case O(MISC,'t'): + c.test = 1; + continue; + + case O(MISC, Sparc22): + if (devmode != 'E') { + pr_err("--sparc2.2 only allowed with --examine\n"); + exit(2); + } + c.SparcAdjust = 1; + continue; + + case O(ASSEMBLE,'b'): /* here we simply set the bitmap file */ + case O(ASSEMBLE,Bitmap): + if (!optarg) { + pr_err("bitmap file needed with -b in --assemble mode\n"); + exit(2); + } + if (strcmp(optarg, "internal")==0) { + pr_err("there is no need to specify --bitmap when assembling arrays with internal bitmaps\n"); + continue; + } + bitmap_fd = open(optarg, O_RDWR); + if (!*optarg || bitmap_fd < 0) { + pr_err("cannot open bitmap file %s: %s\n", optarg, strerror(errno)); + exit(2); + } + ident.bitmap_fd = bitmap_fd; /* for Assemble */ + continue; + + case O(ASSEMBLE, BackupFile): + case O(GROW, BackupFile): + /* Specify a file into which grow might place a backup, + * or from which assemble might recover a backup + */ + if (c.backup_file) { + pr_err("backup file already specified, rejecting %s\n", optarg); + exit(2); + } + c.backup_file = optarg; + continue; + + case O(GROW, Continue): + /* Continue interrupted grow + */ + grow_continue = 1; + continue; + case O(ASSEMBLE, InvalidBackup): + /* Acknowledge that the backupfile is invalid, but ask + * to continue anyway + */ + c.invalid_backup = 1; + continue; + + case O(BUILD,'b'): + case O(BUILD,Bitmap): + case O(CREATE,'b'): + case O(CREATE,Bitmap): /* here we create the bitmap */ + case O(GROW,'b'): + case O(GROW,Bitmap): + if (strcmp(optarg, "internal")== 0 || + strcmp(optarg, "none")== 0 || + strchr(optarg, '/') != NULL) { + s.bitmap_file = optarg; + continue; + } + /* probable typo */ + pr_err("bitmap file must contain a '/', or be 'internal', or 'none'\n" + " not '%s'\n", optarg); + exit(2); + + case O(GROW,BitmapChunk): + case O(BUILD,BitmapChunk): + case O(CREATE,BitmapChunk): /* bitmap chunksize */ + s.bitmap_chunk = parse_size(optarg); + if (s.bitmap_chunk == 0 || + s.bitmap_chunk == INVALID_SECTORS || + s.bitmap_chunk & (s.bitmap_chunk - 1)) { + pr_err("invalid bitmap chunksize: %s\n", + optarg); + exit(2); + } + s.bitmap_chunk = s.bitmap_chunk * 512; + continue; + + case O(GROW, WriteBehind): + case O(BUILD, WriteBehind): + case O(CREATE, WriteBehind): /* write-behind mode */ + s.write_behind = DEFAULT_MAX_WRITE_BEHIND; + if (optarg) { + s.write_behind = parse_num(optarg); + if (s.write_behind < 0 || + s.write_behind > 16383) { + pr_err("Invalid value for maximum outstanding write-behind writes: %s.\n\tMust be between 0 and 16383.\n", optarg); + exit(2); + } + } + continue; + + case O(INCREMENTAL, 'r'): + case O(INCREMENTAL, RebuildMapOpt): + rebuild_map = 1; + continue; + case O(INCREMENTAL, IncrementalPath): + remove_path = optarg; + continue; + } + /* We have now processed all the valid options. Anything else is + * an error + */ + if (option_index > 0) + pr_err(":option --%s not valid in %s mode\n", + long_options[option_index].name, + map_num(modes, mode)); + else + pr_err("option -%c not valid in %s mode\n", + opt, map_num(modes, mode)); + exit(2); + + } + + if (print_help) { + char *help_text; + if (print_help == 2) + help_text = OptionHelp; + else + help_text = mode_help[mode]; + if (help_text == NULL) + help_text = Help; + fputs(help_text,stdout); + exit(0); + } + + if (!mode && devs_found) { + mode = MISC; + devmode = 'Q'; + if (devlist->disposition == 0) + devlist->disposition = devmode; + } + if (!mode) { + fputs(Usage, stderr); + exit(2); + } + + if (symlinks) { + struct createinfo *ci = conf_get_create_info(); + + if (strcasecmp(symlinks, "yes") == 0) + ci->symlinks = 1; + else if (strcasecmp(symlinks, "no") == 0) + ci->symlinks = 0; + else { + pr_err("option --symlinks must be 'no' or 'yes'\n"); + exit(2); + } + } + /* Ok, got the option parsing out of the way + * hopefully it's mostly right but there might be some stuff + * missing + * + * That is mosty checked in the per-mode stuff but... + * + * For @,B,C and A without -s, the first device listed must be + * an md device. We check that here and open it. + */ + + if (mode == MANAGE || mode == BUILD || mode == CREATE + || mode == GROW + || (mode == ASSEMBLE && ! c.scan)) { + if (devs_found < 1) { + pr_err("an md device must be given in this mode\n"); + exit(2); + } + if ((int)ident.super_minor == -2 && c.autof) { + pr_err("--super-minor=dev is incompatible with --auto\n"); + exit(2); + } + if (mode == MANAGE || mode == GROW) { + mdfd = open_mddev(devlist->devname, 1); + if (mdfd < 0) + exit(1); + } else + /* non-existent device is OK */ + mdfd = open_mddev(devlist->devname, 0); + if (mdfd == -2) { + pr_err("device %s exists but is not an md array.\n", devlist->devname); + exit(1); + } + if ((int)ident.super_minor == -2) { + struct stat stb; + if (mdfd < 0) { + pr_err("--super-minor=dev given, and listed device %s doesn't exist.\n", + devlist->devname); + exit(1); + } + fstat(mdfd, &stb); + ident.super_minor = minor(stb.st_rdev); + } + if (mdfd >= 0 && mode != MANAGE && mode != GROW) { + /* We don't really want this open yet, we just might + * have wanted to check some things + */ + close(mdfd); + mdfd = -1; + } + } + + if (s.raiddisks) { + if (s.raiddisks == 1 && !c.force && s.level != LEVEL_FAULTY) { + pr_err("'1' is an unusual number of drives for an array, so it is probably\n" + " a mistake. If you really mean it you will need to specify --force before\n" + " setting the number of drives.\n"); + exit(2); + } + } + + if (c.homehost == NULL && c.require_homehost) + c.homehost = conf_get_homehost(&c.require_homehost); + if (c.homehost == NULL || strcasecmp(c.homehost, "<system>")==0) { + if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) { + sys_hostname[sizeof(sys_hostname)-1] = 0; + c.homehost = sys_hostname; + } + } + if (c.homehost && (!c.homehost[0] || strcasecmp(c.homehost, "<none>") == 0)) { + c.homehost = NULL; + c.require_homehost = 0; + } + + if (c.backup_file && data_offset != INVALID_SECTORS) { + pr_err("--backup-file and --data-offset are incompatible\n"); + exit(2); + } + + if ((mode == MISC && devmode == 'E') + || (mode == MONITOR && spare_sharing == 0)) + /* Anyone may try this */; + else if (geteuid() != 0) { + pr_err("must be super-user to perform this action\n"); + exit(1); + } + + ident.autof = c.autof; + + if (c.scan && c.verbose < 2) + /* --scan implied --brief unless -vv */ + c.brief = 1; + + rv = 0; + switch(mode) { + case MANAGE: + /* readonly, add/remove, readwrite, runstop */ + if (c.readonly > 0) + rv = Manage_ro(devlist->devname, mdfd, c.readonly); + if (!rv && devs_found>1) + rv = Manage_subdevs(devlist->devname, mdfd, + devlist->next, c.verbose, c.test, + c.update, c.force); + if (!rv && c.readonly < 0) + rv = Manage_ro(devlist->devname, mdfd, c.readonly); + if (!rv && c.runstop > 0) + rv = Manage_run(devlist->devname, mdfd, &c); + if (!rv && c.runstop < 0) + rv = Manage_stop(devlist->devname, mdfd, c.verbose, 0); + break; + case ASSEMBLE: + if (devs_found == 1 && ident.uuid_set == 0 && + ident.super_minor == UnSet && ident.name[0] == 0 && !c.scan ) { + /* Only a device has been given, so get details from config file */ + struct mddev_ident *array_ident = conf_get_ident(devlist->devname); + if (array_ident == NULL) { + pr_err("%s not identified in config file.\n", + devlist->devname); + rv |= 1; + if (mdfd >= 0) + close(mdfd); + } else { + if (array_ident->autof == 0) + array_ident->autof = c.autof; + rv |= Assemble(ss, devlist->devname, array_ident, + NULL, &c); + } + } else if (!c.scan) + rv = Assemble(ss, devlist->devname, &ident, + devlist->next, &c); + else if (devs_found > 0) { + if (c.update && devs_found > 1) { + pr_err("can only update a single array at a time\n"); + exit(1); + } + if (c.backup_file && devs_found > 1) { + pr_err("can only assemble a single array when providing a backup file.\n"); + exit(1); + } + for (dv = devlist ; dv ; dv=dv->next) { + struct mddev_ident *array_ident = conf_get_ident(dv->devname); + if (array_ident == NULL) { + pr_err("%s not identified in config file.\n", + dv->devname); + rv |= 1; + continue; + } + if (array_ident->autof == 0) + array_ident->autof = c.autof; + rv |= Assemble(ss, dv->devname, array_ident, + NULL, &c); + } + } else { + if (c.update) { + pr_err("--update not meaningful with a --scan assembly.\n"); + exit(1); + } + if (c.backup_file) { + pr_err("--backup_file not meaningful with a --scan assembly.\n"); + exit(1); + } + rv = scan_assemble(ss, &c, &ident); + } + + break; + case BUILD: + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + if (s.write_behind && !s.bitmap_file) { + pr_err("write-behind mode requires a bitmap.\n"); + rv = 1; + break; + } + if (s.raiddisks == 0) { + pr_err("no raid-devices specified.\n"); + rv = 1; + break; + } + + if (s.bitmap_file) { + if (strcmp(s.bitmap_file, "internal")==0) { + pr_err("'internal' bitmaps not supported with --build\n"); + rv |= 1; + break; + } + } + rv = Build(devlist->devname, devlist->next, &s, &c); + break; + case CREATE: + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + if (s.write_behind && !s.bitmap_file) { + pr_err("write-behind mode requires a bitmap.\n"); + rv = 1; + break; + } + if (s.raiddisks == 0) { + pr_err("no raid-devices specified.\n"); + rv = 1; + break; + } + + rv = Create(ss, devlist->devname, + ident.name, ident.uuid_set ? ident.uuid : NULL, + devs_found-1, devlist->next, + &s, &c, data_offset); + break; + case MISC: + if (devmode == 'E') { + if (devlist == NULL && !c.scan) { + pr_err("No devices to examine\n"); + exit(2); + } + if (devlist == NULL) + devlist = conf_get_devs(); + if (devlist == NULL) { + pr_err("No devices listed in %s\n", configfile?configfile:DefaultConfFile); + exit(1); + } + rv = Examine(devlist, &c, ss); + } else if (devmode == DetailPlatform) { + rv = Detail_Platform(ss ? ss->ss : NULL, ss ? c.scan : 1, + c.verbose, c.export, + devlist ? devlist->devname : NULL); + } else if (devlist == NULL) { + if (devmode == 'S' && c.scan) + rv = stop_scan(c.verbose); + else if ((devmode == 'D' || devmode == Waitclean) && c.scan) + rv = misc_scan(devmode, &c); + else if (devmode == UdevRules) + rv = Write_rules(udev_filename); + else { + pr_err("No devices given.\n"); + exit(2); + } + } else + rv = misc_list(devlist, &ident, dump_directory, ss, &c); + break; + case MONITOR: + if (!devlist && !c.scan) { + pr_err("Cannot monitor: need --scan or at least one device\n"); + rv = 1; + break; + } + if (pidfile && !daemonise) { + pr_err("Cannot write a pid file when not in daemon mode\n"); + rv = 1; + break; + } + if (c.delay == 0) { + if (get_linux_version() > 2006016) + /* mdstat responds to poll */ + c.delay = 1000; + else + c.delay = 60; + } + if (c.delay == 0) + c.delay = 60; + rv= Monitor(devlist, mailaddr, program, + &c, daemonise, oneshot, + dosyslog, pidfile, increments, + spare_sharing); + break; + + case GROW: + if (array_size > 0) { + /* alway impose array size first, independent of + * anything else + * Do not allow level or raid_disks changes at the + * same time as that can be irreversibly destructive. + */ + struct mdinfo sra; + int err; + if (s.raiddisks || s.level != UnSet) { + pr_err("cannot change array size in same operation as changing raiddisks or level.\n" + " Change size first, then check that data is still intact.\n"); + rv = 1; + break; + } + sysfs_init(&sra, mdfd, NULL); + if (array_size == MAX_SIZE) + err = sysfs_set_str(&sra, NULL, "array_size", "default"); + else + err = sysfs_set_num(&sra, NULL, "array_size", array_size / 2); + if (err < 0) { + if (errno == E2BIG) + pr_err("--array-size setting is too large.\n"); + else + pr_err("current kernel does not support setting --array-size\n"); + rv = 1; + break; + } + } + if (devs_found > 1 && s.raiddisks == 0 && s.level == UnSet) { + /* must be '-a'. */ + if (s.size > 0 || s.chunk || s.layout_str != NULL || s.bitmap_file) { + pr_err("--add cannot be used with other geometry changes in --grow mode\n"); + rv = 1; + break; + } + for (dv=devlist->next; dv ; dv=dv->next) { + rv = Grow_Add_device(devlist->devname, mdfd, + dv->devname); + if (rv) + break; + } + } else if (s.bitmap_file) { + if (s.size > 0 || s.raiddisks || s.chunk || + s.layout_str != NULL || devs_found > 1) { + pr_err("--bitmap changes cannot be used with other geometry changes in --grow mode\n"); + rv = 1; + break; + } + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + rv = Grow_addbitmap(devlist->devname, mdfd, &c, &s); + } else if (grow_continue) + rv = Grow_continue_command(devlist->devname, + mdfd, c.backup_file, + c.verbose); + else if (s.size > 0 || s.raiddisks || s.layout_str != NULL + || s.chunk != 0 || s.level != UnSet + || data_offset != INVALID_SECTORS) { + rv = Grow_reshape(devlist->devname, mdfd, + devlist->next, + data_offset, &c, &s); + } else if (array_size == 0) + pr_err("no changes to --grow\n"); + break; + case INCREMENTAL: + if (rebuild_map) { + RebuildMap(); + } + if (c.scan) { + rv = 1; + if (devlist) { + pr_err("In --incremental mode, a device cannot be given with --scan.\n"); + break; + } + if (c.runstop <= 0) { + pr_err("--incremental --scan meaningless without --run.\n"); + break; + } + if (devmode == 'f') { + pr_err("--incremental --scan --fail not supported.\n"); + break; + } + rv = IncrementalScan(&c, NULL); + } + if (!devlist) { + if (!rebuild_map && !c.scan) { + pr_err("--incremental requires a device.\n"); + rv = 1; + } + break; + } + if (devmode == 'f') { + if (devlist->next) { + pr_err("'--incremental --fail' can only handle one device.\n"); + rv = 1; + break; + } + rv = IncrementalRemove(devlist->devname, remove_path, + c.verbose); + } else + rv = Incremental(devlist, &c, ss); + break; + case AUTODETECT: + autodetect(); + break; + } + exit(rv); +} + +static int scan_assemble(struct supertype *ss, + struct context *c, + struct mddev_ident *ident) +{ + struct mddev_ident *a, *array_list = conf_get_ident(NULL); + struct mddev_dev *devlist = conf_get_devs(); + struct map_ent *map = NULL; + int cnt = 0; + int rv = 0; + int failures, successes; + + if (conf_verify_devnames(array_list)) { + pr_err("Duplicate MD device names in conf file were found.\n"); + return 1; + } + if (devlist == NULL) { + pr_err("No devices listed in conf file were found.\n"); + return 1; + } + for (a = array_list; a ; a = a->next) { + a->assembled = 0; + if (a->autof == 0) + a->autof = c->autof; + } + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + do { + failures = 0; + successes = 0; + rv = 0; + for (a = array_list; a ; a = a->next) { + int r; + if (a->assembled) + continue; + if (a->devname && + strcasecmp(a->devname, "<ignore>") == 0) + continue; + + r = Assemble(ss, a->devname, + a, NULL, c); + if (r == 0) { + a->assembled = 1; + successes++; + } else + failures++; + rv |= r; + cnt++; + } + } while (failures && successes); + if (c->homehost && cnt == 0) { + /* Maybe we can auto-assemble something. + * Repeatedly call Assemble in auto-assemble mode + * until it fails + */ + int rv2; + int acnt; + ident->autof = c->autof; + do { + struct mddev_dev *devlist = conf_get_devs(); + acnt = 0; + do { + rv2 = Assemble(ss, NULL, + ident, + devlist, c); + if (rv2==0) { + cnt++; + acnt++; + } + } while (rv2!=2); + /* Incase there are stacked devices, we need to go around again */ + } while (acnt); + if (cnt == 0 && rv == 0) { + pr_err("No arrays found in config file or automatically\n"); + rv = 1; + } else if (cnt) + rv = 0; + } else if (cnt == 0 && rv == 0) { + pr_err("No arrays found in config file\n"); + rv = 1; + } + map_unlock(&map); + return rv; +} + +static int misc_scan(char devmode, struct context *c) +{ + /* apply --detail or --wait-clean to + * all devices in /proc/mdstat + */ + struct mdstat_ent *ms = mdstat_read(0, 1); + struct mdstat_ent *e; + struct map_ent *map = NULL; + int members; + int rv = 0; + + for (members = 0; members <= 1; members++) { + for (e=ms ; e ; e=e->next) { + char *name = NULL; + struct map_ent *me; + struct stat stb; + int member = e->metadata_version && + strncmp(e->metadata_version, + "external:/", 10) == 0; + if (members != member) + continue; + me = map_by_devnm(&map, e->devnm); + if (me && me->path + && strcmp(me->path, "/unknown") != 0) + name = me->path; + if (name == NULL || + stat(name, &stb) != 0) + name = get_md_name(e->devnm); + + if (!name) { + pr_err("cannot find device file for %s\n", + e->devnm); + continue; + } + if (devmode == 'D') + rv |= Detail(name, c); + else + rv |= WaitClean(name, -1, c->verbose); + put_md_name(name); + } + } + free_mdstat(ms); + return rv; +} + +static int stop_scan(int verbose) +{ + /* apply --stop to all devices in /proc/mdstat */ + /* Due to possible stacking of devices, repeat until + * nothing more can be stopped + */ + int progress=1, err; + int last = 0; + int rv = 0; + do { + struct mdstat_ent *ms = mdstat_read(0, 0); + struct mdstat_ent *e; + + if (!progress) last = 1; + progress = 0; err = 0; + for (e=ms ; e ; e=e->next) { + char *name = get_md_name(e->devnm); + int mdfd; + + if (!name) { + pr_err("cannot find device file for %s\n", + e->devnm); + continue; + } + mdfd = open_mddev(name, 1); + if (mdfd >= 0) { + if (Manage_stop(name, mdfd, verbose, !last)) + err = 1; + else + progress = 1; + close(mdfd); + } + + put_md_name(name); + } + free_mdstat(ms); + } while (!last && err); + if (err) + rv |= 1; + return rv; +} + +static int misc_list(struct mddev_dev *devlist, + struct mddev_ident *ident, + char *dump_directory, + struct supertype *ss, struct context *c) +{ + struct mddev_dev *dv; + int rv = 0; + + for (dv=devlist ; dv; dv=(rv & 16) ? NULL : dv->next) { + int mdfd; + + switch(dv->disposition) { + case 'D': + rv |= Detail(dv->devname, c); + continue; + case KillOpt: /* Zero superblock */ + if (ss) + rv |= Kill(dv->devname, ss, c->force, c->verbose,0); + else { + int v = c->verbose; + do { + rv |= Kill(dv->devname, NULL, c->force, v, 0); + v = -1; + } while (rv == 0); + rv &= ~2; + } + continue; + case 'Q': + rv |= Query(dv->devname); continue; + case 'X': + rv |= ExamineBitmap(dv->devname, c->brief, ss); continue; + case ExamineBB: + rv |= ExamineBadblocks(dv->devname, c->brief, ss); continue; + case 'W': + case WaitOpt: + rv |= Wait(dv->devname); continue; + case Waitclean: + rv |= WaitClean(dv->devname, -1, c->verbose); continue; + case KillSubarray: + rv |= Kill_subarray(dv->devname, c->subarray, c->verbose); + continue; + case UpdateSubarray: + if (c->update == NULL) { + pr_err("-U/--update must be specified with --update-subarray\n"); + rv |= 1; + continue; + } + rv |= Update_subarray(dv->devname, c->subarray, + c->update, ident, c->verbose); + continue; + case Dump: + rv |= Dump_metadata(dv->devname, dump_directory, c, ss); + continue; + case Restore: + rv |= Restore_metadata(dv->devname, dump_directory, c, ss, + (dv == devlist && dv->next == NULL)); + continue; + case Action: + rv |= SetAction(dv->devname, c->action); + continue; + } + if (dv->devname[0] == '/') + mdfd = open_mddev(dv->devname, 1); + else { + mdfd = open_dev(dv->devname); + if (mdfd < 0) + pr_err("Cannot open %s\n", dv->devname); + } + if (mdfd>=0) { + switch(dv->disposition) { + case 'R': + c->runstop = 1; + rv |= Manage_run(dv->devname, mdfd, c); break; + case 'S': + rv |= Manage_stop(dv->devname, mdfd, c->verbose, 0); break; + case 'o': + rv |= Manage_ro(dv->devname, mdfd, 1); break; + case 'w': + rv |= Manage_ro(dv->devname, mdfd, -1); break; + } + close(mdfd); + } else + rv |= 1; + } + return rv; +} + +int SetAction(char *dev, char *action) +{ + int fd = open(dev, O_RDONLY); + struct mdinfo mdi; + if (fd < 0) { + pr_err("Couldn't open %s: %s\n", dev, strerror(errno)); + return 1; + } + sysfs_init(&mdi, fd, NULL); + close(fd); + if (!mdi.sys_name[0]) { + pr_err("%s is no an md array\n", dev); + return 1; + } + + if (sysfs_set_str(&mdi, NULL, "sync_action", action) < 0) { + pr_err("Count not set action for %s to %s: %s\n", + dev, action, strerror(errno)); + return 1; + } + return 0; +} diff --git a/mdadm.conf-example b/mdadm.conf-example new file mode 100644 index 00000000..35a75d12 --- /dev/null +++ b/mdadm.conf-example @@ -0,0 +1,65 @@ +# mdadm configuration file +# +# mdadm will function properly without the use of a configuration file, +# but this file is useful for keeping track of arrays and member disks. +# In general, a mdadm.conf file is created, and updated, after arrays +# are created. This is the opposite behavior of /etc/raidtab which is +# created prior to array construction. +# +# +# the config file takes two types of lines: +# +# DEVICE lines specify a list of devices of where to look for +# potential member disks +# +# ARRAY lines specify information about how to identify arrays so +# so that they can be activated +# +# You can have more than one device line and use wild cards. The first +# example includes SCSI the first partition of SCSI disks /dev/sdb, +# /dev/sdc, /dev/sdd, /dev/sdj, /dev/sdk, and /dev/sdl. The second +# line looks for array slices on IDE disks. +# +#DEVICE /dev/sd[bcdjkl]1 +#DEVICE /dev/hda1 /dev/hdb1 +# +# If you mount devfs on /dev, then a suitable way to list all devices is: +#DEVICE /dev/discs/*/* +# +# +# The AUTO line can control which arrays get assembled by auto-assembly, +# meaing either "mdadm -As" when there are no 'ARRAY' lines in this file, +# or "mdadm --incremental" when the array found is not listed in this file. +# By default, all arrays that are found are assembled. +# If you want to ignore all DDF arrays (maybe they are managed by dmraid), +# and only assemble 1.x arrays if which are marked for 'this' homehost, +# but assemble all others, then use +#AUTO -ddf homehost -1.x +all +# +# ARRAY lines specify an array to assemble and a method of identification. +# Arrays can currently be identified by using a UUID, superblock minor number, +# or a listing of devices. +# +# super-minor is usually the minor number of the metadevice +# UUID is the Universally Unique Identifier for the array +# Each can be obtained using +# +# mdadm -D <md> +# +#ARRAY /dev/md0 UUID=3aaa0122:29827cfa:5331ad66:ca767371 +#ARRAY /dev/md1 super-minor=1 +#ARRAY /dev/md2 devices=/dev/hda1,/dev/hdb1 +# +# ARRAY lines can also specify a "spare-group" for each array. mdadm --monitor +# will then move a spare between arrays in a spare-group if one array has a failed +# drive but no spare +#ARRAY /dev/md4 uuid=b23f3c6d:aec43a9f:fd65db85:369432df spare-group=group1 +#ARRAY /dev/md5 uuid=19464854:03f71b1b:e0df2edd:246cc977 spare-group=group1 +# +# When used in --follow (aka --monitor) mode, mdadm needs a +# mail address and/or a program. This can be given with "mailaddr" +# and "program" lines to that monitoring can be started using +# mdadm --follow --scan & echo $! > /run/mdadm/mon.pid +# If the lines are not found, mdadm will exit quietly +#MAILADDR root@mydomain.tld +#PROGRAM /usr/sbin/handle-mdadm-events diff --git a/mdadm.conf.5 b/mdadm.conf.5 new file mode 100644 index 00000000..542e2635 --- /dev/null +++ b/mdadm.conf.5 @@ -0,0 +1,641 @@ +.\" Copyright Neil Brown and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH MDADM.CONF 5 +.SH NAME +mdadm.conf \- configuration for management of Software RAID with mdadm +.SH SYNOPSIS +/etc/mdadm/mdadm.conf +.SH DESCRIPTION +.PP +.I mdadm +is a tool for creating, managing, and monitoring RAID devices using the +.B md +driver in Linux. +.PP +Some common tasks, such as assembling all arrays, can be simplified +by describing the devices and arrays in this configuration file. + +.SS SYNTAX +The file should be seen as a collection of words separated by white +space (space, tab, or newline). +Any word that beings with a hash sign (#) starts a comment and that +word together with the remainder of the line is ignored. + +Spaces can be included in a word using quotation characters. Either +single quotes +.RB ( ' ) +or double quotes (\fB"\fP) +may be used. All the characters from one quotation character to +next identical character are protected and will not be used to +separate words to start new quoted strings. To include a single quote +it must be between double quotes. To include a double quote it must +be between single quotes. + +Any line that starts with white space (space or tab) is treated as +though it were a continuation of the previous line. + +Empty lines are ignored, but otherwise each (non continuation) line +must start with a keyword as listed below. The keywords are case +insensitive and can be abbreviated to 3 characters. + +The keywords are: +.TP +.B DEVICE +A +.B device +line lists the devices (whole devices or partitions) that might contain +a component of an MD array. When looking for the components of an +array, +.I mdadm +will scan these devices (or any devices listed on the command line). + +The +.B device +line may contain a number of different devices (separated by spaces) +and each device name can contain wild cards as defined by +.BR glob (7). + +Also, there may be several device lines present in the file. + +Alternatively, a +.B device +line can contain either or both of the words +.B containers +and +.BR partitions . +The word +.B containers +will cause +.I mdadm +to look for assembled CONTAINER arrays and included them as a source +for assembling further arrays. + +The word +.I partitions +will cause +.I mdadm +to read +.I /proc/partitions +and include all devices and partitions found therein. +.I mdadm +does not use the names from +.I /proc/partitions +but only the major and minor device numbers. It scans +.I /dev +to find the name that matches the numbers. + +If no DEVICE line is present, then "DEVICE partitions containers" is assumed. + +For example: +.IP +DEVICE /dev/hda* /dev/hdc* +.br +DEV /dev/sd* +.br +DEVICE /dev/disk/by-path/pci* +.br +DEVICE partitions + +.TP +.B ARRAY +The ARRAY lines identify actual arrays. The second word on the line +may be the name of the device where the array is normally +assembled, such as +.B /dev/md1 +or +.BR /dev/md/backup . +If the name does not start with a slash +.RB (' / '), +it is treated as being in +.BR /dev/md/ . +Alternately the word +.B <ignore> +(complete with angle brackets) can be given in which case any array +which matches the rest of the line will never be automatically assembled. +If no device name is given, +.I mdadm +will use various heuristics to determine an appropriate name. + +Subsequent words identify the array, or identify the array as a member +of a group. If multiple identities are given, +then a component device must match ALL identities to be considered a +match. Each identity word has a tag, and equals sign, and some value. +The tags are: +.RS 4 +.TP +.B uuid= +The value should be a 128 bit uuid in hexadecimal, with punctuation +interspersed if desired. This must match the uuid stored in the +superblock. +.TP +.B name= +The value should be a simple textual name as was given to +.I mdadm +when the array was created. This must match the name stored in the +superblock on a device for that device to be included in the array. +Not all superblock formats support names. +.TP +.B super\-minor= +The value is an integer which indicates the minor number that was +stored in the superblock when the array was created. When an array is +created as /dev/mdX, then the minor number X is stored. +.TP +.B devices= +The value is a comma separated list of device names or device name +patterns. +Only devices with names which match one entry in the list will be used +to assemble the array. Note that the devices +listed there must also be listed on a DEVICE line. +.TP +.B level= +The value is a RAID level. This is not normally used to +identify an array, but is supported so that the output of + +.B "mdadm \-\-examine \-\-scan" + +can be use directly in the configuration file. +.TP +.B num\-devices= +The value is the number of devices in a complete active array. As with +.B level= +this is mainly for compatibility with the output of + +.BR "mdadm \-\-examine \-\-scan" . + +.TP +.B spares= +The value is a number of spare devices to expect the array to have. +The sole use of this keyword and value is as follows: +.B mdadm \-\-monitor +will report an array if it is found to have fewer than this number of +spares when +.B \-\-monitor +starts or when +.B \-\-oneshot +is used. + +.TP +.B spare\-group= +The value is a textual name for a group of arrays. All arrays with +the same +.B spare\-group +name are considered to be part of the same group. The significance of +a group of arrays is that +.I mdadm +will, when monitoring the arrays, move a spare drive from one array in +a group to another array in that group if the first array had a failed +or missing drive but no spare. + +.TP +.B auto= +This option is rarely needed with mdadm-3.0, particularly if use with +the Linux kernel v2.6.28 or later. +It tells +.I mdadm +whether to use partitionable array or non-partitionable arrays and, +in the absence of +.IR udev , +how many partition devices to create. From 2.6.28 all md array +devices are partitionable, hence this option is not needed. + +The value of this option can be "yes" or "md" to indicate that a +traditional, non-partitionable md array should be created, or "mdp", +"part" or "partition" to indicate that a partitionable md array (only +available in linux 2.6 and later) should be used. This later set can +also have a number appended to indicate how many partitions to create +device files for, e.g. +.BR auto=mdp5 . +The default is 4. + +.TP +.B bitmap= +The option specifies a file in which a write-intent bitmap should be +found. When assembling the array, +.I mdadm +will provide this file to the +.B md +driver as the bitmap file. This has the same function as the +.B \-\-bitmap\-file +option to +.BR \-\-assemble . + +.TP +.B metadata= +Specify the metadata format that the array has. This is mainly +recognised for comparability with the output of +.BR "mdadm \-Es" . + +.TP +.B container= +Specify that this array is a member array of some container. The +value given can be either a path name in /dev, or a UUID of the +container array. + +.TP +.B member= +Specify that this array is a member array of some container. Each +type of container has some way to enumerate member arrays, often a +simple sequence number. The value identifies which member of a +container the array is. It will usually accompany a "container=" word. +.RE + +.TP +.B MAILADDR +The +.B mailaddr +line gives an E-mail address that alerts should be +sent to when +.I mdadm +is running in +.B \-\-monitor +mode (and was given the +.B \-\-scan +option). There should only be one +.B MAILADDR +line and it should have only one address. Any subsequent addresses +are silently ignored. + +.TP +.B MAILFROM +The +.B mailfrom +line (which can only be abbreviated to at least 5 characters) gives an +address to appear in the "From" address for alert mails. This can be +useful if you want to explicitly set a domain, as the default from +address is "root" with no domain. All words on this line are +catenated with spaces to form the address. + +Note that this value cannot be set via the +.I mdadm +commandline. It is only settable via the config file. + +.TP +.B PROGRAM +The +.B program +line gives the name of a program to be run when +.B "mdadm \-\-monitor" +detects potentially interesting events on any of the arrays that it +is monitoring. This program gets run with two or three arguments, they +being the Event, the md device, and possibly the related component +device. + +There should only be one +.B program +line and it should be give only one program. + + +.TP +.B CREATE +The +.B create +line gives default values to be used when creating arrays, new members +of arrays, and device entries for arrays. +These include: + +.RS 4 +.TP +.B owner= +.TP +.B group= +These can give user/group ids or names to use instead of system +defaults (root/wheel or root/disk). +.TP +.B mode= +An octal file mode such as 0660 can be given to override the default +of 0600. +.TP +.B auto= +This corresponds to the +.B \-\-auto +flag to mdadm. Give +.BR yes , +.BR md , +.BR mdp , +.B part +\(em possibly followed by a number of partitions \(em to indicate how +missing device entries should be created. + +.TP +.B metadata= +The name of the metadata format to use if none is explicitly given. +This can be useful to impose a system-wide default of version-1 superblocks. + +.TP +.B symlinks=no +Normally when creating devices in +.B /dev/md/ +.I mdadm +will create a matching symlink from +.B /dev/ +with a name starting +.B md +or +.BR md_ . +Give +.B symlinks=no +to suppress this symlink creation. + +.TP +.B names=yes +Since Linux 2.6.29 it has been possible to create +.B md +devices with a name like +.B md_home +rather than just a number, like +.BR md3 . +.I mdadm +will use the numeric alternative by default as other tools that interact +with md arrays may expect only numbers. +If +.B names=yes +is given in +.I mdadm.conf +then +.I mdadm +will use a name when appropriate. +If +.B names=no +is given, then non-numeric +.I md +device names will not be used even if the default changes in a future +release of +.IR mdadm . + +.TP +.B bbl=no +By default, +.I mdadm +will reserve space for a bad block list (bbl) on all devices +included in or added to any array that supports them. Setting +.B bbl=no +will prevent this, so newly added devices will not have a bad +block log. +.RE + +.TP +.B HOMEHOST +The +.B homehost +line gives a default value for the +.B \-\-homehost= +option to mdadm. There should normally be only one other word on the line. +It should either be a host name, or one of the special words +.BR <system>, +.B <none> +and +.BR <ignore> . +If +.B <system> +is given, then the +.BR gethostname ( 2 ) +systemcall is used to get the host name. This is the default. + +If +.B <ignore> +is given, then a flag is set so that when arrays are being +auto-assembled the checking of the recorded +.I homehost +is disabled. +If +.B <ignore> +is given it is also possible to give an explicit name which will be +used when creating arrays. This is the only case when there can be +more that one other word on the +.B HOMEHOST +line. If there are other words, or other +.B HOMEHOST +lines, they are silently ignored. + +If +.B <none> +is given, then the default of using +.BR gethostname ( 2 ) +is over-ridden and no homehost name is assumed. + +When arrays are created, this host name will be stored in the +metadata. When arrays are assembled using auto-assembly, arrays which +do not record the correct homehost name in their metadata will be +assembled using a "foreign" name. A "foreign" name alway ends with a +digit string preceded by an underscore to differentiate it +from any possible local name. e.g. +.B /dev/md/1_1 +or +.BR /dev/md/home_0 . +.TP +.B AUTO +A list of names of metadata format can be given, each preceded by a +plus or minus sign. Also the word +.I homehost +is allowed as is +.I all +preceded by plus or minus sign. +.I all +is usually last. + +When +.I mdadm +is auto-assembling an array, either via +.I \-\-assemble +or +.I \-\-incremental +and it finds metadata of a given type, it checks that metadata type +against those listed in this line. The first match wins, where +.I all +matches anything. +If a match is found that was preceded by a plus sign, the auto +assembly is allowed. If the match was preceded by a minus sign, the +auto assembly is disallowed. If no match is found, the auto assembly +is allowed. + +If the metadata indicates that the array was created for +.I this +host, and the word +.I homehost +appears before any other match, then the array is treated as a valid +candidate for auto-assembly. + +This can be used to disable all auto-assembly (so that only arrays +explicitly listed in mdadm.conf or on the command line are assembled), +or to disable assembly of certain metadata types which might be +handled by other software. It can also be used to disable assembly of +all foreign arrays - normally such arrays are assembled but given a +non-deterministic name in +.BR /dev/md/ . + +The known metadata types are +.BR 0.90 , +.BR 1.x , +.BR ddf , +.BR imsm . + +.B AUTO +should be given at most once. Subsequent lines are silently ignored. +Thus an earlier config file in a config directory will over-ride +the setting in a later config file. + +.TP +.B POLICY +This is used to specify what automatic behavior is allowed on devices +newly appearing in the system and provides a way of marking spares that can +be moved to other arrays as well as the migration domains. +.I Domain +can be defined through +.I policy +line by specifying a domain name for a number of paths from +.BR /dev/disk/by-path/ . +A device may belong to several domains. The domain of an array is a union +of domains of all devices in that array. A spare can be automatically +moved from one array to another if the set of the destination array's +.I domains +contains all the +.I domains +of the new disk or if both arrays have the same +.IR spare-group . + +To update hot plug configuration it is necessary to execute +.B mdadm \-\-udev\-rules +command after changing the config file + +Key words used in the +.I POLICY +line and supported values are: + +.RS 7 +.TP +.B domain= +any arbitrary string +.TP +.B metadata= +0.9 1.x ddf or imsm +.TP +.B path= +file glob matching anything from +.B /dev/disk/by-path +.TP +.B type= +either +.B disk +or +.BR part . +.TP +.B action= +include, re-add, spare, spare-same-slot, or force-spare +.TP +.B auto= +yes, no, or homehost. + +.P +The +.I action +item determines the automatic behavior allowed for devices matching the +.I path +and +.I type +in the same line. If a device matches several lines with different +.I actions +then the most permissive will apply. The ordering of policy lines +is irrelevant to the end result. +.TP +.B include +allows adding a disk to an array if metadata on that disk matches that array +.TP +.B re\-add +will include the device in the array if it appears to be a current member +or a member that was recently removed and the array has a +write-intent-bitmap to allow the +.B re\-add +functionality. +.TP +.B spare +as above and additionally: if the device is bare it can +become a spare if there is any array that it is a candidate for based +on domains and metadata. +.TP +.B spare\-same\-slot +as above and additionally if given slot was used by an array that went +degraded recently and the device plugged in has no metadata then it will +be automatically added to that array (or it's container) +.TP +.B force\-spare +as above and the disk will become a spare in remaining cases +.RE + +.SH EXAMPLE +DEVICE /dev/sd[bcdjkl]1 +.br +DEVICE /dev/hda1 /dev/hdb1 + +# /dev/md0 is known by its UUID. +.br +ARRAY /dev/md0 UUID=3aaa0122:29827cfa:5331ad66:ca767371 +.br +# /dev/md1 contains all devices with a minor number of +.br +# 1 in the superblock. +.br +ARRAY /dev/md1 superminor=1 +.br +# /dev/md2 is made from precisely these two devices +.br +ARRAY /dev/md2 devices=/dev/hda1,/dev/hdb1 + +# /dev/md4 and /dev/md5 are a spare-group and spares +.br +# can be moved between them +.br +ARRAY /dev/md4 uuid=b23f3c6d:aec43a9f:fd65db85:369432df +.br + spare\-group=group1 +.br +ARRAY /dev/md5 uuid=19464854:03f71b1b:e0df2edd:246cc977 +.br + spare\-group=group1 +.br +# /dev/md/home is created if need to be a partitionable md array +.br +# any spare device number is allocated. +.br +ARRAY /dev/md/home UUID=9187a482:5dde19d9:eea3cc4a:d646ab8b +.br + auto=part +.br +# The name of this array contains a space. +.br +ARRAY /dev/md9 name='Data Storage' +.sp +POLICY domain=domain1 metadata=imsm path=pci-0000:00:1f.2-scsi-* +.br + action=spare +.br +POLICY domain=domain1 metadata=imsm path=pci-0000:04:00.0-scsi-[01]* +.br + action=include +.br +# One domain comprising of devices attached to specified paths is defined. +.br +# Bare device matching first path will be made an imsm spare on hot plug. +.br +# If more than one array is created on devices belonging to domain1 and +.br +# one of them becomes degraded, then any imsm spare matching any path for +.br +# given domain name can be migrated. +.br +MAILADDR root@mydomain.tld +.br +PROGRAM /usr/sbin/handle\-mdadm\-events +.br +CREATE group=system mode=0640 auto=part\-8 +.br +HOMEHOST <system> +.br +AUTO +1.x homehost \-all + +.SH SEE ALSO +.BR mdadm (8), +.BR md (4). diff --git a/mdadm.h b/mdadm.h new file mode 100644 index 00000000..b5976582 --- /dev/null +++ b/mdadm.h @@ -0,0 +1,1574 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#define _GNU_SOURCE +#define _FILE_OFFSET_BITS 64 +#include <unistd.h> +#ifdef __GLIBC__ +extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); +#elif !defined(lseek64) +# if defined(__NO_STAT64) || __WORDSIZE != 32 +# define lseek64 lseek +# endif +#endif + +#include <sys/types.h> +#include <sys/stat.h> +#include <stdlib.h> +#include <time.h> +#include <sys/time.h> +#include <getopt.h> +#include <fcntl.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <syslog.h> +#ifdef __dietlibc__ +#include <strings.h> +/* dietlibc has deprecated random and srandom!! */ +#define random rand +#define srandom srand +#endif + +#include <linux/kdev_t.h> +/*#include <linux/fs.h> */ +#include <sys/mount.h> +#include <asm/types.h> +#include <sys/ioctl.h> +#define MD_MAJOR 9 +#define MdpMinorShift 6 + +#ifndef BLKGETSIZE64 +#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ +#endif + +#define DEFAULT_CHUNK 512 +#define DEFAULT_BITMAP_CHUNK 4096 +#define DEFAULT_BITMAP_DELAY 5 +#define DEFAULT_MAX_WRITE_BEHIND 256 + +/* MAP_DIR should be somewhere that persists across the pivotroot + * from early boot to late boot. + * /run seems to have emerged as the best standard. + */ +#ifndef MAP_DIR +#define MAP_DIR "/run/mdadm" +#endif /* MAP_DIR */ +/* MAP_FILE is what we name the map file we put in MAP_DIR, in case you + * want something other than the default of "map" + */ +#ifndef MAP_FILE +#define MAP_FILE "map" +#endif /* MAP_FILE */ +/* MDMON_DIR is where pid and socket files used for communicating + * with mdmon normally live. Best is /var/run/mdadm as + * mdmon is needed at early boot then it needs to write there prior + * to /var/run being mounted read/write, and it also then needs to + * persist beyond when /var/run is mounter read-only. So, to be + * safe, the default is somewhere that is read/write early in the + * boot process and stays up as long as possible during shutdown. + */ +#ifndef MDMON_DIR +#define MDMON_DIR "/run/mdadm" +#endif /* MDMON_DIR */ + +/* FAILED_SLOTS is where to save files storing recent removal of array + * member in order to allow future reuse of disk inserted in the same + * slot for array recovery + */ +#ifndef FAILED_SLOTS_DIR +#define FAILED_SLOTS_DIR "/run/mdadm/failed-slots" +#endif /* FAILED_SLOTS */ + +#include "md_u.h" +#include "md_p.h" +#include "bitmap.h" +#include "msg.h" + +#include <endian.h> +/* Redhat don't like to #include <asm/byteorder.h>, and + * some time include <linux/byteorder/xxx_endian.h> isn't enough, + * and there is no standard conversion function so... */ +/* And dietlibc doesn't think byteswap is ok, so.. */ +/* #include <byteswap.h> */ +#define bswap_16(x) (((x) & 0x00ffU) << 8 | \ + ((x) & 0xff00U) >> 8) +#define bswap_32(x) (((x) & 0x000000ffU) << 24 | \ + ((x) & 0xff000000U) >> 24 | \ + ((x) & 0x0000ff00U) << 8 | \ + ((x) & 0x00ff0000U) >> 8) +#define bswap_64(x) (((x) & 0x00000000000000ffULL) << 56 | \ + ((x) & 0xff00000000000000ULL) >> 56 | \ + ((x) & 0x000000000000ff00ULL) << 40 | \ + ((x) & 0x00ff000000000000ULL) >> 40 | \ + ((x) & 0x0000000000ff0000ULL) << 24 | \ + ((x) & 0x0000ff0000000000ULL) >> 24 | \ + ((x) & 0x00000000ff000000ULL) << 8 | \ + ((x) & 0x000000ff00000000ULL) >> 8) + +#if !defined(__KLIBC__) +#if BYTE_ORDER == LITTLE_ENDIAN +#define __cpu_to_le16(_x) (unsigned int)(_x) +#define __cpu_to_le32(_x) (unsigned int)(_x) +#define __cpu_to_le64(_x) (unsigned long long)(_x) +#define __le16_to_cpu(_x) (unsigned int)(_x) +#define __le32_to_cpu(_x) (unsigned int)(_x) +#define __le64_to_cpu(_x) (unsigned long long)(_x) + +#define __cpu_to_be16(_x) bswap_16(_x) +#define __cpu_to_be32(_x) bswap_32(_x) +#define __cpu_to_be64(_x) bswap_64(_x) +#define __be16_to_cpu(_x) bswap_16(_x) +#define __be32_to_cpu(_x) bswap_32(_x) +#define __be64_to_cpu(_x) bswap_64(_x) +#elif BYTE_ORDER == BIG_ENDIAN +#define __cpu_to_le16(_x) bswap_16(_x) +#define __cpu_to_le32(_x) bswap_32(_x) +#define __cpu_to_le64(_x) bswap_64(_x) +#define __le16_to_cpu(_x) bswap_16(_x) +#define __le32_to_cpu(_x) bswap_32(_x) +#define __le64_to_cpu(_x) bswap_64(_x) + +#define __cpu_to_be16(_x) (unsigned int)(_x) +#define __cpu_to_be32(_x) (unsigned int)(_x) +#define __cpu_to_be64(_x) (unsigned long long)(_x) +#define __be16_to_cpu(_x) (unsigned int)(_x) +#define __be32_to_cpu(_x) (unsigned int)(_x) +#define __be64_to_cpu(_x) (unsigned long long)(_x) +#else +# error "unknown endianness." +#endif +#endif /* __KLIBC__ */ + +/* + * min()/max()/clamp() macros that also do + * strict type-checking.. See the + * "unnecessary" pointer comparison. + */ +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +extern const char Name[]; + +/* general information that might be extracted from a superblock */ +struct mdinfo { + mdu_array_info_t array; + mdu_disk_info_t disk; + __u64 events; + int uuid[4]; + char name[33]; + unsigned long long data_offset; + unsigned long long new_data_offset; + unsigned long long component_size; /* same as array.size, except in + * sectors and up to 64bits. + */ + unsigned long long custom_array_size; /* size for non-default sized + * arrays (in sectors) + */ +#define NO_RESHAPE 0 +#define VOLUME_RESHAPE 1 +#define CONTAINER_RESHAPE 2 +#define RESHAPE_NO_BACKUP 16 /* Mask 'or'ed in */ + int reshape_active; + unsigned long long reshape_progress; + int recovery_blocked; /* for external metadata it + * indicates that there is + * reshape in progress in + * container, + * for native metadata it is + * reshape_active field mirror + */ + /* During reshape we can sometimes change the data_offset to avoid + * over-writing still-valid data. We need to know if there is space. + * So getinfo_super will fill in space_before and space_after in sectors. + * data_offset can be increased or decreased by this amount. + */ + unsigned long long space_before, space_after; + union { + unsigned long long resync_start; /* per-array resync position */ + unsigned long long recovery_start; /* per-device rebuild position */ + #define MaxSector (~0ULL) /* resync/recovery complete position */ + }; + long bitmap_offset; /* 0 == none, 1 == a file */ + unsigned long safe_mode_delay; /* ms delay to mark clean */ + int new_level, delta_disks, new_layout, new_chunk; + int errors; + unsigned long cache_size; /* size of raid456 stripe cache*/ + int mismatch_cnt; + char text_version[50]; + + int container_member; /* for assembling external-metatdata arrays + * This is to be used internally by metadata + * handler only */ + int container_enough; /* flag external handlers can set to + * indicate that subarrays have not enough (-1), + * enough to start (0), or all expected disks (1) */ + char sys_name[20]; + struct mdinfo *devs; + struct mdinfo *next; + + /* Device info for mdmon: */ + int recovery_fd; + int state_fd; + #define DS_FAULTY 1 + #define DS_INSYNC 2 + #define DS_WRITE_MOSTLY 4 + #define DS_SPARE 8 + #define DS_BLOCKED 16 + #define DS_REMOVE 1024 + #define DS_UNBLOCK 2048 + int prev_state, curr_state, next_state; + +}; + +struct createinfo { + int uid; + int gid; + int autof; + int mode; + int symlinks; + int names; + int bblist; + struct supertype *supertype; +}; + +enum mode { + ASSEMBLE=1, + BUILD, + CREATE, + MANAGE, + MISC, + MONITOR, + GROW, + INCREMENTAL, + AUTODETECT, + mode_count +}; + +extern char short_options[]; +extern char short_bitmap_options[]; +extern char short_bitmap_auto_options[]; +extern struct option long_options[]; +extern char Version[], Usage[], Help[], OptionHelp[], + *mode_help[], + Help_create[], Help_build[], Help_assemble[], Help_grow[], + Help_incr[], + Help_manage[], Help_misc[], Help_monitor[], Help_config[]; + +/* for option that don't have short equivilents, we assign arbitrary + * numbers later than any 'short' character option. + */ +enum special_options { + AssumeClean = 300, + BitmapChunk, + WriteBehind, + ReAdd, + NoDegraded, + Sparc22, + BackupFile, + HomeHost, + AutoHomeHost, + Symlinks, + AutoDetect, + Waitclean, + DetailPlatform, + KillSubarray, + UpdateSubarray, + IncrementalPath, + NoSharing, + HelpOptions, + Brief, + ManageOpt, + Add, + AddSpare, + Remove, + Fail, + Replace, + With, + MiscOpt, + WaitOpt, + ConfigFile, + ChunkSize, + WriteMostly, + Layout, + Auto, + Force, + SuperMinor, + EMail, + ProgramOpt, + Increment, + Fork, + Bitmap, + RebuildMapOpt, + InvalidBackup, + UdevRules, + FreezeReshape, + Continue, + OffRootOpt, + Prefer, + KillOpt, + DataOffset, + ExamineBB, + Dump, + Restore, + Action, +}; + +enum prefix_standard { + JEDEC, + IEC +}; + +/* structures read from config file */ +/* List of mddevice names and identifiers + * Identifiers can be: + * uuid=128-hex-uuid + * super-minor=decimal-minor-number-from-superblock + * devices=comma,separated,list,of,device,names,with,wildcards + * + * If multiple fields are present, the intersection of all matching + * devices is considered + */ +#define UnSet (0xfffe) +struct mddev_ident { + char *devname; + + int uuid_set; + int uuid[4]; + char name[33]; + + int super_minor; + + char *devices; /* comma separated list of device + * names with wild cards + */ + int level; + int raid_disks; + int spare_disks; + struct supertype *st; + int autof; /* 1 for normal, 2 for partitioned */ + char *spare_group; + char *bitmap_file; + int bitmap_fd; + + char *container; /* /dev/whatever name of container, or + * uuid of container. You would expect + * this to be the 'devname' or UUID + * of some other entry. + */ + char *member; /* subarray within a container */ + + struct mddev_ident *next; + union { + /* fields needed by different users of this structure */ + int assembled; /* set when assembly succeeds */ + }; +}; + +struct context { + int readonly; + int runstop; + int verbose; + int brief; + int force; + char *homehost; + int require_homehost; + char *prefer; + int export; + int test; + char *subarray; + char *update; + int scan; + int SparcAdjust; + int autof; + int delay; + int freeze_reshape; + char *backup_file; + int invalid_backup; + char *action; +}; + +struct shape { + int raiddisks; + int sparedisks; + int level; + int layout; + char *layout_str; + int chunk; + int bitmap_chunk; + char *bitmap_file; + int assume_clean; + int write_behind; + unsigned long long size; +}; + +/* List of device names - wildcards expanded */ +struct mddev_dev { + char *devname; + int disposition; /* 'a' for add, 'r' for remove, 'f' for fail, + * 'A' for re_add. + * Not set for names read from .config + */ + char writemostly; /* 1 for 'set writemostly', 2 for 'clear writemostly' */ + int used; /* set when used */ + long long data_offset; + struct mddev_dev *next; +}; + +typedef struct mapping { + char *name; + int num; +} mapping_t; + +struct mdstat_ent { + char devnm[32]; + int active; + char *level; + char *pattern; /* U or up, _ for down */ + int percent; /* -1 if no resync */ + int resync; /* 3 if check, 2 if reshape, 1 if resync, 0 if recovery */ + int devcnt; + int raid_disks; + char * metadata_version; + struct dev_member { + char *name; + struct dev_member *next; + } *members; + struct mdstat_ent *next; +}; + +extern struct mdstat_ent *mdstat_read(int hold, int start); +extern void mdstat_close(void); +extern void free_mdstat(struct mdstat_ent *ms); +extern void mdstat_wait(int seconds); +extern void mdstat_wait_fd(int fd, const sigset_t *sigmask); +extern int mddev_busy(char *devnm); +extern struct mdstat_ent *mdstat_by_component(char *name); +extern struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container); + +struct map_ent { + struct map_ent *next; + char devnm[32]; + char metadata[20]; + int uuid[4]; + int bad; + char *path; +}; +extern int map_update(struct map_ent **mpp, char *devnm, char *metadata, + int uuid[4], char *path); +extern void map_remove(struct map_ent **map, char *devnm); +extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]); +extern struct map_ent *map_by_devnm(struct map_ent **map, char *devnm); +extern void map_free(struct map_ent *map); +extern struct map_ent *map_by_name(struct map_ent **map, char *name); +extern void map_read(struct map_ent **melp); +extern int map_write(struct map_ent *mel); +extern void map_delete(struct map_ent **mapp, char *devnm); +extern void map_add(struct map_ent **melp, + char *devnm, char *metadata, int uuid[4], char *path); +extern int map_lock(struct map_ent **melp); +extern void map_unlock(struct map_ent **melp); +extern void map_fork(void); + +/* various details can be requested */ +enum sysfs_read_flags { + GET_LEVEL = (1 << 0), + GET_LAYOUT = (1 << 1), + GET_COMPONENT = (1 << 2), + GET_CHUNK = (1 << 3), + GET_CACHE = (1 << 4), + GET_MISMATCH = (1 << 5), + GET_VERSION = (1 << 6), + GET_DISKS = (1 << 7), + GET_DEGRADED = (1 << 8), + GET_SAFEMODE = (1 << 9), + GET_BITMAP_LOCATION = (1 << 10), + + GET_DEVS = (1 << 20), /* gets role, major, minor */ + GET_OFFSET = (1 << 21), + GET_SIZE = (1 << 22), + GET_STATE = (1 << 23), + GET_ERROR = (1 << 24), +}; + +/* If fd >= 0, get the array it is open on, + * else use devnm. + */ +extern int sysfs_open(char *devnm, char *devname, char *attr); +extern void sysfs_init(struct mdinfo *mdi, int fd, char *devnm); +extern void sysfs_free(struct mdinfo *sra); +extern struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options); +extern int sysfs_attr_match(const char *attr, const char *str); +extern int sysfs_match_word(const char *word, char **list); +extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val); +extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long val); +extern int sysfs_set_num_signed(struct mdinfo *sra, struct mdinfo *dev, + char *name, long long val); +extern int sysfs_uevent(struct mdinfo *sra, char *event); +extern int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, + char *name); +extern int sysfs_fd_get_ll(int fd, unsigned long long *val); +extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *val); +extern int sysfs_fd_get_two(int fd, unsigned long long *v1, unsigned long long *v2); +extern int sysfs_get_two(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *v1, unsigned long long *v2); +extern int sysfs_fd_get_str(int fd, char *val, int size); +extern int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, + char *name); +extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val, int size); +extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms); +extern int sysfs_set_array(struct mdinfo *info, int vers); +extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume); +extern int sysfs_disk_to_scsi_id(int fd, __u32 *id); +extern int sysfs_unique_holder(char *devnm, long rdev); +extern int sysfs_freeze_array(struct mdinfo *sra); +extern int sysfs_wait(int fd, int *msec); +extern int load_sys(char *path, char *buf); +extern int reshape_prepare_fdlist(char *devname, + struct mdinfo *sra, + int raid_disks, + int nrdisks, + unsigned long blocks, + char *backup_file, + int *fdlist, + unsigned long long *offsets); +extern void reshape_free_fdlist(int *fdlist, + unsigned long long *offsets, + int size); +extern int reshape_open_backup_file(char *backup, + int fd, + char *devname, + long blocks, + int *fdlist, + unsigned long long *offsets, + char *sysfs_name, + int restart); +extern unsigned long compute_backup_blocks(int nchunk, int ochunk, + unsigned int ndata, unsigned int odata); +extern char *locate_backup(char *name); +extern char *make_backup(char *name); + +extern int save_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int nwrites, int *dest, + unsigned long long start, unsigned long long length, + char *buf); +extern int restore_stripes(int *dest, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int source, unsigned long long read_offset, + unsigned long long start, unsigned long long length, + char *src_buf); + +#ifndef Sendmail +#define Sendmail "/usr/lib/sendmail -t" +#endif + +#define SYSLOG_FACILITY LOG_DAEMON + +extern char *map_num(mapping_t *map, int num); +extern int map_name(mapping_t *map, char *name); +extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[]; + +extern char *map_dev_preferred(int major, int minor, int create, + char *prefer); +static inline char *map_dev(int major, int minor, int create) +{ + return map_dev_preferred(major, minor, create, NULL); +} + +struct active_array; +struct metadata_update; + +/* 'struct reshape' records the intermediate states of + * a general reshape. + * The starting geometry is converted to the 'before' geometry + * by at most an atomic level change. They could be the same. + * Similarly the 'after' geometry is converted to the final + * geometry by at most a level change. + * Note that 'before' and 'after' must have the same level. + * 'blocks' is the minimum number of sectors for a reshape unit. + * This will be a multiple of the stripe size in each of the + * 'before' and 'after' geometries. + * If 'blocks' is 0, no restriping is necessary. + * 'min_offset_change' is the minimum change to data_offset to + * allow the reshape to happen. It is at least the larger of + * the old and new chunk sizes, and typically the same as 'blocks' + * divided by number of data disks. + */ +struct reshape { + int level; + int parity; /* number of parity blocks/devices */ + struct { + int layout; + int data_disks; + } before, after; + unsigned long long backup_blocks; + unsigned long long min_offset_change; + unsigned long long stripes; /* number of old stripes that comprise 'blocks'*/ + unsigned long long new_size; /* New size of array in sectors */ +}; + +/* A superswitch provides entry point the a metadata handler. + * + * The superswitch primarily operates on some "metadata" that + * is accessed via the 'supertype'. + * This metadata has one of three possible sources. + * 1/ It is read from a single device. In this case it may not completely + * describe the array or arrays as some information might be on other + * devices. + * 2/ It is read from all devices in a container. In this case all + * information is present. + * 3/ It is created by ->init_super / ->add_to_super. In this case it will + * be complete once enough ->add_to_super calls have completed. + * + * When creating an array inside a container, the metadata will be + * formed by a combination of 2 and 3. The metadata or the array is read, + * then new information is added. + * + * The metadata must sometimes have a concept of a 'current' array + * and a 'current' device. + * The 'current' array is set by init_super to be the newly created array, + * or is set by super_by_fd when it finds it is looking at an array inside + * a container. + * + * The 'current' device is either the device that the metadata was read from + * in case 1, or the last device added by add_to_super in case 3. + * Case 2 does not identify a 'current' device. + */ +extern struct superswitch { + + /* Used to report details of metadata read from a component + * device. ->load_super has been called. + */ + void (*examine_super)(struct supertype *st, char *homehost); + void (*brief_examine_super)(struct supertype *st, int verbose); + void (*brief_examine_subarrays)(struct supertype *st, int verbose); + void (*export_examine_super)(struct supertype *st); + int (*examine_badblocks)(struct supertype *st, int fd, char *devname); + int (*copy_metadata)(struct supertype *st, int from, int to); + + /* Used to report details of an active array. + * ->load_super was possibly given a 'component' string. + */ + void (*detail_super)(struct supertype *st, char *homehost); + void (*brief_detail_super)(struct supertype *st); + void (*export_detail_super)(struct supertype *st); + + /* Optional: platform hardware / firmware details */ + int (*detail_platform)(int verbose, int enumerate_only, char *controller_path); + int (*export_detail_platform)(int verbose, char *controller_path); + + /* Used: + * to get uuid to storing in bitmap metadata + * and 'reshape' backup-data metadata + * To see if a device is being re-added to an array it was part of. + */ + void (*uuid_from_super)(struct supertype *st, int uuid[4]); + + /* Extract generic details from metadata. This could be details about + * the container, or about an individual array within the container. + * The determination is made either by: + * load_super being given a 'component' string. + * validate_geometry determining what to create. + * The info includes both array information and device information. + * The particular device should be: + * The last device added by add_to_super + * The device the metadata was loaded from by load_super + * If 'map' is present, then it is an array raid_disks long + * (raid_disk must already be set and correct) and it is filled + * with 1 for slots that are thought to be active and 0 for slots which + * appear to be failed/missing. + * *info is zeroed out before data is added. + */ + void (*getinfo_super)(struct supertype *st, struct mdinfo *info, char *map); + struct mdinfo *(*getinfo_super_disks)(struct supertype *st); + /* Check if the given metadata is flagged as belonging to "this" + * host. 0 for 'no', 1 for 'yes', -1 for "Don't record homehost" + */ + int (*match_home)(struct supertype *st, char *homehost); + + /* Make one of several generic modifications to metadata + * prior to assembly (or other times). + * sparc2.2 - first bug in early 0.90 metadata + * super-minor - change name of 0.90 metadata + * summaries - 'correct' any redundant data + * resync - mark array as dirty to trigger a resync. + * uuid - set new uuid - only 0.90 or 1.x + * name - change the name of the array (where supported) + * homehost - change which host this array is tied to. + * devicesize - If metadata is at start of device, change recorded + * device size to match actual device size + * byteorder - swap bytes for 0.90 metadata + * + * force-one - mark that device as uptodate, not old or failed. + * force-array - mark array as clean if it would not otherwise + * assemble + * assemble - not sure how this is different from force-one... + * linear-grow-new - add a new device to a linear array, but don't + * change the size: so superblock still matches + * linear-grow-update - now change the size of the array. + * writemostly - set the WriteMostly1 bit in the superblock devflags + * readwrite - clear the WriteMostly1 bit in the superblock devflags + * no-bitmap - clear any record that a bitmap is present. + * bbl - add a bad-block-log if possible + * no-bbl - remove and bad-block-log is it is empty. + * revert-reshape - If a reshape is in progress, modify metadata so + * it will resume going in the opposite direction. + */ + int (*update_super)(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost); + + /* Create new metadata for new array as described. This could + * be a new container, or an array in a pre-existing container. + * Also used to zero metadata prior to writing it to invalidate old + * metadata. + */ + int (*init_super)(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, + char *homehost, int *uuid, + unsigned long long data_offset); + + /* update the metadata to include new device, either at create or + * when hot-adding a spare. + */ + int (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo, + int fd, char *devname, + unsigned long long data_offset); + /* update the metadata to delete a device, + * when hot-removing. + */ + int (*remove_from_super)(struct supertype *st, mdu_disk_info_t *dinfo); + + /* Write metadata to one device when fixing problems or adding + * a new device. + */ + int (*store_super)(struct supertype *st, int fd); + + /* Write all metadata for this array. + */ + int (*write_init_super)(struct supertype *st); + /* Check if metadata read from one device is compatible with an array, + * used when assembling an array, or pseudo-assembling was with + * "--examine --brief" + * If "st" has not yet been loaded the superblock from, "tst" is + * moved in, otherwise the superblock in 'st' is compared with + * 'tst'. + */ + int (*compare_super)(struct supertype *st, struct supertype *tst); + /* Load metadata from a single device. If 'devname' is not NULL + * print error messages as appropriate */ + int (*load_super)(struct supertype *st, int fd, char *devname); + /* 'fd' is a 'container' md array - load array metadata from the + * whole container. + */ + int (*load_container)(struct supertype *st, int fd, char *devname); + /* If 'arg' is a valid name of this metadata type, allocate and + * return a 'supertype' for the particular minor version */ + struct supertype * (*match_metadata_desc)(char *arg); + /* If a device has the given size, and the data_offset has been + * requested - work out how much space is available for data. + * This involves adjusting for reserved space (e.g. bitmaps) + * and for any rounding. + * 'mdadm' only calls this for existing arrays where a possible + * spare is being added. However some super-handlers call it + * internally from validate_geometry when creating an array. + */ + __u64 (*avail_size)(struct supertype *st, __u64 size, + unsigned long long data_offset); + /* This is similar to 'avail_size' in purpose, but is used for + * containers for which there is no 'component size' to compare. + * This reports that whole-device size which is a minimum + */ + unsigned long long (*min_acceptable_spare_size)(struct supertype *st); + /* Find somewhere to put a bitmap - possibly auto-size it - and + * update the metadata to record this. The array may be newly + * created, in which case data_size may be updated, or it might + * already exist. Metadata handler can know if init_super + * has been called, but not write_init_super. + */ + int (*add_internal_bitmap)(struct supertype *st, int *chunkp, + int delay, int write_behind, + unsigned long long size, int may_change, int major); + /* Seek 'fd' to start of write-intent-bitmap. Must be an + * md-native format bitmap + */ + void (*locate_bitmap)(struct supertype *st, int fd); + /* if add_internal_bitmap succeeded for existing array, this + * writes it out. + */ + int (*write_bitmap)(struct supertype *st, int fd); + /* Free the superblock and any other allocated data */ + void (*free_super)(struct supertype *st); + + /* validate_geometry is called with an st returned by + * match_metadata_desc. + * It should check that the geometry described is compatible with + * the metadata type. It will be called repeatedly as devices + * added to validate changing size and new devices. If there are + * inter-device dependencies, it should record sufficient details + * so these can be validated. + * Both 'size' and '*freesize' are in sectors. chunk is KiB. + * Return value is: + * 1: everything is OK + * 0: not OK for some reason - if 'verbose', then error was reported. + * -1: st->sb was NULL, 'subdev' is a member of a container of this + * type, but array is not acceptable for some reason + * message was reported even if verbose is 0. + */ + int (*validate_geometry)(struct supertype *st, int level, int layout, + int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int verbose); + + /* Return a linked list of 'mdinfo' structures for all arrays + * in the container. For non-containers, it is like + * getinfo_super with an allocated mdinfo.*/ + struct mdinfo *(*container_content)(struct supertype *st, char *subarray); + /* query the supertype for default geometry */ + void (*default_geometry)(struct supertype *st, int *level, int *layout, int *chunk); /* optional */ + /* Permit subarray's to be deleted from inactive containers */ + int (*kill_subarray)(struct supertype *st); /* optional */ + /* Permit subarray's to be modified */ + int (*update_subarray)(struct supertype *st, char *subarray, + char *update, struct mddev_ident *ident); /* optional */ + /* Check if reshape is supported for this external format. + * st is obtained from super_by_fd() where st->subarray[0] is + * initialized to indicate if reshape is being performed at the + * container or subarray level + */ +#define APPLY_METADATA_CHANGES 1 +#define ROLLBACK_METADATA_CHANGES 0 + + int (*reshape_super)(struct supertype *st, + unsigned long long size, int level, + int layout, int chunksize, int raid_disks, + int delta_disks, char *backup, char *dev, + int direction, + int verbose); /* optional */ + int (*manage_reshape)( /* optional */ + int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets); + +/* for mdmon */ + int (*open_new)(struct supertype *c, struct active_array *a, + char *inst); + + /* Tell the metadata handler the current state of the array. + * This covers whether it is known to be consistent (no pending writes) + * and how far along a resync is known to have progressed + * (in a->resync_start). + * resync status is really irrelevant if the array is not consistent, + * but some metadata (DDF!) have a place to record the distinction. + * If 'consistent' is '2', then the array can mark it dirty if a + * resync/recovery/whatever is required, or leave it clean if not. + * Return value is 0 dirty (not consistent) and 1 if clean. + * it is only really important if consistent is passed in as '2'. + */ + int (*set_array_state)(struct active_array *a, int consistent); + + /* When the state of a device might have changed, we call set_disk to + * tell the metadata what the current state is. + * Typically this happens on spare->in_sync and (spare|in_sync)->faulty + * transitions. + * set_disk might be called when the state of the particular disk has + * not in fact changed. + */ + void (*set_disk)(struct active_array *a, int n, int state); + void (*sync_metadata)(struct supertype *st); + void (*process_update)(struct supertype *st, + struct metadata_update *update); + /* Prepare updates allocates extra memory that might be + * needed. If the update cannot be understood, return 0. + */ + int (*prepare_update)(struct supertype *st, + struct metadata_update *update); + + /* activate_spare will check if the array is degraded and, if it + * is, try to find some spare space in the container. + * On success, it add appropriate updates (For process_update) to + * to the 'updates' list and returns a list of 'mdinfo' identifying + * the device, or devices as there might be multiple missing + * devices and multiple spares available. + */ + struct mdinfo *(*activate_spare)(struct active_array *a, + struct metadata_update **updates); + /* + * Return statically allocated string that represents metadata specific + * controller domain of the disk. The domain is used in disk domain + * matching functions. Disks belong to the same domain if the they have + * the same domain from mdadm.conf and belong the same metadata domain. + * Returning NULL or not providing this handler means that metadata + * does not distinguish the differences between disks that belong to + * different controllers. They are in the domain specified by + * configuration file (mdadm.conf). + * In case when the metadata has the notion of domains based on disk + * it shall return NULL for disks that do not belong to the controller + * the supported domains. Such disks will form another domain and won't + * be mixed with supported ones. + */ + const char *(*get_disk_controller_domain)(const char *path); + + /* for external backup area */ + int (*recover_backup)(struct supertype *st, struct mdinfo *info); + + /* validate container after assemble */ + int (*validate_container)(struct mdinfo *info); + + int swapuuid; /* true if uuid is bigending rather than hostendian */ + int external; + const char *name; /* canonical metadata name */ +} *superlist[]; + +extern struct superswitch super0, super1; +extern struct superswitch super_imsm, super_ddf; +extern struct superswitch mbr, gpt; + +struct metadata_update { + int len; + char *buf; + void *space; /* allocated space that monitor will use */ + void **space_list; /* list of allocated spaces that monitor can + * use or that it returned. + */ + struct metadata_update *next; +}; + +/* A supertype holds a particular collection of metadata. + * It identifies the metadata type by the superswitch, and the particular + * sub-version of that metadata type. + * metadata read in or created is stored in 'sb' and 'info'. + * There are also fields used by mdmon to track containers. + * + * A supertype may refer to: + * Just an array, possibly in a container + * A container, not identifying any particular array + * Info read from just one device, not yet fully describing the array/container. + * + * + * A supertype is created by: + * super_by_fd + * guess_super + * dup_super + */ +struct supertype { + struct superswitch *ss; + int minor_version; + int max_devs; + char container_devnm[32]; /* devnm of container */ + void *sb; + void *info; + void *other; /* Hack used to convert v0.90 to v1.0 */ + unsigned long long devsize; + unsigned long long data_offset; /* used by v1.x only */ + int ignore_hw_compat; /* used to inform metadata handlers that it should ignore + HW/firmware related incompatability to load metadata. + Used when examining metadata to display content of disk + when user has no hw/firmare compatible system. + */ + struct metadata_update *updates; + struct metadata_update **update_tail; + + /* extra stuff used by mdmon */ + struct active_array *arrays; + int sock; /* listen to external programs */ + char devnm[32]; /* e.g. md0. This appears in metadata_version: + * external:/md0/12 + */ + int devcnt; + int retry_soon; + + struct mdinfo *devs; + +}; + +extern struct supertype *super_by_fd(int fd, char **subarray); +enum guess_types { guess_any, guess_array, guess_partitions }; +extern struct supertype *guess_super_type(int fd, enum guess_types guess_type); +static inline struct supertype *guess_super(int fd) { + return guess_super_type(fd, guess_any); +} +extern struct supertype *dup_super(struct supertype *st); +extern int get_dev_size(int fd, char *dname, unsigned long long *sizep); +extern int must_be_container(int fd); +extern int dev_size_from_id(dev_t id, unsigned long long *size); +void wait_for(char *dev, int fd); + +/* + * Data structures for policy management. + * Each device can have a policy structure that lists + * various name/value pairs each possibly with a metadata associated. + * The policy list is sorted by name/value/metadata + */ +struct dev_policy { + struct dev_policy *next; + char *name; /* None of these strings are allocated. They are + * all just references to strings which are known + * to exist elsewhere. + * name and metadata can be compared by address equality. + */ + const char *metadata; + const char *value; +}; + +extern char pol_act[], pol_domain[], pol_metadata[], pol_auto[]; + +/* iterate over the sublist starting at list, having the same + * 'name' as 'list', and matching the given metadata (Where + * NULL matches anything + */ +#define pol_for_each(item, list, _metadata) \ + for (item = list; \ + item && item->name == list->name; \ + item = item->next) \ + if (!(!_metadata || !item->metadata || _metadata == item->metadata)) \ + ; else + +/* + * policy records read from mdadm are largely just name-value pairs. + * The names are constants, not strdupped + */ +struct pol_rule { + struct pol_rule *next; + char *type; /* rule_policy or rule_part */ + struct rule { + struct rule *next; + char *name; + char *value; + char *dups; /* duplicates of 'value' with a partNN appended */ + } *rule; +}; + +extern char rule_policy[], rule_part[]; +extern char rule_path[], rule_type[]; +extern char type_part[], type_disk[]; + +extern void policyline(char *line, char *type); +extern void policy_add(char *type, ...); +extern void policy_free(void); + +extern struct dev_policy *path_policy(char *path, char *type); +extern struct dev_policy *disk_policy(struct mdinfo *disk); +extern struct dev_policy *devid_policy(int devid); +extern void dev_policy_free(struct dev_policy *p); + +//extern void pol_new(struct dev_policy **pol, char *name, char *val, char *metadata); +extern void pol_add(struct dev_policy **pol, char *name, char *val, char *metadata); +extern struct dev_policy *pol_find(struct dev_policy *pol, char *name); + +enum policy_action { + act_default, + act_include, + act_re_add, + act_spare, /* This only applies to bare devices */ + act_spare_same_slot, /* this allows non-bare devices, + * but only if recent removal */ + act_force_spare, /* this allow non-bare devices in any case */ + act_err +}; + +extern int policy_action_allows(struct dev_policy *plist, const char *metadata, + enum policy_action want); +extern int disk_action_allows(struct mdinfo *disk, const char *metadata, + enum policy_action want); + +struct domainlist { + struct domainlist *next; + const char *dom; +}; + +extern int domain_test(struct domainlist *dom, struct dev_policy *pol, + const char *metadata); +extern struct domainlist *domain_from_array(struct mdinfo *mdi, + const char *metadata); +extern void domainlist_add_dev(struct domainlist **dom, int devid, + const char *metadata); +extern void domain_free(struct domainlist *dl); +extern void domain_merge(struct domainlist **domp, struct dev_policy *pol, + const char *metadata); +void domain_add(struct domainlist **domp, char *domain); + +extern void policy_save_path(char *id_path, struct map_ent *array); +extern int policy_check_path(struct mdinfo *disk, struct map_ent *array); + +#if __GNUC__ < 3 +struct stat64; +#endif + +#define HAVE_NFTW we assume +#define HAVE_FTW + +#ifdef __UCLIBC__ +# include <features.h> +# ifndef __UCLIBC_HAS_LFS__ +# define lseek64 lseek +# endif +# ifndef __UCLIBC_HAS_FTW__ +# undef HAVE_FTW +# undef HAVE_NFTW +# endif +#endif + +#ifdef __dietlibc__ +# undef HAVE_NFTW +#endif + +#if defined(__KLIBC__) +# undef HAVE_NFTW +# undef HAVE_FTW +#endif + +#ifndef HAVE_NFTW +# define FTW_PHYS 1 +# ifndef HAVE_FTW + struct FTW {}; +# endif +#endif + +#ifdef HAVE_FTW +# include <ftw.h> +#endif + +extern int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s); + +extern int Manage_ro(char *devname, int fd, int readonly); +extern int Manage_run(char *devname, int fd, struct context *c); +extern int Manage_stop(char *devname, int fd, int quiet, + int will_retry); +extern int Manage_subdevs(char *devname, int fd, + struct mddev_dev *devlist, int verbose, int test, + char *update, int force); +extern int autodetect(void); +extern int Grow_Add_device(char *devname, int fd, char *newdev); +extern int Grow_addbitmap(char *devname, int fd, + struct context *c, struct shape *s); +extern int Grow_reshape(char *devname, int fd, + struct mddev_dev *devlist, + unsigned long long data_offset, + struct context *c, struct shape *s); +extern int Grow_restart(struct supertype *st, struct mdinfo *info, + int *fdlist, int cnt, char *backup_file, int verbose); +extern int Grow_continue(int mdfd, struct supertype *st, + struct mdinfo *info, char *backup_file, + int forked, int freeze_reshape); + +extern int restore_backup(struct supertype *st, + struct mdinfo *content, + int working_disks, + int spares, + char **backup_filep, + int verbose); +extern int Grow_continue_command(char *devname, int fd, + char *backup_file, int verbose); + +extern int Assemble(struct supertype *st, char *mddev, + struct mddev_ident *ident, + struct mddev_dev *devlist, + struct context *c); + +extern int Build(char *mddev, struct mddev_dev *devlist, + struct shape *s, struct context *c); + +extern int Create(struct supertype *st, char *mddev, + char *name, int *uuid, + int subdevs, struct mddev_dev *devlist, + struct shape *s, + struct context *c, + unsigned long long data_offset); + +extern int Detail(char *dev, struct context *c); +extern int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path); +extern int Query(char *dev); +extern int ExamineBadblocks(char *devname, int brief, struct supertype *forcest); +extern int Examine(struct mddev_dev *devlist, struct context *c, + struct supertype *forcest); +extern int Monitor(struct mddev_dev *devlist, + char *mailaddr, char *alert_cmd, + struct context *c, + int daemonise, int oneshot, + int dosyslog, char *pidfile, int increments, + int share); + +extern int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl); +extern int Kill_subarray(char *dev, char *subarray, int verbose); +extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet); +extern int Wait(char *dev); +extern int WaitClean(char *dev, int sock, int verbose); +extern int SetAction(char *dev, char *action); + +extern int Incremental(struct mddev_dev *devlist, struct context *c, + struct supertype *st); +extern void RebuildMap(void); +extern int IncrementalScan(struct context *c, char *devnm); +extern int IncrementalRemove(char *devname, char *path, int verbose); +extern int CreateBitmap(char *filename, int force, char uuid[16], + unsigned long chunksize, unsigned long daemon_sleep, + unsigned long write_behind, + unsigned long long array_size, + int major); +extern int ExamineBitmap(char *filename, int brief, struct supertype *st); +extern int Write_rules(char *rule_name); +extern int bitmap_update_uuid(int fd, int *uuid, int swap); +extern unsigned long bitmap_sectors(struct bitmap_super_s *bsb); +extern int Dump_metadata(char *dev, char *dir, struct context *c, + struct supertype *st); +extern int Restore_metadata(char *dev, char *dir, struct context *c, + struct supertype *st, int only); + +extern int md_get_version(int fd); +extern int get_linux_version(void); +extern int mdadm_version(char *version); +extern unsigned long long parse_size(char *size); +extern int parse_uuid(char *str, int uuid[4]); +extern int parse_layout_10(char *layout); +extern int parse_layout_faulty(char *layout); +extern long parse_num(char *num); +extern int check_ext2(int fd, char *name); +extern int check_reiser(int fd, char *name); +extern int check_raid(int fd, char *name); +extern int check_partitions(int fd, char *dname, + unsigned long long freesize, + unsigned long long size); + +extern int get_mdp_major(void); +extern int get_maj_min(char *dev, int *major, int *minor); +extern int dev_open(char *dev, int flags); +extern int open_dev(char *devnm); +extern void reopen_mddev(int mdfd); +extern int open_dev_flags(char *devnm, int flags); +extern int open_dev_excl(char *devnm); +extern int is_standard(char *dev, int *nump); +extern int same_dev(char *one, char *two); +extern int compare_paths (char* path1,char* path2); +extern void enable_fds(int devices); + +extern int parse_auto(char *str, char *msg, int config); +extern struct mddev_ident *conf_get_ident(char *dev); +extern struct mddev_dev *conf_get_devs(void); +extern int conf_test_dev(char *devname); +extern int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost); +extern struct createinfo *conf_get_create_info(void); +extern void set_conffile(char *file); +extern char *conf_get_mailaddr(void); +extern char *conf_get_mailfrom(void); +extern char *conf_get_program(void); +extern char *conf_get_homehost(int *require_homehostp); +extern char *conf_line(FILE *file); +extern char *conf_word(FILE *file, int allow_key); +extern void print_quoted(char *str); +extern void print_escape(char *str); +extern int use_udev(void); +extern unsigned long GCD(unsigned long a, unsigned long b); +extern int conf_name_is_free(char *name); +extern int conf_verify_devnames(struct mddev_ident *array_list); +extern int devname_matches(char *name, char *match); +extern struct mddev_ident *conf_match(struct supertype *st, + struct mdinfo *info, + char *devname, + int verbose, int *rvp); +extern int experimental(void); + +extern void free_line(char *line); +extern int match_oneof(char *devices, char *devname); +extern void uuid_from_super(int uuid[4], mdp_super_t *super); +extern const int uuid_zero[4]; +extern int same_uuid(int a[4], int b[4], int swapuuid); +extern void copy_uuid(void *a, int b[4], int swapuuid); +extern char *__fname_from_uuid(int id[4], int swap, char *buf, char sep); +extern char *fname_from_uuid(struct supertype *st, + struct mdinfo *info, char *buf, char sep); +extern unsigned long calc_csum(void *super, int bytes); +extern int enough(int level, int raid_disks, int layout, int clean, + char *avail); +extern int enough_fd(int fd); +extern int ask(char *mesg); +extern unsigned long long get_component_size(int fd); +extern void remove_partitions(int fd); +extern int test_partition(int fd); +extern int test_partition_from_id(dev_t id); +extern int get_data_disks(int level, int layout, int raid_disks); +extern unsigned long long calc_array_size(int level, int raid_disks, int layout, + int chunksize, unsigned long long devsize); +extern int flush_metadata_updates(struct supertype *st); +extern void append_metadata_update(struct supertype *st, void *buf, int len); +extern int assemble_container_content(struct supertype *st, int mdfd, + struct mdinfo *content, + struct context *c, + char *chosen_name, int *result); +#define INCR_NO 1 +#define INCR_UNSAFE 2 +#define INCR_ALREADY 4 +#define INCR_YES 8 +extern struct mdinfo *container_choose_spares(struct supertype *st, + unsigned long long min_size, + struct domainlist *domlist, + char *spare_group, + const char *metadata, int get_one); +extern int move_spare(char *from_devname, char *to_devname, dev_t devid); +extern int add_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info); +extern int remove_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info); +extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info); +unsigned long long min_recovery_start(struct mdinfo *array); + +extern char *human_size(long long bytes); +extern char *human_size_brief(long long bytes, int prefix); +extern void print_r10_layout(int layout); + +extern char *find_free_devnm(int use_partitions); + +extern void put_md_name(char *name); +extern char *devid2kname(int devid); +extern char *devid2devnm(int devid); +extern int devnm2devid(char *devnm); +extern char *get_md_name(char *devnm); + +extern char DefaultConfFile[]; + +extern int create_mddev(char *dev, char *name, int autof, int trustworthy, + char *chosen); +/* values for 'trustworthy' */ +#define LOCAL 1 +#define LOCAL_ANY 10 +#define FOREIGN 2 +#define METADATA 3 +extern int open_mddev(char *dev, int report_errors); +extern int open_container(int fd); +extern int metadata_container_matches(char *metadata, char *devnm); +extern int metadata_subdev_matches(char *metadata, char *devnm); +extern int is_container_member(struct mdstat_ent *ent, char *devname); +extern int is_subarray_active(char *subarray, char *devname); +extern int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet); +extern struct superswitch *version_to_superswitch(char *vers); + +extern int mdmon_running(char *devnm); +extern int mdmon_pid(char *devnm); +extern int check_env(char *name); +extern __u32 random32(void); +extern int start_mdmon(char *devnm); + +extern int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long stripes, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets); +void abort_reshape(struct mdinfo *sra); + +void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0); + +extern void fmt_devname(char *name, int num); +extern char *stat2devnm(struct stat *st); +extern char *fd2devnm(int fd); + +extern int in_initrd(void); + +#define _ROUND_UP(val, base) (((val) + (base) - 1) & ~(base - 1)) +#define ROUND_UP(val, base) _ROUND_UP(val, (typeof(val))(base)) +#define ROUND_UP_PTR(ptr, base) ((typeof(ptr)) \ + (ROUND_UP((unsigned long)(ptr), base))) + +static inline int is_subarray(char *vers) +{ + /* The version string for a 'subarray' (an array in a container) + * is + * /containername/componentname for normal read-write arrays + * -containername/componentname for arrays which mdmon must not + * reconfigure. They might be read-only + * or might be undergoing reshape etc. + * containername is e.g. md0, md_d1 + * componentname is dependant on the metadata. e.g. '1' 'S1' ... + */ + return (*vers == '/' || *vers == '-'); +} + +static inline char *to_subarray(struct mdstat_ent *ent, char *container) +{ + return &ent->metadata_version[10+strlen(container)+1]; +} + +#ifdef DEBUG +#define dprintf(fmt, arg...) \ + fprintf(stderr, "%s: %s: "fmt, Name, __func__, ##arg) +#define dprintf_cont(fmt, arg...) \ + fprintf(stderr, fmt, ##arg) +#else +#define dprintf(fmt, arg...) \ + ({ if (0) fprintf(stderr, "%s: %s: " fmt, Name, __func__, ##arg); 0; }) +#define dprintf_cont(fmt, arg...) \ + ({ if (0) fprintf(stderr, fmt, ##arg); 0; }) +#endif +#include <assert.h> +#include <stdarg.h> +static inline int xasprintf(char **strp, const char *fmt, ...) { + va_list ap; + int ret; + va_start(ap, fmt); + ret = vasprintf(strp, fmt, ap); + va_end(ap); + assert(ret >= 0); + return ret; +} + +#ifdef DEBUG +#define pr_err(fmt, args...) fprintf(stderr, "%s: %s: "fmt, Name, __func__, ##args) +#else +#define pr_err(fmt, args...) fprintf(stderr, "%s: "fmt, Name, ##args) +#endif +#define cont_err(fmt ...) fprintf(stderr, " " fmt) + +void *xmalloc(size_t len); +void *xrealloc(void *ptr, size_t len); +void *xcalloc(size_t num, size_t size); +char *xstrdup(const char *str); + +#define LEVEL_MULTIPATH (-4) +#define LEVEL_LINEAR (-1) +#define LEVEL_FAULTY (-5) + +/* kernel module doesn't know about these */ +#define LEVEL_CONTAINER (-100) +#define LEVEL_UNSUPPORTED (-200) + +/* the kernel does know about this one ... */ +#define LEVEL_NONE (-1000000) + +/* faulty stuff */ + +#define WriteTransient 0 +#define ReadTransient 1 +#define WritePersistent 2 +#define ReadPersistent 3 +#define WriteAll 4 /* doesn't go to device */ +#define ReadFixable 5 +#define Modes 6 + +#define ClearErrors 31 +#define ClearFaults 30 + +#define AllPersist 100 /* internal use only */ +#define NoPersist 101 + +#define ModeMask 0x1f +#define ModeShift 5 + +#ifdef __TINYC__ +#undef minor +#undef major +#undef makedev +#define minor(x) ((x)&0xff) +#define major(x) (((x)>>8)&0xff) +#define makedev(M,m) (((M)<<8) | (m)) +#endif + +/* for raid4/5/6 */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 +#define ALGORITHM_RIGHT_SYMMETRIC 3 + +/* Define non-rotating (raid4) algorithms. These allow + * conversion of raid4 to raid5. + */ +#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ +#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ + +/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. + * Firstly, the exact positioning of the parity block is slightly + * different between the 'LEFT_*' modes of md and the "_N_*" modes + * of DDF. + * Secondly, or order of datablocks over which the Q syndrome is computed + * is different. + * Consequently we have different layouts for DDF/raid6 than md/raid6. + * These layouts are from the DDFv1.2 spec. + * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but + * leaves RLQ=3 as 'Vendor Specific' + */ + +#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ +#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ +#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ + +/* For every RAID5 algorithm we define a RAID6 algorithm + * with exactly the same layout for data and parity, and + * with the Q block always on the last device (N-1). + * This allows trivial conversion from RAID5 to RAID6 + */ +#define ALGORITHM_LEFT_ASYMMETRIC_6 16 +#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 +#define ALGORITHM_LEFT_SYMMETRIC_6 18 +#define ALGORITHM_RIGHT_SYMMETRIC_6 19 +#define ALGORITHM_PARITY_0_6 20 +#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N + +/* Define PATH_MAX in case we don't use glibc or standard library does + * not have PATH_MAX defined. Assume max path length is 4K characters. + */ +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +#define RESYNC_NONE -1 +#define RESYNC_DELAYED -2 +#define RESYNC_PENDING -3 +#define RESYNC_UNKNOWN -4 + +/* When using "GET_DISK_INFO" it isn't certain how high + * we need to check. So we impose an absolute limit of + * MAX_DISKS. This needs to be much more than the largest + * number of devices any metadata can support. Currently + * v1.x can support 1920 + */ +#define MAX_DISKS 4096 + +/* Sometimes the 'size' value passed needs to mean "Maximum". + * In those cases with use MAX_SIZE + */ +#define MAX_SIZE 1 + +/* We want to use unsigned numbers for sector counts, but need + * a value for 'invalid'. Use '1'. + */ +#define INVALID_SECTORS 1 +/* And another special number needed for --data_offset=variable */ +#define VARIABLE_OFFSET 3 diff --git a/mdadm.spec b/mdadm.spec new file mode 100644 index 00000000..293cb190 --- /dev/null +++ b/mdadm.spec @@ -0,0 +1,45 @@ +Summary: mdadm is used for controlling Linux md devices (aka RAID arrays) +Name: mdadm +Version: 3.3.4 +Release: 1 +Source: http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tar.gz +URL: http://neil.brown.name/blog/mdadm +License: GPL +Group: Utilities/System +BuildRoot: %{_tmppath}/%{name}-root +Obsoletes: mdctl + +%description +mdadm is a program that can be used to create, manage, and monitor +Linux MD (Software RAID) devices. + +%prep +%setup -q +# we want to install in /sbin, not /usr/sbin... +%define _exec_prefix %{nil} + +%build +# This is a debatable issue. The author of this RPM spec file feels that +# people who install RPMs (especially given that the default RPM options +# will strip the binary) are not going to be running gdb against the +# program. +make CXFLAGS="$RPM_OPT_FLAGS" SYSCONFDIR="%{_sysconfdir}" + +%install +make DESTDIR=$RPM_BUILD_ROOT MANDIR=%{_mandir} BINDIR=%{_sbindir} install +install -D -m644 mdadm.conf-example $RPM_BUILD_ROOT/%{_sysconfdir}/mdadm.conf + +%clean +rm -rf $RPM_BUILD_ROOT + +%files +%defattr(-,root,root) +%doc TODO ChangeLog mdadm.conf-example COPYING +%{_sbindir}/mdadm +%{_sbindir}/mdmon +/usr/lib/udev/rules.d/63-md-raid-arrays.rules +/usr/lib/udev/rules.d/64-md-raid-assembly.rules +%config(noreplace,missingok)/%{_sysconfdir}/mdadm.conf +%{_mandir}/man*/md* + +%changelog diff --git a/mdassemble.8 b/mdassemble.8 new file mode 100644 index 00000000..601c1d10 --- /dev/null +++ b/mdassemble.8 @@ -0,0 +1,65 @@ +.\" -*- nroff -*- +.TH MDASSEMBLE 8 "" v3.3.4 +.SH NAME +mdassemble \- assemble MD devices +.I aka +Linux Software RAID + +.SH SYNOPSIS + +.BI mdassemble + +.SH DESCRIPTION +.B mdassemble +is a tiny program that can be used to assemble MD devices inside an +initial ramdisk (initrd) or initramfs; it is meant to replace the in-kernel +automatic RAID detection and activation. +It can be built statically and linked against lightweight libc alternatives, like +.B dietlibc, +.B klibc +or +.B uClibc. + +.SH USAGE +Invoking +.B mdassemble +has the same effect as invoking +.B mdadm \-\-assemble \-\-scan. +.PP +Invoking +.B mdassemble +a second time will make all defined arrays readwrite, this is useful if +using the +.B start_ro +module parameter. + +.SH OPTIONS + +There are no options to +.B mdassemble. + +.SH FILES + +.SS /etc/mdadm/mdadm.conf + +The config file lists which devices may be scanned to see if +they contain MD super block, and gives identifying information +(e.g. UUID) about known MD arrays. See +.BR mdadm.conf (5) +for more details. + +.B mdassemble +supports all configuration parameters defined in +.B mdadm.conf +with the exception of +.B auto= +which is supported only if mdadm was built with the +.B \-DMDASSEMBLE_AUTO +define. + +.SH SEE ALSO +.PP +.BR mdadm (8), +.BR mdadm.conf (5), +.BR md (4), +.BR diet (1). diff --git a/mdassemble.c b/mdassemble.c new file mode 100644 index 00000000..78d363a3 --- /dev/null +++ b/mdassemble.c @@ -0,0 +1,80 @@ +/* + * mdassemble - assemble Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * Copyright (C) 2003 Luca Berra <bluca@vodka.it> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" + +char const Name[] = "mdassemble"; + +#ifndef MDASSEMBLE_AUTO +/* from mdopen.c */ +int open_mddev(char *dev, int report_errors/*unused*/) +{ + int mdfd = open(dev, O_RDWR); + if (mdfd < 0) + pr_err("error opening %s: %s\n", + dev, strerror(errno)); + else if (md_get_version(mdfd) <= 0) { + pr_err("%s does not appear to be an md device\n", + dev); + close(mdfd); + mdfd = -1; + } + return mdfd; +} +int create_mddev(char *dev, char *name, int autof/*unused*/, int trustworthy, + char *chosen) +{ + return open_mddev(dev, 0); +} +#endif + +int rv; +int mdfd = -1; + +int main(int argc, char *argv[]) +{ + struct mddev_ident *array_list = conf_get_ident(NULL); + struct context c = { .freeze_reshape = 1 }; + if (!array_list) { + pr_err("No arrays found in config file\n"); + rv = 1; + } else + for (; array_list; array_list = array_list->next) { + mdu_array_info_t array; + if (strcasecmp(array_list->devname, "<ignore>") == 0) + continue; + mdfd = open_mddev(array_list->devname, 0); + if (mdfd >= 0 && ioctl(mdfd, GET_ARRAY_INFO, &array) == 0) { + rv |= Manage_ro(array_list->devname, mdfd, -1); /* make it readwrite */ + continue; + } + if (mdfd >= 0) + close(mdfd); + rv |= Assemble(array_list->st, array_list->devname, + array_list, NULL, &c); + } + return rv; +} diff --git a/mdmon-design.txt b/mdmon-design.txt new file mode 100644 index 00000000..f09184a9 --- /dev/null +++ b/mdmon-design.txt @@ -0,0 +1,146 @@ + +When managing a RAID1 array which uses metadata other than the +"native" metadata understood by the kernel, mdadm makes use of a +partner program named 'mdmon' to manage some aspects of updating +that metadata and synchronising the metadata with the array state. + +This document provides some details on how mdmon works. + +Containers +---------- + +As background: mdadm makes a distinction between an 'array' and a +'container'. Other sources sometimes use the term 'volume' or +'device' for an 'array', and may use the term 'array' for a +'container'. + +For our purposes: + - a 'container' is a collection of devices which are described by a + single set of metadata. The metadata may be stored equally + on all devices, or different devices may have quite different + subsets of the total metadata. But there is conceptually one set + of metadata that unifies the devices. + + - an 'array' is a set of datablock from various devices which + together are used to present the abstraction of a single linear + sequence of block, which may provide data redundancy or enhanced + performance. + +So a container has some metadata and provides a number of arrays which +are described by that metadata. + +Sometimes this model doesn't work perfectly. For example, global +spares may have their own metadata which is quite different from the +metadata from any device that participates in one or more arrays. +Such a global spare might still need to belong to some container so +that it is available to be used should a failure arise. In that case +we consider the 'metadata' to be the union of the metadata on the +active devices which describes the arrays, and the metadata on the +global spares which only describes the spares. In this case different +devices in the one container will have quite different metadata. + + +Purpose +------- + +The main purpose of mdmon is to update the metadata in response to +changes to the array which need to be reflected in the metadata before +futures writes to the array can safely be performed. +These include: + - transitions from 'clean' to 'dirty'. + - recording the devices have failed. + - recording the progress of a 'reshape' + +This requires mdmon to be running at any time that the array is +writable (a read-only array does not require mdmon to be running). + +Because mdmon must be able to process these metadata updates at any +time, it must (when running) have exclusive write access to the +metadata. Any other changes (e.g. reconfiguration of the array) must +go through mdmon. + +A secondary role for mdmon is to activate spares when a device fails. +This role is much less time-critical than the other metadata updates, +so it could be performed by a separate process, possibly +"mdadm --monitor" which has a related role of moving devices between +arrays. A main reason for including this functionality in mdmon is +that in the native-metadata case this function is handled in the +kernel, and mdmon's reason for existence to provide functionality +which is otherwise handled by the kernel. + + +Design overview +--------------- + +mdmon is structured as two threads with a common address space and +common data structures. These threads are know as the 'monitor' and +the 'manager'. + +The 'monitor' has the primary role of monitoring the array for +important state changes and updating the metadata accordingly. As +writes to the array can be blocked until 'monitor' completes and +acknowledges the update, it much be very careful not to block itself. +In particular it must not block waiting for any write to complete else +it could deadlock. This means that it must not allocate memory as +doing this can require dirty memory to be written out and if the +system choose to write to the array that mdmon is monitoring, the +memory allocation could deadlock. + +So 'monitor' must never allocate memory and must limit the number of +other system call it performs. It may: + - use select (or poll) to wait for activity on a file descriptor + - read from a sysfs file descriptor + - write to a sysfs file descriptor + - write the metadata out to the block devices using O_DIRECT + - send a signal (kill) to the manager thread + +It must not e.g. open files or do anything similar that might allocate +resources. + +The 'manager' thread does everything else that is needed. If any +files are to be opened (e.g. because a device has been added to the +array), the manager does that. If any memory needs to be allocated +(e.g. to hold data about a new array as can happen when one set of +metadata describes several arrays), the manager performs that +allocation. + +The 'manager' is also responsible for communicating with mdadm and +assigning spares to replace failed devices. + + +Handling metadata updates +------------------------- + +There are a number of cases in which mdadm needs to update the +metdata which mdmon is managing. These include: + - creating a new array in an active container + - adding a device to a container + - reconfiguring an array +etc. + +To complete these updates, mdadm must send a message to mdmon which +will merge the update into the metadata as it is at that moment. + +To achieve this, mdmon creates a Unix Domain Socket which the manager +thread listens on. mdadm sends a message over this socket. The +manager thread examines the message to see if it will require +allocating any memory and allocates it. This is done in the +'prepare_update' metadata method. + +The update message is then queued for handling by the monitor thread +which it will do when convenient. The monitor thread calls +->process_update which should atomically make the required changes to +the metadata, making use of the pre-allocate memory as required. Any +memory the is no-longer needed can be placed back in the request and +the manager thread will free it. + +The exact format of a metadata update is up to the implementer of the +metadata handlers. It will simply describe a change that needs to be +made. It will sometimes contain fragments of the metadata to be +copied in to place. However the ->process_update routine must make +sure not to over-write any field that the monitor thread might have +updated, such as a 'device failed' or 'array is dirty' state. + +When the monitor thread has completed the update and written it to the +devices, an acknowledgement message is sent back over the socket so +that mdadm knows it is complete. diff --git a/mdmon.8 b/mdmon.8 new file mode 100644 index 00000000..beb82e03 --- /dev/null +++ b/mdmon.8 @@ -0,0 +1,257 @@ +.\" See file COPYING in distribution for details. +.TH MDMON 8 "" v3.3.4 +.SH NAME +mdmon \- monitor MD external metadata arrays + +.SH SYNOPSIS + +.BI mdmon " [--all] [--takeover] [--foreground] CONTAINER" + +.SH OVERVIEW +The 2.6.27 kernel brings the ability to support external metadata arrays. +External metadata implies that user space handles all updates to the metadata. +The kernel's responsibility is to notify user space when a "metadata event" +occurs, like disk failures and clean-to-dirty transitions. The kernel, in +important cases, waits for user space to take action on these notifications. + +.SH DESCRIPTION +.SS Metadata updates: +To service metadata update requests a daemon, +.IR mdmon , +is introduced. +.I Mdmon +is tasked with polling the sysfs namespace looking for changes in +.BR array_state , +.BR sync_action , +and per disk +.BR state +attributes. When a change is detected it calls a per metadata type +handler to make modifications to the metadata. The following actions +are taken: +.RS +.TP +.B array_state \- inactive +Clear the dirty bit for the volume and let the array be stopped +.TP +.B array_state \- write pending +Set the dirty bit for the array and then set +.B array_state +to +.BR active . +Writes +are blocked until userspace writes +.BR active. +.TP +.B array_state \- active-idle +The safe mode timer has expired so set array state to clean to block writes to the array +.TP +.B array_state \- clean +Clear the dirty bit for the volume +.TP +.B array_state \- read-only +This is the initial state that all arrays start at. +.I mdmon +takes one of the three actions: +.RS +.TP +1/ +Transition the array to read-auto keeping the dirty bit clear if the metadata +handler determines that the array does not need resyncing or other modification +.TP +2/ +Transition the array to active if the metadata handler determines a resync or +some other manipulation is necessary +.TP +3/ +Leave the array read\-only if the volume is marked to not be monitored; for +example, the metadata version has been set to "external:\-dev/md127" instead of +"external:/dev/md127" +.RE +.TP +.B sync_action \- resync\-to\-idle +Notify the metadata handler that a resync may have completed. If a resync +process is idled before it completes this event allows the metadata handler to +checkpoint resync. +.TP +.B sync_action \- recover\-to\-idle +A spare may have completed rebuilding so tell the metadata handler about the +state of each disk. This is the metadata handler's opportunity to clear +any "out-of-sync" bits and clear the volume's degraded status. If a recovery +process is idled before it completes this event allows the metadata handler to +checkpoint recovery. +.TP +.B <disk>/state \- faulty +A disk failure kicks off a series of events. First, notify the metadata +handler that a disk has failed, and then notify the kernel that it can unblock +writes that were dependent on this disk. After unblocking the kernel this disk +is set to be removed+ from the member array. Finally the disk is marked failed +in all other member arrays in the container. +.IP ++ Note This behavior differs slightly from native MD arrays where +removal is reserved for a +.B mdadm --remove +event. In the external metadata case the container holds the final +reference on a block device and a +.B mdadm --remove <container> <victim> +call is still required. +.RE + +.SS Containers: +.P +External metadata formats, like DDF, differ from the native MD metadata +formats in that they define a set of disks and a series of sub-arrays +within those disks. MD metadata in comparison defines a 1:1 +relationship between a set of block devices and a RAID array. For +example to create 2 arrays at different RAID levels on a single +set of disks, MD metadata requires the disks be partitioned and then +each array can be created with a subset of those partitions. The +supported external formats perform this disk carving internally. +.P +Container devices simply hold references to all member disks and allow +tools like +.I mdmon +to determine which active arrays belong to which +container. Some array management commands like disk removal and disk +add are now only valid at the container level. Attempts to perform +these actions on member arrays are blocked with error messages like: +.IP +"mdadm: Cannot remove disks from a \'member\' array, perform this +operation on the parent container" +.P +Containers are identified in /proc/mdstat with a metadata version string +"external:<metadata name>". Member devices are identified by +"external:/<container device>/<member index>", or "external:-<container +device>/<member index>" if the array is to remain readonly. + +.SH OPTIONS +.TP +CONTAINER +The +.B container +device to monitor. It can be a full path like /dev/md/container, or a +simple md device name like md127. +.TP +.B \-\-foreground +Normally, +.I mdmon +will fork and continue in the background. Adding this option will +skip that step and run +.I mdmon +in the foreground. +.TP +.B \-\-takeover +This instructs +.I mdmon +to replace any active +.I mdmon +which is currently monitoring the array. This is primarily used late +in the boot process to replace any +.I mdmon +which was started from an +.B initramfs +before the root filesystem was mounted. This avoids holding a +reference on that +.B initramfs +indefinitely and ensures that the +.I pid +and +.I sock +files used to communicate with +.I mdmon +are in a standard place. +.TP +.B \-\-all +This tells mdmon to find any active containers and start monitoring +each of them if appropriate. This is normally used with +.B \-\-takeover +late in the boot sequence. +A separate +.I mdmon +process is started for each container as the +.B \-\-all +argument is over-written with the name of the container. To allow for +containers with names longer than 5 characters, this argument can be +arbitrarily extended, e.g. to +.BR \-\-all-active-arrays . +.TP + +.PP +Note that +.I mdmon +is automatically started by +.I mdadm +when needed and so does not need to be considered when working with +RAID arrays. The only times it is run other than by +.I mdadm +is when the boot scripts need to restart it after mounting the new +root filesystem. + +.SH START UP AND SHUTDOWN + +As +.I mdmon +needs to be running whenever any filesystem on the monitored device is +mounted there are special considerations when the root filesystem is +mounted from an +.I mdmon +monitored device. +Note that in general +.I mdmon +is needed even if the filesystem is mounted read-only as some +filesystems can still write to the device in those circumstances, for +example to replay a journal after an unclean shutdown. + +When the array is assembled by the +.B initramfs +code, mdadm will automatically start +.I mdmon +as required. This means that +.I mdmon +must be installed on the +.B initramfs +and there must be a writable filesystem (typically tmpfs) in which +.B mdmon +can create a +.B .pid +and +.B .sock +file. The particular filesystem to use is given to mdmon at compile +time and defaults to +.BR /run/mdadm . + +This filesystem must persist through to shutdown time. + +After the final root filesystem has be instantiated (usually with +.BR pivot_root ) +.I mdmon +should be run with +.I "\-\-all \-\-takeover" +so that the +.I mdmon +running from the +.B initramfs +can be replaced with one running in the main root, and so the +memory used by the initramfs can be released. + +At shutdown time, +.I mdmon +should not be killed along with other processes. Also as it holds a +file (socket actually) open in +.B /dev +(by default) it will not be possible to unmount +.B /dev +if it is a separate filesystem. + +.SH EXAMPLES + +.B " mdmon \-\-all-active-arrays \-\-takeover" +.br +Any +.I mdmon +which is currently running is killed and a new instance is started. +This should be run during in the boot sequence if an initramfs was +used, so that any mdmon running from the initramfs will not hold +the initramfs active. +.SH SEE ALSO +.IR mdadm (8), +.IR md (4). diff --git a/mdmon.c b/mdmon.c new file mode 100644 index 00000000..ee12b7c7 --- /dev/null +++ b/mdmon.c @@ -0,0 +1,602 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de> + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * md array manager. + * When md arrays have user-space managed metadata, this is the program + * that does the managing. + * + * Given one argument: the name of the array (e.g. /dev/md0) that is + * the container. + * We fork off a helper that runs high priority and mlocked. It responds to + * device failures and other events that might stop writeout, or that are + * trivial to deal with. + * The main thread then watches for new arrays being created in the container + * and starts monitoring them too ... along with a few other tasks. + * + * The main thread communicates with the priority thread by writing over + * a pipe. + * Separate programs can communicate with the main thread via Unix-domain + * socket. + * The two threads share address space and open file table. + * + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/wait.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <fcntl.h> +#include <signal.h> +#include <dirent.h> +#ifdef USE_PTHREADS +#include <pthread.h> +#else +#include <sched.h> +#endif + +#include "mdadm.h" +#include "mdmon.h" + +char const Name[] = "mdmon"; + +struct active_array *discard_this; +struct active_array *pending_discard; + +int mon_tid, mgr_tid; + +int sigterm; + +#ifdef USE_PTHREADS +static void *run_child(void *v) +{ + struct supertype *c = v; + + mon_tid = syscall(SYS_gettid); + do_monitor(c); + return 0; +} + +static int clone_monitor(struct supertype *container) +{ + pthread_attr_t attr; + pthread_t thread; + int rc; + + mon_tid = -1; + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 4096); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + rc = pthread_create(&thread, &attr, run_child, container); + if (rc) + return rc; + while (mon_tid == -1) + usleep(10); + pthread_attr_destroy(&attr); + + mgr_tid = syscall(SYS_gettid); + + return mon_tid; +} +#else /* USE_PTHREADS */ +static int run_child(void *v) +{ + struct supertype *c = v; + + do_monitor(c); + return 0; +} + +#ifdef __ia64__ +int __clone2(int (*fn)(void *), + void *child_stack_base, size_t stack_size, + int flags, void *arg, ... + /* pid_t *pid, struct user_desc *tls, pid_t *ctid */ ); +#endif +static int clone_monitor(struct supertype *container) +{ + static char stack[4096]; + +#ifdef __ia64__ + mon_tid = __clone2(run_child, stack, sizeof(stack), + CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD, + container); +#else + mon_tid = clone(run_child, stack+4096-64, + CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD, + container); +#endif + + mgr_tid = syscall(SYS_gettid); + + return mon_tid; +} +#endif /* USE_PTHREADS */ + +static int make_pidfile(char *devname) +{ + char path[100]; + char pid[10]; + int fd; + int n; + + if (mkdir(MDMON_DIR, 0755) < 0 && + errno != EEXIST) + return -errno; + sprintf(path, "%s/%s.pid", MDMON_DIR, devname); + + fd = open(path, O_RDWR|O_CREAT|O_EXCL, 0600); + if (fd < 0) + return -errno; + sprintf(pid, "%d\n", getpid()); + n = write(fd, pid, strlen(pid)); + close(fd); + if (n < 0) + return -errno; + return 0; +} + +static void try_kill_monitor(pid_t pid, char *devname, int sock) +{ + char buf[100]; + int fd; + int n; + long fl; + + /* first rule of survival... don't off yourself */ + if (pid == getpid()) + return; + + /* kill this process if it is mdmon */ + sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid); + fd = open(buf, O_RDONLY); + if (fd < 0) + return; + + n = read(fd, buf, sizeof(buf)-1); + buf[sizeof(buf)-1] = 0; + close(fd); + + if (n < 0 || !(strstr(buf, "mdmon") || + strstr(buf, "@dmon"))) + return; + + kill(pid, SIGTERM); + + if (sock < 0) + return; + + /* Wait for monitor to exit by reading from the socket, after + * clearing the non-blocking flag */ + fl = fcntl(sock, F_GETFL, 0); + fl &= ~O_NONBLOCK; + fcntl(sock, F_SETFL, fl); + n = read(sock, buf, 100); + /* Ignore result, it is just the wait that + * matters + */ +} + +void remove_pidfile(char *devname) +{ + char buf[100]; + + sprintf(buf, "%s/%s.pid", MDMON_DIR, devname); + unlink(buf); + sprintf(buf, "%s/%s.sock", MDMON_DIR, devname); + unlink(buf); +} + +static int make_control_sock(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + + if (sigterm) + return -1; + + sprintf(path, "%s/%s.sock", MDMON_DIR, devname); + unlink(path); + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + umask(077); /* ensure no world write access */ + if (bind(sfd, &addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + listen(sfd, 10); + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + return sfd; +} + +static void term(int sig) +{ + sigterm = 1; +} + +static void wake_me(int sig) +{ + +} + +/* if we are debugging and starting mdmon by hand then don't fork */ +static int do_fork(void) +{ + #ifdef DEBUG + if (check_env("MDADM_NO_MDMON")) + return 0; + #endif + + return 1; +} + +void usage(void) +{ + fprintf(stderr, +"Usage: mdmon [options] CONTAINER\n" +"\n" +"Options are:\n" +" --help -h : This message\n" +" --all -a : All devices\n" +" --foreground -F : Run in foreground (do not fork)\n" +" --takeover -t : Takeover container\n" +); + exit(2); +} + +static int mdmon(char *devnm, int must_fork, int takeover); + +int main(int argc, char *argv[]) +{ + char *container_name = NULL; + char *devnm = NULL; + int status = 0; + int opt; + int all = 0; + int takeover = 0; + int dofork = 1; + static struct option options[] = { + {"all", 0, NULL, 'a'}, + {"takeover", 0, NULL, 't'}, + {"help", 0, NULL, 'h'}, + {"offroot", 0, NULL, OffRootOpt}, + {"foreground", 0, NULL, 'F'}, + {NULL, 0, NULL, 0} + }; + + if (in_initrd()) { + /* + * set first char of argv[0] to @. This is used by + * systemd to signal that the task was launched from + * initrd/initramfs and should be preserved during shutdown + */ + argv[0][0] = '@'; + } + + while ((opt = getopt_long(argc, argv, "thaF", options, NULL)) != -1) { + switch (opt) { + case 'a': + container_name = argv[optind-1]; + all = 1; + break; + case 't': + takeover = 1; + break; + case 'F': + dofork = 0; + break; + case OffRootOpt: + argv[0][0] = '@'; + break; + case 'h': + default: + usage(); + break; + } + } + + if (all == 0 && container_name == NULL) { + if (argv[optind]) + container_name = argv[optind]; + } + + if (container_name == NULL) + usage(); + + if (argc - optind > 1) + usage(); + + if (strcmp(container_name, "/proc/mdstat") == 0) + all = 1; + + if (all) { + struct mdstat_ent *mdstat, *e; + int container_len = strlen(container_name); + + /* launch an mdmon instance for each container found */ + mdstat = mdstat_read(0, 0); + for (e = mdstat; e; e = e->next) { + if (e->metadata_version && + strncmp(e->metadata_version, "external:", 9) == 0 && + !is_subarray(&e->metadata_version[9])) { + /* update cmdline so this mdmon instance can be + * distinguished from others in a call to ps(1) + */ + if (strlen(e->devnm) <= (unsigned)container_len) { + memset(container_name, 0, container_len); + sprintf(container_name, "%s", e->devnm); + } + status |= mdmon(e->devnm, 1, takeover); + } + } + free_mdstat(mdstat); + + return status; + } else if (strncmp(container_name, "md", 2) == 0) { + int id = devnm2devid(container_name); + if (id) + devnm = container_name; + } else { + struct stat st; + + if (stat(container_name, &st) == 0) + devnm = xstrdup(stat2devnm(&st)); + } + + if (!devnm) { + pr_err("%s is not a valid md device name\n", + container_name); + exit(1); + } + return mdmon(devnm, dofork && do_fork(), takeover); +} + +static int mdmon(char *devnm, int must_fork, int takeover) +{ + int mdfd; + struct mdinfo *mdi, *di; + struct supertype *container; + sigset_t set; + struct sigaction act; + int pfd[2]; + int status; + int ignore; + pid_t victim = -1; + int victim_sock = -1; + + dprintf("starting mdmon for %s\n", devnm); + + mdfd = open_dev(devnm); + if (mdfd < 0) { + pr_err("%s: %s\n", devnm, strerror(errno)); + return 1; + } + if (md_get_version(mdfd) < 0) { + pr_err("%s: Not an md device\n", devnm); + return 1; + } + + /* Fork, and have the child tell us when they are ready */ + if (must_fork) { + if (pipe(pfd) != 0) { + pr_err("failed to create pipe\n"); + return 1; + } + switch(fork()) { + case -1: + pr_err("failed to fork: %s\n", strerror(errno)); + return 1; + case 0: /* child */ + close(pfd[0]); + break; + default: /* parent */ + close(pfd[1]); + if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) { + wait(&status); + status = WEXITSTATUS(status); + } + close(pfd[0]); + return status; + } + } else + pfd[0] = pfd[1] = -1; + + container = xcalloc(1, sizeof(*container)); + strcpy(container->devnm, devnm); + container->arrays = NULL; + container->sock = -1; + + mdi = sysfs_read(mdfd, container->devnm, GET_VERSION|GET_LEVEL|GET_DEVS); + + if (!mdi) { + pr_err("failed to load sysfs info for %s\n", container->devnm); + exit(3); + } + if (mdi->array.level != UnSet) { + pr_err("%s is not a container - cannot monitor\n", devnm); + exit(3); + } + if (mdi->array.major_version != -1 || + mdi->array.minor_version != -2) { + pr_err("%s does not use external metadata - cannot monitor\n", + devnm); + exit(3); + } + + container->ss = version_to_superswitch(mdi->text_version); + if (container->ss == NULL) { + pr_err("%s uses unsupported metadata: %s\n", + devnm, mdi->text_version); + exit(3); + } + + container->devs = NULL; + for (di = mdi->devs; di; di = di->next) { + struct mdinfo *cd = xmalloc(sizeof(*cd)); + *cd = *di; + cd->next = container->devs; + container->devs = cd; + } + sysfs_free(mdi); + + /* SIGUSR is sent between parent and child. So both block it + * and enable it only with pselect. + */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + sigaddset(&set, SIGTERM); + sigprocmask(SIG_BLOCK, &set, NULL); + act.sa_handler = wake_me; + act.sa_flags = 0; + sigaction(SIGUSR1, &act, NULL); + act.sa_handler = term; + sigaction(SIGTERM, &act, NULL); + act.sa_handler = SIG_IGN; + sigaction(SIGPIPE, &act, NULL); + + victim = mdmon_pid(container->devnm); + if (victim >= 0) + victim_sock = connect_monitor(container->devnm); + + ignore = chdir("/"); + if (!takeover && victim > 0 && victim_sock >= 0) { + if (fping_monitor(victim_sock) == 0) { + pr_err("%s already managed\n", container->devnm); + exit(3); + } + close(victim_sock); + victim_sock = -1; + } + if (container->ss->load_container(container, mdfd, devnm)) { + pr_err("Cannot load metadata for %s\n", devnm); + exit(3); + } + close(mdfd); + + /* Ok, this is close enough. We can say goodbye to our parent now. + */ + if (victim > 0) + remove_pidfile(devnm); + if (make_pidfile(devnm) < 0) { + exit(3); + } + container->sock = make_control_sock(devnm); + + status = 0; + if (pfd[1] >= 0) { + if (write(pfd[1], &status, sizeof(status)) < 0) + pr_err("failed to notify our parent: %d\n", + getppid()); + close(pfd[1]); + } + + mlockall(MCL_CURRENT | MCL_FUTURE); + + if (clone_monitor(container) < 0) { + pr_err("failed to start monitor process: %s\n", + strerror(errno)); + exit(2); + } + + if (victim > 0) { + try_kill_monitor(victim, container->devnm, victim_sock); + if (victim_sock >= 0) + close(victim_sock); + } + + setsid(); + close(0); + open("/dev/null", O_RDWR); + close(1); + ignore = dup(0); +#ifndef DEBUG + close(2); + ignore = dup(0); +#endif + + /* This silliness is to stop the compiler complaining + * that we ignore 'ignore' + */ + if (ignore) + ignore++; + + do_manager(container); + + exit(0); +} + +/* Some stub functions so super-* can link with us */ +int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) +{ + return 0; +} + +int restore_stripes(int *dest, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int source, unsigned long long read_offset, + unsigned long long start, unsigned long long length, + char *src_buf) +{ + return 1; +} + +void abort_reshape(struct mdinfo *sra) +{ + return; +} + +int save_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int nwrites, int *dest, + unsigned long long start, unsigned long long length, + char *buf) +{ + return 0; +} + +struct superswitch super0 = { + .name = "0.90", +}; +struct superswitch super1 = { + .name = "1.x", +}; diff --git a/mdmon.h b/mdmon.h new file mode 100644 index 00000000..aa750c68 --- /dev/null +++ b/mdmon.h @@ -0,0 +1,110 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de> + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +extern const char Name[]; + +enum array_state { clear, inactive, suspended, readonly, read_auto, + clean, active, write_pending, active_idle, bad_word}; + +enum sync_action { idle, reshape, resync, recover, check, repair, bad_action }; + +struct active_array { + struct mdinfo info; + struct supertype *container; + struct active_array *next, *replaces; + int to_remove; + + int action_fd; + int resync_start_fd; + int metadata_fd; /* for monitoring rw/ro status */ + int sync_completed_fd; /* for checkpoint notification events */ + unsigned long long last_checkpoint; /* sync_completed fires for many + * reasons this field makes sure the + * kernel has made progress before + * moving the checkpoint. It is + * cleared by the metadata handler + * when it determines recovery is + * terminated. + */ + + enum array_state prev_state, curr_state, next_state; + enum sync_action prev_action, curr_action, next_action; + + int check_degraded; /* flag set by mon, read by manage */ + int check_reshape; /* flag set by mon, read by manage */ +}; + +/* + * Metadata updates are handled by the monitor thread, + * as it has exclusive access to the metadata. + * When the manager want to updates metadata, either + * for it's own reason (e.g. committing a spare) or + * on behalf of mdadm, it creates a metadata_update + * structure and queues it to the monitor. + * Updates are created and processed by code under the + * superswitch. All common code sees them as opaque + * blobs. + */ +extern struct metadata_update *update_queue, *update_queue_handled; + +#define MD_MAJOR 9 + +extern struct active_array *container; +extern struct active_array *discard_this; +extern struct active_array *pending_discard; +extern struct md_generic_cmd *active_cmd; + +void remove_pidfile(char *devname); +void do_monitor(struct supertype *container); +void do_manager(struct supertype *container); +extern int sigterm; + +int read_dev_state(int fd); +int is_container_member(struct mdstat_ent *mdstat, char *container); + +struct mdstat_ent *mdstat_read(int hold, int start); + +extern int exit_now, manager_ready; +extern int mon_tid, mgr_tid; +extern int monitor_loop_cnt; + +/* helper routine to determine resync completion since MaxSector is a + * moving target + */ +static inline int is_resync_complete(struct mdinfo *array) +{ + unsigned long long sync_size = 0; + int ncopies, l; + switch(array->array.level) { + case 1: + case 4: + case 5: + case 6: + sync_size = array->component_size; + break; + case 10: + l = array->array.layout; + ncopies = (l & 0xff) * ((l >> 8) && 0xff); + sync_size = array->component_size * array->array.raid_disks; + sync_size /= ncopies; + break; + } + return array->resync_start >= sync_size; +} diff --git a/mdopen.c b/mdopen.c new file mode 100644 index 00000000..28410f46 --- /dev/null +++ b/mdopen.c @@ -0,0 +1,468 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include <ctype.h> + +void make_parts(char *dev, int cnt) +{ + /* make 'cnt' partition devices for 'dev' + * If dev is a device name we use the + * major/minor from dev and add 1..cnt + * If it is a symlink, we make similar symlinks. + * If dev ends with a digit, we add "p%d" else "%d" + * If the name exists, we use it's owner/mode, + * else that of dev + */ + struct stat stb; + int major_num; + int minor_num; + int odig; + int i; + int nlen = strlen(dev) + 20; + char *name; + int dig = isdigit(dev[strlen(dev)-1]); + char orig[1024]; + char sym[1024]; + int err; + + if (cnt == 0) + cnt = 4; + if (lstat(dev, &stb)!= 0) + return; + + if (S_ISBLK(stb.st_mode)) { + major_num = major(stb.st_rdev); + minor_num = minor(stb.st_rdev); + odig = -1; + } else if (S_ISLNK(stb.st_mode)) { + int len = readlink(dev, orig, sizeof(orig)); + if (len < 0 || len > 1000) + return; + orig[len] = 0; + odig = isdigit(orig[len-1]); + major_num = -1; + minor_num = -1; + } else + return; + name = xmalloc(nlen); + for (i = 1; i <= cnt ; i++) { + struct stat stb2; + snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i); + if (stat(name, &stb2) == 0) { + if (!S_ISBLK(stb2.st_mode) || !S_ISBLK(stb.st_mode)) + continue; + if (stb2.st_rdev == makedev(major_num, minor_num+i)) + continue; + unlink(name); + } else { + stb2 = stb; + } + if (S_ISBLK(stb.st_mode)) { + if (mknod(name, S_IFBLK | 0600, + makedev(major_num, minor_num+i))) + perror("mknod"); + if (chown(name, stb2.st_uid, stb2.st_gid)) + perror("chown"); + if (chmod(name, stb2.st_mode & 07777)) + perror("chmod"); + err = 0; + } else { + snprintf(sym, sizeof(sym), "%s%s%d", orig, odig?"p":"", i); + err = symlink(sym, name); + } + + if (err == 0 && stat(name, &stb2) == 0) + add_dev(name, &stb2, 0, NULL); + } + free(name); +} + +/* + * We need a new md device to assemble/build/create an array. + * 'dev' is a name given us by the user (command line or mdadm.conf) + * It might start with /dev or /dev/md any might end with a digit + * string. + * If it starts with just /dev, it must be /dev/mdX or /dev/md_dX + * If it ends with a digit string, then it must be as above, or + * 'trustworthy' must be 'METADATA' and the 'dev' must be + * /dev/md/'name'NN or 'name'NN + * If it doesn't end with a digit string, it must be /dev/md/'name' + * or 'name' or must be NULL. + * If the digit string is present, it gives the minor number to use + * If not, we choose a high, unused minor number. + * If the 'dev' is a standard name, it devices whether 'md' or 'mdp'. + * else if the name is 'd[0-9]+' then we use mdp + * else if trustworthy is 'METADATA' we use md + * else the choice depends on 'autof'. + * If name is NULL it is assumed to match whatever dev provides. + * If both name and dev are NULL, we choose a name 'mdXX' or 'mdpXX' + * + * If 'name' is given, and 'trustworthy' is 'foreign' and name is not + * supported by 'dev', we add a "_%d" suffix based on the minor number + * use that. + * + * If udev is configured, we create a temporary device, open it, and + * unlink it. + * If not, we create the /dev/mdXX device, and if name is usable, + * /dev/md/name + * In any case we return /dev/md/name or (if that isn't available) + * /dev/mdXX in 'chosen'. + * + * When we create devices, we use uid/gid/umask from config file. + */ + +int create_mddev(char *dev, char *name, int autof, int trustworthy, + char *chosen) +{ + int mdfd; + struct stat stb; + int num = -1; + int use_mdp = -1; + struct createinfo *ci = conf_get_create_info(); + int parts; + char *cname; + char devname[20]; + char devnm[32]; + char cbuf[400]; + if (chosen == NULL) + chosen = cbuf; + + if (autof == 0) + autof = ci->autof; + + parts = autof >> 3; + autof &= 7; + + strcpy(chosen, "/dev/md/"); + cname = chosen + strlen(chosen); + + if (dev) { + if (strncmp(dev, "/dev/md/", 8) == 0) { + strcpy(cname, dev+8); + } else if (strncmp(dev, "/dev/", 5) == 0) { + char *e = dev + strlen(dev); + while (e > dev && isdigit(e[-1])) + e--; + if (e[0]) + num = strtoul(e, NULL, 10); + strcpy(cname, dev+5); + cname[e-(dev+5)] = 0; + /* name *must* be mdXX or md_dXX in this context */ + if (num < 0 || + (strcmp(cname, "md") != 0 && strcmp(cname, "md_d") != 0)) { + pr_err("%s is an invalid name for an md device. Try /dev/md/%s\n", + dev, dev+5); + return -1; + } + if (strcmp(cname, "md") == 0) + use_mdp = 0; + else + use_mdp = 1; + /* recreate name: /dev/md/0 or /dev/md/d0 */ + sprintf(cname, "%s%d", use_mdp?"d":"", num); + } else + strcpy(cname, dev); + + /* 'cname' must not contain a slash, and may not be + * empty. + */ + if (strchr(cname, '/') != NULL) { + pr_err("%s is an invalid name for an md device.\n", dev); + return -1; + } + if (cname[0] == 0) { + pr_err("%s is an invalid name for an md device (empty!).", dev); + return -1; + } + if (num < 0) { + /* If cname is 'N' or 'dN', we get dev number + * from there. + */ + char *sp = cname; + char *ep; + if (cname[0] == 'd') + sp++; + if (isdigit(sp[0])) + num = strtoul(sp, &ep, 10); + else + ep = sp; + if (ep == sp || *ep || num < 0) + num = -1; + else if (cname[0] == 'd') + use_mdp = 1; + else + use_mdp = 0; + } + } + + /* Now determine device number */ + /* named 'METADATA' cannot use 'mdp'. */ + if (name && name[0] == 0) + name = NULL; + if (name && trustworthy == METADATA && use_mdp == 1) { + pr_err("%s is not allowed for a %s container. Consider /dev/md%d.\n", dev, name, num); + return -1; + } + if (name && trustworthy == METADATA) + use_mdp = 0; + if (use_mdp == -1) { + if (autof == 4 || autof == 6) + use_mdp = 1; + else + use_mdp = 0; + } + if (num < 0 && trustworthy == LOCAL && name) { + /* if name is numeric, possibly prefixed by + * 'md' or '/dev/md', use that for num + * if it is not already in use */ + char *ep; + char *n2 = name; + if (strncmp(n2, "/dev/", 5) == 0) + n2 += 5; + if (strncmp(n2, "md", 2) == 0) + n2 += 2; + if (*n2 == '/') + n2++; + num = strtoul(n2, &ep, 10); + if (ep == n2 || *ep) + num = -1; + else { + sprintf(devnm, "md%s%d", use_mdp ? "_d":"", num); + if (mddev_busy(devnm)) + num = -1; + } + } + + if (cname[0] == 0 && name) { + /* Need to find a name if we can + * We don't completely trust 'name'. Truncate to + * reasonable length and remove '/' + */ + char *cp; + struct map_ent *map = NULL; + int conflict = 1; + int unum = 0; + int cnlen; + strncpy(cname, name, 200); + cname[200] = 0; + for (cp = cname; *cp ; cp++) + switch (*cp) { + case '/': + *cp = '-'; + break; + case ' ': + case '\t': + *cp = '_'; + break; + } + + if (trustworthy == LOCAL || + (trustworthy == FOREIGN && strchr(cname, ':') != NULL)) { + /* Only need suffix if there is a conflict */ + if (map_by_name(&map, cname) == NULL) + conflict = 0; + } + cnlen = strlen(cname); + while (conflict) { + if (trustworthy == METADATA && !isdigit(cname[cnlen-1])) + sprintf(cname+cnlen, "%d", unum); + else + /* add _%d to FOREIGN array that don't + * a 'host:' prefix + */ + sprintf(cname+cnlen, "_%d", unum); + unum++; + if (map_by_name(&map, cname) == NULL) + conflict = 0; + } + } + + devnm[0] = 0; + if (num < 0 && cname && ci->names) { + int fd; + int n = -1; + sprintf(devnm, "md_%s", cname); + fd = open("/sys/module/md_mod/parameters/new_array", O_WRONLY); + if (fd >= 0) { + n = write(fd, devnm, strlen(devnm)); + close(fd); + } + if (n < 0) + devnm[0] = 0; + } + if (devnm[0]) + ; + else if (num < 0) { + /* need to choose a free number. */ + char *_devnm = find_free_devnm(use_mdp); + if (devnm == NULL) { + pr_err("No avail md devices - aborting\n"); + return -1; + } + strcpy(devnm, _devnm); + } else { + sprintf(devnm, "%s%d", use_mdp?"md_d":"md", num); + if (mddev_busy(devnm)) { + pr_err("%s is already in use.\n", + dev); + return -1; + } + } + + sprintf(devname, "/dev/%s", devnm); + + if (dev && dev[0] == '/') + strcpy(chosen, dev); + else if (cname[0] == 0) + strcpy(chosen, devname); + + /* We have a device number and name. + * If we cannot detect udev, we need to make + * devices and links ourselves. + */ + if (!use_udev()) { + /* Make sure 'devname' exists and 'chosen' is a symlink to it */ + if (lstat(devname, &stb) == 0) { + /* Must be the correct device, else error */ + if ((stb.st_mode&S_IFMT) != S_IFBLK || + stb.st_rdev != (dev_t)devnm2devid(devnm)) { + pr_err("%s exists but looks wrong, please fix\n", + devname); + return -1; + } + } else { + if (mknod(devname, S_IFBLK|0600, + devnm2devid(devnm)) != 0) { + pr_err("failed to create %s\n", + devname); + return -1; + } + if (chown(devname, ci->uid, ci->gid)) + perror("chown"); + if (chmod(devname, ci->mode)) + perror("chmod"); + stat(devname, &stb); + add_dev(devname, &stb, 0, NULL); + } + if (use_mdp == 1) + make_parts(devname, parts); + + if (strcmp(chosen, devname) != 0) { + if (mkdir("/dev/md",0700) == 0) { + if (chown("/dev/md", ci->uid, ci->gid)) + perror("chown /dev/md"); + if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111))) + perror("chmod /dev/md"); + } + + if (dev && strcmp(chosen, dev) == 0) + /* We know we are allowed to use this name */ + unlink(chosen); + + if (lstat(chosen, &stb) == 0) { + char buf[300]; + ssize_t link_len = readlink(chosen, buf, sizeof(buf)-1); + if (link_len >= 0) + buf[link_len] = '\0'; + + if ((stb.st_mode & S_IFMT) != S_IFLNK || + link_len < 0 || + strcmp(buf, devname) != 0) { + pr_err("%s exists - ignoring\n", + chosen); + strcpy(chosen, devname); + } + } else if (symlink(devname, chosen) != 0) + pr_err("failed to create %s: %s\n", + chosen, strerror(errno)); + if (use_mdp && strcmp(chosen, devname) != 0) + make_parts(chosen, parts); + } + } + mdfd = open_dev_excl(devnm); + if (mdfd < 0) + pr_err("unexpected failure opening %s\n", + devname); + return mdfd; +} + +/* Open this and check that it is an md device. + * On success, return filedescriptor. + * On failure, return -1 if it doesn't exist, + * or -2 if it exists but is not an md device. + */ +int open_mddev(char *dev, int report_errors) +{ + int mdfd = open(dev, O_RDWR); + if (mdfd < 0 && errno == EACCES) + mdfd = open(dev, O_RDONLY); + if (mdfd < 0) { + if (report_errors) + pr_err("error opening %s: %s\n", + dev, strerror(errno)); + return -1; + } + if (md_get_version(mdfd) <= 0) { + close(mdfd); + if (report_errors) + pr_err("%s does not appear to be an md device\n", dev); + return -2; + } + return mdfd; +} + +char *find_free_devnm(int use_partitions) +{ + static char devnm[32]; + int devnum; + for (devnum = 127; devnum != 128; + devnum = devnum ? devnum-1 : (1<<20)-1) { + + if (use_partitions) + sprintf(devnm, "md_d%d", devnum); + else + sprintf(devnm, "md%d", devnum); + if (mddev_busy(devnm)) + continue; + if (!conf_name_is_free(devnm)) + continue; + if (!use_udev()) { + /* make sure it is new to /dev too, at least as a + * non-standard */ + int devid = devnm2devid(devnm); + if (devid) { + char *dn = map_dev(major(devid), + minor(devid), 0); + if (dn && ! is_standard(dn, NULL)) + continue; + } + } + break; + } + if (devnum == 128) + return NULL; + return devnm; +} diff --git a/mdstat.c b/mdstat.c new file mode 100644 index 00000000..2972cdf6 --- /dev/null +++ b/mdstat.c @@ -0,0 +1,414 @@ +/* + * mdstat - parse /proc/mdstat file. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +/* + * The /proc/mdstat file comes in at least 3 flavours: + * In an unpatched 2.2 kernel (md 0.36.6): + * Personalities : [n raidx] ... + * read_ahead {not set|%d sectors} + * md0 : {in}active{ raidX /dev/hda... %d blocks{ maxfault=%d}} + * md1 : ..... + * + * Normally only 4 md lines, but all are listed. + * + * In a patched 2.2 kernel (md 0.90.0) + * Personalities : [raidx] ... + * read_ahead {not set|%d sectors} + * mdN : {in}active {(readonly)} raidX dev[%d]{(F)} ... %d blocks STATUS RESYNC + * ... Only initialised arrays listed + * unused devices: {dev dev ... | <none>} + * + * STATUS is personality dependant: + * linear: %dk rounding + * raid0: %dk chunks + * raid1: [%d/%d] [U_U] ( raid/working. operational or not) + * raid5: level 4/5, %dk chunk, algorithm %d [%d/%d] [U_U] + * + * RESYNC is empty or: + * {resync|recovery}=%u%% finish=%u.%umin + * or + * resync=DELAYED + * + * In a 2.4 kernel (md 0.90.0/2.4) + * Personalities : [raidX] ... + * read_ahead {not set|%d sectors} + * mdN : {in}active {(read-only)} raidX dev[%d]{(F)} ... + * %d blocks STATUS + * RESYNC + * unused devices: {dev dev .. | <none>} + * + * STATUS matches 0.90.0/2.2 + * RESYNC includes [===>....], + * adds a space after {resync|recovery} and before and after '=' + * adds a decimal to the recovery percent. + * adds (%d/%d) resync amount and max_blocks, before finish. + * adds speed=%dK/sec after finish + * + * + * + * Out of this we want to extract: + * list of devices, active or not + * pattern of failed drives (so need number of drives) + * percent resync complete + * + * As continuation is indicated by leading space, we use + * conf_line from config.c to read logical lines + * + */ + +#include "mdadm.h" +#include "dlink.h" +#include <sys/select.h> +#include <ctype.h> + +static void free_member_devnames(struct dev_member *m) +{ + while(m) { + struct dev_member *t = m; + + m = m->next; + free(t->name); + free(t); + } +} + +static int add_member_devname(struct dev_member **m, char *name) +{ + struct dev_member *new; + char *t; + + if ((t = strchr(name, '[')) == NULL) + /* not a device */ + return 0; + + new = xmalloc(sizeof(*new)); + new->name = strndup(name, t - name); + new->next = *m; + *m = new; + return 1; +} + +void free_mdstat(struct mdstat_ent *ms) +{ + while (ms) { + struct mdstat_ent *t; + free(ms->level); + free(ms->pattern); + free(ms->metadata_version); + free_member_devnames(ms->members); + t = ms; + ms = ms->next; + free(t); + } +} + +static int mdstat_fd = -1; +struct mdstat_ent *mdstat_read(int hold, int start) +{ + FILE *f; + struct mdstat_ent *all, *rv, **end, **insert_here; + char *line; + int fd; + + if (hold && mdstat_fd != -1) { + lseek(mdstat_fd, 0L, 0); + fd = dup(mdstat_fd); + if (fd >= 0) + f = fdopen(fd, "r"); + else + return NULL; + } else + f = fopen("/proc/mdstat", "r"); + if (f == NULL) + return NULL; + else + fcntl(fileno(f), F_SETFD, FD_CLOEXEC); + + all = NULL; + end = &all; + for (; (line = conf_line(f)) ; free_line(line)) { + struct mdstat_ent *ent; + char *w; + char devnm[32]; + int in_devs = 0; + + if (strcmp(line, "Personalities")==0) + continue; + if (strcmp(line, "read_ahead")==0) + continue; + if (strcmp(line, "unused")==0) + continue; + insert_here = NULL; + /* Better be an md line.. */ + if (strncmp(line, "md", 2)!= 0 || strlen(line) >= 32 + || (line[2] != '_' && !isdigit(line[2]))) + continue; + strcpy(devnm, line); + + ent = xmalloc(sizeof(*ent)); + ent->level = ent->pattern= NULL; + ent->next = NULL; + ent->percent = RESYNC_NONE; + ent->active = -1; + ent->resync = 0; + ent->metadata_version = NULL; + ent->raid_disks = 0; + ent->devcnt = 0; + ent->members = NULL; + + strcpy(ent->devnm, devnm); + + for (w=dl_next(line); w!= line ; w=dl_next(w)) { + int l = strlen(w); + char *eq; + if (strcmp(w, "active")==0) + ent->active = 1; + else if (strcmp(w, "inactive")==0) { + ent->active = 0; + in_devs = 1; + } else if (ent->active > 0 && + ent->level == NULL && + w[0] != '(' /*readonly*/) { + ent->level = xstrdup(w); + in_devs = 1; + } else if (in_devs && strcmp(w, "blocks")==0) + in_devs = 0; + else if (in_devs) { + char *ep = strchr(w, '['); + ent->devcnt += + add_member_devname(&ent->members, w); + if (ep && strncmp(w, "md", 2)==0) { + /* This has an md device as a component. + * If that device is already in the + * list, make sure we insert before + * there. + */ + struct mdstat_ent **ih; + ih = &all; + while (ih != insert_here && *ih && + ((int)strlen((*ih)->devnm) != ep-w + || strncmp((*ih)->devnm, w, ep-w) != 0)) + ih = & (*ih)->next; + insert_here = ih; + } + } else if (strcmp(w, "super") == 0 && + dl_next(w) != line) { + w = dl_next(w); + ent->metadata_version = xstrdup(w); + } else if (w[0] == '[' && isdigit(w[1])) { + ent->raid_disks = atoi(w+1); + } else if (!ent->pattern && + w[0] == '[' && + (w[1] == 'U' || w[1] == '_')) { + ent->pattern = xstrdup(w+1); + if (ent->pattern[l-2]==']') + ent->pattern[l-2] = '\0'; + } else if (ent->percent == RESYNC_NONE && + strncmp(w, "re", 2)== 0 && + w[l-1] == '%' && + (eq=strchr(w, '=')) != NULL ) { + ent->percent = atoi(eq+1); + if (strncmp(w,"resync", 6)==0) + ent->resync = 1; + else if (strncmp(w, "reshape", 7)==0) + ent->resync = 2; + else + ent->resync = 0; + } else if (ent->percent == RESYNC_NONE && + (w[0] == 'r' || w[0] == 'c')) { + if (strncmp(w, "resync", 4)==0) + ent->resync = 1; + if (strncmp(w, "reshape", 7)==0) + ent->resync = 2; + if (strncmp(w, "recovery", 8)==0) + ent->resync = 0; + if (strncmp(w, "check", 5)==0) + ent->resync = 3; + + if (l > 8 && strcmp(w+l-8, "=DELAYED") == 0) + ent->percent = RESYNC_DELAYED; + if (l > 8 && strcmp(w+l-8, "=PENDING") == 0) + ent->percent = RESYNC_PENDING; + } else if (ent->percent == RESYNC_NONE && + w[0] >= '0' && + w[0] <= '9' && + w[l-1] == '%') { + ent->percent = atoi(w); + } + } + if (insert_here && (*insert_here)) { + ent->next = *insert_here; + *insert_here = ent; + } else { + *end = ent; + end = &ent->next; + } + } + if (hold && mdstat_fd == -1) { + mdstat_fd = dup(fileno(f)); + fcntl(mdstat_fd, F_SETFD, FD_CLOEXEC); + } + fclose(f); + + /* If we might want to start array, + * reverse the order, so that components comes before composites + */ + if (start) { + rv = NULL; + while (all) { + struct mdstat_ent *e = all; + all = all->next; + e->next = rv; + rv = e; + } + } else rv = all; + return rv; +} + +void mdstat_close(void) +{ + if (mdstat_fd >= 0) + close(mdstat_fd); + mdstat_fd = -1; +} + +void mdstat_wait(int seconds) +{ + fd_set fds; + struct timeval tm; + int maxfd = 0; + FD_ZERO(&fds); + if (mdstat_fd >= 0) { + FD_SET(mdstat_fd, &fds); + maxfd = mdstat_fd; + } + tm.tv_sec = seconds; + tm.tv_usec = 0; + select(maxfd + 1, NULL, NULL, &fds, &tm); +} + +void mdstat_wait_fd(int fd, const sigset_t *sigmask) +{ + fd_set fds, rfds; + int maxfd = 0; + + FD_ZERO(&fds); + FD_ZERO(&rfds); + if (mdstat_fd >= 0) + FD_SET(mdstat_fd, &fds); + + if (fd >= 0) { + struct stat stb; + fstat(fd, &stb); + if ((stb.st_mode & S_IFMT) == S_IFREG) + /* Must be a /proc or /sys fd, so expect + * POLLPRI + * i.e. an 'exceptional' event. + */ + FD_SET(fd, &fds); + else + FD_SET(fd, &rfds); + + if (fd > maxfd) + maxfd = fd; + + } + if (mdstat_fd > maxfd) + maxfd = mdstat_fd; + + pselect(maxfd + 1, &rfds, NULL, &fds, + NULL, sigmask); +} + +int mddev_busy(char *devnm) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *me; + + for (me = mdstat ; me ; me = me->next) + if (strcmp(me->devnm, devnm) == 0) + break; + free_mdstat(mdstat); + return me != NULL; +} + +struct mdstat_ent *mdstat_by_component(char *name) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + + while (mdstat) { + struct dev_member *m; + struct mdstat_ent *ent; + if (mdstat->metadata_version && + strncmp(mdstat->metadata_version, "external:", 9) == 0 && + is_subarray(mdstat->metadata_version+9)) + /* don't return subarrays, only containers */ + ; + else for (m = mdstat->members; m; m = m->next) { + if (strcmp(m->name, name) == 0) { + free_mdstat(mdstat->next); + mdstat->next = NULL; + return mdstat; + } + } + ent = mdstat; + mdstat = mdstat->next; + ent->next = NULL; + free_mdstat(ent); + } + return NULL; +} + +struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent = NULL; + + while (mdstat) { + /* metadata version must match: + * external:[/-]%s/%s + * where first %s is 'container' and second %s is 'subdev' + */ + if (ent) + free_mdstat(ent); + ent = mdstat; + mdstat = mdstat->next; + ent->next = NULL; + + if (ent->metadata_version == NULL || + strncmp(ent->metadata_version, "external:", 9) != 0) + continue; + + if (!metadata_container_matches(ent->metadata_version+9, + container) || + !metadata_subdev_matches(ent->metadata_version+9, + subdev)) + continue; + + free_mdstat(mdstat); + return ent; + } + return NULL; +} diff --git a/misc/mdcheck b/misc/mdcheck new file mode 100644 index 00000000..2c8f54d6 --- /dev/null +++ b/misc/mdcheck @@ -0,0 +1,159 @@ +#!/bin/bash + +# Copyright (C) 2014 Neil Brown <neilb@suse.de> +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# Author: Neil Brown +# Email: <neilb@suse.de> + +# This script should be run periodically to automatically +# perform a 'check' on any md arrays. +# +# It supports a 'time budget' such that any incomplete 'check' +# will be checkpointed when that time has expired. +# A subsequent invocation can allow the 'check' to continue. +# +# Options are: +# --continue Don't start new checks, only continue old ones. +# --duration This is passed to "date --date=$duration" to find out +# when to finish +# +# To support '--continue', arrays are identified by UUID and the 'sync_completed' +# value is stored in /var/lib/mdcheck/$UUID + +# convert a /dev/md name into /sys/.../md equivalent +sysname() { + set `ls -lLd $1` + maj=${5%,} + min=$6 + readlink -f /sys/dev/block/$maj:$min +} + +args=$(getopt -o hcd: -l help,continue,duration: -n mdcheck -- "$@") +rv=$? +if [ $rv -ne 0 ]; then exit $rv; fi + +eval set -- $args + +cont= +endtime= +while [ " $1" != " --" ] +do + case $1 in + --help ) + echo >&2 'Usage: mdcheck [--continue] [--duration time-offset]' + echo >&2 ' time-offset must be understood by "date --date"' + exit 0 + ;; + --continue ) cont=yes ;; + --duration ) shift; dur=$1 + endtime=$(date --date "$dur" "+%s") + ;; + esac + shift +done +shift + +# We need a temp file occasionally... +tmp=/var/lib/mdcheck/.md-check-$$ +trap 'rm -f "$tmp"' 0 + + +# firstly, clean out really old state files +mkdir -p /var/lib/mdcheck +find /var/lib/mdcheck -name "MD_UUID*" -type f -mtime +180 -exec rm {} \; + +# Now look at each md device. +cnt=0 +for dev in /dev/md?* +do + [ -e "$dev" ] || continue + sys=`sysname $dev` + if [ ! -f "$sys/md/sync_action" ] + then # cannot check this array + continue + fi + if [ "`cat $sys/md/sync_action`" != 'idle' ] + then # This array is busy + continue + fi + + mdadm --detail --export "$dev" | grep '^MD_UUID=' > $tmp || continue + source $tmp + fl="/var/lib/mdcheck/MD_UUID_$MD_UUID" + if [ -z "$cont" ] + then + start=0 + elif [ -z "$MD_UUID" -o ! -f "$fl" ] + then + # Nothing to continue here + continue + else + start=`cat "$fl"` + fi + + cnt=$[cnt+1] + eval MD_${cnt}_fl=\$fl + eval MD_${cnt}_sys=\$sys + echo $start > $fl + echo $start > $sys/md/sync_min + echo check > $sys/md/sync_action +done + +if [ -z "$endtime" ] +then + exit 0 +fi + +while [ `date +%s` -lt $endtime ] +do + any= + for i in `eval echo {1..$cnt}` + do + eval fl=\$MD_${i}_fl + eval sys=\$MD_${i}_sys + + if [ -z "$fl" ]; then continue; fi + + if [ "`cat $sys/md/sync_action`" != 'check' ] + then + eval MD_${i}_fl= + rm -f $fl + continue; + fi + read a rest < $sys/md/sync_completed + echo $a > $fl + any=yes + done + if [ -z "$any" ]; then exit 0; fi + sleep 120 +done + +# We've waited, and there are still checks running. +# Time to stop them. +for i in `eval echo {1..$cnt}` +do + eval fl=\$MD_${i}_fl + eval sys=\$MD_${i}_sys + + if [ -z "$fl" ]; then continue; fi + + if [ "`cat $sys/md/sync_action`" != 'check' ] + then + eval MD_${i}_fl= + rm -f $fl + continue; + fi + echo idle > $sys/md/sync_action + cat $sys/md/sync_min > $fl +done diff --git a/misc/syslog-events b/misc/syslog-events new file mode 100644 index 00000000..fe8c14e4 --- /dev/null +++ b/misc/syslog-events @@ -0,0 +1,27 @@ +#!/bin/sh +# +# sample event handling script for mdadm +# e.g. mdadm --follow --program=/sbin/syslog-events --scan +# +# License: GPL ver.2 +# Copyright (C) 2004 SEKINE Tatsuo <tsekine@sdri.co.jp> + +event="$1" +dev="$2" +disc="$3" + +facility="kern" +tag="mdmonitor" + +case x"${event}" in + xFail*) priority="error" ;; + xTest*) priority="debug" ;; + x*) priority="info" ;; +esac + +msg="${event} event on ${dev}" +if [ x"${disc}" != x ]; then + msg="${msg}, related to disc ${disc}" +fi + +exec logger -t "${tag}" -p "${facility}.${priority}" -- "${msg}" diff --git a/mkinitramfs b/mkinitramfs new file mode 100644 index 00000000..c6275ddb --- /dev/null +++ b/mkinitramfs @@ -0,0 +1,55 @@ +#!/bin/sh + +# make sure we are being run in the right directory... +if [ -f mkinitramfs ] +then : +else + echo >&2 mkinitramfs must be run from the mdadm source directory. + exit 1 +fi +if [ -f /bin/busybox ] +then : good, it exists + case `file /bin/busybox` in + *statically* ) : good ;; + * ) echo >&2 mkinitramfs: /bin/busybox is not statically linked: cannot proceed. + exit 1 + esac +else + echo >&2 "mkinitramfs: /bin/busybox doesn't exist - please install it statically linked." + exit 1 +fi + +rm -rf initramfs +mkdir initramfs +mkdir initramfs/bin +make mdadm.static +cp mdadm.static initramfs/bin/mdadm +cp /bin/busybox initramfs/bin/busybox +ln initramfs/bin/busybox initramfs/bin/sh +cat <<- END > initramfs/init + #!/bin/sh + + echo 'Auto-assembling boot md array' + mkdir /proc + mount -t proc proc /proc + if [ -n "$rootuuid" ] + then arg=--uuid=$rootuuid + elif [ -n "$mdminor" ] + then arg=--super-minor=$mdminor + else arg=--super-minor=0 + fi + echo "Using $arg" + mdadm -Acpartitions $arg --auto=part /dev/mda + cd / + mount /dev/mda1 /root || mount /dev/mda /root + umount /proc + cd /root + exec chroot . /sbin/init < /dev/console > /dev/console 2>&1 +END +chmod +x initramfs/init + +(cd initramfs + find init bin | cpio -o -H newc | gzip --best +) > init.cpio.gz +rm -rf initramfs +ls -l init.cpio.gz diff --git a/monitor.c b/monitor.c new file mode 100644 index 00000000..870cc1a7 --- /dev/null +++ b/monitor.c @@ -0,0 +1,712 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de> + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "mdadm.h" +#include "mdmon.h" +#include <sys/syscall.h> +#include <sys/select.h> +#include <signal.h> + +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", + "clean", "active", "write-pending", "active-idle", NULL }; +static char *sync_actions[] = { + "idle", "reshape", "resync", "recover", "check", "repair", NULL +}; + +static int write_attr(char *attr, int fd) +{ + return write(fd, attr, strlen(attr)); +} + +static void add_fd(fd_set *fds, int *maxfd, int fd) +{ + struct stat st; + if (fd < 0) + return; + if (fstat(fd, &st) == -1) { + dprintf("Invalid fd %d\n", fd); + return; + } + if (st.st_nlink == 0) { + dprintf("fd %d was deleted\n", fd); + return; + } + if (fd > *maxfd) + *maxfd = fd; + FD_SET(fd, fds); +} + +static int read_attr(char *buf, int len, int fd) +{ + int n; + + if (fd < 0) { + buf[0] = 0; + return 0; + } + lseek(fd, 0, 0); + n = read(fd, buf, len - 1); + + if (n <= 0) { + buf[0] = 0; + return 0; + } + buf[n] = 0; + if (buf[n-1] == '\n') + buf[n-1] = 0; + return n; +} + +static void read_resync_start(int fd, unsigned long long *v) +{ + char buf[30]; + int n; + + n = read_attr(buf, 30, fd); + if (n <= 0) { + dprintf("Failed to read resync_start (%d)\n", fd); + return; + } + if (strncmp(buf, "none", 4) == 0) + *v = MaxSector; + else + *v = strtoull(buf, NULL, 10); +} + +static unsigned long long read_sync_completed(int fd) +{ + unsigned long long val; + char buf[50]; + int n; + char *ep; + + n = read_attr(buf, 50, fd); + + if (n <= 0) + return 0; + buf[n] = 0; + val = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return 0; + return val; +} + +static enum array_state read_state(int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_word; + return (enum array_state) sysfs_match_word(buf, array_states); +} + +static enum sync_action read_action( int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_action; + return (enum sync_action) sysfs_match_word(buf, sync_actions); +} + +int read_dev_state(int fd) +{ + char buf[60]; + int n = read_attr(buf, 60, fd); + char *cp; + int rv = 0; + + if (n <= 0) + return 0; + + cp = buf; + while (cp) { + if (sysfs_attr_match(cp, "faulty")) + rv |= DS_FAULTY; + if (sysfs_attr_match(cp, "in_sync")) + rv |= DS_INSYNC; + if (sysfs_attr_match(cp, "write_mostly")) + rv |= DS_WRITE_MOSTLY; + if (sysfs_attr_match(cp, "spare")) + rv |= DS_SPARE; + if (sysfs_attr_match(cp, "blocked")) + rv |= DS_BLOCKED; + cp = strchr(cp, ','); + if (cp) + cp++; + } + return rv; +} + +static void signal_manager(void) +{ + /* tgkill(getpid(), mon_tid, SIGUSR1); */ + int pid = getpid(); + syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1); +} + +/* Monitor a set of active md arrays - all of which share the + * same metadata - and respond to events that require + * metadata update. + * + * New arrays are detected by another thread which allocates + * required memory and attaches the data structure to our list. + * + * Events: + * Array stops. + * This is detected by array_state going to 'clear' or 'inactive'. + * while we thought it was active. + * Response is to mark metadata as clean and 'clear' the array(??) + * write-pending + * array_state if 'write-pending' + * We mark metadata as 'dirty' then set array to 'active'. + * active_idle + * Either ignore, or mark clean, then mark metadata as clean. + * + * device fails + * detected by rd-N/state reporting "faulty" + * mark device as 'failed' in metadata, let the kernel release the + * device by writing '-blocked' to rd/state, and finally write 'remove' to + * rd/state. Before a disk can be replaced it must be failed and removed + * from all container members, this will be preemptive for the other + * arrays... safe? + * + * sync completes + * sync_action was 'resync' and becomes 'idle' and resync_start becomes + * MaxSector + * Notify metadata that sync is complete. + * + * recovery completes + * sync_action changes from 'recover' to 'idle' + * Check each device state and mark metadata if 'faulty' or 'in_sync'. + * + * deal with resync + * This only happens on finding a new array... mdadm will have set + * 'resync_start' to the correct value. If 'resync_start' indicates that an + * resync needs to occur set the array to the 'active' state rather than the + * initial read-auto state. + * + * + * + * We wait for a change (poll/select) on array_state, sync_action, and + * each rd-X/state file. + * When we get any change, we check everything. So read each state file, + * then decide what to do. + * + * The core action is to write new metadata to all devices in the array. + * This is done at most once on any wakeup. + * After that we might: + * - update the array_state + * - set the role of some devices. + * - request a sync_action + * + */ + +#define ARRAY_DIRTY 1 +#define ARRAY_BUSY 2 +static int read_and_act(struct active_array *a) +{ + unsigned long long sync_completed; + int check_degraded = 0; + int check_reshape = 0; + int deactivate = 0; + struct mdinfo *mdi; + int ret = 0; + int count = 0; + struct timeval tv; + + a->next_state = bad_word; + a->next_action = bad_action; + + a->curr_state = read_state(a->info.state_fd); + a->curr_action = read_action(a->action_fd); + if (a->curr_state != clear) + /* + * In "clear" state, resync_start may wrongly be set to "0" + * when the kernel called md_clean but didn't remove the + * sysfs attributes yet + */ + read_resync_start(a->resync_start_fd, &a->info.resync_start); + sync_completed = read_sync_completed(a->sync_completed_fd); + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->next_state = 0; + mdi->curr_state = 0; + if (mdi->state_fd >= 0) { + read_resync_start(mdi->recovery_fd, + &mdi->recovery_start); + mdi->curr_state = read_dev_state(mdi->state_fd); + } + } + + gettimeofday(&tv, NULL); + dprintf("(%d): %ld.%06ld state:%s prev:%s action:%s prev: %s start:%llu\n", + a->info.container_member, + tv.tv_sec, tv.tv_usec, + array_states[a->curr_state], + array_states[a->prev_state], + sync_actions[a->curr_action], + sync_actions[a->prev_action], + a->info.resync_start + ); + + if ((a->curr_state == bad_word || a->curr_state <= inactive) && + a->prev_state > inactive) { + /* array has been stopped */ + a->container->ss->set_array_state(a, 1); + a->next_state = clear; + deactivate = 1; + } + if (a->curr_state == write_pending) { + a->container->ss->set_array_state(a, 0); + a->next_state = active; + ret |= ARRAY_DIRTY; + } + if (a->curr_state == active_idle) { + /* Set array to 'clean' FIRST, then mark clean + * in the metadata + */ + a->next_state = clean; + ret |= ARRAY_DIRTY; + } + if (a->curr_state == clean) { + a->container->ss->set_array_state(a, 1); + } + if (a->curr_state == active || + a->curr_state == suspended) + ret |= ARRAY_DIRTY; + if (a->curr_state == readonly) { + /* Well, I'm ready to handle things. If readonly + * wasn't requested, transition to read-auto. + */ + char buf[64]; + read_attr(buf, sizeof(buf), a->metadata_fd); + if (strncmp(buf, "external:-", 10) == 0) { + /* explicit request for readonly array. Leave it alone */ + ; + } else { + if (a->container->ss->set_array_state(a, 2)) + a->next_state = read_auto; /* array is clean */ + else { + a->next_state = active; /* Now active for recovery etc */ + ret |= ARRAY_DIRTY; + } + } + } + + if (!deactivate && + a->curr_action == idle && + a->prev_action == resync) { + /* A resync has finished. The endpoint is recorded in + * 'sync_start'. We don't update the metadata + * until the array goes inactive or readonly though. + * Just check if we need to fiddle spares. + */ + a->container->ss->set_array_state(a, a->curr_state <= clean); + check_degraded = 1; + } + + if (!deactivate && + a->curr_action == idle && + a->prev_action == recover) { + /* A recovery has finished. Some disks may be in sync now, + * and the array may no longer be degraded + */ + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + a->container->ss->set_disk(a, mdi->disk.raid_disk, + mdi->curr_state); + if (! (mdi->curr_state & DS_INSYNC)) + check_degraded = 1; + count++; + } + if (count != a->info.array.raid_disks) + check_degraded = 1; + } + + if (!deactivate && + a->curr_action == reshape && + a->prev_action != reshape) + /* reshape was requested by mdadm. Need to see if + * new devices have been added. Manager does that + * when it sees check_reshape + */ + check_reshape = 1; + + /* Check for failures and if found: + * 1/ Record the failure in the metadata and unblock the device. + * FIXME update the kernel to stop notifying on failed drives when + * the array is readonly and we have cleared 'blocked' + * 2/ Try to remove the device if the array is writable, or can be + * made writable. + */ + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + if (mdi->curr_state & DS_FAULTY) { + a->container->ss->set_disk(a, mdi->disk.raid_disk, + mdi->curr_state); + check_degraded = 1; + if (mdi->curr_state & DS_BLOCKED) + mdi->next_state |= DS_UNBLOCK; + if (a->curr_state == read_auto) { + a->container->ss->set_array_state(a, 0); + a->next_state = active; + } + if (a->curr_state > readonly) + mdi->next_state |= DS_REMOVE; + } + } + + /* Check for recovery checkpoint notifications. We need to be a + * minimum distance away from the last checkpoint to prevent + * over checkpointing. Note reshape checkpointing is handled + * in the second branch. + */ + if (sync_completed > a->last_checkpoint && + sync_completed - a->last_checkpoint > a->info.component_size >> 4 && + a->curr_action > reshape) { + /* A (non-reshape) sync_action has reached a checkpoint. + * Record the updated position in the metadata + */ + a->last_checkpoint = sync_completed; + a->container->ss->set_array_state(a, a->curr_state <= clean); + } else if ((a->curr_action == idle && a->prev_action == reshape) || + (a->curr_action == reshape + && sync_completed > a->last_checkpoint) ) { + /* Reshape has progressed or completed so we need to + * update the array state - and possibly the array size + */ + if (sync_completed != 0) + a->last_checkpoint = sync_completed; + /* We might need to update last_checkpoint depending on + * the reason that reshape finished. + * if array reshape is really finished: + * set check point to the end, this allows + * set_array_state() to finalize reshape in metadata + * if reshape if broken: do not set checkpoint to the end + * this allows for reshape restart from checkpoint + */ + if ((a->curr_action != reshape) && + (a->prev_action == reshape)) { + char buf[40]; + if ((sysfs_get_str(&a->info, NULL, + "reshape_position", + buf, + sizeof(buf)) >= 0) && + strncmp(buf, "none", 4) == 0) + a->last_checkpoint = a->info.component_size; + } + a->container->ss->set_array_state(a, a->curr_state <= clean); + a->last_checkpoint = sync_completed; + } + + if (sync_completed > a->last_checkpoint) + a->last_checkpoint = sync_completed; + + a->container->ss->sync_metadata(a->container); + dprintf("(%d): state:%s action:%s next(", a->info.container_member, + array_states[a->curr_state], sync_actions[a->curr_action]); + + /* Effect state changes in the array */ + if (a->next_state != bad_word) { + dprintf_cont(" state:%s", array_states[a->next_state]); + write_attr(array_states[a->next_state], a->info.state_fd); + } + if (a->next_action != bad_action) { + write_attr(sync_actions[a->next_action], a->action_fd); + dprintf_cont(" action:%s", sync_actions[a->next_action]); + } + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + if (mdi->next_state & DS_UNBLOCK) { + dprintf_cont(" %d:-blocked", mdi->disk.raid_disk); + write_attr("-blocked", mdi->state_fd); + } + + if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) { + int remove_result; + + /* The kernel may not be able to immediately remove the + * disk. In that case we wait a little while and + * try again. + */ + remove_result = write_attr("remove", mdi->state_fd); + if (remove_result > 0) { + dprintf_cont(" %d:removed", mdi->disk.raid_disk); + close(mdi->state_fd); + close(mdi->recovery_fd); + mdi->state_fd = -1; + } else + ret |= ARRAY_BUSY; + } + if (mdi->next_state & DS_INSYNC) { + write_attr("+in_sync", mdi->state_fd); + dprintf_cont(" %d:+in_sync", mdi->disk.raid_disk); + } + } + dprintf_cont(" )\n"); + + /* move curr_ to prev_ */ + a->prev_state = a->curr_state; + + a->prev_action = a->curr_action; + + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->prev_state = mdi->curr_state; + mdi->next_state = 0; + } + + if (check_degraded || check_reshape) { + /* manager will do the actual check */ + if (check_degraded) + a->check_degraded = 1; + if (check_reshape) + a->check_reshape = 1; + signal_manager(); + } + + if (deactivate) + a->container = NULL; + + return ret; +} + +static struct mdinfo * +find_device(struct active_array *a, int major, int minor) +{ + struct mdinfo *mdi; + + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + if (mdi->disk.major == major && mdi->disk.minor == minor) + return mdi; + + return NULL; +} + +static void reconcile_failed(struct active_array *aa, struct mdinfo *failed) +{ + struct active_array *a; + struct mdinfo *victim; + + for (a = aa; a; a = a->next) { + if (!a->container || a->to_remove) + continue; + victim = find_device(a, failed->disk.major, failed->disk.minor); + if (!victim) + continue; + + if (!(victim->curr_state & DS_FAULTY)) + write_attr("faulty", victim->state_fd); + } +} + +#ifdef DEBUG +static void dprint_wake_reasons(fd_set *fds) +{ + int i; + char proc_path[256]; + char link[256]; + char *basename; + int rv; + + fprintf(stderr, "monitor: wake ( "); + for (i = 0; i < FD_SETSIZE; i++) { + if (FD_ISSET(i, fds)) { + sprintf(proc_path, "/proc/%d/fd/%d", + (int) getpid(), i); + + rv = readlink(proc_path, link, sizeof(link) - 1); + if (rv < 0) { + fprintf(stderr, "%d:unknown ", i); + continue; + } + link[rv] = '\0'; + basename = strrchr(link, '/'); + fprintf(stderr, "%d:%s ", + i, basename ? ++basename : link); + } + } + fprintf(stderr, ")\n"); +} +#endif + +int monitor_loop_cnt; + +static int wait_and_act(struct supertype *container, int nowait) +{ + fd_set rfds; + int maxfd = 0; + struct active_array **aap = &container->arrays; + struct active_array *a, **ap; + int rv; + struct mdinfo *mdi; + static unsigned int dirty_arrays = ~0; /* start at some non-zero value */ + + FD_ZERO(&rfds); + + for (ap = aap ; *ap ;) { + a = *ap; + /* once an array has been deactivated we want to + * ask the manager to discard it. + */ + if (!a->container || a->to_remove) { + if (discard_this) { + ap = &(*ap)->next; + continue; + } + *ap = a->next; + a->next = NULL; + discard_this = a; + signal_manager(); + continue; + } + + add_fd(&rfds, &maxfd, a->info.state_fd); + add_fd(&rfds, &maxfd, a->action_fd); + add_fd(&rfds, &maxfd, a->sync_completed_fd); + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + add_fd(&rfds, &maxfd, mdi->state_fd); + + ap = &(*ap)->next; + } + + if (manager_ready && (*aap == NULL || (sigterm && !dirty_arrays))) { + /* No interesting arrays, or we have been told to + * terminate and everything is clean. Lets see about + * exiting. Note that blocking at this point is not a + * problem as there are no active arrays, there is + * nothing that we need to be ready to do. + */ + int fd; + if (sigterm) + fd = open_dev_excl(container->devnm); + else + fd = open_dev_flags(container->devnm, O_RDONLY|O_EXCL); + if (fd >= 0 || errno != EBUSY) { + /* OK, we are safe to leave */ + if (sigterm && !dirty_arrays) + dprintf("caught sigterm, all clean... exiting\n"); + else + dprintf("no arrays to monitor... exiting\n"); + if (!sigterm) + /* On SIGTERM, someone (the take-over mdmon) will + * clean up + */ + remove_pidfile(container->devnm); + exit_now = 1; + signal_manager(); + close(fd); + exit(0); + } + } + + if (!nowait) { + sigset_t set; + struct timespec ts; + ts.tv_sec = 24*3600; + ts.tv_nsec = 0; + if (*aap == NULL || container->retry_soon) { + /* just waiting to get O_EXCL access */ + ts.tv_sec = 0; + ts.tv_nsec = 20000000ULL; + } + sigprocmask(SIG_UNBLOCK, NULL, &set); + sigdelset(&set, SIGUSR1); + monitor_loop_cnt |= 1; + rv = pselect(maxfd+1, NULL, NULL, &rfds, &ts, &set); + monitor_loop_cnt += 1; + if (rv == -1) { + if (errno == EINTR) { + rv = 0; + dprintf("monitor: caught signal\n"); + } else + dprintf("monitor: error %d in pselect\n", + errno); + } + #ifdef DEBUG + else + dprint_wake_reasons(&rfds); + #endif + container->retry_soon = 0; + } + + if (update_queue) { + struct metadata_update *this; + + for (this = update_queue; this ; this = this->next) + container->ss->process_update(container, this); + + update_queue_handled = update_queue; + update_queue = NULL; + signal_manager(); + container->ss->sync_metadata(container); + } + + rv = 0; + dirty_arrays = 0; + for (a = *aap; a ; a = a->next) { + + if (a->replaces && !discard_this) { + struct active_array **ap; + for (ap = &a->next; *ap && *ap != a->replaces; + ap = & (*ap)->next) + ; + if (*ap) + *ap = (*ap)->next; + discard_this = a->replaces; + a->replaces = NULL; + /* FIXME check if device->state_fd need to be cleared?*/ + signal_manager(); + } + if (a->container && !a->to_remove) { + int ret = read_and_act(a); + rv |= 1; + dirty_arrays += !!(ret & ARRAY_DIRTY); + /* when terminating stop manipulating the array after it + * is clean, but make sure read_and_act() is given a + * chance to handle 'active_idle' + */ + if (sigterm && !(ret & ARRAY_DIRTY)) + a->container = NULL; /* stop touching this array */ + if (ret & ARRAY_BUSY) + container->retry_soon = 1; + } + } + + /* propagate failures across container members */ + for (a = *aap; a ; a = a->next) { + if (!a->container || a->to_remove) + continue; + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + if (mdi->curr_state & DS_FAULTY) + reconcile_failed(*aap, mdi); + } + + return rv; +} + +void do_monitor(struct supertype *container) +{ + int rv; + int first = 1; + do { + rv = wait_and_act(container, first); + first = 0; + } while (rv >= 0); +} @@ -0,0 +1,475 @@ +/* + * Copyright (C) 2008 Intel Corporation + * + * mdmon socket / message handling + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <unistd.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include "mdadm.h" +#include "mdmon.h" + +static const __u32 start_magic = 0x5a5aa5a5; +static const __u32 end_magic = 0xa5a55a5a; + +static int send_buf(int fd, const void* buf, int len, int tmo) +{ + fd_set set; + int rv; + struct timeval timeout = {tmo, 0}; + struct timeval *ptmo = tmo ? &timeout : NULL; + + while (len) { + FD_ZERO(&set); + FD_SET(fd, &set); + rv = select(fd+1, NULL, &set, NULL, ptmo); + if (rv <= 0) + return -1; + rv = write(fd, buf, len); + if (rv <= 0) + return -1; + len -= rv; + buf += rv; + } + return 0; +} + +static int recv_buf(int fd, void* buf, int len, int tmo) +{ + fd_set set; + int rv; + struct timeval timeout = {tmo, 0}; + struct timeval *ptmo = tmo ? &timeout : NULL; + + while (len) { + FD_ZERO(&set); + FD_SET(fd, &set); + rv = select(fd+1, &set, NULL, NULL, ptmo); + if (rv <= 0) + return -1; + rv = read(fd, buf, len); + if (rv <= 0) + return -1; + len -= rv; + buf += rv; + } + return 0; +} + +int send_message(int fd, struct metadata_update *msg, int tmo) +{ + __s32 len = msg->len; + int rv; + + rv = send_buf(fd, &start_magic, 4, tmo); + rv = rv ?: send_buf(fd, &len, 4, tmo); + if (len > 0) + rv = rv ?: send_buf(fd, msg->buf, msg->len, tmo); + rv = send_buf(fd, &end_magic, 4, tmo); + + return rv; +} + +int receive_message(int fd, struct metadata_update *msg, int tmo) +{ + __u32 magic; + __s32 len; + int rv; + + rv = recv_buf(fd, &magic, 4, tmo); + if (rv < 0 || magic != start_magic) + return -1; + rv = recv_buf(fd, &len, 4, tmo); + if (rv < 0 || len > MSG_MAX_LEN) + return -1; + if (len > 0) { + msg->buf = xmalloc(len); + rv = recv_buf(fd, msg->buf, len, tmo); + if (rv < 0) { + free(msg->buf); + return -1; + } + } else + msg->buf = NULL; + rv = recv_buf(fd, &magic, 4, tmo); + if (rv < 0 || magic != end_magic) { + free(msg->buf); + return -1; + } + msg->len = len; + return 0; +} + +int ack(int fd, int tmo) +{ + struct metadata_update msg = { .len = 0 }; + + return send_message(fd, &msg, tmo); +} + +int wait_reply(int fd, int tmo) +{ + struct metadata_update msg; + int err = receive_message(fd, &msg, tmo); + + /* mdmon sent extra data, but caller only cares that we got a + * successful reply + */ + if (err == 0 && msg.len > 0) + free(msg.buf); + + return err; +} + +int connect_monitor(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + int pos; + char *c; + + pos = sprintf(path, "%s/", MDMON_DIR); + if (is_subarray(devname)) { + devname++; + c = strchr(devname, '/'); + if (!c) + return -1; + snprintf(&path[pos], c - devname + 1, "%s", devname); + pos += c - devname; + } else + pos += sprintf(&path[pos], "%s", devname); + sprintf(&path[pos], ".sock"); + + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + if (connect(sfd, &addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + + return sfd; +} + +int fping_monitor(int sfd) +{ + int err = 0; + + if (sfd < 0) + return sfd; + + /* try to ping existing socket */ + if (ack(sfd, 20) != 0) + err = -1; + + /* check the reply */ + if (!err && wait_reply(sfd, 20) != 0) + err = -1; + + return err; +} + +/* give the monitor a chance to update the metadata */ +int ping_monitor(char *devname) +{ + int sfd = connect_monitor(devname); + int err; + + if (sfd >= 0) { + err = fping_monitor(sfd); + close(sfd); + } else + err = -1; + + return err; +} + +static char *ping_monitor_version(char *devname) +{ + int sfd = connect_monitor(devname); + struct metadata_update msg; + int err = 0; + + if (sfd < 0) + return NULL; + + if (ack(sfd, 20) != 0) + err = -1; + + if (!err && receive_message(sfd, &msg, 20) != 0) + err = -1; + + close(sfd); + + if (err || !msg.len || !msg.buf) + return NULL; + return msg.buf; +} + +int unblock_subarray(struct mdinfo *sra, const int unfreeze) +{ + char buf[64]; + int rc = 0; + + if (sra) { + sprintf(buf, "external:%s\n", sra->text_version); + buf[9] = '/'; + } else + buf[9] = '-'; + + if (buf[9] == '-' || + sysfs_set_str(sra, NULL, "metadata_version", buf) || + (unfreeze && + sysfs_attribute_available(sra, NULL, "sync_action") && + sysfs_set_str(sra, NULL, "sync_action", "idle"))) + rc = -1; + return rc; +} + +int block_subarray(struct mdinfo *sra) +{ + char buf[64]; + int rc = 0; + + sprintf(buf, "external:%s\n", sra->text_version); + buf[9] = '-'; + if (sysfs_set_str(sra, NULL, "metadata_version", buf)) + rc = -1; + + return rc; +} + +/* check mdmon version if it supports + * array blocking mechanism + */ +int check_mdmon_version(char *container) +{ + char *version = NULL; + + if (!mdmon_running(container)) { + /* if mdmon is not active we assume that any instance that is + * later started will match the current mdadm version, if this + * assumption is violated we may inadvertantly rebuild an array + * that was meant for reshape, or start rebuild on a spare that + * was to be moved to another container + */ + /* pass */; + } else { + int ver; + + version = ping_monitor_version(container); + ver = version ? mdadm_version(version) : -1; + free(version); + if (ver < 3002000) { + pr_err("mdmon instance for %s cannot be disabled\n", + container); + return -1; + } + } + + return 0; +} + +/** + * block_monitor - prevent mdmon spare assignment + * @container - container to block + * @freeze - flag to additionally freeze sync_action + * + * This is used by the reshape code to freeze the container, and the + * auto-rebuild implementation to atomically move spares. + * In both cases we need to stop mdmon from assigning spares to replace + * failed devices as we might have other plans for the spare. + * For the reshape case we also need to 'freeze' sync_action so that + * no recovery happens until we have fully prepared for the reshape. + * + * We tell mdmon that the array is frozen by marking the 'metadata' name + * with a leading '-'. The previously told mdmon "Don't make this array + * read/write, leave it readonly". Now it means a more general "Don't + * reconfigure this array at all". + * As older versions of mdmon (which might run from initrd) don't understand + * this, we first check that the running mdmon is new enough. + */ +int block_monitor(char *container, const int freeze) +{ + struct mdstat_ent *ent, *e, *e2; + struct mdinfo *sra = NULL; + char buf[64]; + int rv = 0; + + if (check_mdmon_version(container)) + return -1; + + ent = mdstat_read(0, 0); + if (!ent) { + pr_err("failed to read /proc/mdstat while disabling mdmon\n"); + return -1; + } + + /* freeze container contents */ + for (e = ent; e; e = e->next) { + if (!is_container_member(e, container)) + continue; + sysfs_free(sra); + sra = sysfs_read(-1, e->devnm, GET_VERSION); + if (!sra) { + pr_err("failed to read sysfs for subarray%s\n", + to_subarray(e, container)); + break; + } + /* can't reshape an array that we can't monitor */ + if (sra->text_version[0] == '-') + break; + + if (freeze && sysfs_freeze_array(sra) < 1) + break; + /* flag this array to not be modified by mdmon (close race with + * takeover in reshape case and spare reassignment in the + * auto-rebuild case) + */ + if (block_subarray(sra)) + break; + ping_monitor(container); + + /* check that we did not race with recovery */ + if ((freeze && + !sysfs_attribute_available(sra, NULL, "sync_action")) || + (freeze && + sysfs_attribute_available(sra, NULL, "sync_action") && + sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 && + strcmp(buf, "frozen\n") == 0)) + /* pass */; + else { + unblock_subarray(sra, 0); + break; + } + /* Double check against races - there should be no spares + * or part-spares + */ + sysfs_free(sra); + sra = sysfs_read(-1, e->devnm, GET_DEVS | GET_STATE); + if (sra && sra->array.spare_disks > 0) { + unblock_subarray(sra, freeze); + break; + } + } + + if (e) { + pr_err("failed to freeze subarray%s\n", + to_subarray(e, container)); + + /* thaw the partially frozen container */ + for (e2 = ent; e2 && e2 != e; e2 = e2->next) { + if (!is_container_member(e2, container)) + continue; + sysfs_free(sra); + sra = sysfs_read(-1, e2->devnm, GET_VERSION); + if (unblock_subarray(sra, freeze)) + pr_err("Failed to unfreeze %s\n", e2->devnm); + } + + ping_monitor(container); /* cleared frozen */ + rv = -1; + } + + sysfs_free(sra); + free_mdstat(ent); + + return rv; +} + +void unblock_monitor(char *container, const int unfreeze) +{ + struct mdstat_ent *ent, *e; + struct mdinfo *sra = NULL; + int to_ping = 0; + + ent = mdstat_read(0, 0); + if (!ent) { + pr_err("failed to read /proc/mdstat while unblocking container\n"); + return; + } + + /* unfreeze container contents */ + for (e = ent; e; e = e->next) { + if (!is_container_member(e, container)) + continue; + sysfs_free(sra); + sra = sysfs_read(-1, e->devnm, GET_VERSION|GET_LEVEL); + if (!sra) + continue; + if (sra->array.level > 0) + to_ping++; + if (unblock_subarray(sra, unfreeze)) + pr_err("Failed to unfreeze %s\n", e->devnm); + } + if (to_ping) + ping_monitor(container); + + sysfs_free(sra); + free_mdstat(ent); +} + +/* give the manager a chance to view the updated container state. This + * would naturally happen due to the manager noticing a change in + * /proc/mdstat; however, pinging encourages this detection to happen + * while an exclusive open() on the container is active + */ +int ping_manager(char *devname) +{ + int sfd = connect_monitor(devname); + struct metadata_update msg = { .len = -1 }; + int err = 0; + + if (sfd < 0) + return sfd; + + err = send_message(sfd, &msg, 20); + + /* check the reply */ + if (!err && wait_reply(sfd, 20) != 0) + err = -1; + + close(sfd); + return err; +} + +/* using takeover operation for grow purposes, mdadm has to be sure + * that mdmon processes all updates, and if necessary it will be closed + * at takeover to raid0 operation + */ +void flush_mdmon(char *container) +{ + ping_manager(container); + ping_monitor(container); +} @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2008 Intel Corporation + * + * mdmon socket / message handling + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +struct mdinfo; +struct metadata_update; + +extern int receive_message(int fd, struct metadata_update *msg, int tmo); +extern int send_message(int fd, struct metadata_update *msg, int tmo); +extern int ack(int fd, int tmo); +extern int wait_reply(int fd, int tmo); +extern int connect_monitor(char *devname); +extern int ping_monitor(char *devname); +extern int block_subarray(struct mdinfo *sra); +extern int unblock_subarray(struct mdinfo *sra, const int unfreeze); +extern int block_monitor(char *container, const int freeze); +extern void unblock_monitor(char *container, const int unfreeze); +extern int fping_monitor(int sock); +extern int ping_manager(char *devname); +extern void flush_mdmon(char *container); + +#define MSG_MAX_LEN (4*1024*1024) @@ -0,0 +1,79 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2010 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neil@brown.name> + * + */ + +/* Structure definitions ext for MBR and GPT partition tables + */ + +#define MBR_SIGNATURE_MAGIC __cpu_to_le16(0xAA55) +#define MBR_PARTITIONS 4 + +struct MBR_part_record { + __u8 bootable; + __u8 first_head; + __u8 first_sector; + __u8 first_cyl; + __u8 part_type; + __u8 last_head; + __u8 last_sector; + __u8 last_cyl; + __u32 first_sect_lba; + __u32 blocks_num; +}; + +struct MBR { + __u8 pad[446]; + struct MBR_part_record parts[MBR_PARTITIONS]; + __u16 magic; +} __attribute__((packed)); + +#define GPT_SIGNATURE_MAGIC __cpu_to_le64(0x5452415020494645ULL) +#define MBR_GPT_PARTITION_TYPE 0xEE + +struct GPT_part_entry { + unsigned char type_guid[16]; + unsigned char partition_guid[16]; + __u64 starting_lba; + __u64 ending_lba; + unsigned char attr_bits[8]; + unsigned char name[72]; +} __attribute__((packed)); + +struct GPT { + __u64 magic; + __u32 revision; + __u32 header_size; + __u32 crc; + __u32 pad1; + __u64 current_lba; + __u64 backup_lba; + __u64 first_lba; + __u64 last_lba; + __u8 guid[16]; + __u64 part_start; + __u32 part_cnt; + __u32 part_size; + __u32 part_crc; + __u8 pad2[420]; +} __attribute__((packed)); diff --git a/platform-intel.c b/platform-intel.c new file mode 100644 index 00000000..edb86795 --- /dev/null +++ b/platform-intel.c @@ -0,0 +1,669 @@ +/* + * Intel(R) Matrix Storage Manager hardware and firmware support routines + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include "mdadm.h" +#include "platform-intel.h" +#include "probe_roms.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <dirent.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <limits.h> + +static int devpath_to_ll(const char *dev_path, const char *entry, + unsigned long long *val); + +static __u16 devpath_to_vendor(const char *dev_path); + +static void free_sys_dev(struct sys_dev **list) +{ + while (*list) { + struct sys_dev *next = (*list)->next; + + if ((*list)->path) + free((*list)->path); + free(*list); + *list = next; + } +} + +struct sys_dev *find_driver_devices(const char *bus, const char *driver) +{ + /* search sysfs for devices driven by 'driver' */ + char path[292]; + char link[256]; + char *c; + DIR *driver_dir; + struct dirent *de; + struct sys_dev *head = NULL; + struct sys_dev *list = NULL; + enum sys_dev_type type; + unsigned long long dev_id; + unsigned long long class; + + if (strcmp(driver, "isci") == 0) + type = SYS_DEV_SAS; + else if (strcmp(driver, "ahci") == 0) + type = SYS_DEV_SATA; + else if (strcmp(driver, "nvme") == 0) + type = SYS_DEV_NVME; + else + type = SYS_DEV_UNKNOWN; + + sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver); + driver_dir = opendir(path); + if (!driver_dir) + return NULL; + for (de = readdir(driver_dir); de; de = readdir(driver_dir)) { + int n; + + /* is 'de' a device? check that the 'subsystem' link exists and + * that its target matches 'bus' + */ + sprintf(path, "/sys/bus/%s/drivers/%s/%s/subsystem", + bus, driver, de->d_name); + n = readlink(path, link, sizeof(link)); + if (n < 0 || n >= (int)sizeof(link)) + continue; + link[n] = '\0'; + c = strrchr(link, '/'); + if (!c) + continue; + if (strncmp(bus, c+1, strlen(bus)) != 0) + continue; + + sprintf(path, "/sys/bus/%s/drivers/%s/%s", + bus, driver, de->d_name); + + /* if it's not Intel device skip it. */ + if (devpath_to_vendor(path) != 0x8086) + continue; + + if (devpath_to_ll(path, "device", &dev_id) != 0) + continue; + + if (devpath_to_ll(path, "class", &class) != 0) + continue; + + /* start / add list entry */ + if (!head) { + head = xmalloc(sizeof(*head)); + list = head; + } else { + list->next = xmalloc(sizeof(*head)); + list = list->next; + } + + if (!list) { + free_sys_dev(&head); + break; + } + + list->dev_id = (__u16) dev_id; + list->class = (__u32) class; + list->type = type; + list->path = realpath(path, NULL); + list->next = NULL; + if ((list->pci_id = strrchr(list->path, '/')) != NULL) + list->pci_id++; + } + closedir(driver_dir); + return head; +} + +static struct sys_dev *intel_devices=NULL; +static time_t valid_time = 0; + +struct sys_dev *device_by_id(__u16 device_id) +{ + struct sys_dev *iter; + + for (iter = intel_devices; iter != NULL; iter = iter->next) + if (iter->dev_id == device_id) + return iter; + return NULL; +} + +static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long long *val) +{ + char path[strlen(dev_path) + strlen(entry) + 2]; + int fd; + int n; + + sprintf(path, "%s/%s", dev_path, entry); + + fd = open(path, O_RDONLY); + if (fd < 0) + return -1; + n = sysfs_fd_get_ll(fd, val); + close(fd); + return n; +} + +static __u16 devpath_to_vendor(const char *dev_path) +{ + char path[strlen(dev_path) + strlen("/vendor") + 1]; + char vendor[7]; + int fd; + __u16 id = 0xffff; + int n; + + sprintf(path, "%s/vendor", dev_path); + + fd = open(path, O_RDONLY); + if (fd < 0) + return 0xffff; + + n = read(fd, vendor, sizeof(vendor)); + if (n == sizeof(vendor)) { + vendor[n - 1] = '\0'; + id = strtoul(vendor, NULL, 16); + } + close(fd); + + return id; +} + +struct sys_dev *find_intel_devices(void) +{ + struct sys_dev *ahci, *isci, *nvme; + + if (valid_time > time(0) - 10) + return intel_devices; + + if (intel_devices) + free_sys_dev(&intel_devices); + + isci = find_driver_devices("pci", "isci"); + ahci = find_driver_devices("pci", "ahci"); + nvme = find_driver_devices("pci", "nvme"); + + if (!isci && !ahci) { + ahci = nvme; + } else if (!ahci) { + ahci = isci; + struct sys_dev *elem = ahci; + while (elem->next) + elem = elem->next; + elem->next = nvme; + } else { + struct sys_dev *elem = ahci; + while (elem->next) + elem = elem->next; + elem->next = isci; + while (elem->next) + elem = elem->next; + elem->next = nvme; + } + intel_devices = ahci; + valid_time = time(0); + return intel_devices; +} + +/* + * PCI Expansion ROM Data Structure Format */ +struct pciExpDataStructFormat { + __u8 ver[4]; + __u16 vendorID; + __u16 deviceID; + __u16 devListOffset; + __u16 pciDataStructLen; + __u8 pciDataStructRev; +} __attribute__ ((packed)); + +struct orom_entry *orom_entries; + +const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id) +{ + struct orom_entry *entry; + struct devid_list *devid; + + for (entry = orom_entries; entry; entry = entry->next) { + for (devid = entry->devid_list; devid; devid = devid->next) { + if (devid->devid == dev_id) + return entry; + } + } + + return NULL; +} + +const struct imsm_orom *get_orom_by_device_id(__u16 dev_id) +{ + const struct orom_entry *entry = get_orom_entry_by_device_id(dev_id); + + if (entry) + return &entry->orom; + + return NULL; +} + +static struct orom_entry *add_orom(const struct imsm_orom *orom) +{ + struct orom_entry *list; + struct orom_entry *prev = NULL; + + for (list = orom_entries; list; prev = list, list = list->next) + ; + + list = xmalloc(sizeof(struct orom_entry)); + list->orom = *orom; + list->devid_list = NULL; + list->next = NULL; + + if (prev == NULL) + orom_entries = list; + else + prev->next = list; + + return list; +} + +static void add_orom_device_id(struct orom_entry *entry, __u16 dev_id) +{ + struct devid_list *list; + struct devid_list *prev = NULL; + + for (list = entry->devid_list; list; prev = list, list = list->next) { + if (list->devid == dev_id) + return; + } + list = xmalloc(sizeof(struct devid_list)); + list->devid = dev_id; + list->next = NULL; + + if (prev == NULL) + entry->devid_list = list; + else + prev->next = list; +} + +static int scan(const void *start, const void *end, const void *data) +{ + int offset; + const struct imsm_orom *imsm_mem = NULL; + int len = (end - start); + struct pciExpDataStructFormat *ptr= (struct pciExpDataStructFormat *)data; + + if (data + 0x18 > end) { + dprintf("cannot find pciExpDataStruct \n"); + return 0; + } + + dprintf("ptr->vendorID: %lx __le16_to_cpu(ptr->deviceID): %lx \n", + (ulong) __le16_to_cpu(ptr->vendorID), + (ulong) __le16_to_cpu(ptr->deviceID)); + + if (__le16_to_cpu(ptr->vendorID) != 0x8086) + return 0; + + for (offset = 0; offset < len; offset += 4) { + const void *mem = start + offset; + + if ((memcmp(mem, IMSM_OROM_SIGNATURE, 4) == 0)) { + imsm_mem = mem; + break; + } + } + + if (!imsm_mem) + return 0; + + struct orom_entry *orom = add_orom(imsm_mem); + + /* only PciDataStructure with revision 3 and above supports devices list. */ + if (ptr->pciDataStructRev >= 3 && ptr->devListOffset) { + const __u16 *dev_list = (void *)ptr + ptr->devListOffset; + int i; + + for (i = 0; dev_list[i] != 0; i++) + add_orom_device_id(orom, dev_list[i]); + } else { + add_orom_device_id(orom, __le16_to_cpu(ptr->deviceID)); + } + + return 0; +} + +const struct imsm_orom *imsm_platform_test(struct sys_dev *hba) +{ + struct imsm_orom orom = { + .signature = IMSM_OROM_SIGNATURE, + .rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5, + .sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB | + IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB | + IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB | + IMSM_OROM_SSS_256kB | IMSM_OROM_SSS_512kB | + IMSM_OROM_SSS_1MB | IMSM_OROM_SSS_2MB, + .dpa = IMSM_OROM_DISKS_PER_ARRAY, + .tds = IMSM_OROM_TOTAL_DISKS, + .vpa = IMSM_OROM_VOLUMES_PER_ARRAY, + .vphba = IMSM_OROM_VOLUMES_PER_HBA + }; + orom.attr = orom.rlc | IMSM_OROM_ATTR_ChecksumVerify; + + if (check_env("IMSM_TEST_OROM_NORAID5")) { + orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10; + } + if (check_env("IMSM_TEST_AHCI_EFI_NORAID5") && (hba->type == SYS_DEV_SAS)) { + orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10; + } + if (check_env("IMSM_TEST_SCU_EFI_NORAID5") && (hba->type == SYS_DEV_SATA)) { + orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10; + } + + struct orom_entry *ret = add_orom(&orom); + + add_orom_device_id(ret, hba->dev_id); + + return &ret->orom; +} + +static const struct imsm_orom *find_imsm_hba_orom(struct sys_dev *hba) +{ + unsigned long align; + + if (check_env("IMSM_TEST_OROM")) + return imsm_platform_test(hba); + + /* return empty OROM capabilities in EFI test mode */ + if (check_env("IMSM_TEST_AHCI_EFI") || check_env("IMSM_TEST_SCU_EFI")) + return NULL; + + find_intel_devices(); + + if (intel_devices == NULL) + return NULL; + + /* scan option-rom memory looking for an imsm signature */ + if (check_env("IMSM_SAFE_OROM_SCAN")) + align = 2048; + else + align = 512; + if (probe_roms_init(align) != 0) + return NULL; + probe_roms(); + /* ignore return value - True is returned if both adapater roms are found */ + scan_adapter_roms(scan); + probe_roms_exit(); + + return get_orom_by_device_id(hba->dev_id); +} + +#define GUID_STR_MAX 37 /* according to GUID format: + * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" */ + +#define EFI_GUID(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ +((struct efi_guid) \ +{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, \ + (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) + +#define SYS_EFI_VAR_PATH "/sys/firmware/efi/vars" +#define SYS_EFIVARS_PATH "/sys/firmware/efi/efivars" +#define SCU_PROP "RstScuV" +#define AHCI_PROP "RstSataV" +#define AHCI_SSATA_PROP "RstsSatV" +#define AHCI_CSATA_PROP "RstCSatV" + +#define VENDOR_GUID \ + EFI_GUID(0x193dfefa, 0xa445, 0x4302, 0x99, 0xd8, 0xef, 0x3a, 0xad, 0x1a, 0x04, 0xc6) + +#define PCI_CLASS_RAID_CNTRL 0x010400 + +static int read_efi_var(void *buffer, ssize_t buf_size, char *variable_name, struct efi_guid guid) +{ + char path[PATH_MAX]; + char buf[GUID_STR_MAX]; + int fd; + ssize_t n; + + snprintf(path, PATH_MAX, "%s/%s-%s", SYS_EFIVARS_PATH, variable_name, guid_str(buf, guid)); + + fd = open(path, O_RDONLY); + if (fd < 0) + return 1; + + /* read the variable attributes and ignore it */ + n = read(fd, buf, sizeof(__u32)); + if (n < 0) { + close(fd); + return 1; + } + + /* read the variable data */ + n = read(fd, buffer, buf_size); + close(fd); + if (n < buf_size) + return 1; + + return 0; +} + +static int read_efi_variable(void *buffer, ssize_t buf_size, char *variable_name, struct efi_guid guid) +{ + char path[PATH_MAX]; + char buf[GUID_STR_MAX]; + int dfd; + ssize_t n, var_data_len; + + /* Try to read the variable using the new efivarfs interface first. + * If that fails, fall back to the old sysfs-efivars interface. */ + if (!read_efi_var(buffer, buf_size, variable_name, guid)) + return 0; + + snprintf(path, PATH_MAX, "%s/%s-%s/size", SYS_EFI_VAR_PATH, variable_name, guid_str(buf, guid)); + + dprintf("EFI VAR: path=%s\n", path); + /* get size of variable data */ + dfd = open(path, O_RDONLY); + if (dfd < 0) + return 1; + + n = read(dfd, &buf, sizeof(buf)); + close(dfd); + if (n < 0) + return 1; + buf[n] = '\0'; + + errno = 0; + var_data_len = strtoul(buf, NULL, 16); + if ((errno == ERANGE && (var_data_len == LONG_MAX)) + || (errno != 0 && var_data_len == 0)) + return 1; + + /* get data */ + snprintf(path, PATH_MAX, "%s/%s-%s/data", SYS_EFI_VAR_PATH, variable_name, guid_str(buf, guid)); + + dprintf("EFI VAR: path=%s\n", path); + dfd = open(path, O_RDONLY); + if (dfd < 0) + return 1; + + n = read(dfd, buffer, buf_size); + close(dfd); + if (n != var_data_len || n < buf_size) { + return 1; + } + + return 0; +} + +const struct imsm_orom *find_imsm_efi(struct sys_dev *hba) +{ + struct imsm_orom orom; + struct orom_entry *ret; + int err; + + if (check_env("IMSM_TEST_AHCI_EFI") || check_env("IMSM_TEST_SCU_EFI")) + return imsm_platform_test(hba); + + /* OROM test is set, return that there is no EFI capabilities */ + if (check_env("IMSM_TEST_OROM")) + return NULL; + + if (hba->type == SYS_DEV_SATA && hba->class != PCI_CLASS_RAID_CNTRL) + return NULL; + + err = read_efi_variable(&orom, sizeof(orom), hba->type == SYS_DEV_SAS ? SCU_PROP : AHCI_PROP, VENDOR_GUID); + + /* try to read variable for second AHCI controller */ + if (err && hba->type == SYS_DEV_SATA) + err = read_efi_variable(&orom, sizeof(orom), AHCI_SSATA_PROP, VENDOR_GUID); + + /* try to read variable for combined AHCI controllers */ + if (err && hba->type == SYS_DEV_SATA) { + static struct orom_entry *csata; + + err = read_efi_variable(&orom, sizeof(orom), AHCI_CSATA_PROP, VENDOR_GUID); + if (!err) { + if (!csata) + csata = add_orom(&orom); + add_orom_device_id(csata, hba->dev_id); + return &csata->orom; + } + } + + if (err) + return NULL; + + ret = add_orom(&orom); + add_orom_device_id(ret, hba->dev_id); + + return &ret->orom; +} + +const struct imsm_orom *find_imsm_nvme(struct sys_dev *hba) +{ + static struct orom_entry *nvme_orom; + + if (hba->type != SYS_DEV_NVME) + return NULL; + + if (!nvme_orom) { + struct imsm_orom nvme_orom_compat = { + .signature = IMSM_NVME_OROM_COMPAT_SIGNATURE, + .rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5, + .sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB | + IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB | + IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB, + .dpa = IMSM_OROM_DISKS_PER_ARRAY_NVME, + .tds = IMSM_OROM_TOTAL_DISKS_NVME, + .vpa = IMSM_OROM_VOLUMES_PER_ARRAY, + .vphba = IMSM_OROM_TOTAL_DISKS_NVME / 2 * IMSM_OROM_VOLUMES_PER_ARRAY, + .attr = IMSM_OROM_ATTR_2TB | IMSM_OROM_ATTR_2TB_DISK, + .driver_features = IMSM_OROM_CAPABILITIES_EnterpriseSystem + }; + nvme_orom = add_orom(&nvme_orom_compat); + } + add_orom_device_id(nvme_orom, hba->dev_id); + return &nvme_orom->orom; +} + +const struct imsm_orom *find_imsm_capability(struct sys_dev *hba) +{ + const struct imsm_orom *cap = get_orom_by_device_id(hba->dev_id); + + if (cap) + return cap; + + if (hba->type == SYS_DEV_NVME) + return find_imsm_nvme(hba); + if ((cap = find_imsm_efi(hba)) != NULL) + return cap; + if ((cap = find_imsm_hba_orom(hba)) != NULL) + return cap; + + return NULL; +} + +char *devt_to_devpath(dev_t dev) +{ + char device[46]; + + sprintf(device, "/sys/dev/block/%d:%d/device", major(dev), minor(dev)); + return realpath(device, NULL); +} + +char *diskfd_to_devpath(int fd) +{ + /* return the device path for a disk, return NULL on error or fd + * refers to a partition + */ + struct stat st; + + if (fstat(fd, &st) != 0) + return NULL; + if (!S_ISBLK(st.st_mode)) + return NULL; + + return devt_to_devpath(st.st_rdev); +} + +int path_attached_to_hba(const char *disk_path, const char *hba_path) +{ + int rc; + + if (check_env("IMSM_TEST_AHCI_DEV") || + check_env("IMSM_TEST_SCU_DEV")) { + return 1; + } + + if (!disk_path || !hba_path) + return 0; + dprintf("hba: %s - disk: %s\n", hba_path, disk_path); + if (strncmp(disk_path, hba_path, strlen(hba_path)) == 0) + rc = 1; + else + rc = 0; + + return rc; +} + +int devt_attached_to_hba(dev_t dev, const char *hba_path) +{ + char *disk_path = devt_to_devpath(dev); + int rc = path_attached_to_hba(disk_path, hba_path); + + if (disk_path) + free(disk_path); + + return rc; +} + +int disk_attached_to_hba(int fd, const char *hba_path) +{ + char *disk_path = diskfd_to_devpath(fd); + int rc = path_attached_to_hba(disk_path, hba_path); + + if (disk_path) + free(disk_path); + + return rc; +} diff --git a/platform-intel.h b/platform-intel.h new file mode 100644 index 00000000..695d6c66 --- /dev/null +++ b/platform-intel.h @@ -0,0 +1,243 @@ +/* + * Intel(R) Matrix Storage Manager hardware and firmware support routines + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include <asm/types.h> +#include <strings.h> + +/* The IMSM Capability (IMSM AHCI and ISCU OROM/EFI variable) Version Table definition */ +struct imsm_orom { + __u8 signature[4]; + #define IMSM_OROM_SIGNATURE "$VER" + #define IMSM_NVME_OROM_COMPAT_SIGNATURE "$NVM" + __u8 table_ver_major; /* Currently 2 (can change with future revs) */ + __u8 table_ver_minor; /* Currently 2 (can change with future revs) */ + __u16 major_ver; /* Example: 8 as in 8.6.0.1020 */ + __u16 minor_ver; /* Example: 6 as in 8.6.0.1020 */ + __u16 hotfix_ver; /* Example: 0 as in 8.6.0.1020 */ + __u16 build; /* Example: 1020 as in 8.6.0.1020 */ + __u8 len; /* number of bytes in this entire table */ + __u8 checksum; /* checksum of all the bytes in this table */ + __u16 rlc; /* RAID Level Capability */ + /* we assume the cpu is x86 as the orom should not be found + * anywhere else + */ + #define IMSM_OROM_RLC_RAID0 (1 << 0) + #define IMSM_OROM_RLC_RAID1 (1 << 1) + #define IMSM_OROM_RLC_RAID10 (1 << 2) + #define IMSM_OROM_RLC_RAID1E (1 << 3) + #define IMSM_OROM_RLC_RAID5 (1 << 4) + #define IMSM_OROM_RLC_RAID_CNG (1 << 5) + __u16 sss; /* Strip Size Supported */ + #define IMSM_OROM_SSS_2kB (1 << 0) + #define IMSM_OROM_SSS_4kB (1 << 1) + #define IMSM_OROM_SSS_8kB (1 << 2) + #define IMSM_OROM_SSS_16kB (1 << 3) + #define IMSM_OROM_SSS_32kB (1 << 4) + #define IMSM_OROM_SSS_64kB (1 << 5) + #define IMSM_OROM_SSS_128kB (1 << 6) + #define IMSM_OROM_SSS_256kB (1 << 7) + #define IMSM_OROM_SSS_512kB (1 << 8) + #define IMSM_OROM_SSS_1MB (1 << 9) + #define IMSM_OROM_SSS_2MB (1 << 10) + #define IMSM_OROM_SSS_4MB (1 << 11) + #define IMSM_OROM_SSS_8MB (1 << 12) + #define IMSM_OROM_SSS_16MB (1 << 13) + #define IMSM_OROM_SSS_32MB (1 << 14) + #define IMSM_OROM_SSS_64MB (1 << 15) + __u16 dpa; /* Disks Per Array supported */ + #define IMSM_OROM_DISKS_PER_ARRAY 6 + #define IMSM_OROM_DISKS_PER_ARRAY_NVME 12 + __u16 tds; /* Total Disks Supported */ + #define IMSM_OROM_TOTAL_DISKS 6 + #define IMSM_OROM_TOTAL_DISKS_NVME 12 + __u8 vpa; /* # Volumes Per Array supported */ + #define IMSM_OROM_VOLUMES_PER_ARRAY 2 + __u8 vphba; /* # Volumes Per Host Bus Adapter supported */ + #define IMSM_OROM_VOLUMES_PER_HBA 4 + #define IMSM_OROM_VOLUMES_PER_HBA_NVME 4 + /* Attributes supported. This should map to the + * attributes in the MPB. Also, lower 16 bits + * should match/duplicate RLC bits above. + */ + __u32 attr; + #define IMSM_OROM_ATTR_RAID0 IMSM_OROM_RLC_RAID0 + #define IMSM_OROM_ATTR_RAID1 IMSM_OROM_RLC_RAID1 + #define IMSM_OROM_ATTR_RAID10 IMSM_OROM_RLC_RAID10 + #define IMSM_OROM_ATTR_RAID1E IMSM_OROM_RLC_RAID1E + #define IMSM_OROM_ATTR_RAID5 IMSM_OROM_RLC_RAID5 + #define IMSM_OROM_ATTR_RAID_CNG IMSM_OROM_RLC_RAID_CNG + #define IMSM_OROM_ATTR_2TB_DISK (1 << 26) + #define IMSM_OROM_ATTR_2TB (1 << 29) + #define IMSM_OROM_ATTR_PM (1 << 30) + #define IMSM_OROM_ATTR_ChecksumVerify (1 << 31) + __u32 capabilities; + #define IMSM_OROM_CAPABILITIES_Ext_SATA (1 << 0) + #define IMSM_OROM_CAPABILITIES_TurboMemory (1 << 1) + #define IMSM_OROM_CAPABILITIES_HddPassword (1 << 2) + #define IMSM_OROM_CAPABILITIES_DiskCoercion (1 << 3) + __u32 driver_features; + #define IMSM_OROM_CAPABILITIES_HDDUnlock (1 << 0) + #define IMSM_OROM_CAPABILITIES_LEDLoc (1 << 1) + #define IMSM_OROM_CAPABILITIES_EnterpriseSystem (1 << 2) + #define IMSM_OROM_CAPABILITIES_Zpodd (1 << 3) + #define IMSM_OROM_CAPABILITIES_LargeDramCache (1 << 4) + #define IMSM_OROM_CAPABILITIES_Rohi (1 << 5) + #define IMSM_OROM_CAPABILITIES_ReadPatrol (1 << 6) + #define IMSM_OROM_CAPABILITIES_XorHw (1 << 7) +} __attribute__((packed)); + +static inline int imsm_orom_has_raid0(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID0); +} +static inline int imsm_orom_has_raid1(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID1); +} +static inline int imsm_orom_has_raid1e(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID1E); +} +static inline int imsm_orom_has_raid10(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID10); +} +static inline int imsm_orom_has_raid5(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID5); +} + +/** + * imsm_orom_has_chunk - check if the orom supports the given chunk size + * @orom: orom pointer from find_imsm_orom + * @chunk: chunk size in kibibytes + */ +static inline int imsm_orom_has_chunk(const struct imsm_orom *orom, int chunk) +{ + int fs = ffs(chunk); + if (!fs) + return 0; + fs--; /* bit num to bit index */ + if (chunk & (chunk-1)) + return 0; /* not a power of 2 */ + return !!(orom->sss & (1 << (fs - 1))); +} + +/** + * fls - find last (most-significant) bit set + * @x: the word to search + * The funciton is borrowed from Linux kernel code + * include/asm-generic/bitops/fls.h + */ +static inline int fls(int x) +{ + int r = 32; + + if (!x) + return 0; + if (!(x & 0xffff0000u)) { + x <<= 16; + r -= 16; + } + if (!(x & 0xff000000u)) { + x <<= 8; + r -= 8; + } + if (!(x & 0xf0000000u)) { + x <<= 4; + r -= 4; + } + if (!(x & 0xc0000000u)) { + x <<= 2; + r -= 2; + } + if (!(x & 0x80000000u)) { + x <<= 1; + r -= 1; + } + return r; +} + +static inline int imsm_orom_is_enterprise(const struct imsm_orom *orom) +{ + return !!(orom->driver_features & IMSM_OROM_CAPABILITIES_EnterpriseSystem); +} + +static inline int imsm_orom_is_nvme(const struct imsm_orom *orom) +{ + return memcmp(orom->signature, IMSM_NVME_OROM_COMPAT_SIGNATURE, + sizeof(orom->signature)) == 0; +} + +enum sys_dev_type { + SYS_DEV_UNKNOWN = 0, + SYS_DEV_SAS, + SYS_DEV_SATA, + SYS_DEV_NVME, + SYS_DEV_MAX +}; + +struct sys_dev { + enum sys_dev_type type; + char *path; + char *pci_id; + __u16 dev_id; + __u32 class; + struct sys_dev *next; +}; + +struct efi_guid { + __u8 b[16]; +}; + +struct devid_list { + __u16 devid; + struct devid_list *next; +}; + +struct orom_entry { + struct imsm_orom orom; + struct devid_list *devid_list; + struct orom_entry *next; +}; + +extern struct orom_entry *orom_entries; + +static inline char *guid_str(char *buf, struct efi_guid guid) +{ + sprintf(buf, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + guid.b[3], guid.b[2], guid.b[1], guid.b[0], + guid.b[5], guid.b[4], guid.b[7], guid.b[6], + guid.b[8], guid.b[9], guid.b[10], guid.b[11], + guid.b[12], guid.b[13], guid.b[14], guid.b[15]); + return buf; +} + +char *diskfd_to_devpath(int fd); +struct sys_dev *find_driver_devices(const char *bus, const char *driver); +struct sys_dev *find_intel_devices(void); +const struct imsm_orom *find_imsm_capability(struct sys_dev *hba); +const struct imsm_orom *find_imsm_orom(void); +int disk_attached_to_hba(int fd, const char *hba_path); +int devt_attached_to_hba(dev_t dev, const char *hba_path); +char *devt_to_devpath(dev_t dev); +int path_attached_to_hba(const char *disk_path, const char *hba_path); +const char *get_sys_dev_type(enum sys_dev_type); +const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id); +const struct imsm_orom *get_orom_by_device_id(__u16 device_id); +struct sys_dev *device_by_id(__u16 device_id); diff --git a/policy.c b/policy.c new file mode 100644 index 00000000..064d3491 --- /dev/null +++ b/policy.c @@ -0,0 +1,911 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <dirent.h> +#include <fnmatch.h> +#include <ctype.h> +#include "dlink.h" +/* + * Policy module for mdadm. + * A policy statement about a device lists a set of values for each + * of a set of names. Each value can have a metadata type as context. + * + * names include: + * action - the actions that can be taken on hot-plug + * domain - the domain(s) that the device is part of + * + * Policy information is extracted from various sources, but + * particularly from a set of policy rules in mdadm.conf + */ + +static void pol_new(struct dev_policy **pol, char *name, const char *val, + const char *metadata) +{ + struct dev_policy *n = xmalloc(sizeof(*n)); + const char *real_metadata = NULL; + int i; + + n->name = name; + n->value = val; + + /* We need to normalise the metadata name */ + if (metadata) { + for (i = 0; superlist[i] ; i++) + if (strcmp(metadata, superlist[i]->name) == 0) { + real_metadata = superlist[i]->name; + break; + } + if (!real_metadata) { + if (strcmp(metadata, "1") == 0 || + strcmp(metadata, "1.0") == 0 || + strcmp(metadata, "1.1") == 0 || + strcmp(metadata, "1.2") == 0) + real_metadata = super1.name; + } + if (!real_metadata) { + static const char *prev = NULL; + if (prev != metadata) { + pr_err("metadata=%s unrecognised - ignoring rule\n", + metadata); + prev = metadata; + } + real_metadata = "unknown"; + } + } + + n->metadata = real_metadata; + n->next = *pol; + *pol = n; +} + +static int pol_lesseq(struct dev_policy *a, struct dev_policy *b) +{ + int cmp; + + if (a->name < b->name) + return 1; + if (a->name > b->name) + return 0; + + cmp = strcmp(a->value, b->value); + if (cmp < 0) + return 1; + if (cmp > 0) + return 0; + + return (a->metadata <= b->metadata); +} + +static void pol_sort(struct dev_policy **pol) +{ + /* sort policy list in *pol by name/metadata/value + * using merge sort + */ + + struct dev_policy *pl[2]; + pl[0] = *pol; + pl[1] = NULL; + + do { + struct dev_policy **plp[2], *p[2]; + int curr = 0; + struct dev_policy nul = { NULL, NULL, NULL, NULL }; + struct dev_policy *prev = &nul; + int next = 0; + + /* p[] are the two lists that we are merging. + * plp[] are the ends of the two lists we create + * from the merge. + * 'curr' is which of plp[] that we are currently + * adding items to. + * 'next' is which if p[] we will take the next + * item from. + * 'prev' is that last value, which was placed in + * plp[curr]. + */ + plp[0] = &pl[0]; + plp[1] = &pl[1]; + p[0] = pl[0]; + p[1] = pl[1]; + + /* take least of p[0] and p[1] + * if it is larger than prev, add to + * plp[curr], else swap curr then add + */ + while (p[0] || p[1]) { + if (p[next] == NULL || + (p[1-next] != NULL && + !(pol_lesseq(prev, p[1-next]) + ^pol_lesseq(prev, p[next]) + ^pol_lesseq(p[next], p[1-next]))) + ) + next = 1 - next; + + if (!pol_lesseq(prev, p[next])) + curr = 1 - curr; + + *plp[curr] = prev = p[next]; + plp[curr] = &p[next]->next; + p[next] = p[next]->next; + } + *plp[0] = NULL; + *plp[1] = NULL; + } while (pl[0] && pl[1]); + if (pl[0]) + *pol = pl[0]; + else + *pol = pl[1]; +} + +static void pol_dedup(struct dev_policy *pol) +{ + /* This is a sorted list - remove duplicates. */ + while (pol && pol->next) { + if (pol_lesseq(pol->next, pol)) { + struct dev_policy *tmp = pol->next; + pol->next = tmp->next; + free(tmp); + } else + pol = pol->next; + } +} + +/* + * pol_find finds the first entry in the policy + * list to match name. + * If it returns non-NULL there is at least one + * value, but how many can only be found by + * iterating through the list. + */ +struct dev_policy *pol_find(struct dev_policy *pol, char *name) +{ + while (pol && pol->name < name) + pol = pol->next; + + if (!pol || pol->name != name) + return NULL; + return pol; +} + +static char *disk_path(struct mdinfo *disk) +{ + struct stat stb; + int prefix_len; + DIR *by_path; + char symlink[PATH_MAX] = "/dev/disk/by-path/"; + char nm[PATH_MAX]; + struct dirent *ent; + int rv; + + by_path = opendir(symlink); + if (by_path) { + prefix_len = strlen(symlink); + while ((ent = readdir(by_path)) != NULL) { + if (ent->d_type != DT_LNK) + continue; + strncpy(symlink + prefix_len, + ent->d_name, + sizeof(symlink) - prefix_len); + if (stat(symlink, &stb) < 0) + continue; + if ((stb.st_mode & S_IFMT) != S_IFBLK) + continue; + if (stb.st_rdev != makedev(disk->disk.major, disk->disk.minor)) + continue; + closedir(by_path); + return xstrdup(ent->d_name); + } + closedir(by_path); + } + /* A NULL path isn't really acceptable - use the devname.. */ + sprintf(symlink, "/sys/dev/block/%d:%d", disk->disk.major, disk->disk.minor); + rv = readlink(symlink, nm, sizeof(nm)-1); + if (rv > 0) { + char *dname; + nm[rv] = 0; + dname = strrchr(nm, '/'); + if (dname) + return xstrdup(dname + 1); + } + return xstrdup("unknown"); +} + +char type_part[] = "part"; +char type_disk[] = "disk"; +static char *disk_type(struct mdinfo *disk) +{ + char buf[30+20+20]; + struct stat stb; + sprintf(buf, "/sys/dev/block/%d:%d/partition", + disk->disk.major, disk->disk.minor); + if (stat(buf, &stb) == 0) + return type_part; + else + return type_disk; +} + +static int pol_match(struct rule *rule, char *path, char *type) +{ + /* check if this rule matches on path and type */ + int pathok = 0; /* 0 == no path, 1 == match, -1 == no match yet */ + int typeok = 0; + + while (rule) { + if (rule->name == rule_path) { + if (pathok == 0) + pathok = -1; + if (path && fnmatch(rule->value, path, 0) == 0) + pathok = 1; + } + if (rule->name == rule_type) { + if (typeok == 0) + typeok = -1; + if (type && strcmp(rule->value, type) == 0) + typeok = 1; + } + rule = rule->next; + } + return pathok >= 0 && typeok >= 0; +} + +static void pol_merge(struct dev_policy **pol, struct rule *rule) +{ + /* copy any name assignments from rule into pol */ + struct rule *r; + char *metadata = NULL; + for (r = rule; r ; r = r->next) + if (r->name == pol_metadata) + metadata = r->value; + + for (r = rule; r ; r = r->next) + if (r->name == pol_act || + r->name == pol_domain || + r->name == pol_auto) + pol_new(pol, r->name, r->value, metadata); +} + +static int path_has_part(char *path, char **part) +{ + /* check if path ends with "-partNN" and + * if it does, place a pointer to "-pathNN" + * in 'part'. + */ + int l; + if (!path) + return 0; + l = strlen(path); + while (l > 1 && isdigit(path[l-1])) + l--; + if (l < 5 || strncmp(path+l-5, "-part", 5) != 0) + return 0; + *part = path+l-4; + return 1; +} + +static void pol_merge_part(struct dev_policy **pol, struct rule *rule, char *part) +{ + /* copy any name assignments from rule into pol, appending + * -part to any domain. The string with -part appended is + * stored with the rule so it has a lifetime to match + * the rule. + */ + struct rule *r; + char *metadata = NULL; + for (r = rule; r ; r = r->next) + if (r->name == pol_metadata) + metadata = r->value; + + for (r = rule; r ; r = r->next) { + if (r->name == pol_act) + pol_new(pol, r->name, r->value, metadata); + else if (r->name == pol_domain) { + char *dom; + int len; + if (r->dups == NULL) + r->dups = dl_head(); + len = strlen(r->value); + for (dom = dl_next(r->dups); dom != r->dups; + dom = dl_next(dom)) + if (strcmp(dom+len+1, part)== 0) + break; + if (dom == r->dups) { + char *newdom = dl_strndup( + r->value, len + 1 + strlen(part)); + strcat(strcat(newdom, "-"), part); + dl_add(r->dups, newdom); + dom = newdom; + } + pol_new(pol, r->name, dom, metadata); + } + } +} + +static struct pol_rule *config_rules = NULL; +static struct pol_rule **config_rules_end = NULL; +static int config_rules_has_path = 0; + +/* + * most policy comes from a set policy rules that are + * read from the config file. + * path_policy() gathers policy information for the + * disk described in the given a 'path' and a 'type'. + */ +struct dev_policy *path_policy(char *path, char *type) +{ + struct pol_rule *rules; + struct dev_policy *pol = NULL; + int i; + + rules = config_rules; + + while (rules) { + char *part; + if (rules->type == rule_policy) + if (pol_match(rules->rule, path, type)) + pol_merge(&pol, rules->rule); + if (rules->type == rule_part && strcmp(type, type_part) == 0) + if (path_has_part(path, &part)) { + *part = 0; + if (pol_match(rules->rule, path, type_disk)) + pol_merge_part(&pol, rules->rule, part+1); + *part = '-'; + } + rules = rules->next; + } + + /* Now add any metadata-specific internal knowledge + * about this path + */ + for (i=0; path && superlist[i]; i++) + if (superlist[i]->get_disk_controller_domain) { + const char *d = + superlist[i]->get_disk_controller_domain(path); + if (d) + pol_new(&pol, pol_domain, d, superlist[i]->name); + } + + pol_sort(&pol); + pol_dedup(pol); + return pol; +} + +void pol_add(struct dev_policy **pol, + char *name, char *val, + char *metadata) +{ + pol_new(pol, name, val, metadata); + pol_sort(pol); + pol_dedup(*pol); +} + +/* + * disk_policy() gathers policy information for the + * disk described in the given mdinfo (disk.{major,minor}). + */ +struct dev_policy *disk_policy(struct mdinfo *disk) +{ + char *path = NULL; + char *type = disk_type(disk); + struct dev_policy *pol = NULL; + + if (config_rules_has_path) + path = disk_path(disk); + + pol = path_policy(path, type); + + free(path); + return pol; +} + +struct dev_policy *devid_policy(int dev) +{ + struct mdinfo disk; + disk.disk.major = major(dev); + disk.disk.minor = minor(dev); + return disk_policy(&disk); +} + +/* + * process policy rules read from config file. + */ + +char rule_path[] = "path"; +char rule_type[] = "type"; + +char rule_policy[] = "policy"; +char rule_part[] = "part-policy"; + +char pol_metadata[] = "metadata"; +char pol_act[] = "action"; +char pol_domain[] = "domain"; +char pol_auto[] = "auto"; + +static int try_rule(char *w, char *name, struct rule **rp) +{ + struct rule *r; + int len = strlen(name); + if (strncmp(w, name, len) != 0 || + w[len] != '=') + return 0; + r = xmalloc(sizeof(*r)); + r->next = *rp; + r->name = name; + r->value = xstrdup(w+len+1); + r->dups = NULL; + *rp = r; + return 1; +} + +void policyline(char *line, char *type) +{ + struct pol_rule *pr; + char *w; + + if (config_rules_end == NULL) + config_rules_end = &config_rules; + + pr = xmalloc(sizeof(*pr)); + pr->type = type; + pr->rule = NULL; + for (w = dl_next(line); w != line ; w = dl_next(w)) { + if (try_rule(w, rule_path, &pr->rule)) + config_rules_has_path = 1; + else if (! try_rule(w, rule_type, &pr->rule) && + ! try_rule(w, pol_metadata, &pr->rule) && + ! try_rule(w, pol_act, &pr->rule) && + ! try_rule(w, pol_domain, &pr->rule) && + ! try_rule(w, pol_auto, &pr->rule)) + pr_err("policy rule %s unrecognised and ignored\n", + w); + } + pr->next = config_rules; + config_rules = pr; +} + +void policy_add(char *type, ...) +{ + va_list ap; + struct pol_rule *pr; + char *name, *val; + + pr = xmalloc(sizeof(*pr)); + pr->type = type; + pr->rule = NULL; + + va_start(ap, type); + while ((name = va_arg(ap, char*)) != NULL) { + struct rule *r; + + val = va_arg(ap, char*); + r = xmalloc(sizeof(*r)); + r->next = pr->rule; + r->name = name; + r->value = xstrdup(val); + r->dups = NULL; + pr->rule = r; + } + pr->next = config_rules; + config_rules = pr; + va_end(ap); +} + +void policy_free(void) +{ + while (config_rules) { + struct pol_rule *pr = config_rules; + struct rule *r; + + config_rules = config_rules->next; + + for (r = pr->rule; r; ) { + struct rule *next = r->next; + free(r->value); + if (r->dups) + free_line(r->dups); + free(r); + r = next; + } + free(pr); + } + config_rules_end = NULL; + config_rules_has_path = 0; +} + +void dev_policy_free(struct dev_policy *p) +{ + struct dev_policy *t; + while (p) { + t = p; + p = p->next; + free(t); + } +} + +static enum policy_action map_act(const char *act) +{ + if (strcmp(act, "include") == 0) + return act_include; + if (strcmp(act, "re-add") == 0) + return act_re_add; + if (strcmp(act, "spare") == 0) + return act_spare; + if (strcmp(act, "spare-same-slot") == 0) + return act_spare_same_slot; + if (strcmp(act, "force-spare") == 0) + return act_force_spare; + return act_err; +} + +static enum policy_action policy_action(struct dev_policy *plist, const char *metadata) +{ + enum policy_action rv = act_default; + struct dev_policy *p; + + plist = pol_find(plist, pol_act); + pol_for_each(p, plist, metadata) { + enum policy_action a = map_act(p->value); + if (a > rv) + rv = a; + } + return rv; +} + +int policy_action_allows(struct dev_policy *plist, const char *metadata, enum policy_action want) +{ + enum policy_action act = policy_action(plist, metadata); + + if (act == act_err) + return 0; + return (act >= want); +} + +int disk_action_allows(struct mdinfo *disk, const char *metadata, enum policy_action want) +{ + struct dev_policy *pol = disk_policy(disk); + int rv = policy_action_allows(pol, metadata, want); + + dev_policy_free(pol); + return rv; +} + +/* Domain policy: + * Any device can have a list of domains asserted by different policy + * statements. + * An array also has a list of domains comprising all the domains of + * all the devices in an array. + * Where an array has a spare-group, that becomes an addition domain for + * every device in the array and thus for the array. + * + * We keep the list of domains in a sorted linked list + * As dev policies are already sorted, this is fairly easy to manage. + */ + +static struct domainlist **domain_merge_one(struct domainlist **domp, + const char *domain) +{ + /* merge a domain name into a sorted list and return the + * location of the insertion or match + */ + struct domainlist *dom = *domp; + + while (dom && strcmp(dom->dom, domain) < 0) { + domp = &dom->next; + dom = *domp; + } + if (dom == NULL || strcmp(dom->dom, domain) != 0) { + dom = xmalloc(sizeof(*dom)); + dom->next = *domp; + dom->dom = domain; + *domp = dom; + } + return domp; +} + +#if (DEBUG) +void dump_policy(struct dev_policy *policy) +{ + while (policy) { + dprintf("policy: %p name: %s value: %s metadata: %s\n", + policy, + policy->name, + policy->value, + policy->metadata); + policy = policy->next; + } +} +#endif + +void domain_merge(struct domainlist **domp, struct dev_policy *pollist, + const char *metadata) +{ + /* Add to 'domp' all the domains in pol that apply to 'metadata' + * which are not already in domp + */ + struct dev_policy *pol; + pollist = pol_find(pollist, pol_domain); + pol_for_each(pol, pollist, metadata) + domain_merge_one(domp, pol->value); +} + +int domain_test(struct domainlist *dom, struct dev_policy *pol, + const char *metadata) +{ + /* Check that all domains in pol (for metadata) are also in + * dom. Both lists are sorted. + * If pol has no domains, we don't really know about this device + * so we allow caller to choose: + * -1: has no domains + * 0: has domains, not all match + * 1: has domains, all match + */ + int found_any = -1; + struct dev_policy *p; + + pol = pol_find(pol, pol_domain); + pol_for_each(p, pol, metadata) { + found_any = 1; + while (dom && strcmp(dom->dom, p->value) < 0) + dom = dom->next; + if (!dom || strcmp(dom->dom, p->value) != 0) + return 0; + } + return found_any; +} + +void domainlist_add_dev(struct domainlist **dom, int devid, const char *metadata) +{ + struct dev_policy *pol = devid_policy(devid); + domain_merge(dom, pol, metadata); + dev_policy_free(pol); +} + +struct domainlist *domain_from_array(struct mdinfo *mdi, const char *metadata) +{ + struct domainlist *domlist = NULL; + + if (!mdi) + return NULL; + for (mdi = mdi->devs ; mdi ; mdi = mdi->next) + domainlist_add_dev(&domlist, makedev(mdi->disk.major, + mdi->disk.minor), + metadata); + + return domlist; +} + +void domain_add(struct domainlist **domp, char *domain) +{ + domain_merge_one(domp, domain); +} + +void domain_free(struct domainlist *dl) +{ + while (dl) { + struct domainlist *head = dl; + dl = dl->next; + free(head); + } +} + +/* + * same-path policy. + * Some policy decisions are guided by knowledge of which + * array previously owned the device at a given physical location (path). + * When removing a device from an array we might record the array against + * the path, and when finding a new device, we might look for which + * array previously used that path. + * + * The 'array' is described by a map_ent, and the path by a the disk in an + * mdinfo, or a string. + */ + +void policy_save_path(char *id_path, struct map_ent *array) +{ + char path[PATH_MAX]; + FILE *f = NULL; + + if (mkdir(FAILED_SLOTS_DIR, S_IRWXU) < 0 && errno != EEXIST) { + pr_err("can't create file to save path to old disk: %s\n", strerror(errno)); + return; + } + + snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path); + f = fopen(path, "w"); + if (!f) { + pr_err("can't create file to save path to old disk: %s\n", + strerror(errno)); + return; + } + + if (fprintf(f, "%s %08x:%08x:%08x:%08x\n", + array->metadata, + array->uuid[0], array->uuid[1], + array->uuid[2], array->uuid[3]) <= 0) + pr_err("Failed to write to <id_path> cookie\n"); + + fclose(f); +} + +int policy_check_path(struct mdinfo *disk, struct map_ent *array) +{ + char path[PATH_MAX]; + FILE *f = NULL; + char *id_path = disk_path(disk); + int rv; + + if (!id_path) + return 0; + + snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path); + f = fopen(path, "r"); + if (!f) { + free(id_path); + return 0; + } + + rv = fscanf(f, " %s %x:%x:%x:%x\n", + array->metadata, + array->uuid, + array->uuid+1, + array->uuid+2, + array->uuid+3); + fclose(f); + free(id_path); + return rv == 5; +} + +/* invocation of udev rule file */ +char udev_template_start[] = +"# do not edit this file, it is automatically generated by mdadm\n" +"\n"; + +/* find rule named rule_type and return its value */ +char *find_rule(struct rule *rule, char *rule_type) +{ + while (rule) { + if (rule->name == rule_type) + return rule->value; + + rule = rule->next; + } + return NULL; +} + +#define UDEV_RULE_FORMAT \ +"ACTION==\"add\", SUBSYSTEM==\"block\", " \ +"ENV{DEVTYPE}==\"%s\", ENV{ID_PATH}==\"%s\", " \ +"RUN+=\"" BINDIR "/mdadm --incremental $env{DEVNAME}\"\n" + +#define UDEV_RULE_FORMAT_NOTYPE \ +"ACTION==\"add\", SUBSYSTEM==\"block\", " \ +"ENV{ID_PATH}==\"%s\", " \ +"RUN+=\"" BINDIR "/mdadm --incremental $env{DEVNAME}\"\n" + +/* Write rule in the rule file. Use format from UDEV_RULE_FORMAT */ +int write_rule(struct rule *rule, int fd, int force_part) +{ + char line[1024]; + char *pth = find_rule(rule, rule_path); + char *typ = find_rule(rule, rule_type); + if (!pth) + return -1; + + if (force_part) + typ = type_part; + if (typ) + snprintf(line, sizeof(line) - 1, UDEV_RULE_FORMAT, typ, pth); + else + snprintf(line, sizeof(line) - 1, UDEV_RULE_FORMAT_NOTYPE, pth); + return write(fd, line, strlen(line)) == (int)strlen(line); +} + +/* Generate single entry in udev rule basing on POLICY line found in config + * file. Take only those with paths, only first occurrence if paths are equal + * and if actions supports handling of spares (>=act_spare_same_slot) + */ +int generate_entries(int fd) +{ + struct pol_rule *loop, *dup; + char *loop_value, *dup_value; + int duplicate; + + for (loop = config_rules; loop; loop = loop->next) { + if (loop->type != rule_policy && loop->type != rule_part) + continue; + duplicate = 0; + + /* only policies with paths and with actions supporting + * bare disks are considered */ + loop_value = find_rule(loop->rule, pol_act); + if (!loop_value || map_act(loop_value) < act_spare_same_slot) + continue; + loop_value = find_rule(loop->rule, rule_path); + if (!loop_value) + continue; + for (dup = config_rules; dup != loop; dup = dup->next) { + if (dup->type != rule_policy && loop->type != rule_part) + continue; + dup_value = find_rule(dup->rule, pol_act); + if (!dup_value || map_act(dup_value) < act_spare_same_slot) + continue; + dup_value = find_rule(dup->rule, rule_path); + if (!dup_value) + continue; + if (strcmp(loop_value, dup_value) == 0) { + duplicate = 1; + break; + } + } + + /* not a dup or first occurrence */ + if (!duplicate) + if (!write_rule(loop->rule, fd, loop->type == rule_part) ) + return 0; + } + return 1; +} + +/* Write_rules routine creates dynamic udev rules used to handle + * hot-plug events for bare devices (and making them spares) + */ +int Write_rules(char *rule_name) +{ + int fd; + char udev_rule_file[PATH_MAX]; + + if (rule_name) { + strncpy(udev_rule_file, rule_name, sizeof(udev_rule_file) - 6); + udev_rule_file[sizeof(udev_rule_file) - 6] = '\0'; + strcat(udev_rule_file, ".temp"); + fd = creat(udev_rule_file, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd == -1) + return 1; + } else + fd = 1; + + /* write static invocation */ + if (write(fd, udev_template_start, + sizeof(udev_template_start) - 1) + != (int)sizeof(udev_template_start)-1) + goto abort; + + /* iterate, if none created or error occurred, remove file */ + if (generate_entries(fd) < 0) + goto abort; + + fsync(fd); + if (rule_name) { + close(fd); + rename(udev_rule_file, rule_name); + } + return 0; +abort: + if (rule_name) { + close(fd); + unlink(udev_rule_file); + } + return 1; +} diff --git a/probe_roms.c b/probe_roms.c new file mode 100644 index 00000000..b0b08833 --- /dev/null +++ b/probe_roms.c @@ -0,0 +1,317 @@ +/* + * probe_roms - scan for Adapter ROMS + * + * (based on linux-2.6:arch/x86/kernel/probe_roms_32.c) + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "probe_roms.h" +#include "mdadm.h" +#include <unistd.h> +#include <signal.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <asm/types.h> + +static void *rom_mem = MAP_FAILED; +static int rom_fd = -1; +static const int rom_len = 0xf0000 - 0xc0000; /* option-rom memory region */ +static int _sigbus; +static unsigned long rom_align; + +static void sigbus(int sig) +{ + _sigbus = 1; +} + +static int probe_address8(const __u8 *ptr, __u8 *val) +{ + int rc = 0; + + *val = *ptr; + if (_sigbus) + rc = -1; + _sigbus = 0; + + return rc; +} + +static int probe_address16(const __u16 *ptr, __u16 *val) +{ + int rc = 0; + + *val = *ptr; + if (_sigbus) + rc = -1; + _sigbus = 0; + + return rc; +} + +void probe_roms_exit(void) +{ + signal(SIGBUS, SIG_DFL); + if (rom_fd >= 0) { + close(rom_fd); + rom_fd = -1; + } + if (rom_mem != MAP_FAILED) { + munmap(rom_mem, rom_len); + rom_mem = MAP_FAILED; + } +} + +int probe_roms_init(unsigned long align) +{ + int fd = -1; + int rc = 0; + + /* valid values are 2048 and 512. 512 is for PCI-3.0 compliant + * systems, or systems that do not have dangerous/legacy ISA + * devices. 2048 should always be safe + */ + if (align == 512 || align == 2048) + rom_align = align; + else + return -1; + + if (signal(SIGBUS, sigbus) == SIG_ERR) + rc = -1; + if (rc == 0) { + fd = open("/dev/mem", O_RDONLY); + if (fd < 0) + rc = -1; + } + if (rc == 0) { + rom_mem = mmap(NULL, rom_len, PROT_READ, MAP_PRIVATE, fd, 0xc0000); + if (rom_mem == MAP_FAILED) + rc = -1; + } + + if (rc == 0) + rom_fd = fd; + else { + if (fd >= 0) + close(fd); + probe_roms_exit(); + } + return rc; +} + +/** + * isa_bus_to_virt - convert physical address to mmap'd region + * @addr - address to convert + * + * Only valid between a successful call to probe_roms_init and the + * corresponding probe_roms_exit + */ +static void *isa_bus_to_virt(unsigned long addr) +{ + return rom_mem + (addr - 0xc0000); +} + +struct resource { + unsigned long start; + unsigned long end; + unsigned long data; + const char *name; +}; + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .data = 0, + .end = 0xfffff, +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .data = 0, + .end = 0xeffff, +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .data = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .data = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .data = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .data = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .data = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .data = 0, + .end = 0, +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .data = 0, + .end = 0xc7fff, +}; + +#define ROMSIGNATURE 0xaa55 + +static int romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig = 0; + + return probe_address16(ptr, &sig) == 0 && sig == ROMSIGNATURE; +} + +static int romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_address8(rom++, &c) == 0; length--) + sum += c; + return !length && !sum; +} + +int scan_adapter_roms(scan_fn fn) +{ + /* let scan_fn examing each of the adapter roms found by probe_roms */ + unsigned int i; + int found; + + if (rom_fd < 0) + return 0; + + found = 0; + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + + if (res->start) { + found = fn(isa_bus_to_virt(res->start), + isa_bus_to_virt(res->end), + isa_bus_to_virt(res->data)); + if (found) + break; + } else + break; + } + + return found; +} + +static unsigned long align(unsigned long addr, unsigned long alignment) +{ + return (addr + alignment - 1) & ~(alignment - 1); +} + +void probe_roms(void) +{ + const void *rom; + unsigned long start, length, upper; + unsigned char c; + unsigned int i; + __u16 val=0; + + if (rom_fd < 0) + return; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += rom_align) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (probe_address8(rom + 2, &c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + break; + } + + start = align(video_rom_resource.end + 1, rom_align); + if (start < upper) + start = upper; + + /* system rom */ + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) + upper = extension_rom_resource.start; + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += rom_align) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_address8(rom + 2, &c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* Retrieve 16-bit pointer to PCI Data Structure (offset 18h-19h) + * The data can be within 64KB forward of the first location + * of this code image. The pointer is in little-endian order + */ + + if (probe_address16(rom + 0x18, &val) != 0) + continue; + val = __le16_to_cpu(val); + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].data = start + (unsigned long) val; + adapter_rom_resources[i].end = start + length - 1; + + start = adapter_rom_resources[i++].end & ~(rom_align - 1); + } +} diff --git a/probe_roms.h b/probe_roms.h new file mode 100644 index 00000000..6d70411a --- /dev/null +++ b/probe_roms.h @@ -0,0 +1,24 @@ +/* + * probe_roms - scan for Adapter ROMS + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +void probe_roms_exit(void); +int probe_roms_init(unsigned long align); +typedef int (*scan_fn)(const void *start, const void *end, const void *data); +int scan_adapter_roms(scan_fn fn); +void probe_roms(void); @@ -0,0 +1,17 @@ + +/* + * We cannot link a static binary with passwd/group support, so + * just do without + */ +#include <stdlib.h> +#include <pwd.h> +#include <grp.h> + +struct passwd *getpwnam(const char *name) +{ + return NULL; +} +struct group *getgrnam(const char *name) +{ + return NULL; +} diff --git a/raid5extend.c b/raid5extend.c new file mode 100644 index 00000000..d8e62c2c --- /dev/null +++ b/raid5extend.c @@ -0,0 +1,80 @@ + +int phys2log(int phys, int stripe, int n, int layout) +{ + /* In an 'n' disk array using 'layout', + * in stripe 'stripe', the physical disc 'phys' + * stores what logical chunk? + * -1 mean parity. + * + */ + switch(layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + pd = (n-1) - (stripe % n); + if (phys < pd) + return phys; + else if (phys == pd) + return -1; + else return phys-1; + + case ALGORITHM_RIGHT_ASYMMETRIC: + pd = stripe % n; + if (phys < pd) + return phys; + else if (phys == pd) + return -1; + else return phys-1; + + case ALGORITHM_LEFT_SYMMETRIC: + pd = (n-1) - (stripe %n); + if (phys < pd) + return phys+ n-1-pd; + else if (phys == pd) + return -1; + else return phys-pd-1; + + case ALGORITHM_RIGHT_SYMMETRIC: + pd = stripe % n; + if (phys < pd) + return phys+ n-1-pd; + else if (phys == pd) + return -1; + else return phys-pd-1; + } + return -2; +} + +raid5_extend(unsigned long len, int chunksize, int layout, int n, int m, int rfds[], int wfds[]) +{ + + static char buf[4096]; + + unsigned long blocks = len/4; + unsigned int blocksperchunk= chunksize/4096; + + unsigned long b; + + for (b=0; b<blocks; b++) { + unsigned long stripe = b / blocksperchunk; + unsigned int offset = b - (stripe*blocksperchunk); + unsigned long chunk = stripe * (n-1); + int src; + for (src=0; src<n; src++) { + int dnum, snum; + if (read(rfds[src], buf, sizeof(buf)) != sizeof(buf)) { + error(); + return 0; + } + + snum = phys2log(src, stripe, n, layout); + + if (snum == -1) + continue; + chunk = stripe*(n-1)+snum; + + dstripe = chunk/(m-1); + dnum = log2phys(chunk-(stripe*(m-1)), dstripe, m, layout); + llseek(wfds[dnum], dstripe*chunksize+(offset*4096), 0); + write(wfds[dnum], buf, sizeof(buf)); + } + } +} diff --git a/raid6check.8 b/raid6check.8 new file mode 100644 index 00000000..50033430 --- /dev/null +++ b/raid6check.8 @@ -0,0 +1,96 @@ +.\" -*- nroff -*- +.\" Copyright Piergiorgio Sartor and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH RAID6CHECK 8 "" v1.0.0 +.SH NAME +raid6check \- check MD RAID6 device for errors +.I aka +Linux Software RAID + +.SH SYNOPSIS + +.BI raid6check " <raid6 device> <start stripe> <number of stripes>" + +.SH DESCRIPTION +RAID6 devices in which one single component drive has errors can use +the double parity in order to find out which component drive. +The "raid6check" tool checks, for each stripe, the double parity +consistency, reports mismatches and, if possible, which +component drive has the mismatch. +Since it works at stripe level, it can report different drives with +mismatches at different stripes. + +"raid6check" requires a non-degraded RAID6 MD device as first +parameter, a starting stripe (usually 0) and the number of stripes +to be checked. +If this third parameter is also 0, it will check the array up to +the end. + +"raid6check" will start printing information about the RAID6, then +for each stripe, it will report the parity rotation status. +In case of parity mismatches, "raid6check" reports, if possible, +which component drive could be responsible. Otherwise it reports +that it is not possible to find the component drive. + +If the given MD device is not a RAID6, "raid6check" will, of +course, not continue. + +If the RAID6 MD device is degraded, "raid6check" will report +an error and it will not proceed further. + +No write operations are performed on the array or the components. +Furthermore, the checked array can be online and in use during +the operation of "raid6check". + +.SH EXAMPLES + +.B " raid6check /dev/md0 0 0" +.br +This will check /dev/md0 from start to end. + +.B " raid6check /dev/md3 0 1" +.br +This will check the first stripe of /dev/md3. + +.B " raid6check /dev/md1 1000 0" +.br +This will check /dev/md1 from stripe 1000 up to the end. + +.B " raid6check /dev/m127 128 256" +.br +This will check 256 stripes of /dev/md127 starting from stripe 128. + +.B " raid6check /dev/md0 0 0 | grep -i error > md0_err.log" +.br +This will check /dev/md0 completely and create a log file only +with errors, if any. + +.SH FILES + +"raid6check" uses directly the component drives as found in /dev. +Furthermore, the sysfs interface is needed in order to find out +the RAID6 parameters. + +.SH BUGS +Negative parameters can lead to unexpected results. + +It is not clear what will happen if the RAID6 MD device gets +degraded during the check. + +.PP +The latest version of +.I raid6check +should always be available from +.IP +.B http://www.kernel.org/pub/linux/utils/raid/mdadm/ +.PP +Related man pages: +.PP +.IR mdadm (8) +.IR mdmon (8), +.IR mdadm.conf (5), +.IR md (4). diff --git a/raid6check.c b/raid6check.c new file mode 100644 index 00000000..cb8522e5 --- /dev/null +++ b/raid6check.c @@ -0,0 +1,712 @@ +/* + * raid6check - extended consistency check for RAID-6 + * + * Copyright (C) 2011 Piergiorgio Sartor + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Piergiorgio Sartor + * Based on "restripe.c" from "mdadm" codebase + */ + +#include "mdadm.h" +#include <stdint.h> +#include <signal.h> +#include <sys/mman.h> + +#define CHECK_PAGE_BITS (12) +#define CHECK_PAGE_SIZE (1 << CHECK_PAGE_BITS) + +char const Name[] = "raid6check"; + +enum repair { + NO_REPAIR = 0, + MANUAL_REPAIR, + AUTO_REPAIR +}; + +int geo_map(int block, unsigned long long stripe, int raid_disks, + int level, int layout); +int is_ddf(int layout); +void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size); +void make_tables(void); +void ensure_zero_has_size(int chunk_size); +void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs, + int neg_offset); +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + uint8_t **ptrs, int neg_offset); +void xor_blocks(char *target, char **sources, int disks, int size); + +/* Collect per stripe consistency information */ +void raid6_collect(int chunk_size, uint8_t *p, uint8_t *q, + char *chunkP, char *chunkQ, int *results) +{ + int i; + int data_id; + uint8_t Px, Qx; + extern uint8_t raid6_gflog[]; + + for(i = 0; i < chunk_size; i++) { + Px = (uint8_t)chunkP[i] ^ (uint8_t)p[i]; + Qx = (uint8_t)chunkQ[i] ^ (uint8_t)q[i]; + + if((Px != 0) && (Qx == 0)) + results[i] = -1; + + if((Px == 0) && (Qx != 0)) + results[i] = -2; + + if((Px != 0) && (Qx != 0)) { + data_id = (raid6_gflog[Qx] - raid6_gflog[Px]); + if(data_id < 0) data_id += 255; + results[i] = data_id; + } + + if((Px == 0) && (Qx == 0)) + results[i] = -255; + } +} + +/* Try to find out if a specific disk has problems in a CHECK_PAGE_SIZE page size */ +int raid6_stats_blk(int *results, int raid_disks) +{ + int i; + int curr_broken_disk = -255; + int prev_broken_disk = -255; + int broken_status = 0; + + for(i = 0; i < CHECK_PAGE_SIZE; i++) { + + if(results[i] != -255) + curr_broken_disk = results[i]; + + if(curr_broken_disk >= raid_disks) + broken_status = 2; + + switch(broken_status) { + case 0: + if(curr_broken_disk != -255) { + prev_broken_disk = curr_broken_disk; + broken_status = 1; + } + break; + + case 1: + if(curr_broken_disk != prev_broken_disk) + broken_status = 2; + break; + + case 2: + default: + curr_broken_disk = prev_broken_disk = -65535; + break; + } + } + + return curr_broken_disk; +} + +/* Collect disks status for a strip in CHECK_PAGE_SIZE page size blocks */ +void raid6_stats(int *disk, int *results, int raid_disks, int chunk_size) +{ + int i, j; + + for(i = 0, j = 0; i < chunk_size; i += CHECK_PAGE_SIZE, j++) { + disk[j] = raid6_stats_blk(&results[i], raid_disks); + } +} + +int lock_stripe(struct mdinfo *info, unsigned long long start, + int chunk_size, int data_disks, sighandler_t *sig) { + int rv; + if(mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + return 2; + } + + sig[0] = signal(SIGTERM, SIG_IGN); + sig[1] = signal(SIGINT, SIG_IGN); + sig[2] = signal(SIGQUIT, SIG_IGN); + + rv = sysfs_set_num(info, NULL, "suspend_lo", start * chunk_size * data_disks); + rv |= sysfs_set_num(info, NULL, "suspend_hi", (start + 1) * chunk_size * data_disks); + return rv * 256; +} + +int unlock_all_stripes(struct mdinfo *info, sighandler_t *sig) { + int rv; + rv = sysfs_set_num(info, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + rv |= sysfs_set_num(info, NULL, "suspend_hi", 0); + rv |= sysfs_set_num(info, NULL, "suspend_lo", 0); + + signal(SIGQUIT, sig[2]); + signal(SIGINT, sig[1]); + signal(SIGTERM, sig[0]); + + if(munlockall() != 0) + return 3; + return rv * 256; +} + +/* Autorepair */ +int autorepair(int *disk, unsigned long long start, int chunk_size, + char *name[], int raid_disks, int syndrome_disks, char **blocks_page, + char **blocks, uint8_t *p, int *block_index_for_slot, + int *source, unsigned long long *offsets) +{ + int i, j; + int pages_to_write_count = 0; + int page_to_write[chunk_size >> CHECK_PAGE_BITS]; + for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) { + if (disk[j] >= -2 && block_index_for_slot[disk[j]] >= 0) { + int slot = block_index_for_slot[disk[j]]; + printf("Auto-repairing slot %d (%s)\n", slot, name[slot]); + pages_to_write_count++; + page_to_write[j] = 1; + for(i = -2; i < syndrome_disks; i++) { + blocks_page[i] = blocks[i] + j * CHECK_PAGE_SIZE; + } + if (disk[j] == -2) { + qsyndrome(p, (uint8_t*)blocks_page[-2], + (uint8_t**)blocks_page, + syndrome_disks, CHECK_PAGE_SIZE); + } + else { + char *all_but_failed_blocks[syndrome_disks]; + for(i = 0; i < syndrome_disks; i++) { + if (i == disk[j]) + all_but_failed_blocks[i] = blocks_page[-1]; + else + all_but_failed_blocks[i] = blocks_page[i]; + } + xor_blocks(blocks_page[disk[j]], + all_but_failed_blocks, syndrome_disks, + CHECK_PAGE_SIZE); + } + } + else { + page_to_write[j] = 0; + } + } + + if(pages_to_write_count > 0) { + int write_res = 0; + for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) { + if(page_to_write[j] == 1) { + int slot = block_index_for_slot[disk[j]]; + lseek64(source[slot], offsets[slot] + start * chunk_size + j * CHECK_PAGE_SIZE, SEEK_SET); + write_res += write(source[slot], + blocks[disk[j]] + j * CHECK_PAGE_SIZE, + CHECK_PAGE_SIZE); + } + } + + if (write_res != (CHECK_PAGE_SIZE * pages_to_write_count)) { + fprintf(stderr, "Failed to write a full chunk.\n"); + return -1; + } + } + + return 0; +} + +/* Manual repair */ +int manual_repair(int chunk_size, int syndrome_disks, + int failed_slot1, int failed_slot2, + unsigned long long start, int *block_index_for_slot, + char *name[], char **stripes, char **blocks, uint8_t *p, + int *source, unsigned long long *offsets) +{ + int i; + int fd1 = block_index_for_slot[failed_slot1]; + int fd2 = block_index_for_slot[failed_slot2]; + printf("Repairing stripe %llu\n", start); + printf("Assuming slots %d (%s) and %d (%s) are incorrect\n", + fd1, name[fd1], + fd2, name[fd2]); + + if (failed_slot1 == -2 || failed_slot2 == -2) { + char *all_but_failed_blocks[syndrome_disks]; + int failed_data_or_p; + + if (failed_slot1 == -2) + failed_data_or_p = failed_slot2; + else + failed_data_or_p = failed_slot1; + + printf("Repairing D/P(%d) and Q\n", failed_data_or_p); + + for (i = 0; i < syndrome_disks; i++) { + if (i == failed_data_or_p) + all_but_failed_blocks[i] = blocks[-1]; + else + all_but_failed_blocks[i] = blocks[i]; + } + xor_blocks(blocks[failed_data_or_p], + all_but_failed_blocks, syndrome_disks, chunk_size); + qsyndrome(p, (uint8_t*)blocks[-2], (uint8_t**)blocks, + syndrome_disks, chunk_size); + } else { + ensure_zero_has_size(chunk_size); + if (failed_slot1 == -1 || failed_slot2 == -1) { + int failed_data; + if (failed_slot1 == -1) + failed_data = failed_slot2; + else + failed_data = failed_slot1; + printf("Repairing D(%d) and P\n", failed_data); + raid6_datap_recov(syndrome_disks+2, chunk_size, + failed_data, (uint8_t**)blocks, 1); + } else { + printf("Repairing D and D\n"); + raid6_2data_recov(syndrome_disks+2, chunk_size, + failed_slot1, failed_slot2, + (uint8_t**)blocks, 1); + } + } + + int write_res1, write_res2; + off64_t seek_res; + + seek_res = lseek64(source[fd1], + offsets[fd1] + start * chunk_size, SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek failed for failed_disk1\n"); + return -1; + } + write_res1 = write(source[fd1], blocks[failed_slot1], chunk_size); + + seek_res = lseek64(source[fd2], + offsets[fd2] + start * chunk_size, SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek failed for failed_disk2\n"); + return -1; + } + write_res2 = write(source[fd2], blocks[failed_slot2], chunk_size); + + if (write_res1 != chunk_size || write_res2 != chunk_size) { + fprintf(stderr, "Failed to write a complete chunk.\n"); + return -2; + } + + return 0; +} + +int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + unsigned long long start, unsigned long long length, char *name[], + enum repair repair, int failed_disk1, int failed_disk2) +{ + /* read the data and p and q blocks, and check we got them right */ + int data_disks = raid_disks - 2; + int syndrome_disks = data_disks + is_ddf(layout) * 2; + char *stripe_buf; + + /* stripes[] is indexed by raid_disk and holds chunks from each device */ + char **stripes = xmalloc(raid_disks * sizeof(char*)); + + /* blocks[] is indexed by syndrome number and points to either one of the + * chunks from 'stripes[]', or to a chunk of zeros. -1 and -2 are + * P and Q */ + char **blocks = xmalloc((syndrome_disks + 2) * sizeof(char*)); + + /* blocks_page[] is a temporary index to just one page of the chunks + * that blocks[] points to. */ + char **blocks_page = xmalloc((syndrome_disks + 2) * sizeof(char*)); + + /* block_index_for_slot[] provides the reverse mapping from blocks to stripes. + * The index is a syndrome position, the content is a raid_disk number. + * indicies -1 and -2 work, and are P and Q disks */ + int *block_index_for_slot = xmalloc((syndrome_disks+2) * sizeof(int)); + + /* 'p' and 'q' contain calcualted P and Q, to be compared with + * blocks[-1] and blocks[-2]; + */ + uint8_t *p = xmalloc(chunk_size); + uint8_t *q = xmalloc(chunk_size); + char *zero = xmalloc(chunk_size); + int *results = xmalloc(chunk_size * sizeof(int)); + sighandler_t *sig = xmalloc(3 * sizeof(sighandler_t)); + + int i, j; + int diskP, diskQ, diskD; + int err = 0; + + extern int tables_ready; + + if (!tables_ready) + make_tables(); + + posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size); + block_index_for_slot += 2; + blocks += 2; + blocks_page += 2; + + memset(zero, 0, chunk_size); + for ( i = 0 ; i < raid_disks ; i++) + stripes[i] = stripe_buf + i * chunk_size; + + while (length > 0) { + /* The syndrome number of the broken disk is recorded + * in 'disk[]' which allows a different broken disk for + * each page. + */ + int disk[chunk_size >> CHECK_PAGE_BITS]; + + err = lock_stripe(info, start, chunk_size, data_disks, sig); + if(err != 0) { + if (err != 2) + unlock_all_stripes(info, sig); + goto exitCheck; + } + for (i = 0 ; i < raid_disks ; i++) { + off64_t seek_res = lseek64(source[i], offsets[i] + start * chunk_size, + SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek to source %d failed\n", i); + unlock_all_stripes(info, sig); + err = -1; + goto exitCheck; + } + int read_res = read(source[i], stripes[i], chunk_size); + if (read_res < chunk_size) { + fprintf(stderr, "Failed to read complete chunk disk %d, aborting\n", i); + unlock_all_stripes(info, sig); + err = -1; + goto exitCheck; + } + } + + diskP = geo_map(-1, start, raid_disks, level, layout); + block_index_for_slot[-1] = diskP; + blocks[-1] = stripes[diskP]; + + diskQ = geo_map(-2, start, raid_disks, level, layout); + block_index_for_slot[-2] = diskQ; + blocks[-2] = stripes[diskQ]; + + if (!is_ddf(layout)) { + /* The syndrome-order of disks starts immediately after 'Q', + * but skips P */ + diskD = diskQ; + for (i = 0 ; i < data_disks ; i++) { + diskD = diskD + 1; + if (diskD >= raid_disks) + diskD = 0; + if (diskD == diskP) + diskD += 1; + if (diskD >= raid_disks) + diskD = 0; + blocks[i] = stripes[diskD]; + block_index_for_slot[i] = diskD; + } + } else { + /* The syndrome-order exactly follows raid-disk + * numbers, with ZERO in place of P and Q + */ + for (i = 0 ; i < raid_disks; i++) { + if (i == diskP || i == diskQ) { + blocks[i] = zero; + block_index_for_slot[i] = -1; + } else { + blocks[i] = stripes[i]; + block_index_for_slot[i] = i; + } + } + } + + qsyndrome(p, q, (uint8_t**)blocks, syndrome_disks, chunk_size); + + raid6_collect(chunk_size, p, q, stripes[diskP], stripes[diskQ], results); + raid6_stats(disk, results, raid_disks, chunk_size); + + for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) { + int role = disk[j]; + if (role >= -2) { + int slot = block_index_for_slot[role]; + if (slot >= 0) + printf("Error detected at stripe %llu, page %d: possible failed disk slot %d: %d --> %s\n", + start, j, role, slot, name[slot]); + else + printf("Error detected at stripe %llu, page %d: failed slot %d should be zeros\n", + start, j, role); + } else if(disk[j] == -65535) { + printf("Error detected at stripe %llu, page %d: disk slot unknown\n", start, j); + } + } + + if(repair == AUTO_REPAIR) { + err = autorepair(disk, start, chunk_size, + name, raid_disks, syndrome_disks, blocks_page, + blocks, p, block_index_for_slot, + source, offsets); + if(err != 0) { + unlock_all_stripes(info, sig); + goto exitCheck; + } + } + + if(repair == MANUAL_REPAIR) { + int failed_slot1 = -1, failed_slot2 = -1; + for (i = -2; i < syndrome_disks; i++) { + if (block_index_for_slot[i] == failed_disk1) + failed_slot1 = i; + if (block_index_for_slot[i] == failed_disk2) + failed_slot2 = i; + } + err = manual_repair(chunk_size, syndrome_disks, + failed_slot1, failed_slot2, + start, block_index_for_slot, + name, stripes, blocks, p, + source, offsets); + if(err == -1) { + unlock_all_stripes(info, sig); + goto exitCheck; + } + } + + err = unlock_all_stripes(info, sig); + if(err != 0) { + goto exitCheck; + } + + length--; + start++; + } + +exitCheck: + + free(stripe_buf); + free(stripes); + free(blocks-2); + free(blocks_page-2); + free(block_index_for_slot-2); + free(p); + free(q); + free(results); + free(sig); + + return err; +} + +unsigned long long getnum(char *str, char **err) +{ + char *e; + unsigned long long rv = strtoull(str, &e, 10); + if (e==str || *e) { + *err = str; + return 0; + } + return rv; +} + +int main(int argc, char *argv[]) +{ + /* md_device start length */ + int *fds = NULL; + char *buf = NULL; + char **disk_name = NULL; + unsigned long long *offsets = NULL; + int raid_disks = 0; + int active_disks; + int chunk_size = 0; + int layout = -1; + int level = 6; + enum repair repair = NO_REPAIR; + int failed_disk1 = -1; + int failed_disk2 = -1; + unsigned long long start, length; + int i; + int mdfd; + struct mdinfo *info = NULL, *comp = NULL; + char *err = NULL; + int exit_err = 0; + int close_flag = 0; + char *prg = strrchr(argv[0], '/'); + + if (prg == NULL) + prg = argv[0]; + else + prg++; + + if (argc < 4) { + fprintf(stderr, "Usage: %s md_device start_stripe length_stripes [autorepair]\n", prg); + fprintf(stderr, " or: %s md_device repair stripe failed_slot_1 failed_slot_2\n", prg); + exit_err = 1; + goto exitHere; + } + + mdfd = open(argv[1], O_RDONLY); + if(mdfd < 0) { + perror(argv[1]); + fprintf(stderr, "%s: cannot open %s\n", prg, argv[1]); + exit_err = 2; + goto exitHere; + } + + info = sysfs_read(mdfd, NULL, + GET_LEVEL| + GET_LAYOUT| + GET_DISKS| + GET_DEGRADED | + GET_COMPONENT| + GET_CHUNK| + GET_DEVS| + GET_OFFSET| + GET_SIZE); + + if(info == NULL) { + fprintf(stderr, "%s: Error reading sysfs information of %s\n", prg, argv[1]); + exit_err = 9; + goto exitHere; + } + + if(info->array.level != level) { + fprintf(stderr, "%s: %s not a RAID-6\n", prg, argv[1]); + exit_err = 3; + goto exitHere; + } + + if(info->array.failed_disks > 0) { + fprintf(stderr, "%s: %s degraded array\n", prg, argv[1]); + exit_err = 8; + goto exitHere; + } + + printf("layout: %d\n", info->array.layout); + printf("disks: %d\n", info->array.raid_disks); + printf("component size: %llu\n", info->component_size * 512); + printf("total stripes: %llu\n", (info->component_size * 512) / info->array.chunk_size); + printf("chunk size: %d\n", info->array.chunk_size); + printf("\n"); + + comp = info->devs; + for(i = 0, active_disks = 0; active_disks < info->array.raid_disks; i++) { + printf("disk: %d - offset: %llu - size: %llu - name: %s - slot: %d\n", + i, comp->data_offset * 512, comp->component_size * 512, + map_dev(comp->disk.major, comp->disk.minor, 0), + comp->disk.raid_disk); + if(comp->disk.raid_disk >= 0) + active_disks++; + comp = comp->next; + } + printf("\n"); + + close(mdfd); + + raid_disks = info->array.raid_disks; + chunk_size = info->array.chunk_size; + layout = info->array.layout; + if (strcmp(argv[2], "repair")==0) { + if (argc < 6) { + fprintf(stderr, "For repair mode, call %s md_device repair stripe failed_slot_1 failed_slot_2\n", prg); + exit_err = 1; + goto exitHere; + } + repair = MANUAL_REPAIR; + start = getnum(argv[3], &err); + length = 1; + failed_disk1 = getnum(argv[4], &err); + failed_disk2 = getnum(argv[5], &err); + + if(failed_disk1 >= info->array.raid_disks) { + fprintf(stderr, "%s: failed_slot_1 index is higher than number of devices in raid\n", prg); + exit_err = 4; + goto exitHere; + } + if(failed_disk2 >= info->array.raid_disks) { + fprintf(stderr, "%s: failed_slot_2 index is higher than number of devices in raid\n", prg); + exit_err = 4; + goto exitHere; + } + if(failed_disk1 == failed_disk2) { + fprintf(stderr, "%s: failed_slot_1 and failed_slot_2 are the same\n", prg); + exit_err = 4; + goto exitHere; + } + } + else { + start = getnum(argv[2], &err); + length = getnum(argv[3], &err); + if (argc >= 5 && strcmp(argv[4], "autorepair")==0) + repair = AUTO_REPAIR; + } + + if (err) { + fprintf(stderr, "%s: Bad number: %s\n", prg, err); + exit_err = 4; + goto exitHere; + } + + if(start > ((info->component_size * 512) / chunk_size)) { + start = (info->component_size * 512) / chunk_size; + fprintf(stderr, "%s: start beyond disks size\n", prg); + } + + if((length == 0) || + ((length + start) > ((info->component_size * 512) / chunk_size))) { + length = (info->component_size * 512) / chunk_size - start; + } + + disk_name = xmalloc(raid_disks * sizeof(*disk_name)); + fds = xmalloc(raid_disks * sizeof(*fds)); + offsets = xcalloc(raid_disks, sizeof(*offsets)); + buf = xmalloc(raid_disks * chunk_size); + + for(i=0; i<raid_disks; i++) { + fds[i] = -1; + } + close_flag = 1; + + comp = info->devs; + for (i=0, active_disks=0; active_disks<raid_disks; i++) { + int disk_slot = comp->disk.raid_disk; + if(disk_slot >= 0) { + disk_name[disk_slot] = map_dev(comp->disk.major, comp->disk.minor, 0); + offsets[disk_slot] = comp->data_offset * 512; + fds[disk_slot] = open(disk_name[disk_slot], O_RDWR | O_DIRECT); + if (fds[disk_slot] < 0) { + perror(disk_name[disk_slot]); + fprintf(stderr,"%s: cannot open %s\n", prg, disk_name[disk_slot]); + exit_err = 6; + goto exitHere; + } + active_disks++; + } + comp = comp->next; + } + + int rv = check_stripes(info, fds, offsets, + raid_disks, chunk_size, level, layout, + start, length, disk_name, repair, failed_disk1, failed_disk2); + if (rv != 0) { + fprintf(stderr, "%s: check_stripes returned %d\n", prg, rv); + exit_err = 7; + goto exitHere; + } + +exitHere: + + if (close_flag) + for(i = 0; i < raid_disks; i++) + close(fds[i]); + + free(disk_name); + free(fds); + free(offsets); + free(buf); + + exit(exit_err); +} diff --git a/restripe.c b/restripe.c new file mode 100644 index 00000000..4d921904 --- /dev/null +++ b/restripe.c @@ -0,0 +1,1008 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <stdint.h> + +/* To restripe, we read from old geometry to a buffer, and + * read from buffer to new geometry. + * When reading, we might have missing devices and so could need + * to reconstruct. + * When writing, we need to create correct parity and Q. + * + */ + +int geo_map(int block, unsigned long long stripe, int raid_disks, + int level, int layout) +{ + /* On the given stripe, find which disk in the array will have + * block numbered 'block'. + * '-1' means the parity block. + * '-2' means the Q syndrome. + */ + int pd; + + /* layout is not relevant for raid0 and raid4 */ + if ((level == 0) || + (level == 4)) + layout = 0; + + switch(level*100 + layout) { + case 000: + case 400: + case 500 + ALGORITHM_PARITY_N: + /* raid 4 isn't messed around by parity blocks */ + if (block == -1) + return raid_disks-1; /* parity block */ + return block; + case 500 + ALGORITHM_LEFT_ASYMMETRIC: + pd = (raid_disks-1) - stripe % raid_disks; + if (block == -1) return pd; + if (block >= pd) + block++; + return block; + + case 500 + ALGORITHM_RIGHT_ASYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) return pd; + if (block >= pd) + block++; + return block; + + case 500 + ALGORITHM_LEFT_SYMMETRIC: + pd = (raid_disks - 1) - stripe % raid_disks; + if (block == -1) return pd; + return (pd + 1 + block) % raid_disks; + + case 500 + ALGORITHM_RIGHT_SYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) return pd; + return (pd + 1 + block) % raid_disks; + + case 500 + ALGORITHM_PARITY_0: + return block + 1; + + case 600 + ALGORITHM_PARITY_N_6: + if (block == -2) + return raid_disks - 1; + if (block == -1) + return raid_disks - 2; /* parity block */ + return block; + case 600 + ALGORITHM_LEFT_ASYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = (raid_disks-1) - stripe % raid_disks; + if (block == -1) return pd; + if (block >= pd) + block++; + return block; + + case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = stripe % raid_disks; + if (block == -1) return pd; + if (block >= pd) + block++; + return block; + + case 600 + ALGORITHM_LEFT_SYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = (raid_disks - 1) - stripe % raid_disks; + if (block == -1) return pd; + return (pd + 1 + block) % raid_disks; + + case 600 + ALGORITHM_RIGHT_SYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = stripe % raid_disks; + if (block == -1) return pd; + return (pd + 1 + block) % raid_disks; + + case 600 + ALGORITHM_PARITY_0_6: + if (block == -2) + return raid_disks - 1; + return block + 1; + + case 600 + ALGORITHM_PARITY_0: + if (block == -1) + return 0; + if (block == -2) + return 1; + return block + 2; + + case 600 + ALGORITHM_LEFT_ASYMMETRIC: + pd = raid_disks - 1 - (stripe % raid_disks); + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + if (pd == raid_disks - 1) + return block+1; + if (block >= pd) + return block+2; + return block; + + case 600 + ALGORITHM_ROTATING_ZERO_RESTART: + /* Different order for calculating Q, otherwize same as ... */ + case 600 + ALGORITHM_RIGHT_ASYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + if (pd == raid_disks - 1) + return block+1; + if (block >= pd) + return block+2; + return block; + + case 600 + ALGORITHM_LEFT_SYMMETRIC: + pd = raid_disks - 1 - (stripe % raid_disks); + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + return (pd + 2 + block) % raid_disks; + + case 600 + ALGORITHM_RIGHT_SYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + return (pd + 2 + block) % raid_disks; + + case 600 + ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + pd = raid_disks - 1 - ((stripe + 1) % raid_disks); + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + if (pd == raid_disks - 1) + return block+1; + if (block >= pd) + return block+2; + return block; + + case 600 + ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd = raid_disks - 1 - (stripe % raid_disks); + if (block == -1) return pd; + if (block == -2) return (pd+raid_disks-1) % raid_disks; + return (pd + 1 + block) % raid_disks; + } + return -1; +} + +int is_ddf(int layout) +{ + switch (layout) + { + default: + return 0; + case ALGORITHM_ROTATING_N_CONTINUE: + case ALGORITHM_ROTATING_N_RESTART: + case ALGORITHM_ROTATING_ZERO_RESTART: + return 1; + } +} + +void xor_blocks(char *target, char **sources, int disks, int size) +{ + int i, j; + /* Amazingly inefficient... */ + for (i=0; i<size; i++) { + char c = 0; + for (j=0 ; j<disks; j++) + c ^= sources[j][i]; + target[i] = c; + } +} + +void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size) +{ + int d, z; + uint8_t wq0, wp0, wd0, w10, w20; + for ( d = 0; d < size; d++) { + wq0 = wp0 = sources[disks-1][d]; + for ( z = disks-2 ; z >= 0 ; z-- ) { + wd0 = sources[z][d]; + wp0 ^= wd0; + w20 = (wq0&0x80) ? 0xff : 0x00; + w10 = (wq0 << 1) & 0xff; + w20 &= 0x1d; + w10 ^= w20; + wq0 = w10 ^ wd0; + } + p[d] = wp0; + q[d] = wq0; + } +} + +/* + * The following was taken from linux/drivers/md/mktables.c, and modified + * to create in-memory tables rather than C code + */ +static uint8_t gfmul(uint8_t a, uint8_t b) +{ + uint8_t v = 0; + + while (b) { + if (b & 1) + v ^= a; + a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); + b >>= 1; + } + + return v; +} + +static uint8_t gfpow(uint8_t a, int b) +{ + uint8_t v = 1; + + b %= 255; + if (b < 0) + b += 255; + + while (b) { + if (b & 1) + v = gfmul(v, a); + a = gfmul(a, a); + b >>= 1; + } + + return v; +} + +int tables_ready = 0; +uint8_t raid6_gfmul[256][256]; +uint8_t raid6_gfexp[256]; +uint8_t raid6_gfinv[256]; +uint8_t raid6_gfexi[256]; +uint8_t raid6_gflog[256]; +uint8_t raid6_gfilog[256]; +void make_tables(void) +{ + int i, j; + uint8_t v; + uint32_t b, log; + + /* Compute multiplication table */ + for (i = 0; i < 256; i++) + for (j = 0; j < 256; j++) + raid6_gfmul[i][j] = gfmul(i, j); + + /* Compute power-of-2 table (exponent) */ + v = 1; + for (i = 0; i < 256; i++) { + raid6_gfexp[i] = v; + v = gfmul(v, 2); + if (v == 1) + v = 0; /* For entry 255, not a real entry */ + } + + /* Compute inverse table x^-1 == x^254 */ + for (i = 0; i < 256; i++) + raid6_gfinv[i] = gfpow(i, 254); + + /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ + for (i = 0; i < 256; i ++) + raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1]; + + /* Compute log and inverse log */ + /* Modified code from: + * http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html + */ + b = 1; + raid6_gflog[0] = 0; + raid6_gfilog[255] = 0; + + for (log = 0; log < 255; log++) { + raid6_gflog[b] = (uint8_t) log; + raid6_gfilog[log] = (uint8_t) b; + b = b << 1; + if (b & 256) b = b ^ 0435; + } + + tables_ready = 1; +} + +uint8_t *zero; +int zero_size; + +void ensure_zero_has_size(int chunk_size) +{ + if (zero == NULL || chunk_size > zero_size) { + if (zero) + free(zero); + zero = xcalloc(1, chunk_size); + zero_size = chunk_size; + } +} + +/* Following was taken from linux/drivers/md/raid6recov.c */ + +/* Recover two failed data blocks. */ + +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + uint8_t **ptrs, int neg_offset) +{ + uint8_t *p, *q, *dp, *dq; + uint8_t px, qx, db; + const uint8_t *pbmul; /* P multiplier table for B data */ + const uint8_t *qmul; /* Q multiplier table (for both) */ + + if (faila > failb) { + int t = faila; + faila = failb; + failb = t; + } + + if (neg_offset) { + p = ptrs[-1]; + q = ptrs[-2]; + } else { + p = ptrs[disks-2]; + q = ptrs[disks-1]; + } + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = ptrs[faila]; + ptrs[faila] = zero; + dq = ptrs[failb]; + ptrs[failb] = zero; + + qsyndrome(dp, dq, ptrs, disks-2, bytes); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while ( bytes-- ) { + px = *p ^ *dp; + qx = qmul[*q ^ *dq]; + *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ + *dp++ = db ^ px; /* Reconstructed A */ + p++; q++; + } +} + +/* Recover failure of one data block plus the P block */ +void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs, + int neg_offset) +{ + uint8_t *p, *q, *dq; + const uint8_t *qmul; /* Q multiplier table */ + + if (neg_offset) { + p = ptrs[-1]; + q = ptrs[-2]; + } else { + p = ptrs[disks-2]; + q = ptrs[disks-1]; + } + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = ptrs[faila]; + ptrs[faila] = zero; + + qsyndrome(p, dq, ptrs, disks-2, bytes); + + /* Restore pointer table */ + ptrs[faila] = dq; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while ( bytes-- ) { + *p++ ^= *dq = qmul[*q ^ *dq]; + q++; dq++; + } +} + +/* Try to find out if a specific disk has a problem */ +int raid6_check_disks(int data_disks, int start, int chunk_size, + int level, int layout, int diskP, int diskQ, + char *p, char *q, char **stripes) +{ + int i; + int data_id, diskD; + uint8_t Px, Qx; + int curr_broken_disk = -1; + int prev_broken_disk = -1; + int broken_status = 0; + + for(i = 0; i < chunk_size; i++) { + Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i]; + Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i]; + + if((Px != 0) && (Qx == 0)) + curr_broken_disk = diskP; + + if((Px == 0) && (Qx != 0)) + curr_broken_disk = diskQ; + + if((Px != 0) && (Qx != 0)) { + data_id = (raid6_gflog[Qx] - raid6_gflog[Px]); + if(data_id < 0) data_id += 255; + diskD = geo_map(data_id, start/chunk_size, + data_disks + 2, level, layout); + curr_broken_disk = diskD; + } + + if((Px == 0) && (Qx == 0)) + curr_broken_disk = curr_broken_disk; + + if(curr_broken_disk >= data_disks + 2) + broken_status = 2; + + switch(broken_status) { + case 0: + if(curr_broken_disk != -1) { + prev_broken_disk = curr_broken_disk; + broken_status = 1; + } + break; + + case 1: + if(curr_broken_disk != prev_broken_disk) + broken_status = 2; + break; + + case 2: + default: + curr_broken_disk = prev_broken_disk = -2; + break; + } + } + + return curr_broken_disk; +} + +/******************************************************************************* + * Function: save_stripes + * Description: + * Function reads data (only data without P and Q) from array and writes + * it to buf and opcjonaly to backup files + * Parameters: + * source : A list of 'fds' of the active disks. + * Some may be absent + * offsets : A list of offsets on disk belonging + * to the array [bytes] + * raid_disks : geometry: number of disks in the array + * chunk_size : geometry: chunk size [bytes] + * level : geometry: RAID level + * layout : geometry: layout + * nwrites : number of backup files + * dest : A list of 'fds' for mirrored targets + * (e.g. backup files). They are already seeked to right + * (write) location. If NULL, data will be wrote + * to the buf only + * start : start address of data to read (must be stripe-aligned) + * [bytes] + * length - : length of data to read (must be stripe-aligned) + * [bytes] + * buf : buffer for data. It is large enough to hold + * one stripe. It is stripe aligned + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +int save_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int nwrites, int *dest, + unsigned long long start, unsigned long long length, + char *buf) +{ + int len; + int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2); + int disk; + int i; + unsigned long long length_test; + + if (!tables_ready) + make_tables(); + ensure_zero_has_size(chunk_size); + + len = data_disks * chunk_size; + length_test = length / len; + length_test *= len; + + if (length != length_test) { + dprintf("Error: save_stripes(): Data are not alligned. EXIT\n"); + dprintf("\tArea for saving stripes (length) = %llu\n", length); + dprintf("\tWork step (len) = %i\n", len); + dprintf("\tExpected save area (length_test) = %llu\n", + length_test); + abort(); + } + + while (length > 0) { + int failed = 0; + int fdisk[3], fblock[3]; + for (disk = 0; disk < raid_disks ; disk++) { + unsigned long long offset; + int dnum; + + offset = (start/chunk_size/data_disks)*chunk_size; + dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1, + start/chunk_size/data_disks, + raid_disks, level, layout); + if (dnum < 0) abort(); + if (source[dnum] < 0 || + lseek64(source[dnum], offsets[dnum]+offset, 0) < 0 || + read(source[dnum], buf+disk * chunk_size, chunk_size) + != chunk_size) + if (failed <= 2) { + fdisk[failed] = dnum; + fblock[failed] = disk; + failed++; + } + } + if (failed == 0 || fblock[0] >= data_disks) + /* all data disks are good */ + ; + else if (failed == 1 || fblock[1] >= data_disks+1) { + /* one failed data disk and good parity */ + char *bufs[data_disks]; + for (i=0; i < data_disks; i++) + if (fblock[0] == i) + bufs[i] = buf + data_disks*chunk_size; + else + bufs[i] = buf + i*chunk_size; + + xor_blocks(buf + fblock[0]*chunk_size, + bufs, data_disks, chunk_size); + } else if (failed > 2 || level != 6) + /* too much failure */ + return -1; + else { + /* RAID6 computations needed. */ + uint8_t *bufs[data_disks+4]; + int qdisk; + int syndrome_disks; + disk = geo_map(-1, start/chunk_size/data_disks, + raid_disks, level, layout); + qdisk = geo_map(-2, start/chunk_size/data_disks, + raid_disks, level, layout); + if (is_ddf(layout)) { + /* q over 'raid_disks' blocks, in device order. + * 'p' and 'q' get to be all zero + */ + for (i = 0; i < raid_disks; i++) + bufs[i] = zero; + for (i = 0; i < data_disks; i++) { + int dnum = geo_map(i, + start/chunk_size/data_disks, + raid_disks, level, layout); + int snum; + /* i is the logical block number, so is index to 'buf'. + * dnum is physical disk number + * and thus the syndrome number. + */ + snum = dnum; + bufs[snum] = (uint8_t*)buf + chunk_size * i; + } + syndrome_disks = raid_disks; + } else { + /* for md, q is over 'data_disks' blocks, + * starting immediately after 'q' + * Note that for the '_6' variety, the p block + * makes a hole that we need to be careful of. + */ + int j; + int snum = 0; + for (j = 0; j < raid_disks; j++) { + int dnum = (qdisk + 1 + j) % raid_disks; + if (dnum == disk || dnum == qdisk) + continue; + for (i = 0; i < data_disks; i++) + if (geo_map(i, + start/chunk_size/data_disks, + raid_disks, level, layout) == dnum) + break; + /* i is the logical block number, so is index to 'buf'. + * dnum is physical disk number + * snum is syndrome disk for which 0 is immediately after Q + */ + bufs[snum] = (uint8_t*)buf + chunk_size * i; + + if (fblock[0] == i) + fdisk[0] = snum; + if (fblock[1] == i) + fdisk[1] = snum; + snum++; + } + + syndrome_disks = data_disks; + } + + /* Place P and Q blocks at end of bufs */ + bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks; + bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1); + + if (fblock[1] == data_disks) + /* One data failed, and parity failed */ + raid6_datap_recov(syndrome_disks+2, chunk_size, + fdisk[0], bufs, 0); + else { + /* Two data blocks failed, P,Q OK */ + raid6_2data_recov(syndrome_disks+2, chunk_size, + fdisk[0], fdisk[1], bufs, 0); + } + } + if (dest) { + for (i = 0; i < nwrites; i++) + if (write(dest[i], buf, len) != len) + return -1; + } else { + /* build next stripe in buffer */ + buf += len; + } + length -= len; + start += len; + } + return 0; +} + +/* Restore data: + * We are given: + * A list of 'fds' of the active disks. Some may be '-1' for not-available. + * A geometry: raid_disks, chunk_size, level, layout + * An 'fd' to read from. It is already seeked to the right (Read) location. + * A start and length. + * The length must be a multiple of the stripe size. + * + * We build a full stripe in memory and then write it out. + * We assume that there are enough working devices. + */ +int restore_stripes(int *dest, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int source, unsigned long long read_offset, + unsigned long long start, unsigned long long length, + char *src_buf) +{ + char *stripe_buf; + char **stripes = xmalloc(raid_disks * sizeof(char*)); + char **blocks = xmalloc(raid_disks * sizeof(char*)); + int i; + int rv; + + int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2); + + if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size)) + stripe_buf = NULL; + + if (zero == NULL || chunk_size > zero_size) { + if (zero) + free(zero); + zero = xcalloc(1, chunk_size); + zero_size = chunk_size; + } + + if (stripe_buf == NULL || stripes == NULL || blocks == NULL + || zero == NULL) { + rv = -2; + goto abort; + } + for (i = 0; i < raid_disks; i++) + stripes[i] = stripe_buf + i * chunk_size; + while (length > 0) { + unsigned int len = data_disks * chunk_size; + unsigned long long offset; + int disk, qdisk; + int syndrome_disks; + if (length < len) { + rv = -3; + goto abort; + } + for (i = 0; i < data_disks; i++) { + int disk = geo_map(i, start/chunk_size/data_disks, + raid_disks, level, layout); + if (src_buf == NULL) { + /* read from file */ + if (lseek64(source, read_offset, 0) != + (off64_t)read_offset) { + rv = -1; + goto abort; + } + if (read(source, + stripes[disk], + chunk_size) != chunk_size) { + rv = -1; + goto abort; + } + } else { + /* read from input buffer */ + memcpy(stripes[disk], + src_buf + read_offset, + chunk_size); + } + read_offset += chunk_size; + } + /* We have the data, now do the parity */ + offset = (start/chunk_size/data_disks) * chunk_size; + switch (level) { + case 4: + case 5: + disk = geo_map(-1, start/chunk_size/data_disks, + raid_disks, level, layout); + for (i = 0; i < data_disks; i++) + blocks[i] = stripes[(disk+1+i) % raid_disks]; + xor_blocks(stripes[disk], blocks, data_disks, chunk_size); + break; + case 6: + disk = geo_map(-1, start/chunk_size/data_disks, + raid_disks, level, layout); + qdisk = geo_map(-2, start/chunk_size/data_disks, + raid_disks, level, layout); + if (is_ddf(layout)) { + /* q over 'raid_disks' blocks, in device order. + * 'p' and 'q' get to be all zero + */ + for (i = 0; i < raid_disks; i++) + if (i == disk || i == qdisk) + blocks[i] = (char*)zero; + else + blocks[i] = stripes[i]; + syndrome_disks = raid_disks; + } else { + /* for md, q is over 'data_disks' blocks, + * starting immediately after 'q' + */ + for (i = 0; i < data_disks; i++) + blocks[i] = stripes[(qdisk+1+i) % raid_disks]; + + syndrome_disks = data_disks; + } + qsyndrome((uint8_t*)stripes[disk], + (uint8_t*)stripes[qdisk], + (uint8_t**)blocks, + syndrome_disks, chunk_size); + break; + } + for (i=0; i < raid_disks ; i++) + if (dest[i] >= 0) { + if (lseek64(dest[i], + offsets[i]+offset, 0) < 0) { + rv = -1; + goto abort; + } + if (write(dest[i], stripes[i], + chunk_size) != chunk_size) { + rv = -1; + goto abort; + } + } + length -= len; + start += len; + } + rv = 0; + +abort: + free(stripe_buf); + free(stripes); + free(blocks); + return rv; +} + +#ifdef MAIN + +int test_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + unsigned long long start, unsigned long long length) +{ + /* ready the data and p (and q) blocks, and check we got them right */ + char *stripe_buf = xmalloc(raid_disks * chunk_size); + char **stripes = xmalloc(raid_disks * sizeof(char*)); + char **blocks = xmalloc(raid_disks * sizeof(char*)); + char *p = xmalloc(chunk_size); + char *q = xmalloc(chunk_size); + + int i; + int diskP, diskQ; + int data_disks = raid_disks - (level == 5 ? 1: 2); + + if (!tables_ready) + make_tables(); + + for ( i = 0 ; i < raid_disks ; i++) + stripes[i] = stripe_buf + i * chunk_size; + + while (length > 0) { + int disk; + + for (i = 0 ; i < raid_disks ; i++) { + lseek64(source[i], offsets[i]+start, 0); + read(source[i], stripes[i], chunk_size); + } + for (i = 0 ; i < data_disks ; i++) { + int disk = geo_map(i, start/chunk_size, raid_disks, + level, layout); + blocks[i] = stripes[disk]; + printf("%d->%d\n", i, disk); + } + switch(level) { + case 6: + qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size); + diskP = geo_map(-1, start/chunk_size, raid_disks, + level, layout); + if (memcmp(p, stripes[diskP], chunk_size) != 0) { + printf("P(%d) wrong at %llu\n", diskP, + start / chunk_size); + } + diskQ = geo_map(-2, start/chunk_size, raid_disks, + level, layout); + if (memcmp(q, stripes[diskQ], chunk_size) != 0) { + printf("Q(%d) wrong at %llu\n", diskQ, + start / chunk_size); + } + disk = raid6_check_disks(data_disks, start, chunk_size, + level, layout, diskP, diskQ, + p, q, stripes); + if(disk >= 0) { + printf("Possible failed disk: %d\n", disk); + } + if(disk == -2) { + printf("Failure detected, but disk unknown\n"); + } + break; + } + length -= chunk_size; + start += chunk_size; + } + return 0; +} + +unsigned long long getnum(char *str, char **err) +{ + char *e; + unsigned long long rv = strtoull(str, &e, 10); + if (e==str || *e) { + *err = str; + return 0; + } + return rv; +} + +char const Name[] = "test_restripe"; +int main(int argc, char *argv[]) +{ + /* save/restore file raid_disks chunk_size level layout start length devices... + */ + int save; + int *fds; + char *file; + char *buf; + int storefd; + unsigned long long *offsets; + int raid_disks, chunk_size, level, layout; + unsigned long long start, length; + int i; + + char *err = NULL; + if (argc < 10) { + fprintf(stderr, "Usage: test_stripe save/restore file raid_disks chunk_size level layout start length devices...\n"); + exit(1); + } + if (strcmp(argv[1], "save")==0) + save = 1; + else if (strcmp(argv[1], "restore") == 0) + save = 0; + else if (strcmp(argv[1], "test") == 0) + save = 2; + else { + fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n"); + exit(2); + } + + file = argv[2]; + raid_disks = getnum(argv[3], &err); + chunk_size = getnum(argv[4], &err); + level = getnum(argv[5], &err); + layout = getnum(argv[6], &err); + start = getnum(argv[7], &err); + length = getnum(argv[8], &err); + if (err) { + fprintf(stderr, "test_stripe: Bad number: %s\n", err); + exit(2); + } + if (argc != raid_disks + 9) { + fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n", + raid_disks, argc-9); + exit(2); + } + fds = xmalloc(raid_disks * sizeof(*fds)); + offsets = xcalloc(raid_disks, sizeof(*offsets)); + + storefd = open(file, O_RDWR); + if (storefd < 0) { + perror(file); + fprintf(stderr, "test_stripe: could not open %s.\n", file); + exit(3); + } + for (i=0; i<raid_disks; i++) { + char *p; + p = strchr(argv[9+i], ':'); + + if(p != NULL) { + *p++ = '\0'; + offsets[i] = atoll(p) * 512; + } + + fds[i] = open(argv[9+i], O_RDWR); + if (fds[i] < 0) { + perror(argv[9+i]); + fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]); + exit(3); + } + } + + buf = xmalloc(raid_disks * chunk_size); + + if (save == 1) { + int rv = save_stripes(fds, offsets, + raid_disks, chunk_size, level, layout, + 1, &storefd, + start, length, buf); + if (rv != 0) { + fprintf(stderr, + "test_stripe: save_stripes returned %d\n", rv); + exit(1); + } + } else if (save == 2) { + int rv = test_stripes(fds, offsets, + raid_disks, chunk_size, level, layout, + start, length); + if (rv != 0) { + fprintf(stderr, + "test_stripe: test_stripes returned %d\n", rv); + exit(1); + } + } else { + int rv = restore_stripes(fds, offsets, + raid_disks, chunk_size, level, layout, + storefd, 0ULL, + start, length, NULL); + if (rv != 0) { + fprintf(stderr, + "test_stripe: restore_stripes returned %d\n", + rv); + exit(1); + } + } + exit(0); +} + +#endif /* MAIN */ diff --git a/sg_io.c b/sg_io.c new file mode 100644 index 00000000..50ad180d --- /dev/null +++ b/sg_io.c @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007-2008 Intel Corporation + * + * Retrieve drive serial numbers for scsi disks + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include <string.h> +#include <scsi/scsi.h> +#include <scsi/sg.h> +#include <sys/ioctl.h> + +int scsi_get_serial(int fd, void *buf, size_t buf_len) +{ + unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, buf_len, 0}; + unsigned char sense[32]; + struct sg_io_hdr io_hdr; + + memset(&io_hdr, 0, sizeof(io_hdr)); + io_hdr.interface_id = 'S'; + io_hdr.cmdp = inq_cmd; + io_hdr.cmd_len = sizeof(inq_cmd); + io_hdr.dxferp = buf; + io_hdr.dxfer_len = buf_len; + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.sbp = sense; + io_hdr.mx_sb_len = sizeof(sense); + io_hdr.timeout = 5000; + + return ioctl(fd, SG_IO, &io_hdr); +} @@ -0,0 +1,415 @@ +/* sha1.c - Functions to compute SHA1 message digest of files or + memory blocks according to the NIST specification FIPS-180-1. + + Copyright (C) 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software + Foundation, Inc. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* Written by Scott G. Miller + Credits: + Robert Klep <robert@ilse.nl> -- Expansion function fix +*/ + +//#include <config.h> + +#include "sha1.h" + +#include <stddef.h> +#include <string.h> + +#if USE_UNLOCKED_IO +# include "unlocked-io.h" +#endif + +#ifdef WORDS_BIGENDIAN +# define SWAP(n) (n) +#else +# define SWAP(n) \ + (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24)) +#endif + +#define BLOCKSIZE 4096 +#if BLOCKSIZE % 64 != 0 +# error "invalid BLOCKSIZE" +#endif + +/* This array contains the bytes used to pad the buffer to the next + 64-byte boundary. (RFC 1321, 3.1: Step 1) */ +static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ... */ }; + +/* Take a pointer to a 160 bit block of data (five 32 bit ints) and + initialize it to the start constants of the SHA1 algorithm. This + must be called before using hash in the call to sha1_hash. */ +void +sha1_init_ctx (struct sha1_ctx *ctx) +{ + ctx->A = 0x67452301; + ctx->B = 0xefcdab89; + ctx->C = 0x98badcfe; + ctx->D = 0x10325476; + ctx->E = 0xc3d2e1f0; + + ctx->total[0] = ctx->total[1] = 0; + ctx->buflen = 0; +} + +/* Put result from CTX in first 20 bytes following RESBUF. The result + must be in little endian byte order. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32-bit value. */ +void * +sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf) +{ + ((sha1_uint32 *) resbuf)[0] = SWAP (ctx->A); + ((sha1_uint32 *) resbuf)[1] = SWAP (ctx->B); + ((sha1_uint32 *) resbuf)[2] = SWAP (ctx->C); + ((sha1_uint32 *) resbuf)[3] = SWAP (ctx->D); + ((sha1_uint32 *) resbuf)[4] = SWAP (ctx->E); + + return resbuf; +} + +/* Process the remaining bytes in the internal buffer and the usual + prolog according to the standard and write the result to RESBUF. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32-bit value. */ +void * +sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf) +{ + /* Take yet unprocessed bytes into account. */ + sha1_uint32 bytes = ctx->buflen; + size_t size = (bytes < 56) ? 64 / 4 : 64 * 2 / 4; + + /* Now count remaining bytes. */ + ctx->total[0] += bytes; + if (ctx->total[0] < bytes) + ++ctx->total[1]; + + /* Put the 64-bit file length in *bits* at the end of the buffer. */ + ctx->buffer[size - 2] = SWAP ((ctx->total[1] << 3) | (ctx->total[0] >> 29)); + ctx->buffer[size - 1] = SWAP (ctx->total[0] << 3); + + memcpy (&((char *) ctx->buffer)[bytes], fillbuf, (size - 2) * 4 - bytes); + + /* Process last bytes. */ + sha1_process_block (ctx->buffer, size * 4, ctx); + + return sha1_read_ctx (ctx, resbuf); +} + +/* Compute SHA1 message digest for bytes read from STREAM. The + resulting message digest number will be written into the 16 bytes + beginning at RESBLOCK. */ +int +sha1_stream (FILE *stream, void *resblock) +{ + struct sha1_ctx ctx; + char buffer[BLOCKSIZE + 72]; + size_t sum; + + /* Initialize the computation context. */ + sha1_init_ctx (&ctx); + + /* Iterate over full file contents. */ + while (1) + { + /* We read the file in blocks of BLOCKSIZE bytes. One call of the + computation function processes the whole buffer so that with the + next round of the loop another block can be read. */ + size_t n; + sum = 0; + + /* Read block. Take care for partial reads. */ + while (1) + { + n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream); + + sum += n; + + if (sum == BLOCKSIZE) + break; + + if (n == 0) + { + /* Check for the error flag IFF N == 0, so that we don't + exit the loop after a partial read due to e.g., EAGAIN + or EWOULDBLOCK. */ + if (ferror (stream)) + return 1; + goto process_partial_block; + } + + /* We've read at least one byte, so ignore errors. But always + check for EOF, since feof may be true even though N > 0. + Otherwise, we could end up calling fread after EOF. */ + if (feof (stream)) + goto process_partial_block; + } + + /* Process buffer with BLOCKSIZE bytes. Note that + BLOCKSIZE % 64 == 0 + */ + sha1_process_block (buffer, BLOCKSIZE, &ctx); + } + + process_partial_block:; + + /* Process any remaining bytes. */ + if (sum > 0) + sha1_process_bytes (buffer, sum, &ctx); + + /* Construct result in desired memory. */ + sha1_finish_ctx (&ctx, resblock); + return 0; +} + +/* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The + result is always in little endian byte order, so that a byte-wise + output yields to the wanted ASCII representation of the message + digest. */ +void * +sha1_buffer (const char *buffer, size_t len, void *resblock) +{ + struct sha1_ctx ctx; + + /* Initialize the computation context. */ + sha1_init_ctx (&ctx); + + /* Process whole buffer but last len % 64 bytes. */ + sha1_process_bytes (buffer, len, &ctx); + + /* Put result in desired memory area. */ + return sha1_finish_ctx (&ctx, resblock); +} + +void +sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx) +{ + /* When we already have some bits in our internal buffer concatenate + both inputs first. */ + if (ctx->buflen != 0) + { + size_t left_over = ctx->buflen; + size_t add = 128 - left_over > len ? len : 128 - left_over; + + memcpy (&((char *) ctx->buffer)[left_over], buffer, add); + ctx->buflen += add; + + if (ctx->buflen > 64) + { + sha1_process_block (ctx->buffer, ctx->buflen & ~63, ctx); + + ctx->buflen &= 63; + /* The regions in the following copy operation cannot overlap. */ + memcpy (ctx->buffer, + &((char *) ctx->buffer)[(left_over + add) & ~63], + ctx->buflen); + } + + buffer = (const char *) buffer + add; + len -= add; + } + + /* Process available complete blocks. */ + if (len >= 64) + { +#if !_STRING_ARCH_unaligned +# define alignof(type) offsetof (struct { char c; type x; }, x) +# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0) + if (UNALIGNED_P (buffer)) + while (len > 64) + { + sha1_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx); + buffer = (const char *) buffer + 64; + len -= 64; + } + else +#endif + { + sha1_process_block (buffer, len & ~63, ctx); + buffer = (const char *) buffer + (len & ~63); + len &= 63; + } + } + + /* Move remaining bytes in internal buffer. */ + if (len > 0) + { + size_t left_over = ctx->buflen; + + memcpy (&((char *) ctx->buffer)[left_over], buffer, len); + left_over += len; + if (left_over >= 64) + { + sha1_process_block (ctx->buffer, 64, ctx); + left_over -= 64; + memcpy (ctx->buffer, &ctx->buffer[16], left_over); + } + ctx->buflen = left_over; + } +} + +/* --- Code below is the primary difference between md5.c and sha1.c --- */ + +/* SHA1 round constants */ +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +/* Round functions. Note that F2 is the same as F4. */ +#define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) ) +#define F2(B,C,D) (B ^ C ^ D) +#define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) ) +#define F4(B,C,D) (B ^ C ^ D) + +/* Process LEN bytes of BUFFER, accumulating context into CTX. + It is assumed that LEN % 64 == 0. + Most of this code comes from GnuPG's cipher/sha1.c. */ + +void +sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx) +{ + const sha1_uint32 *words = (const sha1_uint32*) buffer; + size_t nwords = len / sizeof (sha1_uint32); + const sha1_uint32 *endp = words + nwords; + sha1_uint32 x[16]; + sha1_uint32 a = ctx->A; + sha1_uint32 b = ctx->B; + sha1_uint32 c = ctx->C; + sha1_uint32 d = ctx->D; + sha1_uint32 e = ctx->E; + + /* First increment the byte count. RFC 1321 specifies the possible + length of the file up to 2^64 bits. Here we only compute the + number of bytes. Do a double word increment. */ + ctx->total[0] += len; + if (ctx->total[0] < len) + ++ctx->total[1]; + +#define rol(x, n) (((x) << (n)) | ((sha1_uint32) (x) >> (32 - (n)))) + +#define M(I) ( tm = x[I&0x0f] ^ x[(I-14)&0x0f] \ + ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \ + , (x[I&0x0f] = rol(tm, 1)) ) + +#define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \ + + F( B, C, D ) \ + + K \ + + M; \ + B = rol( B, 30 ); \ + } while(0) + + while (words < endp) + { + sha1_uint32 tm; + int t; + for (t = 0; t < 16; t++) + { + x[t] = SWAP (*words); + words++; + } + + R( a, b, c, d, e, F1, K1, x[ 0] ); + R( e, a, b, c, d, F1, K1, x[ 1] ); + R( d, e, a, b, c, F1, K1, x[ 2] ); + R( c, d, e, a, b, F1, K1, x[ 3] ); + R( b, c, d, e, a, F1, K1, x[ 4] ); + R( a, b, c, d, e, F1, K1, x[ 5] ); + R( e, a, b, c, d, F1, K1, x[ 6] ); + R( d, e, a, b, c, F1, K1, x[ 7] ); + R( c, d, e, a, b, F1, K1, x[ 8] ); + R( b, c, d, e, a, F1, K1, x[ 9] ); + R( a, b, c, d, e, F1, K1, x[10] ); + R( e, a, b, c, d, F1, K1, x[11] ); + R( d, e, a, b, c, F1, K1, x[12] ); + R( c, d, e, a, b, F1, K1, x[13] ); + R( b, c, d, e, a, F1, K1, x[14] ); + R( a, b, c, d, e, F1, K1, x[15] ); + R( e, a, b, c, d, F1, K1, M(16) ); + R( d, e, a, b, c, F1, K1, M(17) ); + R( c, d, e, a, b, F1, K1, M(18) ); + R( b, c, d, e, a, F1, K1, M(19) ); + R( a, b, c, d, e, F2, K2, M(20) ); + R( e, a, b, c, d, F2, K2, M(21) ); + R( d, e, a, b, c, F2, K2, M(22) ); + R( c, d, e, a, b, F2, K2, M(23) ); + R( b, c, d, e, a, F2, K2, M(24) ); + R( a, b, c, d, e, F2, K2, M(25) ); + R( e, a, b, c, d, F2, K2, M(26) ); + R( d, e, a, b, c, F2, K2, M(27) ); + R( c, d, e, a, b, F2, K2, M(28) ); + R( b, c, d, e, a, F2, K2, M(29) ); + R( a, b, c, d, e, F2, K2, M(30) ); + R( e, a, b, c, d, F2, K2, M(31) ); + R( d, e, a, b, c, F2, K2, M(32) ); + R( c, d, e, a, b, F2, K2, M(33) ); + R( b, c, d, e, a, F2, K2, M(34) ); + R( a, b, c, d, e, F2, K2, M(35) ); + R( e, a, b, c, d, F2, K2, M(36) ); + R( d, e, a, b, c, F2, K2, M(37) ); + R( c, d, e, a, b, F2, K2, M(38) ); + R( b, c, d, e, a, F2, K2, M(39) ); + R( a, b, c, d, e, F3, K3, M(40) ); + R( e, a, b, c, d, F3, K3, M(41) ); + R( d, e, a, b, c, F3, K3, M(42) ); + R( c, d, e, a, b, F3, K3, M(43) ); + R( b, c, d, e, a, F3, K3, M(44) ); + R( a, b, c, d, e, F3, K3, M(45) ); + R( e, a, b, c, d, F3, K3, M(46) ); + R( d, e, a, b, c, F3, K3, M(47) ); + R( c, d, e, a, b, F3, K3, M(48) ); + R( b, c, d, e, a, F3, K3, M(49) ); + R( a, b, c, d, e, F3, K3, M(50) ); + R( e, a, b, c, d, F3, K3, M(51) ); + R( d, e, a, b, c, F3, K3, M(52) ); + R( c, d, e, a, b, F3, K3, M(53) ); + R( b, c, d, e, a, F3, K3, M(54) ); + R( a, b, c, d, e, F3, K3, M(55) ); + R( e, a, b, c, d, F3, K3, M(56) ); + R( d, e, a, b, c, F3, K3, M(57) ); + R( c, d, e, a, b, F3, K3, M(58) ); + R( b, c, d, e, a, F3, K3, M(59) ); + R( a, b, c, d, e, F4, K4, M(60) ); + R( e, a, b, c, d, F4, K4, M(61) ); + R( d, e, a, b, c, F4, K4, M(62) ); + R( c, d, e, a, b, F4, K4, M(63) ); + R( b, c, d, e, a, F4, K4, M(64) ); + R( a, b, c, d, e, F4, K4, M(65) ); + R( e, a, b, c, d, F4, K4, M(66) ); + R( d, e, a, b, c, F4, K4, M(67) ); + R( c, d, e, a, b, F4, K4, M(68) ); + R( b, c, d, e, a, F4, K4, M(69) ); + R( a, b, c, d, e, F4, K4, M(70) ); + R( e, a, b, c, d, F4, K4, M(71) ); + R( d, e, a, b, c, F4, K4, M(72) ); + R( c, d, e, a, b, F4, K4, M(73) ); + R( b, c, d, e, a, F4, K4, M(74) ); + R( a, b, c, d, e, F4, K4, M(75) ); + R( e, a, b, c, d, F4, K4, M(76) ); + R( d, e, a, b, c, F4, K4, M(77) ); + R( c, d, e, a, b, F4, K4, M(78) ); + R( b, c, d, e, a, F4, K4, M(79) ); + + a = ctx->A += a; + b = ctx->B += b; + c = ctx->C += c; + d = ctx->D += d; + e = ctx->E += e; + } +} @@ -0,0 +1,136 @@ +/* Declarations of functions and data types used for SHA1 sum + library functions. + Copyright (C) 2000, 2001, 2003, 2005, 2006, 2008 + Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#ifndef SHA1_H +# define SHA1_H 1 + +#include <stdio.h> + +#if 1 /* defined HAVE_LIMITS_H || _LIBC */ +# include <limits.h> +#endif + +/* The following contortions are an attempt to use the C preprocessor + to determine an unsigned integral type that is 32 bits wide. An + alternative approach is to use autoconf's AC_CHECK_SIZEOF macro, but + doing that would require that the configure script compile and *run* + the resulting executable. Locally running cross-compiled executables + is usually not possible. */ + +#if 1 /* def _LIBC */ +# include <stdint.h> +typedef uint32_t sha1_uint32; +typedef uintptr_t sha1_uintptr; +#else +# define INT_MAX_32_BITS 2147483647 + +/* If UINT_MAX isn't defined, assume it's a 32-bit type. + This should be valid for all systems GNU cares about because + that doesn't include 16-bit systems, and only modern systems + (that certainly have <limits.h>) have 64+-bit integral types. */ + +# ifndef INT_MAX +# define INT_MAX INT_MAX_32_BITS +# endif + +# if INT_MAX == INT_MAX_32_BITS + typedef unsigned int sha1_uint32; +# else +# if SHRT_MAX == INT_MAX_32_BITS + typedef unsigned short sha1_uint32; +# else +# if LONG_MAX == INT_MAX_32_BITS + typedef unsigned long sha1_uint32; +# else + /* The following line is intended to evoke an error. + Using #error is not portable enough. */ + "Cannot determine unsigned 32-bit data type." +# endif +# endif +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Structure to save state of computation between the single steps. */ +struct sha1_ctx +{ + sha1_uint32 A; + sha1_uint32 B; + sha1_uint32 C; + sha1_uint32 D; + sha1_uint32 E; + + sha1_uint32 total[2]; + sha1_uint32 buflen; + sha1_uint32 buffer[32]; +}; + +/* Initialize structure containing state of computation. */ +extern void sha1_init_ctx (struct sha1_ctx *ctx); + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is necessary that LEN is a multiple of 64!!! */ +extern void sha1_process_block (const void *buffer, size_t len, + struct sha1_ctx *ctx); + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is NOT required that LEN is a multiple of 64. */ +extern void sha1_process_bytes (const void *buffer, size_t len, + struct sha1_ctx *ctx); + +/* Process the remaining bytes in the buffer and put result from CTX + in first 20 bytes following RESBUF. The result is always in little + endian byte order, so that a byte-wise output yields to the wanted + ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF be correctly + aligned for a 32 bits value. */ +extern void *sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf); + +/* Put result from CTX in first 20 bytes following RESBUF. The result is + always in little endian byte order, so that a byte-wise output yields + to the wanted ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32 bits value. */ +extern void *sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf); + +/* Compute SHA1 message digest for bytes read from STREAM. The + resulting message digest number will be written into the 20 bytes + beginning at RESBLOCK. */ +extern int sha1_stream (FILE *stream, void *resblock); + +/* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The + result is always in little endian byte order, so that a byte-wise + output yields to the wanted ASCII representation of the message + digest. */ +extern void *sha1_buffer (const char *buffer, size_t len, void *resblock); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/super-ddf.c b/super-ddf.c new file mode 100644 index 00000000..faaf0a7c --- /dev/null +++ b/super-ddf.c @@ -0,0 +1,5273 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2014 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neil@brown.name> + * + * Specifications for DDF taken from Common RAID DDF Specification Revision 1.2 + * (July 28 2006). Reused by permission of SNIA. + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "mdmon.h" +#include "sha1.h" +#include <values.h> +#include <stddef.h> + +/* a non-official T10 name for creation GUIDs */ +static char T10[] = "Linux-MD"; + +/* DDF timestamps are 1980 based, so we need to add + * second-in-decade-of-seventies to convert to linux timestamps. + * 10 years with 2 leap years. + */ +#define DECADE (3600*24*(365*10+2)) +unsigned long crc32( + unsigned long crc, + const unsigned char *buf, + unsigned len); + +#define DDF_NOTFOUND (~0U) +#define DDF_CONTAINER (DDF_NOTFOUND-1) + +/* Default for safe_mode_delay. Same value as for IMSM. + */ +static const int DDF_SAFE_MODE_DELAY = 4000; + +/* The DDF metadata handling. + * DDF metadata lives at the end of the device. + * The last 512 byte block provides an 'anchor' which is used to locate + * the rest of the metadata which usually lives immediately behind the anchor. + * + * Note: + * - all multibyte numeric fields are bigendian. + * - all strings are space padded. + * + */ + +typedef struct __be16 { + __u16 _v16; +} be16; +#define be16_eq(x, y) ((x)._v16 == (y)._v16) +#define be16_and(x, y) ((x)._v16 & (y)._v16) +#define be16_or(x, y) ((x)._v16 | (y)._v16) +#define be16_clear(x, y) ((x)._v16 &= ~(y)._v16) +#define be16_set(x, y) ((x)._v16 |= (y)._v16) + +typedef struct __be32 { + __u32 _v32; +} be32; +#define be32_eq(x, y) ((x)._v32 == (y)._v32) + +typedef struct __be64 { + __u64 _v64; +} be64; +#define be64_eq(x, y) ((x)._v64 == (y)._v64) + +#define be16_to_cpu(be) __be16_to_cpu((be)._v16) +static inline be16 cpu_to_be16(__u16 x) +{ + be16 be = { ._v16 = __cpu_to_be16(x) }; + return be; +} + +#define be32_to_cpu(be) __be32_to_cpu((be)._v32) +static inline be32 cpu_to_be32(__u32 x) +{ + be32 be = { ._v32 = __cpu_to_be32(x) }; + return be; +} + +#define be64_to_cpu(be) __be64_to_cpu((be)._v64) +static inline be64 cpu_to_be64(__u64 x) +{ + be64 be = { ._v64 = __cpu_to_be64(x) }; + return be; +} + +/* Primary Raid Level (PRL) */ +#define DDF_RAID0 0x00 +#define DDF_RAID1 0x01 +#define DDF_RAID3 0x03 +#define DDF_RAID4 0x04 +#define DDF_RAID5 0x05 +#define DDF_RAID1E 0x11 +#define DDF_JBOD 0x0f +#define DDF_CONCAT 0x1f +#define DDF_RAID5E 0x15 +#define DDF_RAID5EE 0x25 +#define DDF_RAID6 0x06 + +/* Raid Level Qualifier (RLQ) */ +#define DDF_RAID0_SIMPLE 0x00 +#define DDF_RAID1_SIMPLE 0x00 /* just 2 devices in this plex */ +#define DDF_RAID1_MULTI 0x01 /* exactly 3 devices in this plex */ +#define DDF_RAID3_0 0x00 /* parity in first extent */ +#define DDF_RAID3_N 0x01 /* parity in last extent */ +#define DDF_RAID4_0 0x00 /* parity in first extent */ +#define DDF_RAID4_N 0x01 /* parity in last extent */ +/* these apply to raid5e and raid5ee as well */ +#define DDF_RAID5_0_RESTART 0x00 /* same as 'right asymmetric' - layout 1 */ +#define DDF_RAID6_0_RESTART 0x01 /* raid6 different from raid5 here!!! */ +#define DDF_RAID5_N_RESTART 0x02 /* same as 'left asymmetric' - layout 0 */ +#define DDF_RAID5_N_CONTINUE 0x03 /* same as 'left symmetric' - layout 2 */ + +#define DDF_RAID1E_ADJACENT 0x00 /* raid10 nearcopies==2 */ +#define DDF_RAID1E_OFFSET 0x01 /* raid10 offsetcopies==2 */ + +/* Secondary RAID Level (SRL) */ +#define DDF_2STRIPED 0x00 /* This is weirder than RAID0 !! */ +#define DDF_2MIRRORED 0x01 +#define DDF_2CONCAT 0x02 +#define DDF_2SPANNED 0x03 /* This is also weird - be careful */ + +/* Magic numbers */ +#define DDF_HEADER_MAGIC cpu_to_be32(0xDE11DE11) +#define DDF_CONTROLLER_MAGIC cpu_to_be32(0xAD111111) +#define DDF_PHYS_RECORDS_MAGIC cpu_to_be32(0x22222222) +#define DDF_PHYS_DATA_MAGIC cpu_to_be32(0x33333333) +#define DDF_VIRT_RECORDS_MAGIC cpu_to_be32(0xDDDDDDDD) +#define DDF_VD_CONF_MAGIC cpu_to_be32(0xEEEEEEEE) +#define DDF_SPARE_ASSIGN_MAGIC cpu_to_be32(0x55555555) +#define DDF_VU_CONF_MAGIC cpu_to_be32(0x88888888) +#define DDF_VENDOR_LOG_MAGIC cpu_to_be32(0x01dBEEF0) +#define DDF_BBM_LOG_MAGIC cpu_to_be32(0xABADB10C) + +#define DDF_GUID_LEN 24 +#define DDF_REVISION_0 "01.00.00" +#define DDF_REVISION_2 "01.02.00" + +struct ddf_header { + be32 magic; /* DDF_HEADER_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + char revision[8]; /* 01.02.00 */ + be32 seq; /* starts at '1' */ + be32 timestamp; + __u8 openflag; + __u8 foreignflag; + __u8 enforcegroups; + __u8 pad0; /* 0xff */ + __u8 pad1[12]; /* 12 * 0xff */ + /* 64 bytes so far */ + __u8 header_ext[32]; /* reserved: fill with 0xff */ + be64 primary_lba; + be64 secondary_lba; + __u8 type; + __u8 pad2[3]; /* 0xff */ + be32 workspace_len; /* sectors for vendor space - + * at least 32768(sectors) */ + be64 workspace_lba; + be16 max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */ + be16 max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */ + be16 max_partitions; /* i.e. max num of configuration + record entries per disk */ + be16 config_record_len; /* 1 +ROUNDUP(max_primary_element_entries + *12/512) */ + be16 max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */ + __u8 pad3[54]; /* 0xff */ + /* 192 bytes so far */ + be32 controller_section_offset; + be32 controller_section_length; + be32 phys_section_offset; + be32 phys_section_length; + be32 virt_section_offset; + be32 virt_section_length; + be32 config_section_offset; + be32 config_section_length; + be32 data_section_offset; + be32 data_section_length; + be32 bbm_section_offset; + be32 bbm_section_length; + be32 diag_space_offset; + be32 diag_space_length; + be32 vendor_offset; + be32 vendor_length; + /* 256 bytes so far */ + __u8 pad4[256]; /* 0xff */ +}; + +/* type field */ +#define DDF_HEADER_ANCHOR 0x00 +#define DDF_HEADER_PRIMARY 0x01 +#define DDF_HEADER_SECONDARY 0x02 + +/* The content of the 'controller section' - global scope */ +struct ddf_controller_data { + be32 magic; /* DDF_CONTROLLER_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + struct controller_type { + be16 vendor_id; + be16 device_id; + be16 sub_vendor_id; + be16 sub_device_id; + } type; + char product_id[16]; + __u8 pad[8]; /* 0xff */ + __u8 vendor_data[448]; +}; + +/* The content of phys_section - global scope */ +struct phys_disk { + be32 magic; /* DDF_PHYS_RECORDS_MAGIC */ + be32 crc; + be16 used_pdes; /* This is a counter, not a max - the list + * of used entries may not be dense */ + be16 max_pdes; + __u8 pad[52]; + struct phys_disk_entry { + char guid[DDF_GUID_LEN]; + be32 refnum; + be16 type; + be16 state; + be64 config_size; /* DDF structures must be after here */ + char path[18]; /* Another horrible structure really + * but is "used for information + * purposes only" */ + __u8 pad[6]; + } entries[0]; +}; + +/* phys_disk_entry.type is a bitmap - bigendian remember */ +#define DDF_Forced_PD_GUID 1 +#define DDF_Active_in_VD 2 +#define DDF_Global_Spare 4 /* VD_CONF records are ignored */ +#define DDF_Spare 8 /* overrides Global_spare */ +#define DDF_Foreign 16 +#define DDF_Legacy 32 /* no DDF on this device */ + +#define DDF_Interface_mask 0xf00 +#define DDF_Interface_SCSI 0x100 +#define DDF_Interface_SAS 0x200 +#define DDF_Interface_SATA 0x300 +#define DDF_Interface_FC 0x400 + +/* phys_disk_entry.state is a bigendian bitmap */ +#define DDF_Online 1 +#define DDF_Failed 2 /* overrides 1,4,8 */ +#define DDF_Rebuilding 4 +#define DDF_Transition 8 +#define DDF_SMART 16 +#define DDF_ReadErrors 32 +#define DDF_Missing 64 + +/* The content of the virt_section global scope */ +struct virtual_disk { + be32 magic; /* DDF_VIRT_RECORDS_MAGIC */ + be32 crc; + be16 populated_vdes; + be16 max_vdes; + __u8 pad[52]; + struct virtual_entry { + char guid[DDF_GUID_LEN]; + be16 unit; + __u16 pad0; /* 0xffff */ + be16 guid_crc; + be16 type; + __u8 state; + __u8 init_state; + __u8 pad1[14]; + char name[16]; + } entries[0]; +}; + +/* virtual_entry.type is a bitmap - bigendian */ +#define DDF_Shared 1 +#define DDF_Enforce_Groups 2 +#define DDF_Unicode 4 +#define DDF_Owner_Valid 8 + +/* virtual_entry.state is a bigendian bitmap */ +#define DDF_state_mask 0x7 +#define DDF_state_optimal 0x0 +#define DDF_state_degraded 0x1 +#define DDF_state_deleted 0x2 +#define DDF_state_missing 0x3 +#define DDF_state_failed 0x4 +#define DDF_state_part_optimal 0x5 + +#define DDF_state_morphing 0x8 +#define DDF_state_inconsistent 0x10 + +/* virtual_entry.init_state is a bigendian bitmap */ +#define DDF_initstate_mask 0x03 +#define DDF_init_not 0x00 +#define DDF_init_quick 0x01 /* initialisation is progress. + * i.e. 'state_inconsistent' */ +#define DDF_init_full 0x02 + +#define DDF_access_mask 0xc0 +#define DDF_access_rw 0x00 +#define DDF_access_ro 0x80 +#define DDF_access_blocked 0xc0 + +/* The content of the config_section - local scope + * It has multiple records each config_record_len sectors + * They can be vd_config or spare_assign + */ + +struct vd_config { + be32 magic; /* DDF_VD_CONF_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + be32 timestamp; + be32 seqnum; + __u8 pad0[24]; + be16 prim_elmnt_count; + __u8 chunk_shift; /* 0 == 512, 1==1024 etc */ + __u8 prl; + __u8 rlq; + __u8 sec_elmnt_count; + __u8 sec_elmnt_seq; + __u8 srl; + be64 blocks; /* blocks per component could be different + * on different component devices...(only + * for concat I hope) */ + be64 array_blocks; /* blocks in array */ + __u8 pad1[8]; + be32 spare_refs[8]; /* This is used to detect missing spares. + * As we don't have an interface for that + * the values are ignored. + */ + __u8 cache_pol[8]; + __u8 bg_rate; + __u8 pad2[3]; + __u8 pad3[52]; + __u8 pad4[192]; + __u8 v0[32]; /* reserved- 0xff */ + __u8 v1[32]; /* reserved- 0xff */ + __u8 v2[16]; /* reserved- 0xff */ + __u8 v3[16]; /* reserved- 0xff */ + __u8 vendor[32]; + be32 phys_refnum[0]; /* refnum of each disk in sequence */ + /*__u64 lba_offset[0]; LBA offset in each phys. Note extents in a + bvd are always the same size */ +}; +#define LBA_OFFSET(ddf, vd) ((be64 *) &(vd)->phys_refnum[(ddf)->mppe]) + +/* vd_config.cache_pol[7] is a bitmap */ +#define DDF_cache_writeback 1 /* else writethrough */ +#define DDF_cache_wadaptive 2 /* only applies if writeback */ +#define DDF_cache_readahead 4 +#define DDF_cache_radaptive 8 /* only if doing read-ahead */ +#define DDF_cache_ifnobatt 16 /* even to write cache if battery is poor */ +#define DDF_cache_wallowed 32 /* enable write caching */ +#define DDF_cache_rallowed 64 /* enable read caching */ + +struct spare_assign { + be32 magic; /* DDF_SPARE_ASSIGN_MAGIC */ + be32 crc; + be32 timestamp; + __u8 reserved[7]; + __u8 type; + be16 populated; /* SAEs used */ + be16 max; /* max SAEs */ + __u8 pad[8]; + struct spare_assign_entry { + char guid[DDF_GUID_LEN]; + be16 secondary_element; + __u8 pad[6]; + } spare_ents[0]; +}; +/* spare_assign.type is a bitmap */ +#define DDF_spare_dedicated 0x1 /* else global */ +#define DDF_spare_revertible 0x2 /* else committable */ +#define DDF_spare_active 0x4 /* else not active */ +#define DDF_spare_affinity 0x8 /* enclosure affinity */ + +/* The data_section contents - local scope */ +struct disk_data { + be32 magic; /* DDF_PHYS_DATA_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + be32 refnum; /* crc of some magic drive data ... */ + __u8 forced_ref; /* set when above was not result of magic */ + __u8 forced_guid; /* set if guid was forced rather than magic */ + __u8 vendor[32]; + __u8 pad[442]; +}; + +/* bbm_section content */ +struct bad_block_log { + be32 magic; + be32 crc; + be16 entry_count; + be32 spare_count; + __u8 pad[10]; + be64 first_spare; + struct mapped_block { + be64 defective_start; + be32 replacement_start; + be16 remap_count; + __u8 pad[2]; + } entries[0]; +}; + +/* Struct for internally holding ddf structures */ +/* The DDF structure stored on each device is potentially + * quite different, as some data is global and some is local. + * The global data is: + * - ddf header + * - controller_data + * - Physical disk records + * - Virtual disk records + * The local data is: + * - Configuration records + * - Physical Disk data section + * ( and Bad block and vendor which I don't care about yet). + * + * The local data is parsed into separate lists as it is read + * and reconstructed for writing. This means that we only need + * to make config changes once and they are automatically + * propagated to all devices. + * The global (config and disk data) records are each in a list + * of separate data structures. When writing we find the entry + * or entries applicable to the particular device. + */ +struct ddf_super { + struct ddf_header anchor, primary, secondary; + struct ddf_controller_data controller; + struct ddf_header *active; + struct phys_disk *phys; + struct virtual_disk *virt; + char *conf; + int pdsize, vdsize; + unsigned int max_part, mppe, conf_rec_len; + int currentdev; + int updates_pending; + struct vcl { + union { + char space[512]; + struct { + struct vcl *next; + unsigned int vcnum; /* index into ->virt */ + /* For an array with a secondary level there are + * multiple vd_config structures, all with the same + * guid but with different sec_elmnt_seq. + * One of these structures is in 'conf' below. + * The others are in other_bvds, not in any + * particular order. + */ + struct vd_config **other_bvds; + __u64 *block_sizes; /* NULL if all the same */ + }; + }; + struct vd_config conf; + } *conflist, *currentconf; + struct dl { + union { + char space[512]; + struct { + struct dl *next; + int major, minor; + char *devname; + int fd; + unsigned long long size; /* sectors */ + be64 primary_lba; /* sectors */ + be64 secondary_lba; /* sectors */ + be64 workspace_lba; /* sectors */ + int pdnum; /* index in ->phys */ + struct spare_assign *spare; + void *mdupdate; /* hold metadata update */ + + /* These fields used by auto-layout */ + int raiddisk; /* slot to fill in autolayout */ + __u64 esize; + int displayed; + }; + }; + struct disk_data disk; + struct vcl *vlist[0]; /* max_part in size */ + } *dlist, *add_list; +}; + +#ifndef MDASSEMBLE +static int load_super_ddf_all(struct supertype *st, int fd, + void **sbp, char *devname); +static int get_svd_state(const struct ddf_super *, const struct vcl *); +static int +validate_geometry_ddf_container(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose); + +static int validate_geometry_ddf_bvd(struct supertype *st, + int level, int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose); +#endif + +static void free_super_ddf(struct supertype *st); +static int all_ff(const char *guid); +static unsigned int get_pd_index_from_refnum(const struct vcl *vc, + be32 refnum, unsigned int nmax, + const struct vd_config **bvd, + unsigned int *idx); +static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map); +static void uuid_from_ddf_guid(const char *guid, int uuid[4]); +static void uuid_from_super_ddf(struct supertype *st, int uuid[4]); +static void _ddf_array_name(char *name, const struct ddf_super *ddf, int i); +static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map); +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid, unsigned long long data_offset); + +#if DEBUG +static void pr_state(struct ddf_super *ddf, const char *msg) +{ + unsigned int i; + dprintf("%s: ", msg); + for (i = 0; i < be16_to_cpu(ddf->active->max_vd_entries); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + continue; + dprintf_cont("%u(s=%02x i=%02x) ", i, + ddf->virt->entries[i].state, + ddf->virt->entries[i].init_state); + } + dprintf_cont("\n"); +} +#else +static void pr_state(const struct ddf_super *ddf, const char *msg) {} +#endif + +static void _ddf_set_updates_pending(struct ddf_super *ddf, struct vd_config *vc, + const char *func) +{ + if (vc) { + vc->timestamp = cpu_to_be32(time(0)-DECADE); + vc->seqnum = cpu_to_be32(be32_to_cpu(vc->seqnum) + 1); + } + if (ddf->updates_pending) + return; + ddf->updates_pending = 1; + ddf->active->seq = cpu_to_be32((be32_to_cpu(ddf->active->seq)+1)); + pr_state(ddf, func); +} + +#define ddf_set_updates_pending(x,v) _ddf_set_updates_pending((x), (v), __func__) + +static be32 calc_crc(void *buf, int len) +{ + /* crcs are always at the same place as in the ddf_header */ + struct ddf_header *ddf = buf; + be32 oldcrc = ddf->crc; + __u32 newcrc; + ddf->crc = cpu_to_be32(0xffffffff); + + newcrc = crc32(0, buf, len); + ddf->crc = oldcrc; + /* The crc is stored (like everything) bigendian, so convert + * here for simplicity + */ + return cpu_to_be32(newcrc); +} + +#define DDF_INVALID_LEVEL 0xff +#define DDF_NO_SECONDARY 0xff +static int err_bad_md_layout(const mdu_array_info_t *array) +{ + pr_err("RAID%d layout %x with %d disks is unsupported for DDF\n", + array->level, array->layout, array->raid_disks); + return -1; +} + +static int layout_md2ddf(const mdu_array_info_t *array, + struct vd_config *conf) +{ + be16 prim_elmnt_count = cpu_to_be16(array->raid_disks); + __u8 prl = DDF_INVALID_LEVEL, rlq = 0; + __u8 sec_elmnt_count = 1; + __u8 srl = DDF_NO_SECONDARY; + + switch (array->level) { + case LEVEL_LINEAR: + prl = DDF_CONCAT; + break; + case 0: + rlq = DDF_RAID0_SIMPLE; + prl = DDF_RAID0; + break; + case 1: + switch (array->raid_disks) { + case 2: + rlq = DDF_RAID1_SIMPLE; + break; + case 3: + rlq = DDF_RAID1_MULTI; + break; + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID1; + break; + case 4: + if (array->layout != 0) + return err_bad_md_layout(array); + rlq = DDF_RAID4_N; + prl = DDF_RAID4; + break; + case 5: + switch (array->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + rlq = DDF_RAID5_N_RESTART; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + rlq = DDF_RAID5_0_RESTART; + break; + case ALGORITHM_LEFT_SYMMETRIC: + rlq = DDF_RAID5_N_CONTINUE; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + /* not mentioned in standard */ + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID5; + break; + case 6: + switch (array->layout) { + case ALGORITHM_ROTATING_N_RESTART: + rlq = DDF_RAID5_N_RESTART; + break; + case ALGORITHM_ROTATING_ZERO_RESTART: + rlq = DDF_RAID6_0_RESTART; + break; + case ALGORITHM_ROTATING_N_CONTINUE: + rlq = DDF_RAID5_N_CONTINUE; + break; + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID6; + break; + case 10: + if (array->raid_disks % 2 == 0 && array->layout == 0x102) { + rlq = DDF_RAID1_SIMPLE; + prim_elmnt_count = cpu_to_be16(2); + sec_elmnt_count = array->raid_disks / 2; + srl = DDF_2SPANNED; + prl = DDF_RAID1; + } else if (array->raid_disks % 3 == 0 + && array->layout == 0x103) { + rlq = DDF_RAID1_MULTI; + prim_elmnt_count = cpu_to_be16(3); + sec_elmnt_count = array->raid_disks / 3; + srl = DDF_2SPANNED; + prl = DDF_RAID1; + } else if (array->layout == 0x201) { + prl = DDF_RAID1E; + rlq = DDF_RAID1E_OFFSET; + } else if (array->layout == 0x102) { + prl = DDF_RAID1E; + rlq = DDF_RAID1E_ADJACENT; + } else + return err_bad_md_layout(array); + break; + default: + return err_bad_md_layout(array); + } + conf->prl = prl; + conf->prim_elmnt_count = prim_elmnt_count; + conf->rlq = rlq; + conf->srl = srl; + conf->sec_elmnt_count = sec_elmnt_count; + return 0; +} + +static int err_bad_ddf_layout(const struct vd_config *conf) +{ + pr_err("DDF RAID %u qualifier %u with %u disks is unsupported\n", + conf->prl, conf->rlq, be16_to_cpu(conf->prim_elmnt_count)); + return -1; +} + +static int layout_ddf2md(const struct vd_config *conf, + mdu_array_info_t *array) +{ + int level = LEVEL_UNSUPPORTED; + int layout = 0; + int raiddisks = be16_to_cpu(conf->prim_elmnt_count); + + if (conf->sec_elmnt_count > 1) { + /* see also check_secondary() */ + if (conf->prl != DDF_RAID1 || + (conf->srl != DDF_2STRIPED && conf->srl != DDF_2SPANNED)) { + pr_err("Unsupported secondary RAID level %u/%u\n", + conf->prl, conf->srl); + return -1; + } + if (raiddisks == 2 && conf->rlq == DDF_RAID1_SIMPLE) + layout = 0x102; + else if (raiddisks == 3 && conf->rlq == DDF_RAID1_MULTI) + layout = 0x103; + else + return err_bad_ddf_layout(conf); + raiddisks *= conf->sec_elmnt_count; + level = 10; + goto good; + } + + switch (conf->prl) { + case DDF_CONCAT: + level = LEVEL_LINEAR; + break; + case DDF_RAID0: + if (conf->rlq != DDF_RAID0_SIMPLE) + return err_bad_ddf_layout(conf); + level = 0; + break; + case DDF_RAID1: + if (!((conf->rlq == DDF_RAID1_SIMPLE && raiddisks == 2) || + (conf->rlq == DDF_RAID1_MULTI && raiddisks == 3))) + return err_bad_ddf_layout(conf); + level = 1; + break; + case DDF_RAID1E: + if (conf->rlq == DDF_RAID1E_ADJACENT) + layout = 0x102; + else if (conf->rlq == DDF_RAID1E_OFFSET) + layout = 0x201; + else + return err_bad_ddf_layout(conf); + level = 10; + break; + case DDF_RAID4: + if (conf->rlq != DDF_RAID4_N) + return err_bad_ddf_layout(conf); + level = 4; + break; + case DDF_RAID5: + switch (conf->rlq) { + case DDF_RAID5_N_RESTART: + layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case DDF_RAID5_0_RESTART: + layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case DDF_RAID5_N_CONTINUE: + layout = ALGORITHM_LEFT_SYMMETRIC; + break; + default: + return err_bad_ddf_layout(conf); + } + level = 5; + break; + case DDF_RAID6: + switch (conf->rlq) { + case DDF_RAID5_N_RESTART: + layout = ALGORITHM_ROTATING_N_RESTART; + break; + case DDF_RAID6_0_RESTART: + layout = ALGORITHM_ROTATING_ZERO_RESTART; + break; + case DDF_RAID5_N_CONTINUE: + layout = ALGORITHM_ROTATING_N_CONTINUE; + break; + default: + return err_bad_ddf_layout(conf); + } + level = 6; + break; + default: + return err_bad_ddf_layout(conf); + }; + +good: + array->level = level; + array->layout = layout; + array->raid_disks = raiddisks; + return 0; +} + +static int load_ddf_header(int fd, unsigned long long lba, + unsigned long long size, + int type, + struct ddf_header *hdr, struct ddf_header *anchor) +{ + /* read a ddf header (primary or secondary) from fd/lba + * and check that it is consistent with anchor + * Need to check: + * magic, crc, guid, rev, and LBA's header_type, and + * everything after header_type must be the same + */ + if (lba >= size-1) + return 0; + + if (lseek64(fd, lba<<9, 0) < 0) + return 0; + + if (read(fd, hdr, 512) != 512) + return 0; + + if (!be32_eq(hdr->magic, DDF_HEADER_MAGIC)) { + pr_err("bad header magic\n"); + return 0; + } + if (!be32_eq(calc_crc(hdr, 512), hdr->crc)) { + pr_err("bad CRC\n"); + return 0; + } + if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 || + memcmp(anchor->revision, hdr->revision, 8) != 0 || + !be64_eq(anchor->primary_lba, hdr->primary_lba) || + !be64_eq(anchor->secondary_lba, hdr->secondary_lba) || + hdr->type != type || + memcmp(anchor->pad2, hdr->pad2, 512 - + offsetof(struct ddf_header, pad2)) != 0) { + pr_err("header mismatch\n"); + return 0; + } + + /* Looks good enough to me... */ + return 1; +} + +static void *load_section(int fd, struct ddf_super *super, void *buf, + be32 offset_be, be32 len_be, int check) +{ + unsigned long long offset = be32_to_cpu(offset_be); + unsigned long long len = be32_to_cpu(len_be); + int dofree = (buf == NULL); + + if (check) + if (len != 2 && len != 8 && len != 32 + && len != 128 && len != 512) + return NULL; + + if (len > 1024) + return NULL; + if (!buf && posix_memalign(&buf, 512, len<<9) != 0) + buf = NULL; + + if (!buf) + return NULL; + + if (super->active->type == 1) + offset += be64_to_cpu(super->active->primary_lba); + else + offset += be64_to_cpu(super->active->secondary_lba); + + if ((unsigned long long)lseek64(fd, offset<<9, 0) != (offset<<9)) { + if (dofree) + free(buf); + return NULL; + } + if ((unsigned long long)read(fd, buf, len<<9) != (len<<9)) { + if (dofree) + free(buf); + return NULL; + } + return buf; +} + +static int load_ddf_headers(int fd, struct ddf_super *super, char *devname) +{ + unsigned long long dsize; + + get_dev_size(fd, NULL, &dsize); + + if (lseek64(fd, dsize-512, 0) < 0) { + if (devname) + pr_err("Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + if (read(fd, &super->anchor, 512) != 512) { + if (devname) + pr_err("Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + if (!be32_eq(super->anchor.magic, DDF_HEADER_MAGIC)) { + if (devname) + pr_err("no DDF anchor found on %s\n", + devname); + return 2; + } + if (!be32_eq(calc_crc(&super->anchor, 512), super->anchor.crc)) { + if (devname) + pr_err("bad CRC on anchor on %s\n", + devname); + return 2; + } + if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 && + memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) { + if (devname) + pr_err("can only support super revision %.8s and earlier, not %.8s on %s\n", + DDF_REVISION_2, super->anchor.revision,devname); + return 2; + } + super->active = NULL; + if (load_ddf_header(fd, be64_to_cpu(super->anchor.primary_lba), + dsize >> 9, 1, + &super->primary, &super->anchor) == 0) { + if (devname) + pr_err("Failed to load primary DDF header on %s\n", devname); + } else + super->active = &super->primary; + + if (load_ddf_header(fd, be64_to_cpu(super->anchor.secondary_lba), + dsize >> 9, 2, + &super->secondary, &super->anchor)) { + if (super->active == NULL + || (be32_to_cpu(super->primary.seq) + < be32_to_cpu(super->secondary.seq) && + !super->secondary.openflag) + || (be32_to_cpu(super->primary.seq) + == be32_to_cpu(super->secondary.seq) && + super->primary.openflag && !super->secondary.openflag) + ) + super->active = &super->secondary; + } else if (devname && + be64_to_cpu(super->anchor.secondary_lba) != ~(__u64)0) + pr_err("Failed to load secondary DDF header on %s\n", + devname); + if (super->active == NULL) + return 2; + return 0; +} + +static int load_ddf_global(int fd, struct ddf_super *super, char *devname) +{ + void *ok; + ok = load_section(fd, super, &super->controller, + super->active->controller_section_offset, + super->active->controller_section_length, + 0); + super->phys = load_section(fd, super, NULL, + super->active->phys_section_offset, + super->active->phys_section_length, + 1); + super->pdsize = be32_to_cpu(super->active->phys_section_length) * 512; + + super->virt = load_section(fd, super, NULL, + super->active->virt_section_offset, + super->active->virt_section_length, + 1); + super->vdsize = be32_to_cpu(super->active->virt_section_length) * 512; + if (!ok || + !super->phys || + !super->virt) { + free(super->phys); + free(super->virt); + super->phys = NULL; + super->virt = NULL; + return 2; + } + super->conflist = NULL; + super->dlist = NULL; + + super->max_part = be16_to_cpu(super->active->max_partitions); + super->mppe = be16_to_cpu(super->active->max_primary_element_entries); + super->conf_rec_len = be16_to_cpu(super->active->config_record_len); + return 0; +} + +#define DDF_UNUSED_BVD 0xff +static int alloc_other_bvds(const struct ddf_super *ddf, struct vcl *vcl) +{ + unsigned int n_vds = vcl->conf.sec_elmnt_count - 1; + unsigned int i, vdsize; + void *p; + if (n_vds == 0) { + vcl->other_bvds = NULL; + return 0; + } + vdsize = ddf->conf_rec_len * 512; + if (posix_memalign(&p, 512, n_vds * + (vdsize + sizeof(struct vd_config *))) != 0) + return -1; + vcl->other_bvds = (struct vd_config **) (p + n_vds * vdsize); + for (i = 0; i < n_vds; i++) { + vcl->other_bvds[i] = p + i * vdsize; + memset(vcl->other_bvds[i], 0, vdsize); + vcl->other_bvds[i]->sec_elmnt_seq = DDF_UNUSED_BVD; + } + return 0; +} + +static void add_other_bvd(struct vcl *vcl, struct vd_config *vd, + unsigned int len) +{ + int i; + for (i = 0; i < vcl->conf.sec_elmnt_count-1; i++) + if (vcl->other_bvds[i]->sec_elmnt_seq == vd->sec_elmnt_seq) + break; + + if (i < vcl->conf.sec_elmnt_count-1) { + if (be32_to_cpu(vd->seqnum) <= + be32_to_cpu(vcl->other_bvds[i]->seqnum)) + return; + } else { + for (i = 0; i < vcl->conf.sec_elmnt_count-1; i++) + if (vcl->other_bvds[i]->sec_elmnt_seq == DDF_UNUSED_BVD) + break; + if (i == vcl->conf.sec_elmnt_count-1) { + pr_err("no space for sec level config %u, count is %u\n", + vd->sec_elmnt_seq, vcl->conf.sec_elmnt_count); + return; + } + } + memcpy(vcl->other_bvds[i], vd, len); +} + +static int load_ddf_local(int fd, struct ddf_super *super, + char *devname, int keep) +{ + struct dl *dl; + struct stat stb; + char *conf; + unsigned int i; + unsigned int confsec; + int vnum; + unsigned int max_virt_disks = + be16_to_cpu(super->active->max_vd_entries); + unsigned long long dsize; + + /* First the local disk info */ + if (posix_memalign((void**)&dl, 512, + sizeof(*dl) + + (super->max_part) * sizeof(dl->vlist[0])) != 0) { + pr_err("could not allocate disk info buffer\n"); + return 1; + } + + load_section(fd, super, &dl->disk, + super->active->data_section_offset, + super->active->data_section_length, + 0); + dl->devname = devname ? xstrdup(devname) : NULL; + + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->dlist; + dl->fd = keep ? fd : -1; + + dl->size = 0; + if (get_dev_size(fd, devname, &dsize)) + dl->size = dsize >> 9; + /* If the disks have different sizes, the LBAs will differ + * between phys disks. + * At this point here, the values in super->active must be valid + * for this phys disk. */ + dl->primary_lba = super->active->primary_lba; + dl->secondary_lba = super->active->secondary_lba; + dl->workspace_lba = super->active->workspace_lba; + dl->spare = NULL; + for (i = 0 ; i < super->max_part ; i++) + dl->vlist[i] = NULL; + super->dlist = dl; + dl->pdnum = -1; + for (i = 0; i < be16_to_cpu(super->active->max_pd_entries); i++) + if (memcmp(super->phys->entries[i].guid, + dl->disk.guid, DDF_GUID_LEN) == 0) + dl->pdnum = i; + + /* Now the config list. */ + /* 'conf' is an array of config entries, some of which are + * probably invalid. Those which are good need to be copied into + * the conflist + */ + + conf = load_section(fd, super, super->conf, + super->active->config_section_offset, + super->active->config_section_length, + 0); + super->conf = conf; + vnum = 0; + for (confsec = 0; + confsec < be32_to_cpu(super->active->config_section_length); + confsec += super->conf_rec_len) { + struct vd_config *vd = + (struct vd_config *)((char*)conf + confsec*512); + struct vcl *vcl; + + if (be32_eq(vd->magic, DDF_SPARE_ASSIGN_MAGIC)) { + if (dl->spare) + continue; + if (posix_memalign((void**)&dl->spare, 512, + super->conf_rec_len*512) != 0) { + pr_err("could not allocate spare info buf\n"); + return 1; + } + + memcpy(dl->spare, vd, super->conf_rec_len*512); + continue; + } + if (!be32_eq(vd->magic, DDF_VD_CONF_MAGIC)) + /* Must be vendor-unique - I cannot handle those */ + continue; + + for (vcl = super->conflist; vcl; vcl = vcl->next) { + if (memcmp(vcl->conf.guid, + vd->guid, DDF_GUID_LEN) == 0) + break; + } + + if (vcl) { + dl->vlist[vnum++] = vcl; + if (vcl->other_bvds != NULL && + vcl->conf.sec_elmnt_seq != vd->sec_elmnt_seq) { + add_other_bvd(vcl, vd, super->conf_rec_len*512); + continue; + } + if (be32_to_cpu(vd->seqnum) <= + be32_to_cpu(vcl->conf.seqnum)) + continue; + } else { + if (posix_memalign((void**)&vcl, 512, + (super->conf_rec_len*512 + + offsetof(struct vcl, conf))) != 0) { + pr_err("could not allocate vcl buf\n"); + return 1; + } + vcl->next = super->conflist; + vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + vcl->conf.sec_elmnt_count = vd->sec_elmnt_count; + if (alloc_other_bvds(super, vcl) != 0) { + pr_err("could not allocate other bvds\n"); + free(vcl); + return 1; + }; + super->conflist = vcl; + dl->vlist[vnum++] = vcl; + } + memcpy(&vcl->conf, vd, super->conf_rec_len*512); + for (i=0; i < max_virt_disks ; i++) + if (memcmp(super->virt->entries[i].guid, + vcl->conf.guid, DDF_GUID_LEN)==0) + break; + if (i < max_virt_disks) + vcl->vcnum = i; + } + + return 0; +} + +static int load_super_ddf(struct supertype *st, int fd, + char *devname) +{ + unsigned long long dsize; + struct ddf_super *super; + int rv; + + if (get_dev_size(fd, devname, &dsize) == 0) + return 1; + + if (test_partition(fd)) + /* DDF is not allowed on partitions */ + return 1; + + /* 32M is a lower bound */ + if (dsize <= 32*1024*1024) { + if (devname) + pr_err("%s is too small for ddf: size is %llu sectors.\n", + devname, dsize>>9); + return 1; + } + if (dsize & 511) { + if (devname) + pr_err("%s is an odd size for ddf: size is %llu bytes.\n", + devname, dsize); + return 1; + } + + free_super_ddf(st); + + if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) { + pr_err("malloc of %zu failed.\n", + sizeof(*super)); + return 1; + } + memset(super, 0, sizeof(*super)); + + rv = load_ddf_headers(fd, super, devname); + if (rv) { + free(super); + return rv; + } + + /* Have valid headers and have chosen the best. Let's read in the rest*/ + + rv = load_ddf_global(fd, super, devname); + + if (rv) { + if (devname) + pr_err("Failed to load all information sections on %s\n", devname); + free(super); + return rv; + } + + rv = load_ddf_local(fd, super, devname, 0); + + if (rv) { + if (devname) + pr_err("Failed to load all information sections on %s\n", devname); + free(super); + return rv; + } + + /* Should possibly check the sections .... */ + + st->sb = super; + if (st->ss == NULL) { + st->ss = &super_ddf; + st->minor_version = 0; + st->max_devs = 512; + } + return 0; + +} + +static void free_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + if (ddf == NULL) + return; + free(ddf->phys); + free(ddf->virt); + free(ddf->conf); + while (ddf->conflist) { + struct vcl *v = ddf->conflist; + ddf->conflist = v->next; + if (v->block_sizes) + free(v->block_sizes); + if (v->other_bvds) + /* + v->other_bvds[0] points to beginning of buffer, + see alloc_other_bvds() + */ + free(v->other_bvds[0]); + free(v); + } + while (ddf->dlist) { + struct dl *d = ddf->dlist; + ddf->dlist = d->next; + if (d->fd >= 0) + close(d->fd); + if (d->spare) + free(d->spare); + free(d); + } + while (ddf->add_list) { + struct dl *d = ddf->add_list; + ddf->add_list = d->next; + if (d->fd >= 0) + close(d->fd); + if (d->spare) + free(d->spare); + free(d); + } + free(ddf); + st->sb = NULL; +} + +static struct supertype *match_metadata_desc_ddf(char *arg) +{ + /* 'ddf' only supports containers */ + struct supertype *st; + if (strcmp(arg, "ddf") != 0 && + strcmp(arg, "default") != 0 + ) + return NULL; + + st = xcalloc(1, sizeof(*st)); + st->ss = &super_ddf; + st->max_devs = 512; + st->minor_version = 0; + st->sb = NULL; + return st; +} + +#ifndef MDASSEMBLE + +static mapping_t ddf_state[] = { + { "Optimal", 0}, + { "Degraded", 1}, + { "Deleted", 2}, + { "Missing", 3}, + { "Failed", 4}, + { "Partially Optimal", 5}, + { "-reserved-", 6}, + { "-reserved-", 7}, + { NULL, 0} +}; + +static mapping_t ddf_init_state[] = { + { "Not Initialised", 0}, + { "QuickInit in Progress", 1}, + { "Fully Initialised", 2}, + { "*UNKNOWN*", 3}, + { NULL, 0} +}; +static mapping_t ddf_access[] = { + { "Read/Write", 0}, + { "Reserved", 1}, + { "Read Only", 2}, + { "Blocked (no access)", 3}, + { NULL ,0} +}; + +static mapping_t ddf_level[] = { + { "RAID0", DDF_RAID0}, + { "RAID1", DDF_RAID1}, + { "RAID3", DDF_RAID3}, + { "RAID4", DDF_RAID4}, + { "RAID5", DDF_RAID5}, + { "RAID1E",DDF_RAID1E}, + { "JBOD", DDF_JBOD}, + { "CONCAT",DDF_CONCAT}, + { "RAID5E",DDF_RAID5E}, + { "RAID5EE",DDF_RAID5EE}, + { "RAID6", DDF_RAID6}, + { NULL, 0} +}; +static mapping_t ddf_sec_level[] = { + { "Striped", DDF_2STRIPED}, + { "Mirrored", DDF_2MIRRORED}, + { "Concat", DDF_2CONCAT}, + { "Spanned", DDF_2SPANNED}, + { NULL, 0} +}; +#endif + +static int all_ff(const char *guid) +{ + int i; + for (i = 0; i < DDF_GUID_LEN; i++) + if (guid[i] != (char)0xff) + return 0; + return 1; +} + +static const char *guid_str(const char *guid) +{ + static char buf[DDF_GUID_LEN*2+1]; + int i; + char *p = buf; + for (i = 0; i < DDF_GUID_LEN; i++) { + unsigned char c = guid[i]; + if (c >= 32 && c < 127) + p += sprintf(p, "%c", c); + else + p += sprintf(p, "%02x", c); + } + *p = '\0'; + return (const char *) buf; +} + +#ifndef MDASSEMBLE +static void print_guid(char *guid, int tstamp) +{ + /* A GUIDs are part (or all) ASCII and part binary. + * They tend to be space padded. + * We print the GUID in HEX, then in parentheses add + * any initial ASCII sequence, and a possible + * time stamp from bytes 16-19 + */ + int l = DDF_GUID_LEN; + int i; + + for (i=0 ; i<DDF_GUID_LEN ; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02X", guid[i]&255); + } + + printf("\n ("); + while (l && guid[l-1] == ' ') + l--; + for (i=0 ; i<l ; i++) { + if (guid[i] >= 0x20 && guid[i] < 0x7f) + fputc(guid[i], stdout); + else + break; + } + if (tstamp) { + time_t then = __be32_to_cpu(*(__u32*)(guid+16)) + DECADE; + char tbuf[100]; + struct tm *tm; + tm = localtime(&then); + strftime(tbuf, 100, " %D %T",tm); + fputs(tbuf, stdout); + } + printf(")"); +} + +static void examine_vd(int n, struct ddf_super *sb, char *guid) +{ + int crl = sb->conf_rec_len; + struct vcl *vcl; + + for (vcl = sb->conflist ; vcl ; vcl = vcl->next) { + unsigned int i; + struct vd_config *vc = &vcl->conf; + + if (!be32_eq(calc_crc(vc, crl*512), vc->crc)) + continue; + if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0) + continue; + + /* Ok, we know about this VD, let's give more details */ + printf(" Raid Devices[%d] : %d (", n, + be16_to_cpu(vc->prim_elmnt_count)); + for (i = 0; i < be16_to_cpu(vc->prim_elmnt_count); i++) { + int j; + int cnt = be16_to_cpu(sb->phys->max_pdes); + for (j=0; j<cnt; j++) + if (be32_eq(vc->phys_refnum[i], + sb->phys->entries[j].refnum)) + break; + if (i) printf(" "); + if (j < cnt) + printf("%d", j); + else + printf("--"); + printf("@%lluK", (unsigned long long) be64_to_cpu(LBA_OFFSET(sb, vc)[i])/2); + } + printf(")\n"); + if (vc->chunk_shift != 255) + printf(" Chunk Size[%d] : %d sectors\n", n, + 1 << vc->chunk_shift); + printf(" Raid Level[%d] : %s\n", n, + map_num(ddf_level, vc->prl)?:"-unknown-"); + if (vc->sec_elmnt_count != 1) { + printf(" Secondary Position[%d] : %d of %d\n", n, + vc->sec_elmnt_seq, vc->sec_elmnt_count); + printf(" Secondary Level[%d] : %s\n", n, + map_num(ddf_sec_level, vc->srl) ?: "-unknown-"); + } + printf(" Device Size[%d] : %llu\n", n, + be64_to_cpu(vc->blocks)/2); + printf(" Array Size[%d] : %llu\n", n, + be64_to_cpu(vc->array_blocks)/2); + } +} + +static void examine_vds(struct ddf_super *sb) +{ + int cnt = be16_to_cpu(sb->virt->populated_vdes); + unsigned int i; + printf(" Virtual Disks : %d\n", cnt); + + for (i = 0; i < be16_to_cpu(sb->virt->max_vdes); i++) { + struct virtual_entry *ve = &sb->virt->entries[i]; + if (all_ff(ve->guid)) + continue; + printf("\n"); + printf(" VD GUID[%d] : ", i); print_guid(ve->guid, 1); + printf("\n"); + printf(" unit[%d] : %d\n", i, be16_to_cpu(ve->unit)); + printf(" state[%d] : %s, %s%s\n", i, + map_num(ddf_state, ve->state & 7), + (ve->state & DDF_state_morphing) ? "Morphing, ": "", + (ve->state & DDF_state_inconsistent)? "Not Consistent" : "Consistent"); + printf(" init state[%d] : %s\n", i, + map_num(ddf_init_state, ve->init_state&DDF_initstate_mask)); + printf(" access[%d] : %s\n", i, + map_num(ddf_access, (ve->init_state & DDF_access_mask) >> 6)); + printf(" Name[%d] : %.16s\n", i, ve->name); + examine_vd(i, sb, ve->guid); + } + if (cnt) printf("\n"); +} + +static void examine_pds(struct ddf_super *sb) +{ + int cnt = be16_to_cpu(sb->phys->max_pdes); + int i; + struct dl *dl; + int unlisted = 0; + printf(" Physical Disks : %d\n", cnt); + printf(" Number RefNo Size Device Type/State\n"); + + for (dl = sb->dlist; dl; dl = dl->next) + dl->displayed = 0; + + for (i=0 ; i<cnt ; i++) { + struct phys_disk_entry *pd = &sb->phys->entries[i]; + int type = be16_to_cpu(pd->type); + int state = be16_to_cpu(pd->state); + + if (be32_to_cpu(pd->refnum) == 0xffffffff) + /* Not in use */ + continue; + //printf(" PD GUID[%d] : ", i); print_guid(pd->guid, 0); + //printf("\n"); + printf(" %3d %08x ", i, + be32_to_cpu(pd->refnum)); + printf("%8lluK ", + be64_to_cpu(pd->config_size)>>1); + for (dl = sb->dlist; dl ; dl = dl->next) { + if (be32_eq(dl->disk.refnum, pd->refnum)) { + char *dv = map_dev(dl->major, dl->minor, 0); + if (dv) { + printf("%-15s", dv); + break; + } + } + } + if (!dl) + printf("%15s",""); + else + dl->displayed = 1; + printf(" %s%s%s%s%s", + (type&2) ? "active":"", + (type&4) ? "Global-Spare":"", + (type&8) ? "spare" : "", + (type&16)? ", foreign" : "", + (type&32)? "pass-through" : ""); + if (state & DDF_Failed) + /* This over-rides these three */ + state &= ~(DDF_Online|DDF_Rebuilding|DDF_Transition); + printf("/%s%s%s%s%s%s%s", + (state&1)? "Online": "Offline", + (state&2)? ", Failed": "", + (state&4)? ", Rebuilding": "", + (state&8)? ", in-transition": "", + (state&16)? ", SMART-errors": "", + (state&32)? ", Unrecovered-Read-Errors": "", + (state&64)? ", Missing" : ""); + printf("\n"); + } + for (dl = sb->dlist; dl; dl = dl->next) { + char *dv; + if (dl->displayed) + continue; + if (!unlisted) + printf(" Physical disks not in metadata!:\n"); + unlisted = 1; + dv = map_dev(dl->major, dl->minor, 0); + printf(" %08x %s\n", be32_to_cpu(dl->disk.refnum), + dv ? dv : "-unknown-"); + } + if (unlisted) + printf("\n"); +} + +static void examine_super_ddf(struct supertype *st, char *homehost) +{ + struct ddf_super *sb = st->sb; + + printf(" Magic : %08x\n", be32_to_cpu(sb->anchor.magic)); + printf(" Version : %.8s\n", sb->anchor.revision); + printf("Controller GUID : "); print_guid(sb->controller.guid, 0); + printf("\n"); + printf(" Container GUID : "); print_guid(sb->anchor.guid, 1); + printf("\n"); + printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq)); + printf(" Redundant hdr : %s\n", (be32_eq(sb->secondary.magic, + DDF_HEADER_MAGIC) + ?"yes" : "no")); + examine_vds(sb); + examine_pds(sb); +} + +static unsigned int get_vd_num_of_subarray(struct supertype *st) +{ + /* + * Figure out the VD number for this supertype. + * Returns DDF_CONTAINER for the container itself, + * and DDF_NOTFOUND on error. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo *sra; + char *sub, *end; + unsigned int vcnum; + + if (*st->container_devnm == '\0') + return DDF_CONTAINER; + + sra = sysfs_read(-1, st->devnm, GET_VERSION); + if (!sra || sra->array.major_version != -1 || + sra->array.minor_version != -2 || + !is_subarray(sra->text_version)) + return DDF_NOTFOUND; + + sub = strchr(sra->text_version + 1, '/'); + if (sub != NULL) + vcnum = strtoul(sub + 1, &end, 10); + if (sub == NULL || *sub == '\0' || *end != '\0' || + vcnum >= be16_to_cpu(ddf->active->max_vd_entries)) + return DDF_NOTFOUND; + + return vcnum; +} + +static void brief_examine_super_ddf(struct supertype *st, int verbose) +{ + /* We just write a generic DDF ARRAY entry + */ + struct mdinfo info; + char nbuf[64]; + getinfo_super_ddf(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + + printf("ARRAY metadata=ddf UUID=%s\n", nbuf + 5); +} + +static void brief_examine_subarrays_ddf(struct supertype *st, int verbose) +{ + /* We write a DDF ARRAY member entry for each vd, identifying container + * by uuid and member by unit number and uuid. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo info; + unsigned int i; + char nbuf[64]; + getinfo_super_ddf(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + struct virtual_entry *ve = &ddf->virt->entries[i]; + struct vcl vcl; + char nbuf1[64]; + char namebuf[17]; + if (all_ff(ve->guid)) + continue; + memcpy(vcl.conf.guid, ve->guid, DDF_GUID_LEN); + ddf->currentconf =&vcl; + vcl.vcnum = i; + uuid_from_super_ddf(st, info.uuid); + fname_from_uuid(st, &info, nbuf1, ':'); + _ddf_array_name(namebuf, ddf, i); + printf("ARRAY%s%s container=%s member=%d UUID=%s\n", + namebuf[0] == '\0' ? "" : " /dev/md/", namebuf, + nbuf+5, i, nbuf1+5); + } +} + +static void export_examine_super_ddf(struct supertype *st) +{ + struct mdinfo info; + char nbuf[64]; + getinfo_super_ddf(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("MD_METADATA=ddf\n"); + printf("MD_LEVEL=container\n"); + printf("MD_UUID=%s\n", nbuf+5); + printf("MD_DEVICES=%u\n", + be16_to_cpu(((struct ddf_super *)st->sb)->phys->used_pdes)); +} + +static int copy_metadata_ddf(struct supertype *st, int from, int to) +{ + void *buf; + unsigned long long dsize, offset; + int bytes; + struct ddf_header *ddf; + int written = 0; + + /* The meta consists of an anchor, a primary, and a secondary. + * This all lives at the end of the device. + * So it is easiest to find the earliest of primary and + * secondary, and copy everything from there. + * + * Anchor is 512 from end. It contains primary_lba and secondary_lba + * we choose one of those + */ + + if (posix_memalign(&buf, 4096, 4096) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (lseek64(from, dsize-512, 0) < 0) + goto err; + if (read(from, buf, 512) != 512) + goto err; + ddf = buf; + if (!be32_eq(ddf->magic, DDF_HEADER_MAGIC) || + !be32_eq(calc_crc(ddf, 512), ddf->crc) || + (memcmp(ddf->revision, DDF_REVISION_0, 8) != 0 && + memcmp(ddf->revision, DDF_REVISION_2, 8) != 0)) + goto err; + + offset = dsize - 512; + if ((be64_to_cpu(ddf->primary_lba) << 9) < offset) + offset = be64_to_cpu(ddf->primary_lba) << 9; + if ((be64_to_cpu(ddf->secondary_lba) << 9) < offset) + offset = be64_to_cpu(ddf->secondary_lba) << 9; + + bytes = dsize - offset; + + if (lseek64(from, offset, 0) < 0 || + lseek64(to, offset, 0) < 0) + goto err; + while (written < bytes) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (read(from, buf, n) != n) + goto err; + if (write(to, buf, n) != n) + goto err; + written += n; + } + free(buf); + return 0; +err: + free(buf); + return 1; +} + +static void detail_super_ddf(struct supertype *st, char *homehost) +{ + struct ddf_super *sb = st->sb; + int cnt = be16_to_cpu(sb->virt->populated_vdes); + + printf(" Container GUID : "); print_guid(sb->anchor.guid, 1); + printf("\n"); + printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq)); + printf(" Virtual Disks : %d\n", cnt); + printf("\n"); +} +#endif + +static const char *vendors_with_variable_volume_UUID[] = { + "LSI ", +}; + +static int volume_id_is_reliable(const struct ddf_super *ddf) +{ + int n = ARRAY_SIZE(vendors_with_variable_volume_UUID); + int i; + for (i = 0; i < n; i++) + if (!memcmp(ddf->controller.guid, + vendors_with_variable_volume_UUID[i], 8)) + return 0; + return 1; +} + +static void uuid_of_ddf_subarray(const struct ddf_super *ddf, + unsigned int vcnum, int uuid[4]) +{ + char buf[DDF_GUID_LEN+18], sha[20], *p; + struct sha1_ctx ctx; + if (volume_id_is_reliable(ddf)) { + uuid_from_ddf_guid(ddf->virt->entries[vcnum].guid, uuid); + return; + } + /* + * Some fake RAID BIOSes (in particular, LSI ones) change the + * VD GUID at every boot. These GUIDs are not suitable for + * identifying an array. Luckily the header GUID appears to + * remain constant. + * We construct a pseudo-UUID from the header GUID and those + * properties of the subarray that we expect to remain constant. + */ + memset(buf, 0, sizeof(buf)); + p = buf; + memcpy(p, ddf->anchor.guid, DDF_GUID_LEN); + p += DDF_GUID_LEN; + memcpy(p, ddf->virt->entries[vcnum].name, 16); + p += 16; + *((__u16 *) p) = vcnum; + sha1_init_ctx(&ctx); + sha1_process_bytes(buf, sizeof(buf), &ctx); + sha1_finish_ctx(&ctx, sha); + memcpy(uuid, sha, 4*4); +} + +#ifndef MDASSEMBLE +static void brief_detail_super_ddf(struct supertype *st) +{ + struct mdinfo info; + char nbuf[64]; + struct ddf_super *ddf = st->sb; + unsigned int vcnum = get_vd_num_of_subarray(st); + if (vcnum == DDF_CONTAINER) + uuid_from_super_ddf(st, info.uuid); + else if (vcnum == DDF_NOTFOUND) + return; + else + uuid_of_ddf_subarray(ddf, vcnum, info.uuid); + fname_from_uuid(st, &info, nbuf,':'); + printf(" UUID=%s", nbuf + 5); +} +#endif + +static int match_home_ddf(struct supertype *st, char *homehost) +{ + /* It matches 'this' host if the controller is a + * Linux-MD controller with vendor_data matching + * the hostname. It would be nice if we could + * test against controller found in /sys or somewhere... + */ + struct ddf_super *ddf = st->sb; + unsigned int len; + + if (!homehost) + return 0; + len = strlen(homehost); + + return (memcmp(ddf->controller.guid, T10, 8) == 0 && + len < sizeof(ddf->controller.vendor_data) && + memcmp(ddf->controller.vendor_data, homehost,len) == 0 && + ddf->controller.vendor_data[len] == 0); +} + +#ifndef MDASSEMBLE +static int find_index_in_bvd(const struct ddf_super *ddf, + const struct vd_config *conf, unsigned int n, + unsigned int *n_bvd) +{ + /* + * Find the index of the n-th valid physical disk in this BVD. + * Unused entries can be sprinkled in with the used entries, + * but don't count. + */ + unsigned int i, j; + for (i = 0, j = 0; + i < ddf->mppe && j < be16_to_cpu(conf->prim_elmnt_count); + i++) { + if (be32_to_cpu(conf->phys_refnum[i]) != 0xffffffff) { + if (n == j) { + *n_bvd = i; + return 1; + } + j++; + } + } + dprintf("couldn't find BVD member %u (total %u)\n", + n, be16_to_cpu(conf->prim_elmnt_count)); + return 0; +} + +/* Given a member array instance number, and a raid disk within that instance, + * find the vd_config structure. The offset of the given disk in the phys_refnum + * table is returned in n_bvd. + * For two-level members with a secondary raid level the vd_config for + * the appropriate BVD is returned. + * The return value is always &vlc->conf, where vlc is returned in last pointer. + */ +static struct vd_config *find_vdcr(struct ddf_super *ddf, unsigned int inst, + unsigned int n, + unsigned int *n_bvd, struct vcl **vcl) +{ + struct vcl *v; + + for (v = ddf->conflist; v; v = v->next) { + unsigned int nsec, ibvd = 0; + struct vd_config *conf; + if (inst != v->vcnum) + continue; + conf = &v->conf; + if (conf->sec_elmnt_count == 1) { + if (find_index_in_bvd(ddf, conf, n, n_bvd)) { + *vcl = v; + return conf; + } else + goto bad; + } + if (v->other_bvds == NULL) { + pr_err("BUG: other_bvds is NULL, nsec=%u\n", + conf->sec_elmnt_count); + goto bad; + } + nsec = n / be16_to_cpu(conf->prim_elmnt_count); + if (conf->sec_elmnt_seq != nsec) { + for (ibvd = 1; ibvd < conf->sec_elmnt_count; ibvd++) { + if (v->other_bvds[ibvd-1]->sec_elmnt_seq + == nsec) + break; + } + if (ibvd == conf->sec_elmnt_count) + goto bad; + conf = v->other_bvds[ibvd-1]; + } + if (!find_index_in_bvd(ddf, conf, + n - nsec*conf->sec_elmnt_count, n_bvd)) + goto bad; + dprintf("found disk %u as member %u in bvd %d of array %u\n", + n, *n_bvd, ibvd, inst); + *vcl = v; + return conf; + } +bad: + pr_err("Could't find disk %d in array %u\n", n, inst); + return NULL; +} +#endif + +static int find_phys(const struct ddf_super *ddf, be32 phys_refnum) +{ + /* Find the entry in phys_disk which has the given refnum + * and return it's index + */ + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->phys->max_pdes); i++) + if (be32_eq(ddf->phys->entries[i].refnum, phys_refnum)) + return i; + return -1; +} + +static void uuid_from_ddf_guid(const char *guid, int uuid[4]) +{ + char buf[20]; + struct sha1_ctx ctx; + sha1_init_ctx(&ctx); + sha1_process_bytes(guid, DDF_GUID_LEN, &ctx); + sha1_finish_ctx(&ctx, buf); + memcpy(uuid, buf, 4*4); +} + +static void uuid_from_super_ddf(struct supertype *st, int uuid[4]) +{ + /* The uuid returned here is used for: + * uuid to put into bitmap file (Create, Grow) + * uuid for backup header when saving critical section (Grow) + * comparing uuids when re-adding a device into an array + * In these cases the uuid required is that of the data-array, + * not the device-set. + * uuid to recognise same set when adding a missing device back + * to an array. This is a uuid for the device-set. + * + * For each of these we can make do with a truncated + * or hashed uuid rather than the original, as long as + * everyone agrees. + * In the case of SVD we assume the BVD is of interest, + * though that might be the case if a bitmap were made for + * a mirrored SVD - worry about that later. + * So we need to find the VD configuration record for the + * relevant BVD and extract the GUID and Secondary_Element_Seq. + * The first 16 bytes of the sha1 of these is used. + */ + struct ddf_super *ddf = st->sb; + struct vcl *vcl = ddf->currentconf; + + if (vcl) + uuid_of_ddf_subarray(ddf, vcl->vcnum, uuid); + else + uuid_from_ddf_guid(ddf->anchor.guid, uuid); +} + +static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map) +{ + struct ddf_super *ddf = st->sb; + int map_disks = info->array.raid_disks; + __u32 *cptr; + + if (ddf->currentconf) { + getinfo_super_ddf_bvd(st, info, map); + return; + } + memset(info, 0, sizeof(*info)); + + info->array.raid_disks = be16_to_cpu(ddf->phys->used_pdes); + info->array.level = LEVEL_CONTAINER; + info->array.layout = 0; + info->array.md_minor = -1; + cptr = (__u32 *)(ddf->anchor.guid + 16); + info->array.ctime = DECADE + __be32_to_cpu(*cptr); + + info->array.chunk_size = 0; + info->container_enough = 1; + + info->disk.major = 0; + info->disk.minor = 0; + if (ddf->dlist) { + struct phys_disk_entry *pde = NULL; + info->disk.number = be32_to_cpu(ddf->dlist->disk.refnum); + info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum); + + info->data_offset = be64_to_cpu(ddf->phys-> + entries[info->disk.raid_disk]. + config_size); + info->component_size = ddf->dlist->size - info->data_offset; + if (info->disk.raid_disk >= 0) + pde = ddf->phys->entries + info->disk.raid_disk; + if (pde && + !(be16_to_cpu(pde->state) & DDF_Failed) && + !(be16_to_cpu(pde->state) & DDF_Missing)) + info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + else + info->disk.state = 1 << MD_DISK_FAULTY; + + } else { + /* There should always be a dlist, but just in case...*/ + info->disk.number = -1; + info->disk.raid_disk = -1; + info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + } + info->events = be32_to_cpu(ddf->active->seq); + info->array.utime = DECADE + be32_to_cpu(ddf->active->timestamp); + + info->recovery_start = MaxSector; + info->reshape_active = 0; + info->recovery_blocked = 0; + info->name[0] = 0; + + info->array.major_version = -1; + info->array.minor_version = -2; + strcpy(info->text_version, "ddf"); + info->safe_mode_delay = 0; + + uuid_from_super_ddf(st, info->uuid); + + if (map) { + int i, e = 0; + int max = be16_to_cpu(ddf->phys->max_pdes); + for (i = e = 0 ; i < map_disks ; i++, e++) { + while (e < max && + be32_to_cpu(ddf->phys->entries[e].refnum) == 0xffffffff) + e++; + if (i < info->array.raid_disks && e < max && + !(be16_to_cpu(ddf->phys->entries[e].state) + & DDF_Failed)) + map[i] = 1; + else + map[i] = 0; + } + } +} + +/* size of name must be at least 17 bytes! */ +static void _ddf_array_name(char *name, const struct ddf_super *ddf, int i) +{ + int j; + memcpy(name, ddf->virt->entries[i].name, 16); + name[16] = 0; + for(j = 0; j < 16; j++) + if (name[j] == ' ') + name[j] = 0; +} + +static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map) +{ + struct ddf_super *ddf = st->sb; + struct vcl *vc = ddf->currentconf; + int cd = ddf->currentdev; + int n_prim; + int j; + struct dl *dl = NULL; + int map_disks = info->array.raid_disks; + __u32 *cptr; + struct vd_config *conf; + + memset(info, 0, sizeof(*info)); + if (layout_ddf2md(&vc->conf, &info->array) == -1) + return; + info->array.md_minor = -1; + cptr = (__u32 *)(vc->conf.guid + 16); + info->array.ctime = DECADE + __be32_to_cpu(*cptr); + info->array.utime = DECADE + be32_to_cpu(vc->conf.timestamp); + info->array.chunk_size = 512 << vc->conf.chunk_shift; + info->custom_array_size = be64_to_cpu(vc->conf.array_blocks); + + conf = &vc->conf; + n_prim = be16_to_cpu(conf->prim_elmnt_count); + if (conf->sec_elmnt_count > 1 && cd >= n_prim) { + int ibvd = cd / n_prim - 1; + cd %= n_prim; + conf = vc->other_bvds[ibvd]; + } + + if (cd >= 0 && (unsigned)cd < ddf->mppe) { + info->data_offset = + be64_to_cpu(LBA_OFFSET(ddf, conf)[cd]); + if (vc->block_sizes) + info->component_size = vc->block_sizes[cd]; + else + info->component_size = be64_to_cpu(conf->blocks); + + for (dl = ddf->dlist; dl ; dl = dl->next) + if (be32_eq(dl->disk.refnum, conf->phys_refnum[cd])) + break; + } + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.state = 0; + if (dl && dl->pdnum >= 0) { + info->disk.major = dl->major; + info->disk.minor = dl->minor; + info->disk.raid_disk = cd + conf->sec_elmnt_seq + * be16_to_cpu(conf->prim_elmnt_count); + info->disk.number = dl->pdnum; + info->disk.state = 0; + if (info->disk.number >= 0 && + (be16_to_cpu(ddf->phys->entries[info->disk.number].state) & DDF_Online) && + !(be16_to_cpu(ddf->phys->entries[info->disk.number].state) & DDF_Failed)) + info->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE); + info->events = be32_to_cpu(ddf->active->seq); + } + + info->container_member = ddf->currentconf->vcnum; + + info->recovery_start = MaxSector; + info->resync_start = 0; + info->reshape_active = 0; + info->recovery_blocked = 0; + if (!(ddf->virt->entries[info->container_member].state + & DDF_state_inconsistent) && + (ddf->virt->entries[info->container_member].init_state + & DDF_initstate_mask) + == DDF_init_full) + info->resync_start = MaxSector; + + uuid_from_super_ddf(st, info->uuid); + + info->array.major_version = -1; + info->array.minor_version = -2; + sprintf(info->text_version, "/%s/%d", + st->container_devnm, + info->container_member); + info->safe_mode_delay = DDF_SAFE_MODE_DELAY; + + _ddf_array_name(info->name, ddf, info->container_member); + + if (map) + for (j = 0; j < map_disks; j++) { + map[j] = 0; + if (j < info->array.raid_disks) { + int i = find_phys(ddf, vc->conf.phys_refnum[j]); + if (i >= 0 && + (be16_to_cpu(ddf->phys->entries[i].state) + & DDF_Online) && + !(be16_to_cpu(ddf->phys->entries[i].state) + & DDF_Failed)) + map[i] = 1; + } + } +} + +static int update_super_ddf(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* For 'assemble' and 'force' we need to return non-zero if any + * change was made. For others, the return value is ignored. + * Update options are: + * force-one : This device looks a bit old but needs to be included, + * update age info appropriately. + * assemble: clear any 'faulty' flag to allow this device to + * be assembled. + * force-array: Array is degraded but being forced, mark it clean + * if that will be needed to assemble it. + * + * newdev: not used ???? + * grow: Array has gained a new device - this is currently for + * linear only + * resync: mark as dirty so a resync will happen. + * uuid: Change the uuid of the array to match what is given + * homehost: update the recorded homehost + * name: update the name - preserving the homehost + * _reshape_progress: record new reshape_progress position. + * + * Following are not relevant for this version: + * sparc2.2 : update from old dodgey metadata + * super-minor: change the preferred_minor number + * summaries: update redundant counters. + */ + int rv = 0; +// struct ddf_super *ddf = st->sb; +// struct vd_config *vd = find_vdcr(ddf, info->container_member); +// struct virtual_entry *ve = find_ve(ddf); + + /* we don't need to handle "force-*" or "assemble" as + * there is no need to 'trick' the kernel. When the metadata is + * first updated to activate the array, all the implied modifications + * will just happen. + */ + + if (strcmp(update, "grow") == 0) { + /* FIXME */ + } else if (strcmp(update, "resync") == 0) { +// info->resync_checkpoint = 0; + } else if (strcmp(update, "homehost") == 0) { + /* homehost is stored in controller->vendor_data, + * or it is when we are the vendor + */ +// if (info->vendor_is_local) +// strcpy(ddf->controller.vendor_data, homehost); + rv = -1; + } else if (strcmp(update, "name") == 0) { + /* name is stored in virtual_entry->name */ +// memset(ve->name, ' ', 16); +// strncpy(ve->name, info->name, 16); + rv = -1; + } else if (strcmp(update, "_reshape_progress") == 0) { + /* We don't support reshape yet */ + } else if (strcmp(update, "assemble") == 0 ) { + /* Do nothing, just succeed */ + rv = 0; + } else + rv = -1; + +// update_all_csum(ddf); + + return rv; +} + +static void make_header_guid(char *guid) +{ + be32 stamp; + /* Create a DDF Header of Virtual Disk GUID */ + + /* 24 bytes of fiction required. + * first 8 are a 'vendor-id' - "Linux-MD" + * next 8 are controller type.. how about 0X DEAD BEEF 0000 0000 + * Remaining 8 random number plus timestamp + */ + memcpy(guid, T10, sizeof(T10)); + stamp = cpu_to_be32(0xdeadbeef); + memcpy(guid+8, &stamp, 4); + stamp = cpu_to_be32(0); + memcpy(guid+12, &stamp, 4); + stamp = cpu_to_be32(time(0) - DECADE); + memcpy(guid+16, &stamp, 4); + stamp._v32 = random32(); + memcpy(guid+20, &stamp, 4); +} + +static unsigned int find_unused_vde(const struct ddf_super *ddf) +{ + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + return i; + } + return DDF_NOTFOUND; +} + +static unsigned int find_vde_by_name(const struct ddf_super *ddf, + const char *name) +{ + unsigned int i; + if (name == NULL) + return DDF_NOTFOUND; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + continue; + if (!strncmp(name, ddf->virt->entries[i].name, + sizeof(ddf->virt->entries[i].name))) + return i; + } + return DDF_NOTFOUND; +} + +#ifndef MDASSEMBLE +static unsigned int find_vde_by_guid(const struct ddf_super *ddf, + const char *guid) +{ + unsigned int i; + if (guid == NULL || all_ff(guid)) + return DDF_NOTFOUND; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) + if (!memcmp(ddf->virt->entries[i].guid, guid, DDF_GUID_LEN)) + return i; + return DDF_NOTFOUND; +} +#endif + +static int init_super_ddf(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, char *name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + /* This is primarily called by Create when creating a new array. + * We will then get add_to_super called for each component, and then + * write_init_super called to write it out to each device. + * For DDF, Create can create on fresh devices or on a pre-existing + * array. + * To create on a pre-existing array a different method will be called. + * This one is just for fresh drives. + * + * We need to create the entire 'ddf' structure which includes: + * DDF headers - these are easy. + * Controller data - a Sector describing this controller .. not that + * this is a controller exactly. + * Physical Disk Record - one entry per device, so + * leave plenty of space. + * Virtual Disk Records - again, just leave plenty of space. + * This just lists VDs, doesn't give details. + * Config records - describe the VDs that use this disk + * DiskData - describes 'this' device. + * BadBlockManagement - empty + * Diag Space - empty + * Vendor Logs - Could we put bitmaps here? + * + */ + struct ddf_super *ddf; + char hostname[17]; + int hostlen; + int max_phys_disks, max_virt_disks; + unsigned long long sector; + int clen; + int i; + int pdsize, vdsize; + struct phys_disk *pd; + struct virtual_disk *vd; + + if (st->sb) + return init_super_ddf_bvd(st, info, size, name, homehost, uuid, + data_offset); + + if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) { + pr_err("could not allocate superblock\n"); + return 0; + } + memset(ddf, 0, sizeof(*ddf)); + st->sb = ddf; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + /* At least 32MB *must* be reserved for the ddf. So let's just + * start 32MB from the end, and put the primary header there. + * Don't do secondary for now. + * We don't know exactly where that will be yet as it could be + * different on each device. So just set up the lengths. + */ + + ddf->anchor.magic = DDF_HEADER_MAGIC; + make_header_guid(ddf->anchor.guid); + + memcpy(ddf->anchor.revision, DDF_REVISION_2, 8); + ddf->anchor.seq = cpu_to_be32(1); + ddf->anchor.timestamp = cpu_to_be32(time(0) - DECADE); + ddf->anchor.openflag = 0xFF; + ddf->anchor.foreignflag = 0; + ddf->anchor.enforcegroups = 0; /* Is this best?? */ + ddf->anchor.pad0 = 0xff; + memset(ddf->anchor.pad1, 0xff, 12); + memset(ddf->anchor.header_ext, 0xff, 32); + ddf->anchor.primary_lba = cpu_to_be64(~(__u64)0); + ddf->anchor.secondary_lba = cpu_to_be64(~(__u64)0); + ddf->anchor.type = DDF_HEADER_ANCHOR; + memset(ddf->anchor.pad2, 0xff, 3); + ddf->anchor.workspace_len = cpu_to_be32(32768); /* Must be reserved */ + /* Put this at bottom of 32M reserved.. */ + ddf->anchor.workspace_lba = cpu_to_be64(~(__u64)0); + max_phys_disks = 1023; /* Should be enough, 4095 is also allowed */ + ddf->anchor.max_pd_entries = cpu_to_be16(max_phys_disks); + max_virt_disks = 255; /* 15, 63, 255, 1024, 4095 are all allowed */ + ddf->anchor.max_vd_entries = cpu_to_be16(max_virt_disks); + ddf->max_part = 64; + ddf->anchor.max_partitions = cpu_to_be16(ddf->max_part); + ddf->mppe = 256; /* 16, 64, 256, 1024, 4096 are all allowed */ + ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512; + ddf->anchor.config_record_len = cpu_to_be16(ddf->conf_rec_len); + ddf->anchor.max_primary_element_entries = cpu_to_be16(ddf->mppe); + memset(ddf->anchor.pad3, 0xff, 54); + /* Controller section is one sector long immediately + * after the ddf header */ + sector = 1; + ddf->anchor.controller_section_offset = cpu_to_be32(sector); + ddf->anchor.controller_section_length = cpu_to_be32(1); + sector += 1; + + /* phys is 8 sectors after that */ + pdsize = ROUND_UP(sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)*max_phys_disks, + 512); + switch(pdsize/512) { + case 2: case 8: case 32: case 128: case 512: break; + default: abort(); + } + ddf->anchor.phys_section_offset = cpu_to_be32(sector); + ddf->anchor.phys_section_length = + cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */ + sector += pdsize/512; + + /* virt is another 32 sectors */ + vdsize = ROUND_UP(sizeof(struct virtual_disk) + + sizeof(struct virtual_entry) * max_virt_disks, + 512); + switch(vdsize/512) { + case 2: case 8: case 32: case 128: case 512: break; + default: abort(); + } + ddf->anchor.virt_section_offset = cpu_to_be32(sector); + ddf->anchor.virt_section_length = + cpu_to_be32(vdsize/512); /* max_vd_entries/8 */ + sector += vdsize/512; + + clen = ddf->conf_rec_len * (ddf->max_part+1); + ddf->anchor.config_section_offset = cpu_to_be32(sector); + ddf->anchor.config_section_length = cpu_to_be32(clen); + sector += clen; + + ddf->anchor.data_section_offset = cpu_to_be32(sector); + ddf->anchor.data_section_length = cpu_to_be32(1); + sector += 1; + + ddf->anchor.bbm_section_length = cpu_to_be32(0); + ddf->anchor.bbm_section_offset = cpu_to_be32(0xFFFFFFFF); + ddf->anchor.diag_space_length = cpu_to_be32(0); + ddf->anchor.diag_space_offset = cpu_to_be32(0xFFFFFFFF); + ddf->anchor.vendor_length = cpu_to_be32(0); + ddf->anchor.vendor_offset = cpu_to_be32(0xFFFFFFFF); + + memset(ddf->anchor.pad4, 0xff, 256); + + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); + + ddf->primary.openflag = 1; /* I guess.. */ + ddf->primary.type = DDF_HEADER_PRIMARY; + + ddf->secondary.openflag = 1; /* I guess.. */ + ddf->secondary.type = DDF_HEADER_SECONDARY; + + ddf->active = &ddf->primary; + + ddf->controller.magic = DDF_CONTROLLER_MAGIC; + + /* 24 more bytes of fiction required. + * first 8 are a 'vendor-id' - "Linux-MD" + * Remaining 16 are serial number.... maybe a hostname would do? + */ + memcpy(ddf->controller.guid, T10, sizeof(T10)); + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = 0; + hostlen = strlen(hostname); + memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen); + for (i = strlen(T10) ; i+hostlen < 24; i++) + ddf->controller.guid[i] = ' '; + + ddf->controller.type.vendor_id = cpu_to_be16(0xDEAD); + ddf->controller.type.device_id = cpu_to_be16(0xBEEF); + ddf->controller.type.sub_vendor_id = cpu_to_be16(0); + ddf->controller.type.sub_device_id = cpu_to_be16(0); + memcpy(ddf->controller.product_id, "What Is My PID??", 16); + memset(ddf->controller.pad, 0xff, 8); + memset(ddf->controller.vendor_data, 0xff, 448); + if (homehost && strlen(homehost) < 440) + strcpy((char*)ddf->controller.vendor_data, homehost); + + if (posix_memalign((void**)&pd, 512, pdsize) != 0) { + pr_err("could not allocate pd\n"); + return 0; + } + ddf->phys = pd; + ddf->pdsize = pdsize; + + memset(pd, 0xff, pdsize); + memset(pd, 0, sizeof(*pd)); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = cpu_to_be16(0); + pd->max_pdes = cpu_to_be16(max_phys_disks); + memset(pd->pad, 0xff, 52); + for (i = 0; i < max_phys_disks; i++) + memset(pd->entries[i].guid, 0xff, DDF_GUID_LEN); + + if (posix_memalign((void**)&vd, 512, vdsize) != 0) { + pr_err("could not allocate vd\n"); + return 0; + } + ddf->virt = vd; + ddf->vdsize = vdsize; + memset(vd, 0, vdsize); + vd->magic = DDF_VIRT_RECORDS_MAGIC; + vd->populated_vdes = cpu_to_be16(0); + vd->max_vdes = cpu_to_be16(max_virt_disks); + memset(vd->pad, 0xff, 52); + + for (i=0; i<max_virt_disks; i++) + memset(&vd->entries[i], 0xff, sizeof(struct virtual_entry)); + + st->sb = ddf; + ddf_set_updates_pending(ddf, NULL); + return 1; +} + +static int chunk_to_shift(int chunksize) +{ + return ffs(chunksize/512)-1; +} + +#ifndef MDASSEMBLE +struct extent { + unsigned long long start, size; +}; +static int cmp_extent(const void *av, const void *bv) +{ + const struct extent *a = av; + const struct extent *b = bv; + if (a->start < b->start) + return -1; + if (a->start > b->start) + return 1; + return 0; +} + +static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl) +{ + /* Find a list of used extents on the given physical device + * (dnum) of the given ddf. + * Return a malloced array of 'struct extent' + */ + struct extent *rv; + int n = 0; + unsigned int i; + __u16 state; + + if (dl->pdnum < 0) + return NULL; + state = be16_to_cpu(ddf->phys->entries[dl->pdnum].state); + + if ((state & (DDF_Online|DDF_Failed|DDF_Missing)) != DDF_Online) + return NULL; + + rv = xmalloc(sizeof(struct extent) * (ddf->max_part + 2)); + + for (i = 0; i < ddf->max_part; i++) { + const struct vd_config *bvd; + unsigned int ibvd; + struct vcl *v = dl->vlist[i]; + if (v == NULL || + get_pd_index_from_refnum(v, dl->disk.refnum, ddf->mppe, + &bvd, &ibvd) == DDF_NOTFOUND) + continue; + rv[n].start = be64_to_cpu(LBA_OFFSET(ddf, bvd)[ibvd]); + rv[n].size = be64_to_cpu(bvd->blocks); + n++; + } + qsort(rv, n, sizeof(*rv), cmp_extent); + + rv[n].start = be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size); + rv[n].size = 0; + return rv; +} + +static unsigned long long find_space( + struct ddf_super *ddf, struct dl *dl, + unsigned long long data_offset, + unsigned long long *size) +{ + /* Find if the requested amount of space is available. + * If it is, return start. + * If not, set *size to largest space. + * If data_offset != INVALID_SECTORS, then the space must start + * at this location. + */ + struct extent *e = get_extents(ddf, dl); + int i = 0; + unsigned long long pos = 0; + unsigned long long max_size = 0; + + if (!e) { + *size = 0; + return INVALID_SECTORS; + } + do { + unsigned long long esize = e[i].start - pos; + if (data_offset != INVALID_SECTORS && + pos <= data_offset && + e[i].start > data_offset) { + pos = data_offset; + esize = e[i].start - pos; + } + if (data_offset != INVALID_SECTORS && + pos != data_offset) { + i++; + continue; + } + if (esize >= *size) { + /* Found! */ + free(e); + return pos; + } + if (esize > max_size) + max_size = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + *size = max_size; + free(e); + return INVALID_SECTORS; +} +#endif + +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + /* We are creating a BVD inside a pre-existing container. + * so st->sb is already set. + * We need to create a new vd_config and a new virtual_entry + */ + struct ddf_super *ddf = st->sb; + unsigned int venum, i; + struct virtual_entry *ve; + struct vcl *vcl; + struct vd_config *vc; + + if (find_vde_by_name(ddf, name) != DDF_NOTFOUND) { + pr_err("This ddf already has an array called %s\n", name); + return 0; + } + venum = find_unused_vde(ddf); + if (venum == DDF_NOTFOUND) { + pr_err("Cannot find spare slot for virtual disk\n"); + return 0; + } + ve = &ddf->virt->entries[venum]; + + /* A Virtual Disk GUID contains the T10 Vendor ID, controller type, + * timestamp, random number + */ + make_header_guid(ve->guid); + ve->unit = cpu_to_be16(info->md_minor); + ve->pad0 = 0xFFFF; + ve->guid_crc._v16 = crc32(0, (unsigned char *)ddf->anchor.guid, + DDF_GUID_LEN); + ve->type = cpu_to_be16(0); + ve->state = DDF_state_degraded; /* Will be modified as devices are added */ + if (info->state & 1) /* clean */ + ve->init_state = DDF_init_full; + else + ve->init_state = DDF_init_not; + + memset(ve->pad1, 0xff, 14); + memset(ve->name, ' ', 16); + if (name) + strncpy(ve->name, name, 16); + ddf->virt->populated_vdes = + cpu_to_be16(be16_to_cpu(ddf->virt->populated_vdes)+1); + + /* Now create a new vd_config */ + if (posix_memalign((void**)&vcl, 512, + (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512)) != 0) { + pr_err("could not allocate vd_config\n"); + return 0; + } + vcl->vcnum = venum; + vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + vc = &vcl->conf; + + vc->magic = DDF_VD_CONF_MAGIC; + memcpy(vc->guid, ve->guid, DDF_GUID_LEN); + vc->timestamp = cpu_to_be32(time(0)-DECADE); + vc->seqnum = cpu_to_be32(1); + memset(vc->pad0, 0xff, 24); + vc->chunk_shift = chunk_to_shift(info->chunk_size); + if (layout_md2ddf(info, vc) == -1 || + be16_to_cpu(vc->prim_elmnt_count) > ddf->mppe) { + pr_err("unsupported RAID level/layout %d/%d with %d disks\n", + info->level, info->layout, info->raid_disks); + free(vcl); + return 0; + } + vc->sec_elmnt_seq = 0; + if (alloc_other_bvds(ddf, vcl) != 0) { + pr_err("could not allocate other bvds\n"); + free(vcl); + return 0; + } + vc->blocks = cpu_to_be64(info->size * 2); + vc->array_blocks = cpu_to_be64( + calc_array_size(info->level, info->raid_disks, info->layout, + info->chunk_size, info->size*2)); + memset(vc->pad1, 0xff, 8); + vc->spare_refs[0] = cpu_to_be32(0xffffffff); + vc->spare_refs[1] = cpu_to_be32(0xffffffff); + vc->spare_refs[2] = cpu_to_be32(0xffffffff); + vc->spare_refs[3] = cpu_to_be32(0xffffffff); + vc->spare_refs[4] = cpu_to_be32(0xffffffff); + vc->spare_refs[5] = cpu_to_be32(0xffffffff); + vc->spare_refs[6] = cpu_to_be32(0xffffffff); + vc->spare_refs[7] = cpu_to_be32(0xffffffff); + memset(vc->cache_pol, 0, 8); + vc->bg_rate = 0x80; + memset(vc->pad2, 0xff, 3); + memset(vc->pad3, 0xff, 52); + memset(vc->pad4, 0xff, 192); + memset(vc->v0, 0xff, 32); + memset(vc->v1, 0xff, 32); + memset(vc->v2, 0xff, 16); + memset(vc->v3, 0xff, 16); + memset(vc->vendor, 0xff, 32); + + memset(vc->phys_refnum, 0xff, 4*ddf->mppe); + memset(vc->phys_refnum+ddf->mppe, 0x00, 8*ddf->mppe); + + for (i = 1; i < vc->sec_elmnt_count; i++) { + memcpy(vcl->other_bvds[i-1], vc, ddf->conf_rec_len * 512); + vcl->other_bvds[i-1]->sec_elmnt_seq = i; + } + + vcl->next = ddf->conflist; + ddf->conflist = vcl; + ddf->currentconf = vcl; + ddf_set_updates_pending(ddf, NULL); + return 1; +} + +#ifndef MDASSEMBLE +static void add_to_super_ddf_bvd(struct supertype *st, + mdu_disk_info_t *dk, int fd, char *devname, + unsigned long long data_offset) +{ + /* fd and devname identify a device within the ddf container (st). + * dk identifies a location in the new BVD. + * We need to find suitable free space in that device and update + * the phys_refnum and lba_offset for the newly created vd_config. + * We might also want to update the type in the phys_disk + * section. + * + * Alternately: fd == -1 and we have already chosen which device to + * use and recorded in dlist->raid_disk; + */ + struct dl *dl; + struct ddf_super *ddf = st->sb; + struct vd_config *vc; + unsigned int i; + unsigned long long blocks, pos; + unsigned int raid_disk = dk->raid_disk; + + if (fd == -1) { + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->raiddisk == dk->raid_disk) + break; + } else { + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + } + if (!dl || dl->pdnum < 0 || ! (dk->state & (1<<MD_DISK_SYNC))) + return; + + vc = &ddf->currentconf->conf; + if (vc->sec_elmnt_count > 1) { + unsigned int n = be16_to_cpu(vc->prim_elmnt_count); + if (raid_disk >= n) + vc = ddf->currentconf->other_bvds[raid_disk / n - 1]; + raid_disk %= n; + } + + blocks = be64_to_cpu(vc->blocks); + if (ddf->currentconf->block_sizes) + blocks = ddf->currentconf->block_sizes[dk->raid_disk]; + + pos = find_space(ddf, dl, data_offset, &blocks); + if (pos == INVALID_SECTORS) + return; + + ddf->currentdev = dk->raid_disk; + vc->phys_refnum[raid_disk] = dl->disk.refnum; + LBA_OFFSET(ddf, vc)[raid_disk] = cpu_to_be64(pos); + + for (i = 0; i < ddf->max_part ; i++) + if (dl->vlist[i] == NULL) + break; + if (i == ddf->max_part) + return; + dl->vlist[i] = ddf->currentconf; + + if (fd >= 0) + dl->fd = fd; + if (devname) + dl->devname = devname; + + /* Check if we can mark array as optimal yet */ + i = ddf->currentconf->vcnum; + ddf->virt->entries[i].state = + (ddf->virt->entries[i].state & ~DDF_state_mask) + | get_svd_state(ddf, ddf->currentconf); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + dprintf("added disk %d/%08x to VD %d/%s as disk %d\n", + dl->pdnum, be32_to_cpu(dl->disk.refnum), + ddf->currentconf->vcnum, guid_str(vc->guid), + dk->raid_disk); + ddf_set_updates_pending(ddf, vc); +} + +static unsigned int find_unused_pde(const struct ddf_super *ddf) +{ + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->phys->max_pdes); i++) { + if (all_ff(ddf->phys->entries[i].guid)) + return i; + } + return DDF_NOTFOUND; +} + +static void _set_config_size(struct phys_disk_entry *pde, const struct dl *dl) +{ + __u64 cfs, t; + cfs = min(dl->size - 32*1024*2ULL, be64_to_cpu(dl->primary_lba)); + t = be64_to_cpu(dl->secondary_lba); + if (t != ~(__u64)0) + cfs = min(cfs, t); + /* + * Some vendor DDF structures interpret workspace_lba + * very differently than we do: Make a sanity check on the value. + */ + t = be64_to_cpu(dl->workspace_lba); + if (t < cfs) { + __u64 wsp = cfs - t; + if (wsp > 1024*1024*2ULL && wsp > dl->size / 16) { + pr_err("%x:%x: workspace size 0x%llx too big, ignoring\n", + dl->major, dl->minor, (unsigned long long)wsp); + } else + cfs = t; + } + pde->config_size = cpu_to_be64(cfs); + dprintf("%x:%x config_size %llx, DDF structure is %llx blocks\n", + dl->major, dl->minor, + (unsigned long long)cfs, (unsigned long long)(dl->size-cfs)); +} + +/* Add a device to a container, either while creating it or while + * expanding a pre-existing container + */ +static int add_to_super_ddf(struct supertype *st, + mdu_disk_info_t *dk, int fd, char *devname, + unsigned long long data_offset) +{ + struct ddf_super *ddf = st->sb; + struct dl *dd; + time_t now; + struct tm *tm; + unsigned long long size; + struct phys_disk_entry *pde; + unsigned int n, i; + struct stat stb; + __u32 *tptr; + + if (ddf->currentconf) { + add_to_super_ddf_bvd(st, dk, fd, devname, data_offset); + return 0; + } + + /* This is device numbered dk->number. We need to create + * a phys_disk entry and a more detailed disk_data entry. + */ + fstat(fd, &stb); + n = find_unused_pde(ddf); + if (n == DDF_NOTFOUND) { + pr_err("No free slot in array, cannot add disk\n"); + return 1; + } + pde = &ddf->phys->entries[n]; + get_dev_size(fd, NULL, &size); + if (size <= 32*1024*1024) { + pr_err("device size must be at least 32MB\n"); + return 1; + } + size >>= 9; + + if (posix_memalign((void**)&dd, 512, + sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part) != 0) { + pr_err("could allocate buffer for new disk, aborting\n"); + return 1; + } + dd->major = major(stb.st_rdev); + dd->minor = minor(stb.st_rdev); + dd->devname = devname; + dd->fd = fd; + dd->spare = NULL; + + dd->disk.magic = DDF_PHYS_DATA_MAGIC; + now = time(0); + tm = localtime(&now); + sprintf(dd->disk.guid, "%8s%04d%02d%02d", + T10, tm->tm_year+1900, tm->tm_mon+1, tm->tm_mday); + tptr = (__u32 *)(dd->disk.guid + 16); + *tptr++ = random32(); + *tptr = random32(); + + do { + /* Cannot be bothered finding a CRC of some irrelevant details*/ + dd->disk.refnum._v32 = random32(); + for (i = be16_to_cpu(ddf->active->max_pd_entries); + i > 0; i--) + if (be32_eq(ddf->phys->entries[i-1].refnum, + dd->disk.refnum)) + break; + } while (i > 0); + + dd->disk.forced_ref = 1; + dd->disk.forced_guid = 1; + memset(dd->disk.vendor, ' ', 32); + memcpy(dd->disk.vendor, "Linux", 5); + memset(dd->disk.pad, 0xff, 442); + for (i = 0; i < ddf->max_part ; i++) + dd->vlist[i] = NULL; + + dd->pdnum = n; + + if (st->update_tail) { + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + struct phys_disk *pd; + + pd = xmalloc(len); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = cpu_to_be16(n); + pde = &pd->entries[0]; + dd->mdupdate = pd; + } else + ddf->phys->used_pdes = cpu_to_be16( + 1 + be16_to_cpu(ddf->phys->used_pdes)); + + memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN); + pde->refnum = dd->disk.refnum; + pde->type = cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare); + pde->state = cpu_to_be16(DDF_Online); + dd->size = size; + /* + * If there is already a device in dlist, try to reserve the same + * amount of workspace. Otherwise, use 32MB. + * We checked disk size above already. + */ +#define __calc_lba(new, old, lba, mb) do { \ + unsigned long long dif; \ + if ((old) != NULL) \ + dif = (old)->size - be64_to_cpu((old)->lba); \ + else \ + dif = (new)->size; \ + if ((new)->size > dif) \ + (new)->lba = cpu_to_be64((new)->size - dif); \ + else \ + (new)->lba = cpu_to_be64((new)->size - (mb*1024*2)); \ + } while (0) + __calc_lba(dd, ddf->dlist, workspace_lba, 32); + __calc_lba(dd, ddf->dlist, primary_lba, 16); + if (ddf->dlist == NULL || + be64_to_cpu(ddf->dlist->secondary_lba) != ~(__u64)0) + __calc_lba(dd, ddf->dlist, secondary_lba, 32); + _set_config_size(pde, dd); + + sprintf(pde->path, "%17.17s","Information: nil") ; + memset(pde->pad, 0xff, 6); + + if (st->update_tail) { + dd->next = ddf->add_list; + ddf->add_list = dd; + } else { + dd->next = ddf->dlist; + ddf->dlist = dd; + ddf_set_updates_pending(ddf, NULL); + } + + return 0; +} + +static int remove_from_super_ddf(struct supertype *st, mdu_disk_info_t *dk) +{ + struct ddf_super *ddf = st->sb; + struct dl *dl; + + /* mdmon has noticed that this disk (dk->major/dk->minor) has + * disappeared from the container. + * We need to arrange that it disappears from the metadata and + * internal data structures too. + * Most of the work is done by ddf_process_update which edits + * the metadata and closes the file handle and attaches the memory + * where free_updates will free it. + */ + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + if (!dl || dl->pdnum < 0) + return -1; + + if (st->update_tail) { + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + struct phys_disk *pd; + + pd = xmalloc(len); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = cpu_to_be16(dl->pdnum); + pd->entries[0].state = cpu_to_be16(DDF_Missing); + append_metadata_update(st, pd, len); + } + return 0; +} +#endif + +/* + * This is the write_init_super method for a ddf container. It is + * called when creating a container or adding another device to a + * container. + */ + +static int __write_ddf_structure(struct dl *d, struct ddf_super *ddf, __u8 type) +{ + unsigned long long sector; + struct ddf_header *header; + int fd, i, n_config, conf_size, buf_size; + int ret = 0; + char *conf; + + fd = d->fd; + + switch (type) { + case DDF_HEADER_PRIMARY: + header = &ddf->primary; + sector = be64_to_cpu(header->primary_lba); + break; + case DDF_HEADER_SECONDARY: + header = &ddf->secondary; + sector = be64_to_cpu(header->secondary_lba); + break; + default: + return 0; + } + if (sector == ~(__u64)0) + return 0; + + header->type = type; + header->openflag = 1; + header->crc = calc_crc(header, 512); + + lseek64(fd, sector<<9, 0); + if (write(fd, header, 512) < 0) + goto out; + + ddf->controller.crc = calc_crc(&ddf->controller, 512); + if (write(fd, &ddf->controller, 512) < 0) + goto out; + + ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize); + if (write(fd, ddf->phys, ddf->pdsize) < 0) + goto out; + ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize); + if (write(fd, ddf->virt, ddf->vdsize) < 0) + goto out; + + /* Now write lots of config records. */ + n_config = ddf->max_part; + conf_size = ddf->conf_rec_len * 512; + conf = ddf->conf; + buf_size = conf_size * (n_config + 1); + if (!conf) { + if (posix_memalign((void**)&conf, 512, buf_size) != 0) + goto out; + ddf->conf = conf; + } + for (i = 0 ; i <= n_config ; i++) { + struct vcl *c; + struct vd_config *vdc = NULL; + if (i == n_config) { + c = (struct vcl *)d->spare; + if (c) + vdc = &c->conf; + } else { + unsigned int dummy; + c = d->vlist[i]; + if (c) + get_pd_index_from_refnum( + c, d->disk.refnum, + ddf->mppe, + (const struct vd_config **)&vdc, + &dummy); + } + if (vdc) { + dprintf("writing conf record %i on disk %08x for %s/%u\n", + i, be32_to_cpu(d->disk.refnum), + guid_str(vdc->guid), + vdc->sec_elmnt_seq); + vdc->crc = calc_crc(vdc, conf_size); + memcpy(conf + i*conf_size, vdc, conf_size); + } else + memset(conf + i*conf_size, 0xff, conf_size); + } + if (write(fd, conf, buf_size) != buf_size) + goto out; + + d->disk.crc = calc_crc(&d->disk, 512); + if (write(fd, &d->disk, 512) < 0) + goto out; + + ret = 1; +out: + header->openflag = 0; + header->crc = calc_crc(header, 512); + + lseek64(fd, sector<<9, 0); + if (write(fd, header, 512) < 0) + ret = 0; + + return ret; +} + +static int _write_super_to_disk(struct ddf_super *ddf, struct dl *d) +{ + unsigned long long size; + int fd = d->fd; + if (fd < 0) + return 0; + + /* We need to fill in the primary, (secondary) and workspace + * lba's in the headers, set their checksums, + * Also checksum phys, virt.... + * + * Then write everything out, finally the anchor is written. + */ + get_dev_size(fd, NULL, &size); + size /= 512; + memcpy(&ddf->anchor, ddf->active, 512); + if (be64_to_cpu(d->workspace_lba) != 0ULL) + ddf->anchor.workspace_lba = d->workspace_lba; + else + ddf->anchor.workspace_lba = + cpu_to_be64(size - 32*1024*2); + if (be64_to_cpu(d->primary_lba) != 0ULL) + ddf->anchor.primary_lba = d->primary_lba; + else + ddf->anchor.primary_lba = + cpu_to_be64(size - 16*1024*2); + if (be64_to_cpu(d->secondary_lba) != 0ULL) + ddf->anchor.secondary_lba = d->secondary_lba; + else + ddf->anchor.secondary_lba = + cpu_to_be64(size - 32*1024*2); + ddf->anchor.timestamp = cpu_to_be32(time(0) - DECADE); + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); + + ddf->anchor.type = DDF_HEADER_ANCHOR; + ddf->anchor.openflag = 0xFF; /* 'open' means nothing */ + ddf->anchor.seq = cpu_to_be32(0xFFFFFFFF); /* no sequencing in anchor */ + ddf->anchor.crc = calc_crc(&ddf->anchor, 512); + + if (!__write_ddf_structure(d, ddf, DDF_HEADER_PRIMARY)) + return 0; + + if (!__write_ddf_structure(d, ddf, DDF_HEADER_SECONDARY)) + return 0; + + lseek64(fd, (size-1)*512, SEEK_SET); + if (write(fd, &ddf->anchor, 512) < 0) + return 0; + + return 1; +} + +#ifndef MDASSEMBLE +static int __write_init_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + struct dl *d; + int attempts = 0; + int successes = 0; + + pr_state(ddf, __func__); + + /* try to write updated metadata, + * if we catch a failure move on to the next disk + */ + for (d = ddf->dlist; d; d=d->next) { + attempts++; + successes += _write_super_to_disk(ddf, d); + } + + return attempts != successes; +} + +static int write_init_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + struct vcl *currentconf = ddf->currentconf; + + /* We are done with currentconf - reset it so st refers to the container */ + ddf->currentconf = NULL; + + if (st->update_tail) { + /* queue the virtual_disk and vd_config as metadata updates */ + struct virtual_disk *vd; + struct vd_config *vc; + int len, tlen; + unsigned int i; + + if (!currentconf) { + /* Must be adding a physical disk to the container */ + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + + /* adding a disk to the container. */ + if (!ddf->add_list) + return 0; + + append_metadata_update(st, ddf->add_list->mdupdate, len); + ddf->add_list->mdupdate = NULL; + return 0; + } + + /* Newly created VD */ + + /* First the virtual disk. We have a slightly fake header */ + len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry); + vd = xmalloc(len); + *vd = *ddf->virt; + vd->entries[0] = ddf->virt->entries[currentconf->vcnum]; + vd->populated_vdes = cpu_to_be16(currentconf->vcnum); + append_metadata_update(st, vd, len); + + /* Then the vd_config */ + len = ddf->conf_rec_len * 512; + tlen = len * currentconf->conf.sec_elmnt_count; + vc = xmalloc(tlen); + memcpy(vc, ¤tconf->conf, len); + for (i = 1; i < currentconf->conf.sec_elmnt_count; i++) + memcpy((char *)vc + i*len, currentconf->other_bvds[i-1], + len); + append_metadata_update(st, vc, tlen); + + return 0; + } else { + struct dl *d; + if (!currentconf) + for (d = ddf->dlist; d; d=d->next) + while (Kill(d->devname, NULL, 0, -1, 1) == 0); + /* Note: we don't close the fd's now, but a subsequent + * ->free_super() will + */ + return __write_init_super_ddf(st); + } +} + +#endif + +static __u64 avail_size_ddf(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + /* We must reserve the last 32Meg */ + if (devsize <= 32*1024*2) + return 0; + return devsize - 32*1024*2; +} + +#ifndef MDASSEMBLE + +static int reserve_space(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long data_offset, + unsigned long long *freesize) +{ + /* Find 'raiddisks' spare extents at least 'size' big (but + * only caring about multiples of 'chunk') and remember + * them. If size==0, find the largest size possible. + * Report available size in *freesize + * If space cannot be found, fail. + */ + struct dl *dl; + struct ddf_super *ddf = st->sb; + int cnt = 0; + + for (dl = ddf->dlist; dl ; dl=dl->next) { + dl->raiddisk = -1; + dl->esize = 0; + } + /* Now find largest extent on each device */ + for (dl = ddf->dlist ; dl ; dl=dl->next) { + unsigned long long minsize = ULLONG_MAX; + + find_space(ddf, dl, data_offset, &minsize); + if (minsize >= size && minsize >= (unsigned)chunk) { + cnt++; + dl->esize = minsize; + } + } + if (cnt < raiddisks) { + pr_err("not enough devices with space to create array.\n"); + return 0; /* No enough free spaces large enough */ + } + if (size == 0) { + /* choose the largest size of which there are at least 'raiddisk' */ + for (dl = ddf->dlist ; dl ; dl=dl->next) { + struct dl *dl2; + if (dl->esize <= size) + continue; + /* This is bigger than 'size', see if there are enough */ + cnt = 0; + for (dl2 = ddf->dlist; dl2 ; dl2=dl2->next) + if (dl2->esize >= dl->esize) + cnt++; + if (cnt >= raiddisks) + size = dl->esize; + } + if (chunk) { + size = size / chunk; + size *= chunk; + } + *freesize = size; + if (size < 32) { + pr_err("not enough spare devices to create array.\n"); + return 0; + } + } + /* We have a 'size' of which there are enough spaces. + * We simply do a first-fit */ + cnt = 0; + for (dl = ddf->dlist ; dl && cnt < raiddisks ; dl=dl->next) { + if (dl->esize < size) + continue; + + dl->raiddisk = cnt; + cnt++; + } + return 1; +} + +static int validate_geometry_ddf(struct supertype *st, + int level, int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd; + struct mdinfo *sra; + int cfd; + + /* ddf potentially supports lots of things, but it depends on + * what devices are offered (and maybe kernel version?) + * If given unused devices, we will make a container. + * If given devices in a container, we will make a BVD. + * If given BVDs, we make an SVD, changing all the GUIDs in the process. + */ + + if (*chunk == UnSet) + *chunk = DEFAULT_CHUNK; + + if (level == LEVEL_NONE) + level = LEVEL_CONTAINER; + if (level == LEVEL_CONTAINER) { + /* Must be a fresh device to add to a container */ + return validate_geometry_ddf_container(st, level, layout, + raiddisks, *chunk, + size, data_offset, dev, + freesize, + verbose); + } + + if (!dev) { + mdu_array_info_t array = { + .level = level, + .layout = layout, + .raid_disks = raiddisks + }; + struct vd_config conf; + if (layout_md2ddf(&array, &conf) == -1) { + if (verbose) + pr_err("DDF does not support level %d /layout %d arrays with %d disks\n", + level, layout, raiddisks); + return 0; + } + /* Should check layout? etc */ + + if (st->sb && freesize) { + /* --create was given a container to create in. + * So we need to check that there are enough + * free spaces and return the amount of space. + * We may as well remember which drives were + * chosen so that add_to_super/getinfo_super + * can return them. + */ + return reserve_space(st, raiddisks, size, *chunk, + data_offset, freesize); + } + return 1; + } + + if (st->sb) { + /* A container has already been opened, so we are + * creating in there. Maybe a BVD, maybe an SVD. + * Should make a distinction one day. + */ + return validate_geometry_ddf_bvd(st, level, layout, raiddisks, + chunk, size, data_offset, dev, + freesize, + verbose); + } + /* This is the first device for the array. + * If it is a container, we read it in and do automagic allocations, + * no other devices should be given. + * Otherwise it must be a member device of a container, and we + * do manual allocation. + * Later we should check for a BVD and make an SVD. + */ + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd >= 0) { + close(fd); + /* Just a bare device, no good to us */ + if (verbose) + pr_err("ddf: Cannot create this array on device %s - a container is required.\n", + dev); + return 0; + } + if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { + if (verbose) + pr_err("ddf: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + /* Well, it is in use by someone, maybe a 'ddf' container. */ + cfd = open_container(fd); + if (cfd < 0) { + close(fd); + if (verbose) + pr_err("ddf: Cannot use %s: %s\n", + dev, strerror(EBUSY)); + return 0; + } + sra = sysfs_read(cfd, NULL, GET_VERSION); + close(fd); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "ddf") == 0) { + /* This is a member of a ddf container. Load the container + * and try to create a bvd + */ + struct ddf_super *ddf; + if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL) == 0) { + st->sb = ddf; + strcpy(st->container_devnm, fd2devnm(cfd)); + close(cfd); + return validate_geometry_ddf_bvd(st, level, layout, + raiddisks, chunk, size, + data_offset, + dev, freesize, + verbose); + } + close(cfd); + } else /* device may belong to a different container */ + return 0; + + return 1; +} + +static int +validate_geometry_ddf_container(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd; + unsigned long long ldsize; + + if (level != LEVEL_CONTAINER) + return 0; + if (!dev) + return 1; + + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + pr_err("ddf: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + if (!get_dev_size(fd, dev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + *freesize = avail_size_ddf(st, ldsize >> 9, INVALID_SECTORS); + if (*freesize == 0) + return 0; + + return 1; +} + +static int validate_geometry_ddf_bvd(struct supertype *st, + int level, int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose) +{ + struct stat stb; + struct ddf_super *ddf = st->sb; + struct dl *dl; + unsigned long long maxsize; + /* ddf/bvd supports lots of things, but not containers */ + if (level == LEVEL_CONTAINER) { + if (verbose) + pr_err("DDF cannot create a container within an container\n"); + return 0; + } + /* We must have the container info already read in. */ + if (!ddf) + return 0; + + if (!dev) { + /* General test: make sure there is space for + * 'raiddisks' device extents of size 'size'. + */ + unsigned long long minsize = size; + int dcnt = 0; + if (minsize == 0) + minsize = 8; + for (dl = ddf->dlist; dl ; dl = dl->next) { + if (find_space(ddf, dl, data_offset, &minsize) + != INVALID_SECTORS) + dcnt++; + } + if (dcnt < raiddisks) { + if (verbose) + pr_err("ddf: Not enough devices with space for this array (%d < %d)\n", + dcnt, raiddisks); + return 0; + } + return 1; + } + /* This device must be a member of the set */ + if (stat(dev, &stb) < 0) + return 0; + if ((S_IFMT & stb.st_mode) != S_IFBLK) + return 0; + for (dl = ddf->dlist ; dl ; dl = dl->next) { + if (dl->major == (int)major(stb.st_rdev) && + dl->minor == (int)minor(stb.st_rdev)) + break; + } + if (!dl) { + if (verbose) + pr_err("ddf: %s is not in the same DDF set\n", + dev); + return 0; + } + maxsize = ULLONG_MAX; + find_space(ddf, dl, data_offset, &maxsize); + *freesize = maxsize; + + return 1; +} + +static int load_super_ddf_all(struct supertype *st, int fd, + void **sbp, char *devname) +{ + struct mdinfo *sra; + struct ddf_super *super; + struct mdinfo *sd, *best = NULL; + int bestseq = 0; + int seq; + char nm[20]; + int dfd; + + sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + if (!sra) + return 1; + if (sra->array.major_version != -1 || + sra->array.minor_version != -2 || + strcmp(sra->text_version, "ddf") != 0) + return 1; + + if (posix_memalign((void**)&super, 512, sizeof(*super)) != 0) + return 1; + memset(super, 0, sizeof(*super)); + + /* first, try each device, and choose the best ddf */ + for (sd = sra->devs ; sd ; sd = sd->next) { + int rv; + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) + return 2; + rv = load_ddf_headers(dfd, super, NULL); + close(dfd); + if (rv == 0) { + seq = be32_to_cpu(super->active->seq); + if (super->active->openflag) + seq--; + if (!best || seq > bestseq) { + bestseq = seq; + best = sd; + } + } + } + if (!best) + return 1; + /* OK, load this ddf */ + sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) + return 1; + load_ddf_headers(dfd, super, NULL); + load_ddf_global(dfd, super, NULL); + close(dfd); + /* Now we need the device-local bits */ + for (sd = sra->devs ; sd ; sd = sd->next) { + int rv; + + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDWR); + if (dfd < 0) + return 2; + rv = load_ddf_headers(dfd, super, NULL); + if (rv == 0) + rv = load_ddf_local(dfd, super, NULL, 1); + if (rv) + return 1; + } + + *sbp = super; + if (st->ss == NULL) { + st->ss = &super_ddf; + st->minor_version = 0; + st->max_devs = 512; + } + strcpy(st->container_devnm, fd2devnm(fd)); + return 0; +} + +static int load_container_ddf(struct supertype *st, int fd, + char *devname) +{ + return load_super_ddf_all(st, fd, &st->sb, devname); +} + +#endif /* MDASSEMBLE */ + +static int check_secondary(const struct vcl *vc) +{ + const struct vd_config *conf = &vc->conf; + int i; + + /* The only DDF secondary RAID level md can support is + * RAID 10, if the stripe sizes and Basic volume sizes + * are all equal. + * Other configurations could in theory be supported by exposing + * the BVDs to user space and using device mapper for the secondary + * mapping. So far we don't support that. + */ + + __u64 sec_elements[4] = {0, 0, 0, 0}; +#define __set_sec_seen(n) (sec_elements[(n)>>6] |= (1<<((n)&63))) +#define __was_sec_seen(n) ((sec_elements[(n)>>6] & (1<<((n)&63))) != 0) + + if (vc->other_bvds == NULL) { + pr_err("No BVDs for secondary RAID found\n"); + return -1; + } + if (conf->prl != DDF_RAID1) { + pr_err("Secondary RAID level only supported for mirrored BVD\n"); + return -1; + } + if (conf->srl != DDF_2STRIPED && conf->srl != DDF_2SPANNED) { + pr_err("Secondary RAID level %d is unsupported\n", + conf->srl); + return -1; + } + __set_sec_seen(conf->sec_elmnt_seq); + for (i = 0; i < conf->sec_elmnt_count-1; i++) { + const struct vd_config *bvd = vc->other_bvds[i]; + if (bvd->sec_elmnt_seq == DDF_UNUSED_BVD) + continue; + if (bvd->srl != conf->srl) { + pr_err("Inconsistent secondary RAID level across BVDs\n"); + return -1; + } + if (bvd->prl != conf->prl) { + pr_err("Different RAID levels for BVDs are unsupported\n"); + return -1; + } + if (!be16_eq(bvd->prim_elmnt_count, conf->prim_elmnt_count)) { + pr_err("All BVDs must have the same number of primary elements\n"); + return -1; + } + if (bvd->chunk_shift != conf->chunk_shift) { + pr_err("Different strip sizes for BVDs are unsupported\n"); + return -1; + } + if (!be64_eq(bvd->array_blocks, conf->array_blocks)) { + pr_err("Different BVD sizes are unsupported\n"); + return -1; + } + __set_sec_seen(bvd->sec_elmnt_seq); + } + for (i = 0; i < conf->sec_elmnt_count; i++) { + if (!__was_sec_seen(i)) { + /* pr_err("BVD %d is missing\n", i); */ + return -1; + } + } + return 0; +} + +static unsigned int get_pd_index_from_refnum(const struct vcl *vc, + be32 refnum, unsigned int nmax, + const struct vd_config **bvd, + unsigned int *idx) +{ + unsigned int i, j, n, sec, cnt; + + cnt = be16_to_cpu(vc->conf.prim_elmnt_count); + sec = (vc->conf.sec_elmnt_count == 1 ? 0 : vc->conf.sec_elmnt_seq); + + for (i = 0, j = 0 ; i < nmax ; i++) { + /* j counts valid entries for this BVD */ + if (be32_eq(vc->conf.phys_refnum[i], refnum)) { + *bvd = &vc->conf; + *idx = i; + return sec * cnt + j; + } + if (be32_to_cpu(vc->conf.phys_refnum[i]) != 0xffffffff) + j++; + } + if (vc->other_bvds == NULL) + goto bad; + + for (n = 1; n < vc->conf.sec_elmnt_count; n++) { + struct vd_config *vd = vc->other_bvds[n-1]; + sec = vd->sec_elmnt_seq; + if (sec == DDF_UNUSED_BVD) + continue; + for (i = 0, j = 0 ; i < nmax ; i++) { + if (be32_eq(vd->phys_refnum[i], refnum)) { + *bvd = vd; + *idx = i; + return sec * cnt + j; + } + if (be32_to_cpu(vd->phys_refnum[i]) != 0xffffffff) + j++; + } + } +bad: + *bvd = NULL; + return DDF_NOTFOUND; +} + +static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray) +{ + /* Given a container loaded by load_super_ddf_all, + * extract information about all the arrays into + * an mdinfo tree. + * + * For each vcl in conflist: create an mdinfo, fill it in, + * then look for matching devices (phys_refnum) in dlist + * and create appropriate device mdinfo. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo *rest = NULL; + struct vcl *vc; + + for (vc = ddf->conflist ; vc ; vc=vc->next) { + unsigned int i; + struct mdinfo *this; + char *ep; + __u32 *cptr; + unsigned int pd; + + if (subarray && + (strtoul(subarray, &ep, 10) != vc->vcnum || + *ep != '\0')) + continue; + + if (vc->conf.sec_elmnt_count > 1) { + if (check_secondary(vc) != 0) + continue; + } + + this = xcalloc(1, sizeof(*this)); + this->next = rest; + rest = this; + + if (layout_ddf2md(&vc->conf, &this->array)) + continue; + this->array.md_minor = -1; + this->array.major_version = -1; + this->array.minor_version = -2; + this->safe_mode_delay = DDF_SAFE_MODE_DELAY; + cptr = (__u32 *)(vc->conf.guid + 16); + this->array.ctime = DECADE + __be32_to_cpu(*cptr); + this->array.utime = DECADE + + be32_to_cpu(vc->conf.timestamp); + this->array.chunk_size = 512 << vc->conf.chunk_shift; + + i = vc->vcnum; + if ((ddf->virt->entries[i].state & DDF_state_inconsistent) || + (ddf->virt->entries[i].init_state & DDF_initstate_mask) != + DDF_init_full) { + this->array.state = 0; + this->resync_start = 0; + } else { + this->array.state = 1; + this->resync_start = MaxSector; + } + _ddf_array_name(this->name, ddf, i); + memset(this->uuid, 0, sizeof(this->uuid)); + this->component_size = be64_to_cpu(vc->conf.blocks); + this->array.size = this->component_size / 2; + this->container_member = i; + + ddf->currentconf = vc; + uuid_from_super_ddf(st, this->uuid); + if (!subarray) + ddf->currentconf = NULL; + + sprintf(this->text_version, "/%s/%d", + st->container_devnm, this->container_member); + + for (pd = 0; pd < be16_to_cpu(ddf->phys->max_pdes); pd++) { + struct mdinfo *dev; + struct dl *d; + const struct vd_config *bvd; + unsigned int iphys; + int stt; + + if (be32_to_cpu(ddf->phys->entries[pd].refnum) + == 0xFFFFFFFF) + continue; + + stt = be16_to_cpu(ddf->phys->entries[pd].state); + if ((stt & (DDF_Online|DDF_Failed|DDF_Rebuilding)) + != DDF_Online) + continue; + + i = get_pd_index_from_refnum( + vc, ddf->phys->entries[pd].refnum, + ddf->mppe, &bvd, &iphys); + if (i == DDF_NOTFOUND) + continue; + + this->array.working_disks++; + + for (d = ddf->dlist; d ; d=d->next) + if (be32_eq(d->disk.refnum, + ddf->phys->entries[pd].refnum)) + break; + if (d == NULL) + /* Haven't found that one yet, maybe there are others */ + continue; + + dev = xcalloc(1, sizeof(*dev)); + dev->next = this->devs; + this->devs = dev; + + dev->disk.number = be32_to_cpu(d->disk.refnum); + dev->disk.major = d->major; + dev->disk.minor = d->minor; + dev->disk.raid_disk = i; + dev->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE); + dev->recovery_start = MaxSector; + + dev->events = be32_to_cpu(ddf->active->seq); + dev->data_offset = + be64_to_cpu(LBA_OFFSET(ddf, bvd)[iphys]); + dev->component_size = be64_to_cpu(bvd->blocks); + if (d->devname) + strcpy(dev->name, d->devname); + } + } + return rest; +} + +static int store_super_ddf(struct supertype *st, int fd) +{ + struct ddf_super *ddf = st->sb; + unsigned long long dsize; + void *buf; + int rc; + + if (!ddf) + return 1; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + if (ddf->dlist || ddf->conflist) { + struct stat sta; + struct dl *dl; + int ofd, ret; + + if (fstat(fd, &sta) == -1 || !S_ISBLK(sta.st_mode)) { + pr_err("file descriptor for invalid device\n"); + return 1; + } + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == (int)major(sta.st_rdev) && + dl->minor == (int)minor(sta.st_rdev)) + break; + if (!dl) { + pr_err("couldn't find disk %d/%d\n", + (int)major(sta.st_rdev), + (int)minor(sta.st_rdev)); + return 1; + } + ofd = dl->fd; + dl->fd = fd; + ret = (_write_super_to_disk(ddf, dl) != 1); + dl->fd = ofd; + return ret; + } + + if (posix_memalign(&buf, 512, 512) != 0) + return 1; + memset(buf, 0, 512); + + lseek64(fd, dsize-512, 0); + rc = write(fd, buf, 512); + free(buf); + if (rc < 0) + return 1; + return 0; +} + +static int compare_super_ddf(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong magic number - but that isn't possible + * 2 wrong uuid + * 3 wrong other info + */ + struct ddf_super *first = st->sb; + struct ddf_super *second = tst->sb; + struct dl *dl1, *dl2; + struct vcl *vl1, *vl2; + unsigned int max_vds, max_pds, pd, vd; + + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } + + if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0) + return 2; + + /* It is only OK to compare info in the anchor. Anything else + * could be changing due to a reconfig so must be ignored. + * guid really should be enough anyway. + */ + + if (!be32_eq(first->active->seq, second->active->seq)) { + dprintf("sequence number mismatch %u<->%u\n", + be32_to_cpu(first->active->seq), + be32_to_cpu(second->active->seq)); + return 0; + } + + /* + * At this point we are fairly sure that the meta data matches. + * But the new disk may contain additional local data. + * Add it to the super block. + */ + max_vds = be16_to_cpu(first->active->max_vd_entries); + max_pds = be16_to_cpu(first->phys->max_pdes); + for (vl2 = second->conflist; vl2; vl2 = vl2->next) { + for (vl1 = first->conflist; vl1; vl1 = vl1->next) + if (!memcmp(vl1->conf.guid, vl2->conf.guid, + DDF_GUID_LEN)) + break; + if (vl1) { + if (vl1->other_bvds != NULL && + vl1->conf.sec_elmnt_seq != + vl2->conf.sec_elmnt_seq) { + dprintf("adding BVD %u\n", + vl2->conf.sec_elmnt_seq); + add_other_bvd(vl1, &vl2->conf, + first->conf_rec_len*512); + } + continue; + } + + if (posix_memalign((void **)&vl1, 512, + (first->conf_rec_len*512 + + offsetof(struct vcl, conf))) != 0) { + pr_err("could not allocate vcl buf\n"); + return 3; + } + + vl1->next = first->conflist; + vl1->block_sizes = NULL; + memcpy(&vl1->conf, &vl2->conf, first->conf_rec_len*512); + if (alloc_other_bvds(first, vl1) != 0) { + pr_err("could not allocate other bvds\n"); + free(vl1); + return 3; + } + for (vd = 0; vd < max_vds; vd++) + if (!memcmp(first->virt->entries[vd].guid, + vl1->conf.guid, DDF_GUID_LEN)) + break; + vl1->vcnum = vd; + dprintf("added config for VD %u\n", vl1->vcnum); + first->conflist = vl1; + } + + for (dl2 = second->dlist; dl2; dl2 = dl2->next) { + for (dl1 = first->dlist; dl1; dl1 = dl1->next) + if (be32_eq(dl1->disk.refnum, dl2->disk.refnum)) + break; + if (dl1) + continue; + + if (posix_memalign((void **)&dl1, 512, + sizeof(*dl1) + (first->max_part) * sizeof(dl1->vlist[0])) + != 0) { + pr_err("could not allocate disk info buffer\n"); + return 3; + } + memcpy(dl1, dl2, sizeof(*dl1)); + dl1->mdupdate = NULL; + dl1->next = first->dlist; + dl1->fd = -1; + for (pd = 0; pd < max_pds; pd++) + if (be32_eq(first->phys->entries[pd].refnum, + dl1->disk.refnum)) + break; + dl1->pdnum = pd < max_pds ? (int)pd : -1; + if (dl2->spare) { + if (posix_memalign((void **)&dl1->spare, 512, + first->conf_rec_len*512) != 0) { + pr_err("could not allocate spare info buf\n"); + return 3; + } + memcpy(dl1->spare, dl2->spare, first->conf_rec_len*512); + } + for (vd = 0 ; vd < first->max_part ; vd++) { + if (!dl2->vlist[vd]) { + dl1->vlist[vd] = NULL; + continue; + } + for (vl1 = first->conflist; vl1; vl1 = vl1->next) { + if (!memcmp(vl1->conf.guid, + dl2->vlist[vd]->conf.guid, + DDF_GUID_LEN)) + break; + dl1->vlist[vd] = vl1; + } + } + first->dlist = dl1; + dprintf("added disk %d: %08x\n", dl1->pdnum, + be32_to_cpu(dl1->disk.refnum)); + } + + return 0; +} + +#ifndef MDASSEMBLE +/* + * A new array 'a' has been started which claims to be instance 'inst' + * within container 'c'. + * We need to confirm that the array matches the metadata in 'c' so + * that we don't corrupt any metadata. + */ +static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst) +{ + struct ddf_super *ddf = c->sb; + int n = atoi(inst); + struct mdinfo *dev; + struct dl *dl; + static const char faulty[] = "faulty"; + + if (all_ff(ddf->virt->entries[n].guid)) { + pr_err("subarray %d doesn't exist\n", n); + return -ENODEV; + } + dprintf("new subarray %d, GUID: %s\n", n, + guid_str(ddf->virt->entries[n].guid)); + for (dev = a->info.devs; dev; dev = dev->next) { + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == dev->disk.major && + dl->minor == dev->disk.minor) + break; + if (!dl || dl->pdnum < 0) { + pr_err("device %d/%d of subarray %d not found in meta data\n", + dev->disk.major, dev->disk.minor, n); + return -1; + } + if ((be16_to_cpu(ddf->phys->entries[dl->pdnum].state) & + (DDF_Online|DDF_Missing|DDF_Failed)) != DDF_Online) { + pr_err("new subarray %d contains broken device %d/%d (%02x)\n", + n, dl->major, dl->minor, + be16_to_cpu(ddf->phys->entries[dl->pdnum].state)); + if (write(dev->state_fd, faulty, sizeof(faulty)-1) != + sizeof(faulty) - 1) + pr_err("Write to state_fd failed\n"); + dev->curr_state = DS_FAULTY; + } + } + a->info.container_member = n; + return 0; +} + +static void handle_missing(struct ddf_super *ddf, struct active_array *a, int inst) +{ + /* This member array is being activated. If any devices + * are missing they must now be marked as failed. + */ + struct vd_config *vc; + unsigned int n_bvd; + struct vcl *vcl; + struct dl *dl; + int pd; + int n; + int state; + + for (n = 0; ; n++) { + vc = find_vdcr(ddf, inst, n, &n_bvd, &vcl); + if (!vc) + break; + for (dl = ddf->dlist; dl; dl = dl->next) + if (be32_eq(dl->disk.refnum, vc->phys_refnum[n_bvd])) + break; + if (dl) + /* Found this disk, so not missing */ + continue; + + /* Mark the device as failed/missing. */ + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); + if (pd >= 0 && be16_and(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Online))) { + be16_clear(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Online)); + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Failed|DDF_Missing)); + vc->phys_refnum[n_bvd] = cpu_to_be32(0); + ddf_set_updates_pending(ddf, vc); + } + + /* Mark the array as Degraded */ + state = get_svd_state(ddf, vcl); + if (ddf->virt->entries[inst].state != + ((ddf->virt->entries[inst].state & ~DDF_state_mask) + | state)) { + ddf->virt->entries[inst].state = + (ddf->virt->entries[inst].state & ~DDF_state_mask) + | state; + a->check_degraded = 1; + ddf_set_updates_pending(ddf, vc); + } + } +} + +/* + * The array 'a' is to be marked clean in the metadata. + * If '->resync_start' is not ~(unsigned long long)0, then the array is only + * clean up to the point (in sectors). If that cannot be recorded in the + * metadata, then leave it as dirty. + * + * For DDF, we need to clear the DDF_state_inconsistent bit in the + * !global! virtual_disk.virtual_entry structure. + */ +static int ddf_set_array_state(struct active_array *a, int consistent) +{ + struct ddf_super *ddf = a->container->sb; + int inst = a->info.container_member; + int old = ddf->virt->entries[inst].state; + if (consistent == 2) { + handle_missing(ddf, a, inst); + consistent = 1; + if (!is_resync_complete(&a->info)) + consistent = 0; + } + if (consistent) + ddf->virt->entries[inst].state &= ~DDF_state_inconsistent; + else + ddf->virt->entries[inst].state |= DDF_state_inconsistent; + if (old != ddf->virt->entries[inst].state) + ddf_set_updates_pending(ddf, NULL); + + old = ddf->virt->entries[inst].init_state; + ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask; + if (is_resync_complete(&a->info)) + ddf->virt->entries[inst].init_state |= DDF_init_full; + else if (a->info.resync_start == 0) + ddf->virt->entries[inst].init_state |= DDF_init_not; + else + ddf->virt->entries[inst].init_state |= DDF_init_quick; + if (old != ddf->virt->entries[inst].init_state) + ddf_set_updates_pending(ddf, NULL); + + dprintf("ddf mark %d/%s (%d) %s %llu\n", inst, + guid_str(ddf->virt->entries[inst].guid), a->curr_state, + consistent?"clean":"dirty", + a->info.resync_start); + return consistent; +} + +static int get_bvd_state(const struct ddf_super *ddf, + const struct vd_config *vc) +{ + unsigned int i, n_bvd, working = 0; + unsigned int n_prim = be16_to_cpu(vc->prim_elmnt_count); + int pd, st, state; + char *avail = xcalloc(1, n_prim); + mdu_array_info_t array; + + layout_ddf2md(vc, &array); + + for (i = 0; i < n_prim; i++) { + if (!find_index_in_bvd(ddf, vc, i, &n_bvd)) + continue; + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); + if (pd < 0) + continue; + st = be16_to_cpu(ddf->phys->entries[pd].state); + if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding)) + == DDF_Online) { + working++; + avail[i] = 1; + } + } + + state = DDF_state_degraded; + if (working == n_prim) + state = DDF_state_optimal; + else + switch (vc->prl) { + case DDF_RAID0: + case DDF_CONCAT: + case DDF_JBOD: + state = DDF_state_failed; + break; + case DDF_RAID1: + if (working == 0) + state = DDF_state_failed; + else if (working >= 2) + state = DDF_state_part_optimal; + break; + case DDF_RAID1E: + if (!enough(10, n_prim, array.layout, 1, avail)) + state = DDF_state_failed; + break; + case DDF_RAID4: + case DDF_RAID5: + if (working < n_prim - 1) + state = DDF_state_failed; + break; + case DDF_RAID6: + if (working < n_prim - 2) + state = DDF_state_failed; + else if (working == n_prim - 1) + state = DDF_state_part_optimal; + break; + } + return state; +} + +static int secondary_state(int state, int other, int seclevel) +{ + if (state == DDF_state_optimal && other == DDF_state_optimal) + return DDF_state_optimal; + if (seclevel == DDF_2MIRRORED) { + if (state == DDF_state_optimal || other == DDF_state_optimal) + return DDF_state_part_optimal; + if (state == DDF_state_failed && other == DDF_state_failed) + return DDF_state_failed; + return DDF_state_degraded; + } else { + if (state == DDF_state_failed || other == DDF_state_failed) + return DDF_state_failed; + if (state == DDF_state_degraded || other == DDF_state_degraded) + return DDF_state_degraded; + return DDF_state_part_optimal; + } +} + +static int get_svd_state(const struct ddf_super *ddf, const struct vcl *vcl) +{ + int state = get_bvd_state(ddf, &vcl->conf); + unsigned int i; + for (i = 1; i < vcl->conf.sec_elmnt_count; i++) { + state = secondary_state( + state, + get_bvd_state(ddf, vcl->other_bvds[i-1]), + vcl->conf.srl); + } + return state; +} + +/* + * The state of each disk is stored in the global phys_disk structure + * in phys_disk.entries[n].state. + * This makes various combinations awkward. + * - When a device fails in any array, it must be failed in all arrays + * that include a part of this device. + * - When a component is rebuilding, we cannot include it officially in the + * array unless this is the only array that uses the device. + * + * So: when transitioning: + * Online -> failed, just set failed flag. monitor will propagate + * spare -> online, the device might need to be added to the array. + * spare -> failed, just set failed. Don't worry if in array or not. + */ +static void ddf_set_disk(struct active_array *a, int n, int state) +{ + struct ddf_super *ddf = a->container->sb; + unsigned int inst = a->info.container_member, n_bvd; + struct vcl *vcl; + struct vd_config *vc = find_vdcr(ddf, inst, (unsigned int)n, + &n_bvd, &vcl); + int pd; + struct mdinfo *mdi; + struct dl *dl; + int update = 0; + + dprintf("%d to %x\n", n, state); + if (vc == NULL) { + dprintf("ddf: cannot find instance %d!!\n", inst); + return; + } + /* Find the matching slot in 'info'. */ + for (mdi = a->info.devs; mdi; mdi = mdi->next) + if (mdi->disk.raid_disk == n) + break; + if (!mdi) { + pr_err("cannot find raid disk %d\n", n); + return; + } + + /* and find the 'dl' entry corresponding to that. */ + for (dl = ddf->dlist; dl; dl = dl->next) + if (mdi->state_fd >= 0 && + mdi->disk.major == dl->major && + mdi->disk.minor == dl->minor) + break; + if (!dl) { + pr_err("cannot find raid disk %d (%d/%d)\n", + n, mdi->disk.major, mdi->disk.minor); + return; + } + + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); + if (pd < 0 || pd != dl->pdnum) { + /* disk doesn't currently exist or has changed. + * If it is now in_sync, insert it. */ + dprintf("phys disk not found for %d: %d/%d ref %08x\n", + dl->pdnum, dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum)); + dprintf("array %u disk %u ref %08x pd %d\n", + inst, n_bvd, + be32_to_cpu(vc->phys_refnum[n_bvd]), pd); + if ((state & DS_INSYNC) && ! (state & DS_FAULTY) && + dl->pdnum >= 0) { + pd = dl->pdnum; + vc->phys_refnum[n_bvd] = dl->disk.refnum; + LBA_OFFSET(ddf, vc)[n_bvd] = + cpu_to_be64(mdi->data_offset); + be16_clear(ddf->phys->entries[pd].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[pd].type, + cpu_to_be16(DDF_Active_in_VD)); + update = 1; + } + } else { + be16 old = ddf->phys->entries[pd].state; + if (state & DS_FAULTY) + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Failed)); + if (state & DS_INSYNC) { + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Online)); + be16_clear(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Rebuilding)); + } + if (!be16_eq(old, ddf->phys->entries[pd].state)) + update = 1; + } + + dprintf("ddf: set_disk %d (%08x) to %x->%02x\n", n, + be32_to_cpu(dl->disk.refnum), state, + be16_to_cpu(ddf->phys->entries[pd].state)); + + /* Now we need to check the state of the array and update + * virtual_disk.entries[n].state. + * It needs to be one of "optimal", "degraded", "failed". + * I don't understand 'deleted' or 'missing'. + */ + state = get_svd_state(ddf, vcl); + + if (ddf->virt->entries[inst].state != + ((ddf->virt->entries[inst].state & ~DDF_state_mask) + | state)) { + ddf->virt->entries[inst].state = + (ddf->virt->entries[inst].state & ~DDF_state_mask) + | state; + update = 1; + } + if (update) + ddf_set_updates_pending(ddf, vc); +} + +static void ddf_sync_metadata(struct supertype *st) +{ + /* + * Write all data to all devices. + * Later, we might be able to track whether only local changes + * have been made, or whether any global data has been changed, + * but ddf is sufficiently weird that it probably always + * changes global data .... + */ + struct ddf_super *ddf = st->sb; + if (!ddf->updates_pending) + return; + ddf->updates_pending = 0; + __write_init_super_ddf(st); + dprintf("ddf: sync_metadata\n"); +} + +static int del_from_conflist(struct vcl **list, const char *guid) +{ + struct vcl **p; + int found = 0; + for (p = list; p && *p; p = &((*p)->next)) + if (!memcmp((*p)->conf.guid, guid, DDF_GUID_LEN)) { + found = 1; + *p = (*p)->next; + } + return found; +} + +static int _kill_subarray_ddf(struct ddf_super *ddf, const char *guid) +{ + struct dl *dl; + unsigned int vdnum, i; + vdnum = find_vde_by_guid(ddf, guid); + if (vdnum == DDF_NOTFOUND) { + pr_err("could not find VD %s\n", guid_str(guid)); + return -1; + } + if (del_from_conflist(&ddf->conflist, guid) == 0) { + pr_err("could not find conf %s\n", guid_str(guid)); + return -1; + } + for (dl = ddf->dlist; dl; dl = dl->next) + for (i = 0; i < ddf->max_part; i++) + if (dl->vlist[i] != NULL && + !memcmp(dl->vlist[i]->conf.guid, guid, + DDF_GUID_LEN)) + dl->vlist[i] = NULL; + memset(ddf->virt->entries[vdnum].guid, 0xff, DDF_GUID_LEN); + dprintf("deleted %s\n", guid_str(guid)); + return 0; +} + +static int kill_subarray_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + /* + * currentconf is set in container_content_ddf, + * called with subarray arg + */ + struct vcl *victim = ddf->currentconf; + struct vd_config *conf; + unsigned int vdnum; + + ddf->currentconf = NULL; + if (!victim) { + pr_err("nothing to kill\n"); + return -1; + } + conf = &victim->conf; + vdnum = find_vde_by_guid(ddf, conf->guid); + if (vdnum == DDF_NOTFOUND) { + pr_err("could not find VD %s\n", guid_str(conf->guid)); + return -1; + } + if (st->update_tail) { + struct virtual_disk *vd; + int len = sizeof(struct virtual_disk) + + sizeof(struct virtual_entry); + vd = xmalloc(len); + if (vd == NULL) { + pr_err("failed to allocate %d bytes\n", len); + return -1; + } + memset(vd, 0 , len); + vd->magic = DDF_VIRT_RECORDS_MAGIC; + vd->populated_vdes = cpu_to_be16(0); + memcpy(vd->entries[0].guid, conf->guid, DDF_GUID_LEN); + /* we use DDF_state_deleted as marker */ + vd->entries[0].state = DDF_state_deleted; + append_metadata_update(st, vd, len); + } else { + _kill_subarray_ddf(ddf, conf->guid); + ddf_set_updates_pending(ddf, NULL); + ddf_sync_metadata(st); + } + return 0; +} + +static void copy_matching_bvd(struct ddf_super *ddf, + struct vd_config *conf, + const struct metadata_update *update) +{ + unsigned int mppe = + be16_to_cpu(ddf->anchor.max_primary_element_entries); + unsigned int len = ddf->conf_rec_len * 512; + char *p; + struct vd_config *vc; + for (p = update->buf; p < update->buf + update->len; p += len) { + vc = (struct vd_config *) p; + if (vc->sec_elmnt_seq == conf->sec_elmnt_seq) { + memcpy(conf->phys_refnum, vc->phys_refnum, + mppe * (sizeof(__u32) + sizeof(__u64))); + return; + } + } + pr_err("no match for BVD %d of %s in update\n", + conf->sec_elmnt_seq, guid_str(conf->guid)); +} + +static void ddf_process_phys_update(struct supertype *st, + struct metadata_update *update) +{ + struct ddf_super *ddf = st->sb; + struct phys_disk *pd; + unsigned int ent; + + pd = (struct phys_disk*)update->buf; + ent = be16_to_cpu(pd->used_pdes); + if (ent >= be16_to_cpu(ddf->phys->max_pdes)) + return; + if (be16_and(pd->entries[0].state, cpu_to_be16(DDF_Missing))) { + struct dl **dlp; + /* removing this disk. */ + be16_set(ddf->phys->entries[ent].state, + cpu_to_be16(DDF_Missing)); + for (dlp = &ddf->dlist; *dlp; dlp = &(*dlp)->next) { + struct dl *dl = *dlp; + if (dl->pdnum == (signed)ent) { + close(dl->fd); + dl->fd = -1; + *dlp = dl->next; + update->space = dl->devname; + *(void**)dl = update->space_list; + update->space_list = (void**)dl; + break; + } + } + ddf_set_updates_pending(ddf, NULL); + return; + } + if (!all_ff(ddf->phys->entries[ent].guid)) + return; + ddf->phys->entries[ent] = pd->entries[0]; + ddf->phys->used_pdes = cpu_to_be16 + (1 + be16_to_cpu(ddf->phys->used_pdes)); + ddf_set_updates_pending(ddf, NULL); + if (ddf->add_list) { + struct active_array *a; + struct dl *al = ddf->add_list; + ddf->add_list = al->next; + + al->next = ddf->dlist; + ddf->dlist = al; + + /* As a device has been added, we should check + * for any degraded devices that might make + * use of this spare */ + for (a = st->arrays ; a; a=a->next) + a->check_degraded = 1; + } +} + +static void ddf_process_virt_update(struct supertype *st, + struct metadata_update *update) +{ + struct ddf_super *ddf = st->sb; + struct virtual_disk *vd; + unsigned int ent; + + vd = (struct virtual_disk*)update->buf; + + if (vd->entries[0].state == DDF_state_deleted) { + if (_kill_subarray_ddf(ddf, vd->entries[0].guid)) + return; + } else { + ent = find_vde_by_guid(ddf, vd->entries[0].guid); + if (ent != DDF_NOTFOUND) { + dprintf("VD %s exists already in slot %d\n", + guid_str(vd->entries[0].guid), + ent); + return; + } + ent = find_unused_vde(ddf); + if (ent == DDF_NOTFOUND) + return; + ddf->virt->entries[ent] = vd->entries[0]; + ddf->virt->populated_vdes = + cpu_to_be16( + 1 + be16_to_cpu( + ddf->virt->populated_vdes)); + dprintf("added VD %s in slot %d(s=%02x i=%02x)\n", + guid_str(vd->entries[0].guid), ent, + ddf->virt->entries[ent].state, + ddf->virt->entries[ent].init_state); + } + ddf_set_updates_pending(ddf, NULL); +} + +static void ddf_remove_failed(struct ddf_super *ddf) +{ + /* Now remove any 'Failed' devices that are not part + * of any VD. They will have the Transition flag set. + * Once done, we need to update all dl->pdnum numbers. + */ + unsigned int pdnum; + unsigned int pd2 = 0; + struct dl *dl; + + for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->max_pdes); + pdnum++) { + if (be32_to_cpu(ddf->phys->entries[pdnum].refnum) == + 0xFFFFFFFF) + continue; + if (be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Failed)) + && be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Transition))) { + /* skip this one unless in dlist*/ + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->pdnum == (int)pdnum) + break; + if (!dl) + continue; + } + if (pdnum == pd2) + pd2++; + else { + ddf->phys->entries[pd2] = + ddf->phys->entries[pdnum]; + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->pdnum == (int)pdnum) + dl->pdnum = pd2; + pd2++; + } + } + ddf->phys->used_pdes = cpu_to_be16(pd2); + while (pd2 < pdnum) { + memset(ddf->phys->entries[pd2].guid, 0xff, + DDF_GUID_LEN); + pd2++; + } +} + +static void ddf_update_vlist(struct ddf_super *ddf, struct dl *dl) +{ + struct vcl *vcl; + unsigned int vn = 0; + int in_degraded = 0; + + if (dl->pdnum < 0) + return; + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) { + unsigned int dn, ibvd; + const struct vd_config *conf; + int vstate; + dn = get_pd_index_from_refnum(vcl, + dl->disk.refnum, + ddf->mppe, + &conf, &ibvd); + if (dn == DDF_NOTFOUND) + continue; + dprintf("dev %d/%08x has %s (sec=%u) at %d\n", + dl->pdnum, + be32_to_cpu(dl->disk.refnum), + guid_str(conf->guid), + conf->sec_elmnt_seq, vn); + /* Clear the Transition flag */ + if (be16_and + (ddf->phys->entries[dl->pdnum].state, + cpu_to_be16(DDF_Failed))) + be16_clear(ddf->phys + ->entries[dl->pdnum].state, + cpu_to_be16(DDF_Transition)); + dl->vlist[vn++] = vcl; + vstate = ddf->virt->entries[vcl->vcnum].state + & DDF_state_mask; + if (vstate == DDF_state_degraded || + vstate == DDF_state_part_optimal) + in_degraded = 1; + } + while (vn < ddf->max_part) + dl->vlist[vn++] = NULL; + if (dl->vlist[0]) { + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + if (!be16_and(ddf->phys + ->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD))) { + be16_set(ddf->phys + ->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + if (in_degraded) + be16_set(ddf->phys + ->entries[dl->pdnum] + .state, + cpu_to_be16 + (DDF_Rebuilding)); + } + } + if (dl->spare) { + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare)); + } + if (!dl->vlist[0] && !dl->spare) { + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare)); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + } +} + +static void ddf_process_conf_update(struct supertype *st, + struct metadata_update *update) +{ + struct ddf_super *ddf = st->sb; + struct vd_config *vc; + struct vcl *vcl; + struct dl *dl; + unsigned int ent; + unsigned int pdnum, len; + + vc = (struct vd_config*)update->buf; + len = ddf->conf_rec_len * 512; + if ((unsigned int)update->len != len * vc->sec_elmnt_count) { + pr_err("%s: insufficient data (%d) for %u BVDs\n", + guid_str(vc->guid), update->len, + vc->sec_elmnt_count); + return; + } + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) + if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0) + break; + dprintf("conf update for %s (%s)\n", + guid_str(vc->guid), (vcl ? "old" : "new")); + if (vcl) { + /* An update, just copy the phys_refnum and lba_offset + * fields + */ + unsigned int i; + unsigned int k; + copy_matching_bvd(ddf, &vcl->conf, update); + for (k = 0; k < be16_to_cpu(vc->prim_elmnt_count); k++) + dprintf("BVD %u has %08x at %llu\n", 0, + be32_to_cpu(vcl->conf.phys_refnum[k]), + be64_to_cpu(LBA_OFFSET(ddf, + &vcl->conf)[k])); + for (i = 1; i < vc->sec_elmnt_count; i++) { + copy_matching_bvd(ddf, vcl->other_bvds[i-1], + update); + for (k = 0; k < be16_to_cpu( + vc->prim_elmnt_count); k++) + dprintf("BVD %u has %08x at %llu\n", i, + be32_to_cpu + (vcl->other_bvds[i-1]-> + phys_refnum[k]), + be64_to_cpu + (LBA_OFFSET + (ddf, + vcl->other_bvds[i-1])[k])); + } + } else { + /* A new VD_CONF */ + unsigned int i; + if (!update->space) + return; + vcl = update->space; + update->space = NULL; + vcl->next = ddf->conflist; + memcpy(&vcl->conf, vc, len); + ent = find_vde_by_guid(ddf, vc->guid); + if (ent == DDF_NOTFOUND) + return; + vcl->vcnum = ent; + ddf->conflist = vcl; + for (i = 1; i < vc->sec_elmnt_count; i++) + memcpy(vcl->other_bvds[i-1], + update->buf + len * i, len); + } + /* Set DDF_Transition on all Failed devices - to help + * us detect those that are no longer in use + */ + for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->max_pdes); + pdnum++) + if (be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Failed))) + be16_set(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Transition)); + + /* Now make sure vlist is correct for each dl. */ + for (dl = ddf->dlist; dl; dl = dl->next) + ddf_update_vlist(ddf, dl); + ddf_remove_failed(ddf); + + ddf_set_updates_pending(ddf, vc); +} + +static void ddf_process_update(struct supertype *st, + struct metadata_update *update) +{ + /* Apply this update to the metadata. + * The first 4 bytes are a DDF_*_MAGIC which guides + * our actions. + * Possible update are: + * DDF_PHYS_RECORDS_MAGIC + * Add a new physical device or remove an old one. + * Changes to this record only happen implicitly. + * used_pdes is the device number. + * DDF_VIRT_RECORDS_MAGIC + * Add a new VD. Possibly also change the 'access' bits. + * populated_vdes is the entry number. + * DDF_VD_CONF_MAGIC + * New or updated VD. the VIRT_RECORD must already + * exist. For an update, phys_refnum and lba_offset + * (at least) are updated, and the VD_CONF must + * be written to precisely those devices listed with + * a phys_refnum. + * DDF_SPARE_ASSIGN_MAGIC + * replacement Spare Assignment Record... but for which device? + * + * So, e.g.: + * - to create a new array, we send a VIRT_RECORD and + * a VD_CONF. Then assemble and start the array. + * - to activate a spare we send a VD_CONF to add the phys_refnum + * and offset. This will also mark the spare as active with + * a spare-assignment record. + */ + be32 *magic = (be32 *)update->buf; + + dprintf("Process update %x\n", be32_to_cpu(*magic)); + + if (be32_eq(*magic, DDF_PHYS_RECORDS_MAGIC)) { + if (update->len == (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry))) + ddf_process_phys_update(st, update); + } else if (be32_eq(*magic, DDF_VIRT_RECORDS_MAGIC)) { + if (update->len == (sizeof(struct virtual_disk) + + sizeof(struct virtual_entry))) + ddf_process_virt_update(st, update); + } else if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) { + ddf_process_conf_update(st, update); + } + /* case DDF_SPARE_ASSIGN_MAGIC */ +} + +static int ddf_prepare_update(struct supertype *st, + struct metadata_update *update) +{ + /* This update arrived at managemon. + * We are about to pass it to monitor. + * If a malloc is needed, do it here. + */ + struct ddf_super *ddf = st->sb; + be32 *magic; + if (update->len < 4) + return 0; + magic = (be32 *)update->buf; + if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) { + struct vcl *vcl; + struct vd_config *conf; + if (update->len < (int)sizeof(*conf)) + return 0; + conf = (struct vd_config *) update->buf; + if (posix_memalign(&update->space, 512, + offsetof(struct vcl, conf) + + ddf->conf_rec_len * 512) != 0) { + update->space = NULL; + return 0; + } + vcl = update->space; + vcl->conf.sec_elmnt_count = conf->sec_elmnt_count; + if (alloc_other_bvds(ddf, vcl) != 0) { + free(update->space); + update->space = NULL; + return 0; + } + } + return 1; +} + +/* + * Check degraded state of a RAID10. + * returns 2 for good, 1 for degraded, 0 for failed, and -1 for error + */ +static int raid10_degraded(struct mdinfo *info) +{ + int n_prim, n_bvds; + int i; + struct mdinfo *d; + char *found; + int ret = -1; + + n_prim = info->array.layout & ~0x100; + n_bvds = info->array.raid_disks / n_prim; + found = xmalloc(n_bvds); + if (found == NULL) + return ret; + memset(found, 0, n_bvds); + for (d = info->devs; d; d = d->next) { + i = d->disk.raid_disk / n_prim; + if (i >= n_bvds) { + pr_err("BUG: invalid raid disk\n"); + goto out; + } + if (d->state_fd > 0) + found[i]++; + } + ret = 2; + for (i = 0; i < n_bvds; i++) + if (!found[i]) { + dprintf("BVD %d/%d failed\n", i, n_bvds); + ret = 0; + goto out; + } else if (found[i] < n_prim) { + dprintf("BVD %d/%d degraded\n", i, n_bvds); + ret = 1; + } +out: + free(found); + return ret; +} + +/* + * Check if the array 'a' is degraded but not failed. + * If it is, find as many spares as are available and needed and + * arrange for their inclusion. + * We only choose devices which are not already in the array, + * and prefer those with a spare-assignment to this array. + * Otherwise we choose global spares - assuming always that + * there is enough room. + * For each spare that we assign, we return an 'mdinfo' which + * describes the position for the device in the array. + * We also add to 'updates' a DDF_VD_CONF_MAGIC update with + * the new phys_refnum and lba_offset values. + * + * Only worry about BVDs at the moment. + */ +static struct mdinfo *ddf_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + int working = 0; + struct mdinfo *d; + struct ddf_super *ddf = a->container->sb; + int global_ok = 0; + struct mdinfo *rv = NULL; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + int i; + unsigned int j; + struct vcl *vcl; + struct vd_config *vc; + unsigned int n_bvd; + + for (d = a->info.devs ; d ; d = d->next) { + if ((d->curr_state & DS_FAULTY) && + d->state_fd >= 0) + /* wait for Removal to happen */ + return NULL; + if (d->state_fd >= 0) + working ++; + } + + dprintf("working=%d (%d) level=%d\n", working, + a->info.array.raid_disks, + a->info.array.level); + if (working == a->info.array.raid_disks) + return NULL; /* array not degraded */ + switch (a->info.array.level) { + case 1: + if (working == 0) + return NULL; /* failed */ + break; + case 4: + case 5: + if (working < a->info.array.raid_disks - 1) + return NULL; /* failed */ + break; + case 6: + if (working < a->info.array.raid_disks - 2) + return NULL; /* failed */ + break; + case 10: + if (raid10_degraded(&a->info) < 1) + return NULL; + break; + default: /* concat or stripe */ + return NULL; /* failed */ + } + + /* For each slot, if it is not working, find a spare */ + dl = ddf->dlist; + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && (d->state_fd >= 0)) + continue; + + /* OK, this device needs recovery. Find a spare */ + again: + for ( ; dl ; dl = dl->next) { + unsigned long long esize; + unsigned long long pos; + struct mdinfo *d2; + int is_global = 0; + int is_dedicated = 0; + be16 state; + + if (dl->pdnum < 0) + continue; + state = ddf->phys->entries[dl->pdnum].state; + if (be16_and(state, + cpu_to_be16(DDF_Failed|DDF_Missing)) || + !be16_and(state, + cpu_to_be16(DDF_Online))) + continue; + + /* If in this array, skip */ + for (d2 = a->info.devs ; d2 ; d2 = d2->next) + if (d2->state_fd >= 0 && + d2->disk.major == dl->major && + d2->disk.minor == dl->minor) { + dprintf("%x:%x (%08x) already in array\n", + dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum)); + break; + } + if (d2) + continue; + if (be16_and(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare))) { + /* Check spare assign record */ + if (dl->spare) { + if (dl->spare->type & DDF_spare_dedicated) { + /* check spare_ents for guid */ + unsigned int j; + for (j = 0 ; + j < be16_to_cpu + (dl->spare + ->populated); + j++) { + if (memcmp(dl->spare->spare_ents[j].guid, + ddf->virt->entries[a->info.container_member].guid, + DDF_GUID_LEN) == 0) + is_dedicated = 1; + } + } else + is_global = 1; + } + } else if (be16_and(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare))) { + is_global = 1; + } else if (!be16_and(ddf->phys + ->entries[dl->pdnum].state, + cpu_to_be16(DDF_Failed))) { + /* we can possibly use some of this */ + is_global = 1; + } + if ( ! (is_dedicated || + (is_global && global_ok))) { + dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor, + is_dedicated, is_global); + continue; + } + + /* We are allowed to use this device - is there space? + * We need a->info.component_size sectors */ + esize = a->info.component_size; + pos = find_space(ddf, dl, INVALID_SECTORS, &esize); + + if (esize < a->info.component_size) { + dprintf("%x:%x has no room: %llu %llu\n", + dl->major, dl->minor, + esize, a->info.component_size); + /* No room */ + continue; + } + + /* Cool, we have a device with some space at pos */ + di = xcalloc(1, sizeof(*di)); + di->disk.number = i; + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->recovery_start = 0; + di->data_offset = pos; + di->component_size = a->info.component_size; + di->next = rv; + rv = di; + dprintf("%x:%x (%08x) to be %d at %llu\n", + dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum), i, pos); + + break; + } + if (!dl && ! global_ok) { + /* not enough dedicated spares, try global */ + global_ok = 1; + dl = ddf->dlist; + goto again; + } + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * phys_refnum and lba_offset values + */ + vc = find_vdcr(ddf, a->info.container_member, rv->disk.raid_disk, + &n_bvd, &vcl); + if (vc == NULL) + return NULL; + + mu = xmalloc(sizeof(*mu)); + if (posix_memalign(&mu->space, 512, sizeof(struct vcl)) != 0) { + free(mu); + mu = NULL; + } + + mu->len = ddf->conf_rec_len * 512 * vcl->conf.sec_elmnt_count; + mu->buf = xmalloc(mu->len); + mu->space = NULL; + mu->space_list = NULL; + mu->next = *updates; + memcpy(mu->buf, &vcl->conf, ddf->conf_rec_len * 512); + for (j = 1; j < vcl->conf.sec_elmnt_count; j++) + memcpy(mu->buf + j * ddf->conf_rec_len * 512, + vcl->other_bvds[j-1], ddf->conf_rec_len * 512); + + vc = (struct vd_config*)mu->buf; + for (di = rv ; di ; di = di->next) { + unsigned int i_sec, i_prim; + i_sec = di->disk.raid_disk + / be16_to_cpu(vcl->conf.prim_elmnt_count); + i_prim = di->disk.raid_disk + % be16_to_cpu(vcl->conf.prim_elmnt_count); + vc = (struct vd_config *)(mu->buf + + i_sec * ddf->conf_rec_len * 512); + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == di->disk.major + && dl->minor == di->disk.minor) + break; + if (!dl || dl->pdnum < 0) { + pr_err("BUG: can't find disk %d (%d/%d)\n", + di->disk.raid_disk, + di->disk.major, di->disk.minor); + return NULL; + } + vc->phys_refnum[i_prim] = ddf->phys->entries[dl->pdnum].refnum; + LBA_OFFSET(ddf, vc)[i_prim] = cpu_to_be64(di->data_offset); + dprintf("BVD %u gets %u: %08x at %llu\n", i_sec, i_prim, + be32_to_cpu(vc->phys_refnum[i_prim]), + be64_to_cpu(LBA_OFFSET(ddf, vc)[i_prim])); + } + *updates = mu; + return rv; +} +#endif /* MDASSEMBLE */ + +static int ddf_level_to_layout(int level) +{ + switch(level) { + case 0: + case 1: + return 0; + case 5: + return ALGORITHM_LEFT_SYMMETRIC; + case 6: + return ALGORITHM_ROTATING_N_CONTINUE; + case 10: + return 0x102; + default: + return UnSet; + } +} + +static void default_geometry_ddf(struct supertype *st, int *level, int *layout, int *chunk) +{ + if (level && *level == UnSet) + *level = LEVEL_CONTAINER; + + if (level && layout && *layout == UnSet) + *layout = ddf_level_to_layout(*level); +} + +struct superswitch super_ddf = { +#ifndef MDASSEMBLE + .examine_super = examine_super_ddf, + .brief_examine_super = brief_examine_super_ddf, + .brief_examine_subarrays = brief_examine_subarrays_ddf, + .export_examine_super = export_examine_super_ddf, + .detail_super = detail_super_ddf, + .brief_detail_super = brief_detail_super_ddf, + .validate_geometry = validate_geometry_ddf, + .write_init_super = write_init_super_ddf, + .add_to_super = add_to_super_ddf, + .remove_from_super = remove_from_super_ddf, + .load_container = load_container_ddf, + .copy_metadata = copy_metadata_ddf, + .kill_subarray = kill_subarray_ddf, +#endif + .match_home = match_home_ddf, + .uuid_from_super= uuid_from_super_ddf, + .getinfo_super = getinfo_super_ddf, + .update_super = update_super_ddf, + + .avail_size = avail_size_ddf, + + .compare_super = compare_super_ddf, + + .load_super = load_super_ddf, + .init_super = init_super_ddf, + .store_super = store_super_ddf, + .free_super = free_super_ddf, + .match_metadata_desc = match_metadata_desc_ddf, + .container_content = container_content_ddf, + .default_geometry = default_geometry_ddf, + + .external = 1, + +#ifndef MDASSEMBLE +/* for mdmon */ + .open_new = ddf_open_new, + .set_array_state= ddf_set_array_state, + .set_disk = ddf_set_disk, + .sync_metadata = ddf_sync_metadata, + .process_update = ddf_process_update, + .prepare_update = ddf_prepare_update, + .activate_spare = ddf_activate_spare, +#endif + .name = "ddf", +}; diff --git a/super-gpt.c b/super-gpt.c new file mode 100644 index 00000000..1a2adce0 --- /dev/null +++ b/super-gpt.c @@ -0,0 +1,216 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2010 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neil@brown.name> + * + */ + +/* + * 'gpt' is a pseudo metadata type for devices which have a + * GPT partition table. + * + * Obviously arrays cannot be created or assembled for this type. + * It is used to allow a new bare device to have an partition table + * added so the member partitions can then be included in other + * arrays as relevant. + * + * The meaning operations are: + * examine_super, but not brief_examine_super or export_examine + * load_super + * store_super + */ + +#include "mdadm.h" +#include "part.h" + +static void free_gpt(struct supertype *st) +{ + free(st->sb); + st->sb = NULL; +} + +#ifndef MDASSEMBLE +static void examine_gpt(struct supertype *st, char *homehost) +{ + struct GPT *gpt = st->sb + 512; + struct GPT_part_entry *gpe = st->sb + 1024; + unsigned int i; + + printf(" GPT Magic : %llx\n", (unsigned long long)__le64_to_cpu(gpt->magic)); + printf(" GPT Revision : %ld\n", (long)__le32_to_cpu(gpt->revision)); + for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) { + printf(" Partition[%02d] : %12llu sectors at %12llu\n", + i, + (unsigned long long)__le64_to_cpu(gpe[i].starting_lba), + (unsigned long long)__le64_to_cpu(gpe[i].ending_lba)- + (unsigned long long)__le64_to_cpu(gpe[i].starting_lba) + +1 + ); + } +} +#endif /* MDASSEMBLE */ + +static int load_gpt(struct supertype *st, int fd, char *devname) +{ + struct MBR *super; + struct GPT *gpt_head; + int to_read; + + free_gpt(st); + + if (posix_memalign((void**)&super, 4096, 32*512) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + lseek(fd, 0, 0); + if (read(fd, super, sizeof(*super)) != sizeof(*super)) { + no_read: + if (devname) + pr_err("Cannot read partition table on %s\n", + devname); + free(super); + return 1; + } + + if (super->magic != MBR_SIGNATURE_MAGIC || + super->parts[0].part_type != MBR_GPT_PARTITION_TYPE) { + not_found: + if (devname) + pr_err("No partition table found on %s\n", + devname); + free(super); + return 1; + } + /* Seem to have GPT, load the header */ + gpt_head = (struct GPT*)(super+1); + if (read(fd, gpt_head, sizeof(*gpt_head)) != sizeof(*gpt_head)) + goto no_read; + if (gpt_head->magic != GPT_SIGNATURE_MAGIC) + goto not_found; + if (__le32_to_cpu(gpt_head->part_cnt) >= 128) + goto not_found; + + to_read = __le32_to_cpu(gpt_head->part_cnt) * sizeof(struct GPT_part_entry); + to_read = ((to_read+511)/512) * 512; + if (read(fd, gpt_head+1, to_read) != to_read) + goto no_read; + + st->sb = super; + + if (st->ss == NULL) { + st->ss = &gpt; + st->minor_version = 0; + st->max_devs = 1; + st->info = NULL; + } + return 0; +} + +static int store_gpt(struct supertype *st, int fd) +{ + /* FIXME should I save the boot loader */ + /* need to write two copies! */ + /* FIXME allow for blocks != 512 bytes + *etc + */ + struct MBR *super = st->sb; + struct GPT *gpt; + int to_write; + + gpt = (struct GPT*)(super+1); + + to_write = __le32_to_cpu(gpt->part_cnt) * sizeof(struct GPT_part_entry); + to_write = ((to_write+511)/512) * 512; + + lseek(fd, 0, 0); + if (write(fd, st->sb, to_write) != to_write) + return 4; + + fsync(fd); + ioctl(fd, BLKRRPART, 0); + return 0; +} + +static void getinfo_gpt(struct supertype *st, struct mdinfo *info, char *map) +{ + struct GPT *gpt = st->sb + 512; + struct GPT_part_entry *gpe = st->sb + 1024; + unsigned int i; + + memset(&info->array, 0, sizeof(info->array)); + memset(&info->disk, 0, sizeof(info->disk)); + strcpy(info->text_version, "gpt"); + strcpy(info->name, "gpt"); + info->component_size = 0; + + for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) { + unsigned long long last = + (unsigned long long)__le64_to_cpu(gpe[i].ending_lba); + if (last > info->component_size) + info->component_size = last; + } +} + +static struct supertype *match_metadata_desc(char *arg) +{ + struct supertype *st = xmalloc(sizeof(*st)); + + if (!st) + return st; + if (strcmp(arg, "gpt") != 0) { + free(st); + return NULL; + } + + st->ss = &gpt; + st->info = NULL; + st->minor_version = 0; + st->max_devs = 1; + st->sb = NULL; + return st; +} + +#ifndef MDASSEMBLE +static int validate_geometry(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int verbose) +{ + pr_err("gpt metadata cannot be used this way\n"); + return 0; +} +#endif + +struct superswitch gpt = { +#ifndef MDASSEMBLE + .examine_super = examine_gpt, + .validate_geometry = validate_geometry, +#endif + .match_metadata_desc = match_metadata_desc, + .load_super = load_gpt, + .store_super = store_gpt, + .getinfo_super = getinfo_gpt, + .free_super = free_gpt, + .name = "gpt", +}; diff --git a/super-intel.c b/super-intel.c new file mode 100644 index 00000000..95a72b6a --- /dev/null +++ b/super-intel.c @@ -0,0 +1,10667 @@ +/* + * mdadm - Intel(R) Matrix Storage Manager Support + * + * Copyright (C) 2002-2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "mdmon.h" +#include "sha1.h" +#include "platform-intel.h" +#include <values.h> +#include <scsi/sg.h> +#include <ctype.h> +#include <dirent.h> + +/* MPB == Metadata Parameter Block */ +#define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. " +#define MPB_SIG_LEN (strlen(MPB_SIGNATURE)) +#define MPB_VERSION_RAID0 "1.0.00" +#define MPB_VERSION_RAID1 "1.1.00" +#define MPB_VERSION_MANY_VOLUMES_PER_ARRAY "1.2.00" +#define MPB_VERSION_3OR4_DISK_ARRAY "1.2.01" +#define MPB_VERSION_RAID5 "1.2.02" +#define MPB_VERSION_5OR6_DISK_ARRAY "1.2.04" +#define MPB_VERSION_CNG "1.2.06" +#define MPB_VERSION_ATTRIBS "1.3.00" +#define MAX_SIGNATURE_LENGTH 32 +#define MAX_RAID_SERIAL_LEN 16 + +/* supports RAID0 */ +#define MPB_ATTRIB_RAID0 __cpu_to_le32(0x00000001) +/* supports RAID1 */ +#define MPB_ATTRIB_RAID1 __cpu_to_le32(0x00000002) +/* supports RAID10 */ +#define MPB_ATTRIB_RAID10 __cpu_to_le32(0x00000004) +/* supports RAID1E */ +#define MPB_ATTRIB_RAID1E __cpu_to_le32(0x00000008) +/* supports RAID5 */ +#define MPB_ATTRIB_RAID5 __cpu_to_le32(0x00000010) +/* supports RAID CNG */ +#define MPB_ATTRIB_RAIDCNG __cpu_to_le32(0x00000020) +/* supports expanded stripe sizes of 256K, 512K and 1MB */ +#define MPB_ATTRIB_EXP_STRIPE_SIZE __cpu_to_le32(0x00000040) + +/* The OROM Support RST Caching of Volumes */ +#define MPB_ATTRIB_NVM __cpu_to_le32(0x02000000) +/* The OROM supports creating disks greater than 2TB */ +#define MPB_ATTRIB_2TB_DISK __cpu_to_le32(0x04000000) +/* The OROM supports Bad Block Management */ +#define MPB_ATTRIB_BBM __cpu_to_le32(0x08000000) + +/* THe OROM Supports NVM Caching of Volumes */ +#define MPB_ATTRIB_NEVER_USE2 __cpu_to_le32(0x10000000) +/* The OROM supports creating volumes greater than 2TB */ +#define MPB_ATTRIB_2TB __cpu_to_le32(0x20000000) +/* originally for PMP, now it's wasted b/c. Never use this bit! */ +#define MPB_ATTRIB_NEVER_USE __cpu_to_le32(0x40000000) +/* Verify MPB contents against checksum after reading MPB */ +#define MPB_ATTRIB_CHECKSUM_VERIFY __cpu_to_le32(0x80000000) + +/* Define all supported attributes that have to be accepted by mdadm + */ +#define MPB_ATTRIB_SUPPORTED (MPB_ATTRIB_CHECKSUM_VERIFY | \ + MPB_ATTRIB_2TB | \ + MPB_ATTRIB_2TB_DISK | \ + MPB_ATTRIB_RAID0 | \ + MPB_ATTRIB_RAID1 | \ + MPB_ATTRIB_RAID10 | \ + MPB_ATTRIB_RAID5 | \ + MPB_ATTRIB_EXP_STRIPE_SIZE) + +/* Define attributes that are unused but not harmful */ +#define MPB_ATTRIB_IGNORED (MPB_ATTRIB_NEVER_USE) + +#define MPB_SECTOR_CNT 2210 +#define IMSM_RESERVED_SECTORS 4096 +#define NUM_BLOCKS_DIRTY_STRIPE_REGION 2056 +#define SECT_PER_MB_SHIFT 11 + +/* Disk configuration info. */ +#define IMSM_MAX_DEVICES 255 +struct imsm_disk { + __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */ + __u32 total_blocks_lo; /* 0xE8 - 0xEB total blocks lo */ + __u32 scsi_id; /* 0xEC - 0xEF scsi ID */ +#define SPARE_DISK __cpu_to_le32(0x01) /* Spare */ +#define CONFIGURED_DISK __cpu_to_le32(0x02) /* Member of some RaidDev */ +#define FAILED_DISK __cpu_to_le32(0x04) /* Permanent failure */ + __u32 status; /* 0xF0 - 0xF3 */ + __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */ + __u32 total_blocks_hi; /* 0xF4 - 0xF5 total blocks hi */ +#define IMSM_DISK_FILLERS 3 + __u32 filler[IMSM_DISK_FILLERS]; /* 0xF5 - 0x107 MPB_DISK_FILLERS for future expansion */ +}; + +/* map selector for map managment + */ +#define MAP_0 0 +#define MAP_1 1 +#define MAP_X -1 + +/* RAID map configuration infos. */ +struct imsm_map { + __u32 pba_of_lba0_lo; /* start address of partition */ + __u32 blocks_per_member_lo;/* blocks per member */ + __u32 num_data_stripes_lo; /* number of data stripes */ + __u16 blocks_per_strip; + __u8 map_state; /* Normal, Uninitialized, Degraded, Failed */ +#define IMSM_T_STATE_NORMAL 0 +#define IMSM_T_STATE_UNINITIALIZED 1 +#define IMSM_T_STATE_DEGRADED 2 +#define IMSM_T_STATE_FAILED 3 + __u8 raid_level; +#define IMSM_T_RAID0 0 +#define IMSM_T_RAID1 1 +#define IMSM_T_RAID5 5 /* since metadata version 1.2.02 ? */ + __u8 num_members; /* number of member disks */ + __u8 num_domains; /* number of parity domains */ + __u8 failed_disk_num; /* valid only when state is degraded */ + __u8 ddf; + __u32 pba_of_lba0_hi; + __u32 blocks_per_member_hi; + __u32 num_data_stripes_hi; + __u32 filler[4]; /* expansion area */ +#define IMSM_ORD_REBUILD (1 << 24) + __u32 disk_ord_tbl[1]; /* disk_ord_tbl[num_members], + * top byte contains some flags + */ +} __attribute__ ((packed)); + +struct imsm_vol { + __u32 curr_migr_unit; + __u32 checkpoint_id; /* id to access curr_migr_unit */ + __u8 migr_state; /* Normal or Migrating */ +#define MIGR_INIT 0 +#define MIGR_REBUILD 1 +#define MIGR_VERIFY 2 /* analagous to echo check > sync_action */ +#define MIGR_GEN_MIGR 3 +#define MIGR_STATE_CHANGE 4 +#define MIGR_REPAIR 5 + __u8 migr_type; /* Initializing, Rebuilding, ... */ + __u8 dirty; + __u8 fs_state; /* fast-sync state for CnG (0xff == disabled) */ + __u16 verify_errors; /* number of mismatches */ + __u16 bad_blocks; /* number of bad blocks during verify */ + __u32 filler[4]; + struct imsm_map map[1]; + /* here comes another one if migr_state */ +} __attribute__ ((packed)); + +struct imsm_dev { + __u8 volume[MAX_RAID_SERIAL_LEN]; + __u32 size_low; + __u32 size_high; +#define DEV_BOOTABLE __cpu_to_le32(0x01) +#define DEV_BOOT_DEVICE __cpu_to_le32(0x02) +#define DEV_READ_COALESCING __cpu_to_le32(0x04) +#define DEV_WRITE_COALESCING __cpu_to_le32(0x08) +#define DEV_LAST_SHUTDOWN_DIRTY __cpu_to_le32(0x10) +#define DEV_HIDDEN_AT_BOOT __cpu_to_le32(0x20) +#define DEV_CURRENTLY_HIDDEN __cpu_to_le32(0x40) +#define DEV_VERIFY_AND_FIX __cpu_to_le32(0x80) +#define DEV_MAP_STATE_UNINIT __cpu_to_le32(0x100) +#define DEV_NO_AUTO_RECOVERY __cpu_to_le32(0x200) +#define DEV_CLONE_N_GO __cpu_to_le32(0x400) +#define DEV_CLONE_MAN_SYNC __cpu_to_le32(0x800) +#define DEV_CNG_MASTER_DISK_NUM __cpu_to_le32(0x1000) + __u32 status; /* Persistent RaidDev status */ + __u32 reserved_blocks; /* Reserved blocks at beginning of volume */ + __u8 migr_priority; + __u8 num_sub_vols; + __u8 tid; + __u8 cng_master_disk; + __u16 cache_policy; + __u8 cng_state; + __u8 cng_sub_state; +#define IMSM_DEV_FILLERS 10 + __u32 filler[IMSM_DEV_FILLERS]; + struct imsm_vol vol; +} __attribute__ ((packed)); + +struct imsm_super { + __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */ + __u32 check_sum; /* 0x20 - 0x23 MPB Checksum */ + __u32 mpb_size; /* 0x24 - 0x27 Size of MPB */ + __u32 family_num; /* 0x28 - 0x2B Checksum from first time this config was written */ + __u32 generation_num; /* 0x2C - 0x2F Incremented each time this array's MPB is written */ + __u32 error_log_size; /* 0x30 - 0x33 in bytes */ + __u32 attributes; /* 0x34 - 0x37 */ + __u8 num_disks; /* 0x38 Number of configured disks */ + __u8 num_raid_devs; /* 0x39 Number of configured volumes */ + __u8 error_log_pos; /* 0x3A */ + __u8 fill[1]; /* 0x3B */ + __u32 cache_size; /* 0x3c - 0x40 in mb */ + __u32 orig_family_num; /* 0x40 - 0x43 original family num */ + __u32 pwr_cycle_count; /* 0x44 - 0x47 simulated power cycle count for array */ + __u32 bbm_log_size; /* 0x48 - 0x4B - size of bad Block Mgmt Log in bytes */ +#define IMSM_FILLERS 35 + __u32 filler[IMSM_FILLERS]; /* 0x4C - 0xD7 RAID_MPB_FILLERS */ + struct imsm_disk disk[1]; /* 0xD8 diskTbl[numDisks] */ + /* here comes imsm_dev[num_raid_devs] */ + /* here comes BBM logs */ +} __attribute__ ((packed)); + +#define BBM_LOG_MAX_ENTRIES 254 + +struct bbm_log_entry { + __u64 defective_block_start; +#define UNREADABLE 0xFFFFFFFF + __u32 spare_block_offset; + __u16 remapped_marked_count; + __u16 disk_ordinal; +} __attribute__ ((__packed__)); + +struct bbm_log { + __u32 signature; /* 0xABADB10C */ + __u32 entry_count; + __u32 reserved_spare_block_count; /* 0 */ + __u32 reserved; /* 0xFFFF */ + __u64 first_spare_lba; + struct bbm_log_entry mapped_block_entries[BBM_LOG_MAX_ENTRIES]; +} __attribute__ ((__packed__)); + +#ifndef MDASSEMBLE +static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" }; +#endif + +#define RAID_DISK_RESERVED_BLOCKS_IMSM_HI 2209 + +#define GEN_MIGR_AREA_SIZE 2048 /* General Migration Copy Area size in blocks */ + +#define MIGR_REC_BUF_SIZE 512 /* size of migr_record i/o buffer */ +#define MIGR_REC_POSITION 512 /* migr_record position offset on disk, + * MIGR_REC_BUF_SIZE <= MIGR_REC_POSITION + */ + +#define UNIT_SRC_NORMAL 0 /* Source data for curr_migr_unit must + * be recovered using srcMap */ +#define UNIT_SRC_IN_CP_AREA 1 /* Source data for curr_migr_unit has + * already been migrated and must + * be recovered from checkpoint area */ +struct migr_record { + __u32 rec_status; /* Status used to determine how to restart + * migration in case it aborts + * in some fashion */ + __u32 curr_migr_unit; /* 0..numMigrUnits-1 */ + __u32 family_num; /* Family number of MPB + * containing the RaidDev + * that is migrating */ + __u32 ascending_migr; /* True if migrating in increasing + * order of lbas */ + __u32 blocks_per_unit; /* Num disk blocks per unit of operation */ + __u32 dest_depth_per_unit; /* Num member blocks each destMap + * member disk + * advances per unit-of-operation */ + __u32 ckpt_area_pba; /* Pba of first block of ckpt copy area */ + __u32 dest_1st_member_lba; /* First member lba on first + * stripe of destination */ + __u32 num_migr_units; /* Total num migration units-of-op */ + __u32 post_migr_vol_cap; /* Size of volume after + * migration completes */ + __u32 post_migr_vol_cap_hi; /* Expansion space for LBA64 */ + __u32 ckpt_read_disk_num; /* Which member disk in destSubMap[0] the + * migration ckpt record was read from + * (for recovered migrations) */ +} __attribute__ ((__packed__)); + +struct md_list { + /* usage marker: + * 1: load metadata + * 2: metadata does not match + * 4: already checked + */ + int used; + char *devname; + int found; + int container; + dev_t st_rdev; + struct md_list *next; +}; + +#define pr_vrb(fmt, arg...) (void) (verbose && pr_err(fmt, ##arg)) + +static __u8 migr_type(struct imsm_dev *dev) +{ + if (dev->vol.migr_type == MIGR_VERIFY && + dev->status & DEV_VERIFY_AND_FIX) + return MIGR_REPAIR; + else + return dev->vol.migr_type; +} + +static void set_migr_type(struct imsm_dev *dev, __u8 migr_type) +{ + /* for compatibility with older oroms convert MIGR_REPAIR, into + * MIGR_VERIFY w/ DEV_VERIFY_AND_FIX status + */ + if (migr_type == MIGR_REPAIR) { + dev->vol.migr_type = MIGR_VERIFY; + dev->status |= DEV_VERIFY_AND_FIX; + } else { + dev->vol.migr_type = migr_type; + dev->status &= ~DEV_VERIFY_AND_FIX; + } +} + +static unsigned int sector_count(__u32 bytes) +{ + return ROUND_UP(bytes, 512) / 512; +} + +static unsigned int mpb_sectors(struct imsm_super *mpb) +{ + return sector_count(__le32_to_cpu(mpb->mpb_size)); +} + +struct intel_dev { + struct imsm_dev *dev; + struct intel_dev *next; + unsigned index; +}; + +struct intel_hba { + enum sys_dev_type type; + char *path; + char *pci_id; + struct intel_hba *next; +}; + +enum action { + DISK_REMOVE = 1, + DISK_ADD +}; +/* internal representation of IMSM metadata */ +struct intel_super { + union { + void *buf; /* O_DIRECT buffer for reading/writing metadata */ + struct imsm_super *anchor; /* immovable parameters */ + }; + union { + void *migr_rec_buf; /* buffer for I/O operations */ + struct migr_record *migr_rec; /* migration record */ + }; + int clean_migration_record_by_mdmon; /* when reshape is switched to next + array, it indicates that mdmon is allowed to clean migration + record */ + size_t len; /* size of the 'buf' allocation */ + void *next_buf; /* for realloc'ing buf from the manager */ + size_t next_len; + int updates_pending; /* count of pending updates for mdmon */ + int current_vol; /* index of raid device undergoing creation */ + unsigned long long create_offset; /* common start for 'current_vol' */ + __u32 random; /* random data for seeding new family numbers */ + struct intel_dev *devlist; + struct dl { + struct dl *next; + int index; + __u8 serial[MAX_RAID_SERIAL_LEN]; + int major, minor; + char *devname; + struct imsm_disk disk; + int fd; + int extent_cnt; + struct extent *e; /* for determining freespace @ create */ + int raiddisk; /* slot to fill in autolayout */ + enum action action; + } *disks, *current_disk; + struct dl *disk_mgmt_list; /* list of disks to add/remove while mdmon + active */ + struct dl *missing; /* disks removed while we weren't looking */ + struct bbm_log *bbm_log; + struct intel_hba *hba; /* device path of the raid controller for this metadata */ + const struct imsm_orom *orom; /* platform firmware support */ + struct intel_super *next; /* (temp) list for disambiguating family_num */ +}; + +struct intel_disk { + struct imsm_disk disk; + #define IMSM_UNKNOWN_OWNER (-1) + int owner; + struct intel_disk *next; +}; + +struct extent { + unsigned long long start, size; +}; + +/* definitions of reshape process types */ +enum imsm_reshape_type { + CH_TAKEOVER, + CH_MIGRATION, + CH_ARRAY_SIZE, +}; + +/* definition of messages passed to imsm_process_update */ +enum imsm_update_type { + update_activate_spare, + update_create_array, + update_kill_array, + update_rename_array, + update_add_remove_disk, + update_reshape_container_disks, + update_reshape_migration, + update_takeover, + update_general_migration_checkpoint, + update_size_change, +}; + +struct imsm_update_activate_spare { + enum imsm_update_type type; + struct dl *dl; + int slot; + int array; + struct imsm_update_activate_spare *next; +}; + +struct geo_params { + char devnm[32]; + char *dev_name; + unsigned long long size; + int level; + int layout; + int chunksize; + int raid_disks; +}; + +enum takeover_direction { + R10_TO_R0, + R0_TO_R10 +}; +struct imsm_update_takeover { + enum imsm_update_type type; + int subarray; + enum takeover_direction direction; +}; + +struct imsm_update_reshape { + enum imsm_update_type type; + int old_raid_disks; + int new_raid_disks; + + int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */ +}; + +struct imsm_update_reshape_migration { + enum imsm_update_type type; + int old_raid_disks; + int new_raid_disks; + /* fields for array migration changes + */ + int subdev; + int new_level; + int new_layout; + int new_chunksize; + + int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */ +}; + +struct imsm_update_size_change { + enum imsm_update_type type; + int subdev; + long long new_size; +}; + +struct imsm_update_general_migration_checkpoint { + enum imsm_update_type type; + __u32 curr_migr_unit; +}; + +struct disk_info { + __u8 serial[MAX_RAID_SERIAL_LEN]; +}; + +struct imsm_update_create_array { + enum imsm_update_type type; + int dev_idx; + struct imsm_dev dev; +}; + +struct imsm_update_kill_array { + enum imsm_update_type type; + int dev_idx; +}; + +struct imsm_update_rename_array { + enum imsm_update_type type; + __u8 name[MAX_RAID_SERIAL_LEN]; + int dev_idx; +}; + +struct imsm_update_add_remove_disk { + enum imsm_update_type type; +}; + +static const char *_sys_dev_type[] = { + [SYS_DEV_UNKNOWN] = "Unknown", + [SYS_DEV_SAS] = "SAS", + [SYS_DEV_SATA] = "SATA", + [SYS_DEV_NVME] = "NVMe" +}; + +const char *get_sys_dev_type(enum sys_dev_type type) +{ + if (type >= SYS_DEV_MAX) + type = SYS_DEV_UNKNOWN; + + return _sys_dev_type[type]; +} + +static struct intel_hba * alloc_intel_hba(struct sys_dev *device) +{ + struct intel_hba *result = xmalloc(sizeof(*result)); + + result->type = device->type; + result->path = xstrdup(device->path); + result->next = NULL; + if (result->path && (result->pci_id = strrchr(result->path, '/')) != NULL) + result->pci_id++; + + return result; +} + +static struct intel_hba * find_intel_hba(struct intel_hba *hba, struct sys_dev *device) +{ + struct intel_hba *result=NULL; + for (result = hba; result; result = result->next) { + if (result->type == device->type && strcmp(result->path, device->path) == 0) + break; + } + return result; +} + +static int attach_hba_to_super(struct intel_super *super, struct sys_dev *device) +{ + struct intel_hba *hba; + + /* check if disk attached to Intel HBA */ + hba = find_intel_hba(super->hba, device); + if (hba != NULL) + return 1; + /* Check if HBA is already attached to super */ + if (super->hba == NULL) { + super->hba = alloc_intel_hba(device); + return 1; + } + + hba = super->hba; + /* Intel metadata allows for all disks attached to the same type HBA. + * Do not support HBA types mixing + */ + if (device->type != hba->type) + return 2; + + /* Multiple same type HBAs can be used if they share the same OROM */ + const struct imsm_orom *device_orom = get_orom_by_device_id(device->dev_id); + + if (device_orom != super->orom) + return 2; + + while (hba->next) + hba = hba->next; + + hba->next = alloc_intel_hba(device); + return 1; +} + +static struct sys_dev* find_disk_attached_hba(int fd, const char *devname) +{ + struct sys_dev *list, *elem; + char *disk_path; + + if ((list = find_intel_devices()) == NULL) + return 0; + + if (fd < 0) + disk_path = (char *) devname; + else + disk_path = diskfd_to_devpath(fd); + + if (!disk_path) + return 0; + + for (elem = list; elem; elem = elem->next) + if (path_attached_to_hba(disk_path, elem->path)) + return elem; + + if (disk_path != devname) + free(disk_path); + + return NULL; +} + +static int find_intel_hba_capability(int fd, struct intel_super *super, + char *devname); + +static struct supertype *match_metadata_desc_imsm(char *arg) +{ + struct supertype *st; + + if (strcmp(arg, "imsm") != 0 && + strcmp(arg, "default") != 0 + ) + return NULL; + + st = xcalloc(1, sizeof(*st)); + st->ss = &super_imsm; + st->max_devs = IMSM_MAX_DEVICES; + st->minor_version = 0; + st->sb = NULL; + return st; +} + +#ifndef MDASSEMBLE +static __u8 *get_imsm_version(struct imsm_super *mpb) +{ + return &mpb->sig[MPB_SIG_LEN]; +} +#endif + +/* retrieve a disk directly from the anchor when the anchor is known to be + * up-to-date, currently only at load time + */ +static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index) +{ + if (index >= mpb->num_disks) + return NULL; + return &mpb->disk[index]; +} + +/* retrieve the disk description based on a index of the disk + * in the sub-array + */ +static struct dl *get_imsm_dl_disk(struct intel_super *super, __u8 index) +{ + struct dl *d; + + for (d = super->disks; d; d = d->next) + if (d->index == index) + return d; + + return NULL; +} +/* retrieve a disk from the parsed metadata */ +static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index) +{ + struct dl *dl; + + dl = get_imsm_dl_disk(super, index); + if (dl) + return &dl->disk; + + return NULL; +} + +/* generate a checksum directly from the anchor when the anchor is known to be + * up-to-date, currently only at load or write_super after coalescing + */ +static __u32 __gen_imsm_checksum(struct imsm_super *mpb) +{ + __u32 end = mpb->mpb_size / sizeof(end); + __u32 *p = (__u32 *) mpb; + __u32 sum = 0; + + while (end--) { + sum += __le32_to_cpu(*p); + p++; + } + + return sum - __le32_to_cpu(mpb->check_sum); +} + +static size_t sizeof_imsm_map(struct imsm_map *map) +{ + return sizeof(struct imsm_map) + sizeof(__u32) * (map->num_members - 1); +} + +struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map) +{ + /* A device can have 2 maps if it is in the middle of a migration. + * If second_map is: + * MAP_0 - we return the first map + * MAP_1 - we return the second map if it exists, else NULL + * MAP_X - we return the second map if it exists, else the first + */ + struct imsm_map *map = &dev->vol.map[0]; + struct imsm_map *map2 = NULL; + + if (dev->vol.migr_state) + map2 = (void *)map + sizeof_imsm_map(map); + + switch (second_map) { + case MAP_0: + break; + case MAP_1: + map = map2; + break; + case MAP_X: + if (map2) + map = map2; + break; + default: + map = NULL; + } + return map; + +} + +/* return the size of the device. + * migr_state increases the returned size if map[0] were to be duplicated + */ +static size_t sizeof_imsm_dev(struct imsm_dev *dev, int migr_state) +{ + size_t size = sizeof(*dev) - sizeof(struct imsm_map) + + sizeof_imsm_map(get_imsm_map(dev, MAP_0)); + + /* migrating means an additional map */ + if (dev->vol.migr_state) + size += sizeof_imsm_map(get_imsm_map(dev, MAP_1)); + else if (migr_state) + size += sizeof_imsm_map(get_imsm_map(dev, MAP_0)); + + return size; +} + +#ifndef MDASSEMBLE +/* retrieve disk serial number list from a metadata update */ +static struct disk_info *get_disk_info(struct imsm_update_create_array *update) +{ + void *u = update; + struct disk_info *inf; + + inf = u + sizeof(*update) - sizeof(struct imsm_dev) + + sizeof_imsm_dev(&update->dev, 0); + + return inf; +} +#endif + +static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index) +{ + int offset; + int i; + void *_mpb = mpb; + + if (index >= mpb->num_raid_devs) + return NULL; + + /* devices start after all disks */ + offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb; + + for (i = 0; i <= index; i++) + if (i == index) + return _mpb + offset; + else + offset += sizeof_imsm_dev(_mpb + offset, 0); + + return NULL; +} + +static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index) +{ + struct intel_dev *dv; + + if (index >= super->anchor->num_raid_devs) + return NULL; + for (dv = super->devlist; dv; dv = dv->next) + if (dv->index == index) + return dv->dev; + return NULL; +} + +/* + * for second_map: + * == MAP_0 get first map + * == MAP_1 get second map + * == MAP_X than get map according to the current migr_state + */ +static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev, + int slot, + int second_map) +{ + struct imsm_map *map; + + map = get_imsm_map(dev, second_map); + + /* top byte identifies disk under rebuild */ + return __le32_to_cpu(map->disk_ord_tbl[slot]); +} + +#define ord_to_idx(ord) (((ord) << 8) >> 8) +static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot, int second_map) +{ + __u32 ord = get_imsm_ord_tbl_ent(dev, slot, second_map); + + return ord_to_idx(ord); +} + +static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord) +{ + map->disk_ord_tbl[slot] = __cpu_to_le32(ord); +} + +static int get_imsm_disk_slot(struct imsm_map *map, unsigned idx) +{ + int slot; + __u32 ord; + + for (slot = 0; slot < map->num_members; slot++) { + ord = __le32_to_cpu(map->disk_ord_tbl[slot]); + if (ord_to_idx(ord) == idx) + return slot; + } + + return -1; +} + +static int get_imsm_raid_level(struct imsm_map *map) +{ + if (map->raid_level == 1) { + if (map->num_members == 2) + return 1; + else + return 10; + } + + return map->raid_level; +} + +static int cmp_extent(const void *av, const void *bv) +{ + const struct extent *a = av; + const struct extent *b = bv; + if (a->start < b->start) + return -1; + if (a->start > b->start) + return 1; + return 0; +} + +static int count_memberships(struct dl *dl, struct intel_super *super) +{ + int memberships = 0; + int i; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + + if (get_imsm_disk_slot(map, dl->index) >= 0) + memberships++; + } + + return memberships; +} + +static __u32 imsm_min_reserved_sectors(struct intel_super *super); + +static int split_ull(unsigned long long n, __u32 *lo, __u32 *hi) +{ + if (lo == 0 || hi == 0) + return 1; + *lo = __le32_to_cpu((unsigned)n); + *hi = __le32_to_cpu((unsigned)(n >> 32)); + return 0; +} + +static unsigned long long join_u32(__u32 lo, __u32 hi) +{ + return (unsigned long long)__le32_to_cpu(lo) | + (((unsigned long long)__le32_to_cpu(hi)) << 32); +} + +static unsigned long long total_blocks(struct imsm_disk *disk) +{ + if (disk == NULL) + return 0; + return join_u32(disk->total_blocks_lo, disk->total_blocks_hi); +} + +static unsigned long long pba_of_lba0(struct imsm_map *map) +{ + if (map == NULL) + return 0; + return join_u32(map->pba_of_lba0_lo, map->pba_of_lba0_hi); +} + +static unsigned long long blocks_per_member(struct imsm_map *map) +{ + if (map == NULL) + return 0; + return join_u32(map->blocks_per_member_lo, map->blocks_per_member_hi); +} + +#ifndef MDASSEMBLE +static unsigned long long num_data_stripes(struct imsm_map *map) +{ + if (map == NULL) + return 0; + return join_u32(map->num_data_stripes_lo, map->num_data_stripes_hi); +} + +static void set_total_blocks(struct imsm_disk *disk, unsigned long long n) +{ + split_ull(n, &disk->total_blocks_lo, &disk->total_blocks_hi); +} +#endif + +static void set_pba_of_lba0(struct imsm_map *map, unsigned long long n) +{ + split_ull(n, &map->pba_of_lba0_lo, &map->pba_of_lba0_hi); +} + +static void set_blocks_per_member(struct imsm_map *map, unsigned long long n) +{ + split_ull(n, &map->blocks_per_member_lo, &map->blocks_per_member_hi); +} + +static void set_num_data_stripes(struct imsm_map *map, unsigned long long n) +{ + split_ull(n, &map->num_data_stripes_lo, &map->num_data_stripes_hi); +} + +static struct extent *get_extents(struct intel_super *super, struct dl *dl) +{ + /* find a list of used extents on the given physical device */ + struct extent *rv, *e; + int i; + int memberships = count_memberships(dl, super); + __u32 reservation; + + /* trim the reserved area for spares, so they can join any array + * regardless of whether the OROM has assigned sectors from the + * IMSM_RESERVED_SECTORS region + */ + if (dl->index == -1) + reservation = imsm_min_reserved_sectors(super); + else + reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + + rv = xcalloc(sizeof(struct extent), (memberships + 1)); + e = rv; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + + if (get_imsm_disk_slot(map, dl->index) >= 0) { + e->start = pba_of_lba0(map); + e->size = blocks_per_member(map); + e++; + } + } + qsort(rv, memberships, sizeof(*rv), cmp_extent); + + /* determine the start of the metadata + * when no raid devices are defined use the default + * ...otherwise allow the metadata to truncate the value + * as is the case with older versions of imsm + */ + if (memberships) { + struct extent *last = &rv[memberships - 1]; + unsigned long long remainder; + + remainder = total_blocks(&dl->disk) - (last->start + last->size); + /* round down to 1k block to satisfy precision of the kernel + * 'size' interface + */ + remainder &= ~1UL; + /* make sure remainder is still sane */ + if (remainder < (unsigned)ROUND_UP(super->len, 512) >> 9) + remainder = ROUND_UP(super->len, 512) >> 9; + if (reservation > remainder) + reservation = remainder; + } + e->start = total_blocks(&dl->disk) - reservation; + e->size = 0; + return rv; +} + +/* try to determine how much space is reserved for metadata from + * the last get_extents() entry, otherwise fallback to the + * default + */ +static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl) +{ + struct extent *e; + int i; + __u32 rv; + + /* for spares just return a minimal reservation which will grow + * once the spare is picked up by an array + */ + if (dl->index == -1) + return MPB_SECTOR_CNT; + + e = get_extents(super, dl); + if (!e) + return MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + + /* scroll to last entry */ + for (i = 0; e[i].size; i++) + continue; + + rv = total_blocks(&dl->disk) - e[i].start; + + free(e); + + return rv; +} + +static int is_spare(struct imsm_disk *disk) +{ + return (disk->status & SPARE_DISK) == SPARE_DISK; +} + +static int is_configured(struct imsm_disk *disk) +{ + return (disk->status & CONFIGURED_DISK) == CONFIGURED_DISK; +} + +static int is_failed(struct imsm_disk *disk) +{ + return (disk->status & FAILED_DISK) == FAILED_DISK; +} + +/* try to determine how much space is reserved for metadata from + * the last get_extents() entry on the smallest active disk, + * otherwise fallback to the default + */ +static __u32 imsm_min_reserved_sectors(struct intel_super *super) +{ + struct extent *e; + int i; + unsigned long long min_active; + __u32 remainder; + __u32 rv = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + struct dl *dl, *dl_min = NULL; + + if (!super) + return rv; + + min_active = 0; + for (dl = super->disks; dl; dl = dl->next) { + if (dl->index < 0) + continue; + unsigned long long blocks = total_blocks(&dl->disk); + if (blocks < min_active || min_active == 0) { + dl_min = dl; + min_active = blocks; + } + } + if (!dl_min) + return rv; + + /* find last lba used by subarrays on the smallest active disk */ + e = get_extents(super, dl_min); + if (!e) + return rv; + for (i = 0; e[i].size; i++) + continue; + + remainder = min_active - e[i].start; + free(e); + + /* to give priority to recovery we should not require full + IMSM_RESERVED_SECTORS from the spare */ + rv = MPB_SECTOR_CNT + NUM_BLOCKS_DIRTY_STRIPE_REGION; + + /* if real reservation is smaller use that value */ + return (remainder < rv) ? remainder : rv; +} + +/* Return minimum size of a spare that can be used in this array*/ +static unsigned long long min_acceptable_spare_size_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + struct dl *dl; + struct extent *e; + int i; + unsigned long long rv = 0; + + if (!super) + return rv; + /* find first active disk in array */ + dl = super->disks; + while (dl && (is_failed(&dl->disk) || dl->index == -1)) + dl = dl->next; + if (!dl) + return rv; + /* find last lba used by subarrays */ + e = get_extents(super, dl); + if (!e) + return rv; + for (i = 0; e[i].size; i++) + continue; + if (i > 0) + rv = e[i-1].start + e[i-1].size; + free(e); + + /* add the amount of space needed for metadata */ + rv = rv + imsm_min_reserved_sectors(super); + + return rv * 512; +} + +static int is_gen_migration(struct imsm_dev *dev); + +#ifndef MDASSEMBLE +static __u64 blocks_per_migr_unit(struct intel_super *super, + struct imsm_dev *dev); + +static void print_imsm_dev(struct intel_super *super, + struct imsm_dev *dev, + char *uuid, + int disk_idx) +{ + __u64 sz; + int slot, i; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *map2 = get_imsm_map(dev, MAP_1); + __u32 ord; + + printf("\n"); + printf("[%.16s]:\n", dev->volume); + printf(" UUID : %s\n", uuid); + printf(" RAID Level : %d", get_imsm_raid_level(map)); + if (map2) + printf(" <-- %d", get_imsm_raid_level(map2)); + printf("\n"); + printf(" Members : %d", map->num_members); + if (map2) + printf(" <-- %d", map2->num_members); + printf("\n"); + printf(" Slots : ["); + for (i = 0; i < map->num_members; i++) { + ord = get_imsm_ord_tbl_ent(dev, i, MAP_0); + printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U"); + } + printf("]"); + if (map2) { + printf(" <-- ["); + for (i = 0; i < map2->num_members; i++) { + ord = get_imsm_ord_tbl_ent(dev, i, MAP_1); + printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U"); + } + printf("]"); + } + printf("\n"); + printf(" Failed disk : "); + if (map->failed_disk_num == 0xff) + printf("none"); + else + printf("%i", map->failed_disk_num); + printf("\n"); + slot = get_imsm_disk_slot(map, disk_idx); + if (slot >= 0) { + ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X); + printf(" This Slot : %d%s\n", slot, + ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : ""); + } else + printf(" This Slot : ?\n"); + sz = __le32_to_cpu(dev->size_high); + sz <<= 32; + sz += __le32_to_cpu(dev->size_low); + printf(" Array Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); + sz = blocks_per_member(map); + printf(" Per Dev Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); + printf(" Sector Offset : %llu\n", + pba_of_lba0(map)); + printf(" Num Stripes : %llu\n", + num_data_stripes(map)); + printf(" Chunk Size : %u KiB", + __le16_to_cpu(map->blocks_per_strip) / 2); + if (map2) + printf(" <-- %u KiB", + __le16_to_cpu(map2->blocks_per_strip) / 2); + printf("\n"); + printf(" Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks)); + printf(" Migrate State : "); + if (dev->vol.migr_state) { + if (migr_type(dev) == MIGR_INIT) + printf("initialize\n"); + else if (migr_type(dev) == MIGR_REBUILD) + printf("rebuild\n"); + else if (migr_type(dev) == MIGR_VERIFY) + printf("check\n"); + else if (migr_type(dev) == MIGR_GEN_MIGR) + printf("general migration\n"); + else if (migr_type(dev) == MIGR_STATE_CHANGE) + printf("state change\n"); + else if (migr_type(dev) == MIGR_REPAIR) + printf("repair\n"); + else + printf("<unknown:%d>\n", migr_type(dev)); + } else + printf("idle\n"); + printf(" Map State : %s", map_state_str[map->map_state]); + if (dev->vol.migr_state) { + struct imsm_map *map = get_imsm_map(dev, MAP_1); + + printf(" <-- %s", map_state_str[map->map_state]); + printf("\n Checkpoint : %u ", + __le32_to_cpu(dev->vol.curr_migr_unit)); + if ((is_gen_migration(dev)) && ((slot > 1) || (slot < 0))) + printf("(N/A)"); + else + printf("(%llu)", (unsigned long long) + blocks_per_migr_unit(super, dev)); + } + printf("\n"); + printf(" Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean"); +} + +static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved) +{ + char str[MAX_RAID_SERIAL_LEN + 1]; + __u64 sz; + + if (index < -1 || !disk) + return; + + printf("\n"); + snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial); + if (index >= 0) + printf(" Disk%02d Serial : %s\n", index, str); + else + printf(" Disk Serial : %s\n", str); + printf(" State :%s%s%s\n", is_spare(disk) ? " spare" : "", + is_configured(disk) ? " active" : "", + is_failed(disk) ? " failed" : ""); + printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id)); + sz = total_blocks(disk) - reserved; + printf(" Usable Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); +} + +void examine_migr_rec_imsm(struct intel_super *super) +{ + struct migr_record *migr_rec = super->migr_rec; + struct imsm_super *mpb = super->anchor; + int i; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + struct imsm_map *map; + int slot = -1; + + if (is_gen_migration(dev) == 0) + continue; + + printf("\nMigration Record Information:"); + + /* first map under migration */ + map = get_imsm_map(dev, MAP_0); + if (map) + slot = get_imsm_disk_slot(map, super->disks->index); + if ((map == NULL) || (slot > 1) || (slot < 0)) { + printf(" Empty\n "); + printf("Examine one of first two disks in array\n"); + break; + } + printf("\n Status : "); + if (__le32_to_cpu(migr_rec->rec_status) == UNIT_SRC_NORMAL) + printf("Normal\n"); + else + printf("Contains Data\n"); + printf(" Current Unit : %u\n", + __le32_to_cpu(migr_rec->curr_migr_unit)); + printf(" Family : %u\n", + __le32_to_cpu(migr_rec->family_num)); + printf(" Ascending : %u\n", + __le32_to_cpu(migr_rec->ascending_migr)); + printf(" Blocks Per Unit : %u\n", + __le32_to_cpu(migr_rec->blocks_per_unit)); + printf(" Dest. Depth Per Unit : %u\n", + __le32_to_cpu(migr_rec->dest_depth_per_unit)); + printf(" Checkpoint Area pba : %u\n", + __le32_to_cpu(migr_rec->ckpt_area_pba)); + printf(" First member lba : %u\n", + __le32_to_cpu(migr_rec->dest_1st_member_lba)); + printf(" Total Number of Units : %u\n", + __le32_to_cpu(migr_rec->num_migr_units)); + printf(" Size of volume : %u\n", + __le32_to_cpu(migr_rec->post_migr_vol_cap)); + printf(" Expansion space for LBA64 : %u\n", + __le32_to_cpu(migr_rec->post_migr_vol_cap_hi)); + printf(" Record was read from : %u\n", + __le32_to_cpu(migr_rec->ckpt_read_disk_num)); + + break; + } +} +#endif /* MDASSEMBLE */ +/******************************************************************************* + * function: imsm_check_attributes + * Description: Function checks if features represented by attributes flags + * are supported by mdadm. + * Parameters: + * attributes - Attributes read from metadata + * Returns: + * 0 - passed attributes contains unsupported features flags + * 1 - all features are supported + ******************************************************************************/ +static int imsm_check_attributes(__u32 attributes) +{ + int ret_val = 1; + __u32 not_supported = MPB_ATTRIB_SUPPORTED^0xffffffff; + + not_supported &= ~MPB_ATTRIB_IGNORED; + + not_supported &= attributes; + if (not_supported) { + pr_err("(IMSM): Unsupported attributes : %x\n", + (unsigned)__le32_to_cpu(not_supported)); + if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) { + dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY \n"); + not_supported ^= MPB_ATTRIB_CHECKSUM_VERIFY; + } + if (not_supported & MPB_ATTRIB_2TB) { + dprintf("\t\tMPB_ATTRIB_2TB\n"); + not_supported ^= MPB_ATTRIB_2TB; + } + if (not_supported & MPB_ATTRIB_RAID0) { + dprintf("\t\tMPB_ATTRIB_RAID0\n"); + not_supported ^= MPB_ATTRIB_RAID0; + } + if (not_supported & MPB_ATTRIB_RAID1) { + dprintf("\t\tMPB_ATTRIB_RAID1\n"); + not_supported ^= MPB_ATTRIB_RAID1; + } + if (not_supported & MPB_ATTRIB_RAID10) { + dprintf("\t\tMPB_ATTRIB_RAID10\n"); + not_supported ^= MPB_ATTRIB_RAID10; + } + if (not_supported & MPB_ATTRIB_RAID1E) { + dprintf("\t\tMPB_ATTRIB_RAID1E\n"); + not_supported ^= MPB_ATTRIB_RAID1E; + } + if (not_supported & MPB_ATTRIB_RAID5) { + dprintf("\t\tMPB_ATTRIB_RAID5\n"); + not_supported ^= MPB_ATTRIB_RAID5; + } + if (not_supported & MPB_ATTRIB_RAIDCNG) { + dprintf("\t\tMPB_ATTRIB_RAIDCNG\n"); + not_supported ^= MPB_ATTRIB_RAIDCNG; + } + if (not_supported & MPB_ATTRIB_BBM) { + dprintf("\t\tMPB_ATTRIB_BBM\n"); + not_supported ^= MPB_ATTRIB_BBM; + } + if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) { + dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY (== MPB_ATTRIB_LEGACY)\n"); + not_supported ^= MPB_ATTRIB_CHECKSUM_VERIFY; + } + if (not_supported & MPB_ATTRIB_EXP_STRIPE_SIZE) { + dprintf("\t\tMPB_ATTRIB_EXP_STRIP_SIZE\n"); + not_supported ^= MPB_ATTRIB_EXP_STRIPE_SIZE; + } + if (not_supported & MPB_ATTRIB_2TB_DISK) { + dprintf("\t\tMPB_ATTRIB_2TB_DISK\n"); + not_supported ^= MPB_ATTRIB_2TB_DISK; + } + if (not_supported & MPB_ATTRIB_NEVER_USE2) { + dprintf("\t\tMPB_ATTRIB_NEVER_USE2\n"); + not_supported ^= MPB_ATTRIB_NEVER_USE2; + } + if (not_supported & MPB_ATTRIB_NEVER_USE) { + dprintf("\t\tMPB_ATTRIB_NEVER_USE\n"); + not_supported ^= MPB_ATTRIB_NEVER_USE; + } + + if (not_supported) + dprintf("(IMSM): Unknown attributes : %x\n", not_supported); + + ret_val = 0; + } + + return ret_val; +} + +#ifndef MDASSEMBLE +static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map); + +static void examine_super_imsm(struct supertype *st, char *homehost) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + char str[MAX_SIGNATURE_LENGTH]; + int i; + struct mdinfo info; + char nbuf[64]; + __u32 sum; + __u32 reserved = imsm_reserved_sectors(super, super->disks); + struct dl *dl; + + snprintf(str, MPB_SIG_LEN, "%s", mpb->sig); + printf(" Magic : %s\n", str); + snprintf(str, strlen(MPB_VERSION_RAID0), "%s", get_imsm_version(mpb)); + printf(" Version : %s\n", get_imsm_version(mpb)); + printf(" Orig Family : %08x\n", __le32_to_cpu(mpb->orig_family_num)); + printf(" Family : %08x\n", __le32_to_cpu(mpb->family_num)); + printf(" Generation : %08x\n", __le32_to_cpu(mpb->generation_num)); + printf(" Attributes : "); + if (imsm_check_attributes(mpb->attributes)) + printf("All supported\n"); + else + printf("not supported\n"); + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf(" UUID : %s\n", nbuf + 5); + sum = __le32_to_cpu(mpb->check_sum); + printf(" Checksum : %08x %s\n", sum, + __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect"); + printf(" MPB Sectors : %d\n", mpb_sectors(mpb)); + printf(" Disks : %d\n", mpb->num_disks); + printf(" RAID Devices : %d\n", mpb->num_raid_devs); + print_imsm_disk(__get_imsm_disk(mpb, super->disks->index), super->disks->index, reserved); + if (super->bbm_log) { + struct bbm_log *log = super->bbm_log; + + printf("\n"); + printf("Bad Block Management Log:\n"); + printf(" Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size)); + printf(" Signature : %x\n", __le32_to_cpu(log->signature)); + printf(" Entry Count : %d\n", __le32_to_cpu(log->entry_count)); + printf(" Spare Blocks : %d\n", __le32_to_cpu(log->reserved_spare_block_count)); + printf(" First Spare : %llx\n", + (unsigned long long) __le64_to_cpu(log->first_spare_lba)); + } + for (i = 0; i < mpb->num_raid_devs; i++) { + struct mdinfo info; + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + + super->current_vol = i; + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + print_imsm_dev(super, dev, nbuf + 5, super->disks->index); + } + for (i = 0; i < mpb->num_disks; i++) { + if (i == super->disks->index) + continue; + print_imsm_disk(__get_imsm_disk(mpb, i), i, reserved); + } + + for (dl = super->disks; dl; dl = dl->next) + if (dl->index == -1) + print_imsm_disk(&dl->disk, -1, reserved); + + examine_migr_rec_imsm(super); +} + +static void brief_examine_super_imsm(struct supertype *st, int verbose) +{ + /* We just write a generic IMSM ARRAY entry */ + struct mdinfo info; + char nbuf[64]; + struct intel_super *super = st->sb; + + if (!super->anchor->num_raid_devs) { + printf("ARRAY metadata=imsm\n"); + return; + } + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("ARRAY metadata=imsm UUID=%s\n", nbuf + 5); +} + +static void brief_examine_subarrays_imsm(struct supertype *st, int verbose) +{ + /* We just write a generic IMSM ARRAY entry */ + struct mdinfo info; + char nbuf[64]; + char nbuf1[64]; + struct intel_super *super = st->sb; + int i; + + if (!super->anchor->num_raid_devs) + return; + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + + super->current_vol = i; + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf1, ':'); + printf("ARRAY /dev/md/%.16s container=%s member=%d UUID=%s\n", + dev->volume, nbuf + 5, i, nbuf1 + 5); + } +} + +static void export_examine_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct mdinfo info; + char nbuf[64]; + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("MD_METADATA=imsm\n"); + printf("MD_LEVEL=container\n"); + printf("MD_UUID=%s\n", nbuf+5); + printf("MD_DEVICES=%u\n", mpb->num_disks); +} + +static int copy_metadata_imsm(struct supertype *st, int from, int to) +{ + /* The second last 512byte sector of the device contains + * the "struct imsm_super" metadata. + * This contains mpb_size which is the size in bytes of the + * extended metadata. This is located immediately before + * the imsm_super. + * We want to read all that, plus the last sector which + * may contain a migration record, and write it all + * to the target. + */ + void *buf; + unsigned long long dsize, offset; + int sectors; + struct imsm_super *sb; + int written = 0; + + if (posix_memalign(&buf, 4096, 4096) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (lseek64(from, dsize-1024, 0) < 0) + goto err; + if (read(from, buf, 512) != 512) + goto err; + sb = buf; + if (strncmp((char*)sb->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) + goto err; + + sectors = mpb_sectors(sb) + 2; + offset = dsize - sectors * 512; + if (lseek64(from, offset, 0) < 0 || + lseek64(to, offset, 0) < 0) + goto err; + while (written < sectors * 512) { + int n = sectors*512 - written; + if (n > 4096) + n = 4096; + if (read(from, buf, n) != n) + goto err; + if (write(to, buf, n) != n) + goto err; + written += n; + } + free(buf); + return 0; +err: + free(buf); + return 1; +} + +static void detail_super_imsm(struct supertype *st, char *homehost) +{ + struct mdinfo info; + char nbuf[64]; + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("\n UUID : %s\n", nbuf + 5); +} + +static void brief_detail_super_imsm(struct supertype *st) +{ + struct mdinfo info; + char nbuf[64]; + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf(" UUID=%s", nbuf + 5); +} + +static int imsm_read_serial(int fd, char *devname, __u8 *serial); +static void fd2devname(int fd, char *name); + +static int ahci_enumerate_ports(const char *hba_path, int port_count, int host_base, int verbose) +{ + /* dump an unsorted list of devices attached to AHCI Intel storage + * controller, as well as non-connected ports + */ + int hba_len = strlen(hba_path) + 1; + struct dirent *ent; + DIR *dir; + char *path = NULL; + int err = 0; + unsigned long port_mask = (1 << port_count) - 1; + + if (port_count > (int)sizeof(port_mask) * 8) { + if (verbose > 0) + pr_err("port_count %d out of range\n", port_count); + return 2; + } + + /* scroll through /sys/dev/block looking for devices attached to + * this hba + */ + dir = opendir("/sys/dev/block"); + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + int fd; + char model[64]; + char vendor[64]; + char buf[1024]; + int major, minor; + char *device; + char *c; + int port; + int type; + + if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2) + continue; + path = devt_to_devpath(makedev(major, minor)); + if (!path) + continue; + if (!path_attached_to_hba(path, hba_path)) { + free(path); + path = NULL; + continue; + } + + /* retrieve the scsi device type */ + if (asprintf(&device, "/sys/dev/block/%d:%d/device/xxxxxxx", major, minor) < 0) { + if (verbose > 0) + pr_err("failed to allocate 'device'\n"); + err = 2; + break; + } + sprintf(device, "/sys/dev/block/%d:%d/device/type", major, minor); + if (load_sys(device, buf) != 0) { + if (verbose > 0) + pr_err("failed to read device type for %s\n", + path); + err = 2; + free(device); + break; + } + type = strtoul(buf, NULL, 10); + + /* if it's not a disk print the vendor and model */ + if (!(type == 0 || type == 7 || type == 14)) { + vendor[0] = '\0'; + model[0] = '\0'; + sprintf(device, "/sys/dev/block/%d:%d/device/vendor", major, minor); + if (load_sys(device, buf) == 0) { + strncpy(vendor, buf, sizeof(vendor)); + vendor[sizeof(vendor) - 1] = '\0'; + c = (char *) &vendor[sizeof(vendor) - 1]; + while (isspace(*c) || *c == '\0') + *c-- = '\0'; + + } + sprintf(device, "/sys/dev/block/%d:%d/device/model", major, minor); + if (load_sys(device, buf) == 0) { + strncpy(model, buf, sizeof(model)); + model[sizeof(model) - 1] = '\0'; + c = (char *) &model[sizeof(model) - 1]; + while (isspace(*c) || *c == '\0') + *c-- = '\0'; + } + + if (vendor[0] && model[0]) + sprintf(buf, "%.64s %.64s", vendor, model); + else + switch (type) { /* numbers from hald/linux/device.c */ + case 1: sprintf(buf, "tape"); break; + case 2: sprintf(buf, "printer"); break; + case 3: sprintf(buf, "processor"); break; + case 4: + case 5: sprintf(buf, "cdrom"); break; + case 6: sprintf(buf, "scanner"); break; + case 8: sprintf(buf, "media_changer"); break; + case 9: sprintf(buf, "comm"); break; + case 12: sprintf(buf, "raid"); break; + default: sprintf(buf, "unknown"); + } + } else + buf[0] = '\0'; + free(device); + + /* chop device path to 'host%d' and calculate the port number */ + c = strchr(&path[hba_len], '/'); + if (!c) { + if (verbose > 0) + pr_err("%s - invalid path name\n", path + hba_len); + err = 2; + break; + } + *c = '\0'; + if ((sscanf(&path[hba_len], "ata%d", &port) == 1) || + ((sscanf(&path[hba_len], "host%d", &port) == 1))) + port -= host_base; + else { + if (verbose > 0) { + *c = '/'; /* repair the full string */ + pr_err("failed to determine port number for %s\n", + path); + } + err = 2; + break; + } + + /* mark this port as used */ + port_mask &= ~(1 << port); + + /* print out the device information */ + if (buf[0]) { + printf(" Port%d : - non-disk device (%s) -\n", port, buf); + continue; + } + + fd = dev_open(ent->d_name, O_RDONLY); + if (fd < 0) + printf(" Port%d : - disk info unavailable -\n", port); + else { + fd2devname(fd, buf); + printf(" Port%d : %s", port, buf); + if (imsm_read_serial(fd, NULL, (__u8 *) buf) == 0) + printf(" (%.*s)\n", MAX_RAID_SERIAL_LEN, buf); + else + printf(" ()\n"); + close(fd); + } + free(path); + path = NULL; + } + if (path) + free(path); + if (dir) + closedir(dir); + if (err == 0) { + int i; + + for (i = 0; i < port_count; i++) + if (port_mask & (1 << i)) + printf(" Port%d : - no device attached -\n", i); + } + + return err; +} + +static void print_found_intel_controllers(struct sys_dev *elem) +{ + for (; elem; elem = elem->next) { + pr_err("found Intel(R) "); + if (elem->type == SYS_DEV_SATA) + fprintf(stderr, "SATA "); + else if (elem->type == SYS_DEV_SAS) + fprintf(stderr, "SAS "); + else if (elem->type == SYS_DEV_NVME) + fprintf(stderr, "NVMe "); + fprintf(stderr, "RAID controller"); + if (elem->pci_id) + fprintf(stderr, " at %s", elem->pci_id); + fprintf(stderr, ".\n"); + } + fflush(stderr); +} + +static int ahci_get_port_count(const char *hba_path, int *port_count) +{ + struct dirent *ent; + DIR *dir; + int host_base = -1; + + *port_count = 0; + if ((dir = opendir(hba_path)) == NULL) + return -1; + + for (ent = readdir(dir); ent; ent = readdir(dir)) { + int host; + + if ((sscanf(ent->d_name, "ata%d", &host) != 1) && + ((sscanf(ent->d_name, "host%d", &host) != 1))) + continue; + if (*port_count == 0) + host_base = host; + else if (host < host_base) + host_base = host; + + if (host + 1 > *port_count + host_base) + *port_count = host + 1 - host_base; + } + closedir(dir); + return host_base; +} + +static void print_imsm_capability(const struct imsm_orom *orom) +{ + printf(" Platform : Intel(R) "); + if (orom->capabilities == 0 && orom->driver_features == 0) + printf("Matrix Storage Manager\n"); + else + printf("Rapid Storage Technology%s\n", + imsm_orom_is_enterprise(orom) ? " enterprise" : ""); + if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build) + printf(" Version : %d.%d.%d.%d\n", orom->major_ver, + orom->minor_ver, orom->hotfix_ver, orom->build); + printf(" RAID Levels :%s%s%s%s%s\n", + imsm_orom_has_raid0(orom) ? " raid0" : "", + imsm_orom_has_raid1(orom) ? " raid1" : "", + imsm_orom_has_raid1e(orom) ? " raid1e" : "", + imsm_orom_has_raid10(orom) ? " raid10" : "", + imsm_orom_has_raid5(orom) ? " raid5" : ""); + printf(" Chunk Sizes :%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + imsm_orom_has_chunk(orom, 2) ? " 2k" : "", + imsm_orom_has_chunk(orom, 4) ? " 4k" : "", + imsm_orom_has_chunk(orom, 8) ? " 8k" : "", + imsm_orom_has_chunk(orom, 16) ? " 16k" : "", + imsm_orom_has_chunk(orom, 32) ? " 32k" : "", + imsm_orom_has_chunk(orom, 64) ? " 64k" : "", + imsm_orom_has_chunk(orom, 128) ? " 128k" : "", + imsm_orom_has_chunk(orom, 256) ? " 256k" : "", + imsm_orom_has_chunk(orom, 512) ? " 512k" : "", + imsm_orom_has_chunk(orom, 1024*1) ? " 1M" : "", + imsm_orom_has_chunk(orom, 1024*2) ? " 2M" : "", + imsm_orom_has_chunk(orom, 1024*4) ? " 4M" : "", + imsm_orom_has_chunk(orom, 1024*8) ? " 8M" : "", + imsm_orom_has_chunk(orom, 1024*16) ? " 16M" : "", + imsm_orom_has_chunk(orom, 1024*32) ? " 32M" : "", + imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : ""); + printf(" 2TB volumes :%s supported\n", + (orom->attr & IMSM_OROM_ATTR_2TB)?"":" not"); + printf(" 2TB disks :%s supported\n", + (orom->attr & IMSM_OROM_ATTR_2TB_DISK)?"":" not"); + printf(" Max Disks : %d\n", orom->tds); + printf(" Max Volumes : %d per array, %d per %s\n", + orom->vpa, orom->vphba, + imsm_orom_is_nvme(orom) ? "platform" : "controller"); + return; +} + +static void print_imsm_capability_export(const struct imsm_orom *orom) +{ + printf("MD_FIRMWARE_TYPE=imsm\n"); + if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build) + printf("IMSM_VERSION=%d.%d.%d.%d\n", orom->major_ver, orom->minor_ver, + orom->hotfix_ver, orom->build); + printf("IMSM_SUPPORTED_RAID_LEVELS=%s%s%s%s%s\n", + imsm_orom_has_raid0(orom) ? "raid0 " : "", + imsm_orom_has_raid1(orom) ? "raid1 " : "", + imsm_orom_has_raid1e(orom) ? "raid1e " : "", + imsm_orom_has_raid5(orom) ? "raid10 " : "", + imsm_orom_has_raid10(orom) ? "raid5 " : ""); + printf("IMSM_SUPPORTED_CHUNK_SIZES=%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + imsm_orom_has_chunk(orom, 2) ? "2k " : "", + imsm_orom_has_chunk(orom, 4) ? "4k " : "", + imsm_orom_has_chunk(orom, 8) ? "8k " : "", + imsm_orom_has_chunk(orom, 16) ? "16k " : "", + imsm_orom_has_chunk(orom, 32) ? "32k " : "", + imsm_orom_has_chunk(orom, 64) ? "64k " : "", + imsm_orom_has_chunk(orom, 128) ? "128k " : "", + imsm_orom_has_chunk(orom, 256) ? "256k " : "", + imsm_orom_has_chunk(orom, 512) ? "512k " : "", + imsm_orom_has_chunk(orom, 1024*1) ? "1M " : "", + imsm_orom_has_chunk(orom, 1024*2) ? "2M " : "", + imsm_orom_has_chunk(orom, 1024*4) ? "4M " : "", + imsm_orom_has_chunk(orom, 1024*8) ? "8M " : "", + imsm_orom_has_chunk(orom, 1024*16) ? "16M " : "", + imsm_orom_has_chunk(orom, 1024*32) ? "32M " : "", + imsm_orom_has_chunk(orom, 1024*64) ? "64M " : ""); + printf("IMSM_2TB_VOLUMES=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB) ? "yes" : "no"); + printf("IMSM_2TB_DISKS=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB_DISK) ? "yes" : "no"); + printf("IMSM_MAX_DISKS=%d\n",orom->tds); + printf("IMSM_MAX_VOLUMES_PER_ARRAY=%d\n",orom->vpa); + printf("IMSM_MAX_VOLUMES_PER_CONTROLLER=%d\n",orom->vphba); +} + +static int detail_platform_imsm(int verbose, int enumerate_only, char *controller_path) +{ + /* There are two components to imsm platform support, the ahci SATA + * controller and the option-rom. To find the SATA controller we + * simply look in /sys/bus/pci/drivers/ahci to see if an ahci + * controller with the Intel vendor id is present. This approach + * allows mdadm to leverage the kernel's ahci detection logic, with the + * caveat that if ahci.ko is not loaded mdadm will not be able to + * detect platform raid capabilities. The option-rom resides in a + * platform "Adapter ROM". We scan for its signature to retrieve the + * platform capabilities. If raid support is disabled in the BIOS the + * option-rom capability structure will not be available. + */ + struct sys_dev *list, *hba; + int host_base = 0; + int port_count = 0; + int result=1; + + if (enumerate_only) { + if (check_env("IMSM_NO_PLATFORM")) + return 0; + list = find_intel_devices(); + if (!list) + return 2; + for (hba = list; hba; hba = hba->next) { + if (find_imsm_capability(hba)) { + result = 0; + break; + } + else + result = 2; + } + return result; + } + + list = find_intel_devices(); + if (!list) { + if (verbose > 0) + pr_err("no active Intel(R) RAID controller found.\n"); + return 2; + } else if (verbose > 0) + print_found_intel_controllers(list); + + for (hba = list; hba; hba = hba->next) { + if (controller_path && (compare_paths(hba->path, controller_path) != 0)) + continue; + if (!find_imsm_capability(hba)) { + pr_err("imsm capabilities not found for controller: %s (type %s)\n", + hba->path, get_sys_dev_type(hba->type)); + continue; + } + result = 0; + } + + if (controller_path && result == 1) { + pr_err("no active Intel(R) RAID controller found under %s\n", + controller_path); + return result; + } + + const struct orom_entry *entry; + + for (entry = orom_entries; entry; entry = entry->next) { + print_imsm_capability(&entry->orom); + + if (imsm_orom_is_nvme(&entry->orom)) { + for (hba = list; hba; hba = hba->next) { + if (hba->type == SYS_DEV_NVME) + printf(" NVMe Device : %s\n", hba->path); + } + continue; + } + + struct devid_list *devid; + for (devid = entry->devid_list; devid; devid = devid->next) { + hba = device_by_id(devid->devid); + if (!hba) + continue; + + printf(" I/O Controller : %s (%s)\n", + hba->path, get_sys_dev_type(hba->type)); + if (hba->type == SYS_DEV_SATA) { + host_base = ahci_get_port_count(hba->path, &port_count); + if (ahci_enumerate_ports(hba->path, port_count, host_base, verbose)) { + if (verbose > 0) + pr_err("failed to enumerate ports on SATA controller at %s.\n", hba->pci_id); + result |= 2; + } + } + } + printf("\n"); + } + + return result; +} + +static int export_detail_platform_imsm(int verbose, char *controller_path) +{ + struct sys_dev *list, *hba; + int result=1; + + list = find_intel_devices(); + if (!list) { + if (verbose > 0) + pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_INTEL_DEVICES\n"); + result = 2; + return result; + } + + for (hba = list; hba; hba = hba->next) { + if (controller_path && (compare_paths(hba->path,controller_path) != 0)) + continue; + if (!find_imsm_capability(hba) && verbose > 0) + pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n", hba->path); + else + result = 0; + } + + const struct orom_entry *entry; + + for (entry = orom_entries; entry; entry = entry->next) + print_imsm_capability_export(&entry->orom); + + return result; +} + +#endif + +static int match_home_imsm(struct supertype *st, char *homehost) +{ + /* the imsm metadata format does not specify any host + * identification information. We return -1 since we can never + * confirm nor deny whether a given array is "meant" for this + * host. We rely on compare_super and the 'family_num' fields to + * exclude member disks that do not belong, and we rely on + * mdadm.conf to specify the arrays that should be assembled. + * Auto-assembly may still pick up "foreign" arrays. + */ + + return -1; +} + +static void uuid_from_super_imsm(struct supertype *st, int uuid[4]) +{ + /* The uuid returned here is used for: + * uuid to put into bitmap file (Create, Grow) + * uuid for backup header when saving critical section (Grow) + * comparing uuids when re-adding a device into an array + * In these cases the uuid required is that of the data-array, + * not the device-set. + * uuid to recognise same set when adding a missing device back + * to an array. This is a uuid for the device-set. + * + * For each of these we can make do with a truncated + * or hashed uuid rather than the original, as long as + * everyone agrees. + * In each case the uuid required is that of the data-array, + * not the device-set. + */ + /* imsm does not track uuid's so we synthesis one using sha1 on + * - The signature (Which is constant for all imsm array, but no matter) + * - the orig_family_num of the container + * - the index number of the volume + * - the 'serial' number of the volume. + * Hopefully these are all constant. + */ + struct intel_super *super = st->sb; + + char buf[20]; + struct sha1_ctx ctx; + struct imsm_dev *dev = NULL; + __u32 family_num; + + /* some mdadm versions failed to set ->orig_family_num, in which + * case fall back to ->family_num. orig_family_num will be + * fixed up with the first metadata update. + */ + family_num = super->anchor->orig_family_num; + if (family_num == 0) + family_num = super->anchor->family_num; + sha1_init_ctx(&ctx); + sha1_process_bytes(super->anchor->sig, MPB_SIG_LEN, &ctx); + sha1_process_bytes(&family_num, sizeof(__u32), &ctx); + if (super->current_vol >= 0) + dev = get_imsm_dev(super, super->current_vol); + if (dev) { + __u32 vol = super->current_vol; + sha1_process_bytes(&vol, sizeof(vol), &ctx); + sha1_process_bytes(dev->volume, MAX_RAID_SERIAL_LEN, &ctx); + } + sha1_finish_ctx(&ctx, buf); + memcpy(uuid, buf, 4*4); +} + +#if 0 +static void +get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p) +{ + __u8 *v = get_imsm_version(mpb); + __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH; + char major[] = { 0, 0, 0 }; + char minor[] = { 0 ,0, 0 }; + char patch[] = { 0, 0, 0 }; + char *ver_parse[] = { major, minor, patch }; + int i, j; + + i = j = 0; + while (*v != '\0' && v < end) { + if (*v != '.' && j < 2) + ver_parse[i][j++] = *v; + else { + i++; + j = 0; + } + v++; + } + + *m = strtol(minor, NULL, 0); + *p = strtol(patch, NULL, 0); +} +#endif + +static __u32 migr_strip_blocks_resync(struct imsm_dev *dev) +{ + /* migr_strip_size when repairing or initializing parity */ + struct imsm_map *map = get_imsm_map(dev, MAP_0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 5: + case 10: + return chunk; + default: + return 128*1024 >> 9; + } +} + +static __u32 migr_strip_blocks_rebuild(struct imsm_dev *dev) +{ + /* migr_strip_size when rebuilding a degraded disk, no idea why + * this is different than migr_strip_size_resync(), but it's good + * to be compatible + */ + struct imsm_map *map = get_imsm_map(dev, MAP_1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: + if (map->num_members % map->num_domains == 0) + return 128*1024 >> 9; + else + return chunk; + case 5: + return max((__u32) 64*1024 >> 9, chunk); + default: + return 128*1024 >> 9; + } +} + +static __u32 num_stripes_per_unit_resync(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, MAP_0); + struct imsm_map *hi = get_imsm_map(dev, MAP_1); + __u32 lo_chunk = __le32_to_cpu(lo->blocks_per_strip); + __u32 hi_chunk = __le32_to_cpu(hi->blocks_per_strip); + + return max((__u32) 1, hi_chunk / lo_chunk); +} + +static __u32 num_stripes_per_unit_rebuild(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, MAP_0); + int level = get_imsm_raid_level(lo); + + if (level == 1 || level == 10) { + struct imsm_map *hi = get_imsm_map(dev, MAP_1); + + return hi->num_domains; + } else + return num_stripes_per_unit_resync(dev); +} + +static __u8 imsm_num_data_members(struct imsm_dev *dev, int second_map) +{ + /* named 'imsm_' because raid0, raid1 and raid10 + * counter-intuitively have the same number of data disks + */ + struct imsm_map *map = get_imsm_map(dev, second_map); + + switch (get_imsm_raid_level(map)) { + case 0: + return map->num_members; + break; + case 1: + case 10: + return map->num_members/2; + case 5: + return map->num_members - 1; + default: + dprintf("unsupported raid level\n"); + return 0; + } +} + +static __u32 parity_segment_depth(struct imsm_dev *dev) +{ + struct imsm_map *map = get_imsm_map(dev, MAP_0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch(get_imsm_raid_level(map)) { + case 1: + case 10: + return chunk * map->num_domains; + case 5: + return chunk * map->num_members; + default: + return chunk; + } +} + +static __u32 map_migr_block(struct imsm_dev *dev, __u32 block) +{ + struct imsm_map *map = get_imsm_map(dev, MAP_1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + __u32 strip = block / chunk; + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: { + __u32 vol_strip = (strip * map->num_domains) + 1; + __u32 vol_stripe = vol_strip / map->num_members; + + return vol_stripe * chunk + block % chunk; + } case 5: { + __u32 stripe = strip / (map->num_members - 1); + + return stripe * chunk + block % chunk; + } + default: + return 0; + } +} + +static __u64 blocks_per_migr_unit(struct intel_super *super, + struct imsm_dev *dev) +{ + /* calculate the conversion factor between per member 'blocks' + * (md/{resync,rebuild}_start) and imsm migration units, return + * 0 for the 'not migrating' and 'unsupported migration' cases + */ + if (!dev->vol.migr_state) + return 0; + + switch (migr_type(dev)) { + case MIGR_GEN_MIGR: { + struct migr_record *migr_rec = super->migr_rec; + return __le32_to_cpu(migr_rec->blocks_per_unit); + } + case MIGR_VERIFY: + case MIGR_REPAIR: + case MIGR_INIT: { + struct imsm_map *map = get_imsm_map(dev, MAP_0); + __u32 stripes_per_unit; + __u32 blocks_per_unit; + __u32 parity_depth; + __u32 migr_chunk; + __u32 block_map; + __u32 block_rel; + __u32 segment; + __u32 stripe; + __u8 disks; + + /* yes, this is really the translation of migr_units to + * per-member blocks in the 'resync' case + */ + stripes_per_unit = num_stripes_per_unit_resync(dev); + migr_chunk = migr_strip_blocks_resync(dev); + disks = imsm_num_data_members(dev, MAP_0); + blocks_per_unit = stripes_per_unit * migr_chunk * disks; + stripe = __le16_to_cpu(map->blocks_per_strip) * disks; + segment = blocks_per_unit / stripe; + block_rel = blocks_per_unit - segment * stripe; + parity_depth = parity_segment_depth(dev); + block_map = map_migr_block(dev, block_rel); + return block_map + parity_depth * segment; + } + case MIGR_REBUILD: { + __u32 stripes_per_unit; + __u32 migr_chunk; + + stripes_per_unit = num_stripes_per_unit_rebuild(dev); + migr_chunk = migr_strip_blocks_rebuild(dev); + return migr_chunk * stripes_per_unit; + } + case MIGR_STATE_CHANGE: + default: + return 0; + } +} + +static int imsm_level_to_layout(int level) +{ + switch (level) { + case 0: + case 1: + return 0; + case 5: + case 6: + return ALGORITHM_LEFT_ASYMMETRIC; + case 10: + return 0x102; + } + return UnSet; +} + +/******************************************************************************* + * Function: read_imsm_migr_rec + * Description: Function reads imsm migration record from last sector of disk + * Parameters: + * fd : disk descriptor + * super : metadata info + * Returns: + * 0 : success, + * -1 : fail + ******************************************************************************/ +static int read_imsm_migr_rec(int fd, struct intel_super *super) +{ + int ret_val = -1; + unsigned long long dsize; + + get_dev_size(fd, NULL, &dsize); + if (lseek64(fd, dsize - MIGR_REC_POSITION, SEEK_SET) < 0) { + pr_err("Cannot seek to anchor block: %s\n", + strerror(errno)); + goto out; + } + if (read(fd, super->migr_rec_buf, MIGR_REC_BUF_SIZE) != + MIGR_REC_BUF_SIZE) { + pr_err("Cannot read migr record block: %s\n", + strerror(errno)); + goto out; + } + ret_val = 0; + +out: + return ret_val; +} + +static struct imsm_dev *imsm_get_device_during_migration( + struct intel_super *super) +{ + + struct intel_dev *dv; + + for (dv = super->devlist; dv; dv = dv->next) { + if (is_gen_migration(dv->dev)) + return dv->dev; + } + return NULL; +} + +/******************************************************************************* + * Function: load_imsm_migr_rec + * Description: Function reads imsm migration record (it is stored at the last + * sector of disk) + * Parameters: + * super : imsm internal array info + * info : general array info + * Returns: + * 0 : success + * -1 : fail + * -2 : no migration in progress + ******************************************************************************/ +static int load_imsm_migr_rec(struct intel_super *super, struct mdinfo *info) +{ + struct mdinfo *sd; + struct dl *dl = NULL; + char nm[30]; + int retval = -1; + int fd = -1; + struct imsm_dev *dev; + struct imsm_map *map = NULL; + int slot = -1; + + /* find map under migration */ + dev = imsm_get_device_during_migration(super); + /* nothing to load,no migration in progress? + */ + if (dev == NULL) + return -2; + map = get_imsm_map(dev, MAP_0); + + if (info) { + for (sd = info->devs ; sd ; sd = sd->next) { + /* skip spare and failed disks + */ + if (sd->disk.raid_disk < 0) + continue; + /* read only from one of the first two slots */ + if (map) + slot = get_imsm_disk_slot(map, + sd->disk.raid_disk); + if ((map == NULL) || (slot > 1) || (slot < 0)) + continue; + + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + fd = dev_open(nm, O_RDONLY); + if (fd >= 0) + break; + } + } + if (fd < 0) { + for (dl = super->disks; dl; dl = dl->next) { + /* skip spare and failed disks + */ + if (dl->index < 0) + continue; + /* read only from one of the first two slots */ + if (map) + slot = get_imsm_disk_slot(map, dl->index); + if ((map == NULL) || (slot > 1) || (slot < 0)) + continue; + sprintf(nm, "%d:%d", dl->major, dl->minor); + fd = dev_open(nm, O_RDONLY); + if (fd >= 0) + break; + } + } + if (fd < 0) + goto out; + retval = read_imsm_migr_rec(fd, super); + +out: + if (fd >= 0) + close(fd); + return retval; +} + +#ifndef MDASSEMBLE +/******************************************************************************* + * function: imsm_create_metadata_checkpoint_update + * Description: It creates update for checkpoint change. + * Parameters: + * super : imsm internal array info + * u : pointer to prepared update + * Returns: + * Uptate length. + * If length is equal to 0, input pointer u contains no update + ******************************************************************************/ +static int imsm_create_metadata_checkpoint_update( + struct intel_super *super, + struct imsm_update_general_migration_checkpoint **u) +{ + + int update_memory_size = 0; + + dprintf("(enter)\n"); + + if (u == NULL) + return 0; + *u = NULL; + + /* size of all update data without anchor */ + update_memory_size = + sizeof(struct imsm_update_general_migration_checkpoint); + + *u = xcalloc(1, update_memory_size); + if (*u == NULL) { + dprintf("error: cannot get memory\n"); + return 0; + } + (*u)->type = update_general_migration_checkpoint; + (*u)->curr_migr_unit = __le32_to_cpu(super->migr_rec->curr_migr_unit); + dprintf("prepared for %u\n", (*u)->curr_migr_unit); + + return update_memory_size; +} + +static void imsm_update_metadata_locally(struct supertype *st, + void *buf, int len); + +/******************************************************************************* + * Function: write_imsm_migr_rec + * Description: Function writes imsm migration record + * (at the last sector of disk) + * Parameters: + * super : imsm internal array info + * Returns: + * 0 : success + * -1 : if fail + ******************************************************************************/ +static int write_imsm_migr_rec(struct supertype *st) +{ + struct intel_super *super = st->sb; + unsigned long long dsize; + char nm[30]; + int fd = -1; + int retval = -1; + struct dl *sd; + int len; + struct imsm_update_general_migration_checkpoint *u; + struct imsm_dev *dev; + struct imsm_map *map = NULL; + + /* find map under migration */ + dev = imsm_get_device_during_migration(super); + /* if no migration, write buffer anyway to clear migr_record + * on disk based on first available device + */ + if (dev == NULL) + dev = get_imsm_dev(super, super->current_vol < 0 ? 0 : + super->current_vol); + + map = get_imsm_map(dev, MAP_0); + + for (sd = super->disks ; sd ; sd = sd->next) { + int slot = -1; + + /* skip failed and spare devices */ + if (sd->index < 0) + continue; + /* write to 2 first slots only */ + if (map) + slot = get_imsm_disk_slot(map, sd->index); + if ((map == NULL) || (slot > 1) || (slot < 0)) + continue; + + sprintf(nm, "%d:%d", sd->major, sd->minor); + fd = dev_open(nm, O_RDWR); + if (fd < 0) + continue; + get_dev_size(fd, NULL, &dsize); + if (lseek64(fd, dsize - MIGR_REC_POSITION, SEEK_SET) < 0) { + pr_err("Cannot seek to anchor block: %s\n", + strerror(errno)); + goto out; + } + if (write(fd, super->migr_rec_buf, MIGR_REC_BUF_SIZE) != + MIGR_REC_BUF_SIZE) { + pr_err("Cannot write migr record block: %s\n", + strerror(errno)); + goto out; + } + close(fd); + fd = -1; + } + /* update checkpoint information in metadata */ + len = imsm_create_metadata_checkpoint_update(super, &u); + + if (len <= 0) { + dprintf("imsm: Cannot prepare update\n"); + goto out; + } + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) { + append_metadata_update(st, u, len); + /* during reshape we do all work inside metadata handler + * manage_reshape(), so metadata update has to be triggered + * insida it + */ + flush_metadata_updates(st); + st->update_tail = &st->updates; + } else + free(u); + + retval = 0; + out: + if (fd >= 0) + close(fd); + return retval; +} +#endif /* MDASSEMBLE */ + +/* spare/missing disks activations are not allowe when + * array/container performs reshape operation, because + * all arrays in container works on the same disks set + */ +int imsm_reshape_blocks_arrays_changes(struct intel_super *super) +{ + int rv = 0; + struct intel_dev *i_dev; + struct imsm_dev *dev; + + /* check whole container + */ + for (i_dev = super->devlist; i_dev; i_dev = i_dev->next) { + dev = i_dev->dev; + if (is_gen_migration(dev)) { + /* No repair during any migration in container + */ + rv = 1; + break; + } + } + return rv; +} +static unsigned long long imsm_component_size_aligment_check(int level, + int chunk_size, + unsigned long long component_size) +{ + unsigned int component_size_alligment; + + /* check component size aligment + */ + component_size_alligment = component_size % (chunk_size/512); + + dprintf("(Level: %i, chunk_size = %i, component_size = %llu), component_size_alligment = %u\n", + level, chunk_size, component_size, + component_size_alligment); + + if (component_size_alligment && (level != 1) && (level != UnSet)) { + dprintf("imsm: reported component size alligned from %llu ", + component_size); + component_size -= component_size_alligment; + dprintf_cont("to %llu (%i).\n", + component_size, component_size_alligment); + } + + return component_size; +} + +static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, char *dmap) +{ + struct intel_super *super = st->sb; + struct migr_record *migr_rec = super->migr_rec; + struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *prev_map = get_imsm_map(dev, MAP_1); + struct imsm_map *map_to_analyse = map; + struct dl *dl; + int map_disks = info->array.raid_disks; + + memset(info, 0, sizeof(*info)); + if (prev_map) + map_to_analyse = prev_map; + + dl = super->current_disk; + + info->container_member = super->current_vol; + info->array.raid_disks = map->num_members; + info->array.level = get_imsm_raid_level(map_to_analyse); + info->array.layout = imsm_level_to_layout(info->array.level); + info->array.md_minor = -1; + info->array.ctime = 0; + info->array.utime = 0; + info->array.chunk_size = + __le16_to_cpu(map_to_analyse->blocks_per_strip) << 9; + info->array.state = !dev->vol.dirty; + info->custom_array_size = __le32_to_cpu(dev->size_high); + info->custom_array_size <<= 32; + info->custom_array_size |= __le32_to_cpu(dev->size_low); + info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb); + + if (is_gen_migration(dev)) { + info->reshape_active = 1; + info->new_level = get_imsm_raid_level(map); + info->new_layout = imsm_level_to_layout(info->new_level); + info->new_chunk = __le16_to_cpu(map->blocks_per_strip) << 9; + info->delta_disks = map->num_members - prev_map->num_members; + if (info->delta_disks) { + /* this needs to be applied to every array + * in the container. + */ + info->reshape_active = CONTAINER_RESHAPE; + } + /* We shape information that we give to md might have to be + * modify to cope with md's requirement for reshaping arrays. + * For example, when reshaping a RAID0, md requires it to be + * presented as a degraded RAID4. + * Also if a RAID0 is migrating to a RAID5 we need to specify + * the array as already being RAID5, but the 'before' layout + * is a RAID4-like layout. + */ + switch (info->array.level) { + case 0: + switch(info->new_level) { + case 0: + /* conversion is happening as RAID4 */ + info->array.level = 4; + info->array.raid_disks += 1; + break; + case 5: + /* conversion is happening as RAID5 */ + info->array.level = 5; + info->array.layout = ALGORITHM_PARITY_N; + info->delta_disks -= 1; + break; + default: + /* FIXME error message */ + info->array.level = UnSet; + break; + } + break; + } + } else { + info->new_level = UnSet; + info->new_layout = UnSet; + info->new_chunk = info->array.chunk_size; + info->delta_disks = 0; + } + + if (dl) { + info->disk.major = dl->major; + info->disk.minor = dl->minor; + info->disk.number = dl->index; + info->disk.raid_disk = get_imsm_disk_slot(map_to_analyse, + dl->index); + } + + info->data_offset = pba_of_lba0(map_to_analyse); + info->component_size = blocks_per_member(map_to_analyse); + + info->component_size = imsm_component_size_aligment_check( + info->array.level, + info->array.chunk_size, + info->component_size); + + memset(info->uuid, 0, sizeof(info->uuid)); + info->recovery_start = MaxSector; + + info->reshape_progress = 0; + info->resync_start = MaxSector; + if ((map_to_analyse->map_state == IMSM_T_STATE_UNINITIALIZED || + dev->vol.dirty) && + imsm_reshape_blocks_arrays_changes(super) == 0) { + info->resync_start = 0; + } + if (dev->vol.migr_state) { + switch (migr_type(dev)) { + case MIGR_REPAIR: + case MIGR_INIT: { + __u64 blocks_per_unit = blocks_per_migr_unit(super, + dev); + __u64 units = __le32_to_cpu(dev->vol.curr_migr_unit); + + info->resync_start = blocks_per_unit * units; + break; + } + case MIGR_GEN_MIGR: { + __u64 blocks_per_unit = blocks_per_migr_unit(super, + dev); + __u64 units = __le32_to_cpu(migr_rec->curr_migr_unit); + unsigned long long array_blocks; + int used_disks; + + if (__le32_to_cpu(migr_rec->ascending_migr) && + (units < + (__le32_to_cpu(migr_rec->num_migr_units)-1)) && + (super->migr_rec->rec_status == + __cpu_to_le32(UNIT_SRC_IN_CP_AREA))) + units++; + + info->reshape_progress = blocks_per_unit * units; + + dprintf("IMSM: General Migration checkpoint : %llu (%llu) -> read reshape progress : %llu\n", + (unsigned long long)units, + (unsigned long long)blocks_per_unit, + info->reshape_progress); + + used_disks = imsm_num_data_members(dev, MAP_1); + if (used_disks > 0) { + array_blocks = blocks_per_member(map) * + used_disks; + /* round array size down to closest MB + */ + info->custom_array_size = (array_blocks + >> SECT_PER_MB_SHIFT) + << SECT_PER_MB_SHIFT; + } + } + case MIGR_VERIFY: + /* we could emulate the checkpointing of + * 'sync_action=check' migrations, but for now + * we just immediately complete them + */ + case MIGR_REBUILD: + /* this is handled by container_content_imsm() */ + case MIGR_STATE_CHANGE: + /* FIXME handle other migrations */ + default: + /* we are not dirty, so... */ + info->resync_start = MaxSector; + } + } + + strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN); + info->name[MAX_RAID_SERIAL_LEN] = 0; + + info->array.major_version = -1; + info->array.minor_version = -2; + sprintf(info->text_version, "/%s/%d", st->container_devnm, info->container_member); + info->safe_mode_delay = 4000; /* 4 secs like the Matrix driver */ + uuid_from_super_imsm(st, info->uuid); + + if (dmap) { + int i, j; + for (i=0; i<map_disks; i++) { + dmap[i] = 0; + if (i < info->array.raid_disks) { + struct imsm_disk *dsk; + j = get_imsm_disk_idx(dev, i, MAP_X); + dsk = get_imsm_disk(super, j); + if (dsk && (dsk->status & CONFIGURED_DISK)) + dmap[i] = 1; + } + } + } +} + +static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, + int failed, int look_in_map); + +static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev, + int look_in_map); + +#ifndef MDASSEMBLE +static void manage_second_map(struct intel_super *super, struct imsm_dev *dev) +{ + if (is_gen_migration(dev)) { + int failed; + __u8 map_state; + struct imsm_map *map2 = get_imsm_map(dev, MAP_1); + + failed = imsm_count_failed(super, dev, MAP_1); + map_state = imsm_check_degraded(super, dev, failed, MAP_1); + if (map2->map_state != map_state) { + map2->map_state = map_state; + super->updates_pending++; + } + } +} +#endif + +static struct imsm_disk *get_imsm_missing(struct intel_super *super, __u8 index) +{ + struct dl *d; + + for (d = super->missing; d; d = d->next) + if (d->index == index) + return &d->disk; + return NULL; +} + +static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map) +{ + struct intel_super *super = st->sb; + struct imsm_disk *disk; + int map_disks = info->array.raid_disks; + int max_enough = -1; + int i; + struct imsm_super *mpb; + + if (super->current_vol >= 0) { + getinfo_super_imsm_volume(st, info, map); + return; + } + memset(info, 0, sizeof(*info)); + + /* Set raid_disks to zero so that Assemble will always pull in valid + * spares + */ + info->array.raid_disks = 0; + info->array.level = LEVEL_CONTAINER; + info->array.layout = 0; + info->array.md_minor = -1; + info->array.ctime = 0; /* N/A for imsm */ + info->array.utime = 0; + info->array.chunk_size = 0; + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.raid_disk = -1; + info->reshape_active = 0; + info->array.major_version = -1; + info->array.minor_version = -2; + strcpy(info->text_version, "imsm"); + info->safe_mode_delay = 0; + info->disk.number = -1; + info->disk.state = 0; + info->name[0] = 0; + info->recovery_start = MaxSector; + info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb); + + /* do we have the all the insync disks that we expect? */ + mpb = super->anchor; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + int failed, enough, j, missing = 0; + struct imsm_map *map; + __u8 state; + + failed = imsm_count_failed(super, dev, MAP_0); + state = imsm_check_degraded(super, dev, failed, MAP_0); + map = get_imsm_map(dev, MAP_0); + + /* any newly missing disks? + * (catches single-degraded vs double-degraded) + */ + for (j = 0; j < map->num_members; j++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, j, MAP_0); + __u32 idx = ord_to_idx(ord); + + if (!(ord & IMSM_ORD_REBUILD) && + get_imsm_missing(super, idx)) { + missing = 1; + break; + } + } + + if (state == IMSM_T_STATE_FAILED) + enough = -1; + else if (state == IMSM_T_STATE_DEGRADED && + (state != map->map_state || missing)) + enough = 0; + else /* we're normal, or already degraded */ + enough = 1; + if (is_gen_migration(dev) && missing) { + /* during general migration we need all disks + * that process is running on. + * No new missing disk is allowed. + */ + max_enough = -1; + enough = -1; + /* no more checks necessary + */ + break; + } + /* in the missing/failed disk case check to see + * if at least one array is runnable + */ + max_enough = max(max_enough, enough); + } + dprintf("enough: %d\n", max_enough); + info->container_enough = max_enough; + + if (super->disks) { + __u32 reserved = imsm_reserved_sectors(super, super->disks); + + disk = &super->disks->disk; + info->data_offset = total_blocks(&super->disks->disk) - reserved; + info->component_size = reserved; + info->disk.state = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0; + /* we don't change info->disk.raid_disk here because + * this state will be finalized in mdmon after we have + * found the 'most fresh' version of the metadata + */ + info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + info->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); + } + + /* only call uuid_from_super_imsm when this disk is part of a populated container, + * ->compare_super may have updated the 'num_raid_devs' field for spares + */ + if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs) + uuid_from_super_imsm(st, info->uuid); + else + memcpy(info->uuid, uuid_zero, sizeof(uuid_zero)); + + /* I don't know how to compute 'map' on imsm, so use safe default */ + if (map) { + int i; + for (i = 0; i < map_disks; i++) + map[i] = 1; + } + +} + +/* allocates memory and fills disk in mdinfo structure + * for each disk in array */ +struct mdinfo *getinfo_super_disks_imsm(struct supertype *st) +{ + struct mdinfo *mddev = NULL; + struct intel_super *super = st->sb; + struct imsm_disk *disk; + int count = 0; + struct dl *dl; + if (!super || !super->disks) + return NULL; + dl = super->disks; + mddev = xcalloc(1, sizeof(*mddev)); + while (dl) { + struct mdinfo *tmp; + disk = &dl->disk; + tmp = xcalloc(1, sizeof(*tmp)); + if (mddev->devs) + tmp->next = mddev->devs; + mddev->devs = tmp; + tmp->disk.number = count++; + tmp->disk.major = dl->major; + tmp->disk.minor = dl->minor; + tmp->disk.state = is_configured(disk) ? + (1 << MD_DISK_ACTIVE) : 0; + tmp->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + tmp->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); + tmp->disk.raid_disk = -1; + dl = dl->next; + } + return mddev; +} + +static int update_super_imsm(struct supertype *st, struct mdinfo *info, + char *update, char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* For 'assemble' and 'force' we need to return non-zero if any + * change was made. For others, the return value is ignored. + * Update options are: + * force-one : This device looks a bit old but needs to be included, + * update age info appropriately. + * assemble: clear any 'faulty' flag to allow this device to + * be assembled. + * force-array: Array is degraded but being forced, mark it clean + * if that will be needed to assemble it. + * + * newdev: not used ???? + * grow: Array has gained a new device - this is currently for + * linear only + * resync: mark as dirty so a resync will happen. + * name: update the name - preserving the homehost + * uuid: Change the uuid of the array to match watch is given + * + * Following are not relevant for this imsm: + * sparc2.2 : update from old dodgey metadata + * super-minor: change the preferred_minor number + * summaries: update redundant counters. + * homehost: update the recorded homehost + * _reshape_progress: record new reshape_progress position. + */ + int rv = 1; + struct intel_super *super = st->sb; + struct imsm_super *mpb; + + /* we can only update container info */ + if (!super || super->current_vol >= 0 || !super->anchor) + return 1; + + mpb = super->anchor; + + if (strcmp(update, "uuid") == 0) { + /* We take this to mean that the family_num should be updated. + * However that is much smaller than the uuid so we cannot really + * allow an explicit uuid to be given. And it is hard to reliably + * know if one was. + * So if !uuid_set we know the current uuid is random and just used + * the first 'int' and copy it to the other 3 positions. + * Otherwise we require the 4 'int's to be the same as would be the + * case if we are using a random uuid. So an explicit uuid will be + * accepted as long as all for ints are the same... which shouldn't hurt + */ + if (!uuid_set) { + info->uuid[1] = info->uuid[2] = info->uuid[3] = info->uuid[0]; + rv = 0; + } else { + if (info->uuid[0] != info->uuid[1] || + info->uuid[1] != info->uuid[2] || + info->uuid[2] != info->uuid[3]) + rv = -1; + else + rv = 0; + } + if (rv == 0) + mpb->orig_family_num = info->uuid[0]; + } else if (strcmp(update, "assemble") == 0) + rv = 0; + else + rv = -1; + + /* successful update? recompute checksum */ + if (rv == 0) + mpb->check_sum = __le32_to_cpu(__gen_imsm_checksum(mpb)); + + return rv; +} + +static size_t disks_to_mpb_size(int disks) +{ + size_t size; + + size = sizeof(struct imsm_super); + size += (disks - 1) * sizeof(struct imsm_disk); + size += 2 * sizeof(struct imsm_dev); + /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */ + size += (4 - 2) * sizeof(struct imsm_map); + /* 4 possible disk_ord_tbl's */ + size += 4 * (disks - 1) * sizeof(__u32); + + return size; +} + +static __u64 avail_size_imsm(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS)) + return 0; + + return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS); +} + +static void free_devlist(struct intel_super *super) +{ + struct intel_dev *dv; + + while (super->devlist) { + dv = super->devlist->next; + free(super->devlist->dev); + free(super->devlist); + super->devlist = dv; + } +} + +static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src) +{ + memcpy(dest, src, sizeof_imsm_dev(src, 0)); +} + +static int compare_super_imsm(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + struct intel_super *first = st->sb; + struct intel_super *sec = tst->sb; + + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } + /* in platform dependent environment test if the disks + * use the same Intel hba + * If not on Intel hba at all, allow anything. + */ + if (!check_env("IMSM_NO_PLATFORM") && first->hba && sec->hba) { + if (first->hba->type != sec->hba->type) { + fprintf(stderr, + "HBAs of devices do not match %s != %s\n", + get_sys_dev_type(first->hba->type), + get_sys_dev_type(sec->hba->type)); + return 3; + } + if (first->orom != sec->orom) { + fprintf(stderr, + "HBAs of devices do not match %s != %s\n", + first->hba->pci_id, sec->hba->pci_id); + return 3; + } + } + + /* if an anchor does not have num_raid_devs set then it is a free + * floating spare + */ + if (first->anchor->num_raid_devs > 0 && + sec->anchor->num_raid_devs > 0) { + /* Determine if these disks might ever have been + * related. Further disambiguation can only take place + * in load_super_imsm_all + */ + __u32 first_family = first->anchor->orig_family_num; + __u32 sec_family = sec->anchor->orig_family_num; + + if (memcmp(first->anchor->sig, sec->anchor->sig, + MAX_SIGNATURE_LENGTH) != 0) + return 3; + + if (first_family == 0) + first_family = first->anchor->family_num; + if (sec_family == 0) + sec_family = sec->anchor->family_num; + + if (first_family != sec_family) + return 3; + + } + + /* if 'first' is a spare promote it to a populated mpb with sec's + * family number + */ + if (first->anchor->num_raid_devs == 0 && + sec->anchor->num_raid_devs > 0) { + int i; + struct intel_dev *dv; + struct imsm_dev *dev; + + /* we need to copy raid device info from sec if an allocation + * fails here we don't associate the spare + */ + for (i = 0; i < sec->anchor->num_raid_devs; i++) { + dv = xmalloc(sizeof(*dv)); + dev = xmalloc(sizeof_imsm_dev(get_imsm_dev(sec, i), 1)); + dv->dev = dev; + dv->index = i; + dv->next = first->devlist; + first->devlist = dv; + } + if (i < sec->anchor->num_raid_devs) { + /* allocation failure */ + free_devlist(first); + pr_err("imsm: failed to associate spare\n"); + return 3; + } + first->anchor->num_raid_devs = sec->anchor->num_raid_devs; + first->anchor->orig_family_num = sec->anchor->orig_family_num; + first->anchor->family_num = sec->anchor->family_num; + memcpy(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH); + for (i = 0; i < sec->anchor->num_raid_devs; i++) + imsm_copy_dev(get_imsm_dev(first, i), get_imsm_dev(sec, i)); + } + + return 0; +} + +static void fd2devname(int fd, char *name) +{ + struct stat st; + char path[256]; + char dname[PATH_MAX]; + char *nm; + int rv; + + name[0] = '\0'; + if (fstat(fd, &st) != 0) + return; + sprintf(path, "/sys/dev/block/%d:%d", + major(st.st_rdev), minor(st.st_rdev)); + + rv = readlink(path, dname, sizeof(dname)-1); + if (rv <= 0) + return; + + dname[rv] = '\0'; + nm = strrchr(dname, '/'); + if (nm) { + nm++; + snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm); + } +} + +extern int scsi_get_serial(int fd, void *buf, size_t buf_len); + +static int imsm_read_serial(int fd, char *devname, + __u8 serial[MAX_RAID_SERIAL_LEN]) +{ + unsigned char scsi_serial[255]; + int rv; + int rsp_len; + int len; + char *dest; + char *src; + char *rsp_buf; + int i; + + memset(scsi_serial, 0, sizeof(scsi_serial)); + + rv = scsi_get_serial(fd, scsi_serial, sizeof(scsi_serial)); + + if (rv && check_env("IMSM_DEVNAME_AS_SERIAL")) { + memset(serial, 0, MAX_RAID_SERIAL_LEN); + fd2devname(fd, (char *) serial); + return 0; + } + + if (rv != 0) { + if (devname) + pr_err("Failed to retrieve serial for %s\n", + devname); + return rv; + } + + rsp_len = scsi_serial[3]; + if (!rsp_len) { + if (devname) + pr_err("Failed to retrieve serial for %s\n", + devname); + return 2; + } + rsp_buf = (char *) &scsi_serial[4]; + + /* trim all whitespace and non-printable characters and convert + * ':' to ';' + */ + for (i = 0, dest = rsp_buf; i < rsp_len; i++) { + src = &rsp_buf[i]; + if (*src > 0x20) { + /* ':' is reserved for use in placeholder serial + * numbers for missing disks + */ + if (*src == ':') + *dest++ = ';'; + else + *dest++ = *src; + } + } + len = dest - rsp_buf; + dest = rsp_buf; + + /* truncate leading characters */ + if (len > MAX_RAID_SERIAL_LEN) { + dest += len - MAX_RAID_SERIAL_LEN; + len = MAX_RAID_SERIAL_LEN; + } + + memset(serial, 0, MAX_RAID_SERIAL_LEN); + memcpy(serial, dest, len); + + return 0; +} + +static int serialcmp(__u8 *s1, __u8 *s2) +{ + return strncmp((char *) s1, (char *) s2, MAX_RAID_SERIAL_LEN); +} + +static void serialcpy(__u8 *dest, __u8 *src) +{ + strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN); +} + +static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) +{ + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (serialcmp(dl->serial, serial) == 0) + break; + + return dl; +} + +static struct imsm_disk * +__serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx) +{ + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + + if (serialcmp(disk->serial, serial) == 0) { + if (idx) + *idx = i; + return disk; + } + } + + return NULL; +} + +static int +load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + struct imsm_disk *disk; + struct dl *dl; + struct stat stb; + int rv; + char name[40]; + __u8 serial[MAX_RAID_SERIAL_LEN]; + + rv = imsm_read_serial(fd, devname, serial); + + if (rv != 0) + return 2; + + dl = xcalloc(1, sizeof(*dl)); + + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->disks; + dl->fd = keep_fd ? fd : -1; + assert(super->disks == NULL); + super->disks = dl; + serialcpy(dl->serial, serial); + dl->index = -2; + dl->e = NULL; + fd2devname(fd, name); + if (devname) + dl->devname = xstrdup(devname); + else + dl->devname = xstrdup(name); + + /* look up this disk's index in the current anchor */ + disk = __serial_to_disk(dl->serial, super->anchor, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of a + * populated contianer, i.e. one with raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + } + + return 0; +} + +#ifndef MDASSEMBLE +/* When migrating map0 contains the 'destination' state while map1 + * contains the current state. When not migrating map0 contains the + * current state. This routine assumes that map[0].map_state is set to + * the current array state before being called. + * + * Migration is indicated by one of the following states + * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed) + * 2/ Initialize (migr_state=1 migr_type=MIGR_INIT map0state=normal + * map1state=unitialized) + * 3/ Repair (Resync) (migr_state=1 migr_type=MIGR_REPAIR map0state=normal + * map1state=normal) + * 4/ Rebuild (migr_state=1 migr_type=MIGR_REBUILD map0state=normal + * map1state=degraded) + * 5/ Migration (mig_state=1 migr_type=MIGR_GEN_MIGR map0state=normal + * map1state=normal) + */ +static void migrate(struct imsm_dev *dev, struct intel_super *super, + __u8 to_state, int migr_type) +{ + struct imsm_map *dest; + struct imsm_map *src = get_imsm_map(dev, MAP_0); + + dev->vol.migr_state = 1; + set_migr_type(dev, migr_type); + dev->vol.curr_migr_unit = 0; + dest = get_imsm_map(dev, MAP_1); + + /* duplicate and then set the target end state in map[0] */ + memcpy(dest, src, sizeof_imsm_map(src)); + if ((migr_type == MIGR_REBUILD) || + (migr_type == MIGR_GEN_MIGR)) { + __u32 ord; + int i; + + for (i = 0; i < src->num_members; i++) { + ord = __le32_to_cpu(src->disk_ord_tbl[i]); + set_imsm_ord_tbl_ent(src, i, ord_to_idx(ord)); + } + } + + if (migr_type == MIGR_GEN_MIGR) + /* Clear migration record */ + memset(super->migr_rec, 0, sizeof(struct migr_record)); + + src->map_state = to_state; +} + +static void end_migration(struct imsm_dev *dev, struct intel_super *super, + __u8 map_state) +{ + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state == 0 ? + MAP_0 : MAP_1); + int i, j; + + /* merge any IMSM_ORD_REBUILD bits that were not successfully + * completed in the last migration. + * + * FIXME add support for raid-level-migration + */ + if ((map_state != map->map_state) && (is_gen_migration(dev) == 0) && + (prev->map_state != IMSM_T_STATE_UNINITIALIZED)) { + /* when final map state is other than expected + * merge maps (not for migration) + */ + int failed; + + for (i = 0; i < prev->num_members; i++) + for (j = 0; j < map->num_members; j++) + /* during online capacity expansion + * disks position can be changed + * if takeover is used + */ + if (ord_to_idx(map->disk_ord_tbl[j]) == + ord_to_idx(prev->disk_ord_tbl[i])) { + map->disk_ord_tbl[j] |= + prev->disk_ord_tbl[i]; + break; + } + failed = imsm_count_failed(super, dev, MAP_0); + map_state = imsm_check_degraded(super, dev, failed, MAP_0); + } + + dev->vol.migr_state = 0; + set_migr_type(dev, 0); + dev->vol.curr_migr_unit = 0; + map->map_state = map_state; +} +#endif + +static int parse_raid_devices(struct intel_super *super) +{ + int i; + struct imsm_dev *dev_new; + size_t len, len_migr; + size_t max_len = 0; + size_t space_needed = 0; + struct imsm_super *mpb = super->anchor; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); + struct intel_dev *dv; + + len = sizeof_imsm_dev(dev_iter, 0); + len_migr = sizeof_imsm_dev(dev_iter, 1); + if (len_migr > len) + space_needed += len_migr - len; + + dv = xmalloc(sizeof(*dv)); + if (max_len < len_migr) + max_len = len_migr; + if (max_len > len_migr) + space_needed += max_len - len_migr; + dev_new = xmalloc(max_len); + imsm_copy_dev(dev_new, dev_iter); + dv->dev = dev_new; + dv->index = i; + dv->next = super->devlist; + super->devlist = dv; + } + + /* ensure that super->buf is large enough when all raid devices + * are migrating + */ + if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) { + void *buf; + + len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed, 512); + if (posix_memalign(&buf, 512, len) != 0) + return 1; + + memcpy(buf, super->buf, super->len); + memset(buf + super->len, 0, len - super->len); + free(super->buf); + super->buf = buf; + super->len = len; + } + + return 0; +} + +/* retrieve a pointer to the bbm log which starts after all raid devices */ +struct bbm_log *__get_imsm_bbm_log(struct imsm_super *mpb) +{ + void *ptr = NULL; + + if (__le32_to_cpu(mpb->bbm_log_size)) { + ptr = mpb; + ptr += mpb->mpb_size - __le32_to_cpu(mpb->bbm_log_size); + } + + return ptr; +} + +/******************************************************************************* + * Function: check_mpb_migr_compatibility + * Description: Function checks for unsupported migration features: + * - migration optimization area (pba_of_lba0) + * - descending reshape (ascending_migr) + * Parameters: + * super : imsm metadata information + * Returns: + * 0 : migration is compatible + * -1 : migration is not compatible + ******************************************************************************/ +int check_mpb_migr_compatibility(struct intel_super *super) +{ + struct imsm_map *map0, *map1; + struct migr_record *migr_rec = super->migr_rec; + int i; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); + + if (dev_iter && + dev_iter->vol.migr_state == 1 && + dev_iter->vol.migr_type == MIGR_GEN_MIGR) { + /* This device is migrating */ + map0 = get_imsm_map(dev_iter, MAP_0); + map1 = get_imsm_map(dev_iter, MAP_1); + if (pba_of_lba0(map0) != pba_of_lba0(map1)) + /* migration optimization area was used */ + return -1; + if (migr_rec->ascending_migr == 0 + && migr_rec->dest_depth_per_unit > 0) + /* descending reshape not supported yet */ + return -1; + } + } + return 0; +} + +static void __free_imsm(struct intel_super *super, int free_disks); + +/* load_imsm_mpb - read matrix metadata + * allocates super->mpb to be freed by free_imsm + */ +static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) +{ + unsigned long long dsize; + unsigned long long sectors; + struct stat; + struct imsm_super *anchor; + __u32 check_sum; + + get_dev_size(fd, NULL, &dsize); + if (dsize < 1024) { + if (devname) + pr_err("%s: device to small for imsm\n", + devname); + return 1; + } + + if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) { + if (devname) + pr_err("Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void**)&anchor, 512, 512) != 0) { + if (devname) + pr_err("Failed to allocate imsm anchor buffer on %s\n", devname); + return 1; + } + if (read(fd, anchor, 512) != 512) { + if (devname) + pr_err("Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); + free(anchor); + return 1; + } + + if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) { + if (devname) + pr_err("no IMSM anchor on %s\n", devname); + free(anchor); + return 2; + } + + __free_imsm(super, 0); + /* reload capability and hba */ + + /* capability and hba must be updated with new super allocation */ + find_intel_hba_capability(fd, super, devname); + super->len = ROUND_UP(anchor->mpb_size, 512); + if (posix_memalign(&super->buf, 512, super->len) != 0) { + if (devname) + pr_err("unable to allocate %zu byte mpb buffer\n", + super->len); + free(anchor); + return 2; + } + memcpy(super->buf, anchor, 512); + + sectors = mpb_sectors(anchor) - 1; + free(anchor); + + if (posix_memalign(&super->migr_rec_buf, 512, MIGR_REC_BUF_SIZE) != 0) { + pr_err("could not allocate migr_rec buffer\n"); + free(super->buf); + return 2; + } + super->clean_migration_record_by_mdmon = 0; + + if (!sectors) { + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { + if (devname) + pr_err("IMSM checksum %x != %x on %s\n", + check_sum, + __le32_to_cpu(super->anchor->check_sum), + devname); + return 2; + } + + return 0; + } + + /* read the extended mpb */ + if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) { + if (devname) + pr_err("Cannot seek to extended mpb on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if ((unsigned)read(fd, super->buf + 512, super->len - 512) != super->len - 512) { + if (devname) + pr_err("Cannot read extended mpb on %s: %s\n", + devname, strerror(errno)); + return 2; + } + + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { + if (devname) + pr_err("IMSM checksum %x != %x on %s\n", + check_sum, __le32_to_cpu(super->anchor->check_sum), + devname); + return 3; + } + + /* FIXME the BBM log is disk specific so we cannot use this global + * buffer for all disks. Ok for now since we only look at the global + * bbm_log_size parameter to gate assembly + */ + super->bbm_log = __get_imsm_bbm_log(super->anchor); + + return 0; +} + +static int read_imsm_migr_rec(int fd, struct intel_super *super); + +/* clears hi bits in metadata if MPB_ATTRIB_2TB_DISK not set */ +static void clear_hi(struct intel_super *super) +{ + struct imsm_super *mpb = super->anchor; + int i, n; + if (mpb->attributes & MPB_ATTRIB_2TB_DISK) + return; + for (i = 0; i < mpb->num_disks; ++i) { + struct imsm_disk *disk = &mpb->disk[i]; + disk->total_blocks_hi = 0; + } + for (i = 0; i < mpb->num_raid_devs; ++i) { + struct imsm_dev *dev = get_imsm_dev(super, i); + if (!dev) + return; + for (n = 0; n < 2; ++n) { + struct imsm_map *map = get_imsm_map(dev, n); + if (!map) + continue; + map->pba_of_lba0_hi = 0; + map->blocks_per_member_hi = 0; + map->num_data_stripes_hi = 0; + } + } +} + +static int +load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + int err; + + err = load_imsm_mpb(fd, super, devname); + if (err) + return err; + err = load_imsm_disk(fd, super, devname, keep_fd); + if (err) + return err; + err = parse_raid_devices(super); + clear_hi(super); + return err; +} + +static void __free_imsm_disk(struct dl *d) +{ + if (d->fd >= 0) + close(d->fd); + if (d->devname) + free(d->devname); + if (d->e) + free(d->e); + free(d); + +} + +static void free_imsm_disks(struct intel_super *super) +{ + struct dl *d; + + while (super->disks) { + d = super->disks; + super->disks = d->next; + __free_imsm_disk(d); + } + while (super->disk_mgmt_list) { + d = super->disk_mgmt_list; + super->disk_mgmt_list = d->next; + __free_imsm_disk(d); + } + while (super->missing) { + d = super->missing; + super->missing = d->next; + __free_imsm_disk(d); + } + +} + +/* free all the pieces hanging off of a super pointer */ +static void __free_imsm(struct intel_super *super, int free_disks) +{ + struct intel_hba *elem, *next; + + if (super->buf) { + free(super->buf); + super->buf = NULL; + } + /* unlink capability description */ + super->orom = NULL; + if (super->migr_rec_buf) { + free(super->migr_rec_buf); + super->migr_rec_buf = NULL; + } + if (free_disks) + free_imsm_disks(super); + free_devlist(super); + elem = super->hba; + while (elem) { + if (elem->path) + free((void *)elem->path); + next = elem->next; + free(elem); + elem = next; + } + super->hba = NULL; +} + +static void free_imsm(struct intel_super *super) +{ + __free_imsm(super, 1); + free(super); +} + +static void free_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + + if (!super) + return; + + free_imsm(super); + st->sb = NULL; +} + +static struct intel_super *alloc_super(void) +{ + struct intel_super *super = xcalloc(1, sizeof(*super)); + + super->current_vol = -1; + super->create_offset = ~((unsigned long long) 0); + return super; +} + +/* + * find and allocate hba and OROM/EFI based on valid fd of RAID component device + */ +static int find_intel_hba_capability(int fd, struct intel_super *super, char *devname) +{ + struct sys_dev *hba_name; + int rv = 0; + + if ((fd < 0) || check_env("IMSM_NO_PLATFORM")) { + super->orom = NULL; + super->hba = NULL; + return 0; + } + hba_name = find_disk_attached_hba(fd, NULL); + if (!hba_name) { + if (devname) + pr_err("%s is not attached to Intel(R) RAID controller.\n", + devname); + return 1; + } + rv = attach_hba_to_super(super, hba_name); + if (rv == 2) { + if (devname) { + struct intel_hba *hba = super->hba; + + pr_err("%s is attached to Intel(R) %s RAID controller (%s),\n" + " but the container is assigned to Intel(R) %s RAID controller (", + devname, + get_sys_dev_type(hba_name->type), + hba_name->pci_id ? : "Err!", + get_sys_dev_type(super->hba->type)); + + while (hba) { + fprintf(stderr, "%s", hba->pci_id ? : "Err!"); + if (hba->next) + fprintf(stderr, ", "); + hba = hba->next; + } + fprintf(stderr, ").\n" + " Mixing devices attached to different controllers is not allowed.\n"); + } + return 2; + } + super->orom = find_imsm_capability(hba_name); + if (!super->orom) + return 3; + + return 0; +} + +/* find_missing - helper routine for load_super_imsm_all that identifies + * disks that have disappeared from the system. This routine relies on + * the mpb being uptodate, which it is at load time. + */ +static int find_missing(struct intel_super *super) +{ + int i; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + struct imsm_disk *disk; + + for (i = 0; i < mpb->num_disks; i++) { + disk = __get_imsm_disk(mpb, i); + dl = serial_to_dl(disk->serial, super); + if (dl) + continue; + + dl = xmalloc(sizeof(*dl)); + dl->major = 0; + dl->minor = 0; + dl->fd = -1; + dl->devname = xstrdup("missing"); + dl->index = i; + serialcpy(dl->serial, disk->serial); + dl->disk = *disk; + dl->e = NULL; + dl->next = super->missing; + super->missing = dl; + } + + return 0; +} + +#ifndef MDASSEMBLE +static struct intel_disk *disk_list_get(__u8 *serial, struct intel_disk *disk_list) +{ + struct intel_disk *idisk = disk_list; + + while (idisk) { + if (serialcmp(idisk->disk.serial, serial) == 0) + break; + idisk = idisk->next; + } + + return idisk; +} + +static int __prep_thunderdome(struct intel_super **table, int tbl_size, + struct intel_super *super, + struct intel_disk **disk_list) +{ + struct imsm_disk *d = &super->disks->disk; + struct imsm_super *mpb = super->anchor; + int i, j; + + for (i = 0; i < tbl_size; i++) { + struct imsm_super *tbl_mpb = table[i]->anchor; + struct imsm_disk *tbl_d = &table[i]->disks->disk; + + if (tbl_mpb->family_num == mpb->family_num) { + if (tbl_mpb->check_sum == mpb->check_sum) { + dprintf("mpb from %d:%d matches %d:%d\n", + super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + break; + } + + if (((is_configured(d) && !is_configured(tbl_d)) || + is_configured(d) == is_configured(tbl_d)) && + tbl_mpb->generation_num < mpb->generation_num) { + /* current version of the mpb is a + * better candidate than the one in + * super_table, but copy over "cross + * generational" status + */ + struct intel_disk *idisk; + + dprintf("mpb from %d:%d replaces %d:%d\n", + super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + idisk = disk_list_get(tbl_d->serial, *disk_list); + if (idisk && is_failed(&idisk->disk)) + tbl_d->status |= FAILED_DISK; + break; + } else { + struct intel_disk *idisk; + struct imsm_disk *disk; + + /* tbl_mpb is more up to date, but copy + * over cross generational status before + * returning + */ + disk = __serial_to_disk(d->serial, mpb, NULL); + if (disk && is_failed(disk)) + d->status |= FAILED_DISK; + + idisk = disk_list_get(d->serial, *disk_list); + if (idisk) { + idisk->owner = i; + if (disk && is_configured(disk)) + idisk->disk.status |= CONFIGURED_DISK; + } + + dprintf("mpb from %d:%d prefer %d:%d\n", + super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + return tbl_size; + } + } + } + + if (i >= tbl_size) + table[tbl_size++] = super; + else + table[i] = super; + + /* update/extend the merged list of imsm_disk records */ + for (j = 0; j < mpb->num_disks; j++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, j); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, *disk_list); + if (idisk) { + idisk->disk.status |= disk->status; + if (is_configured(&idisk->disk) || + is_failed(&idisk->disk)) + idisk->disk.status &= ~(SPARE_DISK); + } else { + idisk = xcalloc(1, sizeof(*idisk)); + idisk->owner = IMSM_UNKNOWN_OWNER; + idisk->disk = *disk; + idisk->next = *disk_list; + *disk_list = idisk; + } + + if (serialcmp(idisk->disk.serial, d->serial) == 0) + idisk->owner = i; + } + + return tbl_size; +} + +static struct intel_super * +validate_members(struct intel_super *super, struct intel_disk *disk_list, + const int owner) +{ + struct imsm_super *mpb = super->anchor; + int ok_count = 0; + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, disk_list); + if (idisk) { + if (idisk->owner == owner || + idisk->owner == IMSM_UNKNOWN_OWNER) + ok_count++; + else + dprintf("'%.16s' owner %d != %d\n", + disk->serial, idisk->owner, + owner); + } else { + dprintf("unknown disk %x [%d]: %.16s\n", + __le32_to_cpu(mpb->family_num), i, + disk->serial); + break; + } + } + + if (ok_count == mpb->num_disks) + return super; + return NULL; +} + +static void show_conflicts(__u32 family_num, struct intel_super *super_list) +{ + struct intel_super *s; + + for (s = super_list; s; s = s->next) { + if (family_num != s->anchor->family_num) + continue; + pr_err("Conflict, offlining family %#x on '%s'\n", + __le32_to_cpu(family_num), s->disks->devname); + } +} + +static struct intel_super * +imsm_thunderdome(struct intel_super **super_list, int len) +{ + struct intel_super *super_table[len]; + struct intel_disk *disk_list = NULL; + struct intel_super *champion, *spare; + struct intel_super *s, **del; + int tbl_size = 0; + int conflict; + int i; + + memset(super_table, 0, sizeof(super_table)); + for (s = *super_list; s; s = s->next) + tbl_size = __prep_thunderdome(super_table, tbl_size, s, &disk_list); + + for (i = 0; i < tbl_size; i++) { + struct imsm_disk *d; + struct intel_disk *idisk; + struct imsm_super *mpb = super_table[i]->anchor; + + s = super_table[i]; + d = &s->disks->disk; + + /* 'd' must appear in merged disk list for its + * configuration to be valid + */ + idisk = disk_list_get(d->serial, disk_list); + if (idisk && idisk->owner == i) + s = validate_members(s, disk_list, i); + else + s = NULL; + + if (!s) + dprintf("marking family: %#x from %d:%d offline\n", + mpb->family_num, + super_table[i]->disks->major, + super_table[i]->disks->minor); + super_table[i] = s; + } + + /* This is where the mdadm implementation differs from the Windows + * driver which has no strict concept of a container. We can only + * assemble one family from a container, so when returning a prodigal + * array member to this system the code will not be able to disambiguate + * the container contents that should be assembled ("foreign" versus + * "local"). It requires user intervention to set the orig_family_num + * to a new value to establish a new container. The Windows driver in + * this situation fixes up the volume name in place and manages the + * foreign array as an independent entity. + */ + s = NULL; + spare = NULL; + conflict = 0; + for (i = 0; i < tbl_size; i++) { + struct intel_super *tbl_ent = super_table[i]; + int is_spare = 0; + + if (!tbl_ent) + continue; + + if (tbl_ent->anchor->num_raid_devs == 0) { + spare = tbl_ent; + is_spare = 1; + } + + if (s && !is_spare) { + show_conflicts(tbl_ent->anchor->family_num, *super_list); + conflict++; + } else if (!s && !is_spare) + s = tbl_ent; + } + + if (!s) + s = spare; + if (!s) { + champion = NULL; + goto out; + } + champion = s; + + if (conflict) + pr_err("Chose family %#x on '%s', assemble conflicts to new container with '--update=uuid'\n", + __le32_to_cpu(s->anchor->family_num), s->disks->devname); + + /* collect all dl's onto 'champion', and update them to + * champion's version of the status + */ + for (s = *super_list; s; s = s->next) { + struct imsm_super *mpb = champion->anchor; + struct dl *dl = s->disks; + + if (s == champion) + continue; + + mpb->attributes |= s->anchor->attributes & MPB_ATTRIB_2TB_DISK; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk; + + disk = __serial_to_disk(dl->serial, mpb, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of + * a populated contianer, i.e. one with + * raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + break; + } + } + + if (i >= mpb->num_disks) { + struct intel_disk *idisk; + + idisk = disk_list_get(dl->serial, disk_list); + if (idisk && is_spare(&idisk->disk) && + !is_failed(&idisk->disk) && !is_configured(&idisk->disk)) + dl->index = -1; + else { + dl->index = -2; + continue; + } + } + + dl->next = champion->disks; + champion->disks = dl; + s->disks = NULL; + } + + /* delete 'champion' from super_list */ + for (del = super_list; *del; ) { + if (*del == champion) { + *del = (*del)->next; + break; + } else + del = &(*del)->next; + } + champion->next = NULL; + + out: + while (disk_list) { + struct intel_disk *idisk = disk_list; + + disk_list = disk_list->next; + free(idisk); + } + + return champion; +} + +static int +get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd); +static int get_super_block(struct intel_super **super_list, char *devnm, char *devname, + int major, int minor, int keep_fd); +static int +get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list, + int *max, int keep_fd); + +static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, + char *devname, struct md_list *devlist, + int keep_fd) +{ + struct intel_super *super_list = NULL; + struct intel_super *super = NULL; + int err = 0; + int i = 0; + + if (fd >= 0) + /* 'fd' is an opened container */ + err = get_sra_super_block(fd, &super_list, devname, &i, keep_fd); + else + /* get super block from devlist devices */ + err = get_devlist_super_block(devlist, &super_list, &i, keep_fd); + if (err) + goto error; + /* all mpbs enter, maybe one leaves */ + super = imsm_thunderdome(&super_list, i); + if (!super) { + err = 1; + goto error; + } + + if (find_missing(super) != 0) { + free_imsm(super); + err = 2; + goto error; + } + + /* load migration record */ + err = load_imsm_migr_rec(super, NULL); + if (err == -1) { + /* migration is in progress, + * but migr_rec cannot be loaded, + */ + err = 4; + goto error; + } + + /* Check migration compatibility */ + if ((err == 0) && (check_mpb_migr_compatibility(super) != 0)) { + pr_err("Unsupported migration detected"); + if (devname) + fprintf(stderr, " on %s\n", devname); + else + fprintf(stderr, " (IMSM).\n"); + + err = 5; + goto error; + } + + err = 0; + + error: + while (super_list) { + struct intel_super *s = super_list; + + super_list = super_list->next; + free_imsm(s); + } + + if (err) + return err; + + *sbp = super; + if (fd >= 0) + strcpy(st->container_devnm, fd2devnm(fd)); + else + st->container_devnm[0] = 0; + if (err == 0 && st->ss == NULL) { + st->ss = &super_imsm; + st->minor_version = 0; + st->max_devs = IMSM_MAX_DEVICES; + } + return 0; +} + +static int +get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list, + int *max, int keep_fd) +{ + struct md_list *tmpdev; + int err = 0; + int i = 0; + + for (i = 0, tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if (tmpdev->used != 1) + continue; + if (tmpdev->container == 1) { + int lmax = 0; + int fd = dev_open(tmpdev->devname, O_RDONLY|O_EXCL); + if (fd < 0) { + pr_err("cannot open device %s: %s\n", + tmpdev->devname, strerror(errno)); + err = 8; + goto error; + } + err = get_sra_super_block(fd, super_list, + tmpdev->devname, &lmax, + keep_fd); + i += lmax; + close(fd); + if (err) { + err = 7; + goto error; + } + } else { + int major = major(tmpdev->st_rdev); + int minor = minor(tmpdev->st_rdev); + err = get_super_block(super_list, + NULL, + tmpdev->devname, + major, minor, + keep_fd); + i++; + if (err) { + err = 6; + goto error; + } + } + } + error: + *max = i; + return err; +} + +static int get_super_block(struct intel_super **super_list, char *devnm, char *devname, + int major, int minor, int keep_fd) +{ + struct intel_super*s = NULL; + char nm[32]; + int dfd = -1; + int err = 0; + int retry; + + s = alloc_super(); + if (!s) { + err = 1; + goto error; + } + + sprintf(nm, "%d:%d", major, minor); + dfd = dev_open(nm, O_RDWR); + if (dfd < 0) { + err = 2; + goto error; + } + + find_intel_hba_capability(dfd, s, devname); + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + + /* retry the load if we might have raced against mdmon */ + if (err == 3 && devnm && mdmon_running(devnm)) + for (retry = 0; retry < 3; retry++) { + usleep(3000); + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + if (err != 3) + break; + } + error: + if (!err) { + s->next = *super_list; + *super_list = s; + } else { + if (s) + free(s); + if (dfd >= 0) + close(dfd); + } + if ((dfd >= 0) && (!keep_fd)) + close(dfd); + return err; + +} + +static int +get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd) +{ + struct mdinfo *sra; + char *devnm; + struct mdinfo *sd; + int err = 0; + int i = 0; + sra = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + if (!sra) + return 1; + + if (sra->array.major_version != -1 || + sra->array.minor_version != -2 || + strcmp(sra->text_version, "imsm") != 0) { + err = 1; + goto error; + } + /* load all mpbs */ + devnm = fd2devnm(fd); + for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) { + if (get_super_block(super_list, devnm, devname, + sd->disk.major, sd->disk.minor, keep_fd) != 0) { + err = 7; + goto error; + } + } + error: + sysfs_free(sra); + *max = i; + return err; +} + +static int load_container_imsm(struct supertype *st, int fd, char *devname) +{ + return load_super_imsm_all(st, fd, &st->sb, devname, NULL, 1); +} +#endif + +static int load_super_imsm(struct supertype *st, int fd, char *devname) +{ + struct intel_super *super; + int rv; + int retry; + + if (test_partition(fd)) + /* IMSM not allowed on partitions */ + return 1; + + free_super_imsm(st); + + super = alloc_super(); + /* Load hba and capabilities if they exist. + * But do not preclude loading metadata in case capabilities or hba are + * non-compliant and ignore_hw_compat is set. + */ + rv = find_intel_hba_capability(fd, super, devname); + /* no orom/efi or non-intel hba of the disk */ + if ((rv != 0) && (st->ignore_hw_compat == 0)) { + if (devname) + pr_err("No OROM/EFI properties for %s\n", devname); + free_imsm(super); + return 2; + } + rv = load_and_parse_mpb(fd, super, devname, 0); + + /* retry the load if we might have raced against mdmon */ + if (rv == 3) { + struct mdstat_ent *mdstat = mdstat_by_component(fd2devnm(fd)); + + if (mdstat && mdmon_running(mdstat->devnm) && getpid() != mdmon_pid(mdstat->devnm)) { + for (retry = 0; retry < 3; retry++) { + usleep(3000); + rv = load_and_parse_mpb(fd, super, devname, 0); + if (rv != 3) + break; + } + } + + free_mdstat(mdstat); + } + + if (rv) { + if (devname) + pr_err("Failed to load all information sections on %s\n", devname); + free_imsm(super); + return rv; + } + + st->sb = super; + if (st->ss == NULL) { + st->ss = &super_imsm; + st->minor_version = 0; + st->max_devs = IMSM_MAX_DEVICES; + } + + /* load migration record */ + if (load_imsm_migr_rec(super, NULL) == 0) { + /* Check for unsupported migration features */ + if (check_mpb_migr_compatibility(super) != 0) { + pr_err("Unsupported migration detected"); + if (devname) + fprintf(stderr, " on %s\n", devname); + else + fprintf(stderr, " (IMSM).\n"); + return 3; + } + } + + return 0; +} + +static __u16 info_to_blocks_per_strip(mdu_array_info_t *info) +{ + if (info->level == 1) + return 128; + return info->chunk_size >> 9; +} + +static unsigned long long info_to_blocks_per_member(mdu_array_info_t *info, + unsigned long long size) +{ + if (info->level == 1) + return size * 2; + else + return (size * 2) & ~(info_to_blocks_per_strip(info) - 1); +} + +static void imsm_update_version_info(struct intel_super *super) +{ + /* update the version and attributes */ + struct imsm_super *mpb = super->anchor; + char *version; + struct imsm_dev *dev; + struct imsm_map *map; + int i; + + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + if (__le32_to_cpu(dev->size_high) > 0) + mpb->attributes |= MPB_ATTRIB_2TB; + + /* FIXME detect when an array spans a port multiplier */ + #if 0 + mpb->attributes |= MPB_ATTRIB_PM; + #endif + + if (mpb->num_raid_devs > 1 || + mpb->attributes != MPB_ATTRIB_CHECKSUM_VERIFY) { + version = MPB_VERSION_ATTRIBS; + switch (get_imsm_raid_level(map)) { + case 0: mpb->attributes |= MPB_ATTRIB_RAID0; break; + case 1: mpb->attributes |= MPB_ATTRIB_RAID1; break; + case 10: mpb->attributes |= MPB_ATTRIB_RAID10; break; + case 5: mpb->attributes |= MPB_ATTRIB_RAID5; break; + } + } else { + if (map->num_members >= 5) + version = MPB_VERSION_5OR6_DISK_ARRAY; + else if (dev->status == DEV_CLONE_N_GO) + version = MPB_VERSION_CNG; + else if (get_imsm_raid_level(map) == 5) + version = MPB_VERSION_RAID5; + else if (map->num_members >= 3) + version = MPB_VERSION_3OR4_DISK_ARRAY; + else if (get_imsm_raid_level(map) == 1) + version = MPB_VERSION_RAID1; + else + version = MPB_VERSION_RAID0; + } + strcpy(((char *) mpb->sig) + strlen(MPB_SIGNATURE), version); + } +} + +static int check_name(struct intel_super *super, char *name, int quiet) +{ + struct imsm_super *mpb = super->anchor; + char *reason = NULL; + int i; + + if (strlen(name) > MAX_RAID_SERIAL_LEN) + reason = "must be 16 characters or less"; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + + if (strncmp((char *) dev->volume, name, MAX_RAID_SERIAL_LEN) == 0) { + reason = "already exists"; + break; + } + } + + if (reason && !quiet) + pr_err("imsm volume name %s\n", reason); + + return !reason; +} + +static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, + char *homehost, int *uuid, + long long data_offset) +{ + /* We are creating a volume inside a pre-existing container. + * so st->sb is already set. + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct intel_dev *dv; + struct imsm_dev *dev; + struct imsm_vol *vol; + struct imsm_map *map; + int idx = mpb->num_raid_devs; + int i; + unsigned long long array_blocks; + size_t size_old, size_new; + unsigned long long num_data_stripes; + + if (super->orom && mpb->num_raid_devs >= super->orom->vpa) { + pr_err("This imsm-container already has the maximum of %d volumes\n", super->orom->vpa); + return 0; + } + + /* ensure the mpb is large enough for the new data */ + size_old = __le32_to_cpu(mpb->mpb_size); + size_new = disks_to_mpb_size(info->nr_disks); + if (size_new > size_old) { + void *mpb_new; + size_t size_round = ROUND_UP(size_new, 512); + + if (posix_memalign(&mpb_new, 512, size_round) != 0) { + pr_err("could not allocate new mpb\n"); + return 0; + } + if (posix_memalign(&super->migr_rec_buf, 512, + MIGR_REC_BUF_SIZE) != 0) { + pr_err("could not allocate migr_rec buffer\n"); + free(super->buf); + free(super); + free(mpb_new); + return 0; + } + memcpy(mpb_new, mpb, size_old); + free(mpb); + mpb = mpb_new; + super->anchor = mpb_new; + mpb->mpb_size = __cpu_to_le32(size_new); + memset(mpb_new + size_old, 0, size_round - size_old); + } + super->current_vol = idx; + + /* handle 'failed_disks' by either: + * a) create dummy disk entries in the table if this the first + * volume in the array. We add them here as this is the only + * opportunity to add them. add_to_super_imsm_volume() + * handles the non-failed disks and continues incrementing + * mpb->num_disks. + * b) validate that 'failed_disks' matches the current number + * of missing disks if the container is populated + */ + if (super->current_vol == 0) { + mpb->num_disks = 0; + for (i = 0; i < info->failed_disks; i++) { + struct imsm_disk *disk; + + mpb->num_disks++; + disk = __get_imsm_disk(mpb, i); + disk->status = CONFIGURED_DISK | FAILED_DISK; + disk->scsi_id = __cpu_to_le32(~(__u32)0); + snprintf((char *) disk->serial, MAX_RAID_SERIAL_LEN, + "missing:%d", i); + } + find_missing(super); + } else { + int missing = 0; + struct dl *d; + + for (d = super->missing; d; d = d->next) + missing++; + if (info->failed_disks > missing) { + pr_err("unable to add 'missing' disk to container\n"); + return 0; + } + } + + if (!check_name(super, name, 0)) + return 0; + dv = xmalloc(sizeof(*dv)); + dev = xcalloc(1, sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1)); + strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN); + array_blocks = calc_array_size(info->level, info->raid_disks, + info->layout, info->chunk_size, + size * 2); + /* round array size down to closest MB */ + array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT; + + dev->size_low = __cpu_to_le32((__u32) array_blocks); + dev->size_high = __cpu_to_le32((__u32) (array_blocks >> 32)); + dev->status = (DEV_READ_COALESCING | DEV_WRITE_COALESCING); + vol = &dev->vol; + vol->migr_state = 0; + set_migr_type(dev, MIGR_INIT); + vol->dirty = !info->state; + vol->curr_migr_unit = 0; + map = get_imsm_map(dev, MAP_0); + set_pba_of_lba0(map, super->create_offset); + set_blocks_per_member(map, info_to_blocks_per_member(info, size)); + map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info)); + map->failed_disk_num = ~0; + if (info->level > 0) + map->map_state = (info->state ? IMSM_T_STATE_NORMAL + : IMSM_T_STATE_UNINITIALIZED); + else + map->map_state = info->failed_disks ? IMSM_T_STATE_FAILED : + IMSM_T_STATE_NORMAL; + map->ddf = 1; + + if (info->level == 1 && info->raid_disks > 2) { + free(dev); + free(dv); + pr_err("imsm does not support more than 2 disksin a raid1 volume\n"); + return 0; + } + + map->raid_level = info->level; + if (info->level == 10) { + map->raid_level = 1; + map->num_domains = info->raid_disks / 2; + } else if (info->level == 1) + map->num_domains = info->raid_disks; + else + map->num_domains = 1; + + /* info->size is only int so use the 'size' parameter instead */ + num_data_stripes = (size * 2) / info_to_blocks_per_strip(info); + num_data_stripes /= map->num_domains; + set_num_data_stripes(map, num_data_stripes); + + map->num_members = info->raid_disks; + for (i = 0; i < map->num_members; i++) { + /* initialized in add_to_super */ + set_imsm_ord_tbl_ent(map, i, IMSM_ORD_REBUILD); + } + mpb->num_raid_devs++; + + dv->dev = dev; + dv->index = super->current_vol; + dv->next = super->devlist; + super->devlist = dv; + + imsm_update_version_info(super); + + return 1; +} + +static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, + char *homehost, int *uuid, + unsigned long long data_offset) +{ + /* This is primarily called by Create when creating a new array. + * We will then get add_to_super called for each component, and then + * write_init_super called to write it out to each device. + * For IMSM, Create can create on fresh devices or on a pre-existing + * array. + * To create on a pre-existing array a different method will be called. + * This one is just for fresh drives. + */ + struct intel_super *super; + struct imsm_super *mpb; + size_t mpb_size; + char *version; + + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset not supported by imsm\n"); + return 0; + } + + if (st->sb) + return init_super_imsm_volume(st, info, size, name, homehost, uuid, + data_offset); + + if (info) + mpb_size = disks_to_mpb_size(info->nr_disks); + else + mpb_size = 512; + + super = alloc_super(); + if (super && posix_memalign(&super->buf, 512, mpb_size) != 0) { + free(super); + super = NULL; + } + if (!super) { + pr_err("could not allocate superblock\n"); + return 0; + } + if (posix_memalign(&super->migr_rec_buf, 512, MIGR_REC_BUF_SIZE) != 0) { + pr_err("could not allocate migr_rec buffer\n"); + free(super->buf); + free(super); + return 0; + } + memset(super->buf, 0, mpb_size); + mpb = super->buf; + mpb->mpb_size = __cpu_to_le32(mpb_size); + st->sb = super; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + + version = (char *) mpb->sig; + strcpy(version, MPB_SIGNATURE); + version += strlen(MPB_SIGNATURE); + strcpy(version, MPB_VERSION_RAID0); + + return 1; +} + +#ifndef MDASSEMBLE +static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct imsm_disk *_disk; + struct imsm_dev *dev; + struct imsm_map *map; + struct dl *dl, *df; + int slot; + + dev = get_imsm_dev(super, super->current_vol); + map = get_imsm_map(dev, MAP_0); + + if (! (dk->state & (1<<MD_DISK_SYNC))) { + pr_err("%s: Cannot add spare devices to IMSM volume\n", + devname); + return 1; + } + + if (fd == -1) { + /* we're doing autolayout so grab the pre-marked (in + * validate_geometry) raid_disk + */ + for (dl = super->disks; dl; dl = dl->next) + if (dl->raiddisk == dk->raid_disk) + break; + } else { + for (dl = super->disks; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + } + + if (!dl) { + pr_err("%s is not a member of the same container\n", devname); + return 1; + } + + /* add a pristine spare to the metadata */ + if (dl->index < 0) { + dl->index = super->anchor->num_disks; + super->anchor->num_disks++; + } + /* Check the device has not already been added */ + slot = get_imsm_disk_slot(map, dl->index); + if (slot >= 0 && + (get_imsm_ord_tbl_ent(dev, slot, MAP_X) & IMSM_ORD_REBUILD) == 0) { + pr_err("%s has been included in this array twice\n", + devname); + return 1; + } + set_imsm_ord_tbl_ent(map, dk->raid_disk, dl->index); + dl->disk.status = CONFIGURED_DISK; + + /* update size of 'missing' disks to be at least as large as the + * largest acitve member (we only have dummy missing disks when + * creating the first volume) + */ + if (super->current_vol == 0) { + for (df = super->missing; df; df = df->next) { + if (total_blocks(&dl->disk) > total_blocks(&df->disk)) + set_total_blocks(&df->disk, total_blocks(&dl->disk)); + _disk = __get_imsm_disk(mpb, df->index); + *_disk = df->disk; + } + } + + /* refresh unset/failed slots to point to valid 'missing' entries */ + for (df = super->missing; df; df = df->next) + for (slot = 0; slot < mpb->num_disks; slot++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X); + + if ((ord & IMSM_ORD_REBUILD) == 0) + continue; + set_imsm_ord_tbl_ent(map, slot, df->index | IMSM_ORD_REBUILD); + if (is_gen_migration(dev)) { + struct imsm_map *map2 = get_imsm_map(dev, + MAP_1); + int slot2 = get_imsm_disk_slot(map2, df->index); + if ((slot2 < map2->num_members) && + (slot2 >= 0)) { + __u32 ord2 = get_imsm_ord_tbl_ent(dev, + slot2, + MAP_1); + if ((unsigned)df->index == + ord_to_idx(ord2)) + set_imsm_ord_tbl_ent(map2, + slot2, + df->index | + IMSM_ORD_REBUILD); + } + } + dprintf("set slot:%d to missing disk:%d\n", slot, df->index); + break; + } + + /* if we are creating the first raid device update the family number */ + if (super->current_vol == 0) { + __u32 sum; + struct imsm_dev *_dev = __get_imsm_dev(mpb, 0); + + _disk = __get_imsm_disk(mpb, dl->index); + if (!_dev || !_disk) { + pr_err("BUG mpb setup error\n"); + return 1; + } + *_dev = *dev; + *_disk = dl->disk; + sum = random32(); + sum += __gen_imsm_checksum(mpb); + mpb->family_num = __cpu_to_le32(sum); + mpb->orig_family_num = mpb->family_num; + } + super->current_disk = dl; + return 0; +} + +/* mark_spare() + * Function marks disk as spare and restores disk serial + * in case it was previously marked as failed by takeover operation + * reruns: + * -1 : critical error + * 0 : disk is marked as spare but serial is not set + * 1 : success + */ +int mark_spare(struct dl *disk) +{ + __u8 serial[MAX_RAID_SERIAL_LEN]; + int ret_val = -1; + + if (!disk) + return ret_val; + + ret_val = 0; + if (!imsm_read_serial(disk->fd, NULL, serial)) { + /* Restore disk serial number, because takeover marks disk + * as failed and adds to serial ':0' before it becomes + * a spare disk. + */ + serialcpy(disk->serial, serial); + serialcpy(disk->disk.serial, serial); + ret_val = 1; + } + disk->disk.status = SPARE_DISK; + disk->index = -1; + + return ret_val; +} + +static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname, + unsigned long long data_offset) +{ + struct intel_super *super = st->sb; + struct dl *dd; + unsigned long long size; + __u32 id; + int rv; + struct stat stb; + + /* If we are on an RAID enabled platform check that the disk is + * attached to the raid controller. + * We do not need to test disks attachment for container based additions, + * they shall be already tested when container was created/assembled. + */ + rv = find_intel_hba_capability(fd, super, devname); + /* no orom/efi or non-intel hba of the disk */ + if (rv != 0) { + dprintf("capability: %p fd: %d ret: %d\n", + super->orom, fd, rv); + return 1; + } + + if (super->current_vol >= 0) + return add_to_super_imsm_volume(st, dk, fd, devname); + + fstat(fd, &stb); + dd = xcalloc(sizeof(*dd), 1); + dd->major = major(stb.st_rdev); + dd->minor = minor(stb.st_rdev); + dd->devname = devname ? xstrdup(devname) : NULL; + dd->fd = fd; + dd->e = NULL; + dd->action = DISK_ADD; + rv = imsm_read_serial(fd, devname, dd->serial); + if (rv) { + pr_err("failed to retrieve scsi serial, aborting\n"); + free(dd); + abort(); + } + + get_dev_size(fd, NULL, &size); + /* clear migr_rec when adding disk to container */ + memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SIZE); + if (lseek64(fd, size - MIGR_REC_POSITION, SEEK_SET) >= 0) { + if (write(fd, super->migr_rec_buf, + MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE) + perror("Write migr_rec failed"); + } + + size /= 512; + serialcpy(dd->disk.serial, dd->serial); + set_total_blocks(&dd->disk, size); + if (__le32_to_cpu(dd->disk.total_blocks_hi) > 0) { + struct imsm_super *mpb = super->anchor; + mpb->attributes |= MPB_ATTRIB_2TB_DISK; + } + mark_spare(dd); + if (sysfs_disk_to_scsi_id(fd, &id) == 0) + dd->disk.scsi_id = __cpu_to_le32(id); + else + dd->disk.scsi_id = __cpu_to_le32(0); + + if (st->update_tail) { + dd->next = super->disk_mgmt_list; + super->disk_mgmt_list = dd; + } else { + dd->next = super->disks; + super->disks = dd; + super->updates_pending++; + } + + return 0; +} + +static int remove_from_super_imsm(struct supertype *st, mdu_disk_info_t *dk) +{ + struct intel_super *super = st->sb; + struct dl *dd; + + /* remove from super works only in mdmon - for communication + * manager - monitor. Check if communication memory buffer + * is prepared. + */ + if (!st->update_tail) { + pr_err("shall be used in mdmon context only\n"); + return 1; + } + dd = xcalloc(1, sizeof(*dd)); + dd->major = dk->major; + dd->minor = dk->minor; + dd->fd = -1; + mark_spare(dd); + dd->action = DISK_REMOVE; + + dd->next = super->disk_mgmt_list; + super->disk_mgmt_list = dd; + + return 0; +} + +static int store_imsm_mpb(int fd, struct imsm_super *mpb); + +static union { + char buf[512]; + struct imsm_super anchor; +} spare_record __attribute__ ((aligned(512))); + +/* spare records have their own family number and do not have any defined raid + * devices + */ +static int write_super_imsm_spares(struct intel_super *super, int doclose) +{ + struct imsm_super *mpb = super->anchor; + struct imsm_super *spare = &spare_record.anchor; + __u32 sum; + struct dl *d; + + spare->mpb_size = __cpu_to_le32(sizeof(struct imsm_super)); + spare->generation_num = __cpu_to_le32(1UL); + spare->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + spare->num_disks = 1; + spare->num_raid_devs = 0; + spare->cache_size = mpb->cache_size; + spare->pwr_cycle_count = __cpu_to_le32(1); + + snprintf((char *) spare->sig, MAX_SIGNATURE_LENGTH, + MPB_SIGNATURE MPB_VERSION_RAID0); + + for (d = super->disks; d; d = d->next) { + if (d->index != -1) + continue; + + spare->disk[0] = d->disk; + if (__le32_to_cpu(d->disk.total_blocks_hi) > 0) + spare->attributes |= MPB_ATTRIB_2TB_DISK; + + sum = __gen_imsm_checksum(spare); + spare->family_num = __cpu_to_le32(sum); + spare->orig_family_num = 0; + sum = __gen_imsm_checksum(spare); + spare->check_sum = __cpu_to_le32(sum); + + if (store_imsm_mpb(d->fd, spare)) { + pr_err("failed for device %d:%d %s\n", + d->major, d->minor, strerror(errno)); + return 1; + } + if (doclose) { + close(d->fd); + d->fd = -1; + } + } + + return 0; +} + +static int write_super_imsm(struct supertype *st, int doclose) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct dl *d; + __u32 generation; + __u32 sum; + int spares = 0; + int i; + __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk); + int num_disks = 0; + int clear_migration_record = 1; + + /* 'generation' is incremented everytime the metadata is written */ + generation = __le32_to_cpu(mpb->generation_num); + generation++; + mpb->generation_num = __cpu_to_le32(generation); + + /* fix up cases where previous mdadm releases failed to set + * orig_family_num + */ + if (mpb->orig_family_num == 0) + mpb->orig_family_num = mpb->family_num; + + for (d = super->disks; d; d = d->next) { + if (d->index == -1) + spares++; + else { + mpb->disk[d->index] = d->disk; + num_disks++; + } + } + for (d = super->missing; d; d = d->next) { + mpb->disk[d->index] = d->disk; + num_disks++; + } + mpb->num_disks = num_disks; + mpb_size += sizeof(struct imsm_disk) * mpb->num_disks; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + struct imsm_dev *dev2 = get_imsm_dev(super, i); + if (dev && dev2) { + imsm_copy_dev(dev, dev2); + mpb_size += sizeof_imsm_dev(dev, 0); + } + if (is_gen_migration(dev2)) + clear_migration_record = 0; + } + mpb_size += __le32_to_cpu(mpb->bbm_log_size); + mpb->mpb_size = __cpu_to_le32(mpb_size); + + /* recalculate checksum */ + sum = __gen_imsm_checksum(mpb); + mpb->check_sum = __cpu_to_le32(sum); + + if (super->clean_migration_record_by_mdmon) { + clear_migration_record = 1; + super->clean_migration_record_by_mdmon = 0; + } + if (clear_migration_record) + memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SIZE); + + /* write the mpb for disks that compose raid devices */ + for (d = super->disks; d ; d = d->next) { + if (d->index < 0 || is_failed(&d->disk)) + continue; + + if (clear_migration_record) { + unsigned long long dsize; + + get_dev_size(d->fd, NULL, &dsize); + if (lseek64(d->fd, dsize - 512, SEEK_SET) >= 0) { + if (write(d->fd, super->migr_rec_buf, + MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE) + perror("Write migr_rec failed"); + } + } + + if (store_imsm_mpb(d->fd, mpb)) + fprintf(stderr, + "failed for device %d:%d (fd: %d)%s\n", + d->major, d->minor, + d->fd, strerror(errno)); + + if (doclose) { + close(d->fd); + d->fd = -1; + } + } + + if (spares) + return write_super_imsm_spares(super, doclose); + + return 0; +} + +static int create_array(struct supertype *st, int dev_idx) +{ + size_t len; + struct imsm_update_create_array *u; + struct intel_super *super = st->sb; + struct imsm_dev *dev = get_imsm_dev(super, dev_idx); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct disk_info *inf; + struct imsm_disk *disk; + int i; + + len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0) + + sizeof(*inf) * map->num_members; + u = xmalloc(len); + u->type = update_create_array; + u->dev_idx = dev_idx; + imsm_copy_dev(&u->dev, dev); + inf = get_disk_info(u); + for (i = 0; i < map->num_members; i++) { + int idx = get_imsm_disk_idx(dev, i, MAP_X); + + disk = get_imsm_disk(super, idx); + if (!disk) + disk = get_imsm_missing(super, idx); + serialcpy(inf[i].serial, disk->serial); + } + append_metadata_update(st, u, len); + + return 0; +} + +static int mgmt_disk(struct supertype *st) +{ + struct intel_super *super = st->sb; + size_t len; + struct imsm_update_add_remove_disk *u; + + if (!super->disk_mgmt_list) + return 0; + + len = sizeof(*u); + u = xmalloc(len); + u->type = update_add_remove_disk; + append_metadata_update(st, u, len); + + return 0; +} + +static int write_init_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + int current_vol = super->current_vol; + + /* we are done with current_vol reset it to point st at the container */ + super->current_vol = -1; + + if (st->update_tail) { + /* queue the recently created array / added disk + * as a metadata update */ + int rv; + + /* determine if we are creating a volume or adding a disk */ + if (current_vol < 0) { + /* in the mgmt (add/remove) disk case we are running + * in mdmon context, so don't close fd's + */ + return mgmt_disk(st); + } else + rv = create_array(st, current_vol); + + return rv; + } else { + struct dl *d; + for (d = super->disks; d; d = d->next) + Kill(d->devname, NULL, 0, -1, 1); + return write_super_imsm(st, 1); + } +} +#endif + +static int store_super_imsm(struct supertype *st, int fd) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super ? super->anchor : NULL; + + if (!mpb) + return 1; + +#ifndef MDASSEMBLE + return store_imsm_mpb(fd, mpb); +#else + return 1; +#endif +} + +static int imsm_bbm_log_size(struct imsm_super *mpb) +{ + return __le32_to_cpu(mpb->bbm_log_size); +} + +#ifndef MDASSEMBLE +static int validate_geometry_imsm_container(struct supertype *st, int level, + int layout, int raiddisks, int chunk, + unsigned long long size, + unsigned long long data_offset, + char *dev, + unsigned long long *freesize, + int verbose) +{ + int fd; + unsigned long long ldsize; + struct intel_super *super=NULL; + int rv = 0; + + if (level != LEVEL_CONTAINER) + return 0; + if (!dev) + return 1; + + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose > 0) + pr_err("imsm: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + if (!get_dev_size(fd, dev, &ldsize)) { + close(fd); + return 0; + } + + /* capabilities retrieve could be possible + * note that there is no fd for the disks in array. + */ + super = alloc_super(); + rv = find_intel_hba_capability(fd, super, verbose > 0 ? dev : NULL); + if (rv != 0) { +#if DEBUG + char str[256]; + fd2devname(fd, str); + dprintf("fd: %d %s orom: %p rv: %d raiddisk: %d\n", + fd, str, super->orom, rv, raiddisks); +#endif + /* no orom/efi or non-intel hba of the disk */ + close(fd); + free_imsm(super); + return 0; + } + close(fd); + if (super->orom) { + if (raiddisks > super->orom->tds) { + if (verbose) + pr_err("%d exceeds maximum number of platform supported disks: %d\n", + raiddisks, super->orom->tds); + free_imsm(super); + return 0; + } + if ((super->orom->attr & IMSM_OROM_ATTR_2TB_DISK) == 0 && + (ldsize >> 9) >> 32 > 0) { + if (verbose) + pr_err("%s exceeds maximum platform supported size\n", dev); + free_imsm(super); + return 0; + } + } + + *freesize = avail_size_imsm(st, ldsize >> 9, data_offset); + free_imsm(super); + + return 1; +} + +static unsigned long long find_size(struct extent *e, int *idx, int num_extents) +{ + const unsigned long long base_start = e[*idx].start; + unsigned long long end = base_start + e[*idx].size; + int i; + + if (base_start == end) + return 0; + + *idx = *idx + 1; + for (i = *idx; i < num_extents; i++) { + /* extend overlapping extents */ + if (e[i].start >= base_start && + e[i].start <= end) { + if (e[i].size == 0) + return 0; + if (e[i].start + e[i].size > end) + end = e[i].start + e[i].size; + } else if (e[i].start > end) { + *idx = i; + break; + } + } + + return end - base_start; +} + +static unsigned long long merge_extents(struct intel_super *super, int sum_extents) +{ + /* build a composite disk with all known extents and generate a new + * 'maxsize' given the "all disks in an array must share a common start + * offset" constraint + */ + struct extent *e = xcalloc(sum_extents, sizeof(*e)); + struct dl *dl; + int i, j; + int start_extent; + unsigned long long pos; + unsigned long long start = 0; + unsigned long long maxsize; + unsigned long reserve; + + /* coalesce and sort all extents. also, check to see if we need to + * reserve space between member arrays + */ + j = 0; + for (dl = super->disks; dl; dl = dl->next) { + if (!dl->e) + continue; + for (i = 0; i < dl->extent_cnt; i++) + e[j++] = dl->e[i]; + } + qsort(e, sum_extents, sizeof(*e), cmp_extent); + + /* merge extents */ + i = 0; + j = 0; + while (i < sum_extents) { + e[j].start = e[i].start; + e[j].size = find_size(e, &i, sum_extents); + j++; + if (e[j-1].size == 0) + break; + } + + pos = 0; + maxsize = 0; + start_extent = 0; + i = 0; + do { + unsigned long long esize; + + esize = e[i].start - pos; + if (esize >= maxsize) { + maxsize = esize; + start = pos; + start_extent = i; + } + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + free(e); + + if (maxsize == 0) + return 0; + + /* FIXME assumes volume at offset 0 is the first volume in a + * container + */ + if (start_extent > 0) + reserve = IMSM_RESERVED_SECTORS; /* gap between raid regions */ + else + reserve = 0; + + if (maxsize < reserve) + return 0; + + super->create_offset = ~((unsigned long long) 0); + if (start + reserve > super->create_offset) + return 0; /* start overflows create_offset */ + super->create_offset = start + reserve; + + return maxsize - reserve; +} + +static int is_raid_level_supported(const struct imsm_orom *orom, int level, int raiddisks) +{ + if (level < 0 || level == 6 || level == 4) + return 0; + + /* if we have an orom prevent invalid raid levels */ + if (orom) + switch (level) { + case 0: return imsm_orom_has_raid0(orom); + case 1: + if (raiddisks > 2) + return imsm_orom_has_raid1e(orom); + return imsm_orom_has_raid1(orom) && raiddisks == 2; + case 10: return imsm_orom_has_raid10(orom) && raiddisks == 4; + case 5: return imsm_orom_has_raid5(orom) && raiddisks > 2; + } + else + return 1; /* not on an Intel RAID platform so anything goes */ + + return 0; +} + +static int +active_arrays_by_format(char *name, char* hba, struct md_list **devlist, + int dpa, int verbose) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *memb = NULL; + int count = 0; + int num = 0; + struct md_list *dv = NULL; + int found; + + for (memb = mdstat ; memb ; memb = memb->next) { + if (memb->metadata_version && + (strncmp(memb->metadata_version, "external:", 9) == 0) && + (strcmp(&memb->metadata_version[9], name) == 0) && + !is_subarray(memb->metadata_version+9) && + memb->members) { + struct dev_member *dev = memb->members; + int fd = -1; + while(dev && (fd < 0)) { + char *path = xmalloc(strlen(dev->name) + strlen("/dev/") + 1); + num = sprintf(path, "%s%s", "/dev/", dev->name); + if (num > 0) + fd = open(path, O_RDONLY, 0); + if ((num <= 0) || (fd < 0)) { + pr_vrb(": Cannot open %s: %s\n", + dev->name, strerror(errno)); + } + free(path); + dev = dev->next; + } + found = 0; + if ((fd >= 0) && disk_attached_to_hba(fd, hba)) { + struct mdstat_ent *vol; + for (vol = mdstat ; vol ; vol = vol->next) { + if ((vol->active > 0) && + vol->metadata_version && + is_container_member(vol, memb->devnm)) { + found++; + count++; + } + } + if (*devlist && (found < dpa)) { + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xmalloc(strlen(memb->devnm) + strlen("/dev/") + 1); + sprintf(dv->devname, "%s%s", "/dev/", memb->devnm); + dv->found = found; + dv->used = 0; + dv->next = *devlist; + *devlist = dv; + } + } + if (fd >= 0) + close(fd); + } + } + free_mdstat(mdstat); + return count; +} + +#ifdef DEBUG_LOOP +static struct md_list* +get_loop_devices(void) +{ + int i; + struct md_list *devlist = NULL; + struct md_list *dv = NULL; + + for(i = 0; i < 12; i++) { + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xmalloc(40); + sprintf(dv->devname, "/dev/loop%d", i); + dv->next = devlist; + devlist = dv; + } + return devlist; +} +#endif + +static struct md_list* +get_devices(const char *hba_path) +{ + struct md_list *devlist = NULL; + struct md_list *dv = NULL; + struct dirent *ent; + DIR *dir; + int err = 0; + +#if DEBUG_LOOP + devlist = get_loop_devices(); + return devlist; +#endif + /* scroll through /sys/dev/block looking for devices attached to + * this hba + */ + dir = opendir("/sys/dev/block"); + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + int fd; + char buf[1024]; + int major, minor; + char *path = NULL; + if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2) + continue; + path = devt_to_devpath(makedev(major, minor)); + if (!path) + continue; + if (!path_attached_to_hba(path, hba_path)) { + free(path); + path = NULL; + continue; + } + free(path); + path = NULL; + fd = dev_open(ent->d_name, O_RDONLY); + if (fd >= 0) { + fd2devname(fd, buf); + close(fd); + } else { + pr_err("cannot open device: %s\n", + ent->d_name); + continue; + } + + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xstrdup(buf); + dv->next = devlist; + devlist = dv; + } + if (err) { + while(devlist) { + dv = devlist; + devlist = devlist->next; + free(dv->devname); + free(dv); + } + } + closedir(dir); + return devlist; +} + +static int +count_volumes_list(struct md_list *devlist, char *homehost, + int verbose, int *found) +{ + struct md_list *tmpdev; + int count = 0; + struct supertype *st = NULL; + + /* first walk the list of devices to find a consistent set + * that match the criterea, if that is possible. + * We flag the ones we like with 'used'. + */ + *found = 0; + st = match_metadata_desc_imsm("imsm"); + if (st == NULL) { + pr_vrb(": cannot allocate memory for imsm supertype\n"); + return 0; + } + + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + char *devname = tmpdev->devname; + struct stat stb; + struct supertype *tst; + int dfd; + if (tmpdev->used > 1) + continue; + tst = dup_super(st); + if (tst == NULL) { + pr_vrb(": cannot allocate memory for imsm supertype\n"); + goto err_1; + } + tmpdev->container = 0; + dfd = dev_open(devname, O_RDONLY|O_EXCL); + if (dfd < 0) { + dprintf("cannot open device %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if (fstat(dfd, &stb)< 0) { + /* Impossible! */ + dprintf("fstat failed for %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if ((stb.st_mode & S_IFMT) != S_IFBLK) { + dprintf("%s is not a block device.\n", + devname); + tmpdev->used = 2; + } else if (must_be_container(dfd)) { + struct supertype *cst; + cst = super_by_fd(dfd, NULL); + if (cst == NULL) { + dprintf("cannot recognize container type %s\n", + devname); + tmpdev->used = 2; + } else if (tst->ss != st->ss) { + dprintf("non-imsm container - ignore it: %s\n", + devname); + tmpdev->used = 2; + } else if (!tst->ss->load_container || + tst->ss->load_container(tst, dfd, NULL)) + tmpdev->used = 2; + else { + tmpdev->container = 1; + } + if (cst) + cst->ss->free_super(cst); + } else { + tmpdev->st_rdev = stb.st_rdev; + if (tst->ss->load_super(tst,dfd, NULL)) { + dprintf("no RAID superblock on %s\n", + devname); + tmpdev->used = 2; + } else if (tst->ss->compare_super == NULL) { + dprintf("Cannot assemble %s metadata on %s\n", + tst->ss->name, devname); + tmpdev->used = 2; + } + } + if (dfd >= 0) + close(dfd); + if (tmpdev->used == 2 || tmpdev->used == 4) { + /* Ignore unrecognised devices during auto-assembly */ + goto loop; + } + else { + struct mdinfo info; + tst->ss->getinfo_super(tst, &info, NULL); + + if (st->minor_version == -1) + st->minor_version = tst->minor_version; + + if (memcmp(info.uuid, uuid_zero, + sizeof(int[4])) == 0) { + /* this is a floating spare. It cannot define + * an array unless there are no more arrays of + * this type to be found. It can be included + * in an array of this type though. + */ + tmpdev->used = 3; + goto loop; + } + + if (st->ss != tst->ss || + st->minor_version != tst->minor_version || + st->ss->compare_super(st, tst) != 0) { + /* Some mismatch. If exactly one array matches this host, + * we can resolve on that one. + * Or, if we are auto assembling, we just ignore the second + * for now. + */ + dprintf("superblock on %s doesn't match others - assembly aborted\n", + devname); + goto loop; + } + tmpdev->used = 1; + *found = 1; + dprintf("found: devname: %s\n", devname); + } + loop: + if (tst) + tst->ss->free_super(tst); + } + if (*found != 0) { + int err; + if ((err = load_super_imsm_all(st, -1, &st->sb, NULL, devlist, 0)) == 0) { + struct mdinfo *iter, *head = st->ss->container_content(st, NULL); + for (iter = head; iter; iter = iter->next) { + dprintf("content->text_version: %s vol\n", + iter->text_version); + if (iter->array.state & (1<<MD_SB_BLOCK_VOLUME)) { + /* do not assemble arrays with unsupported + configurations */ + dprintf("Cannot activate member %s.\n", + iter->text_version); + } else + count++; + } + sysfs_free(head); + + } else { + dprintf("No valid super block on device list: err: %d %p\n", + err, st->sb); + } + } else { + dprintf("no more devices to examine\n"); + } + + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if ((tmpdev->used == 1) && (tmpdev->found)) { + if (count) { + if (count < tmpdev->found) + count = 0; + else + count -= tmpdev->found; + } + } + if (tmpdev->used == 1) + tmpdev->used = 4; + } + err_1: + if (st) + st->ss->free_super(st); + return count; +} + +static int +count_volumes(struct intel_hba *hba, int dpa, int verbose) +{ + struct sys_dev *idev, *intel_devices = find_intel_devices(); + int count = 0; + const struct orom_entry *entry; + struct devid_list *dv, *devid_list; + + if (!hba || !hba->path) + return 0; + + for (idev = intel_devices; idev; idev = idev->next) { + if (strstr(idev->path, hba->path)) + break; + } + + if (!idev || !idev->dev_id) + return 0; + + entry = get_orom_entry_by_device_id(idev->dev_id); + + if (!entry || !entry->devid_list) + return 0; + + devid_list = entry->devid_list; + for (dv = devid_list; dv; dv = dv->next) { + + struct md_list *devlist = NULL; + struct sys_dev *device = device_by_id(dv->devid); + char *hba_path; + int found = 0; + + if (device) + hba_path = device->path; + else + return 0; + + devlist = get_devices(hba_path); + /* if no intel devices return zero volumes */ + if (devlist == NULL) + return 0; + + count += active_arrays_by_format("imsm", hba_path, &devlist, dpa, verbose); + dprintf("path: %s active arrays: %d\n", hba_path, count); + if (devlist == NULL) + return 0; + do { + found = 0; + count += count_volumes_list(devlist, + NULL, + verbose, + &found); + dprintf("found %d count: %d\n", found, count); + } while (found); + + dprintf("path: %s total number of volumes: %d\n", hba_path, count); + + while (devlist) { + struct md_list *dv = devlist; + devlist = devlist->next; + free(dv->devname); + free(dv); + } + } + return count; +} + +static int imsm_default_chunk(const struct imsm_orom *orom) +{ + /* up to 512 if the plaform supports it, otherwise the platform max. + * 128 if no platform detected + */ + int fs = max(7, orom ? fls(orom->sss) : 0); + + return min(512, (1 << fs)); +} + +static int +validate_geometry_imsm_orom(struct intel_super *super, int level, int layout, + int raiddisks, int *chunk, unsigned long long size, int verbose) +{ + /* check/set platform and metadata limits/defaults */ + if (super->orom && raiddisks > super->orom->dpa) { + pr_vrb(": platform supports a maximum of %d disks per array\n", + super->orom->dpa); + return 0; + } + + /* capabilities of OROM tested - copied from validate_geometry_imsm_volume */ + if (!is_raid_level_supported(super->orom, level, raiddisks)) { + pr_vrb(": platform does not support raid%d with %d disk%s\n", + level, raiddisks, raiddisks > 1 ? "s" : ""); + return 0; + } + + if (*chunk == 0 || *chunk == UnSet) + *chunk = imsm_default_chunk(super->orom); + + if (super->orom && !imsm_orom_has_chunk(super->orom, *chunk)) { + pr_vrb(": platform does not support a chunk size of: %d\n", *chunk); + return 0; + } + + if (layout != imsm_level_to_layout(level)) { + if (level == 5) + pr_vrb(": imsm raid 5 only supports the left-asymmetric layout\n"); + else if (level == 10) + pr_vrb(": imsm raid 10 only supports the n2 layout\n"); + else + pr_vrb(": imsm unknown layout %#x for this raid level %d\n", + layout, level); + return 0; + } + + if (super->orom && (super->orom->attr & IMSM_OROM_ATTR_2TB) == 0 && + (calc_array_size(level, raiddisks, layout, *chunk, size) >> 32) > 0) { + pr_vrb(": platform does not support a volume size over 2TB\n"); + return 0; + } + + return 1; +} + +/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd + * FIX ME add ahci details + */ +static int validate_geometry_imsm_volume(struct supertype *st, int level, + int layout, int raiddisks, int *chunk, + unsigned long long size, + unsigned long long data_offset, + char *dev, + unsigned long long *freesize, + int verbose) +{ + struct stat stb; + struct intel_super *super = st->sb; + struct imsm_super *mpb; + struct dl *dl; + unsigned long long pos = 0; + unsigned long long maxsize; + struct extent *e; + int i; + + /* We must have the container info already read in. */ + if (!super) + return 0; + + mpb = super->anchor; + + if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, size, verbose)) { + pr_err("RAID gemetry validation failed. Cannot proceed with the action(s).\n"); + return 0; + } + if (!dev) { + /* General test: make sure there is space for + * 'raiddisks' device extents of size 'size' at a given + * offset + */ + unsigned long long minsize = size; + unsigned long long start_offset = MaxSector; + int dcnt = 0; + if (minsize == 0) + minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + for (dl = super->disks; dl ; dl = dl->next) { + int found = 0; + + pos = 0; + i = 0; + e = get_extents(super, dl); + if (!e) continue; + do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= minsize) + found = 1; + if (found && start_offset == MaxSector) { + start_offset = pos; + break; + } else if (found && pos != start_offset) { + found = 0; + break; + } + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + if (found) + dcnt++; + free(e); + } + if (dcnt < raiddisks) { + if (verbose) + pr_err("imsm: Not enough devices with space for this array (%d < %d)\n", + dcnt, raiddisks); + return 0; + } + return 1; + } + + /* This device must be a member of the set */ + if (stat(dev, &stb) < 0) + return 0; + if ((S_IFMT & stb.st_mode) != S_IFBLK) + return 0; + for (dl = super->disks ; dl ; dl = dl->next) { + if (dl->major == (int)major(stb.st_rdev) && + dl->minor == (int)minor(stb.st_rdev)) + break; + } + if (!dl) { + if (verbose) + pr_err("%s is not in the same imsm set\n", dev); + return 0; + } else if (super->orom && dl->index < 0 && mpb->num_raid_devs) { + /* If a volume is present then the current creation attempt + * cannot incorporate new spares because the orom may not + * understand this configuration (all member disks must be + * members of each array in the container). + */ + pr_err("%s is a spare and a volume is already defined for this container\n", dev); + pr_err("The option-rom requires all member disks to be a member of all volumes\n"); + return 0; + } else if (super->orom && mpb->num_raid_devs > 0 && + mpb->num_disks != raiddisks) { + pr_err("The option-rom requires all member disks to be a member of all volumes\n"); + return 0; + } + + /* retrieve the largest free space block */ + e = get_extents(super, dl); + maxsize = 0; + i = 0; + if (e) { + do { + unsigned long long esize; + + esize = e[i].start - pos; + if (esize >= maxsize) + maxsize = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + dl->e = e; + dl->extent_cnt = i; + } else { + if (verbose) + pr_err("unable to determine free space for: %s\n", + dev); + return 0; + } + if (maxsize < size) { + if (verbose) + pr_err("%s not enough space (%llu < %llu)\n", + dev, maxsize, size); + return 0; + } + + /* count total number of extents for merge */ + i = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + i += dl->extent_cnt; + + maxsize = merge_extents(super, i); + + if (!check_env("IMSM_NO_PLATFORM") && + mpb->num_raid_devs > 0 && size && size != maxsize) { + pr_err("attempting to create a second volume with size less then remaining space. Aborting...\n"); + return 0; + } + + if (maxsize < size || maxsize == 0) { + if (verbose) { + if (maxsize == 0) + pr_err("no free space left on device. Aborting...\n"); + else + pr_err("not enough space to create volume of given size (%llu < %llu). Aborting...\n", + maxsize, size); + } + return 0; + } + + *freesize = maxsize; + + if (super->orom) { + int count = count_volumes(super->hba, + super->orom->dpa, verbose); + if (super->orom->vphba <= count) { + pr_vrb(": platform does not support more than %d raid volumes.\n", + super->orom->vphba); + return 0; + } + } + return 1; +} + +static int imsm_get_free_size(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long *freesize) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + int i; + int extent_cnt; + struct extent *e; + unsigned long long maxsize; + unsigned long long minsize; + int cnt; + int used; + + /* find the largest common start free region of the possible disks */ + used = 0; + extent_cnt = 0; + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) { + dl->raiddisk = -1; + + if (dl->index >= 0) + used++; + + /* don't activate new spares if we are orom constrained + * and there is already a volume active in the container + */ + if (super->orom && dl->index < 0 && mpb->num_raid_devs) + continue; + + e = get_extents(super, dl); + if (!e) + continue; + for (i = 1; e[i-1].size; i++) + ; + dl->e = e; + dl->extent_cnt = i; + extent_cnt += i; + cnt++; + } + + maxsize = merge_extents(super, extent_cnt); + minsize = size; + if (size == 0) + /* chunk is in K */ + minsize = chunk * 2; + + if (cnt < raiddisks || + (super->orom && used && used != raiddisks) || + maxsize < minsize || + maxsize == 0) { + pr_err("not enough devices with space to create array.\n"); + return 0; /* No enough free spaces large enough */ + } + + if (size == 0) { + size = maxsize; + if (chunk) { + size /= 2 * chunk; + size *= 2 * chunk; + } + maxsize = size; + } + if (!check_env("IMSM_NO_PLATFORM") && + mpb->num_raid_devs > 0 && size && size != maxsize) { + pr_err("attempting to create a second volume with size less then remaining space. Aborting...\n"); + return 0; + } + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + dl->raiddisk = cnt++; + + *freesize = size; + + dprintf("imsm: imsm_get_free_size() returns : %llu\n", size); + + return 1; +} + +static int reserve_space(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long *freesize) +{ + struct intel_super *super = st->sb; + struct dl *dl; + int cnt; + int rv = 0; + + rv = imsm_get_free_size(st, raiddisks, size, chunk, freesize); + if (rv) { + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + dl->raiddisk = cnt++; + rv = 1; + } + + return rv; +} + +static int validate_geometry_imsm(struct supertype *st, int level, int layout, + int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd, cfd; + struct mdinfo *sra; + int is_member = 0; + + /* load capability + * if given unused devices create a container + * if given given devices in a container create a member volume + */ + if (level == LEVEL_CONTAINER) { + /* Must be a fresh device to add to a container */ + return validate_geometry_imsm_container(st, level, layout, + raiddisks, + *chunk, + size, data_offset, + dev, freesize, + verbose); + } + + if (!dev) { + if (st->sb) { + struct intel_super *super = st->sb; + if (!validate_geometry_imsm_orom(st->sb, level, layout, + raiddisks, chunk, size, + verbose)) + return 0; + /* we are being asked to automatically layout a + * new volume based on the current contents of + * the container. If the the parameters can be + * satisfied reserve_space will record the disks, + * start offset, and size of the volume to be + * created. add_to_super and getinfo_super + * detect when autolayout is in progress. + */ + /* assuming that freesize is always given when array is + created */ + if (super->orom && freesize) { + int count; + count = count_volumes(super->hba, + super->orom->dpa, verbose); + if (super->orom->vphba <= count) { + pr_vrb(": platform does not support more than %d raid volumes.\n", + super->orom->vphba); + return 0; + } + } + if (freesize) + return reserve_space(st, raiddisks, size, + *chunk, freesize); + } + return 1; + } + if (st->sb) { + /* creating in a given container */ + return validate_geometry_imsm_volume(st, level, layout, + raiddisks, chunk, size, + data_offset, + dev, freesize, verbose); + } + + /* This device needs to be a device in an 'imsm' container */ + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd >= 0) { + if (verbose) + pr_err("Cannot create this array on device %s\n", + dev); + close(fd); + return 0; + } + if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { + if (verbose) + pr_err("Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + /* Well, it is in use by someone, maybe an 'imsm' container. */ + cfd = open_container(fd); + close(fd); + if (cfd < 0) { + if (verbose) + pr_err("Cannot use %s: It is busy\n", + dev); + return 0; + } + sra = sysfs_read(cfd, NULL, GET_VERSION); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "imsm") == 0) + is_member = 1; + sysfs_free(sra); + if (is_member) { + /* This is a member of a imsm container. Load the container + * and try to create a volume + */ + struct intel_super *super; + + if (load_super_imsm_all(st, cfd, (void **) &super, NULL, NULL, 1) == 0) { + st->sb = super; + strcpy(st->container_devnm, fd2devnm(cfd)); + close(cfd); + return validate_geometry_imsm_volume(st, level, layout, + raiddisks, chunk, + size, data_offset, dev, + freesize, 1) + ? 1 : -1; + } + } + + if (verbose) + pr_err("failed container membership check\n"); + + close(cfd); + return 0; +} + +static void default_geometry_imsm(struct supertype *st, int *level, int *layout, int *chunk) +{ + struct intel_super *super = st->sb; + + if (level && *level == UnSet) + *level = LEVEL_CONTAINER; + + if (level && layout && *layout == UnSet) + *layout = imsm_level_to_layout(*level); + + if (chunk && (*chunk == UnSet || *chunk == 0)) + *chunk = imsm_default_chunk(super->orom); +} + +static void handle_missing(struct intel_super *super, struct imsm_dev *dev); + +static int kill_subarray_imsm(struct supertype *st) +{ + /* remove the subarray currently referenced by ->current_vol */ + __u8 i; + struct intel_dev **dp; + struct intel_super *super = st->sb; + __u8 current_vol = super->current_vol; + struct imsm_super *mpb = super->anchor; + + if (super->current_vol < 0) + return 2; + super->current_vol = -1; /* invalidate subarray cursor */ + + /* block deletions that would change the uuid of active subarrays + * + * FIXME when immutable ids are available, but note that we'll + * also need to fixup the invalidated/active subarray indexes in + * mdstat + */ + for (i = 0; i < mpb->num_raid_devs; i++) { + char subarray[4]; + + if (i < current_vol) + continue; + sprintf(subarray, "%u", i); + if (is_subarray_active(subarray, st->devnm)) { + pr_err("deleting subarray-%d would change the UUID of active subarray-%d, aborting\n", + current_vol, i); + + return 2; + } + } + + if (st->update_tail) { + struct imsm_update_kill_array *u = xmalloc(sizeof(*u)); + + u->type = update_kill_array; + u->dev_idx = current_vol; + append_metadata_update(st, u, sizeof(*u)); + + return 0; + } + + for (dp = &super->devlist; *dp;) + if ((*dp)->index == current_vol) { + *dp = (*dp)->next; + } else { + handle_missing(super, (*dp)->dev); + if ((*dp)->index > current_vol) + (*dp)->index--; + dp = &(*dp)->next; + } + + /* no more raid devices, all active components are now spares, + * but of course failed are still failed + */ + if (--mpb->num_raid_devs == 0) { + struct dl *d; + + for (d = super->disks; d; d = d->next) + if (d->index > -2) + mark_spare(d); + } + + super->updates_pending++; + + return 0; +} + +static int update_subarray_imsm(struct supertype *st, char *subarray, + char *update, struct mddev_ident *ident) +{ + /* update the subarray currently referenced by ->current_vol */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + + if (strcmp(update, "name") == 0) { + char *name = ident->name; + char *ep; + int vol; + + if (is_subarray_active(subarray, st->devnm)) { + pr_err("Unable to update name of active subarray\n"); + return 2; + } + + if (!check_name(super, name, 0)) + return 2; + + vol = strtoul(subarray, &ep, 10); + if (*ep != '\0' || vol >= super->anchor->num_raid_devs) + return 2; + + if (st->update_tail) { + struct imsm_update_rename_array *u = xmalloc(sizeof(*u)); + + u->type = update_rename_array; + u->dev_idx = vol; + snprintf((char *) u->name, MAX_RAID_SERIAL_LEN, "%s", name); + append_metadata_update(st, u, sizeof(*u)); + } else { + struct imsm_dev *dev; + int i; + + dev = get_imsm_dev(super, vol); + snprintf((char *) dev->volume, MAX_RAID_SERIAL_LEN, "%s", name); + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + handle_missing(super, dev); + } + super->updates_pending++; + } + } else + return 2; + + return 0; +} +#endif /* MDASSEMBLE */ + +static int is_gen_migration(struct imsm_dev *dev) +{ + if (dev == NULL) + return 0; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) == MIGR_GEN_MIGR) + return 1; + + return 0; +} + +static int is_rebuilding(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) != MIGR_REBUILD) + return 0; + + migr_map = get_imsm_map(dev, MAP_1); + + if (migr_map->map_state == IMSM_T_STATE_DEGRADED) + return 1; + else + return 0; +} + +#ifndef MDASSEMBLE +static int is_initializing(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) != MIGR_INIT) + return 0; + + migr_map = get_imsm_map(dev, MAP_1); + + if (migr_map->map_state == IMSM_T_STATE_UNINITIALIZED) + return 1; + + return 0; +} +#endif + +static void update_recovery_start(struct intel_super *super, + struct imsm_dev *dev, + struct mdinfo *array) +{ + struct mdinfo *rebuild = NULL; + struct mdinfo *d; + __u32 units; + + if (!is_rebuilding(dev)) + return; + + /* Find the rebuild target, but punt on the dual rebuild case */ + for (d = array->devs; d; d = d->next) + if (d->recovery_start == 0) { + if (rebuild) + return; + rebuild = d; + } + + if (!rebuild) { + /* (?) none of the disks are marked with + * IMSM_ORD_REBUILD, so assume they are missing and the + * disk_ord_tbl was not correctly updated + */ + dprintf("failed to locate out-of-sync disk\n"); + return; + } + + units = __le32_to_cpu(dev->vol.curr_migr_unit); + rebuild->recovery_start = units * blocks_per_migr_unit(super, dev); +} + +#ifndef MDASSEMBLE +static int recover_backup_imsm(struct supertype *st, struct mdinfo *info); +#endif + +static struct mdinfo *container_content_imsm(struct supertype *st, char *subarray) +{ + /* Given a container loaded by load_super_imsm_all, + * extract information about all the arrays into + * an mdinfo tree. + * If 'subarray' is given, just extract info about that array. + * + * For each imsm_dev create an mdinfo, fill it in, + * then look for matching devices in super->disks + * and create appropriate device mdinfo. + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct mdinfo *rest = NULL; + unsigned int i; + int sb_errors = 0; + struct dl *d; + int spare_disks = 0; + + /* do not assemble arrays when not all attributes are supported */ + if (imsm_check_attributes(mpb->attributes) == 0) { + sb_errors = 1; + pr_err("Unsupported attributes in IMSM metadata.Arrays activation is blocked.\n"); + } + + /* check for bad blocks */ + if (imsm_bbm_log_size(super->anchor)) { + pr_err("BBM log found in IMSM metadata.Arrays activation is blocked.\n"); + sb_errors = 1; + } + + /* count spare devices, not used in maps + */ + for (d = super->disks; d; d = d->next) + if (d->index == -1) + spare_disks++; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev; + struct imsm_map *map; + struct imsm_map *map2; + struct mdinfo *this; + int slot; +#ifndef MDASSEMBLE + int chunk; +#endif + char *ep; + + if (subarray && + (i != strtoul(subarray, &ep, 10) || *ep != '\0')) + continue; + + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + map2 = get_imsm_map(dev, MAP_1); + + /* do not publish arrays that are in the middle of an + * unsupported migration + */ + if (dev->vol.migr_state && + (migr_type(dev) == MIGR_STATE_CHANGE)) { + pr_err("cannot assemble volume '%.16s': unsupported migration in progress\n", + dev->volume); + continue; + } + /* do not publish arrays that are not support by controller's + * OROM/EFI + */ + + this = xmalloc(sizeof(*this)); + + super->current_vol = i; + getinfo_super_imsm_volume(st, this, NULL); + this->next = rest; +#ifndef MDASSEMBLE + chunk = __le16_to_cpu(map->blocks_per_strip) >> 1; + /* mdadm does not support all metadata features- set the bit in all arrays state */ + if (!validate_geometry_imsm_orom(super, + get_imsm_raid_level(map), /* RAID level */ + imsm_level_to_layout(get_imsm_raid_level(map)), + map->num_members, /* raid disks */ + &chunk, join_u32(dev->size_low, dev->size_high), + 1 /* verbose */)) { + pr_err("IMSM RAID geometry validation failed. Array %s activation is blocked.\n", + dev->volume); + this->array.state |= + (1<<MD_SB_BLOCK_CONTAINER_RESHAPE) | + (1<<MD_SB_BLOCK_VOLUME); + } +#endif + + /* if array has bad blocks, set suitable bit in all arrays state */ + if (sb_errors) + this->array.state |= + (1<<MD_SB_BLOCK_CONTAINER_RESHAPE) | + (1<<MD_SB_BLOCK_VOLUME); + + for (slot = 0 ; slot < map->num_members; slot++) { + unsigned long long recovery_start; + struct mdinfo *info_d; + struct dl *d; + int idx; + int skip; + __u32 ord; + + skip = 0; + idx = get_imsm_disk_idx(dev, slot, MAP_0); + ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X); + for (d = super->disks; d ; d = d->next) + if (d->index == idx) + break; + + recovery_start = MaxSector; + if (d == NULL) + skip = 1; + if (d && is_failed(&d->disk)) + skip = 1; + if (ord & IMSM_ORD_REBUILD) + recovery_start = 0; + + /* + * if we skip some disks the array will be assmebled degraded; + * reset resync start to avoid a dirty-degraded + * situation when performing the intial sync + * + * FIXME handle dirty degraded + */ + if ((skip || recovery_start == 0) && !dev->vol.dirty) + this->resync_start = MaxSector; + if (skip) + continue; + + info_d = xcalloc(1, sizeof(*info_d)); + info_d->next = this->devs; + this->devs = info_d; + + info_d->disk.number = d->index; + info_d->disk.major = d->major; + info_d->disk.minor = d->minor; + info_d->disk.raid_disk = slot; + info_d->recovery_start = recovery_start; + if (map2) { + if (slot < map2->num_members) + info_d->disk.state = (1 << MD_DISK_ACTIVE); + else + this->array.spare_disks++; + } else { + if (slot < map->num_members) + info_d->disk.state = (1 << MD_DISK_ACTIVE); + else + this->array.spare_disks++; + } + if (info_d->recovery_start == MaxSector) + this->array.working_disks++; + + info_d->events = __le32_to_cpu(mpb->generation_num); + info_d->data_offset = pba_of_lba0(map); + info_d->component_size = blocks_per_member(map); + } + /* now that the disk list is up-to-date fixup recovery_start */ + update_recovery_start(super, dev, this); + this->array.spare_disks += spare_disks; + +#ifndef MDASSEMBLE + /* check for reshape */ + if (this->reshape_active == 1) + recover_backup_imsm(st, this); +#endif + rest = this; + } + + return rest; +} + +static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, + int failed, int look_in_map) +{ + struct imsm_map *map; + + map = get_imsm_map(dev, look_in_map); + + if (!failed) + return map->map_state == IMSM_T_STATE_UNINITIALIZED ? + IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL; + + switch (get_imsm_raid_level(map)) { + case 0: + return IMSM_T_STATE_FAILED; + break; + case 1: + if (failed < map->num_members) + return IMSM_T_STATE_DEGRADED; + else + return IMSM_T_STATE_FAILED; + break; + case 10: + { + /** + * check to see if any mirrors have failed, otherwise we + * are degraded. Even numbered slots are mirrored on + * slot+1 + */ + int i; + /* gcc -Os complains that this is unused */ + int insync = insync; + + for (i = 0; i < map->num_members; i++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, i, MAP_X); + int idx = ord_to_idx(ord); + struct imsm_disk *disk; + + /* reset the potential in-sync count on even-numbered + * slots. num_copies is always 2 for imsm raid10 + */ + if ((i & 1) == 0) + insync = 2; + + disk = get_imsm_disk(super, idx); + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) + insync--; + + /* no in-sync disks left in this mirror the + * array has failed + */ + if (insync == 0) + return IMSM_T_STATE_FAILED; + } + + return IMSM_T_STATE_DEGRADED; + } + case 5: + if (failed < 2) + return IMSM_T_STATE_DEGRADED; + else + return IMSM_T_STATE_FAILED; + break; + default: + break; + } + + return map->map_state; +} + +static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev, + int look_in_map) +{ + int i; + int failed = 0; + struct imsm_disk *disk; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *prev = get_imsm_map(dev, MAP_1); + struct imsm_map *map_for_loop; + __u32 ord; + int idx; + int idx_1; + + /* at the beginning of migration we set IMSM_ORD_REBUILD on + * disks that are being rebuilt. New failures are recorded to + * map[0]. So we look through all the disks we started with and + * see if any failures are still present, or if any new ones + * have arrived + */ + map_for_loop = map; + if (prev && (map->num_members < prev->num_members)) + map_for_loop = prev; + + for (i = 0; i < map_for_loop->num_members; i++) { + idx_1 = -255; + /* when MAP_X is passed both maps failures are counted + */ + if (prev && + ((look_in_map == MAP_1) || (look_in_map == MAP_X)) && + (i < prev->num_members)) { + ord = __le32_to_cpu(prev->disk_ord_tbl[i]); + idx_1 = ord_to_idx(ord); + + disk = get_imsm_disk(super, idx_1); + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) + failed++; + } + if (((look_in_map == MAP_0) || (look_in_map == MAP_X)) && + (i < map->num_members)) { + ord = __le32_to_cpu(map->disk_ord_tbl[i]); + idx = ord_to_idx(ord); + + if (idx != idx_1) { + disk = get_imsm_disk(super, idx); + if (!disk || is_failed(disk) || + ord & IMSM_ORD_REBUILD) + failed++; + } + } + } + + return failed; +} + +#ifndef MDASSEMBLE +static int imsm_open_new(struct supertype *c, struct active_array *a, + char *inst) +{ + struct intel_super *super = c->sb; + struct imsm_super *mpb = super->anchor; + + if (atoi(inst) >= mpb->num_raid_devs) { + pr_err("subarry index %d, out of range\n", atoi(inst)); + return -ENODEV; + } + + dprintf("imsm: open_new %s\n", inst); + a->info.container_member = atoi(inst); + return 0; +} + +static int is_resyncing(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) == MIGR_INIT || + migr_type(dev) == MIGR_REPAIR) + return 1; + + if (migr_type(dev) == MIGR_GEN_MIGR) + return 0; + + migr_map = get_imsm_map(dev, MAP_1); + + if ((migr_map->map_state == IMSM_T_STATE_NORMAL) && + (dev->vol.migr_type != MIGR_GEN_MIGR)) + return 1; + else + return 0; +} + +/* return true if we recorded new information */ +static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx) +{ + __u32 ord; + int slot; + struct imsm_map *map; + char buf[MAX_RAID_SERIAL_LEN+3]; + unsigned int len, shift = 0; + + /* new failures are always set in map[0] */ + map = get_imsm_map(dev, MAP_0); + + slot = get_imsm_disk_slot(map, idx); + if (slot < 0) + return 0; + + ord = __le32_to_cpu(map->disk_ord_tbl[slot]); + if (is_failed(disk) && (ord & IMSM_ORD_REBUILD)) + return 0; + + memcpy(buf, disk->serial, MAX_RAID_SERIAL_LEN); + buf[MAX_RAID_SERIAL_LEN] = '\000'; + strcat(buf, ":0"); + if ((len = strlen(buf)) >= MAX_RAID_SERIAL_LEN) + shift = len - MAX_RAID_SERIAL_LEN + 1; + strncpy((char *)disk->serial, &buf[shift], MAX_RAID_SERIAL_LEN); + + disk->status |= FAILED_DISK; + set_imsm_ord_tbl_ent(map, slot, idx | IMSM_ORD_REBUILD); + /* mark failures in second map if second map exists and this disk + * in this slot. + * This is valid for migration, initialization and rebuild + */ + if (dev->vol.migr_state) { + struct imsm_map *map2 = get_imsm_map(dev, MAP_1); + int slot2 = get_imsm_disk_slot(map2, idx); + + if ((slot2 < map2->num_members) && + (slot2 >= 0)) + set_imsm_ord_tbl_ent(map2, slot2, + idx | IMSM_ORD_REBUILD); + } + if (map->failed_disk_num == 0xff) + map->failed_disk_num = slot; + return 1; +} + +static void mark_missing(struct imsm_dev *dev, struct imsm_disk *disk, int idx) +{ + mark_failure(dev, disk, idx); + + if (disk->scsi_id == __cpu_to_le32(~(__u32)0)) + return; + + disk->scsi_id = __cpu_to_le32(~(__u32)0); + memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1); +} + +static void handle_missing(struct intel_super *super, struct imsm_dev *dev) +{ + struct dl *dl; + + if (!super->missing) + return; + + /* When orom adds replacement for missing disk it does + * not remove entry of missing disk, but just updates map with + * new added disk. So it is not enough just to test if there is + * any missing disk, we have to look if there are any failed disks + * in map to stop migration */ + + dprintf("imsm: mark missing\n"); + /* end process for initialization and rebuild only + */ + if (is_gen_migration(dev) == 0) { + __u8 map_state; + int failed; + + failed = imsm_count_failed(super, dev, MAP_0); + map_state = imsm_check_degraded(super, dev, failed, MAP_0); + + if (failed) + end_migration(dev, super, map_state); + } + for (dl = super->missing; dl; dl = dl->next) + mark_missing(dev, &dl->disk, dl->index); + super->updates_pending++; +} + +static unsigned long long imsm_set_array_size(struct imsm_dev *dev, + long long new_size) +{ + int used_disks = imsm_num_data_members(dev, MAP_0); + unsigned long long array_blocks; + struct imsm_map *map; + + if (used_disks == 0) { + /* when problems occures + * return current array_blocks value + */ + array_blocks = __le32_to_cpu(dev->size_high); + array_blocks = array_blocks << 32; + array_blocks += __le32_to_cpu(dev->size_low); + + return array_blocks; + } + + /* set array size in metadata + */ + if (new_size <= 0) { + /* OLCE size change is caused by added disks + */ + map = get_imsm_map(dev, MAP_0); + array_blocks = blocks_per_member(map) * used_disks; + } else { + /* Online Volume Size Change + * Using available free space + */ + array_blocks = new_size; + } + + /* round array size down to closest MB + */ + array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT; + dev->size_low = __cpu_to_le32((__u32)array_blocks); + dev->size_high = __cpu_to_le32((__u32)(array_blocks >> 32)); + + return array_blocks; +} + +static void imsm_set_disk(struct active_array *a, int n, int state); + +static void imsm_progress_container_reshape(struct intel_super *super) +{ + /* if no device has a migr_state, but some device has a + * different number of members than the previous device, start + * changing the number of devices in this device to match + * previous. + */ + struct imsm_super *mpb = super->anchor; + int prev_disks = -1; + int i; + int copy_map_size; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *map2; + int prev_num_members; + + if (dev->vol.migr_state) + return; + + if (prev_disks == -1) + prev_disks = map->num_members; + if (prev_disks == map->num_members) + continue; + + /* OK, this array needs to enter reshape mode. + * i.e it needs a migr_state + */ + + copy_map_size = sizeof_imsm_map(map); + prev_num_members = map->num_members; + map->num_members = prev_disks; + dev->vol.migr_state = 1; + dev->vol.curr_migr_unit = 0; + set_migr_type(dev, MIGR_GEN_MIGR); + for (i = prev_num_members; + i < map->num_members; i++) + set_imsm_ord_tbl_ent(map, i, i); + map2 = get_imsm_map(dev, MAP_1); + /* Copy the current map */ + memcpy(map2, map, copy_map_size); + map2->num_members = prev_num_members; + + imsm_set_array_size(dev, -1); + super->clean_migration_record_by_mdmon = 1; + super->updates_pending++; + } +} + +/* Handle dirty -> clean transititions, resync and reshape. Degraded and rebuild + * states are handled in imsm_set_disk() with one exception, when a + * resync is stopped due to a new failure this routine will set the + * 'degraded' state for the array. + */ +static int imsm_set_array_state(struct active_array *a, int consistent) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int failed = imsm_count_failed(super, dev, MAP_0); + __u8 map_state = imsm_check_degraded(super, dev, failed, MAP_0); + __u32 blocks_per_unit; + + if (dev->vol.migr_state && + dev->vol.migr_type == MIGR_GEN_MIGR) { + /* array state change is blocked due to reshape action + * We might need to + * - abort the reshape (if last_checkpoint is 0 and action!= reshape) + * - finish the reshape (if last_checkpoint is big and action != reshape) + * - update curr_migr_unit + */ + if (a->curr_action == reshape) { + /* still reshaping, maybe update curr_migr_unit */ + goto mark_checkpoint; + } else { + if (a->last_checkpoint == 0 && a->prev_action == reshape) { + /* for some reason we aborted the reshape. + * + * disable automatic metadata rollback + * user action is required to recover process + */ + if (0) { + struct imsm_map *map2 = + get_imsm_map(dev, MAP_1); + dev->vol.migr_state = 0; + set_migr_type(dev, 0); + dev->vol.curr_migr_unit = 0; + memcpy(map, map2, + sizeof_imsm_map(map2)); + super->updates_pending++; + } + } + if (a->last_checkpoint >= a->info.component_size) { + unsigned long long array_blocks; + int used_disks; + struct mdinfo *mdi; + + used_disks = imsm_num_data_members(dev, MAP_0); + if (used_disks > 0) { + array_blocks = + blocks_per_member(map) * + used_disks; + /* round array size down to closest MB + */ + array_blocks = (array_blocks + >> SECT_PER_MB_SHIFT) + << SECT_PER_MB_SHIFT; + a->info.custom_array_size = array_blocks; + /* encourage manager to update array + * size + */ + + a->check_reshape = 1; + } + /* finalize online capacity expansion/reshape */ + for (mdi = a->info.devs; mdi; mdi = mdi->next) + imsm_set_disk(a, + mdi->disk.raid_disk, + mdi->curr_state); + + imsm_progress_container_reshape(super); + } + } + } + + /* before we activate this array handle any missing disks */ + if (consistent == 2) + handle_missing(super, dev); + + if (consistent == 2 && + (!is_resync_complete(&a->info) || + map_state != IMSM_T_STATE_NORMAL || + dev->vol.migr_state)) + consistent = 0; + + if (is_resync_complete(&a->info)) { + /* complete intialization / resync, + * recovery and interrupted recovery is completed in + * ->set_disk + */ + if (is_resyncing(dev)) { + dprintf("imsm: mark resync done\n"); + end_migration(dev, super, map_state); + super->updates_pending++; + a->last_checkpoint = 0; + } + } else if ((!is_resyncing(dev) && !failed) && + (imsm_reshape_blocks_arrays_changes(super) == 0)) { + /* mark the start of the init process if nothing is failed */ + dprintf("imsm: mark resync start\n"); + if (map->map_state == IMSM_T_STATE_UNINITIALIZED) + migrate(dev, super, IMSM_T_STATE_NORMAL, MIGR_INIT); + else + migrate(dev, super, IMSM_T_STATE_NORMAL, MIGR_REPAIR); + super->updates_pending++; + } + +mark_checkpoint: + /* skip checkpointing for general migration, + * it is controlled in mdadm + */ + if (is_gen_migration(dev)) + goto skip_mark_checkpoint; + + /* check if we can update curr_migr_unit from resync_start, recovery_start */ + blocks_per_unit = blocks_per_migr_unit(super, dev); + if (blocks_per_unit) { + __u32 units32; + __u64 units; + + units = a->last_checkpoint / blocks_per_unit; + units32 = units; + + /* check that we did not overflow 32-bits, and that + * curr_migr_unit needs updating + */ + if (units32 == units && + units32 != 0 && + __le32_to_cpu(dev->vol.curr_migr_unit) != units32) { + dprintf("imsm: mark checkpoint (%u)\n", units32); + dev->vol.curr_migr_unit = __cpu_to_le32(units32); + super->updates_pending++; + } + } + +skip_mark_checkpoint: + /* mark dirty / clean */ + if (dev->vol.dirty != !consistent) { + dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty"); + if (consistent) + dev->vol.dirty = 0; + else + dev->vol.dirty = 1; + super->updates_pending++; + } + + return consistent; +} + +static void imsm_set_disk(struct active_array *a, int n, int state) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_disk *disk; + struct mdinfo *mdi; + int recovery_not_finished = 0; + int failed; + __u32 ord; + __u8 map_state; + + if (n > map->num_members) + pr_err("imsm: set_disk %d out of range 0..%d\n", + n, map->num_members - 1); + + if (n < 0) + return; + + dprintf("imsm: set_disk %d:%x\n", n, state); + + ord = get_imsm_ord_tbl_ent(dev, n, MAP_0); + disk = get_imsm_disk(super, ord_to_idx(ord)); + + /* check for new failures */ + if (state & DS_FAULTY) { + if (mark_failure(dev, disk, ord_to_idx(ord))) + super->updates_pending++; + } + + /* check if in_sync */ + if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD && is_rebuilding(dev)) { + struct imsm_map *migr_map = get_imsm_map(dev, MAP_1); + + set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord)); + super->updates_pending++; + } + + failed = imsm_count_failed(super, dev, MAP_0); + map_state = imsm_check_degraded(super, dev, failed, MAP_0); + + /* check if recovery complete, newly degraded, or failed */ + dprintf("imsm: Detected transition to state "); + switch (map_state) { + case IMSM_T_STATE_NORMAL: /* transition to normal state */ + dprintf("normal: "); + if (is_rebuilding(dev)) { + dprintf_cont("while rebuilding"); + /* check if recovery is really finished */ + for (mdi = a->info.devs; mdi ; mdi = mdi->next) + if (mdi->recovery_start != MaxSector) { + recovery_not_finished = 1; + break; + } + if (recovery_not_finished) { + dprintf_cont("\n"); + dprintf("Rebuild has not finished yet, state not changed"); + if (a->last_checkpoint < mdi->recovery_start) { + a->last_checkpoint = mdi->recovery_start; + super->updates_pending++; + } + break; + } + end_migration(dev, super, map_state); + map = get_imsm_map(dev, MAP_0); + map->failed_disk_num = ~0; + super->updates_pending++; + a->last_checkpoint = 0; + break; + } + if (is_gen_migration(dev)) { + dprintf_cont("while general migration"); + if (a->last_checkpoint >= a->info.component_size) + end_migration(dev, super, map_state); + else + map->map_state = map_state; + map = get_imsm_map(dev, MAP_0); + map->failed_disk_num = ~0; + super->updates_pending++; + break; + } + break; + case IMSM_T_STATE_DEGRADED: /* transition to degraded state */ + dprintf_cont("degraded: "); + if ((map->map_state != map_state) && + !dev->vol.migr_state) { + dprintf_cont("mark degraded"); + map->map_state = map_state; + super->updates_pending++; + a->last_checkpoint = 0; + break; + } + if (is_rebuilding(dev)) { + dprintf_cont("while rebuilding."); + if (map->map_state != map_state) { + dprintf_cont(" Map state change"); + end_migration(dev, super, map_state); + super->updates_pending++; + } + break; + } + if (is_gen_migration(dev)) { + dprintf_cont("while general migration"); + if (a->last_checkpoint >= a->info.component_size) + end_migration(dev, super, map_state); + else { + map->map_state = map_state; + manage_second_map(super, dev); + } + super->updates_pending++; + break; + } + if (is_initializing(dev)) { + dprintf_cont("while initialization."); + map->map_state = map_state; + super->updates_pending++; + break; + } + break; + case IMSM_T_STATE_FAILED: /* transition to failed state */ + dprintf_cont("failed: "); + if (is_gen_migration(dev)) { + dprintf_cont("while general migration"); + map->map_state = map_state; + super->updates_pending++; + break; + } + if (map->map_state != map_state) { + dprintf_cont("mark failed"); + end_migration(dev, super, map_state); + super->updates_pending++; + a->last_checkpoint = 0; + break; + } + break; + default: + dprintf_cont("state %i\n", map_state); + } + dprintf_cont("\n"); +} + +static int store_imsm_mpb(int fd, struct imsm_super *mpb) +{ + void *buf = mpb; + __u32 mpb_size = __le32_to_cpu(mpb->mpb_size); + unsigned long long dsize; + unsigned long long sectors; + + get_dev_size(fd, NULL, &dsize); + + if (mpb_size > 512) { + /* -1 to account for anchor */ + sectors = mpb_sectors(mpb) - 1; + + /* write the extended mpb to the sectors preceeding the anchor */ + if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) + return 1; + + if ((unsigned long long)write(fd, buf + 512, 512 * sectors) + != 512 * sectors) + return 1; + } + + /* first block is stored on second to last sector of the disk */ + if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) + return 1; + + if (write(fd, buf, 512) != 512) + return 1; + + return 0; +} + +static void imsm_sync_metadata(struct supertype *container) +{ + struct intel_super *super = container->sb; + + dprintf("sync metadata: %d\n", super->updates_pending); + if (!super->updates_pending) + return; + + write_super_imsm(container, 0); + + super->updates_pending = 0; +} + +static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a) +{ + struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); + int i = get_imsm_disk_idx(dev, idx, MAP_X); + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (dl->index == i) + break; + + if (dl && is_failed(&dl->disk)) + dl = NULL; + + if (dl) + dprintf("found %x:%x\n", dl->major, dl->minor); + + return dl; +} + +static struct dl *imsm_add_spare(struct intel_super *super, int slot, + struct active_array *a, int activate_new, + struct mdinfo *additional_test_list) +{ + struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); + int idx = get_imsm_disk_idx(dev, slot, MAP_X); + struct imsm_super *mpb = super->anchor; + struct imsm_map *map; + unsigned long long pos; + struct mdinfo *d; + struct extent *ex; + int i, j; + int found; + __u32 array_start = 0; + __u32 array_end = 0; + struct dl *dl; + struct mdinfo *test_list; + + for (dl = super->disks; dl; dl = dl->next) { + /* If in this array, skip */ + for (d = a->info.devs ; d ; d = d->next) + if (d->state_fd >= 0 && + d->disk.major == dl->major && + d->disk.minor == dl->minor) { + dprintf("%x:%x already in array\n", + dl->major, dl->minor); + break; + } + if (d) + continue; + test_list = additional_test_list; + while (test_list) { + if (test_list->disk.major == dl->major && + test_list->disk.minor == dl->minor) { + dprintf("%x:%x already in additional test list\n", + dl->major, dl->minor); + break; + } + test_list = test_list->next; + } + if (test_list) + continue; + + /* skip in use or failed drives */ + if (is_failed(&dl->disk) || idx == dl->index || + dl->index == -2) { + dprintf("%x:%x status (failed: %d index: %d)\n", + dl->major, dl->minor, is_failed(&dl->disk), idx); + continue; + } + + /* skip pure spares when we are looking for partially + * assimilated drives + */ + if (dl->index == -1 && !activate_new) + continue; + + /* Does this unused device have the requisite free space? + * It needs to be able to cover all member volumes + */ + ex = get_extents(super, dl); + if (!ex) { + dprintf("cannot get extents\n"); + continue; + } + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + + /* check if this disk is already a member of + * this array + */ + if (get_imsm_disk_slot(map, dl->index) >= 0) + continue; + + found = 0; + j = 0; + pos = 0; + array_start = pba_of_lba0(map); + array_end = array_start + + blocks_per_member(map) - 1; + + do { + /* check that we can start at pba_of_lba0 with + * blocks_per_member of space + */ + if (array_start >= pos && array_end < ex[j].start) { + found = 1; + break; + } + pos = ex[j].start + ex[j].size; + j++; + } while (ex[j-1].size); + + if (!found) + break; + } + + free(ex); + if (i < mpb->num_raid_devs) { + dprintf("%x:%x does not have %u to %u available\n", + dl->major, dl->minor, array_start, array_end); + /* No room */ + continue; + } + return dl; + } + + return dl; +} + +static int imsm_rebuild_allowed(struct supertype *cont, int dev_idx, int failed) +{ + struct imsm_dev *dev2; + struct imsm_map *map; + struct dl *idisk; + int slot; + int idx; + __u8 state; + + dev2 = get_imsm_dev(cont->sb, dev_idx); + if (dev2) { + state = imsm_check_degraded(cont->sb, dev2, failed, MAP_0); + if (state == IMSM_T_STATE_FAILED) { + map = get_imsm_map(dev2, MAP_0); + if (!map) + return 1; + for (slot = 0; slot < map->num_members; slot++) { + /* + * Check if failed disks are deleted from intel + * disk list or are marked to be deleted + */ + idx = get_imsm_disk_idx(dev2, slot, MAP_X); + idisk = get_imsm_dl_disk(cont->sb, idx); + /* + * Do not rebuild the array if failed disks + * from failed sub-array are not removed from + * container. + */ + if (idisk && + is_failed(&idisk->disk) && + (idisk->action != DISK_REMOVE)) + return 0; + } + } + } + return 1; +} + +static struct mdinfo *imsm_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + /** + * Find a device with unused free space and use it to replace a + * failed/vacant region in an array. We replace failed regions one a + * array at a time. The result is that a new spare disk will be added + * to the first failed array and after the monitor has finished + * propagating failures the remainder will be consumed. + * + * FIXME add a capability for mdmon to request spares from another + * container. + */ + + struct intel_super *super = a->container->sb; + int inst = a->info.container_member; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int failed = a->info.array.raid_disks; + struct mdinfo *rv = NULL; + struct mdinfo *d; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + struct imsm_update_activate_spare *u; + int num_spares = 0; + int i; + int allowed; + + for (d = a->info.devs ; d ; d = d->next) { + if ((d->curr_state & DS_FAULTY) && + d->state_fd >= 0) + /* wait for Removal to happen */ + return NULL; + if (d->state_fd >= 0) + failed--; + } + + dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n", + inst, failed, a->info.array.raid_disks, a->info.array.level); + + if (imsm_reshape_blocks_arrays_changes(super)) + return NULL; + + /* Cannot activate another spare if rebuild is in progress already + */ + if (is_rebuilding(dev)) { + dprintf("imsm: No spare activation allowed. Rebuild in progress already.\n"); + return NULL; + } + + if (a->info.array.level == 4) + /* No repair for takeovered array + * imsm doesn't support raid4 + */ + return NULL; + + if (imsm_check_degraded(super, dev, failed, MAP_0) != + IMSM_T_STATE_DEGRADED) + return NULL; + + /* + * If there are any failed disks check state of the other volume. + * Block rebuild if the another one is failed until failed disks + * are removed from container. + */ + if (failed) { + dprintf("found failed disks in %.*s, check if there anotherfailed sub-array.\n", + MAX_RAID_SERIAL_LEN, dev->volume); + /* check if states of the other volumes allow for rebuild */ + for (i = 0; i < super->anchor->num_raid_devs; i++) { + if (i != inst) { + allowed = imsm_rebuild_allowed(a->container, + i, failed); + if (!allowed) + return NULL; + } + } + } + + /* For each slot, if it is not working, find a spare */ + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && (d->state_fd >= 0)) + continue; + + /* + * OK, this device needs recovery. Try to re-add the + * previous occupant of this slot, if this fails see if + * we can continue the assimilation of a spare that was + * partially assimilated, finally try to activate a new + * spare. + */ + dl = imsm_readd(super, i, a); + if (!dl) + dl = imsm_add_spare(super, i, a, 0, rv); + if (!dl) + dl = imsm_add_spare(super, i, a, 1, rv); + if (!dl) + continue; + + /* found a usable disk with enough space */ + di = xcalloc(1, sizeof(*di)); + + /* dl->index will be -1 in the case we are activating a + * pristine spare. imsm_process_update() will create a + * new index in this case. Once a disk is found to be + * failed in all member arrays it is kicked from the + * metadata + */ + di->disk.number = dl->index; + + /* (ab)use di->devs to store a pointer to the device + * we chose + */ + di->devs = (struct mdinfo *) dl; + + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->recovery_start = 0; + di->data_offset = pba_of_lba0(map); + di->component_size = a->info.component_size; + di->container_member = inst; + super->random = random32(); + di->next = rv; + rv = di; + num_spares++; + dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor, + i, di->data_offset); + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * disk_ord_tbl for the array + */ + mu = xmalloc(sizeof(*mu)); + mu->buf = xcalloc(num_spares, + sizeof(struct imsm_update_activate_spare)); + mu->space = NULL; + mu->space_list = NULL; + mu->len = sizeof(struct imsm_update_activate_spare) * num_spares; + mu->next = *updates; + u = (struct imsm_update_activate_spare *) mu->buf; + + for (di = rv ; di ; di = di->next) { + u->type = update_activate_spare; + u->dl = (struct dl *) di->devs; + di->devs = NULL; + u->slot = di->disk.raid_disk; + u->array = inst; + u->next = u + 1; + u++; + } + (u-1)->next = NULL; + *updates = mu; + + return rv; +} + +static int disks_overlap(struct intel_super *super, int idx, struct imsm_update_create_array *u) +{ + struct imsm_dev *dev = get_imsm_dev(super, idx); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *new_map = get_imsm_map(&u->dev, MAP_0); + struct disk_info *inf = get_disk_info(u); + struct imsm_disk *disk; + int i; + int j; + + for (i = 0; i < map->num_members; i++) { + disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i, MAP_X)); + for (j = 0; j < new_map->num_members; j++) + if (serialcmp(disk->serial, inf[j].serial) == 0) + return 1; + } + + return 0; +} + +static struct dl *get_disk_super(struct intel_super *super, int major, int minor) +{ + struct dl *dl = NULL; + for (dl = super->disks; dl; dl = dl->next) + if ((dl->major == major) && (dl->minor == minor)) + return dl; + return NULL; +} + +static int remove_disk_super(struct intel_super *super, int major, int minor) +{ + struct dl *prev = NULL; + struct dl *dl; + + prev = NULL; + for (dl = super->disks; dl; dl = dl->next) { + if ((dl->major == major) && (dl->minor == minor)) { + /* remove */ + if (prev) + prev->next = dl->next; + else + super->disks = dl->next; + dl->next = NULL; + __free_imsm_disk(dl); + dprintf("removed %x:%x\n", major, minor); + break; + } + prev = dl; + } + return 0; +} + +static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index); + +static int add_remove_disk_update(struct intel_super *super) +{ + int check_degraded = 0; + struct dl *disk = NULL; + /* add/remove some spares to/from the metadata/contrainer */ + while (super->disk_mgmt_list) { + struct dl *disk_cfg; + + disk_cfg = super->disk_mgmt_list; + super->disk_mgmt_list = disk_cfg->next; + disk_cfg->next = NULL; + + if (disk_cfg->action == DISK_ADD) { + disk_cfg->next = super->disks; + super->disks = disk_cfg; + check_degraded = 1; + dprintf("added %x:%x\n", + disk_cfg->major, disk_cfg->minor); + } else if (disk_cfg->action == DISK_REMOVE) { + dprintf("Disk remove action processed: %x.%x\n", + disk_cfg->major, disk_cfg->minor); + disk = get_disk_super(super, + disk_cfg->major, + disk_cfg->minor); + if (disk) { + /* store action status */ + disk->action = DISK_REMOVE; + /* remove spare disks only */ + if (disk->index == -1) { + remove_disk_super(super, + disk_cfg->major, + disk_cfg->minor); + } + } + /* release allocate disk structure */ + __free_imsm_disk(disk_cfg); + } + } + return check_degraded; +} + +static int apply_reshape_migration_update(struct imsm_update_reshape_migration *u, + struct intel_super *super, + void ***space_list) +{ + struct intel_dev *id; + void **tofree = NULL; + int ret_val = 0; + + dprintf("(enter)\n"); + if ((u->subdev < 0) || + (u->subdev > 1)) { + dprintf("imsm: Error: Wrong subdev: %i\n", u->subdev); + return ret_val; + } + if ((space_list == NULL) || (*space_list == NULL)) { + dprintf("imsm: Error: Memory is not allocated\n"); + return ret_val; + } + + for (id = super->devlist ; id; id = id->next) { + if (id->index == (unsigned)u->subdev) { + struct imsm_dev *dev = get_imsm_dev(super, u->subdev); + struct imsm_map *map; + struct imsm_dev *new_dev = + (struct imsm_dev *)*space_list; + struct imsm_map *migr_map = get_imsm_map(dev, MAP_1); + int to_state; + struct dl *new_disk; + + if (new_dev == NULL) + return ret_val; + *space_list = **space_list; + memcpy(new_dev, dev, sizeof_imsm_dev(dev, 0)); + map = get_imsm_map(new_dev, MAP_0); + if (migr_map) { + dprintf("imsm: Error: migration in progress"); + return ret_val; + } + + to_state = map->map_state; + if ((u->new_level == 5) && (map->raid_level == 0)) { + map->num_members++; + /* this should not happen */ + if (u->new_disks[0] < 0) { + map->failed_disk_num = + map->num_members - 1; + to_state = IMSM_T_STATE_DEGRADED; + } else + to_state = IMSM_T_STATE_NORMAL; + } + migrate(new_dev, super, to_state, MIGR_GEN_MIGR); + if (u->new_level > -1) + map->raid_level = u->new_level; + migr_map = get_imsm_map(new_dev, MAP_1); + if ((u->new_level == 5) && + (migr_map->raid_level == 0)) { + int ord = map->num_members - 1; + migr_map->num_members--; + if (u->new_disks[0] < 0) + ord |= IMSM_ORD_REBUILD; + set_imsm_ord_tbl_ent(map, + map->num_members - 1, + ord); + } + id->dev = new_dev; + tofree = (void **)dev; + + /* update chunk size + */ + if (u->new_chunksize > 0) + map->blocks_per_strip = + __cpu_to_le16(u->new_chunksize * 2); + + /* add disk + */ + if ((u->new_level != 5) || + (migr_map->raid_level != 0) || + (migr_map->raid_level == map->raid_level)) + goto skip_disk_add; + + if (u->new_disks[0] >= 0) { + /* use passes spare + */ + new_disk = get_disk_super(super, + major(u->new_disks[0]), + minor(u->new_disks[0])); + dprintf("imsm: new disk for reshape is: %i:%i (%p, index = %i)\n", + major(u->new_disks[0]), + minor(u->new_disks[0]), + new_disk, new_disk->index); + if (new_disk == NULL) + goto error_disk_add; + + new_disk->index = map->num_members - 1; + /* slot to fill in autolayout + */ + new_disk->raiddisk = new_disk->index; + new_disk->disk.status |= CONFIGURED_DISK; + new_disk->disk.status &= ~SPARE_DISK; + } else + goto error_disk_add; + +skip_disk_add: + *tofree = *space_list; + /* calculate new size + */ + imsm_set_array_size(new_dev, -1); + + ret_val = 1; + } + } + + if (tofree) + *space_list = tofree; + return ret_val; + +error_disk_add: + dprintf("Error: imsm: Cannot find disk.\n"); + return ret_val; +} + +static int apply_size_change_update(struct imsm_update_size_change *u, + struct intel_super *super) +{ + struct intel_dev *id; + int ret_val = 0; + + dprintf("(enter)\n"); + if ((u->subdev < 0) || + (u->subdev > 1)) { + dprintf("imsm: Error: Wrong subdev: %i\n", u->subdev); + return ret_val; + } + + for (id = super->devlist ; id; id = id->next) { + if (id->index == (unsigned)u->subdev) { + struct imsm_dev *dev = get_imsm_dev(super, u->subdev); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int used_disks = imsm_num_data_members(dev, MAP_0); + unsigned long long blocks_per_member; + + /* calculate new size + */ + blocks_per_member = u->new_size / used_disks; + dprintf("(size: %llu, blocks per member: %llu)\n", + u->new_size, blocks_per_member); + set_blocks_per_member(map, blocks_per_member); + imsm_set_array_size(dev, u->new_size); + + ret_val = 1; + break; + } + } + + return ret_val; +} + +static int apply_update_activate_spare(struct imsm_update_activate_spare *u, + struct intel_super *super, + struct active_array *active_array) +{ + struct imsm_super *mpb = super->anchor; + struct imsm_dev *dev = get_imsm_dev(super, u->array); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *migr_map; + struct active_array *a; + struct imsm_disk *disk; + __u8 to_state; + struct dl *dl; + unsigned int found; + int failed; + int victim; + int i; + int second_map_created = 0; + + for (; u; u = u->next) { + victim = get_imsm_disk_idx(dev, u->slot, MAP_X); + + if (victim < 0) + return 0; + + for (dl = super->disks; dl; dl = dl->next) + if (dl == u->dl) + break; + + if (!dl) { + pr_err("error: imsm_activate_spare passed an unknown disk (index: %d)\n", + u->dl->index); + return 0; + } + + /* count failures (excluding rebuilds and the victim) + * to determine map[0] state + */ + failed = 0; + for (i = 0; i < map->num_members; i++) { + if (i == u->slot) + continue; + disk = get_imsm_disk(super, + get_imsm_disk_idx(dev, i, MAP_X)); + if (!disk || is_failed(disk)) + failed++; + } + + /* adding a pristine spare, assign a new index */ + if (dl->index < 0) { + dl->index = super->anchor->num_disks; + super->anchor->num_disks++; + } + disk = &dl->disk; + disk->status |= CONFIGURED_DISK; + disk->status &= ~SPARE_DISK; + + /* mark rebuild */ + to_state = imsm_check_degraded(super, dev, failed, MAP_0); + if (!second_map_created) { + second_map_created = 1; + map->map_state = IMSM_T_STATE_DEGRADED; + migrate(dev, super, to_state, MIGR_REBUILD); + } else + map->map_state = to_state; + migr_map = get_imsm_map(dev, MAP_1); + set_imsm_ord_tbl_ent(map, u->slot, dl->index); + set_imsm_ord_tbl_ent(migr_map, u->slot, + dl->index | IMSM_ORD_REBUILD); + + /* update the family_num to mark a new container + * generation, being careful to record the existing + * family_num in orig_family_num to clean up after + * earlier mdadm versions that neglected to set it. + */ + if (mpb->orig_family_num == 0) + mpb->orig_family_num = mpb->family_num; + mpb->family_num += super->random; + + /* count arrays using the victim in the metadata */ + found = 0; + for (a = active_array; a ; a = a->next) { + dev = get_imsm_dev(super, a->info.container_member); + map = get_imsm_map(dev, MAP_0); + + if (get_imsm_disk_slot(map, victim) >= 0) + found++; + } + + /* delete the victim if it is no longer being + * utilized anywhere + */ + if (!found) { + struct dl **dlp; + + /* We know that 'manager' isn't touching anything, + * so it is safe to delete + */ + for (dlp = &super->disks; *dlp; dlp = &(*dlp)->next) + if ((*dlp)->index == victim) + break; + + /* victim may be on the missing list */ + if (!*dlp) + for (dlp = &super->missing; *dlp; + dlp = &(*dlp)->next) + if ((*dlp)->index == victim) + break; + imsm_delete(super, dlp, victim); + } + } + + return 1; +} + +static int apply_reshape_container_disks_update(struct imsm_update_reshape *u, + struct intel_super *super, + void ***space_list) +{ + struct dl *new_disk; + struct intel_dev *id; + int i; + int delta_disks = u->new_raid_disks - u->old_raid_disks; + int disk_count = u->old_raid_disks; + void **tofree = NULL; + int devices_to_reshape = 1; + struct imsm_super *mpb = super->anchor; + int ret_val = 0; + unsigned int dev_id; + + dprintf("(enter)\n"); + + /* enable spares to use in array */ + for (i = 0; i < delta_disks; i++) { + new_disk = get_disk_super(super, + major(u->new_disks[i]), + minor(u->new_disks[i])); + dprintf("imsm: new disk for reshape is: %i:%i (%p, index = %i)\n", + major(u->new_disks[i]), minor(u->new_disks[i]), + new_disk, new_disk->index); + if ((new_disk == NULL) || + ((new_disk->index >= 0) && + (new_disk->index < u->old_raid_disks))) + goto update_reshape_exit; + new_disk->index = disk_count++; + /* slot to fill in autolayout + */ + new_disk->raiddisk = new_disk->index; + new_disk->disk.status |= + CONFIGURED_DISK; + new_disk->disk.status &= ~SPARE_DISK; + } + + dprintf("imsm: volume set mpb->num_raid_devs = %i\n", + mpb->num_raid_devs); + /* manage changes in volume + */ + for (dev_id = 0; dev_id < mpb->num_raid_devs; dev_id++) { + void **sp = *space_list; + struct imsm_dev *newdev; + struct imsm_map *newmap, *oldmap; + + for (id = super->devlist ; id; id = id->next) { + if (id->index == dev_id) + break; + } + if (id == NULL) + break; + if (!sp) + continue; + *space_list = *sp; + newdev = (void*)sp; + /* Copy the dev, but not (all of) the map */ + memcpy(newdev, id->dev, sizeof(*newdev)); + oldmap = get_imsm_map(id->dev, MAP_0); + newmap = get_imsm_map(newdev, MAP_0); + /* Copy the current map */ + memcpy(newmap, oldmap, sizeof_imsm_map(oldmap)); + /* update one device only + */ + if (devices_to_reshape) { + dprintf("imsm: modifying subdev: %i\n", + id->index); + devices_to_reshape--; + newdev->vol.migr_state = 1; + newdev->vol.curr_migr_unit = 0; + set_migr_type(newdev, MIGR_GEN_MIGR); + newmap->num_members = u->new_raid_disks; + for (i = 0; i < delta_disks; i++) { + set_imsm_ord_tbl_ent(newmap, + u->old_raid_disks + i, + u->old_raid_disks + i); + } + /* New map is correct, now need to save old map + */ + newmap = get_imsm_map(newdev, MAP_1); + memcpy(newmap, oldmap, sizeof_imsm_map(oldmap)); + + imsm_set_array_size(newdev, -1); + } + + sp = (void **)id->dev; + id->dev = newdev; + *sp = tofree; + tofree = sp; + + /* Clear migration record */ + memset(super->migr_rec, 0, sizeof(struct migr_record)); + } + if (tofree) + *space_list = tofree; + ret_val = 1; + +update_reshape_exit: + + return ret_val; +} + +static int apply_takeover_update(struct imsm_update_takeover *u, + struct intel_super *super, + void ***space_list) +{ + struct imsm_dev *dev = NULL; + struct intel_dev *dv; + struct imsm_dev *dev_new; + struct imsm_map *map; + struct dl *dm, *du; + int i; + + for (dv = super->devlist; dv; dv = dv->next) + if (dv->index == (unsigned int)u->subarray) { + dev = dv->dev; + break; + } + + if (dev == NULL) + return 0; + + map = get_imsm_map(dev, MAP_0); + + if (u->direction == R10_TO_R0) { + /* Number of failed disks must be half of initial disk number */ + if (imsm_count_failed(super, dev, MAP_0) != + (map->num_members / 2)) + return 0; + + /* iterate through devices to mark removed disks as spare */ + for (dm = super->disks; dm; dm = dm->next) { + if (dm->disk.status & FAILED_DISK) { + int idx = dm->index; + /* update indexes on the disk list */ +/* FIXME this loop-with-the-loop looks wrong, I'm not convinced + the index values will end up being correct.... NB */ + for (du = super->disks; du; du = du->next) + if (du->index > idx) + du->index--; + /* mark as spare disk */ + mark_spare(dm); + } + } + /* update map */ + map->num_members = map->num_members / 2; + map->map_state = IMSM_T_STATE_NORMAL; + map->num_domains = 1; + map->raid_level = 0; + map->failed_disk_num = -1; + } + + if (u->direction == R0_TO_R10) { + void **space; + /* update slots in current disk list */ + for (dm = super->disks; dm; dm = dm->next) { + if (dm->index >= 0) + dm->index *= 2; + } + /* create new *missing* disks */ + for (i = 0; i < map->num_members; i++) { + space = *space_list; + if (!space) + continue; + *space_list = *space; + du = (void *)space; + memcpy(du, super->disks, sizeof(*du)); + du->fd = -1; + du->minor = 0; + du->major = 0; + du->index = (i * 2) + 1; + sprintf((char *)du->disk.serial, + " MISSING_%d", du->index); + sprintf((char *)du->serial, + "MISSING_%d", du->index); + du->next = super->missing; + super->missing = du; + } + /* create new dev and map */ + space = *space_list; + if (!space) + return 0; + *space_list = *space; + dev_new = (void *)space; + memcpy(dev_new, dev, sizeof(*dev)); + /* update new map */ + map = get_imsm_map(dev_new, MAP_0); + map->num_members = map->num_members * 2; + map->map_state = IMSM_T_STATE_DEGRADED; + map->num_domains = 2; + map->raid_level = 1; + /* replace dev<->dev_new */ + dv->dev = dev_new; + } + /* update disk order table */ + for (du = super->disks; du; du = du->next) + if (du->index >= 0) + set_imsm_ord_tbl_ent(map, du->index, du->index); + for (du = super->missing; du; du = du->next) + if (du->index >= 0) { + set_imsm_ord_tbl_ent(map, du->index, du->index); + mark_missing(dv->dev, &du->disk, du->index); + } + + return 1; +} + +static void imsm_process_update(struct supertype *st, + struct metadata_update *update) +{ + /** + * crack open the metadata_update envelope to find the update record + * update can be one of: + * update_reshape_container_disks - all the arrays in the container + * are being reshaped to have more devices. We need to mark + * the arrays for general migration and convert selected spares + * into active devices. + * update_activate_spare - a spare device has replaced a failed + * device in an array, update the disk_ord_tbl. If this disk is + * present in all member arrays then also clear the SPARE_DISK + * flag + * update_create_array + * update_kill_array + * update_rename_array + * update_add_remove_disk + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb; + enum imsm_update_type type = *(enum imsm_update_type *) update->buf; + + /* update requires a larger buf but the allocation failed */ + if (super->next_len && !super->next_buf) { + super->next_len = 0; + return; + } + + if (super->next_buf) { + memcpy(super->next_buf, super->buf, super->len); + free(super->buf); + super->len = super->next_len; + super->buf = super->next_buf; + + super->next_len = 0; + super->next_buf = NULL; + } + + mpb = super->anchor; + + switch (type) { + case update_general_migration_checkpoint: { + struct intel_dev *id; + struct imsm_update_general_migration_checkpoint *u = + (void *)update->buf; + + dprintf("called for update_general_migration_checkpoint\n"); + + /* find device under general migration */ + for (id = super->devlist ; id; id = id->next) { + if (is_gen_migration(id->dev)) { + id->dev->vol.curr_migr_unit = + __cpu_to_le32(u->curr_migr_unit); + super->updates_pending++; + } + } + break; + } + case update_takeover: { + struct imsm_update_takeover *u = (void *)update->buf; + if (apply_takeover_update(u, super, &update->space_list)) { + imsm_update_version_info(super); + super->updates_pending++; + } + break; + } + + case update_reshape_container_disks: { + struct imsm_update_reshape *u = (void *)update->buf; + if (apply_reshape_container_disks_update( + u, super, &update->space_list)) + super->updates_pending++; + break; + } + case update_reshape_migration: { + struct imsm_update_reshape_migration *u = (void *)update->buf; + if (apply_reshape_migration_update( + u, super, &update->space_list)) + super->updates_pending++; + break; + } + case update_size_change: { + struct imsm_update_size_change *u = (void *)update->buf; + if (apply_size_change_update(u, super)) + super->updates_pending++; + break; + } + case update_activate_spare: { + struct imsm_update_activate_spare *u = (void *) update->buf; + if (apply_update_activate_spare(u, super, st->arrays)) + super->updates_pending++; + break; + } + case update_create_array: { + /* someone wants to create a new array, we need to be aware of + * a few races/collisions: + * 1/ 'Create' called by two separate instances of mdadm + * 2/ 'Create' versus 'activate_spare': mdadm has chosen + * devices that have since been assimilated via + * activate_spare. + * In the event this update can not be carried out mdadm will + * (FIX ME) notice that its update did not take hold. + */ + struct imsm_update_create_array *u = (void *) update->buf; + struct intel_dev *dv; + struct imsm_dev *dev; + struct imsm_map *map, *new_map; + unsigned long long start, end; + unsigned long long new_start, new_end; + int i; + struct disk_info *inf; + struct dl *dl; + + /* handle racing creates: first come first serve */ + if (u->dev_idx < mpb->num_raid_devs) { + dprintf("subarray %d already defined\n", u->dev_idx); + goto create_error; + } + + /* check update is next in sequence */ + if (u->dev_idx != mpb->num_raid_devs) { + dprintf("can not create array %d expected index %d\n", + u->dev_idx, mpb->num_raid_devs); + goto create_error; + } + + new_map = get_imsm_map(&u->dev, MAP_0); + new_start = pba_of_lba0(new_map); + new_end = new_start + blocks_per_member(new_map); + inf = get_disk_info(u); + + /* handle activate_spare versus create race: + * check to make sure that overlapping arrays do not include + * overalpping disks + */ + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + start = pba_of_lba0(map); + end = start + blocks_per_member(map); + if ((new_start >= start && new_start <= end) || + (start >= new_start && start <= new_end)) + /* overlap */; + else + continue; + + if (disks_overlap(super, i, u)) { + dprintf("arrays overlap\n"); + goto create_error; + } + } + + /* check that prepare update was successful */ + if (!update->space) { + dprintf("prepare update failed\n"); + goto create_error; + } + + /* check that all disks are still active before committing + * changes. FIXME: could we instead handle this by creating a + * degraded array? That's probably not what the user expects, + * so better to drop this update on the floor. + */ + for (i = 0; i < new_map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (!dl) { + dprintf("disk disappeared\n"); + goto create_error; + } + } + + super->updates_pending++; + + /* convert spares to members and fixup ord_tbl */ + for (i = 0; i < new_map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (dl->index == -1) { + dl->index = mpb->num_disks; + mpb->num_disks++; + dl->disk.status |= CONFIGURED_DISK; + dl->disk.status &= ~SPARE_DISK; + } + set_imsm_ord_tbl_ent(new_map, i, dl->index); + } + + dv = update->space; + dev = dv->dev; + update->space = NULL; + imsm_copy_dev(dev, &u->dev); + dv->index = u->dev_idx; + dv->next = super->devlist; + super->devlist = dv; + mpb->num_raid_devs++; + + imsm_update_version_info(super); + break; + create_error: + /* mdmon knows how to release update->space, but not + * ((struct intel_dev *) update->space)->dev + */ + if (update->space) { + dv = update->space; + free(dv->dev); + } + break; + } + case update_kill_array: { + struct imsm_update_kill_array *u = (void *) update->buf; + int victim = u->dev_idx; + struct active_array *a; + struct intel_dev **dp; + struct imsm_dev *dev; + + /* sanity check that we are not affecting the uuid of + * active arrays, or deleting an active array + * + * FIXME when immutable ids are available, but note that + * we'll also need to fixup the invalidated/active + * subarray indexes in mdstat + */ + for (a = st->arrays; a; a = a->next) + if (a->info.container_member >= victim) + break; + /* by definition if mdmon is running at least one array + * is active in the container, so checking + * mpb->num_raid_devs is just extra paranoia + */ + dev = get_imsm_dev(super, victim); + if (a || !dev || mpb->num_raid_devs == 1) { + dprintf("failed to delete subarray-%d\n", victim); + break; + } + + for (dp = &super->devlist; *dp;) + if ((*dp)->index == (unsigned)super->current_vol) { + *dp = (*dp)->next; + } else { + if ((*dp)->index > (unsigned)victim) + (*dp)->index--; + dp = &(*dp)->next; + } + mpb->num_raid_devs--; + super->updates_pending++; + break; + } + case update_rename_array: { + struct imsm_update_rename_array *u = (void *) update->buf; + char name[MAX_RAID_SERIAL_LEN+1]; + int target = u->dev_idx; + struct active_array *a; + struct imsm_dev *dev; + + /* sanity check that we are not affecting the uuid of + * an active array + */ + snprintf(name, MAX_RAID_SERIAL_LEN, "%s", (char *) u->name); + name[MAX_RAID_SERIAL_LEN] = '\0'; + for (a = st->arrays; a; a = a->next) + if (a->info.container_member == target) + break; + dev = get_imsm_dev(super, u->dev_idx); + if (a || !dev || !check_name(super, name, 1)) { + dprintf("failed to rename subarray-%d\n", target); + break; + } + + snprintf((char *) dev->volume, MAX_RAID_SERIAL_LEN, "%s", name); + super->updates_pending++; + break; + } + case update_add_remove_disk: { + /* we may be able to repair some arrays if disks are + * being added, check the status of add_remove_disk + * if discs has been added. + */ + if (add_remove_disk_update(super)) { + struct active_array *a; + + super->updates_pending++; + for (a = st->arrays; a; a = a->next) + a->check_degraded = 1; + } + break; + } + default: + pr_err("error: unsuported process update type:(type: %d)\n", type); + } +} + +static struct mdinfo *get_spares_for_grow(struct supertype *st); + +static int imsm_prepare_update(struct supertype *st, + struct metadata_update *update) +{ + /** + * Allocate space to hold new disk entries, raid-device entries or a new + * mpb if necessary. The manager synchronously waits for updates to + * complete in the monitor, so new mpb buffers allocated here can be + * integrated by the monitor thread without worrying about live pointers + * in the manager thread. + */ + enum imsm_update_type type; + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + size_t buf_len; + size_t len = 0; + + if (update->len < (int)sizeof(type)) + return 0; + + type = *(enum imsm_update_type *) update->buf; + + switch (type) { + case update_general_migration_checkpoint: + if (update->len < (int)sizeof(struct imsm_update_general_migration_checkpoint)) + return 0; + dprintf("called for update_general_migration_checkpoint\n"); + break; + case update_takeover: { + struct imsm_update_takeover *u = (void *)update->buf; + if (update->len < (int)sizeof(*u)) + return 0; + if (u->direction == R0_TO_R10) { + void **tail = (void **)&update->space_list; + struct imsm_dev *dev = get_imsm_dev(super, u->subarray); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int num_members = map->num_members; + void *space; + int size, i; + /* allocate memory for added disks */ + for (i = 0; i < num_members; i++) { + size = sizeof(struct dl); + space = xmalloc(size); + *tail = space; + tail = space; + *tail = NULL; + } + /* allocate memory for new device */ + size = sizeof_imsm_dev(super->devlist->dev, 0) + + (num_members * sizeof(__u32)); + space = xmalloc(size); + *tail = space; + tail = space; + *tail = NULL; + len = disks_to_mpb_size(num_members * 2); + } + + break; + } + case update_reshape_container_disks: { + /* Every raid device in the container is about to + * gain some more devices, and we will enter a + * reconfiguration. + * So each 'imsm_map' will be bigger, and the imsm_vol + * will now hold 2 of them. + * Thus we need new 'struct imsm_dev' allocations sized + * as sizeof_imsm_dev but with more devices in both maps. + */ + struct imsm_update_reshape *u = (void *)update->buf; + struct intel_dev *dl; + void **space_tail = (void**)&update->space_list; + + if (update->len < (int)sizeof(*u)) + return 0; + + dprintf("for update_reshape\n"); + + for (dl = super->devlist; dl; dl = dl->next) { + int size = sizeof_imsm_dev(dl->dev, 1); + void *s; + if (u->new_raid_disks > u->old_raid_disks) + size += sizeof(__u32)*2* + (u->new_raid_disks - u->old_raid_disks); + s = xmalloc(size); + *space_tail = s; + space_tail = s; + *space_tail = NULL; + } + + len = disks_to_mpb_size(u->new_raid_disks); + dprintf("New anchor length is %llu\n", (unsigned long long)len); + break; + } + case update_reshape_migration: { + /* for migration level 0->5 we need to add disks + * so the same as for container operation we will copy + * device to the bigger location. + * in memory prepared device and new disk area are prepared + * for usage in process update + */ + struct imsm_update_reshape_migration *u = (void *)update->buf; + struct intel_dev *id; + void **space_tail = (void **)&update->space_list; + int size; + void *s; + int current_level = -1; + + if (update->len < (int)sizeof(*u)) + return 0; + + dprintf("for update_reshape\n"); + + /* add space for bigger array in update + */ + for (id = super->devlist; id; id = id->next) { + if (id->index == (unsigned)u->subdev) { + size = sizeof_imsm_dev(id->dev, 1); + if (u->new_raid_disks > u->old_raid_disks) + size += sizeof(__u32)*2* + (u->new_raid_disks - u->old_raid_disks); + s = xmalloc(size); + *space_tail = s; + space_tail = s; + *space_tail = NULL; + break; + } + } + if (update->space_list == NULL) + break; + + /* add space for disk in update + */ + size = sizeof(struct dl); + s = xmalloc(size); + *space_tail = s; + space_tail = s; + *space_tail = NULL; + + /* add spare device to update + */ + for (id = super->devlist ; id; id = id->next) + if (id->index == (unsigned)u->subdev) { + struct imsm_dev *dev; + struct imsm_map *map; + + dev = get_imsm_dev(super, u->subdev); + map = get_imsm_map(dev, MAP_0); + current_level = map->raid_level; + break; + } + if ((u->new_level == 5) && (u->new_level != current_level)) { + struct mdinfo *spares; + + spares = get_spares_for_grow(st); + if (spares) { + struct dl *dl; + struct mdinfo *dev; + + dev = spares->devs; + if (dev) { + u->new_disks[0] = + makedev(dev->disk.major, + dev->disk.minor); + dl = get_disk_super(super, + dev->disk.major, + dev->disk.minor); + dl->index = u->old_raid_disks; + dev = dev->next; + } + sysfs_free(spares); + } + } + len = disks_to_mpb_size(u->new_raid_disks); + dprintf("New anchor length is %llu\n", (unsigned long long)len); + break; + } + case update_size_change: { + if (update->len < (int)sizeof(struct imsm_update_size_change)) + return 0; + break; + } + case update_activate_spare: { + if (update->len < (int)sizeof(struct imsm_update_activate_spare)) + return 0; + break; + } + case update_create_array: { + struct imsm_update_create_array *u = (void *) update->buf; + struct intel_dev *dv; + struct imsm_dev *dev = &u->dev; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct dl *dl; + struct disk_info *inf; + int i; + int activate = 0; + + if (update->len < (int)sizeof(*u)) + return 0; + + inf = get_disk_info(u); + len = sizeof_imsm_dev(dev, 1); + /* allocate a new super->devlist entry */ + dv = xmalloc(sizeof(*dv)); + dv->dev = xmalloc(len); + update->space = dv; + + /* count how many spares will be converted to members */ + for (i = 0; i < map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (!dl) { + /* hmm maybe it failed?, nothing we can do about + * it here + */ + continue; + } + if (count_memberships(dl, super) == 0) + activate++; + } + len += activate * sizeof(struct imsm_disk); + break; + } + case update_kill_array: { + if (update->len < (int)sizeof(struct imsm_update_kill_array)) + return 0; + break; + } + case update_rename_array: { + if (update->len < (int)sizeof(struct imsm_update_rename_array)) + return 0; + break; + } + case update_add_remove_disk: + /* no update->len needed */ + break; + default: + return 0; + } + + /* check if we need a larger metadata buffer */ + if (super->next_buf) + buf_len = super->next_len; + else + buf_len = super->len; + + if (__le32_to_cpu(mpb->mpb_size) + len > buf_len) { + /* ok we need a larger buf than what is currently allocated + * if this allocation fails process_update will notice that + * ->next_len is set and ->next_buf is NULL + */ + buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + len, 512); + if (super->next_buf) + free(super->next_buf); + + super->next_len = buf_len; + if (posix_memalign(&super->next_buf, 512, buf_len) == 0) + memset(super->next_buf, 0, buf_len); + else + super->next_buf = NULL; + } + return 1; +} + +/* must be called while manager is quiesced */ +static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index) +{ + struct imsm_super *mpb = super->anchor; + struct dl *iter; + struct imsm_dev *dev; + struct imsm_map *map; + int i, j, num_members; + __u32 ord; + + dprintf("deleting device[%d] from imsm_super\n", index); + + /* shift all indexes down one */ + for (iter = super->disks; iter; iter = iter->next) + if (iter->index > (int)index) + iter->index--; + for (iter = super->missing; iter; iter = iter->next) + if (iter->index > (int)index) + iter->index--; + + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + num_members = map->num_members; + for (j = 0; j < num_members; j++) { + /* update ord entries being careful not to propagate + * ord-flags to the first map + */ + ord = get_imsm_ord_tbl_ent(dev, j, MAP_X); + + if (ord_to_idx(ord) <= index) + continue; + + map = get_imsm_map(dev, MAP_0); + set_imsm_ord_tbl_ent(map, j, ord_to_idx(ord - 1)); + map = get_imsm_map(dev, MAP_1); + if (map) + set_imsm_ord_tbl_ent(map, j, ord - 1); + } + } + + mpb->num_disks--; + super->updates_pending++; + if (*dlp) { + struct dl *dl = *dlp; + + *dlp = (*dlp)->next; + __free_imsm_disk(dl); + } +} +#endif /* MDASSEMBLE */ + +static void close_targets(int *targets, int new_disks) +{ + int i; + + if (!targets) + return; + + for (i = 0; i < new_disks; i++) { + if (targets[i] >= 0) { + close(targets[i]); + targets[i] = -1; + } + } +} + +static int imsm_get_allowed_degradation(int level, int raid_disks, + struct intel_super *super, + struct imsm_dev *dev) +{ + switch (level) { + case 1: + case 10:{ + int ret_val = 0; + struct imsm_map *map; + int i; + + ret_val = raid_disks/2; + /* check map if all disks pairs not failed + * in both maps + */ + map = get_imsm_map(dev, MAP_0); + for (i = 0; i < ret_val; i++) { + int degradation = 0; + if (get_imsm_disk(super, i) == NULL) + degradation++; + if (get_imsm_disk(super, i + 1) == NULL) + degradation++; + if (degradation == 2) + return 0; + } + map = get_imsm_map(dev, MAP_1); + /* if there is no second map + * result can be returned + */ + if (map == NULL) + return ret_val; + /* check degradation in second map + */ + for (i = 0; i < ret_val; i++) { + int degradation = 0; + if (get_imsm_disk(super, i) == NULL) + degradation++; + if (get_imsm_disk(super, i + 1) == NULL) + degradation++; + if (degradation == 2) + return 0; + } + return ret_val; + } + case 5: + return 1; + case 6: + return 2; + default: + return 0; + } +} + +/******************************************************************************* + * Function: open_backup_targets + * Description: Function opens file descriptors for all devices given in + * info->devs + * Parameters: + * info : general array info + * raid_disks : number of disks + * raid_fds : table of device's file descriptors + * super : intel super for raid10 degradation check + * dev : intel device for raid10 degradation check + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +int open_backup_targets(struct mdinfo *info, int raid_disks, int *raid_fds, + struct intel_super *super, struct imsm_dev *dev) +{ + struct mdinfo *sd; + int i; + int opened = 0; + + for (i = 0; i < raid_disks; i++) + raid_fds[i] = -1; + + for (sd = info->devs ; sd ; sd = sd->next) { + char *dn; + + if (sd->disk.state & (1<<MD_DISK_FAULTY)) { + dprintf("disk is faulty!!\n"); + continue; + } + + if ((sd->disk.raid_disk >= raid_disks) || + (sd->disk.raid_disk < 0)) + continue; + + dn = map_dev(sd->disk.major, + sd->disk.minor, 1); + raid_fds[sd->disk.raid_disk] = dev_open(dn, O_RDWR); + if (raid_fds[sd->disk.raid_disk] < 0) { + pr_err("cannot open component\n"); + continue; + } + opened++; + } + /* check if maximum array degradation level is not exceeded + */ + if ((raid_disks - opened) > + imsm_get_allowed_degradation(info->new_level, + raid_disks, + super, dev)) { + pr_err("Not enough disks can be opened.\n"); + close_targets(raid_fds, raid_disks); + return -2; + } + return 0; +} + +/******************************************************************************* + * Function: validate_container_imsm + * Description: This routine validates container after assemble, + * eg. if devices in container are under the same controller. + * + * Parameters: + * info : linked list with info about devices used in array + * Returns: + * 1 : HBA mismatch + * 0 : Success + ******************************************************************************/ +int validate_container_imsm(struct mdinfo *info) +{ + if (check_env("IMSM_NO_PLATFORM")) + return 0; + + struct sys_dev *idev; + struct sys_dev *hba = NULL; + struct sys_dev *intel_devices = find_intel_devices(); + char *dev_path = devt_to_devpath(makedev(info->disk.major, + info->disk.minor)); + + for (idev = intel_devices; idev; idev = idev->next) { + if (dev_path && strstr(dev_path, idev->path)) { + hba = idev; + break; + } + } + if (dev_path) + free(dev_path); + + if (!hba) { + pr_err("WARNING - Cannot detect HBA for device %s!\n", + devid2kname(makedev(info->disk.major, info->disk.minor))); + return 1; + } + + const struct imsm_orom *orom = get_orom_by_device_id(hba->dev_id); + struct mdinfo *dev; + + for (dev = info->next; dev; dev = dev->next) { + dev_path = devt_to_devpath(makedev(dev->disk.major, dev->disk.minor)); + + struct sys_dev *hba2 = NULL; + for (idev = intel_devices; idev; idev = idev->next) { + if (dev_path && strstr(dev_path, idev->path)) { + hba2 = idev; + break; + } + } + if (dev_path) + free(dev_path); + + const struct imsm_orom *orom2 = hba2 == NULL ? NULL : + get_orom_by_device_id(hba2->dev_id); + + if (hba2 && hba->type != hba2->type) { + pr_err("WARNING - HBAs of devices do not match %s != %s\n", + get_sys_dev_type(hba->type), get_sys_dev_type(hba2->type)); + return 1; + } + + if (orom != orom2) { + pr_err("WARNING - IMSM container assembled with disks under different HBAs!\n" + " This operation is not supported and can lead to data loss.\n"); + return 1; + } + + if (!orom) { + pr_err("WARNING - IMSM container assembled with disks under HBAs without IMSM platform support!\n" + " This operation is not supported and can lead to data loss.\n"); + return 1; + } + } + + return 0; +} +#ifndef MDASSEMBLE +/******************************************************************************* + * Function: init_migr_record_imsm + * Description: Function inits imsm migration record + * Parameters: + * super : imsm internal array info + * dev : device under migration + * info : general array info to find the smallest device + * Returns: + * none + ******************************************************************************/ +void init_migr_record_imsm(struct supertype *st, struct imsm_dev *dev, + struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct migr_record *migr_rec = super->migr_rec; + int new_data_disks; + unsigned long long dsize, dev_sectors; + long long unsigned min_dev_sectors = -1LLU; + struct mdinfo *sd; + char nm[30]; + int fd; + struct imsm_map *map_dest = get_imsm_map(dev, MAP_0); + struct imsm_map *map_src = get_imsm_map(dev, MAP_1); + unsigned long long num_migr_units; + unsigned long long array_blocks; + + memset(migr_rec, 0, sizeof(struct migr_record)); + migr_rec->family_num = __cpu_to_le32(super->anchor->family_num); + + /* only ascending reshape supported now */ + migr_rec->ascending_migr = __cpu_to_le32(1); + + migr_rec->dest_depth_per_unit = GEN_MIGR_AREA_SIZE / + max(map_dest->blocks_per_strip, map_src->blocks_per_strip); + migr_rec->dest_depth_per_unit *= + max(map_dest->blocks_per_strip, map_src->blocks_per_strip); + new_data_disks = imsm_num_data_members(dev, MAP_0); + migr_rec->blocks_per_unit = + __cpu_to_le32(migr_rec->dest_depth_per_unit * new_data_disks); + migr_rec->dest_depth_per_unit = + __cpu_to_le32(migr_rec->dest_depth_per_unit); + array_blocks = info->component_size * new_data_disks; + num_migr_units = + array_blocks / __le32_to_cpu(migr_rec->blocks_per_unit); + + if (array_blocks % __le32_to_cpu(migr_rec->blocks_per_unit)) + num_migr_units++; + migr_rec->num_migr_units = __cpu_to_le32(num_migr_units); + + migr_rec->post_migr_vol_cap = dev->size_low; + migr_rec->post_migr_vol_cap_hi = dev->size_high; + + /* Find the smallest dev */ + for (sd = info->devs ; sd ; sd = sd->next) { + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + fd = dev_open(nm, O_RDONLY); + if (fd < 0) + continue; + get_dev_size(fd, NULL, &dsize); + dev_sectors = dsize / 512; + if (dev_sectors < min_dev_sectors) + min_dev_sectors = dev_sectors; + close(fd); + } + migr_rec->ckpt_area_pba = __cpu_to_le32(min_dev_sectors - + RAID_DISK_RESERVED_BLOCKS_IMSM_HI); + + write_imsm_migr_rec(st); + + return; +} + +/******************************************************************************* + * Function: save_backup_imsm + * Description: Function saves critical data stripes to Migration Copy Area + * and updates the current migration unit status. + * Use restore_stripes() to form a destination stripe, + * and to write it to the Copy Area. + * Parameters: + * st : supertype information + * dev : imsm device that backup is saved for + * info : general array info + * buf : input buffer + * length : length of data to backup (blocks_per_unit) + * Returns: + * 0 : success + *, -1 : fail + ******************************************************************************/ +int save_backup_imsm(struct supertype *st, + struct imsm_dev *dev, + struct mdinfo *info, + void *buf, + int length) +{ + int rv = -1; + struct intel_super *super = st->sb; + unsigned long long *target_offsets = NULL; + int *targets = NULL; + int i; + struct imsm_map *map_dest = get_imsm_map(dev, MAP_0); + int new_disks = map_dest->num_members; + int dest_layout = 0; + int dest_chunk; + unsigned long long start; + int data_disks = imsm_num_data_members(dev, MAP_0); + + targets = xmalloc(new_disks * sizeof(int)); + + for (i = 0; i < new_disks; i++) + targets[i] = -1; + + target_offsets = xcalloc(new_disks, sizeof(unsigned long long)); + + start = info->reshape_progress * 512; + for (i = 0; i < new_disks; i++) { + target_offsets[i] = (unsigned long long) + __le32_to_cpu(super->migr_rec->ckpt_area_pba) * 512; + /* move back copy area adderss, it will be moved forward + * in restore_stripes() using start input variable + */ + target_offsets[i] -= start/data_disks; + } + + if (open_backup_targets(info, new_disks, targets, + super, dev)) + goto abort; + + dest_layout = imsm_level_to_layout(map_dest->raid_level); + dest_chunk = __le16_to_cpu(map_dest->blocks_per_strip) * 512; + + if (restore_stripes(targets, /* list of dest devices */ + target_offsets, /* migration record offsets */ + new_disks, + dest_chunk, + map_dest->raid_level, + dest_layout, + -1, /* source backup file descriptor */ + 0, /* input buf offset + * always 0 buf is already offseted */ + start, + length, + buf) != 0) { + pr_err("Error restoring stripes\n"); + goto abort; + } + + rv = 0; + +abort: + if (targets) { + close_targets(targets, new_disks); + free(targets); + } + free(target_offsets); + + return rv; +} + +/******************************************************************************* + * Function: save_checkpoint_imsm + * Description: Function called for current unit status update + * in the migration record. It writes it to disk. + * Parameters: + * super : imsm internal array info + * info : general array info + * Returns: + * 0: success + * 1: failure + * 2: failure, means no valid migration record + * / no general migration in progress / + ******************************************************************************/ +int save_checkpoint_imsm(struct supertype *st, struct mdinfo *info, int state) +{ + struct intel_super *super = st->sb; + unsigned long long blocks_per_unit; + unsigned long long curr_migr_unit; + + if (load_imsm_migr_rec(super, info) != 0) { + dprintf("imsm: ERROR: Cannot read migration record for checkpoint save.\n"); + return 1; + } + + blocks_per_unit = __le32_to_cpu(super->migr_rec->blocks_per_unit); + if (blocks_per_unit == 0) { + dprintf("imsm: no migration in progress.\n"); + return 2; + } + curr_migr_unit = info->reshape_progress / blocks_per_unit; + /* check if array is alligned to copy area + * if it is not alligned, add one to current migration unit value + * this can happend on array reshape finish only + */ + if (info->reshape_progress % blocks_per_unit) + curr_migr_unit++; + + super->migr_rec->curr_migr_unit = + __cpu_to_le32(curr_migr_unit); + super->migr_rec->rec_status = __cpu_to_le32(state); + super->migr_rec->dest_1st_member_lba = + __cpu_to_le32(curr_migr_unit * + __le32_to_cpu(super->migr_rec->dest_depth_per_unit)); + if (write_imsm_migr_rec(st) < 0) { + dprintf("imsm: Cannot write migration record outside backup area\n"); + return 1; + } + + return 0; +} + +/******************************************************************************* + * Function: recover_backup_imsm + * Description: Function recovers critical data from the Migration Copy Area + * while assembling an array. + * Parameters: + * super : imsm internal array info + * info : general array info + * Returns: + * 0 : success (or there is no data to recover) + * 1 : fail + ******************************************************************************/ +int recover_backup_imsm(struct supertype *st, struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct migr_record *migr_rec = super->migr_rec; + struct imsm_map *map_dest = NULL; + struct intel_dev *id = NULL; + unsigned long long read_offset; + unsigned long long write_offset; + unsigned unit_len; + int *targets = NULL; + int new_disks, i, err; + char *buf = NULL; + int retval = 1; + unsigned long curr_migr_unit = __le32_to_cpu(migr_rec->curr_migr_unit); + unsigned long num_migr_units = __le32_to_cpu(migr_rec->num_migr_units); + char buffer[20]; + int skipped_disks = 0; + + err = sysfs_get_str(info, NULL, "array_state", (char *)buffer, 20); + if (err < 1) + return 1; + + /* recover data only during assemblation */ + if (strncmp(buffer, "inactive", 8) != 0) + return 0; + /* no data to recover */ + if (__le32_to_cpu(migr_rec->rec_status) == UNIT_SRC_NORMAL) + return 0; + if (curr_migr_unit >= num_migr_units) + return 1; + + /* find device during reshape */ + for (id = super->devlist; id; id = id->next) + if (is_gen_migration(id->dev)) + break; + if (id == NULL) + return 1; + + map_dest = get_imsm_map(id->dev, MAP_0); + new_disks = map_dest->num_members; + + read_offset = (unsigned long long) + __le32_to_cpu(migr_rec->ckpt_area_pba) * 512; + + write_offset = ((unsigned long long) + __le32_to_cpu(migr_rec->dest_1st_member_lba) + + pba_of_lba0(map_dest)) * 512; + + unit_len = __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512; + if (posix_memalign((void **)&buf, 512, unit_len) != 0) + goto abort; + targets = xcalloc(new_disks, sizeof(int)); + + if (open_backup_targets(info, new_disks, targets, super, id->dev)) { + pr_err("Cannot open some devices belonging to array.\n"); + goto abort; + } + + for (i = 0; i < new_disks; i++) { + if (targets[i] < 0) { + skipped_disks++; + continue; + } + if (lseek64(targets[i], read_offset, SEEK_SET) < 0) { + pr_err("Cannot seek to block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + if ((unsigned)read(targets[i], buf, unit_len) != unit_len) { + pr_err("Cannot read copy area block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + if (lseek64(targets[i], write_offset, SEEK_SET) < 0) { + pr_err("Cannot seek to block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + if ((unsigned)write(targets[i], buf, unit_len) != unit_len) { + pr_err("Cannot restore block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + } + + if (skipped_disks > imsm_get_allowed_degradation(info->new_level, + new_disks, + super, + id->dev)) { + pr_err("Cannot restore data from backup. Too many failed disks\n"); + goto abort; + } + + if (save_checkpoint_imsm(st, info, UNIT_SRC_NORMAL)) { + /* ignore error == 2, this can mean end of reshape here + */ + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL) during restart\n"); + } else + retval = 0; + +abort: + if (targets) { + for (i = 0; i < new_disks; i++) + if (targets[i]) + close(targets[i]); + free(targets); + } + free(buf); + return retval; +} + +static char disk_by_path[] = "/dev/disk/by-path/"; + +static const char *imsm_get_disk_controller_domain(const char *path) +{ + char disk_path[PATH_MAX]; + char *drv=NULL; + struct stat st; + + strcpy(disk_path, disk_by_path); + strncat(disk_path, path, PATH_MAX - strlen(disk_path) - 1); + if (stat(disk_path, &st) == 0) { + struct sys_dev* hba; + char *path=NULL; + + path = devt_to_devpath(st.st_rdev); + if (path == NULL) + return "unknown"; + hba = find_disk_attached_hba(-1, path); + if (hba && hba->type == SYS_DEV_SAS) + drv = "isci"; + else if (hba && hba->type == SYS_DEV_SATA) + drv = "ahci"; + else + drv = "unknown"; + dprintf("path: %s hba: %s attached: %s\n", + path, (hba) ? hba->path : "NULL", drv); + free(path); + } + return drv; +} + +static char *imsm_find_array_devnm_by_subdev(int subdev, char *container) +{ + static char devnm[32]; + char subdev_name[20]; + struct mdstat_ent *mdstat; + + sprintf(subdev_name, "%d", subdev); + mdstat = mdstat_by_subdev(subdev_name, container); + if (!mdstat) + return NULL; + + strcpy(devnm, mdstat->devnm); + free_mdstat(mdstat); + return devnm; +} + +static int imsm_reshape_is_allowed_on_container(struct supertype *st, + struct geo_params *geo, + int *old_raid_disks, + int direction) +{ + /* currently we only support increasing the number of devices + * for a container. This increases the number of device for each + * member array. They must all be RAID0 or RAID5. + */ + int ret_val = 0; + struct mdinfo *info, *member; + int devices_that_can_grow = 0; + + dprintf("imsm: imsm_reshape_is_allowed_on_container(ENTER): st->devnm = (%s)\n", st->devnm); + + if (geo->size > 0 || + geo->level != UnSet || + geo->layout != UnSet || + geo->chunksize != 0 || + geo->raid_disks == UnSet) { + dprintf("imsm: Container operation is allowed for raid disks number change only.\n"); + return ret_val; + } + + if (direction == ROLLBACK_METADATA_CHANGES) { + dprintf("imsm: Metadata changes rollback is not supported for container operation.\n"); + return ret_val; + } + + info = container_content_imsm(st, NULL); + for (member = info; member; member = member->next) { + char *result; + + dprintf("imsm: checking device_num: %i\n", + member->container_member); + + if (geo->raid_disks <= member->array.raid_disks) { + /* we work on container for Online Capacity Expansion + * only so raid_disks has to grow + */ + dprintf("imsm: for container operation raid disks increase is required\n"); + break; + } + + if ((info->array.level != 0) && + (info->array.level != 5)) { + /* we cannot use this container with other raid level + */ + dprintf("imsm: for container operation wrong raid level (%i) detected\n", + info->array.level); + break; + } else { + /* check for platform support + * for this raid level configuration + */ + struct intel_super *super = st->sb; + if (!is_raid_level_supported(super->orom, + member->array.level, + geo->raid_disks)) { + dprintf("platform does not support raid%d with %d disk%s\n", + info->array.level, + geo->raid_disks, + geo->raid_disks > 1 ? "s" : ""); + break; + } + /* check if component size is aligned to chunk size + */ + if (info->component_size % + (info->array.chunk_size/512)) { + dprintf("Component size is not aligned to chunk size\n"); + break; + } + } + + if (*old_raid_disks && + info->array.raid_disks != *old_raid_disks) + break; + *old_raid_disks = info->array.raid_disks; + + /* All raid5 and raid0 volumes in container + * have to be ready for Online Capacity Expansion + * so they need to be assembled. We have already + * checked that no recovery etc is happening. + */ + result = imsm_find_array_devnm_by_subdev(member->container_member, + st->container_devnm); + if (result == NULL) { + dprintf("imsm: cannot find array\n"); + break; + } + devices_that_can_grow++; + } + sysfs_free(info); + if (!member && devices_that_can_grow) + ret_val = 1; + + if (ret_val) + dprintf("Container operation allowed\n"); + else + dprintf("Error: %i\n", ret_val); + + return ret_val; +} + +/* Function: get_spares_for_grow + * Description: Allocates memory and creates list of spare devices + * avaliable in container. Checks if spare drive size is acceptable. + * Parameters: Pointer to the supertype structure + * Returns: Pointer to the list of spare devices (mdinfo structure) on success, + * NULL if fail + */ +static struct mdinfo *get_spares_for_grow(struct supertype *st) +{ + unsigned long long min_size = min_acceptable_spare_size_imsm(st); + return container_choose_spares(st, min_size, NULL, NULL, NULL, 0); +} + +/****************************************************************************** + * function: imsm_create_metadata_update_for_reshape + * Function creates update for whole IMSM container. + * + ******************************************************************************/ +static int imsm_create_metadata_update_for_reshape( + struct supertype *st, + struct geo_params *geo, + int old_raid_disks, + struct imsm_update_reshape **updatep) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + int update_memory_size = 0; + struct imsm_update_reshape *u = NULL; + struct mdinfo *spares = NULL; + int i; + int delta_disks = 0; + struct mdinfo *dev; + + dprintf("(enter) raid_disks = %i\n", geo->raid_disks); + + delta_disks = geo->raid_disks - old_raid_disks; + + /* size of all update data without anchor */ + update_memory_size = sizeof(struct imsm_update_reshape); + + /* now add space for spare disks that we need to add. */ + update_memory_size += sizeof(u->new_disks[0]) * (delta_disks - 1); + + u = xcalloc(1, update_memory_size); + u->type = update_reshape_container_disks; + u->old_raid_disks = old_raid_disks; + u->new_raid_disks = geo->raid_disks; + + /* now get spare disks list + */ + spares = get_spares_for_grow(st); + + if (spares == NULL + || delta_disks > spares->array.spare_disks) { + pr_err("imsm: ERROR: Cannot get spare devices for %s.\n", geo->dev_name); + i = -1; + goto abort; + } + + /* we have got spares + * update disk list in imsm_disk list table in anchor + */ + dprintf("imsm: %i spares are available.\n\n", + spares->array.spare_disks); + + dev = spares->devs; + for (i = 0; i < delta_disks; i++) { + struct dl *dl; + + if (dev == NULL) + break; + u->new_disks[i] = makedev(dev->disk.major, + dev->disk.minor); + dl = get_disk_super(super, dev->disk.major, dev->disk.minor); + dl->index = mpb->num_disks; + mpb->num_disks++; + dev = dev->next; + } + +abort: + /* free spares + */ + sysfs_free(spares); + + dprintf("imsm: reshape update preparation :"); + if (i == delta_disks) { + dprintf_cont(" OK\n"); + *updatep = u; + return update_memory_size; + } + free(u); + dprintf_cont(" Error\n"); + + return 0; +} + +/****************************************************************************** + * function: imsm_create_metadata_update_for_size_change() + * Creates update for IMSM array for array size change. + * + ******************************************************************************/ +static int imsm_create_metadata_update_for_size_change( + struct supertype *st, + struct geo_params *geo, + struct imsm_update_size_change **updatep) +{ + struct intel_super *super = st->sb; + int update_memory_size = 0; + struct imsm_update_size_change *u = NULL; + + dprintf("(enter) New size = %llu\n", geo->size); + + /* size of all update data without anchor */ + update_memory_size = sizeof(struct imsm_update_size_change); + + u = xcalloc(1, update_memory_size); + u->type = update_size_change; + u->subdev = super->current_vol; + u->new_size = geo->size; + + dprintf("imsm: reshape update preparation : OK\n"); + *updatep = u; + + return update_memory_size; +} + +/****************************************************************************** + * function: imsm_create_metadata_update_for_migration() + * Creates update for IMSM array. + * + ******************************************************************************/ +static int imsm_create_metadata_update_for_migration( + struct supertype *st, + struct geo_params *geo, + struct imsm_update_reshape_migration **updatep) +{ + struct intel_super *super = st->sb; + int update_memory_size = 0; + struct imsm_update_reshape_migration *u = NULL; + struct imsm_dev *dev; + int previous_level = -1; + + dprintf("(enter) New Level = %i\n", geo->level); + + /* size of all update data without anchor */ + update_memory_size = sizeof(struct imsm_update_reshape_migration); + + u = xcalloc(1, update_memory_size); + u->type = update_reshape_migration; + u->subdev = super->current_vol; + u->new_level = geo->level; + u->new_layout = geo->layout; + u->new_raid_disks = u->old_raid_disks = geo->raid_disks; + u->new_disks[0] = -1; + u->new_chunksize = -1; + + dev = get_imsm_dev(super, u->subdev); + if (dev) { + struct imsm_map *map; + + map = get_imsm_map(dev, MAP_0); + if (map) { + int current_chunk_size = + __le16_to_cpu(map->blocks_per_strip) / 2; + + if (geo->chunksize != current_chunk_size) { + u->new_chunksize = geo->chunksize / 1024; + dprintf("imsm: chunk size change from %i to %i\n", + current_chunk_size, u->new_chunksize); + } + previous_level = map->raid_level; + } + } + if ((geo->level == 5) && (previous_level == 0)) { + struct mdinfo *spares = NULL; + + u->new_raid_disks++; + spares = get_spares_for_grow(st); + if ((spares == NULL) || (spares->array.spare_disks < 1)) { + free(u); + sysfs_free(spares); + update_memory_size = 0; + dprintf("error: cannot get spare device for requested migration"); + return 0; + } + sysfs_free(spares); + } + dprintf("imsm: reshape update preparation : OK\n"); + *updatep = u; + + return update_memory_size; +} + +static void imsm_update_metadata_locally(struct supertype *st, + void *buf, int len) +{ + struct metadata_update mu; + + mu.buf = buf; + mu.len = len; + mu.space = NULL; + mu.space_list = NULL; + mu.next = NULL; + if (imsm_prepare_update(st, &mu)) + imsm_process_update(st, &mu); + + while (mu.space_list) { + void **space = mu.space_list; + mu.space_list = *space; + free(space); + } +} + +/*************************************************************************** +* Function: imsm_analyze_change +* Description: Function analyze change for single volume +* and validate if transition is supported +* Parameters: Geometry parameters, supertype structure, +* metadata change direction (apply/rollback) +* Returns: Operation type code on success, -1 if fail +****************************************************************************/ +enum imsm_reshape_type imsm_analyze_change(struct supertype *st, + struct geo_params *geo, + int direction) +{ + struct mdinfo info; + int change = -1; + int check_devs = 0; + int chunk; + /* number of added/removed disks in operation result */ + int devNumChange = 0; + /* imsm compatible layout value for array geometry verification */ + int imsm_layout = -1; + int data_disks; + struct imsm_dev *dev; + struct intel_super *super; + unsigned long long current_size; + unsigned long long free_size; + unsigned long long max_size; + int rv; + + getinfo_super_imsm_volume(st, &info, NULL); + if ((geo->level != info.array.level) && + (geo->level >= 0) && + (geo->level != UnSet)) { + switch (info.array.level) { + case 0: + if (geo->level == 5) { + change = CH_MIGRATION; + if (geo->layout != ALGORITHM_LEFT_ASYMMETRIC) { + pr_err("Error. Requested Layout not supported (left-asymmetric layout is supported only)!\n"); + change = -1; + goto analyse_change_exit; + } + imsm_layout = geo->layout; + check_devs = 1; + devNumChange = 1; /* parity disk added */ + } else if (geo->level == 10) { + change = CH_TAKEOVER; + check_devs = 1; + devNumChange = 2; /* two mirrors added */ + imsm_layout = 0x102; /* imsm supported layout */ + } + break; + case 1: + case 10: + if (geo->level == 0) { + change = CH_TAKEOVER; + check_devs = 1; + devNumChange = -(geo->raid_disks/2); + imsm_layout = 0; /* imsm raid0 layout */ + } + break; + } + if (change == -1) { + pr_err("Error. Level Migration from %d to %d not supported!\n", + info.array.level, geo->level); + goto analyse_change_exit; + } + } else + geo->level = info.array.level; + + if ((geo->layout != info.array.layout) + && ((geo->layout != UnSet) && (geo->layout != -1))) { + change = CH_MIGRATION; + if ((info.array.layout == 0) + && (info.array.level == 5) + && (geo->layout == 5)) { + /* reshape 5 -> 4 */ + } else if ((info.array.layout == 5) + && (info.array.level == 5) + && (geo->layout == 0)) { + /* reshape 4 -> 5 */ + geo->layout = 0; + geo->level = 5; + } else { + pr_err("Error. Layout Migration from %d to %d not supported!\n", + info.array.layout, geo->layout); + change = -1; + goto analyse_change_exit; + } + } else { + geo->layout = info.array.layout; + if (imsm_layout == -1) + imsm_layout = info.array.layout; + } + + if ((geo->chunksize > 0) && (geo->chunksize != UnSet) + && (geo->chunksize != info.array.chunk_size)) + change = CH_MIGRATION; + else + geo->chunksize = info.array.chunk_size; + + chunk = geo->chunksize / 1024; + + super = st->sb; + dev = get_imsm_dev(super, super->current_vol); + data_disks = imsm_num_data_members(dev , MAP_0); + /* compute current size per disk member + */ + current_size = info.custom_array_size / data_disks; + + if ((geo->size > 0) && (geo->size != MAX_SIZE)) { + /* align component size + */ + geo->size = imsm_component_size_aligment_check( + get_imsm_raid_level(dev->vol.map), + chunk * 1024, + geo->size * 2); + if (geo->size == 0) { + pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is 0).\n", + current_size); + goto analyse_change_exit; + } + } + + if ((current_size != geo->size) && (geo->size > 0)) { + if (change != -1) { + pr_err("Error. Size change should be the only one at a time.\n"); + change = -1; + goto analyse_change_exit; + } + if ((super->current_vol + 1) != super->anchor->num_raid_devs) { + pr_err("Error. The last volume in container can be expanded only (%i/%s).\n", + super->current_vol, st->devnm); + goto analyse_change_exit; + } + /* check the maximum available size + */ + rv = imsm_get_free_size(st, dev->vol.map->num_members, + 0, chunk, &free_size); + if (rv == 0) + /* Cannot find maximum available space + */ + max_size = 0; + else { + max_size = free_size + current_size; + /* align component size + */ + max_size = imsm_component_size_aligment_check( + get_imsm_raid_level(dev->vol.map), + chunk * 1024, + max_size); + } + if (geo->size == MAX_SIZE) { + /* requested size change to the maximum available size + */ + if (max_size == 0) { + pr_err("Error. Cannot find maximum available space.\n"); + change = -1; + goto analyse_change_exit; + } else + geo->size = max_size; + } + + if ((direction == ROLLBACK_METADATA_CHANGES)) { + /* accept size for rollback only + */ + } else { + /* round size due to metadata compatibility + */ + geo->size = (geo->size >> SECT_PER_MB_SHIFT) + << SECT_PER_MB_SHIFT; + dprintf("Prepare update for size change to %llu\n", + geo->size ); + if (current_size >= geo->size) { + pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is %llu).\n", + current_size, geo->size); + goto analyse_change_exit; + } + if (max_size && geo->size > max_size) { + pr_err("Error. Requested size is larger than maximum available size (maximum available size is %llu, requested size /rounded/ is %llu).\n", + max_size, geo->size); + goto analyse_change_exit; + } + } + geo->size *= data_disks; + geo->raid_disks = dev->vol.map->num_members; + change = CH_ARRAY_SIZE; + } + if (!validate_geometry_imsm(st, + geo->level, + imsm_layout, + geo->raid_disks + devNumChange, + &chunk, + geo->size, INVALID_SECTORS, + 0, 0, 1)) + change = -1; + + if (check_devs) { + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + + if (mpb->num_raid_devs > 1) { + pr_err("Error. Cannot perform operation on %s- for this operation it MUST be single array in container\n", + geo->dev_name); + change = -1; + } + } + +analyse_change_exit: + if ((direction == ROLLBACK_METADATA_CHANGES) && + ((change == CH_MIGRATION) || (change == CH_TAKEOVER))) { + dprintf("imsm: Metadata changes rollback is not supported for migration and takeover operations.\n"); + change = -1; + } + return change; +} + +int imsm_takeover(struct supertype *st, struct geo_params *geo) +{ + struct intel_super *super = st->sb; + struct imsm_update_takeover *u; + + u = xmalloc(sizeof(struct imsm_update_takeover)); + + u->type = update_takeover; + u->subarray = super->current_vol; + + /* 10->0 transition */ + if (geo->level == 0) + u->direction = R10_TO_R0; + + /* 0->10 transition */ + if (geo->level == 10) + u->direction = R0_TO_R10; + + /* update metadata locally */ + imsm_update_metadata_locally(st, u, + sizeof(struct imsm_update_takeover)); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, + sizeof(struct imsm_update_takeover)); + else + free(u); + + return 0; +} + +static int imsm_reshape_super(struct supertype *st, unsigned long long size, + int level, + int layout, int chunksize, int raid_disks, + int delta_disks, char *backup, char *dev, + int direction, int verbose) +{ + int ret_val = 1; + struct geo_params geo; + + dprintf("(enter)\n"); + + memset(&geo, 0, sizeof(struct geo_params)); + + geo.dev_name = dev; + strcpy(geo.devnm, st->devnm); + geo.size = size; + geo.level = level; + geo.layout = layout; + geo.chunksize = chunksize; + geo.raid_disks = raid_disks; + if (delta_disks != UnSet) + geo.raid_disks += delta_disks; + + dprintf("for level : %i\n", geo.level); + dprintf("for raid_disks : %i\n", geo.raid_disks); + + if (experimental() == 0) + return ret_val; + + if (strcmp(st->container_devnm, st->devnm) == 0) { + /* On container level we can only increase number of devices. */ + dprintf("imsm: info: Container operation\n"); + int old_raid_disks = 0; + + if (imsm_reshape_is_allowed_on_container( + st, &geo, &old_raid_disks, direction)) { + struct imsm_update_reshape *u = NULL; + int len; + + len = imsm_create_metadata_update_for_reshape( + st, &geo, old_raid_disks, &u); + + if (len <= 0) { + dprintf("imsm: Cannot prepare update\n"); + goto exit_imsm_reshape_super; + } + + ret_val = 0; + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, len); + else + free(u); + + } else { + pr_err("(imsm) Operation is not allowed on this container\n"); + } + } else { + /* On volume level we support following operations + * - takeover: raid10 -> raid0; raid0 -> raid10 + * - chunk size migration + * - migration: raid5 -> raid0; raid0 -> raid5 + */ + struct intel_super *super = st->sb; + struct intel_dev *dev = super->devlist; + int change; + dprintf("imsm: info: Volume operation\n"); + /* find requested device */ + while (dev) { + char *devnm = + imsm_find_array_devnm_by_subdev( + dev->index, st->container_devnm); + if (devnm && strcmp(devnm, geo.devnm) == 0) + break; + dev = dev->next; + } + if (dev == NULL) { + pr_err("Cannot find %s (%s) subarray\n", + geo.dev_name, geo.devnm); + goto exit_imsm_reshape_super; + } + super->current_vol = dev->index; + change = imsm_analyze_change(st, &geo, direction); + switch (change) { + case CH_TAKEOVER: + ret_val = imsm_takeover(st, &geo); + break; + case CH_MIGRATION: { + struct imsm_update_reshape_migration *u = NULL; + int len = + imsm_create_metadata_update_for_migration( + st, &geo, &u); + if (len < 1) { + dprintf("imsm: Cannot prepare update\n"); + break; + } + ret_val = 0; + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, len); + else + free(u); + } + break; + case CH_ARRAY_SIZE: { + struct imsm_update_size_change *u = NULL; + int len = + imsm_create_metadata_update_for_size_change( + st, &geo, &u); + if (len < 1) { + dprintf("imsm: Cannot prepare update\n"); + break; + } + ret_val = 0; + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, len); + else + free(u); + } + break; + default: + ret_val = 1; + } + } + +exit_imsm_reshape_super: + dprintf("imsm: reshape_super Exit code = %i\n", ret_val); + return ret_val; +} + +/******************************************************************************* + * Function: wait_for_reshape_imsm + * Description: Function writes new sync_max value and waits until + * reshape process reach new position + * Parameters: + * sra : general array info + * ndata : number of disks in new array's layout + * Returns: + * 0 : success, + * 1 : there is no reshape in progress, + * -1 : fail + ******************************************************************************/ +int wait_for_reshape_imsm(struct mdinfo *sra, int ndata) +{ + int fd = sysfs_get_fd(sra, NULL, "sync_completed"); + unsigned long long completed; + /* to_complete : new sync_max position */ + unsigned long long to_complete = sra->reshape_progress; + unsigned long long position_to_set = to_complete / ndata; + + if (fd < 0) { + dprintf("cannot open reshape_position\n"); + return 1; + } + + if (sysfs_fd_get_ll(fd, &completed) < 0) { + dprintf("cannot read reshape_position (no reshape in progres)\n"); + close(fd); + return 0; + } + + if (completed > position_to_set) { + dprintf("wrong next position to set %llu (%llu)\n", + to_complete, position_to_set); + close(fd); + return -1; + } + dprintf("Position set: %llu\n", position_to_set); + if (sysfs_set_num(sra, NULL, "sync_max", + position_to_set) != 0) { + dprintf("cannot set reshape position to %llu\n", + position_to_set); + close(fd); + return -1; + } + + do { + char action[20]; + sysfs_wait(fd, NULL); + if (sysfs_get_str(sra, NULL, "sync_action", + action, 20) > 0 && + strncmp(action, "reshape", 7) != 0) + break; + if (sysfs_fd_get_ll(fd, &completed) < 0) { + dprintf("cannot read reshape_position (in loop)\n"); + close(fd); + return 1; + } + } while (completed < position_to_set); + close(fd); + return 0; + +} + +/******************************************************************************* + * Function: check_degradation_change + * Description: Check that array hasn't become failed. + * Parameters: + * info : for sysfs access + * sources : source disks descriptors + * degraded: previous degradation level + * Returns: + * degradation level + ******************************************************************************/ +int check_degradation_change(struct mdinfo *info, + int *sources, + int degraded) +{ + unsigned long long new_degraded; + int rv; + + rv = sysfs_get_ll(info, NULL, "degraded", &new_degraded); + if ((rv == -1) || (new_degraded != (unsigned long long)degraded)) { + /* check each device to ensure it is still working */ + struct mdinfo *sd; + new_degraded = 0; + for (sd = info->devs ; sd ; sd = sd->next) { + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (sd->disk.state & (1<<MD_DISK_SYNC)) { + char sbuf[20]; + if (sysfs_get_str(info, + sd, "state", sbuf, 20) < 0 || + strstr(sbuf, "faulty") || + strstr(sbuf, "in_sync") == NULL) { + /* this device is dead */ + sd->disk.state = (1<<MD_DISK_FAULTY); + if (sd->disk.raid_disk >= 0 && + sources[sd->disk.raid_disk] >= 0) { + close(sources[ + sd->disk.raid_disk]); + sources[sd->disk.raid_disk] = + -1; + } + new_degraded++; + } + } + } + } + + return new_degraded; +} + +/******************************************************************************* + * Function: imsm_manage_reshape + * Description: Function finds array under reshape and it manages reshape + * process. It creates stripes backups (if required) and sets + * checheckpoits. + * Parameters: + * afd : Backup handle (nattive) - not used + * sra : general array info + * reshape : reshape parameters - not used + * st : supertype structure + * blocks : size of critical section [blocks] + * fds : table of source device descriptor + * offsets : start of array (offest per devices) + * dests : not used + * destfd : table of destination device descriptor + * destoffsets : table of destination offsets (per device) + * Returns: + * 1 : success, reshape is done + * 0 : fail + ******************************************************************************/ +static int imsm_manage_reshape( + int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long backup_blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) +{ + int ret_val = 0; + struct intel_super *super = st->sb; + struct intel_dev *dv = NULL; + struct imsm_dev *dev = NULL; + struct imsm_map *map_src; + int migr_vol_qan = 0; + int ndata, odata; /* [bytes] */ + int chunk; /* [bytes] */ + struct migr_record *migr_rec; + char *buf = NULL; + unsigned int buf_size; /* [bytes] */ + unsigned long long max_position; /* array size [bytes] */ + unsigned long long next_step; /* [blocks]/[bytes] */ + unsigned long long old_data_stripe_length; + unsigned long long start_src; /* [bytes] */ + unsigned long long start; /* [bytes] */ + unsigned long long start_buf_shift; /* [bytes] */ + int degraded = 0; + int source_layout = 0; + + if (!fds || !offsets || !sra) + goto abort; + + /* Find volume during the reshape */ + for (dv = super->devlist; dv; dv = dv->next) { + if (dv->dev->vol.migr_type == MIGR_GEN_MIGR + && dv->dev->vol.migr_state == 1) { + dev = dv->dev; + migr_vol_qan++; + } + } + /* Only one volume can migrate at the same time */ + if (migr_vol_qan != 1) { + pr_err(": %s", migr_vol_qan ? + "Number of migrating volumes greater than 1\n" : + "There is no volume during migrationg\n"); + goto abort; + } + + map_src = get_imsm_map(dev, MAP_1); + if (map_src == NULL) + goto abort; + + ndata = imsm_num_data_members(dev, MAP_0); + odata = imsm_num_data_members(dev, MAP_1); + + chunk = __le16_to_cpu(map_src->blocks_per_strip) * 512; + old_data_stripe_length = odata * chunk; + + migr_rec = super->migr_rec; + + /* initialize migration record for start condition */ + if (sra->reshape_progress == 0) + init_migr_record_imsm(st, dev, sra); + else { + if (__le32_to_cpu(migr_rec->rec_status) != UNIT_SRC_NORMAL) { + dprintf("imsm: cannot restart migration when data are present in copy area.\n"); + goto abort; + } + /* Save checkpoint to update migration record for current + * reshape position (in md). It can be farther than current + * reshape position in metadata. + */ + if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) { + /* ignore error == 2, this can mean end of reshape here + */ + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL, initial save)\n"); + goto abort; + } + } + + /* size for data */ + buf_size = __le32_to_cpu(migr_rec->blocks_per_unit) * 512; + /* extend buffer size for parity disk */ + buf_size += __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512; + /* add space for stripe aligment */ + buf_size += old_data_stripe_length; + if (posix_memalign((void **)&buf, 4096, buf_size)) { + dprintf("imsm: Cannot allocate checpoint buffer\n"); + goto abort; + } + + max_position = sra->component_size * ndata; + source_layout = imsm_level_to_layout(map_src->raid_level); + + while (__le32_to_cpu(migr_rec->curr_migr_unit) < + __le32_to_cpu(migr_rec->num_migr_units)) { + /* current reshape position [blocks] */ + unsigned long long current_position = + __le32_to_cpu(migr_rec->blocks_per_unit) + * __le32_to_cpu(migr_rec->curr_migr_unit); + unsigned long long border; + + /* Check that array hasn't become failed. + */ + degraded = check_degradation_change(sra, fds, degraded); + if (degraded > 1) { + dprintf("imsm: Abort reshape due to degradation level (%i)\n", degraded); + goto abort; + } + + next_step = __le32_to_cpu(migr_rec->blocks_per_unit); + + if ((current_position + next_step) > max_position) + next_step = max_position - current_position; + + start = current_position * 512; + + /* allign reading start to old geometry */ + start_buf_shift = start % old_data_stripe_length; + start_src = start - start_buf_shift; + + border = (start_src / odata) - (start / ndata); + border /= 512; + if (border <= __le32_to_cpu(migr_rec->dest_depth_per_unit)) { + /* save critical stripes to buf + * start - start address of current unit + * to backup [bytes] + * start_src - start address of current unit + * to backup alligned to source array + * [bytes] + */ + unsigned long long next_step_filler = 0; + unsigned long long copy_length = next_step * 512; + + /* allign copy area length to stripe in old geometry */ + next_step_filler = ((copy_length + start_buf_shift) + % old_data_stripe_length); + if (next_step_filler) + next_step_filler = (old_data_stripe_length + - next_step_filler); + dprintf("save_stripes() parameters: start = %llu,\tstart_src = %llu,\tnext_step*512 = %llu,\tstart_in_buf_shift = %llu,\tnext_step_filler = %llu\n", + start, start_src, copy_length, + start_buf_shift, next_step_filler); + + if (save_stripes(fds, offsets, map_src->num_members, + chunk, map_src->raid_level, + source_layout, 0, NULL, start_src, + copy_length + + next_step_filler + start_buf_shift, + buf)) { + dprintf("imsm: Cannot save stripes to buffer\n"); + goto abort; + } + /* Convert data to destination format and store it + * in backup general migration area + */ + if (save_backup_imsm(st, dev, sra, + buf + start_buf_shift, copy_length)) { + dprintf("imsm: Cannot save stripes to target devices\n"); + goto abort; + } + if (save_checkpoint_imsm(st, sra, + UNIT_SRC_IN_CP_AREA)) { + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_IN_CP_AREA)\n"); + goto abort; + } + } else { + /* set next step to use whole border area */ + border /= next_step; + if (border > 1) + next_step *= border; + } + /* When data backed up, checkpoint stored, + * kick the kernel to reshape unit of data + */ + next_step = next_step + sra->reshape_progress; + /* limit next step to array max position */ + if (next_step > max_position) + next_step = max_position; + sysfs_set_num(sra, NULL, "suspend_lo", sra->reshape_progress); + sysfs_set_num(sra, NULL, "suspend_hi", next_step); + sra->reshape_progress = next_step; + + /* wait until reshape finish */ + if (wait_for_reshape_imsm(sra, ndata) < 0) { + dprintf("wait_for_reshape_imsm returned error!\n"); + goto abort; + } + if (sigterm) + goto abort; + + if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) { + /* ignore error == 2, this can mean end of reshape here + */ + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL)\n"); + goto abort; + } + + } + + /* clear migr_rec on disks after successful migration */ + struct dl *d; + + memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SIZE); + for (d = super->disks; d; d = d->next) { + if (d->index < 0 || is_failed(&d->disk)) + continue; + unsigned long long dsize; + + get_dev_size(d->fd, NULL, &dsize); + if (lseek64(d->fd, dsize - MIGR_REC_POSITION, + SEEK_SET) >= 0) { + if (write(d->fd, super->migr_rec_buf, + MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE) + perror("Write migr_rec failed"); + } + } + + /* return '1' if done */ + ret_val = 1; +abort: + free(buf); + abort_reshape(sra); + + return ret_val; +} + +#endif /* MDASSEMBLE */ + +struct superswitch super_imsm = { +#ifndef MDASSEMBLE + .examine_super = examine_super_imsm, + .brief_examine_super = brief_examine_super_imsm, + .brief_examine_subarrays = brief_examine_subarrays_imsm, + .export_examine_super = export_examine_super_imsm, + .detail_super = detail_super_imsm, + .brief_detail_super = brief_detail_super_imsm, + .write_init_super = write_init_super_imsm, + .validate_geometry = validate_geometry_imsm, + .add_to_super = add_to_super_imsm, + .remove_from_super = remove_from_super_imsm, + .detail_platform = detail_platform_imsm, + .export_detail_platform = export_detail_platform_imsm, + .kill_subarray = kill_subarray_imsm, + .update_subarray = update_subarray_imsm, + .load_container = load_container_imsm, + .default_geometry = default_geometry_imsm, + .get_disk_controller_domain = imsm_get_disk_controller_domain, + .reshape_super = imsm_reshape_super, + .manage_reshape = imsm_manage_reshape, + .recover_backup = recover_backup_imsm, + .copy_metadata = copy_metadata_imsm, +#endif + .match_home = match_home_imsm, + .uuid_from_super= uuid_from_super_imsm, + .getinfo_super = getinfo_super_imsm, + .getinfo_super_disks = getinfo_super_disks_imsm, + .update_super = update_super_imsm, + + .avail_size = avail_size_imsm, + .min_acceptable_spare_size = min_acceptable_spare_size_imsm, + + .compare_super = compare_super_imsm, + + .load_super = load_super_imsm, + .init_super = init_super_imsm, + .store_super = store_super_imsm, + .free_super = free_super_imsm, + .match_metadata_desc = match_metadata_desc_imsm, + .container_content = container_content_imsm, + .validate_container = validate_container_imsm, + + .external = 1, + .name = "imsm", + +#ifndef MDASSEMBLE +/* for mdmon */ + .open_new = imsm_open_new, + .set_array_state= imsm_set_array_state, + .set_disk = imsm_set_disk, + .sync_metadata = imsm_sync_metadata, + .activate_spare = imsm_activate_spare, + .process_update = imsm_process_update, + .prepare_update = imsm_prepare_update, +#endif /* MDASSEMBLE */ +}; diff --git a/super-mbr.c b/super-mbr.c new file mode 100644 index 00000000..62b3f031 --- /dev/null +++ b/super-mbr.c @@ -0,0 +1,204 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2010 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neil@brown.name> + * + */ + +/* + * 'mbr' is a pseudo metadata type for devices which have a + * partition table in the Master Boot Record (mbr) also known + * as a dos partition table. + * + * Obviously arrays cannot be created or assembled for this type. + * It is used to allow a new bare device to have an partition table + * added so the member partitions can then be included in other + * arrays as relevant. + * + * The meaning operations are: + * examine_super, but not brief_examine_super or export_examine + * load_super + * store_super + */ + +#include "mdadm.h" +#include "part.h" + +static void free_mbr(struct supertype *st) +{ + free(st->sb); + st->sb = NULL; +} + +#ifndef MDASSEMBLE + +static void examine_mbr(struct supertype *st, char *homehost) +{ + struct MBR *sb = st->sb; + int i; + + printf(" MBR Magic : %04x\n", sb->magic); + for (i = 0; i < MBR_PARTITIONS; i++) + if (sb->parts[i].blocks_num) + printf("Partition[%d] : %12lu sectors at %12lu (type %02x)\n", + i, + (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num), + (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba), + sb->parts[i].part_type); + +} + +#endif /*MDASSEMBLE */ + +static int load_super_mbr(struct supertype *st, int fd, char *devname) +{ + /* try to read an mbr + * Return + * 0 on success + * 1 cannot get record + * 2 record is meaningless + */ + struct MBR *super; + + free_mbr(st); + + if (posix_memalign((void**)&super, 512, 512) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + lseek(fd, 0, 0); + if (read(fd, super, sizeof(*super)) != sizeof(*super)) { + if (devname) + pr_err("Cannot read partition table on %s\n", + devname); + free(super); + return 1; + } + + if (super->magic != MBR_SIGNATURE_MAGIC) { + if (devname) + pr_err("No partition table found on %s\n", + devname); + free(super); + return 1; + } + + st->sb = super; + + if (st->ss == NULL) { + st->ss = &mbr; + st->minor_version = 0; + st->max_devs = 1; + st->info = NULL; + } + return 0; +} + +static int store_mbr(struct supertype *st, int fd) +{ + struct MBR *old, *super; + + if (posix_memalign((void**)&old, 512, 512) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + lseek(fd, 0, 0); + if (read(fd, old, sizeof(*old)) != sizeof(*old)) { + free(old); + return 1; + } + + super = st->sb; + memcpy(super->pad, old->pad, sizeof(super->pad)); + free(old); + lseek(fd, 0, 0); + if (write(fd, super, sizeof(*super)) != sizeof(*super)) + return 4; + fsync(fd); + ioctl(fd, BLKRRPART, 0); + return 0; +} + +static void getinfo_mbr(struct supertype *st, struct mdinfo *info, char *map) +{ + struct MBR *sb = st->sb; + int i; + + memset(&info->array, 0, sizeof(info->array)); + memset(&info->disk, 0, sizeof(info->disk)); + strcpy(info->text_version, "mbr"); + strcpy(info->name, "mbr"); + info->component_size = 0; + + for (i = 0; i < MBR_PARTITIONS ; i++) + if (sb->parts[i].blocks_num) { + unsigned long last = + (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num) + + (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba); + if (last > info->component_size) + info->component_size = last; + } + +} + +static struct supertype *match_metadata_desc(char *arg) +{ + struct supertype *st; + + if (strcmp(arg, "mbr") != 0) + return NULL; + + st = xmalloc(sizeof(*st)); + st->ss = &mbr; + st->info = NULL; + st->minor_version = 0; + st->max_devs = 1; + st->sb = NULL; + return st; +} + +#ifndef MDASSEMBLE +static int validate_geometry(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int verbose) +{ + pr_err("mbr metadata cannot be used this way\n"); + return 0; +} +#endif + +struct superswitch mbr = { +#ifndef MDASSEMBLE + .examine_super = examine_mbr, + .validate_geometry = validate_geometry, +#endif + .match_metadata_desc = match_metadata_desc, + .load_super = load_super_mbr, + .store_super = store_mbr, + .getinfo_super = getinfo_mbr, + .free_super = free_mbr, + .name = "mbr", +}; diff --git a/super0.c b/super0.c new file mode 100644 index 00000000..deb59994 --- /dev/null +++ b/super0.c @@ -0,0 +1,1330 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "sha1.h" +/* + * All handling for the 0.90.0 version superblock is in + * this file. + * This includes: + * - finding, loading, and writing the superblock. + * - initialising a new superblock + * - printing the superblock for --examine + * - printing part of the superblock for --detail + * .. other stuff + */ + +static unsigned long calc_sb0_csum(mdp_super_t *super) +{ + unsigned long csum = super->sb_csum; + unsigned long newcsum; + super->sb_csum= 0 ; + newcsum = calc_csum(super, MD_SB_BYTES); + super->sb_csum = csum; + return newcsum; +} + +static void super0_swap_endian(struct mdp_superblock_s *sb) +{ + /* as super0 superblocks are host-endian, it is sometimes + * useful to be able to swap the endianness + * as (almost) everything is u32's we byte-swap every 4byte + * number. + * We then also have to swap the events_hi and events_lo + */ + char *sbc = (char *)sb; + __u32 t32; + int i; + + for (i=0; i < MD_SB_BYTES ; i+=4) { + char t = sbc[i]; + sbc[i] = sbc[i+3]; + sbc[i+3] = t; + t=sbc[i+1]; + sbc[i+1]=sbc[i+2]; + sbc[i+2]=t; + } + t32 = sb->events_hi; + sb->events_hi = sb->events_lo; + sb->events_lo = t32; + + t32 = sb->cp_events_hi; + sb->cp_events_hi = sb->cp_events_lo; + sb->cp_events_lo = t32; + +} + +#ifndef MDASSEMBLE + +static void examine_super0(struct supertype *st, char *homehost) +{ + mdp_super_t *sb = st->sb; + time_t atime; + int d; + int delta_extra = 0; + char *c; + + printf(" Magic : %08x\n", sb->md_magic); + printf(" Version : %d.%02d.%02d\n", sb->major_version, sb->minor_version, + sb->patch_version); + if (sb->minor_version >= 90) { + printf(" UUID : %08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + if (homehost) { + char buf[20]; + void *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + if (memcmp(&sb->set_uuid2, hash, 8)==0) + printf(" (local to host %s)", homehost); + } + printf("\n"); + } else + printf(" UUID : %08x\n", sb->set_uuid0); + + if (sb->not_persistent) + printf(" Eedk : not persistent\n"); + + atime = sb->ctime; + printf(" Creation Time : %.24s\n", ctime(&atime)); + c=map_num(pers, sb->level); + printf(" Raid Level : %s\n", c?c:"-unknown-"); + if ((int)sb->level > 0) { + int ddsks = 0, ddsks_denom = 1; + printf(" Used Dev Size : %d%s\n", sb->size, + human_size((long long)sb->size<<10)); + switch(sb->level) { + case 1: ddsks=1;break; + case 4: + case 5: ddsks = sb->raid_disks-1; break; + case 6: ddsks = sb->raid_disks-2; break; + case 10: ddsks = sb->raid_disks; + ddsks_denom = (sb->layout&255) * ((sb->layout>>8)&255); + } + if (ddsks) { + long long asize = sb->size; + asize = (asize << 10) * ddsks / ddsks_denom; + printf(" Array Size : %llu%s\n", + asize >> 10, human_size(asize)); + } + } + printf(" Raid Devices : %d\n", sb->raid_disks); + printf(" Total Devices : %d\n", sb->nr_disks); + printf("Preferred Minor : %d\n", sb->md_minor); + printf("\n"); + if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) { + printf(" Reshape pos'n : %llu%s\n", (unsigned long long)sb->reshape_position/2, human_size((long long)sb->reshape_position<<9)); + if (sb->delta_disks) { + printf(" Delta Devices : %d", sb->delta_disks); + printf(" (%d->%d)\n", sb->raid_disks-sb->delta_disks, sb->raid_disks); + if (((int)sb->delta_disks) < 0) + delta_extra = - sb->delta_disks; + } + if (sb->new_level != sb->level) { + c = map_num(pers, sb->new_level); + printf(" New Level : %s\n", c?c:"-unknown-"); + } + if (sb->new_layout != sb->layout) { + if (sb->level == 5) { + c = map_num(r5layout, sb->new_layout); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } + if (sb->level == 6) { + c = map_num(r6layout, sb->new_layout); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } + if (sb->level == 10) { + printf(" New Layout : near=%d, %s=%d\n", + sb->new_layout&255, + (sb->new_layout&0x10000)?"offset":"far", + (sb->new_layout>>8)&255); + } + } + if (sb->new_chunk != sb->chunk_size) + printf(" New Chunksize : %d\n", sb->new_chunk); + printf("\n"); + } + atime = sb->utime; + printf(" Update Time : %.24s\n", ctime(&atime)); + printf(" State : %s\n", + (sb->state&(1<<MD_SB_CLEAN))?"clean":"active"); + if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) + printf("Internal Bitmap : present\n"); + printf(" Active Devices : %d\n", sb->active_disks); + printf("Working Devices : %d\n", sb->working_disks); + printf(" Failed Devices : %d\n", sb->failed_disks); + printf(" Spare Devices : %d\n", sb->spare_disks); + if (calc_sb0_csum(sb) == sb->sb_csum) + printf(" Checksum : %x - correct\n", sb->sb_csum); + else + printf(" Checksum : %x - expected %lx\n", sb->sb_csum, calc_sb0_csum(sb)); + printf(" Events : %llu\n", + ((unsigned long long)sb->events_hi << 32) + + sb->events_lo); + printf("\n"); + if (sb->level == 5) { + c = map_num(r5layout, sb->layout); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (sb->level == 6) { + c = map_num(r6layout, sb->layout); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (sb->level == 10) { + printf(" Layout :"); + print_r10_layout(sb->layout); + printf("\n"); + } + switch(sb->level) { + case 0: + case 4: + case 5: + case 6: + case 10: + printf(" Chunk Size : %dK\n", sb->chunk_size/1024); + break; + case -1: + printf(" Rounding : %dK\n", sb->chunk_size/1024); + break; + default: break; + } + printf("\n"); + printf(" Number Major Minor RaidDevice State\n"); + for (d= -1; d<(signed int)(sb->raid_disks+delta_extra + sb->spare_disks); d++) { + mdp_disk_t *dp; + char *dv; + char nb[5]; + int wonly; + if (d>=0) dp = &sb->disks[d]; + else dp = &sb->this_disk; + snprintf(nb, sizeof(nb), "%4d", d); + printf("%4s %5d %5d %5d %5d ", d < 0 ? "this" : nb, + dp->number, dp->major, dp->minor, dp->raid_disk); + wonly = dp->state & (1<<MD_DISK_WRITEMOSTLY); + dp->state &= ~(1<<MD_DISK_WRITEMOSTLY); + if (dp->state & (1<<MD_DISK_FAULTY)) printf(" faulty"); + if (dp->state & (1<<MD_DISK_ACTIVE)) printf(" active"); + if (dp->state & (1<<MD_DISK_SYNC)) printf(" sync"); + if (dp->state & (1<<MD_DISK_REMOVED)) printf(" removed"); + if (wonly) printf(" write-mostly"); + if (dp->state == 0) printf(" spare"); + if ((dv=map_dev(dp->major, dp->minor, 0))) + printf(" %s", dv); + printf("\n"); + if (d == -1) printf("\n"); + } +} + +static void brief_examine_super0(struct supertype *st, int verbose) +{ + mdp_super_t *sb = st->sb; + char *c=map_num(pers, sb->level); + char devname[20]; + + sprintf(devname, "/dev/md%d", sb->md_minor); + + if (verbose) { + printf("ARRAY %s level=%s num-devices=%d", + devname, + c?c:"-unknown-", sb->raid_disks); + } else + printf("ARRAY %s", devname); + + if (sb->minor_version >= 90) + printf(" UUID=%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf(" UUID=%08x", sb->set_uuid0); + printf("\n"); +} + +static void export_examine_super0(struct supertype *st) +{ + mdp_super_t *sb = st->sb; + + printf("MD_LEVEL=%s\n", map_num(pers, sb->level)); + printf("MD_DEVICES=%d\n", sb->raid_disks); + if (sb->minor_version >= 90) + printf("MD_UUID=%08x:%08x:%08x:%08x\n", + sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf("MD_UUID=%08x\n", sb->set_uuid0); + printf("MD_UPDATE_TIME=%llu\n", + __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL); + printf("MD_EVENTS=%llu\n", + ((unsigned long long)sb->events_hi << 32) + + sb->events_lo); +} + +static int copy_metadata0(struct supertype *st, int from, int to) +{ + /* Read 64K from the appropriate offset of 'from' + * and if it looks a little like a 0.90 superblock, + * write it to the same offset of 'to' + */ + void *buf; + unsigned long long dsize, offset; + const int bufsize = 64*1024; + mdp_super_t *super; + + if (posix_memalign(&buf, 4096, bufsize) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (dsize < MD_RESERVED_SECTORS*512) + goto err; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(from, offset, 0) < 0LL) + goto err; + if (read(from, buf, bufsize) != bufsize) + goto err; + + if (lseek64(to, offset, 0) < 0LL) + goto err; + super = buf; + if (super->md_magic != MD_SB_MAGIC || + super->major_version != 0 || + calc_sb0_csum(super) != super->sb_csum) + goto err; + if (write(to, buf, bufsize) != bufsize) + goto err; + free(buf); + return 0; +err: + free(buf); + return 1; +} + +static void detail_super0(struct supertype *st, char *homehost) +{ + mdp_super_t *sb = st->sb; + printf(" UUID : "); + if (sb->minor_version >= 90) + printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf("%08x", sb->set_uuid0); + if (homehost) { + char buf[20]; + void *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + if (memcmp(&sb->set_uuid2, hash, 8)==0) + printf(" (local to host %s)", homehost); + } + printf("\n Events : %d.%d\n\n", sb->events_hi, sb->events_lo); +} + +static void brief_detail_super0(struct supertype *st) +{ + mdp_super_t *sb = st->sb; + printf(" UUID="); + if (sb->minor_version >= 90) + printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf("%08x", sb->set_uuid0); +} +#endif + +static int match_home0(struct supertype *st, char *homehost) +{ + mdp_super_t *sb = st->sb; + char buf[20]; + char *hash; + + if (!homehost) + return 0; + hash = sha1_buffer(homehost, + strlen(homehost), + buf); + + return (memcmp(&sb->set_uuid2, hash, 8)==0); +} + +static void uuid_from_super0(struct supertype *st, int uuid[4]) +{ + mdp_super_t *super = st->sb; + uuid[0] = super->set_uuid0; + if (super->minor_version >= 90) { + uuid[1] = super->set_uuid1; + uuid[2] = super->set_uuid2; + uuid[3] = super->set_uuid3; + } else { + uuid[1] = 0; + uuid[2] = 0; + uuid[3] = 0; + } +} + +static void getinfo_super0(struct supertype *st, struct mdinfo *info, char *map) +{ + mdp_super_t *sb = st->sb; + int working = 0; + int i; + int map_disks = info->array.raid_disks; + + memset(info, 0, sizeof(*info)); + info->array.major_version = sb->major_version; + info->array.minor_version = sb->minor_version; + info->array.patch_version = sb->patch_version; + info->array.raid_disks = sb->raid_disks; + info->array.level = sb->level; + info->array.layout = sb->layout; + info->array.md_minor = sb->md_minor; + info->array.ctime = sb->ctime; + info->array.utime = sb->utime; + info->array.chunk_size = sb->chunk_size; + info->array.state = sb->state; + info->component_size = sb->size*2; + + if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) + info->bitmap_offset = 8; + + info->disk.state = sb->this_disk.state; + info->disk.major = sb->this_disk.major; + info->disk.minor = sb->this_disk.minor; + info->disk.raid_disk = sb->this_disk.raid_disk; + info->disk.number = sb->this_disk.number; + + info->events = md_event(sb); + info->data_offset = 0; + + sprintf(info->text_version, "0.%d", sb->minor_version); + info->safe_mode_delay = 200; + + uuid_from_super0(st, info->uuid); + + info->recovery_start = MaxSector; + if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) { + info->reshape_active = 1; + info->reshape_progress = sb->reshape_position; + info->new_level = sb->new_level; + info->delta_disks = sb->delta_disks; + info->new_layout = sb->new_layout; + info->new_chunk = sb->new_chunk; + if (info->delta_disks < 0) + info->array.raid_disks -= info->delta_disks; + } else + info->reshape_active = 0; + + info->recovery_blocked = info->reshape_active; + + sprintf(info->name, "%d", sb->md_minor); + /* work_disks is calculated rather than read directly */ + for (i=0; i < MD_SB_DISKS; i++) + if ((sb->disks[i].state & (1<<MD_DISK_SYNC)) && + (sb->disks[i].raid_disk < (unsigned)info->array.raid_disks) && + (sb->disks[i].state & (1<<MD_DISK_ACTIVE)) && + !(sb->disks[i].state & (1<<MD_DISK_FAULTY))) { + working ++; + if (map && i < map_disks) + map[i] = 1; + } else if (map && i < map_disks) + map[i] = 0; + info->array.working_disks = working; +} + +static struct mdinfo *container_content0(struct supertype *st, char *subarray) +{ + struct mdinfo *info; + + if (subarray) + return NULL; + + info = xmalloc(sizeof(*info)); + getinfo_super0(st, info, NULL); + return info; +} + +static int update_super0(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* NOTE: for 'assemble' and 'force' we need to return non-zero + * if any change was made. For others, the return value is + * ignored. + */ + int rv = 0; + int uuid[4]; + mdp_super_t *sb = st->sb; + + if (strcmp(update, "homehost") == 0 && + homehost) { + /* note that 'homehost' is special as it is really + * a "uuid" update. + */ + uuid_set = 0; + update = "uuid"; + info->uuid[0] = sb->set_uuid0; + info->uuid[1] = sb->set_uuid1; + } + + if (strcmp(update, "sparc2.2")==0 ) { + /* 2.2 sparc put the events in the wrong place + * So we copy the tail of the superblock + * up 4 bytes before continuing + */ + __u32 *sb32 = (__u32*)sb; + memcpy(sb32+MD_SB_GENERIC_CONSTANT_WORDS+7, + sb32+MD_SB_GENERIC_CONSTANT_WORDS+7+1, + (MD_SB_WORDS - (MD_SB_GENERIC_CONSTANT_WORDS+7+1))*4); + if (verbose >= 0) + pr_err("adjusting superblock of %s for 2.2/sparc compatibility.\n", + devname); + } else if (strcmp(update, "super-minor") ==0) { + sb->md_minor = info->array.md_minor; + if (verbose > 0) + pr_err("updating superblock of %s with minor number %d\n", + devname, info->array.md_minor); + } else if (strcmp(update, "summaries") == 0) { + unsigned int i; + /* set nr_disks, active_disks, working_disks, + * failed_disks, spare_disks based on disks[] + * array in superblock. + * Also make sure extra slots aren't 'failed' + */ + sb->nr_disks = sb->active_disks = + sb->working_disks = sb->failed_disks = + sb->spare_disks = 0; + for (i=0; i < MD_SB_DISKS ; i++) + if (sb->disks[i].major || + sb->disks[i].minor) { + int state = sb->disks[i].state; + if (state & (1<<MD_DISK_REMOVED)) + continue; + sb->nr_disks++; + if (state & (1<<MD_DISK_ACTIVE)) + sb->active_disks++; + if (state & (1<<MD_DISK_FAULTY)) + sb->failed_disks++; + else + sb->working_disks++; + if (state == 0) + sb->spare_disks++; + } else if (i >= sb->raid_disks && sb->disks[i].number == 0) + sb->disks[i].state = 0; + } else if (strcmp(update, "force-one")==0) { + /* Not enough devices for a working array, so + * bring this one up-to-date. + */ + __u32 ehi = sb->events_hi, elo = sb->events_lo; + sb->events_hi = (info->events>>32) & 0xFFFFFFFF; + sb->events_lo = (info->events) & 0xFFFFFFFF; + if (sb->events_hi != ehi || + sb->events_lo != elo) + rv = 1; + } else if (strcmp(update, "force-array")==0) { + /* degraded array and 'force' requested, so + * maybe need to mark it 'clean' + */ + if ((sb->level == 5 || sb->level == 4 || sb->level == 6) && + (sb->state & (1 << MD_SB_CLEAN)) == 0) { + /* need to force clean */ + sb->state |= (1 << MD_SB_CLEAN); + rv = 1; + } + } else if (strcmp(update, "assemble")==0) { + int d = info->disk.number; + int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEMOSTLY); + int mask = (1<<MD_DISK_WRITEMOSTLY); + int add = 0; + if (sb->minor_version >= 91) + /* During reshape we don't insist on everything + * being marked 'sync' + */ + add = (1<<MD_DISK_SYNC); + if (((sb->disks[d].state & ~mask) | add) + != (unsigned)info->disk.state) { + sb->disks[d].state = info->disk.state | wonly; + rv = 1; + } + if (info->reshape_active && + sb->minor_version > 90 && (sb->reshape_position+1) != 0 && + info->delta_disks >= 0 && + info->reshape_progress < sb->reshape_position) { + sb->reshape_position = info->reshape_progress; + rv = 1; + } + if (info->reshape_active && + sb->minor_version > 90 && (sb->reshape_position+1) != 0 && + info->delta_disks < 0 && + info->reshape_progress > sb->reshape_position) { + sb->reshape_position = info->reshape_progress; + rv = 1; + } + } else if (strcmp(update, "linear-grow-new") == 0) { + memset(&sb->disks[info->disk.number], 0, sizeof(sb->disks[0])); + sb->disks[info->disk.number].number = info->disk.number; + sb->disks[info->disk.number].major = info->disk.major; + sb->disks[info->disk.number].minor = info->disk.minor; + sb->disks[info->disk.number].raid_disk = info->disk.raid_disk; + sb->disks[info->disk.number].state = info->disk.state; + sb->this_disk = sb->disks[info->disk.number]; + } else if (strcmp(update, "linear-grow-update") == 0) { + sb->raid_disks = info->array.raid_disks; + sb->nr_disks = info->array.nr_disks; + sb->active_disks = info->array.active_disks; + sb->working_disks = info->array.working_disks; + memset(&sb->disks[info->disk.number], 0, sizeof(sb->disks[0])); + sb->disks[info->disk.number].number = info->disk.number; + sb->disks[info->disk.number].major = info->disk.major; + sb->disks[info->disk.number].minor = info->disk.minor; + sb->disks[info->disk.number].raid_disk = info->disk.raid_disk; + sb->disks[info->disk.number].state = info->disk.state; + } else if (strcmp(update, "resync") == 0) { + /* make sure resync happens */ + sb->state &= ~(1<<MD_SB_CLEAN); + sb->recovery_cp = 0; + } else if (strcmp(update, "uuid") == 0) { + if (!uuid_set && homehost) { + char buf[20]; + char *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + memcpy(info->uuid+2, hash, 8); + } + sb->set_uuid0 = info->uuid[0]; + sb->set_uuid1 = info->uuid[1]; + sb->set_uuid2 = info->uuid[2]; + sb->set_uuid3 = info->uuid[3]; + if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { + struct bitmap_super_s *bm; + bm = (struct bitmap_super_s*)(sb+1); + uuid_from_super0(st, uuid); + memcpy(bm->uuid, uuid, 16); + } + } else if (strcmp(update, "metadata") == 0) { + /* Create some v1.0 metadata to match ours but make the + * ctime bigger. Also update info->array.*_version. + * We need to arrange that store_super writes out + * the v1.0 metadata. + * Not permitted for unclean array, or array with + * bitmap. + */ + if (info->bitmap_offset) { + pr_err("Cannot update metadata when bitmap is present\n"); + rv = -2; + } else if (info->array.state != 1) { + pr_err("Cannot update metadata on unclean array\n"); + rv = -2; + } else { + info->array.major_version = 1; + info->array.minor_version = 0; + uuid_from_super0(st, info->uuid); + st->other = super1_make_v0(st, info, st->sb); + } + } else if (strcmp(update, "revert-reshape") == 0) { + rv = -2; + if (sb->minor_version <= 90) + pr_err("No active reshape to revert on %s\n", + devname); + else if (sb->delta_disks == 0) + pr_err("%s: Can only revert reshape which changes number of devices\n", + devname); + else { + int tmp; + int parity = sb->level == 6 ? 2 : 1; + rv = 0; + + if (sb->level >= 4 && sb->level <= 6 && + sb->reshape_position % ( + sb->new_chunk/512 * + (sb->raid_disks - sb->delta_disks - parity))) { + pr_err("Reshape position is not suitably aligned.\n"); + pr_err("Try normal assembly and stop again\n"); + return -2; + } + sb->raid_disks -= sb->delta_disks; + sb->delta_disks = -sb->delta_disks; + + tmp = sb->new_layout; + sb->new_layout = sb->layout; + sb->layout = tmp; + + tmp = sb->new_chunk; + sb->new_chunk = sb->chunk_size; + sb->chunk_size = tmp; + } + } else if (strcmp(update, "no-bitmap") == 0) { + sb->state &= ~(1<<MD_SB_BITMAP_PRESENT); + } else if (strcmp(update, "_reshape_progress")==0) + sb->reshape_position = info->reshape_progress; + else if (strcmp(update, "writemostly")==0) + sb->state |= (1<<MD_DISK_WRITEMOSTLY); + else if (strcmp(update, "readwrite")==0) + sb->state &= ~(1<<MD_DISK_WRITEMOSTLY); + else + rv = -1; + + sb->sb_csum = calc_sb0_csum(sb); + return rv; +} + +/* + * For verion-0 superblock, the homehost is 'stored' in the + * uuid. 8 bytes for a hash of the host leaving 8 bytes + * of random material. + * We use the first 8 bytes (64bits) of the sha1 of the + * host name + */ + +static int init_super0(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *ignored_name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + mdp_super_t *sb; + int spares; + + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset not support for 0.90\n"); + return 0; + } + + if (posix_memalign((void**)&sb, 4096, + MD_SB_BYTES + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) { + pr_err("could not allocate superblock\n"); + return 0; + } + memset(sb, 0, MD_SB_BYTES + sizeof(bitmap_super_t)); + + st->sb = sb; + if (info == NULL) { + /* zeroing the superblock */ + return 0; + } + + spares = info->working_disks - info->active_disks; + if (info->raid_disks + spares > MD_SB_DISKS) { + pr_err("too many devices requested: %d+%d > %d\n", + info->raid_disks , spares, MD_SB_DISKS); + return 0; + } + + sb->md_magic = MD_SB_MAGIC; + sb->major_version = 0; + sb->minor_version = 90; + sb->patch_version = 0; + sb->gvalid_words = 0; /* ignored */ + sb->ctime = time(0); + sb->level = info->level; + sb->size = size; + if (size != (unsigned long long)sb->size) + return 0; + sb->nr_disks = info->nr_disks; + sb->raid_disks = info->raid_disks; + sb->md_minor = info->md_minor; + sb->not_persistent = 0; + if (uuid) { + sb->set_uuid0 = uuid[0]; + sb->set_uuid1 = uuid[1]; + sb->set_uuid2 = uuid[2]; + sb->set_uuid3 = uuid[3]; + } else { + int rfd = open("/dev/urandom", O_RDONLY); + if (rfd < 0 || read(rfd, &sb->set_uuid0, 4) != 4) + sb->set_uuid0 = random(); + if (rfd < 0 || read(rfd, &sb->set_uuid1, 12) != 12) { + sb->set_uuid1 = random(); + sb->set_uuid2 = random(); + sb->set_uuid3 = random(); + } + if (rfd >= 0) + close(rfd); + } + if (homehost && !uuid) { + char buf[20]; + char *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + memcpy(&sb->set_uuid2, hash, 8); + } + + sb->utime = sb->ctime; + sb->state = info->state; + sb->active_disks = info->active_disks; + sb->working_disks = info->working_disks; + sb->failed_disks = info->failed_disks; + sb->spare_disks = info->spare_disks; + sb->events_hi = 0; + sb->events_lo = 1; + + sb->layout = info->layout; + sb->chunk_size = info->chunk_size; + + return 1; +} + +struct devinfo { + int fd; + char *devname; + mdu_disk_info_t disk; + struct devinfo *next; +}; + +#ifndef MDASSEMBLE +/* Add a device to the superblock being created */ +static int add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo, + int fd, char *devname, unsigned long long data_offset) +{ + mdp_super_t *sb = st->sb; + mdp_disk_t *dk = &sb->disks[dinfo->number]; + struct devinfo *di, **dip; + + dk->number = dinfo->number; + dk->major = dinfo->major; + dk->minor = dinfo->minor; + dk->raid_disk = dinfo->raid_disk; + dk->state = dinfo->state & ((1<<MD_DISK_ACTIVE) | + (1<<MD_DISK_SYNC)); + + sb->this_disk = sb->disks[dinfo->number]; + sb->sb_csum = calc_sb0_csum(sb); + + dip = (struct devinfo **)&st->info; + while (*dip) + dip = &(*dip)->next; + di = xmalloc(sizeof(struct devinfo)); + di->fd = fd; + di->devname = devname; + di->disk = *dinfo; + di->next = NULL; + *dip = di; + + return 0; +} +#endif + +static int store_super0(struct supertype *st, int fd) +{ + unsigned long long dsize; + unsigned long long offset; + mdp_super_t *super = st->sb; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + if (dsize < MD_RESERVED_SECTORS*512) + return 2; + + if (st->other) { + /* Writing out v1.0 metadata for --update=metadata */ + int ret = 0; + + offset = dsize/512 - 8*2; + offset &= ~(4*2-1); + offset *= 512; + if (lseek64(fd, offset, 0)< 0LL) + ret = 3; + else if (write(fd, st->other, 1024) != 1024) + ret = 4; + else + fsync(fd); + free(st->other); + st->other = NULL; + return ret; + } + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(fd, offset, 0)< 0LL) + return 3; + + if (write(fd, super, sizeof(*super)) != sizeof(*super)) + return 4; + + if (super->state & (1<<MD_SB_BITMAP_PRESENT)) { + struct bitmap_super_s * bm = (struct bitmap_super_s*)(super+1); + if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) + if (write(fd, bm, ROUND_UP(sizeof(*bm),4096)) != + ROUND_UP(sizeof(*bm),4096)) + return 5; + } + + fsync(fd); + return 0; +} + +#ifndef MDASSEMBLE +static int write_init_super0(struct supertype *st) +{ + mdp_super_t *sb = st->sb; + int rv = 0; + struct devinfo *di; + + for (di = st->info ; di && ! rv ; di = di->next) { + + if (di->disk.state & (1 << MD_DISK_FAULTY)) + continue; + if (di->fd == -1) + continue; + while (Kill(di->devname, NULL, 0, -1, 1) == 0) + ; + + sb->disks[di->disk.number].state &= ~(1<<MD_DISK_FAULTY); + + sb->this_disk = sb->disks[di->disk.number]; + sb->sb_csum = calc_sb0_csum(sb); + rv = store_super0(st, di->fd); + + if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT))) + rv = st->ss->write_bitmap(st, di->fd); + + if (rv) + pr_err("failed to write superblock to %s\n", + di->devname); + } + return rv; +} +#endif + +static int compare_super0(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + mdp_super_t *first = st->sb; + mdp_super_t *second = tst->sb; + int uuid1[4], uuid2[4]; + + if (second->md_magic != MD_SB_MAGIC) + return 1; + if (!first) { + if (posix_memalign((void**)&first, 4096, + MD_SB_BYTES + + ROUND_UP(sizeof(struct bitmap_super_s), 4096)) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s)); + st->sb = first; + return 0; + } + + uuid_from_super0(st, uuid1); + uuid_from_super0(tst, uuid2); + if (!same_uuid(uuid1, uuid2, 0)) + return 2; + if (first->major_version != second->major_version || + first->minor_version != second->minor_version || + first->patch_version != second->patch_version || + first->gvalid_words != second->gvalid_words || + first->ctime != second->ctime || + first->level != second->level || + first->size != second->size || + first->raid_disks != second->raid_disks ) + return 3; + + return 0; +} + +static void free_super0(struct supertype *st); + +static int load_super0(struct supertype *st, int fd, char *devname) +{ + /* try to read in the superblock + * Return: + * 0 on success + * 1 on cannot get superblock + * 2 on superblock meaningless + */ + unsigned long long dsize; + unsigned long long offset; + mdp_super_t *super; + int uuid[4]; + struct bitmap_super_s *bsb; + + free_super0(st); + + if (!get_dev_size(fd, devname, &dsize)) + return 1; + + if (dsize < MD_RESERVED_SECTORS*512) { + if (devname) + pr_err("%s is too small for md: size is %llu sectors.\n", + devname, dsize); + return 1; + } + st->devsize = dsize; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(fd, offset, 0)< 0LL) { + if (devname) + pr_err("Cannot seek to superblock on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void**)&super, 4096, + MD_SB_BYTES + + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) { + if (devname) + pr_err("Cannot read superblock on %s\n", + devname); + free(super); + return 1; + } + + if (st->ss && st->minor_version == 9) + super0_swap_endian(super); + + if (super->md_magic != MD_SB_MAGIC) { + if (devname) + pr_err("No super block found on %s (Expected magic %08x, got %08x)\n", + devname, MD_SB_MAGIC, super->md_magic); + free(super); + return 2; + } + + if (super->major_version != 0) { + if (devname) + pr_err("Cannot interpret superblock on %s - version is %d\n", + devname, super->major_version); + free(super); + return 2; + } + st->sb = super; + + if (st->ss == NULL) { + st->ss = &super0; + st->minor_version = super->minor_version; + st->max_devs = MD_SB_DISKS; + st->info = NULL; + } + + /* Now check on the bitmap superblock */ + if ((super->state & (1<<MD_SB_BITMAP_PRESENT)) == 0) + return 0; + /* Read the bitmap superblock and make sure it looks + * valid. If it doesn't clear the bit. An --assemble --force + * should get that written out. + */ + if (read(fd, super+1, ROUND_UP(sizeof(struct bitmap_super_s),4096)) + != ROUND_UP(sizeof(struct bitmap_super_s),4096)) + goto no_bitmap; + + uuid_from_super0(st, uuid); + bsb = (struct bitmap_super_s *)(super+1); + if (__le32_to_cpu(bsb->magic) != BITMAP_MAGIC || + memcmp(bsb->uuid, uuid, 16) != 0) + goto no_bitmap; + return 0; + + no_bitmap: + super->state &= ~(1<<MD_SB_BITMAP_PRESENT); + + return 0; +} + +static struct supertype *match_metadata_desc0(char *arg) +{ + struct supertype *st = xcalloc(1, sizeof(*st)); + + st->container_devnm[0] = 0; + st->ss = &super0; + st->info = NULL; + st->minor_version = 90; + st->max_devs = MD_SB_DISKS; + st->sb = NULL; + /* we sometimes get 00.90 */ + while (arg[0] == '0' && arg[1] == '0') + arg++; + if (strcmp(arg, "0") == 0 || +#ifdef DEFAULT_OLD_METADATA /* ifndef in super1.c */ + strcmp(arg, "default") == 0 || +#endif /* DEFAULT_OLD_METADATA */ + strcmp(arg, "0.90") == 0 || + strcmp(arg, "") == 0 /* no metadata - i.e. non_persistent */ + ) + return st; + + st->minor_version = 91; /* reshape in progress */ + if (strcmp(arg, "0.91") == 0) /* For dup_super support */ + return st; + + st->minor_version = 9; /* flag for 'byte-swapped' */ + if (strcmp(arg, "0.swap")==0 || + strcmp(arg, "0.9") == 0) /* For dup_super support */ + return st; + + free(st); + return NULL; +} + +static __u64 avail_size0(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + if (data_offset != 0 && data_offset != INVALID_SECTORS) + return 0ULL; + if (devsize < MD_RESERVED_SECTORS) + return 0ULL; + return MD_NEW_SIZE_SECTORS(devsize); +} + +static int add_internal_bitmap0(struct supertype *st, int *chunkp, + int delay, int write_behind, + unsigned long long size, int may_change, + int major) +{ + /* + * The bitmap comes immediately after the superblock and must be 60K in size + * at most. The default size is between 30K and 60K + * + * size is in sectors, chunk is in bytes !!! + */ + unsigned long long bits; + unsigned long long max_bits = (60*1024 - sizeof(bitmap_super_t))*8; + unsigned long long min_chunk; + int chunk = *chunkp; + mdp_super_t *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MD_SB_BYTES); + int uuid[4]; + + min_chunk = 4096; /* sub-page chunks don't work yet.. */ + bits = (size * 512) / min_chunk + 1; + while (bits > max_bits) { + min_chunk *= 2; + bits = (bits+1)/2; + } + if (chunk == UnSet) { + /* A chunk size less than a few Megabytes gives poor + * performance without increasing resync noticeably + */ + chunk = min_chunk; + if (chunk < 64*1024*1024) + chunk = 64*1024*1024; + } else if ((unsigned long long)chunk < min_chunk) + return 0; /* chunk size too small */ + + sb->state |= (1<<MD_SB_BITMAP_PRESENT); + + memset(bms, 0, sizeof(*bms)); + bms->magic = __cpu_to_le32(BITMAP_MAGIC); + bms->version = __cpu_to_le32(major); + uuid_from_super0(st, uuid); + memcpy(bms->uuid, uuid, 16); + bms->chunksize = __cpu_to_le32(chunk); + bms->daemon_sleep = __cpu_to_le32(delay); + bms->sync_size = __cpu_to_le64(size); + bms->write_behind = __cpu_to_le32(write_behind); + *chunkp = chunk; + return 1; +} + +static void locate_bitmap0(struct supertype *st, int fd) +{ + unsigned long long dsize; + unsigned long long offset; + + if (!get_dev_size(fd, NULL, &dsize)) + return; + + if (dsize < MD_RESERVED_SECTORS*512) + return; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + offset += MD_SB_BYTES; + + lseek64(fd, offset, 0); +} + +static int write_bitmap0(struct supertype *st, int fd) +{ + unsigned long long dsize; + unsigned long long offset; + mdp_super_t *sb = st->sb; + + int rv = 0; + + int towrite, n; + void *buf; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + if (dsize < MD_RESERVED_SECTORS*512) + return -1; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(fd, offset + 4096, 0)< 0LL) + return 3; + + if (posix_memalign(&buf, 4096, 4096)) + return -ENOMEM; + + memset(buf, 0xff, 4096); + memcpy(buf, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t)); + towrite = 60*1024; + while (towrite > 0) { + n = towrite; + if (n > 4096) + n = 4096; + n = write(fd, buf, n); + if (n > 0) + towrite -= n; + else + break; + memset(buf, 0xff, 4096); + } + fsync(fd); + if (towrite) + rv = -2; + + free(buf); + return rv; +} + +static void free_super0(struct supertype *st) +{ + if (st->sb) + free(st->sb); + while (st->info) { + struct devinfo *di = st->info; + st->info = di->next; + if (di->fd >= 0) + close(di->fd); + free(di); + } + st->sb = NULL; +} + +#ifndef MDASSEMBLE +static int validate_geometry0(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int verbose) +{ + unsigned long long ldsize; + int fd; + unsigned int tbmax = 4; + + /* prior to linux 3.1, a but limits usable device size to 2TB. + * It was introduced in 2.6.29, but we won't worry about that detail + */ + if (get_linux_version() < 3001000) + tbmax = 2; + + if (level == LEVEL_CONTAINER) { + if (verbose) + pr_err("0.90 metadata does not support containers\n"); + return 0; + } + if (raiddisks > MD_SB_DISKS) { + if (verbose) + pr_err("0.90 metadata supports at most %d devices per array\n", + MD_SB_DISKS); + return 0; + } + if (size >= tbmax * 2ULL*1024*1024*1024) { + if (verbose) + pr_err("0.90 metadata supports at most %d terabytes per device\n", tbmax); + return 0; + } + if (*chunk == UnSet) + *chunk = DEFAULT_CHUNK; + + if (!subdev) + return 1; + + fd = open(subdev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + pr_err("super0.90 cannot open %s: %s\n", + subdev, strerror(errno)); + return 0; + } + + if (!get_dev_size(fd, subdev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + if (ldsize < MD_RESERVED_SECTORS * 512) + return 0; + *freesize = MD_NEW_SIZE_SECTORS(ldsize >> 9); + return 1; +} +#endif /* MDASSEMBLE */ + +struct superswitch super0 = { +#ifndef MDASSEMBLE + .examine_super = examine_super0, + .brief_examine_super = brief_examine_super0, + .export_examine_super = export_examine_super0, + .detail_super = detail_super0, + .brief_detail_super = brief_detail_super0, + .write_init_super = write_init_super0, + .validate_geometry = validate_geometry0, + .add_to_super = add_to_super0, + .copy_metadata = copy_metadata0, +#endif + .match_home = match_home0, + .uuid_from_super = uuid_from_super0, + .getinfo_super = getinfo_super0, + .container_content = container_content0, + .update_super = update_super0, + .init_super = init_super0, + .store_super = store_super0, + .compare_super = compare_super0, + .load_super = load_super0, + .match_metadata_desc = match_metadata_desc0, + .avail_size = avail_size0, + .add_internal_bitmap = add_internal_bitmap0, + .locate_bitmap = locate_bitmap0, + .write_bitmap = write_bitmap0, + .free_super = free_super0, + .name = "0.90", +}; diff --git a/super1.c b/super1.c new file mode 100644 index 00000000..f0508fe7 --- /dev/null +++ b/super1.c @@ -0,0 +1,2417 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include <stddef.h> +#include "mdadm.h" +/* + * The version-1 superblock : + * All numeric fields are little-endian. + * + * total size: 256 bytes plus 2 per device. + * 1K allows 384 devices. + */ +struct mdp_superblock_1 { + /* constant array information - 128 bytes */ + __u32 magic; /* MD_SB_MAGIC: 0xa92b4efc - little endian */ + __u32 major_version; /* 1 */ + __u32 feature_map; /* 0 for now */ + __u32 pad0; /* always set to 0 when writing */ + + __u8 set_uuid[16]; /* user-space generated. */ + char set_name[32]; /* set and interpreted by user-space */ + + __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ + __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ + __u32 layout; /* only for raid5 currently */ + __u64 size; /* used size of component devices, in 512byte sectors */ + + __u32 chunksize; /* in 512byte sectors */ + __u32 raid_disks; + __u32 bitmap_offset; /* sectors after start of superblock that bitmap starts + * NOTE: signed, so bitmap can be before superblock + * only meaningful of feature_map[0] is set. + */ + + /* These are only valid with feature bit '4' */ + __u32 new_level; /* new level we are reshaping to */ + __u64 reshape_position; /* next address in array-space for reshape */ + __u32 delta_disks; /* change in number of raid_disks */ + __u32 new_layout; /* new layout */ + __u32 new_chunk; /* new chunk size (sectors) */ + __u32 new_offset; /* signed number to add to data_offset in new + * layout. 0 == no-change. This can be + * different on each device in the array. + */ + + /* constant this-device information - 64 bytes */ + __u64 data_offset; /* sector start of data, often 0 */ + __u64 data_size; /* sectors in this device that can be used for data */ + __u64 super_offset; /* sector start of this superblock */ + __u64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + __u32 dev_number; /* permanent identifier of this device - not role in raid */ + __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ + __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ + __u8 devflags; /* per-device flags. Only one defined...*/ +#define WriteMostly1 1 /* mask for writemostly flag in above */ + /* bad block log. If there are any bad blocks the feature flag is set. + * if offset and size are non-zero, that space is reserved and available. + */ + __u8 bblog_shift; /* shift from sectors to block size for badblocklist */ + __u16 bblog_size; /* number of sectors reserved for badblocklist */ + __u32 bblog_offset; /* sector offset from superblock to bblog, signed */ + + /* array state information - 64 bytes */ + __u64 utime; /* 40 bits second, 24 btes microseconds */ + __u64 events; /* incremented when superblock updated */ + __u64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ + __u32 sb_csum; /* checksum upto dev_roles[max_dev] */ + __u32 max_dev; /* size of dev_roles[] array to consider */ + __u8 pad3[64-32]; /* set to 0 when writing */ + + /* device state information. Indexed by dev_number. + * 2 bytes per device + * Note there are no per-device state flags. State information is rolled + * into the 'roles' value. If a device is spare or faulty, then it doesn't + * have a meaningful role. + */ + __u16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ +}; + +#define MAX_SB_SIZE 4096 +/* bitmap super size is 256, but we round up to a sector for alignment */ +#define BM_SUPER_SIZE 512 +#define MAX_DEVS ((int)(MAX_SB_SIZE - sizeof(struct mdp_superblock_1)) / 2) +#define SUPER1_SIZE (MAX_SB_SIZE + BM_SUPER_SIZE \ + + sizeof(struct misc_dev_info)) + +struct misc_dev_info { + __u64 device_size; +}; + +/* feature_map bits */ +#define MD_FEATURE_BITMAP_OFFSET 1 +#define MD_FEATURE_RECOVERY_OFFSET 2 /* recovery_offset is present and + * must be honoured + */ +#define MD_FEATURE_RESHAPE_ACTIVE 4 +#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ +#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an + * active device with same 'role'. + * 'recovery_offset' is also set. + */ +#define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number + * of devices, but is going + * backwards anyway. + */ +#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ +#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ + |MD_FEATURE_RECOVERY_OFFSET \ + |MD_FEATURE_RESHAPE_ACTIVE \ + |MD_FEATURE_BAD_BLOCKS \ + |MD_FEATURE_REPLACEMENT \ + |MD_FEATURE_RESHAPE_BACKWARDS \ + |MD_FEATURE_NEW_OFFSET \ + ) + +static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +{ + unsigned int disk_csum, csum; + unsigned long long newcsum; + int size = sizeof(*sb) + __le32_to_cpu(sb->max_dev)*2; + unsigned int *isuper = (unsigned int*)sb; + +/* make sure I can count... */ + if (offsetof(struct mdp_superblock_1,data_offset) != 128 || + offsetof(struct mdp_superblock_1, utime) != 192 || + sizeof(struct mdp_superblock_1) != 256) { + fprintf(stderr, "WARNING - superblock isn't sized correctly\n"); + } + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + newcsum = 0; + for (; size>=4; size -= 4 ) { + newcsum += __le32_to_cpu(*isuper); + isuper++; + } + + if (size == 2) + newcsum += __le16_to_cpu(*(unsigned short*) isuper); + + csum = (newcsum & 0xffffffff) + (newcsum >> 32); + sb->sb_csum = disk_csum; + return __cpu_to_le32(csum); +} + +/* + * Information related to file descriptor used for aligned reads/writes. + * Cache the block size. + */ +struct align_fd { + int fd; + int blk_sz; +}; + +static void init_afd(struct align_fd *afd, int fd) +{ + afd->fd = fd; + + if (ioctl(afd->fd, BLKSSZGET, &afd->blk_sz) != 0) + afd->blk_sz = 512; +} + +static char abuf[4096+4096]; +static int aread(struct align_fd *afd, void *buf, int len) +{ + /* aligned read. + * On devices with a 4K sector size, we need to read + * the full sector and copy relevant bits into + * the buffer + */ + int bsize, iosize; + char *b; + int n; + + bsize = afd->blk_sz; + + if (!bsize || bsize > 4096 || len > 4096) { + if (!bsize) + fprintf(stderr, "WARNING - aread() called with invalid block size\n"); + return -1; + } + b = ROUND_UP_PTR((char *)abuf, 4096); + + for (iosize = 0; iosize < len; iosize += bsize) + ; + n = read(afd->fd, b, iosize); + if (n <= 0) + return n; + lseek(afd->fd, len - n, 1); + if (n > len) + n = len; + memcpy(buf, b, n); + return n; +} + +static int awrite(struct align_fd *afd, void *buf, int len) +{ + /* aligned write. + * On devices with a 4K sector size, we need to write + * the full sector. We pre-read if the sector is larger + * than the write. + * The address must be sector-aligned. + */ + int bsize, iosize; + char *b; + int n; + + bsize = afd->blk_sz; + if (!bsize || bsize > 4096 || len > 4096) { + if (!bsize) + fprintf(stderr, "WARNING - awrite() called with invalid block size\n"); + return -1; + } + b = ROUND_UP_PTR((char *)abuf, 4096); + + for (iosize = 0; iosize < len ; iosize += bsize) + ; + + if (len != iosize) { + n = read(afd->fd, b, iosize); + if (n <= 0) + return n; + lseek(afd->fd, -n, 1); + } + + memcpy(b, buf, len); + n = write(afd->fd, b, iosize); + if (n <= 0) + return n; + lseek(afd->fd, len - n, 1); + return len; +} + +#ifndef MDASSEMBLE +static void examine_super1(struct supertype *st, char *homehost) +{ + struct mdp_superblock_1 *sb = st->sb; + time_t atime; + unsigned int d; + int role; + int delta_extra = 0; + int i; + char *c; + int l = homehost ? strlen(homehost) : 0; + int layout; + unsigned long long sb_offset; + struct mdinfo info; + + printf(" Magic : %08x\n", __le32_to_cpu(sb->magic)); + printf(" Version : 1"); + sb_offset = __le64_to_cpu(sb->super_offset); + if (sb_offset <= 4) + printf(".1\n"); + else if (sb_offset <= 8) + printf(".2\n"); + else + printf(".0\n"); + printf(" Feature Map : 0x%x\n", __le32_to_cpu(sb->feature_map)); + printf(" Array UUID : "); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } + printf("\n"); + printf(" Name : %.32s", sb->set_name); + if (l > 0 && l < 32 && + sb->set_name[l] == ':' && + strncmp(sb->set_name, homehost, l) == 0) + printf(" (local to host %s)", homehost); + printf("\n"); + atime = __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL; + printf(" Creation Time : %.24s\n", ctime(&atime)); + c=map_num(pers, __le32_to_cpu(sb->level)); + printf(" Raid Level : %s\n", c?c:"-unknown-"); + printf(" Raid Devices : %d\n", __le32_to_cpu(sb->raid_disks)); + printf("\n"); + printf(" Avail Dev Size : %llu%s\n", + (unsigned long long)__le64_to_cpu(sb->data_size), + human_size(__le64_to_cpu(sb->data_size)<<9)); + if (__le32_to_cpu(sb->level) > 0) { + int ddsks = 0, ddsks_denom = 1; + switch(__le32_to_cpu(sb->level)) { + case 1: ddsks=1;break; + case 4: + case 5: ddsks = __le32_to_cpu(sb->raid_disks)-1; break; + case 6: ddsks = __le32_to_cpu(sb->raid_disks)-2; break; + case 10: + layout = __le32_to_cpu(sb->layout); + ddsks = __le32_to_cpu(sb->raid_disks); + ddsks_denom = (layout&255) * ((layout>>8)&255); + } + if (ddsks) { + long long asize = __le64_to_cpu(sb->size); + asize = (asize << 9) * ddsks / ddsks_denom; + printf(" Array Size : %llu%s\n", + asize >> 10, human_size(asize)); + } + if (sb->size != sb->data_size) + printf(" Used Dev Size : %llu%s\n", + (unsigned long long)__le64_to_cpu(sb->size), + human_size(__le64_to_cpu(sb->size)<<9)); + } + if (sb->data_offset) + printf(" Data Offset : %llu sectors\n", + (unsigned long long)__le64_to_cpu(sb->data_offset)); + if (sb->new_offset && + (__le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) { + unsigned long long offset = __le64_to_cpu(sb->data_offset); + offset += (signed)(int32_t)__le32_to_cpu(sb->new_offset); + printf(" New Offset : %llu sectors\n", offset); + } + printf(" Super Offset : %llu sectors\n", + (unsigned long long)__le64_to_cpu(sb->super_offset)); + if (__le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET) + printf("Recovery Offset : %llu sectors\n", (unsigned long long)__le64_to_cpu(sb->recovery_offset)); + + st->ss->getinfo_super(st, &info, NULL); + if (info.space_after != 1 && + !(__le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) + printf(" Unused Space : before=%llu sectors, after=%llu sectors\n", + info.space_before, info.space_after); + + printf(" State : %s\n", (__le64_to_cpu(sb->resync_offset)+1)? "active":"clean"); + printf(" Device UUID : "); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->device_uuid[i]); + } + printf("\n"); + printf("\n"); + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + printf("Internal Bitmap : %ld sectors from superblock\n", + (long)(int32_t)__le32_to_cpu(sb->bitmap_offset)); + } + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE)) { + printf(" Reshape pos'n : %llu%s\n", (unsigned long long)__le64_to_cpu(sb->reshape_position)/2, + human_size(__le64_to_cpu(sb->reshape_position)<<9)); + if (__le32_to_cpu(sb->delta_disks)) { + printf(" Delta Devices : %d", __le32_to_cpu(sb->delta_disks)); + printf(" (%d->%d)\n", + __le32_to_cpu(sb->raid_disks)-__le32_to_cpu(sb->delta_disks), + __le32_to_cpu(sb->raid_disks)); + if ((int)__le32_to_cpu(sb->delta_disks) < 0) + delta_extra = -__le32_to_cpu(sb->delta_disks); + } + if (__le32_to_cpu(sb->new_level) != __le32_to_cpu(sb->level)) { + c = map_num(pers, __le32_to_cpu(sb->new_level)); + printf(" New Level : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->new_layout) != __le32_to_cpu(sb->layout)) { + if (__le32_to_cpu(sb->level) == 5) { + c = map_num(r5layout, __le32_to_cpu(sb->new_layout)); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 6) { + c = map_num(r6layout, __le32_to_cpu(sb->new_layout)); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 10) { + printf(" New Layout :"); + print_r10_layout(__le32_to_cpu(sb->new_layout)); + printf("\n"); + } + } + if (__le32_to_cpu(sb->new_chunk) != __le32_to_cpu(sb->chunksize)) + printf(" New Chunksize : %dK\n", __le32_to_cpu(sb->new_chunk)/2); + printf("\n"); + } + if (sb->devflags) { + printf(" Flags :"); + if (sb->devflags & WriteMostly1) + printf(" write-mostly"); + printf("\n"); + } + + atime = __le64_to_cpu(sb->utime) & 0xFFFFFFFFFFULL; + printf(" Update Time : %.24s\n", ctime(&atime)); + + if (sb->bblog_size && sb->bblog_offset) { + printf(" Bad Block Log : %d entries available at offset %ld sectors", + __le16_to_cpu(sb->bblog_size)*512/8, + (long)(int32_t)__le32_to_cpu(sb->bblog_offset)); + if (sb->feature_map & + __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + printf(" - bad blocks present."); + printf("\n"); + } + + if (calc_sb_1_csum(sb) == sb->sb_csum) + printf(" Checksum : %x - correct\n", __le32_to_cpu(sb->sb_csum)); + else + printf(" Checksum : %x - expected %x\n", __le32_to_cpu(sb->sb_csum), + __le32_to_cpu(calc_sb_1_csum(sb))); + printf(" Events : %llu\n", (unsigned long long)__le64_to_cpu(sb->events)); + printf("\n"); + if (__le32_to_cpu(sb->level) == 5) { + c = map_num(r5layout, __le32_to_cpu(sb->layout)); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 6) { + c = map_num(r6layout, __le32_to_cpu(sb->layout)); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 10) { + int lo = __le32_to_cpu(sb->layout); + printf(" Layout :"); + print_r10_layout(lo); + printf("\n"); + } + switch(__le32_to_cpu(sb->level)) { + case 0: + case 4: + case 5: + case 6: + case 10: + printf(" Chunk Size : %dK\n", __le32_to_cpu(sb->chunksize)/2); + break; + case -1: + printf(" Rounding : %dK\n", __le32_to_cpu(sb->chunksize)/2); + break; + default: break; + } + printf("\n"); +#if 0 + /* This turns out to just be confusing */ + printf(" Array Slot : %d (", __le32_to_cpu(sb->dev_number)); + for (i= __le32_to_cpu(sb->max_dev); i> 0 ; i--) + if (__le16_to_cpu(sb->dev_roles[i-1]) != 0xffff) + break; + for (d=0; d < i; d++) { + int role = __le16_to_cpu(sb->dev_roles[d]); + if (d) printf(", "); + if (role == 0xffff) printf("empty"); + else if(role == 0xfffe) printf("failed"); + else printf("%d", role); + } + printf(")\n"); +#endif + printf(" Device Role : "); + d = __le32_to_cpu(sb->dev_number); + if (d < __le32_to_cpu(sb->max_dev)) + role = __le16_to_cpu(sb->dev_roles[d]); + else + role = 0xFFFF; + if (role >= 0xFFFE) + printf("spare\n"); + else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_REPLACEMENT)) + printf("Replacement device %d\n", role); + else + printf("Active device %d\n", role); + + printf(" Array State : "); + for (d=0; d<__le32_to_cpu(sb->raid_disks) + delta_extra; d++) { + int cnt = 0; + unsigned int i; + for (i=0; i< __le32_to_cpu(sb->max_dev); i++) { + unsigned int role = __le16_to_cpu(sb->dev_roles[i]); + if (role == d) + cnt++; + } + if (cnt == 2) + printf("R"); + else if (cnt == 1) + printf("A"); + else if (cnt == 0) + printf("."); + else + printf("?"); + } +#if 0 + /* This is confusing too */ + faulty = 0; + for (i=0; i< __le32_to_cpu(sb->max_dev); i++) { + int role = __le16_to_cpu(sb->dev_roles[i]); + if (role == 0xFFFE) + faulty++; + } + if (faulty) printf(" %d failed", faulty); +#endif + printf(" ('A' == active, '.' == missing, 'R' == replacing)"); + printf("\n"); +} + +static void brief_examine_super1(struct supertype *st, int verbose) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + unsigned long long sb_offset; + char *nm; + char *c=map_num(pers, __le32_to_cpu(sb->level)); + + nm = strchr(sb->set_name, ':'); + if (nm) + nm++; + else if (sb->set_name[0]) + nm = sb->set_name; + else + nm = NULL; + + printf("ARRAY "); + if (nm) { + printf("/dev/md/"); + print_escape(nm); + putchar(' '); + } + if (verbose && c) + printf(" level=%s", c); + sb_offset = __le64_to_cpu(sb->super_offset); + if (sb_offset <= 4) + printf(" metadata=1.1 "); + else if (sb_offset <= 8) + printf(" metadata=1.2 "); + else + printf(" metadata=1.0 "); + if (verbose) + printf("num-devices=%d ", __le32_to_cpu(sb->raid_disks)); + printf("UUID="); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } + if (sb->set_name[0]) { + printf(" name="); + print_quoted(sb->set_name); + } + printf("\n"); +} + +static void export_examine_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + int len = 32; + int layout; + + printf("MD_LEVEL=%s\n", map_num(pers, __le32_to_cpu(sb->level))); + printf("MD_DEVICES=%d\n", __le32_to_cpu(sb->raid_disks)); + for (i=0; i<32; i++) + if (sb->set_name[i] == '\n' || + sb->set_name[i] == '\0') { + len = i; + break; + } + if (len) + printf("MD_NAME=%.*s\n", len, sb->set_name); + if (__le32_to_cpu(sb->level) > 0) { + int ddsks = 0, ddsks_denom = 1; + switch(__le32_to_cpu(sb->level)) { + case 1: ddsks=1;break; + case 4: + case 5: ddsks = __le32_to_cpu(sb->raid_disks)-1; break; + case 6: ddsks = __le32_to_cpu(sb->raid_disks)-2; break; + case 10: + layout = __le32_to_cpu(sb->layout); + ddsks = __le32_to_cpu(sb->raid_disks); + ddsks_denom = (layout&255) * ((layout>>8)&255); + } + if (ddsks) { + long long asize = __le64_to_cpu(sb->size); + asize = (asize << 9) * ddsks / ddsks_denom; + printf("MD_ARRAY_SIZE=%s\n",human_size_brief(asize,JEDEC)); + } + } + printf("MD_UUID="); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } + printf("\n"); + printf("MD_UPDATE_TIME=%llu\n", + __le64_to_cpu(sb->utime) & 0xFFFFFFFFFFULL); + printf("MD_DEV_UUID="); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->device_uuid[i]); + } + printf("\n"); + printf("MD_EVENTS=%llu\n", + (unsigned long long)__le64_to_cpu(sb->events)); +} + +static int copy_metadata1(struct supertype *st, int from, int to) +{ + /* Read superblock. If it looks good, write it out. + * Then if a bitmap is present, copy that. + * And if a bad-block-list is present, copy that too. + */ + void *buf; + unsigned long long dsize, sb_offset; + const int bufsize = 4*1024; + struct mdp_superblock_1 super, *sb; + + if (posix_memalign(&buf, 4096, bufsize) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + dsize >>= 9; + if (dsize < 24) + goto err; + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4*2; + break; + default: + goto err; + } + + if (lseek64(from, sb_offset << 9, 0) < 0LL) + goto err; + if (read(from, buf, bufsize) != bufsize) + goto err; + + sb = buf; + super = *sb; // save most of sb for when we reuse buf + + if (__le32_to_cpu(super.magic) != MD_SB_MAGIC || + __le32_to_cpu(super.major_version) != 1 || + __le64_to_cpu(super.super_offset) != sb_offset || + calc_sb_1_csum(sb) != super.sb_csum) + goto err; + + if (lseek64(to, sb_offset << 9, 0) < 0LL) + goto err; + if (write(to, buf, bufsize) != bufsize) + goto err; + + if (super.feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) { + unsigned long long bitmap_offset = sb_offset; + int bytes = 4096; // just an estimate. + int written = 0; + struct align_fd afrom, ato; + + init_afd(&afrom, from); + init_afd(&ato, to); + + bitmap_offset += (int32_t)__le32_to_cpu(super.bitmap_offset); + + if (lseek64(from, bitmap_offset<<9, 0) < 0) + goto err; + if (lseek64(to, bitmap_offset<<9, 0) < 0) + goto err; + + for (written = 0; written < bytes ; ) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (aread(&afrom, buf, n) != n) + goto err; + if (written == 0) { + /* have the header, can calculate + * correct bitmap bytes */ + bitmap_super_t *bms; + int bits; + bms = (void*)buf; + bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); + bytes = (bits+7) >> 3; + bytes += sizeof(bitmap_super_t); + bytes = ROUND_UP(bytes, 512); + if (n > bytes) + n = bytes; + } + if (awrite(&ato, buf, n) != n) + goto err; + written += n; + } + } + + if (super.bblog_size != 0 && + __le32_to_cpu(super.bblog_size) <= 100 && + super.bblog_offset != 0 && + (super.feature_map & __le32_to_cpu(MD_FEATURE_BAD_BLOCKS))) { + /* There is a bad block log */ + unsigned long long bb_offset = sb_offset; + int bytes = __le32_to_cpu(super.bblog_size) * 512; + int written = 0; + struct align_fd afrom, ato; + + init_afd(&afrom, from); + init_afd(&ato, to); + + bb_offset += (int32_t)__le32_to_cpu(super.bblog_offset); + + if (lseek64(from, bb_offset<<9, 0) < 0) + goto err; + if (lseek64(to, bb_offset<<9, 0) < 0) + goto err; + + for (written = 0; written < bytes ; ) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (aread(&afrom, buf, n) != n) + goto err; + + if (awrite(&ato, buf, n) != n) + goto err; + written += n; + } + } + + free(buf); + return 0; + +err: + free(buf); + return 1; +} + +static void detail_super1(struct supertype *st, char *homehost) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + int l = homehost ? strlen(homehost) : 0; + + printf(" Name : %.32s", sb->set_name); + if (l > 0 && l < 32 && + sb->set_name[l] == ':' && + strncmp(sb->set_name, homehost, l) == 0) + printf(" (local to host %s)", homehost); + printf("\n UUID : "); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } + printf("\n Events : %llu\n\n", (unsigned long long)__le64_to_cpu(sb->events)); +} + +static void brief_detail_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + + if (sb->set_name[0]) { + printf(" name="); + print_quoted(sb->set_name); + } + printf(" UUID="); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } +} + +static void export_detail_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + int len = 32; + + for (i=0; i<32; i++) + if (sb->set_name[i] == '\n' || + sb->set_name[i] == '\0') { + len = i; + break; + } + if (len) + printf("MD_NAME=%.*s\n", len, sb->set_name); +} + +static int examine_badblocks_super1(struct supertype *st, int fd, char *devname) +{ + struct mdp_superblock_1 *sb = st->sb; + unsigned long long offset; + int size; + __u64 *bbl, *bbp; + int i; + + if (!sb->bblog_size || __le32_to_cpu(sb->bblog_size) > 100 + || !sb->bblog_offset){ + printf("No bad-blocks list configured on %s\n", devname); + return 0; + } + if ((sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + == 0) { + printf("Bad-blocks list is empty in %s\n", devname); + return 0; + } + + size = __le32_to_cpu(sb->bblog_size)* 512; + if (posix_memalign((void**)&bbl, 4096, size) != 0) { + pr_err("could not allocate badblocks list\n"); + return 0; + } + offset = __le64_to_cpu(sb->super_offset) + + (int)__le32_to_cpu(sb->bblog_offset); + offset <<= 9; + if (lseek64(fd, offset, 0) < 0) { + pr_err("Cannot seek to bad-blocks list\n"); + return 1; + } + if (read(fd, bbl, size) != size) { + pr_err("Cannot read bad-blocks list\n"); + return 1; + } + /* 64bits per entry. 10 bits is block-count, 54 bits is block + * offset. Blocks are sectors unless bblog->shift makes them bigger + */ + bbp = (__u64*)bbl; + printf("Bad-blocks on %s:\n", devname); + for (i = 0; i < size/8; i++, bbp++) { + __u64 bb = __le64_to_cpu(*bbp); + int count = bb & 0x3ff; + unsigned long long sector = bb >> 10; + + if (bb + 1 == 0) + break; + + sector <<= sb->bblog_shift; + count <<= sb->bblog_shift; + + printf("%20llu for %d sectors\n", sector, count); + } + return 0; +} + +#endif + +static int match_home1(struct supertype *st, char *homehost) +{ + struct mdp_superblock_1 *sb = st->sb; + int l = homehost ? strlen(homehost) : 0; + + return (l > 0 && l < 32 && + sb->set_name[l] == ':' && + strncmp(sb->set_name, homehost, l) == 0); +} + +static void uuid_from_super1(struct supertype *st, int uuid[4]) +{ + struct mdp_superblock_1 *super = st->sb; + char *cuuid = (char*)uuid; + int i; + for (i=0; i<16; i++) + cuuid[i] = super->set_uuid[i]; +} + +static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) +{ + struct mdp_superblock_1 *sb = st->sb; + struct bitmap_super_s *bsb = (void*)(((char*)sb)+MAX_SB_SIZE); + struct misc_dev_info *misc = (void*)(((char*)sb)+MAX_SB_SIZE+BM_SUPER_SIZE); + int working = 0; + unsigned int i; + unsigned int role; + unsigned int map_disks = info->array.raid_disks; + unsigned long long super_offset; + unsigned long long data_size; + + memset(info, 0, sizeof(*info)); + info->array.major_version = 1; + info->array.minor_version = st->minor_version; + info->array.patch_version = 0; + info->array.raid_disks = __le32_to_cpu(sb->raid_disks); + info->array.level = __le32_to_cpu(sb->level); + info->array.layout = __le32_to_cpu(sb->layout); + info->array.md_minor = -1; + info->array.ctime = __le64_to_cpu(sb->ctime); + info->array.utime = __le64_to_cpu(sb->utime); + info->array.chunk_size = __le32_to_cpu(sb->chunksize)*512; + info->array.state = + (__le64_to_cpu(sb->resync_offset) == MaxSector) + ? 1 : 0; + + info->data_offset = __le64_to_cpu(sb->data_offset); + info->component_size = __le64_to_cpu(sb->size); + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) + info->bitmap_offset = (int32_t)__le32_to_cpu(sb->bitmap_offset); + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.number = __le32_to_cpu(sb->dev_number); + if (__le32_to_cpu(sb->dev_number) >= __le32_to_cpu(sb->max_dev) || + __le32_to_cpu(sb->dev_number) >= MAX_DEVS) + role = 0xfffe; + else + role = __le16_to_cpu(sb->dev_roles[__le32_to_cpu(sb->dev_number)]); + + super_offset = __le64_to_cpu(sb->super_offset); + if (info->array.level <= 0) + data_size = __le64_to_cpu(sb->data_size); + else + data_size = __le64_to_cpu(sb->size); + if (info->data_offset < super_offset) { + unsigned long long end; + info->space_before = info->data_offset; + end = super_offset; + + if (sb->bblog_offset && sb->bblog_size) { + unsigned long long bboffset = super_offset; + bboffset += (int32_t)__le32_to_cpu(sb->bblog_offset); + if (bboffset < end) + end = bboffset; + } + + if (super_offset + info->bitmap_offset < end) + end = super_offset + info->bitmap_offset; + + if (info->data_offset + data_size < end) + info->space_after = end - data_size - info->data_offset; + else + info->space_after = 0; + } else { + unsigned long long earliest; + earliest = super_offset + (32+4)*2; /* match kernel */ + if (info->bitmap_offset > 0) { + unsigned long long bmend = info->bitmap_offset; + unsigned long long size = __le64_to_cpu(bsb->sync_size); + size /= __le32_to_cpu(bsb->chunksize) >> 9; + size = (size + 7) >> 3; + size += sizeof(bitmap_super_t); + size = ROUND_UP(size, 4096); + size /= 512; + bmend += size; + if (bmend > earliest) + bmend = earliest; + } + if (sb->bblog_offset && sb->bblog_size) { + unsigned long long bbend = super_offset; + bbend += (int32_t)__le32_to_cpu(sb->bblog_offset); + bbend += __le32_to_cpu(sb->bblog_size); + if (bbend > earliest) + earliest = bbend; + } + if (earliest < info->data_offset) + info->space_before = info->data_offset - earliest; + else + info->space_before = 0; + info->space_after = misc->device_size - data_size - info->data_offset; + } + if (info->space_before == 0 && info->space_after == 0) { + /* It will look like we don't support data_offset changes, + * be we do - it's just that there is no room. + * A change that reduced the number of devices should + * still be allowed, so set the otherwise useless value of '1' + */ + info->space_after = 1; + } + + info->disk.raid_disk = -1; + switch(role) { + case 0xFFFF: + info->disk.state = 0; /* spare: not active, not sync, not faulty */ + break; + case 0xFFFE: + info->disk.state = 1; /* faulty */ + break; + default: + info->disk.state = 6; /* active and in sync */ + info->disk.raid_disk = role; + } + if (sb->devflags & WriteMostly1) + info->disk.state |= (1 << MD_DISK_WRITEMOSTLY); + info->events = __le64_to_cpu(sb->events); + sprintf(info->text_version, "1.%d", st->minor_version); + info->safe_mode_delay = 200; + + memcpy(info->uuid, sb->set_uuid, 16); + + strncpy(info->name, sb->set_name, 32); + info->name[32] = 0; + + if ((__le32_to_cpu(sb->feature_map)&MD_FEATURE_REPLACEMENT)) { + info->disk.state &= ~(1 << MD_DISK_SYNC); + info->disk.state |= 1 << MD_DISK_REPLACEMENT; + } + + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RECOVERY_OFFSET)) + info->recovery_start = __le32_to_cpu(sb->recovery_offset); + else + info->recovery_start = MaxSector; + + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE)) { + info->reshape_active = 1; + if ((sb->feature_map & __le32_to_cpu(MD_FEATURE_NEW_OFFSET)) && + sb->new_offset != 0) + info->reshape_active |= RESHAPE_NO_BACKUP; + info->reshape_progress = __le64_to_cpu(sb->reshape_position); + info->new_level = __le32_to_cpu(sb->new_level); + info->delta_disks = __le32_to_cpu(sb->delta_disks); + info->new_layout = __le32_to_cpu(sb->new_layout); + info->new_chunk = __le32_to_cpu(sb->new_chunk)<<9; + if (info->delta_disks < 0) + info->array.raid_disks -= info->delta_disks; + } else + info->reshape_active = 0; + + info->recovery_blocked = info->reshape_active; + + if (map) + for (i=0; i<map_disks; i++) + map[i] = 0; + for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) { + role = __le16_to_cpu(sb->dev_roles[i]); + if (/*role == 0xFFFF || */role < (unsigned) info->array.raid_disks) { + working++; + if (map && role < map_disks) + map[role] = 1; + } + } + + info->array.working_disks = working; +} + +static struct mdinfo *container_content1(struct supertype *st, char *subarray) +{ + struct mdinfo *info; + + if (subarray) + return NULL; + + info = xmalloc(sizeof(*info)); + getinfo_super1(st, info, NULL); + return info; +} + +static int update_super1(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* NOTE: for 'assemble' and 'force' we need to return non-zero + * if any change was made. For others, the return value is + * ignored. + */ + int rv = 0; + struct mdp_superblock_1 *sb = st->sb; + + if (strcmp(update, "homehost") == 0 && + homehost) { + /* Note that 'homehost' is special as it is really + * a "name" update. + */ + char *c; + update = "name"; + c = strchr(sb->set_name, ':'); + if (c) + strncpy(info->name, c+1, 31 - (c-sb->set_name)); + else + strncpy(info->name, sb->set_name, 32); + info->name[32] = 0; + } + + if (strcmp(update, "force-one")==0) { + /* Not enough devices for a working array, + * so bring this one up-to-date + */ + if (sb->events != __cpu_to_le64(info->events)) + rv = 1; + sb->events = __cpu_to_le64(info->events); + } else if (strcmp(update, "force-array")==0) { + /* Degraded array and 'force' requests to + * maybe need to mark it 'clean'. + */ + switch(__le32_to_cpu(sb->level)) { + case 5: case 4: case 6: + /* need to force clean */ + if (sb->resync_offset != MaxSector) + rv = 1; + sb->resync_offset = MaxSector; + } + } else if (strcmp(update, "assemble")==0) { + int d = info->disk.number; + int want; + if (info->disk.state & (1<<MD_DISK_ACTIVE)) + want = info->disk.raid_disk; + else + want = 0xFFFF; + if (sb->dev_roles[d] != __cpu_to_le16(want)) { + sb->dev_roles[d] = __cpu_to_le16(want); + rv = 1; + } + if (info->reshape_active && + sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE) && + info->delta_disks >= 0 && + info->reshape_progress < __le64_to_cpu(sb->reshape_position)) { + sb->reshape_position = __cpu_to_le64(info->reshape_progress); + rv = 1; + } + if (info->reshape_active && + sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE) && + info->delta_disks < 0 && + info->reshape_progress > __le64_to_cpu(sb->reshape_position)) { + sb->reshape_position = __cpu_to_le64(info->reshape_progress); + rv = 1; + } + } else if (strcmp(update, "linear-grow-new") == 0) { + unsigned int i; + int rfd, fd; + unsigned int max = __le32_to_cpu(sb->max_dev); + + for (i=0 ; i < max ; i++) + if (__le16_to_cpu(sb->dev_roles[i]) >= 0xfffe) + break; + sb->dev_number = __cpu_to_le32(i); + info->disk.number = i; + if (max >= __le32_to_cpu(sb->max_dev)) + sb->max_dev = __cpu_to_le32(max+1); + + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->device_uuid, 16) != 16) { + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->device_uuid, r, 16); + } + if (rfd >= 0) + close(rfd); + + sb->dev_roles[i] = + __cpu_to_le16(info->disk.raid_disk); + + fd = open(devname, O_RDONLY); + if (fd >= 0) { + unsigned long long ds; + get_dev_size(fd, devname, &ds); + close(fd); + ds >>= 9; + if (__le64_to_cpu(sb->super_offset) < + __le64_to_cpu(sb->data_offset)) { + sb->data_size = __cpu_to_le64( + ds - __le64_to_cpu(sb->data_offset)); + } else { + ds -= 8*2; + ds &= ~(unsigned long long)(4*2-1); + sb->super_offset = __cpu_to_le64(ds); + sb->data_size = __cpu_to_le64( + ds - __le64_to_cpu(sb->data_offset)); + } + } + } else if (strcmp(update, "linear-grow-update") == 0) { + sb->raid_disks = __cpu_to_le32(info->array.raid_disks); + sb->dev_roles[info->disk.number] = + __cpu_to_le16(info->disk.raid_disk); + } else if (strcmp(update, "resync") == 0) { + /* make sure resync happens */ + sb->resync_offset = 0ULL; + } else if (strcmp(update, "uuid") == 0) { + copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid); + + if (__le32_to_cpu(sb->feature_map)&MD_FEATURE_BITMAP_OFFSET) { + struct bitmap_super_s *bm; + bm = (struct bitmap_super_s*)(st->sb+MAX_SB_SIZE); + memcpy(bm->uuid, sb->set_uuid, 16); + } + } else if (strcmp(update, "no-bitmap") == 0) { + sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); + } else if (strcmp(update, "bbl") == 0) { + /* only possible if there is room after the bitmap, or if + * there is no bitmap + */ + unsigned long long sb_offset = __le64_to_cpu(sb->super_offset); + unsigned long long data_offset = __le64_to_cpu(sb->data_offset); + long bitmap_offset = (long)(int32_t)__le32_to_cpu(sb->bitmap_offset); + long bm_sectors = 0; + long space; + +#ifndef MDASSEMBLE + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + struct bitmap_super_s *bsb; + bsb = (struct bitmap_super_s *)(((char*)sb)+MAX_SB_SIZE); + bm_sectors = bitmap_sectors(bsb); + } +#endif + if (sb_offset < data_offset) { + /* 1.1 or 1.2. Put bbl after bitmap leaving at least 32K + */ + long bb_offset; + bb_offset = sb_offset + 8; + if (bm_sectors && bitmap_offset > 0) + bb_offset = bitmap_offset + bm_sectors; + while (bb_offset < (long)sb_offset + 8 + 32*2 + && bb_offset + 8+8 <= (long)data_offset) + /* too close to bitmap, and room to grow */ + bb_offset += 8; + if (bb_offset + 8 <= (long)data_offset) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(bb_offset); + } + } else { + /* 1.0 - Put bbl just before super block */ + if (bm_sectors && bitmap_offset < 0) + space = -bitmap_offset - bm_sectors; + else + space = sb_offset - data_offset - + __le64_to_cpu(sb->data_size); + if (space >= 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32((unsigned)-8); + } + } + } else if (strcmp(update, "no-bbl") == 0) { + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + pr_err("Cannot remove active bbl from %s\n",devname); + else { + sb->bblog_size = 0; + sb->bblog_shift = 0; + sb->bblog_offset = 0; + } + } else if (strcmp(update, "name") == 0) { + if (info->name[0] == 0) + sprintf(info->name, "%d", info->array.md_minor); + memset(sb->set_name, 0, sizeof(sb->set_name)); + if (homehost && + strchr(info->name, ':') == NULL && + strlen(homehost)+1+strlen(info->name) < 32) { + strcpy(sb->set_name, homehost); + strcat(sb->set_name, ":"); + strcat(sb->set_name, info->name); + } else + strcpy(sb->set_name, info->name); + } else if (strcmp(update, "devicesize") == 0 && + __le64_to_cpu(sb->super_offset) < + __le64_to_cpu(sb->data_offset)) { + /* set data_size to device size less data_offset */ + struct misc_dev_info *misc = (struct misc_dev_info*) + (st->sb + MAX_SB_SIZE + BM_SUPER_SIZE); + sb->data_size = __cpu_to_le64( + misc->device_size - __le64_to_cpu(sb->data_offset)); + } else if (strcmp(update, "revert-reshape") == 0) { + rv = -2; + if (!(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE))) + pr_err("No active reshape to revert on %s\n", + devname); + else { + __u32 temp; + unsigned long long reshape_sectors; + long reshape_chunk; + rv = 0; + /* reshape_position is a little messy. + * Its value must be a multiple of the larger + * chunk size, and of the "after" data disks. + * So when reverting we need to change it to + * be a multiple of the new "after" data disks, + * which is the old "before". + * If it isn't already a multiple of 'before', + * the only thing we could do would be + * copy some block around on the disks, which + * is easy to get wrong. + * So we reject a revert-reshape unless the + * alignment is good. + */ + if (__le32_to_cpu(sb->level) >= 4 && + __le32_to_cpu(sb->level) <= 6) { + reshape_sectors = __le64_to_cpu(sb->reshape_position); + reshape_chunk = __le32_to_cpu(sb->new_chunk); + reshape_chunk *= __le32_to_cpu(sb->raid_disks) - __le32_to_cpu(sb->delta_disks) - + (__le32_to_cpu(sb->level)==6 ? 2 : 1); + if (reshape_sectors % reshape_chunk) { + pr_err("Reshape position is not suitably aligned.\n"); + pr_err("Try normal assembly and stop again\n"); + return -2; + } + } + sb->raid_disks = __cpu_to_le32(__le32_to_cpu(sb->raid_disks) - + __le32_to_cpu(sb->delta_disks)); + if (sb->delta_disks == 0) + sb->feature_map ^= __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); + else + sb->delta_disks = __cpu_to_le32(-__le32_to_cpu(sb->delta_disks)); + + temp = sb->new_layout; + sb->new_layout = sb->layout; + sb->layout = temp; + + temp = sb->new_chunk; + sb->new_chunk = sb->chunksize; + sb->chunksize = temp; + + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_NEW_OFFSET)) { + long offset_delta = (int32_t)__le32_to_cpu(sb->new_offset); + sb->data_offset = __cpu_to_le64(__le64_to_cpu(sb->data_offset) + offset_delta); + sb->new_offset = __cpu_to_le32(-offset_delta); + sb->data_size = __cpu_to_le64(__le64_to_cpu(sb->data_size) - offset_delta); + } + } + } else if (strcmp(update, "_reshape_progress")==0) + sb->reshape_position = __cpu_to_le64(info->reshape_progress); + else if (strcmp(update, "writemostly")==0) + sb->devflags |= WriteMostly1; + else if (strcmp(update, "readwrite")==0) + sb->devflags &= ~WriteMostly1; + else + rv = -1; + + sb->sb_csum = calc_sb_1_csum(sb); + return rv; +} + +static int init_super1(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + struct mdp_superblock_1 *sb; + int spares; + int rfd; + char defname[10]; + int sbsize; + + if (posix_memalign((void**)&sb, 4096, SUPER1_SIZE) != 0) { + pr_err("could not allocate superblock\n"); + return 0; + } + memset(sb, 0, SUPER1_SIZE); + + st->sb = sb; + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + spares = info->working_disks - info->active_disks; + if (info->raid_disks + spares > MAX_DEVS) { + pr_err("too many devices requested: %d+%d > %d\n", + info->raid_disks , spares, MAX_DEVS); + return 0; + } + + sb->magic = __cpu_to_le32(MD_SB_MAGIC); + sb->major_version = __cpu_to_le32(1); + sb->feature_map = 0; + sb->pad0 = 0; + + if (uuid) + copy_uuid(sb->set_uuid, uuid, super1.swapuuid); + else { + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->set_uuid, 16) != 16) { + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->set_uuid, r, 16); + } + if (rfd >= 0) close(rfd); + } + + if (name == NULL || *name == 0) { + sprintf(defname, "%d", info->md_minor); + name = defname; + } + if (homehost && + strchr(name, ':')== NULL && + strlen(homehost)+1+strlen(name) < 32) { + strcpy(sb->set_name, homehost); + strcat(sb->set_name, ":"); + strcat(sb->set_name, name); + } else + strcpy(sb->set_name, name); + + sb->ctime = __cpu_to_le64((unsigned long long)time(0)); + sb->level = __cpu_to_le32(info->level); + sb->layout = __cpu_to_le32(info->layout); + sb->size = __cpu_to_le64(size*2ULL); + sb->chunksize = __cpu_to_le32(info->chunk_size>>9); + sb->raid_disks = __cpu_to_le32(info->raid_disks); + + sb->data_offset = __cpu_to_le64(data_offset); + sb->data_size = __cpu_to_le64(0); + sb->super_offset = __cpu_to_le64(0); + sb->recovery_offset = __cpu_to_le64(0); + + sb->utime = sb->ctime; + sb->events = __cpu_to_le64(1); + if (info->state & (1<<MD_SB_CLEAN)) + sb->resync_offset = MaxSector; + else + sb->resync_offset = 0; + sbsize = sizeof(struct mdp_superblock_1) + 2 * (info->raid_disks + spares); + sbsize = ROUND_UP(sbsize, 512); + sb->max_dev = __cpu_to_le32((sbsize - sizeof(struct mdp_superblock_1)) / 2); + + memset(sb->dev_roles, 0xff, MAX_SB_SIZE - sizeof(struct mdp_superblock_1)); + + return 1; +} + +struct devinfo { + int fd; + char *devname; + long long data_offset; + mdu_disk_info_t disk; + struct devinfo *next; +}; +#ifndef MDASSEMBLE +/* Add a device to the superblock being created */ +static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname, unsigned long long data_offset) +{ + struct mdp_superblock_1 *sb = st->sb; + __u16 *rp = sb->dev_roles + dk->number; + struct devinfo *di, **dip; + + if ((dk->state & 6) == 6) /* active, sync */ + *rp = __cpu_to_le16(dk->raid_disk); + else if ((dk->state & ~2) == 0) /* active or idle -> spare */ + *rp = 0xffff; + else + *rp = 0xfffe; + + if (dk->number >= (int)__le32_to_cpu(sb->max_dev) && + __le32_to_cpu(sb->max_dev) < MAX_DEVS) + sb->max_dev = __cpu_to_le32(dk->number+1); + + sb->dev_number = __cpu_to_le32(dk->number); + sb->devflags = 0; /* don't copy another disks flags */ + sb->sb_csum = calc_sb_1_csum(sb); + + dip = (struct devinfo **)&st->info; + while (*dip) + dip = &(*dip)->next; + di = xmalloc(sizeof(struct devinfo)); + di->fd = fd; + di->devname = devname; + di->disk = *dk; + di->data_offset = data_offset; + di->next = NULL; + *dip = di; + + return 0; +} +#endif + +static void locate_bitmap1(struct supertype *st, int fd); + +static int store_super1(struct supertype *st, int fd) +{ + struct mdp_superblock_1 *sb = st->sb; + unsigned long long sb_offset; + struct align_fd afd; + int sbsize; + unsigned long long dsize; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + dsize >>= 9; + + if (dsize < 24) + return 2; + + init_afd(&afd, fd); + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4*2; + break; + default: + return -EINVAL; + } + + if (sb_offset != __le64_to_cpu(sb->super_offset) && + 0 != __le64_to_cpu(sb->super_offset) + ) { + pr_err("internal error - sb_offset is wrong\n"); + abort(); + } + + if (lseek64(fd, sb_offset << 9, 0)< 0LL) + return 3; + + sbsize = ROUND_UP(sizeof(*sb) + 2 * __le32_to_cpu(sb->max_dev), 512); + + if (awrite(&afd, sb, sbsize) != sbsize) + return 4; + + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + struct bitmap_super_s *bm = (struct bitmap_super_s*) + (((char*)sb)+MAX_SB_SIZE); + if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) { + locate_bitmap1(st, fd); + if (awrite(&afd, bm, sizeof(*bm)) != sizeof(*bm)) + return 5; + } + } + fsync(fd); + return 0; +} + +static int load_super1(struct supertype *st, int fd, char *devname); + +static unsigned long choose_bm_space(unsigned long devsize) +{ + /* if the device is bigger than 8Gig, save 64k for bitmap usage, + * if bigger than 200Gig, save 128k + * NOTE: result must be multiple of 4K else bad things happen + * on 4K-sector devices. + */ + if (devsize < 64*2) return 0; + if (devsize - 64*2 >= 200*1024*1024*2) + return 128*2; + if (devsize - 4*2 > 8*1024*1024*2) + return 64*2; + return 4*2; +} + +static void free_super1(struct supertype *st); + +#ifndef MDASSEMBLE +static int write_init_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + struct supertype *refst; + int rfd; + int rv = 0; + unsigned long long bm_space; + struct devinfo *di; + unsigned long long dsize, array_size; + unsigned long long sb_offset; + unsigned long long data_offset; + + for (di = st->info; di; di = di->next) { + if (di->disk.state & (1 << MD_DISK_FAULTY)) + continue; + if (di->fd < 0) + continue; + + while (Kill(di->devname, NULL, 0, -1, 1) == 0) + ; + + sb->dev_number = __cpu_to_le32(di->disk.number); + if (di->disk.state & (1<<MD_DISK_WRITEMOSTLY)) + sb->devflags |= WriteMostly1; + else + sb->devflags &= ~WriteMostly1; + + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->device_uuid, 16) != 16) { + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->device_uuid, r, 16); + } + if (rfd >= 0) + close(rfd); + + sb->events = 0; + + refst = dup_super(st); + if (load_super1(refst, di->fd, NULL)==0) { + struct mdp_superblock_1 *refsb = refst->sb; + + memcpy(sb->device_uuid, refsb->device_uuid, 16); + if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) { + /* same array, so preserve events and + * dev_number */ + sb->events = refsb->events; + /* bugs in 2.6.17 and earlier mean the + * dev_number chosen in Manage must be preserved + */ + if (get_linux_version() >= 2006018) + sb->dev_number = refsb->dev_number; + } + free_super1(refst); + } + free(refst); + + if (!get_dev_size(di->fd, NULL, &dsize)) { + rv = 1; + goto error_out; + } + dsize >>= 9; + + if (dsize < 24) { + close(di->fd); + rv = 2; + goto error_out; + } + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + * data_offset has already been set. + */ + array_size = __le64_to_cpu(sb->size); + /* work out how much space we left for a bitmap, + * Add 8 sectors for bad block log */ + bm_space = choose_bm_space(array_size) + 8; + + data_offset = di->data_offset; + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + switch(st->minor_version) { + case 0: + if (data_offset == INVALID_SECTORS) + data_offset = 0; + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + sb->data_offset = __cpu_to_le64(data_offset); + sb->super_offset = __cpu_to_le64(sb_offset); + if (sb_offset < array_size + bm_space) + bm_space = sb_offset - array_size; + sb->data_size = __cpu_to_le64(sb_offset - bm_space); + if (bm_space >= 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32((unsigned)-8); + } + break; + case 1: + sb->super_offset = __cpu_to_le64(0); + if (data_offset == INVALID_SECTORS) + data_offset = 16; + + sb->data_offset = __cpu_to_le64(data_offset); + sb->data_size = __cpu_to_le64(dsize - data_offset); + if (data_offset >= 8 + 32*2 + 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(8 + 32*2); + } else if (data_offset >= 16) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(data_offset-8); + } + break; + case 2: + sb_offset = 4*2; + sb->super_offset = __cpu_to_le64(sb_offset); + if (data_offset == INVALID_SECTORS) + data_offset = 24; + + sb->data_offset = __cpu_to_le64(data_offset); + sb->data_size = __cpu_to_le64(dsize - data_offset); + if (data_offset >= 16 + 32*2 + 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(8 + 32*2); + } else if (data_offset >= 16+16) { + sb->bblog_size = __cpu_to_le16(8); + /* '8' sectors for the bblog, and another '8' + * because we want offset from superblock, not + * start of device. + */ + sb->bblog_offset = __cpu_to_le32(data_offset-8-8); + } + break; + default: + pr_err("Failed to write invalid metadata format 1.%i to %s\n", + st->minor_version, di->devname); + rv = -EINVAL; + goto out; + } + if (conf_get_create_info()->bblist == 0) { + sb->bblog_size = 0; + sb->bblog_offset = 0; + } + + sb->sb_csum = calc_sb_1_csum(sb); + rv = store_super1(st, di->fd); + if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1)) + rv = st->ss->write_bitmap(st, di->fd); + close(di->fd); + di->fd = -1; + if (rv) + goto error_out; + } +error_out: + if (rv) + pr_err("Failed to write metadata to %s\n", + di->devname); +out: + return rv; +} +#endif + +static int compare_super1(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + struct mdp_superblock_1 *first = st->sb; + struct mdp_superblock_1 *second = tst->sb; + + if (second->magic != __cpu_to_le32(MD_SB_MAGIC)) + return 1; + if (second->major_version != __cpu_to_le32(1)) + return 1; + + if (!first) { + if (posix_memalign((void**)&first, 4096, SUPER1_SIZE) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + memcpy(first, second, SUPER1_SIZE); + st->sb = first; + return 0; + } + if (memcmp(first->set_uuid, second->set_uuid, 16)!= 0) + return 2; + + if (first->ctime != second->ctime || + first->level != second->level || + first->layout != second->layout || + first->size != second->size || + first->chunksize != second->chunksize || + first->raid_disks != second->raid_disks) + return 3; + return 0; +} + +static int load_super1(struct supertype *st, int fd, char *devname) +{ + unsigned long long dsize; + unsigned long long sb_offset; + struct mdp_superblock_1 *super; + int uuid[4]; + struct bitmap_super_s *bsb; + struct misc_dev_info *misc; + struct align_fd afd; + + free_super1(st); + + init_afd(&afd, fd); + + if (st->ss == NULL || st->minor_version == -1) { + int bestvers = -1; + struct supertype tst; + __u64 bestctime = 0; + /* guess... choose latest ctime */ + memset(&tst, 0, sizeof(tst)); + tst.ss = &super1; + for (tst.minor_version = 0; tst.minor_version <= 2 ; tst.minor_version++) { + switch(load_super1(&tst, fd, devname)) { + case 0: super = tst.sb; + if (bestvers == -1 || + bestctime < __le64_to_cpu(super->ctime)) { + bestvers = tst.minor_version; + bestctime = __le64_to_cpu(super->ctime); + } + free(super); + tst.sb = NULL; + break; + case 1: return 1; /*bad device */ + case 2: break; /* bad, try next */ + } + } + if (bestvers != -1) { + int rv; + tst.minor_version = bestvers; + tst.ss = &super1; + tst.max_devs = MAX_DEVS; + rv = load_super1(&tst, fd, devname); + if (rv == 0) + *st = tst; + return rv; + } + return 2; + } + if (!get_dev_size(fd, devname, &dsize)) + return 1; + dsize >>= 9; + + if (dsize < 24) { + if (devname) + pr_err("%s is too small for md: size is %llu sectors.\n", + devname, dsize); + return 1; + } + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4*2; + break; + default: + return -EINVAL; + } + + if (lseek64(fd, sb_offset << 9, 0)< 0LL) { + if (devname) + pr_err("Cannot seek to superblock on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void**)&super, 4096, SUPER1_SIZE) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + if (aread(&afd, super, MAX_SB_SIZE) != MAX_SB_SIZE) { + if (devname) + pr_err("Cannot read superblock on %s\n", + devname); + free(super); + return 1; + } + + if (__le32_to_cpu(super->magic) != MD_SB_MAGIC) { + if (devname) + pr_err("No super block found on %s (Expected magic %08x, got %08x)\n", + devname, MD_SB_MAGIC, __le32_to_cpu(super->magic)); + free(super); + return 2; + } + + if (__le32_to_cpu(super->major_version) != 1) { + if (devname) + pr_err("Cannot interpret superblock on %s - version is %d\n", + devname, __le32_to_cpu(super->major_version)); + free(super); + return 2; + } + if (__le64_to_cpu(super->super_offset) != sb_offset) { + if (devname) + pr_err("No superblock found on %s (super_offset is wrong)\n", + devname); + free(super); + return 2; + } + st->sb = super; + + bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE); + + misc = (struct misc_dev_info*) (((char*)super)+MAX_SB_SIZE+BM_SUPER_SIZE); + misc->device_size = dsize; + if (st->data_offset == INVALID_SECTORS) + st->data_offset = __le64_to_cpu(super->data_offset); + + /* Now check on the bitmap superblock */ + if ((__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) == 0) + return 0; + /* Read the bitmap superblock and make sure it looks + * valid. If it doesn't clear the bit. An --assemble --force + * should get that written out. + */ + locate_bitmap1(st, fd); + if (aread(&afd, bsb, 512) != 512) + goto no_bitmap; + + uuid_from_super1(st, uuid); + if (__le32_to_cpu(bsb->magic) != BITMAP_MAGIC || + memcmp(bsb->uuid, uuid, 16) != 0) + goto no_bitmap; + return 0; + + no_bitmap: + super->feature_map = __cpu_to_le32(__le32_to_cpu(super->feature_map) + & ~MD_FEATURE_BITMAP_OFFSET); + return 0; +} + +static struct supertype *match_metadata_desc1(char *arg) +{ + struct supertype *st = xcalloc(1, sizeof(*st)); + + st->container_devnm[0] = 0; + st->ss = &super1; + st->max_devs = MAX_DEVS; + st->sb = NULL; + st->data_offset = INVALID_SECTORS; + /* leading zeros can be safely ignored. --detail generates them. */ + while (*arg == '0') + arg++; + if (strcmp(arg, "1.0") == 0 || + strcmp(arg, "1.00") == 0) { + st->minor_version = 0; + return st; + } + if (strcmp(arg, "1.1") == 0 || + strcmp(arg, "1.01") == 0 + ) { + st->minor_version = 1; + return st; + } + if (strcmp(arg, "1.2") == 0 || +#ifndef DEFAULT_OLD_METADATA /* ifdef in super0.c */ + strcmp(arg, "default") == 0 || +#endif /* DEFAULT_OLD_METADATA */ + strcmp(arg, "1.02") == 0) { + st->minor_version = 2; + return st; + } + if (strcmp(arg, "1") == 0 || + strcmp(arg, "default") == 0) { + st->minor_version = -1; + return st; + } + + free(st); + return NULL; +} + +/* find available size on device with this devsize, using + * superblock type st, and reserving 'reserve' sectors for + * a possible bitmap + */ +static __u64 avail_size1(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + struct mdp_superblock_1 *super = st->sb; + int bmspace = 0; + int bbspace = 0; + if (devsize < 24) + return 0; + +#ifndef MDASSEMBLE + if (__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) { + /* hot-add. allow for actual size of bitmap */ + struct bitmap_super_s *bsb; + bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE); + bmspace = bitmap_sectors(bsb); + } +#endif + /* Allow space for bad block log */ + if (super->bblog_size) + bbspace = __le16_to_cpu(super->bblog_size); + + if (st->minor_version < 0) + /* not specified, so time to set default */ + st->minor_version = 2; + + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + + if (data_offset != INVALID_SECTORS) + switch(st->minor_version) { + case 0: + return devsize - data_offset - 8*2 - bbspace; + case 1: + case 2: + return devsize - data_offset; + default: + return 0; + } + + devsize -= bmspace; + + switch(st->minor_version) { + case 0: + /* at end */ + return ((devsize - 8*2 - bbspace ) & ~(4*2-1)); + case 1: + /* at start, 4K for superblock and possible bitmap */ + return devsize - 4*2 - bbspace; + case 2: + /* 4k from start, 4K for superblock and possible bitmap */ + return devsize - (4+4)*2 - bbspace; + } + return 0; +} + +static int +add_internal_bitmap1(struct supertype *st, + int *chunkp, int delay, int write_behind, + unsigned long long size, + int may_change, int major) +{ + /* + * If not may_change, then this is a 'Grow' without sysfs support for + * bitmaps, and the bitmap must fit after the superblock at 1K offset. + * If may_change, then this is create or a Grow with sysfs syupport, + * and we can put the bitmap wherever we like. + * + * size is in sectors, chunk is in bytes !!! + */ + + unsigned long long bits; + unsigned long long max_bits; + unsigned long long min_chunk; + long offset; + long bbl_offset, bbl_size; + unsigned long long chunk = *chunkp; + int room = 0; + int creating = 0; + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + int uuid[4]; + + if (__le64_to_cpu(sb->data_size) == 0) + /* Must be creating the array, else data_size would be non-zero */ + creating = 1; + switch(st->minor_version) { + case 0: + /* either 3K after the superblock (when hot-add), + * or some amount of space before. + */ + if (creating) { + /* We are creating array, so we *know* how much room has + * been left. + */ + offset = 0; + bbl_size = 8; + room = choose_bm_space(__le64_to_cpu(sb->size)) + bbl_size; + } else { + room = __le64_to_cpu(sb->super_offset) + - __le64_to_cpu(sb->data_offset) + - __le64_to_cpu(sb->data_size); + bbl_size = __le16_to_cpu(sb->bblog_size); + if (bbl_size < 8) + bbl_size = 8; + bbl_offset = (__s32)__le32_to_cpu(sb->bblog_offset); + if (bbl_size < -bbl_offset) + bbl_size = -bbl_offset; + + if (!may_change || (room < 3*2 && + __le32_to_cpu(sb->max_dev) <= 384)) { + room = 3*2; + offset = 1*2; + bbl_size = 0; + } else { + offset = 0; /* means movable offset */ + } + } + break; + case 1: + case 2: /* between superblock and data */ + if (creating) { + offset = 4*2; + bbl_size = 8; + room = choose_bm_space(__le64_to_cpu(sb->size)) + bbl_size; + } else { + room = __le64_to_cpu(sb->data_offset) + - __le64_to_cpu(sb->super_offset); + bbl_size = __le16_to_cpu(sb->bblog_size); + if (bbl_size) + room = __le32_to_cpu(sb->bblog_offset) + bbl_size; + else + bbl_size = 8; + + if (!may_change) { + room -= 2; /* Leave 1K for superblock */ + offset = 2; + bbl_size = 0; + } else { + room -= 4*2; /* leave 4K for superblock */ + offset = 4*2; + } + } + break; + default: + return 0; + } + + room -= bbl_size; + if (chunk == UnSet && room > 128*2) + /* Limit to 128K of bitmap when chunk size not requested */ + room = 128*2; + + if (room <= 1) + /* No room for a bitmap */ + return 0; + + max_bits = (room * 512 - sizeof(bitmap_super_t)) * 8; + + min_chunk = 4096; /* sub-page chunks don't work yet.. */ + bits = (size*512)/min_chunk +1; + while (bits > max_bits) { + min_chunk *= 2; + bits = (bits+1)/2; + } + if (chunk == UnSet) { + /* For practical purpose, 64Meg is a good + * default chunk size for internal bitmaps. + */ + chunk = min_chunk; + if (chunk < 64*1024*1024) + chunk = 64*1024*1024; + } else if (chunk < min_chunk) + return 0; /* chunk size too small */ + if (chunk == 0) /* rounding problem */ + return 0; + + if (offset == 0) { + /* start bitmap on a 4K boundary with enough space for + * the bitmap + */ + bits = (size*512) / chunk + 1; + room = ((bits+7)/8 + sizeof(bitmap_super_t) +4095)/4096; + room *= 8; /* convert 4K blocks to sectors */ + offset = -room - bbl_size; + } + + sb->bitmap_offset = (int32_t)__cpu_to_le32(offset); + + sb->feature_map = __cpu_to_le32(__le32_to_cpu(sb->feature_map) + | MD_FEATURE_BITMAP_OFFSET); + memset(bms, 0, sizeof(*bms)); + bms->magic = __cpu_to_le32(BITMAP_MAGIC); + bms->version = __cpu_to_le32(major); + uuid_from_super1(st, uuid); + memcpy(bms->uuid, uuid, 16); + bms->chunksize = __cpu_to_le32(chunk); + bms->daemon_sleep = __cpu_to_le32(delay); + bms->sync_size = __cpu_to_le64(size); + bms->write_behind = __cpu_to_le32(write_behind); + + *chunkp = chunk; + return 1; +} + +static void locate_bitmap1(struct supertype *st, int fd) +{ + unsigned long long offset; + struct mdp_superblock_1 *sb; + int mustfree = 0; + + if (!st->sb) { + if (st->ss->load_super(st, fd, NULL)) + return; /* no error I hope... */ + mustfree = 1; + } + sb = st->sb; + + offset = __le64_to_cpu(sb->super_offset); + offset += (int32_t) __le32_to_cpu(sb->bitmap_offset); + if (mustfree) + free(sb); + lseek64(fd, offset<<9, 0); +} + +static int write_bitmap1(struct supertype *st, int fd) +{ + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE); + int rv = 0; + void *buf; + int towrite, n; + struct align_fd afd; + + init_afd(&afd, fd); + + locate_bitmap1(st, fd); + + if (posix_memalign(&buf, 4096, 4096)) + return -ENOMEM; + + memset(buf, 0xff, 4096); + memcpy(buf, (char *)bms, sizeof(bitmap_super_t)); + + towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); + towrite = (towrite+7) >> 3; /* bits to bytes */ + towrite += sizeof(bitmap_super_t); + towrite = ROUND_UP(towrite, 512); + while (towrite > 0) { + n = towrite; + if (n > 4096) + n = 4096; + n = awrite(&afd, buf, n); + if (n > 0) + towrite -= n; + else + break; + memset(buf, 0xff, 4096); + } + fsync(fd); + if (towrite) + rv = -2; + + free(buf); + return rv; +} + +static void free_super1(struct supertype *st) +{ + if (st->sb) + free(st->sb); + while (st->info) { + struct devinfo *di = st->info; + st->info = di->next; + if (di->fd >= 0) + close(di->fd); + free(di); + } + st->sb = NULL; +} + +#ifndef MDASSEMBLE +static int validate_geometry1(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int verbose) +{ + unsigned long long ldsize, devsize; + int bmspace; + unsigned long long headroom; + int fd; + + if (level == LEVEL_CONTAINER) { + if (verbose) + pr_err("1.x metadata does not support containers\n"); + return 0; + } + if (*chunk == UnSet) + *chunk = DEFAULT_CHUNK; + + if (!subdev) + return 1; + + if (st->minor_version < 0) + /* not specified, so time to set default */ + st->minor_version = 2; + + fd = open(subdev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + pr_err("super1.x cannot open %s: %s\n", + subdev, strerror(errno)); + return 0; + } + + if (!get_dev_size(fd, subdev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + devsize = ldsize >> 9; + if (devsize < 24) { + *freesize = 0; + return 0; + } + + /* creating: allow suitable space for bitmap */ + bmspace = choose_bm_space(devsize); + + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + if (data_offset == INVALID_SECTORS) + switch (st->minor_version) { + case 0: + data_offset = 0; + break; + case 1: + case 2: + /* Choose data offset appropriate for this device + * and use as default for whole array. + * The data_offset must allow for bitmap space + * and base metadata, should allow for some headroom + * for reshape, and should be rounded to multiple + * of 1M. + * Headroom is limited to 128M, but aim for about 0.1% + */ + headroom = 128*1024*2; + while ((headroom << 10) > devsize && + (*chunk == 0 || + headroom / 2 >= ((unsigned)(*chunk)*2)*2)) + headroom >>= 1; + data_offset = 12*2 + bmspace + headroom; + #define ONE_MEG (2*1024) + if (data_offset > ONE_MEG) + data_offset = (data_offset / ONE_MEG) * ONE_MEG; + break; + } + if (st->data_offset == INVALID_SECTORS) + st->data_offset = data_offset; + switch(st->minor_version) { + case 0: /* metadata at end. Round down and subtract space to reserve */ + devsize = (devsize & ~(4ULL*2-1)); + /* space for metadata, bblog, bitmap */ + devsize -= 8*2 + 8 + bmspace; + break; + case 1: + case 2: + devsize -= data_offset; + break; + } + *freesize = devsize; + return 1; +} +#endif /* MDASSEMBLE */ + +void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0) +{ + /* Create a v1.0 superblock based on 'info'*/ + void *ret; + struct mdp_superblock_1 *sb; + int i; + int rfd; + unsigned long long offset; + + if (posix_memalign(&ret, 4096, 1024) != 0) + return NULL; + sb = ret; + memset(ret, 0, 1024); + sb->magic = __cpu_to_le32(MD_SB_MAGIC); + sb->major_version = __cpu_to_le32(1); + + copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid); + sprintf(sb->set_name, "%d", sb0->md_minor); + sb->ctime = __cpu_to_le32(info->array.ctime+1); + sb->level = __cpu_to_le32(info->array.level); + sb->layout = __cpu_to_le32(info->array.layout); + sb->size = __cpu_to_le64(info->component_size); + sb->chunksize = __cpu_to_le32(info->array.chunk_size/512); + sb->raid_disks = __cpu_to_le32(info->array.raid_disks); + if (info->array.level > 0) + sb->data_size = sb->size; + else + sb->data_size = st->ss->avail_size(st, st->devsize/512, 0); + sb->resync_offset = MaxSector; + sb->max_dev = __cpu_to_le32(MD_SB_DISKS); + sb->dev_number = __cpu_to_le32(info->disk.number); + sb->utime = __cpu_to_le64(info->array.utime); + + offset = st->devsize/512 - 8*2; + offset &= ~(4*2-1); + sb->super_offset = __cpu_to_le64(offset); + //*(__u64*)(st->other + 128 + 8 + 8) = __cpu_to_le64(offset); + + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->device_uuid, 16) != 16) { + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->device_uuid, r, 16); + } + if (rfd >= 0) + close(rfd); + + for (i = 0; i < MD_SB_DISKS; i++) { + int state = sb0->disks[i].state; + sb->dev_roles[i] = 0xFFFF; + if ((state & (1<<MD_DISK_SYNC)) && + !(state & (1<<MD_DISK_FAULTY))) + sb->dev_roles[i] = __cpu_to_le16(sb0->disks[i].raid_disk); + } + sb->sb_csum = calc_sb_1_csum(sb); + return ret; +} + +struct superswitch super1 = { +#ifndef MDASSEMBLE + .examine_super = examine_super1, + .brief_examine_super = brief_examine_super1, + .export_examine_super = export_examine_super1, + .detail_super = detail_super1, + .brief_detail_super = brief_detail_super1, + .export_detail_super = export_detail_super1, + .write_init_super = write_init_super1, + .validate_geometry = validate_geometry1, + .add_to_super = add_to_super1, + .examine_badblocks = examine_badblocks_super1, + .copy_metadata = copy_metadata1, +#endif + .match_home = match_home1, + .uuid_from_super = uuid_from_super1, + .getinfo_super = getinfo_super1, + .container_content = container_content1, + .update_super = update_super1, + .init_super = init_super1, + .store_super = store_super1, + .compare_super = compare_super1, + .load_super = load_super1, + .match_metadata_desc = match_metadata_desc1, + .avail_size = avail_size1, + .add_internal_bitmap = add_internal_bitmap1, + .locate_bitmap = locate_bitmap1, + .write_bitmap = write_bitmap1, + .free_super = free_super1, +#if __BYTE_ORDER == BIG_ENDIAN + .swapuuid = 0, +#else + .swapuuid = 1, +#endif + .name = "1.x", +}; diff --git a/swap_super.c b/swap_super.c new file mode 100644 index 00000000..b6db5743 --- /dev/null +++ b/swap_super.c @@ -0,0 +1,81 @@ +#include <unistd.h> +#include <stdlib.h> +#include <fcntl.h> +#include <stdio.h> +#include <sys/mount.h> +/* + * This is a tiny test program to endian-swap + * the superblock on a given device. + * We simply read 4k from where the superblock should be + * do the swap, and write it back + * Don't use this on a real array, use mdadm. + */ + +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) + +extern long long lseek64(int, long long, int); + +int main(int argc, char *argv[]) +{ + int fd, i; + unsigned long size; + unsigned long long offset; + char super[4096]; + if (argc != 2) { + fprintf(stderr, "Usage: swap_super device\n"); + exit(1); + } + fd = open(argv[1], O_RDWR); + if (fd<0) { + perror(argv[1]); + exit(1); + } + if (ioctl(fd, BLKGETSIZE, &size)) { + perror("BLKGETSIZE"); + exit(1); + } + offset = MD_NEW_SIZE_SECTORS(size) * 512LL; + if (lseek64(fd, offset, 0) < 0LL) { + perror("lseek64"); + exit(1); + } + if (read(fd, super, 4096) != 4096) { + perror("read"); + exit(1); + } + + for (i=0; i < 4096 ; i+=4) { + char t = super[i]; + super[i] = super[i+3]; + super[i+3] = t; + t=super[i+1]; + super[i+1]=super[i+2]; + super[i+2]=t; + } + /* swap the u64 events counters */ + for (i=0; i<4; i++) { + /* events_hi and events_lo */ + char t=super[32*4+7*4 +i]; + super[32*4+7*4 +i] = super[32*4+8*4 +i]; + super[32*4+8*4 +i] = t; + + /* cp_events_hi and cp_events_lo */ + t=super[32*4+9*4 +i]; + super[32*4+9*4 +i] = super[32*4+10*4 +i]; + super[32*4+10*4 +i] = t; + } + + if (lseek64(fd, offset, 0) < 0LL) { + perror("lseek64"); + exit(1); + } + if (write(fd, super, 4096) != 4096) { + perror("write"); + exit(1); + } + exit(0); + +} diff --git a/sysfs.c b/sysfs.c new file mode 100644 index 00000000..72684702 --- /dev/null +++ b/sysfs.c @@ -0,0 +1,918 @@ +/* + * sysfs - extract md related information from sysfs. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <dirent.h> +#include <ctype.h> + +int load_sys(char *path, char *buf) +{ + int fd = open(path, O_RDONLY); + int n; + if (fd < 0) + return -1; + n = read(fd, buf, 1024); + close(fd); + if (n <0 || n >= 1024) + return -1; + buf[n] = 0; + if (n && buf[n-1] == '\n') + buf[n-1] = 0; + return 0; +} + +void sysfs_free(struct mdinfo *sra) +{ + while (sra) { + struct mdinfo *sra2 = sra->next; + while (sra->devs) { + struct mdinfo *d = sra->devs; + sra->devs = d->next; + free(d); + } + free(sra); + sra = sra2; + } +} + +int sysfs_open(char *devnm, char *devname, char *attr) +{ + char fname[50]; + int fd; + + sprintf(fname, "/sys/block/%s/md/", devnm); + if (devname) { + strcat(fname, devname); + strcat(fname, "/"); + } + strcat(fname, attr); + fd = open(fname, O_RDWR); + if (fd < 0 && errno == EACCES) + fd = open(fname, O_RDONLY); + return fd; +} + +void sysfs_init(struct mdinfo *mdi, int fd, char *devnm) +{ + mdi->sys_name[0] = 0; + if (fd >= 0) { + mdu_version_t vers; + if (ioctl(fd, RAID_VERSION, &vers) != 0) + return; + devnm = fd2devnm(fd); + } + if (devnm == NULL) + return; + strcpy(mdi->sys_name, devnm); +} + +struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options) +{ + char fname[PATH_MAX]; + char buf[PATH_MAX]; + char *base; + char *dbase; + struct mdinfo *sra; + struct mdinfo *dev, **devp; + DIR *dir = NULL; + struct dirent *de; + + sra = xcalloc(1, sizeof(*sra)); + sysfs_init(sra, fd, devnm); + if (sra->sys_name[0] == 0) { + free(sra); + return NULL; + } + + sprintf(fname, "/sys/block/%s/md/", sra->sys_name); + base = fname + strlen(fname); + + sra->devs = NULL; + if (options & GET_VERSION) { + strcpy(base, "metadata_version"); + if (load_sys(fname, buf)) + goto abort; + if (strncmp(buf, "none", 4) == 0) { + sra->array.major_version = + sra->array.minor_version = -1; + strcpy(sra->text_version, ""); + } else if (strncmp(buf, "external:", 9) == 0) { + sra->array.major_version = -1; + sra->array.minor_version = -2; + strcpy(sra->text_version, buf+9); + } else { + sscanf(buf, "%d.%d", + &sra->array.major_version, + &sra->array.minor_version); + strcpy(sra->text_version, buf); + } + } + if (options & GET_LEVEL) { + strcpy(base, "level"); + if (load_sys(fname, buf)) + goto abort; + sra->array.level = map_name(pers, buf); + } + if (options & GET_LAYOUT) { + strcpy(base, "layout"); + if (load_sys(fname, buf)) + goto abort; + sra->array.layout = strtoul(buf, NULL, 0); + } + if (options & GET_DISKS) { + strcpy(base, "raid_disks"); + if (load_sys(fname, buf)) + goto abort; + sra->array.raid_disks = strtoul(buf, NULL, 0); + } + if (options & GET_DEGRADED) { + strcpy(base, "degraded"); + if (load_sys(fname, buf)) + goto abort; + sra->array.failed_disks = strtoul(buf, NULL, 0); + } + if (options & GET_COMPONENT) { + strcpy(base, "component_size"); + if (load_sys(fname, buf)) + goto abort; + sra->component_size = strtoull(buf, NULL, 0); + /* sysfs reports "K", but we want sectors */ + sra->component_size *= 2; + } + if (options & GET_CHUNK) { + strcpy(base, "chunk_size"); + if (load_sys(fname, buf)) + goto abort; + sra->array.chunk_size = strtoul(buf, NULL, 0); + } + if (options & GET_CACHE) { + strcpy(base, "stripe_cache_size"); + if (load_sys(fname, buf)) + /* Probably level doesn't support it */ + sra->cache_size = 0; + else + sra->cache_size = strtoul(buf, NULL, 0); + } + if (options & GET_MISMATCH) { + strcpy(base, "mismatch_cnt"); + if (load_sys(fname, buf)) + goto abort; + sra->mismatch_cnt = strtoul(buf, NULL, 0); + } + if (options & GET_SAFEMODE) { + int scale = 1; + int dot = 0; + unsigned i; + unsigned long msec; + size_t len; + + strcpy(base, "safe_mode_delay"); + if (load_sys(fname, buf)) + goto abort; + + /* remove a period, and count digits after it */ + len = strlen(buf); + for (i = 0; i < len; i++) { + if (dot) { + if (isdigit(buf[i])) { + buf[i-1] = buf[i]; + scale *= 10; + } + buf[i] = 0; + } else if (buf[i] == '.') { + dot=1; + buf[i] = 0; + } + } + msec = strtoul(buf, NULL, 10); + msec = (msec * 1000) / scale; + sra->safe_mode_delay = msec; + } + if (options & GET_BITMAP_LOCATION) { + strcpy(base, "bitmap/location"); + if (load_sys(fname, buf)) + goto abort; + if (strncmp(buf, "file", 4) == 0) + sra->bitmap_offset = 1; + else if (strncmp(buf, "none", 4) == 0) + sra->bitmap_offset = 0; + else if (buf[0] == '+') + sra->bitmap_offset = strtol(buf+1, NULL, 10); + else + goto abort; + } + + if (! (options & GET_DEVS)) + return sra; + + /* Get all the devices as well */ + *base = 0; + dir = opendir(fname); + if (!dir) + goto abort; + sra->array.spare_disks = 0; + + devp = &sra->devs; + sra->devs = NULL; + while ((de = readdir(dir)) != NULL) { + char *ep; + if (de->d_ino == 0 || + strncmp(de->d_name, "dev-", 4) != 0) + continue; + strcpy(base, de->d_name); + dbase = base + strlen(base); + *dbase++ = '/'; + + dev = xmalloc(sizeof(*dev)); + + /* Always get slot, major, minor */ + strcpy(dbase, "slot"); + if (load_sys(fname, buf)) { + /* hmm... unable to read 'slot' maybe the device + * is going away? + */ + strcpy(dbase, "block"); + if (readlink(fname, buf, sizeof(buf)) < 0 && + errno != ENAMETOOLONG) { + /* ...yup device is gone */ + free(dev); + continue; + } else { + /* slot is unreadable but 'block' link + * still intact... something bad is happening + * so abort + */ + free(dev); + goto abort; + } + + } + strcpy(dev->sys_name, de->d_name); + dev->disk.raid_disk = strtoul(buf, &ep, 10); + if (*ep) dev->disk.raid_disk = -1; + + strcpy(dbase, "block/dev"); + if (load_sys(fname, buf)) { + /* assume this is a stale reference to a hot + * removed device + */ + free(dev); + continue; + } + sra->array.nr_disks++; + sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor); + + /* special case check for block devices that can go 'offline' */ + strcpy(dbase, "block/device/state"); + if (load_sys(fname, buf) == 0 && + strncmp(buf, "offline", 7) == 0) { + free(dev); + continue; + } + + /* finally add this disk to the array */ + *devp = dev; + devp = & dev->next; + dev->next = NULL; + + if (options & GET_OFFSET) { + strcpy(dbase, "offset"); + if (load_sys(fname, buf)) + goto abort; + dev->data_offset = strtoull(buf, NULL, 0); + strcpy(dbase, "new_offset"); + if (load_sys(fname, buf) == 0) + dev->new_data_offset = strtoull(buf, NULL, 0); + else + dev->new_data_offset = dev->data_offset; + } + if (options & GET_SIZE) { + strcpy(dbase, "size"); + if (load_sys(fname, buf)) + goto abort; + dev->component_size = strtoull(buf, NULL, 0) * 2; + } + if (options & GET_STATE) { + dev->disk.state = 0; + strcpy(dbase, "state"); + if (load_sys(fname, buf)) + goto abort; + if (strstr(buf, "in_sync")) + dev->disk.state |= (1<<MD_DISK_SYNC); + if (strstr(buf, "faulty")) + dev->disk.state |= (1<<MD_DISK_FAULTY); + if (dev->disk.state == 0) + sra->array.spare_disks++; + } + if (options & GET_ERROR) { + strcpy(buf, "errors"); + if (load_sys(fname, buf)) + goto abort; + dev->errors = strtoul(buf, NULL, 0); + } + } + closedir(dir); + return sra; + + abort: + if (dir) + closedir(dir); + sysfs_free(sra); + return NULL; +} + +int sysfs_attr_match(const char *attr, const char *str) +{ + /* See if attr, read from a sysfs file, matches + * str. They must either be the same, or attr can + * have a trailing newline or comma + */ + while (*attr && *str && *attr == *str) { + attr++; + str++; + } + + if (*str || (*attr && *attr != ',' && *attr != '\n')) + return 0; + return 1; +} + +int sysfs_match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (sysfs_attr_match(word, list[n])) + break; + return n; +} + +unsigned long long get_component_size(int fd) +{ + /* Find out the component size of the array. + * We cannot trust GET_ARRAY_INFO ioctl as it's + * size field is only 32bits. + * So look in /sys/block/mdXXX/md/component_size + * + * This returns in units of sectors. + */ + struct stat stb; + char fname[50]; + int n; + if (fstat(fd, &stb)) return 0; + if (major(stb.st_rdev) != (unsigned)get_mdp_major()) + sprintf(fname, "/sys/block/md%d/md/component_size", + (int)minor(stb.st_rdev)); + else + sprintf(fname, "/sys/block/md_d%d/md/component_size", + (int)minor(stb.st_rdev)>>MdpMinorShift); + fd = open(fname, O_RDONLY); + if (fd < 0) + return 0; + n = read(fd, fname, sizeof(fname)); + close(fd); + if (n < 0 || n == sizeof(fname)) + return 0; + fname[n] = 0; + return strtoull(fname, NULL, 10) * 2; +} + +int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val) +{ + char fname[50]; + unsigned int n; + int fd; + + sprintf(fname, "/sys/block/%s/md/%s/%s", + sra->sys_name, dev?dev->sys_name:"", name); + fd = open(fname, O_WRONLY); + if (fd < 0) + return -1; + n = write(fd, val, strlen(val)); + close(fd); + if (n != strlen(val)) { + dprintf("failed to write '%s' to '%s' (%s)\n", + val, fname, strerror(errno)); + return -1; + } + return 0; +} + +int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long val) +{ + char valstr[50]; + sprintf(valstr, "%llu", val); + return sysfs_set_str(sra, dev, name, valstr); +} + +int sysfs_set_num_signed(struct mdinfo *sra, struct mdinfo *dev, + char *name, long long val) +{ + char valstr[50]; + sprintf(valstr, "%lli", val); + return sysfs_set_str(sra, dev, name, valstr); +} + +int sysfs_uevent(struct mdinfo *sra, char *event) +{ + char fname[50]; + int n; + int fd; + + sprintf(fname, "/sys/block/%s/uevent", + sra->sys_name); + fd = open(fname, O_WRONLY); + if (fd < 0) + return -1; + n = write(fd, event, strlen(event)); + close(fd); + if (n != (int)strlen(event)) { + dprintf("failed to write '%s' to '%s' (%s)\n", + event, fname, strerror(errno)); + return -1; + } + return 0; +} + +int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name) +{ + char fname[50]; + struct stat st; + + sprintf(fname, "/sys/block/%s/md/%s/%s", + sra->sys_name, dev?dev->sys_name:"", name); + + return stat(fname, &st) == 0; +} + +int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, + char *name) +{ + char fname[50]; + int fd; + + sprintf(fname, "/sys/block/%s/md/%s/%s", + sra->sys_name, dev?dev->sys_name:"", name); + fd = open(fname, O_RDWR); + if (fd < 0) + fd = open(fname, O_RDONLY); + return fd; +} + +int sysfs_fd_get_ll(int fd, unsigned long long *val) +{ + char buf[50]; + int n; + char *ep; + + lseek(fd, 0, 0); + n = read(fd, buf, sizeof(buf)); + if (n <= 0 || n == sizeof(buf)) + return -2; + buf[n] = 0; + *val = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return -1; + return 0; +} + +int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *val) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_ll(fd, val); + close(fd); + return n; +} + +int sysfs_fd_get_two(int fd, unsigned long long *v1, unsigned long long *v2) +{ + /* two numbers in this sysfs file, either + * NNN (NNN) + * or + * NNN / NNN + */ + char buf[80]; + int n; + char *ep, *ep2; + + lseek(fd, 0, 0); + n = read(fd, buf, sizeof(buf)); + if (n <= 0 || n == sizeof(buf)) + return -2; + buf[n] = 0; + *v1 = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return -1; + while (*ep == ' ' || *ep == '/' || *ep == '(') + ep++; + *v2 = strtoull(ep, &ep2, 0); + if (ep2 == ep || (*ep2 != 0 && *ep2 != '\n' && *ep2 != ' ' && *ep2 != ')')) { + *v2 = *v1; + return 1; + } + return 2; +} + +int sysfs_get_two(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *v1, unsigned long long *v2) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_two(fd, v1, v2); + close(fd); + return n; +} + +int sysfs_fd_get_str(int fd, char *val, int size) +{ + int n; + + lseek(fd, 0, 0); + n = read(fd, val, size); + if (n <= 0 || n == size) + return -1; + val[n] = 0; + return n; +} + +int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val, int size) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_str(fd, val, size); + close(fd); + return n; +} + +int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms) +{ + unsigned long sec; + unsigned long msec; + char delay[30]; + + sec = ms / 1000; + msec = ms % 1000; + + sprintf(delay, "%ld.%03ld\n", sec, msec); + /* this '\n' ^ needed for kernels older than 2.6.28 */ + return sysfs_set_str(sra, NULL, "safe_mode_delay", delay); +} + +int sysfs_set_array(struct mdinfo *info, int vers) +{ + int rv = 0; + char ver[100]; + int raid_disks = info->array.raid_disks; + + ver[0] = 0; + if (info->array.major_version == -1 && + info->array.minor_version == -2) { + char buf[1024]; + + strcat(strcpy(ver, "external:"), info->text_version); + + /* meta version might already be set if we are setting + * new geometry for a reshape. In that case we don't + * want to over-write the 'readonly' flag that is + * stored in the metadata version. So read the current + * version first, and preserve the flag + */ + if (sysfs_get_str(info, NULL, "metadata_version", + buf, 1024) > 0) + if (strlen(buf) >= 9 && buf[9] == '-') + ver[9] = '-'; + + if ((vers % 100) < 2 || + sysfs_set_str(info, NULL, "metadata_version", + ver) < 0) { + pr_err("This kernel does not support external metadata.\n"); + return 1; + } + } + if (info->array.level < 0) + return 0; /* FIXME */ + rv |= sysfs_set_str(info, NULL, "level", + map_num(pers, info->array.level)); + if (info->reshape_active && info->delta_disks != UnSet) + raid_disks -= info->delta_disks; + rv |= sysfs_set_num(info, NULL, "raid_disks", raid_disks); + rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size); + rv |= sysfs_set_num(info, NULL, "layout", info->array.layout); + rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2); + if (info->custom_array_size) { + int rc; + + rc = sysfs_set_num(info, NULL, "array_size", + info->custom_array_size/2); + if (rc && errno == ENOENT) { + pr_err("This kernel does not have the md/array_size attribute, the array may be larger than expected\n"); + rc = 0; + } + rv |= rc; + } + + if (info->array.level > 0) + rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start); + + if (info->reshape_active) { + rv |= sysfs_set_num(info, NULL, "reshape_position", + info->reshape_progress); + rv |= sysfs_set_num(info, NULL, "chunk_size", info->new_chunk); + rv |= sysfs_set_num(info, NULL, "layout", info->new_layout); + rv |= sysfs_set_num(info, NULL, "raid_disks", + info->array.raid_disks); + /* We don't set 'new_level' here. That can only happen + * once the reshape completes. + */ + } + return rv; +} + +int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume) +{ + char dv[PATH_MAX]; + char nm[PATH_MAX]; + char *dname; + int rv; + + sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor); + rv = sysfs_set_str(sra, NULL, "new_dev", dv); + if (rv) + return rv; + + memset(nm, 0, sizeof(nm)); + dname = devid2kname(makedev(sd->disk.major, sd->disk.minor)); + strcpy(sd->sys_name, "dev-"); + strcpy(sd->sys_name+4, dname); + + /* test write to see if 'recovery_start' is available */ + if (resume && sd->recovery_start < MaxSector && + sysfs_set_num(sra, sd, "recovery_start", 0)) { + sysfs_set_str(sra, sd, "state", "remove"); + return -1; + } + + rv = sysfs_set_num(sra, sd, "offset", sd->data_offset); + rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2); + if (sra->array.level != LEVEL_CONTAINER) { + if (sd->recovery_start == MaxSector) + /* This can correctly fail if array isn't started, + * yet, so just ignore status for now. + */ + sysfs_set_str(sra, sd, "state", "insync"); + if (sd->disk.raid_disk >= 0) + rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk); + if (resume) + sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start); + } + return rv; +} + +#if 0 +int sysfs_disk_to_sg(int fd) +{ + /* from an open block device, try find and open its corresponding + * scsi_generic interface + */ + struct stat st; + char path[256]; + char sg_path[256]; + char sg_major_minor[10]; + char *c; + DIR *dir; + struct dirent *de; + int major, minor, rv; + + if (fstat(fd, &st)) + return -1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return -1; + + de = readdir(dir); + while (de) { + if (strncmp("scsi_generic:", de->d_name, + strlen("scsi_generic:")) == 0) + break; + de = readdir(dir); + } + closedir(dir); + + if (!de) + return -1; + + snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name); + fd = open(sg_path, O_RDONLY); + if (fd < 0) + return fd; + + rv = read(fd, sg_major_minor, sizeof(sg_major_minor)); + close(fd); + if (rv < 0 || rv == sizeof(sg_major_minor)) + return -1; + else + sg_major_minor[rv - 1] = '\0'; + + c = strchr(sg_major_minor, ':'); + *c = '\0'; + c++; + major = strtol(sg_major_minor, NULL, 10); + minor = strtol(c, NULL, 10); + snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d", + (int) getpid(), major, minor); + if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) { + fd = open(path, O_RDONLY); + unlink(path); + return fd; + } + + return -1; +} +#endif + +int sysfs_disk_to_scsi_id(int fd, __u32 *id) +{ + /* from an open block device, try to retrieve it scsi_id */ + struct stat st; + char path[256]; + DIR *dir; + struct dirent *de; + int host, bus, target, lun; + + if (fstat(fd, &st)) + return 1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device/scsi_device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return 1; + + for (de = readdir(dir); de; de = readdir(dir)) { + int count; + + if (de->d_type != DT_DIR) + continue; + + count = sscanf(de->d_name, "%d:%d:%d:%d", &host, &bus, &target, &lun); + if (count == 4) + break; + } + closedir(dir); + + if (!de) + return 1; + + *id = (host << 24) | (bus << 16) | (target << 8) | (lun << 0); + return 0; +} + +int sysfs_unique_holder(char *devnm, long rdev) +{ + /* Check that devnm is a holder of rdev, + * and is the only holder. + * we should be locked against races by + * an O_EXCL on devnm + * Return values: + * 0 - not unique, not even a holder + * 1 - unique, this is the only holder. + * 2/3 - not unique, there is another holder + * -1 - error, cannot find the holders + */ + DIR *dir; + struct dirent *de; + char dirname[100]; + char l; + int ret = 0; + sprintf(dirname, "/sys/dev/block/%d:%d/holders", + major(rdev), minor(rdev)); + dir = opendir(dirname); + if (!dir) + return -1; + l = strlen(dirname); + while ((de = readdir(dir)) != NULL) { + char buf[100]; + char *sl; + int n; + + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + strcpy(dirname+l, "/"); + strcat(dirname+l, de->d_name); + n = readlink(dirname, buf, sizeof(buf)-1); + if (n <= 0) + continue; + buf[n] = 0; + sl = strrchr(buf, '/'); + if (!sl) + continue; + sl++; + + if (strcmp(devnm, sl) == 0) + ret |= 1; + else + ret |= 2; + } + closedir(dir); + return ret; +} + +int sysfs_freeze_array(struct mdinfo *sra) +{ + /* Try to freeze resync/rebuild on this array/container. + * Return -1 if the array is busy, + * return 0 if this kernel doesn't support 'frozen' + * return 1 if it worked. + */ + char buf[20]; + + if (!sysfs_attribute_available(sra, NULL, "sync_action")) + return 1; /* no sync_action == frozen */ + if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0) + return 0; + if (strcmp(buf, "frozen\n") == 0) + /* Already frozen */ + return 0; + if (strcmp(buf, "idle\n") != 0 && strcmp(buf, "recover\n") != 0) + return -1; + if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0) + return 0; + return 1; +} + +int sysfs_wait(int fd, int *msec) +{ + /* Wait up to '*msec' for fd to have an exception condition. + * if msec == NULL, wait indefinitely. + */ + fd_set fds; + int n; + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (msec == NULL) + n = select(fd+1, NULL, NULL, &fds, NULL); + else if (*msec < 0) + n = 0; + else { + struct timeval start, end, tv; + gettimeofday(&start, NULL); + if (*msec < 1000) { + tv.tv_sec = 0; + tv.tv_usec = (*msec)*1000; + } else { + tv.tv_sec = (*msec)/1000; + tv.tv_usec = 0; + } + n = select(fd+1, NULL, NULL, &fds, &tv); + gettimeofday(&end, NULL); + end.tv_sec -= start.tv_sec; + *msec -= (end.tv_sec * 1000 + end.tv_usec/1000 + - start.tv_usec/1000) + 1; + } + return n; +} diff --git a/systemd/SUSE-mdadm_env.sh b/systemd/SUSE-mdadm_env.sh new file mode 100644 index 00000000..10b2e749 --- /dev/null +++ b/systemd/SUSE-mdadm_env.sh @@ -0,0 +1,45 @@ +#!/bin/sh + +# extract configuration from /etc/sysconfig/mdadm and write +# environment to /run/sysconfig/mdadm to be used by +# systemd unit files. + +MDADM_SCAN="yes" + +# Following adapted from /etc/init.d/mdadmd on openSUSE + +mdadmd_CONFIG=/etc/sysconfig/mdadm +if test -r $mdadmd_CONFIG; then + . $mdadmd_CONFIG +fi + +if [ x$MDADM_DELAY != x"" ]; then + MDADM_DELAY="-d "$MDADM_DELAY; +fi + +if [ x$MDADM_MAIL != x"" ]; then + MDADM_MAIL="-m \"$MDADM_MAIL\"" +fi + +if [ x$MDADM_PROGRAM != x"" ]; then + MDADM_PROGRAM="-p \"$MDADM_PROGRAM\"" +fi + +if [ x$MDADM_SCAN = x"yes" ]; then + MDADM_SCAN="--scan" +else + MDADM_SCAN="" +fi + +if [ x$MDADM_SEND_MAIL_ON_START = x"yes" ]; then + MDADM_SEND_MAIL="-t" +else + MDADM_SEND_MAIL="" +fi + +if [ x$MDADM_CONFIG != x"" ]; then + MDADM_CONFIG="-c \"$MDADM_CONFIG\"" +fi + +mkdir -p /run/sysconfig +echo "MDADM_MONITOR_ARGS=$MDADM_RAIDDEVICES $MDADM_DELAY $MDADM_MAIL $MDADM_PROGRAM $MDADM_SCAN $MDADM_SEND_MAIL $MDADM_CONFIG" > /run/sysconfig/mdadm diff --git a/systemd/mdadm-grow-continue@.service b/systemd/mdadm-grow-continue@.service new file mode 100644 index 00000000..5c667d2a --- /dev/null +++ b/systemd/mdadm-grow-continue@.service @@ -0,0 +1,17 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=Manage MD Reshape on /dev/%I +DefaultDependencies=no + +[Service] +ExecStart=BINDIR/mdadm --grow --continue /dev/%I +StandardInput=null +StandardOutput=null +StandardError=null +KillMode=none diff --git a/systemd/mdadm-last-resort@.service b/systemd/mdadm-last-resort@.service new file mode 100644 index 00000000..5179f194 --- /dev/null +++ b/systemd/mdadm-last-resort@.service @@ -0,0 +1,7 @@ +[Unit] +Description=Activate md array even though degraded +DefaultDependencies=no + +[Service] +Type=oneshot +ExecStart=BINDIR/mdadm --run /dev/%i diff --git a/systemd/mdadm-last-resort@.timer b/systemd/mdadm-last-resort@.timer new file mode 100644 index 00000000..52b3f227 --- /dev/null +++ b/systemd/mdadm-last-resort@.timer @@ -0,0 +1,7 @@ +[Unit] +Description=Timer to wait for more drives before activating degraded array. +DefaultDependencies=no +Conflicts=sys-devices-virtual-block-%i.device + +[Timer] +OnActiveSec=30 diff --git a/systemd/mdadm.shutdown b/systemd/mdadm.shutdown new file mode 100644 index 00000000..33f27783 --- /dev/null +++ b/systemd/mdadm.shutdown @@ -0,0 +1,4 @@ +#!/bin/sh +# We need to ensure all md arrays with external metadata +# (e.g. IMSM, DDF) are clean before completing the shutdown. +BINDIR/mdadm --wait-clean --scan diff --git a/systemd/mdmon@.service b/systemd/mdmon@.service new file mode 100644 index 00000000..85a3a7c5 --- /dev/null +++ b/systemd/mdmon@.service @@ -0,0 +1,28 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD Metadata Monitor on /dev/%I +DefaultDependencies=no +Before=initrd-switch-root.target + +[Service] +# mdmon should never complain due to lack of a platform, +# that is mdadm's job if at all. +Environment=IMSM_NO_PLATFORM=1 +# The mdmon starting in the initramfs (with dracut at least) +# cannot see sysfs after root is mounted, so we will have to +# 'takeover'. As the '--offroot --takeover' don't hurt when +# not necessary, are are useful with root-on-md in dracut, +# have them always present. +ExecStart=BINDIR/mdmon --offroot --takeover %I +Type=forking +# Don't set the PIDFile. It isn't necessary (systemd can work +# it out) and systemd will remove it when transitioning from +# initramfs to rootfs. +#PIDFile=/run/mdadm/%I.pid +KillMode=none diff --git a/systemd/mdmonitor.service b/systemd/mdmonitor.service new file mode 100644 index 00000000..9aff2f56 --- /dev/null +++ b/systemd/mdmonitor.service @@ -0,0 +1,13 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD array monitor +DefaultDependencies=no + +[Service] +ExecStart=BINDIR/mdadm --monitor --scan @@ -0,0 +1,431 @@ +#!/bin/bash +# +# run test suite for mdadm +user=`id -un` +if [ " $user" != " root" ] +then echo >&2 "test: testing can only be done as 'root'." + exit 1; +fi + +prefix='[0-9][0-9]' + +dir=`pwd` +mdadm=$dir/mdadm +if [ \! -x $mdadm ] +then + echo >&2 "test: $mdadm isn't usable." +fi + +testdir="tests" +logdir="$testdir/logs" +logsave=0 +exitonerror=1 + +echo "Testing on linux-$(uname -r) kernel" + +# Check whether to run multipath tests +modprobe multipath 2> /dev/null +if grep -s 'Personalities : .*multipath' > /dev/null /proc/mdstat ; then + MULTIPATH="yes" +fi +INTEGRITY=yes +DEVTYPE=loop +LVM_VOLGROUP=mdtest + +# make sure to test local mdmon, not system one +export MDADM_NO_SYSTEMCTL=1 + +# assume md0, md1, md2 exist in /dev +md0=/dev/md0 md1=/dev/md1 md2=/dev/md2 +mdp0=/dev/md_d0 +mdp1=/dev/md_d1 + +# We test mdadm on loop-back block devices. +# dir for storing files should be settable by command line maybe +targetdir=/var/tmp +size=20000 +# super0, round down to multiple of 64 and substract 64 +mdsize0=19904 +# super00 is nested, subtract 128 +mdsize00=19840 +# super1.0 round down to multiple of 2, subtract 8 +mdsize1=19992 +mdsize1a=19988 +mdsize12=19988 +# super1.2 for linear: round to multiple of 2, subtract 4 +mdsize1_l=19996 +mdsize2_l=19996 +# subtract another 4 for bitmaps +mdsize1b=19988 +mdsize11=19992 +mdsize11a=19456 +mdsize12=19988 + +# ddf needs bigger devices as 32Meg is reserved! +ddfsize=65536 + +config=/tmp/mdadm.conf + +cleanup() { + udevadm settle + $mdadm -Ssq 2> /dev/null + case $DEVTYPE in + loop) + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d + rm -f /dev/disk/by-path/loop* + done + ;; + lvm) + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + eval "lvremove --quiet -f \$dev$d" + done + ;; + esac +} + +ctrl_c() { + exitonerror=1 +} + +do_setup() { + trap cleanup 0 1 3 15 + trap ctrl_c 2 + + # make sure there are no loop devices remaining. + # udev started things can sometimes prevent them being stopped + # immediately + while grep loop /proc/partitions > /dev/null 2>&1 + do + mdadm -Ss + losetup -d /dev/loop[0-9]* 2> /dev/null + sleep 1 + done + devlist= + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + sz=$size + if [ $d -gt 7 ]; then sz=$ddfsize ; fi + case $DEVTYPE in + loop) + [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1 + # make sure udev doesn't touch + mdadm --zero $targetdir/mdtest$d 2> /dev/null + [ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d + if [ $d -eq 7 ] + then + losetup /dev/loop$d $targetdir/mdtest6 # for multipath use + else + losetup /dev/loop$d $targetdir/mdtest$d + fi + eval dev$d=/dev/loop$d + eval file$d=$targetdir/mdtest$d + ;; + lvm) + unset MULTIPATH + eval dev$d=/dev/mapper/${LVM_VOLGROUP}-mdtest$d + if ! lvcreate --quiet -L ${sz}K -n mdtest$d $LVM_VOLGROUP; then + trap '' 0 # make sure lvremove is not called + eval echo error creating \$dev$d + exit 129 + fi + ;; + ram) + unset MULTIPATH + eval dev$d=/dev/ram$d + ;; + esac + eval devlist=\"\$devlist \$dev$d\" + eval devlist$d=\"\$devlist\" + #" <-- add this quote to un-confuse vim syntax highlighting + done + path0=$dev6 + path1=$dev7 + + ulimit -c unlimited + [ -f /proc/mdstat ] || modprobe md_mod + echo 2000 > /proc/sys/dev/raid/speed_limit_max + echo 0 > /sys/module/md_mod/parameters/start_ro +} + +# mdadm always adds --quiet, and we want to see any unexpected messages +mdadm() { + rm -f $targetdir/stderr + case $* in + *-S* ) udevadm settle + p=`cat /proc/sys/dev/raid/speed_limit_max` + echo 20000 > /proc/sys/dev/raid/speed_limit_max + esac + case $* in + *-C* ) $mdadm 2> $targetdir/stderr --quiet "$@" --auto=yes;; + * ) $mdadm 2> $targetdir/stderr --quiet "$@" + esac + rv=$? + case $* in + *-S* ) udevadm settle + echo $p > /proc/sys/dev/raid/speed_limit_max + esac + cat >&2 $targetdir/stderr + return $rv +} + +# check various things +check() { + case $1 in + spares ) + spares=`tr '] ' '\012\012' < /proc/mdstat | grep -c '(S)' || exit 0` + if [ $spares -ne $2 ] + then + echo >&2 "ERROR expected $2 spares, found $spares"; exit 1; + fi + ;; + raid* | linear ) + grep -s "active $1 " /proc/mdstat > /dev/null || { + echo >&2 "ERROR active $1 not found" ; cat /proc/mdstat ; exit 1;} + ;; + algorithm ) + grep -s " algorithm $2 " /proc/mdstat > /dev/null || { + echo >&2 "ERROR algorithm $2 not found"; cat /proc/mdstat; exit 1;} + ;; + resync | recovery | reshape) + cnt=5 + while ! grep -s $1 /proc/mdstat > /dev/null + do + if [ $cnt -gt 0 ] && grep -v idle /sys/block/md*/md/sync_action > /dev/null + then # Something isn't idle - wait a bit + sleep 0.5 + cnt=$[cnt-1] + else + echo >&2 ERROR no $1 happening; cat /proc/mdstat; exit 1 + fi + done + ;; + + nosync ) + sleep 0.5 + # Since 4.2 we delay the close of recovery until there has been a chance for + # spares to be activated. That means that a recovery that finds nothing + # to do can still take a little longer than expected. + # add an extra check: is sync_completed shows the end is reached, assume + # there is no recovery. + if grep -s -E '(resync|recovery|reshape) *=' > /dev/null /proc/mdstat ; then + incomplete=`grep / /sys/block/md*/md/sync_completed 2> /dev/null | sed '/^ *\([0-9]*\) \/ \1/d'` + if [ -n "$incomplete" ]; then + echo >&2 "ERROR resync or recovery is happening!"; cat /proc/mdstat ; exit 1; + fi + fi + ;; + + wait ) + p=`cat /proc/sys/dev/raid/speed_limit_max` + echo 2000000 > /proc/sys/dev/raid/speed_limit_max + sleep 0.1 + while grep -E '(resync|recovery|reshape|check|repair) *=' > /dev/null /proc/mdstat || + grep -v idle > /dev/null /sys/block/md*/md/sync_action + do sleep 0.5; + done + echo $p > /proc/sys/dev/raid/speed_limit_max + ;; + + state ) + grep -s "blocks.*\[$2\]\$" /proc/mdstat > /dev/null || { + echo >&2 "ERROR state $2 not found!"; cat /proc/mdstat ; exit 1; } + sleep 0.5 + ;; + + bitmap ) + grep -s bitmap > /dev/null /proc/mdstat || { + echo >&2 ERROR no bitmap ; cat /proc/mdstat ; exit 1; } + ;; + nobitmap ) + if grep -s "bitmap" > /dev/null /proc/mdstat + then + echo >&2 ERROR bitmap present ; cat /proc/mdstat ; exit 1; + fi + ;; + + * ) echo >&2 ERROR unknown check $1 ; exit 1; + esac +} + +no_errors() { + if [ -s $targetdir/stderr ] + then echo Bad errors from mdadm: ; cat $targetdir/stderr; exit 2; + fi +} +# basic device test + +testdev() { + udevadm settle + dev=$1 + cnt=$2 + dvsize=$3 + chunk=$4 + if [ -z "$5" ]; then + mkfs.ext3 -F -j $dev > /dev/null 2>&1 && fsck -fn $dev >&2 + fi + dsize=$[dvsize/chunk] + dsize=$[dsize*chunk] + rasize=$[dsize*2*cnt] + # rasize is in sectors + if [ -n "$DEV_ROUND_K" ]; then + rasize=$[rasize/DEV_ROUND_K/2] + rasize=$[rasize*DEV_ROUND_K*2] + fi + if [ `/sbin/blockdev --getsize $dev` -eq 0 ]; then sleep 2 ; fi + _sz=`/sbin/blockdev --getsize $dev` + if [ $rasize -lt $_sz -o $[rasize*4/5] -gt $_sz ] + then + echo "ERROR: size is wrong for $dev: $cnt * $dvsize (chunk=$chunk) = $rasize, not $_sz" + exit 1 + fi +} + +fast_sync() { + echo 200000 > /proc/sys/dev/raid/speed_limit_max +} + +rotest() { + dev=$1 + fsck -fn $dev >&2 +} + +do_test() { + _script=$1 + _basename=`basename $_script` + if [ -f "$_script" ] + then + rm -f $targetdir/stderr + # stop all arrays, just incase some script left an array active. + $mdadm -Ssq 2> /dev/null + mdadm --zero $devlist 2> /dev/null + mdadm --zero $devlist 2> /dev/null + # this might have been reset: restore the default. + echo 2000 > /proc/sys/dev/raid/speed_limit_max + # source script in a subshell, so it has access to our + # namespace, but cannot change it. + echo -ne "$_script... " + if ( set -ex ; . $_script ) &> $targetdir/log + then + echo "succeeded" + _fail=0 + else + log=log + cat $targetdir/stderr >> $targetdir/log + echo "=======================dmesg=================" >> $targetdir/log + dmesg | tail -n 200 >> $targetdir/log + if [ $exitonerror == 0 ]; then + log=log-`basename $_script` + mv $targetdir/log $logdir/$log + fi + echo "FAILED - see $logdir/$log for details" + _fail=1 + fi + if [ "$savelogs" == "1" ]; then + cp $targetdir/log $logdir/$_basename.log + fi + if [ "$_fail" == "1" -a "$exitonerror" == "1" ]; then + exit 1 + fi + fi +} + +do_help() { + echo "Usage: $0 [options]" + echo " Options:" + echo " --tests=<test1,test2,..> Comma separated list of tests to run" + echo " --disable-multipath Disable any tests involving multipath" + echo " --disable-integrity Disable slow tests of RAID[56] consistency" + echo " --logdir=<directory> Directory to save logfiles in" + echo " --save-logs Save all logs in <logdir>" + echo " --keep-going Don't stop on error, ie. run all tests" + echo " --dev=[loop|lvm|ram] Use loop devices (default), LVM, or RAM disk" + echo " --volgroup=<name> LVM volume group for LVM test" + echo " setup Setup test environment and exit" + echo " cleanup Cleanup test environment" + echo " <prefix> Run tests with <prefix>" +} + +parse_args() { + for i in $* + do + case $i in + [0-9]*) + prefix=$i + ;; + setup) + echo "mdadm test environment setup" + do_setup + trap 0; exit 0 + ;; + cleanup) + cleanup + exit 0 + ;; + --tests=*) + TESTLIST=`expr "x$i" : 'x[^=]*=\(.*\)' | sed -e 's/,/ /g'` + ;; + --logdir=*) + logdir=`expr "x$i" : 'x[^=]*=\(.*\)'` + ;; + --save-logs) + savelogs=1 + ;; + --keep-going | --no-error) + exitonerror=0 + ;; + --disable-multipath) + unset MULTIPATH + ;; + --disable-integrity) + unset INTEGRITY + ;; + --dev=loop) + DEVTYPE=loop + ;; + --dev=lvm) + DEVTYPE=lvm + ;; + --dev=ram) + DEVTYPE=ram + ;; + --volgroup=*) + LVM_VOLGROUP=`expr "x$i" : 'x[^=]*=\(.*\)'` + ;; + --help) + do_help + exit 0; + ;; + -*) + echo " $0: Unknown argument: $i" + do_help + exit 0; + ;; + esac +done +} + +logdir=$targetdir +parse_args $@ + +do_setup +mkdir -p $logdir + +if [ "$savelogs" == "1" ]; then + echo "Saving logs to $logdir" +fi + +if [ "x$TESTLIST" != "x" ]; then + for script in $TESTLIST + do + do_test $testdir/$script + done +else + for script in $testdir/$prefix $testdir/$prefix*[^~] + do + do_test $script + done +fi +exit 0 diff --git a/tests/00linear b/tests/00linear new file mode 100644 index 00000000..e3ac6555 --- /dev/null +++ b/tests/00linear @@ -0,0 +1,25 @@ + +# create a simple linear + +mdadm -CR $md0 -l linear -n3 $dev0 $dev1 $dev2 +check linear +testdev $md0 3 $mdsize2_l 1 +mdadm -S $md0 + +# now with version-0.90 superblock +mdadm -CR $md0 -e0.90 --level=linear -n4 $dev0 $dev1 $dev2 $dev3 +check linear +testdev $md0 4 $mdsize0 1 +mdadm -S $md0 + +# now with version-1.0 superblock +mdadm -CR $md0 -e1.0 --level=linear -n4 $dev0 $dev1 $dev2 $dev3 +check linear +testdev $md0 4 $mdsize1 1 +mdadm -S $md0 + +# now with no superblock +mdadm -B $md0 -l linear -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check linear +testdev $md0 5 $size 64 +mdadm -S $md0 diff --git a/tests/00multipath b/tests/00multipath new file mode 100644 index 00000000..84e4d693 --- /dev/null +++ b/tests/00multipath @@ -0,0 +1,29 @@ + +# +# create a multipath, and fail and stuff + +if [ "$MULTIPATH" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi + +mdadm -CR $md1 -l multipath -n2 $path0 $path1 + +testdev $md1 1 $mdsize12 1 + +mdadm $md1 -f $path0 +rotest $md1 +testdev $md1 1 $mdsize12 1 + +mdadm $md1 -r $path0 +mdadm $md1 -a $path0 + +rotest $md1 +testdev $md1 1 $mdsize12 1 + +mdadm $md1 -f $path1 +mdadm $md1 -r $path1 +rotest $md1 +testdev $md1 1 $mdsize12 1 + +mdadm -S $md1 diff --git a/tests/00names b/tests/00names new file mode 100644 index 00000000..7a066d8f --- /dev/null +++ b/tests/00names @@ -0,0 +1,13 @@ +set -x -e + +# create arrays with non-numeric names +conf=$targetdir/mdadm.conf +echo "CREATE names=yes" > $conf + +for i in linear raid0 raid1 raid4 raid5 raid6 +do + mdadm -CR --config $conf /dev/md/$i -l $i -n 4 $dev4 $dev3 $dev2 $dev1 + check $i + [ -d /sys/class/block/md_$i/md ] + mdadm -S md_$i +done diff --git a/tests/00raid0 b/tests/00raid0 new file mode 100644 index 00000000..8bc18985 --- /dev/null +++ b/tests/00raid0 @@ -0,0 +1,43 @@ + +# create a simple raid0 + +mdadm -CR $md0 -l raid0 -n3 $dev0 $dev1 $dev2 +check raid0 +testdev $md0 3 $mdsize2_l 512 +mdadm -S $md0 + +# now with version-0.90 superblock +mdadm -CR $md0 -e0.90 -l0 -n4 $dev0 $dev1 $dev2 $dev3 +check raid0 +testdev $md0 4 $mdsize0 512 +mdadm -S $md0 + +# now with no superblock +mdadm -B $md0 -l0 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check raid0 +testdev $md0 5 $size 512 +mdadm -S $md0 + + +# now same again with different chunk size +for chunk in 4 32 256 +do + mdadm -CR $md0 -e0.90 -l raid0 --chunk $chunk -n3 $dev0 $dev1 $dev2 + check raid0 + testdev $md0 3 $mdsize0 $chunk + mdadm -S $md0 + + # now with version-1 superblock + mdadm -CR $md0 -e1.0 -l0 -c $chunk -n4 $dev0 $dev1 $dev2 $dev3 + check raid0 + testdev $md0 4 $mdsize1 $chunk + mdadm -S $md0 + + # now with no superblock + mdadm -B $md0 -l0 -n5 --chun=$chunk $dev0 $dev1 $dev2 $dev3 $dev4 + check raid0 + testdev $md0 5 $size $chunk + mdadm -S $md0 + +done +exit 0 diff --git a/tests/00raid1 b/tests/00raid1 new file mode 100644 index 00000000..c93465d8 --- /dev/null +++ b/tests/00raid1 @@ -0,0 +1,34 @@ + +# create a simple mirror +# test version0, version1, and no super +# test resync and recovery. + +mdadm -CR $md0 -l 1 -n2 $dev0 $dev1 +check resync +check raid1 +testdev $md0 1 $mdsize1a 64 +mdadm -S $md0 + +# now with version-0.90 superblock, spare +mdadm -CR $md0 -e0.90 --level=raid1 -n3 -x2 $dev0 missing missing $dev1 $dev2 +check recovery +check raid1 +testdev $md0 1 $mdsize0 64 +mdadm -S $md0 + +# now with no superblock +mdadm -B $md0 -l mirror -n2 $dev0 $dev1 +check resync +check raid1 +testdev $md0 1 $size 1 +mdadm -S $md0 + +# again, but with no resync +mdadm -B $md0 -l 1 --assume-clean -n2 $dev0 $dev1 +check raid1 +check nosync +testdev $md0 1 $size 1 +mdadm -S $md0 + + +exit 0 diff --git a/tests/00raid10 b/tests/00raid10 new file mode 100644 index 00000000..796b9702 --- /dev/null +++ b/tests/00raid10 @@ -0,0 +1,18 @@ + +# Create some raid10 arrays, all with 6 devices and one spare +devs="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6" + +for lo in n2 n3 f2 f3 +do + cm=1 + case $lo in + f2 ) m=3 cm=2;; + f3 ) m=2 cm=3;; + n2 ) m=3;; + n3 ) m=2;; + esac + mdadm --create --run --level=raid10 --layout $lo --raid-disks 6 -x 1 $md0 $devs + check resync ; check raid10 + testdev $md0 $m $mdsize1 $[512*cm] + mdadm -S $md0 +done diff --git a/tests/00raid4 b/tests/00raid4 new file mode 100644 index 00000000..00a14f2f --- /dev/null +++ b/tests/00raid4 @@ -0,0 +1,16 @@ + +# create a simple raid4 set + +mdadm -CfR $md0 -l 4 -n3 $dev0 $dev1 $dev2 +check resync ; check raid[45] +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +# now with version-1 superblock +mdadm -CR $md0 -e1 --level=raid4 -n4 $dev0 $dev1 $dev2 $dev3 +check recovery; check raid[45] +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + + +exit 0 diff --git a/tests/00raid5 b/tests/00raid5 new file mode 100644 index 00000000..b2b7a971 --- /dev/null +++ b/tests/00raid5 @@ -0,0 +1,33 @@ + +# create a simple raid5 set + +mdadm -CfR $md0 -e 0.90 -l 5 -n3 $dev0 $dev1 $dev2 +check resync +testdev $md0 2 $mdsize0 512 +mdadm -S $md0 + +# now with version-1 superblock +mdadm -CR $md0 -e1 --level=raid5 -n4 $dev0 $dev1 $dev2 $dev3 +check recovery +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# now same again with explicit layout + +for lo in la ra left-symmetric right-symmetric +do + + mdadm -CfR $md0 -l 5 -p $lo -n3 $dev0 $dev1 $dev2 + check resync ; check raid5 + testdev $md0 2 $mdsize1 512 + mdadm -S $md0 + + # now with version-1 superblock + mdadm -CR $md0 -e1 --level=raid5 --layout $lo -n4 $dev0 $dev1 $dev2 $dev3 + check recovery ; check raid5 + testdev $md0 3 $mdsize1 512 + mdadm -S $md0 + +done + +exit 0 diff --git a/tests/00raid6 b/tests/00raid6 new file mode 100644 index 00000000..6977af9b --- /dev/null +++ b/tests/00raid6 @@ -0,0 +1,16 @@ + +# create a simple raid6 set + +mdadm -CfR $md0 -e0.90 -l 6 -n4 $dev0 $dev1 $dev2 $dev3 +check resync ; check raid6 +testdev $md0 2 $mdsize0 512 +mdadm -S $md0 + +# now with version-1 superblock +mdadm -CR $md0 -e1 --level=raid6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check resync ; check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + + +exit 0 diff --git a/tests/01r1fail b/tests/01r1fail new file mode 100644 index 00000000..389b813f --- /dev/null +++ b/tests/01r1fail @@ -0,0 +1,29 @@ + +# create a raid1, fail and remove a drive during initial sync +# Add two more, fail and remove one +# wait for sync to complete, fail, remove, re-add + +mdadm -CR $md0 -l1 -n4 $dev0 $dev1 $dev2 missing +check resync +mdadm $md0 --fail $dev2 +check resync +mdadm $md0 --fail $dev1 +sleep 1 +check nosync +check state U___ +mdadm $md0 --add $dev4 $dev3 +check recovery +# there could be two separate recoveries, one for each dev +check wait +check wait +mdadm $md0 --remove $dev2 $dev1 +check nosync +check state UUU_ + +mdadm --zero-superblock $dev2 +mdadm $md0 -a $dev2 +check recovery +check wait +check state UUUU + +mdadm -S $md0 diff --git a/tests/01r5fail b/tests/01r5fail new file mode 100644 index 00000000..873dba58 --- /dev/null +++ b/tests/01r5fail @@ -0,0 +1,27 @@ + + +# create a raid5, fail and remove a drive during initial sync +# Add two more, fail and remove one +# wait for sync to complete, fail, remove, re-add + +mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +check recovery +mdadm $md0 --fail $dev3 +sleep 1 +check nosync +check state UUU_ + +mdadm $md0 --add $dev4 $dev5 +check recovery +check wait +mdadm $md0 --fail $dev0 +mdadm $md0 --remove $dev3 $dev0 +check recovery +check state _UUU + +mdadm $md0 -a $dev3 +check recovery +check wait +check state UUUU + +mdadm -S $md0
\ No newline at end of file diff --git a/tests/01r5integ b/tests/01r5integ new file mode 100644 index 00000000..48676a22 --- /dev/null +++ b/tests/01r5integ @@ -0,0 +1,33 @@ + +# Check integrity of raid5 in degraded mode +# Create a 4 disk raid5, create a filesystem and +# sha1sum it with each device failed + +if [ "$INTEGRITY" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi + +for layout in ls rs la ra +do + mdadm -CR $md0 -l5 --layout $layout -n4 $dev0 $dev1 $dev2 $dev3 + check wait + tar cf - /etc > $md0 + sum=`sha1sum $md0` + + for i in $dev0 $dev1 $dev2 $dev3 + do + mdadm $md0 -f $i + mdadm $md0 -r $i + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ "$sum" != "$sum1" ] + then + echo $sum does not match $sum1 with $i missing + exit 1 + fi + mdadm $md0 -a $i + while ! (check state 'U*'); do check wait; sleep 0.2; done + done + mdadm -S $md0 +done diff --git a/tests/01raid6integ b/tests/01raid6integ new file mode 100644 index 00000000..12f4d81b --- /dev/null +++ b/tests/01raid6integ @@ -0,0 +1,57 @@ + +# Check integrity of raid6 in degraded modes +# Create a 5 disk raid6, dump some data to it, then +# sha1sum it with different pairs of devices failed + +if [ "$INTEGRITY" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi + +layouts='ls rs la ra' +lv=`uname -r` +if expr $lv '>=' 2.6.30 > /dev/null +then + layouts="$layouts parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6" +fi + +for layout in $layouts +do + mdadm -CR $md0 -l6 --layout $layout -n5 $dev0 $dev1 $dev2 $dev3 $dev4 + check wait + tar cf - /etc > $md0 + sum=`sha1sum $md0` + + totest= + for second in $dev0 $dev1 $dev2 $dev3 $dev4 + do + mdadm $md0 -f $second + mdadm $md0 -r $second + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ "$sum" != "$sum1" ] + then + echo $sum does not match $sum1 with $second missing + exit 1 + fi + for first in $totest + do + mdadm $md0 -f $first + mdadm $md0 -r $first + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ "$sum" != "$sum1" ] + then + echo $sum does not match $sum1 with $first and $second missing + exit 1 + fi + mdadm $md0 -a $first + while ! (check state 'U*_U*'); do check wait; sleep 0.2; done + done + mdadm $md0 -a $second + while ! (check state 'U*'); do check wait; sleep 0.2; done + totest="$totest $second" + done + mdadm -S $md0 +done diff --git a/tests/01replace b/tests/01replace new file mode 100644 index 00000000..6223a223 --- /dev/null +++ b/tests/01replace @@ -0,0 +1,52 @@ +set -x -e + +## test --replace for raid5 raid6 raid1 and raid10 +#1/ after replace, can remove replaced device +#2/ after --replace-with cannot remove the 'with' device +#3/ preserve integrity with concurrent failure + +for level in 1 5 6 10 +do + dd if=/dev/zero of=$dev4 bs=1M || true + dd if=/dev/zero of=$dev5 bs=1M || true + mdadm -CR $md0 -l $level -n4 -x2 $devlist5 + dd if=/dev/urandom of=$md0 bs=1M || true + sum=`sha1sum < $md0` + check wait + mdadm $md0 --replace $dev1 + check wait + mdadm $md0 --remove $dev1 + mdadm $md0 --remove $dev5 && exit 1 + mdadm -S $md0 + dd if=/dev/zero of=$dev4 bs=1M || true + dd if=/dev/zero of=$dev5 bs=1M || true + mdadm -CR $md0 -l $level -n4 -x2 $devlist5 + check wait + sum1=`sha1sum < $md0` + [ "$sum" == "$sum1" ] + + mdadm $md0 --replace $dev1 --with $dev4 + check wait + mdadm $md0 --remove $dev1 + mdadm $md0 --remove $dev5 + mdadm $md0 --remove $dev4 && exit 1 + + mdadm $md0 --add $dev1 $dev5 + mdadm $md0 --replace $dev0 + sleep 1 + mdadm $md0 --fail $dev2 + check wait + sum2=`sha1sum < $md0` + [ "$sum" == "$sum2" ] + + mdadm $md0 --remove $dev0 $dev2 + mdadm $md0 --add $dev0 $dev2 + mdadm $md0 --replace $dev3 + sleep 1 + mdadm $md0 --fail $dev0 $dev2 + check wait + sum3=`sha1sum < $md0` + [ "$sum" == "$sum3" ] + + mdadm -S $md0 +done diff --git a/tests/02lineargrow b/tests/02lineargrow new file mode 100644 index 00000000..e05c219d --- /dev/null +++ b/tests/02lineargrow @@ -0,0 +1,23 @@ + +# create a liner array, and add more drives to to. + +for e in 0.90 1 1.1 1.2 +do + case $e in + 0.90 ) sz=$mdsize0 ;; + 1 ) sz=$mdsize2_l ;; + 1.0 ) sz=$mdsize1 ;; + 1.1 ) sz=$mdsize1_l ;; + 1.2 ) sz=$mdsize2_l ;; + esac + mdadm -CRf $md0 --level linear -e $e --raid-disks=1 $dev1 + testdev $md0 1 $sz 1 + + mdadm --grow $md0 --add $dev2 + testdev $md0 2 $sz 1 + + mdadm --grow $md0 --add $dev3 + testdev $md0 3 $sz 1 + + mdadm -S $md0 +done diff --git a/tests/02r1add b/tests/02r1add new file mode 100644 index 00000000..757f6965 --- /dev/null +++ b/tests/02r1add @@ -0,0 +1,40 @@ + +# Make a raid1, add a device, then remove it again. + +mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2 +check resync +check wait +check state UU + +mdadm --grow $md0 -n 3 +check recovery +check wait +check state UUU + +mdadm $md0 --fail $dev0 +check state _UU + +mdadm --grow $md0 -n 2 +check state UU + +mdadm -S $md0 +# same again for version-1 + + +mdadm -CR $md0 -l1 -n2 -e1.2 -x1 $dev0 $dev1 $dev2 +check resync +check wait +check state UU + +mdadm --grow $md0 -n 3 +check recovery +check wait +check state UUU + +mdadm $md0 --fail $dev0 +check state _UU + +mdadm --grow $md0 -n 2 +check state UU + +mdadm -S $md0 diff --git a/tests/02r1grow b/tests/02r1grow new file mode 100644 index 00000000..5754c88b --- /dev/null +++ b/tests/02r1grow @@ -0,0 +1,36 @@ + + +# create a small raid1 array, make it larger. Then make it smaller + +mdadm -CR $md0 -e 0.90 --level raid1 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +check wait +check state UUU +testdev $md0 1 $[size/2] 1 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 1 $mdsize0 1 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 1 $[size/2] 1 + +mdadm -S $md0 + +# same again with version 1.1 superblock +mdadm -CR $md0 --level raid1 --metadata=1.1 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +check wait +check state UUU +testdev $md0 1 $[size/2] 1 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 1 $mdsize1_l 1 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 1 $[size/2] 1 + +mdadm -S $md0 diff --git a/tests/02r5grow b/tests/02r5grow new file mode 100644 index 00000000..386e82ee --- /dev/null +++ b/tests/02r5grow @@ -0,0 +1,36 @@ + + +# create a small raid5 array, make it larger. Then make it smaller + +mdadm -CR $md0 -e0.90 --level raid5 --chunk=64 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +check wait +check state UUU +testdev $md0 2 $[size/2] 32 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 2 $mdsize0 32 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 2 $[size/2] 32 + +mdadm -S $md0 + +# same again with version 1.1 superblock +mdadm -CR $md0 --level raid5 --metadata=1.1 --chunk=128 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +check wait +check state UUUU +testdev $md0 3 $[size/2] 128 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 3 $[mdsize1_l] 128 + +mdadm --grow $md0 --size $[size/2] +check nosync +sh tests/testdev $md0 3 $[size/2] 128 + +mdadm -S $md0 diff --git a/tests/02r6grow b/tests/02r6grow new file mode 100644 index 00000000..759e6275 --- /dev/null +++ b/tests/02r6grow @@ -0,0 +1,36 @@ + + +# create a small raid6 array, make it larger. Then make it smaller + +mdadm -CR $md0 -e 0.90 --level raid6 --chunk=64 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +check wait +check state UUUU +testdev $md0 2 $[size/2] 32 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 2 $mdsize0 32 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 2 $[size/2] 32 + +mdadm -S $md0 + +# same again with version 1.1 superblock +mdadm -CR $md0 --level raid6 --metadata=1.1 --chunk=128 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +check wait +check state UUUU +testdev $md0 2 $[size/2] 128 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 2 $[mdsize1_l] 128 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 2 $[size/2] 128 + +mdadm -S $md0 diff --git a/tests/03assem-incr b/tests/03assem-incr new file mode 100644 index 00000000..f10a1a48 --- /dev/null +++ b/tests/03assem-incr @@ -0,0 +1,17 @@ +set -x -e + +# Test interaction between -I and -A +# there are locking issue too, but those are hard to test for. +# +# Here just test that a partly "-I" assembled array can +# be completed with "-A" + +for l in 0 1 5 linear +do + mdadm -CR $md0 -l $l -n5 $dev0 $dev1 $dev2 $dev3 $dev4 --assume-clean + mdadm -S md0 + mdadm -I $dev1 + mdadm -I $dev3 + mdadm -A /dev/md0 $dev0 $dev1 $dev2 $dev3 $dev4 + mdadm -S /dev/md0 +done diff --git a/tests/03r0assem b/tests/03r0assem new file mode 100644 index 00000000..6744e322 --- /dev/null +++ b/tests/03r0assem @@ -0,0 +1,137 @@ + +# create a raid0 array from 3 devices, and assemble it in a multitude of ways. +# explicitly list devices +# uuid, md-minor on command line with wildcard devices +# mdadm.conf file + +mdadm -CR $md2 -l0 -n3 $dev0 $dev1 $dev2 +check raid0 +tst="testdev $md2 3 $mdsize1_l 512" +$tst +uuid=`mdadm -Db $md2 | sed 's/.*UUID=//'` +mdadm -S $md2 + +mdadm -A $md2 $dev0 $dev1 $dev2 +$tst +mdadm -S $md2 + +mdadm -A $md2 -u $uuid $devlist +$tst +mdadm -S $md2 + +mdadm --assemble $md2 --name=2 $devlist +$tst +mdadm -S $md2 + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md2 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + +{ + echo DEVICE $devlist + echo array $md2 name=2 +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + + +{ + echo DEVICE $devlist + echo array $md2 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf $md2 +$tst + +echo "DEVICE $devlist" > $conf +mdadm -Db $md2 >> $conf +mdadm -S $md2 + +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + +echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + + +### Now for version 0... + +mdadm --zero-superblock $dev0 $dev1 $dev2 +mdadm -CR $md2 -l0 --metadata=0.90 -n3 $dev0 $dev1 $dev2 +check raid0 +tst="testdev $md2 3 $mdsize0 512" +$tst + +uuid=`mdadm -Db $md2 | sed 's/.*UUID=//'` +mdadm -S $md2 + +mdadm -A $md2 $dev0 $dev1 $dev2 +$tst +mdadm -S $md2 + +mdadm -A $md2 -u $uuid $devlist +$tst +mdadm -S $md2 + +mdadm --assemble $md2 --super-minor=2 $devlist # +$tst +mdadm -S $md2 + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md2 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + +{ + echo DEVICE $devlist + echo array $md2 super-minor=2 +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + + +{ + echo DEVICE $devlist + echo array $md2 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf $md2 +$tst + +echo "DEVICE $devlist" > $conf +mdadm -Db $md2 >> $conf +mdadm -S $md2 + +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + +echo " metadata=1 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + +# Now use incremental assembly. +mdadm -I --config=$conf $dev0 +mdadm -I --config=$conf $dev1 +mdadm -I --config=$conf $dev2 +$tst +mdadm -S $md2 diff --git a/tests/03r5assem b/tests/03r5assem new file mode 100644 index 00000000..0c7fb8c6 --- /dev/null +++ b/tests/03r5assem @@ -0,0 +1,109 @@ + +# create a raid5 array and assemble it in various ways, +# including with missing devices. + +mdadm -CR -e 0.90 $md1 -l5 -n3 $dev0 $dev1 $dev2 +tst="check raid5 ;testdev $md1 2 $mdsize0 512 ; mdadm -S $md1" +uuid=`mdadm -Db $md1 | sed 's/.*UUID=//'` +check wait +eval $tst + +mdadm -A $md1 $dev0 $dev1 $dev2 +eval $tst + +mdadm -A $md1 -u $uuid $devlist +eval $tst + +mdadm -A $md1 -m 1 $devlist +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 super-minor=1 +} > $conf + +mdadm -As -c $conf +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +eval $tst + +mdadm --assemble --scan --config=$conf $md1 +eval $tst + +echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md1 +eval $tst + +### Now with a missing device + +mdadm -AR $md1 $dev0 $dev2 # +check state U_U +eval $tst + +mdadm -A $md1 -u $uuid $devlist +check state U_U +eval $tst + +mdadm -A $md1 -m 1 $devlist +check state U_U +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 super-minor=1 +} > $conf + +mdadm -As -c $conf +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +check state U_U +eval $tst + +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst + +echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst diff --git a/tests/03r5assem-failed b/tests/03r5assem-failed new file mode 100644 index 00000000..d38241df --- /dev/null +++ b/tests/03r5assem-failed @@ -0,0 +1,12 @@ + +# Create an array, fail one device while array is active, stop array, +# then re-assemble listing the failed device first. + +mdadm -CR $md1 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +check wait + +echo 2000 > /sys/block/md1/md/safe_mode_delay +mkfs $md1 +mdadm $md1 -f $dev0 +mdadm -S $md1 +mdadm -A $md1 $dev0 $dev1 $dev2 $dev3 || exit 1 diff --git a/tests/03r5assemV1 b/tests/03r5assemV1 new file mode 100644 index 00000000..bca0c583 --- /dev/null +++ b/tests/03r5assemV1 @@ -0,0 +1,128 @@ + +# create a v-1 raid5 array and assemble in various ways + +mdadm -CR -e1 --name one $md1 -l5 -n3 -x2 $dev0 $dev1 $dev2 $dev3 $dev4 +tst="check raid5 ;testdev $md1 2 $mdsize1 512 ; mdadm -S $md1" +uuid=`mdadm -Db $md1 | sed 's/.*UUID=//'` +check wait + +eval $tst + +mdadm -A $md1 $dev0 $dev1 $dev2 +mdadm $md1 --add $dev3 $dev4 +check spares 2 +eval $tst + +mdadm -A $md1 -u $uuid $devlist +check spares 2 +eval $tst + +mdadm -A $md1 --name one $devlist +check spares 2 +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 name=one +} > $conf + +mdadm -As -c $conf +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2,$dev3,$dev4 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +eval $tst +mdadm --assemble --scan --config=$conf $md1 +eval $tst +echo PING >&2 + +echo " metadata=1.0 devices=$dev0,$dev1,$dev2,$dev3,$dev4" >> $conf +mdadm --assemble --scan --config=$conf $md1 +eval $tst + +### Now with a missing device +# We don't want the recovery to complete while we are +# messing about here. +echo 100 > /proc/sys/dev/raid/speed_limit_max +echo 100 > /proc/sys/dev/raid/speed_limit_min + +mdadm -AR $md1 $dev0 $dev2 $dev3 $dev4 # +check state U_U +check spares 1 +eval $tst + +mdadm -A $md1 -u $uuid $devlist +check state U_U +eval $tst + +mdadm -A $md1 --name=one $devlist +check state U_U +check spares 1 +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 name=one +} > $conf + +mdadm -As -c $conf +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +check state U_U +eval $tst + +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst + +echo " metadata=1.0 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst + +# And now assemble with -I +mdadm -Ss +mdadm -I -c $conf $dev0 +mdadm -I -c $conf $dev1 +mdadm -I -c $conf $dev2 +eval $tst +echo 2000 > /proc/sys/dev/raid/speed_limit_max +echo 1000 > /proc/sys/dev/raid/speed_limit_min diff --git a/tests/04r0update b/tests/04r0update new file mode 100644 index 00000000..73ee3b9f --- /dev/null +++ b/tests/04r0update @@ -0,0 +1,20 @@ + +# create a raid0, re-assemble with a different super-minor +mdadm -CR -e 0.90 $md0 -l0 -n3 $dev0 $dev1 $dev2 +testdev $md0 3 $mdsize0 512 +minor1=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'` +mdadm -S /dev/md0 + +mdadm -A $md1 $dev0 $dev1 $dev2 +minor2=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'` +mdadm -S /dev/md1 + +mdadm -A $md1 --update=super-minor $dev0 $dev1 $dev2 +minor3=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'` +mdadm -S /dev/md1 + +case "$minor1 $minor2 $minor3" in + "0 0 1" ) ;; + * ) echo >&2 "ERROR minors should be '0 0 1' but are '$minor1 $minor2 $minor3'" + exit 1 +esac diff --git a/tests/04r1update b/tests/04r1update new file mode 100644 index 00000000..e22965bc --- /dev/null +++ b/tests/04r1update @@ -0,0 +1,15 @@ +set -i + +# create a raid1 array, let it sync, then re-assemble with a force-sync + +mdadm -CR $md0 -l1 -n2 $dev0 $dev1 +check wait +mdadm -S $md0 + +mdadm -A $md0 $dev0 $dev1 +check nosync +mdadm -S $md0 + +mdadm -A $md0 -U resync $dev0 $dev1 +check resync +mdadm -S $md0 diff --git a/tests/04r5swap b/tests/04r5swap new file mode 100644 index 00000000..5373a607 --- /dev/null +++ b/tests/04r5swap @@ -0,0 +1,18 @@ + +# make a raid5 array, byte swap the superblocks, then assemble... + +mdadm -CR $md0 -e 0.90 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +sleep 4 +mdadm -S $md0 + +mdadm -E --metadata=0 $dev1 > $targetdir/d1 +for d in $dev0 $dev1 $dev2 $dev3 +do $dir/swap_super $d +done +mdadm -E --metadata=0.swap $dev1 > $targetdir/d1s +diff -u $targetdir/d1 $targetdir/d1s + +mdadm --assemble --update=byteorder $md0 $dev0 $dev1 $dev2 $dev3 +sleep 3 +check recovery +mdadm -S $md0 diff --git a/tests/04update-metadata b/tests/04update-metadata new file mode 100644 index 00000000..232fc1ff --- /dev/null +++ b/tests/04update-metadata @@ -0,0 +1,48 @@ +set -xe + +# test converting v0.90 to v1.0 +# check for different levels +# check it fails for non-v0.90 +# check it fails during reshape or recovery +# check it fails when bitmap is present + +dlist="$dev0 $dev1 $dev2 $dev3" + +for ls in raid0/4 linear/4 raid1/1 raid5/3 raid6/2 +do + s=${ls#*/} l=${ls%/*} + mdadm -CR --assume-clean -e 0.90 $md0 --level $l -n 4 -c 64 $dlist + testdev $md0 $s 19904 64 + mdadm -S $md0 + mdadm -A $md0 --update=metadata $dlist + testdev $md0 $s 19904 64 check + mdadm -S $md0 +done + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail with v1.0 metadata + exit 1 +fi + +mdadm -CR -e 0.90 $md0 --level=6 -n4 -c32 $dlist +mdadm -S $md0 + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail during resync + exit 1 +fi +mdadm -A $md0 $dlist +mdadm --wait $md0 || true +mdadm -S $md0 + +# should succeed now +mdadm -A $md0 --update=metadata $dlist + +mdadm -S /dev/md0 +mdadm -CR --assume-clean -e 0.90 $md0 --level=6 -n4 -c32 $dlist --bitmap=internal +mdadm -S $md0 + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail when bitmap present + exit 1 +fi diff --git a/tests/04update-uuid b/tests/04update-uuid new file mode 100644 index 00000000..a4409e78 --- /dev/null +++ b/tests/04update-uuid @@ -0,0 +1,82 @@ +set -x + +# create an array, then change the uuid. + +mdadm -CR --assume-clean $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -S /dev/md0 + +# try v1 superblock + +mdadm -CR --assume-clean -e1 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -S /dev/md0 + + +# now if we have a bitmap, that needs updating too. +rm -f $targetdir/bitmap +mdadm -CR --assume-clean -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || + mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476 +then : ; else + echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2; +fi +mdadm -S /dev/md0 + +# and bitmap for version1 +rm -f $targetdir/bitmap +mdadm -CR --assume-clean -e1.1 -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +# -X cannot tell which byteorder to use for the UUID, so allow both. +if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || + mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476 +then : ; else + echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2; +fi +mdadm -S /dev/md0 + +# Internal bitmaps too. +mdadm -CR --assume-clean -b internal --bitmap-chunk 4 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -X $dev0; exit 2; +} +mdadm -S /dev/md0 + +mdadm -CR --assume-clean -e1.2 -b internal --bitmap-chunk=4 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -X $dev0; exit 2; +} +mdadm -S /dev/md0 diff --git a/tests/05r1-add-internalbitmap b/tests/05r1-add-internalbitmap new file mode 100644 index 00000000..4e203052 --- /dev/null +++ b/tests/05r1-add-internalbitmap @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=0.9 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-add-internalbitmap-v1a b/tests/05r1-add-internalbitmap-v1a new file mode 100644 index 00000000..721a41c1 --- /dev/null +++ b/tests/05r1-add-internalbitmap-v1a @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-add-internalbitmap-v1b b/tests/05r1-add-internalbitmap-v1b new file mode 100644 index 00000000..da78fd61 --- /dev/null +++ b/tests/05r1-add-internalbitmap-v1b @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-add-internalbitmap-v1c b/tests/05r1-add-internalbitmap-v1c new file mode 100644 index 00000000..9f2f128b --- /dev/null +++ b/tests/05r1-add-internalbitmap-v1c @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-bitmapfile b/tests/05r1-bitmapfile new file mode 100644 index 00000000..f384f0ea --- /dev/null +++ b/tests/05r1-bitmapfile @@ -0,0 +1,49 @@ + +# +# create a raid1 with a bitmap file +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create --run $md0 --level=1 -n2 --delay=1 --bitmap $bmf $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1a 64 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 +testdev $md0 1 $mdsize1a 64 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize1a 64 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev2 +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +mdadm --zero $dev1 # force --add, not --re-add +mdadm $md0 --add $dev1 +#it is too fast# check recovery + +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-grow-external b/tests/05r1-grow-external new file mode 100644 index 00000000..69da3e90 --- /dev/null +++ b/tests/05r1-grow-external @@ -0,0 +1,33 @@ + +# +# create a raid1 array, add an external bitmap +# +mdadm --create --run $md0 -l 1 -n 2 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1a 64 + +bmf=$targetdir/bm +rm -f $bmf +#mdadm -E $dev1 +mdadm --grow $md0 --bitmap=$bmf --delay=1 || { mdadm -X $bmf ; exit 1; } +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +testdev $md0 1 $mdsize1a 64 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +#echo $dirty1 $dirty2 $dirty3 $dirty4 +if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ] +then + echo bad dirty counts + exit 1 +fi + +# now to remove the bitmap +check bitmap +mdadm --grow $md0 --bitmap=none +check nobitmap +mdadm -S $md0 diff --git a/tests/05r1-grow-internal b/tests/05r1-grow-internal new file mode 100644 index 00000000..24b3aece --- /dev/null +++ b/tests/05r1-grow-internal @@ -0,0 +1,31 @@ + +# +# create a raid1 array, add an internal bitmap +# +mdadm --create --run $md0 -l 1 -n 2 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1a 64 + +#mdadm -E $dev1 +mdadm --grow $md0 --bitmap=internal --bitmap-chunk=4 --delay=1 || { mdadm -X $dev2 ; exit 1; } +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +testdev $md0 1 $mdsize1a 64 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +#echo $dirty1 $dirty2 $dirty3 $dirty4 +if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ] +then + echo bad dirty counts + exit 1 +fi + +# now to remove the bitmap +check bitmap +mdadm --grow $md0 --bitmap=none +check nobitmap +mdadm -S $md0 diff --git a/tests/05r1-grow-internal-1 b/tests/05r1-grow-internal-1 new file mode 100644 index 00000000..2f0d8237 --- /dev/null +++ b/tests/05r1-grow-internal-1 @@ -0,0 +1,31 @@ + +# +# create a raid1 array, version 1 superblock, add an internal bitmap +# +mdadm --create --run $md0 -e1 -l 1 -n 2 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1b 64 + +#mdadm -E $dev1 +mdadm --grow $md0 --bitmap=internal --bitmap-chunk=4 --delay=1 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +testdev $md0 1 $mdsize1b 64 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +#echo $dirty1 $dirty2 $dirty3 $dirty4 +if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ] +then + echo bad dirty counts + exit 1 +fi + +# now to remove the bitmap +check bitmap +mdadm --grow $md0 --bitmap=none +check nobitmap +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap b/tests/05r1-internalbitmap new file mode 100644 index 00000000..dd7232a7 --- /dev/null +++ b/tests/05r1-internalbitmap @@ -0,0 +1,47 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create -e0.90 --run $md0 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize0 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +testdev $md0 1 $mdsize0 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize0 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 $dev2 +mdadm --zero-superblock $dev1 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap-v1a b/tests/05r1-internalbitmap-v1a new file mode 100644 index 00000000..3ddc082f --- /dev/null +++ b/tests/05r1-internalbitmap-v1a @@ -0,0 +1,48 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +testdev $md0 1 $mdsize1b 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize1b 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --zero-superblock $dev1 +mdadm --assemble -R $md0 $dev2 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap-v1b b/tests/05r1-internalbitmap-v1b new file mode 100644 index 00000000..40f7abea --- /dev/null +++ b/tests/05r1-internalbitmap-v1b @@ -0,0 +1,49 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize11 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize11 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize11 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --zero-superblock $dev1 +mdadm --assemble -R $md0 $dev2 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap-v1c b/tests/05r1-internalbitmap-v1c new file mode 100644 index 00000000..2eaea59b --- /dev/null +++ b/tests/05r1-internalbitmap-v1c @@ -0,0 +1,48 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk 4 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize12 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +testdev $md0 1 $mdsize12 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize12 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --zero-superblock $dev1 +mdadm --assemble -R $md0 $dev2 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-n3-bitmapfile b/tests/05r1-n3-bitmapfile new file mode 100644 index 00000000..f1c3f1ee --- /dev/null +++ b/tests/05r1-n3-bitmapfile @@ -0,0 +1,53 @@ + +# +# create a raid1 with 3 devices and a bitmap file +# make sure resync does right thing. +# +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create -e0.90 --run $md0 --level=1 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 +check wait +testdev $md0 1 $mdsize0 64 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 +testdev $md0 1 $mdsize0 64 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev2 +testdev $md0 1 $mdsize0 64 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev1 $dev3 +check nosync +mdadm --zero-superblock $dev2 +mdadm $md0 --add $dev2 +check recovery + +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 +exit 0 diff --git a/tests/05r1-re-add b/tests/05r1-re-add new file mode 100644 index 00000000..fa6bbcb4 --- /dev/null +++ b/tests/05r1-re-add @@ -0,0 +1,39 @@ + +# +# create a raid1, remove a drive, and readd it. +# resync should be instant. +# Then do some IO first. Resync should still be very fast +# + +mdadm -CR $md0 -l1 -n2 -binternal --bitmap-chunk=4 -d1 $dev1 $dev2 +check resync +check wait +testdev $md0 1 $mdsize1a 64 +sleep 4 + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +mdadm $md0 -a $dev2 +#cat /proc/mdstat +check nosync + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +testdev $md0 1 $mdsize1a 64 +mdadm $md0 -a $dev2 +check wait +blockdev --flushbufs $dev1 $dev2 +cmp --ignore-initial=$[64*512] --bytes=$[$mdsize0*1024] $dev1 $dev2 + +mdadm $md0 -f $dev2; sleep 1 +mdadm $md0 -r $dev2 +if dd if=/dev/zero of=$md0 ; then : ; fi +blockdev --flushbufs $md0 # ensure writes have been sent. +mdadm $md0 -a $dev2 +check recovery +check wait +blockdev --flushbufs $dev1 $dev2 +cmp --ignore-initial=$[64*512] --bytes=$[$mdsize0*1024] $dev1 $dev2 +mdadm -S $md0 diff --git a/tests/05r1-re-add-nosuper b/tests/05r1-re-add-nosuper new file mode 100644 index 00000000..058d602d --- /dev/null +++ b/tests/05r1-re-add-nosuper @@ -0,0 +1,38 @@ + +# +# create a raid1, remove a drive, and readd it. +# resync should be instant. +# Then do some IO first. Resync should still be very fast +# +bmf=$targetdir/bitmap2 +rm -f $bmf +mdadm -B $md0 -l1 -n2 -b$bmf -d1 $dev1 $dev2 +check resync +check wait +testdev $md0 1 $size 1 +sleep 4 + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +mdadm $md0 --re-add $dev2 +check nosync + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +testdev $md0 1 $size 1 +mdadm $md0 --re-add $dev2 +check wait +cmp --bytes=$[$mdsize0*1024] $dev1 $dev2 + +mdadm $md0 -f $dev2; sleep 1 +mdadm $md0 -r $dev2 +if dd if=/dev/zero of=$md0 ; then : ; fi +blockdev --flushbufs $md0 # make sure writes have been sent +mdadm $md0 --re-add $dev2 +check recovery +check wait +# should BLKFLSBUF and then read $dev1/$dev2... +cmp --bytes=$[$mdsize0*1024] $file1 $file2 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap b/tests/05r1-remove-internalbitmap new file mode 100644 index 00000000..712fd56f --- /dev/null +++ b/tests/05r1-remove-internalbitmap @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=0.9 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap-v1a b/tests/05r1-remove-internalbitmap-v1a new file mode 100644 index 00000000..a4a9aaf1 --- /dev/null +++ b/tests/05r1-remove-internalbitmap-v1a @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap-v1b b/tests/05r1-remove-internalbitmap-v1b new file mode 100644 index 00000000..c0918eb6 --- /dev/null +++ b/tests/05r1-remove-internalbitmap-v1b @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap-v1c b/tests/05r1-remove-internalbitmap-v1c new file mode 100644 index 00000000..15f1fbb0 --- /dev/null +++ b/tests/05r1-remove-internalbitmap-v1c @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r5-bitmapfile b/tests/05r5-bitmapfile new file mode 100644 index 00000000..6d173d88 --- /dev/null +++ b/tests/05r5-bitmapfile @@ -0,0 +1,49 @@ + +# +# create a raid1 with a bitmap file +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 +check wait +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 +testdev $md0 2 $mdsize1 512 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 2 $mdsize1 512 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev2 $dev3 +mdadm --zero $dev1 # force add, not re-add +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r5-internalbitmap b/tests/05r5-internalbitmap new file mode 100644 index 00000000..13dc5921 --- /dev/null +++ b/tests/05r5-internalbitmap @@ -0,0 +1,47 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 $dev3 +check wait +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 $dev3 +testdev $md0 2 $mdsize1 512 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 2 $mdsize1 512 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 $dev2 $dev3 +mdadm --zero $dev1 # force --add, not --re-add +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r6-bitmapfile b/tests/05r6-bitmapfile new file mode 100644 index 00000000..d11896db --- /dev/null +++ b/tests/05r6-bitmapfile @@ -0,0 +1,49 @@ + +# +# create a raid1 with a bitmap file +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create --run $md0 --level=6 -n4 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 $dev4 +check wait +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 $dev4 +testdev $md0 2 $mdsize1 512 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev3 +testdev $md0 2 $mdsize1 512 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev1 $dev2 $dev4 +mdadm --zero $dev3 # force --add, not --re-add +mdadm $md0 --add $dev3 +check recovery + +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r6tor0 b/tests/05r6tor0 new file mode 100644 index 00000000..2fd51f2e --- /dev/null +++ b/tests/05r6tor0 @@ -0,0 +1,27 @@ +set -x -e + +# reshape a RAID6 to RAID5 and then RAID0. +# then reshape back up to RAID5 and RAID5 + +mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check wait; sleep 1 +check raid6 +testdev $md0 3 19456 512 +mdadm -G $md0 -l5 +check wait; sleep 1 +check raid5 +testdev $md0 3 19456 512 +mdadm -G $md0 -l0 +check wait; sleep 1 +check raid0 +testdev $md0 3 19456 512 +mdadm -G $md0 -l5 --add $dev3 $dev4 +check wait; sleep 1 +check raid5 +check algorithm 2 +testdev $md0 3 19456 512 +mdadm -G $md0 -l 6 +check wait; sleep 1 +check raid6 +check algorithm 2 +testdev $md0 3 19456 512 diff --git a/tests/06name b/tests/06name new file mode 100644 index 00000000..4d5e824d --- /dev/null +++ b/tests/06name @@ -0,0 +1,12 @@ +set -x + +# create an array with a name + +mdadm -CR $md0 -l0 -n2 --metadata=1 --name="Fred" $dev0 $dev1 +mdadm -E $dev0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1 +mdadm -D $md0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1 +mdadm -S $md0 + +mdadm -A $md0 --name="Fred" $devlist +#mdadm -Db $md0 +mdadm -S $md0 diff --git a/tests/06sysfs b/tests/06sysfs new file mode 100644 index 00000000..af63ef45 --- /dev/null +++ b/tests/06sysfs @@ -0,0 +1,11 @@ +exit 0 +mdadm -CR $md0 -l1 -n3 $dev1 $dev2 $dev3 + +ls -Rl /sys/block/md0 + +cat /sys/block/md0/md/level +cat /sys/block/md0/md/raid_disks + +mdadm -S $md0 + +exit 1 diff --git a/tests/06wrmostly b/tests/06wrmostly new file mode 100644 index 00000000..968c1974 --- /dev/null +++ b/tests/06wrmostly @@ -0,0 +1,13 @@ + +# create a raid1 array with a wrmostly device + +mdadm -CR $md0 -l1 -n3 $dev0 $dev1 --write-mostly $dev2 +testdev $md0 1 $mdsize1a 64 + +# unfortunately, we cannot measure if any read requests are going to $dev2 + +mdadm -S $md0 + +mdadm -CR $md0 -l1 -n3 --write-behind --bitmap=internal --bitmap-chunk=4 $dev0 $dev1 --write-mostly $dev2 +testdev $md0 1 $mdsize1a 64 +mdadm -S $md0 diff --git a/tests/07autoassemble b/tests/07autoassemble new file mode 100644 index 00000000..e689be7c --- /dev/null +++ b/tests/07autoassemble @@ -0,0 +1,24 @@ + +# create two raid1s, build a raid0 on top, then +# tear it down and get auto-assemble to rebuild it. + +mdadm -CR $md1 -l1 -n2 $dev0 $dev1 --homehost=testing +mdadm -CR $md2 -l1 -n2 $dev2 $dev3 --homehost=testing +mdadm -CR $md0 -l0 -n2 $md1 $md2 --homehost=testing + +mdadm -Ss +mdadm -As -c /dev/null --homehost=testing -vvv +testdev $md1 1 $mdsize1a 64 +testdev $md2 1 $mdsize1a 64 +testdev $md0 2 $mdsize11a 512 +mdadm -Ss + +mdadm --zero-superblock $dev0 $dev1 $dev2 $dev3 +## Now the raid0 uses one stacked and one not +mdadm -CR $md1 -l1 -n2 $dev0 $dev1 --homehost=testing +mdadm -CR $md0 -l0 -n2 $md1 $dev2 --homehost=testing +mdadm -Ss +mdadm -As -c /dev/null --homehost=testing -vvv +testdev $md1 1 $mdsize1a 64 +testdev $md0 1 $[mdsize1a+mdsize11a] 512 +mdadm -Ss diff --git a/tests/07autodetect b/tests/07autodetect new file mode 100644 index 00000000..917e0d66 --- /dev/null +++ b/tests/07autodetect @@ -0,0 +1,34 @@ + +# +# Test in-kernel autodetect. +# Create a partitionable array on each of two devices, +# put a partition on each, create an array, and see if we can +# use autodetect to restart the array. + +if lsmod | grep md_mod > /dev/null 2>&1 +then + echo md is a module - cannot test autodetect + exit 0 +fi + + +mdadm -CR -e 0 $mdp0 -l0 -f -n1 $dev0 +mdadm -CR -e 0 $mdp1 -l0 -f -n1 $dev1 +udevadm settle +sfdisk $mdp0 >&2 << END +,,FD +END +sfdisk $mdp1 >&2 << END +,,FD +END +udevadm settle +mdadm -CR -e 0 $md0 -l1 -n2 ${mdp0}p1 ${mdp1}p1 +check resync +check raid1 +check wait +mdadm -S $md0 +mdadm --auto-detect +check raid1 + +mdadm -Ss +exit 0 diff --git a/tests/07changelevelintr b/tests/07changelevelintr new file mode 100644 index 00000000..18c63092 --- /dev/null +++ b/tests/07changelevelintr @@ -0,0 +1,61 @@ + +# +# test that we can stop and restart a level change. +# just test a few in-place changes, and a few +# size-reducing changes. + + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + sleep 1 + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `cat /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + +restart() { + sleep 0.5 + check reshape + mdadm -S $md0 + mdadm -A $md0 $devs --backup-file=$bu + sleep 0.5 + check reshape +} + +bu=/tmp/md-backup +rm -f $bu +devs="$dev0 $dev1 $dev2 $dev3 $dev4" +mdadm -CR $md0 -l5 -n5 -c 256 $devs +checkgeo md0 raid5 5 $[256*1024] 2 + +mdadm -G $md0 -c 128 --backup-file=$bu +restart +checkgeo md0 raid5 5 $[128*1024] 2 + +mdadm -G $md0 --layout rs --backup-file=$bu +restart +checkgeo md0 raid5 5 $[128*1024] 3 + +mdadm -G $md0 --array-size 58368 +mdadm -G $md0 --raid-disks 4 -c 64 --backup-file=$bu +restart +checkgeo md0 raid5 4 $[64*1024] 3 + +devs="$dev0 $dev1 $dev2 $dev3" +mdadm -G $md0 --array-size 19456 +mdadm -G $md0 -n 2 -c 256 --backup-file=$bu +restart +checkgeo md0 raid5 2 $[256*1024] 3 diff --git a/tests/07changelevels b/tests/07changelevels new file mode 100644 index 00000000..a328874a --- /dev/null +++ b/tests/07changelevels @@ -0,0 +1,114 @@ + +# Test changing of level, chunksize etc. +# Create a RAID1, convert to RAID5, add a disk, add another disk +# convert to RAID6, back to RAID5 and ultimately to RAID1 + +testK=$[64*3*6] +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$testK +export MDADM_GROW_VERIFY=1 + +dotest() { + sleep 2 + check wait + testdev $md0 $1 19968 64 nd + blockdev --flushbufs $md0 + cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; } + # write something new - shift chars 4 space + tr ' -~' '$-~ -#' < /tmp/RandFile > /tmp/RandFile2 + mv /tmp/RandFile2 /tmp/RandFile + dd if=/tmp/RandFile of=$md0 +} + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + sleep 1 + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `cat /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + + +bu=/tmp/md-test-backup +rm -f $bu +mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2 -z 19968 +testdev $md0 1 $mdsize1a 64 +dd if=/tmp/RandFile of=$md0 +dotest 1 + +mdadm --grow $md0 -l5 -n3 --chunk 64 +dotest 2 + +mdadm $md0 --add $dev3 $dev4 +mdadm --grow $md0 -n4 --chunk 32 +dotest 3 + +mdadm -G $md0 -l6 --backup-file $bu +dotest 3 + +mdadm -G /dev/md0 --array-size 39936 +mdadm -G $md0 -n4 --backup-file $bu +checkgeo md0 raid6 4 $[32*1024] +dotest 2 + +mdadm -G $md0 -l5 --backup-file $bu +checkgeo md0 raid5 3 $[32*1024] +dotest 2 + +mdadm -G /dev/md0 --array-size 19968 +mdadm -G $md0 -n2 --backup-file $bu +checkgeo md0 raid5 2 $[32*1024] +dotest 1 + +mdadm -G --level=1 $md0 +dotest 1 + +# now repeat that last few steps only with a degraded array. +mdadm -S $md0 +mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +dd if=/tmp/RandFile of=$md0 +dotest 3 + +mdadm $md0 --fail $dev0 + +mdadm -G /dev/md0 --array-size 37888 +mdadm -G $md0 -n4 --backup-file $bu +dotest 2 +checkgeo md0 raid6 4 $[512*1024] +mdadm $md0 --fail $dev4 + +mdadm $md0 --fail $dev3 +# now double-degraded. +# switch layout to a DDF layout and back to make sure that works. + +mdadm -G /dev/md0 --layout=ddf-N-continue --backup-file $bu +checkgeo md0 raid6 4 $[512*1024] 10 +dotest 2 +mdadm -G /dev/md0 --layout=ra --backup-file $bu +checkgeo md0 raid6 4 $[512*1024] 1 +dotest 2 + +mdadm -G $md0 -l5 --backup-file $bu +dotest 2 + +mdadm -G /dev/md0 --array-size 18944 +mdadm -G $md0 -n2 --backup-file $bu +dotest 1 +checkgeo md0 raid5 2 $[512*1024] +mdadm $md0 --fail $dev2 + +mdadm -G --level=1 $md0 +dotest 1 +checkgeo md0 raid1 2 diff --git a/tests/07layouts b/tests/07layouts new file mode 100644 index 00000000..acd1a800 --- /dev/null +++ b/tests/07layouts @@ -0,0 +1,91 @@ + +# check that kernel an restripe interpret all the different layouts +# the same +# This involves changing the layout to each different possibility +# while MDADM_GROW_VERIFY is set. + +testK=$[64*3*6] +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$testK +export MDADM_GROW_VERITY=1 + + +dotest() { + sleep 0.5 + check wait + testdev $md0 $1 $mdsize1 512 nd + blockdev --flushbufs $md0 + cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; } + # write something new - shift chars 4 space + tr ' -~' '$-~ -#' < /tmp/RandFile > /tmp/RandFile2 + mv /tmp/RandFile2 /tmp/RandFile + dd if=/tmp/RandFile of=$md0 +} + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `sed 's/ .*//' /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + + +bu=/tmp/md-test-backup +rm -f $bu + +# first a degraded 5 device raid5 +mdadm -CR $md0 -l5 -n5 $dev0 $dev1 missing $dev2 $dev3 +dd if=/tmp/RandFile of=$md0 +dotest 4 + +l5[0]=la +l5[1]=ra +l5[2]=ls +l5[3]=rs +l5[4]=parity-first +l5[5]=parity-last +for layout in 0 1 2 3 4 5 0 +do + mdadm -G $md0 --layout=${l5[$layout]} --backup-file $bu + checkgeo md0 raid5 5 $[512*1024] $layout + dotest 4 +done + +mdadm -S $md0 +# now a doubly degraded raid6 +mdadm -CR $md0 -l6 -n5 $dev0 missing $dev2 missing $dev4 +dd if=/tmp/RandFile of=$md0 +dotest 3 + +l6[0]=la +l6[1]=ra +l6[2]=ls +l6[3]=rs +l6[4]=parity-first +l6[5]=parity-last +l6[8]=ddf-zero-restart +l6[9]=ddf-N-restart +l6[10]=ddf-N-continue +l6[16]=left-asymmetric-6 +l6[17]=right-asymmetric-6 +l6[18]=left-symmetric-6 +l6[19]=right-symmetric-6 +l6[20]=parity-first-6 +for layout in 0 1 2 3 4 5 8 9 10 16 17 18 19 20 0 +do + mdadm -G $md0 --layout=${l6[$layout]} --backup-file $bu + checkgeo md0 raid6 5 $[512*1024] $layout + dotest 3 +done diff --git a/tests/07reshape5intr b/tests/07reshape5intr new file mode 100644 index 00000000..0f4803ac --- /dev/null +++ b/tests/07reshape5intr @@ -0,0 +1,41 @@ + +# +# test interrupting and restarting raid5 reshape. +set -x +devs="$dev1" +st=UU +for disks in 2 3 4 5 +do + eval devs=\"$devs \$dev$disks\" + st=U$st + for d in $devs + do dd if=/dev/urandom of=$d bs=1024 || true + done + + case $disks in + 2 | 3) chunk=1024;; + 4 ) chunk=512;; + 5 ) chunk=256;; + esac + + mdadm -CR $md0 -amd -l5 -c $chunk -n$disks --assume-clean $devs + mdadm $md0 --add $dev6 + echo 20 > /proc/sys/dev/raid/speed_limit_min + echo 20 > /proc/sys/dev/raid/speed_limit_max + mdadm --grow $md0 -n $[disks+1] + check reshape + check state $st + mdadm --stop $md0 + mdadm --assemble $md0 $devs $dev6 + check reshape + echo 1000 > /proc/sys/dev/raid/speed_limit_min + echo 2000 > /proc/sys/dev/raid/speed_limit_max + check wait + while ! echo check > /sys/block/md0/md/sync_action; do sleep 0.1; done + check wait + mm=`cat /sys/block/md0/md/mismatch_cnt` + if [ $mm -gt 0 ] + then echo >&2 "ERROR mismatch_cnt non-zero : $mm" ; exit 1 + fi + mdadm -S $md0 +done diff --git a/tests/07revert-grow b/tests/07revert-grow new file mode 100644 index 00000000..c8c4e855 --- /dev/null +++ b/tests/07revert-grow @@ -0,0 +1,52 @@ +set -e -x + +# revert a reshape that is increasing the number of devices, +# raid5, raid6, and raid10 + +# metadate 0.90 cannot handle RAID10 growth +# metadata 1.0 doesn't get a default headspace, is don't try it either. + +for metadata in 0.90 1.1 1.2 +do +# RAID5 +mdadm -CR --assume-clean $md0 -l5 -n4 -x1 $devlist4 --metadata=$metadata +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID6 +mdadm -CR --assume-clean $md0 -l6 -n4 -x1 $devlist4 --metadata=$metadata +check raid6 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +if [ $metadata = 0.90 ]; then continue; fi + +# RAID10 +mdadm -CR --assume-clean $md0 -l10 -n4 -x1 $devlist4 --metadata=$metadata +check raid10 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +strace -o /tmp/str ./mdadm -A $md0 --update=revert-reshape $devlist4 +check wait +check raid10 +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +done diff --git a/tests/07revert-inplace b/tests/07revert-inplace new file mode 100644 index 00000000..a73eb977 --- /dev/null +++ b/tests/07revert-inplace @@ -0,0 +1,44 @@ +set -e -x + +# revert a reshape that is not changing the number of data devices, +# raid5, raid6, and raid10 + +# RAID5 -> RAID6 +mdadm -CR --assume-clean $md0 -l5 -n4 -x1 $devlist4 +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -l 6 +sleep 2 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +check algorithm 18 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID6 -> RAID5 +mdadm -CR --assume-clean $md0 -l6 -n5 $devlist4 +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -l 5 +sleep 2 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID10 - decrease chunk size +mdadm -CR --assume-clean $md0 -l10 -n6 -c 64 $devlist5 +check raid10 +testdev $md0 3 $mdsize1 64 +mdadm -G $md0 -c 32 +sleep 2 +mdadm -S $md0 +strace -o /tmp/str ./mdadm -A $md0 --update=revert-reshape $devlist5 +check wait +check raid10 +testdev $md0 3 $mdsize1 64 +mdadm -S $md0 diff --git a/tests/07revert-shrink b/tests/07revert-shrink new file mode 100644 index 00000000..62b5ae02 --- /dev/null +++ b/tests/07revert-shrink @@ -0,0 +1,56 @@ +set -e -x + +# revert a reshape that is decreasing the number of devices, +# raid5, raid6, and raid10 + +bu=$targetdir/md-backup +rm -f $bu +# RAID5 +mdadm -CR --assume-clean $md0 -l5 -n5 $devlist4 +check raid5 +testdev $md0 4 $mdsize1 512 +mdadm --grow $md0 --array-size 56832 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -n 4 --backup=$bu +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=$bu +check wait +check raid5 +fsck -f -n $md0 +testdev $md0 4 $mdsize1 512 +mdadm -S $md0 + +#FIXME +rm -f $bu +# RAID6 +mdadm -CR --assume-clean $md0 -l6 -n5 $devlist4 +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm --grow $md0 --array-size 37888 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 4 --backup=$bu +sleep 2 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=$bu +check wait +check raid6 +fsck -f -n $md0 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID10 +mdadm -CR --assume-clean $md0 -l10 -n6 $devlist5 +check raid10 +testdev $md0 3 $mdsize1 512 +mdadm --grow $md0 --array-size 36864 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 4 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist5 +check wait +check raid10 +fsck -f -n $md0 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 diff --git a/tests/07testreshape5 b/tests/07testreshape5 new file mode 100644 index 00000000..0e1f25f9 --- /dev/null +++ b/tests/07testreshape5 @@ -0,0 +1,45 @@ + +# +# test the reshape code by using test_reshape and the +# kernel md code to move data into and out of variously +# shaped md arrays. +set -x +layouts=(la ra ls rs) +for level in 5 6 +do +for chunk in 4 8 16 32 64 128 +do + devs="$dev1" + for disks in 2 3 4 5 6 + do + eval devs=\"$devs \$dev$disks\" + if [ " $level $disks" = " 6 3" -o " $level $disks" = " 6 2" ] + then continue + fi + for nlayout in 0 1 2 3 + do + layout=${layouts[$nlayout]} + + size=$[chunk*(disks-(level-4))*disks] + + # test restore: make a raid5 from a file, then do a compare + dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$size + $dir/test_stripe restore /tmp/RandFile $disks $[chunk*1024] $level $nlayout 0 $[size*1024] $devs + mdadm -CR -e 1.0 $md0 -amd -l$level -n$disks --assume-clean -c $chunk -p $layout $devs + cmp -s -n $[size*1024] $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + # FIXME check parity + + # test save + dd if=/dev/urandom of=$md0 bs=1024 count=$size + blockdev --flushbufs $md0 $devs; sync + > /tmp/NewRand + $dir/test_stripe save /tmp/NewRand $disks $[chunk*1024] $level $nlayout 0 $[size*1024] $devs + cmp -s -n $[size*1024] $md0 /tmp/NewRand || { echo cmp failed ; exit 2; } + mdadm -S $md0 + udevadm settle + done + done +done +done +exit 0 diff --git a/tests/09imsm-assemble b/tests/09imsm-assemble new file mode 100644 index 00000000..d7028c62 --- /dev/null +++ b/tests/09imsm-assemble @@ -0,0 +1,73 @@ +# validate the prodigal member disk scenario i.e. a former container +# member is returned after having been rebuilt on another system + + +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +export IMSM_DEVNAME_AS_SERIAL=1 +export IMSM_TEST_OROM=1 +export IMSM_NO_PLATFORM=1 +container=/dev/md/container +member=/dev/md/vol0 + + +num_disks=4 +size=$((10*1024)) +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +mdadm -CR $member $dev0 $dev2 -n 2 -l 1 -z $size +mdadm --wait $member || true +mdadm -Ss + +# make dev0 and dev1 a new rebuild family +mdadm -A $container $dev0 $dev1 +mdadm -IR $container +mdadm --wait ${member}_0 || true +mdadm -Ss + +# make dev2 and dev3 a new rebuild family +mdadm -A $container $dev2 $dev3 +mdadm -IR $container +mdadm --wait ${member}_0 || true +mdadm -Ss + +# reassemble and make sure one of the families falls out +mdadm -A $container $dev0 $dev1 $dev2 $dev3 +mdadm -IR $container +testdev ${member}_0 1 $size 64 +if mdadm --remove $container $dev0 ; then + # the dev[23] family won + imsm_check_removal $container $dev1 + imsm_check_hold $container $dev2 + imsm_check_hold $container $dev3 +else + # the dev[01] family won + imsm_check_hold $container $dev1 + imsm_check_removal $container $dev2 + imsm_check_removal $container $dev3 +fi +mdadm -Ss + +# reassemble with a new id for the dev[23] family +mdadm -A $container $dev0 $dev1 +mdadm -IR $container +mdadm -A ${container}2 $dev2 $dev3 --update=uuid +mdadm -IR ${container}2 + +testdev ${member}_0 1 $size 64 +testdev ${member}_1 1 $size 64 diff --git a/tests/09imsm-create-fail-rebuild b/tests/09imsm-create-fail-rebuild new file mode 100644 index 00000000..de17f321 --- /dev/null +++ b/tests/09imsm-create-fail-rebuild @@ -0,0 +1,78 @@ +# sanity check array creation + +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +. tests/env-imsm-template + +# IMSM rounds to multiples of one mebibyte - 1024K +DEV_ROUND_K=1024 + +num_disks=2 +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 +imsm_check container $num_disks + +# RAID0 + RAID1 +size=9000 +level=0 +chunk=64 +offset=0 +mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk +testdev $member0 $num_disks $size $chunk + +offset=$(((size & ~(chunk - 1)) + 2048)) +size=4000 +level=1 +chunk=0 +mdadm -CR $member1 $dev0 $dev1 -n $num_disks -l $level -z $size +imsm_check member $member1 $num_disks $level $size $size $offset $chunk +testdev $member1 1 $size 64 +check wait + +mdadm -Ss + +# RAID10 + RAID5 +num_disks=4 +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +imsm_check container $num_disks + +size=9000 +level=10 +chunk=64 +offset=0 +mdadm -CR $member0 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk +testdev $member0 $((num_disks-2)) $size $chunk + +offset=$(((size & ~(chunk - 1)) + 2048)) +size=4000 +level=5 +mdadm -CR $member1 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member1 $num_disks $level $size $((size*3)) $offset $chunk +testdev $member1 $((num_disks-1)) $size $chunk +check wait + +# FAIL / REBUILD +imsm_check_hold $container $dev0 +mdadm --fail $member0 $dev0 +mdadm --wait-clean --scan || true +imsm_check_removal $container $dev0 +mdadm --add $container $dev4 +check wait +imsm_check_hold $container $dev4 diff --git a/tests/09imsm-overlap b/tests/09imsm-overlap new file mode 100644 index 00000000..e832257c --- /dev/null +++ b/tests/09imsm-overlap @@ -0,0 +1,30 @@ + +. tests/env-imsm-template + +# create raid arrays with varying degress of overlap +mdadm -CR $container -e imsm -n 6 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5 +imsm_check container 6 + +size=1910 +level=1 +num_disks=2 +mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size +mdadm -CR $member1 $dev1 $dev2 -n $num_disks -l $level -z $size +mdadm -CR $member2 $dev2 $dev3 -n $num_disks -l $level -z $size +mdadm -CR $member3 $dev3 $dev4 -n $num_disks -l $level -z $size +mdadm -CR $member4 $dev4 $dev5 -n $num_disks -l $level -z $size + +udevadm settle + +offset=0 +imsm_check member $member0 $num_disks $level $size 1024 $offset +offset=$((offset+size+2048)) +imsm_check member $member1 $num_disks $level $size 1024 $offset +offset=$((offset+size+2048)) +imsm_check member $member2 $num_disks $level $size 1024 $offset +# at this point there should be more freespace at the start of the disk +# than the end +offset=0 +imsm_check member $member3 $num_disks $level $size 1024 $offset +offset=$((offset+size+2048)) +imsm_check member $member4 $num_disks $level $size 1024 $offset diff --git a/tests/10ddf-assemble-missing b/tests/10ddf-assemble-missing new file mode 100644 index 00000000..4bf21b25 --- /dev/null +++ b/tests/10ddf-assemble-missing @@ -0,0 +1,61 @@ +# An array is assembled incompletely. +# Re missing disks get marked as missing and are not allowed back in + +. tests/env-ddf-template +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp /var/tmp/mdmon.log +ret=0 + +mdadm -CR $container -e ddf -n 4 $dev8 $dev9 $dev10 $dev11 +ddf_check container 4 + +mdadm -CR $member1 -n 4 -l 10 $dev8 $dev10 $dev9 $dev11 -z 10000 +mdadm -CR $member0 -n 2 -l 1 $dev8 $dev9 -z 10000 + +mdadm --wait $member0 || true +mdadm --wait $member1 || true + +mdadm -Ss +sleep 1 + +# Add all devices except those for $member0 +mdadm -I $dev10 +mdadm -I $dev11 + +# Start runnable members +mdadm -IRs || true +mdadm -Ss + +#[ -f /var/tmp/mdmon.log ] && cat /var/tmp/mdmon.log + +# Now reassemble +# This should work because BVDs weren't written to +for d in $dev8 $dev9 $dev10 $dev11; do + mdadm -I $d +done +mdadm -Ss + +# Expect consistent state +for d in $dev10 $dev11; do + mdadm -E $d>$tmp + egrep 'state\[0\] : Degraded, Consistent' $tmp || { + ret=1 + echo ERROR: $member0 has unexpected state on $d + } + egrep 'state\[1\] : Optimal, Consistent' $tmp || { + ret=1 + echo ERROR: $member1 has unexpected state on $d + } + + if [ x$(egrep -c 'active/Online$' $tmp) != x2 ]; then + ret=1 + echo ERROR: unexpected number of online disks on $d + fi +done + +if [ $ret -ne 0 ]; then + mdadm -E $dev10 + mdadm -E $dev8 +fi +rm -f $tmp /var/tmp/mdmon.log +[ $ret -eq 0 ] diff --git a/tests/10ddf-create b/tests/10ddf-create new file mode 100644 index 00000000..44e95441 --- /dev/null +++ b/tests/10ddf-create @@ -0,0 +1,89 @@ +# +# Test basic DDF functionality. +# +# Create a container with 5 drives +# create a small raid0 across them all, +# then a small raid10 using 4 drives, then a 2disk raid1 +# and a 3disk raid5 using the remaining space +# +# add some data, tear down the array, reassemble +# and make sure it is still there. +set -e +. tests/env-ddf-template +sda=$(get_rootdev) || exit 1 + +mdadm -CR /dev/md/ddf0 -e ddf -n 5 $dev8 $dev9 $dev10 $dev11 $dev12 +mdadm -CR r5 -l5 -n5 /dev/md/ddf0 -z 5000 +if mdadm -CR r5 -l1 -n2 /dev/md/ddf0 -z 5000 +then echo >&2 create with same name should fail ; exit 1 +fi +mdadm -CR r10 -l10 -n4 -pn2 /dev/md/ddf0 -z 5000 +mdadm -CR r1 -l1 -n2 /dev/md/ddf0 +mdadm -CR r0 -l0 -n3 /dev/md/ddf0 +testdev /dev/md/r5 4 5000 512 +testdev /dev/md/r10 2 5000 512 +# r0/r10 will use 4608 due to chunk size, so that leaves 23552 for the rest +testdev /dev/md/r1 1 23552 64 +testdev /dev/md/r0 3 23552 512 +dd if=$sda of=/dev/md/r0 || true +dd if=$sda of=/dev/md/r10 || true +dd if=$sda of=/dev/md/r1 || true +dd if=$sda of=/dev/md/r5 || true + +s0=`sha1sum /dev/md/r0` +s10=`sha1sum /dev/md/r10` +s1=`sha1sum /dev/md/r1` +s5=`sha1sum /dev/md/r5` + + +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 +mdadm -I /dev/md/ddf0 + +udevadm settle +s0a=`sha1sum /dev/md/r0` +s10a=`sha1sum /dev/md/r10` +s1a=`sha1sum /dev/md/r1` +s5a=`sha1sum /dev/md/r5` + +if [ "$s0" != "$s0a" ]; then + echo r0 did not match ; exit 1; +fi +if [ "$s10" != "$s10a" ]; then + echo r10 did not match ; exit 1; +fi +if [ "$s1" != "$s1a" ]; then + echo r1 did not match ; exit 1; +fi +if [ "$s5" != "$s5a" ]; then + echo r5 did not match ; exit 1; +fi + +# failure status just means it has completed already, so ignore it. +mdadm --wait /dev/md/r1 || true +mdadm --wait /dev/md/r10 || true +mdadm --wait /dev/md/r5 || true + +mdadm -Dbs > /var/tmp/mdadm.conf + +mdadm -Ss + +# Now try to assemble using mdadm.conf +mdadm -Asc /var/tmp/mdadm.conf +check nosync # This failed once. The raid5 was resyncing. +udevadm settle +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - +mdadm -Ss + +# and now assemble fully incrementally. +for i in $dev8 $dev9 $dev10 $dev11 $dev12 +do + mdadm -I $i -c /var/tmp/mdadm.conf +done +check nosync +udevadm settle +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - +mdadm -Ss +rm /tmp/mdadm.conf /var/tmp/mdadm.conf diff --git a/tests/10ddf-create-fail-rebuild b/tests/10ddf-create-fail-rebuild new file mode 100644 index 00000000..a8e8ced9 --- /dev/null +++ b/tests/10ddf-create-fail-rebuild @@ -0,0 +1,77 @@ +# sanity check array creation + +ddf_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +ddf_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +. tests/env-ddf-template + +num_disks=2 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 +ddf_check container $num_disks + +# RAID0 + RAID1 +size=9000 +level=0 +chunk=64 +offset=0 +layout=0 +mdadm -CR $member0 $dev8 $dev9 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk $layout +testdev $member0 $num_disks $size $chunk + +offset=$(((size & ~(chunk - 1)))) +size=4000 +level=1 +chunk=0 +mdadm -CR $member1 $dev8 $dev9 -n $num_disks -l $level -z $size +ddf_check member $member1 $num_disks $level $size $size $offset $chunk $layout +testdev $member1 1 $size 1 +check wait + +mdadm -Ss + +# RAID10 + RAID5 +num_disks=4 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 $dev10 $dev11 +ddf_check container $num_disks + +size=9000 +level=10 +chunk=64 +offset=0 +layout=2 +mdadm -CR $member0 $dev8 $dev9 $dev10 $dev11 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk $layout +testdev $member0 $((num_disks-2)) $size $chunk + +offset=$(((size & ~(chunk - 1)))) +size=4000 +level=5 +mdadm -CR $member1 $dev8 $dev9 $dev10 $dev11 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member1 $num_disks $level $size $((size*3)) $offset $chunk $layout +testdev $member1 $((num_disks-1)) $size $chunk +check wait + +# FAIL / REBUILD +ddf_check_hold $container $dev8 +mdadm --fail $member0 $dev8 +mdadm --wait-clean --scan || true +ddf_check_removal $container $dev8 +mdadm --add $container $dev12 +check wait +ddf_check_hold $container $dev12 diff --git a/tests/10ddf-fail-create-race b/tests/10ddf-fail-create-race new file mode 100644 index 00000000..bd5dfb51 --- /dev/null +++ b/tests/10ddf-fail-create-race @@ -0,0 +1,66 @@ +# This test creates a RAID1, fails a disk, and immediately +# (simultaneously) creates a new array. This tests for a possible +# race where the meta data reflecting the disk failure may not +# be written when the 2nd array is created. +. tests/env-ddf-template + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +mdadm -CR $container -e ddf -l container -n 2 $dev11 $dev12 +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container -z 10000 >/tmp/mdmon.txt 2>&1 +mdadm -CR $member0 -l raid1 -n 2 $container -z 10000 +check wait +fail0=$dev11 +mdadm --fail $member0 $fail0 & + +# The test can succeed two ways: +# 1) mdadm -C member1 fails - in this case the meta data +# was already on disk when the create attempt was made +# 2) mdadm -C succeeds in the first place (meta data not on disk yet), +# but mdmon detects the problem and sets the disk faulty. + +if mdadm -CR $member1 -l raid1 -n 2 $container; then + + echo create should have failed / race condition? + + check wait + set -- $(get_raiddisks $member0) + d0=$1 + ret=0 + if [ $1 = $fail0 -o $2 = $fail0 ]; then + ret=1 + else + set -- $(get_raiddisks $member1) + if [ $1 = $fail0 -o $2 = $fail0 ]; then + ret=1 + fi + fi + if [ $ret -eq 1 ]; then + echo ERROR: failed disk $fail0 is still a RAID member + echo $member0: $(get_raiddisks $member0) + echo $member1: $(get_raiddisks $member1) + fi + tmp=$(mktemp /tmp/mdest-XXXXXX) + mdadm -E $d0 >$tmp + if [ x$(grep -c 'state\[[01]\] : Degraded' $tmp) != x2 ]; then + echo ERROR: non-degraded array found + mdadm -E $d0 + ret=1 + fi + if ! grep -q '^ *0 *[0-9a-f]\{8\} .*Offline, Failed' $tmp; then + echo ERROR: disk 0 not marked as failed in meta data + mdadm -E $d0 + ret=1 + fi + rm -f $tmp +else + ret=0 +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} + +[ $ret -eq 0 ] + diff --git a/tests/10ddf-fail-readd b/tests/10ddf-fail-readd new file mode 100644 index 00000000..9cd78937 --- /dev/null +++ b/tests/10ddf-fail-readd @@ -0,0 +1,55 @@ +# Simple fail / re-add test +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 +mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +mke2fs -F $member0 +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 + +sleep 1 +mdadm $container --remove $fail0 + +set -- $(get_raiddisks $member0) +case $1 in MISSING) shift;; esac +good0=$1 + +# We re-add the disk now +mdadm $container --add $fail0 + +sleep 1 +mdadm --wait $member0 || true + +ret=0 +set -- $(get_raiddisks $member0) +case $1:$2 in + $dev8:$dev9|$dev9:$dev8);; + *) echo ERROR: bad raid disks "$@"; ret=1;; +esac + +mdadm -Ss +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: member 0 should be optimal in meta data on $x + ret=1 + fi +done + +rm -f $tmp +if [ $ret -ne 0 ]; then + mdadm -E $dev8 + mdadm -E $dev9 +fi + +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-readd-readonly b/tests/10ddf-fail-readd-readonly new file mode 100644 index 00000000..6a74d9c8 --- /dev/null +++ b/tests/10ddf-fail-readd-readonly @@ -0,0 +1,71 @@ +# Simple fail / re-add test +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 +mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 + +sleep 1 +set -- $(get_raiddisks $member0) +case $1 in MISSING) shift;; esac +good0=$1 + +# Check that the meta data now show one disk as failed +ret=0 +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Degraded, Consistent' $tmp; then + echo ERROR: member 0 should be degraded in meta data on $x + ret=1 + fi + phys=$(grep $x $tmp) + case $x:$phys in + $fail0:*active/Offline,\ Failed);; + $good0:*active/Online);; + *) echo ERROR: wrong phys disk state for $x + ret=1 + ;; + esac +done + +mdadm $container --remove $fail0 + +# We re-add the disk now +mdadm $container --add $fail0 + +sleep 1 +mdadm --wait $member0 || true + +set -- $(get_raiddisks $member0) +case $1:$2 in + $dev8:$dev9|$dev9:$dev8);; + *) echo ERROR: bad raid disks "$@"; ret=1;; +esac + +mdadm -Ss +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: member 0 should be optimal in meta data on $x + ret=1 + fi +done + +rm -f $tmp +if [ $ret -ne 0 ]; then + mdadm -E $dev8 + mdadm -E $dev9 +fi + +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-spare b/tests/10ddf-fail-spare new file mode 100644 index 00000000..ab737ca4 --- /dev/null +++ b/tests/10ddf-fail-spare @@ -0,0 +1,86 @@ +# Test suggested by Albert Pauw: Create, fail one disk, have mdmon +# activate the spare, +# then run create again. Shouldn't use the failed disk for Create, +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -CR $container -e ddf -l container -n 5 $dev8 $dev9 $dev10 $dev11 $dev12 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm --fail $member0 $fail0 + +# To make sure the spare is activated, we may have to sleep +# 2s has always been enough for me +sleep 2 +check wait + +# This test can succeed both ways - if spare was activated +# before new array was created, we see only member 0. +# otherwise, we see both, adn member0 is degraded because the +# new array grabbed the spare +# which case occurs depends on the sleep time above. +ret=0 +if mdadm -CR $member1 -l raid5 -n 3 $container; then + # Creation successful - must have been quicker than spare activation + + check wait + set -- $(get_raiddisks $member1) + if [ $1 = $fail0 -o $2 = $fail0 -o $3 = $fail0 ]; then + echo ERROR: $member1 must not contain $fail0: $@ + ret=1 + fi + d1=$1 + mdadm -E $d1 >$tmp + if ! grep -q 'state\[1\] : Optimal, Consistent' $tmp; then + echo ERROR: member 1 should be optimal in meta data + ret=1 + fi + state0=Degraded +else + # Creation unsuccessful - spare was used for member 0 + state0=Optimal +fi + +# need to delay a little bit, sometimes the meta data aren't +# up-to-date yet +sleep 0.5 +set -- $(get_raiddisks $member0) +if [ $1 = $fail0 -o $2 = $fail0 ]; then + echo ERROR: $member0 must not contain $fail0: $@ + ret=1 +fi +d0=$1 + +[ -f $tmp ] || mdadm -E $d0 >$tmp + +if ! grep -q 'state\[0\] : '$state0', Consistent' $tmp; then + echo ERROR: member 0 should be $state0 in meta data + ret=1 +fi +if ! grep -q 'Offline, Failed' $tmp; then + echo ERROR: Failed disk expected in meta data + ret=1 +fi +if [ $ret -eq 1 ]; then + cat /proc/mdstat + mdadm -E $d0 + mdadm -E $d1 + mdadm -E $fail0 +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} + +rm -f $tmp +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-stop-readd b/tests/10ddf-fail-stop-readd new file mode 100644 index 00000000..f8ebe176 --- /dev/null +++ b/tests/10ddf-fail-stop-readd @@ -0,0 +1,66 @@ +# Simple fail / re-add test +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 +mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +# Write to the array +mke2fs -F $member0 +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 + +sleep 1 +mdadm $container --remove $fail0 + +set -- $(get_raiddisks $member0) +case $1 in MISSING) shift;; esac +good0=$1 + +mdadm -Ss + +sleep 1 +# Now simulate incremental assembly +mdadm -I $good0 +mdadm -IRs || true + +# Write to the array +mke2fs -F $member0 + +# We re-add the disk now +mdadm $container --add $fail0 + +sleep 1 +mdadm --wait $member0 || true + +ret=0 +set -- $(get_raiddisks $member0) +case $1:$2 in + $dev8:$dev9|$dev9:$dev8);; + *) echo ERROR: bad raid disks "$@"; ret=1;; +esac + +mdadm -Ss +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: member 0 should be optimal in meta data on $x + ret=1 + fi +done + +rm -f $tmp +if [ $ret -ne 0 ]; then + mdadm -E $dev8 + mdadm -E $dev9 +fi + +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-twice b/tests/10ddf-fail-twice new file mode 100644 index 00000000..6af19434 --- /dev/null +++ b/tests/10ddf-fail-twice @@ -0,0 +1,59 @@ +. tests/env-ddf-template + +num_disks=5 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 $dev10 $dev11 $dev12 +ddf_check container $num_disks + +mdadm -CR $member0 -n 2 -l 1 $container +mdadm -CR $member1 -n 3 -l 5 $container + +mdadm --wait $member1 $member0 || mdadm --wait $member1 $member0 || true + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 +set -- $(get_raiddisks $member1) +fail1=$1 +mdadm $member1 --fail $fail1 + +mdadm $container --add $dev13 + +mdadm --wait $member1 $member0 || mdadm --wait $member1 $member0 || true + + +devs0="$(get_raiddisks $member0)" +devs1="$(get_raiddisks $member1)" + +present=$(($(get_present $member0) + $(get_present $member1))) +[ $present -eq 4 ] || { + echo expected 4 present disks, got $present + devices for $member0: $devs0 + devices for $member1: $devs1 + exit 1 +} + +if echo "$devs0" | grep -q MISSING; then + good=1 + bad=0 +else + good=0 + bad=1 +fi + +# find a good device +eval "set -- \$devs$good" +check=$1 + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +mdadm -E $check >$tmp + +{ grep -q 'state\['$bad'\] : Degraded, Consistent' $tmp && + grep -q 'state\['$good'\] : Optimal, Consistent' $tmp; } || { + echo unexpected meta data state on $check + mdadm -E $check + rm -f $tmp + exit 1 +} + +rm -f $tmp +exit 0 diff --git a/tests/10ddf-fail-two-spares b/tests/10ddf-fail-two-spares new file mode 100644 index 00000000..e00810d8 --- /dev/null +++ b/tests/10ddf-fail-two-spares @@ -0,0 +1,86 @@ +# Simulate two disks failing shorty after each other +. tests/env-ddf-template +sda=$(get_rootdev) || exit 1 +tmp=$(mktemp /tmp/mdtest-XXXXXX) + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -CR $container -e ddf -l container -n 6 \ + $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +#fast_sync + +mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 +#$dir/mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 \ +# >/tmp/mdmon.txt 2>&1 +mdadm -CR $member1 -l raid10 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 + +dd if=$sda of=$member0 bs=1M count=32 +dd if=$sda of=$member1 bs=1M skip=16 count=16 + +check wait + +sum0=$(sha1sum $member0) +sum1=$(sha1sum $member1) + +mdadm --fail $member1 $dev11 +sleep 1 +mdadm --fail $member1 $dev12 + +# We will have 4 resync procedures, 2 spares for 2 arrays. +mdadm --wait $member1 $member0 || true +mdadm --wait $member1 $member0 || true + +devs0="$(get_raiddisks $member0)" +devs1="$(get_raiddisks $member1)" +expected="$dev10 +$dev13 +$dev8 +$dev9" + +ret=0 +if [ "$(echo "$devs0" | sort)" != "$expected" \ + -o "$(echo "$devs1" | sort)" != "$expected" ]; then + echo ERROR: unexpected members + echo $member0: $devs0 + echo $member1: $devs1 + ret=1 +fi + +mdadm -E $dev10 >$tmp +if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: $member0 should be optimal in meta data + ret=1 +fi +if ! grep -q 'state\[1\] : Optimal, Consistent' $tmp; then + echo ERROR: $member1 should be optimal in meta data + ret=1 +fi +if [ x"$(grep -c active/Online $tmp)" != x4 ]; then + echo ERROR: expected 4 online disks + ret=1 +fi +if [ x"$(grep -c "Offline, Failed" $tmp)" != x2 ]; then + echo ERROR: expected 2 failed disks + ret=1 +fi + +sum0a=$(sha1sum $member0) +sum1a=$(sha1sum $member1) + +if [ "$sum0" != "$sum0a" -o "$sum1" != "$sum1a" ]; then + echo ERROR: checksum mismatch + ret=1 +fi + +if [ $ret -eq 1 ]; then + cat /proc/mdstat + cat $tmp +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} +rm -f $tmp + +[ $ret -eq 0 ] diff --git a/tests/10ddf-geometry b/tests/10ddf-geometry new file mode 100644 index 00000000..b0cce2f6 --- /dev/null +++ b/tests/10ddf-geometry @@ -0,0 +1,82 @@ +# +# Test various RAID geometries, creation and deletion of subarrays +# + +assert_fail() { + if mdadm "$@"; then + echo mdadm "$@" must fail + return 1 + else + return 0 + fi +} + +assert_kill() { + local dev=$1 n=$2 + mdadm -S $dev + mdadm --kill-subarray=$n /dev/md/ddf0 + if mdadm -Dbs | grep -q $dev; then + echo >&2 $dev should be deleted + return 1 + fi + return 0 +} + +set -e +mdadm -CR /dev/md/ddf0 -e ddf -n 6 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +# RAID1 geometries +# Use different sizes to make offset calculation harder +mdadm -CR l1s -l1 -n2 /dev/md/ddf0 -z 8000 +mdadm -CR l1m -l1 -n3 $dev8 $dev9 $dev10 -z 10000 +assert_fail -CR badl1 -l1 -n4 /dev/md/ddf0 + +# RAID10 geometries +mdadm -CR l10_0 -l10 -n3 /dev/md/ddf0 -z 1000 +mdadm -CR l10_1 -l10 -n5 /dev/md/ddf0 -z 1000 +assert_fail mdadm -CR badl10 -l10 -n4 -pn3 /dev/md/ddf0 +mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 4000 +mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 4000 + +assert_fail -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 +assert_kill /dev/md/l10_2 4 +# gone now, must be able to create it again +mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 + +# Now stop and reassemble +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +# Same as above, on inactive container +assert_fail -CR l10_3 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 +# Kill subarray without having started anything (no mdmon) +mdadm --kill-subarray=5 /dev/md/ddf0 +mdadm -I /dev/md/ddf0 +mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 5000 + +assert_kill /dev/md/l10_2 4 +assert_kill /dev/md/l10_3 5 + +# RAID5 geometries +mdadm -CR l5la -l5 -n3 --layout=ddf-N-restart /dev/md/ddf0 -z 5000 +mdadm -CR l5ra -l5 -n3 --layout=ddf-zero-restart /dev/md/ddf0 -z 5000 +mdadm -CR l5ls -l5 -n3 --layout=ddf-N-continue /dev/md/ddf0 -z 5000 +assert_fail -CR l5rs -l5 -n3 -prs /dev/md/ddf0 -z 5000 + +# Stop and reassemble +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -I /dev/md/ddf0 + +assert_kill /dev/md/l5la 4 +assert_kill /dev/md/l5ls 6 +assert_kill /dev/md/l5ra 5 + +# RAID6 geometries +assert_fail -CR l6la -l6 -n3 -pla /dev/md/ddf0 -z 5000 +assert_fail -CR l6rs -l5 -n4 -prs /dev/md/ddf0 -z 5000 +mdadm -CR l6la -l6 -n4 --layout=ddf-N-restart /dev/md/ddf0 -z 5000 +mdadm -CR l6ra -l6 -n4 --layout=ddf-zero-restart $dev8 $dev9 $dev10 $dev11 -z 5000 +mdadm -CR l6ls -l6 -n4 --layout=ddf-N-continue $dev13 $dev8 $dev9 $dev12 -z 5000 + +mdadm -Ss diff --git a/tests/10ddf-incremental-wrong-order b/tests/10ddf-incremental-wrong-order new file mode 100644 index 00000000..9ecf6bc2 --- /dev/null +++ b/tests/10ddf-incremental-wrong-order @@ -0,0 +1,131 @@ +# An array is assembled incompletely. Some disks will +# have later metadata than others. +# The array is then reassembled in the "wrong" order - +# older meta data first. +# This FAILS with mdadm 3.3 +. tests/env-ddf-template +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp /var/tmp/mdmon.log +ret=0 + +mdadm -CR $container -e ddf -n 4 $dev8 $dev9 $dev10 $dev11 +ddf_check container 4 + +mdadm -CR $member1 -n 4 -l 10 $dev8 $dev10 $dev9 $dev11 -z 10000 +mdadm -CR $member0 -n 2 -l 1 $dev8 $dev9 -z 10000 + +mdadm --wait $member0 || true +mdadm --wait $member1 || true + +mke2fs -F $member0 +mke2fs -F $member1 +sha_0a=$(sha1_sum $member0) +sha_1a=$(sha1_sum $member1) + +mdadm -Ss +sleep 1 + +# Add all devices except those for $member0 +mdadm -I $dev10 +mdadm -I $dev11 + +# Start runnable members ($member1) and write +mdadm -IRs || true +e2fsck -fy $member1 +sha_1b=$(sha1_sum $member1) + +mdadm -Ss +sleep 1 + +# Seq number should be different now +seq8a=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p') +seq10a=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p') + +if [ $seq8a -ge $seq10a ]; then + ret=1 + echo ERROR: sequential number of $dev10 not bigger than $dev8 +fi +if [ x$sha_1a = x$sha_1b ]; then + ret=1 + echo ERROR: sha1sums equal after write +fi + +#[ -f /var/tmp/mdmon.log ] && cat /var/tmp/mdmon.log + +# Now reassemble +# Note that we add the previously missing disks first. +# $dev10 should have a higher seq number than $dev8 +for d in $dev8 $dev9 $dev10 $dev11; do + mdadm -I $d +done + +mdadm -IRs || true +sha_0c=$(sha1_sum $member0) +sha_1c=$(sha1_sum $member1) + +mdadm -Ss +sleep 1 + +seq8c=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p') +seq10c=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p') + +if [ x$sha_0a != x$sha_0c ]; then + ret=1 + echo ERROR: sha1sum of $member0 has changed +fi +if [ x$sha_1b != x$sha_1c ]; then + ret=1 + echo ERROR: sha1sum of $member1 has changed +fi +if [ \( $seq10a -ge $seq10c \) -o \( $seq8c -ne $seq10c \) ]; then + ret=1 + echo ERROR: sequential numbers are wrong +fi + +# Expect consistent state +for d in $dev10 $dev8; do + mdadm -E $d>$tmp + for x in 0 1; do + egrep 'state\['$x'\] : Optimal, Consistent' $tmp || { + ret=1 + echo ERROR: $member0 has unexpected state on $d + } + done + if [ x$(egrep -c 'active/Online$' $tmp) != x4 ]; then + ret=1 + echo ERROR: unexpected number of online disks on $d + fi +done + +# Now try assembly +if mdadm -A $container $dev8 $dev9 $dev10 $dev11; then + mdadm -IR $container + sha_0d=$(sha1_sum $member0) + sha_1d=$(sha1_sum $member1) + mdadm -Ss + sleep 1 + seq8d=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p') + seq10d=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p') + if [ x$sha_0a != x$sha_0d ]; then + ret=1 + echo ERROR: sha1sum of $member0 has changed + fi + if [ x$sha_1b != x$sha_1d ]; then + ret=1 + echo ERROR: sha1sum of $member1 has changed + fi + if [ \( $seq10a -ge $seq10d \) -o \( $seq8d -ne $seq10d \) ]; then + ret=1 + echo ERROR: sequential numbers are wrong + fi +else + ret=1 + echo ERROR: assembly failed +fi + +if [ $ret -ne 0 ]; then + mdadm -E $dev10 + mdadm -E $dev8 +fi +rm -f $tmp /var/tmp/mdmon.log +[ $ret -eq 0 ] diff --git a/tests/10ddf-sudden-degraded b/tests/10ddf-sudden-degraded new file mode 100644 index 00000000..dc692aea --- /dev/null +++ b/tests/10ddf-sudden-degraded @@ -0,0 +1,18 @@ +# +# An array is assembled with one device missing. +# The other device must be marked as Failed in metadata + +. tests/env-ddf-template + +mdadm -CR $container -e ddf -n 2 $dev8 $dev9 +ddf_check container 2 + +mdadm -CR $member1 -n 2 -l1 $dev8 $dev9 +mdadm --wait $member1 || true +mdadm -Ss + +mdadm -I $dev8 +mdadm -R $container +mkfs $member1 +# There must be a missing device recorded +mdadm --examine $dev8 | grep 'Raid Devices.*--' || exit 1 diff --git a/tests/11spare-migration b/tests/11spare-migration new file mode 100644 index 00000000..24b6ec69 --- /dev/null +++ b/tests/11spare-migration @@ -0,0 +1,454 @@ +# Set of tests for autorebuild functionality using mdadm -F +# To be able to test ddf one must have all loop devices of bigger size, with the ones +# above number 7 bigger again by any amount (this is not changed for now as it +# could affect other tests) + +export IMSM_DEVNAME_AS_SERIAL=1 +export IMSM_TEST_OROM=1 +export IMSM_NO_PLATFORM=1 + +. tests/utils +set -ex +verbose="yes" +sleeptime=10 + +# if listfailed=yes then don't exit if test failed due to wrong +# spare-migration and just print a list at the end. Other errors still +# stop the test. +# if listfailed=no then exit on first failure +listfailed="yes" + +# start Monitor, set monitorpid +# uses global scan variable +# all parameters are numbers of devices to be monitored. only used when $scan="no" +# eg. monitor 0 1 will start monitoring of containers c0, c1 and subarrays v0, v1 +monitor(){ + [ -z $monitorpid ] || return + if [ "$scan" == "yes" ]; then + $mdadm -F -d 1 --scan --mail root@localhost -c $config & + monitorpid=$! + return + fi + unset mddevs + while [ -n "$1" ] + do + eval container=\$c$1 + eval volumes=\$v$1 + mddevs="$mddevs /dev/$container" + if [ "$container" != "$volumes" ]; then + for vol in $volumes; do + mddevs="$mddevs /dev/$vol" + done + fi + shift + done + if [ -n "$mddevs" ]; then + if [ "$verbose" != "yes" ]; then + $mdadm -F -d 1 $mddevs -c $config >&2 & + monitorpid=$! + else + $mdadm -F -t -d 1 $mddevs -c $config & + monitorpid=$! + fi + fi + [ "$verbose" != "yes" ] || echo $mddevs $monitorpid +} + +test0() +{ +dsc "Test 0: No config file, no spare should be moved" +> $config +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev0 +# check that spare loop2 was not moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 n +tidyup +} + +test0a() +{ +dsc "Test 0a: No domains in config file, no spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev0 +# check that spare loop2 was not moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 n +tidyup +} + +test1() +{ +dsc "Test 1: Common domain, add disk to one container and fail first one in another container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +# create config file with arrays and common domain +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev0 +# check that spare loop2 was moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 +tidyup +} + +test1a() +{ +dsc "Test 1a: Common domain, add disk to one container and fail second one in another container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev1 +# check that spare loop2 was moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 +tidyup +} + +test2() +{ +dsc "Test 2: Common domain, fail disk in one container and add one to another container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm --fail /dev/$v0 $dev1 +mdadm -a /dev/$c1 $dev2 +chksparemoved $c1 $c0 $dev2 +tidyup +} + +test3() +{ +dsc "Test 3: Two domains, fail a disk in one domain, add a disk to another domain, the spare should not be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +# create config file with 2 domains +createconfig a +createconfig domain-$platform"1" $platform spare 0 1 2 +createconfig domain-$platform"2" $platform spare 3 4 5 +monitor 0 1 +mdadm --fail /dev/$v0 $dev1 +mdadm -a /dev/$c1 $dev5 +chksparemoved $c1 $c0 $dev5 n +tidyup +} + +test4() +{ +dsc "Test 4: One domain holds one container, fail a disk in domain, and add disk to a container not described by domain, move if metadata allows" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 +monitor 0 1 +mdadm --fail /dev/$v0 $dev1 +mdadm -a /dev/$c1 $dev5 +unset shouldmove +[ "$platform" == "imsm" ] || shouldmove="n" +chksparemoved $c1 $c0 $dev5 $shouldmove +tidyup +} + +test5() +{ +dsc "Test 5: Two domains, two containers in each domain" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +setupdevs 2 5 6 $platform +setupdevs 3 8 10 $platform +# 2 and 9 for spares +createconfig a +createconfig domain-$platform"1" $platform spare 0 1 2 3 4 +createconfig domain-$platform"2" $platform spare 5 6 8 9 10 +monitor 0 1 2 3 +test5a +test5b +test5c +tidyup +} + +test5a() +{ +dsc "Test 5a: Two containers in each domain, add spare loop2 to domain1 and fail disk in the other domain, the spare should not be moved" +mdadm -a /dev/$c0 $dev2 +mdadm --fail /dev/$v2 $dev5 +chksparemoved $c0 $c2 $dev2 n +} + +test5b() +{ +dsc "Test 5b: Fail disk in the same domain but different container, spare loop2 should be moved" +mdadm --fail /dev/$v1 $dev3 +chksparemoved $c0 $c1 $dev2 +} + +test5c() +{ +dsc "Test 5c: Add spare loop9 to different container in domain with degraded array, spare should be moved" +mdadm -a /dev/$c3 $dev9 +chksparemoved $c3 $c2 $dev9 +} + +test6() +{ +dsc "Test 6: One domain has two containers, fail a disk in one container, there is a spare in other container too small to use for rebuild" +setupdevs 0 0 1 $platform +setupdevs 1 8 9 $platform +# all devices in one domain +createconfig a +createconfig domain-$platform $platform spare 0 1 2 8 9 +monitor 0 1 +mdadm -a /dev/$c0 $dev2 +mdadm --fail /dev/$v1 $dev8 +chksparemoved $c0 $c1 $dev2 n +tidyup +} + +test7() +{ +dsc "Test 7: One domain, add small spare to container, fail disk in array, spare not used, add suitable spare to other container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 8 9 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 8 9 10 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v1 $dev8 +mdadm -a /dev/$c0 $dev10 +chksparemoved $c0 $c1 $dev10 +tidyup +} + + +test7a() +{ +dsc "Test 7a: Small spare in parent, suitable one in other container, $dev2 in $c1 is not in common domain" +setupdevs 0 0 1 $platform +setupdevs 1 8 9 $platform +#all $platform devices in one domain +createconfig a +createconfig domain-$platform"1" $platform spare 0 1 8 9 10 +createconfig domain-$platform"2" $platform spare 2 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +chkspare $c1 $dev2 +mdadm --fail /dev/$v1 $dev8 +mdadm -a /dev/$c0 $dev10 +chksparemoved $c0 $c1 $dev10 +tidyup +} + +test8() +{ +# ddf does not have getinfo_super_disks implemented so skip this test +return +dsc "Test 8: imsm and ddf - spare should not be migrated" +setupdevs 0 10 11 imsm +setupdevs 1 8 9 ddf +createconfig a +createconfig domain0 noplatform spare 8 9 10 11 12 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 n +tidyup +} + +test9() +{ +dsc "Test 9: imsm and native 1.2 - one domain, no metadata specified, spare should be moved" +setupdevs 0 10 11 imsm +setupdevs 1 8 9 1.2 +createconfig a +createconfig domain0 noplatform spare 8 9 10 11 12 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 +tidyup +} + +test9a() +{ +dsc "Test 9a: imsm and native 1.2 - spare in global domain, should be moved" +setupdevs 0 10 11 imsm +setupdevs 1 8 9 1.2 +createconfig a +createconfig domain-global noplatform spare 8 9 10 11 12 +createconfig domain-1.2 1.2 spare 8 9 +createconfig domain-imsm imsm spare 10 11 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 +tidyup +} + +test10() +{ +dsc "Test 10: Two arrays on the same devices in container" +setupdevs 0 0 1 $platform 10000 +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 5 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/md/sub0_ $dev0 +chksparemoved $c1 $c0 $dev2 +if [ $failed -eq 0 ]; then +# now fail the spare and see if we get another one + mdadm --fail /dev/md/sub0_ $dev2 + mdadm -a /dev/$c1 $dev5 + chksparemoved $c1 $c0 $dev5 +fi +tidyup +} + +test11() +{ +dsc "Test 11: Failed spare from other container should not be used" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v1 $dev3 +#wait until recovery finishes so no degraded array in c1 +check wait +mdadm --fail /dev/$v0 $dev0 +chksparemoved $c1 $c0 $dev3 n +tidyup +} + +test12() +{ +dsc "Test 12: Only one spare should be taken for rebuild, second not needed" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 5 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm -a /dev/$c1 $dev5 +mdadm --fail /dev/$v0 $dev0 +sleep $sleeptime +chkarray $dev2 n +sc1=$c +chkarray $dev5 n +sc2=$c +[ "$sc1" != "$sc2" ] || err "both spares in the same container $sc1" +tidyup +} + +test13() +{ +dsc "Test 13: Common domain, two containers, fail a disk in container, action is below spare, the spare should be moved regadless of action" +setupdevs 0 0 1 $platform +setupdevs 1 4 5 $platform +# same domain but different action on 4 5 6 +createconfig a +createconfig domain-$platform $platform spare 0 1 +createconfig domain-$platform $platform include 4 5 6 +monitor 0 1 +mdadm -a /dev/$c1 $dev6 +mdadm --fail /dev/$v0 $dev0 +chksparemoved $c1 $c0 $d6 +tidyup +} + +test14() +{ +dsc "Test 14: One domain, small array on big disks, check if small spare is accepted" +setupdevs 0 8 9 $platform 10000 1 +setupdevs 1 0 1 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 8 9 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev9 +chksparemoved $c1 $c0 $d2 +tidyup +} + +test15() +{ +dsc "Test 15: spare in global domain for $platform metadata, should be moved" +# this is like 9a but only one metadata used +setupdevs 0 10 11 $platform +setupdevs 1 8 9 $platform +createconfig a +createconfig domain-global $platform spare 8 9 10 11 12 +createconfig domain-1 $platform spare 8 9 +createconfig domain-2 $platform spare 10 11 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 +tidyup +} + +try() +{ +test0 +test0a +test1 +test1a +test2 +test3 +test4 +test5 +test6 +if [ "$platform" != "1.2" ]; then +# this is because we can't have a small spare added to native array + test7 + test7a +fi +test8 +test9 +test9a +if [ "$platform" != "1.2" ]; then +# we can't create two subarrays on the same devices for native (without +# partitions) + test10 +fi +test11 +test12 +test13 +test14 +test15 +} + +try_failed() +{ +platform="1.2" +scan="no" +test5 +test9 +test13 +scan="yes" +test9 +} + +#try_failed + +for scan in no yes; do + for platform in 1.2 imsm; do + try + done +done + +[ $listfailed == "no" ] || [ -z $flist ] || echo -e "\n FAILED TESTS: $flist" + +#cat $targetdir/log +rm -f /dev/disk/by-path/loop* diff --git a/tests/12imsm-r0_2d-grow-r0_3d b/tests/12imsm-r0_2d-grow-r0_3d new file mode 100644 index 00000000..3c6cf743 --- /dev/null +++ b/tests/12imsm-r0_2d-grow-r0_3d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks grow to RAID 0 volume, 3 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2" + +# Before: RAID 0 volume, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 3 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r0_2d-grow-r0_4d b/tests/12imsm-r0_2d-grow-r0_4d new file mode 100644 index 00000000..e4fccda5 --- /dev/null +++ b/tests/12imsm-r0_2d-grow-r0_4d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks grow to RAID 0 volume, 4 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3" + +# Before: RAID 0 volume, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 4 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 2)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r0_2d-grow-r0_5d b/tests/12imsm-r0_2d-grow-r0_5d new file mode 100644 index 00000000..388a5bbd --- /dev/null +++ b/tests/12imsm-r0_2d-grow-r0_5d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks grow to RAID 0 volume, 5 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3 $dev4" + +# Before: RAID 0 volume, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 5 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 3)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r0_3d-grow-r0_4d b/tests/12imsm-r0_3d-grow-r0_4d new file mode 100644 index 00000000..7065f07b --- /dev/null +++ b/tests/12imsm-r0_3d-grow-r0_4d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 3 disks grow to RAID 0 volume, 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume, 3 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 4 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r5_3d-grow-r5_4d b/tests/12imsm-r5_3d-grow-r5_4d new file mode 100644 index 00000000..097da0a7 --- /dev/null +++ b/tests/12imsm-r5_3d-grow-r5_4d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 5 volume, 3 disks grow to RAID 5 volume, 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 5 volume, 4 disks, 64k chunk size +vol0_new_num_comps=$num_disks + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r5_3d-grow-r5_5d b/tests/12imsm-r5_3d-grow-r5_5d new file mode 100644 index 00000000..2e5c7d25 --- /dev/null +++ b/tests/12imsm-r5_3d-grow-r5_5d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 5 volume, 3 disks grow to RAID 5 volume, 5 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3 $dev4" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 5 volume, 5 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r0_2d-grow-r0_r0_4d b/tests/13imsm-r0_r0_2d-grow-r0_r0_4d new file mode 100644 index 00000000..f85efa5d --- /dev/null +++ b/tests/13imsm-r0_r0_2d-grow-r0_r0_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 2 disks to 4 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3" + +# Before: RAID 0 volume in slot #0, 2 disks, 128k chunk size +# RAID 0 volume in slot #1, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID 0 volume in slot #0, 4 disks, 128k chunk size +# RAID 0 volume in slot #1, 4 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 2)) +vol1_new_num_comps=$vol0_new_num_comps + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r0_2d-grow-r0_r0_5d b/tests/13imsm-r0_r0_2d-grow-r0_r0_5d new file mode 100644 index 00000000..1b851a9b --- /dev/null +++ b/tests/13imsm-r0_r0_2d-grow-r0_r0_5d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow both members from 2 disks to 5 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3 $dev4" + +# Before: RAID 0 volume in slot #0, 2 disks, 64k chunk size +# RAID 0 volume in slot #1, 2 disks, 256k chunk size +vol0_level=0 +vol0_comp_size=$((4 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((6 * 1024)) +vol1_chunk=256 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID 0 volume in slot #0, 5 disks, 64k chunk size +# RAID 0 volume in slot #1, 5 disks, 256k chunk size +vol0_new_num_comps=$((num_disks + 3)) +vol1_new_num_comps=$vol0_new_num_comps + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r0_3d-grow-r0_r0_4d b/tests/13imsm-r0_r0_3d-grow-r0_r0_4d new file mode 100644 index 00000000..27ba83b3 --- /dev/null +++ b/tests/13imsm-r0_r0_3d-grow-r0_r0_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow a container (arrays inside) from 3 disks to 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume in slot #0, 3 disks, 128k chunk size +# RAID 0 volume in slot #1, 3 disks, 512k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=128 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID0 volume in slot #0, 4 disks, 128k chunk size +# RAID0 volume in slot #1, 4 disks, 512k chunk size +vol0_new_num_comps=$((num_disks + 1)) +vol1_new_num_comps=$vol0_new_num_comps + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r5_3d-grow-r0_r5_4d b/tests/13imsm-r0_r5_3d-grow-r0_r5_4d new file mode 100644 index 00000000..b4bde449 --- /dev/null +++ b/tests/13imsm-r0_r5_3d-grow-r0_r5_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume in slot #0, 3 disks, 64k chunk size +# RAID 5 volume in slot #1, 3 disks, 128k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=128 +vol1_num_comps=$((num_disks - 1)) +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID 0 volume in slot #0, 4 disks, 64k chunk size +# RAID 5 volume in slot #1, 4 disks, 128k chunk size +vol1_new_num_comps=$num_disks +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r5_3d-grow-r0_r5_5d b/tests/13imsm-r0_r5_3d-grow-r0_r5_5d new file mode 100644 index 00000000..d0db9aeb --- /dev/null +++ b/tests/13imsm-r0_r5_3d-grow-r0_r5_5d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 5 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3 $dev4" + +# Before: RAID 0 volume in slot #0, 3 disks, 256k chunk size +# RAID 5 volume in slot #1, 3 disks, 512k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=128 +vol1_num_comps=$((num_disks - 1)) +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID 0 volume in slot #0, 5 disks, 256k chunk size +# RAID 5 volume in slot #1, 5 disks, 512k chunk size +vol0_new_num_comps=$((num_disks + 2)) +vol1_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r5_r0_3d-grow-r5_r0_4d b/tests/13imsm-r5_r0_3d-grow-r5_r0_4d new file mode 100644 index 00000000..32ebc924 --- /dev/null +++ b/tests/13imsm-r5_r0_3d-grow-r5_r0_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 5 volume in slot #0, 3 disks, 64k chunk size +# RAID 0 volume in slot #1, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_offset=$((vol0_comp_size + 2048)) +vol1_num_comps=$num_disks + +# After: RAID 5 volume in slot #0, 4 disks, 64k chunk size +# RAID 0 volume in slot #1, 4 disks, 64k chunk size +vol0_new_num_comps=$num_disks +vol1_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r5_r0_3d-grow-r5_r0_5d b/tests/13imsm-r5_r0_3d-grow-r5_r0_5d new file mode 100644 index 00000000..a97002d0 --- /dev/null +++ b/tests/13imsm-r5_r0_3d-grow-r5_r0_5d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 5 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3 $dev4" + +# Before: RAID 5 volume in slot #0, 3 disks, 128k chunk size +# RAID 0 volume in slot #1, 3 disks, 256k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_offset=$((vol0_comp_size + 2048)) +vol1_num_comps=$num_disks + +# After: RAID 5 volume in slot #0, 5 disks, 128k chunk size +# RAID 0 volume in slot #1, 5 disks, 256k chunk size +vol0_new_num_comps=$((num_disks + 1)) +vol1_new_num_comps=$((num_disks + 2)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d b/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d new file mode 100644 index 00000000..386abeee --- /dev/null +++ b/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# RAID 0 and RAID 5 volumes (3 disks) migrate to RAID 5 and RAID 5 volumes (4 disks) +# NEGATIVE test - migration is not allowed if there is more then one array in a container + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume, 3 disks, 64k chunk size, as member #0 +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# Extra: RAID 5 volume, 3 disks, 64k chunk size, as member #1 +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$((num_disks - 1)) +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID 5 volume, 4 disks, 64k chunk size (only member #0) +vol0_new_level=5 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/14imsm-r0_3d_no_spares-migrate-r5_3d b/tests/14imsm-r0_3d_no_spares-migrate-r5_3d new file mode 100644 index 00000000..10bbab6d --- /dev/null +++ b/tests/14imsm-r0_3d_no_spares-migrate-r5_3d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 0 volume (3 disks, no spares) migrate to RAID 5 volume (3 disks) +# NEGATIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 0 volume, 3 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 5, 3 disks, 64k chunk size +vol0_new_level=5 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 diff --git a/tests/14imsm-r0_r0_2d-takeover-r10_4d b/tests/14imsm-r0_r0_2d-takeover-r10_4d new file mode 100644 index 00000000..df5b0ce3 --- /dev/null +++ b/tests/14imsm-r0_r0_2d-takeover-r10_4d @@ -0,0 +1,30 @@ +. tests/env-imsm-template + + +# Two RAID 0 volumes (2 disks) migrate to RAID 10 volume (4 disks) +# NEGATIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# Before: RAID 0 volume in slot #0, 2 disks, 64k chunk size +# RAID 0 volume in slot #1, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# Before: RAID 0 volume, disks, 64k chunk size +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=num_disks +vol1_offset=$(( $vol0_comp_size + 2048 )) + +# After: RAID 10, 4 disks, 64k chunk size +vol0_new_level=10 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/14imsm-r10_4d-grow-r10_5d b/tests/14imsm-r10_4d-grow-r10_5d new file mode 100644 index 00000000..bcbe1476 --- /dev/null +++ b/tests/14imsm-r10_4d-grow-r10_5d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 10 volume, 4 disks grow to RAID 10 volume, 5 disks +# NEGATIVE test + +num_disks=4 +device_list="$dev0 $dev1 $dev2 $dev3" +spare_list="$dev4" + +# Before: RAID 10 volume, 4 disks, 128k chunk size +vol0_level=10 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$((num_disks - 2)) +vol0_offset=0 + +# After: RAID 10 volume, 5 disks, 128k chunks size (test should fail) +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 1 0 diff --git a/tests/14imsm-r10_r5_4d-takeover-r0_2d b/tests/14imsm-r10_r5_4d-takeover-r0_2d new file mode 100644 index 00000000..9e5205e2 --- /dev/null +++ b/tests/14imsm-r10_r5_4d-takeover-r0_2d @@ -0,0 +1,30 @@ +. tests/env-imsm-template + + +# Two RAID volumes: RAID10 and RAID5 (4 disks) migrate to RAID 0 volume (2 disks) +# NEGATIVE test + +num_disks=4 +device_list="$dev0 $dev1 $dev2 $dev3" + +# Before: RAID 10 volume in slot #0, 4 disks, 64k chunk size +# RAID 5 volume in slot #1, 4 disks, 64k chunk size +vol0_level=10 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$(( $num_disks - 2 )) +vol0_offset=0 + +# Before: RAID 0 volume, disks, 64k chunk size +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$(( $num_disks - 1 )) +vol1_offset=$(( $vol0_comp_size + 2048 )) + +# After: RAID 10, 4 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=2 +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/14imsm-r1_2d-grow-r1_3d b/tests/14imsm-r1_2d-grow-r1_3d new file mode 100644 index 00000000..1edd50e4 --- /dev/null +++ b/tests/14imsm-r1_2d-grow-r1_3d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 1 volume, 2 disks grow to RAID 1 volume, 3 disks +# NEGATIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev4" + +# Before: RAID 1 volume, 2 disks, 64k chunk size +vol0_level=1 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 1 volume, 3 disks, 64k chunks size (test should fail) +vol0_new_num_comps=$num_disks + +. tests/imsm-grow-template 1 0 diff --git a/tests/14imsm-r1_2d-takeover-r0_2d b/tests/14imsm-r1_2d-takeover-r0_2d new file mode 100644 index 00000000..d8296815 --- /dev/null +++ b/tests/14imsm-r1_2d-takeover-r0_2d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 1 volume, 2 disks change to RAID 0 volume, 2 disks +# +#NEGATIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# Before: RAID 1 volume, 2 disks, 64k chunk size +vol0_level=1 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 0 volume, 2 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 diff --git a/tests/14imsm-r5_3d-grow-r5_5d-no-spares b/tests/14imsm-r5_3d-grow-r5_5d-no-spares new file mode 100644 index 00000000..ed18e72b --- /dev/null +++ b/tests/14imsm-r5_3d-grow-r5_5d-no-spares @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 5 volume, 3 disks grow to RAID 5 volume, 4 disks +# NEGATIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 5 volume, 4 disks, 64k chunks size +add_to_num_disks=2 +vol0_new_num_comps=$((num_disks + 2)) + +. tests/imsm-grow-template 1 0 diff --git a/tests/14imsm-r5_3d-migrate-r4_3d b/tests/14imsm-r5_3d-migrate-r4_3d new file mode 100644 index 00000000..e3b971cc --- /dev/null +++ b/tests/14imsm-r5_3d-migrate-r4_3d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume (3 disks) migrate to RAID 4 volume (3 disks) +# NEGATIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 4, 3 disks, 64k chunk size +vol0_new_level=4 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 diff --git a/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k b/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k new file mode 100644 index 00000000..4fe3807e --- /dev/null +++ b/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 0 volume, Migration from 64k to 256k chunk size. +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# RAID 0, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# RAID 0, 2 disks, 256k chunk size +vol0_new_level=0 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k b/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k new file mode 100644 index 00000000..025e9efb --- /dev/null +++ b/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume, Migration from 4k to 256 chunk size. +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# RAID 5, 3 disks, 4k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=4 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# RAID 5, 3 disks, 256k chunk size +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k b/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k new file mode 100644 index 00000000..37547b74 --- /dev/null +++ b/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume, Migration from 64k to 256k chunk size. +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# RAID 5, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# RAID 5, 3 disks, 256k chunk size +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k b/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k new file mode 100644 index 00000000..d2f6c707 --- /dev/null +++ b/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume, Migration from 4k to 256k chunk size. +# POSITIVE test + +num_disks=6 +device_list="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5" + +# RAID 5, 6 disks, 4k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=4 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# RAID 5, 6 disks, 256k chunk size +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k b/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k new file mode 100644 index 00000000..da218efa --- /dev/null +++ b/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k @@ -0,0 +1,34 @@ +. tests/env-imsm-template + +# Member 0: RAID 5 volume, Member 1: RAID 0 volume +# Migration from 64k to 256k chunk size (both members) +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# RAID 5, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After migration parameters +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +# RAID 0, 3 disks, 64k chunk size +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 2048)) + +# After migration paramters +vol1_new_level=0 +vol1_new_num_comps=$vol1_num_comps +vol1_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r0_3d-migrate-r5_4d b/tests/16imsm-r0_3d-migrate-r5_4d new file mode 100644 index 00000000..4f45479a --- /dev/null +++ b/tests/16imsm-r0_3d-migrate-r5_4d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 0 volume (3 disks) migrate to RAID 5 volume (4 disks) +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 0, 3 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 5, 4 disks, 64k chunk size +vol0_new_level=5 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r0_5d-migrate-r5_6d b/tests/16imsm-r0_5d-migrate-r5_6d new file mode 100644 index 00000000..bee505bf --- /dev/null +++ b/tests/16imsm-r0_5d-migrate-r5_6d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 0 volume (5 disks) migrate to RAID 5 volume (6 disks) +# POSITIVE test + +num_disks=5 +device_list="$dev0 $dev1 $dev2 $dev3 $dev4" + +# Before: RAID 0, 5 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 5, 6 disks, 64k chunk size +vol0_new_level=5 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r5_3d-migrate-r0_3d b/tests/16imsm-r5_3d-migrate-r0_3d new file mode 100644 index 00000000..b1459cc1 --- /dev/null +++ b/tests/16imsm-r5_3d-migrate-r0_3d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume (3 disks) migrate to RAID 0 volume (2 disks) +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 5, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 0, 3 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=$((num_disks-1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r5_5d-migrate-r0_5d b/tests/16imsm-r5_5d-migrate-r0_5d new file mode 100644 index 00000000..323ca52e --- /dev/null +++ b/tests/16imsm-r5_5d-migrate-r0_5d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume (5 disks) migration to RAID 0 volume (4 disks) +# POSITIVE test + +num_disks=5 +device_list="$dev0 $dev1 $dev2 $dev3 $dev4" + +# Before: RAID 5 volume, 5 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 0 volume, 5 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 0 1 diff --git a/tests/18imsm-1d-takeover-r0_1d b/tests/18imsm-1d-takeover-r0_1d new file mode 100644 index 00000000..6f5cf5a6 --- /dev/null +++ b/tests/18imsm-1d-takeover-r0_1d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# Create RAID 0 from a single disk. +# POSITIVE test + +vol0_num_comps=1 +vol0_comp_size=$((10 * 1024)) + +# Create container +mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0 +check wait +imsm_check container $vol0_num_comps + +# Create RAID 0 volume +mdadm --create --run $member0 --auto=md --level=0 --size=$vol0_comp_size --chunk=64 --force --raid-disks=$vol0_num_comps $dev0 +check wait + +# Test the member +imsm_check member $member0 $vol0_num_comps 0 $vol0_comp_size $((vol0_num_comps * vol0_comp_size)) 0 64 +testdev $member0 $vol0_num_comps $vol0_comp_size 64 + +exit 0 diff --git a/tests/18imsm-1d-takeover-r1_2d b/tests/18imsm-1d-takeover-r1_2d new file mode 100644 index 00000000..72e4173e --- /dev/null +++ b/tests/18imsm-1d-takeover-r1_2d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# Create RAID 1 from a single disk +# POSITIVE test + +vol0_num_comps=1 +vol0_comp_size=$((10 * 1024)) + +# Create container +mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0 +check wait +imsm_check container $vol0_num_comps + +# Create RAID 1 volume +mdadm --create --run $member0 --auto=md --level=1 --size=$vol0_comp_size --chunk=64 --raid-disks=$((vol0_num_comps + 1)) $dev0 missing +check wait + +# Test the member0 +imsm_check member $member0 $((vol_num_comps + 1)) 1 $vol0_comp_size $((vol0_num_comps * vol0_comp_size)) 0 64 +testdev $member0 $vol0_num_comps $vol0_comp_size 64 diff --git a/tests/18imsm-r0_2d-takeover-r10_4d b/tests/18imsm-r0_2d-takeover-r10_4d new file mode 100644 index 00000000..0e77e5da --- /dev/null +++ b/tests/18imsm-r0_2d-takeover-r10_4d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks change to RAID 10 volume, 4 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3" + +# Before: RAID 0 volume, 2 disks, 256k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 10 volume, 4 disks, 256k chunk size +vol0_new_level=10 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=128 + +. tests/imsm-grow-template 0 1 diff --git a/tests/18imsm-r10_4d-takeover-r0_2d b/tests/18imsm-r10_4d-takeover-r0_2d new file mode 100644 index 00000000..8a9606b4 --- /dev/null +++ b/tests/18imsm-r10_4d-takeover-r0_2d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 10 volume, 4 disks change to RAID 0 volume, 2 disks +# POSITIVE test + +num_disks=4 +device_list="$dev0 $dev1 $dev2 $dev3" + +# Before: RAID 10 volume, 4 disks, 128k chunk size +vol0_level=10 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$((num_disks - 2)) +vol0_offset=0 + +# After: RAID 0 volume, 2 disks, 128k chunk size +vol0_new_level=0 +vol0_new_num_comps=2 +vol0_new_chunk=128 +new_num_disks=2 + +. tests/imsm-grow-template 0 1 diff --git a/tests/18imsm-r1_2d-takeover-r0_1d b/tests/18imsm-r1_2d-takeover-r0_1d new file mode 100644 index 00000000..cb10ec97 --- /dev/null +++ b/tests/18imsm-r1_2d-takeover-r0_1d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 1 volume, 2 disks change to RAID 0 volume, 1 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# Before: RAID 1 volume, 2 disks +vol0_level=1 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$(( $num_disks - 1 )) +vol0_offset=0 + +# After: RAID 0 volume, 1 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=1 +vol0_new_chunk=64 +new_num_disks=0 + +. tests/imsm-grow-template 0 1 diff --git a/tests/19raid6auto-repair b/tests/19raid6auto-repair new file mode 100644 index 00000000..7fb1c72f --- /dev/null +++ b/tests/19raid6auto-repair @@ -0,0 +1,41 @@ +number_of_disks=5 +chunksize_in_kib=512 +chunksize_in_b=$[chunksize_in_kib*1024] +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev0 $dev1 $dev2 $dev3 $dev4" + +# default 2048 sectors +data_offset_in_kib=$[2048/2] + +# make a raid5 from a file +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib +mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs +dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib +blockdev --flushbufs $md0; sync +check wait +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches +cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } + +# wipe out 5 chunks on each device +dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0] +dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5] +dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10] +dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15] +dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20] + +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches + +$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } + +$dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; } +blockdev --flushbufs $md0 $devs; sync +echo 3 > /proc/sys/vm/drop_caches + +$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } +cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + +mdadm -S $md0 +udevadm settle diff --git a/tests/19raid6check b/tests/19raid6check new file mode 100644 index 00000000..67958c6a --- /dev/null +++ b/tests/19raid6check @@ -0,0 +1,27 @@ +# +# Confirm that raid6check handles all RAID6 layouts. +# Try both 4 and 5 devices. + +layouts='ls rs la ra' +lv=`uname -r` +if expr $lv '>=' 2.6.30 > /dev/null +then + layouts="$layouts parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6" +fi + +for layout in $layouts +do + for devs in 4 5 + do + dl="$dev0 $dev1 $dev2 $dev3" + if [ $devs = 5 ]; then dl="$dl $dev4"; fi + + mdadm -CR $md0 -l6 --layout $layout -n$devs $dl + check wait + tar cf - /etc > $md0 + ./raid6check $md0 0 0 | grep 'Error detected' && exit 1 + mdadm -S $md0 + done +done + diff --git a/tests/19raid6repair b/tests/19raid6repair new file mode 100644 index 00000000..1159bd3e --- /dev/null +++ b/tests/19raid6repair @@ -0,0 +1,47 @@ +number_of_disks=4 +chunksize_in_kib=512 +chunksize_in_b=$[chunksize_in_kib*1024] +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev1 $dev2 $dev3 $dev4" + +# default 2048 sectors +data_offset_in_kib=$[2048/2] + +for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" "$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \ + "$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" "$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do + failure_split=( $failure ) + device_with_error=${failure_split[0]} + stripe_with_error=${failure_split[1]} + repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}" + start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error] + + # make a raid5 from a file + dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib + mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs + dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib + blockdev --flushbufs $md0; sync + + check wait + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } + + dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib + blockdev --flushbufs $device_with_error; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } + + $dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; } + blockdev --flushbufs $md0 $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + mdadm -S $md0 + udevadm settle + sync + echo 3 > /proc/sys/vm/drop_caches +done diff --git a/tests/19repair-does-not-destroy b/tests/19repair-does-not-destroy new file mode 100644 index 00000000..a92883fd --- /dev/null +++ b/tests/19repair-does-not-destroy @@ -0,0 +1,28 @@ +number_of_disks=7 +chunksize_in_kib=512 +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6" + +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib +mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs +dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib +blockdev --flushbufs $md0; sync +check wait +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches +$dir/raid6check $md0 repair 1 2 3 > /dev/null # D D +$dir/raid6check $md0 repair 8 2 5 > /dev/null # D P +$dir/raid6check $md0 repair 15 4 6 > /dev/null # D Q +$dir/raid6check $md0 repair 22 5 6 > /dev/null # P Q +$dir/raid6check $md0 repair 3 4 0 > /dev/null # Q D +$dir/raid6check $md0 repair 3 3 1 > /dev/null # P D +$dir/raid6check $md0 repair 6 4 5 > /dev/null # D<D +$dir/raid6check $md0 repair 13 5 4 > /dev/null # D>D +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches +$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } +cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo should not mess up correct stripe ; exit 2; } + +mdadm -S $md0 +udevadm settle diff --git a/tests/ToTest b/tests/ToTest new file mode 100644 index 00000000..b98e266d --- /dev/null +++ b/tests/ToTest @@ -0,0 +1,44 @@ + +multipath!! + +add/remove/fail + raid1 DONE + raid5 DONE + raid6/10 needed?? + +assemble + by devices DONE + by uuid DONE + by superminor DONE + by config file DONE + + various --updates DONE (not sparc2.2 or summaries) + +stop + --scan + +readonly/readwrite + +bitmap + separate file + internal + filename in config file + +examine + --scan + --brief + +detail + +grow: + size + raid1/5/6 DONE + devices + raid1 add DONE + raid1 shrink DONE + +'--quiet' option, and remove "" +'--name' option fo v1, and configfile etc... + +faulty + errors in raid1/5/6 diff --git a/tests/check b/tests/check new file mode 100644 index 00000000..f4ed6d5b --- /dev/null +++ b/tests/check @@ -0,0 +1,35 @@ + +case $1 in + raid* | linear ) + grep -s "active $1 " /proc/mdstat > /dev/null || { + echo >&2 "ERROR active $1 not found" ; cat /proc/mdstat ; exit 1;} + ;; + resync | recovery ) + sleep 0.1 + grep -s $1 /proc/mdstat > /dev/null || { + echo >&2 ERROR no $1 happening; cat /proc/mdstat; exit 1; } + ;; + + nosync ) + sleep 0.5 + grep -s 're[synccovery]* =' > /dev/null /proc/mdstat && { + echo >&2 "ERROR resync or recovery is happening!"; cat /proc/mdstat ; exit 1; } + ;; + + wait ) + sleep 0.1 + while grep 're[synccovery]* =' > /dev/null /proc/mdstat + do sleep 2; + done + ;; + + state ) + grep -s "blocks.*\[$2\]\$" /proc/mdstat > /dev/null || { + echo >&2 "ERROR state $2 not found!"; cat /proc/mdstat ; exit 1; } + sleep 0.5 + ;; + + * ) echo >&2 ERROR unknown check $1 ; exit 1; +esac + +exit 0 diff --git a/tests/env-ddf-template b/tests/env-ddf-template new file mode 100644 index 00000000..90d7272f --- /dev/null +++ b/tests/env-ddf-template @@ -0,0 +1,113 @@ +sha1_sum() { + sha1sum "$1" | cut -c 1-40 +} + +get_rootdev() { + local dev=$(stat -c %D /) + local maj=$(expr $dev : '\(..*\)..') + local min=${dev#$maj} + local bd=/dev/$(basename $(readlink /sys/dev/block/$((0x$maj)):$((0x$min)))) + [ -b $bd ] || exit 1 + echo $bd +} + +get_sysdir() { + local mddev=$1 + [ -L $mddev ] && mddev=$(readlink -f $mddev) + echo "/sys/class/block/$(basename $mddev)/md" +} + +get_raiddisks() { + sysdir=$(get_sysdir "$1") + for i in $(seq 0 $(($(cat $sysdir/raid_disks)-1))); do + if [ -d $sysdir/rd$i ]; then + readlink -f /dev/block/$(cat $sysdir/rd$i/block/dev) + else + echo MISSING + fi + done +} + +get_present() { + get_raiddisks $1 | grep -vc MISSING +} + +ddf_check() { + udevadm settle + case $1 in + container ) + grep -s "blocks super external:ddf" /proc/mdstat > /dev/null || { + echo >&2 "**Fatal** Correctly formed container not found"; cat /proc/mdstat; exit 1; } + ;; + member ) + t_member=$2 + t_num_disks=$3 + t_level=$4 + t_rd_size=$5 + t_size=$6 + t_offset=$7 + t_chunk=$8 + t_layout=$9 + + if [ $t_chunk -ne 0 ]; then + t_rd_size=$((t_rd_size & ~(t_chunk - 1))) + fi + case $t_level in + 0) t_size=$((t_num_disks*$t_rd_size));; + 1) t_size=$t_rd_size;; + 4|5) t_size=$(((t_num_disks-1)*$t_rd_size));; + 6) t_size=$(((t_num_disks-2)*$t_rd_size));; + 10) t_size=$((t_num_disks*$t_rd_size/t_layout));; + esac + + err=0 + + eval `stat -L -c "let major=0x%t; let minor=0x%T;" $t_member` + sysfs=/sys/dev/block/${major}:${minor} + if [ ! -f ${sysfs}/md/array_state ]; then + echo "**Fatal**: Array member $t_member not found" >&2; cat /proc/mdstat >&2; exit 1 + fi + _chunk=`cat ${sysfs}/md/chunk_size` + if [ $t_chunk -ne $((_chunk/1024)) ]; then + echo "**Error**: Chunk size mismatch - expected $t_chunk, actual $_chunk" >&2 + err=$((err + 1)) + fi + for i in `seq 0 $((t_num_disks - 1))`; do + _offset=`cat ${sysfs}/md/rd${i}/offset` + if [ $t_offset -ne $((_offset / 2)) ]; then + echo "**Error**: Offset mismatch - expected $t_offset, actual $((_offset/2))" >&2 + err=$((err + 1)) + fi + _rd_size=`cat ${sysfs}/md/rd${i}/size` + if [ $t_rd_size -ne $_rd_size ]; then + echo "**Error**: Component size mismatch - expected $t_rd_size, actual $_rd_size" >&2 + err=$((err + 1)) + fi + done + _size=`cat ${sysfs}/md/array_size` + [ o$_size = odefault ] && _size=$(($(cat ${sysfs}/size)/2)) + if [ $t_size -ne $_size ]; then + echo "**Error**: Array size mismatch - expected $t_size, actual $_size" >&2 + err=$((err + 1)) + fi + if [ $err -gt 0 ]; then + echo "$t_member failed check" >&2 + cat /proc/mdstat >&2 + mdadm -E /dev/loop8 >&2 + exit 1 + fi + ;; + * ) + echo >&2 "**Error** unknown check $1"; exit 1; + esac +} + +container=/dev/md/ddf0 +member0=/dev/md/vol0 +member1=/dev/md/vol1 +member2=/dev/md/vol2 +member3=/dev/md/vol3 +member4=/dev/md/vol4 + +# We don't want systemd to start system mdmon; start our own +export MDADM_NO_SYSTEMCTL=1 diff --git a/tests/env-imsm-template b/tests/env-imsm-template new file mode 100644 index 00000000..bc5f5852 --- /dev/null +++ b/tests/env-imsm-template @@ -0,0 +1,74 @@ +imsm_check() { + udevadm settle + case $1 in + container ) + grep -s "blocks super external:imsm" /proc/mdstat > /dev/null || { + echo >&2 "**Fatal** Correctly formed container not found"; cat /proc/mdstat; exit 1; } + ;; + member ) + t_member=$2 + t_num_disks=$3 + t_level=$4 + t_rd_size=$5 + t_size=$6 + t_offset=$7 + t_chunk=$8 + + if [ $t_level -ne 1 ]; then + t_rd_size=$((t_rd_size & ~(t_chunk - 1))) + else + t_chunk=64 + fi + t_size=$((t_size/1024)) + t_size=$((t_size*1024)) + err=0 + + eval `stat -L -c "let major=0x%t; let minor=0x%T;" $t_member` + sysfs=/sys/dev/block/${major}:${minor} + if [ ! -f ${sysfs}/md/array_state ]; then + echo "**Fatal**: Array member $t_member not found" >&2; cat /proc/mdstat >&2; exit 1 + fi + _chunk=`cat ${sysfs}/md/chunk_size` + if [ $t_chunk -ne $((_chunk/1024)) ]; then + echo "**Error**: Chunk size mismatch - expected $t_chunk, actual $_chunk" >&2 + err=$((err + 1)) + fi + for i in `seq 0 $((t_num_disks - 1))`; do + _offset=`cat ${sysfs}/md/rd${i}/offset` + if [ $t_offset -ne $((_offset / 2)) ]; then + echo "**Error**: Offset mismatch - expected $t_offset, actual $_offset" >&2 + err=$((err + 1)) + fi + _rd_size=`cat ${sysfs}/md/rd${i}/size` + if [ $t_rd_size -ne $_rd_size ]; then + echo "**Error**: Component size mismatch - expected $t_rd_size, actual $_rd_size" >&2 + err=$((err + 1)) + fi + done + _size=`cat ${sysfs}/md/array_size` + if [ $t_size -ne $_size ]; then + echo "**Error**: Array size mismatch - expected $t_size, actual $_size" >&2 + err=$((err + 1)) + fi + if [ $err -gt 0 ]; then + echo "$t_member failed check" >&2 + cat /proc/mdstat >&2 + mdadm -E /dev/loop0 >&2 + exit 1 + fi + ;; + * ) + echo >&2 "**Error** unknown check $1"; exit 1; + esac +} + +export IMSM_NO_PLATFORM=1 +export IMSM_DEVNAME_AS_SERIAL=1 +export IMSM_TEST_OROM=1 +export MDADM_EXPERIMENTAL=1 +container=/dev/md/container +member0=/dev/md/vol0 +member1=/dev/md/vol1 +member2=/dev/md/vol2 +member3=/dev/md/vol3 +member4=/dev/md/vol4 diff --git a/tests/imsm-grow-template b/tests/imsm-grow-template new file mode 100644 index 00000000..71a0bbb1 --- /dev/null +++ b/tests/imsm-grow-template @@ -0,0 +1,106 @@ + +# 0 - POSITIVE test, otherwise NEGATIVE test +negative_test=$1 + +# 0 - On-line Capacity Expansion test, otherwise LEVEL migration or CHUNK size migration test +migration_test=$2 + +function grow_member() { + local member=$1 + local disks=$2 + local comps=$3 + local level=$4 + local size=$5 + local offset=$6 + local chunk=$7 + local array_size=$((comps * size)) + + rm -f $backup_imsm + ( set -ex; mdadm --grow $member --chunk=$chunk --level=$level ) + local status=$? + if [ $negative_test -ne 0 ]; then + if [ $status -eq 0 ]; then + echo >&2 "**Error**: $member: --grow should failed, but it completed successfuly" + exit 1 + fi + return + fi + check wait + sleep 5 + imsm_check member $member $disks $level $size $array_size $offset $chunk + testdev $member $comps $size $chunk +} + +# Create container +mdadm --create --run $container --auto=md --metadata=imsm --raid-disks=$num_disks $device_list +check wait +imsm_check container $num_disks + +# Create first volume inside the container +mdadm --create --run $member0 --auto=md --level=$vol0_level --size=$vol0_comp_size --chunk=$vol0_chunk --raid-disks=$num_disks $device_list +check wait + +# Create second volume inside the container (if defined) +if [ ! -z $vol1_chunk ]; then + mdadm --create --run $member1 --auto=md --level=$vol1_level --size=$vol1_comp_size --chunk=$vol1_chunk --raid-disks=$num_disks $device_list + check wait +fi + +# Wait for any RESYNC to complete +check wait + +# Test first volume +imsm_check member $member0 $num_disks $vol0_level $vol0_comp_size $((vol0_comp_size * vol0_num_comps)) $vol0_offset $vol0_chunk +testdev $member0 $vol0_num_comps $vol0_comp_size $vol0_chunk + +# Test second volume (if defined) +if [ ! -z $vol1_chunk ]; then + imsm_check member $member1 $num_disks $vol1_level $vol1_comp_size $((vol1_comp_size * vol1_num_comps)) $vol1_offset $vol1_chunk + testdev $member1 $vol1_num_comps $vol1_comp_size $vol1_chunk +fi + +# Add extra disks to container if operation requires spares in container. +for i in $spare_list +do + mdadm --add $container $i + check wait + num_disks=$((num_disks + 1)) +done + +imsm_check container $num_disks +num_disks=$((num_disks + add_to_num_disks)) +backup_imsm=/tmp/backup_imsm + +# Grow each member or a container depending on the type of an operation +if [ $migration_test -ne 0 ]; then + if [ -z $new_num_disks ]; then + new_num_disks=$num_disks + fi + grow_member $member0 $new_num_disks $vol0_new_num_comps $vol0_new_level $vol0_comp_size $vol0_offset $vol0_new_chunk + if [[ $vol1_new_chunk -ne 0 ]] ; then + grow_member $member1 $new_num_disks $vol1_new_num_comps $vol1_new_level $vol1_comp_size $vol1_offset $vol1_new_chunk + fi +else + rm -f $backup_imsm + ( set -x; mdadm --grow $container --raid-disks=$num_disks ) + grow_status=$? + if [ $negative_test -ne 0 ]; then + if [ $grow_status -eq 0 ]; then + echo >&2 "**Error**: $container: --grow should failed, but it completed successfuly" + exit 1 + fi + else + sleep 5 + check wait + sleep 5 + check wait + imsm_check member $member0 $num_disks $vol0_level $vol0_comp_size $((vol0_comp_size * vol0_new_num_comps)) $vol0_offset $vol0_chunk + testdev $member0 $vol0_new_num_comps $vol0_comp_size $vol0_chunk + if [ $vol1_new_num_comps -ne 0 ]; then + imsm_check member $member1 $num_disks $vol1_level $vol1_comp_size $((vol1_comp_size * vol1_new_num_comps)) $vol1_offset $vol1_chunk + testdev $member1 $vol1_new_num_comps $vol1_comp_size $vol1_chunk + fi + fi +fi + +exit 0 diff --git a/tests/testdev b/tests/testdev new file mode 100644 index 00000000..8b6e6f06 --- /dev/null +++ b/tests/testdev @@ -0,0 +1,13 @@ +dev=$1 +cnt=$2 +size=$3 +chunk=$4 +mkfs -j $dev > /dev/null 2>&1 && fsck -fn $dev >&2 +dsize=$[size/chunk] +dsize=$[dsize*chunk] +rasize=$[dsize*2*cnt] +if [ $rasize -ne `/sbin/blockdev --getsize $dev` ] +then + echo "ERROR: size is wrong for $dev: $cnt * $size (chunk=$chunk) = $rasize, not `/sbin/blockdev --getsize $dev`" + exit 1; +fi diff --git a/tests/utils b/tests/utils new file mode 100644 index 00000000..3acebd77 --- /dev/null +++ b/tests/utils @@ -0,0 +1,191 @@ +# set of functions used to test policy framework with assemble, incremental and Monitor + +set +e +#create links to be able to use domains +for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 +do + eval ln -s \$dev$d /dev/disk/by-path/loop$d + eval d$d="loop$d" + eval mdadm --zero-superblock \$dev$d +done + +devices="/dev/loop[0-9] /dev/loop10 /dev/loop11 /dev/loop12" + +# on failure print out few things before exit +# uses testdsc and platform global variables +err(){ + echo >&2 "ERROR: $*" + cat $config >&2 || true + cat /proc/mdstat >&2 + [ -z "$testdsc" ] || { echo >&2 $platform: $testdsc "- failed"; } + ps -e | grep mdadm >&2 || true + if [ $listfailed == "yes" ]; then + [ "$verbose" != "yes" ] || echo ---FAILED--- + flist="$flist \n $platform $testdsc" + failed=1 + else + exit 1 + fi +} + +# set test description +dsc(){ + failed=0 + testdsc="$*" + [ "$verbose" != "yes" ] || echo $testdsc +} + +killmonitor(){ + [ -z "$monitorpid" ] || { kill -9 $monitorpid; unset monitorpid; } +} + +tidyup(){ + killmonitor + mdadm -Ss || true + mdadm -Ss + mdadm --zero-superblock $devices || true + udevadm settle + rm -f $config +} + +trap tidyup 0 1 2 3 15 + +# create a RAID 1 array or container and subarray(s) on 2 disks +# if platform not specified imsm is used +# if subsize is given, first subarray is created with given size and second one on remaining space +ccv(){ + # mddevno used to name created array + local mddevno="$1" + # numbers of devices to be used in array + local devno1="$2" + local devno2="$3" + local platform="$4" + local subsize="$5" + local onearray="$6" + [ -n "$platform" ] || platform="imsm" + if [ "$platform" == "imsm" ] || [ "$platform" == "ddf" ]; then + eval mdadm -CR /dev/md/con$mddevno -e $platform -n 2 \$dev$devno1 \$dev$devno2 + udevadm settle + [ -z "$subsize" ] || eval mdadm -CR sub$mddevno"_" -l 1 -n 2 /dev/md/con$mddevno -z $subsize + [ -n "$onearray" ] || eval mdadm -CR sub$mddevno -l 1 -n 2 /dev/md/con$mddevno + else + [ -z "$subsize" ] || sizepar="-z $subsize" + eval mdadm -CR arr$mddevno -e $platform -l 1 -n 2 \$dev$devno1 \$dev$devno2 $sizepar + unset sizepar + fi +} + +# get container and subarray using given device from mdstat +# sets global variables c and v +getarray(){ + local devname=`basename $1` + local platformtype=`grep -A 1 $devname /proc/mdstat | awk '/active/ {getline; print $4 }' | awk -F ":" 'END {print $1}'` + c=`grep "inactive.*$devname" /proc/mdstat | awk -F " " '{print $1}'` + v=`grep " active.*$devname" /proc/mdstat | awk -F " " '{print $1}'` + [ "$platformtype" == "external" ] || c=$v +} + +# check if given device belongs to any container and subarray +# if $2 given then only container checked +chkarray(){ + local devname="$1" + local subcheck="$2" + getarray $devname + [ -n "$c" ] || err "$devname not in any container" + [ -n "$subcheck" ] || [ -n "$v" ] || err " $devname not in subarray" +} + +# test if two devices in the same container/subarray +# $1 $2 - devices +# $3 don't check subarrays, only containers +tst(){ + local device1=`basename $1` + local device2=`basename $2` + local subcheck="$3" + chkarray $device1 $subcheck + local x="$c" + local y="$v" + chkarray $device2 $subcheck + [ "$c" == "$x" ] || err "$device1 and $device2 not in the same container" + [ -n "$subcheck" ] || [ "$v" == "$y" ] || err "$device1 and $device2 not in the same subarray" +} + +# same as tst, just use numbers of devices instead of names as parameters +dtst(){ + local devno1="$1" + local devno2="$2" + local subcheck="$3" + eval tst \$dev$devno1 \$dev$devno2 $subcheck +} + +# create containers/subarrays, check if created properly, +# set global variables c$mddevno v$mddevno, usually c0=md127, v0=md126 , etc. +setupdevs(){ + local mddevno="$1" + local devno1="$2" + local devno2="$3" + local p="$4" + local subsize="$5" + local onearray="$6" + [ -n "$p" ] || p=$platform + ccv $mddevno $devno1 $devno2 $p $subsize $onearray + dtst $devno1 $devno2 + eval c$mddevno=\"$c\" + eval v$mddevno=\"$v\" +} + +# check if given spare in container +# usage: chkspare container spare [n] (n if spare shouldn't be in container) +chkspare(){ + local container=`basename $1` + local spare=$2 + local expected=$3 + getarray $spare + [ -n "$expected" ] || expected="y" + if [ "$expected" == "y" ]; then + [ "$c" == "$container" ] || err "$spare not in container $container" + else + [ "$c" != "$container" ] || err "$spare in container $container" + fi +} + +#check if spare was moved from one container to another +# args: from_container to_container spare [yn] +# n when spare should remain in original container +chksparemoved(){ + sleep $sleeptime + from_container="$1" + to_container="$2" + spare="$3" + expected="$4" + [ -n "$expected" ] || expected="y" + notexpected="n"; [ "$expected" == "y" ] || notexpected="y" + chkspare $from_container $spare $notexpected + [ $failed -eq 1 ] || chkspare $to_container $spare $expected +} + + +# for domains defined through policy +createconfig(){ +if [ "$1" != "a" ]; then +{ + domain=$1 + metadata=$2 + action=$3 + while [ -n "$4" ]; do + echo="policy domain=$domain" + [ "$metadata" == "noplatform" ] || echo="$echo metadata=$metadata" + echo="$echo path=loop$4" + echo="$echo action=$action" + echo "$echo" + shift + done +} >> $config +else +{ + echo "DEVICES $devlist /dev/md1*" + mdadm -Ebs +} > $config +fi +#[ "$verbose" != "yes" ] || cat $config | grep policy || true +} diff --git a/udev-md-raid-arrays.rules b/udev-md-raid-arrays.rules new file mode 100644 index 00000000..a32b6d2d --- /dev/null +++ b/udev-md-raid-arrays.rules @@ -0,0 +1,39 @@ +# do not edit this file, it will be overwritten on update + +SUBSYSTEM!="block", GOTO="md_end" + +# handle md arrays +ACTION!="add|change", GOTO="md_end" +KERNEL!="md*", GOTO="md_end" + +# partitions have no md/{array_state,metadata_version}, but should not +# for that reason be ignored. +ENV{DEVTYPE}=="partition", GOTO="md_ignore_state" + +# container devices have a metadata version of e.g. 'external:ddf' and +# never leave state 'inactive' +ATTR{md/metadata_version}=="external:[A-Za-z]*", ATTR{md/array_state}=="inactive", GOTO="md_ignore_state" +TEST!="md/array_state", ENV{SYSTEMD_READY}="0", GOTO="md_end" +ATTR{md/array_state}=="|clear|inactive", ENV{SYSTEMD_READY}="0", GOTO="md_end" +LABEL="md_ignore_state" + +IMPORT{program}="BINDIR/mdadm --detail --export $tempnode" +ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace" +ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}" +ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}" +ENV{DEVTYPE}=="partition", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}-part%n", OPTIONS+="string_escape=replace" +ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}-part%n" +ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n" +ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n" + +IMPORT{program}="/sbin/blkid -o udev -p -u noraid $tempnode" +ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" +ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" + +ENV{MD_LEVEL}=="raid[1-9]*", ENV{SYSTEMD_WANTS}+="mdmonitor.service" + +# Tell systemd to run mdmon for our container, if we need it. +ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c" +ENV{MD_MON_THIS}=="?*", PROGRAM="/usr/bin/basename $env{MD_MON_THIS}", ENV{SYSTEMD_WANTS}+="mdmon@%c.service" + +LABEL="md_end" diff --git a/udev-md-raid-assembly.rules b/udev-md-raid-assembly.rules new file mode 100644 index 00000000..5bde607f --- /dev/null +++ b/udev-md-raid-assembly.rules @@ -0,0 +1,38 @@ +# do not edit this file, it will be overwritten on update + +# Don't process any events if anaconda is running as anaconda brings up +# raid devices manually +ENV{ANACONDA}=="?*", GOTO="md_inc_end" +# assemble md arrays + +SUBSYSTEM!="block", GOTO="md_inc_end" + +# handle potential components of arrays (the ones supported by md) +ENV{ID_FS_TYPE}=="linux_raid_member", GOTO="md_inc" + +# "noiswmd" on kernel command line stops mdadm from handling +# "isw" (aka IMSM - Intel RAID). +# "nodmraid" on kernel command line stops mdadm from handling +# "isw" or "ddf". +IMPORT{cmdline}="noiswmd" +IMPORT{cmdline}="nodmraid" + +ENV{nodmraid}=="?*", GOTO="md_inc_end" +ENV{ID_FS_TYPE}=="ddf_raid_member", GOTO="md_inc" +ENV{noiswmd}=="?*", GOTO="md_inc_end" +ENV{ID_FS_TYPE}=="isw_raid_member", GOTO="md_inc" +GOTO="md_inc_end" + +LABEL="md_inc" + +# Disable incremental assembly to fix Debian bug #784070 +GOTO="md_inc_end" + +# remember you can limit what gets auto/incrementally assembled by +# mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY' +ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $tempnode --offroot ${DEVLINKS}" +ACTION=="add|change", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer" +ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="BINDIR/mdadm -If $name --path $env{ID_PATH}" +ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="BINDIR/mdadm -If $name" + +LABEL="md_inc_end" @@ -0,0 +1,1978 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include <sys/socket.h> +#include <sys/utsname.h> +#include <sys/wait.h> +#include <sys/un.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <linux/magic.h> +#include <ctype.h> +#include <dirent.h> +#include <signal.h> + +/* + * following taken from linux/blkpg.h because they aren't + * anywhere else and it isn't safe to #include linux/ * stuff. + */ + +#define BLKPG _IO(0x12,105) + +/* The argument structure */ +struct blkpg_ioctl_arg { + int op; + int flags; + int datalen; + void *data; +}; + +/* The subfunctions (for the op field) */ +#define BLKPG_ADD_PARTITION 1 +#define BLKPG_DEL_PARTITION 2 + +/* Sizes of name fields. Unused at present. */ +#define BLKPG_DEVNAMELTH 64 +#define BLKPG_VOLNAMELTH 64 + +/* The data structure for ADD_PARTITION and DEL_PARTITION */ +struct blkpg_partition { + long long start; /* starting offset in bytes */ + long long length; /* length in bytes */ + int pno; /* partition number */ + char devname[BLKPG_DEVNAMELTH]; /* partition name, like sda5 or c0d1p2, + to be used in kernel messages */ + char volname[BLKPG_VOLNAMELTH]; /* volume label */ +}; + +#include "part.h" + +/* Force a compilation error if condition is true */ +#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) + +/* Force a compilation error if condition is true, but also produce a + result (of value 0 and type size_t), so the expression can be used + e.g. in a structure initializer (or where-ever else comma expressions + aren't permitted). */ +#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) + +/* + * Parse a 128 bit uuid in 4 integers + * format is 32 hexx nibbles with options :.<space> separator + * If not exactly 32 hex digits are found, return 0 + * else return 1 + */ +int parse_uuid(char *str, int uuid[4]) +{ + int hit = 0; /* number of Hex digIT */ + int i; + char c; + for (i = 0; i < 4; i++) + uuid[i] = 0; + + while ((c = *str++) != 0) { + int n; + if (c >= '0' && c <= '9') + n = c-'0'; + else if (c >= 'a' && c <= 'f') + n = 10 + c - 'a'; + else if (c >= 'A' && c <= 'F') + n = 10 + c - 'A'; + else if (strchr(":. -", c)) + continue; + else return 0; + + if (hit<32) { + uuid[hit/8] <<= 4; + uuid[hit/8] += n; + } + hit++; + } + if (hit == 32) + return 1; + return 0; +} + +/* + * Get the md version number. + * We use the RAID_VERSION ioctl if it is supported + * If not, but we have a block device with major '9', we assume + * 0.36.0 + * + * Return version number as 24 but number - assume version parts + * always < 255 + */ + +int md_get_version(int fd) +{ + struct stat stb; + mdu_version_t vers; + + if (fstat(fd, &stb)<0) + return -1; + if ((S_IFMT&stb.st_mode) != S_IFBLK) + return -1; + + if (ioctl(fd, RAID_VERSION, &vers) == 0) + return (vers.major*10000) + (vers.minor*100) + vers.patchlevel; + if (errno == EACCES) + return -1; + if (major(stb.st_rdev) == MD_MAJOR) + return (3600); + return -1; +} + +int get_linux_version() +{ + struct utsname name; + char *cp; + int a = 0, b = 0,c = 0; + if (uname(&name) <0) + return -1; + + cp = name.release; + a = strtoul(cp, &cp, 10); + if (*cp == '.') + b = strtoul(cp+1, &cp, 10); + if (*cp == '.') + c = strtoul(cp+1, &cp, 10); + + return (a*1000000)+(b*1000)+c; +} + +#ifndef MDASSEMBLE +int mdadm_version(char *version) +{ + int a, b, c; + char *cp; + + if (!version) + version = Version; + + cp = strchr(version, '-'); + if (!cp || *(cp+1) != ' ' || *(cp+2) != 'v') + return -1; + cp += 3; + a = strtoul(cp, &cp, 10); + if (*cp != '.') + return -1; + b = strtoul(cp+1, &cp, 10); + if (*cp == '.') + c = strtoul(cp+1, &cp, 10); + else + c = 0; + if (*cp != ' ' && *cp != '-') + return -1; + return (a*1000000)+(b*1000)+c; +} + +unsigned long long parse_size(char *size) +{ + /* parse 'size' which should be a number optionally + * followed by 'K', 'M', or 'G'. + * Without a suffix, K is assumed. + * Number returned is in sectors (half-K) + * INVALID_SECTORS returned on error. + */ + char *c; + long long s = strtoll(size, &c, 10); + if (s > 0) { + switch (*c) { + case 'K': + c++; + default: + s *= 2; + break; + case 'M': + c++; + s *= 1024 * 2; + break; + case 'G': + c++; + s *= 1024 * 1024 * 2; + break; + case 's': /* sectors */ + c++; + break; + } + } else + s = INVALID_SECTORS; + if (*c) + s = INVALID_SECTORS; + return s; +} + +int parse_layout_10(char *layout) +{ + int copies, rv; + char *cp; + /* Parse the layout string for raid10 */ + /* 'f', 'o' or 'n' followed by a number <= raid_disks */ + if ((layout[0] != 'n' && layout[0] != 'f' && layout[0] != 'o') || + (copies = strtoul(layout+1, &cp, 10)) < 1 || + copies > 200 || + *cp) + return -1; + if (layout[0] == 'n') + rv = 256 + copies; + else if (layout[0] == 'o') + rv = 0x10000 + (copies<<8) + 1; + else + rv = 1 + (copies<<8); + return rv; +} + +int parse_layout_faulty(char *layout) +{ + /* Parse the layout string for 'faulty' */ + int ln = strcspn(layout, "0123456789"); + char *m = xstrdup(layout); + int mode; + m[ln] = 0; + mode = map_name(faultylayout, m); + if (mode == UnSet) + return -1; + + return mode | (atoi(layout+ln)<< ModeShift); +} + +long parse_num(char *num) +{ + /* Either return a valid number, or -1 */ + char *c; + long rv = strtol(num, &c, 10); + if (rv < 0 || *c || !num[0]) + return -1; + else + return rv; +} +#endif + +void remove_partitions(int fd) +{ + /* remove partitions from this block devices. + * This is used for components added to an array + */ +#ifdef BLKPG_DEL_PARTITION + struct blkpg_ioctl_arg a; + struct blkpg_partition p; + + a.op = BLKPG_DEL_PARTITION; + a.data = (void*)&p; + a.datalen = sizeof(p); + a.flags = 0; + memset(a.data, 0, a.datalen); + for (p.pno = 0; p.pno < 16; p.pno++) + ioctl(fd, BLKPG, &a); +#endif +} + +int test_partition(int fd) +{ + /* Check if fd is a whole-disk or a partition. + * BLKPG will return EINVAL on a partition, and BLKPG_DEL_PARTITION + * will return ENXIO on an invalid partition number. + */ + struct blkpg_ioctl_arg a; + struct blkpg_partition p; + a.op = BLKPG_DEL_PARTITION; + a.data = (void*)&p; + a.datalen = sizeof(p); + a.flags = 0; + memset(a.data, 0, a.datalen); + p.pno = 1<<30; + if (ioctl(fd, BLKPG, &a) == 0) + /* Very unlikely, but not a partition */ + return 0; + if (errno == ENXIO || errno == ENOTTY) + /* not a partition */ + return 0; + + return 1; +} + +int test_partition_from_id(dev_t id) +{ + char buf[20]; + int fd, rv; + + sprintf(buf, "%d:%d", major(id), minor(id)); + fd = dev_open(buf, O_RDONLY); + if (fd < 0) + return -1; + rv = test_partition(fd); + close(fd); + return rv; +} + +int enough(int level, int raid_disks, int layout, int clean, char *avail) +{ + int copies, first; + int i; + int avail_disks = 0; + + for (i = 0; i < raid_disks; i++) + avail_disks += !!avail[i]; + + switch (level) { + case 10: + /* This is the tricky one - we need to check + * which actual disks are present. + */ + copies = (layout&255)* ((layout>>8) & 255); + first = 0; + do { + /* there must be one of the 'copies' form 'first' */ + int n = copies; + int cnt = 0; + int this = first; + while (n--) { + if (avail[this]) + cnt++; + this = (this+1) % raid_disks; + } + if (cnt == 0) + return 0; + first = (first+(layout&255)) % raid_disks; + } while (first != 0); + return 1; + + case LEVEL_MULTIPATH: + return avail_disks>= 1; + case LEVEL_LINEAR: + case 0: + return avail_disks == raid_disks; + case 1: + return avail_disks >= 1; + case 4: + if (avail_disks == raid_disks - 1 && + !avail[raid_disks - 1]) + /* If just the parity device is missing, then we + * have enough, even if not clean + */ + return 1; + /* FALL THROUGH */ + case 5: + if (clean) + return avail_disks >= raid_disks-1; + else + return avail_disks >= raid_disks; + case 6: + if (clean) + return avail_disks >= raid_disks-2; + else + return avail_disks >= raid_disks; + default: + return 0; + } +} + +int enough_fd(int fd) +{ + struct mdu_array_info_s array; + struct mdu_disk_info_s disk; + int i, rv; + char *avail; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0 || + array.raid_disks <= 0) + return 0; + avail = xcalloc(array.raid_disks, 1); + for (i = 0; i < MAX_DISKS && array.nr_disks > 0; i++) { + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + array.nr_disks--; + + if (! (disk.state & (1<<MD_DISK_SYNC))) + continue; + if (disk.raid_disk < 0 || disk.raid_disk >= array.raid_disks) + continue; + avail[disk.raid_disk] = 1; + } + /* This is used on an active array, so assume it is clean */ + rv = enough(array.level, array.raid_disks, array.layout, + 1, avail); + free(avail); + return rv; +} + +const int uuid_zero[4] = { 0, 0, 0, 0 }; + +int same_uuid(int a[4], int b[4], int swapuuid) +{ + if (swapuuid) { + /* parse uuids are hostendian. + * uuid's from some superblocks are big-ending + * if there is a difference, we need to swap.. + */ + unsigned char *ac = (unsigned char *)a; + unsigned char *bc = (unsigned char *)b; + int i; + for (i = 0; i < 16; i += 4) { + if (ac[i+0] != bc[i+3] || + ac[i+1] != bc[i+2] || + ac[i+2] != bc[i+1] || + ac[i+3] != bc[i+0]) + return 0; + } + return 1; + } else { + if (a[0]==b[0] && + a[1]==b[1] && + a[2]==b[2] && + a[3]==b[3]) + return 1; + return 0; + } +} + +void copy_uuid(void *a, int b[4], int swapuuid) +{ + if (swapuuid) { + /* parse uuids are hostendian. + * uuid's from some superblocks are big-ending + * if there is a difference, we need to swap.. + */ + unsigned char *ac = (unsigned char *)a; + unsigned char *bc = (unsigned char *)b; + int i; + for (i = 0; i < 16; i += 4) { + ac[i+0] = bc[i+3]; + ac[i+1] = bc[i+2]; + ac[i+2] = bc[i+1]; + ac[i+3] = bc[i+0]; + } + } else + memcpy(a, b, 16); +} + +char *__fname_from_uuid(int id[4], int swap, char *buf, char sep) +{ + int i, j; + char uuid[16]; + char *c = buf; + strcpy(c, "UUID-"); + c += strlen(c); + copy_uuid(uuid, id, swap); + for (i = 0; i < 4; i++) { + if (i) + *c++ = sep; + for (j = 3; j >= 0; j--) { + sprintf(c,"%02x", (unsigned char) uuid[j+4*i]); + c+= 2; + } + } + return buf; + +} + +char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep) +{ + // dirty hack to work around an issue with super1 superblocks... + // super1 superblocks need swapuuid set in order for assembly to + // work, but can't have it set if we want this printout to match + // all the other uuid printouts in super1.c, so we force swapuuid + // to 1 to make our printout match the rest of super1 + return __fname_from_uuid(info->uuid, (st->ss == &super1) ? 1 : st->ss->swapuuid, buf, sep); +} + +#ifndef MDASSEMBLE +int check_ext2(int fd, char *name) +{ + /* + * Check for an ext2fs file system. + * Superblock is always 1K at 1K offset + * + * s_magic is le16 at 56 == 0xEF53 + * report mtime - le32 at 44 + * blocks - le32 at 4 + * logblksize - le32 at 24 + */ + unsigned char sb[1024]; + time_t mtime; + unsigned long long size; + int bsize; + if (lseek(fd, 1024,0)!= 1024) + return 0; + if (read(fd, sb, 1024)!= 1024) + return 0; + if (sb[56] != 0x53 || sb[57] != 0xef) + return 0; + + mtime = sb[44]|(sb[45]|(sb[46]|sb[47]<<8)<<8)<<8; + bsize = sb[24]|(sb[25]|(sb[26]|sb[27]<<8)<<8)<<8; + size = sb[4]|(sb[5]|(sb[6]|sb[7]<<8)<<8)<<8; + size <<= bsize; + pr_err("%s appears to contain an ext2fs file system\n", + name); + cont_err("size=%lluK mtime=%s", size, ctime(&mtime)); + return 1; +} + +int check_reiser(int fd, char *name) +{ + /* + * superblock is at 64K + * size is 1024; + * Magic string "ReIsErFs" or "ReIsEr2Fs" at 52 + * + */ + unsigned char sb[1024]; + unsigned long long size; + if (lseek(fd, 64*1024, 0) != 64*1024) + return 0; + if (read(fd, sb, 1024) != 1024) + return 0; + if (strncmp((char*)sb+52, "ReIsErFs",8) != 0 && + strncmp((char*)sb+52, "ReIsEr2Fs",9) != 0) + return 0; + pr_err("%s appears to contain a reiserfs file system\n",name); + size = sb[0]|(sb[1]|(sb[2]|sb[3]<<8)<<8)<<8; + cont_err("size = %lluK\n", size*4); + + return 1; +} + +int check_raid(int fd, char *name) +{ + struct mdinfo info; + time_t crtime; + char *level; + struct supertype *st = guess_super(fd); + + if (!st) + return 0; + st->ss->load_super(st, fd, name); + /* Looks like a raid array .. */ + pr_err("%s appears to be part of a raid array:\n", + name); + st->ss->getinfo_super(st, &info, NULL); + st->ss->free_super(st); + crtime = info.array.ctime; + level = map_num(pers, info.array.level); + if (!level) level = "-unknown-"; + cont_err("level=%s devices=%d ctime=%s", + level, info.array.raid_disks, ctime(&crtime)); + return 1; +} + +int ask(char *mesg) +{ + char *add = ""; + int i; + for (i = 0; i < 5; i++) { + char buf[100]; + fprintf(stderr, "%s%s", mesg, add); + fflush(stderr); + if (fgets(buf, 100, stdin)==NULL) + return 0; + if (buf[0]=='y' || buf[0]=='Y') + return 1; + if (buf[0]=='n' || buf[0]=='N') + return 0; + add = "(y/n) "; + } + pr_err("assuming 'no'\n"); + return 0; +} +#endif /* MDASSEMBLE */ + +int is_standard(char *dev, int *nump) +{ + /* tests if dev is a "standard" md dev name. + * i.e if the last component is "/dNN" or "/mdNN", + * where NN is a string of digits + * Returns 1 if a partitionable standard, + * -1 if non-partitonable, + * 0 if not a standard name. + */ + char *d = strrchr(dev, '/'); + int type = 0; + int num; + if (!d) + return 0; + if (strncmp(d, "/d",2) == 0) + d += 2, type = 1; /* /dev/md/dN{pM} */ + else if (strncmp(d, "/md_d", 5) == 0) + d += 5, type = 1; /* /dev/md_dN{pM} */ + else if (strncmp(d, "/md", 3) == 0) + d += 3, type = -1; /* /dev/mdN */ + else if (d-dev > 3 && strncmp(d-2, "md/", 3) == 0) + d += 1, type = -1; /* /dev/md/N */ + else + return 0; + if (!*d) + return 0; + num = atoi(d); + while (isdigit(*d)) + d++; + if (*d) + return 0; + if (nump) *nump = num; + + return type; +} + +unsigned long calc_csum(void *super, int bytes) +{ + unsigned long long newcsum = 0; + int i; + unsigned int csum; + unsigned int *superc = (unsigned int*) super; + + for(i = 0; i < bytes/4; i++) + newcsum += superc[i]; + csum = (newcsum& 0xffffffff) + (newcsum>>32); +#ifdef __alpha__ +/* The in-kernel checksum calculation is always 16bit on + * the alpha, though it is 32 bit on i386... + * I wonder what it is elsewhere... (it uses an API in + * a way that it shouldn't). + */ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); +#endif + return csum; +} + +#ifndef MDASSEMBLE +char *human_size(long long bytes) +{ + static char buf[30]; + + /* We convert bytes to either centi-M{ega,ibi}bytes or + * centi-G{igi,ibi}bytes, with appropriate rounding, + * and then print 1/100th of those as a decimal. + * We allow upto 2048Megabytes before converting to + * gigabytes, as that shows more precision and isn't + * too large a number. + * Terabytes are not yet handled. + */ + + if (bytes < 5000*1024) + buf[0] = 0; + else if (bytes < 2*1024LL*1024LL*1024LL) { + long cMiB = (bytes * 200LL / (1LL<<20) + 1) / 2; + long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2; + snprintf(buf, sizeof(buf), " (%ld.%02ld MiB %ld.%02ld MB)", + cMiB/100 , cMiB % 100, + cMB/100, cMB % 100); + } else { + long cGiB = (bytes * 200LL / (1LL<<30) +1) / 2; + long cGB = (bytes / (1000000000LL/200LL ) +1) /2; + snprintf(buf, sizeof(buf), " (%ld.%02ld GiB %ld.%02ld GB)", + cGiB/100 , cGiB % 100, + cGB/100, cGB % 100); + } + return buf; +} + +char *human_size_brief(long long bytes, int prefix) +{ + static char buf[30]; + + /* We convert bytes to either centi-M{ega,ibi}bytes or + * centi-G{igi,ibi}bytes, with appropriate rounding, + * and then print 1/100th of those as a decimal. + * We allow upto 2048Megabytes before converting to + * gigabytes, as that shows more precision and isn't + * too large a number. + * Terabytes are not yet handled. + * + * If prefix == IEC, we mean prefixes like kibi,mebi,gibi etc. + * If prefix == JEDEC, we mean prefixes like kilo,mega,giga etc. + */ + + if (bytes < 5000*1024) + buf[0] = 0; + else if (prefix == IEC) { + if (bytes < 2*1024LL*1024LL*1024LL) { + long cMiB = (bytes * 200LL / (1LL<<20) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldMiB", + cMiB/100 , cMiB % 100); + } else { + long cGiB = (bytes * 200LL / (1LL<<30) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldGiB", + cGiB/100 , cGiB % 100); + } + } + else if (prefix == JEDEC) { + if (bytes < 2*1024LL*1024LL*1024LL) { + long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldMB", + cMB/100, cMB % 100); + } else { + long cGB = (bytes / (1000000000LL/200LL ) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldGB", + cGB/100 , cGB % 100); + } + } + else + buf[0] = 0; + + return buf; +} + +void print_r10_layout(int layout) +{ + int near = layout & 255; + int far = (layout >> 8) & 255; + int offset = (layout&0x10000); + char *sep = ""; + + if (near != 1) { + printf("%s near=%d", sep, near); + sep = ","; + } + if (far != 1) + printf("%s %s=%d", sep, offset?"offset":"far", far); + if (near*far == 1) + printf("NO REDUNDANCY"); +} +#endif + +unsigned long long calc_array_size(int level, int raid_disks, int layout, + int chunksize, unsigned long long devsize) +{ + if (level == 1) + return devsize; + devsize &= ~(unsigned long long)((chunksize>>9)-1); + return get_data_disks(level, layout, raid_disks) * devsize; +} + +int get_data_disks(int level, int layout, int raid_disks) +{ + int data_disks = 0; + switch (level) { + case 0: data_disks = raid_disks; + break; + case 1: data_disks = 1; + break; + case 4: + case 5: data_disks = raid_disks - 1; + break; + case 6: data_disks = raid_disks - 2; + break; + case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255); + break; + } + + return data_disks; +} + +int devnm2devid(char *devnm) +{ + /* First look in /sys/block/$DEVNM/dev for %d:%d + * If that fails, try parsing out a number + */ + char path[100]; + char *ep; + int fd; + int mjr,mnr; + + sprintf(path, "/sys/block/%s/dev", devnm); + fd = open(path, O_RDONLY); + if (fd >= 0) { + char buf[20]; + int n = read(fd, buf, sizeof(buf)); + close(fd); + if (n > 0) + buf[n] = 0; + if (n > 0 && sscanf(buf, "%d:%d\n", &mjr, &mnr) == 2) + return makedev(mjr, mnr); + } + if (strncmp(devnm, "md_d", 4) == 0 && + isdigit(devnm[4]) && + (mnr = strtoul(devnm+4, &ep, 10)) >= 0 && + ep > devnm && *ep == 0) + return makedev(get_mdp_major(), mnr << MdpMinorShift); + + if (strncmp(devnm, "md", 2) == 0 && + isdigit(devnm[2]) && + (mnr = strtoul(devnm+2, &ep, 10)) >= 0 && + ep > devnm && *ep == 0) + return makedev(MD_MAJOR, mnr); + + return 0; +} + +#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) +char *get_md_name(char *devnm) +{ + /* find /dev/md%d or /dev/md/%d or make a device /dev/.tmp.md%d */ + /* if dev < 0, want /dev/md/d%d or find mdp in /proc/devices ... */ + + static char devname[50]; + struct stat stb; + dev_t rdev = devnm2devid(devnm); + char *dn; + + if (rdev == 0) + return 0; + if (strncmp(devnm, "md_", 3) == 0) { + snprintf(devname, sizeof(devname), "/dev/md/%s", + devnm + 3); + if (stat(devname, &stb) == 0 + && (S_IFMT&stb.st_mode) == S_IFBLK + && (stb.st_rdev == rdev)) + return devname; + } + snprintf(devname, sizeof(devname), "/dev/%s", devnm); + if (stat(devname, &stb) == 0 + && (S_IFMT&stb.st_mode) == S_IFBLK + && (stb.st_rdev == rdev)) + return devname; + + snprintf(devname, sizeof(devname), "/dev/md/%s", devnm+2); + if (stat(devname, &stb) == 0 + && (S_IFMT&stb.st_mode) == S_IFBLK + && (stb.st_rdev == rdev)) + return devname; + + dn = map_dev(major(rdev), minor(rdev), 0); + if (dn) + return dn; + snprintf(devname, sizeof(devname), "/dev/.tmp.%s", devnm); + if (mknod(devname, S_IFBLK | 0600, rdev) == -1) + if (errno != EEXIST) + return NULL; + + if (stat(devname, &stb) == 0 + && (S_IFMT&stb.st_mode) == S_IFBLK + && (stb.st_rdev == rdev)) + return devname; + unlink(devname); + return NULL; +} + +void put_md_name(char *name) +{ + if (strncmp(name, "/dev/.tmp.md", 12) == 0) + unlink(name); +} +#endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */ + +int get_maj_min(char *dev, int *major, int *minor) +{ + char *e; + *major = strtoul(dev, &e, 0); + return (e > dev && *e == ':' && e[1] && + (*minor = strtoul(e+1, &e, 0)) >= 0 && + *e == 0); +} + +int dev_open(char *dev, int flags) +{ + /* like 'open', but if 'dev' matches %d:%d, create a temp + * block device and open that + */ + int fd = -1; + char devname[32]; + int major; + int minor; + + if (!dev) return -1; + flags |= O_DIRECT; + + if (get_maj_min(dev, &major, &minor)) { + snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d", + (int)getpid(), major, minor); + if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) { + fd = open(devname, flags); + unlink(devname); + } + if (fd < 0) { + /* Try /tmp as /dev appear to be read-only */ + snprintf(devname, sizeof(devname), "/tmp/.tmp.md.%d:%d:%d", + (int)getpid(), major, minor); + if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) { + fd = open(devname, flags); + unlink(devname); + } + } + } else + fd = open(dev, flags); + return fd; +} + +int open_dev_flags(char *devnm, int flags) +{ + int devid; + char buf[20]; + + devid = devnm2devid(devnm); + sprintf(buf, "%d:%d", major(devid), minor(devid)); + return dev_open(buf, flags); +} + +int open_dev(char *devnm) +{ + return open_dev_flags(devnm, O_RDONLY); +} + +int open_dev_excl(char *devnm) +{ + char buf[20]; + int i; + int flags = O_RDWR; + int devid = devnm2devid(devnm); + long delay = 1000; + + sprintf(buf, "%d:%d", major(devid), minor(devid)); + for (i = 0 ; i < 25 ; i++) { + int fd = dev_open(buf, flags|O_EXCL); + if (fd >= 0) + return fd; + if (errno == EACCES && flags == O_RDWR) { + flags = O_RDONLY; + continue; + } + if (errno != EBUSY) + return fd; + usleep(delay); + if (delay < 200000) + delay *= 2; + } + return -1; +} + +int same_dev(char *one, char *two) +{ + struct stat st1, st2; + if (stat(one, &st1) != 0) + return 0; + if (stat(two, &st2) != 0) + return 0; + if ((st1.st_mode & S_IFMT) != S_IFBLK) + return 0; + if ((st2.st_mode & S_IFMT) != S_IFBLK) + return 0; + return st1.st_rdev == st2.st_rdev; +} + +void wait_for(char *dev, int fd) +{ + int i; + struct stat stb_want; + long delay = 1000; + + if (fstat(fd, &stb_want) != 0 || + (stb_want.st_mode & S_IFMT) != S_IFBLK) + return; + + for (i = 0 ; i < 25 ; i++) { + struct stat stb; + if (stat(dev, &stb) == 0 && + (stb.st_mode & S_IFMT) == S_IFBLK && + (stb.st_rdev == stb_want.st_rdev)) + return; + usleep(delay); + if (delay < 200000) + delay *= 2; + } + if (i == 25) + dprintf("timeout waiting for %s\n", dev); +} + +struct superswitch *superlist[] = +{ + &super0, &super1, + &super_ddf, &super_imsm, + &mbr, &gpt, + NULL }; + +#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) + +struct supertype *super_by_fd(int fd, char **subarrayp) +{ + mdu_array_info_t array; + int vers; + int minor; + struct supertype *st = NULL; + struct mdinfo *sra; + char *verstr; + char version[20]; + int i; + char *subarray = NULL; + char container[32] = ""; + + sra = sysfs_read(fd, NULL, GET_VERSION); + + if (sra) { + vers = sra->array.major_version; + minor = sra->array.minor_version; + verstr = sra->text_version; + } else { + if (ioctl(fd, GET_ARRAY_INFO, &array)) + array.major_version = array.minor_version = 0; + vers = array.major_version; + minor = array.minor_version; + verstr = ""; + } + + if (vers != -1) { + sprintf(version, "%d.%d", vers, minor); + verstr = version; + } + if (minor == -2 && is_subarray(verstr)) { + char *dev = verstr+1; + + subarray = strchr(dev, '/'); + if (subarray) { + *subarray++ = '\0'; + subarray = xstrdup(subarray); + } + strcpy(container, dev); + if (sra) + sysfs_free(sra); + sra = sysfs_read(-1, container, GET_VERSION); + if (sra && sra->text_version[0]) + verstr = sra->text_version; + else + verstr = "-no-metadata-"; + } + + for (i = 0; st == NULL && superlist[i] ; i++) + st = superlist[i]->match_metadata_desc(verstr); + + if (sra) + sysfs_free(sra); + if (st) { + st->sb = NULL; + if (subarrayp) + *subarrayp = subarray; + strcpy(st->container_devnm, container); + strcpy(st->devnm, fd2devnm(fd)); + } else + free(subarray); + + return st; +} +#endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */ + +int dev_size_from_id(dev_t id, unsigned long long *size) +{ + char buf[20]; + int fd; + + sprintf(buf, "%d:%d", major(id), minor(id)); + fd = dev_open(buf, O_RDONLY); + if (fd < 0) + return 0; + if (get_dev_size(fd, NULL, size)) { + close(fd); + return 1; + } + close(fd); + return 0; +} + +struct supertype *dup_super(struct supertype *orig) +{ + struct supertype *st; + + if (!orig) + return orig; + st = xcalloc(1, sizeof(*st)); + st->ss = orig->ss; + st->max_devs = orig->max_devs; + st->minor_version = orig->minor_version; + st->ignore_hw_compat = orig->ignore_hw_compat; + st->data_offset = orig->data_offset; + st->sb = NULL; + st->info = NULL; + return st; +} + +struct supertype *guess_super_type(int fd, enum guess_types guess_type) +{ + /* try each load_super to find the best match, + * and return the best superswitch + */ + struct superswitch *ss; + struct supertype *st; + time_t besttime = 0; + int bestsuper = -1; + int i; + + st = xcalloc(1, sizeof(*st)); + st->container_devnm[0] = 0; + + for (i = 0 ; superlist[i]; i++) { + int rv; + ss = superlist[i]; + if (guess_type == guess_array && ss->add_to_super == NULL) + continue; + if (guess_type == guess_partitions && ss->add_to_super != NULL) + continue; + memset(st, 0, sizeof(*st)); + st->ignore_hw_compat = 1; + rv = ss->load_super(st, fd, NULL); + if (rv == 0) { + struct mdinfo info; + st->ss->getinfo_super(st, &info, NULL); + if (bestsuper == -1 || + besttime < info.array.ctime) { + bestsuper = i; + besttime = info.array.ctime; + } + ss->free_super(st); + } + } + if (bestsuper != -1) { + int rv; + memset(st, 0, sizeof(*st)); + st->ignore_hw_compat = 1; + rv = superlist[bestsuper]->load_super(st, fd, NULL); + if (rv == 0) { + superlist[bestsuper]->free_super(st); + return st; + } + } + free(st); + return NULL; +} + +/* Return size of device in bytes */ +int get_dev_size(int fd, char *dname, unsigned long long *sizep) +{ + unsigned long long ldsize; + struct stat st; + + if (fstat(fd, &st) != -1 && S_ISREG(st.st_mode)) + ldsize = (unsigned long long)st.st_size; + else +#ifdef BLKGETSIZE64 + if (ioctl(fd, BLKGETSIZE64, &ldsize) != 0) +#endif + { + unsigned long dsize; + if (ioctl(fd, BLKGETSIZE, &dsize) == 0) { + ldsize = dsize; + ldsize <<= 9; + } else { + if (dname) + pr_err("Cannot get size of %s: %s\b", + dname, strerror(errno)); + return 0; + } + } + *sizep = ldsize; + return 1; +} + +/* Return true if this can only be a container, not a member device. + * i.e. is and md device and size is zero + */ +int must_be_container(int fd) +{ + unsigned long long size; + if (md_get_version(fd) < 0) + return 0; + if (get_dev_size(fd, NULL, &size) == 0) + return 1; + if (size == 0) + return 1; + return 0; +} + +/* Sets endofpart parameter to the last block used by the last GPT partition on the device. + * Returns: 1 if successful + * -1 for unknown partition type + * 0 for other errors + */ +static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart) +{ + struct GPT gpt; + unsigned char empty_gpt_entry[16]= {0}; + struct GPT_part_entry *part; + char buf[512]; + unsigned long long curr_part_end; + unsigned all_partitions, entry_size; + unsigned part_nr; + + *endofpart = 0; + + BUILD_BUG_ON(sizeof(gpt) != 512); + /* skip protective MBR */ + lseek(fd, 512, SEEK_SET); + /* read GPT header */ + if (read(fd, &gpt, 512) != 512) + return 0; + + /* get the number of partition entries and the entry size */ + all_partitions = __le32_to_cpu(gpt.part_cnt); + entry_size = __le32_to_cpu(gpt.part_size); + + /* Check GPT signature*/ + if (gpt.magic != GPT_SIGNATURE_MAGIC) + return -1; + + /* sanity checks */ + if (all_partitions > 1024 || + entry_size > sizeof(buf)) + return -1; + + part = (struct GPT_part_entry *)buf; + + for (part_nr = 0; part_nr < all_partitions; part_nr++) { + /* read partition entry */ + if (read(fd, buf, entry_size) != (ssize_t)entry_size) + return 0; + + /* is this valid partition? */ + if (memcmp(part->type_guid, empty_gpt_entry, 16) != 0) { + /* check the last lba for the current partition */ + curr_part_end = __le64_to_cpu(part->ending_lba); + if (curr_part_end > *endofpart) + *endofpart = curr_part_end; + } + + } + return 1; +} + +/* Sets endofpart parameter to the last block used by the last partition on the device. + * Returns: 1 if successful + * -1 for unknown partition type + * 0 for other errors + */ +static int get_last_partition_end(int fd, unsigned long long *endofpart) +{ + struct MBR boot_sect; + struct MBR_part_record *part; + unsigned long long curr_part_end; + unsigned part_nr; + int retval = 0; + + *endofpart = 0; + + BUILD_BUG_ON(sizeof(boot_sect) != 512); + /* read MBR */ + lseek(fd, 0, 0); + if (read(fd, &boot_sect, 512) != 512) + goto abort; + + /* check MBP signature */ + if (boot_sect.magic == MBR_SIGNATURE_MAGIC) { + retval = 1; + /* found the correct signature */ + part = boot_sect.parts; + + for (part_nr = 0; part_nr < MBR_PARTITIONS; part_nr++) { + /* check for GPT type */ + if (part->part_type == MBR_GPT_PARTITION_TYPE) { + retval = get_gpt_last_partition_end(fd, endofpart); + break; + } + /* check the last used lba for the current partition */ + curr_part_end = __le32_to_cpu(part->first_sect_lba) + + __le32_to_cpu(part->blocks_num); + if (curr_part_end > *endofpart) + *endofpart = curr_part_end; + + part++; + } + } else { + /* Unknown partition table */ + retval = -1; + } + abort: + return retval; +} + +int check_partitions(int fd, char *dname, unsigned long long freesize, + unsigned long long size) +{ + /* + * Check where the last partition ends + */ + unsigned long long endofpart; + int ret; + + if ((ret = get_last_partition_end(fd, &endofpart)) > 0) { + /* There appears to be a partition table here */ + if (freesize == 0) { + /* partitions will not be visible in new device */ + pr_err("partition table exists on %s but will be lost or\n" + " meaningless after creating array\n", + dname); + return 1; + } else if (endofpart > freesize) { + /* last partition overlaps metadata */ + pr_err("metadata will over-write last partition on %s.\n", + dname); + return 1; + } else if (size && endofpart > size) { + /* partitions will be truncated in new device */ + pr_err("array size is too small to cover all partitions on %s.\n", + dname); + return 1; + } + } + return 0; +} + +int open_container(int fd) +{ + /* 'fd' is a block device. Find out if it is in use + * by a container, and return an open fd on that container. + */ + char path[256]; + char *e; + DIR *dir; + struct dirent *de; + int dfd, n; + char buf[200]; + int major, minor; + struct stat st; + + if (fstat(fd, &st) != 0) + return -1; + sprintf(path, "/sys/dev/block/%d:%d/holders", + (int)major(st.st_rdev), (int)minor(st.st_rdev)); + e = path + strlen(path); + + dir = opendir(path); + if (!dir) + return -1; + while ((de = readdir(dir))) { + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + /* Need to make sure it is a container and not a volume */ + sprintf(e, "/%s/md/metadata_version", de->d_name); + dfd = open(path, O_RDONLY); + if (dfd < 0) + continue; + n = read(dfd, buf, sizeof(buf)); + close(dfd); + if (n <= 0 || (unsigned)n >= sizeof(buf)) + continue; + buf[n] = 0; + if (strncmp(buf, "external", 8) != 0 || + n < 10 || + buf[9] == '/') + continue; + sprintf(e, "/%s/dev", de->d_name); + dfd = open(path, O_RDONLY); + if (dfd < 0) + continue; + n = read(dfd, buf, sizeof(buf)); + close(dfd); + if (n <= 0 || (unsigned)n >= sizeof(buf)) + continue; + buf[n] = 0; + if (sscanf(buf, "%d:%d", &major, &minor) != 2) + continue; + sprintf(buf, "%d:%d", major, minor); + dfd = dev_open(buf, O_RDONLY); + if (dfd >= 0) { + closedir(dir); + return dfd; + } + } + closedir(dir); + return -1; +} + +struct superswitch *version_to_superswitch(char *vers) +{ + int i; + + for (i = 0; superlist[i]; i++) { + struct superswitch *ss = superlist[i]; + + if (strcmp(vers, ss->name) == 0) + return ss; + } + + return NULL; +} + +int metadata_container_matches(char *metadata, char *devnm) +{ + /* Check if 'devnm' is the container named in 'metadata' + * which is + * /containername/componentname or + * -containername/componentname + */ + int l; + if (*metadata != '/' && *metadata != '-') + return 0; + l = strlen(devnm); + if (strncmp(metadata+1, devnm, l) != 0) + return 0; + if (metadata[l+1] != '/') + return 0; + return 1; +} + +int metadata_subdev_matches(char *metadata, char *devnm) +{ + /* Check if 'devnm' is the subdev named in 'metadata' + * which is + * /containername/subdev or + * -containername/subdev + */ + char *sl; + if (*metadata != '/' && *metadata != '-') + return 0; + sl = strchr(metadata+1, '/'); + if (!sl) + return 0; + if (strcmp(sl+1, devnm) == 0) + return 1; + return 0; +} + +int is_container_member(struct mdstat_ent *mdstat, char *container) +{ + if (mdstat->metadata_version == NULL || + strncmp(mdstat->metadata_version, "external:", 9) != 0 || + !metadata_container_matches(mdstat->metadata_version+9, container)) + return 0; + + return 1; +} + +int is_subarray_active(char *subarray, char *container) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + + for (ent = mdstat; ent; ent = ent->next) + if (is_container_member(ent, container)) + if (strcmp(to_subarray(ent, container), subarray) == 0) + break; + + free_mdstat(mdstat); + + return ent != NULL; +} + +/* open_subarray - opens a subarray in a container + * @dev: container device name + * @st: empty supertype + * @quiet: block reporting errors flag + * + * On success returns an fd to a container and fills in *st + */ +int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet) +{ + struct mdinfo *mdi; + struct mdinfo *info; + int fd, err = 1; + char *_devnm; + + fd = open(dev, O_RDWR|O_EXCL); + if (fd < 0) { + if (!quiet) + pr_err("Couldn't open %s, aborting\n", + dev); + return -1; + } + + _devnm = fd2devnm(fd); + if (_devnm == NULL) { + if (!quiet) + pr_err("Failed to determine device number for %s\n", + dev); + goto close_fd; + } + strcpy(st->devnm, _devnm); + + mdi = sysfs_read(fd, st->devnm, GET_VERSION|GET_LEVEL); + if (!mdi) { + if (!quiet) + pr_err("Failed to read sysfs for %s\n", + dev); + goto close_fd; + } + + if (mdi->array.level != UnSet) { + if (!quiet) + pr_err("%s is not a container\n", dev); + goto free_sysfs; + } + + st->ss = version_to_superswitch(mdi->text_version); + if (!st->ss) { + if (!quiet) + pr_err("Operation not supported for %s metadata\n", + mdi->text_version); + goto free_sysfs; + } + + if (st->devnm[0] == 0) { + if (!quiet) + pr_err("Failed to allocate device name\n"); + goto free_sysfs; + } + + if (!st->ss->load_container) { + if (!quiet) + pr_err("%s is not a container\n", dev); + goto free_sysfs; + } + + if (st->ss->load_container(st, fd, NULL)) { + if (!quiet) + pr_err("Failed to load metadata for %s\n", + dev); + goto free_sysfs; + } + + info = st->ss->container_content(st, subarray); + if (!info) { + if (!quiet) + pr_err("Failed to find subarray-%s in %s\n", + subarray, dev); + goto free_super; + } + free(info); + + err = 0; + + free_super: + if (err) + st->ss->free_super(st); + free_sysfs: + sysfs_free(mdi); + close_fd: + if (err) + close(fd); + + if (err) + return -1; + else + return fd; +} + +int add_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info) +{ + /* Add a device to an array, in one of 2 ways. */ + int rv; +#ifndef MDASSEMBLE + if (st->ss->external) { + if (info->disk.state & (1<<MD_DISK_SYNC)) + info->recovery_start = MaxSector; + else + info->recovery_start = 0; + rv = sysfs_add_disk(sra, info, 0); + if (! rv) { + struct mdinfo *sd2; + for (sd2 = sra->devs; sd2; sd2=sd2->next) + if (sd2 == info) + break; + if (sd2 == NULL) { + sd2 = xmalloc(sizeof(*sd2)); + *sd2 = *info; + sd2->next = sra->devs; + sra->devs = sd2; + } + } + } else +#endif + rv = ioctl(mdfd, ADD_NEW_DISK, &info->disk); + return rv; +} + +int remove_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info) +{ + int rv; + /* Remove the disk given by 'info' from the array */ +#ifndef MDASSEMBLE + if (st->ss->external) + rv = sysfs_set_str(sra, info, "slot", "none"); + else +#endif + rv = ioctl(mdfd, HOT_REMOVE_DISK, makedev(info->disk.major, + info->disk.minor)); + return rv; +} + +int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info) +{ + /* Initialise kernel's knowledge of array. + * This varies between externally managed arrays + * and older kernels + */ + int vers = md_get_version(mdfd); + int rv; + +#ifndef MDASSEMBLE + if (st->ss->external) + rv = sysfs_set_array(info, vers); + else +#endif + if ((vers % 100) >= 1) { /* can use different versions */ + mdu_array_info_t inf; + memset(&inf, 0, sizeof(inf)); + inf.major_version = info->array.major_version; + inf.minor_version = info->array.minor_version; + rv = ioctl(mdfd, SET_ARRAY_INFO, &inf); + } else + rv = ioctl(mdfd, SET_ARRAY_INFO, NULL); + return rv; +} + +unsigned long long min_recovery_start(struct mdinfo *array) +{ + /* find the minimum recovery_start in an array for metadata + * formats that only record per-array recovery progress instead + * of per-device + */ + unsigned long long recovery_start = MaxSector; + struct mdinfo *d; + + for (d = array->devs; d; d = d->next) + recovery_start = min(recovery_start, d->recovery_start); + + return recovery_start; +} + +int mdmon_pid(char *devnm) +{ + char path[100]; + char pid[10]; + int fd; + int n; + + sprintf(path, "%s/%s.pid", MDMON_DIR, devnm); + + fd = open(path, O_RDONLY | O_NOATIME, 0); + + if (fd < 0) + return -1; + n = read(fd, pid, 9); + close(fd); + if (n <= 0) + return -1; + return atoi(pid); +} + +int mdmon_running(char *devnm) +{ + int pid = mdmon_pid(devnm); + if (pid <= 0) + return 0; + if (kill(pid, 0) == 0) + return 1; + return 0; +} + +int start_mdmon(char *devnm) +{ + int i, skipped; + int len; + pid_t pid; + int status; + char pathbuf[1024]; + char *paths[4] = { + pathbuf, + BINDIR "/mdmon", + "./mdmon", + NULL + }; + + if (check_env("MDADM_NO_MDMON")) + return 0; + + len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf)-1); + if (len > 0) { + char *sl; + pathbuf[len] = 0; + sl = strrchr(pathbuf, '/'); + if (sl) + sl++; + else + sl = pathbuf; + strcpy(sl, "mdmon"); + } else + pathbuf[0] = '\0'; + + /* First try to run systemctl */ + if (!check_env("MDADM_NO_SYSTEMCTL")) + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + skipped = 0; + for (i = 3; skipped < 20; i++) + if (close(i) < 0) + skipped++; + else + skipped = 0; + + /* Don't want to see error messages from + * systemctl. If the service doesn't exist, + * we start mdmon ourselves. + */ + close(2); + open("/dev/null", O_WRONLY); + snprintf(pathbuf, sizeof(pathbuf), "mdmon@%s.service", + devnm); + status = execl("/usr/bin/systemctl", "systemctl", + "start", + pathbuf, NULL); + status = execl("/bin/systemctl", "systemctl", "start", + pathbuf, NULL); + exit(1); + case -1: pr_err("cannot run mdmon. Array remains readonly\n"); + return -1; + default: /* parent - good */ + pid = wait(&status); + if (pid >= 0 && status == 0) + return 0; + } + + /* That failed, try running mdmon directly */ + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + skipped = 0; + for (i = 3; skipped < 20; i++) + if (close(i) < 0) + skipped++; + else + skipped = 0; + + for (i = 0; paths[i]; i++) + if (paths[i][0]) { + execl(paths[i], paths[i], + devnm, NULL); + } + exit(1); + case -1: pr_err("cannot run mdmon. Array remains readonly\n"); + return -1; + default: /* parent - good */ + pid = wait(&status); + if (pid < 0 || status != 0) { + pr_err("failed to launch mdmon. Array remains readonly\n"); + return -1; + } + } + return 0; +} + +__u32 random32(void) +{ + __u32 rv; + int rfd = open("/dev/urandom", O_RDONLY); + if (rfd < 0 || read(rfd, &rv, 4) != 4) + rv = random(); + if (rfd >= 0) + close(rfd); + return rv; +} + +#ifndef MDASSEMBLE +int flush_metadata_updates(struct supertype *st) +{ + int sfd; + if (!st->updates) { + st->update_tail = NULL; + return -1; + } + + sfd = connect_monitor(st->container_devnm); + if (sfd < 0) + return -1; + + while (st->updates) { + struct metadata_update *mu = st->updates; + st->updates = mu->next; + + send_message(sfd, mu, 0); + wait_reply(sfd, 0); + free(mu->buf); + free(mu); + } + ack(sfd, 0); + wait_reply(sfd, 0); + close(sfd); + st->update_tail = NULL; + return 0; +} + +void append_metadata_update(struct supertype *st, void *buf, int len) +{ + + struct metadata_update *mu = xmalloc(sizeof(*mu)); + + mu->buf = buf; + mu->len = len; + mu->space = NULL; + mu->space_list = NULL; + mu->next = NULL; + *st->update_tail = mu; + st->update_tail = &mu->next; +} +#endif /* MDASSEMBLE */ + +#ifdef __TINYC__ +/* tinyc doesn't optimize this check in ioctl.h out ... */ +unsigned int __invalid_size_argument_for_IOC = 0; +#endif + +int experimental(void) +{ + if (check_env("MDADM_EXPERIMENTAL")) + return 1; + else { + pr_err("To use this feature MDADM_EXPERIMENTAL environment variable has to be defined.\n"); + return 0; + } +} + +/* Pick all spares matching given criteria from a container + * if min_size == 0 do not check size + * if domlist == NULL do not check domains + * if spare_group given add it to domains of each spare + * metadata allows to test domains using metadata of destination array */ +struct mdinfo *container_choose_spares(struct supertype *st, + unsigned long long min_size, + struct domainlist *domlist, + char *spare_group, + const char *metadata, int get_one) +{ + struct mdinfo *d, **dp, *disks = NULL; + + /* get list of all disks in container */ + if (st->ss->getinfo_super_disks) + disks = st->ss->getinfo_super_disks(st); + + if (!disks) + return disks; + /* find spare devices on the list */ + dp = &disks->devs; + disks->array.spare_disks = 0; + while (*dp) { + int found = 0; + d = *dp; + if (d->disk.state == 0) { + /* check if size is acceptable */ + unsigned long long dev_size; + dev_t dev = makedev(d->disk.major,d->disk.minor); + + if (!min_size || + (dev_size_from_id(dev, &dev_size) && + dev_size >= min_size)) + found = 1; + /* check if domain matches */ + if (found && domlist) { + struct dev_policy *pol = devid_policy(dev); + if (spare_group) + pol_add(&pol, pol_domain, + spare_group, NULL); + if (domain_test(domlist, pol, metadata) != 1) + found = 0; + dev_policy_free(pol); + } + } + if (found) { + dp = &d->next; + disks->array.spare_disks++; + if (get_one) { + sysfs_free(*dp); + d->next = NULL; + } + } else { + *dp = d->next; + d->next = NULL; + sysfs_free(d); + } + } + return disks; +} + +/* Checks if paths point to the same device + * Returns 0 if they do. + * Returns 1 if they don't. + * Returns -1 if something went wrong, + * e.g. paths are empty or the files + * they point to don't exist */ +int compare_paths (char* path1, char* path2) +{ + struct stat st1,st2; + + if (path1 == NULL || path2 == NULL) + return -1; + if (stat(path1,&st1) != 0) + return -1; + if (stat(path2,&st2) != 0) + return -1; + if ((st1.st_ino == st2.st_ino) && (st1.st_dev == st2.st_dev)) + return 0; + return 1; +} + +/* Make sure we can open as many devices as needed */ +void enable_fds(int devices) +{ + unsigned int fds = 20 + devices; + struct rlimit lim; + if (getrlimit(RLIMIT_NOFILE, &lim) != 0 + || lim.rlim_cur >= fds) + return; + if (lim.rlim_max < fds) + lim.rlim_max = fds; + lim.rlim_cur = fds; + setrlimit(RLIMIT_NOFILE, &lim); +} + +int in_initrd(void) +{ + /* This is based on similar function in systemd. */ + struct statfs s; + /* statfs.f_type is signed long on s390x and MIPS, causing all + sorts of sign extension problems with RAMFS_MAGIC being + defined as 0x858458f6 */ + return statfs("/", &s) >= 0 && + ((unsigned long)s.f_type == TMPFS_MAGIC || + ((unsigned long)s.f_type & 0xFFFFFFFFUL) == + ((unsigned long)RAMFS_MAGIC & 0xFFFFFFFFUL)); +} + +void reopen_mddev(int mdfd) +{ + /* Re-open without any O_EXCL, but keep + * the same fd + */ + char *devnm; + int fd; + devnm = fd2devnm(mdfd); + close(mdfd); + fd = open_dev(devnm); + if (fd >= 0 && fd != mdfd) + dup2(fd, mdfd); +} diff --git a/xmalloc.c b/xmalloc.c new file mode 100644 index 00000000..8b3f78a6 --- /dev/null +++ b/xmalloc.c @@ -0,0 +1,84 @@ +/* mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +/*#include <sys/socket.h> +#include <sys/utsname.h> +#include <sys/wait.h> +#include <sys/un.h> +#include <ctype.h> +#include <dirent.h> +#include <signal.h> +*/ + +void *xmalloc(size_t len) +{ + void *rv = malloc(len); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} + +void *xrealloc(void *ptr, size_t len) +{ + void *rv = realloc(ptr, len); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} + +void *xcalloc(size_t num, size_t size) +{ + void *rv = calloc(num, size); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} + +char *xstrdup(const char *str) +{ + char *rv = strdup(str); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} |