From b83f8fcaffa542498c5698a3a161b9967ac1d3d6 Mon Sep 17 00:00:00 2001 From: Dimitri John Ledkov Date: Sat, 2 Jul 2016 19:16:01 +0100 Subject: mdadm (3.4-2) unstable; urgency=low * Reneable incremental assembly * Rely on udev to assemble incremental arrays * In environments with systemd rely on mdadm-last-resort@.timer|.service units to activate degrated raids * In environments initramfs-tools initrd (no systemd) add local-block script to do the same after 2/3rds of root delay iteration * Drop local-top initramfs script * Drop dependency on initscripts package * Drop INITRDSTART support * Drop mdadm-raid init script * Drop ancient preinst * In mdadm.init remove dependency on mdadm-raid * In mdadm.init check, and bail out running in a container * In mdadm.config drop mdadm/autostart logic * Drop CREATE stanzas from mkconf and don't include them in the initramfs. The generated defaults, are the compiled-in defaults. And the current one generates warnings when running mdadm in the initramfs, as there is no passwd|group files to resolve root/disk uid/gid. Closes: 717609 * Adapt changes and formatting of initramfs hook from Ubuntu * Bump standards version to 3.9.7, no changes required * Fix copyright-refers-to-symlink-license * Closes: #781172, #796624, #769201, #813335, #632401, #804973, #714155, #770002, #737132, #675452, #726390, #813637, #814036. # imported from the archive --- .gitignore | 18 + ANNOUNCE-3.0 | 98 + ANNOUNCE-3.0.1 | 22 + ANNOUNCE-3.0.2 | 21 + ANNOUNCE-3.0.3 | 29 + ANNOUNCE-3.1 | 33 + ANNOUNCE-3.1.1 | 39 + ANNOUNCE-3.1.2 | 46 + ANNOUNCE-3.1.3 | 46 + ANNOUNCE-3.1.4 | 37 + ANNOUNCE-3.1.5 | 42 + ANNOUNCE-3.2 | 77 + ANNOUNCE-3.2.1 | 75 + ANNOUNCE-3.2.2 | 36 + ANNOUNCE-3.2.3 | 24 + ANNOUNCE-3.2.4 | 144 + ANNOUNCE-3.2.5 | 31 + ANNOUNCE-3.2.6 | 57 + ANNOUNCE-3.3 | 63 + ANNOUNCE-3.3.1 | 23 + ANNOUNCE-3.3.2 | 16 + ANNOUNCE-3.3.3 | 18 + ANNOUNCE-3.3.4 | 37 + ANNOUNCE-3.4 | 24 + Assemble.c | 2070 +++++ Build.c | 292 + COPYING | 339 + ChangeLog | 306 + Create.c | 1071 +++ Detail.c | 768 ++ Dump.c | 311 + Examine.c | 225 + Grow.c | 4985 ++++++++++ INSTALL | 13 + Incremental.c | 1808 ++++ Kill.c | 146 + Makefile | 345 + Manage.c | 1786 ++++ Monitor.c | 1143 +++ Query.c | 126 + README.initramfs | 123 + ReadMe.c | 642 ++ TODO | 213 + bitmap.c | 485 + bitmap.h | 291 + config.c | 1206 +++ crc32.c | 360 + crc32.h | 441 + crc32c.c | 104 + debian/FAQ | 581 ++ debian/NEWS | 107 + debian/README.Debian | 148 + debian/README.checkarray | 33 + debian/README.recipes | 149 + debian/TODO | 29 + debian/bugscript | 219 + debian/changelog | 1889 ++++ debian/checkarray | 219 + debian/compat | 1 + debian/control | 32 + debian/copyright | 21 + debian/initramfs/hook | 110 + debian/initramfs/script.local-block | 44 + debian/initramfs/script.local-bottom | 3 + debian/mdadm-waitidle | 56 + debian/mdadm.config | 43 + debian/mdadm.cron.d | 12 + debian/mdadm.cron.daily | 18 + debian/mdadm.doc-base.faq | 9 + debian/mdadm.doc-base.recipes | 9 + debian/mdadm.docs | 7 + debian/mdadm.init | 100 + debian/mdadm.logcheck.ignore.server | 23 + debian/mdadm.logcheck.violations | 3 + debian/mdadm.maintscript | 1 + debian/mdadm.modules | 8 + debian/mdadm.postinst | 107 + debian/mdadm.postrm | 25 + debian/mdadm.templates | 38 + debian/mkconf | 97 + debian/patches/debian-conffile-location.diff | 115 + debian/patches/debian-no-Werror.diff | 24 + debian/patches/mdmonitor-service-simplify.diff | 20 + debian/patches/readlink-path.patch | 15 + debian/patches/series | 5 + debian/patches/sha1-includes.diff | 40 + debian/po/POTFILES.in | 1 + debian/po/ca.po | 184 + debian/po/cs.po | 228 + debian/po/da.po | 175 + debian/po/de.po | 249 + debian/po/es.po | 218 + debian/po/eu.po | 176 + debian/po/fi.po | 173 + debian/po/fr.po | 186 + debian/po/gl.po | 177 + debian/po/it.po | 177 + debian/po/ja.po | 233 + debian/po/nl.po | 188 + debian/po/pt.po | 179 + debian/po/pt_BR.po | 304 + debian/po/ru.po | 189 + debian/po/sk.po | 176 + debian/po/sv.po | 186 + debian/po/templates.pot | 78 + debian/po/vi.po | 179 + debian/presubj | 32 + debian/rules | 103 + debian/source/format | 1 + debian/watch | 2 + dlink.c | 74 + dlink.h | 25 + external-reshape-design.txt | 280 + inventory | 255 + kernel-patch-2.6.18 | 35 + kernel-patch-2.6.18.6 | 35 + kernel-patch-2.6.19 | 34 + kernel-patch-2.6.25 | 199 + kernel-patch-2.6.27 | 36 + lib.c | 475 + makedist | 96 + managemon.c | 926 ++ mapfile.c | 508 + maps.c | 150 + md.4 | 1145 +++ md5.h | 136 + md_p.h | 269 + md_u.h | 123 + mdadm.8.in | 3258 +++++++ mdadm.c | 1936 ++++ mdadm.conf-example | 65 + mdadm.conf.5 | 641 ++ mdadm.h | 1691 ++++ mdadm.spec | 45 + mdassemble.8 | 65 + mdassemble.c | 80 + mdmon-design.txt | 146 + mdmon.8 | 257 + mdmon.c | 602 ++ mdmon.h | 110 + mdopen.c | 468 + mdstat.c | 414 + misc/mdcheck | 159 + misc/syslog-events | 27 + mkinitramfs | 55 + monitor.c | 712 ++ msg.c | 475 + msg.h | 37 + part.h | 79 + platform-intel.c | 741 ++ platform-intel.h | 247 + policy.c | 911 ++ probe_roms.c | 317 + probe_roms.h | 24 + pwgr.c | 17 + raid5extend.c | 80 + raid6check.8 | 96 + raid6check.c | 713 ++ restripe.c | 1008 ++ sg_io.c | 42 + sha1.c | 415 + sha1.h | 136 + super-ddf.c | 5273 +++++++++++ super-gpt.c | 216 + super-intel.c | 10765 ++++++++++++++++++++++ super-mbr.c | 204 + super0.c | 1332 +++ super1.c | 2656 ++++++ swap_super.c | 81 + sysfs.c | 931 ++ systemd/SUSE-mdadm_env.sh | 45 + systemd/mdadm-grow-continue@.service | 17 + systemd/mdadm-last-resort@.service | 8 + systemd/mdadm-last-resort@.timer | 7 + systemd/mdadm.shutdown | 4 + systemd/mdmon@.service | 28 + systemd/mdmonitor.service | 13 + test | 440 + tests/00linear | 25 + tests/00multipath | 29 + tests/00names | 13 + tests/00raid0 | 43 + tests/00raid1 | 34 + tests/00raid10 | 18 + tests/00raid4 | 16 + tests/00raid5 | 33 + tests/00raid6 | 16 + tests/01r1fail | 29 + tests/01r5fail | 27 + tests/01r5integ | 33 + tests/01raid6integ | 57 + tests/01replace | 52 + tests/02lineargrow | 23 + tests/02r1add | 40 + tests/02r1grow | 36 + tests/02r5grow | 36 + tests/02r6grow | 36 + tests/03assem-incr | 17 + tests/03r0assem | 137 + tests/03r5assem | 109 + tests/03r5assem-failed | 12 + tests/03r5assemV1 | 128 + tests/04r0update | 20 + tests/04r1update | 15 + tests/04r5swap | 18 + tests/04update-metadata | 48 + tests/04update-uuid | 82 + tests/05r1-add-internalbitmap | 20 + tests/05r1-add-internalbitmap-v1a | 20 + tests/05r1-add-internalbitmap-v1b | 20 + tests/05r1-add-internalbitmap-v1c | 20 + tests/05r1-bitmapfile | 49 + tests/05r1-grow-external | 33 + tests/05r1-grow-internal | 31 + tests/05r1-grow-internal-1 | 31 + tests/05r1-internalbitmap | 47 + tests/05r1-internalbitmap-v1a | 48 + tests/05r1-internalbitmap-v1b | 49 + tests/05r1-internalbitmap-v1c | 48 + tests/05r1-n3-bitmapfile | 53 + tests/05r1-re-add | 39 + tests/05r1-re-add-nosuper | 38 + tests/05r1-remove-internalbitmap | 18 + tests/05r1-remove-internalbitmap-v1a | 18 + tests/05r1-remove-internalbitmap-v1b | 18 + tests/05r1-remove-internalbitmap-v1c | 18 + tests/05r5-bitmapfile | 49 + tests/05r5-internalbitmap | 47 + tests/05r6-bitmapfile | 49 + tests/05r6tor0 | 27 + tests/06name | 12 + tests/06sysfs | 11 + tests/06wrmostly | 13 + tests/07autoassemble | 24 + tests/07autodetect | 34 + tests/07changelevelintr | 61 + tests/07changelevels | 114 + tests/07layouts | 91 + tests/07reshape5intr | 41 + tests/07revert-grow | 52 + tests/07revert-inplace | 44 + tests/07revert-shrink | 56 + tests/07testreshape5 | 45 + tests/09imsm-assemble | 73 + tests/09imsm-create-fail-rebuild | 78 + tests/09imsm-overlap | 30 + tests/10ddf-assemble-missing | 61 + tests/10ddf-create | 89 + tests/10ddf-create-fail-rebuild | 77 + tests/10ddf-fail-create-race | 66 + tests/10ddf-fail-readd | 55 + tests/10ddf-fail-readd-readonly | 71 + tests/10ddf-fail-spare | 86 + tests/10ddf-fail-stop-readd | 66 + tests/10ddf-fail-twice | 59 + tests/10ddf-fail-two-spares | 86 + tests/10ddf-geometry | 82 + tests/10ddf-incremental-wrong-order | 131 + tests/10ddf-sudden-degraded | 18 + tests/11spare-migration | 454 + tests/12imsm-r0_2d-grow-r0_3d | 20 + tests/12imsm-r0_2d-grow-r0_4d | 20 + tests/12imsm-r0_2d-grow-r0_5d | 20 + tests/12imsm-r0_3d-grow-r0_4d | 20 + tests/12imsm-r5_3d-grow-r5_4d | 20 + tests/12imsm-r5_3d-grow-r5_5d | 20 + tests/13imsm-r0_r0_2d-grow-r0_r0_4d | 29 + tests/13imsm-r0_r0_2d-grow-r0_r0_5d | 29 + tests/13imsm-r0_r0_3d-grow-r0_r0_4d | 29 + tests/13imsm-r0_r5_3d-grow-r0_r5_4d | 29 + tests/13imsm-r0_r5_3d-grow-r0_r5_5d | 29 + tests/13imsm-r5_r0_3d-grow-r5_r0_4d | 29 + tests/13imsm-r5_r0_3d-grow-r5_r0_5d | 29 + tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d | 29 + tests/14imsm-r0_3d_no_spares-migrate-r5_3d | 21 + tests/14imsm-r0_r0_2d-takeover-r10_4d | 30 + tests/14imsm-r10_4d-grow-r10_5d | 20 + tests/14imsm-r10_r5_4d-takeover-r0_2d | 30 + tests/14imsm-r1_2d-grow-r1_3d | 20 + tests/14imsm-r1_2d-takeover-r0_2d | 22 + tests/14imsm-r5_3d-grow-r5_5d-no-spares | 20 + tests/14imsm-r5_3d-migrate-r4_3d | 21 + tests/15imsm-r0_3d_64k-migrate-r0_3d_256k | 21 + tests/15imsm-r5_3d_4k-migrate-r5_3d_256k | 21 + tests/15imsm-r5_3d_64k-migrate-r5_3d_256k | 21 + tests/15imsm-r5_6d_4k-migrate-r5_6d_256k | 21 + tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k | 34 + tests/16imsm-r0_3d-migrate-r5_4d | 21 + tests/16imsm-r0_5d-migrate-r5_6d | 21 + tests/16imsm-r5_3d-migrate-r0_3d | 21 + tests/16imsm-r5_5d-migrate-r0_5d | 21 + tests/18imsm-1d-takeover-r0_1d | 22 + tests/18imsm-1d-takeover-r1_2d | 20 + tests/18imsm-r0_2d-takeover-r10_4d | 22 + tests/18imsm-r10_4d-takeover-r0_2d | 22 + tests/18imsm-r1_2d-takeover-r0_1d | 22 + tests/19raid6auto-repair | 49 + tests/19raid6check | 27 + tests/19raid6repair | 56 + tests/19repair-does-not-destroy | 28 + tests/20raid5journal | 64 + tests/ToTest | 44 + tests/check | 35 + tests/env-ddf-template | 113 + tests/env-imsm-template | 74 + tests/imsm-grow-template | 106 + tests/testdev | 13 + tests/utils | 191 + udev-md-raid-arrays.rules | 41 + udev-md-raid-assembly.rules | 35 + util.c | 2205 +++++ xmalloc.c | 84 + 312 files changed, 79862 insertions(+) create mode 100644 .gitignore create mode 100644 ANNOUNCE-3.0 create mode 100644 ANNOUNCE-3.0.1 create mode 100644 ANNOUNCE-3.0.2 create mode 100644 ANNOUNCE-3.0.3 create mode 100644 ANNOUNCE-3.1 create mode 100644 ANNOUNCE-3.1.1 create mode 100644 ANNOUNCE-3.1.2 create mode 100644 ANNOUNCE-3.1.3 create mode 100644 ANNOUNCE-3.1.4 create mode 100644 ANNOUNCE-3.1.5 create mode 100644 ANNOUNCE-3.2 create mode 100644 ANNOUNCE-3.2.1 create mode 100644 ANNOUNCE-3.2.2 create mode 100644 ANNOUNCE-3.2.3 create mode 100644 ANNOUNCE-3.2.4 create mode 100644 ANNOUNCE-3.2.5 create mode 100644 ANNOUNCE-3.2.6 create mode 100644 ANNOUNCE-3.3 create mode 100644 ANNOUNCE-3.3.1 create mode 100644 ANNOUNCE-3.3.2 create mode 100644 ANNOUNCE-3.3.3 create mode 100644 ANNOUNCE-3.3.4 create mode 100644 ANNOUNCE-3.4 create mode 100644 Assemble.c create mode 100644 Build.c create mode 100644 COPYING create mode 100644 ChangeLog create mode 100644 Create.c create mode 100644 Detail.c create mode 100644 Dump.c create mode 100644 Examine.c create mode 100755 Grow.c create mode 100644 INSTALL create mode 100644 Incremental.c create mode 100644 Kill.c create mode 100644 Makefile create mode 100644 Manage.c create mode 100644 Monitor.c create mode 100644 Query.c create mode 100644 README.initramfs create mode 100644 ReadMe.c create mode 100644 TODO create mode 100644 bitmap.c create mode 100644 bitmap.h create mode 100644 config.c create mode 100644 crc32.c create mode 100644 crc32.h create mode 100644 crc32c.c create mode 100644 debian/FAQ create mode 100644 debian/NEWS create mode 100644 debian/README.Debian create mode 100644 debian/README.checkarray create mode 100644 debian/README.recipes create mode 100644 debian/TODO create mode 100755 debian/bugscript create mode 100644 debian/changelog create mode 100644 debian/checkarray create mode 100644 debian/compat create mode 100644 debian/control create mode 100644 debian/copyright create mode 100644 debian/initramfs/hook create mode 100644 debian/initramfs/script.local-block create mode 100644 debian/initramfs/script.local-bottom create mode 100644 debian/mdadm-waitidle create mode 100644 debian/mdadm.config create mode 100644 debian/mdadm.cron.d create mode 100644 debian/mdadm.cron.daily create mode 100644 debian/mdadm.doc-base.faq create mode 100644 debian/mdadm.doc-base.recipes create mode 100644 debian/mdadm.docs create mode 100644 debian/mdadm.init create mode 100644 debian/mdadm.logcheck.ignore.server create mode 100644 debian/mdadm.logcheck.violations create mode 100644 debian/mdadm.maintscript create mode 100644 debian/mdadm.modules create mode 100644 debian/mdadm.postinst create mode 100644 debian/mdadm.postrm create mode 100644 debian/mdadm.templates create mode 100644 debian/mkconf create mode 100644 debian/patches/debian-conffile-location.diff create mode 100644 debian/patches/debian-no-Werror.diff create mode 100644 debian/patches/mdmonitor-service-simplify.diff create mode 100644 debian/patches/readlink-path.patch create mode 100644 debian/patches/series create mode 100644 debian/patches/sha1-includes.diff create mode 100644 debian/po/POTFILES.in create mode 100644 debian/po/ca.po create mode 100644 debian/po/cs.po create mode 100644 debian/po/da.po create mode 100644 debian/po/de.po create mode 100644 debian/po/es.po create mode 100644 debian/po/eu.po create mode 100644 debian/po/fi.po create mode 100644 debian/po/fr.po create mode 100644 debian/po/gl.po create mode 100644 debian/po/it.po create mode 100644 debian/po/ja.po create mode 100644 debian/po/nl.po create mode 100644 debian/po/pt.po create mode 100644 debian/po/pt_BR.po create mode 100644 debian/po/ru.po create mode 100644 debian/po/sk.po create mode 100644 debian/po/sv.po create mode 100644 debian/po/templates.pot create mode 100644 debian/po/vi.po create mode 100644 debian/presubj create mode 100755 debian/rules create mode 100644 debian/source/format create mode 100644 debian/watch create mode 100644 dlink.c create mode 100644 dlink.h create mode 100644 external-reshape-design.txt create mode 100755 inventory create mode 100644 kernel-patch-2.6.18 create mode 100644 kernel-patch-2.6.18.6 create mode 100644 kernel-patch-2.6.19 create mode 100644 kernel-patch-2.6.25 create mode 100644 kernel-patch-2.6.27 create mode 100644 lib.c create mode 100755 makedist create mode 100644 managemon.c create mode 100644 mapfile.c create mode 100644 maps.c create mode 100644 md.4 create mode 100644 md5.h create mode 100644 md_p.h create mode 100644 md_u.h create mode 100644 mdadm.8.in create mode 100644 mdadm.c create mode 100644 mdadm.conf-example create mode 100644 mdadm.conf.5 create mode 100755 mdadm.h create mode 100644 mdadm.spec create mode 100644 mdassemble.8 create mode 100644 mdassemble.c create mode 100644 mdmon-design.txt create mode 100644 mdmon.8 create mode 100644 mdmon.c create mode 100644 mdmon.h create mode 100644 mdopen.c create mode 100644 mdstat.c create mode 100644 misc/mdcheck create mode 100644 misc/syslog-events create mode 100644 mkinitramfs create mode 100644 monitor.c create mode 100644 msg.c create mode 100644 msg.h create mode 100644 part.h create mode 100644 platform-intel.c create mode 100644 platform-intel.h create mode 100644 policy.c create mode 100644 probe_roms.c create mode 100644 probe_roms.h create mode 100644 pwgr.c create mode 100644 raid5extend.c create mode 100644 raid6check.8 create mode 100644 raid6check.c create mode 100644 restripe.c create mode 100644 sg_io.c create mode 100644 sha1.c create mode 100644 sha1.h create mode 100644 super-ddf.c create mode 100644 super-gpt.c create mode 100644 super-intel.c create mode 100644 super-mbr.c create mode 100644 super0.c create mode 100644 super1.c create mode 100644 swap_super.c create mode 100644 sysfs.c create mode 100644 systemd/SUSE-mdadm_env.sh create mode 100644 systemd/mdadm-grow-continue@.service create mode 100644 systemd/mdadm-last-resort@.service create mode 100644 systemd/mdadm-last-resort@.timer create mode 100644 systemd/mdadm.shutdown create mode 100644 systemd/mdmon@.service create mode 100644 systemd/mdmonitor.service create mode 100755 test create mode 100644 tests/00linear create mode 100644 tests/00multipath create mode 100644 tests/00names create mode 100644 tests/00raid0 create mode 100644 tests/00raid1 create mode 100644 tests/00raid10 create mode 100644 tests/00raid4 create mode 100644 tests/00raid5 create mode 100644 tests/00raid6 create mode 100644 tests/01r1fail create mode 100644 tests/01r5fail create mode 100644 tests/01r5integ create mode 100644 tests/01raid6integ create mode 100644 tests/01replace create mode 100644 tests/02lineargrow create mode 100644 tests/02r1add create mode 100644 tests/02r1grow create mode 100644 tests/02r5grow create mode 100644 tests/02r6grow create mode 100644 tests/03assem-incr create mode 100644 tests/03r0assem create mode 100644 tests/03r5assem create mode 100644 tests/03r5assem-failed create mode 100644 tests/03r5assemV1 create mode 100644 tests/04r0update create mode 100644 tests/04r1update create mode 100644 tests/04r5swap create mode 100644 tests/04update-metadata create mode 100644 tests/04update-uuid create mode 100644 tests/05r1-add-internalbitmap create mode 100644 tests/05r1-add-internalbitmap-v1a create mode 100644 tests/05r1-add-internalbitmap-v1b create mode 100644 tests/05r1-add-internalbitmap-v1c create mode 100644 tests/05r1-bitmapfile create mode 100644 tests/05r1-grow-external create mode 100644 tests/05r1-grow-internal create mode 100644 tests/05r1-grow-internal-1 create mode 100644 tests/05r1-internalbitmap create mode 100644 tests/05r1-internalbitmap-v1a create mode 100644 tests/05r1-internalbitmap-v1b create mode 100644 tests/05r1-internalbitmap-v1c create mode 100644 tests/05r1-n3-bitmapfile create mode 100644 tests/05r1-re-add create mode 100644 tests/05r1-re-add-nosuper create mode 100644 tests/05r1-remove-internalbitmap create mode 100644 tests/05r1-remove-internalbitmap-v1a create mode 100644 tests/05r1-remove-internalbitmap-v1b create mode 100644 tests/05r1-remove-internalbitmap-v1c create mode 100644 tests/05r5-bitmapfile create mode 100644 tests/05r5-internalbitmap create mode 100644 tests/05r6-bitmapfile create mode 100644 tests/05r6tor0 create mode 100644 tests/06name create mode 100644 tests/06sysfs create mode 100644 tests/06wrmostly create mode 100644 tests/07autoassemble create mode 100644 tests/07autodetect create mode 100644 tests/07changelevelintr create mode 100644 tests/07changelevels create mode 100644 tests/07layouts create mode 100644 tests/07reshape5intr create mode 100644 tests/07revert-grow create mode 100644 tests/07revert-inplace create mode 100644 tests/07revert-shrink create mode 100644 tests/07testreshape5 create mode 100644 tests/09imsm-assemble create mode 100644 tests/09imsm-create-fail-rebuild create mode 100644 tests/09imsm-overlap create mode 100644 tests/10ddf-assemble-missing create mode 100644 tests/10ddf-create create mode 100644 tests/10ddf-create-fail-rebuild create mode 100644 tests/10ddf-fail-create-race create mode 100644 tests/10ddf-fail-readd create mode 100644 tests/10ddf-fail-readd-readonly create mode 100644 tests/10ddf-fail-spare create mode 100644 tests/10ddf-fail-stop-readd create mode 100644 tests/10ddf-fail-twice create mode 100644 tests/10ddf-fail-two-spares create mode 100644 tests/10ddf-geometry create mode 100644 tests/10ddf-incremental-wrong-order create mode 100644 tests/10ddf-sudden-degraded create mode 100644 tests/11spare-migration create mode 100644 tests/12imsm-r0_2d-grow-r0_3d create mode 100644 tests/12imsm-r0_2d-grow-r0_4d create mode 100644 tests/12imsm-r0_2d-grow-r0_5d create mode 100644 tests/12imsm-r0_3d-grow-r0_4d create mode 100644 tests/12imsm-r5_3d-grow-r5_4d create mode 100644 tests/12imsm-r5_3d-grow-r5_5d create mode 100644 tests/13imsm-r0_r0_2d-grow-r0_r0_4d create mode 100644 tests/13imsm-r0_r0_2d-grow-r0_r0_5d create mode 100644 tests/13imsm-r0_r0_3d-grow-r0_r0_4d create mode 100644 tests/13imsm-r0_r5_3d-grow-r0_r5_4d create mode 100644 tests/13imsm-r0_r5_3d-grow-r0_r5_5d create mode 100644 tests/13imsm-r5_r0_3d-grow-r5_r0_4d create mode 100644 tests/13imsm-r5_r0_3d-grow-r5_r0_5d create mode 100644 tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d create mode 100644 tests/14imsm-r0_3d_no_spares-migrate-r5_3d create mode 100644 tests/14imsm-r0_r0_2d-takeover-r10_4d create mode 100644 tests/14imsm-r10_4d-grow-r10_5d create mode 100644 tests/14imsm-r10_r5_4d-takeover-r0_2d create mode 100644 tests/14imsm-r1_2d-grow-r1_3d create mode 100644 tests/14imsm-r1_2d-takeover-r0_2d create mode 100644 tests/14imsm-r5_3d-grow-r5_5d-no-spares create mode 100644 tests/14imsm-r5_3d-migrate-r4_3d create mode 100644 tests/15imsm-r0_3d_64k-migrate-r0_3d_256k create mode 100644 tests/15imsm-r5_3d_4k-migrate-r5_3d_256k create mode 100644 tests/15imsm-r5_3d_64k-migrate-r5_3d_256k create mode 100644 tests/15imsm-r5_6d_4k-migrate-r5_6d_256k create mode 100644 tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k create mode 100644 tests/16imsm-r0_3d-migrate-r5_4d create mode 100644 tests/16imsm-r0_5d-migrate-r5_6d create mode 100644 tests/16imsm-r5_3d-migrate-r0_3d create mode 100644 tests/16imsm-r5_5d-migrate-r0_5d create mode 100644 tests/18imsm-1d-takeover-r0_1d create mode 100644 tests/18imsm-1d-takeover-r1_2d create mode 100644 tests/18imsm-r0_2d-takeover-r10_4d create mode 100644 tests/18imsm-r10_4d-takeover-r0_2d create mode 100644 tests/18imsm-r1_2d-takeover-r0_1d create mode 100644 tests/19raid6auto-repair create mode 100644 tests/19raid6check create mode 100644 tests/19raid6repair create mode 100644 tests/19repair-does-not-destroy create mode 100644 tests/20raid5journal create mode 100644 tests/ToTest create mode 100644 tests/check create mode 100644 tests/env-ddf-template create mode 100644 tests/env-imsm-template create mode 100644 tests/imsm-grow-template create mode 100644 tests/testdev create mode 100644 tests/utils create mode 100644 udev-md-raid-arrays.rules create mode 100644 udev-md-raid-assembly.rules create mode 100644 util.c create mode 100644 xmalloc.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..217fe76d --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +/*.o +/*.man +/*-stamp +/mdadm +/mdadm.8 +/mdadm.udeb +/mdassemble +/mdmon +/swap_super +/test_stripe +/TAGS +/mdadm.O2 +/mdadm.Os +/mdadm.static +/mdassemble.auto +/mdassemble.static +/mdmon.O2 +/raid6check diff --git a/ANNOUNCE-3.0 b/ANNOUNCE-3.0 new file mode 100644 index 00000000..f2d4f847 --- /dev/null +++ b/ANNOUNCE-3.0 @@ -0,0 +1,98 @@ +Subject: ANNOUNCE: mdadm 3.0 - A tool for managing Soft RAID under Linux + +I am pleased to (finally) announce the availability of + mdadm version 3.0 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This is a major new version and as such should be treated with some +caution. However it has seen substantial testing and is considerred +to be ready for wide use. + + +The significant change which justifies the new major version number is +that mdadm can now handle metadata updates entirely in userspace. +This allows mdadm to support metadata formats that the kernel knows +nothing about. + +Currently two such metadata formats are supported: + - DDF - The SNIA standard format + - Intel Matrix - The metadata used by recent Intel ICH controlers. + +Also the approach to device names has changed significantly. + +If udev is installed on the system, mdadm will not create any devices +in /dev. Rather it allows udev to manage those devices. For this to work +as expected, the included udev rules file should be installed. + +If udev is not installed, mdadm will still create devices and symlinks +as required, and will also remove them when the array is stopped. + +mdadm now requires all devices which do not have a standard name (mdX +or md_dX) to live in the directory /dev/md/. Names in this directory +will always be created as symlinks back to the standard name in /dev. + +The man pages contain some information about the new externally managed +metadata. However see below for a more condensed overview. + +Externally managed metadata introduces the concept of a 'container'. +A container is a collection of (normally) physical devices which have +a common set of metadata. A container is assembled as an md array, but +is left 'inactive'. + +A container can contain one or more data arrays. These are composed from +slices (partitions?) of various devices in the container. + +For example, a 5 devices DDF set can container a RAID1 using the first +half of two devices, a RAID0 using the first half of the remain 3 devices, +and a RAID5 over thte second half of all 5 devices. + +A container can be created with + + mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde] + +or "-e imsm" to use the Intel Matrix Storage Manager. + +An array can be created within a container either by giving the +container name and the only member: + + mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0 + +or by listing the component devices + + mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde] + +To assemble a container, it is easiest just to pass each device in turn to +mdadm -I + + for i in /dev/sd[abcde] + do mdadm -I $i + done + +This will assemble the container and the components. + +Alternately the container can be assembled explicitly + + mdadm -A /dev/md0 /dev/sd[abcde] + +Then the components can all be assembled with + + mdadm -I /dev/md0 + +For each container, mdadm will start a program called "mdmon" which will +monitor the array and effect any metadata updates needed. The array is +initially assembled readonly. It is up to "mdmon" to mark the metadata +as 'dirty' and which the array to 'read-write'. + +The version 0.90 and 1.x metadata formats supported by previous +versions for mdadm are still supported and the kernel still performs +the same updates it use to. The new 'mdmon' approach is only used for +newly introduced metadata types. + +NeilBrown 2nd June 2009 diff --git a/ANNOUNCE-3.0.1 b/ANNOUNCE-3.0.1 new file mode 100644 index 00000000..91b44284 --- /dev/null +++ b/ANNOUNCE-3.0.1 @@ -0,0 +1,22 @@ +Subject: ANNOUNCE: mdadm 3.0.1 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This contains only minor bug fixes over 3.0. If you are using +3.0, you could consider upgrading. + +The brief change log is: + - Fix various segfaults + - Fixed for --examine with containers + - Lots of other little fixes. + +NeilBrown 25th September 2009 diff --git a/ANNOUNCE-3.0.2 b/ANNOUNCE-3.0.2 new file mode 100644 index 00000000..93643d17 --- /dev/null +++ b/ANNOUNCE-3.0.2 @@ -0,0 +1,21 @@ +Subject: ANNOUNCE: mdadm 3.0.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This just contains one bugfix over 3.0.1 - I was obviously a bit hasty +in releasing that one. + +The brief change log is: + - Fix crash when hosthost is not set, as often happens in + early boot. + +NeilBrown 25th September 2009 diff --git a/ANNOUNCE-3.0.3 b/ANNOUNCE-3.0.3 new file mode 100644 index 00000000..d6117a1d --- /dev/null +++ b/ANNOUNCE-3.0.3 @@ -0,0 +1,29 @@ +Subject: ANNOUNCE: mdadm 3.0.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This contains a collection of bug fixes and minor enhancements over +3.0.1. + +The brief change log is: + - Improvements for creating arrays giving just a name, like 'foo', + rather than the full '/dev/md/foo'. + - Improvements for assembling member arrays of containers. + - Improvements to test suite + - Add option to change increment for RebuildNN messages reported + by "mdadm --monitor" + - Improvements to mdmon 'hand-over' from initrd to final root. + - Handle merging of devices that have left an IMSM array and are + being re-incorporated. + - Add missing space in "--detail --brief" output. + +NeilBrown 22nd October 2009 diff --git a/ANNOUNCE-3.1 b/ANNOUNCE-3.1 new file mode 100644 index 00000000..343b85da --- /dev/null +++ b/ANNOUNCE-3.1 @@ -0,0 +1,33 @@ +Subject: ANNOUNCE: mdadm 3.1 - A tool for managing Soft RAID under Linux + +Hot on the heals of 3.0.3 I am pleased to announce the availability of + mdadm version 3.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +It contains significant feature enhancements over 3.0.x + +The brief change log is: + - Support --grow to change the layout of RAID4/5/6 + - Support --grow to change the chunksize of raid 4/5/6 + - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and + back. + - Support --grow to reduce the number of devices in RAID4/5/6. + - Support restart of these grow options which assembling an array + which is partially grown. + - Assorted tests of this code, and of different RAID6 layouts. + +Note that a 2.6.31 or later is needed to have access to these. +Reducing devices in a RAID4/5/6 requires 2.6.32. +Changing RAID5 to RAID1 requires 2.6.33. + +You should only upgrade if you need to use, or which to test, these +features. + +NeilBrown 22nd October 2009 diff --git a/ANNOUNCE-3.1.1 b/ANNOUNCE-3.1.1 new file mode 100644 index 00000000..9e480dc0 --- /dev/null +++ b/ANNOUNCE-3.1.1 @@ -0,0 +1,39 @@ +Subject: ANNOUNCE: mdadm 3.1.1 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix release over 3.1, which was withdrawn due to serious +bugs. So it might be best to ignore 3.1 and say that this is a significant +feature release over 3.0.x + +Significant changes are: + - RAID level conversion between RAID1, RAID5, and RAID6 are + possible were the kernel supports it (2.6.32 at least) + - online chunksize and layout changing for RAID5 and RAID6 + where the kernel supports it. + - reduce the number of devices in a RAID4/5/6 array. + + - The default metadata is not v1.1. This metadata is stored at the + start of the device so is safer in many ways but could interfere with + boot loaded. The old default (0.90) is still available and fully + supported. + + - The default chunksize is now 512K rather than 64K. This seems more + appropriate for modern devices. + + - The default bitmap chunksize for internal bitmaps is now at least + 64Meg as fine grained bitmaps tend to impact performance more for + little extra gain. + +This release is believed to be stable and you should feel free to +upgrade to 3.1.1. + +NeilBrown 19th November 2009 diff --git a/ANNOUNCE-3.1.2 b/ANNOUNCE-3.1.2 new file mode 100644 index 00000000..321b8bef --- /dev/null +++ b/ANNOUNCE-3.1.2 @@ -0,0 +1,46 @@ +Subject: ANNOUNCE: mdadm 3.1.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.1. + +Significant changes are: + - The default metadata has change again (sorry about that). + It is now v1.2 and will hopefully stay that way. It turned + out there with boot-block issues with v1.1 which make it + unsuitable for a default, though in many cases it is still + suitable to use. + - Stopping a container is not permitted when members are still + active + - Add 'homehost' to the valid words for the "AUTO" config file + line. When followed by "-all", this causes mdadm to + auto-assemble any array belonging to this host, but not + auto-assemble anything else. + - Fix some bugs with "--grow --chunksize=" for changing chunksize. + - VAR_RUN can be easily changed at compile time just like ALT_RUN. + This gives distros more flexability in how to manage the + pid and sock files that mdmon needs. + - Various mdmon fixes + - Alway make bitmap 4K-aligned if at all possible. + - If mdadm.conf lists arrays which have inter-dependencies, + the previously had to be listed in the "right" order. Now + any order should work. + - Fix --force assembly of v1.x arrays which are in the process + of recovering. + - Add section on 'scrubbing' to 'md' man page. + - Various command-line-option parsing improvements. + - ... and lots of other bug fixes. + + +This release is believed to be stable and you should feel free to +upgrade to 3.1.2 + +NeilBrown 10th March 2010 diff --git a/ANNOUNCE-3.1.3 b/ANNOUNCE-3.1.3 new file mode 100644 index 00000000..95b2b6c1 --- /dev/null +++ b/ANNOUNCE-3.1.3 @@ -0,0 +1,46 @@ +Subject: ANNOUNCE: mdadm 3.1.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.2 + +Significant changes are: + - mapfile now lives in a fixed location which default to + /dev/.mdadm/map but can be changed at compile time. This + location is choses and most distros provide it during early + boot and preserve it through. As long a /dev exists and is + writable, /dev/.mdadm will be created. + Other files file communication with mdmon live here too. + This fixes a bug reported by Debian and Gentoo users where + udev would spin in early-boot. + - IMSM and DDF metadata will not be recognised on partitions + as they should only be used on whole-disks. + - Various overflows causes by 2G drives have been addressed. + - A subarray of an IMSM contain can now be killed with + --kill-subarray. Also subarrays can be renamed with + --update-subarray + - -If (or --incremental --fail) can be used from udev to + fail and remove from all arrays a device which has been + unplugged from the system. i.e. hot-unplug-support. + - "mdadm /dev/mdX --re-add missing" will look for any device + that looks like it should be a member of /dev/mdX but isn't + and will automatically --re-add it + - Now compile with -Wextra to get extra warnings. + - Lots of minor bug fixes, documentation improvements, etcc + +This release is believed to be stable and you should feel free to +upgrade to 3.1.3 + +It is expected that the next release will be 3.2 with a number of new +features. 3.1.4 will only happen if important bugs show up before 3.2 +is stable. + +NeilBrown 6th August 2010 diff --git a/ANNOUNCE-3.1.4 b/ANNOUNCE-3.1.4 new file mode 100644 index 00000000..c157a36a --- /dev/null +++ b/ANNOUNCE-3.1.4 @@ -0,0 +1,37 @@ +Subject: ANNOUNCE: mdadm 3.1.4 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.4 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.3. +3.1.3 had a couple of embarrasing regressions and a couple of other +issues surfaces which had easy fixes so I decided to make a 3.1.4 +release after all. + +Two fixes related to configs that aren't using udev: + - Don't remove md devices which 'standard' names on --stop + - Allow dev_open to work on read-only /dev +And fixed regressions: + - Allow --incremental to add spares to an array + - Accept --no-degraded as a deprecated option rather than + throwing an error + - Return correct success status when --incrmental assembling + a container which does not yet have enough devices. + - Don't link mdadm with pthreads, only mdmon needs it. + - Fix compiler warning due to bad use of snprintf + - Fix spare migration + +This release is believed to be stable and you should feel free to +upgrade to 3.1.4 + +It is expected that the next release will be 3.2 with a number of new +features. + +NeilBrown 31st August 2010 diff --git a/ANNOUNCE-3.1.5 b/ANNOUNCE-3.1.5 new file mode 100644 index 00000000..baa1f921 --- /dev/null +++ b/ANNOUNCE-3.1.5 @@ -0,0 +1,42 @@ +Subject: ANNOUNCE: mdadm 3.1.5 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.5 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.4. It contains all the +important bugfixes found while working on 3.2 and 3.2.1. It will be +the last 3.1.x release - 3.2.1 is expected to be released in a few days. + +Changes include: + - Fixes for v1.x metadata on big-endian machines. + - man page improvements + - Improve '--detail --export' when run on partitions of an md array. + - Fix regression with removing 'failed' or 'detached' devices. + - Fixes for "--assemble --force" in various unusual cases. + - Allow '-Y' to mean --export. This was documented but not implemented. + - Various fixed for handling 'ddf' metadata. This is now more reliable + but could benefit from more interoperability testing. + - Correctly list subarrays of a container in "--detail" output. + - Improve checks on whether the requested number of devices is supported + by the metadata - both for --create and --grow. + - Don't remove partitions from a device that is being included in an + array until we are fully committed to including it. + - Allow "--assemble --update=no-bitmap" so an array with a corrupt + bitmap can still be assembled. + - Don't allow --add to succeed if it looks like a "--re-add" is probably + wanted, but cannot succeed. This avoids inadvertently turning + devices into spares when an array is failed. + +This release is believed to be stable and you should feel free to +upgrade to 3.1.5 + + +NeilBrown 23rd March 2011 + diff --git a/ANNOUNCE-3.2 b/ANNOUNCE-3.2 new file mode 100644 index 00000000..9e282bc6 --- /dev/null +++ b/ANNOUNCE-3.2 @@ -0,0 +1,77 @@ +Subject: ANNOUNCE: mdadm 3.2 - A tool for managing Soft RAID under Linux (DEVEL ONLY) + +I am pleased to announce the availability of + mdadm version 3.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm devel-3.2 + http://neil.brown.name/git?p=mdadm + +This is a "Developers only" release. Please don't consider using it +or making it available to others without reading the following. + + +By far the most significant change in this release related to the +management of reshaping arrays. This code has been substantially +re-written so that it can work with 'externally managed metadata' - +Intel's IMSM in particular. We now support level migration and +OnLine Capacity Expansion on these arrays. + +However, while the code largely works it has not been tested +exhaustively so there are likely to be problems. As the reshape code +for native metadata arrays was changed as part of this rewrite these +problems could also result in regressions for reshape of native +metadata. + +It is partly to encourage greater testing that this release is being +made. Any reports of problem - particular reproducible recipes for +triggering the problems - will be gratefully received. + +It is hopped that a "3.2.1" release will be available in early March +which will be a bugfix release over this and can be considered +suitable for general use. + +Other changes of note: + + - Policy framework. + Various policy statements can be made in the mdadm.conf to guide + the behaviour of mdadm, particular with regards to how new devices + are treated by "mdadm -I". + Depending on the 'action' associated with a device (identified by + its 'path') such need devices can be automatically re-added to and + existing array that they previously fell out off, or automatically + added as a spare if they appear to contain no data. + + - mdadm now has a limited understanding of partition tables. This + allows the policy framework to make decisions about partitioned + devices as well. + + - --incremental --remove can be told what --path the device was on, + and this info will be recorded so that another device appearing at + the same physical location can be preferentially added to the same + array (provides the spare-same-slot action policy applied to the + path). + + - A new flags "--invalid-backup" flag is available in --assemble + mode. This can be used to re-assemble an array which was stopping + in the middle of a reshape, and for which the 'backup file' is no + longer available or is corrupted. The array may have some + corruption in it at the point where reshape was up to, but at least + the rest of the array will become available. + + + - Various internal restructuring - more is needed. + + +Any feed back and bug reports are always welcomed at: + linux-raid@vger.kernel.org + +And please: don't use this in production - particularly not the +--grow functionality. + +NeilBrown 1st February 2011 + + diff --git a/ANNOUNCE-3.2.1 b/ANNOUNCE-3.2.1 new file mode 100644 index 00000000..0e7826ca --- /dev/null +++ b/ANNOUNCE-3.2.1 @@ -0,0 +1,75 @@ + + +I am pleased to announce the availability of + mdadm version 3.2.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +Many of the changes in this release are of internal interest only, +restructuring and refactoring code and so forth. + +Most of the bugs found and fixed during development for 3.2.1 have been +back-ported for the recently-release 3.1.5 so this release primarily +provides a few new features over 3.1.5. + +They include: + - policy framework + Policy can be expressed for moving spare devices between arrays, and + for how to handle hot-plugged devices. This policy can be different + for devices plugged in to different controllers etc. + This, for example, allows a configuration where when a device is plugged + in it is immediately included in an md array as a hot spare and + possibly starts recovery immediately if an array is degraded. + + - some understanding of mbr and gpt paritition tables + This is primarly to support the new hot-plug support. If a + device is plugged in and policy suggests it should have a partition table, + the partition table will be copied from a suitably similar device, and + then the partitions will hot-plug and can then be added to md arrays. + + - "--incremental --remove" can remember where a device was removed from + so if a device gets plugged back in the same place, special policy applies + to it, allowing it to be included in an array even if a general hotplug + will not be included. + + - enhanced reshape options, including growing a RAID0 by converting to RAID4, + restriping, and converting back. Also convertions between RAID0 and + RAID10 and between RAID1 and RAID10 are possible (with a suitably recent + kernel). + + - spare migration for IMSM arrays. + Spare migration can now work across 'containers' using non-native metadata + and specifically Intel's IMSM arrays support spare migrations. + + - OLCE and level migration for Intel IMSM arrays. + OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is + supported for Intel Matrix Storage Manager arrays. + This support is currently 'experimental' for technical reasons. It can + be enabled with "export MDADM_EXPERIMENTAL=1" + + - avoid including wayward devices + If you split a RAID1, mount the two halves as two separate degraded RAID1s, + and then later bring the two back together, it is possible that the md + metadata won't properly show that one must over-ride the other. + mdadm now does extra checking to detect this possibilty and avoid + potentially corrupting data. + + - remove any possible confusion between similar options. + e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't + notice if one was used where the other was expected. + + - allow K,M,G suffixes on chunk sizes + + +While mdadm-3.2.1 is considered to be reasonably stable, you should +only use it if you want to try out the new features, or if you +generally like to be on the bleeding edge. If the new features are not +important to you, then 3.1.5 is probably the appropriate version to be using +until 3.2.2 comes out. + +NeilBrown 28th March 2011 diff --git a/ANNOUNCE-3.2.2 b/ANNOUNCE-3.2.2 new file mode 100644 index 00000000..b70d18b9 --- /dev/null +++ b/ANNOUNCE-3.2.2 @@ -0,0 +1,36 @@ +Subject: ANNOUNCE: mdadm 3.2.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a stablising release for the 3.2 series. +Many of the changes just fix bugs introduces in 3.2 or 3.2.1. + +There are some new features. They are: + - reshaping IMSM (Intel metadata) arrays is no longer 'experimental', + it should work properly and be largely compatible with IMSM drivers in + other platforms. + - --assume-clean can be used with --grow --size to avoid resyncing the + new part of the array. This is only support with very new kernels. + - RAID0 arrays can have chunksize which is not a power of 2. This has been + supported in the kernel for a while but is only now supprted by + mdadm. + + - A new tool 'raid6check' is available which can check a RAID6 array, + or part of it, and report which device is most inconsistent with the + others if any stripe is inconsistent. This is still under development + and does not have a man page yet. If anyone tries it out and has any + questions or experience to report, they would be most welcome on + linux-raid@vger.kernel.org. + +Future releases in the 3.2 series will only be made if bugfixes are needed. +The next release to add features is expected to be 3.3. + +NeilBrown 17th June 2011 diff --git a/ANNOUNCE-3.2.3 b/ANNOUNCE-3.2.3 new file mode 100644 index 00000000..8a8dba46 --- /dev/null +++ b/ANNOUNCE-3.2.3 @@ -0,0 +1,24 @@ +Subject: ANNOUNCE: mdadm 3.2.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a bugfix release for the 3.2 series with many +minor fixes with little or no impact. + +The largest single area of change is support for reshape of Intel +IMSM arrays (OnLine Capacity Explansion and Level Migtration). +Among other fixes, this now has a better chance of surviving if a +device fails during reshape. + +Upgrading is recommended - particularly if you use mdadm for IMSM +arrays - but not essential. + +NeilBrown 23rd December 2011 diff --git a/ANNOUNCE-3.2.4 b/ANNOUNCE-3.2.4 new file mode 100644 index 00000000..e3216786 --- /dev/null +++ b/ANNOUNCE-3.2.4 @@ -0,0 +1,144 @@ +Subject: ANNOUNCE: mdadm 3.2.4 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.4 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a bugfix release for the 3.2 series with many +minor fixes with little or no impact. + +"--oneline" log of changes is below. Some notable ones are: + + - --offroot argument to improve interactions between mdmon and initrd + - --prefer argument to select which /dev names to display in some + circumstances. + - relax restructions on when "--add" will be allowed + - Fix bug with adding write-intent-bitmap to active array + - Now defaults to "/run/mdadm" for storing run-time files. + +Upgrading is encouraged. + +The next mdadm release is expected to be 3.3 with a number of new +features. + +NeilBrown 9th May 2012 + +77b3ac8 monitor: make return from read_and_act more symbolic. +68226a8 monitor: ensure we retry soon when 'remove' fails. +8453f8d fix: Monitor sometimes crashes +90fa1a2 Work around gcc-4.7's strict aliasing checks +0c4304c fix: container creation with --incremental used. +5d1c7cd FIX: External metadata sometimes is not updated +3c20f98 FIX: mdmon check in reshape_container() can cause a problem +59ab9f5 FIX: Typo error in fprint command +9587c37 imsm: load_super_imsm_all function refactoring +ec50f7b imsm: load_imsm_super_all supports loading metadata from the device list +ca9de18 imsm: validate the number of imsm volumes per controller +30602f5 imsm: display fd in error trace when when store_imsm_mpb failes +eb155f6 mdmon: Use getopt_long() to parse command line options +08ca2ad Add --offroot argument to mdadm +da82751 Add --offroot argument to mdmon +a0963a8 Spawn mdmon with --offroot if mdadm was launched with --offroot +f878b24 imsm: fix, the second array need to have the whole available space on devices +d597705 getinfo_super1: Use MaxSector in place of sb->size +6ef8905 super1: make aread/awrite always use an aligned buffer. +de5a472 Remove avail_disks arg from 'enough'. +da8fe5a Assemble: fix --force assemble during reshape. +b10c663 config: fix handing of 'homehost' in AUTO line. +92d49ec FIX: NULL pointer to strdup() can be passed +d2bde6d imsm: FIX: No new missing disks are allowed during general migration +111e9fd FIX: Array is not run when expansion disks are added +bf5cf7c imsm: FIX: imsm_get_allowed_degradation() doesn't count degradation for raid1 +50927b1 Fix: Sometimes mdmon throws core dump during reshape +78340e2 Flush mdmon before next reshape step during container operation +e174219 imsm: FIX: Chunk size migration problem +f93346e FIX: use md position to reshape restart +6a75c8c imsm: FIX: use md position to reshape restart +51d83f5 imsm: FIX: Clear migration record when migration switches to next volume. +e1dd332 FIX: restart reshape when reshape process is stopped just between 2 reshapes +1ca90aa FIX: Do not try to (continue) reshape using inactive array +9f1b0f0 config: conf_match should ignore devname when not set. +d669228 Use posix_memalign() for memory used to write bitmaps +178950e FIX: Changes in '0' case for reshape position verification +9200d41 avoid double-free upon "old buggy kernel" sysfs_read failure +4011421 Print error message if failing to write super for 1.x metadata +0011874 Use MDMON_DIR for pid files created in Monitor.c +56d1885 Assemble: don't use O_EXCL until we have checked device content. +b720636 Assemble: support assembling of a RAID0 being reshaped. +c69ffac Manage: allow --re-add to failed array. +52f07f5 Reset bad flag on map update +911cead super1: support superblocks up to 4K. +ad6db3c Create: reduce the verbosity of 'default_layout'. +b2bfdfa super1.c don't keep recalculating bitmap pointer +4122675 Define and use SUPER1_SIZE for allocations +1afa930 init_super1() memset full buffer allocated for superblock +2de0b8a match_metadata_desc1(): Use calloc instead of malloc+memset +3c0bcd4 Use 4K buffer alignment for superblock allocations +308340a Use struct align_fd to cache fd's block size for aligned reads/writes +65ed615 match_metadata_desc0(): Use calloc instead of malloc+memset +de89706 Generalize ROUND_UP() macro and introduce matching ROUND_UP_PTR() +0a2f189 super1.c: use ROUND_UP/ROUND_UP_PTR +654a381 super-intel.c: Use ROUND_UP() instead of manually coding it +42d5dfd __write_init_super_ddf(): Use posix_memalign() instead of static aligned buffer +d4633e0 Examine: fix array size calculation for RAID10. +e62b778 Assemble: improve verbose logging when including old devices. +0073a6e Remove possible crash during RAID6 -> RAID5 reshape. +69fe207 Incremental: fix adding devices with --incremental +bcbb311 Manage: replace 'return 1' with 'goto abort'. +9f58469 Manage: freeze recovery while adding multiple devices. +ae6c05a Create: round off size for RAID1 arrays. +5ca3a90 Grow: print useful error when converting RAID1->RAID5 will fail. +c07d640 Fix tests/05r1-re-add-nosupper +2d762ad Fix the new ROUND_UP macro. +fd324b0 sysfs: fixed sysfs_freeze_array array to work properly with Manage_subdevs. +5551b11 imsm: avoid overflows for disks over 1TB +97f81ee clear hi bits if not used after loading metadata from disk +e03640b simplify calculating array_blocks +29cd082 show 2TB volumes/disks support in --detail-platform +2cc699a check volume size in validate_geometry_imsm_orom +9126b9a check that no disk over 2TB is used to create container when no support +027c374 imsm: set 2tb disk attribute for spare +3556c2f Fix typo: wan -> want +15632a9 parse_size: distinguish between 0 and error. +fbdef49 Bitmap_offset is a signed number +508a7f1 super1: leave more space in front of data by default. +40110b9 Fix two typos in fprintf messages +342460c mdadm man page: fix typo +0e7f69a imsm: display maximum volumes per controller and array +36fd8cc imsm: FIX: Update function imsm_num_data_members() for Raid1/10 +7abc987 imsm: FIX: Add volume size expand support to imsm_analyze_change() +f3871fd imsm: Add new metadata update for volume size expansion +54397ed imsm: Execute size change for external metatdata +016e00f FIX: Support metadata changes rollback +fbf3d20 imsm: FIX: Support metadata changes rollback +44f6f18 FIX: Extend size of raid0 array +7e7e9a4 FIX: Respect metadata size limitations +65a9798 FIX: Detect error and rollback metadata +13bcac9 imsm: Add function imsm_get_free_size() +b130333 imsm: Support setting max size for size change operation +c41e00b imsm: FIX: Component size alignment check +58d26a2 FIX: Size change is possible as standalone change only +4aecb54 FIX: Assembled second array is in read only state during reshape +ae2416e FIX: resolve make everything compilation error +480f356 Raid limit of 1024 when scanning for devices. +c2ecf5f Add --prefer option for --detail and --monitor +0a99975 Relax restrictions on when --add is permitted. +7ce0570 imsm: fix: rebuild does not continue after reboot +b51702b fix: correct extending size of raid0 array +34a1395 Fix sign extension of bitmap_offset in super1.c +012a864 Introduce sysfs_set_num_signed() and use it to set bitmap/offset +5d7b407 imsm: fix: thunderdome may drop 2tb attribute +5ffdc2d Update test for "is udev active". +96fd06e Adjust to new standard of /run +974e039 test: don't worry too much about array size. +b0a658f Grow: failing the set the per-device size is not an error. +36614e9 super-intel.c: Don't try to close negative fd +562aa10 super-intel.c: Fix resource leak from opendir() + diff --git a/ANNOUNCE-3.2.5 b/ANNOUNCE-3.2.5 new file mode 100644 index 00000000..396da12a --- /dev/null +++ b/ANNOUNCE-3.2.5 @@ -0,0 +1,31 @@ +Subject: ANNOUNCE: mdadm 3.2.5 - A tool for managing Soft RAID under Linux + +I am somewhat disappointed to have to announce the availability of + mdadm version 3.2.5 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release primarily fixes a serious regression in 3.2.4. +This regression does *not* cause any risk to data. It simply +means that adding a device with "--add" would sometime fail +when it should not. + +The fix also includes a couple of minor fixes such as making +the "--layout=preserve" option to "--grow" work again. + +A reminder that the default location for runtime files is now +"/run/mdadm". If you compile this for a distro that does not +have "/run", you will need to compile with an alternate setting for +MAP_DIR. e.g. + make MAP_DIR=/var/run/mdadm +or + make MAP_DIR=/dev/.mdadm + +NeilBrown 18th May 2012 + diff --git a/ANNOUNCE-3.2.6 b/ANNOUNCE-3.2.6 new file mode 100644 index 00000000..f5cfd492 --- /dev/null +++ b/ANNOUNCE-3.2.6 @@ -0,0 +1,57 @@ +Subject: ANNOUNCE: mdadm 3.2.6 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.6 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This is a stablity release which adds a number of bugfixs to 3.2.5. +There are no real stand-out fixes, just lots of little bits and pieces. + +Below is the "git log --oneline --reverse" list of changes since +3.2.5. + +NeilBrown 25th October 2012 + +b7e05d2 udev-rules: prevent systemd from mount devices before they are ready. +0d478e2 mdadm: Fix Segmentation fault. +42f0ca1 imsm: fix: correct checking volume's degradation +fcf2195 Monitor: fix inconsistencies in values for ->percent +5f862fb Monitor: Report NewArray when an array the disappeared, reappears. +6f51b1c Monitor: fix reporting for Fail vs FailSpare etc. +68ad53b mdmon: fix arg parsing. +517f135 Assemble: don't leak memory with fdlist. +090900c udev-rules: prevent systemd from mount devices before they are ready. +446e000 sha1.h: remove ansidecl.h header inclusion +ec894f5 Manage: zero metadata before adding to 'external' array. +3a84db5 ddf: allow a non-spare to be used to recovery a missing device. +c5d61ca ddf: hack to fix container recognition. +23084aa mdmon: fix arg processing for -a +c4e96a3 mdmon: allow --takeover when original was started with --offroot +80841df find_free_devnum: avoid auto-using names in /etc/mdadm.conf +c5c56d6 mapfile: fix mapfile rebuild for containers +aec89f6 fix segfaults in Detail() +2117ad1 Fix 'enough' function for RAID10. +0bc300d Use --offroot flag when assembling md arrays via --incrmental +ac78f24 Grow: make warning about old metadata more explicit. +14026ab Replace sha1.h with slightly older version. +6f6809f Add zlib license to crc32.c +5267ba0 Handles spaces in array names better. +c51f288 imsm: allow --assume-clean to work. +acf7076 Grow: allow --grow --continue to work for native metadata. +335d2a6 Grow: fix a couple of typos with --assume-clean usage +9ff1427 Fix open_container +3713633 mdadm: super0: do not override uuid with homehost +31bff58 Trivial bugfix and spelling fixes. +e1e539f Detail: don't report a faulty device as 'spare' or 'rebuilding'. +22a6461 super0: allow creation of array on 2TB+ devices. +a5d47a2 Create new md devices consistently +eb48676 Monitor: don't complain about non-monitorable arrays in mdadm.conf +ecdf2d7 Query: don't be confused by partition tables. +f7b75c1 Query: allow member of non-0.90 arrays to be better reported. diff --git a/ANNOUNCE-3.3 b/ANNOUNCE-3.3 new file mode 100644 index 00000000..f770aa13 --- /dev/null +++ b/ANNOUNCE-3.3 @@ -0,0 +1,63 @@ +Subject: ANNOUNCE: mdadm 3.3 - A tools for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm + +This is a major new release so don't be too surprised if there are a +few issues. If I hear about them they will be fixed in 3.3.1. +git log reports nearly 500 changes since 3.2.6 so I won't list them +all. + +Some highlights are: + +- Some array reshapes can proceed without needing backup file. + This is done by changing the 'data_offset' so we never need to write + any data back over where it was before. If there is no "head space" + or "tail space" to allow data_offset to change, the old mechanism + with a backup file can still be used. +- RAID10 arrays can be reshaped to change the number of devices, + change the chunk size, or change the layout between 'near' + and 'offset'. + This will always change data_offset, and will fail if there is no + room for data_offset to be moved. +- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array. +- bad-block-logs are supported (but not heavily tested yet) +- "--assemble --update=revert-reshape" can be used to undo a reshape + that has just been started but isn't really wanted. This is very + new and while it passes basic tests it cannot be guaranteed. +- improved locking between --incremental and --assemble +- uses systemd to run "mdmon" if systemd is configured to do that. +- kernel names of md devices can be non-numeric. e.g. "md_home" rather than + "md0". This will probably confuse lots of other tools, so you need to + echo CREATE names=yes >> /etc/mdadm.conf + or the feature will not be used. (you also need a reasonably new kernel). +- "--stop" can be given a kernel name instead of a device name. i.e + mdadm --stop md4 + will work even if /dev/md4 doesn't exist. +- "--detail --export" has some information about the devices in the array +- --dump and --restore can be used to backup and restore the metadata on an + array. +- Hot-replace is supported with + mdadm /dev/mdX --replace /dev/foo + and + mdadm /dev/mdX --replace /dev/foo --with /dev/bar +- Config file can be a directory in which case all "*.conf" files are + read in lexical order. + Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d + Thus + echo CREATE name=yes > /etc/mdadm.conf.d/names.conf + will also enable the use of named md devices. + +- Lots of improvements to DDF support including adding support for + RAID10 (thanks Martin Wilck). + +and lots of bugfixes and other little changes. + +NeilBrown 3rd September 2013 diff --git a/ANNOUNCE-3.3.1 b/ANNOUNCE-3.3.1 new file mode 100644 index 00000000..7d5e666e --- /dev/null +++ b/ANNOUNCE-3.3.1 @@ -0,0 +1,23 @@ +Subject: ANNOUNCE: mdadm 3.3.1 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.1 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +The main changes are: + - lots of work on "DDF" support. Hopefully it will be more stable + now. Bug reports are always welcome. + - improved interactions with 'systemd'. Where possible, background + tasks are run from systemd (if it is present) rather then forking + disassociationg from the session. This is important because udev + doesn't really let you disassociate. + +though there are a number of other little bug fixes too. + +NeilBrown 5th June 2014 diff --git a/ANNOUNCE-3.3.2 b/ANNOUNCE-3.3.2 new file mode 100644 index 00000000..6b549611 --- /dev/null +++ b/ANNOUNCE-3.3.2 @@ -0,0 +1,16 @@ +Subject: ANNOUNCE: mdadm 3.3.2 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.2 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +Changes since 3.3.1 are mostly little bugfixes and some man-page +updates. + +NeilBrown 21st August 2014 diff --git a/ANNOUNCE-3.3.3 b/ANNOUNCE-3.3.3 new file mode 100644 index 00000000..ac1b2173 --- /dev/null +++ b/ANNOUNCE-3.3.3 @@ -0,0 +1,18 @@ +Subject: ANNOUNCE: mdadm 3.3.3 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.3 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +The 100 changes since 3.3.3 are mostly little bugfixes and some improvements +to the selftests. +raid6check now handle all RAID6 layouts including DDF correctly. +See git log for the rest. + +NeilBrown 24th July 2015 diff --git a/ANNOUNCE-3.3.4 b/ANNOUNCE-3.3.4 new file mode 100644 index 00000000..52b94562 --- /dev/null +++ b/ANNOUNCE-3.3.4 @@ -0,0 +1,37 @@ +Subject: ANNOUNCE: mdadm 3.3.4 - A tool for managing md Soft RAID under Linux + +I am somewhat disappointed to have to announce the availability of + mdadm version 3.3.4 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +In mdadm-3.3 a change was made to how IMSM (Intel Matrix Storage +Manager) metadata was handled. Previously an IMSM array would only +be assembled if it was attached to an IMSM controller. + +In 3.3 this was relaxed as there are circumstances where the +controller is not properly detected. Unfortunately this has negative +consequences which have only just come to light. + +If you have an IMSM RAID1 configured and then disable RAID in the +BIOS, the metadata will remain on the devices. If you then install +some other OS on one device and then install Linux on the other, Linux +might eventually start noticing the IMSM metadata (depending a bit on whether +mdadm is included in the initramfs) and might start up the RAID1. This could +copy one device over the other, thus trashing one of the installations. + +Not good. + +So with this release IMSM arrays will only be assembled if attached to +an IMSM controller, or if "--force" is given to --assemble, or if the +environment variable IMSM_NO_PLATFORM is set (used primarily for +testing). + +I strongly recommend upgrading to 3.3.4 if you are using 3.3 or later. + +NeilBrown 3rd August 2015. diff --git a/ANNOUNCE-3.4 b/ANNOUNCE-3.4 new file mode 100644 index 00000000..2689732d --- /dev/null +++ b/ANNOUNCE-3.4 @@ -0,0 +1,24 @@ +Subject: ANNOUNCE: mdadm 3.4 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.4 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm + +The new second-level version number reflects significant new +functionality, particular support for journalled RAID5/6 and clustered +RAID1. This new support is probably still buggy. Please report bugs. + +There are also a number of fixes for Intel's IMSM metadata support, +and an assortment of minor bug fixes. + +I plan for this to be the last release of mdadm that I provide as I am +retiring from MD and mdadm maintenance. Jes Sorensen has volunteered +to oversee mdadm for the next while. Thanks Jes! + +NeilBrown 28th January 2016 diff --git a/Assemble.c b/Assemble.c new file mode 100644 index 00000000..d199afc9 --- /dev/null +++ b/Assemble.c @@ -0,0 +1,2070 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2016 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include + +static int name_matches(char *found, char *required, char *homehost, int require_homehost) +{ + /* See if the name found matches the required name, possibly + * prefixed with 'homehost' + */ + char *sep; + unsigned int l; + + if (strcmp(found, required)==0) + return 1; + sep = strchr(found, ':'); + if (!sep) + return 0; + l = sep - found; + if (strncmp(found, "any:", 4) == 0 || + (homehost && strcmp(homehost, "any") == 0) || + !require_homehost || + (homehost && strlen(homehost) == l && + strncmp(found, homehost, l) == 0)) { + /* matching homehost */ + if (strcmp(sep+1, required) == 0) + return 1; + } + return 0; +} + +static int is_member_busy(char *metadata_version) +{ + /* check if the given member array is active */ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + int busy = 0; + + for (ent = mdstat; ent; ent = ent->next) { + if (ent->metadata_version == NULL) + continue; + if (strncmp(ent->metadata_version, "external:", 9) != 0) + continue; + if (!is_subarray(&ent->metadata_version[9])) + continue; + /* Skip first char - it can be '/' or '-' */ + if (strcmp(&ent->metadata_version[10], metadata_version+1) == 0) { + busy = 1; + break; + } + } + free_mdstat(mdstat); + + return busy; +} + +static int ident_matches(struct mddev_ident *ident, + struct mdinfo *content, + struct supertype *tst, + char *homehost, int require_homehost, + char *update, char *devname) +{ + + if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) && + same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0 && + memcmp(content->uuid, uuid_zero, sizeof(int[4])) != 0) { + if (devname) + pr_err("%s has wrong uuid.\n", devname); + return 0; + } + if (ident->name[0] && (!update || strcmp(update, "name")!= 0) && + name_matches(content->name, ident->name, homehost, require_homehost)==0) { + if (devname) + pr_err("%s has wrong name.\n", devname); + return 0; + } + if (ident->super_minor != UnSet && + ident->super_minor != content->array.md_minor) { + if (devname) + pr_err("%s has wrong super-minor.\n", + devname); + return 0; + } + if (ident->level != UnSet && + ident->level != content->array.level) { + if (devname) + pr_err("%s has wrong raid level.\n", + devname); + return 0; + } + if (ident->raid_disks != UnSet && + content->array.raid_disks != 0 && /* metadata doesn't know how many to expect */ + ident->raid_disks!= content->array.raid_disks) { + if (devname) + pr_err("%s requires wrong number of drives.\n", + devname); + return 0; + } + if (ident->member && ident->member[0]) { + /* content->text_version must match */ + char *s = strchr(content->text_version+1, '/'); + if (s == NULL) { + if (devname) + pr_err("%s is not a container and one is required.\n", + devname); + return 0; + } else if (strcmp(ident->member, s+1) != 0) { + if (devname) + pr_err("skipping wrong member %s is %s\n", + content->text_version, devname); + return 0; + } + } + return 1; +} + +static int select_devices(struct mddev_dev *devlist, + struct mddev_ident *ident, + struct supertype **stp, + struct mdinfo **contentp, + struct context *c, + int inargv, int auto_assem) +{ + struct mddev_dev *tmpdev; + int num_devs; + struct supertype *st = *stp; + struct mdinfo *content = NULL; + int report_mismatch = ((inargv && c->verbose >= 0) || c->verbose > 0); + struct domainlist *domains = NULL; + + tmpdev = devlist; num_devs = 0; + while (tmpdev) { + if (tmpdev->used) + tmpdev->used = 2; + else + num_devs++; + tmpdev->disposition = 0; + tmpdev = tmpdev->next; + } + + /* first walk the list of devices to find a consistent set + * that match the criterea, if that is possible. + * We flag the ones we like with 'used'. + */ + for (tmpdev = devlist; + tmpdev; + tmpdev = tmpdev ? tmpdev->next : NULL) { + char *devname = tmpdev->devname; + int dfd; + struct stat stb; + struct supertype *tst; + struct dev_policy *pol = NULL; + int found_container = 0; + + if (tmpdev->used > 1) + continue; + + if (ident->container) { + if (ident->container[0] == '/' && + !same_dev(ident->container, devname)) { + if (report_mismatch) + pr_err("%s is not the container required (%s)\n", + devname, ident->container); + continue; + } + } else if (ident->devices && + !match_oneof(ident->devices, devname)) { + /* Note that we ignore the "device=" identifier if a + * "container=" is given. Checking both is unnecessarily + * complicated. + */ + if (report_mismatch) + pr_err("%s is not one of %s\n", devname, ident->devices); + continue; + } + + tst = dup_super(st); + + dfd = dev_open(devname, O_RDONLY); + if (dfd < 0) { + if (report_mismatch) + pr_err("cannot open device %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if (fstat(dfd, &stb)< 0) { + /* Impossible! */ + pr_err("fstat failed for %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if ((stb.st_mode & S_IFMT) != S_IFBLK) { + pr_err("%s is not a block device.\n", + devname); + tmpdev->used = 2; + } else if (must_be_container(dfd)) { + if (st) { + /* already found some components, this cannot + * be another one. + */ + if (report_mismatch) + pr_err("%s is a container, but we are looking for components\n", + devname); + tmpdev->used = 2; +#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) + } if (!tst && (tst = super_by_fd(dfd, NULL)) == NULL) { + if (report_mismatch) + pr_err("not a recognisable container: %s\n", + devname); + tmpdev->used = 2; +#endif + } else if (!tst->ss->load_container + || tst->ss->load_container(tst, dfd, NULL)) { + if (report_mismatch) + pr_err("no correct container type: %s\n", + devname); + tmpdev->used = 2; + } else if (auto_assem && + !conf_test_metadata(tst->ss->name, (pol = devid_policy(stb.st_rdev)), + tst->ss->match_home(tst, c->homehost) == 1)) { + if (report_mismatch) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, tst->ss->name); + tmpdev->used = 2; + } else + found_container = 1; + } else { + if (!tst && (tst = guess_super(dfd)) == NULL) { + if (report_mismatch) + pr_err("no recogniseable superblock on %s\n", + devname); + tmpdev->used = 2; + } else if ((tst->ignore_hw_compat = 0), + tst->ss->load_super(tst, dfd, + report_mismatch ? devname : NULL)) { + if (report_mismatch) + pr_err("no RAID superblock on %s\n", + devname); + tmpdev->used = 2; + } else if (tst->ss->compare_super == NULL) { + if (report_mismatch) + pr_err("Cannot assemble %s metadata on %s\n", + tst->ss->name, devname); + tmpdev->used = 2; + } else if (auto_assem && st == NULL && + !conf_test_metadata(tst->ss->name, (pol = devid_policy(stb.st_rdev)), + tst->ss->match_home(tst, c->homehost) == 1)) { + if (report_mismatch) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, tst->ss->name); + tmpdev->used = 2; + } + } + if (dfd >= 0) close(dfd); + if (tmpdev->used == 2) { + if (auto_assem || !inargv) + /* Ignore unrecognised devices during auto-assembly */ + goto loop; + if (ident->uuid_set || ident->name[0] || + ident->super_minor != UnSet) + /* Ignore unrecognised device if looking for + * specific array */ + goto loop; + + pr_err("%s has no superblock - assembly aborted\n", + devname); + if (st) + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + return -1; + } + + if (found_container) { + /* tmpdev is a container. We need to be either + * looking for a member, or auto-assembling + */ + /* should be safe to try an exclusive open now, we + * have rejected anything that some other mdadm might + * be looking at + */ + dfd = dev_open(devname, O_RDONLY | O_EXCL); + if (dfd < 0) { + if (report_mismatch) + pr_err("%s is busy - skipping\n", devname); + goto loop; + } + close(dfd); + + if (ident->container && ident->container[0] != '/') { + /* we have a uuid */ + int uuid[4]; + + content = *contentp; + tst->ss->getinfo_super(tst, content, NULL); + + if (!parse_uuid(ident->container, uuid) || + !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) { + if (report_mismatch) + pr_err("%s has wrong UUID to be required container\n", + devname); + goto loop; + } + } + /* It is worth looking inside this container. + */ + if (c->verbose > 0) + pr_err("looking in container %s\n", + devname); + + for (content = tst->ss->container_content(tst, NULL); + content; + content = content->next) { + + if (!ident_matches(ident, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + /* message already printed */; + else if (is_member_busy(content->text_version)) { + if (report_mismatch) + pr_err("member %s in %s is already assembled\n", + content->text_version, + devname); + } else if (content->array.state & (1<text_version, + devname); + } else + break; + } + if (!content) { + tmpdev->used = 2; + goto loop; /* empty container */ + } + + st = tst; tst = NULL; + if (!auto_assem && inargv && tmpdev->next != NULL) { + pr_err("%s is a container, but is not only device given: confused and aborting\n", + devname); + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + return -1; + } + if (c->verbose > 0) + pr_err("found match on member %s in %s\n", + content->text_version, devname); + + /* make sure we finished the loop */ + tmpdev = NULL; + goto loop; + } else { + content = *contentp; + tst->ss->getinfo_super(tst, content, NULL); + + if (!ident_matches(ident, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + goto loop; + + if (auto_assem) { + /* Never auto-assemble things that conflict + * with mdadm.conf in some way + */ + struct mddev_ident *match; + int rv = 0; + + match = conf_match(tst, content, devname, + report_mismatch ? c->verbose : -1, + &rv); + if (!match && rv == 2) + goto loop; + if (match && match->devname && + strcasecmp(match->devname, "") == 0) { + if (report_mismatch) + pr_err("%s is a member of an explicitly ignored array\n", + devname); + goto loop; + } + if (match && !ident_matches(match, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + /* Array exists in mdadm.conf but some + * details don't match, so reject it + */ + goto loop; + } + + /* should be safe to try an exclusive open now, we + * have rejected anything that some other mdadm might + * be looking at + */ + dfd = dev_open(devname, O_RDONLY | O_EXCL); + if (dfd < 0) { + if (report_mismatch) + pr_err("%s is busy - skipping\n", devname); + goto loop; + } + close(dfd); + + if (st == NULL) + st = dup_super(tst); + if (st->minor_version == -1) + st->minor_version = tst->minor_version; + + if (memcmp(content->uuid, uuid_zero, + sizeof(int[4])) == 0) { + /* this is a floating spare. It cannot define + * an array unless there are no more arrays of + * this type to be found. It can be included + * in an array of this type though. + */ + tmpdev->used = 3; + goto loop; + } + + if (st->ss != tst->ss || + st->minor_version != tst->minor_version || + st->ss->compare_super(st, tst) != 0) { + /* Some mismatch. If exactly one array matches this host, + * we can resolve on that one. + * Or, if we are auto assembling, we just ignore the second + * for now. + */ + if (auto_assem) + goto loop; + if (c->homehost) { + int first = st->ss->match_home(st, c->homehost); + int last = tst->ss->match_home(tst, c->homehost); + if (first != last && + (first == 1 || last == 1)) { + /* We can do something */ + if (first) {/* just ignore this one */ + if (report_mismatch) + pr_err("%s misses out due to wrong homehost\n", + devname); + goto loop; + } else { /* reject all those sofar */ + struct mddev_dev *td; + if (report_mismatch) + pr_err("%s overrides previous devices due to good homehost\n", + devname); + for (td=devlist; td != tmpdev; td=td->next) + if (td->used == 1) + td->used = 0; + tmpdev->used = 1; + goto loop; + } + } + } + pr_err("superblock on %s doesn't match others - assembly aborted\n", + devname); + tst->ss->free_super(tst); + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + return -1; + } + tmpdev->used = 1; + } + loop: + /* Collect domain information from members only */ + if (tmpdev && tmpdev->used == 1) { + if (!pol) + pol = devid_policy(stb.st_rdev); + domain_merge(&domains, pol, tst?tst->ss->name:NULL); + } + dev_policy_free(pol); + pol = NULL; + if (tst) + tst->ss->free_super(tst); + } + + /* Check if we found some imsm spares but no members */ + if ((auto_assem || + (ident->uuid_set && + memcmp(uuid_zero, ident->uuid,sizeof(uuid_zero)) == 0)) && + (!st || !st->sb)) + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if (tmpdev->used != 3) + continue; + tmpdev->used = 1; + content = *contentp; + + if (!st->sb) { + /* we need sb from one of the spares */ + int dfd = dev_open(tmpdev->devname, O_RDONLY); + if (dfd < 0 || + st->ss->load_super(st, dfd, NULL)) + tmpdev->used = 2; + if (dfd > 0) + close(dfd); + } + } + + /* Now reject spares that don't match domains of identified members */ + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + struct stat stb; + if (tmpdev->used != 3) + continue; + if (stat(tmpdev->devname, &stb)< 0) { + pr_err("fstat failed for %s: %s\n", + tmpdev->devname, strerror(errno)); + tmpdev->used = 2; + } else { + struct dev_policy *pol = devid_policy(stb.st_rdev); + int dt = domain_test(domains, pol, NULL); + if (inargv && dt != 0) + /* take this spare as domains match + * if there are any */ + tmpdev->used = 1; + else if (!inargv && dt == 1) + /* device wasn't explicitly listed, so need + * explicit domain match - which we have */ + tmpdev->used = 1; + else + /* if domains don't match mark as unused */ + tmpdev->used = 0; + dev_policy_free(pol); + } + } + domain_free(domains); + *stp = st; + if (st && st->sb && content == *contentp) + st->ss->getinfo_super(st, content, NULL); + *contentp = content; + + return num_devs; +} + +struct devs { + char *devname; + int uptodate; /* set once we decide that this device is as + * recent as everything else in the array. + */ + int included; /* set if the device is already in the array + * due to a previous '-I' + */ + struct mdinfo i; +}; + +static int load_devices(struct devs *devices, char *devmap, + struct mddev_ident *ident, struct supertype **stp, + struct mddev_dev *devlist, struct context *c, + struct mdinfo *content, + int mdfd, char *mddev, + int *most_recentp, int *bestcntp, int **bestp, + int inargv) +{ + struct mddev_dev *tmpdev; + int devcnt = 0; + int nextspare = 0; +#ifndef MDASSEMBLE + int bitmap_done = 0; +#endif + int most_recent = -1; + int bestcnt = 0; + int *best = *bestp; + struct supertype *st = *stp; + + for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) { + char *devname = tmpdev->devname; + struct stat stb; + struct supertype *tst; + int i; + int dfd; + + if (tmpdev->used != 1) + continue; + /* looks like a good enough match to update the super block if needed */ +#ifndef MDASSEMBLE + if (c->update) { + /* prepare useful information in info structures */ + struct stat stb2; + int err; + fstat(mdfd, &stb2); + + if (strcmp(c->update, "uuid")==0 && + !ident->uuid_set) { + int rfd; + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, ident->uuid, 16) != 16) { + *(__u32*)(ident->uuid) = random(); + *(__u32*)(ident->uuid+1) = random(); + *(__u32*)(ident->uuid+2) = random(); + *(__u32*)(ident->uuid+3) = random(); + } + if (rfd >= 0) close(rfd); + } + dfd = dev_open(devname, + tmpdev->disposition == 'I' + ? O_RDWR : (O_RDWR|O_EXCL)); + + tst = dup_super(st); + if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { + pr_err("cannot re-read metadata from %s - aborting\n", + devname); + if (dfd >= 0) + close(dfd); + close(mdfd); + free(devices); + free(devmap); + tst->ss->free_super(tst); + free(tst); + *stp = st; + return -1; + } + tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks); + + memcpy(content->uuid, ident->uuid, 16); + strcpy(content->name, ident->name); + content->array.md_minor = minor(stb2.st_rdev); + + if (strcmp(c->update, "byteorder") == 0) + err = 0; + else if (strcmp(c->update, "home-cluster") == 0) { + tst->cluster_name = c->homecluster; + err = tst->ss->write_bitmap(tst, dfd, NameUpdate); + } else if (strcmp(c->update, "nodes") == 0) { + tst->nodes = c->nodes; + err = tst->ss->write_bitmap(tst, dfd, NodeNumUpdate); + } else if (strcmp(c->update, "revert-reshape") == 0 && + c->invalid_backup) + err = tst->ss->update_super(tst, content, + "revert-reshape-nobackup", + devname, c->verbose, + ident->uuid_set, + c->homehost); + else + err = tst->ss->update_super(tst, content, c->update, + devname, c->verbose, + ident->uuid_set, + c->homehost); + if (err < 0) { + if (err == -1) + pr_err("--update=%s not understood for %s metadata\n", + c->update, tst->ss->name); + tst->ss->free_super(tst); + free(tst); + close(mdfd); + close(dfd); + free(devices); + free(devmap); + *stp = st; + return -1; + } + if (strcmp(c->update, "uuid")==0 && + !ident->uuid_set) { + ident->uuid_set = 1; + memcpy(ident->uuid, content->uuid, 16); + } + if (tst->ss->store_super(tst, dfd)) + pr_err("Could not re-write superblock on %s.\n", + devname); + + if (strcmp(c->update, "uuid")==0 && + ident->bitmap_fd >= 0 && !bitmap_done) { + if (bitmap_update_uuid(ident->bitmap_fd, + content->uuid, + tst->ss->swapuuid) != 0) + pr_err("Could not update uuid on external bitmap.\n"); + else + bitmap_done = 1; + } + } else +#endif + { + dfd = dev_open(devname, + tmpdev->disposition == 'I' + ? O_RDWR : (O_RDWR|O_EXCL)); + tst = dup_super(st); + + if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { + pr_err("cannot re-read metadata from %s - aborting\n", + devname); + if (dfd >= 0) + close(dfd); + close(mdfd); + free(devices); + free(devmap); + tst->ss->free_super(tst); + free(tst); + *stp = st; + return -1; + } + tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks); + } + + fstat(dfd, &stb); + close(dfd); + + if (c->verbose > 0) + pr_err("%s is identified as a member of %s, slot %d%s.\n", + devname, mddev, content->disk.raid_disk, + (content->disk.state & (1<disposition == 'I'); + devices[devcnt].i = *content; + devices[devcnt].i.disk.major = major(stb.st_rdev); + devices[devcnt].i.disk.minor = minor(stb.st_rdev); + + if (devices[devcnt].i.disk.state == 6) { + if (most_recent < 0 || + devices[devcnt].i.events + > devices[most_recent].i.events) { + struct supertype *tmp = tst; + tst = st; + st = tmp; + most_recent = devcnt; + } + } + tst->ss->free_super(tst); + free(tst); + + if (content->array.level == LEVEL_MULTIPATH) + /* with multipath, the raid_disk from the superblock is meaningless */ + i = devcnt; + else + i = devices[devcnt].i.disk.raid_disk; + if (i+1 == 0 || i == MD_DISK_ROLE_JOURNAL) { + if (nextspare < content->array.raid_disks*2) + nextspare = content->array.raid_disks*2; + i = nextspare++; + } else { + /* i is raid_disk - double it so there is room for + * replacements */ + i *= 2; + if (devices[devcnt].i.disk.state & (1<= content->array.raid_disks*2 && + i >= nextspare) + nextspare = i+1; + } + if (i < 10000) { + if (i >= bestcnt) { + int newbestcnt = i+10; + int *newbest = xmalloc(sizeof(int)*newbestcnt); + int c; + for (c=0; c < newbestcnt; c++) + if (c < bestcnt) + newbest[c] = best[c]; + else + newbest[c] = -1; + if (best)free(best); + best = newbest; + bestcnt = newbestcnt; + } + if (best[i] >=0 && + devices[best[i]].i.events + == devices[devcnt].i.events + && (devices[best[i]].i.disk.minor + != devices[devcnt].i.disk.minor) + && st->ss == &super0 + && content->array.level != LEVEL_MULTIPATH) { + /* two different devices with identical superblock. + * Could be a mis-detection caused by overlapping + * partitions. fail-safe. + */ + pr_err("WARNING %s and %s appear to have very similar superblocks.\n" + " If they are really different, please --zero the superblock on one\n" + " If they are the same or overlap, please remove one from %s.\n", + devices[best[i]].devname, devname, + inargv ? "the list" : + "the\n DEVICE list in mdadm.conf" + ); + close(mdfd); + free(devices); + free(devmap); + *stp = st; + return -1; + } + if (best[i] == -1 + || (devices[best[i]].i.events + < devices[devcnt].i.events)) + best[i] = devcnt; + } + devcnt++; + } + if (most_recent >= 0) + *most_recentp = most_recent; + *bestcntp = bestcnt; + *bestp = best; + *stp = st; + return devcnt; +} + +static int force_array(struct mdinfo *content, + struct devs *devices, + int *best, int bestcnt, char *avail, + int most_recent, + struct supertype *st, + struct context *c) +{ + int okcnt = 0; + while (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, + avail) + || + (content->reshape_active && content->delta_disks > 0 && + !enough(content->array.level, (content->array.raid_disks + - content->delta_disks), + content->new_layout, 1, + avail) + )) { + /* Choose the newest best drive which is + * not up-to-date, update the superblock + * and add it. + */ + int fd; + struct supertype *tst; + unsigned long long current_events; + int chosen_drive = -1; + int i; + + for (i = 0; + i < content->array.raid_disks * 2 && i < bestcnt; + i += 2) { + int j = best[i]; + if (j < 0) + continue; + if (devices[j].uptodate) + continue; + if (devices[j].i.recovery_start != MaxSector) { + int delta; + if (!devices[j].i.reshape_active || + devices[j].i.delta_disks <= 0) + continue; + /* When increasing number of devices, an + * added device also appears to be + * recovering. It is safe to include it + * as long as it won't be a source of + * data. + * For now, just allow for last data + * devices in RAID4 or last devices in RAID4/5/6. + */ + delta = devices[j].i.delta_disks; + if (devices[j].i.array.level >= 4 && + devices[j].i.array.level <= 6 && + i/2 >= content->array.raid_disks - delta) + /* OK */; + else if (devices[j].i.array.level == 4 && + i/2 >= content->array.raid_disks - delta - 1) + /* OK */; + else + continue; + } + if (chosen_drive < 0 || + devices[j].i.events + > devices[chosen_drive].i.events) + chosen_drive = j; + } + if (chosen_drive < 0) + break; + current_events = devices[chosen_drive].i.events; + add_another: + if (c->verbose >= 0) + pr_err("forcing event count in %s(%d) from %d upto %d\n", + devices[chosen_drive].devname, + devices[chosen_drive].i.disk.raid_disk, + (int)(devices[chosen_drive].i.events), + (int)(devices[most_recent].i.events)); + fd = dev_open(devices[chosen_drive].devname, + devices[chosen_drive].included ? O_RDWR + : (O_RDWR|O_EXCL)); + if (fd < 0) { + pr_err("Couldn't open %s for write - not updating\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + continue; + } + tst = dup_super(st); + if (tst->ss->load_super(tst,fd, NULL)) { + close(fd); + pr_err("RAID superblock disappeared from %s - not updating.\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + continue; + } + content->events = devices[most_recent].i.events; + tst->ss->update_super(tst, content, "force-one", + devices[chosen_drive].devname, c->verbose, + 0, NULL); + + if (tst->ss->store_super(tst, fd)) { + close(fd); + pr_err("Could not re-write superblock on %s\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + tst->ss->free_super(tst); + continue; + } + close(fd); + devices[chosen_drive].i.events = devices[most_recent].i.events; + devices[chosen_drive].uptodate = 1; + avail[chosen_drive] = 1; + okcnt++; + tst->ss->free_super(tst); + /* If there are any other drives of the same vintage, + * add them in as well. We can't lose and we might gain + */ + for (i = 0; + i < content->array.raid_disks * 2 && i < bestcnt ; + i += 2) { + int j = best[i]; + if (j >= 0 && + !devices[j].uptodate && + devices[j].i.recovery_start == MaxSector && + devices[j].i.events == current_events) { + chosen_drive = j; + goto add_another; + } + } + } + return okcnt; +} + +static int start_array(int mdfd, + char *mddev, + struct mdinfo *content, + struct supertype *st, + struct mddev_ident *ident, + int *best, int bestcnt, + int chosen_drive, + struct devs *devices, + unsigned int okcnt, + unsigned int sparecnt, + unsigned int rebuilding_cnt, + unsigned int journalcnt, + struct context *c, + int clean, char *avail, + int start_partial_ok, + int err_ok, + int was_forced + ) +{ + int rv; + int i; + unsigned int req_cnt; + + if (content->journal_device_required && (content->journal_clean == 0)) { + if (!c->force) { + pr_err("Not safe to assemble with missing or stale journal device, consider --force.\n"); + return 1; + } + pr_err("Journal is missing or stale, starting array read only.\n"); + c->readonly = 1; + } + + rv = set_array_info(mdfd, st, content); + if (rv && !err_ok) { + pr_err("failed to set array info for %s: %s\n", + mddev, strerror(errno)); + return 1; + } + if (ident->bitmap_fd >= 0) { + if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) { + pr_err("SET_BITMAP_FILE failed.\n"); + return 1; + } + } else if (ident->bitmap_file) { + /* From config file */ + int bmfd = open(ident->bitmap_file, O_RDWR); + if (bmfd < 0) { + pr_err("Could not open bitmap file %s\n", + ident->bitmap_file); + return 1; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { + pr_err("Failed to set bitmapfile for %s\n", mddev); + close(bmfd); + return 1; + } + close(bmfd); + } + + /* First, add the raid disks, but add the chosen one last */ + for (i=0; i<= bestcnt; i++) { + int j; + if (i < bestcnt) { + j = best[i]; + if (j == chosen_drive) + continue; + } else + j = chosen_drive; + + if (j >= 0 && !devices[j].included) { + int dfd = dev_open(devices[j].devname, + O_RDWR|O_EXCL); + if (dfd >= 0) { + remove_partitions(dfd); + close(dfd); + } + rv = add_disk(mdfd, st, content, &devices[j].i); + + if (rv) { + pr_err("failed to add %s to %s: %s\n", + devices[j].devname, + mddev, + strerror(errno)); + if (i < content->array.raid_disks * 2 + || i == bestcnt) + okcnt--; + else + sparecnt--; + } else if (c->verbose > 0) + pr_err("added %s to %s as %d%s%s\n", + devices[j].devname, mddev, + devices[j].i.disk.raid_disk, + devices[j].uptodate?"": + " (possibly out of date)", + (devices[j].i.disk.state & (1<= 0) { + if (c->verbose > 0) + pr_err("%s is already in %s as %d\n", + devices[j].devname, mddev, + devices[j].i.disk.raid_disk); + } else if (c->verbose > 0 && i < content->array.raid_disks*2 + && (i&1) == 0) + pr_err("no uptodate device for slot %d of %s\n", + i/2, mddev); + } + + if (content->array.level == LEVEL_CONTAINER) { + if (c->verbose >= 0) { + pr_err("Container %s has been assembled with %d drive%s", + mddev, okcnt+sparecnt+journalcnt, + okcnt+sparecnt+journalcnt==1?"":"s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", + content->array.raid_disks); + fprintf(stderr, "\n"); + } + + if (st->ss->validate_container) { + struct mdinfo *devices_list; + struct mdinfo *info_devices = xmalloc(sizeof(struct mdinfo)*(okcnt+sparecnt)); + unsigned int count; + devices_list = NULL; + for (count = 0; count < okcnt+sparecnt; count++) { + info_devices[count] = devices[count].i; + info_devices[count].next = devices_list; + devices_list = &info_devices[count]; + } + if (st->ss->validate_container(devices_list)) + pr_err("Mismatch detected!\n"); + free(info_devices); + } + + st->ss->free_super(st); + sysfs_uevent(content, "change"); + if (err_ok && okcnt < (unsigned)content->array.raid_disks) + /* Was partial, is still partial, so signal an error + * to ensure we don't retry */ + return 1; + return 0; + } + + /* Get number of in-sync devices according to the superblock. + * We must have this number to start the array without -s or -R + */ + req_cnt = content->array.working_disks; + + if (c->runstop == 1 || + (c->runstop <= 0 && + ( enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, avail) && + (okcnt + rebuilding_cnt >= req_cnt || start_partial_ok) + ))) { + /* This array is good-to-go. + * If a reshape is in progress then we might need to + * continue monitoring it. In that case we start + * it read-only and let the grow code make it writable. + */ + int rv; +#ifndef MDASSEMBLE + if (content->reshape_active && + !(content->reshape_active & RESHAPE_NO_BACKUP) && + content->delta_disks <= 0) { + if (!c->backup_file) { + pr_err("%s: Need a backup file to complete reshape of this array.\n", + mddev); + pr_err("Please provided one with \"--backup-file=...\"\n"); + if (c->update && + strcmp(c->update, "revert-reshape") == 0) + pr_err("(Don't specify --update=revert-reshape again, that part succeeded.)\n"); + return 1; + } + rv = sysfs_set_str(content, NULL, + "array_state", "readonly"); + if (rv == 0) + rv = Grow_continue(mdfd, st, content, + c->backup_file, 0, + c->freeze_reshape); + } else if (c->readonly && + sysfs_attribute_available( + content, NULL, "array_state")) { + rv = sysfs_set_str(content, NULL, + "array_state", "readonly"); + } else +#endif + rv = ioctl(mdfd, RUN_ARRAY, NULL); + reopen_mddev(mdfd); /* drop O_EXCL */ + if (rv == 0) { + if (c->verbose >= 0) { + pr_err("%s has been started with %d drive%s", + mddev, okcnt, okcnt==1?"":"s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", content->array.raid_disks); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt); + if (sparecnt) + fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); + if (content->journal_clean) + fprintf(stderr, " and %d journal", journalcnt); + fprintf(stderr, ".\n"); + } + if (content->reshape_active && + content->array.level >= 4 && + content->array.level <= 6) { + /* might need to increase the size + * of the stripe cache - default is 256 + */ + int chunk_size = content->array.chunk_size; + if (content->reshape_active && + content->new_chunk > chunk_size) + chunk_size = content->new_chunk; + if (256 < 4 * ((chunk_size+4065)/4096)) { + struct mdinfo *sra = sysfs_read(mdfd, NULL, 0); + if (sra) + sysfs_set_num(sra, NULL, + "stripe_cache_size", + (4 * chunk_size / 4096) + 1); + sysfs_free(sra); + } + } + if (okcnt < (unsigned)content->array.raid_disks) { + /* If any devices did not get added + * because the kernel rejected them based + * on event count, try adding them + * again providing the action policy is + * 're-add' or greater. The bitmap + * might allow them to be included, or + * they will become spares. + */ + for (i = 0; i < bestcnt; i++) { + int j = best[i]; + if (j >= 0 && !devices[j].uptodate) { + if (!disk_action_allows(&devices[j].i, st->ss->name, act_re_add)) + continue; + rv = add_disk(mdfd, st, content, + &devices[j].i); + if (rv == 0 && c->verbose >= 0) + pr_err("%s has been re-added.\n", + devices[j].devname); + } + } + } + if (content->array.level == 6 && + okcnt + 1 == (unsigned)content->array.raid_disks && + was_forced) { + struct mdinfo *sra = sysfs_read(mdfd, NULL, 0); + if (sra) + sysfs_set_str(sra, NULL, + "sync_action", "repair"); + sysfs_free(sra); + } + return 0; + } + pr_err("failed to RUN_ARRAY %s: %s\n", + mddev, strerror(errno)); + + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail)) + pr_err("Not enough devices to start the array.\n"); + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, + avail)) + pr_err("Not enough devices to start the array while not clean - consider --force.\n"); + + return 1; + } + if (c->runstop == -1) { + pr_err("%s assembled from %d drive%s", + mddev, okcnt, okcnt==1?"":"s"); + if (okcnt != (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", content->array.raid_disks); + fprintf(stderr, ", but not started.\n"); + return 2; + } + if (c->verbose >= -1) { + pr_err("%s assembled from %d drive%s", mddev, okcnt, okcnt==1?"":"s"); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt); + if (sparecnt) + fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail)) + fprintf(stderr, " - not enough to start the array.\n"); + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, + avail)) + fprintf(stderr, " - not enough to start the array while not clean - consider --force.\n"); + else { + if (req_cnt == (unsigned)content->array.raid_disks) + fprintf(stderr, " - need all %d to start it", req_cnt); + else + fprintf(stderr, " - need %d to start", req_cnt); + fprintf(stderr, " (use --run to insist).\n"); + } + } + return 1; +} + +int Assemble(struct supertype *st, char *mddev, + struct mddev_ident *ident, + struct mddev_dev *devlist, + struct context *c) +{ + /* + * The task of Assemble is to find a collection of + * devices that should (according to their superblocks) + * form an array, and to give this collection to the MD driver. + * In Linux-2.4 and later, this involves submitting a + * SET_ARRAY_INFO ioctl with no arg - to prepare + * the array - and then submit a number of + * ADD_NEW_DISK ioctls to add disks into + * the array. Finally RUN_ARRAY might + * be submitted to start the array. + * + * Much of the work of Assemble is in finding and/or + * checking the disks to make sure they look right. + * + * If mddev is not set, then scan must be set and we + * read through the config file for dev+uuid mapping + * We recurse, setting mddev, for each device that + * - isn't running + * - has a valid uuid (or any uuid if !uuidset) + * + * If mddev is set, we try to determine state of md. + * check version - must be at least 0.90.0 + * check kernel version. must be at least 2.4. + * If not, we can possibly fall back on START_ARRAY + * Try to GET_ARRAY_INFO. + * If possible, give up + * If not, try to STOP_ARRAY just to make sure + * + * If !uuidset and scan, look in conf-file for uuid + * If not found, give up + * If !devlist and scan and uuidset, get list of devs from conf-file + * + * For each device: + * Check superblock - discard if bad + * Check uuid (set if we don't have one) - discard if no match + * Check superblock similarity if we have a superblock - discard if different + * Record events, devicenum + * This should give us a list of devices for the array + * We should collect the most recent event number + * + * Count disks with recent enough event count + * While force && !enough disks + * Choose newest rejected disks, update event count + * mark clean and rewrite superblock + * If recent kernel: + * SET_ARRAY_INFO + * foreach device with recent events : ADD_NEW_DISK + * if runstop == 1 || "enough" disks and runstop==0 -> RUN_ARRAY + * If old kernel: + * Check the device numbers in superblock are right + * update superblock if any changes + * START_ARRAY + * + */ + int rv; + int mdfd; + int clean; + int auto_assem = (mddev == NULL && !ident->uuid_set && + ident->super_minor == UnSet && ident->name[0] == 0 + && (ident->container == NULL || ident->member == NULL)); + struct devs *devices; + char *devmap; + int *best = NULL; /* indexed by raid_disk */ + int bestcnt = 0; + int devcnt; + unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt, journalcnt; + int journal_clean = 0; + int i; + int was_forced = 0; + int most_recent = 0; + int chosen_drive; + int change = 0; + int inargv = 0; + int start_partial_ok = (c->runstop >= 0) && + (c->force || devlist==NULL || auto_assem); + int num_devs; + struct mddev_dev *tmpdev; + struct mdinfo info; + struct mdinfo *content = NULL; + struct mdinfo *pre_exist = NULL; + char *avail; + char *name = NULL; + char chosen_name[1024]; + struct map_ent *map = NULL; + struct map_ent *mp; + + /* + * If any subdevs are listed, then any that don't + * match ident are discarded. Remainder must all match and + * become the array. + * If no subdevs, then we scan all devices in the config file, but + * there must be something in the identity + */ + + if (!devlist && + ident->uuid_set == 0 && + (ident->super_minor < 0 || ident->super_minor == UnSet) && + ident->name[0] == 0 && + (ident->container == NULL || ident->member == NULL) && + ident->devices == NULL) { + pr_err("No identity information available for %s - cannot assemble.\n", + mddev ? mddev : "further assembly"); + return 1; + } + + if (devlist == NULL) + devlist = conf_get_devs(); + else if (mddev) + inargv = 1; + +try_again: + /* We come back here when doing auto-assembly and attempting some + * set of devices failed. Those are now marked as ->used==2 and + * we ignore them and try again + */ + if (!st && ident->st) + st = ident->st; + if (c->verbose>0) + pr_err("looking for devices for %s\n", + mddev ? mddev : "further assembly"); + + content = &info; + if (st && c->force) + st->ignore_hw_compat = 1; + num_devs = select_devices(devlist, ident, &st, &content, c, + inargv, auto_assem); + if (num_devs < 0) + return 1; + + if (!st || !st->sb || !content) + return 2; + + /* We have a full set of devices - we now need to find the + * array device. + * However there is a risk that we are racing with "mdadm -I" + * and the array is already partially assembled - we will have + * rejected any devices already in this address. + * So we take a lock on the map file - to prevent further races - + * and look for the uuid in there. If found and the array is + * active, we abort. If found and the array is not active + * we commit to that md device and add all the contained devices + * to our list. We flag them so that we don't try to re-add, + * but can remove if they turn out to not be wanted. + */ + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile - continue anyway...\n"); + if (c->update && strcmp(c->update,"uuid") == 0) + mp = NULL; + else + mp = map_by_uuid(&map, content->uuid); + if (mp) { + struct mdinfo *dv; + /* array already exists. */ + pre_exist = sysfs_read(-1, mp->devnm, GET_LEVEL|GET_DEVS); + if (pre_exist->array.level != UnSet) { + pr_err("Found some drive for an array that is already active: %s\n", + mp->path); + pr_err("giving up.\n"); + return 1; + } + for (dv = pre_exist->devs; dv; dv = dv->next) { + /* We want to add this device to our list, + * but it could already be there if "mdadm -I" + * started *after* we checked for O_EXCL. + * If we add it to the top of the list + * it will be preferred over later copies. + */ + struct mddev_dev *newdev; + char *devname = map_dev(dv->disk.major, + dv->disk.minor, + 0); + if (!devname) + continue; + newdev = xmalloc(sizeof(*newdev)); + newdev->devname = devname; + newdev->disposition = 'I'; + newdev->used = 1; + newdev->next = devlist; + devlist = newdev; + num_devs++; + } + strcpy(chosen_name, mp->path); + if (c->verbose > 0 || mddev == NULL || + strcmp(mddev, chosen_name) != 0) + pr_err("Merging with already-assembled %s\n", + chosen_name); + mdfd = open_dev_excl(mp->devnm); + } else { + int trustworthy = FOREIGN; + name = content->name; + switch (st->ss->match_home(st, c->homehost) + ?: st->ss->match_home(st, "any")) { + case 1: + trustworthy = LOCAL; + name = strchr(content->name, ':'); + if (name) + name++; + else + name = content->name; + break; + } + if (!auto_assem) + /* If the array is listed in mdadm.conf or on + * command line, then we trust the name + * even if the array doesn't look local + */ + trustworthy = LOCAL; + + if (name[0] == 0 && + content->array.level == LEVEL_CONTAINER) { + name = content->text_version; + trustworthy = METADATA; + } + + if (name[0] && trustworthy != LOCAL && + ! c->require_homehost && + conf_name_is_free(name)) + trustworthy = LOCAL; + + if (trustworthy == LOCAL && + strchr(name, ':')) + /* Ignore 'host:' prefix of name */ + name = strchr(name, ':')+1; + + mdfd = create_mddev(mddev, name, ident->autof, trustworthy, + chosen_name); + } + if (mdfd < 0) { + st->ss->free_super(st); + if (auto_assem) + goto try_again; + return 1; + } + mddev = chosen_name; + if (get_linux_version() < 2004000 || + md_get_version(mdfd) < 9000) { + pr_err("Assemble requires Linux 2.4 or later, and\n" + " md driver version 0.90.0 or later.\n" + " Upgrade your kernel or try --build\n"); + close(mdfd); + return 1; + } + if (pre_exist == NULL) { + if (mddev_busy(fd2devnm(mdfd))) { + pr_err("%s already active, cannot restart it!\n", + mddev); + for (tmpdev = devlist ; + tmpdev && tmpdev->used != 1; + tmpdev = tmpdev->next) + ; + if (tmpdev && auto_assem) + pr_err("%s needed for %s...\n", + mddev, tmpdev->devname); + close(mdfd); + mdfd = -3; + st->ss->free_super(st); + if (auto_assem) + goto try_again; + return 1; + } + /* just incase it was started but has no content */ + ioctl(mdfd, STOP_ARRAY, NULL); + } + +#ifndef MDASSEMBLE + if (content != &info) { + /* This is a member of a container. Try starting the array. */ + int err; + err = assemble_container_content(st, mdfd, content, c, + chosen_name, NULL); + close(mdfd); + return err; + } +#endif + /* Ok, no bad inconsistancy, we can try updating etc */ + devices = xcalloc(num_devs, sizeof(*devices)); + devmap = xcalloc(num_devs, content->array.raid_disks); + devcnt = load_devices(devices, devmap, ident, &st, devlist, + c, content, mdfd, mddev, + &most_recent, &bestcnt, &best, inargv); + if (devcnt < 0) + return 1; + + if (devcnt == 0) { + pr_err("no devices found for %s\n", + mddev); + if (st) + st->ss->free_super(st); + close(mdfd); + free(devices); + free(devmap); + return 1; + } + + if (c->update && strcmp(c->update, "byteorder")==0) + st->minor_version = 90; + + st->ss->getinfo_super(st, content, NULL); + clean = content->array.state & 1; + + /* now we have some devices that might be suitable. + * I wonder how many + */ + avail = xcalloc(content->array.raid_disks, 1); + okcnt = 0; + replcnt = 0; + sparecnt=0; + journalcnt=0; + rebuilding_cnt=0; + for (i=0; i< bestcnt; i++) { + int j = best[i]; + int event_margin = 1; /* always allow a difference of '1' + * like the kernel does + */ + if (j < 0) continue; + /* note: we ignore error flags in multipath arrays + * as they don't make sense + */ + if (content->array.level != LEVEL_MULTIPATH) { + if (devices[j].i.disk.state & (1<journal_device_required) + journalcnt++; + else /* unexpected journal, mark as faulty */ + devices[j].i.disk.state |= (1<force && + content->array.raid_disks > 0 && + devices[most_recent].i.disk.raid_disk >= 0 && + devmap[j * content->array.raid_disks + devices[most_recent].i.disk.raid_disk] == 0) { + if (c->verbose > -1) + pr_err("ignoring %s as it reports %s as failed\n", + devices[j].devname, devices[most_recent].devname); + best[i] = -1; + continue; + } + /* Require event counter to be same as, or just less than, + * most recent. If it is bigger, it must be a stray spare and + * should be ignored. + */ + if (devices[j].i.events+event_margin >= + devices[most_recent].i.events && + devices[j].i.events <= + devices[most_recent].i.events + ) { + devices[j].uptodate = 1; + if (devices[j].i.disk.state & (1<array.raid_disks * 2) { + if (devices[j].i.recovery_start == MaxSector || + (content->reshape_active && + i >= content->array.raid_disks - content->delta_disks)) { + if (!avail[i/2]) { + okcnt++; + avail[i/2]=1; + } else + replcnt++; + } else + rebuilding_cnt++; + } else if (devices[j].i.disk.raid_disk != MD_DISK_ROLE_JOURNAL) + sparecnt++; + } + } + free(devmap); + if (c->force) { + int force_ok = force_array(content, devices, best, bestcnt, + avail, most_recent, st, c); + okcnt += force_ok; + if (force_ok) + was_forced = 1; + } + /* Now we want to look at the superblock which the kernel will base things on + * and compare the devices that we think are working with the devices that the + * superblock thinks are working. + * If there are differences and --force is given, then update this chosen + * superblock. + */ + chosen_drive = -1; + st->ss->free_super(st); + for (i=0; chosen_drive < 0 && iss->load_super(st,fd, NULL)) { + close(fd); + pr_err("RAID superblock has disappeared from %s\n", + devices[j].devname); + close(mdfd); + free(devices); + return 1; + } + close(fd); + } + if (st->sb == NULL) { + pr_err("No suitable drives found for %s\n", mddev); + close(mdfd); + free(devices); + return 1; + } + st->ss->getinfo_super(st, content, NULL); +#ifndef MDASSEMBLE + sysfs_init(content, mdfd, NULL); +#endif + /* after reload context, store journal_clean in context */ + content->journal_clean = journal_clean; + for (i=0; i= content->array.raid_disks * 2) + desired_state = 0; + else if (i & 1) + desired_state = (1<ss->update_super(st, &devices[j].i, "assemble", NULL, + c->verbose, 0, NULL)) { + if (c->force) { + if (c->verbose >= 0) + pr_err("clearing FAULTY flag for device %d in %s for %s\n", + j, mddev, devices[j].devname); + change = 1; + } else { + if (c->verbose >= -1) + pr_err("device %d in %s has wrong state in superblock, but %s seems ok\n", + i, mddev, devices[j].devname); + } + } +#if 0 + if (!(super.disks[i].i.disk.state & (1 << MD_DISK_FAULTY))) { + pr_err("devices %d of %s is not marked FAULTY in superblock, but cannot be found\n", + i, mddev); + } +#endif + } + if (c->force && !clean && + !enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, + avail)) { + change += st->ss->update_super(st, content, "force-array", + devices[chosen_drive].devname, c->verbose, + 0, NULL); + was_forced = 1; + clean = 1; + } + + if (change) { + int fd; + fd = dev_open(devices[chosen_drive].devname, + devices[chosen_drive].included ? + O_RDWR : (O_RDWR|O_EXCL)); + if (fd < 0) { + pr_err("Could not open %s for write - cannot Assemble array.\n", + devices[chosen_drive].devname); + close(mdfd); + free(devices); + return 1; + } + if (st->ss->store_super(st, fd)) { + close(fd); + pr_err("Could not re-write superblock on %s\n", + devices[chosen_drive].devname); + close(mdfd); + free(devices); + return 1; + } + if (c->verbose >= 0) + pr_err("Marking array %s as 'clean'\n", + mddev); + close(fd); + } + + /* If we are in the middle of a reshape we may need to restore saved data + * that was moved aside due to the reshape overwriting live data + * The code of doing this lives in Grow.c + */ +#ifndef MDASSEMBLE + if (content->reshape_active && + !(content->reshape_active & RESHAPE_NO_BACKUP)) { + int err = 0; + int *fdlist = xmalloc(sizeof(int)* bestcnt); + if (c->verbose > 0) + pr_err("%s has an active reshape - checking if critical section needs to be restored\n", + chosen_name); + if (!c->backup_file) + c->backup_file = locate_backup(content->sys_name); + enable_fds(bestcnt/2); + for (i = 0; i < bestcnt/2; i++) { + int j = best[i*2]; + if (j >= 0) { + fdlist[i] = dev_open(devices[j].devname, + devices[j].included + ? O_RDWR : (O_RDWR|O_EXCL)); + if (fdlist[i] < 0) { + pr_err("Could not open %s for write - cannot Assemble array.\n", + devices[j].devname); + err = 1; + break; + } + } else + fdlist[i] = -1; + } + if (!err) { + if (st->ss->external && st->ss->recover_backup) + err = st->ss->recover_backup(st, content); + else + err = Grow_restart(st, content, fdlist, bestcnt/2, + c->backup_file, c->verbose > 0); + if (err && c->invalid_backup) { + if (c->verbose > 0) + pr_err("continuing without restoring backup\n"); + err = 0; + } + } + while (i>0) { + i--; + if (fdlist[i]>=0) close(fdlist[i]); + } + free(fdlist); + if (err) { + pr_err("Failed to restore critical section for reshape, sorry.\n"); + if (c->backup_file == NULL) + cont_err("Possibly you needed to specify the --backup-file\n"); + close(mdfd); + free(devices); + return err; + } + } +#endif + + /* Almost ready to actually *do* something */ + /* First, fill in the map, so that udev can find our name + * as soon as we become active. + */ + if (c->update && strcmp(c->update, "metadata")==0) { + content->array.major_version = 1; + content->array.minor_version = 0; + strcpy(content->text_version, "1.0"); + } + + map_update(&map, fd2devnm(mdfd), content->text_version, + content->uuid, chosen_name); + + rv = start_array(mdfd, mddev, content, + st, ident, best, bestcnt, + chosen_drive, devices, okcnt, sparecnt, + rebuilding_cnt, journalcnt, + c, + clean, avail, start_partial_ok, + pre_exist != NULL, + was_forced); + if (rv == 1 && !pre_exist) + ioctl(mdfd, STOP_ARRAY, NULL); + free(devices); + map_unlock(&map); + if (rv == 0) { + wait_for(chosen_name, mdfd); + close(mdfd); + if (auto_assem) { + int usecs = 1; + /* There is a nasty race with 'mdadm --monitor'. + * If it opens this device before we close it, + * it gets an incomplete open on which IO + * doesn't work and the capacity is + * wrong. + * If we reopen (to check for layered devices) + * before --monitor closes, we loose. + * + * So: wait upto 1 second for there to be + * a non-zero capacity. + */ + while (usecs < 1000) { + mdfd = open(mddev, O_RDONLY); + if (mdfd >= 0) { + unsigned long long size; + if (get_dev_size(mdfd, NULL, &size) && + size > 0) + break; + close(mdfd); + } + usleep(usecs); + usecs <<= 1; + } + } + } else + close(mdfd); + + /* '2' means 'OK, but not started yet' */ + return rv == 2 ? 0 : rv; +} + +#ifndef MDASSEMBLE +int assemble_container_content(struct supertype *st, int mdfd, + struct mdinfo *content, struct context *c, + char *chosen_name, int *result) +{ + struct mdinfo *dev, *sra, *dev2; + int working = 0, preexist = 0; + int expansion = 0; + struct map_ent *map = NULL; + int old_raid_disks; + int start_reshape; + char *avail = NULL; + int err; + + sysfs_init(content, mdfd, NULL); + + sra = sysfs_read(mdfd, NULL, GET_VERSION|GET_DEVS); + if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0) { + if (content->array.major_version == -1 && + content->array.minor_version == -2 && + c->readonly && + content->text_version[0] == '/') + content->text_version[0] = '-'; + if (sysfs_set_array(content, md_get_version(mdfd)) != 0) { + if (sra) + sysfs_free(sra); + return 1; + } + } + + /* There are two types of reshape: container wide or sub-array specific + * Check if metadata requests blocking container wide reshapes + */ + start_reshape = (content->reshape_active && + !((content->reshape_active == CONTAINER_RESHAPE) && + (content->array.state & (1<ss->external && content->recovery_blocked && start_reshape) + block_subarray(content); + + for (dev2 = sra->devs; dev2; dev2 = dev2->next) { + for (dev = content->devs; dev; dev = dev->next) + if (dev2->disk.major == dev->disk.major && + dev2->disk.minor == dev->disk.minor) + break; + if (dev) + continue; + /* Don't want this one any more */ + if (sysfs_set_str(sra, dev2, "slot", "none") < 0 && + errno == EBUSY) { + pr_err("Cannot remove old device %s: not updating %s\n", dev2->sys_name, sra->sys_name); + sysfs_free(sra); + return 1; + } + sysfs_set_str(sra, dev2, "state", "remove"); + } + old_raid_disks = content->array.raid_disks - content->delta_disks; + avail = xcalloc(content->array.raid_disks, 1); + for (dev = content->devs; dev; dev = dev->next) { + if (dev->disk.raid_disk >= 0) + avail[dev->disk.raid_disk] = 1; + if (sysfs_add_disk(content, dev, 1) == 0) { + if (dev->disk.raid_disk >= old_raid_disks && + content->reshape_active) + expansion++; + else + working++; + } else if (errno == EEXIST) + preexist++; + } + sysfs_free(sra); + if (working + expansion == 0 && c->runstop <= 0) { + free(avail); + return 1;/* Nothing new, don't try to start */ + } + map_update(&map, fd2devnm(mdfd), + content->text_version, + content->uuid, chosen_name); + + + if (enough(content->array.level, content->array.raid_disks, + content->array.layout, content->array.state & 1, avail) == 0) { + if (c->export && result) + *result |= INCR_NO; + else if (c->verbose >= 0) { + pr_err("%s assembled with %d device%s", + chosen_name, preexist + working, + preexist + working == 1 ? "":"s"); + if (preexist) + fprintf(stderr, " (%d new)", working); + fprintf(stderr, " but not started\n"); + } + free(avail); + return 1; + } + free(avail); + + if (c->runstop <= 0 && + (working + preexist + expansion) < + content->array.working_disks) { + if (c->export && result) + *result |= INCR_UNSAFE; + else if (c->verbose >= 0) { + pr_err("%s assembled with %d device%s", + chosen_name, preexist + working, + preexist + working == 1 ? "":"s"); + if (preexist) + fprintf(stderr, " (%d new)", working); + fprintf(stderr, " but not safe to start\n"); + } + return 1; + } + + + if (start_reshape) { + int spare = content->array.raid_disks + expansion; + if (restore_backup(st, content, + working, + spare, &c->backup_file, c->verbose) == 1) + return 1; + + err = sysfs_set_str(content, NULL, + "array_state", "readonly"); + if (err) + return 1; + + if (st->ss->external) { + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(st->container_devnm); + if (mdmon_running(st->container_devnm) && + st->update_tail == NULL) + st->update_tail = &st->updates; + } + + err = Grow_continue(mdfd, st, content, c->backup_file, + 0, c->freeze_reshape); + } else switch(content->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + err = sysfs_set_str(content, NULL, "array_state", + c->readonly ? "readonly" : "active"); + break; + default: + err = sysfs_set_str(content, NULL, "array_state", + "readonly"); + /* start mdmon if needed. */ + if (!err) { + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(st->container_devnm); + } + break; + } + if (!err) + sysfs_set_safemode(content, content->safe_mode_delay); + + /* Block subarray here if it is not reshaped now + * It has be blocked a little later to allow mdmon to switch in + * in to R/W state + */ + if (st->ss->external && content->recovery_blocked && + !start_reshape) + block_subarray(content); + + if (c->export && result) { + if (err) + *result |= INCR_NO; + else + *result |= INCR_YES; + } else if (c->verbose >= 0) { + if (err) + pr_err("array %s now has %d device%s", + chosen_name, working + preexist, + working + preexist == 1 ? "":"s"); + else + pr_err("Started %s with %d device%s", + chosen_name, working + preexist, + working + preexist == 1 ? "":"s"); + if (preexist) + fprintf(stderr, " (%d new)", working); + if (expansion) + fprintf(stderr, " ( + %d for expansion)", + expansion); + fprintf(stderr, "\n"); + } + if (!err) + wait_for(chosen_name, mdfd); + return err; + /* FIXME should have an O_EXCL and wait for read-auto */ +} +#endif diff --git a/Build.c b/Build.c new file mode 100644 index 00000000..8603c710 --- /dev/null +++ b/Build.c @@ -0,0 +1,292 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" + +#define REGISTER_DEV _IO (MD_MAJOR, 1) +#define START_MD _IO (MD_MAJOR, 2) +#define STOP_MD _IO (MD_MAJOR, 3) + +int Build(char *mddev, struct mddev_dev *devlist, + struct shape *s, struct context *c) +{ + /* Build a linear or raid0 arrays without superblocks + * We cannot really do any checks, we just do it. + * For md_version < 0.90.0, we call REGISTER_DEV + * with the device numbers, and then + * START_MD giving the "geometry" + * geometry is 0xpp00cc + * where pp is personality: 1==linear, 2=raid0 + * cc = chunk size factor: 0==4k, 1==8k etc. + * + * For md_version >= 0.90.0 we call + * SET_ARRAY_INFO, ADD_NEW_DISK, RUN_ARRAY + * + */ + int i; + int vers; + struct stat stb; + int subdevs = 0, missing_disks = 0; + struct mddev_dev *dv; + int bitmap_fd; + unsigned long long bitmapsize; + int mdfd; + char chosen_name[1024]; + int uuid[4] = {0,0,0,0}; + struct map_ent *map = NULL; + + /* scan all devices, make sure they really are block devices */ + for (dv = devlist; dv; dv=dv->next) { + subdevs++; + if (strcmp("missing", dv->devname) == 0) { + missing_disks++; + continue; + } + if (stat(dv->devname, &stb)) { + pr_err("Cannot find %s: %s\n", + dv->devname, strerror(errno)); + return 1; + } + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + pr_err("%s is not a block device.\n", + dv->devname); + return 1; + } + } + + if (s->raiddisks != subdevs) { + pr_err("requested %d devices in array but listed %d\n", + s->raiddisks, subdevs); + return 1; + } + + if (s->layout == UnSet) + switch(s->level) { + default: /* no layout */ + s->layout = 0; + break; + case 10: + s->layout = 0x102; /* near=2, far=1 */ + if (c->verbose > 0) + pr_err("layout defaults to n1\n"); + break; + case 5: + case 6: + s->layout = map_name(r5layout, "default"); + if (c->verbose > 0) + pr_err("layout defaults to %s\n", map_num(r5layout, s->layout)); + break; + case LEVEL_FAULTY: + s->layout = map_name(faultylayout, "default"); + + if (c->verbose > 0) + pr_err("layout defaults to %s\n", map_num(faultylayout, s->layout)); + break; + } + + /* We need to create the device. It can have no name. */ + map_lock(&map); + mdfd = create_mddev(mddev, NULL, c->autof, LOCAL, + chosen_name); + if (mdfd < 0) { + map_unlock(&map); + return 1; + } + mddev = chosen_name; + + map_update(&map, fd2devnm(mdfd), "none", uuid, chosen_name); + map_unlock(&map); + + vers = md_get_version(mdfd); + + /* looks Ok, go for it */ + if (vers >= 9000) { + mdu_array_info_t array; + array.level = s->level; + if (s->size == MAX_SIZE) + s->size = 0; + array.size = s->size; + array.nr_disks = s->raiddisks; + array.raid_disks = s->raiddisks; + array.md_minor = 0; + if (fstat(mdfd, &stb)==0) + array.md_minor = minor(stb.st_rdev); + array.not_persistent = 1; + array.state = 0; /* not clean, but no errors */ + if (s->assume_clean) + array.state |= 1; + array.active_disks = s->raiddisks - missing_disks; + array.working_disks = s->raiddisks - missing_disks; + array.spare_disks = 0; + array.failed_disks = missing_disks; + if (s->chunk == 0 && (s->level==0 || s->level==LEVEL_LINEAR)) + s->chunk = 64; + array.chunk_size = s->chunk*1024; + array.layout = s->layout; + if (ioctl(mdfd, SET_ARRAY_INFO, &array)) { + pr_err("SET_ARRAY_INFO failed for %s: %s\n", + mddev, strerror(errno)); + goto abort; + } + } else if (s->bitmap_file) { + pr_err("bitmaps not supported with this kernel\n"); + goto abort; + } + + if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) + s->bitmap_file = NULL; + if (s->bitmap_file && s->level <= 0) { + pr_err("bitmaps not meaningful with level %s\n", + map_num(pers, s->level)?:"given"); + goto abort; + } + /* now add the devices */ + for ((i=0), (dv = devlist) ; dv ; i++, dv=dv->next) { + unsigned long long dsize; + int fd; + if (strcmp("missing", dv->devname) == 0) + continue; + if (stat(dv->devname, &stb)) { + pr_err("Weird: %s has disappeared.\n", + dv->devname); + goto abort; + } + if ((stb.st_mode & S_IFMT)!= S_IFBLK) { + pr_err("Weird: %s is no longer a block device.\n", + dv->devname); + goto abort; + } + fd = open(dv->devname, O_RDONLY|O_EXCL); + if (fd < 0) { + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if (get_dev_size(fd, NULL, &dsize) && + (s->size == 0 || s->size == MAX_SIZE || dsize < s->size)) + s->size = dsize; + close(fd); + if (vers >= 9000) { + mdu_disk_info_t disk; + disk.number = i; + disk.raid_disk = i; + disk.state = (1<writemostly == 1) + disk.state |= 1<devname, strerror(errno)); + goto abort; + } + } else { + if (ioctl(mdfd, REGISTER_DEV, &stb.st_rdev)) { + pr_err("REGISTER_DEV failed for %s: %s.\n", + dv->devname, strerror(errno)); + goto abort; + } + } + } + /* now to start it */ + if (vers >= 9000) { + mdu_param_t param; /* not used by syscall */ + if (s->bitmap_file) { + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + int major = BITMAP_MAJOR_HI; +#if 0 + if (s->bitmap_chunk == UnSet) { + pr_err("%s cannot be openned.", + s->bitmap_file); + goto abort; + } +#endif + if (vers < 9003) { + major = BITMAP_MAJOR_HOSTENDIAN; +#ifdef __BIG_ENDIAN + pr_err("Warning - bitmaps created on this kernel are not portable\n" + " between different architectures. Consider upgrading the Linux kernel.\n"); +#endif + } + bitmapsize = s->size>>9; /* FIXME wrong for RAID10 */ + if (CreateBitmap(s->bitmap_file, 1, NULL, s->bitmap_chunk, + c->delay, s->write_behind, bitmapsize, major)) { + goto abort; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("%s cannot be openned.", + s->bitmap_file); + goto abort; + } + } + if (bitmap_fd >= 0) { + if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { + pr_err("Cannot set bitmap file for %s: %s\n", + mddev, strerror(errno)); + goto abort; + } + } + } + if (ioctl(mdfd, RUN_ARRAY, ¶m)) { + pr_err("RUN_ARRAY failed: %s\n", + strerror(errno)); + if (s->chunk & (s->chunk-1)) { + cont_err("Problem may be that chunk size is not a power of 2\n"); + } + goto abort; + } + } else { + unsigned long arg; + arg=0; + while (s->chunk > 4096) { + arg++; + s->chunk >>= 1; + } + if (s->level == 0) + arg |= 0x20000; + else + arg |= 0x10000; + if (ioctl(mdfd, START_MD, arg)) { + pr_err("START_MD failed: %s\n", + strerror(errno)); + goto abort; + } + } + if (c->verbose >= 0) + pr_err("array %s built and started.\n", + mddev); + wait_for(mddev, mdfd); + close(mdfd); + return 0; + + abort: + if (vers >= 9000) + ioctl(mdfd, STOP_ARRAY, 0); + else + ioctl(mdfd, STOP_MD, 0); + close(mdfd); + return 1; +} diff --git a/COPYING b/COPYING new file mode 100644 index 00000000..d159169d --- /dev/null +++ b/COPYING @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 00000000..a3bf7007 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,306 @@ +Please see git logs for detailed change log. +This file just contains highlight. + +Changes Prior to release 3.3 +- Some array reshapes can proceed without needing backup file. + This is done by changing the 'data_offset' so we never need to write + any data back over where it was before. If there is no "head space" + or "tail space" to allow data_offset to change, the old mechanism + with a backup file can still be used. +- RAID10 arrays can be reshaped to change the number of devices, + change the chunk size, or change the layout between 'near' + and 'offset'. + This will always change data_offset, and will fail if there is no + room for data_offset to be moved. +- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array. +- bad-block-logs are supported (but not heavily tested yet) +- "--assemble --update=revert-reshape" can be used to undo a reshape + that has just been started but isn't really wanted. This is very + new and while it passes basic tests it cannot be guaranteed. +- improved locking between --incremental and --assemble +- uses systemd to run "mdmon" if systemd is configured to do that. +- kernel names of md devices can be non-numeric. e.g. "md_home" rather than + "md0". This will probably confuse lots of other tools, so you need to + echo CREATE names=yes >> /etc/mdadm.conf + or the feature will not be used. (you also need a reasonably new kernel). +- "--stop" can be given a kernel name instead of a device name. i.e + mdadm --stop md4 + will work even if /dev/md4 doesn't exist. +- "--detail --export" has some information about the devices in the array +- --dump and --restore can be used to backup and restore the metadata on an + array. +- Hot-replace is supported with + mdadm /dev/mdX --replace /dev/foo + and + mdadm /dev/mdX --replace /dev/foo --with /dev/bar +- Config file can be a directory in which case all "*.conf" files are + read in lexical order. + Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d + Thus + echo CREATE name=yes > /etc/mdadm.conf.d/names.conf + will also enable the use of named md devices. + +- Lots of improvements to DDF support including adding support for + RAID10 (thanks Martin Wilck). + +Changes Prior to release 3.2.6 + - There are no real stand-out fixes, just lots of little bits and pieces. + +Changes Prior to release 3.2.5 + - This release primarily fixes a serious regression in 3.2.4. + This regression does *not* cause any risk to data. It simply + means that adding a device with "--add" would sometime fail + when it should not. + + - The fix also includes a couple of minor fixes such as making + the "--layout=preserve" option to "--grow" work again. + + +Changes Prior to release 3.2.4 +"--oneline" log of changes is below. Some notable ones are: + + - --offroot argument to improve interactions between mdmon and initrd + - --prefer argument to select which /dev names to display in some + circumstances. + - relax restructions on when "--add" will be allowed + - Fix bug with adding write-intent-bitmap to active array + - Now defaults to "/run/mdadm" for storing run-time files. + +Changes Prior to release 3.2.3 + - The largest single area of change is support for reshape of Intel + IMSM arrays (OnLine Capacity Explansion and Level Migration). + - Among other fixes, this now has a better chance of surviving if a + device fails during reshape. + +Changes Prior to release 3.2.2 + - reshaping IMSM (Intel metadata) arrays is no longer 'experimental', + it should work properly and be largely compatible with IMSM drivers in + other platforms. + - --assume-clean can be used with --grow --size to avoid resyncing the + new part of the array. This is only support with very new kernels. + - RAID0 arrays can have chunksize which is not a power of 2. This has been + supported in the kernel for a while but is only now supprted by + mdadm. + + - A new tool 'raid6check' is available which can check a RAID6 array, + or part of it, and report which device is most inconsistent with the + others if any stripe is inconsistent. This is still under development + and does not have a man page yet. If anyone tries it out and has any + questions or experience to report, they would be most welcome on + linux-raid@vger.kernel.org. + +Changes Prior to release 3.2.1 + - policy framework + Policy can be expressed for moving spare devices between arrays, and + for how to handle hot-plugged devices. This policy can be different + for devices plugged in to different controllers etc. + This, for example, allows a configuration where when a device is plugged + in it is immediately included in an md array as a hot spare and + possibly starts recovery immediately if an array is degraded. + + - some understanding of mbr and gpt paritition tables + This is primarly to support the new hot-plug support. If a + device is plugged in and policy suggests it should have a partition table, + the partition table will be copied from a suitably similar device, and + then the partitions will hot-plug and can then be added to md arrays. + + - "--incremental --remove" can remember where a device was removed from + so if a device gets plugged back in the same place, special policy applies + to it, allowing it to be included in an array even if a general hotplug + will not be included. + + - enhanced reshape options, including growing a RAID0 by converting to RAID4, + restriping, and converting back. Also convertions between RAID0 and + RAID10 and between RAID1 and RAID10 are possible (with a suitably recent + kernel). + + - spare migration for IMSM arrays. + Spare migration can now work across 'containers' using non-native metadata + and specifically Intel's IMSM arrays support spare migrations. + + - OLCE and level migration for Intel IMSM arrays. + OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is + supported for Intel Matrix Storage Manager arrays. + This support is currently 'experimental' for technical reasons. It can + be enabled with "export MDADM_EXPERIMENTAL=1" + + - avoid including wayward devices + If you split a RAID1, mount the two halves as two separate degraded RAID1s, + and then later bring the two back together, it is possible that the md + metadata won't properly show that one must over-ride the other. + mdadm now does extra checking to detect this possibilty and avoid + potentially corrupting data. + + - remove any possible confusion between similar options. + e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't + notice if one was used where the other was expected. + + - allow K,M,G suffixes on chunk sizes + +Changes Prior to release 3.2 + - By far the most significant change in this release related to the + management of reshaping arrays. This code has been substantially + re-written so that it can work with 'externally managed metadata' - + Intel's IMSM in particular. We now support level migration and + OnLine Capacity Expansion on these arrays. + - Policy framework. + Various policy statements can be made in the mdadm.conf to guide + the behaviour of mdadm, particular with regards to how new devices + are treated by "mdadm -I". + Depending on the 'action' associated with a device (identified by + its 'path') such need devices can be automatically re-added to and + existing array that they previously fell out off, or automatically + added as a spare if they appear to contain no data. + + - mdadm now has a limited understanding of partition tables. This + allows the policy framework to make decisions about partitioned + devices as well. + + - --incremental --remove can be told what --path the device was on, + and this info will be recorded so that another device appearing at + the same physical location can be preferentially added to the same + array (provides the spare-same-slot action policy applied to the + path). + + - A new flags "--invalid-backup" flag is available in --assemble + mode. This can be used to re-assemble an array which was stopping + in the middle of a reshape, and for which the 'backup file' is no + longer available or is corrupted. The array may have some + corruption in it at the point where reshape was up to, but at least + the rest of the array will become available. + + + - Various internal restructuring - more is needed. + +Changes Prior to release 3.1.5 + - Fixes for v1.x metadata on big-endian machines. + - man page improvements + - Improve '--detail --export' when run on partitions of an md array. + - Fix regression with removing 'failed' or 'detached' devices. + - Fixes for "--assemble --force" in various unusual cases. + - Allow '-Y' to mean --export. This was documented but not implemented. + - Various fixed for handling 'ddf' metadata. This is now more reliable + but could benefit from more interoperability testing. + - Correctly list subarrays of a container in "--detail" output. + - Improve checks on whether the requested number of devices is supported + by the metadata - both for --create and --grow. + - Don't remove partitions from a device that is being included in an + array until we are fully committed to including it. + - Allow "--assemble --update=no-bitmap" so an array with a corrupt + bitmap can still be assembled. + - Don't allow --add to succeed if it looks like a "--re-add" is probably + wanted, but cannot succeed. This avoids inadvertently turning + devices into spares when an array is failed. + +Changes Prior to release 3.1.4 + Two fixes related to configs that aren't using udev: + - Don't remove md devices which 'standard' names on --stop + - Allow dev_open to work on read-only /dev + And fixed regressions: + - Allow --incremental to add spares to an array + - Accept --no-degraded as a deprecated option rather than + throwing an error + - Return correct success status when --incrmental assembling + a container which does not yet have enough devices. + - Don't link mdadm with pthreads, only mdmon needs it. + - Fix compiler warning due to bad use of snprintf + +Changes Prior to release 3.1.3 + - mapfile now lives in a fixed location which default to + /dev/.mdadm/map but can be changed at compile time. This + location is choses and most distros provide it during early + boot and preserve it through. As long a /dev exists and is + writable, /dev/.mdadm will be created. + Other files file communication with mdmon live here too. + This fixes a bug reported by Debian and Gentoo users where + udev would spin in early-boot. + - IMSM and DDF metadata will not be recognised on partitions + as they should only be used on whole-disks. + - Various overflows causes by 2G drives have been addressed. + - A subarray of an IMSM contain can now be killed with + --kill-subarray. Also subarrays can be renamed with + --update-subarray + - -If (or --incremental --fail) can be used from udev to + fail and remove from all arrays a device which has been + unplugged from the system. i.e. hot-unplug-support. + - "mdadm /dev/mdX --re-add missing" will look for any device + that looks like it should be a member of /dev/mdX but isn't + and will automatically --re-add it + - Now compile with -Wextra to get extra warnings. + - Lots of minor bug fixes, documentation improvements, etcc + +Changes Prior to release 3.1.2 + - The default metadata has change again (sorry about that). + It is now v1.2 and will hopefully stay that way. It turned + out there with boot-block issues with v1.1 which make it + unsuitable for a default, though in many cases it is still + suitable to use. + - Stopping a container is not permitted when members are still + active + - Add 'homehost' to the valid words for the "AUTO" config file + line. When followed by "-all", this causes mdadm to + auto-assemble any array belonging to this host, but not + auto-assemble anything else. + - Fix some bugs with "--grow --chunksize=" for changing chunksize. + - VAR_RUN can be easily changed at compile time just like ALT_RUN. + This gives distros more flexability in how to manage the + pid and sock files that mdmon needs. + - Various mdmon fixes + - Alway make bitmap 4K-aligned if at all possible. + - If mdadm.conf lists arrays which have inter-dependencies, + the previously had to be listed in the "right" order. Now + any order should work. + - Fix --force assembly of v1.x arrays which are in the process + of recovering. + - Add section on 'scrubbing' to 'md' man page. + - Various command-line-option parsing improvements. + - ... and lots of other bug fixes. + +Changes Prior to release 3.1.1 + - Multiple fixes for new --grow levels including fixes for + serious data corruption problems. + - Change default metadata to v1.1 + - Change default chunk size to 512K + - Change default bitmap chunk size to 64Meg + - When --re-add is used, don't fall back to + --add if --re-add fails as this can destroy data. + +Changes Prior to release 3.1 + - Support --grow to change the layout of RAID4/5/6 + - Support --grow to change the chunksize of raid 4/5/6 + - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and + back. + - Support --grow to reduce the number of devices in RAID4/5/6. + - Support restart of these grow options which assembling an array + which is partially grown. + - Assorted tests of this code, and of different RAID6 layouts. + +Changes Prior to release 3.0.3 + - Improvements for creating arrays giving just a name, like 'foo', + rather than the full '/dev/md/foo'. + - Improvements for assembling member arrays of containers. + - Improvements to test suite + - Add option to change increment for RebuildNN messages reported + by "mdadm --monitor" + - Improvements to mdmon 'hand-over' from initrd to final root. + - Handle merging of devices that have left an IMSM array and are + being re-incorporated. + - Add missing space in "--detail --brief" output. + +Changes Prior to release 3.0.2 + - Fix crash when hosthost is not set, as often happens in + early boot. + +Changes Prior to release 3.0.1 + - Fix various segfaults + - Fixed for --examine with containers + - Lots of other little fixes. + +Changes Prior to release 3.0 + - Support for externally managed metadata, specifically DDF and IMSM. + - Depend on udev to create entries in /dev, rather than creating them + ourselves. + - remove --auto-update-home-hosts + - new config file line "auto" + - new "" and "any" options for "homehost" + - numerous bug fixes and minor enhancements. diff --git a/Create.c b/Create.c new file mode 100644 index 00000000..1e4a6ee0 --- /dev/null +++ b/Create.c @@ -0,0 +1,1071 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" +#include + +static int default_layout(struct supertype *st, int level, int verbose) +{ + int layout = UnSet; + + if (st && st->ss->default_geometry) + st->ss->default_geometry(st, &level, &layout, NULL); + + if (layout == UnSet) + switch(level) { + default: /* no layout */ + layout = 0; + break; + case 10: + layout = 0x102; /* near=2, far=1 */ + if (verbose > 0) + pr_err("layout defaults to n2\n"); + break; + case 5: + case 6: + layout = map_name(r5layout, "default"); + if (verbose > 0) + pr_err("layout defaults to %s\n", map_num(r5layout, layout)); + break; + case LEVEL_FAULTY: + layout = map_name(faultylayout, "default"); + + if (verbose > 0) + pr_err("layout defaults to %s\n", map_num(faultylayout, layout)); + break; + } + + return layout; +} + +int Create(struct supertype *st, char *mddev, + char *name, int *uuid, + int subdevs, struct mddev_dev *devlist, + struct shape *s, + struct context *c, unsigned long long data_offset) +{ + /* + * Create a new raid array. + * + * First check that necessary details are available + * (i.e. level, raid-disks) + * + * Then check each disk to see what might be on it + * and report anything interesting. + * + * If anything looks odd, and runstop not set, + * abort. + * + * SET_ARRAY_INFO and ADD_NEW_DISK, and + * if runstop==run, or raiddisks disks were used, + * RUN_ARRAY + */ + int mdfd; + unsigned long long minsize=0, maxsize=0; + char *mindisc = NULL; + char *maxdisc = NULL; + int dnum, raid_disk_num; + struct mddev_dev *dv; + int fail=0, warn=0; + struct stat stb; + int first_missing = subdevs * 2; + int second_missing = subdevs * 2; + int missing_disks = 0; + int insert_point = subdevs * 2; /* where to insert a missing drive */ + int total_slots; + int pass; + int vers; + int rv; + int bitmap_fd; + int have_container = 0; + int container_fd = -1; + int need_mdmon = 0; + unsigned long long bitmapsize; + struct mdinfo info, *infos; + int did_default = 0; + int do_default_layout = 0; + int do_default_chunk = 0; + unsigned long safe_mode_delay = 0; + char chosen_name[1024]; + struct map_ent *map = NULL; + unsigned long long newsize; + + int major_num = BITMAP_MAJOR_HI; + if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0) + major_num = BITMAP_MAJOR_CLUSTERED; + + memset(&info, 0, sizeof(info)); + if (s->level == UnSet && st && st->ss->default_geometry) + st->ss->default_geometry(st, &s->level, NULL, NULL); + if (s->level == UnSet) { + pr_err("a RAID level is needed to create an array.\n"); + return 1; + } + if (s->raiddisks < 4 && s->level == 6) { + pr_err("at least 4 raid-devices needed for level 6\n"); + return 1; + } + if (s->raiddisks > 256 && s->level == 6) { + pr_err("no more than 256 raid-devices supported for level 6\n"); + return 1; + } + if (s->raiddisks < 2 && s->level >= 4) { + pr_err("at least 2 raid-devices needed for level 4 or 5\n"); + return 1; + } + if (s->level <= 0 && s->sparedisks) { + pr_err("This level does not support spare devices\n"); + return 1; + } + + if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) { + /* If given a single device, it might be a container, and we can + * extract a device list from there + */ + mdu_array_info_t inf; + int fd; + + memset(&inf, 0, sizeof(inf)); + fd = open(devlist->devname, O_RDONLY); + if (fd >= 0 && + ioctl(fd, GET_ARRAY_INFO, &inf) == 0 && + inf.raid_disks == 0) { + /* yep, looks like a container */ + if (st) { + rv = st->ss->load_container(st, fd, + devlist->devname); + if (rv == 0) + have_container = 1; + } else { + st = super_by_fd(fd, NULL); + if (st && !(rv = st->ss-> + load_container(st, fd, + devlist->devname))) + have_container = 1; + else + st = NULL; + } + if (have_container) { + subdevs = s->raiddisks; + first_missing = subdevs * 2; + second_missing = subdevs * 2; + insert_point = subdevs * 2; + } + } + if (fd >= 0) + close(fd); + } + if (st && st->ss->external && s->sparedisks) { + pr_err("This metadata type does not support spare disks at create time\n"); + return 1; + } + if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) { + pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks); + return 1; + } + if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) { + pr_err("You haven't given enough devices (real or missing) to create this array\n"); + return 1; + } + if (s->bitmap_file && s->level <= 0) { + pr_err("bitmaps not meaningful with level %s\n", + map_num(pers, s->level)?:"given"); + return 1; + } + + /* now set some defaults */ + + if (s->layout == UnSet) { + do_default_layout = 1; + s->layout = default_layout(st, s->level, c->verbose); + } + + if (s->level == 10) + /* check layout fits in array*/ + if ((s->layout&255) * ((s->layout>>8)&255) > s->raiddisks) { + pr_err("that layout requires at least %d devices\n", + (s->layout&255) * ((s->layout>>8)&255)); + return 1; + } + + switch(s->level) { + case 4: + case 5: + case 10: + case 6: + case 0: + if (s->chunk == 0 || s->chunk == UnSet) { + s->chunk = UnSet; + do_default_chunk = 1; + /* chunk will be set later */ + } + break; + case LEVEL_LINEAR: + /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */ + if (get_linux_version() < 2006016 && s->chunk == 0) { + s->chunk = 64; + if (c->verbose > 0) + pr_err("chunk size defaults to 64K\n"); + } + break; + case 1: + case LEVEL_FAULTY: + case LEVEL_MULTIPATH: + case LEVEL_CONTAINER: + if (s->chunk) { + s->chunk = 0; + if (c->verbose > 0) + pr_err("chunk size ignored for this level\n"); + } + break; + default: + pr_err("unknown level %d\n", s->level); + return 1; + } + if (s->size == MAX_SIZE) + /* use '0' to mean 'max' now... */ + s->size = 0; + if (s->size && s->chunk && s->chunk != UnSet) + s->size &= ~(unsigned long long)(s->chunk - 1); + newsize = s->size * 2; + if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks, + &s->chunk, s->size*2, + data_offset, NULL, + &newsize, c->verbose>=0)) + return 1; + + if (s->chunk && s->chunk != UnSet) { + newsize &= ~(unsigned long long)(s->chunk*2 - 1); + if (do_default_chunk) { + /* default chunk was just set */ + if (c->verbose > 0) + pr_err("chunk size defaults to %dK\n", s->chunk); + s->size &= ~(unsigned long long)(s->chunk - 1); + do_default_chunk = 0; + } + } + + if (s->size == 0) { + s->size = newsize / 2; + if (s->level == 1) + /* If this is ever reshaped to RAID5, we will + * need a chunksize. So round it off a bit + * now just to be safe + */ + s->size &= ~(64ULL-1); + + if (s->size && c->verbose > 0) + pr_err("setting size to %lluK\n", s->size); + } + + /* now look at the subdevs */ + info.array.active_disks = 0; + info.array.working_disks = 0; + dnum = 0; + for (dv = devlist; dv ; dv = dv->next) + if (data_offset == VARIABLE_OFFSET) + dv->data_offset = INVALID_SECTORS; + else + dv->data_offset = data_offset; + + for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) { + char *dname = dv->devname; + unsigned long long freesize; + int dfd; + char *doff; + + if (strcasecmp(dname, "missing")==0) { + if (first_missing > dnum) + first_missing = dnum; + if (second_missing > dnum && dnum > first_missing) + second_missing = dnum; + missing_disks ++; + continue; + } + if (data_offset == VARIABLE_OFFSET) { + doff = strchr(dname, ':'); + if (doff) { + *doff++ = 0; + dv->data_offset = parse_size(doff); + } else + dv->data_offset = INVALID_SECTORS; + } else + dv->data_offset = data_offset; + + dfd = open(dname, O_RDONLY); + if (dfd < 0) { + pr_err("cannot open %s: %s\n", + dname, strerror(errno)); + exit(2); + } + if (fstat(dfd, &stb) != 0 || + (stb.st_mode & S_IFMT) != S_IFBLK) { + close(dfd); + pr_err("%s is not a block device\n", + dname); + exit(2); + } + close(dfd); + info.array.working_disks++; + if (dnum < s->raiddisks && dv->disposition != 'j') + info.array.active_disks++; + if (st == NULL) { + struct createinfo *ci = conf_get_create_info(); + if (ci) + st = ci->supertype; + } + if (st == NULL) { + /* Need to choose a default metadata, which is different + * depending on geometry of array. + */ + int i; + char *name = "default"; + for(i=0; !st && superlist[i]; i++) { + st = superlist[i]->match_metadata_desc(name); + if (!st) + continue; + if (do_default_layout) + s->layout = default_layout(st, s->level, c->verbose); + switch (st->ss->validate_geometry( + st, s->level, s->layout, s->raiddisks, + &s->chunk, s->size*2, + dv->data_offset, dname, + &freesize, c->verbose > 0)) { + case -1: /* Not valid, message printed, and not + * worth checking any further */ + exit(2); + break; + case 0: /* Geometry not valid */ + free(st); + st = NULL; + s->chunk = do_default_chunk ? UnSet : s->chunk; + break; + case 1: /* All happy */ + break; + } + } + + if (!st) { + int dfd = open(dname, O_RDONLY|O_EXCL); + if (dfd < 0) { + pr_err("cannot open %s: %s\n", + dname, strerror(errno)); + exit(2); + } + pr_err("device %s not suitable for any style of array\n", + dname); + exit(2); + } + if (st->ss != &super0 || + st->minor_version != 90) + did_default = 1; + } else { + if (do_default_layout) + s->layout = default_layout(st, s->level, 0); + if (!st->ss->validate_geometry(st, s->level, s->layout, + s->raiddisks, + &s->chunk, s->size*2, + dv->data_offset, + dname, &freesize, + c->verbose >= 0)) { + + pr_err("%s is not suitable for this array.\n", + dname); + fail = 1; + continue; + } + } + + if (dv->disposition == 'j') + goto skip_size_check; /* skip write journal for size check */ + + freesize /= 2; /* convert to K */ + if (s->chunk && s->chunk != UnSet) { + /* round to chunk size */ + freesize = freesize & ~(s->chunk-1); + if (do_default_chunk) { + /* default chunk was just set */ + if (c->verbose > 0) + pr_err("chunk size defaults to %dK\n", s->chunk); + s->size &= ~(unsigned long long)(s->chunk - 1); + do_default_chunk = 0; + } + } + if (!freesize) { + pr_err("no free space left on %s\n", dname); + fail = 1; + continue; + } + + if (s->size && freesize < s->size) { + pr_err("%s is smaller than given size. %lluK < %lluK + metadata\n", + dname, freesize, s->size); + fail = 1; + continue; + } + if (maxdisc == NULL || (maxdisc && freesize > maxsize)) { + maxdisc = dname; + maxsize = freesize; + } + if (mindisc ==NULL || (mindisc && freesize < minsize)) { + mindisc = dname; + minsize = freesize; + } + skip_size_check: + if (c->runstop != 1 || c->verbose >= 0) { + int fd = open(dname, O_RDONLY); + if (fd <0 ) { + pr_err("Cannot open %s: %s\n", + dname, strerror(errno)); + fail=1; + continue; + } + warn |= check_ext2(fd, dname); + warn |= check_reiser(fd, dname); + warn |= check_raid(fd, dname); + if (strcmp(st->ss->name, "1.x") == 0 && + st->minor_version >= 1) + /* metadata at front */ + warn |= check_partitions(fd, dname, 0, 0); + else if (s->level == 1 || s->level == LEVEL_CONTAINER + || (s->level == 0 && s->raiddisks == 1)) + /* partitions could be meaningful */ + warn |= check_partitions(fd, dname, freesize*2, s->size*2); + else + /* partitions cannot be meaningful */ + warn |= check_partitions(fd, dname, 0, 0); + if (strcmp(st->ss->name, "1.x") == 0 && + st->minor_version >= 1 && + did_default && + s->level == 1 && + (warn & 1024) == 0) { + warn |= 1024; + pr_err("Note: this array has metadata at the start and\n" + " may not be suitable as a boot device. If you plan to\n" + " store '/boot' on this device please ensure that\n" + " your boot-loader understands md/v1.x metadata, or use\n" + " --metadata=0.90\n"); + } + close(fd); + } + } + if (s->raiddisks + s->sparedisks > st->max_devs) { + pr_err("Too many devices: %s metadata only supports %d\n", + st->ss->name, st->max_devs); + return 1; + } + if (have_container) + info.array.working_disks = s->raiddisks; + if (fail) { + pr_err("create aborted\n"); + return 1; + } + if (s->size == 0) { + if (mindisc == NULL && !have_container) { + pr_err("no size and no drives given - aborting create.\n"); + return 1; + } + if (s->level > 0 || s->level == LEVEL_MULTIPATH + || s->level == LEVEL_FAULTY + || st->ss->external ) { + /* size is meaningful */ + if (!st->ss->validate_geometry(st, s->level, s->layout, + s->raiddisks, + &s->chunk, minsize*2, + data_offset, + NULL, NULL, 0)) { + pr_err("devices too large for RAID level %d\n", s->level); + return 1; + } + s->size = minsize; + if (s->level == 1) + /* If this is ever reshaped to RAID5, we will + * need a chunksize. So round it off a bit + * now just to be safe + */ + s->size &= ~(64ULL-1); + if (c->verbose > 0) + pr_err("size set to %lluK\n", s->size); + } + } + + if (!s->bitmap_file && + s->level >= 1 && + st->ss->add_internal_bitmap && + (s->write_behind || s->size > 100*1024*1024ULL)) { + if (c->verbose > 0) + pr_err("automatically enabling write-intent bitmap on large array\n"); + s->bitmap_file = "internal"; + } + if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) + s->bitmap_file = NULL; + + if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) { + if (c->runstop != 1 || c->verbose >= 0) + pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n", + maxdisc, s->size); + warn = 1; + } + + if (st->ss->detail_platform && st->ss->detail_platform(0, 1, NULL) != 0) { + if (c->runstop != 1 || c->verbose >= 0) + pr_err("%s unable to enumerate platform support\n" + " array may not be compatible with hardware/firmware\n", + st->ss->name); + warn = 1; + } + st->nodes = c->nodes; + st->cluster_name = c->homecluster; + + if (warn) { + if (c->runstop!= 1) { + if (!ask("Continue creating array? ")) { + pr_err("create aborted.\n"); + return 1; + } + } else { + if (c->verbose > 0) + pr_err("creation continuing despite oddities due to --run\n"); + } + } + + /* If this is raid4/5, we want to configure the last active slot + * as missing, so that a reconstruct happens (faster than re-parity) + * FIX: Can we do this for raid6 as well? + */ + if (st->ss->external == 0 && + s->assume_clean==0 && c->force == 0 && first_missing >= s->raiddisks) { + switch ( s->level ) { + case 4: + case 5: + insert_point = s->raiddisks-1; + s->sparedisks++; + info.array.active_disks--; + missing_disks++; + break; + default: + break; + } + } + /* For raid6, if creating with 1 missing drive, make a good drive + * into a spare, else the create will fail + */ + if (s->assume_clean == 0 && c->force == 0 && first_missing < s->raiddisks && + st->ss->external == 0 && + second_missing >= s->raiddisks && s->level == 6) { + insert_point = s->raiddisks - 1; + if (insert_point == first_missing) + insert_point--; + s->sparedisks ++; + info.array.active_disks--; + missing_disks++; + } + + if (s->level <= 0 && first_missing < subdevs * 2) { + pr_err("This level does not support missing devices\n"); + return 1; + } + + /* We need to create the device */ + map_lock(&map); + mdfd = create_mddev(mddev, name, c->autof, LOCAL, chosen_name); + if (mdfd < 0) { + map_unlock(&map); + return 1; + } + /* verify if chosen_name is not in use, + * it could be in conflict with already existing device + * e.g. container, array + */ + if (strncmp(chosen_name, "/dev/md/", 8) == 0 + && map_by_name(&map, chosen_name+8) != NULL) { + pr_err("Array name %s is in use already.\n", + chosen_name); + close(mdfd); + map_unlock(&map); + return 1; + } + mddev = chosen_name; + + vers = md_get_version(mdfd); + if (vers < 9000) { + pr_err("Create requires md driver version 0.90.0 or later\n"); + goto abort_locked; + } else { + mdu_array_info_t inf; + memset(&inf, 0, sizeof(inf)); + ioctl(mdfd, GET_ARRAY_INFO, &inf); + if (inf.working_disks != 0) { + pr_err("another array by this name is already running.\n"); + goto abort_locked; + } + } + + /* Ok, lets try some ioctls */ + + info.array.level = s->level; + info.array.size = s->size; + info.array.raid_disks = s->raiddisks; + /* The kernel should *know* what md_minor we are dealing + * with, but it chooses to trust me instead. Sigh + */ + info.array.md_minor = 0; + if (fstat(mdfd, &stb)==0) + info.array.md_minor = minor(stb.st_rdev); + info.array.not_persistent = 0; + + if ( ( (s->level == 4 || s->level == 5) && + (insert_point < s->raiddisks || first_missing < s->raiddisks) ) + || + ( s->level == 6 && (insert_point < s->raiddisks + || second_missing < s->raiddisks)) + || + ( s->level <= 0 ) + || + s->assume_clean + ) { + info.array.state = 1; /* clean, but one+ drive will be missing*/ + info.resync_start = MaxSector; + } else { + info.array.state = 0; /* not clean, but no errors */ + info.resync_start = 0; + } + if (s->level == 10) { + /* for raid10, the bitmap size is the capacity of the array, + * which is array.size * raid_disks / ncopies; + * .. but convert to sectors. + */ + int ncopies = ((s->layout>>8) & 255) * (s->layout & 255); + bitmapsize = s->size * s->raiddisks / ncopies * 2; +/* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, s->size, s->raiddisks, ncopies);*/ + } else + bitmapsize = s->size * 2; + + /* There is lots of redundancy in these disk counts, + * raid_disks is the most meaningful value + * it describes the geometry of the array + * it is constant + * nr_disks is total number of used slots. + * it should be raid_disks+spare_disks + * spare_disks is the number of extra disks present + * see above + * active_disks is the number of working disks in + * active slots. (With raid_disks) + * working_disks is the total number of working disks, + * including spares + * failed_disks is the number of disks marked failed + * + * Ideally, the kernel would keep these (except raid_disks) + * up-to-date as we ADD_NEW_DISK, but it doesn't (yet). + * So for now, we assume that all raid and spare + * devices will be given. + */ + info.array.spare_disks=s->sparedisks; + info.array.failed_disks=missing_disks; + info.array.nr_disks = info.array.working_disks + + info.array.failed_disks; + info.array.layout = s->layout; + info.array.chunk_size = s->chunk*1024; + + if (name == NULL || *name == 0) { + /* base name on mddev */ + /* /dev/md0 -> 0 + * /dev/md_d0 -> d0 + * /dev/md_foo -> foo + * /dev/md/1 -> 1 + * /dev/md/d1 -> d1 + * /dev/md/home -> home + * /dev/mdhome -> home + */ + /* FIXME compare this with rules in create_mddev */ + name = strrchr(mddev, '/'); + if (name) { + name++; + if (strncmp(name, "md_", 3)==0 && + strlen(name) > 3 && + (name-mddev) == 5 /* /dev/ */) + name += 3; + else if (strncmp(name, "md", 2)==0 && + strlen(name) > 2 && + isdigit(name[2]) && + (name-mddev) == 5 /* /dev/ */) + name += 2; + } + } + if (!st->ss->init_super(st, &info.array, s->size, name, c->homehost, uuid, + data_offset)) + goto abort_locked; + + total_slots = info.array.nr_disks; + st->ss->getinfo_super(st, &info, NULL); + sysfs_init(&info, mdfd, NULL); + + if (did_default && c->verbose >= 0) { + if (is_subarray(info.text_version)) { + char devnm[32]; + char *ep; + struct mdinfo *mdi; + + strncpy(devnm, info.text_version+1, 32); + devnm[31] = 0; + ep = strchr(devnm, '/'); + if (ep) + *ep = 0; + + mdi = sysfs_read(-1, devnm, GET_VERSION); + + pr_err("Creating array inside %s container %s\n", + mdi?mdi->text_version:"managed", devnm); + sysfs_free(mdi); + } else + pr_err("Defaulting to version %s metadata\n", info.text_version); + } + + map_update(&map, fd2devnm(mdfd), info.text_version, + info.uuid, chosen_name); + /* Keep map locked until devices have been added to array + * to stop another mdadm from finding and using those devices. + */ + + if (s->bitmap_file && vers < 9003) { + major_num = BITMAP_MAJOR_HOSTENDIAN; +#ifdef __BIG_ENDIAN + pr_err("Warning - bitmaps created on this kernel are not portable\n" + " between different architectured. Consider upgrading the Linux kernel.\n"); +#endif + } + + if (s->bitmap_file && (strcmp(s->bitmap_file, "internal")==0 || + strcmp(s->bitmap_file, "clustered")==0)) { + if ((vers%100) < 2) { + pr_err("internal bitmaps not supported by this kernel.\n"); + goto abort_locked; + } + if (!st->ss->add_internal_bitmap) { + pr_err("internal bitmaps not supported with %s metadata\n", + st->ss->name); + goto abort_locked; + } + if (!st->ss->add_internal_bitmap(st, &s->bitmap_chunk, + c->delay, s->write_behind, + bitmapsize, 1, major_num)) { + pr_err("Given bitmap chunk size not supported.\n"); + goto abort_locked; + } + s->bitmap_file = NULL; + } + + sysfs_init(&info, mdfd, NULL); + + if (st->ss->external && st->container_devnm[0]) { + /* member */ + + /* When creating a member, we need to be careful + * to negotiate with mdmon properly. + * If it is already running, we cannot write to + * the devices and must ask it to do that part. + * If it isn't running, we write to the devices, + * and then start it. + * We hold an exclusive open on the container + * device to make sure mdmon doesn't exit after + * we checked that it is running. + * + * For now, fail if it is already running. + */ + container_fd = open_dev_excl(st->container_devnm); + if (container_fd < 0) { + pr_err("Cannot get exclusive open on container - weird.\n"); + goto abort_locked; + } + if (mdmon_running(st->container_devnm)) { + if (c->verbose) + pr_err("reusing mdmon for %s.\n", + st->container_devnm); + st->update_tail = &st->updates; + } else + need_mdmon = 1; + } + rv = set_array_info(mdfd, st, &info); + if (rv) { + pr_err("failed to set array info for %s: %s\n", + mddev, strerror(errno)); + goto abort_locked; + } + + if (s->bitmap_file) { + int uuid[4]; + + st->ss->uuid_from_super(st, uuid); + if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk, + c->delay, s->write_behind, + bitmapsize, + major_num)) { + goto abort_locked; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("weird: %s cannot be openned\n", + s->bitmap_file); + goto abort_locked; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { + pr_err("Cannot set bitmap file for %s: %s\n", + mddev, strerror(errno)); + goto abort_locked; + } + } + + infos = xmalloc(sizeof(*infos) * total_slots); + enable_fds(total_slots); + for (pass=1; pass <=2 ; pass++) { + struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */ + + for (dnum=0, raid_disk_num=0, dv = devlist ; dv ; + dv=(dv->next)?(dv->next):moved_disk, dnum++) { + int fd; + struct stat stb; + struct mdinfo *inf = &infos[dnum]; + + if (dnum >= total_slots) + abort(); + if (dnum == insert_point) { + raid_disk_num += 1; + moved_disk = dv; + continue; + } + if (strcasecmp(dv->devname, "missing")==0) { + raid_disk_num += 1; + continue; + } + if (have_container) + moved_disk = NULL; + if (have_container && dnum < info.array.raid_disks - 1) + /* repeatedly use the container */ + moved_disk = dv; + + switch(pass) { + case 1: + *inf = info; + + inf->disk.number = dnum; + inf->disk.raid_disk = raid_disk_num++; + + if (dv->disposition == 'j') { + inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL; + inf->disk.state = (1<disk.raid_disk < s->raiddisks) + inf->disk.state = (1<disk.state = 0; + + if (dv->writemostly == 1) + inf->disk.state |= (1<ss->external && + st->container_devnm[0]) + fd = open(dv->devname, O_RDWR); + else + fd = open(dv->devname, O_RDWR|O_EXCL); + + if (fd < 0) { + pr_err("failed to open %s after earlier success - aborting\n", + dv->devname); + goto abort_locked; + } + fstat(fd, &stb); + inf->disk.major = major(stb.st_rdev); + inf->disk.minor = minor(stb.st_rdev); + } + if (fd >= 0) + remove_partitions(fd); + if (st->ss->add_to_super(st, &inf->disk, + fd, dv->devname, + dv->data_offset)) { + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort_locked; + } + st->ss->getinfo_super(st, inf, NULL); + safe_mode_delay = inf->safe_mode_delay; + + if (have_container && c->verbose > 0) + pr_err("Using %s for device %d\n", + map_dev(inf->disk.major, + inf->disk.minor, + 0), dnum); + + if (!have_container) { + /* getinfo_super might have lost these ... */ + inf->disk.major = major(stb.st_rdev); + inf->disk.minor = minor(stb.st_rdev); + } + break; + case 2: + inf->errors = 0; + + rv = add_disk(mdfd, st, &info, inf); + + if (rv) { + pr_err("ADD_NEW_DISK for %s failed: %s\n", + dv->devname, strerror(errno)); + goto abort_locked; + } + break; + } + if (!have_container && + dv == moved_disk && dnum != insert_point) break; + } + if (pass == 1) { + struct mdinfo info_new; + struct map_ent *me = NULL; + + /* check to see if the uuid has changed due to these + * metadata changes, and if so update the member array + * and container uuid. Note ->write_init_super clears + * the subarray cursor such that ->getinfo_super once + * again returns container info. + */ + st->ss->getinfo_super(st, &info_new, NULL); + if (st->ss->external && s->level != LEVEL_CONTAINER && + !same_uuid(info_new.uuid, info.uuid, 0)) { + map_update(&map, fd2devnm(mdfd), + info_new.text_version, + info_new.uuid, chosen_name); + me = map_by_devnm(&map, st->container_devnm); + } + + if (st->ss->write_init_super(st)) { + st->ss->free_super(st); + goto abort_locked; + } + + /* update parent container uuid */ + if (me) { + char *path = xstrdup(me->path); + + st->ss->getinfo_super(st, &info_new, NULL); + map_update(&map, st->container_devnm, + info_new.text_version, + info_new.uuid, path); + free(path); + } + + flush_metadata_updates(st); + st->ss->free_super(st); + } + } + map_unlock(&map); + free(infos); + + if (s->level == LEVEL_CONTAINER) { + /* No need to start. But we should signal udev to + * create links */ + sysfs_uevent(&info, "change"); + if (c->verbose >= 0) + pr_err("container %s prepared.\n", mddev); + wait_for(chosen_name, mdfd); + } else if (c->runstop == 1 || subdevs >= s->raiddisks) { + if (st->ss->external) { + int err; + switch(s->level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + err = sysfs_set_str(&info, NULL, "array_state", + c->readonly + ? "readonly" + : "active"); + need_mdmon = 0; + break; + default: + err = sysfs_set_str(&info, NULL, "array_state", + "readonly"); + break; + } + sysfs_set_safemode(&info, safe_mode_delay); + if (err) { + pr_err("failed to activate array.\n"); + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + } else if (c->readonly && + sysfs_attribute_available( + &info, NULL, "array_state")) { + if (sysfs_set_str(&info, NULL, + "array_state", "readonly") < 0) { + pr_err("Failed to start array: %s\n", + strerror(errno)); + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + } else { + /* param is not actually used */ + mdu_param_t param; + if (ioctl(mdfd, RUN_ARRAY, ¶m)) { + pr_err("RUN_ARRAY failed: %s\n", + strerror(errno)); + if (info.array.chunk_size & (info.array.chunk_size-1)) { + cont_err("Problem may be that chunk size is not a power of 2\n"); + } + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + /* if start_ro module parameter is set, array is + * auto-read-only, which is bad as the resync won't + * start. So lets make it read-write now. + */ + ioctl(mdfd, RESTART_ARRAY_RW, NULL); + } + if (c->verbose >= 0) + pr_err("array %s started.\n", mddev); + if (st->ss->external && st->container_devnm[0]) { + if (need_mdmon) + start_mdmon(st->container_devnm); + + ping_monitor(st->container_devnm); + close(container_fd); + } + wait_for(chosen_name, mdfd); + } else { + pr_err("not starting array - not enough devices.\n"); + } + close(mdfd); + return 0; + + abort: + map_lock(&map); + abort_locked: + map_remove(&map, fd2devnm(mdfd)); + map_unlock(&map); + + if (mdfd >= 0) + close(mdfd); + return 1; +} diff --git a/Detail.c b/Detail.c new file mode 100644 index 00000000..0cfccadb --- /dev/null +++ b/Detail.c @@ -0,0 +1,768 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" +#include + +static int cmpstringp(const void *p1, const void *p2) +{ + return strcmp(* (char * const *) p1, * (char * const *) p2); +} + +static int add_device(const char *dev, char ***p_devices, + int *p_max_devices, int n_devices) +{ + if (n_devices + 1 >= *p_max_devices) { + *p_max_devices += 16; + *p_devices = xrealloc(*p_devices, *p_max_devices * + sizeof(**p_devices)); + if (!*p_devices) { + *p_max_devices = 0; + return 0; + } + }; + (*p_devices)[n_devices] = xstrdup(dev); + return n_devices + 1; +} + +int Detail(char *dev, struct context *c) +{ + /* + * Print out details for an md array by using + * GET_ARRAY_INFO and GET_DISK_INFO ioctl calls + */ + + int fd = open(dev, O_RDONLY); + int vers; + mdu_array_info_t array; + mdu_disk_info_t *disks; + int next; + int d; + time_t atime; + char *str; + char **devices = NULL; + int max_devices = 0, n_devices = 0; + int spares = 0; + struct stat stb; + int is_26 = get_linux_version() >= 2006000; + int is_rebuilding = 0; + int failed = 0; + struct supertype *st; + char *subarray = NULL; + int max_disks = MD_SB_DISKS; /* just a default */ + struct mdinfo *info = NULL; + struct mdinfo *sra; + struct mdinfo *subdev; + char *member = NULL; + char *container = NULL; + + int rv = c->test ? 4 : 1; + int avail_disks = 0; + char *avail = NULL; + int external; + int inactive; + + if (fd < 0) { + pr_err("cannot open %s: %s\n", + dev, strerror(errno)); + return rv; + } + vers = md_get_version(fd); + if (vers < 0) { + pr_err("%s does not appear to be an md device\n", + dev); + close(fd); + return rv; + } + if (vers < 9000) { + pr_err("cannot get detail for md device %s: driver version too old.\n", + dev); + close(fd); + return rv; + } + sra = sysfs_read(fd, NULL, GET_VERSION|GET_DEVS); + external = (sra != NULL && sra->array.major_version == -1 + && sra->array.minor_version == -2); + st = super_by_fd(fd, &subarray); + if (ioctl(fd, GET_ARRAY_INFO, &array) == 0) { + inactive = 0; + } else if (errno == ENODEV && sra) { + array = sra->array; + inactive = 1; + } else { + pr_err("cannot get array detail for %s: %s\n", + dev, strerror(errno)); + close(fd); + return rv; + } + + if (fstat(fd, &stb) != 0 && !S_ISBLK(stb.st_mode)) + stb.st_rdev = 0; + rv = 0; + + if (st) + max_disks = st->max_devs; + + if (subarray) { + /* This is a subarray of some container. + * We want the name of the container, and the member + */ + int devid = devnm2devid(st->container_devnm); + int cfd, err; + + member = subarray; + container = map_dev_preferred(major(devid), minor(devid), + 1, c->prefer); + cfd = open_dev(st->container_devnm); + if (cfd >= 0) { + err = st->ss->load_container(st, cfd, NULL); + close(cfd); + if (err == 0) + info = st->ss->container_content(st, subarray); + } + } + + /* try to load a superblock. Try sra->devs first, then try ioctl */ + if (st && !info) for (d = 0, subdev = sra ? sra->devs : NULL; + d < max_disks || subdev; + subdev ? (void)(subdev = subdev->next) : (void)(d++)){ + mdu_disk_info_t disk; + char *dv; + int fd2; + int err; + if (subdev) + disk = subdev->disk; + else { + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (d >= array.raid_disks && + disk.major == 0 && + disk.minor == 0) + continue; + } + + if (array.raid_disks > 0 && + (disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + + if (st->sb) + st->ss->free_super(st); + + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + if (err) + continue; + if (info) + free(info); + if (subarray) + info = st->ss->container_content(st, subarray); + else { + info = xmalloc(sizeof(*info)); + st->ss->getinfo_super(st, info, NULL); + } + if (!info) + continue; + + if (array.raid_disks != 0 && /* container */ + (info->array.ctime != array.ctime || + info->array.level != array.level)) { + st->ss->free_super(st); + continue; + } + /* some formats (imsm) have free-floating-spares + * with a uuid of uuid_zero, they don't + * have very good info about the rest of the + * container, so keep searching when + * encountering such a device. Otherwise, stop + * after the first successful call to + * ->load_super. + */ + if (memcmp(uuid_zero, + info->uuid, + sizeof(uuid_zero)) == 0) { + st->ss->free_super(st); + continue; + } + break; + } + + /* Ok, we have some info to print... */ + str = map_num(pers, array.level); + + if (c->export) { + if (array.raid_disks) { + if (str) + printf("MD_LEVEL=%s\n", str); + printf("MD_DEVICES=%d\n", array.raid_disks); + } else { + if (!inactive) + printf("MD_LEVEL=container\n"); + printf("MD_DEVICES=%d\n", array.nr_disks); + } + if (container) { + printf("MD_CONTAINER=%s\n", container); + printf("MD_MEMBER=%s\n", member); + } else { + if (sra && sra->array.major_version < 0) + printf("MD_METADATA=%s\n", sra->text_version); + else + printf("MD_METADATA=%d.%d\n", + array.major_version, array.minor_version); + } + + if (st && st->sb && info) { + char nbuf[64]; + struct map_ent *mp, *map = NULL; + + fname_from_uuid(st, info, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + mp = map_by_uuid(&map, info->uuid); + if (mp && mp->path && + strncmp(mp->path, "/dev/md/", 8) == 0) { + printf("MD_DEVNAME="); + print_escape(mp->path+8); + putchar('\n'); + } + + if (st->ss->export_detail_super) + st->ss->export_detail_super(st); + } else { + struct map_ent *mp, *map = NULL; + char nbuf[64]; + mp = map_by_devnm(&map, fd2devnm(fd)); + if (mp) { + __fname_from_uuid(mp->uuid, 0, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + } + if (mp && mp->path && + strncmp(mp->path, "/dev/md/", 8) == 0) { + printf("MD_DEVNAME="); + print_escape(mp->path+8); + putchar('\n'); + } + } + if (sra) { + struct mdinfo *mdi; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + char *path = + map_dev(mdi->disk.major, + mdi->disk.minor, 0); + + if (mdi->disk.raid_disk >= 0) + printf("MD_DEVICE_%s_ROLE=%d\n", + mdi->sys_name+4, + mdi->disk.raid_disk); + else + printf("MD_DEVICE_%s_ROLE=spare\n", + mdi->sys_name+4); + if (path) + printf("MD_DEVICE_%s_DEV=%s\n", + mdi->sys_name+4, path); + } + } + goto out; + } + + disks = xmalloc(max_disks * 2 * sizeof(mdu_disk_info_t)); + for (d = 0; d < max_disks * 2; d++) { + disks[d].state = (1<devs; mdi; mdi = mdi->next) { + disks[next++] = mdi->disk; + disks[next-1].number = -1; + } + } else for (d = 0; d < max_disks; d++) { + mdu_disk_info_t disk; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) { + if (d < array.raid_disks) + pr_err("cannot get device detail for device %d: %s\n", + d, strerror(errno)); + continue; + } + if (disk.major == 0 && disk.minor == 0) + continue; + if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks + && disks[disk.raid_disk*2].state == (1<= 0 && disk.raid_disk < array.raid_disks + && disks[disk.raid_disk*2+1].state == (1<test; + } + + if (c->brief) { + mdu_bitmap_file_t bmf; + printf("%sARRAY %s", inactive ? "INACTIVE-":"", dev); + if (c->verbose > 0) { + if (array.raid_disks) + printf(" level=%s num-devices=%d", + str?str:"-unknown-", + array.raid_disks ); + else if (!inactive) + printf(" level=container num-devices=%d", + array.nr_disks); + else + printf(" num-devices=%d", array.nr_disks); + } + if (container) { + printf(" container=%s", container); + printf(" member=%s", member); + } else { + if (sra && sra->array.major_version < 0) + printf(" metadata=%s", sra->text_version); + else + printf(" metadata=%d.%d", + array.major_version, array.minor_version); + } + + /* Only try GET_BITMAP_FILE for 0.90.01 and later */ + if (vers >= 9001 && + ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && + bmf.pathname[0]) { + printf(" bitmap=%s", bmf.pathname); + } + } else { + mdu_bitmap_file_t bmf; + unsigned long long larray_size; + struct mdstat_ent *ms = mdstat_read(0, 0); + struct mdstat_ent *e; + char *devnm; + + devnm = stat2devnm(&stb); + for (e=ms; e; e=e->next) + if (strcmp(e->devnm, devnm) == 0) + break; + if (!get_dev_size(fd, NULL, &larray_size)) + larray_size = 0; + + printf("%s:\n", dev); + + if (container) + printf(" Container : %s, member %s\n", container, member); + else { + if (sra && sra->array.major_version < 0) + printf(" Version : %s\n", sra->text_version); + else + printf(" Version : %d.%d\n", + array.major_version, array.minor_version); + } + + atime = array.ctime; + if (atime) + printf(" Creation Time : %.24s\n", ctime(&atime)); + if (array.raid_disks == 0 && external) + str = "container"; + if (str) + printf(" Raid Level : %s\n", str); + if (larray_size) + printf(" Array Size : %llu%s\n", (larray_size>>10), + human_size(larray_size)); + if (array.level >= 1) { + if (sra) + array.major_version = sra->array.major_version; + if (array.major_version != 0 && + (larray_size >= 0xFFFFFFFFULL|| array.size == 0)) { + unsigned long long dsize = get_component_size(fd); + if (dsize > 0) + printf(" Used Dev Size : %llu%s\n", + dsize/2, + human_size((long long)dsize<<9)); + else + printf(" Used Dev Size : unknown\n"); + } else + printf(" Used Dev Size : %lu%s\n", + (unsigned long)array.size, + human_size((unsigned long long)array.size<<10)); + } + if (array.raid_disks) + printf(" Raid Devices : %d\n", array.raid_disks); + printf(" Total Devices : %d\n", array.nr_disks); + if (!container && + ((sra == NULL && array.major_version == 0) || + (sra && sra->array.major_version == 0))) + printf("Preferred Minor : %d\n", array.md_minor); + if (sra == NULL || sra->array.major_version >= 0) + printf(" Persistence : Superblock is %spersistent\n", + array.not_persistent?"not ":""); + printf("\n"); + /* Only try GET_BITMAP_FILE for 0.90.01 and later */ + if (vers >= 9001 && + ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && + bmf.pathname[0]) { + printf(" Intent Bitmap : %s\n", bmf.pathname); + printf("\n"); + } else if (array.state & (1<percent < 0 && e->percent != RESYNC_PENDING && + e->percent != RESYNC_DELAYED)) ? "" : sync_action[e->resync], + larray_size ? "": ", Not Started", + (e && e->percent == RESYNC_DELAYED) ? " (DELAYED)": "", + (e && e->percent == RESYNC_PENDING) ? " (PENDING)": ""); + } else if (inactive) { + printf(" State : inactive\n"); + } + if (array.raid_disks) + printf(" Active Devices : %d\n", array.active_disks); + if (array.working_disks > 0) + printf("Working Devices : %d\n", array.working_disks); + if (array.raid_disks) { + printf(" Failed Devices : %d\n", array.failed_disks); + printf(" Spare Devices : %d\n", array.spare_disks); + } + printf("\n"); + if (array.level == 5) { + str = map_num(r5layout, array.layout); + printf(" Layout : %s\n", str?str:"-unknown-"); + } + if (array.level == 6) { + str = map_num(r6layout, array.layout); + printf(" Layout : %s\n", str?str:"-unknown-"); + } + if (array.level == 10) { + printf(" Layout :"); + print_r10_layout(array.layout); + printf("\n"); + } + switch (array.level) { + case 0: + case 4: + case 5: + case 10: + case 6: + if (array.chunk_size) + printf(" Chunk Size : %dK\n\n", + array.chunk_size/1024); + break; + case -1: + printf(" Rounding : %dK\n\n", array.chunk_size/1024); + break; + default: break; + } + + if (e && e->percent >= 0) { + static char *sync_action[] = { + "Rebuild", "Resync", + "Reshape", "Check"}; + printf(" %7s Status : %d%% complete\n", sync_action[e->resync], e->percent); + is_rebuilding = 1; + } + free_mdstat(ms); + + if ((st && st->sb) && (info && info->reshape_active)) { +#if 0 +This is pretty boring + printf(" Reshape pos'n : %llu%s\n", (unsigned long long) info->reshape_progress<<9, + human_size((unsigned long long)info->reshape_progress<<9)); +#endif + if (info->delta_disks != 0) + printf(" Delta Devices : %d, (%d->%d)\n", + info->delta_disks, + array.raid_disks - info->delta_disks, + array.raid_disks); + if (info->new_level != array.level) { + str = map_num(pers, info->new_level); + printf(" New Level : %s\n", str?str:"-unknown-"); + } + if (info->new_level != array.level || + info->new_layout != array.layout) { + if (info->new_level == 5) { + str = map_num(r5layout, info->new_layout); + printf(" New Layout : %s\n", + str?str:"-unknown-"); + } + if (info->new_level == 6) { + str = map_num(r6layout, info->new_layout); + printf(" New Layout : %s\n", + str?str:"-unknown-"); + } + if (info->new_level == 10) { + printf(" New Layout : near=%d, %s=%d\n", + info->new_layout&255, + (info->new_layout&0x10000)?"offset":"far", + (info->new_layout>>8)&255); + } + } + if (info->new_chunk != array.chunk_size) + printf(" New Chunksize : %dK\n", info->new_chunk/1024); + printf("\n"); + } else if (e && e->percent >= 0) + printf("\n"); + if (st && st->sb) + st->ss->detail_super(st, c->homehost); + + if (array.raid_disks == 0 && sra && sra->array.major_version == -1 + && sra->array.minor_version == -2 && sra->text_version[0] != '/') { + /* This looks like a container. Find any active arrays + * That claim to be a member. + */ + DIR *dir = opendir("/sys/block"); + struct dirent *de; + + printf(" Member Arrays :"); + + while (dir && (de = readdir(dir)) != NULL) { + char path[200]; + char vbuf[1024]; + int nlen = strlen(sra->sys_name); + int devid; + if (de->d_name[0] == '.') + continue; + sprintf(path, "/sys/block/%s/md/metadata_version", + de->d_name); + if (load_sys(path, vbuf) < 0) + continue; + if (strncmp(vbuf, "external:", 9) != 0 || + !is_subarray(vbuf+9) || + strncmp(vbuf+10, sra->sys_name, nlen) != 0 || + vbuf[10+nlen] != '/') + continue; + devid = devnm2devid(de->d_name); + printf(" %s", map_dev_preferred( + major(devid), + minor(devid), 1, c->prefer)); + } + if (dir) + closedir(dir); + printf("\n\n"); + } + + if (array.raid_disks) + printf(" Number Major Minor RaidDevice State\n"); + else + printf(" Number Major Minor RaidDevice\n"); + } + free(info); + + for (d= 0; d < max_disks * 2; d++) { + char *dv; + mdu_disk_info_t disk = disks[d]; + + if (d >= array.raid_disks*2 && + disk.major == 0 && + disk.minor == 0) + continue; + if ((d & 1) && + disk.major == 0 && + disk.minor == 0) + continue; + if (!c->brief) { + if (d == array.raid_disks*2) printf("\n"); + if (disk.number < 0 && disk.raid_disk < 0) + printf(" - %5d %5d - ", + disk.major, disk.minor); + else if (disk.raid_disk < 0 || disk.state & (1<brief && array.raid_disks) { + + if (disk.state & (1<= 0) + failed++; + } + if (disk.state & (1<> 8) & 0xff; + int copies = nc*fc; + if (fc == 1 && array.raid_disks % copies == 0 && copies <= 26) { + /* We can divide the devices into 'sets' */ + int set = disk.raid_disk % copies; + printf(" set-%c", set + 'A'); + } + } + } + if (disk.state & (1<= 0) + printf(" rebuilding"); + } else if (is_rebuilding && failed) { + /* Taking a bit of a risk here, we remove the + * device from the array, and then put it back. + * If this fails, we are rebuilding + */ + int err = ioctl(fd, HOT_REMOVE_DISK, makedev(disk.major, disk.minor)); + if (err == 0) ioctl(fd, HOT_ADD_DISK, makedev(disk.major, disk.minor)); + if (err && errno == EBUSY) + printf(" rebuilding"); + } + } + } + if (disk.state == 0) spares++; + dv=map_dev_preferred(disk.major, disk.minor, 0, c->prefer); + if (dv != NULL) { + if (c->brief) + n_devices = add_device(dv, &devices, + &max_devices, + n_devices); + else + printf(" %s", dv); + } + if (!c->brief) printf("\n"); + } + if (spares && c->brief && array.raid_disks) printf(" spares=%d", spares); + if (c->brief && st && st->sb) + st->ss->brief_detail_super(st); + if (st) + st->ss->free_super(st); + + if (c->brief && c->verbose > 0 && devices) { + qsort(devices, n_devices, sizeof(*devices), cmpstringp); + printf("\n devices=%s", devices[0]); + for (d = 1; d < n_devices; d++) + printf(",%s", devices[d]); + } + if (c->brief) + printf("\n"); + if (c->test && + !enough(array.level, array.raid_disks, array.layout, + 1, avail)) + rv = 2; + + free(disks); +out: + close(fd); + free(subarray); + free(avail); + for (d = 0; d < n_devices; d++) + free(devices[d]); + free(devices); + sysfs_free(sra); + return rv; +} + +int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path) +{ + /* display platform capabilities for the given metadata format + * 'scan' in this context means iterate over all metadata types + */ + int i; + int err = 1; + + if (ss && export && ss->export_detail_platform) + err = ss->export_detail_platform(verbose, controller_path); + else if (ss && ss->detail_platform) + err = ss->detail_platform(verbose, 0, controller_path); + else if (ss) { + if (verbose > 0) + pr_err("%s metadata is platform independent\n", + ss->name ? : "[no name]"); + } else if (!scan) { + if (verbose > 0) + pr_err("specify a metadata type or --scan\n"); + } + + if (!scan) + return err; + + err = 0; + for (i = 0; superlist[i]; i++) { + struct superswitch *meta = superlist[i]; + + if (meta == ss) + continue; + if (verbose > 0) + pr_err("checking metadata %s\n", + meta->name ? : "[no name]"); + if (!meta->detail_platform) { + if (verbose > 0) + pr_err("%s metadata is platform independent\n", + meta->name ? : "[no name]"); + } else if (export && meta->export_detail_platform) { + err |= meta->export_detail_platform(verbose, controller_path); + } else + err |= meta->detail_platform(verbose, 0, controller_path); + } + + return err; +} diff --git a/Dump.c b/Dump.c new file mode 100644 index 00000000..7bdbf6f7 --- /dev/null +++ b/Dump.c @@ -0,0 +1,311 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2013 Neil Brown + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include + +int Dump_metadata(char *dev, char *dir, struct context *c, + struct supertype *st) +{ + /* create a new file in 'dir' named for the basename of 'dev'. + * Truncate to the same size as 'dev' and ask the metadata + * handler to copy metadata there. + * For every name in /dev/disk/by-id that points to this device, + * create a hardlink in 'dir'. + * Complain if any of those hardlinks cannot be created. + */ + int fd, fl; + struct stat stb, dstb; + char *base; + char *fname = NULL; + unsigned long long size; + DIR *dirp; + struct dirent *de; + + if (stat(dir, &stb) != 0 || + (S_IFMT & stb.st_mode) != S_IFDIR) { + pr_err("--dump requires an existing directory, not: %s\n", + dir); + return 16; + } + + fd = dev_open(dev, O_RDONLY); + if (fd < 0) { + pr_err("Cannot open %s to dump metadata: %s\n", + dev, strerror(errno)); + return 1; + } + if (!get_dev_size(fd, dev, &size)) { + close(fd); + return 1; + } + + if (st == NULL) + st = guess_super_type(fd, guess_array); + if (!st) { + pr_err("Cannot find RAID metadata on %s\n", dev); + close(fd); + return 1; + } + + st->ignore_hw_compat = 1; + if (st->ss->load_super(st, fd, NULL) != 0) { + pr_err("No %s metadata found on %s\n", + st->ss->name, dev); + close(fd); + return 1; + } + if (st->ss->copy_metadata == NULL) { + pr_err("%s metadata on %s cannot be copied\n", + st->ss->name, dev); + close(fd); + return 1; + } + + base = strrchr(dev, '/'); + if (base) + base++; + else + base = dev; + xasprintf(&fname, "%s/%s", dir, base); + fl = open(fname, O_RDWR|O_CREAT|O_EXCL, 0666); + if (fl < 0) { + pr_err("Cannot create dump file %s: %s\n", + fname, strerror(errno)); + close(fd); + free(fname); + return 1; + } + if (ftruncate(fl, size) < 0) { + pr_err("failed to set size of dump file: %s\n", + strerror(errno)); + close(fd); + close(fl); + free(fname); + return 1; + } + + if (st->ss->copy_metadata(st, fd, fl) != 0) { + pr_err("Failed to copy metadata from %s to %s\n", + dev, fname); + close(fd); + close(fl); + unlink(fname); + free(fname); + return 1; + } + if (c->verbose >= 0) + printf("%s saved as %s.\n", dev, fname); + fstat(fd, &dstb); + close(fd); + close(fl); + if ((dstb.st_mode & S_IFMT) != S_IFBLK) { + /* Not a block device, so cannot create links */ + free(fname); + return 0; + } + /* mostly done: just want to find some other names */ + dirp = opendir("/dev/disk/by-id"); + if (!dirp) { + free(fname); + return 0; + } + while ((de = readdir(dirp)) != NULL) { + char *p = NULL; + if (de->d_name[0] == '.') + continue; + xasprintf(&p, "/dev/disk/by-id/%s", de->d_name); + if (stat(p, &stb) != 0 || + (stb.st_mode & S_IFMT) != S_IFBLK || + stb.st_rdev != dstb.st_rdev) { + /* Not this one */ + free(p); + continue; + } + free(p); + xasprintf(&p, "%s/%s", dir, de->d_name); + if (link(fname, p) == 0) { + if (c->verbose >= 0) + printf("%s also saved as %s.\n", + dev, p); + } else { + pr_err("Could not save %s as %s!!\n", + dev, p); + } + free(p); + } + closedir(dirp); + free(fname); + return 0; +} + +int Restore_metadata(char *dev, char *dir, struct context *c, + struct supertype *st, int only) +{ + /* If 'dir' really is a directory we choose a name + * from it that matches a suitable name in /dev/disk/by-id, + * and copy metadata from the file to the device. + * If two names from by-id match and aren't both the same + * inode, we fail. If none match and basename of 'dev' + * can be found in dir, use that. + * If 'dir' is really a file then it is only permitted if + * 'only' is set (meaning there was only one device given) + * and the metadata is restored irrespective of file names. + */ + int fd, fl; + struct stat stb, dstb; + char *fname = NULL; + unsigned long long size; + + if (stat(dir, &stb) != 0) { + pr_err("%s does not exist: cannot restore from there.\n", + dir); + return 16; + } else if ((S_IFMT & stb.st_mode) != S_IFDIR && !only) { + pr_err("--restore requires a directory when multiple devices given\n"); + return 16; + } + + fd = dev_open(dev, O_RDWR); + if (fd < 0) { + pr_err("Cannot open %s to restore metadata: %s\n", + dev, strerror(errno)); + return 1; + } + if (!get_dev_size(fd, dev, &size)) { + close(fd); + return 1; + } + + if ((S_IFMT & stb.st_mode) == S_IFDIR) { + /* choose one name from the directory. */ + DIR *d = opendir(dir); + struct dirent *de; + char *chosen = NULL; + unsigned int chosen_inode = 0; + + fstat(fd, &dstb); + + while (d && (de = readdir(d)) != NULL) { + if (de->d_name[0] == '.') + continue; + xasprintf(&fname, "/dev/disk/by-id/%s", de->d_name); + if (stat(fname, &stb) != 0) { + free(fname); + continue; + } + free(fname); + if ((S_IFMT & stb.st_mode) != S_IFBLK) + continue; + if (stb.st_rdev != dstb.st_rdev) + continue; + /* This file is a good match for our device. */ + xasprintf(&fname, "%s/%s", dir, de->d_name); + if (stat(fname, &stb) != 0) { + /* Weird! */ + free(fname); + continue; + } + if (chosen == NULL) { + chosen = fname; + chosen_inode = stb.st_ino; + continue; + } + if (chosen_inode == stb.st_ino) { + /* same, no need to change */ + free(fname); + continue; + } + /* Oh dear, two names both match. Must give up. */ + pr_err("Both %s and %s seem suitable for %s. Please choose one.\n", + chosen, fname, dev); + free(fname); + free(chosen); + close(fd); + closedir(d); + return 1; + } + closedir(d); + if (!chosen) { + /* One last chance: try basename of device */ + char *base = strrchr(dev, '/'); + if (base) + base++; + else + base = dev; + xasprintf(&fname, "%s/%s", dir, base); + if (stat(fname, &stb) == 0) + chosen = fname; + else + free(fname); + } + fname = chosen; + } else + fname = strdup(dir); + + if (!fname) { + pr_err("Cannot find suitable file in %s for %s\n", + dir, dev); + close(fd); + return 1; + } + + fl = open(fname, O_RDONLY); + if (!fl) { + pr_err("Could not open %s for --restore.\n", + fname); + goto err; + } + if (((unsigned long long)stb.st_size) != size) { + pr_err("%s is not the same size as %s - cannot restore.\n", + fname, dev); + goto err; + } + if (st == NULL) + st = guess_super_type(fl, guess_array); + if (!st) { + pr_err("Cannot find metadata on %s\n", fname); + goto err; + } + st->ignore_hw_compat = 1; + if (st->ss->load_super(st, fl, NULL) != 0) { + pr_err("No %s metadata found on %s\n", + st->ss->name, fname); + goto err; + } + if (st->ss->copy_metadata == NULL) { + pr_err("%s metadata on %s cannot be copied\n", + st->ss->name, dev); + goto err; + } + if (st->ss->copy_metadata(st, fl, fd) != 0) { + pr_err("Failed to copy metadata from %s to %s\n", + fname, dev); + goto err; + } + if (c->verbose >= 0) + printf("%s restored from %s.\n", dev, fname); + return 0; + +err: + close(fd); + close(fl); + free(fname); + return 1; +} diff --git a/Examine.c b/Examine.c new file mode 100644 index 00000000..953b8eee --- /dev/null +++ b/Examine.c @@ -0,0 +1,225 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "dlink.h" + +#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) +#error no endian defined +#endif +#include "md_u.h" +#include "md_p.h" +int Examine(struct mddev_dev *devlist, + struct context *c, + struct supertype *forcest) +{ + + /* Read the raid superblock from a device and + * display important content. + * + * If cannot be found, print reason: too small, bad magic + * + * Print: + * version, ctime, level, size, raid+spare+ + * prefered minor + * uuid + * + * utime, state etc + * + * If (brief) gather devices for same array and just print a mdadm.conf + * line including devices= + * if devlist==NULL, use conf_get_devs() + */ + int fd; + int rv = 0; + int err = 0; + + struct array { + struct supertype *st; + struct mdinfo info; + void *devs; + struct array *next; + int spares; + } *arrays = NULL; + + for (; devlist ; devlist = devlist->next) { + struct supertype *st; + int have_container = 0; + + fd = dev_open(devlist->devname, O_RDONLY); + if (fd < 0) { + if (!c->scan) { + pr_err("cannot open %s: %s\n", + devlist->devname, strerror(errno)); + rv = 1; + } + err = 1; + } + else { + int container = 0; + if (forcest) + st = dup_super(forcest); + else if (must_be_container(fd)) { + /* might be a container */ + st = super_by_fd(fd, NULL); + container = 1; + } else + st = guess_super(fd); + if (st) { + err = 1; + st->ignore_hw_compat = 1; + if (!container) + err = st->ss->load_super(st, fd, + (c->brief||c->scan) ? NULL + :devlist->devname); + if (err && st->ss->load_container) { + err = st->ss->load_container(st, fd, + (c->brief||c->scan) ? NULL + :devlist->devname); + if (!err) + have_container = 1; + } + st->ignore_hw_compat = 0; + } else { + if (!c->brief) { + pr_err("No md superblock detected on %s.\n", devlist->devname); + rv = 1; + } + err = 1; + } + close(fd); + } + if (err) + continue; + + if (c->SparcAdjust) + st->ss->update_super(st, NULL, "sparc2.2", + devlist->devname, 0, 0, NULL); + /* Ok, its good enough to try, though the checksum could be wrong */ + + if (c->brief && st->ss->brief_examine_super == NULL) { + if (!c->scan) + pr_err("No brief listing for %s on %s\n", + st->ss->name, devlist->devname); + } else if (c->brief) { + struct array *ap; + char *d; + for (ap = arrays; ap; ap = ap->next) { + if (st->ss == ap->st->ss && + st->ss->compare_super(ap->st, st) == 0) + break; + } + if (!ap) { + ap = xmalloc(sizeof(*ap)); + ap->devs = dl_head(); + ap->next = arrays; + ap->spares = 0; + ap->st = st; + arrays = ap; + st->ss->getinfo_super(st, &ap->info, NULL); + } else + st->ss->getinfo_super(st, &ap->info, NULL); + if (!have_container && + !(ap->info.disk.state & (1<spares++; + d = dl_strdup(devlist->devname); + dl_add(ap->devs, d); + } else if (c->export) { + if (st->ss->export_examine_super) + st->ss->export_examine_super(st); + st->ss->free_super(st); + } else { + printf("%s:\n",devlist->devname); + st->ss->examine_super(st, c->homehost); + st->ss->free_super(st); + } + } + if (c->brief) { + struct array *ap; + for (ap = arrays; ap; ap = ap->next) { + char sep='='; + char *d; + int newline = 0; + + ap->st->ss->brief_examine_super(ap->st, c->verbose > 0); + if (ap->spares) + newline += printf(" spares=%d", ap->spares); + if (c->verbose > 0) { + newline += printf(" devices"); + for (d = dl_next(ap->devs); + d != ap->devs; + d=dl_next(d)) { + printf("%c%s", sep, d); + sep=','; + } + } + if (ap->st->ss->brief_examine_subarrays) { + if (newline) + printf("\n"); + ap->st->ss->brief_examine_subarrays(ap->st, c->verbose); + } + ap->st->ss->free_super(ap->st); + /* FIXME free ap */ + if (ap->spares || c->verbose > 0) + printf("\n"); + } + } + return rv; +} + +int ExamineBadblocks(char *devname, int brief, struct supertype *forcest) +{ + int fd = dev_open(devname, O_RDONLY); + struct supertype *st = forcest; + int err = 1; + + if (fd < 0) { + pr_err("cannot open %s: %s\n", devname, strerror(errno)); + return 1; + } + if (!st) + st = guess_super(fd); + if (!st) { + if (!brief) + pr_err("No md superblock detected on %s\n", devname); + goto out; + } + if (!st->ss->examine_badblocks) { + pr_err("%s metadata does not support badblocks\n", st->ss->name); + goto out; + } + err = st->ss->load_super(st, fd, brief ? NULL : devname); + if (err) + goto out; + err = st->ss->examine_badblocks(st, fd, devname); + +out: + if (fd >= 0) + close(fd); + if (st) { + st->ss->free_super(st); + free(st); + } + return err; +} diff --git a/Grow.c b/Grow.c new file mode 100755 index 00000000..bbdd46c0 --- /dev/null +++ b/Grow.c @@ -0,0 +1,4985 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ +#include "mdadm.h" +#include "dlink.h" +#include +#include +#include +#include +#include + +#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) +#error no endian defined +#endif +#include "md_u.h" +#include "md_p.h" + +int restore_backup(struct supertype *st, + struct mdinfo *content, + int working_disks, + int next_spare, + char **backup_filep, + int verbose) +{ + int i; + int *fdlist; + struct mdinfo *dev; + int err; + int disk_count = next_spare + working_disks; + char *backup_file = *backup_filep; + + dprintf("Called restore_backup()\n"); + fdlist = xmalloc(sizeof(int) * disk_count); + + enable_fds(next_spare); + for (i = 0; i < next_spare; i++) + fdlist[i] = -1; + for (dev = content->devs; dev; dev = dev->next) { + char buf[22]; + int fd; + sprintf(buf, "%d:%d", + dev->disk.major, + dev->disk.minor); + fd = dev_open(buf, O_RDWR); + + if (dev->disk.raid_disk >= 0) + fdlist[dev->disk.raid_disk] = fd; + else + fdlist[next_spare++] = fd; + } + + if (!backup_file) { + backup_file = locate_backup(content->sys_name); + *backup_filep = backup_file; + } + + if (st->ss->external && st->ss->recover_backup) + err = st->ss->recover_backup(st, content); + else + err = Grow_restart(st, content, fdlist, next_spare, + backup_file, verbose > 0); + + while (next_spare > 0) { + next_spare--; + if (fdlist[next_spare] >= 0) + close(fdlist[next_spare]); + } + free(fdlist); + if (err) { + pr_err("Failed to restore critical section for reshape - sorry.\n"); + if (!backup_file) + pr_err("Possibly you need to specify a --backup-file\n"); + return 1; + } + + dprintf("restore_backup() returns status OK.\n"); + return 0; +} + +int Grow_Add_device(char *devname, int fd, char *newdev) +{ + /* Add a device to an active array. + * Currently, just extend a linear array. + * This requires writing a new superblock on the + * new device, calling the kernel to add the device, + * and if that succeeds, update the superblock on + * all other devices. + * This means that we need to *find* all other devices. + */ + struct mdinfo info; + + struct stat stb; + int nfd, fd2; + int d, nd; + struct supertype *st = NULL; + char *subarray = NULL; + + if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) { + pr_err("cannot get array info for %s\n", devname); + return 1; + } + + if (info.array.level != -1) { + pr_err("can only add devices to linear arrays\n"); + return 1; + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("cannot handle arrays with superblock version %d\n", + info.array.major_version); + return 1; + } + + if (subarray) { + pr_err("Cannot grow linear sub-arrays yet\n"); + free(subarray); + free(st); + return 1; + } + + nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT); + if (nfd < 0) { + pr_err("cannot open %s\n", newdev); + free(st); + return 1; + } + fstat(nfd, &stb); + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + pr_err("%s is not a block device!\n", newdev); + close(nfd); + free(st); + return 1; + } + /* now check out all the devices and make sure we can read the + * superblock */ + for (d=0 ; d < info.array.raid_disks ; d++) { + mdu_disk_info_t disk; + char *dv; + + st->ss->free_super(st); + + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) { + pr_err("cannot get device detail for device %d\n", + d); + close(nfd); + free(st); + return 1; + } + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) { + pr_err("cannot find device file for device %d\n", + d); + close(nfd); + free(st); + return 1; + } + fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) { + pr_err("cannot open device file %s\n", dv); + close(nfd); + free(st); + return 1; + } + + if (st->ss->load_super(st, fd2, NULL)) { + pr_err("cannot find super block on %s\n", dv); + close(nfd); + close(fd2); + free(st); + return 1; + } + close(fd2); + } + /* Ok, looks good. Lets update the superblock and write it out to + * newdev. + */ + + info.disk.number = d; + info.disk.major = major(stb.st_rdev); + info.disk.minor = minor(stb.st_rdev); + info.disk.raid_disk = d; + info.disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + st->ss->update_super(st, &info, "linear-grow-new", newdev, + 0, 0, NULL); + + if (st->ss->store_super(st, nfd)) { + pr_err("Cannot store new superblock on %s\n", + newdev); + close(nfd); + return 1; + } + close(nfd); + + if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) { + pr_err("Cannot add new disk to this array\n"); + return 1; + } + /* Well, that seems to have worked. + * Now go through and update all superblocks + */ + + if (ioctl(fd, GET_ARRAY_INFO, &info.array) < 0) { + pr_err("cannot get array info for %s\n", devname); + return 1; + } + + nd = d; + for (d=0 ; d < info.array.raid_disks ; d++) { + mdu_disk_info_t disk; + char *dv; + + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) { + pr_err("cannot get device detail for device %d\n", + d); + return 1; + } + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) { + pr_err("cannot find device file for device %d\n", + d); + return 1; + } + fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) { + pr_err("cannot open device file %s\n", dv); + return 1; + } + if (st->ss->load_super(st, fd2, NULL)) { + pr_err("cannot find super block on %s\n", dv); + close(fd); + return 1; + } + info.array.raid_disks = nd+1; + info.array.nr_disks = nd+1; + info.array.active_disks = nd+1; + info.array.working_disks = nd+1; + + st->ss->update_super(st, &info, "linear-grow-update", dv, + 0, 0, NULL); + + if (st->ss->store_super(st, fd2)) { + pr_err("Cannot store new superblock on %s\n", dv); + close(fd2); + return 1; + } + close(fd2); + } + + return 0; +} + +int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) +{ + /* + * First check that array doesn't have a bitmap + * Then create the bitmap + * Then add it + * + * For internal bitmaps, we need to check the version, + * find all the active devices, and write the bitmap block + * to all devices + */ + mdu_bitmap_file_t bmf; + mdu_array_info_t array; + struct supertype *st; + char *subarray = NULL; + int major = BITMAP_MAJOR_HI; + int vers = md_get_version(fd); + unsigned long long bitmapsize, array_size; + + if (vers < 9003) { + major = BITMAP_MAJOR_HOSTENDIAN; + pr_err("Warning - bitmaps created on this kernel are not portable\n" + " between different architectures. Consider upgrading the Linux kernel.\n"); + } + + if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0) + major = BITMAP_MAJOR_CLUSTERED; + + if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) { + if (errno == ENOMEM) + pr_err("Memory allocation failure.\n"); + else + pr_err("bitmaps not supported by this kernel.\n"); + return 1; + } + if (bmf.pathname[0]) { + if (strcmp(s->bitmap_file,"none")==0) { + if (ioctl(fd, SET_BITMAP_FILE, -1)!= 0) { + pr_err("failed to remove bitmap %s\n", + bmf.pathname); + return 1; + } + return 0; + } + pr_err("%s already has a bitmap (%s)\n", + devname, bmf.pathname); + return 1; + } + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { + pr_err("cannot get array status for %s\n", devname); + return 1; + } + if (array.state & (1<bitmap_file, "none")==0) { + array.state &= ~(1<bitmap_file, "none") == 0) { + pr_err("no bitmap found on %s\n", devname); + return 1; + } + if (array.level <= 0) { + pr_err("Bitmaps not meaningful with level %s\n", + map_num(pers, array.level)?:"of this array"); + return 1; + } + bitmapsize = array.size; + bitmapsize <<= 1; + if (get_dev_size(fd, NULL, &array_size) && + array_size > (0x7fffffffULL<<9)) { + /* Array is big enough that we cannot trust array.size + * try other approaches + */ + bitmapsize = get_component_size(fd); + } + if (bitmapsize == 0) { + pr_err("Cannot reliably determine size of array to create bitmap - sorry.\n"); + return 1; + } + + if (array.level == 10) { + int ncopies = (array.layout&255)*((array.layout>>8)&255); + bitmapsize = bitmapsize * array.raid_disks / ncopies; + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("Cannot understand version %d.%d\n", + array.major_version, array.minor_version); + return 1; + } + if (subarray) { + pr_err("Cannot add bitmaps to sub-arrays yet\n"); + free(subarray); + free(st); + return 1; + } + if (strcmp(s->bitmap_file, "internal") == 0 || + strcmp(s->bitmap_file, "clustered") == 0) { + int rv; + int d; + int offset_setable = 0; + struct mdinfo *mdi; + if (st->ss->add_internal_bitmap == NULL) { + pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name); + return 1; + } + st->nodes = c->nodes; + st->cluster_name = c->homecluster; + mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION); + if (mdi) + offset_setable = 1; + for (d=0; d< st->max_devs; d++) { + mdu_disk_info_t disk; + char *dv; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && + disk.minor == 0) + continue; + if ((disk.state & (1<ss->load_super(st, fd2, NULL)==0) { + if (st->ss->add_internal_bitmap( + st, + &s->bitmap_chunk, c->delay, s->write_behind, + bitmapsize, offset_setable, + major) + ) + st->ss->write_bitmap(st, fd2, NoUpdate); + else { + pr_err("failed to create internal bitmap - chunksize problem.\n"); + close(fd2); + return 1; + } + } + close(fd2); + } + } + if (offset_setable) { + st->ss->getinfo_super(st, mdi, NULL); + sysfs_init(mdi, fd, NULL); + rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location", + mdi->bitmap_offset); + } else { + if (strcmp(s->bitmap_file, "clustered") == 0) + array.state |= (1<max_devs; + + /* try to load a superblock */ + for (d = 0; d < max_devs; d++) { + mdu_disk_info_t disk; + char *dv; + int fd2; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if ((disk.major==0 && disk.minor==0) || + (disk.state & (1<= 0) { + if (st->ss->load_super(st, fd2, NULL) == 0) { + close(fd2); + st->ss->uuid_from_super(st, uuid); + break; + } + close(fd2); + } + } + if (d == max_devs) { + pr_err("cannot find UUID for array!\n"); + return 1; + } + if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk, + c->delay, s->write_behind, bitmapsize, major)) { + return 1; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("weird: %s cannot be opened\n", + s->bitmap_file); + return 1; + } + if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) { + int err = errno; + if (errno == EBUSY) + pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n"); + pr_err("Cannot set bitmap file for %s: %s\n", + devname, strerror(err)); + return 1; + } + } + + return 0; +} + +/* + * When reshaping an array we might need to backup some data. + * This is written to all spares with a 'super_block' describing it. + * The superblock goes 4K from the end of the used space on the + * device. + * It if written after the backup is complete. + * It has the following structure. + */ + +static struct mdp_backup_super { + char magic[16]; /* md_backup_data-1 or -2 */ + __u8 set_uuid[16]; + __u64 mtime; + /* start/sizes in 512byte sectors */ + __u64 devstart; /* address on backup device/file of data */ + __u64 arraystart; + __u64 length; + __u32 sb_csum; /* csum of preceeding bytes. */ + __u32 pad1; + __u64 devstart2; /* offset in to data of second section */ + __u64 arraystart2; + __u64 length2; + __u32 sb_csum2; /* csum of preceeding bytes. */ + __u8 pad[512-68-32]; +} __attribute__((aligned(512))) bsb, bsb2; + +static __u32 bsb_csum(char *buf, int len) +{ + int i; + int csum = 0; + for (i = 0; i < len; i++) + csum = (csum<<3) + buf[0]; + return __cpu_to_le32(csum); +} + +static int check_idle(struct supertype *st) +{ + /* Check that all member arrays for this container, or the + * container of this array, are idle + */ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + struct mdstat_ent *ent, *e; + int is_idle = 1; + + ent = mdstat_read(0, 0); + for (e = ent ; e; e = e->next) { + if (!is_container_member(e, container)) + continue; + if (e->percent >= 0) { + is_idle = 0; + break; + } + } + free_mdstat(ent); + return is_idle; +} + +static int freeze_container(struct supertype *st) +{ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + + if (!check_idle(st)) + return -1; + + if (block_monitor(container, 1)) { + pr_err("failed to freeze container\n"); + return -2; + } + + return 1; +} + +static void unfreeze_container(struct supertype *st) +{ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + + unblock_monitor(container, 1); +} + +static int freeze(struct supertype *st) +{ + /* Try to freeze resync/rebuild on this array/container. + * Return -1 if the array is busy, + * return -2 container cannot be frozen, + * return 0 if this kernel doesn't support 'frozen' + * return 1 if it worked. + */ + if (st->ss->external) + return freeze_container(st); + else { + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); + int err; + char buf[20]; + + if (!sra) + return -1; + /* Need to clear any 'read-auto' status */ + if (sysfs_get_str(sra, NULL, "array_state", buf, 20) > 0 && + strncmp(buf, "read-auto", 9) == 0) + sysfs_set_str(sra, NULL, "array_state", "clean"); + + err = sysfs_freeze_array(sra); + sysfs_free(sra); + return err; + } +} + +static void unfreeze(struct supertype *st) +{ + if (st->ss->external) + return unfreeze_container(st); + else { + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); + char buf[20]; + + if (sra && + sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 + && strcmp(buf, "frozen\n") == 0) + sysfs_set_str(sra, NULL, "sync_action", "idle"); + sysfs_free(sra); + } +} + +static void wait_reshape(struct mdinfo *sra) +{ + int fd = sysfs_get_fd(sra, NULL, "sync_action"); + char action[20]; + + if (fd < 0) + return; + + while (sysfs_fd_get_str(fd, action, 20) > 0 && + strncmp(action, "reshape", 7) == 0) + sysfs_wait(fd, NULL); + close(fd); +} + +static int reshape_super(struct supertype *st, unsigned long long size, + int level, int layout, int chunksize, int raid_disks, + int delta_disks, char *backup_file, char *dev, + int direction, int verbose) +{ + /* nothing extra to check in the native case */ + if (!st->ss->external) + return 0; + if (!st->ss->reshape_super || + !st->ss->manage_reshape) { + pr_err("%s metadata does not support reshape\n", + st->ss->name); + return 1; + } + + return st->ss->reshape_super(st, size, level, layout, chunksize, + raid_disks, delta_disks, backup_file, dev, + direction, verbose); +} + +static void sync_metadata(struct supertype *st) +{ + if (st->ss->external) { + if (st->update_tail) { + flush_metadata_updates(st); + st->update_tail = &st->updates; + } else + st->ss->sync_metadata(st); + } +} + +static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n) +{ + /* when dealing with external metadata subarrays we need to be + * prepared to handle EAGAIN. The kernel may need to wait for + * mdmon to mark the array active so the kernel can handle + * allocations/writeback when preparing the reshape action + * (md_allow_write()). We temporarily disable safe_mode_delay + * to close a race with the array_state going clean before the + * next write to raid_disks / stripe_cache_size + */ + char safe[50]; + int rc; + + /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */ + if (!container || + (strcmp(name, "raid_disks") != 0 && + strcmp(name, "stripe_cache_size") != 0)) + return sysfs_set_num(sra, NULL, name, n); + + rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe)); + if (rc <= 0) + return -1; + sysfs_set_num(sra, NULL, "safe_mode_delay", 0); + rc = sysfs_set_num(sra, NULL, name, n); + if (rc < 0 && errno == EAGAIN) { + ping_monitor(container); + /* if we get EAGAIN here then the monitor is not active + * so stop trying + */ + rc = sysfs_set_num(sra, NULL, name, n); + } + sysfs_set_str(sra, NULL, "safe_mode_delay", safe); + return rc; +} + +int start_reshape(struct mdinfo *sra, int already_running, + int before_data_disks, int data_disks) +{ + int err; + unsigned long long sync_max_to_set; + + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + err = sysfs_set_num(sra, NULL, "suspend_hi", sra->reshape_progress); + err = err ?: sysfs_set_num(sra, NULL, "suspend_lo", + sra->reshape_progress); + if (before_data_disks <= data_disks) + sync_max_to_set = sra->reshape_progress / data_disks; + else + sync_max_to_set = (sra->component_size * data_disks + - sra->reshape_progress) / data_disks; + if (!already_running) + sysfs_set_num(sra, NULL, "sync_min", sync_max_to_set); + err = err ?: sysfs_set_num(sra, NULL, "sync_max", sync_max_to_set); + if (!already_running && err == 0) { + int cnt = 5; + do { + err = sysfs_set_str(sra, NULL, "sync_action", "reshape"); + if (err) + sleep(1); + } while (err && errno == EBUSY && cnt-- > 0); + } + return err; +} + +void abort_reshape(struct mdinfo *sra) +{ + sysfs_set_str(sra, NULL, "sync_action", "idle"); + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "sync_min", 0); + // It isn't safe to reset sync_max as we aren't monitoring. + // Array really should be stopped at this point. +} + +int remove_disks_for_takeover(struct supertype *st, + struct mdinfo *sra, + int layout) +{ + int nr_of_copies; + struct mdinfo *remaining; + int slot; + + if (sra->array.level == 10) + nr_of_copies = layout & 0xff; + else if (sra->array.level == 1) + nr_of_copies = sra->array.raid_disks; + else + return 1; + + remaining = sra->devs; + sra->devs = NULL; + /* for each 'copy', select one device and remove from the list. */ + for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) { + struct mdinfo **diskp; + int found = 0; + + /* Find a working device to keep */ + for (diskp = &remaining; *diskp ; diskp = &(*diskp)->next) { + struct mdinfo *disk = *diskp; + + if (disk->disk.raid_disk < slot) + continue; + if (disk->disk.raid_disk >= slot + nr_of_copies) + continue; + if (disk->disk.state & (1<disk.state & (1<disk.state & (1<next; + disk->next = sra->devs; + sra->devs = disk; + found = 1; + break; + } + if (!found) + break; + } + + if (slot < sra->array.raid_disks) { + /* didn't find all slots */ + struct mdinfo **e; + e = &remaining; + while (*e) + e = &(*e)->next; + *e = sra->devs; + sra->devs = remaining; + return 1; + } + + /* Remove all 'remaining' devices from the array */ + while (remaining) { + struct mdinfo *sd = remaining; + remaining = sd->next; + + sysfs_set_str(sra, sd, "state", "faulty"); + sysfs_set_str(sra, sd, "slot", "none"); + /* for external metadata disks should be removed in mdmon */ + if (!st->ss->external) + sysfs_set_str(sra, sd, "state", "remove"); + sd->disk.state |= (1<disk.state &= ~(1<next = sra->devs; + sra->devs = sd; + } + return 0; +} + +void reshape_free_fdlist(int *fdlist, + unsigned long long *offsets, + int size) +{ + int i; + + for (i = 0; i < size; i++) + if (fdlist[i] >= 0) + close(fdlist[i]); + + free(fdlist); + free(offsets); +} + +int reshape_prepare_fdlist(char *devname, + struct mdinfo *sra, + int raid_disks, + int nrdisks, + unsigned long blocks, + char *backup_file, + int *fdlist, + unsigned long long *offsets) +{ + int d = 0; + struct mdinfo *sd; + + enable_fds(nrdisks); + for (d = 0; d <= nrdisks; d++) + fdlist[d] = -1; + d = raid_disks; + for (sd = sra->devs; sd; sd = sd->next) { + if (sd->disk.state & (1<disk.state & (1<disk.raid_disk < raid_disks) { + char *dn = map_dev(sd->disk.major, + sd->disk.minor, 1); + fdlist[sd->disk.raid_disk] + = dev_open(dn, O_RDONLY); + offsets[sd->disk.raid_disk] = sd->data_offset*512; + if (fdlist[sd->disk.raid_disk] < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + d = -1; + goto release; + } + } else if (backup_file == NULL) { + /* spare */ + char *dn = map_dev(sd->disk.major, + sd->disk.minor, 1); + fdlist[d] = dev_open(dn, O_RDWR); + offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512; + if (fdlist[d] < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + d = -1; + goto release; + } + d++; + } + } +release: + return d; +} + +int reshape_open_backup_file(char *backup_file, + int fd, + char *devname, + long blocks, + int *fdlist, + unsigned long long *offsets, + char *sys_name, + int restart) +{ + /* Return 1 on success, 0 on any form of failure */ + /* need to check backup file is large enough */ + char buf[512]; + struct stat stb; + unsigned int dev; + int i; + + *fdlist = open(backup_file, O_RDWR|O_CREAT|(restart ? O_TRUNC : O_EXCL), + S_IRUSR | S_IWUSR); + *offsets = 8 * 512; + if (*fdlist < 0) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + /* Guard against backup file being on array device. + * If array is partitioned or if LVM etc is in the + * way this will not notice, but it is better than + * nothing. + */ + fstat(*fdlist, &stb); + dev = stb.st_dev; + fstat(fd, &stb); + if (stb.st_rdev == dev) { + pr_err("backup file must NOT be on the array being reshaped.\n"); + close(*fdlist); + return 0; + } + + memset(buf, 0, 512); + for (i=0; i < blocks + 8 ; i++) { + if (write(*fdlist, buf, 512) != 512) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + } + if (fsync(*fdlist) != 0) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + + if (!restart && strncmp(backup_file, MAP_DIR, strlen(MAP_DIR)) != 0) { + char *bu = make_backup(sys_name); + if (symlink(backup_file, bu)) + pr_err("Recording backup file in " MAP_DIR " failed: %s\n", + strerror(errno)); + free(bu); + } + + return 1; +} + +unsigned long compute_backup_blocks(int nchunk, int ochunk, + unsigned int ndata, unsigned int odata) +{ + unsigned long a, b, blocks; + /* So how much do we need to backup. + * We need an amount of data which is both a whole number of + * old stripes and a whole number of new stripes. + * So LCM for (chunksize*datadisks). + */ + a = (ochunk/512) * odata; + b = (nchunk/512) * ndata; + /* Find GCD */ + a = GCD(a, b); + /* LCM == product / GCD */ + blocks = (ochunk/512) * (nchunk/512) * odata * ndata / a; + + return blocks; +} + +char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re) +{ + /* Based on the current array state in info->array and + * the changes in info->new_* etc, determine: + * - whether the change is possible + * - Intermediate level/raid_disks/layout + * - whether a restriping reshape is needed + * - number of sectors in minimum change unit. This + * will cover a whole number of stripes in 'before' and + * 'after'. + * + * Return message if the change should be rejected + * NULL if the change can be achieved + * + * This can be called as part of starting a reshape, or + * when assembling an array that is undergoing reshape. + */ + int near, far, offset, copies; + int new_disks; + int old_chunk, new_chunk; + /* delta_parity records change in number of devices + * caused by level change + */ + int delta_parity = 0; + + memset(re, 0, sizeof(*re)); + + /* If a new level not explicitly given, we assume no-change */ + if (info->new_level == UnSet) + info->new_level = info->array.level; + + if (info->new_chunk) + switch (info->new_level) { + case 0: + case 4: + case 5: + case 6: + case 10: + /* chunk size is meaningful, must divide component_size + * evenly + */ + if (info->component_size % (info->new_chunk/512)) { + unsigned long long shrink = info->component_size; + shrink &= ~(unsigned long long)(info->new_chunk/512-1); + pr_err("New chunk size (%dK) does not evenly divide device size (%lluk)\n", + info->new_chunk/1024, info->component_size/2); + pr_err("After shrinking any filesystem, \"mdadm --grow %s --size %llu\"\n", + devname, shrink/2); + pr_err("will shrink the array so the given chunk size would work.\n"); + return ""; + } + break; + default: + return "chunk size not meaningful for this level"; + } + else + info->new_chunk = info->array.chunk_size; + + switch (info->array.level) { + default: + return "No reshape is possibly for this RAID level"; + case LEVEL_LINEAR: + if (info->delta_disks != UnSet) + return "Only --add is supported for LINEAR, setting --raid-disks is not needed"; + else + return "Only --add is supported for LINEAR, other --grow options are not meaningful"; + case 1: + /* RAID1 can convert to RAID1 with different disks, or + * raid5 with 2 disks, or + * raid0 with 1 disk + */ + if (info->new_level > 1 && + (info->component_size & 7)) + return "Cannot convert RAID1 of this size - reduce size to multiple of 4K first."; + if (info->new_level == 0) { + if (info->delta_disks != UnSet && + info->delta_disks != 0) + return "Cannot change number of disks with RAID1->RAID0 conversion"; + re->level = 0; + re->before.data_disks = 1; + re->after.data_disks = 1; + return NULL; + } + if (info->new_level == 1) { + if (info->delta_disks == UnSet) + /* Don't know what to do */ + return "no change requested for Growing RAID1"; + re->level = 1; + return NULL; + } + if (info->array.raid_disks == 2 && + info->new_level == 5) { + + re->level = 5; + re->before.data_disks = 1; + if (info->delta_disks != UnSet && + info->delta_disks != 0) + re->after.data_disks = 1 + info->delta_disks; + else + re->after.data_disks = 1; + if (re->after.data_disks < 1) + return "Number of disks too small for RAID5"; + + re->before.layout = ALGORITHM_LEFT_SYMMETRIC; + info->array.chunk_size = 65536; + break; + } + /* Could do some multi-stage conversions, but leave that to + * later. + */ + return "Impossibly level change request for RAID1"; + + case 10: + /* RAID10 can be converted from near mode to + * RAID0 by removing some devices. + * It can also be reshaped if the kernel supports + * new_data_offset. + */ + switch (info->new_level) { + case 0: + if ((info->array.layout & ~0xff) != 0x100) + return "Cannot Grow RAID10 with far/offset layout"; + /* number of devices must be multiple of number of copies */ + if (info->array.raid_disks % (info->array.layout & 0xff)) + return "RAID10 layout too complex for Grow operation"; + + new_disks = (info->array.raid_disks + / (info->array.layout & 0xff)); + if (info->delta_disks == UnSet) + info->delta_disks = (new_disks + - info->array.raid_disks); + + if (info->delta_disks != new_disks - info->array.raid_disks) + return "New number of raid-devices impossible for RAID10"; + if (info->new_chunk && + info->new_chunk != info->array.chunk_size) + return "Cannot change chunk-size with RAID10 Grow"; + + /* looks good */ + re->level = 0; + re->before.data_disks = new_disks; + re->after.data_disks = re->before.data_disks; + return NULL; + + case 10: + near = info->array.layout & 0xff; + far = (info->array.layout >> 8) & 0xff; + offset = info->array.layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 in far-mode"; + copies = near * far; + + old_chunk = info->array.chunk_size * far; + + if (info->new_layout == UnSet) + info->new_layout = info->array.layout; + else { + near = info->new_layout & 0xff; + far = (info->new_layout >> 8) & 0xff; + offset = info->new_layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 to far-mode"; + if (near * far != copies) + return "Cannot change number of copies when reshaping RAID10"; + } + if (info->delta_disks == UnSet) + info->delta_disks = 0; + new_disks = (info->array.raid_disks + + info->delta_disks); + + new_chunk = info->new_chunk * far; + + re->level = 10; + re->before.layout = info->array.layout; + re->before.data_disks = info->array.raid_disks; + re->after.layout = info->new_layout; + re->after.data_disks = new_disks; + /* For RAID10 we don't do backup but do allow reshape, + * so set backup_blocks to INVALID_SECTORS rather than + * zero. + * And there is no need to synchronise stripes on both + * 'old' and 'new'. So the important + * number is the minimum data_offset difference + * which is the larger of (offset copies * chunk). + */ + re->backup_blocks = INVALID_SECTORS; + re->min_offset_change = max(old_chunk, new_chunk) / 512; + if (new_disks < re->before.data_disks && + info->space_after < re->min_offset_change) + /* Reduce component size by one chunk */ + re->new_size = (info->component_size - + re->min_offset_change); + else + re->new_size = info->component_size; + re->new_size = re->new_size * new_disks / copies; + return NULL; + + default: + return "RAID10 can only be changed to RAID0"; + } + case 0: + /* RAID0 can be converted to RAID10, or to RAID456 */ + if (info->new_level == 10) { + if (info->new_layout == UnSet && info->delta_disks == UnSet) { + /* Assume near=2 layout */ + info->new_layout = 0x102; + info->delta_disks = info->array.raid_disks; + } + if (info->new_layout == UnSet) { + int copies = 1 + (info->delta_disks + / info->array.raid_disks); + if (info->array.raid_disks * (copies-1) + != info->delta_disks) + return "Impossible number of devices for RAID0->RAID10"; + info->new_layout = 0x100 + copies; + } + if (info->delta_disks == UnSet) { + int copies = info->new_layout & 0xff; + if (info->new_layout != 0x100 + copies) + return "New layout impossible for RAID0->RAID10";; + info->delta_disks = (copies - 1) * + info->array.raid_disks; + } + if (info->new_chunk && + info->new_chunk != info->array.chunk_size) + return "Cannot change chunk-size with RAID0->RAID10"; + /* looks good */ + re->level = 10; + re->before.data_disks = (info->array.raid_disks + + info->delta_disks); + re->after.data_disks = re->before.data_disks; + re->before.layout = info->new_layout; + return NULL; + } + + /* RAID0 can also covert to RAID0/4/5/6 by first converting to + * a raid4 style layout of the final level. + */ + switch (info->new_level) { + case 4: + delta_parity = 1; + case 0: + re->level = 4; + re->before.layout = 0; + break; + case 5: + delta_parity = 1; + re->level = 5; + re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r5layout, "default"); + break; + case 6: + delta_parity = 2; + re->level = 6; + re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r6layout, "default"); + break; + default: + return "Impossible level change requested"; + } + re->before.data_disks = info->array.raid_disks; + /* determining 'after' layout happens outside this 'switch' */ + break; + + case 4: + info->array.layout = ALGORITHM_PARITY_N; + case 5: + switch (info->new_level) { + case 0: + delta_parity = -1; + case 4: + re->level = info->array.level; + re->before.data_disks = info->array.raid_disks - 1; + re->before.layout = info->array.layout; + break; + case 5: + re->level = 5; + re->before.data_disks = info->array.raid_disks - 1; + re->before.layout = info->array.layout; + break; + case 6: + delta_parity = 1; + re->level = 6; + re->before.data_disks = info->array.raid_disks - 1; + switch (info->array.layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + re->before.layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + re->before.layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + re->before.layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + re->before.layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + re->before.layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + re->before.layout = ALGORITHM_PARITY_N_6; + break; + default: + return "Cannot convert an array with this layout"; + } + break; + case 1: + if (info->array.raid_disks != 2) + return "Can only convert a 2-device array to RAID1"; + if (info->delta_disks != UnSet && + info->delta_disks != 0) + return "Cannot set raid_disk when converting RAID5->RAID1"; + re->level = 1; + info->new_chunk = 0; + return NULL; + default: + return "Impossible level change requested"; + } + break; + case 6: + switch (info->new_level) { + case 4: + case 5: + delta_parity = -1; + case 6: + re->level = 6; + re->before.data_disks = info->array.raid_disks - 2; + re->before.layout = info->array.layout; + break; + default: + return "Impossible level change requested"; + } + break; + } + + /* If we reached here then it looks like a re-stripe is + * happening. We have determined the intermediate level + * and initial raid_disks/layout and stored these in 're'. + * + * We need to deduce the final layout that can be atomically + * converted to the end state. + */ + switch (info->new_level) { + case 0: + /* We can only get to RAID0 from RAID4 or RAID5 + * with appropriate layout and one extra device + */ + if (re->level != 4 && re->level != 5) + return "Cannot covert to RAID0 from this level"; + + switch (re->level) { + case 4: + re->before.layout = 0; + re->after.layout = 0; + break; + case 5: + re->after.layout = ALGORITHM_PARITY_N; + break; + } + break; + + case 4: + /* We can only get to RAID4 from RAID5 */ + if (re->level != 4 && re->level != 5) + return "Cannot convert to RAID4 from this level"; + + switch (re->level) { + case 4: + re->after.layout = 0; + break; + case 5: + re->after.layout = ALGORITHM_PARITY_N; + break; + } + break; + + case 5: + /* We get to RAID5 from RAID5 or RAID6 */ + if (re->level != 5 && re->level != 6) + return "Cannot convert to RAID5 from this level"; + + switch (re->level) { + case 5: + if (info->new_layout == UnSet) + re->after.layout = re->before.layout; + else + re->after.layout = info->new_layout; + break; + case 6: + if (info->new_layout == UnSet) + info->new_layout = re->before.layout; + + /* after.layout needs to be raid6 version of new_layout */ + if (info->new_layout == ALGORITHM_PARITY_N) + re->after.layout = ALGORITHM_PARITY_N; + else { + char layout[40]; + char *ls = map_num(r5layout, info->new_layout); + int l; + if (ls) { + /* Current RAID6 layout has a RAID5 + * equivalent - good + */ + strcat(strcpy(layout, ls), "-6"); + l = map_name(r6layout, layout); + if (l == UnSet) + return "Cannot find RAID6 layout to convert to"; + } else { + /* Current RAID6 has no equivalent. + * If it is already a '-6' layout we + * can leave it unchanged, else we must + * fail + */ + ls = map_num(r6layout, info->new_layout); + if (!ls || + strcmp(ls+strlen(ls)-2, "-6") != 0) + return "Please specify new layout"; + l = info->new_layout; + } + re->after.layout = l; + } + } + break; + + case 6: + /* We must already be at level 6 */ + if (re->level != 6) + return "Impossible level change"; + if (info->new_layout == UnSet) + re->after.layout = info->array.layout; + else + re->after.layout = info->new_layout; + break; + default: + return "Impossible level change requested"; + } + if (info->delta_disks == UnSet) + info->delta_disks = delta_parity; + + re->after.data_disks = (re->before.data_disks + + info->delta_disks + - delta_parity); + switch (re->level) { + case 6: re->parity = 2; + break; + case 4: + case 5: re->parity = 1; + break; + default: re->parity = 0; + break; + } + /* So we have a restripe operation, we need to calculate the number + * of blocks per reshape operation. + */ + re->new_size = info->component_size * re->before.data_disks; + if (info->new_chunk == 0) + info->new_chunk = info->array.chunk_size; + if (re->after.data_disks == re->before.data_disks && + re->after.layout == re->before.layout && + info->new_chunk == info->array.chunk_size) { + /* Nothing to change, can change level immediately. */ + re->level = info->new_level; + re->backup_blocks = 0; + return NULL; + } + if (re->after.data_disks == 1 && re->before.data_disks == 1) { + /* chunk and layout changes make no difference */ + re->level = info->new_level; + re->backup_blocks = 0; + return NULL; + } + + if (re->after.data_disks == re->before.data_disks && + get_linux_version() < 2006032) + return "in-place reshape is not safe before 2.6.32 - sorry."; + + if (re->after.data_disks < re->before.data_disks && + get_linux_version() < 2006030) + return "reshape to fewer devices is not supported before 2.6.30 - sorry."; + + re->backup_blocks = compute_backup_blocks( + info->new_chunk, info->array.chunk_size, + re->after.data_disks, + re->before.data_disks); + re->min_offset_change = re->backup_blocks / re->before.data_disks; + + re->new_size = info->component_size * re->after.data_disks; + return NULL; +} + +static int set_array_size(struct supertype *st, struct mdinfo *sra, + char *text_version) +{ + struct mdinfo *info; + char *subarray; + int ret_val = -1; + + if ((st == NULL) || (sra == NULL)) + return ret_val; + + if (text_version == NULL) + text_version = sra->text_version; + subarray = strchr(text_version+1, '/')+1; + info = st->ss->container_content(st, subarray); + if (info) { + unsigned long long current_size = 0; + unsigned long long new_size = + info->custom_array_size/2; + + if (sysfs_get_ll(sra, NULL, "array_size", ¤t_size) == 0 && + new_size > current_size) { + if (sysfs_set_num(sra, NULL, "array_size", new_size) + < 0) + dprintf("Error: Cannot set array size"); + else { + ret_val = 0; + dprintf("Array size changed"); + } + dprintf_cont(" from %llu to %llu.\n", + current_size, new_size); + } + sysfs_free(info); + } else + dprintf("Error: set_array_size(): info pointer in NULL\n"); + + return ret_val; +} + +static int reshape_array(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + int force, struct mddev_dev *devlist, + unsigned long long data_offset, + char *backup_file, int verbose, int forked, + int restart, int freeze_reshape); +static int reshape_container(char *container, char *devname, + int mdfd, + struct supertype *st, + struct mdinfo *info, + int force, + char *backup_file, int verbose, + int forked, int restart, int freeze_reshape); + +int Grow_reshape(char *devname, int fd, + struct mddev_dev *devlist, + unsigned long long data_offset, + struct context *c, struct shape *s) +{ + /* Make some changes in the shape of an array. + * The kernel must support the change. + * + * There are three different changes. Each can trigger + * a resync or recovery so we freeze that until we have + * requested everything (if kernel supports freezing - 2.6.30). + * The steps are: + * - change size (i.e. component_size) + * - change level + * - change layout/chunksize/ndisks + * + * The last can require a reshape. It is different on different + * levels so we need to check the level before actioning it. + * Some times the level change needs to be requested after the + * reshape (e.g. raid6->raid5, raid5->raid0) + * + */ + struct mdu_array_info_s array; + int rv = 0; + struct supertype *st; + char *subarray = NULL; + + int frozen; + int changed = 0; + char *container = NULL; + int cfd = -1; + + struct mddev_dev *dv; + int added_disks; + + struct mdinfo info; + struct mdinfo *sra; + + if (ioctl(fd, GET_ARRAY_INFO, &array) < 0) { + pr_err("%s is not an active md array - aborting\n", + devname); + return 1; + } + if (data_offset != INVALID_SECTORS && array.level != 10 + && (array.level < 4 || array.level > 6)) { + pr_err("--grow --data-offset not yet supported\n"); + return 1; + } + + if (s->size > 0 && + (s->chunk || s->level!= UnSet || s->layout_str || s->raiddisks)) { + pr_err("cannot change component size at the same time as other changes.\n" + " Change size first, then check data is intact before making other changes.\n"); + return 1; + } + + if (s->raiddisks && s->raiddisks < array.raid_disks && array.level > 1 && + get_linux_version() < 2006032 && + !check_env("MDADM_FORCE_FEWER")) { + pr_err("reducing the number of devices is not safe before Linux 2.6.32\n" + " Please use a newer kernel\n"); + return 1; + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("Unable to determine metadata format for %s\n", devname); + return 1; + } + if (s->raiddisks > st->max_devs) { + pr_err("Cannot increase raid-disks on this array beyond %d\n", st->max_devs); + return 1; + } + if (s->level == 0 && + (array.state & (1<ss->external) { + int rv; + + if (subarray) { + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); + } else { + container = st->devnm; + close(fd); + cfd = open_dev_excl(st->devnm); + fd = cfd; + } + if (cfd < 0) { + pr_err("Unable to open container for %s\n", + devname); + free(subarray); + return 1; + } + + rv = st->ss->load_container(st, cfd, NULL); + + if (rv) { + pr_err("Cannot read superblock for %s\n", + devname); + free(subarray); + return 1; + } + + /* check if operation is supported for metadata handler */ + if (st->ss->container_content) { + struct mdinfo *cc = NULL; + struct mdinfo *content = NULL; + + cc = st->ss->container_content(st, subarray); + for (content = cc; content ; content = content->next) { + int allow_reshape = 1; + + /* check if reshape is allowed based on metadata + * indications stored in content.array.status + */ + if (content->array.state & (1<array.state + & (1<update_tail = &st->updates; + } + + added_disks = 0; + for (dv = devlist; dv; dv = dv->next) + added_disks++; + if (s->raiddisks > array.raid_disks && + array.spare_disks +added_disks < (s->raiddisks - array.raid_disks) && + !c->force) { + pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n" + " Use --force to over-ride this check.\n", + s->raiddisks - array.raid_disks, + s->raiddisks - array.raid_disks == 1 ? "" : "s", + array.spare_disks + added_disks); + return 1; + } + + sra = sysfs_read(fd, NULL, GET_LEVEL | GET_DISKS | GET_DEVS + | GET_STATE | GET_VERSION); + if (sra) { + if (st->ss->external && subarray == NULL) { + array.level = LEVEL_CONTAINER; + sra->array.level = LEVEL_CONTAINER; + } + } else { + pr_err("failed to read sysfs parameters for %s\n", + devname); + return 1; + } + frozen = freeze(st); + if (frozen < -1) { + /* freeze() already spewed the reason */ + sysfs_free(sra); + return 1; + } else if (frozen < 0) { + pr_err("%s is performing resync/recovery and cannot be reshaped\n", devname); + sysfs_free(sra); + return 1; + } + + /* ========= set size =============== */ + if (s->size > 0 && (s->size == MAX_SIZE || s->size != (unsigned)array.size)) { + unsigned long long orig_size = get_component_size(fd)/2; + unsigned long long min_csize; + struct mdinfo *mdi; + int raid0_takeover = 0; + + if (orig_size == 0) + orig_size = (unsigned) array.size; + + if (orig_size == 0) { + pr_err("Cannot set device size in this type of array.\n"); + rv = 1; + goto release; + } + + if (reshape_super(st, s->size, UnSet, UnSet, 0, 0, UnSet, NULL, + devname, APPLY_METADATA_CHANGES, c->verbose > 0)) { + rv = 1; + goto release; + } + sync_metadata(st); + if (st->ss->external) { + /* metadata can have size limitation + * update size value according to metadata information + */ + struct mdinfo *sizeinfo = + st->ss->container_content(st, subarray); + if (sizeinfo) { + unsigned long long new_size = + sizeinfo->custom_array_size/2; + int data_disks = get_data_disks( + sizeinfo->array.level, + sizeinfo->array.layout, + sizeinfo->array.raid_disks); + new_size /= data_disks; + dprintf("Metadata size correction from %llu to %llu (%llu)\n", orig_size, new_size, + new_size * data_disks); + s->size = new_size; + sysfs_free(sizeinfo); + } + } + + /* Update the size of each member device in case + * they have been resized. This will never reduce + * below the current used-size. The "size" attribute + * understands '0' to mean 'max'. + */ + min_csize = 0; + rv = 0; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + if (sysfs_set_num(sra, mdi, "size", + s->size == MAX_SIZE ? 0 : s->size) < 0) { + /* Probably kernel refusing to let us + * reduce the size - not an error. + */ + break; + } + if (array.not_persistent == 0 && + array.major_version == 0 && + get_linux_version() < 3001000) { + /* Dangerous to allow size to exceed 2TB */ + unsigned long long csize; + if (sysfs_get_ll(sra, mdi, "size", &csize) == 0) { + if (csize >= 2ULL*1024*1024*1024) + csize = 2ULL*1024*1024*1024; + if ((min_csize == 0 || (min_csize + > csize))) + min_csize = csize; + } + } + } + if (rv) { + pr_err("Cannot set size on array members.\n"); + goto size_change_error; + } + if (min_csize && s->size > min_csize) { + pr_err("Cannot safely make this array use more than 2TB per device on this kernel.\n"); + rv = 1; + goto size_change_error; + } + if (min_csize && s->size == MAX_SIZE) { + /* Don't let the kernel choose a size - it will get + * it wrong + */ + pr_err("Limited v0.90 array to 2TB per device\n"); + s->size = min_csize; + } + if (st->ss->external) { + if (sra->array.level == 0) { + rv = sysfs_set_str(sra, NULL, "level", + "raid5"); + if (!rv) { + raid0_takeover = 1; + /* get array parameters after takeover + * to change one parameter at time only + */ + rv = ioctl(fd, GET_ARRAY_INFO, &array); + } + } + /* make sure mdmon is + * aware of the new level */ + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(container); + if (mdmon_running(st->container_devnm) && + st->update_tail == NULL) + st->update_tail = &st->updates; + } + + if (s->size == MAX_SIZE) + s->size = 0; + array.size = s->size; + if (s->size & ~INT32_MAX) { + /* got truncated to 32bit, write to + * component_size instead + */ + if (sra) + rv = sysfs_set_num(sra, NULL, + "component_size", s->size); + else + rv = -1; + } else { + rv = ioctl(fd, SET_ARRAY_INFO, &array); + + /* manage array size when it is managed externally + */ + if ((rv == 0) && st->ss->external) + rv = set_array_size(st, sra, sra->text_version); + } + + if (raid0_takeover) { + /* do not recync non-existing parity, + * we will drop it anyway + */ + sysfs_set_str(sra, NULL, "sync_action", "frozen"); + /* go back to raid0, drop parity disk + */ + sysfs_set_str(sra, NULL, "level", "raid0"); + ioctl(fd, GET_ARRAY_INFO, &array); + } + +size_change_error: + if (rv != 0) { + int err = errno; + + /* restore metadata */ + if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0, + UnSet, NULL, devname, + ROLLBACK_METADATA_CHANGES, + c->verbose) == 0) + sync_metadata(st); + pr_err("Cannot set device size for %s: %s\n", + devname, strerror(err)); + if (err == EBUSY && + (array.state & (1<assume_clean) { + /* This will fail on kernels older than 3.0 unless + * a backport has been arranged. + */ + if (sra == NULL || + sysfs_set_str(sra, NULL, "resync_start", "none") < 0) + pr_err("--assume-clean not supported with --grow on this kernel\n"); + } + ioctl(fd, GET_ARRAY_INFO, &array); + s->size = get_component_size(fd)/2; + if (s->size == 0) + s->size = array.size; + if (c->verbose >= 0) { + if (s->size == orig_size) + pr_err("component size of %s unchanged at %lluK\n", + devname, s->size); + else + pr_err("component size of %s has been set to %lluK\n", + devname, s->size); + } + changed = 1; + } else if (array.level != LEVEL_CONTAINER) { + s->size = get_component_size(fd)/2; + if (s->size == 0) + s->size = array.size; + } + + /* See if there is anything else to do */ + if ((s->level == UnSet || s->level == array.level) && + (s->layout_str == NULL) && + (s->chunk == 0 || s->chunk == array.chunk_size) && + data_offset == INVALID_SECTORS && + (s->raiddisks == 0 || s->raiddisks == array.raid_disks)) { + /* Nothing more to do */ + if (!changed && c->verbose >= 0) + pr_err("%s: no change requested\n", + devname); + goto release; + } + + /* ========= check for Raid10/Raid1 -> Raid0 conversion =============== + * current implementation assumes that following conditions must be met: + * - RAID10: + * - far_copies == 1 + * - near_copies == 2 + */ + if ((s->level == 0 && array.level == 10 && sra && + array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) || + (s->level == 0 && array.level == 1 && sra)) { + int err; + err = remove_disks_for_takeover(st, sra, array.layout); + if (err) { + dprintf("Array cannot be reshaped\n"); + if (cfd > -1) + close(cfd); + rv = 1; + goto release; + } + /* Make sure mdmon has seen the device removal + * and updated metadata before we continue with + * level change + */ + if (container) + ping_monitor(container); + } + + memset(&info, 0, sizeof(info)); + info.array = array; + sysfs_init(&info, fd, NULL); + strcpy(info.text_version, sra->text_version); + info.component_size = s->size*2; + info.new_level = s->level; + info.new_chunk = s->chunk * 1024; + if (info.array.level == LEVEL_CONTAINER) { + info.delta_disks = UnSet; + info.array.raid_disks = s->raiddisks; + } else if (s->raiddisks) + info.delta_disks = s->raiddisks - info.array.raid_disks; + else + info.delta_disks = UnSet; + if (s->layout_str == NULL) { + info.new_layout = UnSet; + if (info.array.level == 6 && + (info.new_level == 6 || info.new_level == UnSet) && + info.array.layout >= 16) { + pr_err("%s has a non-standard layout. If you wish to preserve this\n", devname); + cont_err("during the reshape, please specify --layout=preserve\n"); + cont_err("If you want to change it, specify a layout or use --layout=normalise\n"); + rv = 1; + goto release; + } + } else if (strcmp(s->layout_str, "normalise") == 0 || + strcmp(s->layout_str, "normalize") == 0) { + /* If we have a -6 RAID6 layout, remove the '-6'. */ + info.new_layout = UnSet; + if (info.array.level == 6 && info.new_level == UnSet) { + char l[40], *h; + strcpy(l, map_num(r6layout, info.array.layout)); + h = strrchr(l, '-'); + if (h && strcmp(h, "-6") == 0) { + *h = 0; + info.new_layout = map_name(r6layout, l); + } + } else { + pr_err("%s is only meaningful when reshaping a RAID6 array.\n", s->layout_str); + rv = 1; + goto release; + } + } else if (strcmp(s->layout_str, "preserve") == 0) { + /* This means that a non-standard RAID6 layout + * is OK. + * In particular: + * - When reshape a RAID6 (e.g. adding a device) + * which is in a non-standard layout, it is OK + * to preserve that layout. + * - When converting a RAID5 to RAID6, leave it in + * the XXX-6 layout, don't re-layout. + */ + if (info.array.level == 6 && info.new_level == UnSet) + info.new_layout = info.array.layout; + else if (info.array.level == 5 && info.new_level == 6) { + char l[40]; + strcpy(l, map_num(r5layout, info.array.layout)); + strcat(l, "-6"); + info.new_layout = map_name(r6layout, l); + } else { + pr_err("%s in only meaningful when reshaping to RAID6\n", s->layout_str); + rv = 1; + goto release; + } + } else { + int l = info.new_level; + if (l == UnSet) + l = info.array.level; + switch (l) { + case 5: + info.new_layout = map_name(r5layout, s->layout_str); + break; + case 6: + info.new_layout = map_name(r6layout, s->layout_str); + break; + case 10: + info.new_layout = parse_layout_10(s->layout_str); + break; + case LEVEL_FAULTY: + info.new_layout = parse_layout_faulty(s->layout_str); + break; + default: + pr_err("layout not meaningful with this level\n"); + rv = 1; + goto release; + } + if (info.new_layout == UnSet) { + pr_err("layout %s not understood for this level\n", + s->layout_str); + rv = 1; + goto release; + } + } + + if (array.level == LEVEL_FAULTY) { + if (s->level != UnSet && s->level != array.level) { + pr_err("cannot change level of Faulty device\n"); + rv =1 ; + } + if (s->chunk) { + pr_err("cannot set chunksize of Faulty device\n"); + rv =1 ; + } + if (s->raiddisks && s->raiddisks != 1) { + pr_err("cannot set raid_disks of Faulty device\n"); + rv =1 ; + } + if (s->layout_str) { + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + array.layout = info.new_layout; + if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + pr_err("failed to set new layout\n"); + rv = 1; + } else if (c->verbose >= 0) + printf("layout for %s set to %d\n", + devname, array.layout); + } + } else if (array.level == LEVEL_CONTAINER) { + /* This change is to be applied to every array in the + * container. This is only needed when the metadata imposes + * restraints of the various arrays in the container. + * Currently we only know that IMSM requires all arrays + * to have the same number of devices so changing the + * number of devices (On-Line Capacity Expansion) must be + * performed at the level of the container + */ + rv = reshape_container(container, devname, -1, st, &info, + c->force, c->backup_file, c->verbose, 0, 0, 0); + frozen = 0; + } else { + /* get spare devices from external metadata + */ + if (st->ss->external) { + struct mdinfo *info2; + + info2 = st->ss->container_content(st, subarray); + if (info2) { + info.array.spare_disks = + info2->array.spare_disks; + sysfs_free(info2); + } + } + + /* Impose these changes on a single array. First + * check that the metadata is OK with the change. */ + + if (reshape_super(st, 0, info.new_level, + info.new_layout, info.new_chunk, + info.array.raid_disks, info.delta_disks, + c->backup_file, devname, APPLY_METADATA_CHANGES, + c->verbose)) { + rv = 1; + goto release; + } + sync_metadata(st); + rv = reshape_array(container, fd, devname, st, &info, c->force, + devlist, data_offset, c->backup_file, c->verbose, + 0, 0, 0); + frozen = 0; + } +release: + sysfs_free(sra); + if (frozen > 0) + unfreeze(st); + return rv; +} + +/* verify_reshape_position() + * Function checks if reshape position in metadata is not farther + * than position in md. + * Return value: + * 0 : not valid sysfs entry + * it can be caused by not started reshape, it should be started + * by reshape array or raid0 array is before takeover + * -1 : error, reshape position is obviously wrong + * 1 : success, reshape progress correct or updated +*/ +static int verify_reshape_position(struct mdinfo *info, int level) +{ + int ret_val = 0; + char buf[40]; + int rv; + + /* read sync_max, failure can mean raid0 array */ + rv = sysfs_get_str(info, NULL, "sync_max", buf, 40); + + if (rv > 0) { + char *ep; + unsigned long long position = strtoull(buf, &ep, 0); + + dprintf("Read sync_max sysfs entry is: %s\n", buf); + if (!(ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))) { + position *= get_data_disks(level, + info->new_layout, + info->array.raid_disks); + if (info->reshape_progress < position) { + dprintf("Corrected reshape progress (%llu) to md position (%llu)\n", + info->reshape_progress, position); + info->reshape_progress = position; + ret_val = 1; + } else if (info->reshape_progress > position) { + pr_err("Fatal error: array reshape was not properly frozen (expected reshape position is %llu, but reshape progress is %llu.\n", + position, info->reshape_progress); + ret_val = -1; + } else { + dprintf("Reshape position in md and metadata are the same;"); + ret_val = 1; + } + } + } else if (rv == 0) { + /* for valid sysfs entry, 0-length content + * should be indicated as error + */ + ret_val = -1; + } + + return ret_val; +} + +static unsigned long long choose_offset(unsigned long long lo, + unsigned long long hi, + unsigned long long min, + unsigned long long max) +{ + /* Choose a new offset between hi and lo. + * It must be between min and max, but + * we would prefer something near the middle of hi/lo, and also + * prefer to be aligned to a big power of 2. + * + * So we start with the middle, then for each bit, + * starting at '1' and increasing, if it is set, we either + * add it or subtract it if possible, preferring the option + * which is furthest from the boundary. + * + * We stop once we get a 1MB alignment. As units are in sectors, + * 1MB = 2*1024 sectors. + */ + unsigned long long choice = (lo + hi) / 2; + unsigned long long bit = 1; + + for (bit = 1; bit < 2*1024; bit = bit << 1) { + unsigned long long bigger, smaller; + if (! (bit & choice)) + continue; + bigger = choice + bit; + smaller = choice - bit; + if (bigger > max && smaller < min) + break; + if (bigger > max) + choice = smaller; + else if (smaller < min) + choice = bigger; + else if (hi - bigger > smaller - lo) + choice = bigger; + else + choice = smaller; + } + return choice; +} + +static int set_new_data_offset(struct mdinfo *sra, struct supertype *st, + char *devname, int delta_disks, + unsigned long long data_offset, + unsigned long long min, + int can_fallback) +{ + struct mdinfo *sd; + int dir = 0; + int err = 0; + unsigned long long before, after; + + /* Need to find min space before and after so same is used + * on all devices + */ + before = UINT64_MAX; + after = UINT64_MAX; + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int dfd; + int rv; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + goto release; + } + st2 = dup_super(st); + rv = st2->ss->load_super(st2,dfd, NULL); + close(dfd); + if (rv) { + free(st2); + pr_err("%s: cannot get superblock from %s\n", + devname, dn); + goto release; + } + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (info2.space_before == 0 && + info2.space_after == 0) { + /* Metadata doesn't support data_offset changes */ + if (!can_fallback) + pr_err("%s: Metadata version doesn't support data_offset changes\n", + devname); + goto fallback; + } + if (before > info2.space_before) + before = info2.space_before; + if (after > info2.space_after) + after = info2.space_after; + + if (data_offset != INVALID_SECTORS) { + if (dir == 0) { + if (info2.data_offset == data_offset) { + pr_err("%s: already has that data_offset\n", + dn); + goto release; + } + if (data_offset < info2.data_offset) + dir = -1; + else + dir = 1; + } else if ((data_offset <= info2.data_offset && dir == 1) || + (data_offset >= info2.data_offset && dir == -1)) { + pr_err("%s: differing data offsets on devices make this --data-offset setting impossible\n", + dn); + goto release; + } + } + } + if (before == UINT64_MAX) + /* impossible really, there must be no devices */ + return 1; + + for (sd = sra->devs; sd; sd = sd->next) { + char *dn = map_dev(sd->disk.major, sd->disk.minor, 0); + unsigned long long new_data_offset; + + if (sd->disk.state & (1<data_offset + min; + else { + if (data_offset < sd->data_offset + min) { + pr_err("--data-offset too small for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else if (delta_disks > 0) { + /* need space before */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient head-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset == INVALID_SECTORS) + new_data_offset = sd->data_offset - min; + else { + if (data_offset > sd->data_offset - min) { + pr_err("--data-offset too large for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else { + if (dir == 0) { + /* can move up or down. If 'data_offset' + * was set we would have already decided, + * so just choose direction with most space. + */ + if (before > after) + dir = -1; + else + dir = 1; + } + sysfs_set_str(sra, NULL, "reshape_direction", + dir == 1 ? "backwards" : "forwards"); + if (dir > 0) { + /* Increase data offset */ + if (after < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient tail-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset < sd->data_offset + min) { + pr_err("--data-offset too small on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset, + sd->data_offset + after, + sd->data_offset + min, + sd->data_offset + after); + } else { + /* Decrease data offset */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("insufficient head-room on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset < sd->data_offset - min) { + pr_err("--data-offset too small on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset - before, + sd->data_offset, + sd->data_offset - before, + sd->data_offset - min); + } + } + err = sysfs_set_num(sra, sd, "new_offset", new_data_offset); + if (err < 0 && errno == E2BIG) { + /* try again after increasing data size to max */ + err = sysfs_set_num(sra, sd, "size", 0); + if (err < 0 && errno == EINVAL && + !(sd->disk.state & (1<component_size + after)/2); + } + err = sysfs_set_num(sra, sd, "new_offset", + new_data_offset); + } + if (err < 0) { + if (errno == E2BIG && data_offset != INVALID_SECTORS) { + pr_err("data-offset is too big for %s\n", + dn); + goto release; + } + if (sd == sra->devs && + (errno == ENOENT || errno == E2BIG)) + /* Early kernel, no 'new_offset' file, + * or kernel doesn't like us. + * For RAID5/6 this is not fatal + */ + return 1; + pr_err("Cannot set new_offset for %s\n", + dn); + break; + } + } + return err; +release: + return -1; +fallback: + /* Just use a backup file */ + return 1; +} + +static int raid10_reshape(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + struct reshape *reshape, + unsigned long long data_offset, + int force, int verbose) +{ + /* Changing raid_disks, layout, chunksize or possibly + * just data_offset for a RAID10. + * We must always change data_offset. We change by at least + * ->min_offset_change which is the largest of the old and new + * chunk sizes. + * If raid_disks is increasing, then data_offset must decrease + * by at least this copy size. + * If raid_disks is unchanged, data_offset must increase or + * decrease by at least min_offset_change but preferably by much more. + * We choose half of the available space. + * If raid_disks is decreasing, data_offset must increase by + * at least min_offset_change. To allow of this, component_size + * must be decreased by the same amount. + * + * So we calculate the required minimum and direction, possibly + * reduce the component_size, then iterate through the devices + * and set the new_data_offset. + * If that all works, we set chunk_size, layout, raid_disks, and start + * 'reshape' + */ + struct mdinfo *sra; + unsigned long long min; + int err = 0; + + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK + ); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + min = reshape->min_offset_change; + + if (info->delta_disks) + sysfs_set_str(sra, NULL, "reshape_direction", + info->delta_disks < 0 ? "backwards" : "forwards"); + if (info->delta_disks < 0 && + info->space_after < min) { + int rv = sysfs_set_num(sra, NULL, "component_size", + (sra->component_size - + min)/2); + if (rv) { + pr_err("cannot reduce component size\n"); + goto release; + } + } + err = set_new_data_offset(sra, st, devname, info->delta_disks, data_offset, + min, 0); + if (err == 1) { + pr_err("Cannot set new_data_offset: RAID10 reshape not\n"); + cont_err("supported on this kernel\n"); + err = -1; + } + if (err < 0) + goto release; + + if (!err && sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", reshape->after.layout) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "raid_disks", + info->array.raid_disks + info->delta_disks) < 0) + err = errno; + if (!err && sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) + err = errno; + if (err) { + pr_err("Cannot set array shape for %s\n", + devname); + if (err == EBUSY && + (info->array.state & (1<devs; sd; sd = sd->next) { + char *dn; + int dfd; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + break; + st2 = dup_super(st); + if (st2->ss->load_super(st2,dfd, NULL)) { + close(dfd); + free(st2); + break; + } + close(dfd); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (first || + min_space_before > info2.space_before) + min_space_before = info2.space_before; + if (first || + min_space_after > info2.space_after) + min_space_after = info2.space_after; + first = 0; + } + if (sd == NULL && !first) { + info->space_after = min_space_after; + info->space_before = min_space_before; + } + sysfs_free(sra); +} + +static void update_cache_size(char *container, struct mdinfo *sra, + struct mdinfo *info, + int disks, unsigned long long blocks) +{ + /* Check that the internal stripe cache is + * large enough, or it won't work. + * It must hold at least 4 stripes of the larger + * chunk size + */ + unsigned long cache; + cache = max(info->array.chunk_size, info->new_chunk); + cache *= 4; /* 4 stripes minimum */ + cache /= 512; /* convert to sectors */ + /* make sure there is room for 'blocks' with a bit to spare */ + if (cache < 16 + blocks / disks) + cache = 16 + blocks / disks; + cache /= (4096/512); /* Convert from sectors to pages */ + + if (sra->cache_size < cache) + subarray_set_num(container, sra, "stripe_cache_size", + cache+1); +} + +static int impose_reshape(struct mdinfo *sra, + struct mdinfo *info, + struct supertype *st, + int fd, + int restart, + char *devname, char *container, + struct reshape *reshape) +{ + struct mdu_array_info_s array; + + sra->new_chunk = info->new_chunk; + + if (restart) { + /* for external metadata checkpoint saved by mdmon can be lost + * or missed /due to e.g. crash/. Check if md is not during + * restart farther than metadata points to. + * If so, this means metadata information is obsolete. + */ + if (st->ss->external) + verify_reshape_position(info, reshape->level); + sra->reshape_progress = info->reshape_progress; + } else { + sra->reshape_progress = 0; + if (reshape->after.data_disks < reshape->before.data_disks) + /* start from the end of the new array */ + sra->reshape_progress = (sra->component_size + * reshape->after.data_disks); + } + + ioctl(fd, GET_ARRAY_INFO, &array); + if (info->array.chunk_size == info->new_chunk && + reshape->before.layout == reshape->after.layout && + st->ss->external == 0) { + /* use SET_ARRAY_INFO but only if reshape hasn't started */ + array.raid_disks = reshape->after.data_disks + reshape->parity; + if (!restart && + ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + int err = errno; + + pr_err("Cannot set device shape for %s: %s\n", + devname, strerror(errno)); + + if (err == EBUSY && + (array.state & (1<new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", + reshape->after.layout) < 0) + err = errno; + if (!err && subarray_set_num(container, sra, "raid_disks", + reshape->after.data_disks + + reshape->parity) < 0) + err = errno; + if (err) { + pr_err("Cannot set device shape for %s\n", + devname); + + if (err == EBUSY && + (array.state & (1<= 4 && array.level <= 6)) { + /* To convert to RAID0 we need to fail and + * remove any non-data devices. */ + int found = 0; + int d; + int data_disks = array.raid_disks - 1; + if (array.level == 6) + data_disks -= 1; + if (array.level == 5 && + array.layout != ALGORITHM_PARITY_N) + return -1; + if (array.level == 6 && + array.layout != ALGORITHM_PARITY_N_6) + return -1; + sysfs_set_str(&info, NULL,"sync_action", "idle"); + /* First remove any spares so no recovery starts */ + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; + d++) { + mdu_disk_info_t disk; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) + && disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, HOT_REMOVE_DISK, + makedev(disk.major, disk.minor)); + } + /* Now fail anything left */ + ioctl(fd, GET_ARRAY_INFO, &array); + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; + d++) { + int cnt; + mdu_disk_info_t disk; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) + && disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, SET_DISK_FAULTY, + makedev(disk.major, disk.minor)); + cnt = 5; + while (ioctl(fd, HOT_REMOVE_DISK, + makedev(disk.major, disk.minor)) < 0 + && errno == EBUSY + && cnt--) { + usleep(10000); + } + } + } + c = map_num(pers, level); + if (c) { + int err = sysfs_set_str(&info, NULL, "level", c); + if (err) { + err = errno; + pr_err("%s: could not set level to %s\n", + devname, c); + if (err == EBUSY && + (array.state & (1<= 0) + pr_err("level of %s changed to %s\n", + devname, c); + } + return 0; +} + +int sigterm = 0; +static void catch_term(int sig) +{ + sigterm = 1; +} + +static int continue_via_systemd(char *devnm) +{ + int skipped, i, pid, status; + char pathbuf[1024]; + /* In a systemd/udev world, it is best to get systemd to + * run "mdadm --grow --continue" rather than running in the + * background. + */ + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + skipped = 0; + for (i = 3; skipped < 20; i++) + if (close(i) < 0) + skipped++; + else + skipped = 0; + + /* Don't want to see error messages from + * systemctl. If the service doesn't exist, + * we fork ourselves. + */ + close(2); + open("/dev/null", O_WRONLY); + snprintf(pathbuf, sizeof(pathbuf), "mdadm-grow-continue@%s.service", + devnm); + status = execl("/usr/bin/systemctl", "systemctl", + "start", + pathbuf, NULL); + status = execl("/bin/systemctl", "systemctl", "start", + pathbuf, NULL); + exit(1); + case -1: /* Just do it ourselves. */ + break; + default: /* parent - good */ + pid = wait(&status); + if (pid >= 0 && status == 0) + return 1; + } + return 0; +} + +static int reshape_array(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + int force, struct mddev_dev *devlist, + unsigned long long data_offset, + char *backup_file, int verbose, int forked, + int restart, int freeze_reshape) +{ + struct reshape reshape; + int spares_needed; + char *msg; + int orig_level = UnSet; + int odisks; + int delayed; + + struct mdu_array_info_s array; + char *c; + + struct mddev_dev *dv; + int added_disks; + + int *fdlist = NULL; + unsigned long long *offsets = NULL; + int d; + int nrdisks; + int err; + unsigned long blocks; + unsigned long long array_size; + int done; + struct mdinfo *sra = NULL; + char buf[20]; + + /* when reshaping a RAID0, the component_size might be zero. + * So try to fix that up. + */ + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + if (array.level == 0 && info->component_size == 0) { + get_dev_size(fd, NULL, &array_size); + info->component_size = array_size / array.raid_disks; + } + + if (array.level == 10) + /* Need space_after info */ + get_space_after(fd, st, info); + + if (info->reshape_active) { + int new_level = info->new_level; + info->new_level = UnSet; + if (info->delta_disks > 0) + info->array.raid_disks -= info->delta_disks; + msg = analyse_change(devname, info, &reshape); + info->new_level = new_level; + if (info->delta_disks > 0) + info->array.raid_disks += info->delta_disks; + if (!restart) + /* Make sure the array isn't read-only */ + ioctl(fd, RESTART_ARRAY_RW, 0); + } else + msg = analyse_change(devname, info, &reshape); + if (msg) { + /* if msg == "", error has already been printed */ + if (msg[0]) + pr_err("%s\n", msg); + goto release; + } + if (restart && + (reshape.level != info->array.level || + reshape.before.layout != info->array.layout || + reshape.before.data_disks + reshape.parity + != info->array.raid_disks - max(0, info->delta_disks))) { + pr_err("reshape info is not in native format - cannot continue.\n"); + goto release; + } + + if (st->ss->external && restart && (info->reshape_progress == 0) && + !((sysfs_get_str(info, NULL, "sync_action", buf, sizeof(buf)) > 0) && + (strncmp(buf, "reshape", 7) == 0))) { + /* When reshape is restarted from '0', very begin of array + * it is possible that for external metadata reshape and array + * configuration doesn't happen. + * Check if md has the same opinion, and reshape is restarted + * from 0. If so, this is regular reshape start after reshape + * switch in metadata to next array only. + */ + if ((verify_reshape_position(info, reshape.level) >= 0) && + (info->reshape_progress == 0)) + restart = 0; + } + if (restart) { + /* reshape already started. just skip to monitoring the reshape */ + if (reshape.backup_blocks == 0) + return 0; + if (restart & RESHAPE_NO_BACKUP) + return 0; + + /* Need 'sra' down at 'started:' */ + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| + GET_CACHE); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + + if (!backup_file) + backup_file = locate_backup(sra->sys_name); + + goto started; + } + /* The container is frozen but the array may not be. + * So freeze the array so spares don't get put to the wrong use + * FIXME there should probably be a cleaner separation between + * freeze_array and freeze_container. + */ + sysfs_freeze_array(info); + /* Check we have enough spares to not be degraded */ + added_disks = 0; + for (dv = devlist; dv ; dv=dv->next) + added_disks++; + spares_needed = max(reshape.before.data_disks, + reshape.after.data_disks) + + reshape.parity - array.raid_disks; + + if (!force && + info->new_level > 1 && info->array.level > 1 && + spares_needed > info->array.spare_disks + added_disks) { + pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n" + " Use --force to over-ride this check.\n", + spares_needed, + spares_needed == 1 ? "" : "s", + info->array.spare_disks + added_disks); + goto release; + } + /* Check we have enough spares to not fail */ + spares_needed = max(reshape.before.data_disks, + reshape.after.data_disks) + - array.raid_disks; + if ((info->new_level > 1 || info->new_level == 0) && + spares_needed > info->array.spare_disks +added_disks) { + pr_err("Need %d spare%s to create working array, and only have %d.\n", + spares_needed, + spares_needed == 1 ? "" : "s", + info->array.spare_disks + added_disks); + goto release; + } + + if (reshape.level != array.level) { + int err = impose_level(fd, reshape.level, devname, verbose); + if (err) + goto release; + info->new_layout = UnSet; /* after level change, + * layout is meaningless */ + orig_level = array.level; + sysfs_freeze_array(info); + + if (reshape.level > 0 && st->ss->external) { + /* make sure mdmon is aware of the new level */ + if (mdmon_running(container)) + flush_mdmon(container); + + if (!mdmon_running(container)) + start_mdmon(container); + ping_monitor(container); + if (mdmon_running(container) && + st->update_tail == NULL) + st->update_tail = &st->updates; + } + } + /* ->reshape_super might have chosen some spares from the + * container that it wants to be part of the new array. + * We can collect them with ->container_content and give + * them to the kernel. + */ + if (st->ss->reshape_super && st->ss->container_content) { + char *subarray = strchr(info->text_version+1, '/')+1; + struct mdinfo *info2 = + st->ss->container_content(st, subarray); + struct mdinfo *d; + + if (info2) { + sysfs_init(info2, fd, st->devnm); + /* When increasing number of devices, we need to set + * new raid_disks before adding these, or they might + * be rejected. + */ + if (reshape.backup_blocks && + reshape.after.data_disks > reshape.before.data_disks) + subarray_set_num(container, info2, "raid_disks", + reshape.after.data_disks + + reshape.parity); + for (d = info2->devs; d; d = d->next) { + if (d->disk.state == 0 && + d->disk.raid_disk >= 0) { + /* This is a spare that wants to + * be part of the array. + */ + add_disk(fd, st, info2, d); + } + } + sysfs_free(info2); + } + } + /* We might have been given some devices to add to the + * array. Now that the array has been changed to the right + * level and frozen, we can safely add them. + */ + if (devlist) + Manage_subdevs(devname, fd, devlist, verbose, + 0,NULL, 0); + + if (reshape.backup_blocks == 0 && data_offset != INVALID_SECTORS) + reshape.backup_blocks = reshape.before.data_disks * info->array.chunk_size/512; + if (reshape.backup_blocks == 0) { + /* No restriping needed, but we might need to impose + * some more changes: layout, raid_disks, chunk_size + */ + /* read current array info */ + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + /* compare current array info with new values and if + * it is different update them to new */ + if (info->new_layout != UnSet && + info->new_layout != array.layout) { + array.layout = info->new_layout; + if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + pr_err("failed to set new layout\n"); + goto release; + } else if (verbose >= 0) + printf("layout for %s set to %d\n", + devname, array.layout); + } + if (info->delta_disks != UnSet && + info->delta_disks != 0 && + array.raid_disks != (info->array.raid_disks + info->delta_disks)) { + array.raid_disks += info->delta_disks; + if (ioctl(fd, SET_ARRAY_INFO, &array) != 0) { + pr_err("failed to set raid disks\n"); + goto release; + } else if (verbose >= 0) { + printf("raid_disks for %s set to %d\n", + devname, array.raid_disks); + } + } + if (info->new_chunk != 0 && + info->new_chunk != array.chunk_size) { + if (sysfs_set_num(info, NULL, + "chunk_size", info->new_chunk) != 0) { + pr_err("failed to set chunk size\n"); + goto release; + } else if (verbose >= 0) + printf("chunk size for %s set to %d\n", + devname, array.chunk_size); + } + unfreeze(st); + return 0; + } + + /* + * There are three possibilities. + * 1/ The array will shrink. + * We need to ensure the reshape will pause before reaching + * the 'critical section'. We also need to fork and wait for + * that to happen. When it does we + * suspend/backup/complete/unfreeze + * + * 2/ The array will not change size. + * This requires that we keep a backup of a sliding window + * so that we can restore data after a crash. So we need + * to fork and monitor progress. + * In future we will allow the data_offset to change, so + * a sliding backup becomes unnecessary. + * + * 3/ The array will grow. This is relatively easy. + * However the kernel's restripe routines will cheerfully + * overwrite some early data before it is safe. So we + * need to make a backup of the early parts of the array + * and be ready to restore it if rebuild aborts very early. + * For externally managed metadata, we still need a forked + * child to monitor the reshape and suspend IO over the region + * that is being reshaped. + * + * We backup data by writing it to one spare, or to a + * file which was given on command line. + * + * In each case, we first make sure that storage is available + * for the required backup. + * Then we: + * - request the shape change. + * - fork to handle backup etc. + */ + /* Check that we can hold all the data */ + get_dev_size(fd, NULL, &array_size); + if (reshape.new_size < (array_size/512)) { + pr_err("this change will reduce the size of the array.\n" + " use --grow --array-size first to truncate array.\n" + " e.g. mdadm --grow %s --array-size %llu\n", + devname, reshape.new_size/2); + goto release; + } + + if (array.level == 10) { + /* Reshaping RAID10 does not require any data backup by + * user-space. Instead it requires that the data_offset + * is changed to avoid the need for backup. + * So this is handled very separately + */ + if (restart) + /* Nothing to do. */ + return 0; + return raid10_reshape(container, fd, devname, st, info, + &reshape, data_offset, + force, verbose); + } + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| + GET_CACHE); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + + if (!backup_file) + switch(set_new_data_offset(sra, st, devname, + reshape.after.data_disks - reshape.before.data_disks, + data_offset, + reshape.min_offset_change, 1)) { + case -1: + goto release; + case 0: + /* Updated data_offset, so it's easy now */ + update_cache_size(container, sra, info, + min(reshape.before.data_disks, + reshape.after.data_disks), + reshape.backup_blocks); + + /* Right, everything seems fine. Let's kick things off. + */ + sync_metadata(st); + + if (impose_reshape(sra, info, st, fd, restart, + devname, container, &reshape) < 0) + goto release; + if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) { + struct mdinfo *sd; + if (errno != EINVAL) { + pr_err("Failed to initiate reshape!\n"); + goto release; + } + /* revert data_offset and try the old way */ + for (sd = sra->devs; sd; sd = sd->next) { + sysfs_set_num(sra, sd, "new_offset", + sd->data_offset); + sysfs_set_str(sra, NULL, "reshape_direction", + "forwards"); + } + break; + } + if (info->new_level == reshape.level) + return 0; + /* need to adjust level when reshape completes */ + switch(fork()) { + case -1: /* ignore error, but don't wait */ + return 0; + default: /* parent */ + return 0; + case 0: + map_fork(); + break; + } + close(fd); + wait_reshape(sra); + fd = open_dev(sra->sys_name); + if (fd >= 0) + impose_level(fd, info->new_level, devname, verbose); + return 0; + case 1: /* Couldn't set data_offset, try the old way */ + if (data_offset != INVALID_SECTORS) { + pr_err("Cannot update data_offset on this array\n"); + goto release; + } + break; + } + +started: + /* Decide how many blocks (sectors) for a reshape + * unit. The number we have so far is just a minimum + */ + blocks = reshape.backup_blocks; + if (reshape.before.data_disks == + reshape.after.data_disks) { + /* Make 'blocks' bigger for better throughput, but + * not so big that we reject it below. + * Try for 16 megabytes + */ + while (blocks * 32 < sra->component_size && + blocks < 16*1024*2) + blocks *= 2; + } else + pr_err("Need to backup %luK of critical section..\n", blocks/2); + + if (blocks >= sra->component_size/2) { + pr_err("%s: Something wrong - reshape aborted\n", + devname); + goto release; + } + + /* Now we need to open all these devices so we can read/write. + */ + nrdisks = max(reshape.before.data_disks, + reshape.after.data_disks) + reshape.parity + + sra->array.spare_disks; + fdlist = xcalloc((1+nrdisks), sizeof(int)); + offsets = xcalloc((1+nrdisks), sizeof(offsets[0])); + + odisks = reshape.before.data_disks + reshape.parity; + d = reshape_prepare_fdlist(devname, sra, odisks, + nrdisks, blocks, backup_file, + fdlist, offsets); + if (d < odisks) { + goto release; + } + if ((st->ss->manage_reshape == NULL) || + (st->ss->recover_backup == NULL)) { + if (backup_file == NULL) { + if (reshape.after.data_disks <= + reshape.before.data_disks) { + pr_err("%s: Cannot grow - need backup-file\n", + devname); + pr_err(" Please provide one with \"--backup=...\"\n"); + goto release; + } else if (d == odisks) { + pr_err("%s: Cannot grow - need a spare or backup-file to backup critical section\n", devname); + goto release; + } + } else { + if (!reshape_open_backup_file(backup_file, fd, devname, + (signed)blocks, + fdlist+d, offsets+d, + sra->sys_name, + restart)) { + goto release; + } + d++; + } + } + + update_cache_size(container, sra, info, + min(reshape.before.data_disks, reshape.after.data_disks), + blocks); + + /* Right, everything seems fine. Let's kick things off. + * If only changing raid_disks, use ioctl, else use + * sysfs. + */ + sync_metadata(st); + + if (impose_reshape(sra, info, st, fd, restart, + devname, container, &reshape) < 0) + goto release; + + err = start_reshape(sra, restart, reshape.before.data_disks, + reshape.after.data_disks); + if (err) { + pr_err("Cannot %s reshape for %s\n", + restart ? "continue" : "start", + devname); + goto release; + } + if (restart) + sysfs_set_str(sra, NULL, "array_state", "active"); + if (freeze_reshape) { + free(fdlist); + free(offsets); + sysfs_free(sra); + pr_err("Reshape has to be continued from location %llu when root filesystem has been mounted.\n", + sra->reshape_progress); + return 1; + } + + if (!forked && !check_env("MDADM_NO_SYSTEMCTL")) + if (continue_via_systemd(container ?: sra->sys_name)) { + free(fdlist); + free(offsets); + sysfs_free(sra); + return 0; + } + + /* Now we just need to kick off the reshape and watch, while + * handling backups of the data... + * This is all done by a forked background process. + */ + switch(forked ? 0 : fork()) { + case -1: + pr_err("Cannot run child to monitor reshape: %s\n", + strerror(errno)); + abort_reshape(sra); + goto release; + default: + free(fdlist); + free(offsets); + sysfs_free(sra); + return 0; + case 0: + map_fork(); + break; + } + + /* If another array on the same devices is busy, the + * reshape will wait for them. This would mean that + * the first section that we suspend will stay suspended + * for a long time. So check on that possibility + * by looking for "DELAYED" in /proc/mdstat, and if found, + * wait a while + */ + do { + struct mdstat_ent *mds, *m; + delayed = 0; + mds = mdstat_read(1, 0); + for (m = mds; m; m = m->next) + if (strcmp(m->devnm, sra->sys_name) == 0) { + if (m->resync && + m->percent == RESYNC_DELAYED) + delayed = 1; + if (m->resync == 0) + /* Haven't started the reshape thread + * yet, wait a bit + */ + delayed = 2; + break; + } + free_mdstat(mds); + if (delayed == 1 && get_linux_version() < 3007000) { + pr_err("Reshape is delayed, but cannot wait carefully with this kernel.\n" + " You might experience problems until other reshapes complete.\n"); + delayed = 0; + } + if (delayed) + mdstat_wait(30 - (delayed-1) * 25); + } while (delayed); + mdstat_close(); + close(fd); + if (check_env("MDADM_GROW_VERIFY")) + fd = open(devname, O_RDONLY | O_DIRECT); + else + fd = -1; + mlockall(MCL_FUTURE); + + signal(SIGTERM, catch_term); + + if (st->ss->external) { + /* metadata handler takes it from here */ + done = st->ss->manage_reshape( + fd, sra, &reshape, st, blocks, + fdlist, offsets, + d - odisks, fdlist+odisks, + offsets+odisks); + } else + done = child_monitor( + fd, sra, &reshape, st, blocks, + fdlist, offsets, + d - odisks, fdlist+odisks, + offsets+odisks); + + free(fdlist); + free(offsets); + + if (backup_file && done) { + char *bul; + bul = make_backup(sra->sys_name); + if (bul) { + char buf[1024]; + int l = readlink(bul, buf, sizeof(buf) - 1); + if (l > 0) { + buf[l]=0; + unlink(buf); + } + unlink(bul); + free(bul); + } + unlink(backup_file); + } + if (!done) { + abort_reshape(sra); + goto out; + } + + if (!st->ss->external && + !(reshape.before.data_disks != reshape.after.data_disks + && info->custom_array_size) && + info->new_level == reshape.level && + !forked) { + /* no need to wait for the reshape to finish as + * there is nothing more to do. + */ + sysfs_free(sra); + exit(0); + } + wait_reshape(sra); + + if (st->ss->external) { + /* Re-load the metadata as much could have changed */ + int cfd = open_dev(st->container_devnm); + if (cfd >= 0) { + flush_mdmon(container); + st->ss->free_super(st); + st->ss->load_container(st, cfd, container); + close(cfd); + } + } + + /* set new array size if required customer_array_size is used + * by this metadata. + */ + if (reshape.before.data_disks != + reshape.after.data_disks && + info->custom_array_size) + set_array_size(st, info, info->text_version); + + if (info->new_level != reshape.level) { + if (fd < 0) + fd = open(devname, O_RDONLY); + impose_level(fd, info->new_level, devname, verbose); + close(fd); + if (info->new_level == 0) + st->update_tail = NULL; + } +out: + sysfs_free(sra); + if (forked) + return 0; + unfreeze(st); + exit(0); + +release: + free(fdlist); + free(offsets); + if (orig_level != UnSet && sra) { + c = map_num(pers, orig_level); + if (c && sysfs_set_str(sra, NULL, "level", c) == 0) + pr_err("aborting level change\n"); + } + sysfs_free(sra); + if (!forked) + unfreeze(st); + return 1; +} + +/* mdfd handle is passed to be closed in child process (after fork). + */ +int reshape_container(char *container, char *devname, + int mdfd, + struct supertype *st, + struct mdinfo *info, + int force, + char *backup_file, int verbose, + int forked, int restart, int freeze_reshape) +{ + struct mdinfo *cc = NULL; + int rv = restart; + char last_devnm[32] = ""; + + /* component_size is not meaningful for a container, + * so pass '0' meaning 'no change' + */ + if (!restart && + reshape_super(st, 0, info->new_level, + info->new_layout, info->new_chunk, + info->array.raid_disks, info->delta_disks, + backup_file, devname, APPLY_METADATA_CHANGES, + verbose)) { + unfreeze(st); + return 1; + } + + sync_metadata(st); + + /* ping monitor to be sure that update is on disk + */ + ping_monitor(container); + + if (!forked && !freeze_reshape && !check_env("MDADM_NO_SYSTEMCTL")) + if (continue_via_systemd(container)) + return 0; + + switch (forked ? 0 : fork()) { + case -1: /* error */ + perror("Cannot fork to complete reshape\n"); + unfreeze(st); + return 1; + default: /* parent */ + if (!freeze_reshape) + printf("%s: multi-array reshape continues in background\n", Name); + return 0; + case 0: /* child */ + map_fork(); + break; + } + + /* close unused handle in child process + */ + if (mdfd > -1) + close(mdfd); + + while(1) { + /* For each member array with reshape_active, + * we need to perform the reshape. + * We pick the first array that needs reshaping and + * reshape it. reshape_array() will re-read the metadata + * so the next time through a different array should be + * ready for reshape. + * It is possible that the 'different' array will not + * be assembled yet. In that case we simple exit. + * When it is assembled, the mdadm which assembles it + * will take over the reshape. + */ + struct mdinfo *content; + int fd; + struct mdstat_ent *mdstat; + char *adev; + int devid; + + sysfs_free(cc); + + cc = st->ss->container_content(st, NULL); + + for (content = cc; content ; content = content->next) { + char *subarray; + if (!content->reshape_active) + continue; + + subarray = strchr(content->text_version+1, '/')+1; + mdstat = mdstat_by_subdev(subarray, container); + if (!mdstat) + continue; + if (mdstat->active == 0) { + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); + free_mdstat(mdstat); + mdstat = NULL; + continue; + } + break; + } + if (!content) + break; + + devid = devnm2devid(mdstat->devnm); + adev = map_dev(major(devid), minor(devid), 0); + if (!adev) + adev = content->text_version; + + fd = open_dev(mdstat->devnm); + if (fd < 0) { + pr_err("Device %s cannot be opened for reshape.\n", adev); + break; + } + + if (strcmp(last_devnm, mdstat->devnm) == 0) { + /* Do not allow for multiple reshape_array() calls for + * the same array. + * It can happen when reshape_array() returns without + * error, when reshape is not finished (wrong reshape + * starting/continuation conditions). Mdmon doesn't + * switch to next array in container and reentry + * conditions for the same array occur. + * This is possibly interim until the behaviour of + * reshape_array is resolved(). + */ + printf("%s: Multiple reshape execution detected for device %s.\n", Name, adev); + close(fd); + break; + } + strcpy(last_devnm, mdstat->devnm); + + sysfs_init(content, fd, mdstat->devnm); + + if (mdmon_running(container)) + flush_mdmon(container); + + rv = reshape_array(container, fd, adev, st, + content, force, NULL, INVALID_SECTORS, + backup_file, verbose, 1, restart, + freeze_reshape); + close(fd); + + if (freeze_reshape) { + sysfs_free(cc); + exit(0); + } + + restart = 0; + if (rv) + break; + + if (mdmon_running(container)) + flush_mdmon(container); + } + if (!rv) + unfreeze(st); + sysfs_free(cc); + exit(0); +} + +/* + * We run a child process in the background which performs the following + * steps: + * - wait for resync to reach a certain point + * - suspend io to the following section + * - backup that section + * - allow resync to proceed further + * - resume io + * - discard the backup. + * + * When are combined in slightly different ways in the three cases. + * Grow: + * - suspend/backup/allow/wait/resume/discard + * Shrink: + * - allow/wait/suspend/backup/allow/wait/resume/discard + * same-size: + * - wait/resume/discard/suspend/backup/allow + * + * suspend/backup/allow always come together + * wait/resume/discard do too. + * For the same-size case we have two backups to improve flow. + * + */ + +int progress_reshape(struct mdinfo *info, struct reshape *reshape, + unsigned long long backup_point, + unsigned long long wait_point, + unsigned long long *suspend_point, + unsigned long long *reshape_completed, int *frozen) +{ + /* This function is called repeatedly by the reshape manager. + * It determines how much progress can safely be made and allows + * that progress. + * - 'info' identifies the array and particularly records in + * ->reshape_progress the metadata's knowledge of progress + * This is a sector offset from the start of the array + * of the next array block to be relocated. This number + * may increase from 0 or decrease from array_size, depending + * on the type of reshape that is happening. + * Note that in contrast, 'sync_completed' is a block count of the + * reshape so far. It gives the distance between the start point + * (head or tail of device) and the next place that data will be + * written. It always increases. + * - 'reshape' is the structure created by analyse_change + * - 'backup_point' shows how much the metadata manager has backed-up + * data. For reshapes with increasing progress, it is the next address + * to be backed up, previous addresses have been backed-up. For + * decreasing progress, it is the earliest address that has been + * backed up - later address are also backed up. + * So addresses between reshape_progress and backup_point are + * backed up providing those are in the 'correct' order. + * - 'wait_point' is an array address. When reshape_completed + * passes this point, progress_reshape should return. It might + * return earlier if it determines that ->reshape_progress needs + * to be updated or further backup is needed. + * - suspend_point is maintained by progress_reshape and the caller + * should not touch it except to initialise to zero. + * It is an array address and it only increases in 2.6.37 and earlier. + * This makes it difficult to handle reducing reshapes with + * external metadata. + * However: it is similar to backup_point in that it records the + * other end of a suspended region from reshape_progress. + * it is moved to extend the region that is safe to backup and/or + * reshape + * - reshape_completed is read from sysfs and returned. The caller + * should copy this into ->reshape_progress when it has reason to + * believe that the metadata knows this, and any backup outside this + * has been erased. + * + * Return value is: + * 1 if more data from backup_point - but only as far as suspend_point, + * should be backed up + * 0 if things are progressing smoothly + * -1 if the reshape is finished because it is all done, + * -2 if the reshape is finished due to an error. + */ + + int advancing = (reshape->after.data_disks + >= reshape->before.data_disks); + unsigned long long need_backup; /* All data between start of array and + * here will at some point need to + * be backed up. + */ + unsigned long long read_offset, write_offset; + unsigned long long write_range; + unsigned long long max_progress, target, completed; + unsigned long long array_size = (info->component_size + * reshape->before.data_disks); + int fd; + char buf[20]; + + /* First, we unsuspend any region that is now known to be safe. + * If suspend_point is on the 'wrong' side of reshape_progress, then + * we don't have or need suspension at the moment. This is true for + * native metadata when we don't need to back-up. + */ + if (advancing) { + if (info->reshape_progress <= *suspend_point) + sysfs_set_num(info, NULL, "suspend_lo", + info->reshape_progress); + } else { + /* Note: this won't work in 2.6.37 and before. + * Something somewhere should make sure we don't need it! + */ + if (info->reshape_progress >= *suspend_point) + sysfs_set_num(info, NULL, "suspend_hi", + info->reshape_progress); + } + + /* Now work out how far it is safe to progress. + * If the read_offset for ->reshape_progress is less than + * 'blocks' beyond the write_offset, we can only progress as far + * as a backup. + * Otherwise we can progress until the write_offset for the new location + * reaches (within 'blocks' of) the read_offset at the current location. + * However that region must be suspended unless we are using native + * metadata. + * If we need to suspend more, we limit it to 128M per device, which is + * rather arbitrary and should be some time-based calculation. + */ + read_offset = info->reshape_progress / reshape->before.data_disks; + write_offset = info->reshape_progress / reshape->after.data_disks; + write_range = info->new_chunk/512; + if (reshape->before.data_disks == reshape->after.data_disks) + need_backup = array_size; + else + need_backup = reshape->backup_blocks; + if (advancing) { + if (read_offset < write_offset + write_range) + max_progress = backup_point; + else + max_progress = + read_offset * + reshape->after.data_disks; + } else { + if (read_offset > write_offset - write_range) + /* Can only progress as far as has been backed up, + * which must be suspended */ + max_progress = backup_point; + else if (info->reshape_progress <= need_backup) + max_progress = backup_point; + else { + if (info->array.major_version >= 0) + /* Can progress until backup is needed */ + max_progress = need_backup; + else { + /* Can progress until metadata update is required */ + max_progress = + read_offset * + reshape->after.data_disks; + /* but data must be suspended */ + if (max_progress < *suspend_point) + max_progress = *suspend_point; + } + } + } + + /* We know it is safe to progress to 'max_progress' providing + * it is suspended or we are using native metadata. + * Consider extending suspend_point 128M per device if it + * is less than 64M per device beyond reshape_progress. + * But always do a multiple of 'blocks' + * FIXME this is too big - it takes to long to complete + * this much. + */ + target = 64*1024*2 * min(reshape->before.data_disks, + reshape->after.data_disks); + target /= reshape->backup_blocks; + if (target < 2) + target = 2; + target *= reshape->backup_blocks; + + /* For externally managed metadata we always need to suspend IO to + * the area being reshaped so we regularly push suspend_point forward. + * For native metadata we only need the suspend if we are going to do + * a backup. + */ + if (advancing) { + if ((need_backup > info->reshape_progress + || info->array.major_version < 0) && + *suspend_point < info->reshape_progress + target) { + if (need_backup < *suspend_point + 2 * target) + *suspend_point = need_backup; + else if (*suspend_point + 2 * target < array_size) + *suspend_point += 2 * target; + else + *suspend_point = array_size; + sysfs_set_num(info, NULL, "suspend_hi", *suspend_point); + if (max_progress > *suspend_point) + max_progress = *suspend_point; + } + } else { + if (info->array.major_version >= 0) { + /* Only need to suspend when about to backup */ + if (info->reshape_progress < need_backup * 2 && + *suspend_point > 0) { + *suspend_point = 0; + sysfs_set_num(info, NULL, "suspend_lo", 0); + sysfs_set_num(info, NULL, "suspend_hi", need_backup); + } + } else { + /* Need to suspend continually */ + if (info->reshape_progress < *suspend_point) + *suspend_point = info->reshape_progress; + if (*suspend_point + target < info->reshape_progress) + /* No need to move suspend region yet */; + else { + if (*suspend_point >= 2 * target) + *suspend_point -= 2 * target; + else + *suspend_point = 0; + sysfs_set_num(info, NULL, "suspend_lo", + *suspend_point); + } + if (max_progress < *suspend_point) + max_progress = *suspend_point; + } + } + + /* now set sync_max to allow that progress. sync_max, like + * sync_completed is a count of sectors written per device, so + * we find the difference between max_progress and the start point, + * and divide that by after.data_disks to get a sync_max + * number. + * At the same time we convert wait_point to a similar number + * for comparing against sync_completed. + */ + /* scale down max_progress to per_disk */ + max_progress /= reshape->after.data_disks; + /* Round to chunk size as some kernels give an erroneously high number */ + max_progress /= info->new_chunk/512; + max_progress *= info->new_chunk/512; + /* And round to old chunk size as the kernel wants that */ + max_progress /= info->array.chunk_size/512; + max_progress *= info->array.chunk_size/512; + /* Limit progress to the whole device */ + if (max_progress > info->component_size) + max_progress = info->component_size; + wait_point /= reshape->after.data_disks; + if (!advancing) { + /* switch from 'device offset' to 'processed block count' */ + max_progress = info->component_size - max_progress; + wait_point = info->component_size - wait_point; + } + + if (!*frozen) + sysfs_set_num(info, NULL, "sync_max", max_progress); + + /* Now wait. If we have already reached the point that we were + * asked to wait to, don't wait at all, else wait for any change. + * We need to select on 'sync_completed' as that is the place that + * notifications happen, but we are really interested in + * 'reshape_position' + */ + fd = sysfs_get_fd(info, NULL, "sync_completed"); + if (fd < 0) + goto check_progress; + + if (sysfs_fd_get_ll(fd, &completed) < 0) + goto check_progress; + + while (completed < max_progress && completed < wait_point) { + /* Check that sync_action is still 'reshape' to avoid + * waiting forever on a dead array + */ + char action[20]; + if (sysfs_get_str(info, NULL, "sync_action", + action, 20) <= 0 || + strncmp(action, "reshape", 7) != 0) + break; + /* Some kernels reset 'sync_completed' to zero + * before setting 'sync_action' to 'idle'. + * So we need these extra tests. + */ + if (completed == 0 && advancing + && strncmp(action, "idle", 4) == 0 + && info->reshape_progress > 0) + break; + if (completed == 0 && !advancing + && strncmp(action, "idle", 4) == 0 + && info->reshape_progress < (info->component_size + * reshape->after.data_disks)) + break; + sysfs_wait(fd, NULL); + if (sysfs_fd_get_ll(fd, &completed) < 0) + goto check_progress; + } + /* Some kernels reset 'sync_completed' to zero, + * we need to have real point we are in md. + * So in that case, read 'reshape_position' from sysfs. + */ + if (completed == 0) { + unsigned long long reshapep; + char action[20]; + if (sysfs_get_str(info, NULL, "sync_action", + action, 20) > 0 && + strncmp(action, "idle", 4) == 0 && + sysfs_get_ll(info, NULL, + "reshape_position", &reshapep) == 0) + *reshape_completed = reshapep; + } else { + /* some kernels can give an incorrectly high + * 'completed' number, so round down */ + completed /= (info->new_chunk/512); + completed *= (info->new_chunk/512); + /* Convert 'completed' back in to a 'progress' number */ + completed *= reshape->after.data_disks; + if (!advancing) + completed = (info->component_size + * reshape->after.data_disks + - completed); + *reshape_completed = completed; + } + + close(fd); + + /* We return the need_backup flag. Caller will decide + * how much - a multiple of ->backup_blocks up to *suspend_point + */ + if (advancing) + return need_backup > info->reshape_progress; + else + return need_backup >= info->reshape_progress; + +check_progress: + /* if we couldn't read a number from sync_completed, then + * either the reshape did complete, or it aborted. + * We can tell which by checking for 'none' in reshape_position. + * If it did abort, then it might immediately restart if it + * it was just a device failure that leaves us degraded but + * functioning. + */ + if (sysfs_get_str(info, NULL, "reshape_position", buf, sizeof(buf)) < 0 + || strncmp(buf, "none", 4) != 0) { + /* The abort might only be temporary. Wait up to 10 + * seconds for fd to contain a valid number again. + */ + int wait = 10000; + int rv = -2; + unsigned long long new_sync_max; + while (fd >= 0 && rv < 0 && wait > 0) { + if (sysfs_wait(fd, &wait) != 1) + break; + switch (sysfs_fd_get_ll(fd, &completed)) { + case 0: + /* all good again */ + rv = 1; + /* If "sync_max" is no longer max_progress + * we need to freeze things + */ + sysfs_get_ll(info, NULL, "sync_max", &new_sync_max); + *frozen = (new_sync_max != max_progress); + break; + case -2: /* read error - abort */ + wait = 0; + break; + } + } + if (fd >= 0) + close(fd); + return rv; /* abort */ + } else { + /* Maybe racing with array shutdown - check state */ + if (fd >= 0) + close(fd); + if (sysfs_get_str(info, NULL, "array_state", buf, sizeof(buf)) < 0 + || strncmp(buf, "inactive", 8) == 0 + || strncmp(buf, "clear",5) == 0) + return -2; /* abort */ + return -1; /* complete */ + } +} + +/* FIXME return status is never checked */ +static int grow_backup(struct mdinfo *sra, + unsigned long long offset, /* per device */ + unsigned long stripes, /* per device, in old chunks */ + int *sources, unsigned long long *offsets, + int disks, int chunk, int level, int layout, + int dests, int *destfd, unsigned long long *destoffsets, + int part, int *degraded, + char *buf) +{ + /* Backup 'blocks' sectors at 'offset' on each device of the array, + * to storage 'destfd' (offset 'destoffsets'), after first + * suspending IO. Then allow resync to continue + * over the suspended section. + * Use part 'part' of the backup-super-block. + */ + int odata = disks; + int rv = 0; + int i; + unsigned long long ll; + int new_degraded; + //printf("offset %llu\n", offset); + if (level >= 4) + odata--; + if (level == 6) + odata--; + + /* Check that array hasn't become degraded, else we might backup the wrong data */ + if (sysfs_get_ll(sra, NULL, "degraded", &ll) < 0) + return -1; /* FIXME this error is ignored */ + new_degraded = (int)ll; + if (new_degraded != *degraded) { + /* check each device to ensure it is still working */ + struct mdinfo *sd; + for (sd = sra->devs ; sd ; sd = sd->next) { + if (sd->disk.state & (1<disk.state & (1<disk.state = (1<disk.raid_disk >= 0 && + sources[sd->disk.raid_disk] >= 0) { + close(sources[sd->disk.raid_disk]); + sources[sd->disk.raid_disk] = -1; + } + } + } + } + *degraded = new_degraded; + } + if (part) { + bsb.arraystart2 = __cpu_to_le64(offset * odata); + bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata); + } else { + bsb.arraystart = __cpu_to_le64(offset * odata); + bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata); + } + if (part) + bsb.magic[15] = '2'; + for (i = 0; i < dests; i++) + if (part) + lseek64(destfd[i], destoffsets[i] + __le64_to_cpu(bsb.devstart2)*512, 0); + else + lseek64(destfd[i], destoffsets[i], 0); + + rv = save_stripes(sources, offsets, + disks, chunk, level, layout, + dests, destfd, + offset*512*odata, stripes * chunk * odata, + buf); + + if (rv) + return rv; + bsb.mtime = __cpu_to_le64(time(0)); + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + + bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + + rv = -1; + if ((unsigned long long)lseek64(destfd[i], destoffsets[i] - 4096, 0) + != destoffsets[i] - 4096) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + if (destoffsets[i] > 4096) { + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) != + destoffsets[i]+stripes*chunk*odata) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + } + fsync(destfd[i]); + rv = 0; + } + + return rv; +} + +/* in 2.6.30, the value reported by sync_completed can be + * less that it should be by one stripe. + * This only happens when reshape hits sync_max and pauses. + * So allow wait_backup to either extent sync_max further + * than strictly necessary, or return before the + * sync has got quite as far as we would really like. + * This is what 'blocks2' is for. + * The various caller give appropriate values so that + * every works. + */ +/* FIXME return value is often ignored */ +static int forget_backup(int dests, int *destfd, + unsigned long long *destoffsets, + int part) +{ + /* + * Erase backup 'part' (which is 0 or 1) + */ + int i; + int rv; + + if (part) { + bsb.arraystart2 = __cpu_to_le64(0); + bsb.length2 = __cpu_to_le64(0); + } else { + bsb.arraystart = __cpu_to_le64(0); + bsb.length = __cpu_to_le64(0); + } + bsb.mtime = __cpu_to_le64(time(0)); + rv = 0; + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + bsb.sb_csum = bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) != + destoffsets[i]-4096) + rv = -1; + if (rv == 0 && + write(destfd[i], &bsb, 512) != 512) + rv = -1; + fsync(destfd[i]); + } + return rv; +} + +static void fail(char *msg) +{ + int rv; + rv = (write(2, msg, strlen(msg)) != (int)strlen(msg)); + rv |= (write(2, "\n", 1) != 1); + exit(rv ? 1 : 2); +} + +static char *abuf, *bbuf; +static unsigned long long abuflen; +static void validate(int afd, int bfd, unsigned long long offset) +{ + /* check that the data in the backup against the array. + * This is only used for regression testing and should not + * be used while the array is active + */ + if (afd < 0) + return; + lseek64(bfd, offset - 4096, 0); + if (read(bfd, &bsb2, 512) != 512) + fail("cannot read bsb"); + if (bsb2.sb_csum != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum)-((char*)&bsb2))) + fail("first csum bad"); + if (memcmp(bsb2.magic, "md_backup_data", 14) != 0) + fail("magic is bad"); + if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 && + bsb2.sb_csum2 != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum2)-((char*)&bsb2))) + fail("second csum bad"); + + if (__le64_to_cpu(bsb2.devstart)*512 != offset) + fail("devstart is wrong"); + + if (bsb2.length) { + unsigned long long len = __le64_to_cpu(bsb2.length)*512; + + if (abuflen < len) { + free(abuf); + free(bbuf); + abuflen = len; + if (posix_memalign((void**)&abuf, 4096, abuflen) || + posix_memalign((void**)&bbuf, 4096, abuflen)) { + abuflen = 0; + /* just stop validating on mem-alloc failure */ + return; + } + } + + lseek64(bfd, offset, 0); + if ((unsigned long long)read(bfd, bbuf, len) != len) { + //printf("len %llu\n", len); + fail("read first backup failed"); + } + lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0); + if ((unsigned long long)read(afd, abuf, len) != len) + fail("read first from array failed"); + if (memcmp(bbuf, abuf, len) != 0) { +#if 0 + int i; + printf("offset=%llu len=%llu\n", + (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len); + for (i=0; iafter.data_disks >= reshape->before.data_disks; + int part = 0; /* The next part of the backup area to fill. It may already + * be full, so we need to check */ + int level = reshape->level; + int layout = reshape->before.layout; + int data = reshape->before.data_disks; + int disks = reshape->before.data_disks + reshape->parity; + int chunk = sra->array.chunk_size; + struct mdinfo *sd; + unsigned long stripes; + int uuid[4]; + int frozen = 0; + + /* set up the backup-super-block. This requires the + * uuid from the array. + */ + /* Find a superblock */ + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int devfd; + int ok; + if (sd->disk.state & (1<disk.major, sd->disk.minor, 1); + devfd = dev_open(dn, O_RDONLY); + if (devfd < 0) + continue; + ok = st->ss->load_super(st, devfd, NULL); + close(devfd); + if (ok == 0) + break; + } + if (!sd) { + pr_err("Cannot find a superblock\n"); + return 0; + } + + memset(&bsb, 0, 512); + memcpy(bsb.magic, "md_backup_data-1", 16); + st->ss->uuid_from_super(st, uuid); + memcpy(bsb.set_uuid, uuid, 16); + bsb.mtime = __cpu_to_le64(time(0)); + bsb.devstart2 = blocks; + + stripes = blocks / (sra->array.chunk_size/512) / + reshape->before.data_disks; + + if (posix_memalign((void**)&buf, 4096, disks * chunk)) + /* Don't start the 'reshape' */ + return 0; + if (reshape->before.data_disks == reshape->after.data_disks) { + sysfs_get_ll(sra, NULL, "sync_speed_min", &speed); + sysfs_set_num(sra, NULL, "sync_speed_min", 200000); + } + + if (increasing) { + array_size = sra->component_size * reshape->after.data_disks; + backup_point = sra->reshape_progress; + suspend_point = 0; + } else { + array_size = sra->component_size * reshape->before.data_disks; + backup_point = reshape->backup_blocks; + suspend_point = array_size; + } + + while (!done) { + int rv; + + /* Want to return as soon the oldest backup slot can + * be released as that allows us to start backing up + * some more, providing suspend_point has been + * advanced, which it should have. + */ + if (increasing) { + wait_point = array_size; + if (part == 0 && __le64_to_cpu(bsb.length) > 0) + wait_point = (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length)); + if (part == 1 && __le64_to_cpu(bsb.length2) > 0) + wait_point = (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2)); + } else { + wait_point = 0; + if (part == 0 && __le64_to_cpu(bsb.length) > 0) + wait_point = __le64_to_cpu(bsb.arraystart); + if (part == 1 && __le64_to_cpu(bsb.length2) > 0) + wait_point = __le64_to_cpu(bsb.arraystart2); + } + + reshape_completed = sra->reshape_progress; + rv = progress_reshape(sra, reshape, + backup_point, wait_point, + &suspend_point, &reshape_completed, + &frozen); + /* external metadata would need to ping_monitor here */ + sra->reshape_progress = reshape_completed; + + /* Clear any backup region that is before 'here' */ + if (increasing) { + if (__le64_to_cpu(bsb.length) > 0 && + reshape_completed >= (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length))) + forget_backup(dests, destfd, + destoffsets, 0); + if (__le64_to_cpu(bsb.length2) > 0 && + reshape_completed >= (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2))) + forget_backup(dests, destfd, + destoffsets, 1); + } else { + if (__le64_to_cpu(bsb.length) > 0 && + reshape_completed <= (__le64_to_cpu(bsb.arraystart))) + forget_backup(dests, destfd, + destoffsets, 0); + if (__le64_to_cpu(bsb.length2) > 0 && + reshape_completed <= (__le64_to_cpu(bsb.arraystart2))) + forget_backup(dests, destfd, + destoffsets, 1); + } + if (sigterm) + rv = -2; + if (rv < 0) { + if (rv == -1) + done = 1; + break; + } + if (rv == 0 && increasing && !st->ss->external) { + /* No longer need to monitor this reshape */ + sysfs_set_str(sra, NULL, "sync_max", "max"); + done = 1; + break; + } + + while (rv) { + unsigned long long offset; + unsigned long actual_stripes; + /* Need to backup some data. + * If 'part' is not used and the desired + * backup size is suspended, do a backup, + * then consider the next part. + */ + /* Check that 'part' is unused */ + if (part == 0 && __le64_to_cpu(bsb.length) != 0) + break; + if (part == 1 && __le64_to_cpu(bsb.length2) != 0) + break; + + offset = backup_point / data; + actual_stripes = stripes; + if (increasing) { + if (offset + actual_stripes * (chunk/512) > + sra->component_size) + actual_stripes = ((sra->component_size - offset) + / (chunk/512)); + if (offset + actual_stripes * (chunk/512) > + suspend_point/data) + break; + } else { + if (offset < actual_stripes * (chunk/512)) + actual_stripes = offset / (chunk/512); + offset -= actual_stripes * (chunk/512); + if (offset < suspend_point/data) + break; + } + if (actual_stripes == 0) + break; + grow_backup(sra, offset, actual_stripes, + fds, offsets, + disks, chunk, level, layout, + dests, destfd, destoffsets, + part, °raded, buf); + validate(afd, destfd[0], destoffsets[0]); + /* record where 'part' is up to */ + part = !part; + if (increasing) + backup_point += actual_stripes * (chunk/512) * data; + else + backup_point -= actual_stripes * (chunk/512) * data; + } + } + + /* FIXME maybe call progress_reshape one more time instead */ + /* remove any remaining suspension */ + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "sync_min", 0); + + if (reshape->before.data_disks == reshape->after.data_disks) + sysfs_set_num(sra, NULL, "sync_speed_min", speed); + free(buf); + return done; +} + +/* + * If any spare contains md_back_data-1 which is recent wrt mtime, + * write that data into the array and update the super blocks with + * the new reshape_progress + */ +int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt, + char *backup_file, int verbose) +{ + int i, j; + int old_disks; + unsigned long long *offsets; + unsigned long long nstripe, ostripe; + int ndata, odata; + + odata = info->array.raid_disks - info->delta_disks - 1; + if (info->array.level == 6) odata--; /* number of data disks */ + ndata = info->array.raid_disks - 1; + if (info->new_level == 6) ndata--; + + old_disks = info->array.raid_disks - info->delta_disks; + + if (info->delta_disks <= 0) + /* Didn't grow, so the backup file must have + * been used + */ + old_disks = cnt; + for (i=old_disks-(backup_file?1:0); iss->load_super(st, fd, NULL)) + continue; + + st->ss->getinfo_super(st, &dinfo, NULL); + st->ss->free_super(st); + + if (lseek64(fd, + (dinfo.data_offset + dinfo.component_size - 8) <<9, + 0) < 0) { + pr_err("Cannot seek on device %d\n", i); + continue; /* Cannot seek */ + } + sprintf(namebuf, "device-%d", i); + devname = namebuf; + } + if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) { + if (verbose) + pr_err("Cannot read from %s\n", devname); + continue; /* Cannot read */ + } + if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 && + memcmp(bsb.magic, "md_backup_data-2", 16) != 0) { + if (verbose) + pr_err("No backup metadata on %s\n", devname); + continue; + } + if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) { + if (verbose) + pr_err("Bad backup-metadata checksum on %s\n", devname); + continue; /* bad checksum */ + } + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 && + bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) { + if (verbose) + pr_err("Bad backup-metadata checksum2 on %s\n", devname); + continue; /* Bad second checksum */ + } + if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) { + if (verbose) + pr_err("Wrong uuid on backup-metadata on %s\n", devname); + continue; /* Wrong uuid */ + } + + /* array utime and backup-mtime should be updated at much the same time, but it seems that + * sometimes they aren't... So allow considerable flexability in matching, and allow + * this test to be overridden by an environment variable. + */ + if(time_after(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) + 2*60*60) || + time_before(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) - 10*60)) { + if (check_env("MDADM_GROW_ALLOW_OLD")) { + pr_err("accepting backup with timestamp %lu for array with timestamp %lu\n", + (unsigned long)__le64_to_cpu(bsb.mtime), + (unsigned long)info->array.utime); + } else { + pr_err("too-old timestamp on backup-metadata on %s\n", devname); + pr_err("If you think it is should be safe, try 'export MDADM_GROW_ALLOW_OLD=1'\n"); + continue; /* time stamp is too bad */ + } + } + + if (bsb.magic[15] == '1') { + if (bsb.length == 0) + continue; + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length) + < info->reshape_progress) { + nonew: + if (verbose) + pr_err("backup-metadata found on %s but is not needed\n", devname); + continue; /* No new data here */ + } + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } + } else { + if (bsb.length == 0 && bsb.length2 == 0) + continue; + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if ((__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length) + < info->reshape_progress) + && + (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2) + < info->reshape_progress)) + goto nonew; /* No new data here */ + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } + } + if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) { + second_fail: + if (verbose) + pr_err("Failed to verify secondary backup-metadata block on %s\n", + devname); + continue; /* Cannot seek */ + } + /* There should be a duplicate backup superblock 4k before here */ + if (lseek64(fd, -4096, 1) < 0 || + read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2)) + goto second_fail; /* Cannot find leading superblock */ + if (bsb.magic[15] == '1') + bsbsize = offsetof(struct mdp_backup_super, pad1); + else + bsbsize = offsetof(struct mdp_backup_super, pad); + if (memcmp(&bsb2, &bsb, bsbsize) != 0) + goto second_fail; /* Cannot find leading superblock */ + + /* Now need the data offsets for all devices. */ + offsets = xmalloc(sizeof(*offsets)*info->array.raid_disks); + for(j=0; jarray.raid_disks; j++) { + if (fdlist[j] < 0) + continue; + if (st->ss->load_super(st, fdlist[j], NULL)) + /* FIXME should be this be an error */ + continue; + st->ss->getinfo_super(st, &dinfo, NULL); + st->ss->free_super(st); + offsets[j] = dinfo.data_offset * 512; + } + printf("%s: restoring critical section\n", Name); + + if (restore_stripes(fdlist, offsets, + info->array.raid_disks, + info->new_chunk, + info->new_level, + info->new_layout, + fd, __le64_to_cpu(bsb.devstart)*512, + __le64_to_cpu(bsb.arraystart)*512, + __le64_to_cpu(bsb.length)*512, NULL)) { + /* didn't succeed, so giveup */ + if (verbose) + pr_err("Error restoring backup from %s\n", + devname); + free(offsets); + return 1; + } + + if (bsb.magic[15] == '2' && + restore_stripes(fdlist, offsets, + info->array.raid_disks, + info->new_chunk, + info->new_level, + info->new_layout, + fd, __le64_to_cpu(bsb.devstart)*512 + + __le64_to_cpu(bsb.devstart2)*512, + __le64_to_cpu(bsb.arraystart2)*512, + __le64_to_cpu(bsb.length2)*512, NULL)) { + /* didn't succeed, so giveup */ + if (verbose) + pr_err("Error restoring second backup from %s\n", + devname); + free(offsets); + return 1; + } + + free(offsets); + + /* Ok, so the data is restored. Let's update those superblocks. */ + + lo = hi = 0; + if (bsb.length) { + lo = __le64_to_cpu(bsb.arraystart); + hi = lo + __le64_to_cpu(bsb.length); + } + if (bsb.magic[15] == '2' && bsb.length2) { + unsigned long long lo1, hi1; + lo1 = __le64_to_cpu(bsb.arraystart2); + hi1 = lo1 + __le64_to_cpu(bsb.length2); + if (lo == hi) { + lo = lo1; + hi = hi1; + } else if (lo < lo1) + hi = hi1; + else + lo = lo1; + } + if (lo < hi && + (info->reshape_progress < lo || + info->reshape_progress > hi)) + /* backup does not affect reshape_progress*/ ; + else if (info->delta_disks >= 0) { + info->reshape_progress = __le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length); + if (bsb.magic[15] == '2') { + unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2); + if (p2 > info->reshape_progress) + info->reshape_progress = p2; + } + } else { + info->reshape_progress = __le64_to_cpu(bsb.arraystart); + if (bsb.magic[15] == '2') { + unsigned long long p2 = __le64_to_cpu(bsb.arraystart2); + if (p2 < info->reshape_progress) + info->reshape_progress = p2; + } + } + for (j=0; jarray.raid_disks; j++) { + if (fdlist[j] < 0) + continue; + if (st->ss->load_super(st, fdlist[j], NULL)) + continue; + st->ss->getinfo_super(st, &dinfo, NULL); + dinfo.reshape_progress = info->reshape_progress; + st->ss->update_super(st, &dinfo, + "_reshape_progress", + NULL,0, 0, NULL); + st->ss->store_super(st, fdlist[j]); + st->ss->free_super(st); + } + return 0; + } + /* Didn't find any backup data, try to see if any + * was needed. + */ + if (info->delta_disks < 0) { + /* When shrinking, the critical section is at the end. + * So see if we are before the critical section. + */ + unsigned long long first_block; + nstripe = ostripe = 0; + first_block = 0; + while (ostripe >= nstripe) { + ostripe += info->array.chunk_size / 512; + first_block = ostripe * odata; + nstripe = first_block / ndata / (info->new_chunk/512) * + (info->new_chunk/512); + } + + if (info->reshape_progress >= first_block) + return 0; + } + if (info->delta_disks > 0) { + /* See if we are beyond the critical section. */ + unsigned long long last_block; + nstripe = ostripe = 0; + last_block = 0; + while (nstripe >= ostripe) { + nstripe += info->new_chunk / 512; + last_block = nstripe * ndata; + ostripe = last_block / odata / (info->array.chunk_size/512) * + (info->array.chunk_size/512); + } + + if (info->reshape_progress >= last_block) + return 0; + } + /* needed to recover critical section! */ + if (verbose) + pr_err("Failed to find backup of critical section\n"); + return 1; +} + +int Grow_continue_command(char *devname, int fd, + char *backup_file, int verbose) +{ + int ret_val = 0; + struct supertype *st = NULL; + struct mdinfo *content = NULL; + struct mdinfo array; + char *subarray = NULL; + struct mdinfo *cc = NULL; + struct mdstat_ent *mdstat = NULL; + int cfd = -1; + int fd2 = -1; + + dprintf("Grow continue from command line called for %s\n", + devname); + + st = super_by_fd(fd, &subarray); + if (!st || !st->ss) { + pr_err("Unable to determine metadata format for %s\n", + devname); + return 1; + } + dprintf("Grow continue is run for "); + if (st->ss->external == 0) { + int d; + dprintf_cont("native array (%s)\n", devname); + if (ioctl(fd, GET_ARRAY_INFO, &array.array) < 0) { + pr_err("%s is not an active md array - aborting\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + content = &array; + /* Need to load a superblock. + * FIXME we should really get what we need from + * sysfs + */ + for (d = 0; d < MAX_DISKS; d++) { + mdu_disk_info_t disk; + char *dv; + int err; + disk.number = d; + if (ioctl(fd, GET_DISK_INFO, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + if ((disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + /* invalidate fd2 to avoid possible double close() */ + fd2 = -1; + if (err) + continue; + break; + } + if (d == MAX_DISKS) { + pr_err("Unable to load metadata for %s\n", + devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + st->ss->getinfo_super(st, content, NULL); + } else { + char *container; + + if (subarray) { + dprintf_cont("subarray (%s)\n", subarray); + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); + } else { + container = st->devnm; + close(fd); + cfd = open_dev_excl(st->devnm); + dprintf_cont("container (%s)\n", container); + fd = cfd; + } + if (cfd < 0) { + pr_err("Unable to open container for %s\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + + /* find in container array under reshape + */ + ret_val = st->ss->load_container(st, cfd, NULL); + if (ret_val) { + pr_err("Cannot read superblock for %s\n", + devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + + cc = st->ss->container_content(st, subarray); + for (content = cc; content ; content = content->next) { + char *array; + int allow_reshape = 1; + + if (content->reshape_active == 0) + continue; + /* The decision about array or container wide + * reshape is taken in Grow_continue based + * content->reshape_active state, therefore we + * need to check_reshape based on + * reshape_active and subarray name + */ + if (content->array.state & (1<reshape_active == CONTAINER_RESHAPE && + (content->array.state + & (1<text_version+1, '/')+1; + mdstat = mdstat_by_subdev(array, container); + if (!mdstat) + continue; + if (mdstat->active == 0) { + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); + free_mdstat(mdstat); + mdstat = NULL; + continue; + } + break; + } + if (!content) { + pr_err("Unable to determine reshaped array for %s\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + fd2 = open_dev(mdstat->devnm); + if (fd2 < 0) { + pr_err("cannot open (%s)\n", mdstat->devnm); + ret_val = 1; + goto Grow_continue_command_exit; + } + + sysfs_init(content, fd2, mdstat->devnm); + + close(fd2); + fd2 = -1; + + /* start mdmon in case it is not running + */ + if (!mdmon_running(container)) + start_mdmon(container); + ping_monitor(container); + + if (mdmon_running(container)) + st->update_tail = &st->updates; + else { + pr_err("No mdmon found. Grow cannot continue.\n"); + ret_val = 1; + goto Grow_continue_command_exit; + } + } + + /* verify that array under reshape is started from + * correct position + */ + if (verify_reshape_position(content, content->array.level) < 0) { + ret_val = 1; + goto Grow_continue_command_exit; + } + + /* continue reshape + */ + ret_val = Grow_continue(fd, st, content, backup_file, 1, 0); + +Grow_continue_command_exit: + if (fd2 > -1) + close(fd2); + if (cfd > -1) + close(cfd); + st->ss->free_super(st); + free_mdstat(mdstat); + sysfs_free(cc); + free(subarray); + + return ret_val; +} + +int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, + char *backup_file, int forked, int freeze_reshape) +{ + int ret_val = 2; + + if (!info->reshape_active) + return ret_val; + + if (st->ss->external) { + int cfd = open_dev(st->container_devnm); + + if (cfd < 0) + return 1; + + st->ss->load_container(st, cfd, st->container_devnm); + close(cfd); + ret_val = reshape_container(st->container_devnm, NULL, mdfd, + st, info, 0, backup_file, + 0, forked, + 1 | info->reshape_active, + freeze_reshape); + } else + ret_val = reshape_array(NULL, mdfd, "array", st, info, 1, + NULL, INVALID_SECTORS, + backup_file, 0, forked, + 1 | info->reshape_active, + freeze_reshape); + + return ret_val; +} + +char *make_backup(char *name) +{ + char *base = "backup_file-"; + int len; + char *fname; + + len = strlen(MAP_DIR) + 1 + strlen(base) + strlen(name)+1; + fname = xmalloc(len); + sprintf(fname, "%s/%s%s", MAP_DIR, base, name); + return fname; +} + +char *locate_backup(char *name) +{ + char *fl = make_backup(name); + struct stat stb; + + if (stat(fl, &stb) == 0 && + S_ISREG(stb.st_mode)) + return fl; + + free(fl); + return NULL; +} diff --git a/INSTALL b/INSTALL new file mode 100644 index 00000000..f7bcc3e6 --- /dev/null +++ b/INSTALL @@ -0,0 +1,13 @@ + +To build mdadm, simply run: + + make + +to install, run + + make install + +as root. + + +No configuration is necessary. diff --git a/Incremental.c b/Incremental.c new file mode 100644 index 00000000..24fd8276 --- /dev/null +++ b/Incremental.c @@ -0,0 +1,1808 @@ +/* + * Incremental.c - support --incremental. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2013 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * Novell Inc + * GPO Box Q1283 + * QVB Post Office, NSW 1230 + * Australia + */ + +#include "mdadm.h" +#include +#include +#include + +static int count_active(struct supertype *st, struct mdinfo *sra, + int mdfd, char **availp, + struct mdinfo *info); +static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, + int number, __u64 events, int verbose, + char *array_name); +static int try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, + struct supertype *st, int verbose); + +static int Incremental_container(struct supertype *st, char *devname, + struct context *c, char *only); + +int Incremental(struct mddev_dev *devlist, struct context *c, + struct supertype *st) +{ + /* Add this device to an array, creating the array if necessary + * and starting the array if sensible or - if runstop>0 - if possible. + * + * This has several steps: + * + * 1/ Check if device is permitted by mdadm.conf, reject if not. + * 2/ Find metadata, reject if none appropriate (check + * version/name from args) + * 3/ Check if there is a match in mdadm.conf + * 3a/ if not, check for homehost match. If no match, assemble as + * a 'foreign' array. + * 4/ Determine device number. + * - If in mdadm.conf with std name, use that + * - UUID in /var/run/mdadm.map use that + * - If name is suggestive, use that. unless in use with different uuid. + * - Choose a free, high number. + * - Use a partitioned device unless strong suggestion not to. + * e.g. auto=md + * Don't choose partitioned for containers. + * 5/ Find out if array already exists + * 5a/ if it does not + * - choose a name, from mdadm.conf or 'name' field in array. + * - create the array + * - add the device + * 5b/ if it does + * - check one drive in array to make sure metadata is a reasonably + * close match. Reject if not (e.g. different type) + * - add the device + * 6/ Make sure /var/run/mdadm.map contains this array. + * 7/ Is there enough devices to possibly start the array? + * For a container, this means running Incremental_container. + * 7a/ if not, finish with success. + * 7b/ if yes, + * - read all metadata and arrange devices like -A does + * - if number of OK devices match expected, or -R and there are enough, + * start the array (auto-readonly). + */ + struct stat stb; + struct mdinfo info, dinfo; + struct mdinfo *sra = NULL, *d; + struct mddev_ident *match; + char chosen_name[1024]; + char *md_devname; + int rv = 1; + struct map_ent *mp, *map = NULL; + int dfd = -1, mdfd = -1; + char *avail = NULL; + int active_disks; + int trustworthy; + char *name_to_use; + mdu_array_info_t ainf; + struct dev_policy *policy = NULL; + struct map_ent target_array; + int have_target; + char *devname = devlist->devname; + int journal_device_missing = 0; + + struct createinfo *ci = conf_get_create_info(); + + if (stat(devname, &stb) < 0) { + if (c->verbose >= 0) + pr_err("stat failed for %s: %s.\n", + devname, strerror(errno)); + return rv; + } + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + if (c->verbose >= 0) + pr_err("%s is not a block device.\n", + devname); + return rv; + } + dfd = dev_open(devname, O_RDONLY); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot open %s: %s.\n", + devname, strerror(errno)); + return rv; + } + /* If the device is a container, we do something very different */ + if (must_be_container(dfd)) { + if (!st) + st = super_by_fd(dfd, NULL); + if (st && st->ss->load_container) + rv = st->ss->load_container(st, dfd, NULL); + + close(dfd); + if (!rv && st->ss->container_content) { + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + if (c->export) + printf("MD_DEVNAME=%s\n", devname); + rv = Incremental_container(st, devname, c, NULL); + map_unlock(&map); + return rv; + } + + pr_err("%s is not part of an md array.\n", + devname); + return rv; + } + + /* 1/ Check if device is permitted by mdadm.conf */ + + for (;devlist; devlist = devlist->next) + if (conf_test_dev(devlist->devname)) + break; + if (!devlist) { + devlist = conf_get_devs(); + for (;devlist; devlist = devlist->next) { + struct stat st2; + if (stat(devlist->devname, &st2) == 0 && + (st2.st_mode & S_IFMT) == S_IFBLK && + st2.st_rdev == stb.st_rdev) + break; + } + } + if (!devlist) { + if (c->verbose >= 0) + pr_err("%s not permitted by mdadm.conf.\n", + devname); + goto out; + } + + /* 2/ Find metadata, reject if none appropriate (check + * version/name from args) */ + + if (fstat(dfd, &stb) < 0) { + if (c->verbose >= 0) + pr_err("fstat failed for %s: %s.\n", + devname, strerror(errno)); + goto out; + } + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + if (c->verbose >= 0) + pr_err("%s is not a block device.\n", + devname); + goto out; + } + + dinfo.disk.major = major(stb.st_rdev); + dinfo.disk.minor = minor(stb.st_rdev); + + policy = disk_policy(&dinfo); + have_target = policy_check_path(&dinfo, &target_array); + + if (st == NULL && (st = guess_super_type(dfd, guess_array)) == NULL) { + if (c->verbose >= 0) + pr_err("no recognisable superblock on %s.\n", + devname); + rv = try_spare(devname, &dfd, policy, + have_target ? &target_array : NULL, + NULL, c->verbose); + goto out; + } + st->ignore_hw_compat = 0; + + if (st->ss->compare_super == NULL || + st->ss->load_super(st, dfd, c->verbose >= 0 ? devname : NULL)) { + if (c->verbose >= 0) + pr_err("no RAID superblock on %s.\n", + devname); + rv = try_spare(devname, &dfd, policy, + have_target ? &target_array : NULL, + st, c->verbose); + free(st); + goto out; + } + close (dfd); dfd = -1; + + st->ss->getinfo_super(st, &info, NULL); + + /* 3/ Check if there is a match in mdadm.conf */ + match = conf_match(st, &info, devname, c->verbose, &rv); + if (!match && rv == 2) + goto out; + + if (match && match->devname + && strcasecmp(match->devname, "") == 0) { + if (c->verbose >= 0) + pr_err("array containing %s is explicitly ignored by mdadm.conf\n", + devname); + goto out; + } + + /* 3a/ if not, check for homehost match. If no match, continue + * but don't trust the 'name' in the array. Thus a 'random' minor + * number will be assigned, and the device name will be based + * on that. */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, c->homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL_ANY; + else + trustworthy = FOREIGN; + + if (!match && !conf_test_metadata(st->ss->name, policy, + (trustworthy == LOCAL))) { + if (c->verbose >= 1) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, st->ss->name); + goto out; + } + if (trustworthy == LOCAL_ANY) + trustworthy = LOCAL; + + /* There are three possible sources for 'autof': command line, + * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf. + * ARRAY takes precedence, then command line, then + * CREATE. + */ + if (match && match->autof) + c->autof = match->autof; + if (c->autof == 0) + c->autof = ci->autof; + + name_to_use = info.name; + if (name_to_use[0] == 0 && + info.array.level == LEVEL_CONTAINER) { + name_to_use = info.text_version; + trustworthy = METADATA; + } + if (name_to_use[0] && trustworthy != LOCAL && + ! c->require_homehost && + conf_name_is_free(name_to_use)) + trustworthy = LOCAL; + + /* strip "hostname:" prefix from name if we have decided + * to treat it as LOCAL + */ + if (trustworthy == LOCAL && strchr(name_to_use, ':') != NULL) + name_to_use = strchr(name_to_use, ':')+1; + + /* 4/ Check if array exists. + */ + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + /* Now check we can get O_EXCL. If not, probably "mdadm -A" has + * taken over + */ + dfd = dev_open(devname, O_RDONLY|O_EXCL); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot reopen %s: %s.\n", + devname, strerror(errno)); + goto out_unlock; + } + /* Cannot hold it open while we add the device to the array, + * so we must release the O_EXCL and depend on the map_lock() + * So now is the best time to remove any partitions. + */ + remove_partitions(dfd); + close(dfd); + dfd = -1; + + mp = map_by_uuid(&map, info.uuid); + if (mp) + mdfd = open_dev(mp->devnm); + else + mdfd = -1; + + if (mdfd < 0) { + + /* Skip the clustered ones. This should be started by + * clustering resource agents + */ + if (info.array.state & (1 << MD_SB_CLUSTERED)) + goto out; + + /* Couldn't find an existing array, maybe make a new one */ + mdfd = create_mddev(match ? match->devname : NULL, + name_to_use, c->autof, trustworthy, chosen_name); + + if (mdfd < 0) + goto out_unlock; + + sysfs_init(&info, mdfd, NULL); + + if (set_array_info(mdfd, st, &info) != 0) { + pr_err("failed to set array info for %s: %s\n", + chosen_name, strerror(errno)); + rv = 2; + goto out_unlock; + } + + dinfo = info; + dinfo.disk.major = major(stb.st_rdev); + dinfo.disk.minor = minor(stb.st_rdev); + if (add_disk(mdfd, st, &info, &dinfo) != 0) { + pr_err("failed to add %s to new array %s: %s.\n", + devname, chosen_name, strerror(errno)); + ioctl(mdfd, STOP_ARRAY, 0); + rv = 2; + goto out_unlock; + } + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + + if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) { + /* It really should be 'none' - must be old buggy + * kernel, and mdadm -I may not be able to complete. + * So reject it. + */ + ioctl(mdfd, STOP_ARRAY, NULL); + pr_err("You have an old buggy kernel which cannot support\n --incremental reliably. Aborting.\n"); + rv = 2; + goto out_unlock; + } + info.array.working_disks = 1; + /* 6/ Make sure /var/run/mdadm.map contains this array. */ + map_update(&map, fd2devnm(mdfd), + info.text_version, + info.uuid, chosen_name); + } else { + /* 5b/ if it does */ + /* - check one drive in array to make sure metadata is a reasonably */ + /* close match. Reject if not (e.g. different type) */ + /* - add the device */ + char dn[20]; + int dfd2; + int err; + struct supertype *st2; + struct mdinfo info2, *d; + + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, mp->devnm); + + /* It is generally not OK to add non-spare drives to a + * running array as they are probably missing because + * they failed. However if runstop is 1, then the + * array was possibly started early and our best bet is + * to add this anyway. + * Also if action policy is re-add or better we allow + * re-add. + * This doesn't apply to containers as the 'non-spare' + * flag has a different meaning. The test has to happen + * at the device level there + */ + if (!st->ss->external + && (info.disk.state & (1<ss->name, + act_re_add) + && c->runstop < 1) { + if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) { + pr_err("not adding %s to active array (without --run) %s\n", + devname, chosen_name); + rv = 2; + goto out_unlock; + } + } + if (!sra) { + rv = 2; + goto out_unlock; + } + if (sra->devs) { + sprintf(dn, "%d:%d", sra->devs->disk.major, + sra->devs->disk.minor); + dfd2 = dev_open(dn, O_RDONLY); + if (dfd2 < 0) { + pr_err("unable to open %s\n", devname); + rv = 2; + goto out_unlock; + } + st2 = dup_super(st); + if (st2->ss->load_super(st2, dfd2, NULL) || + st->ss->compare_super(st, st2) != 0) { + pr_err("metadata mismatch between %s and chosen array %s\n", + devname, chosen_name); + close(dfd2); + rv = 2; + goto out_unlock; + } + close(dfd2); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + if (info.array.level != info2.array.level || + memcmp(info.uuid, info2.uuid, 16) != 0 || + info.array.raid_disks != info2.array.raid_disks) { + pr_err("unexpected difference between %s and %s.\n", + chosen_name, devname); + rv = 2; + goto out_unlock; + } + } + info.disk.major = major(stb.st_rdev); + info.disk.minor = minor(stb.st_rdev); + /* add disk needs to know about containers */ + if (st->ss->external) + sra->array.level = LEVEL_CONTAINER; + + if (info.array.state & (1 << MD_SB_CLUSTERED)) + info.disk.state |= (1 << MD_DISK_CLUSTER_ADD); + + err = add_disk(mdfd, st, sra, &info); + if (err < 0 && errno == EBUSY) { + /* could be another device present with the same + * disk.number. Find and reject any such + */ + find_reject(mdfd, st, sra, info.disk.number, + info.events, c->verbose, chosen_name); + err = add_disk(mdfd, st, sra, &info); + } + if (err < 0 && errno == EINVAL && + info.disk.state & (1<ss->name, + act_force_spare)) { + info.disk.state &= ~(1<verbose >= 0) + pr_err("can only add %s to %s as a spare, and force-spare is not set.\n", + devname, chosen_name); + } + if (err < 0) { + pr_err("failed to add %s to existing array %s: %s.\n", + devname, chosen_name, strerror(errno)); + rv = 2; + goto out_unlock; + } + info.array.working_disks = 0; + for (d = sra->devs; d; d=d->next) + info.array.working_disks ++; + + } + if (strncmp(chosen_name, "/dev/md/", 8) == 0) + md_devname = chosen_name+8; + else + md_devname = chosen_name; + if (c->export) { + printf("MD_DEVICE=%s\n", fd2devnm(mdfd)); + printf("MD_DEVNAME=%s\n", md_devname); + printf("MD_FOREIGN=%s\n", trustworthy == FOREIGN ? "yes" : "no"); + } + + /* 7/ Is there enough devices to possibly start the array? */ + /* 7a/ if not, finish with success. */ + if (info.array.level == LEVEL_CONTAINER) { + char devnm[32]; + /* Try to assemble within the container */ + sysfs_uevent(sra, "change"); + if (!c->export && c->verbose >= 0) + pr_err("container %s now has %d device%s\n", + chosen_name, info.array.working_disks, + info.array.working_disks == 1?"":"s"); + wait_for(chosen_name, mdfd); + if (st->ss->external) + strcpy(devnm, fd2devnm(mdfd)); + if (st->ss->load_container) + rv = st->ss->load_container(st, mdfd, NULL); + close(mdfd); + sysfs_free(sra); + if (!rv) + rv = Incremental_container(st, chosen_name, c, NULL); + map_unlock(&map); + /* after spare is added, ping monitor for external metadata + * so that it can eg. try to rebuild degraded array */ + if (st->ss->external) + ping_monitor(devnm); + return rv; + } + + /* We have added something to the array, so need to re-read the + * state. Eventually this state should be kept up-to-date as + * things change. + */ + sysfs_free(sra); + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + active_disks = count_active(st, sra, mdfd, &avail, &info); + + journal_device_missing = (info.journal_device_required) && (info.journal_clean == 0); + + if (enough(info.array.level, info.array.raid_disks, + info.array.layout, info.array.state & 1, + avail) == 0) { + if (c->export) { + printf("MD_STARTED=no\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start (%d).\n", + devname, chosen_name, active_disks); + rv = 0; + goto out_unlock; + } + + /* 7b/ if yes, */ + /* - if number of OK devices match expected, or -R and there */ + /* are enough, */ + /* + add any bitmap file */ + /* + start the array (auto-readonly). */ + + if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) { + if (c->export) { + printf("MD_STARTED=already\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s which is already active.\n", + devname, chosen_name); + rv = 0; + goto out_unlock; + } + + map_unlock(&map); + if (c->runstop > 0 || (!journal_device_missing && active_disks >= info.array.working_disks)) { + struct mdinfo *dsk; + /* Let's try to start it */ + + if (journal_device_missing) + pr_err("Trying to run with missing journal device\n"); + if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) { + pr_err("%s: This array is being reshaped and cannot be started\n", + chosen_name); + cont_err("by --incremental. Please use --assemble\n"); + goto out; + } + if (match && match->bitmap_file) { + int bmfd = open(match->bitmap_file, O_RDWR); + if (bmfd < 0) { + pr_err("Could not open bitmap file %s.\n", + match->bitmap_file); + goto out; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { + close(bmfd); + pr_err("Failed to set bitmapfile for %s.\n", + chosen_name); + goto out; + } + close(bmfd); + } + /* Need to remove from the array any devices which + * 'count_active' discerned were too old or inappropriate + */ + for (d = sra ? sra->devs : NULL ; d ; d = d->next) + if (d->disk.state & (1<= info.array.working_disks) + && trustworthy != FOREIGN) + rv = ioctl(mdfd, RUN_ARRAY, NULL); + else + rv = sysfs_set_str(sra, NULL, + "array_state", "read-auto"); + /* Array might be O_EXCL which will interfere with + * fsck and mount. So re-open without O_EXCL. + */ + reopen_mddev(mdfd); + if (rv == 0) { + if (c->export) { + printf("MD_STARTED=yes\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, which has been started.\n", + devname, chosen_name); + rv = 0; + wait_for(chosen_name, mdfd); + /* We just started the array, so some devices + * might have been evicted from the array + * because their event counts were too old. + * If the action=re-add policy is in-force for + * those devices we should re-add them now. + */ + for (dsk = sra->devs; dsk ; dsk = dsk->next) { + if (disk_action_allows(dsk, st->ss->name, act_re_add) && + add_disk(mdfd, st, sra, dsk) == 0) + pr_err("%s re-added to %s\n", + dsk->sys_name, chosen_name); + } + } else { + pr_err("%s attached to %s, but failed to start: %s.\n", + devname, chosen_name, strerror(errno)); + rv = 1; + } + } else { + if (c->export) { + printf("MD_STARTED=unsafe\n"); + } else if (journal_device_missing) { + pr_err("Journal device is missing, not safe to start yet.\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start safely.\n", + devname, chosen_name); + rv = 0; + } +out: + free(avail); + if (dfd >= 0) + close(dfd); + if (mdfd >= 0) + close(mdfd); + if (policy) + dev_policy_free(policy); + if (sra) + sysfs_free(sra); + return rv; +out_unlock: + map_unlock(&map); + goto out; +} + +static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, + int number, __u64 events, int verbose, + char *array_name) +{ + /* Find a device attached to this array with a disk.number of number + * and events less than the passed events, and remove the device. + */ + struct mdinfo *d; + mdu_array_info_t ra; + + if (ioctl(mdfd, GET_ARRAY_INFO, &ra) == 0) + return; /* not safe to remove from active arrays + * without thinking more */ + + for (d = sra->devs; d ; d = d->next) { + char dn[24]; // 2*11 bytes for ints (including sign) + colon + null byte + int dfd; + struct mdinfo info; + sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + if (st->ss->load_super(st, dfd, NULL)) { + close(dfd); + continue; + } + st->ss->getinfo_super(st, &info, NULL); + st->ss->free_super(st); + close(dfd); + + if (info.disk.number != number || + info.events >= events) + continue; + + if (d->disk.raid_disk > -1) + sysfs_set_str(sra, d, "slot", "none"); + if (sysfs_set_str(sra, d, "state", "remove") == 0) + if (verbose >= 0) + pr_err("removing old device %s from %s\n", + d->sys_name+4, array_name); + } +} + +static int count_active(struct supertype *st, struct mdinfo *sra, + int mdfd, char **availp, + struct mdinfo *bestinfo) +{ + /* count how many devices in sra think they are active */ + struct mdinfo *d; + int cnt = 0; + int replcnt = 0; + __u64 max_events = 0; + char *avail = NULL; + int *best = NULL; + char *devmap = NULL; + int numdevs = 0; + int devnum; + int b, i; + int raid_disks = 0; + + if (!sra) + return 0; + + for (d = sra->devs ; d ; d = d->next) + numdevs++; + for (d = sra->devs, devnum = 0 ; d ; d = d->next, devnum++) { + char dn[30]; + int dfd; + int ok; + struct mdinfo info; + + sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + ok = st->ss->load_super(st, dfd, NULL); + close(dfd); + if (ok != 0) + continue; + + info.array.raid_disks = raid_disks; + st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum); + if (info.disk.raid_disk == MD_DISK_ROLE_JOURNAL) + bestinfo->journal_clean = 1; + if (!avail) { + raid_disks = info.array.raid_disks; + avail = xcalloc(raid_disks, 1); + *availp = avail; + + best = xcalloc(raid_disks, sizeof(int)); + devmap = xcalloc(raid_disks, numdevs); + + st->ss->getinfo_super(st, &info, devmap); + } + + if (info.disk.state & (1<ss->getinfo_super(st, bestinfo, NULL); + } else if (info.events == max_events) { + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + } else if (info.events == max_events-1) { + if (avail[info.disk.raid_disk] == 0) { + avail[info.disk.raid_disk] = 1; + best[info.disk.raid_disk] = devnum; + } + } else if (info.events < max_events - 1) + ; + else if (info.events == max_events+1) { + int i; + max_events = info.events; + for (i = 0; i < raid_disks; i++) + if (avail[i]) + avail[i]--; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } else { /* info.events much bigger */ + memset(avail, 0, raid_disks); + max_events = info.events; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } + } else if (info.disk.state & (1<ss->free_super(st); + } + + if (!avail) + return 0; + /* We need to reject any device that thinks the best device is + * failed or missing */ + for (b = 0; b < raid_disks; b++) + if (avail[b] == 2) + break; + cnt = 0; + for (i = 0 ; i < raid_disks ; i++) { + if (i != b && avail[i]) + if (devmap[raid_disks * best[i] + b] == 0) { + /* This device thinks 'b' is failed - + * don't use it */ + devnum = best[i]; + for (d=sra->devs ; devnum; d = d->next) + devnum--; + d->disk.state |= (1 << MD_DISK_REMOVED); + avail[i] = 0; + } + if (avail[i]) + cnt++; + } + /* Also need to reject any spare device with an event count that + * is too high + */ + for (d = sra->devs; d; d = d->next) { + if (!(d->disk.state & (1<events > max_events) + d->disk.state |= (1 << MD_DISK_REMOVED); + } + free(best); + free(devmap); + return cnt + replcnt; +} + +/* test if container has degraded member(s) */ +static int container_members_max_degradation(struct map_ent *map, struct map_ent *me) +{ + mdu_array_info_t array; + int afd; + int max_degraded = 0; + + for(; map; map = map->next) { + if (!metadata_container_matches(map->metadata, me->devnm)) + continue; + afd = open_dev(map->devnm); + if (afd < 0) + continue; + /* most accurate information regarding array degradation */ + if (ioctl(afd, GET_ARRAY_INFO, &array) >= 0) { + int degraded = array.raid_disks - array.active_disks - + array.spare_disks; + if (degraded > max_degraded) + max_degraded = degraded; + } + close(afd); + } + return (max_degraded); +} + +static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, int bare, + struct supertype *st, int verbose) +{ + /* This device doesn't have any md metadata + * The device policy allows 'spare' and if !bare, it allows spare-same-slot. + * If 'st' is not set, then we only know that some metadata allows this, + * others possibly don't. + * So look for a container or array to attach the device to. + * Prefer 'target' if that is set and the array is found. + * + * If st is set, then only arrays of that type are considered + * Return 0 on success, or some exit code on failure, probably 1. + */ + int rv = 1; + struct stat stb; + struct map_ent *mp, *map = NULL; + struct mdinfo *chosen = NULL; + int dfd = *dfdp; + + if (fstat(dfd, &stb) != 0) + return 1; + + /* + * Now we need to find a suitable array to add this to. + * We only accept arrays that: + * - match 'st' + * - are in the same domains as the device + * - are of an size for which the device will be useful + * and we choose the one that is the most degraded + */ + + if (map_lock(&map)) { + pr_err("failed to get exclusive lock on mapfile\n"); + return 1; + } + for (mp = map ; mp ; mp = mp->next) { + struct supertype *st2; + struct domainlist *dl = NULL; + struct mdinfo *sra; + unsigned long long devsize; + unsigned long long component_size = 0; + + if (is_subarray(mp->metadata)) + continue; + if (st) { + st2 = st->ss->match_metadata_desc(mp->metadata); + if (!st2 || + (st->minor_version >= 0 && + st->minor_version != st2->minor_version)) { + if (verbose > 1) + pr_err("not adding %s to %s as metadata type doesn't match\n", + devname, mp->path); + free(st2); + continue; + } + free(st2); + } + sra = sysfs_read(-1, mp->devnm, + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| + GET_DEGRADED|GET_COMPONENT|GET_VERSION); + if (!sra) { + /* Probably a container - no degraded info */ + sra = sysfs_read(-1, mp->devnm, + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| + GET_COMPONENT|GET_VERSION); + if (sra) + sra->array.failed_disks = -1; + } + if (!sra) + continue; + if (st == NULL) { + int i; + st2 = NULL; + for(i = 0; !st2 && superlist[i]; i++) + st2 = superlist[i]->match_metadata_desc( + sra->text_version); + if (!st2) { + if (verbose > 1) + pr_err("not adding %s to %s as metadata not recognised.\n", + devname, mp->path); + goto next; + } + /* Need to double check the 'act_spare' permissions applies + * to this metadata. + */ + if (!policy_action_allows(pol, st2->ss->name, act_spare)) + goto next; + if (!bare && !policy_action_allows(pol, st2->ss->name, + act_spare_same_slot)) + goto next; + } else + st2 = st; + /* update number of failed disks for mostly degraded + * container member */ + if (sra->array.failed_disks == -1) + sra->array.failed_disks = container_members_max_degradation(map, mp); + + get_dev_size(dfd, NULL, &devsize); + if (sra->component_size == 0) { + /* true for containers, here we must read superblock + * to obtain minimum spare size */ + struct supertype *st3 = dup_super(st2); + int mdfd = open_dev(mp->devnm); + if (mdfd < 0) { + free(st3); + goto next; + } + if (st3->ss->load_container && + !st3->ss->load_container(st3, mdfd, mp->path)) { + component_size = st3->ss->min_acceptable_spare_size(st3); + st3->ss->free_super(st3); + } + free(st3); + close(mdfd); + } + if ((sra->component_size > 0 && + st2->ss->avail_size(st2, devsize, + sra->devs + ? sra->devs->data_offset + : INVALID_SECTORS) + < sra->component_size) + || + (sra->component_size == 0 && devsize < component_size)) { + if (verbose > 1) + pr_err("not adding %s to %s as it is too small\n", + devname, mp->path); + goto next; + } + /* test against target. + * If 'target' is set and 'bare' is false, we only accept + * arrays/containers that match 'target'. + * If 'target' is set and 'bare' is true, we prefer the + * array which matches 'target'. + * target is considered only if we deal with degraded array + */ + if (target && policy_action_allows(pol, st2->ss->name, + act_spare_same_slot)) { + if (strcmp(target->metadata, mp->metadata) == 0 && + memcmp(target->uuid, mp->uuid, + sizeof(target->uuid)) == 0 && + sra->array.failed_disks > 0) { + /* This is our target!! */ + if (chosen) + sysfs_free(chosen); + chosen = sra; + sra = NULL; + /* skip to end so we don't check any more */ + while (mp->next) + mp = mp->next; + goto next; + } + /* not our target */ + if (!bare) + goto next; + } + + dl = domain_from_array(sra, st2->ss->name); + if (domain_test(dl, pol, st2->ss->name) != 1) { + /* domain test fails */ + if (verbose > 1) + pr_err("not adding %s to %s as it is not in a compatible domain\n", + devname, mp->path); + + goto next; + } + /* all tests passed, OK to add to this array */ + if (!chosen) { + chosen = sra; + sra = NULL; + } else if (chosen->array.failed_disks < sra->array.failed_disks) { + sysfs_free(chosen); + chosen = sra; + sra = NULL; + } + next: + if (sra) + sysfs_free(sra); + if (st != st2) + free(st2); + if (dl) + domain_free(dl); + } + if (chosen) { + /* add current device to chosen array as a spare */ + int mdfd = open_dev(chosen->sys_name); + if (mdfd >= 0) { + struct mddev_dev devlist; + char chosen_devname[24]; // 2*11 for int (including signs) + colon + null + devlist.next = NULL; + devlist.used = 0; + devlist.writemostly = 0; + devlist.devname = chosen_devname; + sprintf(chosen_devname, "%d:%d", major(stb.st_rdev), + minor(stb.st_rdev)); + devlist.disposition = 'a'; + close(dfd); + *dfdp = -1; + rv = Manage_subdevs(chosen->sys_name, mdfd, &devlist, + -1, 0, NULL, 0); + close(mdfd); + } + if (verbose > 0) { + if (rv == 0) + pr_err("added %s as spare for %s\n", + devname, chosen->sys_name); + else + pr_err("failed to add %s as spare for %s\n", + devname, chosen->sys_name); + } + sysfs_free(chosen); + } + map_unlock(&map); + return rv; +} + +static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct supertype *st, int verbose) +{ + /* we know that at least one partition virtual-metadata is + * allowed to incorporate spares like this device. We need to + * find a suitable device to copy partition information from. + * + * Getting a list of all disk (not partition) devices is + * slightly non-trivial. We could look at /sys/block, but + * that is theoretically due to be removed. Maybe best to use + * /dev/disk/by-path/?* and ignore names ending '-partNN' as + * we depend on this directory of 'path' info. But that fails + * to find loop devices and probably others. Maybe don't + * worry about that, they aren't the real target. + * + * So: check things in /dev/disk/by-path to see if they are in + * a compatible domain, then load the partition table and see + * if it is OK for the new device, and choose the largest + * partition table that fits. + */ + DIR *dir; + struct dirent *de; + char *chosen = NULL; + unsigned long long chosen_size = 0; + struct supertype *chosen_st = NULL; + int fd; + + dir = opendir("/dev/disk/by-path"); + if (!dir) + return 1; + while ((de = readdir(dir)) != NULL) { + char *ep; + struct dev_policy *pol2 = NULL; + struct domainlist *domlist = NULL; + int fd = -1; + struct mdinfo info; + struct supertype *st2 = NULL; + char *devname = NULL; + unsigned long long devsectors; + + if (de->d_ino == 0 || + de->d_name[0] == '.' || + (de->d_type != DT_LNK && de->d_type != DT_UNKNOWN)) + goto next; + + ep = de->d_name + strlen(de->d_name); + while (ep > de->d_name && + isdigit(ep[-1])) + ep--; + if (ep > de->d_name + 5 && + strncmp(ep-5, "-part", 5) == 0) + /* This is a partition - skip it */ + goto next; + + pol2 = path_policy(de->d_name, type_disk); + + domain_merge(&domlist, pol2, st ? st->ss->name : NULL); + if (domain_test(domlist, pol, st ? st->ss->name : NULL) != 1) + /* new device is incompatible with this device. */ + goto next; + + domain_free(domlist); + domlist = NULL; + + if (asprintf(&devname, "/dev/disk/by-path/%s", de->d_name) != 1) { + devname = NULL; + goto next; + } + fd = open(devname, O_RDONLY); + if (fd < 0) + goto next; + if (get_dev_size(fd, devname, &devsectors) == 0) + goto next; + devsectors >>= 9; + + if (st) + st2 = dup_super(st); + else + st2 = guess_super_type(fd, guess_partitions); + if (st2 == NULL || + st2->ss->load_super(st2, fd, NULL) < 0) + goto next; + st2->ignore_hw_compat = 0; + + if (!st) { + /* Check domain policy again, this time referring to metadata */ + domain_merge(&domlist, pol2, st2->ss->name); + if (domain_test(domlist, pol, st2->ss->name) != 1) + /* Incompatible devices for this metadata type */ + goto next; + if (!policy_action_allows(pol, st2->ss->name, act_spare)) + /* Some partition types allow sparing, but not + * this one. + */ + goto next; + } + + st2->ss->getinfo_super(st2, &info, NULL); + if (info.component_size > devsectors) + /* This partitioning doesn't fit in the device */ + goto next; + + /* This is an acceptable device to copy partition + * metadata from. We could just stop here, but I + * think I want to keep looking incase a larger + * metadata which makes better use of the device can + * be found. + */ + if (chosen == NULL || + chosen_size < info.component_size) { + chosen_size = info.component_size; + free(chosen); + chosen = devname; + devname = NULL; + if (chosen_st) { + chosen_st->ss->free_super(chosen_st); + free(chosen_st); + } + chosen_st = st2; + st2 = NULL; + } + + next: + free(devname); + domain_free(domlist); + dev_policy_free(pol2); + if (st2) + st2->ss->free_super(st2); + free(st2); + + if (fd >= 0) + close(fd); + } + + closedir(dir); + + if (!chosen) + return 1; + + /* 'chosen' is the best device we can find. Let's write its + * metadata to devname dfd is read-only so don't use that + */ + fd = open(devname, O_RDWR); + if (fd >= 0) { + chosen_st->ss->store_super(chosen_st, fd); + close(fd); + } + free(chosen); + chosen_st->ss->free_super(chosen_st); + free(chosen_st); + return 0; +} + +static int is_bare(int dfd) +{ + unsigned long long size = 0; + char bufpad[4096 + 4096]; + char *buf = (char*)(((long)bufpad + 4096) & ~4095); + + if (lseek(dfd, 0, SEEK_SET) != 0 || + read(dfd, buf, 4096) != 4096) + return 0; + + if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff') + return 0; + if (memcmp(buf, buf+1, 4095) != 0) + return 0; + + /* OK, first 4K appear blank, try the end. */ + get_dev_size(dfd, NULL, &size); + if (lseek(dfd, size-4096, SEEK_SET) < 0 || + read(dfd, buf, 4096) != 4096) + return 0; + + if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff') + return 0; + if (memcmp(buf, buf+1, 4095) != 0) + return 0; + + return 1; +} + +/* adding a spare to a regular array is quite different from adding one to + * a set-of-partitions virtual array. + * This function determines which is worth trying and tries as appropriate. + * Arrays are given priority over partitions. + */ +static int try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, + struct supertype *st, int verbose) +{ + int i; + int rv; + int arrays_ok = 0; + int partitions_ok = 0; + int dfd = *dfdp; + int bare; + + /* Can only add a spare if device has at least one domain */ + if (pol_find(pol, pol_domain) == NULL) + return 1; + /* And only if some action allows spares */ + if (!policy_action_allows(pol, st?st->ss->name:NULL, act_spare)) + return 1; + + /* Now check if the device is bare. + * bare devices can always be added as a spare + * non-bare devices can only be added if spare-same-slot is permitted, + * and this device is replacing a previous device - in which case 'target' + * will be set. + */ + if (!is_bare(dfd)) { + /* Must have a target and allow same_slot */ + /* Later - may allow force_spare without target */ + if (!target || + !policy_action_allows(pol, st?st->ss->name:NULL, + act_spare_same_slot)) { + if (verbose > 1) + pr_err("%s is not bare, so not considering as a spare\n", + devname); + return 1; + } + bare = 0; + } else + bare = 1; + + /* It might be OK to add this device to an array - need to see + * what arrays might be candidates. + */ + if (st) { + /* just try try 'array' or 'partition' based on this metadata */ + if (st->ss->add_to_super) + return array_try_spare(devname, dfdp, pol, target, bare, + st, verbose); + else + return partition_try_spare(devname, dfdp, pol, + st, verbose); + } + /* No metadata was specified or found so options are open. + * Check for whether any array metadata, or any partition metadata + * might allow adding the spare. This check is just help to avoid + * a more costly scan of all arrays when we can be sure that will + * fail. + */ + for (i = 0; (!arrays_ok || !partitions_ok) && superlist[i] ; i++) { + if (superlist[i]->add_to_super && !arrays_ok && + policy_action_allows(pol, superlist[i]->name, act_spare)) + arrays_ok = 1; + if (superlist[i]->add_to_super == NULL && !partitions_ok && + policy_action_allows(pol, superlist[i]->name, act_spare)) + partitions_ok = 1; + } + rv = 1; + if (arrays_ok) + rv = array_try_spare(devname, dfdp, pol, target, bare, + st, verbose); + if (rv != 0 && partitions_ok) + rv = partition_try_spare(devname, dfdp, pol, st, verbose); + return rv; +} + +int IncrementalScan(struct context *c, char *devnm) +{ + /* look at every device listed in the 'map' file. + * If one is found that is not running then: + * look in mdadm.conf for bitmap file. + * if one exists, but array has none, add it. + * try to start array in auto-readonly mode + */ + struct map_ent *mapl = NULL; + struct map_ent *me; + struct mddev_ident *devs, *mddev; + int rv = 0; + char container[32]; + char *only = NULL; + + map_read(&mapl); + devs = conf_get_ident(NULL); + +restart: + for (me = mapl ; me ; me = me->next) { + mdu_array_info_t array; + mdu_bitmap_file_t bmf; + struct mdinfo *sra; + int mdfd; + + if (devnm && strcmp(devnm, me->devnm) != 0) + continue; + if (devnm && me->metadata[0] == '/') { + char *sl; + /* member array, need to work on container */ + strncpy(container, me->metadata+1, 32); + container[31] = 0; + sl = strchr(container, '/'); + if (sl) + *sl = 0; + only = devnm; + devnm = container; + goto restart; + } + mdfd = open_dev(me->devnm); + + if (mdfd < 0) + continue; + if (!isdigit(me->metadata[0])) { + /* must be a container */ + struct supertype *st = super_by_fd(mdfd, NULL); + int ret = 0; + struct map_ent *map = NULL; + + if (st && st->ss->load_container) + ret = st->ss->load_container(st, mdfd, NULL); + close(mdfd); + if (!ret && st && st->ss->container_content) { + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + ret = Incremental_container(st, me->path, c, only); + map_unlock(&map); + } + if (ret) + rv = 1; + continue; + } + if (ioctl(mdfd, GET_ARRAY_INFO, &array) == 0 || + errno != ENODEV) { + close(mdfd); + continue; + } + /* Ok, we can try this one. Maybe it needs a bitmap */ + for (mddev = devs ; mddev ; mddev = mddev->next) + if (mddev->devname && me->path + && devname_matches(mddev->devname, me->path)) + break; + if (mddev && mddev->bitmap_file) { + /* + * Note: early kernels will wrongly fail this, so it + * is a hint only + */ + int added = -1; + if (ioctl(mdfd, GET_ARRAY_INFO, &bmf) < 0) { + int bmfd = open(mddev->bitmap_file, O_RDWR); + if (bmfd >= 0) { + added = ioctl(mdfd, SET_BITMAP_FILE, + bmfd); + close(bmfd); + } + } + if (c->verbose >= 0) { + if (added == 0) + pr_err("Added bitmap %s to %s\n", + mddev->bitmap_file, me->path); + else if (errno != EEXIST) + pr_err("Failed to add bitmap to %s: %s\n", + me->path, strerror(errno)); + } + } + /* FIXME check for reshape_active and consider not + * starting array. + */ + sra = sysfs_read(mdfd, NULL, 0); + if (sra) { + if (sysfs_set_str(sra, NULL, + "array_state", "read-auto") == 0) { + if (c->verbose >= 0) + pr_err("started array %s\n", + me->path ?: me->devnm); + } else { + pr_err("failed to start array %s: %s\n", + me->path ?: me->devnm, + strerror(errno)); + rv = 1; + } + sysfs_free(sra); + } + } + return rv; +} + +static char *container2devname(char *devname) +{ + char *mdname = NULL; + + if (devname[0] == '/') { + int fd = open(devname, O_RDONLY); + if (fd >= 0) { + mdname = xstrdup(fd2devnm(fd)); + close(fd); + } + } else { + int uuid[4]; + struct map_ent *mp, *map = NULL; + + if (!parse_uuid(devname, uuid)) + return mdname; + mp = map_by_uuid(&map, uuid); + if (mp) + mdname = xstrdup(mp->devnm); + map_free(map); + } + + return mdname; +} + +static int Incremental_container(struct supertype *st, char *devname, + struct context *c, char *only) +{ + /* Collect the contents of this container and for each + * array, choose a device name and assemble the array. + */ + + struct mdinfo *list; + struct mdinfo *ra; + struct map_ent *map = NULL; + struct mdinfo info; + int trustworthy; + struct mddev_ident *match; + int rv = 0; + struct domainlist *domains; + struct map_ent *smp; + int suuid[4]; + int sfd; + int ra_blocked = 0; + int ra_all = 0; + int result = 0; + + st->ss->getinfo_super(st, &info, NULL); + + if ((c->runstop > 0 && info.container_enough >= 0) || + info.container_enough > 0) + /* pass */; + else { + if (c->export) { + printf("MD_STARTED=no\n"); + } else if (c->verbose) + pr_err("not enough devices to start the container\n"); + return 0; + } + + match = conf_match(st, &info, devname, c->verbose, &rv); + if (match == NULL && rv == 2) + return rv; + + /* Need to compute 'trustworthy' */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, c->homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL; + else + trustworthy = FOREIGN; + + list = st->ss->container_content(st, NULL); + /* when nothing to activate - quit */ + if (list == NULL) { + if (c->export) { + printf("MD_STARTED=nothing\n"); + } + return 0; + } + for (ra = list ; ra ; ra = ra->next) { + int mdfd; + char chosen_name[1024]; + struct map_ent *mp; + struct mddev_ident *match = NULL; + + ra_all++; + /* do not activate arrays blocked by metadata handler */ + if (ra->array.state & (1 << MD_SB_BLOCK_VOLUME)) { + pr_err("Cannot activate array %s in %s.\n", + ra->text_version, devname); + ra_blocked++; + continue; + } + mp = map_by_uuid(&map, ra->uuid); + + if (mp) { + mdfd = open_dev(mp->devnm); + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, mp->devnm); + } else if (!only) { + + /* Check in mdadm.conf for container == devname and + * member == ra->text_version after second slash. + */ + char *sub = strchr(ra->text_version+1, '/'); + struct mddev_ident *array_list; + if (sub) { + sub++; + array_list = conf_get_ident(NULL); + } else + array_list = NULL; + for(; array_list ; array_list = array_list->next) { + char *dn; + if (array_list->member == NULL || + array_list->container == NULL) + continue; + if (strcmp(array_list->member, sub) != 0) + continue; + if (array_list->uuid_set && + !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid)) + continue; + dn = container2devname(array_list->container); + if (dn == NULL) + continue; + if (strncmp(dn, ra->text_version+1, + strlen(dn)) != 0 || + ra->text_version[strlen(dn)+1] != '/') { + free(dn); + continue; + } + free(dn); + /* we have a match */ + match = array_list; + if (c->verbose>0) + pr_err("match found for member %s\n", + array_list->member); + break; + } + + if (match && match->devname && + strcasecmp(match->devname, "") == 0) { + if (c->verbose > 0) + pr_err("array %s/%s is explicitly ignored by mdadm.conf\n", + match->container, match->member); + continue; + } + if (match) + trustworthy = LOCAL; + + mdfd = create_mddev(match ? match->devname : NULL, + ra->name, + c->autof, + trustworthy, + chosen_name); + } + if (only && (!mp || strcmp(mp->devnm, only) != 0)) + continue; + + if (mdfd < 0) { + pr_err("failed to open %s: %s.\n", + chosen_name, strerror(errno)); + return 2; + } + + assemble_container_content(st, mdfd, ra, c, + chosen_name, &result); + close(mdfd); + } + if (c->export && result) { + char sep = '='; + printf("MD_STARTED"); + if (result & INCR_NO) { + printf("%cno", sep); + sep = ','; + } + if (result & INCR_UNSAFE) { + printf("%cunsafe", sep); + sep = ','; + } + if (result & INCR_ALREADY) { + printf("%calready", sep); + sep = ','; + } + if (result & INCR_YES) { + printf("%cyes", sep); + sep = ','; + } + printf("\n"); + } + + /* don't move spares to container with volume being activated + when all volumes are blocked */ + if (ra_all == ra_blocked) + return 0; + + /* Now move all suitable spares from spare container */ + domains = domain_from_array(list, st->ss->name); + memcpy(suuid, uuid_zero, sizeof(int[4])); + if (domains && + (smp = map_by_uuid(&map, suuid)) != NULL && + (sfd = open(smp->path, O_RDONLY)) >= 0) { + /* spare container found */ + struct supertype *sst = + super_imsm.match_metadata_desc("imsm"); + struct mdinfo *sinfo; + unsigned long long min_size = 0; + if (st->ss->min_acceptable_spare_size) + min_size = st->ss->min_acceptable_spare_size(st); + if (!sst->ss->load_container(sst, sfd, NULL)) { + close(sfd); + sinfo = container_choose_spares(sst, min_size, + domains, NULL, + st->ss->name, 0); + sst->ss->free_super(sst); + if (sinfo){ + int count = 0; + struct mdinfo *disks = sinfo->devs; + while (disks) { + /* move spare from spare + * container to currently + * assembled one + */ + if (move_spare( + smp->path, + devname, + makedev(disks->disk.major, + disks->disk.minor))) + count++; + disks = disks->next; + } + if (count) + pr_err("Added %d spare%s to %s\n", + count, count>1?"s":"", devname); + } + sysfs_free(sinfo); + } else + close(sfd); + } + domain_free(domains); + return 0; +} + +static void run_udisks(char *arg1, char *arg2) +{ + int pid = fork(); + int status; + if (pid == 0) { + execl("/usr/bin/udisks", "udisks", arg1, arg2, NULL); + execl("/bin/udisks", "udisks", arg1, arg2, NULL); + exit(1); + } + while (pid > 0 && wait(&status) != pid) + ; +} + +/* + * IncrementalRemove - Attempt to see if the passed in device belongs to any + * raid arrays, and if so first fail (if needed) and then remove the device. + * + * @devname - The device we want to remove + * @id_path - name as found in /dev/disk/by-path for this device + * + * Note: the device name must be a kernel name like "sda", so + * that we can find it in /proc/mdstat + */ +int IncrementalRemove(char *devname, char *id_path, int verbose) +{ + int mdfd; + int rv = 0; + struct mdstat_ent *ent; + struct mddev_dev devlist; + struct mdinfo mdi; + char buf[32]; + + if (!id_path) + dprintf("incremental removal without --path lacks the possibility to re-add new device in this port\n"); + + if (strchr(devname, '/')) { + pr_err("incremental removal requires a kernel device name, not a file: %s\n", devname); + return 1; + } + ent = mdstat_by_component(devname); + if (!ent) { + if (verbose >= 0) + pr_err("%s does not appear to be a component of any array\n", devname); + return 1; + } + sysfs_init(&mdi, -1, ent->devnm); + mdfd = open_dev_excl(ent->devnm); + if (mdfd > 0) { + close(mdfd); + if (sysfs_get_str(&mdi, NULL, "array_state", + buf, sizeof(buf)) > 0) { + if (strncmp(buf, "active", 6) == 0 || + strncmp(buf, "clean", 5) == 0) + sysfs_set_str(&mdi, NULL, + "array_state", "read-auto"); + } + } + mdfd = open_dev(ent->devnm); + if (mdfd < 0) { + if (verbose >= 0) + pr_err("Cannot open array %s!!\n", ent->devnm); + free_mdstat(ent); + return 1; + } + + if (id_path) { + struct map_ent *map = NULL, *me; + me = map_by_devnm(&map, ent->devnm); + if (me) + policy_save_path(id_path, me); + map_free(map); + } + + memset(&devlist, 0, sizeof(devlist)); + devlist.devname = devname; + devlist.disposition = 'f'; + /* for a container, we must fail each member array */ + if (ent->metadata_version && + strncmp(ent->metadata_version, "external:", 9) == 0) { + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *memb; + for (memb = mdstat ; memb ; memb = memb->next) + if (is_container_member(memb, ent->devnm)) { + int subfd = open_dev(memb->devnm); + if (subfd >= 0) { + rv |= Manage_subdevs( + memb->devnm, subfd, + &devlist, verbose, 0, + NULL, 0); + close(subfd); + } + } + free_mdstat(mdstat); + } else + rv |= Manage_subdevs(ent->devnm, mdfd, &devlist, + verbose, 0, NULL, 0); + if (rv & 2) { + /* Failed due to EBUSY, try to stop the array. + * Give udisks a chance to unmount it first. + */ + int devid = devnm2devid(ent->devnm); + run_udisks("--unmount", map_dev(major(devid),minor(devid), 0)); + rv = Manage_stop(ent->devnm, mdfd, verbose, 1); + if (rv) + /* At least we can try to trigger a 'remove' */ + sysfs_uevent(&mdi, "remove"); + if (verbose) { + if (rv) + pr_err("Fail to stop %s too.\n", ent->devnm); + } + } else { + devlist.disposition = 'r'; + rv = Manage_subdevs(ent->devnm, mdfd, &devlist, + verbose, 0, NULL, 0); + } + close(mdfd); + free_mdstat(ent); + return rv; +} diff --git a/Kill.c b/Kill.c new file mode 100644 index 00000000..f2fdb856 --- /dev/null +++ b/Kill.c @@ -0,0 +1,146 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * + * Added by Dale Stephenson + * steph@snapserver.com + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" + +int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl) +{ + /* + * Nothing fancy about Kill. It just zeroes out a superblock + * Definitely not safe. + * Returns: + * 0 - a zero superblock was successfully written out + * 1 - failed to write the zero superblock + * 2 - failed to open the device or find a superblock. + */ + + int fd, rv = 0; + + if (force) + noexcl = 1; + fd = open(dev, O_RDWR|(noexcl ? 0 : O_EXCL)); + if (fd < 0) { + if (verbose >= 0) + pr_err("Couldn't open %s for write - not zeroing\n", + dev); + return 2; + } + if (st == NULL) + st = guess_super(fd); + if (st == NULL || st->ss->init_super == NULL) { + if (verbose >= 0) + pr_err("Unrecognised md component device - %s\n", dev); + close(fd); + return 2; + } + st->ignore_hw_compat = 1; + rv = st->ss->load_super(st, fd, dev); + if (rv == 0 || (force && rv >= 2)) { + st->ss->free_super(st); + st->ss->init_super(st, NULL, 0, "", NULL, NULL, + INVALID_SECTORS); + if (st->ss->store_super(st, fd)) { + if (verbose >= 0) + pr_err("Could not zero superblock on %s\n", + dev); + rv = 1; + } else if (rv) { + if (verbose >= 0) + pr_err("superblock zeroed anyway\n"); + rv = 0; + } + } + close(fd); + return rv; +} + +int Kill_subarray(char *dev, char *subarray, int verbose) +{ + /* Delete a subarray out of a container, the subarry must be + * inactive. The subarray string must be a subarray index + * number. + * + * 0 = successfully deleted subarray from all container members + * 1 = failed to sync metadata to one or more devices + * 2 = failed to find the container, subarray, or other resource + * issue + */ + struct supertype supertype, *st = &supertype; + int fd, rv = 2; + + memset(st, 0, sizeof(*st)); + + fd = open_subarray(dev, subarray, st, verbose < 0); + if (fd < 0) + return 2; + + if (!st->ss->kill_subarray) { + if (verbose >= 0) + pr_err("Operation not supported for %s metadata\n", + st->ss->name); + goto free_super; + } + + if (is_subarray_active(subarray, st->devnm)) { + if (verbose >= 0) + pr_err("Subarray-%s still active, aborting\n", + subarray); + goto free_super; + } + + if (mdmon_running(st->devnm)) + st->update_tail = &st->updates; + + /* ok we've found our victim, drop the axe */ + rv = st->ss->kill_subarray(st); + if (rv) { + if (verbose >= 0) + pr_err("Failed to delete subarray-%s from %s\n", + subarray, dev); + goto free_super; + } + + /* FIXME these routines do not report success/failure */ + if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + + if (verbose >= 0) + pr_err("Deleted subarray-%s from %s, UUIDs may have changed\n", + subarray, dev); + + rv = 0; + + free_super: + st->ss->free_super(st); + close(fd); + + return rv; +} diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..664c79ff --- /dev/null +++ b/Makefile @@ -0,0 +1,345 @@ +# +# mdadm - manage Linux "md" devices aka RAID arrays. +# +# Copyright (C) 2001-2002 Neil Brown +# Copyright (C) 2013 Neil Brown +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Author: Neil Brown +# Email: +# Paper: Neil Brown +# School of Computer Science and Engineering +# The University of New South Wales +# Sydney, 2052 +# Australia +# + +# define "CXFLAGS" to give extra flags to CC. +# e.g. make CXFLAGS=-O to optimise +TCC = tcc +UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found ) +#DIET_GCC = diet gcc +# sorry, but diet-libc doesn't know about posix_memalign, +# so we cannot use it any more. +DIET_GCC = gcc -DHAVE_STDINT_H + +KLIBC=/home/src/klibc/klibc-0.77 + +KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 + +CC = $(CROSS_COMPILE)gcc +CXFLAGS ?= -ggdb +CWFLAGS = -Wall -Wstrict-prototypes -Wextra -Wno-unused-parameter +ifdef WARN_UNUSED +CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3 +endif + +ifdef DEBIAN +CPPFLAGS += -DDEBIAN +endif +ifdef DEFAULT_OLD_METADATA + CPPFLAGS += -DDEFAULT_OLD_METADATA + DEFAULT_METADATA=0.90 +else + DEFAULT_METADATA=1.2 +endif +CPPFLAGS += -DBINDIR=\"$(BINDIR)\" + +PKG_CONFIG ?= pkg-config + +SYSCONFDIR = /etc +CONFFILE = $(SYSCONFDIR)/mdadm/mdadm.conf +CONFFILE2 = $(SYSCONFDIR)/mdadm.conf +MAILCMD =/usr/sbin/sendmail -t +CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" +# Both MAP_DIR and MDMON_DIR should be somewhere that persists across the +# pivotroot from early boot to late boot. +# /run is best, but for distros that don't support that. +# /dev can work, in which case you probably want /dev/.mdadm +RUN_DIR=/run/mdadm +CHECK_RUN_DIR=1 +MAP_DIR=$(RUN_DIR) +MAP_FILE = map +MAP_PATH = $(MAP_DIR)/$(MAP_FILE) +MDMON_DIR = $(RUN_DIR) +# place for autoreplace cookies +FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots +SYSTEMD_DIR=/lib/systemd/system + +COROSYNC:=$(shell [ -d /usr/include/corosync ] || echo -DNO_COROSYNC) +DLM:=$(shell [ -f /usr/include/libdlm.h ] || echo -DNO_DLM) + +DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\" +DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\" +DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\" +CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) $(COROSYNC) $(DLM) + +VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//') +VERS_DATE = $(shell [ -d .git ] && date --date="`git log -n1 --format=format:%cd --date=short`" '+%0dth %B %Y' | sed -e 's/1th/1st/' -e 's/2th/2nd/' -e 's/11st/11th/' -e 's/12nd/12th/') +DVERS = $(if $(VERSION),-DVERSION=\"$(VERSION)\",) +DDATE = $(if $(VERS_DATE),-DVERS_DATE="\"$(VERS_DATE)\"",) +CFLAGS += $(DVERS) $(DDATE) + +# The glibc TLS ABI requires applications that call clone(2) to set up +# TLS data structures, use pthreads until mdmon implements this support +USE_PTHREADS = 1 +ifdef USE_PTHREADS +CFLAGS += -DUSE_PTHREADS +MON_LDFLAGS += -pthread +endif + +# If you want a static binary, you might uncomment these +# LDFLAGS = -static +# STRIP = -s +LDLIBS=-ldl + +INSTALL = /usr/bin/install +DESTDIR = +BINDIR = /sbin +MANDIR = /usr/share/man +MAN4DIR = $(MANDIR)/man4 +MAN5DIR = $(MANDIR)/man5 +MAN8DIR = $(MANDIR)/man8 + +UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null) +ifndef UDEVDIR + UDEVDIR = /lib/udev +endif + +ifeq (,$(findstring s,$(MAKEFLAGS))) + ECHO=echo +else + ECHO=: +endif + +OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o util.o maps.o lib.o \ + Manage.o Assemble.o Build.o \ + Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \ + Incremental.o Dump.o \ + mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \ + super-mbr.o super-gpt.o \ + restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \ + platform-intel.o probe_roms.o crc32c.o + +CHECK_OBJS = restripe.o sysfs.o maps.o lib.o xmalloc.o dlink.o + +SRCS = $(patsubst %.o,%.c,$(OBJS)) + +INCL = mdadm.h part.h bitmap.h + +MON_OBJS = mdmon.o monitor.o managemon.o util.o maps.o mdstat.o sysfs.o \ + policy.o lib.o \ + Kill.o sg_io.o dlink.o ReadMe.o super-intel.o \ + super-mbr.o super-gpt.o \ + super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \ + platform-intel.o probe_roms.o + +MON_SRCS = $(patsubst %.o,%.c,$(MON_OBJS)) + +STATICSRC = pwgr.c +STATICOBJS = pwgr.o + +ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c policy.c dlink.c util.c \ + maps.c lib.c xmalloc.c \ + super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c mdstat.c \ + platform-intel.c probe_roms.c sysfs.c super-mbr.c super-gpt.c mapfile.c +ASSEMBLE_AUTO_SRCS := mdopen.c +ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE +ifdef MDASSEMBLE_AUTO +ASSEMBLE_SRCS += $(ASSEMBLE_AUTO_SRCS) +ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO +endif + +all : mdadm mdmon +man : mdadm.man md.man mdadm.conf.man mdmon.man raid6check.man + +check_rundir: + @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" = 1 ]; then \ + echo "***** Parent of $(RUN_DIR) does not exist. Maybe set different RUN_DIR="; \ + echo "***** e.g. make RUN_DIR=/dev/.mdadm" ; \ + echo "***** or set CHECK_RUN_DIR=0"; exit 1; \ + fi + +everything: all mdadm.static swap_super test_stripe raid6check \ + mdassemble mdassemble.auto mdassemble.static mdassemble.man \ + mdadm.Os mdadm.O2 man +everything-test: all mdadm.static swap_super test_stripe \ + mdassemble.auto mdassemble.static mdassemble.man \ + mdadm.Os mdadm.O2 man +# mdadm.uclibc and mdassemble.uclibc don't work on x86-64 +# mdadm.tcc doesn't work.. + +mdadm : $(OBJS) | check_rundir + $(CC) $(CFLAGS) $(LDFLAGS) -o mdadm $(OBJS) $(LDLIBS) + +mdadm.static : $(OBJS) $(STATICOBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS) $(LDLIBS) + +mdadm.tcc : $(SRCS) $(INCL) + $(TCC) -o mdadm.tcc $(SRCS) + +mdadm.klibc : $(SRCS) $(INCL) + rm -f $(OBJS) + $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS) + +mdadm.Os : $(SRCS) $(INCL) + $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) $(LDLIBS) + +mdadm.O2 : $(SRCS) $(INCL) mdmon.O2 + $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) $(LDLIBS) + +mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h + $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) $(LDLIBS) + +# use '-z now' to guarantee no dynamic linker interactions with the monitor thread +mdmon : $(MON_OBJS) | check_rundir + $(CC) $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -Wl,-z,now -o mdmon $(MON_OBJS) $(LDLIBS) +msg.o: msg.c msg.h + +test_stripe : restripe.c xmalloc.o mdadm.h + $(CC) $(CFLAGS) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c + +raid6check : raid6check.o mdadm.h $(CHECK_OBJS) + $(CC) $(CXFLAGS) $(LDFLAGS) -o raid6check raid6check.o $(CHECK_OBJS) + +mdassemble : $(ASSEMBLE_SRCS) $(INCL) + $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) $(STATICSRC) + +mdassemble.diet : $(ASSEMBLE_SRCS) $(INCL) + rm -f $(OBJS) + $(DIET_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) $(STATICSRC) + +mdassemble.static : $(ASSEMBLE_SRCS) $(INCL) + rm -f $(OBJS) + $(CC) $(LDFLAGS) $(CPPFLAGS) $(ASSEMBLE_FLAGS) -static -DHAVE_STDINT_H -o mdassemble.static $(ASSEMBLE_SRCS) $(STATICSRC) + +mdassemble.auto : $(ASSEMBLE_SRCS) $(INCL) $(ASSEMBLE_AUTO_SRCS) + rm -f mdassemble.static + $(MAKE) MDASSEMBLE_AUTO=1 mdassemble.static + mv mdassemble.static mdassemble.auto + +mdassemble.uclibc : $(ASSEMBLE_SRCS) $(INCL) + rm -f $(OJS) + $(UCLIBC_GCC) $(ASSEMBLE_FLAGS) -DUCLIBC -DHAVE_STDINT_H -static -o mdassemble.uclibc $(ASSEMBLE_SRCS) $(STATICSRC) + +# This doesn't work +mdassemble.klibc : $(ASSEMBLE_SRCS) $(INCL) + rm -f $(OBJS) + $(KLIBC_GCC) $(ASSEMBLE_FLAGS) -o mdassemble $(ASSEMBLE_SRCS) + +mdadm.8 : mdadm.8.in + sed -e 's/{DEFAULT_METADATA}/$(DEFAULT_METADATA)/g' \ + -e 's,{MAP_PATH},$(MAP_PATH),g' mdadm.8.in > mdadm.8 + +mdadm.man : mdadm.8 + man -l mdadm.8 > mdadm.man + +mdmon.man : mdmon.8 + man -l mdmon.8 > mdmon.man + +md.man : md.4 + man -l md.4 > md.man + +mdadm.conf.man : mdadm.conf.5 + man -l mdadm.conf.5 > mdadm.conf.man + +mdassemble.man : mdassemble.8 + man -l mdassemble.8 > mdassemble.man + +raid6check.man : raid6check.8 + man -l raid6check.8 > raid6check.man + +$(OBJS) : $(INCL) mdmon.h +$(MON_OBJS) : $(INCL) mdmon.h + +sha1.o : sha1.c sha1.h md5.h + $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c + +install : mdadm mdmon install-man install-udev + $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm + $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon + +install-static : mdadm.static install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm + +install-tcc : mdadm.tcc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.tcc $(DESTDIR)$(BINDIR)/mdadm + +install-uclibc : mdadm.uclibc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.uclibc $(DESTDIR)$(BINDIR)/mdadm + +install-klibc : mdadm.klibc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.klibc $(DESTDIR)$(BINDIR)/mdadm + +install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8 + $(INSTALL) -D -m 644 mdadm.8 $(DESTDIR)$(MAN8DIR)/mdadm.8 + $(INSTALL) -D -m 644 mdmon.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 + $(INSTALL) -D -m 644 md.4 $(DESTDIR)$(MAN4DIR)/md.4 + $(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 + +install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules + @for file in 63-md-raid-arrays.rules 64-md-raid-assembly.rules ; \ + do sed -e 's,BINDIR,$(BINDIR),g' udev-$${file#??-} > .install.tmp.1 && \ + $(ECHO) $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ + $(INSTALL) -D -m 644 .install.tmp.1 $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ + rm -f .install.tmp.1; \ + done + +install-systemd: systemd/mdmon@.service + @for file in mdmon@.service mdmonitor.service mdadm-last-resort@.timer \ + mdadm-last-resort@.service mdadm-grow-continue@.service; \ + do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.2 && \ + $(ECHO) $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ + $(INSTALL) -D -m 644 .install.tmp.2 $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ + rm -f .install.tmp.2; \ + done + @for file in mdadm.shutdown ; \ + do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.3 && \ + $(ECHO) $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ + $(INSTALL) -D -m 755 .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ + rm -f .install.tmp.3; \ + done + if [ -f /etc/SuSE-release -o -n "$(SUSE)" ] ;then $(INSTALL) -D -m 755 systemd/SUSE-mdadm_env.sh $(DESTDIR)$(SYSTEMD_DIR)/../scripts/mdadm_env.sh ;fi + +uninstall: + rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm + +test: mdadm mdmon test_stripe swap_super raid6check + @echo "Please run './test' as root" + +clean : + rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \ + mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt .merge_file_* \ + mdadm.Os mdadm.O2 mdmon.O2 \ + mdassemble mdassemble.static mdassemble.auto mdassemble.uclibc \ + mdassemble.klibc swap_super \ + init.cpio.gz mdadm.uclibc.static test_stripe raid6check raid6check.o mdmon \ + mdadm.8 + +dist : clean + ./makedist + +testdist : everything-test clean + ./makedist test + +TAGS : + etags *.h *.c + +DISTRO_MAKEFILE := $(wildcard distropkg/Makefile) +ifdef DISTRO_MAKEFILE +include $(DISTRO_MAKEFILE) +endif diff --git a/Manage.c b/Manage.c new file mode 100644 index 00000000..7e1b94be --- /dev/null +++ b/Manage.c @@ -0,0 +1,1786 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" +#include + +#define REGISTER_DEV _IO (MD_MAJOR, 1) +#define START_MD _IO (MD_MAJOR, 2) +#define STOP_MD _IO (MD_MAJOR, 3) + +int Manage_ro(char *devname, int fd, int readonly) +{ + /* switch to readonly or rw + * + * requires >= 0.90.0 + * first check that array is runing + * use RESTART_ARRAY_RW or STOP_ARRAY_RO + * + */ + mdu_array_info_t array; +#ifndef MDASSEMBLE + struct mdinfo *mdi; +#endif + int rv = 0; + + if (md_get_version(fd) < 9000) { + pr_err("need md driver version 0.90.0 or later\n"); + return 1; + } +#ifndef MDASSEMBLE + /* If this is an externally-managed array, we need to modify the + * metadata_version so that mdmon doesn't undo our change. + */ + mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION); + if (mdi && + mdi->array.major_version == -1 && + is_subarray(mdi->text_version)) { + char vers[64]; + strcpy(vers, "external:"); + strcat(vers, mdi->text_version); + if (readonly > 0) { + int rv; + /* We set readonly ourselves. */ + vers[9] = '-'; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + + close(fd); + rv = sysfs_set_str(mdi, NULL, "array_state", "readonly"); + + if (rv < 0) { + pr_err("failed to set readonly for %s: %s\n", + devname, strerror(errno)); + + vers[9] = mdi->text_version[0]; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + rv = 1; + goto out; + } + } else { + char *cp; + /* We cannot set read/write - must signal mdmon */ + vers[9] = '/'; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + + cp = strchr(vers+10, '/'); + if (cp) + *cp = 0; + ping_monitor(vers+10); + if (mdi->array.level <= 0) + sysfs_set_str(mdi, NULL, "array_state", "active"); + } + goto out; + } +#endif + if (ioctl(fd, GET_ARRAY_INFO, &array)) { + pr_err("%s does not appear to be active.\n", + devname); + rv = 1; + goto out; + } + + if (readonly > 0) { + if (ioctl(fd, STOP_ARRAY_RO, NULL)) { + pr_err("failed to set readonly for %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + } else if (readonly < 0) { + if (ioctl(fd, RESTART_ARRAY_RW, NULL)) { + pr_err("failed to set writable for %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + } +out: +#ifndef MDASSEMBLE + if (mdi) + sysfs_free(mdi); +#endif + return rv; +} + +#ifndef MDASSEMBLE + +static void remove_devices(char *devnm, char *path) +{ + /* + * Remove names at 'path' - possibly with + * partition suffixes - which link to the 'standard' + * name for devnm. These were probably created + * by mdadm when the array was assembled. + */ + char base[40]; + char *path2; + char link[1024]; + int n; + int part; + char *be; + char *pe; + + if (!path) + return; + + sprintf(base, "/dev/%s", devnm); + be = base + strlen(base); + + path2 = xmalloc(strlen(path)+20); + strcpy(path2, path); + pe = path2 + strlen(path2); + + for (part = 0; part < 16; part++) { + if (part) { + sprintf(be, "p%d", part); + + if (isdigit(pe[-1])) + sprintf(pe, "p%d", part); + else + sprintf(pe, "%d", part); + } + n = readlink(path2, link, sizeof(link)); + if (n > 0 && (int)strlen(base) == n && + strncmp(link, base, n) == 0) + unlink(path2); + } + free(path2); +} + +int Manage_run(char *devname, int fd, struct context *c) +{ + /* Run the array. Array must already be configured + * Requires >= 0.90.0 + */ + char nm[32], *nmp; + + if (md_get_version(fd) < 9000) { + pr_err("need md driver version 0.90.0 or later\n"); + return 1; + } + nmp = fd2devnm(fd); + if (!nmp) { + pr_err("Cannot find %s in sysfs!!\n", devname); + return 1; + } + strcpy(nm, nmp); + return IncrementalScan(c, nm); +} + +int Manage_stop(char *devname, int fd, int verbose, int will_retry) +{ + /* Stop the array. Array must already be configured + * 'will_retry' means that error messages are not wanted. + */ + int rv = 0; + struct map_ent *map = NULL; + struct mdinfo *mdi; + char devnm[32]; + char container[32]; + int err; + int count; + char buf[32]; + unsigned long long rd1, rd2; + + if (will_retry && verbose == 0) + verbose = -1; + + if (md_get_version(fd) < 9000) { + if (ioctl(fd, STOP_MD, 0) == 0) + return 0; + pr_err("stopping device %s failed: %s\n", + devname, strerror(errno)); + return 1; + } + + strcpy(devnm, fd2devnm(fd)); + /* Get EXCL access first. If this fails, then attempting + * to stop is probably a bad idea. + */ + mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION); + if (mdi && is_subarray(mdi->text_version)) { + char *sl; + strncpy(container, mdi->text_version+1, sizeof(container)); + container[sizeof(container)-1] = 0; + sl = strchr(container, '/'); + if (sl) + *sl = 0; + } else + container[0] = 0; + close(fd); + count = 5; + while (((fd = ((devname[0] == '/') + ?open(devname, O_RDONLY|O_EXCL) + :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0 + || strcmp(fd2devnm(fd), devnm) != 0) + && container[0] + && mdmon_running(container) + && count) { + /* Can't open, so something might be wrong. However it + * is a container, so we might be racing with mdmon, so + * retry for a bit. + */ + if (fd >= 0) + close(fd); + flush_mdmon(container); + count--; + } + if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) { + if (fd >= 0) + close(fd); + if (verbose >= 0) + pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n", + devname); + return 1; + } + /* If this is an mdmon managed array, just write 'inactive' + * to the array state and let mdmon clear up. + */ + if (mdi && + mdi->array.level > 0 && + is_subarray(mdi->text_version)) { + int err; + /* This is mdmon managed. */ + close(fd); + + /* As we had an O_EXCL open, any use of the device + * which blocks STOP_ARRAY is probably a transient use, + * so it is reasonable to retry for a while - 5 seconds. + */ + count = 25; + while (count && + (err = sysfs_set_str(mdi, NULL, + "array_state", + "inactive")) < 0 + && errno == EBUSY) { + usleep(200000); + count--; + } + if (err) { + if (verbose >= 0) + pr_err("failed to stop array %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + + /* Give monitor a chance to act */ + ping_monitor(mdi->text_version); + + fd = open_dev_excl(devnm); + if (fd < 0) { + if (verbose >= 0) + pr_err("failed to completely stop %s: Device is busy\n", + devname); + rv = 1; + goto out; + } + } else if (mdi && + mdi->array.major_version == -1 && + mdi->array.minor_version == -2 && + !is_subarray(mdi->text_version)) { + struct mdstat_ent *mds, *m; + /* container, possibly mdmon-managed. + * Make sure mdmon isn't opening it, which + * would interfere with the 'stop' + */ + ping_monitor(mdi->sys_name); + + /* now check that there are no existing arrays + * which are members of this array + */ + mds = mdstat_read(0, 0); + for (m = mds; m; m = m->next) + if (m->metadata_version && + strncmp(m->metadata_version, "external:", 9)==0 && + metadata_container_matches(m->metadata_version+9, + devnm)) { + if (verbose >= 0) + pr_err("Cannot stop container %s: member %s still active\n", + devname, m->devnm); + free_mdstat(mds); + rv = 1; + goto out; + } + } + + /* If the array is undergoing a reshape which changes the number + * of devices, then it would be nice to stop it at a point where + * it has completed a full number of stripes in both old and + * new layouts as this will allow the reshape to be reverted. + * So if 'sync_action' is "reshape" and 'raid_disks' shows two + * different numbers, then + * - freeze reshape + * - set sync_max to next multiple of both data_disks and + * chunk sizes (or next but one) + * - unfreeze reshape + * - wait on 'sync_completed' for that point to be reached. + */ + if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) && + sysfs_attribute_available(mdi, NULL, "sync_action") && + sysfs_attribute_available(mdi, NULL, "reshape_direction") && + sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 && + strcmp(buf, "reshape\n") == 0 && + sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) { + unsigned long long position, curr; + unsigned long long chunk1, chunk2; + unsigned long long rddiv, chunkdiv; + unsigned long long sectors; + unsigned long long sync_max, old_sync_max; + unsigned long long completed; + int backwards = 0; + int delay; + int scfd; + + delay = 40; + while (rd1 > rd2 && delay > 0 && + sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) { + /* must be in the critical section - wait a bit */ + delay -= 1; + usleep(100000); + } + + if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0) + goto done; + /* Array is frozen */ + + rd1 -= mdi->array.level == 6 ? 2 : 1; + rd2 -= mdi->array.level == 6 ? 2 : 1; + sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf)); + if (strncmp(buf, "back", 4) == 0) + backwards = 1; + if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) { + /* reshape must have finished now */ + sysfs_set_str(mdi, NULL, "sync_action", "idle"); + goto done; + } + sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2); + chunk1 /= 512; + chunk2 /= 512; + rddiv = GCD(rd1, rd2); + chunkdiv = GCD(chunk1, chunk2); + sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2; + + if (backwards) { + /* Need to subtract 'reshape_position' from + * array size to get equivalent of sync_max. + * Size calculation based on raid5_size in kernel. + */ + unsigned long long size = mdi->component_size; + size &= ~(chunk1-1); + size &= ~(chunk2-1); + /* rd1 must be smaller */ + /* Reshape may have progressed further backwards than + * recorded, so target even further back (hence "-1") + */ + position = (position / sectors - 1) * sectors; + /* rd1 is always the conversion factor between 'sync' + * position and 'reshape' position. + * We read 1 "new" stripe worth of data from where-ever, + * and when write out that full stripe. + */ + sync_max = size - position/rd1; + } else { + /* Reshape will very likely be beyond position, and it may + * be too late to stop at '+1', so aim for '+2' + */ + position = (position / sectors + 2) * sectors; + sync_max = position/rd1; + } + if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0) + old_sync_max = mdi->component_size; + /* Must not advance sync_max as that could confuse + * the reshape monitor */ + if (sync_max < old_sync_max) + sysfs_set_num(mdi, NULL, "sync_max", sync_max); + sysfs_set_str(mdi, NULL, "sync_action", "idle"); + + /* That should have set things going again. Now we + * wait a little while (3 second max) for sync_completed + * to reach the target. + * The reshape process can block for 500msec if + * the sync speed limit is hit, so we need to wait + * a lot longer than that. 1 second is usually + * enough. 3 is safe. + */ + delay = 3000; + scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed"); + while (scfd >= 0 && delay > 0 && old_sync_max > 0) { + unsigned long long max_completed; + sysfs_get_ll(mdi, NULL, "reshape_position", &curr); + sysfs_fd_get_str(scfd, buf, sizeof(buf)); + if (strncmp(buf, "none", 4) == 0) { + /* Either reshape has aborted, or hasn't + * quite started yet. Wait a bit and + * check 'sync_action' to see. + */ + usleep(10000); + sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf)); + if (strncmp(buf, "reshape", 7) != 0) + break; + } + + if (sysfs_fd_get_two(scfd, &completed, + &max_completed) == 2 && + /* 'completed' sometimes reads as max-uulong */ + completed < max_completed && + (completed > sync_max || + (completed == sync_max && curr != position))) { + while (completed > sync_max) { + sync_max += sectors / rd1; + if (backwards) + position -= sectors; + else + position += sectors; + } + if (sync_max < old_sync_max) + sysfs_set_num(mdi, NULL, "sync_max", sync_max); + } + + if (!backwards && curr >= position) + break; + if (backwards && curr <= position) + break; + sysfs_wait(scfd, &delay); + } + if (scfd >= 0) + close(scfd); + + } +done: + + /* As we have an O_EXCL open, any use of the device + * which blocks STOP_ARRAY is probably a transient use, + * so it is reasonable to retry for a while - 5 seconds. + */ + count = 25; err = 0; + while (count && fd >= 0 + && (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 + && errno == EBUSY) { + usleep(200000); + count --; + } + if (fd >= 0 && err) { + if (verbose >= 0) { + pr_err("failed to stop array %s: %s\n", + devname, strerror(errno)); + if (errno == EBUSY) + cont_err("Perhaps a running process, mounted filesystem or active volume group?\n"); + } + rv = 1; + goto out; + } + /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array + * was stopped, so We'll do it here just to be sure. Drop any + * partitions as well... + */ + if (fd >= 0) + ioctl(fd, BLKRRPART, 0); + if (mdi) + sysfs_uevent(mdi, "change"); + + if (devnm[0] && use_udev()) { + struct map_ent *mp = map_by_devnm(&map, devnm); + remove_devices(devnm, mp ? mp->path : NULL); + } + + if (verbose >= 0) + pr_err("stopped %s\n", devname); + map_lock(&map); + map_remove(&map, devnm); + map_unlock(&map); +out: + if (mdi) + sysfs_free(mdi); + + return rv; +} + +static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp) +{ + struct mddev_dev *new; + new = xmalloc(sizeof(*new)); + memset(new, 0, sizeof(*new)); + new->devname = xstrdup(name); + new->disposition = disp; + new->next = dv->next; + dv->next = new; + return new; +} + +static void add_faulty(struct mddev_dev *dv, int fd, char disp) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int i; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + if ((disk.state & 1) == 0) /* not faulty */ + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + dv = add_one(dv, buf, disp); + } +} + +static void add_detached(struct mddev_dev *dv, int fd, char disp) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int i; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + int sfd; + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */ + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + sfd = dev_open(buf, O_RDONLY); + if (sfd >= 0) { + /* Not detached */ + close(sfd); + continue; + } + if (errno != ENXIO) + /* Probably not detached */ + continue; + dv = add_one(dv, buf, disp); + } +} + +static void add_set(struct mddev_dev *dv, int fd, char set_char) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int copies, set; + int i; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0) + return; + if (array.level != 10) + return; + copies = ((array.layout & 0xff) * + ((array.layout >> 8) & 0xff)); + if (array.raid_disks % copies) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + set = disk.raid_disk % copies; + if (set_char != set + 'A') + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + dv = add_one(dv, buf, dv->disposition); + } +} + +int attempt_re_add(int fd, int tfd, struct mddev_dev *dv, + struct supertype *dev_st, struct supertype *tst, + unsigned long rdev, + char *update, char *devname, int verbose, + mdu_array_info_t *array) +{ + struct mdinfo mdi; + int duuid[4]; + int ouuid[4]; + + dev_st->ss->getinfo_super(dev_st, &mdi, NULL); + dev_st->ss->uuid_from_super(dev_st, ouuid); + if (tst->sb) + tst->ss->uuid_from_super(tst, duuid); + else + /* Assume uuid matches: kernel will check */ + memcpy(duuid, ouuid, sizeof(ouuid)); + if ((mdi.disk.state & (1<major_version == 1 && + get_linux_version() <= 2006018) + goto skip_re_add; + disc.number = mdi.disk.number; + if (ioctl(fd, GET_DISK_INFO, &disc) != 0 + || disc.major != 0 || disc.minor != 0 + ) + goto skip_re_add; + disc.major = major(rdev); + disc.minor = minor(rdev); + disc.number = mdi.disk.number; + disc.raid_disk = mdi.disk.raid_disk; + disc.state = mdi.disk.state; + if (array->state & (1 << MD_SB_CLUSTERED)) { + /* extra flags are needed when adding to a cluster as + * there are two cases to distinguish + */ + if (dv->disposition == 'c') + disc.state |= (1 << MD_DISK_CANDIDATE); + else + disc.state |= (1 << MD_DISK_CLUSTER_ADD); + } + if (dv->writemostly == 1) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; + if (dv->writemostly == 2) + disc.state &= ~(1 << MD_DISK_WRITEMOSTLY); + remove_partitions(tfd); + if (update || dv->writemostly > 0) { + int rv = -1; + tfd = dev_open(dv->devname, O_RDWR); + if (tfd < 0) { + pr_err("failed to open %s for superblock update during re-add\n", dv->devname); + return -1; + } + + if (dv->writemostly == 1) + rv = dev_st->ss->update_super( + dev_st, NULL, "writemostly", + devname, verbose, 0, NULL); + if (dv->writemostly == 2) + rv = dev_st->ss->update_super( + dev_st, NULL, "readwrite", + devname, verbose, 0, NULL); + if (update) + rv = dev_st->ss->update_super( + dev_st, NULL, update, + devname, verbose, 0, NULL); + if (rv == 0) + rv = dev_st->ss->store_super(dev_st, tfd); + close(tfd); + if (rv != 0) { + pr_err("failed to update superblock during re-add\n"); + return -1; + } + } + /* don't even try if disk is marked as faulty */ + errno = 0; + if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) { + if (verbose >= 0) + pr_err("re-added %s\n", dv->devname); + return 1; + } + if (errno == ENOMEM || errno == EROFS) { + pr_err("add new device failed for %s: %s\n", + dv->devname, strerror(errno)); + if (dv->disposition == 'M') + return 0; + return -1; + } + } +skip_re_add: + return 0; +} + +int Manage_add(int fd, int tfd, struct mddev_dev *dv, + struct supertype *tst, mdu_array_info_t *array, + int force, int verbose, char *devname, + char *update, unsigned long rdev, unsigned long long array_size, + int raid_slot) +{ + unsigned long long ldsize; + struct supertype *dev_st = NULL; + int j; + mdu_disk_info_t disc; + + if (!get_dev_size(tfd, dv->devname, &ldsize)) { + if (dv->disposition == 'M') + return 0; + else + return -1; + } + + if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) { + /* More than 4TB is wasted on v0.90 */ + if (!force) { + pr_err("%s is larger than %s can effectively use.\n" + " Add --force is you really want to add this device.\n", + dv->devname, devname); + return -1; + } + pr_err("%s is larger than %s can effectively use.\n" + " Adding anyway as --force was given.\n", + dv->devname, devname); + } + if (!tst->ss->external && + array->major_version == 0 && + md_get_version(fd)%100 < 2) { + if (ioctl(fd, HOT_ADD_DISK, rdev)==0) { + if (verbose >= 0) + pr_err("hot added %s\n", + dv->devname); + return 1; + } + + pr_err("hot add failed for %s: %s\n", + dv->devname, strerror(errno)); + return -1; + } + + if (array->not_persistent == 0 || tst->ss->external) { + + /* need to find a sample superblock to copy, and + * a spare slot to use. + * For 'external' array (well, container based), + * We can just load the metadata for the array-> + */ + int array_failed; + if (tst->sb) + /* already loaded */; + else if (tst->ss->external) { + tst->ss->load_container(tst, fd, NULL); + } else for (j = 0; j < tst->max_devs; j++) { + char *dev; + int dfd; + disc.number = j; + if (ioctl(fd, GET_DISK_INFO, &disc)) + continue; + if (disc.major==0 && disc.minor==0) + continue; + if ((disc.state & 4)==0) /* sync */ + continue; + /* Looks like a good device to try */ + dev = map_dev(disc.major, disc.minor, 1); + if (!dev) + continue; + dfd = dev_open(dev, O_RDONLY); + if (dfd < 0) + continue; + if (tst->ss->load_super(tst, dfd, + NULL)) { + close(dfd); + continue; + } + close(dfd); + break; + } + /* FIXME this is a bad test to be using */ + if (!tst->sb && (dv->disposition != 'a' + && dv->disposition != 'S')) { + /* we are re-adding a device to a + * completely dead array - have to depend + * on kernel to check + */ + } else if (!tst->sb) { + pr_err("cannot load array metadata from %s\n", devname); + return -1; + } + + /* Make sure device is large enough */ + if (dv->disposition != 'j' && /* skip size check for Journal */ + tst->sb && + tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) < + array_size) { + if (dv->disposition == 'M') + return 0; + pr_err("%s not large enough to join array\n", + dv->devname); + return -1; + } + + /* Possibly this device was recently part of + * the array and was temporarily removed, and + * is now being re-added. If so, we can + * simply re-add it. + */ + + if (array->not_persistent==0) { + dev_st = dup_super(tst); + dev_st->ss->load_super(dev_st, tfd, NULL); + } + if (dev_st && dev_st->sb && dv->disposition != 'S') { + int rv = attempt_re_add(fd, tfd, dv, + dev_st, tst, + rdev, + update, devname, + verbose, + array); + dev_st->ss->free_super(dev_st); + if (rv) + return rv; + } + if (dv->disposition == 'M') { + if (verbose > 0) + pr_err("--re-add for %s to %s is not possible\n", + dv->devname, devname); + return 0; + } + if (dv->disposition == 'A') { + pr_err("--re-add for %s to %s is not possible\n", + dv->devname, devname); + return -1; + } + if (array->active_disks < array->raid_disks) { + char *avail = xcalloc(array->raid_disks, 1); + int d; + int found = 0; + + for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) { + disc.number = d; + if (ioctl(fd, GET_DISK_INFO, &disc)) + continue; + if (disc.major == 0 && disc.minor == 0) + continue; + found++; + if (!(disc.state & (1<level, array->raid_disks, + array->layout, 1, avail); + free(avail); + } else + array_failed = 0; + if (array_failed) { + pr_err("%s has failed so using --add cannot work and might destroy\n", + devname); + pr_err("data on %s. You should stop the array and re-assemble it.\n", + dv->devname); + return -1; + } + } else { + /* non-persistent. Must ensure that new drive + * is at least array->size big. + */ + if (ldsize/512 < array_size) { + pr_err("%s not large enough to join array\n", + dv->devname); + return -1; + } + } + /* committed to really trying this device now*/ + remove_partitions(tfd); + + /* in 2.6.17 and earlier, version-1 superblocks won't + * use the number we write, but will choose a free number. + * we must choose the same free number, which requires + * starting at 'raid_disks' and counting up + */ + for (j = array->raid_disks; j < tst->max_devs; j++) { + disc.number = j; + if (ioctl(fd, GET_DISK_INFO, &disc)) + break; + if (disc.major==0 && disc.minor==0) + break; + if (disc.state & 8) /* removed */ + break; + } + disc.major = major(rdev); + disc.minor = minor(rdev); + if (raid_slot < 0) + disc.number = j; + else + disc.number = raid_slot; + disc.state = 0; + + /* only add journal to array that supports journaling */ + if (dv->disposition == 'j') { + struct mdinfo mdi; + struct mdinfo *mdp; + + mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE); + + if (strncmp(mdp->sysfs_array_state, "readonly", 8) != 0) { + pr_err("%s is not readonly, cannot add journal.\n", devname); + return -1; + } + + tst->ss->getinfo_super(tst, &mdi, NULL); + if (mdi.journal_device_required == 0) { + pr_err("%s does not support journal device.\n", devname); + return -1; + } + disc.raid_disk = 0; + } + + if (array->not_persistent==0) { + int dfd; + if (dv->disposition == 'j') + disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC); + if (dv->writemostly == 1) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname, INVALID_SECTORS)) + return -1; + if (tst->ss->write_init_super(tst)) + return -1; + } else if (dv->disposition == 'A') { + /* this had better be raid1. + * As we are "--re-add"ing we must find a spare slot + * to fill. + */ + char *used = xcalloc(array->raid_disks, 1); + for (j = 0; j < tst->max_devs; j++) { + mdu_disk_info_t disc2; + disc2.number = j; + if (ioctl(fd, GET_DISK_INFO, &disc2)) + continue; + if (disc2.major==0 && disc2.minor==0) + continue; + if (disc2.state & 8) /* removed */ + continue; + if (disc2.raid_disk < 0) + continue; + if (disc2.raid_disk > array->raid_disks) + continue; + used[disc2.raid_disk] = 1; + } + for (j = 0 ; j < array->raid_disks; j++) + if (!used[j]) { + disc.raid_disk = j; + disc.state |= (1<state & (1 << MD_SB_CLUSTERED)) { + if (dv->disposition == 'c') + disc.state |= (1 << MD_DISK_CANDIDATE); + else + disc.state |= (1 << MD_DISK_CLUSTER_ADD); + } + + if (dv->writemostly == 1) + disc.state |= (1 << MD_DISK_WRITEMOSTLY); + if (tst->ss->external) { + /* add a disk + * to an external metadata container */ + struct mdinfo new_mdi; + struct mdinfo *sra; + int container_fd; + char devnm[32]; + int dfd; + + strcpy(devnm, fd2devnm(fd)); + + container_fd = open_dev_excl(devnm); + if (container_fd < 0) { + pr_err("add failed for %s: could not get exclusive access to container\n", + dv->devname); + tst->ss->free_super(tst); + return -1; + } + + Kill(dv->devname, NULL, 0, -1, 0); + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (mdmon_running(tst->container_devnm)) + tst->update_tail = &tst->updates; + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname, INVALID_SECTORS)) { + close(dfd); + close(container_fd); + return -1; + } + if (tst->update_tail) + flush_metadata_updates(tst); + else + tst->ss->sync_metadata(tst); + + sra = sysfs_read(container_fd, NULL, 0); + if (!sra) { + pr_err("add failed for %s: sysfs_read failed\n", + dv->devname); + close(container_fd); + tst->ss->free_super(tst); + return -1; + } + sra->array.level = LEVEL_CONTAINER; + /* Need to set data_offset and component_size */ + tst->ss->getinfo_super(tst, &new_mdi, NULL); + new_mdi.disk.major = disc.major; + new_mdi.disk.minor = disc.minor; + new_mdi.recovery_start = 0; + /* Make sure fds are closed as they are O_EXCL which + * would block add_disk */ + tst->ss->free_super(tst); + if (sysfs_add_disk(sra, &new_mdi, 0) != 0) { + pr_err("add new device to external metadata failed for %s\n", dv->devname); + close(container_fd); + sysfs_free(sra); + return -1; + } + ping_monitor(devnm); + sysfs_free(sra); + close(container_fd); + } else { + tst->ss->free_super(tst); + if (ioctl(fd, ADD_NEW_DISK, &disc)) { + if (dv->disposition == 'j') + pr_err("Failed to hot add %s as journal, " + "please try restart %s.\n", dv->devname, devname); + else + pr_err("add new device failed for %s as %d: %s\n", + dv->devname, j, strerror(errno)); + return -1; + } + if (dv->disposition == 'j') { + pr_err("Journal added successfully, making %s read-write\n", devname); + if (Manage_ro(devname, fd, -1)) + pr_err("Failed to make %s read-write\n", devname); + } + + } + if (verbose >= 0) + pr_err("added %s\n", dv->devname); + return 1; +} + +int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv, + int sysfd, unsigned long rdev, int verbose, char *devname) +{ + int lfd = -1; + int err; + + if (tst->ss->external) { + /* To remove a device from a container, we must + * check that it isn't in use in an array. + * This involves looking in the 'holders' + * directory - there must be just one entry, + * the container. + * To ensure that it doesn't get used as a + * hot spare while we are checking, we + * get an O_EXCL open on the container + */ + int ret; + char devnm[32]; + strcpy(devnm, fd2devnm(fd)); + lfd = open_dev_excl(devnm); + if (lfd < 0) { + pr_err("Cannot get exclusive access to container - odd\n"); + return -1; + } + /* We may not be able to check on holders in + * sysfs, either because we don't have the dev num + * (rdev == 0) or because the device has been detached + * and the 'holders' directory no longer exists + * (ret == -1). In that case, assume it is OK to + * remove. + */ + if (rdev == 0) + ret = -1; + else + ret = sysfs_unique_holder(devnm, rdev); + if (ret == 0) { + pr_err("%s is not a member, cannot remove.\n", + dv->devname); + close(lfd); + return -1; + } + if (ret >= 2) { + pr_err("%s is still in use, cannot remove.\n", + dv->devname); + close(lfd); + return -1; + } + } + /* FIXME check that it is a current member */ + if (sysfd >= 0) { + /* device has been removed and we don't know + * the major:minor number + */ + int n = write(sysfd, "remove", 6); + if (n != 6) + err = -1; + else + err = 0; + } else { + err = ioctl(fd, HOT_REMOVE_DISK, rdev); + if (err && errno == ENODEV) { + /* Old kernels rejected this if no personality + * is registered */ + struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS); + struct mdinfo *dv = NULL; + if (sra) + dv = sra->devs; + for ( ; dv ; dv=dv->next) + if (dv->disk.major == (int)major(rdev) && + dv->disk.minor == (int)minor(rdev)) + break; + if (dv) + err = sysfs_set_str(sra, dv, + "state", "remove"); + else + err = -1; + if (sra) + sysfs_free(sra); + } + } + if (err) { + pr_err("hot remove failed for %s: %s\n", dv->devname, + strerror(errno)); + if (lfd >= 0) + close(lfd); + return -1; + } + if (tst->ss->external) { + /* + * Before dropping our exclusive open we make an + * attempt at preventing mdmon from seeing an + * 'add' event before reconciling this 'remove' + * event. + */ + char *devnm = fd2devnm(fd); + + if (!devnm) { + pr_err("unable to get container name\n"); + return -1; + } + + ping_manager(devnm); + } + if (lfd >= 0) + close(lfd); + if (verbose >= 0) + pr_err("hot removed %s from %s\n", + dv->devname, devname); + return 1; +} + +int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv, + unsigned long rdev, int verbose, char *devname) +{ + struct mdinfo *mdi, *di; + if (tst->ss->external) { + pr_err("--replace only supported for native metadata (0.90 or 1.x)\n"); + return -1; + } + /* Need to find the device in sysfs and add 'want_replacement' to the + * status. + */ + mdi = sysfs_read(fd, NULL, GET_DEVS); + if (!mdi || !mdi->devs) { + pr_err("Cannot find status of %s to enable replacement - strange\n", + devname); + return -1; + } + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (int)major(rdev) && + di->disk.minor == (int)minor(rdev)) + break; + if (di) { + int rv; + if (di->disk.raid_disk < 0) { + pr_err("%s is not active and so cannot be replaced.\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + rv = sysfs_set_str(mdi, di, + "state", "want_replacement"); + if (rv) { + sysfs_free(mdi); + pr_err("Failed to request replacement for %s\n", + dv->devname); + return -1; + } + if (verbose >= 0) + pr_err("Marked %s (device %d in %s) for replacement\n", + dv->devname, di->disk.raid_disk, devname); + /* If there is a matching 'with', we need to tell it which + * raid disk + */ + while (dv && dv->disposition != 'W') + dv = dv->next; + if (dv) { + dv->disposition = 'w'; + dv->used = di->disk.raid_disk; + } + return 1; + } + sysfs_free(mdi); + pr_err("%s not found in %s so cannot --replace it\n", + dv->devname, devname); + return -1; +} + +int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv, + unsigned long rdev, int verbose, char *devname) +{ + struct mdinfo *mdi, *di; + /* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */ + mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE); + if (!mdi || !mdi->devs) { + pr_err("Cannot find status of %s to enable replacement - strange\n", + devname); + return -1; + } + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (int)major(rdev) && + di->disk.minor == (int)minor(rdev)) + break; + if (di) { + int rv; + if (di->disk.state & (1<devname); + sysfs_free(mdi); + return -1; + } + if (di->disk.raid_disk >= 0) { + pr_err("%s is active and cannot be a replacement\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + rv = sysfs_set_num(mdi, di, + "slot", dv->used); + if (rv) { + sysfs_free(mdi); + pr_err("Failed to set %s as preferred replacement.\n", + dv->devname); + return -1; + } + if (verbose >= 0) + pr_err("Marked %s in %s as replacement for device %d\n", + dv->devname, devname, dv->used); + return 1; + } + sysfs_free(mdi); + pr_err("%s not found in %s so cannot make it preferred replacement\n", + dv->devname, devname); + return -1; +} + +int Manage_subdevs(char *devname, int fd, + struct mddev_dev *devlist, int verbose, int test, + char *update, int force) +{ + /* Do something to each dev. + * devmode can be + * 'a' - add the device + * try HOT_ADD_DISK + * If that fails EINVAL, try ADD_NEW_DISK + * 'S' - add the device as a spare - don't try re-add + * 'j' - add the device as a journal device + * 'A' - re-add the device + * 'r' - remove the device: HOT_REMOVE_DISK + * device can be 'faulty' or 'detached' in which case all + * matching devices are removed. + * 'f' - set the device faulty SET_DISK_FAULTY + * device can be 'detached' in which case any device that + * is inaccessible will be marked faulty. + * 'R' - mark this device as wanting replacement. + * 'W' - this device is added if necessary and activated as + * a replacement for a previous 'R' device. + * ----- + * 'w' - 'W' will be changed to 'w' when it is paired with + * a 'R' device. If a 'W' is found while walking the list + * it must be unpaired, and is an error. + * 'M' - this is created by a 'missing' target. It is a slight + * variant on 'A' + * 'F' - Another variant of 'A', where the device was faulty + * so must be removed from the array first. + * 'c' - confirm the device as found (for clustered environments) + * + * For 'f' and 'r', the device can also be a kernel-internal + * name such as 'sdb'. + */ + mdu_array_info_t array; + unsigned long long array_size; + struct mddev_dev *dv; + int tfd = -1; + struct supertype *tst; + char *subarray = NULL; + int sysfd = -1; + int count = 0; /* number of actions taken */ + struct mdinfo info; + struct mdinfo devinfo; + int frozen = 0; + int busy = 0; + int raid_slot = -1; + + if (ioctl(fd, GET_ARRAY_INFO, &array)) { + pr_err("Cannot get array info for %s\n", + devname); + goto abort; + } + sysfs_init(&info, fd, NULL); + + /* array.size is only 32 bits and may be truncated. + * So read from sysfs if possible, and record number of sectors + */ + + array_size = get_component_size(fd); + if (array_size <= 0) + array_size = array.size * 2; + + tst = super_by_fd(fd, &subarray); + if (!tst) { + pr_err("unsupport array - version %d.%d\n", + array.major_version, array.minor_version); + goto abort; + } + + for (dv = devlist; dv; dv = dv->next) { + unsigned long rdev = 0; /* device to add/remove etc */ + int rv; + int mj,mn; + + raid_slot = -1; + if (dv->disposition == 'c') { + rv = parse_cluster_confirm_arg(dv->devname, + &dv->devname, + &raid_slot); + if (rv) { + pr_err("Could not get the devname of cluster\n"); + goto abort; + } + } + + if (strcmp(dv->devname, "failed") == 0 || + strcmp(dv->devname, "faulty") == 0) { + if (dv->disposition != 'A' + && dv->disposition != 'r') { + pr_err("%s only meaningful with -r or --re-add, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + add_faulty(dv, fd, (dv->disposition == 'A' + ? 'F' : 'r')); + continue; + } + if (strcmp(dv->devname, "detached") == 0) { + if (dv->disposition != 'r' && dv->disposition != 'f') { + pr_err("%s only meaningful with -r of -f, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + add_detached(dv, fd, dv->disposition); + continue; + } + + if (strcmp(dv->devname, "missing") == 0) { + struct mddev_dev *add_devlist = NULL; + struct mddev_dev **dp; + if (dv->disposition == 'c') { + rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL); + break; + } + + if (dv->disposition != 'A') { + pr_err("'missing' only meaningful with --re-add\n"); + goto abort; + } + add_devlist = conf_get_devs(); + if (add_devlist == NULL) { + pr_err("no devices to scan for missing members."); + continue; + } + for (dp = &add_devlist; *dp; dp = & (*dp)->next) + /* 'M' (for 'missing') is like 'A' without errors */ + (*dp)->disposition = 'M'; + *dp = dv->next; + dv->next = add_devlist; + continue; + } + + if (strncmp(dv->devname, "set-", 4) == 0 && + strlen(dv->devname) == 5) { + int copies; + + if (dv->disposition != 'r' && + dv->disposition != 'f') { + pr_err("'%s' only meaningful with -r or -f\n", + dv->devname); + goto abort; + } + if (array.level != 10) { + pr_err("'%s' only meaningful with RAID10 arrays\n", + dv->devname); + goto abort; + } + copies = ((array.layout & 0xff) * + ((array.layout >> 8) & 0xff)); + if (array.raid_disks % copies != 0 || + dv->devname[4] < 'A' || + dv->devname[4] >= 'A' + copies || + copies > 26) { + pr_err("'%s' not meaningful with this array\n", + dv->devname); + goto abort; + } + add_set(dv, fd, dv->devname[4]); + continue; + } + + if (strchr(dv->devname, '/') == NULL && + strchr(dv->devname, ':') == NULL && + strlen(dv->devname) < 50) { + /* Assume this is a kernel-internal name like 'sda1' */ + int found = 0; + char dname[55]; + if (dv->disposition != 'r' && dv->disposition != 'f') { + pr_err("%s only meaningful with -r or -f, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + + sprintf(dname, "dev-%s", dv->devname); + sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev"); + if (sysfd >= 0) { + char dn[20]; + if (sysfs_fd_get_str(sysfd, dn, 20) > 0 && + sscanf(dn, "%d:%d", &mj,&mn) == 2) { + rdev = makedev(mj,mn); + found = 1; + } + close(sysfd); + sysfd = -1; + } + if (!found) { + sysfd = sysfs_open(fd2devnm(fd), dname, "state"); + if (sysfd < 0) { + pr_err("%s does not appear to be a component of %s\n", + dv->devname, devname); + goto abort; + } + } + } else if ((dv->disposition == 'r' || dv->disposition == 'f') + && get_maj_min(dv->devname, &mj, &mn)) { + /* for 'fail' and 'remove', the device might + * not exist. + */ + rdev = makedev(mj, mn); + } else { + struct stat stb; + tfd = dev_open(dv->devname, O_RDONLY); + if (tfd >= 0) + fstat(tfd, &stb); + else { + int open_err = errno; + if (stat(dv->devname, &stb) != 0) { + pr_err("Cannot find %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if ((stb.st_mode & S_IFMT) != S_IFBLK) { + if (dv->disposition == 'M') + /* non-fatal. Also improbable */ + continue; + pr_err("%s is not a block device.\n", + dv->devname); + goto abort; + } + if (dv->disposition == 'r') + /* Be happy, the stat worked, that is + * enough for --remove + */ + ; + else { + if (dv->disposition == 'M') + /* non-fatal */ + continue; + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(open_err)); + goto abort; + } + } + rdev = stb.st_rdev; + } + switch(dv->disposition){ + default: + pr_err("internal error - devmode[%s]=%d\n", + dv->devname, dv->disposition); + goto abort; + case 'a': + case 'S': /* --add-spare */ + case 'j': /* --add-journal */ + case 'A': + case 'M': /* --re-add missing */ + case 'F': /* --re-add faulty */ + case 'c': /* --cluster-confirm */ + /* add the device */ + if (subarray) { + pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n"); + goto abort; + } + + /* Let's first try to write re-add to sysfs */ + if (rdev != 0 && + (dv->disposition == 'A' || dv->disposition == 'F')) { + sysfs_init_dev(&devinfo, rdev); + if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) { + pr_err("re-add %s to %s succeed\n", + dv->devname, info.sys_name); + break; + } + } + + if (dv->disposition == 'F') + /* Need to remove first */ + ioctl(fd, HOT_REMOVE_DISK, rdev); + /* Make sure it isn't in use (in 2.6 or later) */ + tfd = dev_open(dv->devname, O_RDONLY|O_EXCL); + if (tfd >= 0) { + /* We know no-one else is using it. We'll + * need non-exclusive access to add it, so + * do that now. + */ + close(tfd); + tfd = dev_open(dv->devname, O_RDONLY); + } + if (tfd < 0) { + if (dv->disposition == 'M') + continue; + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if (!frozen) { + if (sysfs_freeze_array(&info) == 1) + frozen = 1; + else + frozen = -1; + } + rv = Manage_add(fd, tfd, dv, tst, &array, + force, verbose, devname, update, + rdev, array_size, raid_slot); + close(tfd); + tfd = -1; + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + + case 'r': + /* hot remove */ + if (subarray) { + pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n"); + rv = -1; + } else + rv = Manage_remove(tst, fd, dv, sysfd, + rdev, verbose, + devname); + if (sysfd >= 0) + close(sysfd); + sysfd = -1; + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + + case 'f': /* set faulty */ + /* FIXME check current member */ + if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) || + (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY, + rdev))) { + if (errno == EBUSY) + busy = 1; + pr_err("set device faulty failed for %s: %s\n", + dv->devname, strerror(errno)); + if (sysfd >= 0) + close(sysfd); + goto abort; + } + if (sysfd >= 0) + close(sysfd); + sysfd = -1; + count++; + if (verbose >= 0) + pr_err("set %s faulty in %s\n", + dv->devname, devname); + break; + case 'R': /* Mark as replaceable */ + if (subarray) { + pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n"); + rv = -1; + } else { + if (!frozen) { + if (sysfs_freeze_array(&info) == 1) + frozen = 1; + else + frozen = -1; + } + rv = Manage_replace(tst, fd, dv, + rdev, verbose, + devname); + } + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + case 'W': /* --with device that doesn't match */ + pr_err("No matching --replace device for --with %s\n", + dv->devname); + goto abort; + case 'w': /* --with device which was matched */ + rv = Manage_with(tst, fd, dv, + rdev, verbose, devname); + if (rv < 0) + goto abort; + break; + } + } + if (frozen > 0) + sysfs_set_str(&info, NULL, "sync_action","idle"); + if (test && count == 0) + return 2; + return 0; + +abort: + if (frozen > 0) + sysfs_set_str(&info, NULL, "sync_action","idle"); + return !test && busy ? 2 : 1; +} + +int autodetect(void) +{ + /* Open any md device, and issue the RAID_AUTORUN ioctl */ + int rv = 1; + int fd = dev_open("9:0", O_RDONLY); + if (fd >= 0) { + if (ioctl(fd, RAID_AUTORUN, 0) == 0) + rv = 0; + close(fd); + } + return rv; +} + +int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int verbose) +{ + struct supertype supertype, *st = &supertype; + int fd, rv = 2; + + memset(st, 0, sizeof(*st)); + + fd = open_subarray(dev, subarray, st, verbose < 0); + if (fd < 0) + return 2; + + if (!st->ss->update_subarray) { + if (verbose >= 0) + pr_err("Operation not supported for %s metadata\n", + st->ss->name); + goto free_super; + } + + if (mdmon_running(st->devnm)) + st->update_tail = &st->updates; + + rv = st->ss->update_subarray(st, subarray, update, ident); + + if (rv) { + if (verbose >= 0) + pr_err("Failed to update %s of subarray-%s in %s\n", + update, subarray, dev); + } else if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + + if (rv == 0 && strcmp(update, "name") == 0 && verbose >= 0) + pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n", + subarray, dev); + + free_super: + st->ss->free_super(st); + close(fd); + + return rv; +} + +/* Move spare from one array to another If adding to destination array fails + * add back to original array. + * Returns 1 on success, 0 on failure */ +int move_spare(char *from_devname, char *to_devname, dev_t devid) +{ + struct mddev_dev devlist; + char devname[20]; + + /* try to remove and add */ + int fd1 = open(to_devname, O_RDONLY); + int fd2 = open(from_devname, O_RDONLY); + + if (fd1 < 0 || fd2 < 0) { + if (fd1>=0) close(fd1); + if (fd2>=0) close(fd2); + return 0; + } + + devlist.next = NULL; + devlist.used = 0; + devlist.writemostly = 0; + devlist.devname = devname; + sprintf(devname, "%d:%d", major(devid), minor(devid)); + + devlist.disposition = 'r'; + if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0) == 0) { + devlist.disposition = 'a'; + if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0, NULL, 0) == 0) { + /* make sure manager is aware of changes */ + ping_manager(to_devname); + ping_manager(from_devname); + close(fd1); + close(fd2); + return 1; + } + else Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0); + } + close(fd1); + close(fd2); + return 0; +} +#endif diff --git a/Monitor.c b/Monitor.c new file mode 100644 index 00000000..f19c2e58 --- /dev/null +++ b/Monitor.c @@ -0,0 +1,1143 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" +#include +#include +#include +#include + +struct state { + char *devname; + char devnm[32]; /* to sync with mdstat info */ + long utime; + int err; + char *spare_group; + int active, working, failed, spare, raid; + int from_config; + int from_auto; + int expected_spares; + int devstate[MAX_DISKS]; + dev_t devid[MAX_DISKS]; + int percent; + char parent_devnm[32]; /* For subarray, devnm of parent. + * For others, "" + */ + struct supertype *metadata; + struct state *subarray;/* for a container it is a link to first subarray + * for a subarray it is a link to next subarray + * in the same container */ + struct state *parent; /* for a subarray it is a link to its container + */ + struct state *next; +}; + +struct alert_info { + char *mailaddr; + char *mailfrom; + char *alert_cmd; + int dosyslog; +}; +static int make_daemon(char *pidfile); +static int check_one_sharer(int scan); +static void alert(char *event, char *dev, char *disc, struct alert_info *info); +static int check_array(struct state *st, struct mdstat_ent *mdstat, + int test, struct alert_info *info, + int increments, char *prefer); +static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist, + int test, struct alert_info *info); +static void try_spare_migration(struct state *statelist, struct alert_info *info); +static void link_containers_with_subarrays(struct state *list); + +int Monitor(struct mddev_dev *devlist, + char *mailaddr, char *alert_cmd, + struct context *c, + int daemonise, int oneshot, + int dosyslog, char *pidfile, int increments, + int share) +{ + /* + * Every few seconds, scan every md device looking for changes + * When a change is found, log it, possibly run the alert command, + * and possibly send Email + * + * For each array, we record: + * Update time + * active/working/failed/spare drives + * State of each device. + * %rebuilt if rebuilding + * + * If the update time changes, check out all the data again + * It is possible that we cannot get the state of each device + * due to bugs in the md kernel module. + * We also read /proc/mdstat to get rebuild percent, + * and to get state on all active devices incase of kernel bug. + * + * Events are: + * Fail + * An active device had Faulty set or Active/Sync removed + * FailSpare + * A spare device had Faulty set + * SpareActive + * An active device had a reverse transition + * RebuildStarted + * percent went from -1 to +ve + * RebuildNN + * percent went from below to not-below NN% + * DeviceDisappeared + * Couldn't access a device which was previously visible + * + * if we detect an array with active0, + * and if we can get_disk_info and find a name + * Then we hot-remove and hot-add to the other array + * + * If devlist is NULL, then we can monitor everything because --scan + * was given. We get an initial list from config file and add anything + * that appears in /proc/mdstat + */ + + struct state *statelist = NULL; + struct state *st2; + int finished = 0; + struct mdstat_ent *mdstat = NULL; + char *mailfrom = NULL; + struct alert_info info; + + if (!mailaddr) { + mailaddr = conf_get_mailaddr(); + if (mailaddr && ! c->scan) + pr_err("Monitor using email address \"%s\" from config file\n", + mailaddr); + } + mailfrom = conf_get_mailfrom(); + + if (!alert_cmd) { + alert_cmd = conf_get_program(); + if (alert_cmd && ! c->scan) + pr_err("Monitor using program \"%s\" from config file\n", + alert_cmd); + } + if (c->scan && !mailaddr && !alert_cmd && !dosyslog) { + pr_err("No mail address or alert command - not monitoring.\n"); + return 1; + } + info.alert_cmd = alert_cmd; + info.mailaddr = mailaddr; + info.mailfrom = mailfrom; + info.dosyslog = dosyslog; + + if (daemonise) { + int rv = make_daemon(pidfile); + if (rv >= 0) + return rv; + } + + if (share) + if (check_one_sharer(c->scan)) + return 1; + + if (devlist == NULL) { + struct mddev_ident *mdlist = conf_get_ident(NULL); + for (; mdlist; mdlist=mdlist->next) { + struct state *st; + if (mdlist->devname == NULL) + continue; + if (strcasecmp(mdlist->devname, "") == 0) + continue; + st = xcalloc(1, sizeof *st); + if (mdlist->devname[0] == '/') + st->devname = xstrdup(mdlist->devname); + else { + st->devname = xmalloc(8+strlen(mdlist->devname)+1); + strcpy(strcpy(st->devname, "/dev/md/"), + mdlist->devname); + } + st->next = statelist; + st->devnm[0] = 0; + st->percent = RESYNC_UNKNOWN; + st->from_config = 1; + st->expected_spares = mdlist->spare_disks; + if (mdlist->spare_group) + st->spare_group = xstrdup(mdlist->spare_group); + statelist = st; + } + } else { + struct mddev_dev *dv; + for (dv=devlist ; dv; dv=dv->next) { + struct mddev_ident *mdlist = conf_get_ident(dv->devname); + struct state *st = xcalloc(1, sizeof *st); + st->devname = xstrdup(dv->devname); + st->next = statelist; + st->devnm[0] = 0; + st->percent = RESYNC_UNKNOWN; + st->expected_spares = -1; + if (mdlist) { + st->expected_spares = mdlist->spare_disks; + if (mdlist->spare_group) + st->spare_group = xstrdup(mdlist->spare_group); + } + statelist = st; + } + } + + while (! finished) { + int new_found = 0; + struct state *st, **stp; + int anydegraded = 0; + + if (mdstat) + free_mdstat(mdstat); + mdstat = mdstat_read(oneshot?0:1, 0); + + for (st=statelist; st; st=st->next) + if (check_array(st, mdstat, c->test, &info, + increments, c->prefer)) + anydegraded = 1; + + /* now check if there are any new devices found in mdstat */ + if (c->scan) + new_found = add_new_arrays(mdstat, &statelist, c->test, + &info); + + /* If an array has active < raid && spare == 0 && spare_group != NULL + * Look for another array with spare > 0 and active == raid and same spare_group + * if found, choose a device and hotremove/hotadd + */ + if (share && anydegraded) + try_spare_migration(statelist, &info); + if (!new_found) { + if (oneshot) + break; + else + mdstat_wait(c->delay); + } + c->test = 0; + + for (stp = &statelist; (st = *stp) != NULL; ) { + if (st->from_auto && st->err > 5) { + *stp = st->next; + free(st->devname); + free(st->spare_group); + free(st); + } else + stp = &st->next; + } + } + for (st2 = statelist; st2; st2 = statelist) { + statelist = st2->next; + free(st2); + } + + if (pidfile) + unlink(pidfile); + return 0; +} + +static int make_daemon(char *pidfile) +{ + /* Return: + * -1 in the forked daemon + * 0 in the parent + * 1 on error + * so a none-negative becomes the exit code. + */ + int pid = fork(); + if (pid > 0) { + if (!pidfile) + printf("%d\n", pid); + else { + FILE *pid_file; + pid_file=fopen(pidfile, "w"); + if (!pid_file) + perror("cannot create pid file"); + else { + fprintf(pid_file,"%d\n", pid); + fclose(pid_file); + } + } + return 0; + } + if (pid < 0) { + perror("daemonise"); + return 1; + } + close(0); + open("/dev/null", O_RDWR); + dup2(0,1); + dup2(0,2); + setsid(); + return -1; +} + +static int check_one_sharer(int scan) +{ + int pid, rv; + FILE *fp; + char dir[20]; + char path[100]; + struct stat buf; + sprintf(path, "%s/autorebuild.pid", MDMON_DIR); + fp = fopen(path, "r"); + if (fp) { + if (fscanf(fp, "%d", &pid) != 1) + pid = -1; + sprintf(dir, "/proc/%d", pid); + rv = stat(dir, &buf); + if (rv != -1) { + if (scan) { + pr_err("Only one autorebuild process allowed in scan mode, aborting\n"); + fclose(fp); + return 1; + } else { + pr_err("Warning: One autorebuild process already running.\n"); + } + } + fclose(fp); + } + if (scan) { + if (mkdir(MDMON_DIR, S_IRWXU) < 0 && + errno != EEXIST) { + pr_err("Can't create autorebuild.pid file\n"); + } else { + fp = fopen(path, "w"); + if (!fp) + pr_err("Cannot create autorebuild.pidfile\n"); + else { + pid = getpid(); + fprintf(fp, "%d\n", pid); + fclose(fp); + } + } + } + return 0; +} + +static void alert(char *event, char *dev, char *disc, struct alert_info *info) +{ + int priority; + + if (!info->alert_cmd && !info->mailaddr && !info->dosyslog) { + time_t now = time(0); + + printf("%1.15s: %s on %s %s\n", ctime(&now)+4, event, dev, disc?disc:"unknown device"); + } + if (info->alert_cmd) { + int pid = fork(); + switch(pid) { + default: + waitpid(pid, NULL, 0); + break; + case -1: + break; + case 0: + execl(info->alert_cmd, info->alert_cmd, + event, dev, disc, NULL); + exit(2); + } + } + if (info->mailaddr && + (strncmp(event, "Fail", 4)==0 || + strncmp(event, "Test", 4)==0 || + strncmp(event, "Spares", 6)==0 || + strncmp(event, "Degrade", 7)==0)) { + FILE *mp = popen(Sendmail, "w"); + if (mp) { + FILE *mdstat; + char hname[256]; + gethostname(hname, sizeof(hname)); + signal(SIGPIPE, SIG_IGN); + if (info->mailfrom) + fprintf(mp, "From: %s\n", info->mailfrom); + else + fprintf(mp, "From: %s monitoring \n", Name); + fprintf(mp, "To: %s\n", info->mailaddr); + fprintf(mp, "Subject: %s event on %s:%s\n\n", + event, dev, hname); + + fprintf(mp, + "This is an automatically generated mail message from %s\n", Name); + fprintf(mp, "running on %s\n\n", hname); + + fprintf(mp, + "A %s event had been detected on md device %s.\n\n", event, dev); + + if (disc && disc[0] != ' ') + fprintf(mp, + "It could be related to component device %s.\n\n", disc); + if (disc && disc[0] == ' ') + fprintf(mp, "Extra information:%s.\n\n", disc); + + fprintf(mp, "Faithfully yours, etc.\n"); + + mdstat = fopen("/proc/mdstat", "r"); + if (mdstat) { + char buf[8192]; + int n; + fprintf(mp, + "\nP.S. The /proc/mdstat file currently contains the following:\n\n"); + while ( (n=fread(buf, 1, sizeof(buf), mdstat)) > 0) + n=fwrite(buf, 1, n, mp); + fclose(mdstat); + } + pclose(mp); + } + } + + /* log the event to syslog maybe */ + if (info->dosyslog) { + /* Log at a different severity depending on the event. + * + * These are the critical events: */ + if (strncmp(event, "Fail", 4)==0 || + strncmp(event, "Degrade", 7)==0 || + strncmp(event, "DeviceDisappeared", 17)==0) + priority = LOG_CRIT; + /* Good to know about, but are not failures: */ + else if (strncmp(event, "Rebuild", 7)==0 || + strncmp(event, "MoveSpare", 9)==0 || + strncmp(event, "Spares", 6) != 0) + priority = LOG_WARNING; + /* Everything else: */ + else + priority = LOG_INFO; + + if (disc && disc[0] != ' ') + syslog(priority, + "%s event detected on md device %s, component device %s", event, dev, disc); + else if (disc) + syslog(priority, + "%s event detected on md device %s: %s", + event, dev, disc); + else + syslog(priority, + "%s event detected on md device %s", + event, dev); + } +} + +static int check_array(struct state *st, struct mdstat_ent *mdstat, + int test, struct alert_info *ainfo, + int increments, char *prefer) +{ + /* Update the state 'st' to reflect any changes shown in mdstat, + * or found by directly examining the array, and return + * '1' if the array is degraded, or '0' if it is optimal (or dead). + */ + struct { int state, major, minor; } info[MAX_DISKS]; + mdu_array_info_t array; + struct mdstat_ent *mse = NULL, *mse2; + char *dev = st->devname; + int fd = -1; + int i; + int remaining_disks; + int last_disk; + int new_array = 0; + + if (test) + alert("TestMessage", dev, NULL, ainfo); + if (st->devnm[0]) + fd = open("/sys/block", O_RDONLY|O_DIRECTORY); + if (fd >= 0) { + /* Don't open the device unless it is present and + * active in sysfs. + */ + char buf[10]; + close(fd); + fd = sysfs_open(st->devnm, NULL, "array_state"); + if (fd < 0 || + read(fd, buf, 10) < 5 || + strncmp(buf,"clear",5) == 0 || + strncmp(buf,"inact",5) == 0) { + if (fd >= 0) + close(fd); + fd = sysfs_open(st->devnm, NULL, "level"); + if (fd < 0 || read(fd, buf, 10) != 0) { + if (fd >= 0) + close(fd); + if (!st->err) + alert("DeviceDisappeared", dev, NULL, ainfo); + st->err++; + return 0; + } + } + close(fd); + } + fd = open(dev, O_RDONLY); + if (fd < 0) { + if (!st->err) + alert("DeviceDisappeared", dev, NULL, ainfo); + st->err++; + return 0; + } + fcntl(fd, F_SETFD, FD_CLOEXEC); + if (ioctl(fd, GET_ARRAY_INFO, &array)<0) { + if (!st->err) + alert("DeviceDisappeared", dev, NULL, ainfo); + st->err++; + close(fd); + return 0; + } + /* It's much easier to list what array levels can't + * have a device disappear than all of them that can + */ + if (array.level == 0 || array.level == -1) { + if (!st->err && !st->from_config) + alert("DeviceDisappeared", dev, " Wrong-Level", ainfo); + st->err++; + close(fd); + return 0; + } + if (st->devnm[0] == 0) + strcpy(st->devnm, fd2devnm(fd)); + + for (mse2 = mdstat ; mse2 ; mse2=mse2->next) + if (strcmp(mse2->devnm, st->devnm) == 0) { + mse2->devnm[0] = 0; /* flag it as "used" */ + mse = mse2; + } + + if (!mse) { + /* duplicated array in statelist + * or re-created after reading mdstat*/ + st->err++; + close(fd); + return 0; + } + /* this array is in /proc/mdstat */ + if (array.utime == 0) + /* external arrays don't update utime, so + * just make sure it is always different. */ + array.utime = st->utime + 1;; + + if (st->err) { + /* New array appeared where previously had an error */ + st->err = 0; + st->percent = RESYNC_NONE; + new_array = 1; + alert("NewArray", st->devname, NULL, ainfo); + } + + if (st->utime == array.utime && + st->failed == array.failed_disks && + st->working == array.working_disks && + st->spare == array.spare_disks && + (mse == NULL || ( + mse->percent == st->percent + ))) { + close(fd); + if ((st->active < st->raid) && st->spare == 0) + return 1; + else + return 0; + } + if (st->utime == 0 && /* new array */ + mse->pattern && strchr(mse->pattern, '_') /* degraded */ + ) + alert("DegradedArray", dev, NULL, ainfo); + + if (st->utime == 0 && /* new array */ + st->expected_spares > 0 && + array.spare_disks < st->expected_spares) + alert("SparesMissing", dev, NULL, ainfo); + if (st->percent < 0 && st->percent != RESYNC_UNKNOWN && + mse->percent >= 0) + alert("RebuildStarted", dev, NULL, ainfo); + if (st->percent >= 0 && + mse->percent >= 0 && + (mse->percent / increments) > (st->percent / increments)) { + char percentalert[15]; // "RebuildNN" (10 chars) or "RebuildStarted" (15 chars) + + if((mse->percent / increments) == 0) + snprintf(percentalert, sizeof(percentalert), "RebuildStarted"); + else + snprintf(percentalert, sizeof(percentalert), "Rebuild%02d", mse->percent); + + alert(percentalert, dev, NULL, ainfo); + } + + if (mse->percent == RESYNC_NONE && + st->percent >= 0) { + /* Rebuild/sync/whatever just finished. + * If there is a number in /mismatch_cnt, + * we should report that. + */ + struct mdinfo *sra = + sysfs_read(-1, st->devnm, GET_MISMATCH); + if (sra && sra->mismatch_cnt > 0) { + char cnt[80]; + snprintf(cnt, sizeof(cnt), + " mismatches found: %d (on raid level %d)", + sra->mismatch_cnt, array.level); + alert("RebuildFinished", dev, cnt, ainfo); + } else + alert("RebuildFinished", dev, NULL, ainfo); + if (sra) + free(sra); + } + st->percent = mse->percent; + + remaining_disks = array.nr_disks; + for (i=0; i 0; + i++) { + mdu_disk_info_t disc; + disc.number = i; + if (ioctl(fd, GET_DISK_INFO, &disc) >= 0) { + info[i].state = disc.state; + info[i].major = disc.major; + info[i].minor = disc.minor; + if (disc.major || disc.minor) + remaining_disks --; + } else + info[i].major = info[i].minor = 0; + } + last_disk = i; + + if (mse->metadata_version && + strncmp(mse->metadata_version, "external:", 9) == 0 && + is_subarray(mse->metadata_version+9)) { + char *sl; + strcpy(st->parent_devnm, + mse->metadata_version+10); + sl = strchr(st->parent_devnm, '/'); + if (sl) + *sl = 0; + } else + st->parent_devnm[0] = 0; + if (st->metadata == NULL && + st->parent_devnm[0] == 0) + st->metadata = super_by_fd(fd, NULL); + + close(fd); + + for (i=0; idevid[i]) + dv = map_dev_preferred( + major(st->devid[i]), + minor(st->devid[i]), 1, prefer); + change = newstate ^ st->devstate[i]; + if (st->utime && change && !st->err && !new_array) { + if ((st->devstate[i]&change)&(1<devid[i] == makedev(disc.major, disc.minor)) + alert("FailSpare", dev, dv, ainfo); + else if ((newstate&change)&(1<devstate[i] = newstate; + st->devid[i] = makedev(disc.major, disc.minor); + } + st->active = array.active_disks; + st->working = array.working_disks; + st->spare = array.spare_disks; + st->failed = array.failed_disks; + st->utime = array.utime; + st->raid = array.raid_disks; + st->err = 0; + if ((st->active < st->raid) && st->spare == 0) + return 1; + return 0; +} + +static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist, + int test, struct alert_info *info) +{ + struct mdstat_ent *mse; + int new_found = 0; + char *name; + + for (mse=mdstat; mse; mse=mse->next) + if (mse->devnm[0] && + (!mse->level || /* retrieve containers */ + (strcmp(mse->level, "raid0") != 0 && + strcmp(mse->level, "linear") != 0)) + ) { + struct state *st = xcalloc(1, sizeof *st); + mdu_array_info_t array; + int fd; + + name = get_md_name(mse->devnm); + if (!name) { + free(st); + continue; + } + + st->devname = xstrdup(name); + if ((fd = open(st->devname, O_RDONLY)) < 0 || + ioctl(fd, GET_ARRAY_INFO, &array)< 0) { + /* no such array */ + if (fd >=0) close(fd); + put_md_name(st->devname); + free(st->devname); + if (st->metadata) { + st->metadata->ss->free_super(st->metadata); + free(st->metadata); + } + free(st); + continue; + } + close(fd); + st->next = *statelist; + st->err = 1; + st->from_auto = 1; + strcpy(st->devnm, mse->devnm); + st->percent = RESYNC_UNKNOWN; + st->expected_spares = -1; + if (mse->metadata_version && + strncmp(mse->metadata_version, "external:", 9) == 0 && + is_subarray(mse->metadata_version+9)) { + char *sl; + strcpy(st->parent_devnm, + mse->metadata_version+10); + sl = strchr(st->parent_devnm, '/'); + *sl = 0; + } else + st->parent_devnm[0] = 0; + *statelist = st; + if (test) + alert("TestMessage", st->devname, NULL, info); + new_found = 1; + } + return new_found; +} + +static int get_min_spare_size_required(struct state *st, unsigned long long *sizep) +{ + int fd; + + if (!st->metadata || + !st->metadata->ss->min_acceptable_spare_size) { + *sizep = 0; + return 0; + } + + fd = open(st->devname, O_RDONLY); + if (fd < 0) + return 1; + if (st->metadata->ss->external) + st->metadata->ss->load_container(st->metadata, fd, st->devname); + else + st->metadata->ss->load_super(st->metadata, fd, st->devname); + close(fd); + if (!st->metadata->sb) + return 1; + *sizep = st->metadata->ss->min_acceptable_spare_size(st->metadata); + st->metadata->ss->free_super(st->metadata); + + return 0; +} + +static int check_donor(struct state *from, struct state *to) +{ + struct state *sub; + + if (from == to) + return 0; + if (from->parent) + /* Cannot move from a member */ + return 0; + if (from->err) + return 0; + for (sub = from->subarray; sub; sub = sub->subarray) + /* If source array has degraded subarrays, don't + * remove anything + */ + if (sub->active < sub->raid) + return 0; + if (from->metadata->ss->external == 0) + if (from->active < from->raid) + return 0; + if (from->spare <= 0) + return 0; + return 1; +} + +static dev_t choose_spare(struct state *from, struct state *to, + struct domainlist *domlist, unsigned long long min_size) +{ + int d; + dev_t dev = 0; + + for (d = from->raid; !dev && d < MAX_DISKS; d++) { + if (from->devid[d] > 0 && + from->devstate[d] == 0) { + struct dev_policy *pol; + unsigned long long dev_size; + + if (to->metadata->ss->external && + test_partition_from_id(from->devid[d])) + continue; + + if (min_size && + dev_size_from_id(from->devid[d], &dev_size) && + dev_size < min_size) + continue; + + pol = devid_policy(from->devid[d]); + if (from->spare_group) + pol_add(&pol, pol_domain, + from->spare_group, NULL); + if (domain_test(domlist, pol, to->metadata->ss->name) == 1) + dev = from->devid[d]; + dev_policy_free(pol); + } + } + return dev; +} + +static dev_t container_choose_spare(struct state *from, struct state *to, + struct domainlist *domlist, + unsigned long long min_size, int active) +{ + /* This is similar to choose_spare, but we cannot trust devstate, + * so we need to read the metadata instead + */ + struct mdinfo *list; + struct supertype *st = from->metadata; + int fd = open(from->devname, O_RDONLY); + int err; + dev_t dev = 0; + + if (fd < 0) + return 0; + if (!st->ss->getinfo_super_disks) { + close(fd); + return 0; + } + + err = st->ss->load_container(st, fd, NULL); + close(fd); + if (err) + return 0; + + if (from == to) { + /* We must check if number of active disks has not increased + * since ioctl in main loop. mdmon may have added spare + * to subarray. If so we do not need to look for more spares + * so return non zero value */ + int active_cnt = 0; + struct mdinfo *dp; + list = st->ss->getinfo_super_disks(st); + if (!list) { + st->ss->free_super(st); + return 1; + } + dp = list->devs; + while (dp) { + if (dp->disk.state & (1<disk.state & (1<next; + } + sysfs_free(list); + if (active < active_cnt) { + /* Spare just activated.*/ + st->ss->free_super(st); + return 1; + } + } + + /* We only need one spare so full list not needed */ + list = container_choose_spares(st, min_size, domlist, from->spare_group, + to->metadata->ss->name, 1); + if (list) { + struct mdinfo *disks = list->devs; + if (disks) + dev = makedev(disks->disk.major, disks->disk.minor); + sysfs_free(list); + } + st->ss->free_super(st); + return dev; +} + +static void try_spare_migration(struct state *statelist, struct alert_info *info) +{ + struct state *from; + struct state *st; + + link_containers_with_subarrays(statelist); + for (st = statelist; st; st = st->next) + if (st->active < st->raid && + st->spare == 0 && !st->err) { + struct domainlist *domlist = NULL; + int d; + struct state *to = st; + unsigned long long min_size; + + if (to->parent_devnm[0] && !to->parent) + /* subarray monitored without parent container + * we can't move spares here */ + continue; + + if (to->parent) + /* member of a container */ + to = to->parent; + + if (get_min_spare_size_required(to, &min_size)) + continue; + if (to->metadata->ss->external) { + /* We must make sure there is + * no suitable spare in container already. + * If there is we don't add more */ + dev_t devid = container_choose_spare( + to, to, NULL, min_size, st->active); + if (devid > 0) + continue; + } + for (d = 0; d < MAX_DISKS; d++) + if (to->devid[d]) + domainlist_add_dev(&domlist, + to->devid[d], + to->metadata->ss->name); + if (to->spare_group) + domain_add(&domlist, to->spare_group); + /* + * No spare migration if the destination + * has no domain. Skip this array. + */ + if (!domlist) + continue; + for (from=statelist ; from ; from=from->next) { + dev_t devid; + if (!check_donor(from, to)) + continue; + if (from->metadata->ss->external) + devid = container_choose_spare( + from, to, domlist, min_size, 0); + else + devid = choose_spare(from, to, domlist, + min_size); + if (devid > 0 + && move_spare(from->devname, to->devname, devid)) { + alert("MoveSpare", to->devname, from->devname, info); + break; + } + } + domain_free(domlist); + } +} + +/* search the statelist to connect external + * metadata subarrays with their containers + * We always completely rebuild the tree from scratch as + * that is safest considering the possibility of entries + * disappearing or changing. + */ +static void link_containers_with_subarrays(struct state *list) +{ + struct state *st; + struct state *cont; + for (st = list; st; st = st->next) { + st->parent = NULL; + st->subarray = NULL; + } + for (st = list; st; st = st->next) + if (st->parent_devnm[0]) + for (cont = list; cont; cont = cont->next) + if (!cont->err && + cont->parent_devnm[0] == 0 && + strcmp(cont->devnm, st->parent_devnm) == 0) { + st->parent = cont; + st->subarray = cont->subarray; + cont->subarray = st; + break; + } +} + +/* Not really Monitor but ... */ +int Wait(char *dev) +{ + struct stat stb; + char devnm[32]; + int rv = 1; + int frozen_remaining = 3; + + if (stat(dev, &stb) != 0) { + pr_err("Cannot find %s: %s\n", dev, + strerror(errno)); + return 2; + } + strcpy(devnm, stat2devnm(&stb)); + + while(1) { + struct mdstat_ent *ms = mdstat_read(1, 0); + struct mdstat_ent *e; + + for (e=ms ; e; e=e->next) + if (strcmp(e->devnm, devnm) == 0) + break; + + if (e && e->percent == RESYNC_NONE) { + /* We could be in the brief pause before something + * starts. /proc/mdstat doesn't show that, but + * sync_action does. + */ + struct mdinfo mdi; + char buf[21]; + sysfs_init(&mdi, -1, devnm); + if (sysfs_get_str(&mdi, NULL, "sync_action", + buf, 20) > 0 && + strcmp(buf,"idle\n") != 0) { + e->percent = RESYNC_UNKNOWN; + if (strcmp(buf, "frozen\n") == 0) { + if (frozen_remaining == 0) + e->percent = RESYNC_NONE; + else + frozen_remaining -= 1; + } + } + } + if (!e || e->percent == RESYNC_NONE) { + if (e && e->metadata_version && + strncmp(e->metadata_version, "external:", 9) == 0) { + if (is_subarray(&e->metadata_version[9])) + ping_monitor(&e->metadata_version[9]); + else + ping_monitor(devnm); + } + free_mdstat(ms); + return rv; + } + free_mdstat(ms); + rv = 0; + mdstat_wait(5); + } +} + +#ifndef MDASSEMBLE + +static char *clean_states[] = { + "clear", "inactive", "readonly", "read-auto", "clean", NULL }; + +int WaitClean(char *dev, int sock, int verbose) +{ + int fd; + struct mdinfo *mdi; + int rv = 1; + char devnm[32]; + + fd = open(dev, O_RDONLY); + if (fd < 0) { + if (verbose) + pr_err("Couldn't open %s: %s\n", dev, strerror(errno)); + return 1; + } + + strcpy(devnm, fd2devnm(fd)); + mdi = sysfs_read(fd, devnm, GET_VERSION|GET_LEVEL|GET_SAFEMODE); + if (!mdi) { + if (verbose) + pr_err("Failed to read sysfs attributes for %s\n", dev); + close(fd); + return 0; + } + + switch(mdi->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + /* safemode delay is irrelevant for these levels */ + rv = 0; + } + + /* for internal metadata the kernel handles the final clean + * transition, containers can never be dirty + */ + if (!is_subarray(mdi->text_version)) + rv = 0; + + /* safemode disabled ? */ + if (mdi->safe_mode_delay == 0) + rv = 0; + + if (rv) { + int state_fd = sysfs_open(fd2devnm(fd), NULL, "array_state"); + char buf[20]; + int delay = 5000; + + /* minimize the safe_mode_delay and prepare to wait up to 5s + * for writes to quiesce + */ + sysfs_set_safemode(mdi, 1); + + /* wait for array_state to be clean */ + while (1) { + rv = read(state_fd, buf, sizeof(buf)); + if (rv < 0) + break; + if (sysfs_match_word(buf, clean_states) <= 4) + break; + rv = sysfs_wait(state_fd, &delay); + if (rv < 0 && errno != EINTR) + break; + lseek(state_fd, 0, SEEK_SET); + } + if (rv < 0) + rv = 1; + else if (fping_monitor(sock) == 0 || + ping_monitor(mdi->text_version) == 0) { + /* we need to ping to close the window between array + * state transitioning to clean and the metadata being + * marked clean + */ + rv = 0; + } else + rv = 1; + if (rv && verbose) + pr_err("Error waiting for %s to be clean\n", + dev); + + /* restore the original safe_mode_delay */ + sysfs_set_safemode(mdi, mdi->safe_mode_delay); + close(state_fd); + } + + sysfs_free(mdi); + close(fd); + + return rv; +} +#endif /* MDASSEMBLE */ diff --git a/Query.c b/Query.c new file mode 100644 index 00000000..fbc1d103 --- /dev/null +++ b/Query.c @@ -0,0 +1,126 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2002-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" + +int Query(char *dev) +{ + /* Give a brief description of the device, + * whether it is an md device and whether it has + * a superblock + */ + int fd = open(dev, O_RDONLY); + int vers; + int ioctlerr; + int superror; + struct mdinfo info; + mdu_array_info_t array; + struct supertype *st = NULL; + + unsigned long long larray_size; + struct stat stb; + char *mddev; + mdu_disk_info_t disc; + char *activity; + + if (fd < 0){ + pr_err("cannot open %s: %s\n", + dev, strerror(errno)); + return 1; + } + + vers = md_get_version(fd); + if (ioctl(fd, GET_ARRAY_INFO, &array)<0) + ioctlerr = errno; + else ioctlerr = 0; + + fstat(fd, &stb); + + if (vers>=9000 && !ioctlerr) { + if (!get_dev_size(fd, NULL, &larray_size)) + larray_size = 0; + } + + if (vers < 0) + printf("%s: is not an md array\n", dev); + else if (vers < 9000) + printf("%s: is an md device, but kernel cannot provide details\n", dev); + else if (ioctlerr == ENODEV) + printf("%s: is an md device which is not active\n", dev); + else if (ioctlerr) + printf("%s: is an md device, but gives \"%s\" when queried\n", + dev, strerror(ioctlerr)); + else { + printf("%s: %s %s %d devices, %d spare%s. Use mdadm --detail for more detail.\n", + dev, + human_size_brief(larray_size,IEC), + map_num(pers, array.level), + array.raid_disks, + array.spare_disks, array.spare_disks==1?"":"s"); + } + st = guess_super(fd); + if (st && st->ss->compare_super != NULL) + superror = st->ss->load_super(st, fd, dev); + else + superror = -1; + close(fd); + if (superror == 0) { + /* array might be active... */ + int uuid[4]; + struct map_ent *me, *map = NULL; + st->ss->getinfo_super(st, &info, NULL); + st->ss->uuid_from_super(st, uuid); + me = map_by_uuid(&map, uuid); + if (me) { + mddev = me->path; + disc.number = info.disk.number; + activity = "undetected"; + if (mddev && (fd = open(mddev, O_RDONLY))>=0) { + if (md_get_version(fd) >= 9000 && + ioctl(fd, GET_ARRAY_INFO, &array)>= 0) { + if (ioctl(fd, GET_DISK_INFO, &disc) >= 0 && + makedev((unsigned)disc.major,(unsigned)disc.minor) == stb.st_rdev) + activity = "active"; + else + activity = "mismatch"; + } + close(fd); + } + } else { + activity = "inactive"; + mddev = "array"; + } + printf("%s: device %d in %d device %s %s %s. Use mdadm --examine for more detail.\n", + dev, + info.disk.number, info.array.raid_disks, + activity, + map_num(pers, info.array.level), + mddev); + if (st->ss == &super0) + put_md_name(mddev); + } + return 0; +} diff --git a/README.initramfs b/README.initramfs new file mode 100644 index 00000000..8f9b8ddf --- /dev/null +++ b/README.initramfs @@ -0,0 +1,123 @@ +Assembling md arrays at boot time. +--------------------------------- +December 2005 + +These notes apply to 2.6 kernels only and, in some cases, +to 2.6.15 or later. + +Md arrays can be assembled at boot time using the 'autodetect' functionality +which is triggered by storing components of an array in partitions of type +'fd' - Linux Raid Autodetect. +They can also be assembled by specifying the component devices in a +kernel parameter such as + md=0,/dev/sda,/dev/sdb +In this case, /dev/md0 will be assembled (because of the 0) from the listed +devices. + +These mechanisms, while useful, do not provide complete functionality +and are unlikely to be extended. The preferred way to assemble md +arrays at boot time is using 'mdadm' or 'mdassemble' (which is a +trimmed-down mdadm). To assemble an array which contains the root +filesystem, mdadm needs to be run before that filesystem is mounted, +and so needs to be run from an initial-ram-fs. It is how this can +work that is the primary focus of this document. + +It should be noted up front that only the array containing the root +filesystem should be assembled from the initramfs. Any other arrays +should be assembled under the control of files on the main filesystem +as this enhanced flexibility and maintainability. + +A minimal initramfs for assembling md arrays can be created using 3 +files and one directory. These are: + +/bin Directory +/bin/mdadm statically linked mdadm binary +/bin/busybox statically linked busybox binary +/bin/sh hard link to /bin/busybox +/init a shell script which call mdadm appropriately. + +An example init script is: + +============================================== +#!/bin/sh + +echo 'Auto-assembling boot md array' +mkdir /proc +mount -t proc proc /proc +if [ -n "$rootuuid" ] +then arg=--uuid=$rootuuid +elif [ -n "$mdminor" ] +then arg=--super-minor=$mdminor +else arg=--super-minor=0 +fi +echo "Using $arg" +mdadm -Acpartitions $arg --auto=part /dev/mda +cd / +mount /dev/mda1 /root || mount /dev/mda /root +umount /proc +cd /root +exec chroot . /sbin/init < /dev/console > /dev/console 2>&1 +============================================= + +This could certainly be extended, or merged into a larger init script. +Though tested and in production use, it is not presented here as +"The Right Way" to do it, but as a useful example. +Some key points are: + + /proc needs to be mounted so that /proc/partitions can be accessed + by mdadm, and so that /proc/filesystems can be accessed by mount. + + The uuid of the array can be passed in as a kernel parameter + (rootuuid). As the kernel doesn't use this value, it is made available + in the environment for /init + + If no uuid is given, we default to md0, (--super-minor=0) which is a + commonly used to store the root filesystem. This may not work in + all situations. + + We assemble the array as a partitionable array (/dev/mda) even if we + end up using the whole array. There is no cost in using the partitionable + interface, and in this context it is simpler. + + We try mounting both /dev/mda1 and /dev/mda as they are the most like + part of the array to contain the root filesystem. + + The --auto flag is given to mdadm so that it will create /dev/md* + files automatically. This is needed as /dev will not contain + and md files, and udev will not create them (as udev only created device + files after the device exists, and mdadm need the device file to create + the device). Note that the created md files may not exist in /dev + of the mounted root filesystem. This needs to be deal with separately + from mdadm - possibly using udev. + + We do not need to create device files for the components which will + be assembled into /dev/mda. mdadm finds the major/minor numbers from + /proc/partitions and creates a temporary /dev file if one doesn't already + exist. + +The script "mkinitramfs" which is included with the mdadm distribution +can be used to create a minimal initramfs. It creates a file called +'init.cpio.gz' which can be specified as an 'initrd' to lilo or grub +(or whatever boot loader is being used). + + + + +Resume from an md array +----------------------- + +If you want to make use of the suspend-to-disk/resume functionality in Linux, +and want to have swap on an md array, you will need to assemble the array +before resume is possible. +However, because the array is active in the resumed image, you do not want +anything written to any drives during the resume process, such as superblock +updates or array resync. + +This can be achieved in 2.6.15-rc1 and later kernels using the +'start_readonly' module parameter. +Simply include the command + echo 1 > /sys/module/md_mod/parameters/start_ro +before assembling the array with 'mdadm'. +You can then echo + 9:0 +or whatever is appropriate to /sys/power/resume to trigger the resume. diff --git a/ReadMe.c b/ReadMe.c new file mode 100644 index 00000000..d40310a9 --- /dev/null +++ b/ReadMe.c @@ -0,0 +1,642 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2016 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" + +#ifndef VERSION +#define VERSION "3.4" +#endif +#ifndef VERS_DATE +#define VERS_DATE "28th January 2016" +#endif +char Version[] = "mdadm - v" VERSION " - " VERS_DATE "\n"; + +/* + * File: ReadMe.c + * + * This file contains general comments about the implementation + * and the various usage messages that can be displayed by mdadm + * + */ + +/* + * mdadm has 7 major modes of operation: + * 1/ Create + * This mode is used to create a new array with a superblock + * 2/ Assemble + * This mode is used to assemble the parts of a previously created + * array into an active array. Components can be explicitly given + * or can be searched for. mdadm (optionally) checks that the components + * do form a bona-fide array, and can, on request, fiddle superblock + * version numbers so as to assemble a faulty array. + * 3/ Build + * This is for building legacy arrays without superblocks + * 4/ Manage + * This is for doing something to one or more devices + * in an array, such as add,remove,fail. + * run/stop/readonly/readwrite are also available + * 5/ Misc + * This is for doing things to individual devices. + * They might be parts of an array so + * zero-superblock, examine might be appropriate + * They might be md arrays so + * run,stop,rw,ro,detail might be appropriate + * Also query will treat it as either + * 6/ Monitor + * This mode never exits but just monitors arrays and reports changes. + * 7/ Grow + * This mode allows for changing of key attributes of a raid array, such + * as size, number of devices, and possibly even layout. + * 8/ Incremental + * Is assembles an array incrementally instead of all at once. + * As devices are discovered they can be passed to "mdadm --incremental" + * which will collect them. When enough devices to for an array are + * found, it is started. + */ + +char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; +char short_bitmap_options[]= + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sarfRSow1tye:"; +char short_bitmap_auto_options[]= + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:n:x:u:c:d:z:U:N:sa:rfRSow1tye:"; + +struct option long_options[] = { + {"manage", 0, 0, ManageOpt}, + {"misc", 0, 0, MiscOpt}, + {"assemble", 0, 0, 'A'}, + {"build", 0, 0, 'B'}, + {"create", 0, 0, 'C'}, + {"detail", 0, 0, 'D'}, + {"examine", 0, 0, 'E'}, + {"follow", 0, 0, 'F'}, + {"grow", 0, 0, 'G'}, + {"incremental",0,0, 'I'}, + {"zero-superblock", 0, 0, KillOpt}, /* deliberately not a short_option */ + {"query", 0, 0, 'Q'}, + {"examine-bitmap", 0, 0, 'X'}, + {"auto-detect", 0, 0, AutoDetect}, + {"detail-platform", 0, 0, DetailPlatform}, + {"kill-subarray", 1, 0, KillSubarray}, + {"update-subarray", 1, 0, UpdateSubarray}, + {"udev-rules", 2, 0, UdevRules}, + {"offroot", 0, 0, OffRootOpt}, + {"examine-badblocks", 0, 0, ExamineBB}, + + {"dump", 1, 0, Dump}, + {"restore", 1, 0, Restore}, + + /* synonyms */ + {"monitor", 0, 0, 'F'}, + + /* after those will normally come the name of the md device */ + + {"help", 0, 0, 'h'}, + {"help-options",0,0, HelpOptions}, + {"version", 0, 0, 'V'}, + {"verbose", 0, 0, 'v'}, + {"quiet", 0, 0, 'q'}, + + /* For create or build: */ + {"chunk", 1, 0, ChunkSize}, + {"rounding", 1, 0, ChunkSize}, /* for linear, chunk is really a + * rounding number */ + {"level", 1, 0, 'l'}, /* 0,1,4,5,6,linear */ + {"parity", 1, 0, Layout}, /* {left,right}-{a,}symmetric */ + {"layout", 1, 0, Layout}, + {"raid-disks",1, 0, 'n'}, + {"raid-devices",1, 0, 'n'}, + {"spare-disks",1,0, 'x'}, + {"spare-devices",1,0, 'x'}, + {"size", 1, 0, 'z'}, + {"auto", 1, 0, Auto}, /* also for --assemble */ + {"assume-clean",0,0, AssumeClean }, + {"metadata", 1, 0, 'e'}, /* superblock format */ + {"bitmap", 1, 0, Bitmap}, + {"bitmap-chunk", 1, 0, BitmapChunk}, + {"write-behind", 2, 0, WriteBehind}, + {"write-mostly",0, 0, WriteMostly}, + {"re-add", 0, 0, ReAdd}, + {"homehost", 1, 0, HomeHost}, + {"symlinks", 1, 0, Symlinks}, + {"data-offset",1, 0, DataOffset}, + {"nodes",1, 0, Nodes}, /* also for --assemble */ + {"home-cluster",1, 0, ClusterName}, + {"write-journal",1, 0, WriteJournal}, + + /* For assemble */ + {"uuid", 1, 0, 'u'}, + {"super-minor",1,0, SuperMinor}, + {"name", 1, 0, 'N'}, + {"config", 1, 0, ConfigFile}, + {"scan", 0, 0, 's'}, + {"force", 0, 0, Force}, + {"update", 1, 0, 'U'}, + {"freeze-reshape", 0, 0, FreezeReshape}, + + /* Management */ + {"add", 0, 0, Add}, + {"add-spare", 0, 0, AddSpare}, + {"add-journal", 0, 0, AddJournal}, + {"remove", 0, 0, Remove}, + {"fail", 0, 0, Fail}, + {"set-faulty",0, 0, Fail}, + {"replace", 0, 0, Replace}, + {"with", 0, 0, With}, + {"run", 0, 0, 'R'}, + {"stop", 0, 0, 'S'}, + {"readonly", 0, 0, 'o'}, + {"readwrite", 0, 0, 'w'}, + {"no-degraded",0,0, NoDegraded }, + {"wait", 0, 0, WaitOpt}, + {"wait-clean", 0, 0, Waitclean }, + {"action", 1, 0, Action }, + {"cluster-confirm", 0, 0, ClusterConfirm}, + + /* For Detail/Examine */ + {"brief", 0, 0, Brief}, + {"export", 0, 0, 'Y'}, + {"sparc2.2", 0, 0, Sparc22}, + {"test", 0, 0, 't'}, + {"prefer", 1, 0, Prefer}, + + /* For Follow/monitor */ + {"mail", 1, 0, EMail}, + {"program", 1, 0, ProgramOpt}, + {"alert", 1, 0, ProgramOpt}, + {"increment", 1, 0, Increment}, + {"delay", 1, 0, 'd'}, + {"daemonise", 0, 0, Fork}, + {"daemonize", 0, 0, Fork}, + {"oneshot", 0, 0, '1'}, + {"pid-file", 1, 0, 'i'}, + {"syslog", 0, 0, 'y'}, + {"no-sharing", 0, 0, NoSharing}, + + /* For Grow */ + {"backup-file", 1,0, BackupFile}, + {"invalid-backup",0,0,InvalidBackup}, + {"array-size", 1, 0, 'Z'}, + {"continue", 0, 0, Continue}, + + /* For Incremental */ + {"rebuild-map", 0, 0, RebuildMapOpt}, + {"path", 1, 0, IncrementalPath}, + + {0, 0, 0, 0} +}; + +char Usage[] = +"Usage: mdadm --help\n" +" for help\n" +; + +char Help[] = +"mdadm is used for building, managing, and monitoring\n" +"Linux md devices (aka RAID arrays)\n" +"Usage: mdadm --create device options...\n" +" Create a new array from unused devices.\n" +" mdadm --assemble device options...\n" +" Assemble a previously created array.\n" +" mdadm --build device options...\n" +" Create or assemble an array without metadata.\n" +" mdadm --manage device options...\n" +" make changes to an existing array.\n" +" mdadm --misc options... devices\n" +" report on or modify various md related devices.\n" +" mdadm --grow options device\n" +" resize/reshape an active array\n" +" mdadm --incremental device\n" +" add/remove a device to/from an array as appropriate\n" +" mdadm --monitor options...\n" +" Monitor one or more array for significant changes.\n" +" mdadm device options...\n" +" Shorthand for --manage.\n" +"Any parameter that does not start with '-' is treated as a device name\n" +"or, for --examine-bitmap, a file name.\n" +"The first such name is often the name of an md device. Subsequent\n" +"names are often names of component devices.\n" +"\n" +" For detailed help on the above major modes use --help after the mode\n" +" e.g.\n" +" mdadm --assemble --help\n" +" For general help on options use\n" +" mdadm --help-options\n" +; + +char OptionHelp[] = +"Any parameter that does not start with '-' is treated as a device name\n" +"or, for --examine-bitmap, a file name.\n" +"The first such name is often the name of an md device. Subsequent\n" +"names are often names of component devices.\n" +"\n" +"Some common options are:\n" +" --help -h : General help message or, after above option,\n" +" mode specific help message\n" +" --help-options : This help message\n" +" --version -V : Print version information for mdadm\n" +" --verbose -v : Be more verbose about what is happening\n" +" --quiet -q : Don't print un-necessary messages\n" +" --brief -b : Be less verbose, more brief\n" +" --export -Y : With --detail, --detail-platform or --examine use\n" +" key=value format for easy import into environment\n" +" --force -f : Override normal checks and be more forceful\n" +"\n" +" --assemble -A : Assemble an array\n" +" --build -B : Build an array without metadata\n" +" --create -C : Create a new array\n" +" --detail -D : Display details of an array\n" +" --examine -E : Examine superblock on an array component\n" +" --examine-bitmap -X: Display the detail of a bitmap file\n" +" --examine-badblocks: Display list of known bad blocks on device\n" +" --monitor -F : monitor (follow) some arrays\n" +" --grow -G : resize/ reshape and array\n" +" --incremental -I : add/remove a single device to/from an array as appropriate\n" +" --query -Q : Display general information about how a\n" +" device relates to the md driver\n" +" --auto-detect : Start arrays auto-detected by the kernel\n" +; +/* +"\n" +" For create or build:\n" +" --bitmap= -b : File to store bitmap in - may pre-exist for --build\n" +" --chunk= -c : chunk size of kibibytes\n" +" --rounding= : rounding factor for linear array (==chunk size)\n" +" --level= -l : raid level: 0,1,4,5,6,10,linear, or mp for create.\n" +" : 0,1,10,mp,faulty or linear for build.\n" +" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" +" --layout= : same as --parity, for RAID10: [fno]NN \n" +" --raid-devices= -n : number of active devices in array\n" +" --spare-devices= -x: number of spare (eXtra) devices in initial array\n" +" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" +" --force -f : Honour devices as listed on command line. Don't\n" +" : insert a missing drive for RAID5.\n" +" --assume-clean : Assume the array is already in-sync. This is dangerous for RAID5.\n" +" --bitmap-chunk= : chunksize of bitmap in bitmap file (Kilobytes)\n" +" --delay= -d : seconds between bitmap updates\n" +" --write-behind= : number of simultaneous write-behind requests to allow (requires bitmap)\n" +" --name= -N : Textual name for array - max 32 characters\n" +"\n" +" For assemble:\n" +" --bitmap= -b : File to find bitmap information in\n" +" --uuid= -u : uuid of array to assemble. Devices which don't\n" +" have this uuid are excluded\n" +" --super-minor= -m : minor number to look for in super-block when\n" +" choosing devices to use.\n" +" --name= -N : Array name to look for in super-block.\n" +" --config= -c : config file\n" +" --scan -s : scan config file for missing information\n" +" --force -f : Assemble the array even if some superblocks appear out-of-date\n" +" --update= -U : Update superblock: try '-A --update=?' for list of options.\n" +" --no-degraded : Do not start any degraded arrays - default unless --scan.\n" +"\n" +" For detail or examine:\n" +" --brief -b : Just print device name and UUID\n" +"\n" +" For follow/monitor:\n" +" --mail= -m : Address to mail alerts of failure to\n" +" --program= -p : Program to run when an event is detected\n" +" --alert= : same as --program\n" +" --delay= -d : seconds of delay between polling state. default=60\n" +"\n" +" General management:\n" +" --add -a : add, or hotadd subsequent devices\n" +" --re-add : re-add a recently removed device\n" +" --remove -r : remove subsequent devices\n" +" --fail -f : mark subsequent devices as faulty\n" +" --set-faulty : same as --fail\n" +" --replace : mark a device for replacement\n" +" --run -R : start a partially built array\n" +" --stop -S : deactivate array, releasing all resources\n" +" --readonly -o : mark array as readonly\n" +" --readwrite -w : mark array as readwrite\n" +" --zero-superblock : erase the MD superblock from a device.\n" +" --wait -W : wait for recovery/resync/reshape to finish.\n" +; +*/ + +char Help_create[] = +"Usage: mdadm --create device -chunk=X --level=Y --raid-devices=Z devices\n" +"\n" +" This usage will initialise a new md array, associate some\n" +" devices with it, and activate the array. In order to create an\n" +" array with some devices missing, use the special word 'missing' in\n" +" place of the relevant device name.\n" +"\n" +" Before devices are added, they are checked to see if they already contain\n" +" raid superblocks or filesystems. They are also checked to see if\n" +" the variance in device size exceeds 1%.\n" +" If any discrepancy is found, the user will be prompted for confirmation\n" +" before the array is created. The presence of a '--run' can override this\n" +" caution.\n" +"\n" +" If the --size option is given then only that many kilobytes of each\n" +" device is used, no matter how big each device is.\n" +" If no --size is given, the apparent size of the smallest drive given\n" +" is used for raid level 1 and greater, and the full device is used for\n" +" other levels.\n" +"\n" +" Options that are valid with --create (-C) are:\n" +" --bitmap= : Create a bitmap for the array with the given filename\n" +" : or an internal bitmap is 'internal' is given\n" +" --chunk= -c : chunk size in kibibytes\n" +" --rounding= : rounding factor for linear array (==chunk size)\n" +" --level= -l : raid level: 0,1,4,5,6,10,linear,multipath and synonyms\n" +" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" +" --layout= : same as --parity, for RAID10: [fno]NN \n" +" --raid-devices= -n : number of active devices in array\n" +" --spare-devices= -x: number of spare (eXtra) devices in initial array\n" +" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" +" --data-offset= : Space to leave between start of device and start\n" +" : of array data.\n" +" --force -f : Honour devices as listed on command line. Don't\n" +" : insert a missing drive for RAID5.\n" +" --run -R : insist of running the array even if not all\n" +" : devices are present or some look odd.\n" +" --readonly -o : start the array readonly - not supported yet.\n" +" --name= -N : Textual name for array - max 32 characters\n" +" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n" +" --delay= -d : bitmap update delay in seconds.\n" +" --write-journal= : Specify journal device for RAID-4/5/6 array\n" +"\n" +; + +char Help_build[] = +"Usage: mdadm --build device -chunk=X --level=Y --raid-devices=Z devices\n" +"\n" +" This usage is similar to --create. The difference is that it creates\n" +" a legacy array without a superblock. With these arrays there is no\n" +" different between initially creating the array and subsequently\n" +" assembling the array, except that hopefully there is useful data\n" +" there in the second case.\n" +"\n" +" The level may only be 0, 1, 10, linear, multipath, or faulty.\n" +" All devices must be listed and the array will be started once complete.\n" +" Options that are valid with --build (-B) are:\n" +" --bitmap= : file to store/find bitmap information in.\n" +" --chunk= -c : chunk size of kibibytes\n" +" --rounding= : rounding factor for linear array (==chunk size)\n" +" --level= -l : 0, 1, 10, linear, multipath, faulty\n" +" --raid-devices= -n : number of active devices in array\n" +" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n" +" --delay= -d : bitmap update delay in seconds.\n" +; + +char Help_assemble[] = +"Usage: mdadm --assemble device options...\n" +" mdadm --assemble --scan options...\n" +"\n" +"This usage assembles one or more raid arrays from pre-existing\n" +"components.\n" +"For each array, mdadm needs to know the md device, the identity of\n" +"the array, and a number of sub devices. These can be found in a number\n" +"of ways.\n" +"\n" +"The md device is given on the command line, is found listed in the\n" +"config file, or can be deduced from the array identity.\n" +"The array identity is determined either from the --uuid, --name, or\n" +"--super-minor commandline arguments, from the config file,\n" +"or from the first component device on the command line.\n" +"\n" +"The different combinations of these are as follows:\n" +" If the --scan option is not given, then only devices and identities\n" +" listed on the command line are considered.\n" +" The first device will be the array device, and the remainder will be\n" +" examined when looking for components.\n" +" If an explicit identity is given with --uuid or --super-minor, then\n" +" only devices with a superblock which matches that identity is considered,\n" +" otherwise every device listed is considered.\n" +"\n" +" If the --scan option is given, and no devices are listed, then\n" +" every array listed in the config file is considered for assembly.\n" +" The identity of candidate devices are determined from the config file.\n" +" After these arrays are assembled, mdadm will look for other devices\n" +" that could form further arrays and tries to assemble them. This can\n" +" be disabled using the 'AUTO' option in the config file.\n" +"\n" +" If the --scan option is given as well as one or more devices, then\n" +" Those devices are md devices that are to be assembled. Their identity\n" +" and components are determined from the config file.\n" +"\n" +" If mdadm can not find all of the components for an array, it will assemble\n" +" it but not activate it unless --run or --scan is given. To preserve this\n" +" behaviour even with --scan, add --no-degraded. Note that \"all of the\n" +" components\" means as many as were present the last time the array was running\n" +" as recorded in the superblock. If the array was already degraded, and\n" +" the missing device is not a new problem, it will still be assembled. It\n" +" is only newly missing devices that cause the array not to be started.\n" +"\n" +"Options that are valid with --assemble (-A) are:\n" +" --bitmap= : bitmap file to use with the array\n" +" --uuid= -u : uuid of array to assemble. Devices which don't\n" +" have this uuid are excluded\n" +" --super-minor= -m : minor number to look for in super-block when\n" +" choosing devices to use.\n" +" --name= -N : Array name to look for in super-block.\n" +" --config= -c : config file\n" +" --scan -s : scan config file for missing information\n" +" --run -R : Try to start the array even if not enough devices\n" +" for a full array are present\n" +" --force -f : Assemble the array even if some superblocks appear\n" +" : out-of-date. This involves modifying the superblocks.\n" +" --update= -U : Update superblock: try '-A --update=?' for option list.\n" +" --no-degraded : Assemble but do not start degraded arrays.\n" +" --readonly -o : Mark the array as read-only. No resync will start.\n" +; + +char Help_manage[] = +"Usage: mdadm arraydevice options component devices...\n" +"\n" +"This usage is for managing the component devices within an array.\n" +"The --manage option is not needed and is assumed if the first argument\n" +"is a device name or a management option.\n" +"The first device listed will be taken to be an md array device, any\n" +"subsequent devices are (potential) components of that array.\n" +"\n" +"Options that are valid with management mode are:\n" +" --add -a : hotadd subsequent devices to the array\n" +" --re-add : subsequent devices are re-added if there were\n" +" : recent members of the array\n" +" --remove -r : remove subsequent devices, which must not be active\n" +" --fail -f : mark subsequent devices a faulty\n" +" --set-faulty : same as --fail\n" +" --replace : mark device(s) to be replaced by spares. Once\n" +" : replacement completes, device will be marked faulty\n" +" --with : Indicate which spare a previous '--replace' should\n" +" : prefer to use\n" +" --run -R : start a partially built array\n" +" --stop -S : deactivate array, releasing all resources\n" +" --readonly -o : mark array as readonly\n" +" --readwrite -w : mark array as readwrite\n" +; + +char Help_misc[] = +"Usage: mdadm misc_option devices...\n" +"\n" +"This usage is for performing some task on one or more devices, which\n" +"may be arrays or components, depending on the task.\n" +"The --misc option is not needed (though it is allowed) and is assumed\n" +"if the first argument in a misc option.\n" +"\n" +"Options that are valid with the miscellaneous mode are:\n" +" --query -Q : Display general information about how a\n" +" device relates to the md driver\n" +" --detail -D : Display details of an array\n" +" --detail-platform : Display hardware/firmware details\n" +" --examine -E : Examine superblock on an array component\n" +" --examine-bitmap -X: Display contents of a bitmap file\n" +" --examine-badblocks: Display list of known bad blocks on device\n" +" --zero-superblock : erase the MD superblock from a device.\n" +" --run -R : start a partially built array\n" +" --stop -S : deactivate array, releasing all resources\n" +" --readonly -o : mark array as readonly\n" +" --readwrite -w : mark array as readwrite\n" +" --test -t : exit status 0 if ok, 1 if degrade, 2 if dead, 4 if missing\n" +" --wait -W : wait for resync/rebuild/recovery to finish\n" +" --action= : initiate or abort ('idle' or 'frozen') a 'check' or 'repair'.\n" +; + +char Help_monitor[] = +"Usage: mdadm --monitor options devices\n" +"\n" +"This usage causes mdadm to monitor a number of md arrays by periodically\n" +"polling their status and acting on any changes.\n" +"If any devices are listed then those devices are monitored, otherwise\n" +"all devices listed in the config file are monitored.\n" +"The address for mailing advisories to, and the program to handle\n" +"each change can be specified in the config file or on the command line.\n" +"There must be at least one destination for advisories, whether\n" +"an email address, a program, or --syslog\n" +"\n" +"Options that are valid with the monitor (-F --follow) mode are:\n" +" --mail= -m : Address to mail alerts of failure to\n" +" --program= -p : Program to run when an event is detected\n" +" --alert= : same as --program\n" +" --syslog -y : Report alerts via syslog\n" +" --increment= -r : Report RebuildNN events in the given increment. default=20\n" +" --delay= -d : seconds of delay between polling state. default=60\n" +" --config= -c : specify a different config file\n" +" --scan -s : find mail-address/program in config file\n" +" --daemonise -f : Fork and continue in child, parent exits\n" +" --pid-file= -i : In daemon mode write pid to specified file instead of stdout\n" +" --oneshot -1 : Check for degraded arrays, then exit\n" +" --test -t : Generate a TestMessage event against each array at startup\n" +; + +char Help_grow[] = +"Usage: mdadm --grow device options\n" +"\n" +"This usage causes mdadm to attempt to reconfigure a running array.\n" +"This is only possibly if the kernel being used supports a particular\n" +"reconfiguration.\n" +"\n" +"Options that are valid with the grow (-G --grow) mode are:\n" +" --level= -l : Tell mdadm what level to convert the array to.\n" +" --layout= -p : For a FAULTY array, set/change the error mode.\n" +" : for other arrays, update the layout\n" +" --size= -z : Change the active size of devices in an array.\n" +" : This is useful if all devices have been replaced\n" +" : with larger devices. Value is in Kilobytes, or\n" +" : the special word 'max' meaning 'as large as possible'.\n" +" --assume-clean : When increasing the --size, this flag will avoid\n" +" : a resync of the new space\n" +" --chunk= -c : Change the chunksize of the array\n" +" --raid-devices= -n : Change the number of active devices in an array.\n" +" --add= -a : Add listed devices as part of reshape. This is\n" +" : needed for resizing a RAID0 which cannot have\n" +" : spares already present.\n" +" --bitmap= -b : Add or remove a write-intent bitmap.\n" +" --backup-file= file : A file on a different device to store data for a\n" +" : short time while increasing raid-devices on a\n" +" : RAID4/5/6 array. Also needed throughout a reshape\n" +" : when changing parameters other than raid-devices\n" +" --array-size= -Z : Change visible size of array. This does not change\n" +" : any data on the device, and is not stable across restarts.\n" +" --data-offset= : Location on device to move start of data to.\n" +; + +char Help_incr[] = +"Usage: mdadm --incremental [-Rqrsf] device\n" +"\n" +"This usage allows for incremental assembly of md arrays. Devices can be\n" +"added one at a time as they are discovered. Once an array has all expected\n" +"devices, it will be started.\n" +"\n" +"Optionally, the process can be reversed by using the fail option.\n" +"When fail mode is invoked, mdadm will see if the device belongs to an array\n" +"and then both fail (if needed) and remove the device from that array.\n" +"\n" +"Options that are valid with incremental assembly (-I --incremental) are:\n" +" --run -R : Run arrays as soon as a minimal number of devices are\n" +" : present rather than waiting for all expected.\n" +" --quiet -q : Don't print any information messages, just errors.\n" +" --rebuild-map -r : Rebuild the 'map' file that mdadm uses for tracking\n" +" : partial arrays.\n" +" --scan -s : Use with -R to start any arrays that have the minimal\n" +" : required number of devices, but are not yet started.\n" +" --fail -f : First fail (if needed) and then remove device from\n" +" : any array that it is a member of.\n" +; + +char Help_config[] = +"The /etc/mdadm/mdadm.conf config file:\n\n" +" The config file contains, apart from blank lines and comment lines that\n" +" start with a hash(#), array lines, device lines, and various\n" +" configuration lines.\n" +" Each line is constructed of a number of space separated words, and can\n" +" be continued on subsequent physical lines by indenting those lines.\n" +"\n" +" A device line starts with the word 'device' and then has a number of words\n" +" which identify devices. These words should be names of devices in the\n" +" filesystem, and can contain wildcards. There can be multiple words or each\n" +" device line, and multiple device lines. All devices so listed are checked\n" +" for relevant super blocks when assembling arrays.\n" +"\n" +" An array line start with the word 'array'. This is followed by the name of\n" +" the array device in the filesystem, e.g. '/dev/md2'. Subsequent words\n" +" describe the identity of the array, used to recognise devices to include in the\n" +" array. The identity can be given as a UUID with a word starting 'uuid=', or\n" +" as a minor-number stored in the superblock using 'super-minor=', or as a list\n" +" of devices. This is given as a comma separated list of names, possibly\n" +" containing wildcards, preceded by 'devices='. If multiple critea are given,\n" +" than a device must match all of them to be considered.\n" +"\n" +" Other configuration lines include:\n" +" mailaddr, mailfrom, program used for --monitor mode\n" +" create, auto used when creating device names in /dev\n" +" homehost, policy, part-policy used to guide policy in various\n" +" situations\n" +"\n" +; + +char *mode_help[mode_count] = { + [0] = Help, + [ASSEMBLE] = Help_assemble, + [BUILD] = Help_build, + [CREATE] = Help_create, + [MANAGE] = Help_manage, + [MISC] = Help_misc, + [MONITOR] = Help_monitor, + [GROW] = Help_grow, + [INCREMENTAL] = Help_incr, +}; diff --git a/TODO b/TODO new file mode 100644 index 00000000..279d20db --- /dev/null +++ b/TODO @@ -0,0 +1,213 @@ + - add 'name' field to metadata type and use it. + - use validate_geometry more + - metadata should be able to check/reject bitmap stuff. + +DDF: + Three new metadata types: + ddf - used only to create a container. + ddf-bvd - used to create an array in a container + ddf-svd - used to create a secondary array from bvds. + + Usage: + mdadm -C /dev/ddf1 /dev/sd[abcdef] + mdadm -C /dev/md1 -e ddf /dev/sd[a-f] + mdadm -C /dev/md1 -l container /dev/sd[a-f] + + Each of these create a new ddf container using all those + devices. The name 'ddf*' signals that ddf metadata should be used. + '-e ddf' only supports one level - 'container'. 'container' is only + supported by ddf. + + mdadm -C /dev/md1 -l0 -n4 /dev/ddf1 # or maybe not ??? + mdadm -C /dev/md1 -l1 -n2 /dev/sda /dev/sdb + If exactly one device is given, and it is a container, we select + devices from that container. + If devices are given that are already in use, they must be in use by + a container, and the array is created in the container. + If devices given are bvds, we slip under the hood to make + the svd arrays. + + mdadm -A /dev/ddf ...... + base drives make a container. Anything in that container is started + auto-read-only. + if /dev/ddf is already assembled, we assemble bvds and svds inside it. + + +2005-dec-20 + Want an incremental assembly mode to work nicely with udev. + Core usage would be something like + mdadm --incr-assemble /dev/newdevice + This would + - examine the device to determine uuid etc. + - look for a match in /etc/mdadm.conf, abort if not found + - find that device and collect current contents + - perform an 'assemble' analysis to make sure we have the best set of devices. + - remove or add devices as appropriate + - possibly start the array if it was complete + + Other usages could involve + - specify which array to auto-add to. + This requires an existing array for uuid matching... is there any point? + + - + + +2004-june-02 + * Don't print 'errors' flag, it is meaningless. DONE + * Handle new superblock format + * create device file on demand, particularly partitionable devices. DONE + BUT figure a way to create the partition devices. + auto=partN + * Use Event: interface to listen for events. DONE, untested + * Make sure mdadm -As can assemble multi-level RAIDs ok. + * --build to build raid1 or multipath arrays + clean or not ??? + +---------------------------------------------------------------------------- +* mdadm --monitor to monitor failed multipath paths and re-instate them. + +* Maybe make "--help" fit in 80x24 and have a --long-help with more info. DONE + + +* maybe "missing" instead of missing in doco DONE +* possibly wait for resync to start, or even finish while assembling.- NO + +* -Db should have a devices= entry if possible. - DONE +* when assembling multipath arrays, ignore any error indicators. - DONE +* rationalise --monitor usage: + mdadm --monitor + doesn't do as expected. DONE + +* --assemble could have a --update option. - DONE + following word can be: + sparc2.2 + super-minor + +* mdadm /dev/md11, where md11 is raid0 can segfault, particularly when looking in the + [UU_UUU] string ... which doesn't exist ! +It should be more sensible. DONE + +Example: + +from Raimund Sacherer + +mke2fs -m0 -q /dev/ram1 300 +mount -n -t ext2 /dev/ram1 /tmp +echo DEVICE /dev/[sh]* >> /tmp/mdadm.conf +mdadm -Esb /dev/[sh]* 2>/dev/null >> /tmp/mdadm.conf +mdadm -ARsc /tmp/mdadm.conf +umount /tmp + + +?? Allow -S /dev/md? - current complains subsequent not a/d/r - DONE + +* new "Query" mode to subsume --detail and --examine. + --query or -Q, takes a device and tells if it is an MD device, + and also tells in a raid superblock is found. + DONE + +* write mdstat.c to parse /proc/mdstat file + Build list of arrays: name, rebuild-percent + DONE + +* parse /proc/partitions and map major/minor into /dev/* names, + and use that for default DEVICE list ???? + +* --detail --scan to read /proc/mdstat, and then iterate over these, + but assume --brief. --verbose can override + check each subdevice to see if it is in conf_get_devs. + Warn if not. + DONE, but don't warn yet... + +* Support multipath ... maybe... + maybe DONE + +* --follow to syslog + +* --follow to move spares around DONE + +* --follow to notice other events: DONE + rebuild started + spare activated + spare removed + spare added + +------------------------------------ +- --examine --scan scans all drives and build an mdadm.conf file DONE + +- check superblock checksum in examine DONE +- report "chunk" or "rounding" depending on raid level DONE +- report "linear" instead of "-1" for raid level DONE +- decode ayout depending on raid level DONE +- --verbose and --force flags. DONE + +- set md_minor, *_disks for Create - DONE +- for create raid5, how to choose between + all working, but not insync + one missing, one spare, insync DONE (--force) +- and for raid1 - some failed drives... (missing) + +- when RUN_ARRAY, make sure *_disks counts are right + +- get --detail to extract extra stuff from superblock, + like uuid DONE +- --detail --brief to give a config file line DONE +- parse config file. DONE +- test... + +- when --assemble --scan, if an underlying device is an md device, + then try to assemble that device first. + + +- mdadm -S /dev/md0 /dev/md1 gives internal error FIXED + +- mdadm --detail --scan print summary of what it can find? DONE + + +--------- +Assemble doesn't add spares. - DONE +Create to allow "missing" name for devices. +Create to accept "--force" for do exactly what is requested +- get Assemble to upgrade devices if force flag. +ARRAY lines in config file to have super_minor=n +ARRAY lines in config file to have device=pattern, and only accept + those devices + If UUID given, insist on that + If not, but super_minor given, require all found with that minor + to have same uuid + If only device given, all valid supers on those devices must have + same uuid +allow /dev/mdX as first argument before any options +Possible --dry-run option for create and assemble--force + +Assemble to check that all devices mentioned in superblock + are present. + +New mode: --Monitor (or --Follow) + Periodically check status of all arrays (listed in config file). + Log every event and apparent cause - or differences + Email and alert - or run a program - for important events + Move spares around if necessary. + + An Array line can have a spare-group= field that indicates that + the array shares spares with other arrays with the same + spare-group name. + If an array has a failed and no spares, then check all other + arrays in the spare group. If one has no failures and a spare, + then consider that spare. + Choose the smallest considered spare that is large enough. + If there is one, then hot-remove it from it's home, and + hot-add it to the array in question. + + --mail-to address + --alert-handler program + + Will also extract information from /proc/mdstat if present, + and consider 20% marks in rebuild as events. + + Events are: + drive fails - causes mail to be sent + rebuild started + spare activated + spare removed + spare added diff --git a/bitmap.c b/bitmap.c new file mode 100644 index 00000000..dab674b4 --- /dev/null +++ b/bitmap.c @@ -0,0 +1,485 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2004 Paul Clements, SteelEye Technology, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "mdadm.h" + +static inline void sb_le_to_cpu(bitmap_super_t *sb) +{ + sb->magic = __le32_to_cpu(sb->magic); + sb->version = __le32_to_cpu(sb->version); + /* uuid gets no translation */ + sb->events = __le64_to_cpu(sb->events); + sb->events_cleared = __le64_to_cpu(sb->events_cleared); + sb->state = __le32_to_cpu(sb->state); + sb->chunksize = __le32_to_cpu(sb->chunksize); + sb->daemon_sleep = __le32_to_cpu(sb->daemon_sleep); + sb->sync_size = __le64_to_cpu(sb->sync_size); + sb->write_behind = __le32_to_cpu(sb->write_behind); + sb->nodes = __le32_to_cpu(sb->nodes); + sb->sectors_reserved = __le32_to_cpu(sb->sectors_reserved); +} + +static inline void sb_cpu_to_le(bitmap_super_t *sb) +{ + sb_le_to_cpu(sb); /* these are really the same thing */ +} + +mapping_t bitmap_states[] = { + { "OK", 0 }, + { "Out of date", 2 }, + { NULL, -1 } +}; + +const char *bitmap_state(int state_num) +{ + char *state = map_num(bitmap_states, state_num); + return state ? state : "Unknown"; +} + +const char *human_chunksize(unsigned long bytes) +{ + static char buf[16]; + char *suffixes[] = { "B", "KB", "MB", "GB", "TB", NULL }; + int i = 0; + + while (bytes >> 10) { + bytes >>= 10; + i++; + } + + snprintf(buf, sizeof(buf), "%lu %s", bytes, suffixes[i]); + + return buf; +} + +typedef struct bitmap_info_s { + bitmap_super_t sb; + unsigned long long total_bits; + unsigned long long dirty_bits; +} bitmap_info_t; + +/* count the dirty bits in the first num_bits of byte */ +static inline int count_dirty_bits_byte(char byte, int num_bits) +{ + int num = 0; + + switch (num_bits) { /* fall through... */ + case 8: if (byte & 128) num++; + case 7: if (byte & 64) num++; + case 6: if (byte & 32) num++; + case 5: if (byte & 16) num++; + case 4: if (byte & 8) num++; + case 3: if (byte & 4) num++; + case 2: if (byte & 2) num++; + case 1: if (byte & 1) num++; + default: break; + } + + return num; +} + +int count_dirty_bits(char *buf, int num_bits) +{ + int i, num = 0; + + for (i = 0; i < num_bits / 8; i++) + num += count_dirty_bits_byte(buf[i], 8); + + if (num_bits % 8) /* not an even byte boundary */ + num += count_dirty_bits_byte(buf[i], num_bits % 8); + + return num; +} + +/* calculate the size of the bitmap given the array size and bitmap chunksize */ +unsigned long long bitmap_bits(unsigned long long array_size, + unsigned long chunksize) +{ + return (array_size * 512 + chunksize - 1) / chunksize; +} + +unsigned long bitmap_sectors(struct bitmap_super_s *bsb) +{ + unsigned long long bits = bitmap_bits(__le64_to_cpu(bsb->sync_size), + __le32_to_cpu(bsb->chunksize)); + int bits_per_sector = 8*512; + return (bits + bits_per_sector - 1) / bits_per_sector; +} + +bitmap_info_t *bitmap_fd_read(int fd, int brief) +{ + /* Note: fd might be open O_DIRECT, so we must be + * careful to align reads properly + */ + unsigned long long total_bits = 0, read_bits = 0, dirty_bits = 0; + bitmap_info_t *info; + void *buf; + unsigned int n, skip; + + if (posix_memalign(&buf, 4096, 8192) != 0) { + pr_err("failed to allocate 8192 bytes\n"); + return NULL; + } + n = read(fd, buf, 8192); + + info = xmalloc(sizeof(*info)); + + if (n < sizeof(info->sb)) { + pr_err("failed to read superblock of bitmap file: %s\n", strerror(errno)); + free(info); + free(buf); + return NULL; + } + memcpy(&info->sb, buf, sizeof(info->sb)); + skip = sizeof(info->sb); + + sb_le_to_cpu(&info->sb); /* convert superblock to CPU byte ordering */ + + if (brief || info->sb.sync_size == 0 || info->sb.chunksize == 0) + goto out; + + /* read the rest of the file counting total bits and dirty bits -- + * we stop when either: + * 1) we hit EOF, in which case we assume the rest of the bits (if any) + * are dirty + * 2) we've read the full bitmap, in which case we ignore any trailing + * data in the file + */ + total_bits = bitmap_bits(info->sb.sync_size, info->sb.chunksize); + + while(read_bits < total_bits) { + unsigned long long remaining = total_bits - read_bits; + + if (n == 0) { + n = read(fd, buf, 8192); + skip = 0; + if (n <= 0) + break; + } + if (remaining > (n-skip) * 8) /* we want the full buffer */ + remaining = (n-skip) * 8; + + dirty_bits += count_dirty_bits(buf+skip, remaining); + + read_bits += remaining; + n = 0; + } + + if (read_bits < total_bits) { /* file truncated... */ + pr_err("WARNING: bitmap file is not large enough for array size %llu!\n\n", + (unsigned long long)info->sb.sync_size); + total_bits = read_bits; + } +out: + free(buf); + info->total_bits = total_bits; + info->dirty_bits = dirty_bits; + return info; +} + +int bitmap_file_open(char *filename, struct supertype **stp) +{ + int fd; + struct stat stb; + struct supertype *st = *stp; + + if (stat(filename, &stb) < 0) { + pr_err("failed to find file %s: %s\n", + filename, strerror(errno)); + return -1; + } + if ((S_IFMT & stb.st_mode) == S_IFBLK) { + fd = open(filename, O_RDONLY|O_DIRECT); + if (fd < 0) { + pr_err("failed to open bitmap file %s: %s\n", + filename, strerror(errno)); + return -1; + } + /* block device, so we are probably after an internal bitmap */ + if (!st) st = guess_super(fd); + if (!st) { + /* just look at device... */ + lseek(fd, 0, 0); + } else if (!st->ss->locate_bitmap) { + pr_err("No bitmap possible with %s metadata\n", + st->ss->name); + return -1; + } else { + if (st->ss->locate_bitmap(st, fd)) { + pr_err("%s doesn't have bitmap\n", filename); + fd = -1; + } + } + + *stp = st; + } else { + fd = open(filename, O_RDONLY|O_DIRECT); + if (fd < 0) { + pr_err("failed to open bitmap file %s: %s\n", + filename, strerror(errno)); + return -1; + } + } + + return fd; +} + +__u32 swapl(__u32 l) +{ + char *c = (char*)&l; + char t= c[0]; + c[0] = c[3]; + c[3] = t; + + t = c[1]; + c[1] = c[2]; + c[2] = t; + return l; +} +int ExamineBitmap(char *filename, int brief, struct supertype *st) +{ + /* + * Read the bitmap file and display its contents + */ + + bitmap_super_t *sb; + bitmap_info_t *info; + int rv = 1; + char buf[64]; + int swap; + int fd, i; + __u32 uuid32[4]; + + fd = bitmap_file_open(filename, &st); + if (fd < 0) + return rv; + + info = bitmap_fd_read(fd, brief); + if (!info) + return rv; + sb = &info->sb; + if (sb->magic != BITMAP_MAGIC && md_get_version(fd) > 0) { + pr_err("This is an md array. To view a bitmap you need to examine\n"); + pr_err("a member device, not the array.\n"); + pr_err("Reporting bitmap that would be used if this array were used\n"); + pr_err("as a member of some other array\n"); + } + close(fd); + printf(" Filename : %s\n", filename); + printf(" Magic : %08x\n", sb->magic); + if (sb->magic != BITMAP_MAGIC) { + pr_err("invalid bitmap magic 0x%x, the bitmap file appears\n", + sb->magic); + pr_err("to be corrupted or missing.\n"); + } + printf(" Version : %d\n", sb->version); + if (sb->version < BITMAP_MAJOR_LO || + sb->version > BITMAP_MAJOR_CLUSTERED) { + pr_err("unknown bitmap version %d, either the bitmap file\n", + sb->version); + pr_err("is corrupted or you need to upgrade your tools\n"); + goto free_info; + } + + rv = 0; + if (st) + swap = st->ss->swapuuid; + else +#if __BYTE_ORDER == BIG_ENDIAN + swap = 0; +#else + swap = 1; +#endif + memcpy(uuid32, sb->uuid, 16); + if (swap) + printf(" UUID : %08x:%08x:%08x:%08x\n", + swapl(uuid32[0]), + swapl(uuid32[1]), + swapl(uuid32[2]), + swapl(uuid32[3])); + else + printf(" UUID : %08x:%08x:%08x:%08x\n", + uuid32[0], + uuid32[1], + uuid32[2], + uuid32[3]); + + if (sb->nodes == 0) { + printf(" Events : %llu\n", (unsigned long long)sb->events); + printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared); + printf(" State : %s\n", bitmap_state(sb->state)); + + } + + printf(" Chunksize : %s\n", human_chunksize(sb->chunksize)); + printf(" Daemon : %ds flush period\n", sb->daemon_sleep); + if (sb->write_behind) + sprintf(buf, "Allow write behind, max %d", sb->write_behind); + else + sprintf(buf, "Normal"); + printf(" Write Mode : %s\n", buf); + printf(" Sync Size : %llu%s\n", (unsigned long long)sb->sync_size/2, + human_size(sb->sync_size * 512)); + + if (sb->nodes == 0) { + if (brief) + goto free_info; + printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", + info->total_bits, info->dirty_bits, + 100.0 * info->dirty_bits / (info->total_bits?:1)); + } else { + printf(" Cluster nodes : %d\n", sb->nodes); + printf(" Cluster name : %-64s\n", sb->cluster_name); + for (i = 0; i < (int)sb->nodes; i++) { + if (i) { + free(info); + info = bitmap_fd_read(fd, brief); + sb = &info->sb; + } + if (sb->magic != BITMAP_MAGIC) + pr_err("invalid bitmap magic 0x%x, the bitmap file appears to be corrupted\n", sb->magic); + + printf(" Node Slot : %d\n", i); + printf(" Events : %llu\n", + (unsigned long long)sb->events); + printf(" Events Cleared : %llu\n", + (unsigned long long)sb->events_cleared); + printf(" State : %s\n", bitmap_state(sb->state)); + if (brief) + continue; + printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", + info->total_bits, info->dirty_bits, + 100.0 * info->dirty_bits / (info->total_bits?:1)); + + } + } + +free_info: + free(info); + return rv; +} + +int CreateBitmap(char *filename, int force, char uuid[16], + unsigned long chunksize, unsigned long daemon_sleep, + unsigned long write_behind, + unsigned long long array_size /* sectors */, + int major) +{ + /* + * Create a bitmap file with a superblock and (optionally) a full bitmap + */ + + FILE *fp; + int rv = 1; + char block[512]; + bitmap_super_t sb; + long long bytes, filesize; + + if (!force && access(filename, F_OK) == 0) { + pr_err("bitmap file %s already exists, use --force to overwrite\n", filename); + return rv; + } + + fp = fopen(filename, "w"); + if (fp == NULL) { + pr_err("failed to open bitmap file %s: %s\n", + filename, strerror(errno)); + return rv; + } + + if (chunksize == UnSet) { + /* We don't want more than 2^21 chunks, as 2^11 fill up one + * 4K page (2 bytes per chunk), and 2^10 address of those + * fill up a 4K indexing page. 2^20 might be safer, especially + * on 64bit hosts, so use that. + */ + chunksize = DEFAULT_BITMAP_CHUNK; + /* <<20 for 2^20 chunks, >>9 to convert bytes to sectors */ + while (array_size > ((unsigned long long)chunksize << (20-9))) + chunksize <<= 1; + } + + memset(&sb, 0, sizeof(sb)); + sb.magic = BITMAP_MAGIC; + sb.version = major; + if (uuid != NULL) + memcpy(sb.uuid, uuid, 16); + sb.chunksize = chunksize; + sb.daemon_sleep = daemon_sleep; + sb.write_behind = write_behind; + sb.sync_size = array_size; + + sb_cpu_to_le(&sb); /* convert to on-disk byte ordering */ + + if (fwrite(&sb, sizeof(sb), 1, fp) != 1) { + pr_err("failed to write superblock to bitmap file %s: %s\n", filename, strerror(errno)); + goto out; + } + + /* calculate the size of the bitmap and write it to disk */ + bytes = (bitmap_bits(array_size, chunksize) + 7) / 8; + if (!bytes) { + rv = 0; + goto out; + } + + filesize = bytes + sizeof(sb); + + memset(block, 0xff, sizeof(block)); + + while (bytes > 0) { + if (fwrite(block, sizeof(block), 1, fp) != 1) { + pr_err("failed to write bitmap file %s: %s\n", filename, strerror(errno)); + goto out; + } + bytes -= sizeof(block); + } + + rv = 0; + fflush(fp); + /* make the file be the right size (well, to the nearest byte) */ + if (ftruncate(fileno(fp), filesize)) + perror("ftrunace"); +out: + fclose(fp); + if (rv) + unlink(filename); /* possibly corrupted, better get rid of it */ + return rv; +} + +int bitmap_update_uuid(int fd, int *uuid, int swap) +{ + struct bitmap_super_s bm; + if (lseek(fd, 0, 0) != 0) + return 1; + if (read(fd, &bm, sizeof(bm)) != sizeof(bm)) + return 1; + if (bm.magic != __cpu_to_le32(BITMAP_MAGIC)) + return 1; + copy_uuid(bm.uuid, uuid, swap); + if (lseek(fd, 0, 0) != 0) + return 2; + if (write(fd, &bm, sizeof(bm)) != sizeof(bm)) { + lseek(fd, 0, 0); + return 2; + } + lseek(fd, 0, 0); + return 0; +} diff --git a/bitmap.h b/bitmap.h new file mode 100644 index 00000000..b8fb0714 --- /dev/null +++ b/bitmap.h @@ -0,0 +1,291 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_HOSTENDIAN 3 +#define BITMAP_MAJOR_CLUSTERED 5 + +#define BITMAP_MINOR 39 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared aswell, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SIZE 512 +#define BITMAP_BLOCK_SHIFT 9 + +/* how many blocks per chunk? (this is variable) */ +#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) + +/* when hijacked, the counters and bits represent even larger "chunks" */ +/* there will be 1024 chunks represented by each counter in the page pointers */ +#define PAGEPTR_BLOCK_RATIO(bitmap) \ + (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) +#define PAGEPTR_BLOCK_SHIFT(bitmap) \ + (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) +#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ + +/* map chunks (bits) to file pages - offset by the size of the superblock */ +#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_ACTIVE = 0x001, /* the bitmap is in use */ + BITMAP_STALE = 0x002 /* the bitmap file is out of date or had -EIO */ +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __u32 magic; /* 0 BITMAP_MAGIC */ + __u32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __u64 events; /* 24 event counter for the bitmap (1)*/ + __u64 events_cleared;/*32 event counter when last bit cleared (2) */ + __u64 sync_size; /* 40 the size of the md device's sync range(3) */ + __u32 state; /* 48 bitmap state information */ + __u32 chunksize; /* 52 the bitmap chunk size in bytes */ + __u32 daemon_sleep; /* 56 seconds between disk flushes */ + __u32 write_behind; /* 60 number of outstanding write-behind writes */ + __u32 sectors_reserved; /* 64 number of 512-byte sectors that are + * reserved for the bitmap. */ + __u32 nodes; /* 68 the maximum number of nodes in cluster. */ + __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ + __u8 pad[256 - 136]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked; + /* + * count of dirty bits on the page + */ + int count; +}; + +/* keep track of bitmap file pages that have pending writes on them */ +struct page_list { + struct list_head list; + struct page *page; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + struct bitmap_page *bp; + unsigned long pages; /* total number of pages in the bitmap */ + unsigned long missing_pages; /* number of pages not yet allocated */ + + mddev_t *mddev; /* the md device that the bitmap is for */ + + int counter_bits; /* how many bits per block counter */ + + /* bitmap chunksize -- how much data does each bit represent? */ + unsigned long chunksize; + unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunks; /* total number of data chunks for the array */ + + /* We hold a count on the chunk currently being synced, and drop + * it when the last block is started. If the resync is aborted + * midway, we need to be able to drop that count, so we remember + * the counted chunk.. + */ + unsigned long syncchunk; + + __u64 events_cleared; + + /* bitmap spinlock */ + spinlock_t lock; + + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap file superblock */ + struct page **filemap; /* list of cache pages for the file */ + unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file */ + + unsigned long flags; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + mdk_thread_t *daemon; + unsigned long daemon_sleep; /* how many seconds between updates? */ + + /* + * bitmap write daemon - this daemon performs writes to the bitmap file + * this thread is only needed because of a limitation in ext3 (jbd) + * that does not allow a task to have two journal transactions ongoing + * simultaneously (even if the transactions are for two different + * filesystems) -- in the case of bitmap, that would be the filesystem + * that the bitmap file resides on and the filesystem that is mounted + * on the md device -- see current->journal_info in jbd/transaction.c + */ + mdk_thread_t *write_daemon; + mdk_thread_t *writeback_daemon; + spinlock_t write_lock; + struct semaphore write_ready; + struct semaphore write_done; + unsigned long writes_pending; + wait_queue_head_t write_wait; + struct list_head write_pages; + struct list_head complete_pages; + mempool_t *write_pool; +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(mddev_t *mddev); +void bitmap_destroy(mddev_t *mddev); +int bitmap_active(struct bitmap *bitmap); + +char *file_path(struct file *file, char *buf, int count); +void bitmap_print_sb(struct bitmap *bitmap); +int bitmap_update_sb(struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); + +/* these are exported */ +void bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, + int success); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); + +int bitmap_unplug(struct bitmap *bitmap); +#endif + +#endif diff --git a/config.c b/config.c new file mode 100644 index 00000000..b308b6cc --- /dev/null +++ b/config.c @@ -0,0 +1,1206 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "dlink.h" +#include +#include +#include +#include +#include +#include + +/* + * Read the config file + * + * conf_get_uuids gets a list of devicename+uuid pairs + * conf_get_devs gets device names after expanding wildcards + * + * Each keeps the returned list and frees it when asked to make + * a new list. + * + * The format of the config file needs to be fairly extensible. + * Now, arrays only have names and uuids and devices merely are. + * But later arrays might want names, and devices might want superblock + * versions, and who knows what else. + * I like free format, abhore backslash line continuation, adore + * indentation for structure and am ok about # comments. + * + * So, each line that isn't blank or a #comment must either start + * with a key word, and not be indented, or must start with a + * non-key-word and must be indented. + * + * Keywords are DEVICE and ARRAY ... and several others. + * DEV{ICE} introduces some devices that might contain raid components. + * e.g. + * DEV style=0 /dev/sda* /dev/hd* + * DEV style=1 /dev/sd[b-f]* + * ARR{AY} describes an array giving md device and attributes like uuid=whatever + * e.g. + * ARRAY /dev/md0 uuid=whatever name=something + * Spaces separate words on each line. Quoting, with "" or '' protects them, + * but may not wrap over lines + * + */ +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif + +#ifndef CONFFILE +#define CONFFILE "/etc/mdadm.conf" +#endif +#ifndef CONFFILE2 +/* for Debian compatibility .... */ +#define CONFFILE2 "/etc/mdadm/mdadm.conf" +#endif +char DefaultConfFile[] = CONFFILE; +char DefaultConfDir[] = CONFFILE ".d"; +char DefaultAltConfFile[] = CONFFILE2; +char DefaultAltConfDir[] = CONFFILE2 ".d"; + +enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev, + Homehost, HomeCluster, AutoMode, Policy, PartPolicy, LTEnd }; +char *keywords[] = { + [Devices] = "devices", + [Array] = "array", + [Mailaddr] = "mailaddr", + [Mailfrom] = "mailfrom", + [Program] = "program", + [CreateDev]= "create", + [Homehost] = "homehost", + [HomeCluster] = "homecluster", + [AutoMode] = "auto", + [Policy] = "policy", + [PartPolicy]="part-policy", + [LTEnd] = NULL +}; + +/* + * match_keyword returns an index into the keywords array, or -1 for no match + * case is ignored, and at least three characters must be given + */ + +int match_keyword(char *word) +{ + int len = strlen(word); + int n; + + if (len < 3) return -1; + for (n=0; keywords[n]; n++) { + if (strncasecmp(word, keywords[n], len)==0) + return n; + } + return -1; +} + +struct conf_dev { + struct conf_dev *next; + char *name; +} *cdevlist = NULL; + +struct mddev_dev *load_partitions(void) +{ + FILE *f = fopen("/proc/partitions", "r"); + char buf[1024]; + struct mddev_dev *rv = NULL; + if (f == NULL) { + pr_err("cannot open /proc/partitions\n"); + return NULL; + } + while (fgets(buf, 1024, f)) { + int major, minor; + char *name, *mp; + struct mddev_dev *d; + + buf[1023] = '\0'; + if (buf[0] != ' ') + continue; + major = strtoul(buf, &mp, 10); + if (mp == buf || *mp != ' ') + continue; + minor = strtoul(mp, NULL, 10); + + name = map_dev(major, minor, 1); + if (!name) + continue; + d = xmalloc(sizeof(*d)); + memset(d, 0, sizeof(*d)); + d->devname = xstrdup(name); + d->next = rv; + rv = d; + } + fclose(f); + return rv; +} + +struct mddev_dev *load_containers(void) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + struct mddev_dev *d; + struct mddev_dev *rv = NULL; + struct map_ent *map = NULL, *me; + + if (!mdstat) + return NULL; + + for (ent = mdstat; ent; ent = ent->next) + if (ent->metadata_version && + strncmp(ent->metadata_version, "external:", 9) == 0 && + !is_subarray(&ent->metadata_version[9])) { + d = xmalloc(sizeof(*d)); + memset(d, 0, sizeof(*d)); + me = map_by_devnm(&map, ent->devnm); + if (me) + d->devname = xstrdup(me->path); + else if (asprintf(&d->devname, "/dev/%s", ent->devnm) < 0) { + free(d); + continue; + } + d->next = rv; + rv = d; + } + free_mdstat(mdstat); + map_free(map); + + return rv; +} + +struct createinfo createinfo = { + .autof = 2, /* by default, create devices with standard names */ + .symlinks = 1, + .names = 0, /* By default, stick with numbered md devices. */ + .bblist = 1, /* Use a bad block list by default */ +#ifdef DEBIAN + .gid = 6, /* disk */ + .mode = 0660, +#else + .mode = 0600, +#endif +}; + +int parse_auto(char *str, char *msg, int config) +{ + int autof; + if (str == NULL || *str == 0) + autof = 2; + else if (strcasecmp(str,"no")==0) + autof = 1; + else if (strcasecmp(str,"yes")==0) + autof = 2; + else if (strcasecmp(str,"md")==0) + autof = config?5:3; + else { + /* There might be digits, and maybe a hypen, at the end */ + char *e = str + strlen(str); + int num = 4; + int len; + while (e > str && isdigit(e[-1])) + e--; + if (*e) { + num = atoi(e); + if (num <= 0) num = 1; + } + if (e > str && e[-1] == '-') + e--; + len = e - str; + if ((len == 2 && strncasecmp(str,"md",2)==0)) { + autof = config ? 5 : 3; + } else if ((len == 3 && strncasecmp(str,"yes",3)==0)) { + autof = 2; + } else if ((len == 3 && strncasecmp(str,"mdp",3)==0)) { + autof = config ? 6 : 4; + } else if ((len == 1 && strncasecmp(str,"p",1)==0) || + (len >= 4 && strncasecmp(str,"part",4)==0)) { + autof = 6; + } else { + pr_err("%s arg of \"%s\" unrecognised: use no,yes,md,mdp,part\n" + " optionally followed by a number.\n", + msg, str); + exit(2); + } + autof |= num << 3; + } + return autof; +} + +static void createline(char *line) +{ + char *w; + char *ep; + + for (w=dl_next(line); w!=line; w=dl_next(w)) { + if (strncasecmp(w, "auto=", 5) == 0) + createinfo.autof = parse_auto(w+5, "auto=", 1); + else if (strncasecmp(w, "owner=", 6) == 0) { + if (w[6] == 0) { + pr_err("missing owner name\n"); + continue; + } + createinfo.uid = strtoul(w+6, &ep, 10); + if (*ep != 0) { + struct passwd *pw; + /* must be a name */ + pw = getpwnam(w+6); + if (pw) + createinfo.uid = pw->pw_uid; + else + pr_err("CREATE user %s not found\n", w+6); + } + } else if (strncasecmp(w, "group=", 6) == 0) { + if (w[6] == 0) { + pr_err("missing group name\n"); + continue; + } + createinfo.gid = strtoul(w+6, &ep, 10); + if (*ep != 0) { + struct group *gr; + /* must be a name */ + gr = getgrnam(w+6); + if (gr) + createinfo.gid = gr->gr_gid; + else + pr_err("CREATE group %s not found\n", w+6); + } + } else if (strncasecmp(w, "mode=", 5) == 0) { + if (w[5] == 0) { + pr_err("missing CREATE mode\n"); + continue; + } + createinfo.mode = strtoul(w+5, &ep, 8); + if (*ep != 0) { + createinfo.mode = 0600; + pr_err("unrecognised CREATE mode %s\n", + w+5); + } + } else if (strncasecmp(w, "metadata=", 9) == 0) { + /* style of metadata to use by default */ + int i; + for (i=0; superlist[i] && !createinfo.supertype; i++) + createinfo.supertype = + superlist[i]->match_metadata_desc(w+9); + if (!createinfo.supertype) + pr_err("metadata format %s unknown, ignoring\n", + w+9); + } else if (strncasecmp(w, "symlinks=yes", 12) == 0) + createinfo.symlinks = 1; + else if (strncasecmp(w, "symlinks=no", 11) == 0) + createinfo.symlinks = 0; + else if (strncasecmp(w, "names=yes", 12) == 0) + createinfo.names = 1; + else if (strncasecmp(w, "names=no", 11) == 0) + createinfo.names = 0; + else if (strncasecmp(w, "bbl=no", 11) == 0) + createinfo.bblist = 0; + else if (strncasecmp(w, "bbl=yes", 11) == 0) + createinfo.bblist = 1; + else { + pr_err("unrecognised word on CREATE line: %s\n", + w); + } + } +} + +void devline(char *line) +{ + char *w; + struct conf_dev *cd; + + for (w=dl_next(line); w != line; w=dl_next(w)) { + if (w[0] == '/' || strcasecmp(w, "partitions") == 0 || + strcasecmp(w, "containers") == 0) { + cd = xmalloc(sizeof(*cd)); + cd->name = xstrdup(w); + cd->next = cdevlist; + cdevlist = cd; + } else { + pr_err("unreconised word on DEVICE line: %s\n", + w); + } + } +} + +struct mddev_ident *mddevlist = NULL; +struct mddev_ident **mddevlp = &mddevlist; + +static int is_number(char *w) +{ + /* check if there are 1 or more digits and nothing else */ + int digits = 0; + while (*w && isdigit(*w)) { + digits++; + w++; + } + return (digits && ! *w); +} + +void arrayline(char *line) +{ + char *w; + + struct mddev_ident mis; + struct mddev_ident *mi; + + mis.uuid_set = 0; + mis.super_minor = UnSet; + mis.level = UnSet; + mis.raid_disks = UnSet; + mis.spare_disks = 0; + mis.devices = NULL; + mis.devname = NULL; + mis.spare_group = NULL; + mis.autof = 0; + mis.next = NULL; + mis.st = NULL; + mis.bitmap_fd = -1; + mis.bitmap_file = NULL; + mis.name[0] = 0; + mis.container = NULL; + mis.member = NULL; + + for (w=dl_next(line); w!=line; w=dl_next(w)) { + if (w[0] == '/' || strchr(w, '=') == NULL) { + /* This names the device, or is ''. + * The rules match those in create_mddev. + * 'w' must be: + * /dev/md/{anything} + * /dev/mdNN + * /dev/md_dNN + * + * or anything that doesn't start '/' or '<' + */ + if (strcasecmp(w, "") == 0 || + strncmp(w, "/dev/md/", 8) == 0 || + (w[0] != '/' && w[0] != '<') || + (strncmp(w, "/dev/md", 7) == 0 && + is_number(w+7)) || + (strncmp(w, "/dev/md_d", 9) == 0 && + is_number(w+9)) + ) { + /* This is acceptable */; + if (mis.devname) + pr_err("only give one device per ARRAY line: %s and %s\n", + mis.devname, w); + else + mis.devname = w; + }else { + pr_err("%s is an invalid name for an md device - ignored.\n", w); + } + } else if (strncasecmp(w, "uuid=", 5)==0 ) { + if (mis.uuid_set) + pr_err("only specify uuid once, %s ignored.\n", + w); + else { + if (parse_uuid(w+5, mis.uuid)) + mis.uuid_set = 1; + else + pr_err("bad uuid: %s\n", w); + } + } else if (strncasecmp(w, "super-minor=", 12)==0 ) { + if (mis.super_minor != UnSet) + pr_err("only specify super-minor once, %s ignored.\n", + w); + else { + char *endptr; + int minor = strtol(w+12, &endptr, 10); + + if (w[12]==0 || endptr[0]!=0 || minor < 0) + pr_err("invalid super-minor number: %s\n", + w); + else + mis.super_minor = minor; + } + } else if (strncasecmp(w, "name=", 5)==0) { + if (mis.name[0]) + pr_err("only specify name once, %s ignored.\n", + w); + else if (strlen(w+5) > 32) + pr_err("name too long, ignoring %s\n", w); + else + strcpy(mis.name, w+5); + + } else if (strncasecmp(w, "bitmap=", 7) == 0) { + if (mis.bitmap_file) + pr_err("only specify bitmap file once. %s ignored\n", + w); + else + mis.bitmap_file = xstrdup(w+7); + + } else if (strncasecmp(w, "devices=", 8 ) == 0 ) { + if (mis.devices) + pr_err("only specify devices once (use a comma separated list). %s ignored\n", + w); + else + mis.devices = xstrdup(w+8); + } else if (strncasecmp(w, "spare-group=", 12) == 0 ) { + if (mis.spare_group) + pr_err("only specify one spare group per array. %s ignored.\n", + w); + else + mis.spare_group = xstrdup(w+12); + } else if (strncasecmp(w, "level=", 6) == 0 ) { + /* this is mainly for compatability with --brief output */ + mis.level = map_name(pers, w+6); + } else if (strncasecmp(w, "disks=", 6) == 0 ) { + /* again, for compat */ + mis.raid_disks = atoi(w+6); + } else if (strncasecmp(w, "num-devices=", 12) == 0 ) { + /* again, for compat */ + mis.raid_disks = atoi(w+12); + } else if (strncasecmp(w, "spares=", 7) == 0 ) { + /* for warning if not all spares present */ + mis.spare_disks = atoi(w+7); + } else if (strncasecmp(w, "metadata=", 9) == 0) { + /* style of metadata on the devices. */ + int i; + + for(i=0; superlist[i] && !mis.st; i++) + mis.st = superlist[i]->match_metadata_desc(w+9); + + if (!mis.st) + pr_err("metadata format %s unknown, ignored.\n", w+9); + } else if (strncasecmp(w, "auto=", 5) == 0 ) { + /* whether to create device special files as needed */ + mis.autof = parse_auto(w+5, "auto type", 0); + } else if (strncasecmp(w, "member=", 7) == 0) { + /* subarray within a container */ + mis.member = xstrdup(w+7); + } else if (strncasecmp(w, "container=", 10) == 0) { + /* the container holding this subarray. Either a device name + * or a uuid */ + mis.container = xstrdup(w+10); + } else { + pr_err("unrecognised word on ARRAY line: %s\n", + w); + } + } + if (mis.uuid_set == 0 && mis.devices == NULL && + mis.super_minor == UnSet && mis.name[0] == 0 && + (mis.container == NULL || mis.member == NULL)) + pr_err("ARRAY line %s has no identity information.\n", mis.devname); + else { + mi = xmalloc(sizeof(*mi)); + *mi = mis; + mi->devname = mis.devname ? xstrdup(mis.devname) : NULL; + mi->next = NULL; + *mddevlp = mi; + mddevlp = &mi->next; + } +} + +static char *alert_email = NULL; +void mailline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) + if (alert_email == NULL) + alert_email = xstrdup(w); +} + +static char *alert_mail_from = NULL; +void mailfromline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) { + if (alert_mail_from == NULL) + alert_mail_from = xstrdup(w); + else { + char *t = NULL; + + if (xasprintf(&t, "%s %s", alert_mail_from, w) > 0) { + free(alert_mail_from); + alert_mail_from = t; + } + } + } +} + +static char *alert_program = NULL; +void programline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) + if (alert_program == NULL) + alert_program = xstrdup(w); +} + +static char *home_host = NULL; +static int require_homehost = 1; +void homehostline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) { + if (strcasecmp(w, "")==0) + require_homehost = 0; + else if (home_host == NULL) { + if (strcasecmp(w, "")==0) + home_host = xstrdup(""); + else + home_host = xstrdup(w); + } + } +} + +static char *home_cluster = NULL; +void homeclusterline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) { + if (home_cluster == NULL) { + if (strcasecmp(w, "")==0) + home_cluster = xstrdup(""); + else + home_cluster = xstrdup(w); + } + } +} + +char auto_yes[] = "yes"; +char auto_no[] = "no"; +char auto_homehost[] = "homehost"; + +static int auto_seen = 0; +void autoline(char *line) +{ + char *w; + char *seen; + int super_cnt; + char *dflt = auto_yes; + int homehost = 0; + int i; + + if (auto_seen) + return; + auto_seen = 1; + + /* Parse the 'auto' line creating policy statements for the 'auto' policy. + * + * The default is 'yes' but the 'auto' line might over-ride that. + * Words in the line are processed in order with the first + * match winning. + * word can be: + * +version - that version can be assembled + * -version - that version cannot be auto-assembled + * yes or +all - any other version can be assembled + * no or -all - no other version can be assembled. + * homehost - any array associated by 'homehost' to this + * host can be assembled. + * + * Thus: + * +ddf -0.90 homehost -all + * will auto-assemble any ddf array, no 0.90 array, and + * any other array (imsm, 1.x) if and only if it is identified + * as belonging to this host. + * + * We translate that to policy by creating 'auto=yes' when we see + * a '+version' line, 'auto=no' if we see '-version' before 'homehost', + * or 'auto=homehost' if we see '-version' after 'homehost'. + * When we see yes, no, +all or -all we stop and any version that hasn't + * been seen gets an appropriate auto= entry. + */ + + /* If environment variable MDADM_CONF_AUTO is defined, then + * it is prepended to the auto line. This allow a script + * to easily disable some metadata types. + */ + w = getenv("MDADM_CONF_AUTO"); + if (w && *w) { + char *l = xstrdup(w); + char *head = line; + w = strtok(l, " \t"); + while (w) { + char *nw = dl_strdup(w); + dl_insert(head, nw); + head = nw; + w = strtok(NULL, " \t"); + } + free(l); + } + + for (super_cnt = 0; superlist[super_cnt]; super_cnt++) + ; + seen = xcalloc(super_cnt, 1); + + for (w = dl_next(line); w != line ; w = dl_next(w)) { + char *val; + + if (strcasecmp(w, "yes") == 0) { + dflt = auto_yes; + break; + } + if (strcasecmp(w, "no") == 0) { + if (homehost) + dflt = auto_homehost; + else + dflt = auto_no; + break; + } + if (strcasecmp(w, "homehost") == 0) { + homehost = 1; + continue; + } + if (w[0] == '+') + val = auto_yes; + else if (w[0] == '-') { + if (homehost) + val = auto_homehost; + else + val = auto_no; + } else + continue; + + if (strcasecmp(w+1, "all") == 0) { + dflt = val; + break; + } + for (i = 0; superlist[i]; i++) { + const char *version = superlist[i]->name; + if (strcasecmp(w+1, version) == 0) + break; + /* 1 matches 1.x, 0 matches 0.90 */ + if (version[1] == '.' && + strlen(w+1) == 1 && + w[1] == version[0]) + break; + /* 1.anything matches 1.x */ + if (strcmp(version, "1.x") == 0 && + strncmp(w+1, "1.", 2) == 0) + break; + } + if (superlist[i] == NULL) + /* ignore this word */ + continue; + if (seen[i]) + /* already know about this metadata */ + continue; + policy_add(rule_policy, pol_auto, val, pol_metadata, superlist[i]->name, NULL); + seen[i] = 1; + } + for (i = 0; i < super_cnt; i++) + if (!seen[i]) + policy_add(rule_policy, pol_auto, dflt, pol_metadata, superlist[i]->name, NULL); + + free(seen); +} + +int loaded = 0; + +static char *conffile = NULL; +void set_conffile(char *file) +{ + conffile = file; +} + +void conf_file(FILE *f) +{ + char *line; + while ((line=conf_line(f))) { + switch(match_keyword(line)) { + case Devices: + devline(line); + break; + case Array: + arrayline(line); + break; + case Mailaddr: + mailline(line); + break; + case Mailfrom: + mailfromline(line); + break; + case Program: + programline(line); + break; + case CreateDev: + createline(line); + break; + case Homehost: + homehostline(line); + break; + case HomeCluster: + homeclusterline(line); + break; + case AutoMode: + autoline(line); + break; + case Policy: + policyline(line, rule_policy); + break; + case PartPolicy: + policyline(line, rule_part); + break; + default: + pr_err("Unknown keyword %s\n", line); + } + free_line(line); + } +} + +struct fname { + struct fname *next; + char name[]; +}; + +void conf_file_or_dir(FILE *f) +{ + struct stat st; + DIR *dir; + struct dirent *dp; + struct fname *list = NULL; + + fstat(fileno(f), &st); + if (S_ISREG(st.st_mode)) + conf_file(f); + else if (!S_ISDIR(st.st_mode)) + return; +#if _XOPEN_SOURCE >= 700 || _POSIX_C_SOURCE >= 200809L + dir = fdopendir(fileno(f)); + if (!dir) + return; + while ((dp = readdir(dir)) != NULL) { + int l; + struct fname *fn, **p; + if (dp->d_ino == 0) + continue; + if (dp->d_name[0] == '.') + continue; + l = strlen(dp->d_name); + if (l < 6 || strcmp(dp->d_name+l-5, ".conf") != 0) + continue; + fn = xmalloc(sizeof(*fn)+l+1); + strcpy(fn->name, dp->d_name); + for (p = &list; + *p && strcmp((*p)->name, fn->name) < 0; + p = & (*p)->next) + ; + fn->next = *p; + *p = fn; + } + while (list) { + int fd; + FILE *f2; + struct fname *fn = list; + list = list->next; + fd = openat(fileno(f), fn->name, O_RDONLY); + free(fn); + if (fd < 0) + continue; + f2 = fdopen(fd, "r"); + if (!f2) { + close(fd); + continue; + } + conf_file(f2); + fclose(f2); + } + closedir(dir); +#endif +} + +void load_conffile(void) +{ + FILE *f; + char *confdir = NULL; + char *head; + + if (loaded) + return; + if (conffile == NULL) { + conffile = DefaultConfFile; + confdir = DefaultConfDir; + } + + if (strcmp(conffile, "partitions")==0) { + char *list = dl_strdup("DEV"); + dl_init(list); + dl_add(list, dl_strdup("partitions")); + devline(list); + free_line(list); + } else if (strcmp(conffile, "none") != 0) { + f = fopen(conffile, "r"); + /* Debian chose to relocate mdadm.conf into /etc/mdadm/. + * To allow Debian users to compile from clean source and still + * have a working mdadm, we read /etc/mdadm/mdadm.conf + * if /etc/mdadm.conf doesn't exist + */ + if (f == NULL && + conffile == DefaultConfFile) { + f = fopen(DefaultAltConfFile, "r"); + if (f) { + conffile = DefaultAltConfFile; + confdir = DefaultAltConfDir; + } + } + if (f) { + conf_file_or_dir(f); + fclose(f); + } + if (confdir) { + f = fopen(confdir, "r"); + if (f) { + conf_file_or_dir(f); + fclose(f); + } + } + } + /* If there was no AUTO line, process an empty line + * now so that the MDADM_CONF_AUTO env var gets processed. + */ + head = dl_strdup("AUTO"); + dl_init(head); + autoline(head); + free_line(head); + + loaded = 1; +} + +char *conf_get_mailaddr(void) +{ + load_conffile(); + return alert_email; +} + +char *conf_get_mailfrom(void) +{ + load_conffile(); + return alert_mail_from; +} + +char *conf_get_program(void) +{ + load_conffile(); + return alert_program; +} + +char *conf_get_homehost(int *require_homehostp) +{ + load_conffile(); + if (require_homehostp) + *require_homehostp = require_homehost; + return home_host; +} + +char *conf_get_homecluster(void) +{ + load_conffile(); + return home_cluster; +} + +struct createinfo *conf_get_create_info(void) +{ + load_conffile(); + return &createinfo; +} + +struct mddev_ident *conf_get_ident(char *dev) +{ + struct mddev_ident *rv; + load_conffile(); + rv = mddevlist; + while (dev && rv && (rv->devname == NULL + || !devname_matches(dev, rv->devname))) + rv = rv->next; + return rv; +} + +static void append_dlist(struct mddev_dev **dlp, struct mddev_dev *list) +{ + while (*dlp) + dlp = &(*dlp)->next; + *dlp = list; +} + +struct mddev_dev *conf_get_devs() +{ + glob_t globbuf; + struct conf_dev *cd; + int flags = 0; + static struct mddev_dev *dlist = NULL; + unsigned int i; + + while (dlist) { + struct mddev_dev *t = dlist; + dlist = dlist->next; + free(t->devname); + free(t); + } + + load_conffile(); + + if (cdevlist == NULL) { + /* default to 'partitions' and 'containers' */ + dlist = load_partitions(); + append_dlist(&dlist, load_containers()); + } + + for (cd=cdevlist; cd; cd=cd->next) { + if (strcasecmp(cd->name, "partitions")==0) + append_dlist(&dlist, load_partitions()); + else if (strcasecmp(cd->name, "containers")==0) + append_dlist(&dlist, load_containers()); + else { + glob(cd->name, flags, NULL, &globbuf); + flags |= GLOB_APPEND; + } + } + if (flags & GLOB_APPEND) { + for (i=0; idevname = xstrdup(globbuf.gl_pathv[i]); + t->next = dlist; + dlist = t; +/* printf("one dev is %s\n", t->devname);*/ + } + globfree(&globbuf); + } + + return dlist; +} + +int conf_test_dev(char *devname) +{ + struct conf_dev *cd; + if (cdevlist == NULL) + /* allow anything by default */ + return 1; + for (cd = cdevlist ; cd ; cd = cd->next) { + if (strcasecmp(cd->name, "partitions") == 0) + return 1; + if (fnmatch(cd->name, devname, FNM_PATHNAME) == 0) + return 1; + } + return 0; +} + +int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost) +{ + /* If anyone said 'yes', that sticks. + * else if homehost applies, use that + * else if there is a 'no', say 'no'. + * else 'yes'. + */ + struct dev_policy *p; + int no=0, found_homehost=0; + load_conffile(); + + pol = pol_find(pol, pol_auto); + pol_for_each(p, pol, version) { + if (strcmp(p->value, "yes") == 0) + return 1; + if (strcmp(p->value, "homehost") == 0) + found_homehost = 1; + if (strcmp(p->value, "no") == 0) + no = 1; + } + if (is_homehost && found_homehost) + return 1; + if (no) + return 0; + return 1; +} + +int match_oneof(char *devices, char *devname) +{ + /* check if one of the comma separated patterns in devices + * matches devname + */ + + while (devices && *devices) { + char patn[1024]; + char *p = devices; + devices = strchr(devices, ','); + if (!devices) + devices = p + strlen(p); + if (devices-p < 1024) { + strncpy(patn, p, devices-p); + patn[devices-p] = 0; + if (fnmatch(patn, devname, FNM_PATHNAME)==0) + return 1; + } + if (*devices == ',') + devices++; + } + return 0; +} + +int devname_matches(char *name, char *match) +{ + /* See if the given array name matches the + * given match from config file. + * + * First strip and /dev/md/ or /dev/, then + * see if there might be a numeric match of + * mdNN with NN + * then just strcmp + */ + if (strncmp(name, "/dev/md/", 8) == 0) + name += 8; + else if (strncmp(name, "/dev/", 5) == 0) + name += 5; + + if (strncmp(match, "/dev/md/", 8) == 0) + match += 8; + else if (strncmp(match, "/dev/", 5) == 0) + match += 5; + + if (strncmp(name, "md", 2) == 0 && + isdigit(name[2])) + name += 2; + if (strncmp(match, "md", 2) == 0 && + isdigit(match[2])) + match += 2; + + return (strcmp(name, match) == 0); +} + +int conf_name_is_free(char *name) +{ + /* Check if this name is already taken by an ARRAY entry in + * the config file. + * It can be taken either by a match on devname, name, or + * even super-minor. + */ + struct mddev_ident *dev; + + load_conffile(); + for (dev = mddevlist; dev; dev = dev->next) { + char nbuf[100]; + if (dev->devname && devname_matches(name, dev->devname)) + return 0; + if (dev->name[0] && devname_matches(name, dev->name)) + return 0; + sprintf(nbuf, "%d", dev->super_minor); + if (dev->super_minor != UnSet && + devname_matches(name, nbuf)) + return 0; + } + return 1; +} + +struct mddev_ident *conf_match(struct supertype *st, + struct mdinfo *info, + char *devname, + int verbose, int *rvp) +{ + struct mddev_ident *array_list, *match; + array_list = conf_get_ident(NULL); + match = NULL; + for (; array_list; array_list = array_list->next) { + if (array_list->uuid_set && + same_uuid(array_list->uuid, info->uuid, st->ss->swapuuid) + == 0) { + if (verbose >= 2 && array_list->devname) + pr_err("UUID differs from %s.\n", + array_list->devname); + continue; + } + if (array_list->name[0] && + strcasecmp(array_list->name, info->name) != 0) { + if (verbose >= 2 && array_list->devname) + pr_err("Name differs from %s.\n", + array_list->devname); + continue; + } + if (array_list->devices && devname && + !match_oneof(array_list->devices, devname)) { + if (verbose >= 2 && array_list->devname) + pr_err("Not a listed device for %s.\n", + array_list->devname); + continue; + } + if (array_list->super_minor != UnSet && + array_list->super_minor != info->array.md_minor) { + if (verbose >= 2 && array_list->devname) + pr_err("Different super-minor to %s.\n", + array_list->devname); + continue; + } + if (!array_list->uuid_set && + !array_list->name[0] && + !array_list->devices && + array_list->super_minor == UnSet) { + if (verbose >= 2 && array_list->devname) + pr_err("%s doesn't have any identifying information.\n", + array_list->devname); + continue; + } + /* FIXME, should I check raid_disks and level too?? */ + + if (match) { + if (verbose >= 0) { + if (match->devname && array_list->devname) + pr_err("we match both %s and %s - cannot decide which to use.\n", + match->devname, + array_list->devname); + else + pr_err("multiple lines in mdadm.conf match\n"); + } + if (rvp) + *rvp = 2; + match = NULL; + break; + } + match = array_list; + } + return match; +} + +int conf_verify_devnames(struct mddev_ident *array_list) +{ + struct mddev_ident *a1, *a2; + + for (a1 = array_list; a1; a1 = a1->next) { + if (!a1->devname) + continue; + if (strcmp(a1->devname, "") == 0) + continue; + for (a2 = a1->next; a2; a2 = a2->next) { + if (!a2->devname) + continue; + if (strcmp(a1->devname, a2->devname) != 0) + continue; + + if (a1->uuid_set && a2->uuid_set) { + char nbuf[64]; + __fname_from_uuid(a1->uuid, 0, nbuf, ':'); + pr_err("Devices %s and ", + nbuf); + __fname_from_uuid(a2->uuid, 0, nbuf, ':'); + fprintf(stderr, + "%s have the same name: %s\n", + nbuf, a1->devname); + } else + pr_err("Device %s given twice in config file\n", a1->devname); + return 1; + } + } + + return 0; +} diff --git a/crc32.c b/crc32.c new file mode 100644 index 00000000..94fda06a --- /dev/null +++ b/crc32.c @@ -0,0 +1,360 @@ +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2003 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Note: zlib license from from zlib.h added explicitly as mdadm does + * not include zlib.h. License from v1.2.2 of zlib: + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * + * Thanks to Rodney Brown for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results about a factor + * of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +/* @(#) $Id$ */ + +/* + Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore + protection on the static variables used to control the first-use generation + of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should + first call get_crc_table() to initialize the tables before allowing more than + one thread to use crc32(). + */ + +#ifdef MAKECRCH +# include +# ifndef DYNAMIC_CRC_TABLE +# define DYNAMIC_CRC_TABLE +# endif /* !DYNAMIC_CRC_TABLE */ +#endif /* MAKECRCH */ + +/* #include "zutil.h" / * for STDC and FAR definitions */ +#define STDC +#define FAR +#define Z_NULL ((void*)0) +#define OF(X) X +#define ZEXPORT +typedef long ptrdiff_t; +#define NOBYFOUR + +#define local static + +/* Find a four-byte integer type for crc32_little() and crc32_big(). */ +#ifndef NOBYFOUR +# ifdef STDC /* need ANSI C limits.h to determine sizes */ +# include +# define BYFOUR +# if (UINT_MAX == 0xffffffffUL) + typedef unsigned int u4; +# else +# if (ULONG_MAX == 0xffffffffUL) + typedef unsigned long u4; +# else +# if (USHRT_MAX == 0xffffffffUL) + typedef unsigned short u4; +# else +# undef BYFOUR /* can't find a four-byte integer type! */ +# endif +# endif +# endif +# endif /* STDC */ +#endif /* !NOBYFOUR */ + +/* Definitions for doing the crc four data bytes at a time. */ +#ifdef BYFOUR +# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \ + (((w)&0xff00)<<8)+(((w)&0xff)<<24)) + local unsigned long crc32_little OF((unsigned long, + const unsigned char FAR *, unsigned)); + local unsigned long crc32_big OF((unsigned long, + const unsigned char FAR *, unsigned)); +# define TBLS 8 +#else +# define TBLS 1 +#endif /* BYFOUR */ + +#ifdef DYNAMIC_CRC_TABLE + +local volatile int crc_table_empty = 1; +local unsigned long FAR crc_table[TBLS][256]; +local void make_crc_table OF((void)); +#ifdef MAKECRCH + local void write_table OF((FILE *, const unsigned long FAR *)); +#endif /* MAKECRCH */ + +/* + Generate tables for a byte-wise 32-bit CRC calculation on the polynomial: + x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. + + Polynomials over GF(2) are represented in binary, one bit per coefficient, + with the lowest powers in the most significant bit. Then adding polynomials + is just exclusive-or, and multiplying a polynomial by x is a right shift by + one. If we call the above polynomial p, and represent a byte as the + polynomial q, also with the lowest power in the most significant bit (so the + byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p, + where a mod b means the remainder after dividing a by b. + + This calculation is done using the shift-register method of multiplying and + taking the remainder. The register is initialized to zero, and for each + incoming bit, x^32 is added mod p to the register if the bit is a one (where + x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by + x (which is shifting right by one and adding x^32 mod p if the bit shifted + out is a one). We start with the highest power (least significant bit) of + q and repeat for all eight bits of q. + + The first table is simply the CRC of all possible eight bit values. This is + all the information needed to generate CRCs on data a byte at a time for all + combinations of CRC register values and incoming bytes. The remaining tables + allow for word-at-a-time CRC calculation for both big-endian and little- + endian machines, where a word is four bytes. +*/ +local void make_crc_table() +{ + unsigned long c; + int n, k; + unsigned long poly; /* polynomial exclusive-or pattern */ + /* terms of polynomial defining this crc (except x^32): */ + static volatile int first = 1; /* flag to limit concurrent making */ + static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26}; + + /* See if another task is already doing this (not thread-safe, but better + than nothing -- significantly reduces duration of vulnerability in + case the advice about DYNAMIC_CRC_TABLE is ignored) */ + if (first) { + first = 0; + + /* make exclusive-or pattern from polynomial (0xedb88320UL) */ + poly = 0UL; + for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++) + poly |= 1UL << (31 - p[n]); + + /* generate a crc for every 8-bit value */ + for (n = 0; n < 256; n++) { + c = (unsigned long)n; + for (k = 0; k < 8; k++) + c = c & 1 ? poly ^ (c >> 1) : c >> 1; + crc_table[0][n] = c; + } + +#ifdef BYFOUR + /* generate crc for each value followed by one, two, and three zeros, + and then the byte reversal of those as well as the first table */ + for (n = 0; n < 256; n++) { + c = crc_table[0][n]; + crc_table[4][n] = REV(c); + for (k = 1; k < 4; k++) { + c = crc_table[0][c & 0xff] ^ (c >> 8); + crc_table[k][n] = c; + crc_table[k + 4][n] = REV(c); + } + } +#endif /* BYFOUR */ + + crc_table_empty = 0; + } + else { /* not first */ + /* wait for the other guy to finish (not efficient, but rare) */ + while (crc_table_empty) + ; + } + +#ifdef MAKECRCH + /* write out CRC tables to crc32.h */ + { + FILE *out; + + out = fopen("crc32.h", "w"); + if (out == NULL) return; + fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n"); + fprintf(out, " * Generated automatically by crc32.c\n */\n\n"); + fprintf(out, "local const unsigned long FAR "); + fprintf(out, "crc_table[TBLS][256] =\n{\n {\n"); + write_table(out, crc_table[0]); +# ifdef BYFOUR + fprintf(out, "#ifdef BYFOUR\n"); + for (k = 1; k < 8; k++) { + fprintf(out, " },\n {\n"); + write_table(out, crc_table[k]); + } + fprintf(out, "#endif\n"); +# endif /* BYFOUR */ + fprintf(out, " }\n};\n"); + fclose(out); + } +#endif /* MAKECRCH */ +} + +#ifdef MAKECRCH +local void write_table(out, table) + FILE *out; + const unsigned long FAR *table; +{ + int n; + + for (n = 0; n < 256; n++) + fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n], + n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", ")); +} +#endif /* MAKECRCH */ + +#else /* !DYNAMIC_CRC_TABLE */ +/* ======================================================================== + * Tables of CRC-32s of all single-byte values, made by make_crc_table(). + */ +#include "crc32.h" +#endif /* DYNAMIC_CRC_TABLE */ + +/* ========================================================================= + * This function can be used by asm versions of crc32() + */ +const unsigned long FAR * ZEXPORT get_crc_table(void) +{ +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + return (const unsigned long FAR *)crc_table; +} + +/* ========================================================================= */ +#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8) +#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 + +/* ========================================================================= */ +unsigned long ZEXPORT crc32( + unsigned long crc, + const unsigned char FAR *buf, + unsigned len) +{ + if (buf == Z_NULL) return 0UL; + +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + +#ifdef BYFOUR + if (sizeof(void *) == sizeof(ptrdiff_t)) { + u4 endian; + + endian = 1; + if (*((unsigned char *)(&endian))) + return crc32_little(crc, buf, len); + else + return crc32_big(crc, buf, len); + } +#endif /* BYFOUR */ +/* crc = crc ^ 0xffffffffUL;*/ + while (len >= 8) { + DO8; + len -= 8; + } + if (len) do { + DO1; + } while (--len); + return crc /* ^ 0xffffffffUL*/; +} + +#ifdef BYFOUR + +/* ========================================================================= */ +#define DOLIT4 c ^= *buf4++; \ + c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ + crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] +#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 + +/* ========================================================================= */ +local unsigned long crc32_little(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = (u4)crc; + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + len--; + } + + buf4 = (const u4 FAR *)buf; + while (len >= 32) { + DOLIT32; + len -= 32; + } + while (len >= 4) { + DOLIT4; + len -= 4; + } + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + } while (--len); + c = ~c; + return (unsigned long)c; +} + +/* ========================================================================= */ +#define DOBIG4 c ^= *++buf4; \ + c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ + crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] +#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 + +/* ========================================================================= */ +local unsigned long crc32_big(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = REV((u4)crc); + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + len--; + } + + buf4 = (const u4 FAR *)buf; + buf4--; + while (len >= 32) { + DOBIG32; + len -= 32; + } + while (len >= 4) { + DOBIG4; + len -= 4; + } + buf4++; + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + } while (--len); + c = ~c; + return (unsigned long)(REV(c)); +} + +#endif /* BYFOUR */ diff --git a/crc32.h b/crc32.h new file mode 100644 index 00000000..8053b611 --- /dev/null +++ b/crc32.h @@ -0,0 +1,441 @@ +/* crc32.h -- tables for rapid CRC calculation + * Generated automatically by crc32.c + */ + +local const unsigned long FAR crc_table[TBLS][256] = +{ + { + 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL, + 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL, + 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL, + 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL, + 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL, + 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL, + 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL, + 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL, + 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL, + 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL, + 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL, + 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL, + 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL, + 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL, + 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL, + 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL, + 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL, + 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL, + 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL, + 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL, + 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL, + 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL, + 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL, + 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL, + 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL, + 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL, + 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL, + 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL, + 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL, + 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL, + 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL, + 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL, + 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL, + 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL, + 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL, + 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL, + 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL, + 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL, + 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL, + 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL, + 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL, + 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL, + 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL, + 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL, + 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL, + 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL, + 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL, + 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL, + 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL, + 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL, + 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, + 0x2d02ef8dUL +#ifdef BYFOUR + }, + { + 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL, + 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL, + 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL, + 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL, + 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL, + 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL, + 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL, + 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL, + 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL, + 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL, + 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL, + 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL, + 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL, + 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL, + 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL, + 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL, + 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL, + 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL, + 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL, + 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL, + 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL, + 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL, + 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL, + 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL, + 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL, + 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL, + 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL, + 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL, + 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL, + 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL, + 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL, + 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL, + 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL, + 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL, + 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL, + 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL, + 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL, + 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL, + 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL, + 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL, + 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL, + 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL, + 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL, + 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL, + 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL, + 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL, + 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL, + 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL, + 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL, + 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL, + 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL, + 0x9324fd72UL + }, + { + 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL, + 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL, + 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL, + 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL, + 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL, + 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL, + 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL, + 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL, + 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL, + 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL, + 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL, + 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL, + 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL, + 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL, + 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL, + 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL, + 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL, + 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL, + 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL, + 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL, + 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL, + 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL, + 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL, + 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL, + 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL, + 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL, + 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL, + 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL, + 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL, + 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL, + 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL, + 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL, + 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL, + 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL, + 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL, + 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL, + 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL, + 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL, + 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL, + 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL, + 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL, + 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL, + 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL, + 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL, + 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL, + 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL, + 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL, + 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL, + 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL, + 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL, + 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL, + 0xbe9834edUL + }, + { + 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL, + 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL, + 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL, + 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL, + 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL, + 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL, + 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL, + 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL, + 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL, + 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL, + 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL, + 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL, + 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL, + 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL, + 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL, + 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL, + 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL, + 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL, + 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL, + 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL, + 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL, + 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL, + 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL, + 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL, + 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL, + 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL, + 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL, + 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL, + 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL, + 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL, + 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL, + 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL, + 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL, + 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL, + 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL, + 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL, + 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL, + 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL, + 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL, + 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL, + 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL, + 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL, + 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL, + 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL, + 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL, + 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL, + 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL, + 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL, + 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL, + 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL, + 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL, + 0xde0506f1UL + }, + { + 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL, + 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL, + 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL, + 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL, + 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL, + 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL, + 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL, + 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL, + 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL, + 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL, + 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL, + 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL, + 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL, + 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL, + 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL, + 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL, + 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL, + 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL, + 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL, + 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL, + 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL, + 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL, + 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL, + 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL, + 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL, + 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL, + 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL, + 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL, + 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL, + 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL, + 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL, + 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL, + 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL, + 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL, + 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL, + 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL, + 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL, + 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL, + 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL, + 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL, + 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL, + 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL, + 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL, + 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL, + 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL, + 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL, + 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL, + 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL, + 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL, + 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL, + 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL, + 0x8def022dUL + }, + { + 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL, + 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL, + 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL, + 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL, + 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL, + 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL, + 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL, + 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL, + 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL, + 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL, + 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL, + 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL, + 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL, + 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL, + 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL, + 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL, + 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL, + 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL, + 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL, + 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL, + 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL, + 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL, + 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL, + 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL, + 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL, + 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL, + 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL, + 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL, + 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL, + 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL, + 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL, + 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL, + 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL, + 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL, + 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL, + 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL, + 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL, + 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL, + 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL, + 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL, + 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL, + 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL, + 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL, + 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL, + 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL, + 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL, + 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL, + 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL, + 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL, + 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL, + 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL, + 0x72fd2493UL + }, + { + 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL, + 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL, + 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL, + 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL, + 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL, + 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL, + 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL, + 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL, + 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL, + 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL, + 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL, + 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL, + 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL, + 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL, + 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL, + 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL, + 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL, + 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL, + 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL, + 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL, + 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL, + 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL, + 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL, + 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL, + 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL, + 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL, + 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL, + 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL, + 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL, + 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL, + 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL, + 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL, + 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL, + 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL, + 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL, + 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL, + 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL, + 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL, + 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL, + 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL, + 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL, + 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL, + 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL, + 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL, + 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL, + 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL, + 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL, + 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL, + 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL, + 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL, + 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL, + 0xed3498beUL + }, + { + 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL, + 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL, + 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL, + 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL, + 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL, + 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL, + 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL, + 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL, + 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL, + 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL, + 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL, + 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL, + 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL, + 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL, + 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL, + 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL, + 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL, + 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL, + 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL, + 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL, + 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL, + 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL, + 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL, + 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL, + 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL, + 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL, + 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL, + 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL, + 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL, + 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL, + 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL, + 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL, + 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL, + 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL, + 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL, + 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL, + 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL, + 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL, + 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL, + 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL, + 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL, + 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL, + 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL, + 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL, + 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL, + 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL, + 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL, + 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL, + 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL, + 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL, + 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL, + 0xf10605deUL +#endif + } +}; diff --git a/crc32c.c b/crc32c.c new file mode 100644 index 00000000..156cba19 --- /dev/null +++ b/crc32c.c @@ -0,0 +1,104 @@ +/* + * Oct 28, 2015 Song Liu simplified the code and port it to mdadm + * + * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin + * cleaned up code to current version of sparse and added the slicing-by-8 + * algorithm to the closely similar existing slicing-by-4 algorithm. + * + * Oct 15, 2000 Matt Domsch + * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! + * Code was from the public domain, copyright abandoned. Code was + * subsequently included in the kernel, thus was re-licensed under the + * GNU GPL v2. + * + * Oct 12, 2000 Matt Domsch + * Same crc32 function was used in 5 other places in the kernel. + * I made one version, and deleted the others. + * There are various incantations of crc32(). Some use a seed of 0 or ~0. + * Some xor at the end with ~0. The generic crc32() function takes + * seed as an argument, and doesn't xor at the end. Then individual + * users can do whatever they need. + * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. + * fs/jffs2 uses seed 0, doesn't xor with ~0. + * fs/partitions/efi.c uses seed ~0, xor's with ~0. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include + +/* + * There are multiple 16-bit CRC polynomials in common use, but this is + * *the* standard CRC-32 polynomial, first popularized by Ethernet. + * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0 + */ +#define CRCPOLY_LE 0xedb88320 +#define CRCPOLY_BE 0x04c11db7 + +/* + * This is the CRC32c polynomial, as outlined by Castagnoli. + * x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+x^11+x^10+x^9+ + * x^8+x^6+x^0 + */ +#define CRC32C_POLY_LE 0x82F63B78 + +/** + * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II + * CRC32/CRC32C + * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other + * uses, or the previous crc32/crc32c value if computing incrementally. + * @p: pointer to buffer over which CRC32/CRC32C is run + * @len: length of buffer @p + * @polynomial: CRC32/CRC32c LE polynomial + */ +static inline __u32 crc32_le_generic(__u32 crc, unsigned char const *p, + size_t len, __u32 polynomial) +{ + int i; + while (len--) { + crc ^= *p++; + for (i = 0; i < 8; i++) + crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0); + } + return crc; +} + +__u32 crc32_le(__u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, CRCPOLY_LE); +} + +__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, CRC32C_POLY_LE); +} + +/** + * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 + * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for + * other uses, or the previous crc32 value if computing incrementally. + * @p: pointer to buffer over which CRC32 is run + * @len: length of buffer @p + * @polynomial: CRC32 BE polynomial + */ +static inline __u32 crc32_be_generic(__u32 crc, unsigned char const *p, + size_t len, __u32 polynomial) +{ + int i; + while (len--) { + crc ^= *p++ << 24; + for (i = 0; i < 8; i++) + crc = + (crc << 1) ^ ((crc & 0x80000000) ? polynomial : + 0); + } + return crc; +} + +__u32 crc32_be(__u32 crc, unsigned char const *p, size_t len) +{ + return crc32_be_generic(crc, p, len, CRCPOLY_BE); +} diff --git a/debian/FAQ b/debian/FAQ new file mode 100644 index 00000000..325fcb59 --- /dev/null +++ b/debian/FAQ @@ -0,0 +1,581 @@ +Frequently asked questions -- Debian mdadm +========================================== + +Also see /usr/share/doc/mdadm/README.recipes.gz . + +The latest version of this FAQ is available here: + http://git.debian.org/?p=pkg-mdadm/mdadm.git;a=blob;f=debian/FAQ;hb=HEAD + +0. What does MD stand for? +~~~~~~~~~~~~~~~~~~~~~~~~~~ + MD is an abbreviation for "multiple device" (also often called "multi- + disk"). The Linux MD implementation implements various strategies for + combining multiple physical devices into single logical ones. The most + common use case is commonly known as "Software RAID". Linux supports RAID + levels 1, 4, 5, 6, and 10, as well as the "pseudo-redundant" RAID level 0. + In addition, the MD implementation covers linear and multipath + configurations. + + Most people refer to MD as RAID. Since the original name of the RAID + configuration software is "md"adm, I chose to use MD consistently instead. + +1. How do I overwrite ("zero") the superblock? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --zero-superblock /dev/mdX + + Note that this is a destructive operation. It does not actually delete any + data, but the device will have lost its "authority". You cannot assemble the + array with it anymore, and if you add the device to another array, the + synchronisation process *will* *overwrite* all data on the device. + + Nevertheless, sometimes it is necessary to zero the superblock: + + - If you are reusing a disk that has been part of an array with an different + superblock version and/or location. In this case you zero the superblock + before you assemble the array, or add the device to an array. + + - If you are trying to prevent a device from being recognised as part of an + array. Say for instance you are trying to change an array spanning sd[ab]1 + to sd[bc]1 (maybe because sda is failing or too slow), then automatic + (scan) assembly will still recognise sda1 as a valid device. You can limit + the devices to scan with the DEVICE keyword in the configuration file, but + this may not be what you want. Instead, zeroing the superblock will + (permanently) prevent a device from being considered as part of an array. + +2. How do I change the preferred minor of an MD array (RAID)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + See item 12 in /usr/share/doc/mdadm/README.recipes.gz and read the mdadm + manpage (search for 'preferred'). + +3. How does mdadm determine which /dev/mdX or /dev/md/X to use? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + The logic used by mdadm to determine the device node name in the mdadm + --examine output (which is used to generate mdadm.conf) depends on several + factors. Here's how mdadm determines it: + + It first checks the superblock version of a given array (or each array in + turn when iterating all of them). Run + + mdadm --detail /dev/mdX | sed -ne 's,.*Version : ,,p' + + to determine the superblock version of a running array, or + + mdadm --examine /dev/sdXY | sed -ne 's,.*Version : ,,p' + + to determine the superblock version from a component device of an array. + + Version 0 superblocks (00.90.XX) + '''''''''''''''''''''''''''''''' + You need to know the preferred minor number stored in the superblock, + so run either of + + mdadm --detail /dev/mdX | sed -ne 's,.*Preferred Minor : ,,p' + mdadm --examine /dev/sdXY | sed -ne 's,.*Preferred Minor : ,,p' + + Let's call the resulting number MINOR. Also see FAQ 2 further up. + + Given MINOR, mdadm will output /dev/md if the device node + /dev/md exists. + Otherwise, it outputs /dev/md/ + + Version 1 superblocks (01.XX.XX) + '''''''''''''''''''''''''''''''' + Version 1 superblocks actually seem to ignore preferred minors and instead + use the value of the name field in the superblock. Unless specified + explicitly during creation (-N|--name) the name is determined from the + device name used, using the following regexp: 's,/dev/md/?(.*),$1,', thus: + + /dev/md0 -> 0 + /dev/md/0 -> 0 + /dev/md_d0 -> _d0 (d0 in later versions) + /dev/md/d0 -> d0 + /dev/md/name -> name + (/dev/name does not seem to work) + + mdadm will append the name to '/dev/md/', so it will always output device + names under the /dev/md/ directory. Newer versions can create a symlink + from /dev/mdX. See the symlinks option in mdadm.con(5) and mdadm(8). + + If you want to change the name, you can do so during assembly: + + mdadm -A -U name -N newname /dev/mdX /dev/sd[abc]X + + I know this all sounds inconsistent and upstream has some work to do. + We're on it. + +4. Which RAID level should I use? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Many people seem to prefer RAID4/5/6 because it makes more efficient use of + space. For example, if you have disks of size X, then in order to get 2X + storage, you need 3 disks for RAID5, but 4 if you use RAID10 or RAID1+0 (or + RAID6). + + This gain in usable space comes at a price: performance; RAID1/10 can be up + to four times faster than RAID4/5/6. + + At the same time, however, RAID4/5/6 provide somewhat better redundancy in + the event of two failing disks. In a RAID10 configuration, if one disk is + already dead, the RAID can only survive if any of the two disks in the other + RAID1 array fails, but not if the second disk in the degraded RAID1 array + fails (see next item, 4b). A RAID6 across four disks can cope with any two + disks failing. However, RAID6 is noticeably slower than RAID5. RAID5 and + RAID4 do not differ much, but can only handle single-disk failures. + + If you can afford the extra disks (storage *is* cheap these days), I suggest + RAID1/10 over RAID4/5/6. If you don't care about performance but need as + much space as possible, go with RAID4/5/6, but make sure to have backups. + Heck, make sure to have backups whatever you do. + + Let it be said, however, that I thoroughly regret putting my primary + workstation on RAID5. Anything disk-intensive brings the system to its + knees; I will have to migrate to RAID10 at one point. + +4b. Can a 4-disk RAID10 survive two disk failures? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + I am assuming that you are talking about a setup with two copies of each + block, so --layout=near2/far2/offset2: + + In two thirds of the cases, yes[0], and it does not matter which layout you + use. When you assemble 4 disks into a RAID10, you essentially stripe a RAID0 + across two RAID1, so the four disks A,B,C,D become two pairs: A,B and C,D. + If A fails, the RAID10 can only survive if the second failing disk is either + C or D; If B fails, your array is dead. + + Thus, if you see a disk failing, replace it as soon as possible! + + If you need to handle two failing disks out of a set of four, you have to + use RAID6, or store more than two copies of each block (see the --layout + option in the mdadm(8) manpage). + + See also question 18 further down. + + 0. it's actually (n-2)/(n-1), where n is the number of disks. I am not + a mathematician, see http://aput.net/~jheiss/raid10/, which gives the + chance of *failure* as 1/(n-1), so the chance of success is 1-1/(n-1), or + (n-2)/(n-1), or 2/3 in the four disk example. + (Thanks to Per Olofsson for clarifying this in #493577). + +5. How to convert RAID5 to RAID10? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + To convert RAID5 to RAID10, you need a spare disk (either a spare, forth + disk in the array, or a new one). Then you remove the spare and one of the + three disks from the RAID5, create a degraded RAID10 across them, create + the filesystem and copy the data (or do a raw copy), then add the other two + disks to the new RAID10. However, mdadm cannot assemble a RAID10 with 50% + missing devices the way you might like it: + + mdadm --create -l 10 -n4 -pn2 /dev/md1 /dev/sd[cd] missing missing + + For reasons that may be answered by question 20 further down, mdadm actually + cares about the order of devices you give it. If you intersperse the missing + keywords with the physical drives, it should work: + + mdadm --create -l 10 -n4 -pn2 /dev/md1 /dev/sdc missing /dev/sdd missing + + or even + + mdadm --create -l 10 -n4 -pn2 /dev/md1 missing /dev/sd[cd] missing + + Also see item (4b) further up, and this thread: + http://marc.theaimsgroup.com/?l=linux-raid&m=116004333406395&w=2 + +6. What is the difference between RAID1+0 and RAID10? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RAID1+0 is a form of RAID in which a RAID0 is striped across two RAID1 + arrays. To assemble it, you create two RAID1 arrays and then create a RAID0 + array with the two md arrays. + + The Linux kernel provides the RAID10 level to do pretty much exactly the + same for you, but with greater flexibility (and somewhat improved + performance). While RAID1+0 makes sense with 4 disks, RAID10 can be + configured to work with only 3 disks. Also, RAID10 has a little less + overhead than RAID1+0, which has data pass the md layer twice. + + I prefer RAID10 over RAID1+0. + +6b. What's the difference between RAID1+0 and RAID0+1? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In short: RAID1+0 concatenates two mirrored arrays while RAID0+1 mirrors two + concatenated arrays. However, the two are also often switched. + + The linux MD driver supports RAID10, which is equivalent to the above + RAID1+0 definition. + + RAID1+0/10 has a greater chance to survive two disk failures, its + performance suffers less when in degraded state, and it resyncs faster after + replacing a failed disk. + + See http://aput.net/~jheiss/raid10/ for more details. + +7. Which RAID10 layout scheme should I use +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RAID10 gives you the choice between three ways of laying out the blocks on + the disk. Assuming a simple 4 drive setup with 2 copies of each block, then + if A,B,C are data blocks, a,b their parts, and 1,2 denote their copies, the + following would be a classic RAID1+0 where 1,2 and 3,4 are RAID0 pairs + combined into a RAID1: + + near=2 would be (this is the classic RAID1+0) + + hdd1 Aa1 Ba1 Ca1 + hdd2 Aa2 Ba2 Ca2 + hdd3 Ab1 Bb1 Cb1 + hdd4 Ab2 Bb2 Cb2 + + offset=2 would be + + hdd1 Aa1 Bb2 Ca1 Db2 + hdd2 Ab1 Aa2 Cb1 Ca2 + hdd3 Ba1 Ab2 Da1 Cb2 + hdd4 Bb1 Ba2 Db1 Da2 + + far=2 would be + + hdd1 Aa1 Ca1 .... Bb2 Db2 + hdd2 Ab1 Cb1 .... Aa2 Ca2 + hdd3 Ba1 Da1 .... Ab2 Cb2 + hdd4 Bb1 Db1 .... Ba2 Da2 + + Where the second set start half-way through the drives. + + The advantage of far= is that you can easily spread a long sequential read + across the drives. The cost is more seeking for writes. offset= can + possibly get similar benefits with large enough chunk size. Neither upstream + nor the package maintainer have tried to understand all the implications of + that layout. It was added simply because it is a supported layout in DDF and + DDF support is a goal. + +8. (One of) my RAID arrays is busy and cannot be stopped. What gives? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + It is perfectly normal for mdadm to report the array with the root + filesystem to be busy on shutdown. The reason for this is that the root + filesystem must be mounted to be able to stop the array (or otherwise + /sbin/mdadm does not exist), but to stop the array, the root filesystem + cannot be mounted. Catch 22. The kernel actually stops the array just before + halting, so it's all well. + + If mdadm cannot stop other arrays on your system, check that these arrays + aren't used anymore. Common causes for busy/locked arrays are: + + * The array contains a mounted filesystem (check the `mount' output) + * The array is used as a swap backend (check /proc/swaps) + * The array is used by the device-mapper (check with `dmsetup') + * LVM + * dm-crypt + * EVMS + * The array contains a swap partition used for suspend-to-ram + (check /etc/initramfs-tools/conf.d/resume) + * The array is used by a process (check with `lsof') + +9. Should I use RAID0 (or linear)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + No. Unless you know what you're doing and keep backups, or use it for data + that can be lost. + +9b. Why not? +~~~~~~~~~~~~ + RAID0 has zero redundancy. If you stripe a RAID0 across X disks, you + increase the likelyhood of complete loss of the filesystem by a factor of X. + + The same applies to LVM by the way. + + If you want/must used LVM or RAID0, stripe it across RAID1 arrays + (RAID10/RAID1+0, or LVM on RAID1), and keep backups! + +10. Can I cancel a running array check (checkarray)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + See the -x option in the `/usr/share/mdadm/checkarray --help` output. + +11. mdadm warns about duplicate/similar superblocks; what gives? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In certain configurations, especially if your last partition extends all the + way to the end of the disk, mdadm may display a warning like: + + mdadm: WARNING /dev/hdc3 and /dev/hdc appear to have very similar + superblocks. If they are really different, please --zero the superblock on + one. If they are the same or overlap, please remove one from the DEVICE + list in mdadm.conf. + + There are two ways to solve this: + + (a) recreate the arrays with version-1 superblocks, which is not always an + option -- you cannot yet upgrade version-0 to version-1 superblocks for + existing arrays. + + (b) instead of 'DEVICE partitions', list exactly those devices that are + components of MD arrays on your system. So in the above example: + + - DEVICE partitions + + DEVICE /dev/hd[ab]* /dev/hdc[123] + +12. mdadm -E / mkconf report different arrays with the same device + name / minor number. What gives? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In almost all cases, mdadm updates the super-minor field in an array's + superblock when assembling the array. It does *not* do this for RAID0 + arrays. Thus, you may end up seeing something like this when you run + mdadm -E or mkconf: + + ARRAY /dev/md0 level=raid0 num-devices=2 UUID=abcd... + ARRAY /dev/md0 level=raid1 num-devices=2 UUID=dcba... + + Note how the two arrays have different UUIDs but both appear as /dev/md0. + + The solution in this case is to explicitly tell mdadm to update the + superblock of the RAID0 array. Assuming that the RAID0 array in the above + example should really be /dev/md1: + + mdadm --stop /dev/md1 + mdadm --assemble --update=super-minor --uuid=abcd... /dev/md1 + + See question 2 of this FAQ, and also http://bugs.debian.org/386315 and + recipe #12 in README.recipes . + +13. Can a MD array be partitioned? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Since kernel 2.6.28, MD arrays can be partitioned like any other block + device. + + Prior to 2.6.28, for a MD array to be able to hold partitions, it must be + created as a "partitionable array", using the configuration auto=part on the + command line or in the configuration file, or by using the standard naming + scheme (md_d* or md/d*) for partitionable arrays: + + mdadm --create --auto=yes ... /dev/md_d0 ... + # see mdadm(8) manpage about the values of the --auto keyword + +14. When would I partition an array? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + This answer by Doug Ledford is shamelessly adapted from [0] (with + permission): + + First, not all MD types make sense to be split up, e.g. multipath. For + those types, when a disk fails, the *entire* disk is considered to have + failed, but with different arrays you won't switch over to the next path + until each MD array has attempted to access the bad path. This can have + obvious bad consequences for certain array types that do automatic + failover from one port to another (you can end up getting the array in + a loop of switching ports repeatedly to satisfy the fact that one array + failed over during a path down, then the path came back up, and another + array stayed on the old path because it didn't send any commands during + the path down time period). + + Second, convenience. Assume you have a 6 disk RAID5 array. If a disk + fails and you are using a partitioned MD array, then all the partitions on + the disk will already be handled without using that disk. No need to + manually fail any still active array members from other arrays. + + Third, safety. Again with the raid5 array. If you use multiple arrays on + a single disk, and that disk fails, but it only failed on one array, then + you now need to manually fail that disk from the other arrays before + shutting down or hot swapping the disk. Generally speaking, that's not + a big deal, but people do occasionally have fat finger syndrome and this + is a good opportunity for someone to accidentally fail the wrong disk, and + when you then go to remove the disk you create a two disk failure instead + of one and now you are in real trouble. + + Forth, to respond to what you wrote about independent of each other -- + part of the reason why you partition. I would argue that's not true. If + your goal is to salvage as much use from a failing disk as possible, then + OK. But, generally speaking, people that have something of value on their + disks don't want to salvage any part of a failing disk, they want that + disk gone and replaced immediately. There simply is little to no value in + an already malfunctioning disk. They're too cheap and the data stored on + them too valuable to risk loosing something in an effort to further + utilize broken hardware. This of course is written with the understanding + that the latest MD RAID code will do read error rewrites to compensate for + minor disk issues, so anything that will throw a disk out of an array is + more than just a minor sector glitch. + + 0. http://marc.theaimsgroup.com/?l=linux-raid&m=116117813315590&w=2 + +15. How can I start a dirty degraded array? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + A degraded array (e.g. a RAID5 with only two disks) that has not been + properly stopped cannot be assembled just like that; mdadm will refuse and + complain about a "dirty degraded array", for good reasons. + + The solution might be to force-assemble it, and then to start it. Please see + recipes 4 and 4b of /usr/share/doc/mdadm/README.recipes.gz and make sure you + know what you're doing. + +16. How can I influence the speed with which an array is resynchronised? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + For each array, the MD subsystem exports parameters governing the + synchronisation speed via sysfs. The values are in kB/sec. + + /sys/block/mdX/md/sync_speed -- the current speed + /sys/block/mdX/md/sync_speed_max -- the maximum speed + /sys/block/mdX/md/sync_speed_min -- the guaranteed minimum speed + +17. When I create a new array, why does it resynchronise at first? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + See the mdadm(8) manpage: + When creating a RAID5 array, mdadm will automatically create a degraded + array with an extra spare drive. This is because building the spare into + a degraded array is in general faster than resyncing the parity on + a non-degraded, but not clean, array. This feature can be over-ridden with + the --force option. + + This also applies to RAID levels 4 and 6. + + It does not make much sense for RAID levels 1 and 10 and can thus be + overridden with the --force and --assume-clean options, but it is not + recommended. Read the manpage. + +18. How many failed disks can a RAID10 handle? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + (see also question 4b) + + The following table shows how many disks you can lose and still have an + operational array. In some cases, you *can* lose more than the given number + of disks, but there is no guarantee that the array survives. Thus, the + following is the guaranteed number of failed disks a RAID10 array survives + and the maximum number of failed disks the array can (but is not guaranteed + to) handle, given the number of disks used and the number of data block + copies. Note that 2 copies means original + 1 copy. Thus, if you only have + one copy (the original), you cannot handle any failures. + + 1 2 3 4 (# of copies) + 1 0/0 0/0 0/0 0/0 + 2 0/0 1/1 1/1 1/1 + 3 0/0 1/1 2/2 2/2 + 4 0/0 1/2 2/2 3/3 + 5 0/0 1/2 2/2 3/3 + 6 0/0 1/3 2/3 3/3 + 7 0/0 1/3 2/3 3/3 + 8 0/0 1/4 2/3 3/4 + (# of disks) + + Note: I have not really verified the above information. Please don't count + on it. If a disk fails, replace it as soon as possible. Corrections welcome. + +19. What should I do if a disk fails? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Replace it as soon as possible: + + mdadm --remove /dev/md0 /dev/sda1 + halt + + mdadm --add /dev/md0 /dev/sda1 + +20. So how do I find out which other disk(s) can fail without killing the + array? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Did you read the previous question and its answer? + + For cases when you have two copies of each block, the question is easily + answered by looking at the output of /proc/mdstat. For instance on a four + disk array: + + md3 : active raid10 sdg7[3] sde7[0] sdh7[2] sdf7[1] + + you know that sde7/sdf7 form one pair and sdg7/sgh7 the other. + + If sdh now fails, this will become + + md3 : active raid10 sdg7[3] sde7[0] sdh7[4](F) sdf7[1] + + So now the second pair is broken; the array could take another failure in + the first pair, but if sdg now also fails, you're history. + + Now go and read question 19. + + For cases with more copies per block, it becomes more complicated. Let's + think of a seven disk array with three copies: + + md5 : active raid10 sdg7[6] sde7[4] sdb7[5] sdf7[2] sda7[3] sdc7[1] sdd7[0] + + Each mirror now has 7/3 = 2.33 disks to it, so in order to determine groups, + you need to round up. Note how the disks are arranged in increasing order of + their indices (the number in brackes in /proc/mdstat): + + disk: -sdd7- -sdc7- -sdf7- -sda7- -sde7- -sdb7- -sdg7- + group: [ one ][ two ][ three ] + + Basically this means that after two disk failed, you need to make sure that + the third failed disk doesn't destroy all copies of any given block. And + that's not always easy as it depends on the layout chosen: whether the + blocks are near (same offset within each group), far (spread apart in a way + to maximise the mean distance), or offset (offset by size/n within each + block). + + I'll leave it up to you to figure things out. Now go read question 19. + +21. Why does the kernel speak of 'resync' when using checkarray? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Please see README.checkarray and + http://www.mail-archive.com/linux-raid@vger.kernel.org/msg04835.html . + + In short: it's a bug. checkarray is actually not a resync, but the kernel + does not distinguish between them. + +22. Can I prioritise the sync process and sync certain arrays before others? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Upon start, md will resynchronise any unclean arrays, starting in somewhat + random order. Sometimes it's desirable to sync e.g. /dev/md3 first (because + it's the most important), but while /dev/md1 is synchronising, /dev/md3 will + be DELAYED (see /proc/mdstat; only if they share the same physical + components. + + It is possible to delay the synchronisation via /sys: + + echo idle >/sys/block/md1/md/sync_action + + This will cause md1 to go idle and md to synchronise md3 (or whatever is + queued next; repeat the above for other devices if necessary). md will also + realise that md1 is still not in sync and queue it for resynchronisation, + so it will sync automatically when its turn has come. + +23. mdadm's init script fails because it cannot find any arrays. What gives? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + This does not happen anymore, if no arrays present in config file, no arrays + will be started. + +24. What happened to mdrun? How do I replace it? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdrun used to be the sledgehammer approach to assembling arrays. It has + accumulated several problems over the years (e.g. #354705) and thus has been + deprecated and removed with the 2.6.7-2 version of this package. + + If you are still using mdrun, please ensure that you have a valid + /etc/mdadm/mdadm.conf file (run /usr/share/mdadm/mkconf --generate to get + one), and run + + mdadm --assemble --scan --auto=yes + + instead of mdrun. + +25. Why are my arrays marked auto-read-only in /proc/mdstat? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Arrays are kept read-only until the first write occurs. This allows md to + skip lengthy resynchronisation for arrays that have not been properly shut + down, but which also not have changed. + +26. Why doesn't mdadm find arrays specified in the config file and causes the + boot to fail? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + My boot process dies at an early stage and drops me into the busybox shell. + The last relevant output seems to be from mdadm and is something like + + "/dev/md2 does not exist" + + or + + "No devices listed in conf file found" + + Why does mdadm break my system? + + Short answer: It doesn't, the underlying devices aren't yet available yet + when mdadm runs during the early boot process. + + Long answer: It doesn't. but the drivers of those devices incorrectly + communicate to the kernel that the devices are ready, when in fact they are + not. I consider this a bug in those drivers. Please consider reporting it. + + Workaround: there is nothing mdadm can or will do against this. Fortunately + though, initramfs provides a method, documented at + http://wiki.debian.org/InitramfsDebug. Please append rootdelay=10 to the + kernel command line and try if the boot now works. + + -- martin f. krafft Wed, 13 May 2009 09:59:53 +0200 diff --git a/debian/NEWS b/debian/NEWS new file mode 100644 index 00000000..691f7171 --- /dev/null +++ b/debian/NEWS @@ -0,0 +1,107 @@ +mdadm (3.2.2-1) unstable; urgency=low + + Metadata format change requires recent Grub + The following only applies to users who want to let the grub-pc bootloader + load the kernel directly off a RAID device created with mdadm 3.x and + default values, or when the metadata version is explicitly set using -e. + + Specifically, this includes all arrays created during or after the + installation of Debian squeeze (mdadm-3.1.4+8efb9d1). Arrays created with + older mdadm versions, and RAIDs created with the command-line option + -e 0.9 are not affected. + + Versions of grub-pc older than 1.98+20100720-1 will not be able to boot + directly off a RAID with the 1.x metadata formats (the new default is 1.2). + To ensure a bootable system, please make sure to use grub-pc 1.98+20100720-1 + or later, which is provided by Debian squeeze. An unbootable system may be + rescued with Super Grub2 Disk (http://www.supergrubdisk.org/super-grub2-disk/) + or grml (http://grml.org/). + + -- Scott Schaefer Wed, 27 Jul 2011 20:21:50 -0400 + +mdadm (3.1.4-1+8efb9d1) unstable; urgency=low + + Default metadata format for newly created arrays has changed from + 0.90 to 1.2. Location of the superblock is now 4Kb from the start + of the device, instead of at the end of the device for 0.90. + The change from 0.9 to 1.x lifted many restrictions of the old + metadata format, and change in location (from end to 4k after + start for 1.2) reduced chances to confuse a raid array with + filesystem inside it. It is now less easy to mount a component + device as separate filesystem by incident, thus destroying the + array. + + Also, chunk size by default is 512K (was 64K) and bitmap chunk size + is 64Mb. + + -- Michael Tokarev Sat, 10 Sep 2011 13:35:12 +0400 + +mdadm (2.6.7-2) unstable; urgency=low + + /dev/disk symlinks: + mdadm now creates symlinks in /dev/disk/by-id, using the template + md-uuid-* for the array UUIDs and md-name-* for any names assigned to + arrays (version-1 superblocks only). Thanks to Suse for the udev rules + file. + + mdrun removed: + This version also removes mdrun once and for all. If you are still using + mdrun, please ensure that you have a valid /etc/mdadm/mdadm.conf file (run + /usr/share/mdadm/mkconf --generate to get one), and run + + mdadm --assemble --scan --auto=yes + + -- martin f. krafft Wed, 02 Jul 2008 10:57:32 +0200 + +mdadm (2.5.3.git200608201206-1) unstable; urgency=low + + This version makes mdadm.conf mandatory. If you do not have such a file, it + will be created for you. + + You must verify the contents of this file and ensure that it represents your + local configuration. See /usr/share/doc/mdadm/README.upgrading-2.5.3.gz for + more information. + + -- martin f. krafft Sun, 20 Aug 2006 21:58:43 +0100 + +mdadm (2.5-1) unstable; urgency=low + + mdrun has been (finally) obsoleted, and an appropriate warning message is + written to the console if you (or a script) attempts to run it. If you + cannot live without mdrun, you can disable the warning by setting + USE_DEPRECATED_MDRUN=1 in /etc/default/mdadm. Note that mdrun will *not* be + supported. Please also see /usr/share/doc/mdadm/README.mdrun . + + -- martin f. krafft Tue, 30 May 2006 23:25:13 +0200 + +mdadm (2.4.1-5) unstable; urgency=low + + This version drops the automatic generation of the /etc/mdadm/mdadm.conf + file on every boot (if it was missing). This means that you need to ensure + that you have a valid configuration file. If none is present during package + configuration, mdadm *will* try to generate one, but it will only contain + information about arrays that were running at the time of package + configuration. Arrays not listed in the configuration file will *not* be + started automatically after boot (with the exception of the root partition). + + If you want to recreate your configuration file, either figure out what it + should contain from the mdadm.conf(5) manpage, or simply assemble and run + all the arrays the way you like it, then run + /usr/share/mdadm/mkconf force-generate /etc/mdadm/mdadm.conf + + -- martin f. krafft Sat, 03 Jun 2006 17:45:47 +0200 + +mdadm (2.4.1-1) unstable; urgency=low + + As of version 2.3, mdadm uses /etc/mdadm.conf as its main configuration + file, and falls back to /etc/mdadm/mdadm.conf if the former is not found. + Since Debian uses /etc/mdadm/mdadm.conf as the configuration file path, this + order was reverted: Debian's mdadm reads /etc/mdadm/mdadm.conf as its main + file and falls back to /etc/mdadm.conf if the former is not found. + + An incompatible change in the reshaping of RAID 5 arrays was made in this + upstream release. If you want to reshape a RAID 5 array with a version-1 + superblock, please make sure to use mdadm 2.4.1 and at least a 2.6.17-rc2 + kernel. + + -- martin f. krafft Tue, 16 May 2006 13:07:49 -0500 diff --git a/debian/README.Debian b/debian/README.Debian new file mode 100644 index 00000000..bfca8cb3 --- /dev/null +++ b/debian/README.Debian @@ -0,0 +1,148 @@ +mdadm for Debian +================ + +Please make sure you read into /usr/share/doc/mdadm/NEWS.Debian.gz and the +documents listed under "further reading" a little later in this file. + +The latest version of this document is available here: + http://git.debian.org/?p=pkg-mdadm/mdadm.gita=blob;f=debian/README.Debian;hb=HEAD + +Autostarting devices +~~~~~~~~~~~~~~~~~~~~ +The mdadm.conf file controls which devices are to be started automatically by +mdadm during boot, and various other parameters about how they are to be started. +The file can also contain some control parameters for the mdadm monitor daemon. +See mdadm.conf(5) for more information. + +Note: this only applies to modular kernels. If you use a monolithic kernel, +you can control which devices are started automatically by changing the +partition type: 0xfd for autostart, 0x83 to prevent autostart. mdadm does not +actually care about the partition type, only the kernel does. + +Common recipes +~~~~~~~~~~~~~~ +Check /usr/share/doc/mdadm/README.recipes.gz for some simple examples of how +to do the most common stuff with mdadm. + +To RAID5 or not to RAID5 +~~~~~~~~~~~~~~~~~~~~~~~~ +See http://www.miracleas.com/BAARF/BAARF2.html . The package maintainer could +not possibly come up with so much emotion over such a technical topic. + +Further reading +~~~~~~~~~~~~~~~ +The documentation for the kernel md driver is included in +/usr/share/doc/mdadm/md.txt.gz. In addition, the md(4) manpage provides +valuable information about the applicable concepts. Do read those! + +Further documents of interest: + - Linux-RAID reference Wiki: + http://linux-raid.osdl.org + - Linux software RAID HOWTO: + http://tldp.org/HOWTO/Software-RAID-HOWTO.html + - linux-raid mailing list info: + http://vger.kernel.org/vger-lists.html#linux-raid + - linux-raid mailing list FAQ: + http://www.faqs.org/contrib/linux-raid/ + +Upstream +~~~~~~~~ +For completeness: The upstream repository is available from + git clone git://neil.brown.name/mdadm + +You can browse Neil's repository here: + http://neil.brown.name/git?p=mdadm + +You can also clone from Debian's Git repository, where upstream's code is in +the 'upstream' branch: + git://git.debian.org/git/pkg-mdadm/mdadm + +Reporting bugs +~~~~~~~~~~~~~~ +For reporting bugs, please use the reportbug tool, as it collects useful +information about the system where you're experiencing the problem. + +If the system is another, please include the output of +/usr/share/bug/mdadm/scripts with your report. + +If you are turning to the linux-raid@vger.kernel.org mailing list because you +already know that the issue is with the md kernel driver and certainly not +Debian-specific, please also include the output of +/usr/share/bug/mdadm/scripts. + +In general, report bugs against the mdadm Debian package, using reportbug. +I am happy to route reports to where they belong. + +Debian package maintenance +~~~~~~~~~~~~~~~~~~~~~~~~~~ +The package is maintained with Git and published on git.debian.org. To obtain +the source: + + git clone git://git.debian.org/git/pkg-mdadm/mdadm + +You can browse the repository here: + http://git.debian.org/?p=pkg-mdadm/mdadm + +If you want to join the mdadm effort, please send me an email. I'll be very +glad for any help I get. + +There are things to do listed in debian/TODO. + +You might also be interested in the following document, which explains how +package maintenance of mdadm was migrated from SVN to Git: + http://blog.madduck.net/debian/2007.10.07_converting-a-package-to-git + +Patches +~~~~~~~ +The best way to submit patches is with git-format-patch, as outlined in the +following. If this is too complicated for you, please feel free to make +normal diffs, or contact me for assistance if you'd like to learn how to use +Git. + +Please try to follow the guidelines outlined in + http://repo.or.cz/w/git.git?a=blob;f=Documentation/SubmittingPatches;hb=HEAD + +First, the setup, which you only have to do once on each machine you work with: + +# leave out --global if you want to set your identity only for mdadm +git config --global user.name 'your name' +git config --global user.email 'your@email.address' +git clone git://git.debian.org/git/pkg-mdadm/mdadm.git + +To prepare the actual patch, do the following: + +git pull +git checkout -b some-name-identifying-my-work +while not finished: + // if resuming after a while, maybe update your branch: + git rebase master + // edit files + git add files + git commit + ... +end + +After you've brought your change to a state where you want to submit it, please +squash it into logical single commits. If you only made one change, then this +will do: + +git checkout -b temp-squash master +git merge --squash some-name-identifying-my-work +git commit // ... remove the "Squashed commit of the following:" leader +git format-patch -M -s master +// now inspect the files this created in $PWD +// when you're ready to submit, do: +git send-email --to your@email.address +// check that it's okay when it arrives +git send-email --to pkg-mdadm-devel@lists.alioth.debian.org + +For multiple logical changes, cherry-pick or squash-merge every commit +belonging to a change to the integration branch and then commit it. + +Also, read the git-send-email manpage in case you're submitting multiple +logical changes, in case you want to thread them. + +The manpage also includes information about adding a prologue message explaining your patch, or how to insert it into an existing +thread (in-reply-to). + + -- martin f. krafft Tue, 16 Oct 2007 18:12:13 +0100 diff --git a/debian/README.checkarray b/debian/README.checkarray new file mode 100644 index 00000000..8071a4d6 --- /dev/null +++ b/debian/README.checkarray @@ -0,0 +1,33 @@ +checkarray notes +================ + +checkarray will run parity checks across all your redundant arrays. By +default, it is configured to run on the first Sunday of each month, at 01:06 +in the morning. This is realised by asking cron to wake up every Sunday with +/etc/cron.d/mdadm, but then only running the script when the day of the month +is less than or equal to 7. See #380425. + +Cron will try to run the check at "idle I/O priority" (see ionice(1)), so that +the check does not overload the system too much. Note that this will only +work if all the component devices of the array employ the (default) "cfq" I/O +scheduler. See the kernel documentation[0] for information on how to verify +and modify the scheduler. checkarray does not verify this for you. + + 0. http://www.kernel.org/doc/Documentation/block/switching-sched.txt + +If you manually invoke checkarray, it runs with default I/O priority. Should +you need to run a check at a higher (or lower) I/O priority, then have a look +at the --idle, --slow, --fast, and --realtime options. + +'check' is a read-only operation, even though the kernel logs may suggest +otherwise (e.g. /proc/mdstat and several kernel messages will mention +"resync"). Please also see question 21 of the FAQ. + +If, however, while reading, a read error occurs, the check will trigger the +normal response to read errors which is to generate the 'correct' data and try +to write that out - so it is possible that a 'check' will trigger a write. +However in the absence of read errors it is read-only. + +You can cancel a running array check with the -x option to checkarray. + + -- martin f. krafft Thu, 02 Sep 2010 10:27:29 +0200 diff --git a/debian/README.recipes b/debian/README.recipes new file mode 100644 index 00000000..2b1891e0 --- /dev/null +++ b/debian/README.recipes @@ -0,0 +1,149 @@ +mdadm recipes +============= + +The following examples/recipes may help you with your mdadm experience. I'll +leave it as an exercise to use the correct device names and parameters in each +case. You can find pointers to additional documentation in the README.Debian +file. + +Enjoy. Submissions welcome. + +The latest version of this document is available here: + http://git.debian.org/?p=pkg-mdadm/mdadm.git;a=blob;f=debian/README.recipes;hb=HEAD + +0. create a new array +~~~~~~~~~~~~~~~~~~~~~ + mdadm --create -l1 -n2 -x1 /dev/md0 /dev/sd[abc]1 # RAID 1, 1 spare + mdadm --create -l5 -n3 -x1 /dev/md0 /dev/sd[abcd]1 # RAID 5, 1 spare + mdadm --create -l6 -n4 -x1 /dev/md0 /dev/sd[abcde]1 # RAID 6, 1 spare + +1. create a degraded array +~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --create -l5 -n3 /dev/md0 /dev/sda1 missing /dev/sdb1 + mdadm --create -l6 -n4 /dev/md0 /dev/sda1 missing /dev/sdb1 missing + +2. assemble an existing array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --assemble --auto=yes /dev/md0 /dev/sd[abc]1 + + # if the array is degraded, it won't be started. use --run: + mdadm --assemble --auto=yes --run /dev/md0 /dev/sd[ab]1 + + # or start it by hand: + mdadm --run /dev/md0 + +3. assemble all arrays in /etc/mdadm/mdadm.conf +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --assemble --auto=yes --scan + +4. assemble a dirty degraded array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --assemble --auto=yes --force /dev/md0 /dev/sd[ab]1 + mdadm --run /dev/md0 + +4b. assemble a dirty degraded array at boot-time +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + If the array is started at boot time by the kernel (partition type 0xfd), + you can force-assemble it by passing the kernel boot parameter + + md-mod.start_dirty_degraded=1 + +5. stop arrays +~~~~~~~~~~~~~~ + mdadm --stop /dev/md0 + + # to stop all arrays in /etc/mdadm/mdadm.conf + mdadm --stop --scan + +6. hot-add components +~~~~~~~~~~~~~~~~~~~~~ + # on the running array: + mdadm --add /dev/md0 /dev/sdc1 + # if you add more components than the array was setup with, additional + # components will be spares + +7. hot-remove components +~~~~~~~~~~~~~~~~~~~~~~~~ + # on the running array: + mdadm --fail /dev/md0 /dev/sdb1 + # if you have configured spares, watch /proc/mdstat how it fills in + mdadm --remove /dev/md0 /dev/sdb1 + +8. hot-grow a RAID1 by adding new components +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # on the running array, in either order: + mdadm --grow -n3 /dev/md0 + mdadm --add /dev/md0 /dev/sdc1 + # note: without growing first, additional devices become spares and are + # *not* synchronised after the add. + +9. hot-shrink a RAID1 by removing components +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + mdadm --fail /dev/md0 /dev/sdc1 + mdadm --remove /dev/md0 /dev/sdc1 + mdadm --grow -n2 /dev/md0 + +10. convert existing filesystem to RAID 1 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # The idea is to create a degraded RAID 1 on the second partition, move + # data, then hot add the first. This seems safer to me than simply to + # force-add a superblock to the existing filesystem. + # + # Assume /dev/sda1 holds the data (and let's assume it's mounted on + # /home) and /dev/sdb1 is empty and of the same size... + # + mdadm --create /dev/md0 -l1 -n2 /dev/sdb1 missing + mkfs -t /dev/md0 + mount /dev/md0 /mnt + tar -cf- -C /home . | tar -xf- -C /mnt -p + # consider verifying the data + umount /home + umount /mnt + mount /dev/md0 /home # also change /etc/fstab + mdadm --add /dev/md0 /dev/sda1 + + Warren Togami has a document explaining how to convert a filesystem on + a remote system via SSH: http://togami.com/~warren/guides/remoteraidcrazies/ + +10b. convert existing filesystem to RAID 1 in-place +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + In-place conversion of /dev/sda1 to /dev/md0 is effectively + mdadm --create /dev/md0 -l1 -n2 /dev/sda1 missing + however, do NOT do this, as you risk filesystem corruption. + + If you need to do this, first unmount and shrink the filesystem by + a megabyte (if supported). Then run the above command, then (optionally) + again grow the filesystem as much as possible. + + Do make sure you have backups. If you do not yet, consider method (10) + instead (and make backups anyway!). + +11. convert existing filesystem to RAID 5/6 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # See (10) for the basics. + mdadm --create /dev/md0 -l5 -n3 /dev/sdb1 /dev/sdc1 missing + #mdadm --create /dev/md0 -l6 -n4 /dev/sdb1 /dev/sdc1 /dev/sdd1 missing + mkfs -t /dev/md0 + mount /dev/md0 /mnt + tar -cf- -C /home . | tar -xf- -C /mnt -p + # consider verifying the data + umount /home + umount /mnt + mount /dev/md0 /home # also change /etc/fstab + mdadm --add /dev/md0 /dev/sda1 + +12. change the preferred minor of an MD array (RAID) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # you need to manually assemble the array to change the preferred minor + # if you manually assemble, the superblock will be updated to reflect + # the preferred minor as you indicate with the assembly. + # for example, to set the preferred minor to 4: + mdadm --assemble /dev/md4 /dev/sd[abc]1 + + # this only works on 2.6 kernels, and only for RAID levels of 1 and above. + # for other MD arrays, you need to specify --update explicitly: + mdadm --assemble --update=super-minor /dev/md4 /dev/sd[abc]1 + + # see also item 12 in the FAQ contained with the Debian package. + + -- martin f. krafft Fri, 06 Oct 2006 15:39:58 +0200 diff --git a/debian/TODO b/debian/TODO new file mode 100644 index 00000000..eb3299e0 --- /dev/null +++ b/debian/TODO @@ -0,0 +1,29 @@ +debian mdadm TODO list +====================== + +- version-1 is a nightmare. E.g. on partitionable arrays, with / on + /dev/md_d0p3, mdadm -Es ignores /dev/md_d0 and just uses the name, so + /dev/md/. +- figure out something about device names. +- (better) udev integration + +- check whether mdadm.conf and system are consistent during initramfs creation + and fail otherwise (#381303). +- add code to compare existing and expected configuration, after standardising + the files. In most cases, we'll have to answer DUNNO as to whether the + existing configuration file is okay, but I guess in some cases we can + determine that the configuration is okay. A conservative approach would be + beneficial to the user. Not sure if it's worth the effort though. +- one nice^W important thing would be to check device names and UUIDs at least. + +- verify operation without udev + - udev removed before mdadm installed + - udev removed after mdadm installed + +- more granular handling of init.d starts/stops, don't force all arrays to be + started. +- let user specify when to start/stop which array (#398310). +- also only stop those array we started; this can be easily done with + sentinels in $STATEDIR + +- manage DAEMON_OPTIONS with debconf diff --git a/debian/bugscript b/debian/bugscript new file mode 100755 index 00000000..dcb88ebd --- /dev/null +++ b/debian/bugscript @@ -0,0 +1,219 @@ +#!/bin/bash +# +# mdadm bug submission control script +# +# allows Debian's bug tools to include relevant information in bug reports. +# +# Copyright © martin f. krafft +# distributed under the terms of the Artistic Licence 2.0 +# +# we need /bin/bash for readline and -n capabalities in the prompt(s) +# + +# maximise information output even in the case of errors +set +eu + +if ! command -v yesno >/dev/null; then + if [ -r /usr/share/reportbug/handle_bugscript ]; then + exec /usr/share/reportbug/handle_bugscript ". $0" /dev/stdout + fi + yesno() { + read -n1 -p"$1" REPLY + case "$REPLY" in + [yY]) REPLY=yep;; + [nN]) REPLY=nop;; + ('') REPLY="$2";; + esac + } + exec 3>&1 +fi + +# do not let people ctrl-c out of the bugscript +trap : INT + +if [ $(id -u) != 0 ]; then + if [ -x "$(command -v sudo)" ]; then + yesno "Gather system information as root using sudo? (Y/n) " yep + if [ "$REPLY" = yep ]; then + echo running sudo "$0" "$@"... + sudo "$0" "$@" >&3 && exit 0 + echo "sudo invocation failed, trying /bin/su..." + fi + fi + + yesno "Gather system information as root using su? (Y/n) " yep + if [ "$REPLY" = yep ]; then + ARGS= + for i in "$@"; do ARGS="${ARGS:+$ARGS }'$1'"; shift; done + echo "running su root -s '/bin/sh -c $0${ARGS:+ $ARGS}'..." + su root -s /bin/sh -c "$0 $ARGS" >&3 && exit 0 + unset ARGS + echo "su invocation failed." + fi + + # arrive here only if neither sudo nor su worked: + yesno "Will you provide system information in the bug report yourself? (N/y) " nop + if [ "$REPLY" = yep ]; then + cat <<_eof >&3 + +IMPORTANT: + please do not forget to include all relevant system information with this + bug report. You could run + /usr/share/bug/mdadm/script 3>&1 + as root and attach or include the output. + +_eof + exit 0 + fi + + # try our best + cat <<_eof >&3 + +WARNING: + the following output was not generated by the root user. If you can, please + replace the following up until "-- System Information:" with the output of + /usr/share/bug/mdadm/script 3>&1 + run as root. Thanks! + +_eof +fi + +if [ ! -r /proc/mdstat ]; then + echo "The local system does not have MD (RAID) support: no drivers loaded." + echo "Without MD support, I cannot collect as much information as I'd like." + + #yesno "Are you sure you want to report a bug at this time? " yep + yesno "Hit any key to continue..." yep + #[ "$REPLY" = yep ] || exit 1 +fi + +echo "--- mdadm.conf" >&3 +if [ -r /etc/mdadm/mdadm.conf ]; then + grep '^[^#]' /etc/mdadm/mdadm.conf >&3 +elif [ -r /etc/mdadm.conf ]; then + grep '^[^#]' /etc/mdadm.conf >&3 +else + echo no mdadm.conf file. >&3 +fi +echo >&3 + +echo "--- /etc/default/mdadm" >&3 +if [ -r /etc/default/mdadm ]; then + grep '^[^#]' /etc/default/mdadm >&3 +else + echo no /etc/default/mdadm file. >&3 +fi +echo >&3 + +echo "--- /proc/mdstat:" >&3 +cat /proc/mdstat >&3 2>&3 || : +echo >&3 + +echo "--- /proc/partitions:" >&3 +cat /proc/partitions >&3 2>&3 || : +echo >&3 + +echo "--- LVM physical volumes:" >&3 +if [ -x "$(command -v pvs)" ]; then + pvs >&3 +else + echo "LVM does not seem to be used." >&3 +fi + +echo "--- mount output" >&3 +mount >&3 +echo >&3 + +echo "--- initrd.img-$(uname -r):" >&3 +if [ -r /boot/initrd.img-$(uname -r) ]; then + TEMPDIR=$(mktemp -d) + OLDPWD="$PWD" + cd "$TEMPDIR" + zcat /boot/initrd.img-$(uname -r) 2>&3 | cpio -i 2>&3 + find -regex '.*/md[a/].+' -type f -exec md5sum {} \; >&3 + + echo >&3 + echo "--- initrd's /conf/conf.d/md:" >&3 + if [ -r conf/conf.d/md ]; then + cat conf/conf.d/md >&3 + else + echo "no conf/md file." >&3 + fi + + cd "$OLDPWD" + rm -rf "$TEMPDIR" + unset TEMPDIR +else + echo "no initrd.img-$(uname -r) found." >&3 +fi +echo >&3 + +if [ -r /proc/modules ]; then + echo "--- /proc/modules:" >&3 + egrep '(dm_|raid|linear|multipath|faulty)' < /proc/modules >&3 || : + echo >&3 +fi + +if [ -f /var/log/syslog ]; then + if [ -r /var/log/syslog ]; then + echo "--- /var/log/syslog:" >&3 + egrep "^\w{3} [ :[:digit:]]{11} ($(hostname)|localhost) (kernel: md|mdadm): " /var/log/syslog >&3 || : + echo >&3 + else + echo "syslog not readable by user." >&3 + fi +fi + +echo "--- volume detail:" >&3 +for dev in /dev/[hsv]d[a-z]*; do + [ ! -r $dev ] && echo "$dev not readable by user." && continue + mdadm -E $dev 2>/dev/null && echo -- || echo "$dev is not recognised by mdadm." +done >&3 +echo >&3 + +if [ -r /proc/cmdline ]; then + echo "--- /proc/cmdline" >&3 + cat /proc/cmdline >&3 + echo >&3 +fi + +if [ -f /boot/grub/grub.cfg ]; then + echo "--- grub2:" >&3 + if [ -r /boot/grub/grub.cfg ]; then + egrep '^[^#].*\<(root=|raid)' /boot/grub/grub.cfg >&3 || : + else + echo grub.cfg file not readable. >&3 + fi + echo >&3 +fi + +if [ -f /boot/grub/menu.lst ]; then + echo "--- grub legacy:" >&3 + if [ -r /boot/grub/menu.lst ]; then + grep '^[^#].*\&3 || : + else + echo menu.lst file not readable. >&3 + fi + echo >&3 +fi + +if [ -f /etc/lilo.conf ]; then + echo "--- lilo:" >&3 + if [ -r /etc/lilo.conf ]; then + egrep '^([^#].*)?root=' /etc/lilo.conf >&3 || : + else + echo lilo.conf file not readable. >&3 + fi + echo >&3 +fi + +echo "--- udev:" >&3 +COLUMNS=70 dpkg -l udev | grep '\' >&3 +md5sum /etc/udev/rules.d/*md* /lib/udev/rules.d/*md* >&3 2>/dev/null +echo >&3 + +echo "--- /dev:" >&3 +ls -l /dev/md* /dev/disk/by-* >&3 +echo >&3 + +echo "Auto-generated on $(date -R) by mdadm bugscript" >&3 diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 00000000..f843ff8a --- /dev/null +++ b/debian/changelog @@ -0,0 +1,1889 @@ +mdadm (3.4-2) unstable; urgency=low + + * Reneable incremental assembly + * Rely on udev to assemble incremental arrays + * In environments with systemd rely on mdadm-last-resort@.timer|.service + units to activate degrated raids + * In environments initramfs-tools initrd (no systemd) add local-block + script to do the same after 2/3rds of root delay iteration + * Drop local-top initramfs script + * Drop dependency on initscripts package + * Drop INITRDSTART support + * Drop mdadm-raid init script + * Drop ancient preinst + * In mdadm.init remove dependency on mdadm-raid + * In mdadm.init check, and bail out running in a container + * In mdadm.config drop mdadm/autostart logic + * Drop CREATE stanzas from mkconf and don't include them in the + initramfs. The generated defaults, are the compiled-in defaults. And + the current one generates warnings when running mdadm in the + initramfs, as there is no passwd|group files to resolve root/disk + uid/gid. Closes: 717609 + * Adapt changes and formatting of initramfs hook from Ubuntu + * Bump standards version to 3.9.7, no changes required + * Fix copyright-refers-to-symlink-license + * Closes: #781172, #796624, #769201, #813335, #632401, #804973, #714155, + #770002, #737132, #675452, #726390, #813637, #814036. + + -- Dimitri John Ledkov Sat, 02 Jul 2016 19:16:01 +0100 + +mdadm (3.4-1) unstable; urgency=medium + + * New upstream release. + * Drop use-tempnode-not-devnode.patch, not needed anymore. + * Drop use-external-blkid.diff. (Closes: #793631) + * Refresh patches. + + -- Dimitri John Ledkov Fri, 19 Feb 2016 16:18:36 +0000 + +mdadm (3.3.4-1.1) unstable; urgency=medium + + * Non-maintainer upload. + * disable-incremental-assembly.patch: incremental assembly prevents booting + in degraded mode (Closes: #784070) + + -- Yann Soubeyrand Tue, 10 Nov 2015 11:18:39 +0100 + +mdadm (3.3.4-1) unstable; urgency=medium + + [ Dimitri John Ledkov ] + * Adopting the package as per mjt delegation. Thank you for your + service mainting this package over the years, your contributions + are highly appreciated by all of Debian and derivate communities. + * New upstream release. + * Bump standards version to 3.9.6.0, no changes required. + * Use dh_prep instead of dh_clean -k + * Drop cherrypicked patches. + + [ Helmut Grohne ] + * Fix FTCBFS. Export CROSS_COMPILE=- (Closes: #794335) + + [ Cyril B. ] + * Copy AUTO lines from host into initrd, when updating initrd. + (Closes: #785104) + + [ Martin von Wittich ] + * Ignore errors attempting to apply *nice to checkarray, it may be + done already. (Closes: #791554) + + -- Dimitri John Ledkov Sun, 08 Nov 2015 11:48:03 +0000 + +mdadm (3.3.2-5) unstable; urgency=medium + + * use-tempnode-not-devnode.patch: change udev rules file to use + $tempnode which works both on wheezy and jessie udev, instead + of $devnode which only works in jessie. At this stage it is + better to make rules file compatible with old version instead + of adding versioned dependency. Should be removed for jessie+1. + (Closes: #770883) + * fix Closes: list in previous entry (Closes: #771852) + + -- Michael Tokarev Sat, 20 Dec 2014 11:48:44 +0300 + +mdadm (3.3.2-4) unstable; urgency=medium + + * really remove /var/lib/mdadm in postinst, fixing a brown-paper bag + bug in previous upload (I fixed it earlier but forgot to commit it + before 3.3.2-3 release). (Closes: #764036, #771852) + * mention closing of #588965 #599352 #694513 by 3.3-1 + + -- Michael Tokarev Fri, 05 Dec 2014 17:29:22 +0300 + +mdadm (3.3.2-3) unstable; urgency=medium + + * remove /var/lib/mdadm dir in postinst to clean up from old pkg, + remove config files on purge (restore extraneous cleanup from + last change) (Closes: #764036) + * remove set -u (error on unset variables) from maintscripts + (Closes: #766308) + * rebuildmap-strip-local-host-name-from-device-name.patch - a patch + from upstream fixing a bug when mdadm have to re-create device + nodes after assembling arrays (eg, when switching from initramfs + without preserving /dev and /run), to choose the same device names + as when doing inital assembly + * readlink-path.patch: readlink is in /bin not /usr/bin on debian + (Closes: #766416) + * mdmonitor-service-simplify.diff: simplify mdmonitor.service + systemd file, do not try to read non-existing files (Closes: #764647) + + -- Michael Tokarev Fri, 28 Nov 2014 09:55:14 +0300 + +mdadm (3.3.2-2) unstable; urgency=medium + + * remove more leftovers from old versions + * do not embed $VERSION to mkconf and bugscript + * removed unneeded lintian-overrides file + * removed examples/mdadd.sh + * removed references to MAIL_TO from /etc/default/mdadm (pre-2.x mdadm) + * removed AUTOSTART variable from /etc/default/mdadm + (system will start arrays listed in mdadm.conf) + * simplify d/rules, build udeb in a subdir (for now, to be removed) + * install systemd services and disable some initscripts (mask them) + when systemd is running (Closes: #763959) + * build-sys-no-check_rundir.patch: stop (re)linking executables + at install time + + -- Michael Tokarev Sat, 04 Oct 2014 20:38:36 +0400 + +mdadm (3.3.2-1) unstable; urgency=low + + * new minor/bugfix upstream release (Closes: #731884, #763080) + * removed remove-bashism-from-makefile.patch (applied upstream) + * copy 64-md-raid-assembly.rules to initramfs too, this should + bring us array auto-assemble during initramfs run (Closes: #678691) + * denote inability to initialize md subsystem in local-top script + from fatal error to warning (Closes: #733574) + * create /etc/modprobe.d/mdadm.conf to set start_ro=1 there + instead of doing it in the initramfs + * use install -D in d/rules consistently and remove dirs from d/mdadm.dirs + * remove references to old (2.x and 1.x) mdadm versions (Closes: #728804) + * remove references to very old (pre-wheeze) breaks/replaces/depends + * remove support of makedev + * reformat mdadm-raid initscript a bit + * remove rebuilding-raid.html (had invisible refs) and + RAID5_versus_RAID10.txt docs. Maybe it's a good idea to remove + other docs shipped with the package, since these becoming stale + and these days, better alternatives exist online + * removed debian-specific mdadm-startall script which does nothing more than + "mdadm --assemble --scan". Remove docbook-to-man from build-deps. + * pass dpkg-buildflags to upstream build system and + stop using our own -O0, -Os etc + * remove separate CFLAGS for udeb and simplify *FLAGS handling + * bump Standards-Version to 3.9.5 (no changes needed) + * fix VCS links to point to anonscm.d.o + + -- Michael Tokarev Wed, 01 Oct 2014 15:09:07 +0400 + +mdadm (3.3-2) unstable; urgency=low + + * use 63-md-raid-arrays.rules instead of old 64-md-raid.rules + (Closes: #726237) + * do not use builtin blkid in udev rules, as our udev (at least + on wheezy) does not have it (use-external-blkid.diff) + + -- Michael Tokarev Mon, 14 Oct 2013 15:49:54 +0400 + +mdadm (3.3-1) unstable; urgency=low + + [ Michael Tokarev ] + * new upstream 3.3 release (Closes: #718896, #588965, #599352, #694513) + See ANNOUNCE-3.3 for details. + Patches: + - refreshed debian-conffile-location.diff + (added .conf.d) + - removed debian-disable-udev-incr-assembly.diff + (do not ship udev-md-raid-assembly.rules for now) + - refreshed debian-no-Werror.diff + - refreshed sha1-includes.diff + - removed patches (included upstream)A: + spelling-and-manpages.patch + fix-enough-function-for-RAID10.patch + fix-segfaults-in-detail.patch + super0-do-not-override-uuid-with-homehost.patch + mdmon-allow-takeover-when-original-was-started-with-.patch + mdmon-fix-arg-parsing.patch + mdmon-fix-arg-processing-for-a.path + Install udev-md-raid-arrays.rules instead of udev-md-raid.rules, + don't install new udev-md-raid-assembly.rules for now. + * remove Martin F. Krafft from uploaders per his request. + Thank you for your contributions! + * added remove-bashism-from-makefile.patch patch to work around + newly introduced bashism + * remove debian/source/options, there's no need to set compression + options for debian.tar.gz. + * remove outdated debian/docs/md_superblock_formats.txt and + debian/docs/md.txt (Closes: #714977, #714978) + * ship ANNOUNCE-*, external-reshape-design.txt, mdmon-design.txt + files as documentation (Closes: #715324) + + [ Dmitrijs Ledkovs ] + * Properly remove 65-mdadm.vol_id.rules, instead of trying to remove a + never-existed 65_mdadm.vol_id.rules (note the 65- vs 65_). + + -- Michael Tokarev Fri, 11 Oct 2013 10:12:47 +0400 + +mdadm (3.2.5-6) unstable; urgency=low + + * replace home-grown and not-working-since-etch udevsettle call + in initramfs script with proper wait_for_udev function (from + common initramfs functions). This unbreaks situations when + the underlying device needs some udev magic to happen before + being available, which includes stacked devices (md on lvm) + and other cases. Thanks to Thomas Parmelan and Dave Whitla + for finding the root cause of breakage and for providing + the fix. (Closes: #644876) + + -- Michael Tokarev Tue, 05 Mar 2013 13:32:21 +0400 + +mdadm (3.2.5-5) unstable; urgency=low + + * add (empty) restart and force-reload actions to mdadm-waitidle + script -- this script is used only when the system needs to be + shut down or rebooted, there's nothing to start or restart. + + -- Michael Tokarev Thu, 24 Jan 2013 17:04:49 +0400 + +mdadm (3.2.5-4+mdmon) experimental; urgency=low + + * fix `/etc/init.d/mdadm-raid status' inverse logic (Closes: #686100) + * /etc/init.d/mdadm: change RUNDIR to /run instead of /var/run. + Mdadm itself uses /run internally, we properly depend on initscripts + version which creates /run, and the initscript itself is started + after local_fs is processed, so this is merely a no-op, but let's + do it for consistency. + * Fix 'enough' function for RAID10, to prevent starting of a RAID10 + array which does not have required minimum of component devices. + (Closes: #691668). + * fix segfaults in Detail() - mdadm --detail may segfault if a drive + has been removed from the array (Closes: #691670) + * super0: do not override uuid with homehost. The bug prevented + re-creating an array with v0.90 superblock with the specified uuid + when homehost is also specified. (Closes: #686703) + * several fixes for mdmon argument processing (Closes: #691671): + - allow --takeover when original was started with --offroot + - fix arg parsing. + - fix arg processing for -a + * Changes based on a patch by Miquel van Smoorenburg (Closes: #684708): + - install mdmon in udeb and initramfs, so imsm arrays can work. + mdadm runs mdmon automatically when needed (currently for imsm + arrays), and mdmon is required to make such arrays read-write + (they're read-only by default) so merely presence of mdmon is + enough to be able to use imsm arrays. + - /etc/init.d/mdadm start: if a mdmon pidfile is found in /run/mdadm, + restart mdmon (--takeover --all) + - /etc/init.d/mdadm stop: link pidfiles of mdmon processes into + /run/sendsigs.omit.d, and make sure that happens before sendsigs runs. + - stop mdadm before sendsigs, so that the above code works + - add script mdadm-waitidle that runs just before reboot/halt. + For each array that is still running, it sets sync_action to idle, + and uses mdadm --wait-clean to wait for all arrays to go idle + (it has a short timeout). + + -- Michael Tokarev Sat, 20 Oct 2012 19:20:12 +0400 + +mdadm (3.2.5-3) unstable; urgency=low + + * revert "Drop unused debconf templates" change -- the templates + are actually being (indirectly) used in debian/mdadm.config + * fix initramfs-script config name (slipped in the initramfs fixes) + + -- Michael Tokarev Sat, 25 Aug 2012 23:12:50 +0400 + +mdadm (3.2.5-2) unstable; urgency=low + + [ Sergey B Kirpichev ] + * Fix mdadm.lintian-overrides + * Fix spelling in binaries, fix lintian warnings + manpage-has-errors-from-man and hyphen-used-as-minus-sign + * Drop unused debconf templates + * Implement status option for mdadm-raid init.d script + * Fix lintian info's conflicts-with-version: Conflicts -> Breaks + + [ Michael Tokarev ] + * fix spelling mistakes in previous changelog entry + * some cleanups for checkarray: + - change --help printing and shorten/simplify the text + - make --quiet cumulative and stop documenting --real-quiet + - do not produce help in case of incorrect usage, and exit with 1 + * fixes for initramfs integration (Closes: #644389, #678262, #685161): + - check INITRDSTART=none early + - do not explicitly load raid level modules (modprobe/kmod does this) + - do not collect needed raid levels (we include all modules anyway) + - load md_mod explicitly since we need to change global parameter + + [ Dmitrijs Ledkovs ] + * Use dh_installinit with --no-restart-on-upgrade, which will start + arrays, but will not stop them during upgrade. (Closes: 678971) + * Copy local administrator's modified udev rules as well as the system + one. (Closes: #678973) + + -- Michael Tokarev Sat, 25 Aug 2012 16:25:37 +0400 + +mdadm (3.2.5-1) unstable; urgency=low + + [ Michael Tokarev ] + * new upstream (bugfix) release, fixing regression when --add'ing + device to an array, introduced in 3.2.4, plus other minor fixes + (Closes: #673104, #673344) + * new patch: sha1-includes.diff to fix #include mess in new sha1.h + * added a check into debian/checkarray to skip checking arrays created + less than 2 weeks ago (Closes: #624273) + + [ Dmitrijs Ledkovs ] + * Remove obsolete documentation dating back to ~etch release + * Remove reference to obsolete documention from debconf templates + * Update debconf templates translations + * Remove compatibility with ancient initramfs-tools + * Remove debian-specific mdadm-startall.8 in clean target + + -- Michael Tokarev Fri, 25 May 2012 20:23:52 +0400 + +mdadm (3.2.4-1) unstable; urgency=low + + * new upstream (bugfix) release (Closes: #664088, #661552) + * removed debian-run-udev.diff (applied upstream), and + all RUNDIR handling from debian/rules (it is the default now) + * add build-arch and build-indep targets to debian/rules, and + bump Standards-Version to 3.9.3 + + -- Michael Tokarev Thu, 10 May 2012 17:51:41 +0400 + +mdadm (3.2.3-3) unstable; urgency=low + + * switch from topgit to plain 3.0 (quilt) layout, creating + debian/patches. Don't build-depend on quilt as patching + is done automatically by dpkg-source. + * debian/patches/debian-run-udev.diff by Roger Leigh (Closes: #644319, #627774) + * update debian/mdadm.logcheck.ignore.server to recognize "k" in + addition of "blocks" in kernel messages. Thanks to Frédéric Brière + for the patch (Closes: #656038) + + -- Michael Tokarev Mon, 30 Apr 2012 14:12:38 +0400 + +mdadm (3.2.3-2) unstable; urgency=low + + [ Michael Tokarev ] + * new upstream bugfix/stable version, with lots of fixes all over. + Closes: #641886, #628667, #645563, #651880, #607375, #633880 + * update Neil's email (Closes: #650630) + * update mdadd.sh to version 1.52 (Closes: #655212) + * fixed a typo (RAID6 vs RAID10) in FAQ (Closes: #637068) + * declare ordering dependency for multipath-tools-boot in + mdadm-raid init script (Closes: #641584) + While at it, remove mention of devfsd + * added Slovak (sk.po) po-debconf translation from Slavko + (Closes: #641972) + * set nice value of the check/resync thread too, together with I/O + scheduling class, based on patch by Sergey B Kirpichev (Closes: #652547) + * small changes for debian/checkarray + * (internal) move files from contrib/* topgit branches into debian directory + * remove dh_testroot from clean target + * add myself to uploaders + + [ Peter Eisentraut ] + * Added support for "status" action to mdadm init script (Closes: #651737) + + -- Michael Tokarev Wed, 18 Jan 2012 22:33:01 +0400 + +mdadm (3.2.2-1) unstable; urgency=low + + [ martin f. krafft ] + * New upstream version (closes: #615494), which addresses: + - --grow option aware of size change of underlying device (closes: #618463) + - builds with -Werror on gcc-4.6 (closes: #625392). + - new features/functionality: + + Policy framework. + + Improved management of reshaping arrays. + + Support for Intel Matrix Storage Manager (IMSM) + Thanks to Scott Schaefer for his help in preparing this release! + * Do not print io rescheduling info message when run by cron + (closes: #598957). + * Fix checkarray script so that it does not die after scheduling the first + device when there is no scheduling class specified; thanks to Mario + 'BitKoenig' Holbe (closes: #611627). + * Do not include DEVICE line in generated configs but use the built-in value + by default (closes: #604702). + * Make mdadm-raid init script depend on hostname; thanks to Mario + 'BitKoenig' Holbe (closes: #610421). + * Schedule start/stop of mdadm-raid before/after filesystems are + checked&mounted/unmounted; thanks to Mario 'BitKoenig' Holbe + (closes: #611632). + * Work around a shell coding bug for cases when there are zero active + devices (closes: #618561). + * Add NEWS entry about metadata change and requirement on new Grub version + (closes: #595516). + * Update mdadd.sh script from Arno's webpage. + * Update md.txt from latest kernel source. + * Updated debconf translations: + - Catalan by Innocent De Marchi (closes: #628371) + - Danish by Joe Dalton (closes: #621346) + * Small typo fix in RAID5_versus_RAID10.txt + * Bump Standards-Version to 3.9.2; no changes required. + + [ Michael Tokarev ] + * don't print W: auto-read-only in checkarray in quiet mode, + thanks to Bernd Hanisch for the patch (Closes: #605722) + * move initscript metadata from /lib/init/rw/.mdadm to /run/mdadm, + and depend on initscripts (>= 2.88dsf-13.3) for /run (Closes: #633054) + * move runtime stuff from /dev/.mdadm to /run/mdadm + * document defaults change in debian/NEWS (it was forgotten to be + done for squeeze) - metadata & chunk size (Closes: #595516) + + -- martin f. krafft Mon, 01 Aug 2011 10:41:41 +0200 + +mdadm (3.1.4-1+8efb9d1) unstable; urgency=low + + * Added patch with Makefile fix from upstream (commit 8efb9d1) to fix + compiler/linker problem on non-x86 architectures (closes: #595290). + + -- martin f. krafft Fri, 03 Sep 2010 10:45:01 +0200 + +mdadm (3.1.4-1) unstable; urgency=low + + * New upstream release, which closes:#595039 and addresses the following + issues too: + - reverts move of incremental map (closes: #585015). + - fixes mdadm monitor in the case of an inactive (or start-failed) raid0 + or linear array (closes: #539154). + - prevent --remove faulty from skipping renumbered devices + (closes: #587550). + - fixed overflow when growing a RAID6 (closes: #589493). + * However, disable the incremental assembly upstream turned on in 3.1.3 for + now, this will have to wait until after the squeeze release. + * initramfs/hook: make sure configuration file exists before accessing it; + thanks to Michael Prokop for the fix and NMU (closes: #589836). + * initramfs/hook: Match UUID case-insensitive while checking for running + arrays not listed in mdadm.conf; thanks to Mario 'BitKoenig' Holbe for the + patch (closes: #583545). + * Fix URL in the bug reporting preamble (presubj) (closes: #589833). + * Add I/O rescheduling functionality to the checkarray script and make the + cronjob use the idle priority; this should now minimise the impact of the + monthly re-check on the running system; Florian Heigl had the idea + (closes: #592149, #508123). + + -- martin f. krafft Sun, 29 Aug 2010 13:44:59 +0200 + +mdadm (3.1.2-2) unstable; urgency=low + + * Fix logcheck regexp to cure "egrep: Unmatched [ or [^" message + (closes: #583376). + * Cherry-pick 94fcb80 from upstream to fix compiler error due to argument + type error (at least on ia64, sparc, powerpc) (closes: #583495). + + -- martin f. krafft Fri, 28 May 2010 09:35:42 +0200 + +mdadm (3.1.2-1) unstable; urgency=low + + * New upstream release (closes: #567167). + * Ignore lintian error about not stopping in runlevel 1. + * Include more information about the configuration, initrd, and LVM in + bugscript output. + * Check active devices against configuration file based on UUID, not device + name (closes: #553896). + * When preparing the list of devices, treat /dev/mdX and /dev/md/X equally. + * Bump Standards-Version to 3.8.4 without having to make changes. + * Fix logcheck rule with patch from Frédéric Brière, since Rebuild events + are now arbitrary, no longer multiples of 20 (closes: #570315). + * checkarray: do not exit non-zero when there are no arrays found (see + #582360). + * Do not exit checkarray cronjob non-zero when the tests whether to run + fail (closes: #580825). This is related to a recent change in behaviour in + cron 3.0pl1-110 (see e.g. #581612). Thanks to everyone for their + suggestions! + * Call MAKEDEV to create device nodes only if MAKEDEV is installed + (closes: #569360). + + -- martin f. krafft Thu, 27 May 2010 09:34:24 +0200 + +mdadm (3.1.1-1) unstable; urgency=low + + * New upstream release. + * Retire fixed/blkid-dev branch. + * Reword warning about unbootable system when mdadm is purged + (closes: #544558). + * Updated FAQ to include s2ram as one of the reasons that can prevent an + array from being stopped; thanks to Pascal Hambourg for writing in. + * Install udev rules into udeb package (closes: #558823). + * Update mdadd.sh (formerly /usr/share/doc/mdadm/examples/newdisk.gz) from + Arno's website, refactoring the script into its own branch/patch + (closes: #539103). + * Do not single-quote homehost in initramfs script (closes: #549083). + + -- martin f. krafft Wed, 27 Jan 2010 10:14:25 +1300 + +mdadm (3.0.3-2) unstable; urgency=low + + * Bumped Standards-Version to 3.8.3 without having to make changes. + * Fixed init dependencies of mdadm daemon init.d script; thanks Petter + Reinholdtsen (closes: #541396). + * Switched source package to v3-quilt format. + + -- martin f. krafft Fri, 06 Nov 2009 10:06:03 +0100 + +mdadm (3.0.3-1) unstable; urgency=low + + * New upstream release. + * Acknowledge 3.0-3.1 NMU by Christian Kujau (patch by Marco d'Itri) + (closes: #541884), and add util-linux dependency. + * Copy udev rules into initramfs, which udev stopped doing + (closes: #549535, #549083, #538843, #538143). + * Fix the bug script to write debug information to the correct file + descriptor (closes: #537734). + * Switch to TopGit and split Debian diff into topic branches; add + README.source. + * Drop the outdated rootraiddoc.97.html document. + + -- martin f. krafft Tue, 27 Oct 2009 18:06:13 +0100 + +mdadm (3.0-3.1) unstable; urgency=medium + + * Non-maintainer upload. + * use blkid instead of vold_id in udev-md-raid.rules (closes: #541884) + + -- christian kujau Mon, 14 Sep 2009 10:15:21 +0200 + +mdadm (3.0-2) unstable; urgency=low + + * Fixed initramfs script with patch from Steffen Hau: it was still using + --auto-update-homehost, which has been removed and replaced by a better + heuristic: arrays created for a different "homehost" will now be + assembled read-only, rather than shoehorned into the system with + --auto-update-homehost (closes: #537820). + * Add version stamps to bugscript and mkconf scripts to facilitate + debugging. + + -- martin f. krafft Tue, 21 Jul 2009 10:33:30 +0200 + +mdadm (3.0-1) unstable; urgency=low + + * New stable upstream release. + * Add information about udev and device links in /dev to bugscript output. + * Add pointer to FAQ and in particular rootdelay to the bug script + pre-subject file, which is displayed by tools like reportbug and thus + hopefully reduces the numbers of duplicated bugs. + * Patch from Frédéric Brière to make logcheck rules printk_time aware + (closes: #537460). + * Updated German translation due to typos and old spelling rules; thanks to + Helge Kreutzmann for the patch (closes: #534663). + * Bumped Standard-Version to 3.8.2; no changes necessary. + + -- martin f. krafft Mon, 20 Jul 2009 16:12:41 +0200 + +mdadm (3.0~devel3-43-g2800528-1) experimental; urgency=low + + * Merge tip of upstream's devel-3.0 branch at commit 2800528. + * Drop our own udev rules in favour of upstream's. If + /etc/udev/rules.d/65_mdadm.vol_id.rules has not been modified (md5sum + check), it is automatically removed; else, a warning is emitted. + * Add information about udev and device links in /dev to bugscript output. + + -- martin f. krafft Tue, 05 May 2009 15:10:46 +0200 + +mdadm (2.6.9-3) unstable; urgency=low + + * Fix the multipath prereq patch (#516605) and make it exit after printing + the prerequisites (closes: #526793). + * Change my previous recommendation for postfix over to the new virtual + package default-mta (see #522300 and #508644). + * Enhance bugscript, which now asks to run as root (sudo/su) if invoked by + a normal user. + * Include MD5 sums of md-related files in initrd in bug reports. + * Add grub2 information retrieval to bugscript. + * Trap SIGINT and thus prevent ctrl-c from terminating the bugscript + prematurely. + + -- martin f. krafft Tue, 05 May 2009 11:46:22 +0200 + +mdadm (3.0~devel3-1) experimental; urgency=low + + * Initial release of DEVELOPMENT BRANCH 3.0 to experimental. + + -- martin f. krafft Thu, 30 Apr 2009 11:51:39 +0200 + +mdadm (2.6.9-2) unstable; urgency=low + + * Fix the check of whether mdadm.conf defines all devices known to the + system; thanks Cristian Ionescu-Idbohrn (closes: #525655). + * No longer pass -k to modprobe, which has been deprecated for a long time; + thanks to Jan Hudec (closes: #519999). + * Remove Mario Joußen from the uploaders list, since his email started + bouncing. + * Prepare mdadm source to use quilt, with the long-term goal to switch to + TopGit, once I find the time. + * Cherry-pick caa0f6c & 667e66d from Neil into a quilt patch to fix gcc-4.4 + compiler issues (closes: #505375). + + -- martin f. krafft Sun, 26 Apr 2009 16:08:28 +0200 + +mdadm (2.6.9-1) unstable; urgency=low + + * New upstream release. + * Do not set -eu in the bugscript to maximise information output in the case + of errors. + * Make initramfs script depend on multipath to ensure its script is run + before ours (closes: #516605). + * Provide an alternative (postfix) for mail-transport-agent (closes: + #522300). I chose postfix because that's the only one I could recommend, + and since the alternative does not affect people who already have an MTA + installed, or have a preference, it won't affect them. + * Honour debconf pre-selection of mdadm/initrdstart (closes: #516802). + * Incorporate patch from Adrian Bridgett: the initramfs hook now checks to + see if all known arrays are listed in mdadm.conf and issues a warning if + this is not the case (closes: #519328). + * Make checkarray skip over arrays still marked auto-read-only + (closes: #510641). + * Add cron.daily snippet from Paul Slootman to run one-shot scans every day + to ensure that failed arrays don't go unnoticed (closes: #497949). + * Bumped Standards-Version to 3.8.1; no changes necessary. + + -- martin f. krafft Sat, 25 Apr 2009 19:04:47 +0200 + +mdadm (2.6.8-12-gb47dff6-2) unstable; urgency=low + + * Brown paper bag release: I built from the wrong branch which caused some + Debian-specific changes not to get into the package. Thus build fixes it. + + -- martin f. krafft Mon, 16 Feb 2009 12:15:37 +0100 + +mdadm (2.6.8-12-gb47dff6-1) unstable; urgency=low + + * New upstream release. + - better checks asprintf() return codes, thanks to patch from Dustin + Kirkland (closes: #509167). + * Fix start/stop runlevels in header of mdadm monitor init.d script + (closes: #514923). + * Use modprobe -q instead of --syslog from initramfs (closes: #502988). + + -- martin f. krafft Mon, 16 Feb 2009 11:07:18 +0100 + +mdadm (2.6.7.2-1) unstable; urgency=low + + * New upstream release, created for Debian lenny: + - fixes assembly of arrays that are being reshaped (closes: #512475) + - this bug was also responsible for other assembly problems + (closes: #498505, #499643, #496334) + Again, many thanks to Neil Brown for being such an awesome upstream. + + * Documentation updates: + - Actually install David Pashley's blog post added in 2.6.7.1-1, and + register it with doc-base. + - Update md.txt to version 2.6.26 (the lenny kernel). + - Add a dump of a website detailing md superblock formats. + - Register FAQ, md.txt, RAID5-vs-RAID10, README.recipes with doc-base + - Cherry-picked UID/UUID typo in mdadm.conf(5) manpage fix (commit + 0e69da7) (closes: #506245). + + * Added Italian debconf translation; thanks Luca Monducci (closes: #506572). + + -- martin f. krafft Tue, 03 Feb 2009 21:28:34 +0100 + +mdadm (2.6.7.1-1) unstable; urgency=low + + * New upstream release, specifically created for Debian lenny to fix the RC + bugs, which + - fixes typo in forced assembly code (closes: #496334, #499643, #498505). + - fixes array component size detection (closes: 500309). + Thanks Neil Brown, mdadm upstream: you are spoiling me. :) + + * Minor fixes to documentation: + - Add David Pashley's Rebuilding RAID blog post. + - Add new (bugfix) version 1.40 of the newdisk script + (/usr/share/doc/mdadm/examples/newdisk.gz) (closes: #490955). + - Add link to Warren Togami's writeup about remote RAID-1 conversion to + README.recipes. + - Fix probability of survival in FAQ 4b, since I erroneously labeled the + chance of failure as the chance of survival; thanks to Per Olofssen for + clarification (closes: #493577). + - Cherry-picked 6d6de2e from Neil, which adds HOMEHOST to the manpage and + closes: #489257. + + -- martin f. krafft Wed, 15 Oct 2008 10:27:23 +0200 + +mdadm (2.6.7-3.1) unstable; urgency=low + + * Non-maintainer upload with permission. + * Ask mdadm/initrdstart at medium priority in chrooted environment. + (Closes: #493099) + * Update Swedish debconf translations. Thanks Martin Ã…gren. + (Closes: #492074) + + -- Jérémy Bobbio Mon, 25 Aug 2008 22:28:53 +0200 + +mdadm (2.6.7-3) unstable; urgency=low + + * Correct credits in the NEWS file: Suse authored the udev rules, not Ubuntu + (who use the same file without credit). + + -- martin f. krafft Sat, 05 Jul 2008 12:22:58 +0200 + +mdadm (2.6.7-2) unstable; urgency=low + + * Remove mdrun completely. + * Import udev vol_id handling from Suse, and thus finally export + /dev/disk/by-id/* and /dev/disk/by-uuid symlinks (closes: #435983). + * Update upstream URLs in copyright and watch file (closes: #488364). + * Bump Standards-Version to 3.8.0; no changes required. + * Add mdadm homepage link to debian/control. + + -- martin f. krafft Wed, 02 Jul 2008 11:13:18 +0200 + +mdadm (2.6.7-1) unstable; urgency=low + + * New upstream version, which + - fixes a segfault when reading /proc/mdstat (closes: #462154). + - fixes a possible bug with v1 bitmap space allocation (closes: #474548). + - supports large files for loop assembly (closes: #463769). + * Moved mdadm-raid init.d script to position S60 (from S50) for runlevels + 0 and 6, so that arrays get stopped after cryptdisks-early; thanks to + J.M.Roth (closes: #486012). + * Fixed a typo in checkarray; thanks to Helmut Grohne (closes: #445540). + * Updated debconf translations: + - Spanish; thanks to Javier Fernández-Sanguino (closes: #477920). + - Basque; thanks to Piarres Beobide (closes: #478676). + * Updated logcheck rule so that it matches mdadm log entries with and without + PID (2.6.5 introduces PIDs in the messages). + + -- martin f. krafft Wed, 25 Jun 2008 17:31:15 +0200 + +mdadm (2.6.4-2) unstable; urgency=low + + * Adjusted logcheck rules to follow kernel changes; thanks to Frédéric + Brière (closes: #462478). + * Debconf templates and debian/control reviewed by the debian-l10n- + english team as part of the Smith review project. Closes: #463673 + * Debconf translation updates (thanks to Christian Perrier for compiling + them): + * Japanese. Closes: #464438 + * Galician. Closes: #464454 + * French. Closes: #465984 + * Czech. Closes: #466306 + * Dutch. Closes: #466543 + * Russian. Closes: #466577 + * Portuguese. Closes: #466794 + * German. Closes: #466989 + * Vietnamese. Closes: #467118 + * New debconf translations: + * Finnish. Closes: #468048 + * Fixed bashism in mdadm-raid init.d script; thanks to Raphael Geisser + (closes: #471874). + * Do not output warning when run from cron and no arrays are found + (closes: #474542). + * Add doc-base registration file; thanks to Roberto C. Sanchez + (closes: #451684). + * Reschedule "mdadm Sunday" to 00:57 instead of 01:06 to prevent double + invocation on DST change (closes: #449244). + * Bump DH compatibility level to 6; no changes required. + + -- martin f. krafft Fri, 11 Apr 2008 10:48:45 +0200 + +mdadm (2.6.4-1) unstable; urgency=low + + * New upstream release. + * Apply patch by Petter Reinholdtsen to fix dependency loop in + init.d script (closes: #460256). + * Bump Standards-Version to 3.7.3; no changes needed. + + -- martin f. krafft Mon, 14 Jan 2008 12:47:14 +0100 + +mdadm (2.6.3+200709292116+4450e59-3) unstable; urgency=low + + * Patch by Jérémy Bobbio which completes the fix for #444682. + + -- martin f. krafft Mon, 01 Oct 2007 16:16:19 +0100 + +mdadm (2.6.3+200709292116+4450e59-2) unstable; urgency=low + + * Patch the routine loading v1 superblocks to fix a segfault on amd64 + (closes: #444682). + + -- martin f. krafft Sun, 30 Sep 2007 14:10:41 +0100 + +mdadm (2.6.3+200709292116+4450e59-1) unstable; urgency=low + + * New merge from upstream @4450e59ffaf75623fa4261e244b0717a7463aa84 + - makes "--write-mostly" effective when re-adding a device to an array. + (closes: #442874). + * Do not call update-initramfs -k all, it can be set via + /etc/initramfs-tools/update-initramfs.conf (closes: #439334). + * Depend on udev|makedev instead of just makedev and invoke /dev/MAKEDEV, + not /sbin/MAKEDEV (closes: #436998). + * De-escalate the initramfs hook warning about an "emergency procedure" and + simply note that this involves initramfs assembling arrays it finds + automatically at boot. Also added a FAQ entry on how to turn off the init + script warning when no arrays are found (closes: #434934). + * Add --scan to the single-device-assembly-codepath in the initramfs + local-top script; thanks to Mario 'BitKoenig' Holbe for the patch + (closes: #440703). + * Use short option in initramfs script mkdir call, so make it klibc-utils + compatible; thanks maximilian attems for the patch (closes: #443436). + * Handle module name synonyms in initramfs hook script (closes: #432585). + + -- martin f. krafft Sat, 29 Sep 2007 21:21:25 +0100 + +mdadm (2.6.2-2) unstable; urgency=low + + * Fix typos in md(4) manpage; thanks Jeroen (closes: #425576). + * Make init script not report failure when there are no arrays defined in + config file. + * Add /usr/share/doc/mdadm/examples/newdisk, a script to integrate + a replacement disk into an existing array with minimal effort; will remain + in examples/ until I had a chance to really test and understand it. Thanks + to Arno van Amersfoort (closes: #427880). + * Does some sanity checking for proper format of level= arguments in + mdadm.conf and bails if an error is found. Since the RAID levels are used + verbatim as module names, a discrepancy might lead to an unusable system. + Instead, we thus use the emergency fallback. Thanks to Andrew + Sackville-West for spotting this and helping me with the fix. + + -- martin f. krafft Tue, 10 Jul 2007 09:59:45 +0200 + +mdadm (2.6.2-1) unstable; urgency=low + + * New upstream release + - new options --fail detach and --remove faulty can be used to fail and + remove devices that are no longer physically present (closes: #416512). + - --help output now goes to stdout (closes: #416653). + - plenty of manpage fixes, thanks Peter Samuelson (closes: #414688). + * Incorporated patch by Mikko Rasa to fix detecting of raid6 (and raid10) + devices with --scan (closes: #421915). + + -- martin f. krafft Mon, 21 May 2007 14:25:43 +0200 + +mdadm (2.6.1-1) unstable; urgency=low + + * Release to unstable. + * Start arrays read-only in initramfs to prevent syncing and hence enable + resuming/freezing. The arrays will automatically sync as soon something + writes to it; thanks to Tim Dijkstra, Neil Brown, and Luis Rodrigo + Gallardo Cruz (closes: #415441). + * mkconf now tries to preserve existing values for DEVICE, CREATE and + HOMEHOST (in addition to MAILADDR, which it preserved previously already). + PROGRAM is preserved but only added to mdadm.conf if it occured in the + previously existing configuration file. + * startall is now mdadm-startall and lives in /sbin, thanks to Eduard Bloch. + It now can handle existing mdadm.conf files much more gracefully, mostly + thanks to the above mkconf enhancements (closes: #415336). + + -- martin f. krafft Sat, 05 May 2007 16:12:29 +0200 + +mdadm (2.6.1-1~exp.5) experimental; urgency=low + + * Fix mdadm.conf typo; thanks Tim Phipps (closes: #416626). + * Execute udevtrigger after assembly of arrays during initramfs processing; + thanks to Michael Prokop (closes: #416658). + + -- martin f. krafft Sat, 31 Mar 2007 12:12:27 +0200 + +mdadm (2.6.1-1~exp.4) experimental; urgency=low + + * Incorporate patches by Peter Samuelson fixing several typography as well + as typesetting issues in the manpages. Thanks a lot! (closes: #414688) + + -- martin f. krafft Tue, 13 Mar 2007 19:26:37 +0100 + +mdadm (2.6.1-1~exp.3) experimental; urgency=low + + * Patch by Jørn V. Christensen to properly handle multiple email addresses + for the MAILADDR setting (closes: #413330). + + -- martin f. krafft Mon, 5 Mar 2007 11:19:18 +0000 + +mdadm (2.6.1-1~exp.2) experimental; urgency=low + + * Updated debconf translations: + - Galician by Jacobo Tarrio (closes: #412203). + + -- martin f. krafft Sat, 24 Feb 2007 16:41:16 +0100 + +mdadm (2.6.1-1~exp.1) experimental; urgency=low + + * New upstream release, targeted at experimental until etch is out: + - adds --syslog option (closes: #402457). + - now can --wait for sync activity to finish (closes: #328197). + - for other changes, please see /usr/share/doc/mdadm/changelog.gz + * Updated debconf translations: + - Portuguese by Rui Branco (closes: #411745). + + -- martin f. krafft Thu, 22 Feb 2007 16:35:16 +0100 + +mdadm (2.5.6-9) UNRELEASED; urgency=low + + * More logcheck filters to prevent redundant information logged by mdadm + --syslog. + * Improved mdadm-raid init script to correctly output status information for + drives that are initialising or adding spares. + + -- martin f. krafft Thu, 11 Jan 2007 16:05:12 +0100 + +mdadm (2.5.6-8) unstable; urgency=low + + * Hard-code path to /sbin/mdadm binary rather than searching the $PATH. This + closes: #403307 and should be a little more robust in the presence of + installations of mdadm in /usr/local. + * Made the bugscript a little more failure-resilient. + * Added more documentation. + + -- martin f. krafft Mon, 8 Jan 2007 02:04:25 +0100 + +mdadm (2.5.6-7) unstable; urgency=low + + * Only parse ARRAY lines from configuration file when collecting the array + pairs. Thanks to Daniel Dehennin for the bug report and suggested fix + (closes: #402106). + * Prevent modules from being loaded during initramfs time if no arrays are + to be assembled at this stage. + * Export DAEMON_OPTIONS to /etc/default/mdadm, which gets passed to the + mdadm daemon on startup from the init.d script. The value is set to + -y/--syslog by default, and will get incorporated into debconf in a future + version; thanks for the idea by Alex Owen (closes: #401696). + * Incorporate patch by upstream to fix handling of --syslog long option + (closes: #402457). + * Added logcheck filters for new syslog entries by mdadm monitoring daemon. + * Added Spanish debconf translation by Javier Fernández-Sanguino + (closes: #402681). + + -- martin f. krafft Tue, 12 Dec 2006 11:49:52 +0100 + +mdadm (2.5.6-6) unstable; urgency=medium + + * Fixed a typo in the debconf control script which would cause failures with + a dash shell; thanks to Santiago Garcia Mantinan for reporting this, and + Andreas Metzeler for providing a solution (closes: #399315). + + -- martin f. krafft Mon, 20 Nov 2006 15:02:34 +0800 + +mdadm (2.5.6-5) unstable; urgency=low + + * Prevent initramfs hook from exiting prematurely when VERBOSE=false. + * Moved debconf question about arrays to start by initramfs to high + priority. + + -- martin f. krafft Mon, 13 Nov 2006 11:24:21 +0100 + +mdadm (2.5.6-4) unstable; urgency=low + + * Actually remove mdadm.conf on purge; previously, the generation of + a temporary file for initramfs would screw up the purging; thanks to + Fabrice Lorrain for the report (closes: #398088). + + -- martin f. krafft Sat, 11 Nov 2006 20:07:55 +0100 + +mdadm (2.5.6-3) unstable; urgency=low + + * Fix a syntax error in mdadm-raid script. + + -- martin f. krafft Thu, 9 Nov 2006 15:47:51 +0100 + +mdadm (2.5.6-2) unstable; urgency=low + + * Small fixes to mkconf, now returns 255 instead of -1. + * Added /usr/share/mdadm/startall, a helper script to facilitate starting + all arrays when booting from rescue/live media. It overrides AUTOSTART in + /etc/default/mdadm and starts arrays even if the variable is set to + a false value. + + -- martin f. krafft Thu, 9 Nov 2006 14:44:35 +0100 + +mdadm (2.5.6-1) unstable; urgency=low + + * New upstream release: + - added note to mdadm(8)/--metadata about overriding the default in + mdadm.conf (closes: #396914). + - fixed problems that could cause infinite loop with auto assemble. Thanks + to Dan Pascu for pointing this out (closes: #396582). + - fixed problems with bitmap file names lost after reading from + configuration file. + * Merged patch by Dan Pascu to nicely handle situations where a degraded + array only has one drive left nicely by the mdadm-raid script. + * Updated Japanese debconf translation; thanks to Hideki Yamane + (closes: #396400). + + -- martin f. krafft Thu, 9 Nov 2006 00:47:45 +0100 + +mdadm (2.5.5-1) unstable; urgency=low + + * New upstream release: + - fixes the build problems on several architectures (closes: #393314) by + including the contents of linux/blkpg.h literally, not via #include. + - optimises bitmap file use on 64bit systems. + - does not error out anymore when trying to assemble an already assembled + array without a corresponding /dev device node. + - does not report an error if --assemble --scan only finds already running + arrays. + - fixes several bugs related to RAID10 and the new offset layout. + - improves error message when a wrong '--update' option is given. + * Added FAQ entries about partitionable arrays. + * chroot detection now also works for 2.6.18 and beyond (c.f. kernel commit + 778c1144771f0064b6f51bee865cceb0d996f2f9). + * Now recommends module-init-tools. + * Hides ugly errors during configuration in the absense of module-init-tools + or initramfs-tools. + * Send udev events for arrays assembled by the mdadm-raid init.d script. + This does not close #394193 but it's a good addition anyway. I am not + sending these events from the initramfs as well because it would be + non-trivial to ensure that an event doesn't get sent twice for a given + array. + Anyway, this is all a hack until the kernel sends online/offline events to + udev. See #394193. + * Added more RAID10 information to the FAQ. + * Added filters to logcheck for regular events, even by the md driver; also + promoted messages about non-fresh components to security events. + * Hide informational messages unless VERBOSE is set to a true value in + /etc/default/mdadm. + + -- martin f. krafft Thu, 26 Oct 2006 22:35:24 +0200 + +mdadm (2.5.4-1) unstable; urgency=low + + * New upstream release: + - --examine now reports chunk size also for RAID6 and RAID10 + - fix endianness issues with v1 superblocks (closes: #385726) and bitmap + metadata. + - improved message when mdadm detects similar superblocks + (closes: #385951). + - documents that the automatic update of the super-minor field in the + superblock when using a 2.6 kernel only applies to RAID levels 1 and + higher. RAID0 array superblocks must be manually updated + (closes: #386315, #388172). + - removes partition table from any whole device added to an array. + - allow --auto=yes to specify a number; if mdadm determines from the + device name that you want a partitionable array, this number determines + the number of sub-device nodes to create. + * Removed patch previously used to fix #385951 because it's not adequate. + See the bug log for reasons. + + -- martin f. krafft Fri, 13 Oct 2006 08:32:20 +0200 + +mdadm (2.5.3.git200608202239-8) unstable; urgency=low + + * This revision is dedicated to Peter Samuelson for his RAID10 expertise^W + educated guess^W^W pure luck. (: + * Now writes minimal mdadm.conf file even if the MD subsystem has not been + loaded and the scan for arrays thus failed. + * Now tries to ensure that the configuration file used for the initramfs + actually defines arrays. + * Now preserves MAILADDR from an existing mdadm.conf when generating a new + one. + * Documentation updates. + * Updated debconf translations: + - German by Mario Joußen. + - Vietnamese by Clytie Siddall (closes: #390311). + - Dutch by Frans Pop (closes: #390955). + - French by Jean-Luc Coulon (closes: #391215). + + -- martin f. krafft Fri, 6 Oct 2006 15:03:46 +0200 + +mdadm (2.5.3.git200608202239-7) unstable; urgency=medium + + * Fixed a serious bug in the debconf script which would cause the + configuration to exit prematurely in cases when the root could not be + determined. Since this is RC, the urgency is set to medium. + * Updated the documentation a bit. + * Updated debconf translations: + - Swedish by Daniel Nylander (closes: #389040). + - Czech by Miroslav Kure (closes: #389083). + - Russian by Yuri Kozlov (closes: #389086). + - Brazilian Portuguese by Felipe Augusto. + + -- martin f. krafft Fri, 29 Sep 2006 16:31:44 +0200 + +mdadm (2.5.3.git200608202239-6) unstable; urgency=high + + * Bumping urgency to high because previous version has been in unstable for + three days and this one really only fixes a stupid segfault: + * Reworked the parsing of /proc/partitions and spotted a mean segfault + (closes: #388355). + + -- martin f. krafft Thu, 21 Sep 2006 15:25:21 +0200 + +mdadm (2.5.3.git200608202239-5) unstable; urgency=medium + + * Keeping medium urgency due to RC bug. + * Modified the patch responsible for pruning parent devices so that + superblocks at the end of a disk do not get interpreted twice. It now + makes less assumptions about the exact output of /proc/partitions and + should thus be more robust (now closes: #385951). + * Added code that defers mdadm preconfiguration when the debconf backend is + too old (and does not provide debconf-escape). Now configuration is + postponed until the postinst is run in this case. + + -- martin f. krafft Thu, 14 Sep 2006 11:16:39 +0200 + +mdadm (2.5.3.git200608202239-4) unstable; urgency=medium + + * Correct error related to an unbound variable in postinst. + * Keeping medium urgency. + + -- martin f. krafft Wed, 13 Sep 2006 20:49:33 +0200 + +mdadm (2.5.3.git200608202239-3) unstable; urgency=medium + + * Urgency medium because of RC bugs. + * Add versioned dependency to debconf (closes: #385994); temporary fix until + we find a proper fix for #386439. + * Add patch by Steinar H. Gunderson to ensure mdadm does not interpret + a superblock as belonging to a device when it's actually part of + a partition on that device (closes: #385951). + * Do not override the superblock default version in mdadm.conf to prevent + creation of superblocks that the kernel can't handle (closes: #384614). + * Added a note to alert people that the warning about arrays not listed in + the configuration file is only relevant if the arrays are needed to be + brought up by mdadm from initramfs during boot (closes: #385017). + * Added bootloader/cmdline info to bugscript so that future bug reports via + bug/reportbug include information on how the system is booted with respect + to RAID (the root partition). + * If mdadm is being configured in a chroot, it now defaults to starting all + arrays from the initial ramdisk, rather than trying to figure out the root + MD array (we're using /proc information, so it would be the one of the + host, not the one of the chroot) (closes: #386468). + * Added LSB headers to init scripts. + * Reworked the documentation with respect to the use of "MD" and "RAID", and + added a FAQ entry on the meaning of "MD". Thanks to Frans Pop for his + help! + * Updated debconf translations: + - Czech by Miroslav Kure (closes: #384754). + - French by Florentin Duneau (closes: #385690). + - Russian by Yuri Kozlov (closes: #387017). + + -- martin f. krafft Thu, 7 Sep 2006 14:32:04 +0200 + +mdadm (2.5.3.git200608202239-2) unstable; urgency=low + + * Allow ARRAY lines in configuration file to break across lines + (closes: #384222). + * Improved initramfs hook; now does not rely on initramfs to provide RAID + assembly: + - if a checked mdadm.conf file is present, use that. + - if an unchecked mdadm.conf is present, create a temporary one + - if that fails, use the unchecked one iff it contains at least one + ARRAY statement. Otherwise rely on auto-generation from the initramfs + during the book (and hope for the best). + - if no mdadm.conf is found, create one on the fly + - if that fails, hope that the auto-generation will work during boot + * Improved the messages printed by the initramfs script. + * Do not store the debconf answer for whether arrays not listed in the + configuration file should be used. + * Now asks again for the devices to start (preseeded with 'all') if the user + does not want to proceed with devices not listed in the configuration + file. + + -- martin f. krafft Wed, 30 Aug 2006 16:29:07 +0200 + +mdadm (2.5.3.git200608202239-1) unstable; urgency=low + + * Tracking upstream git releases. + - now the --run switch behaves as stated in the manpage. This properly + fixes #287415 + - new version-1 partitionable arrays are now named X instead of _dX (e.g. + 0 instead of _d0) for device names like md_dX. I actually think this is + a bug and hope upstream will use dX (e.g. d0) instead in a future + version. + - we specify --symlink=no and thus disable the new feature to create + /dev/mdX symlinks to /dev/md/X devices until the entire device node mess + is cleared up. No need to introduce yet another complicating factor at + this stage. + * Instead of trying to do a whole lot of magic with respect to detecting + RAID devices to start, mdadm from now on requires a valid mdadm.conf file + to be installed. It still tries to do what it can, but there are no + heuristics anymore. See /usr/share/doc/mdadm/README.upgrading-2.5.3 . + * Removed the zero-superblock warning because we require the user to sign + off the configuration file anyway. + * This also enables us to use mdadm.conf from the initial ramdisk and thus + closes: #381303. + * In case the user chooses to assemble all arrays from the initial ramdisk, + use the new homehost feature of mdadm (closes: #381057). This will start + only those arrays belonging to the local system, unless it is the first + run on a system, in which case it will start all arrays and mark them as + belonging to the local system. + * Improved the debconf control script: integrated error messages into the + frontends, and made it a bit smarter. Error messages can now be + translated, and the script checks whether the user's choice is listed in + the configuration file and only proceeds if it is, or the user chooses to + ignore that it is not. + * mkconf can now take a generate/force-generate parameter to write directly + to /etc/mdadm/mdadm.conf. A second parameter specifies an alternate + filename. + * mkconf now outputs comments for the settings it suggests. + * Removed all udev-related stuff. We must coexist with udev because there + are setups that assemble arrays without mdadm, so the device nodes must be + created by udev, if that is used. + (closes: #382263, #382450, #383688, #383891, #383806, #382480) + Staying at low urgency since these (RC) bugs only exist in unstable. + * Now installs MD modules and mdadm/mdrun into initial ramdisk regardless of + whether the hook script thinks there are devices to start. This was done + to enable recovery from the initramfs shell. + * Now uses 'MD' instead of 'RAID' consistently in all messages. + * Now rebuilds initramfs for all installed kernels. + * Now breaks the endless config loop only when the user does not see the + question (see #381284, #381007). + * Don't fail mdadm-raid when /dev is on a read-only filesystem + (closes: #382876). + * Updated debconf translations: + - French by Florentin Duneau (closes: #382389). + + -- martin f. krafft Mon, 21 Aug 2006 00:25:22 +0100 + +mdadm (2.5.2-10) unstable; urgency=low + + * Applied patch by upstream to fix the logic of the --run switch (see + #287415). Thus also reverted the mdadm-raid hack used to fix the bug in + the 2.5.2-9 upload. + * Recognise devfs-style device nodes by fixing a regression bug in the root + RAID autodetection code (closes: #381007), which was introduced as part of + the fix for #380596 in the 2.5.2-9 upload. The bug is RC, but it only + applies to unstable right now, so I am not pumping up the urgency. + The autodetection code now doesn't care about the actual name of the array + device, but instead only insists that it exists, is a block device, and + recognised as an array by mdadm (mdadm --detail). + * Added safety net to prevent endless loops in RAID autodetection. Now just + falls back to starting all arrays from the initramfs if it fails to + determine an acceptable array for the / filesystem in three tries + (closes: #381284, also see #381007). + * Added udev rules file to prevent udev from ever creating md device nodes, + which can get in the way of mdadm (also see next item). + * Added a workaround to the initramfs hook to deal with the problem with + /dev/md/X device nodes when /dev/mdX is also present and version-0 + superblocks are in use (closes: #381181). + * checkarray: correctly recognise when the kernel is too old for parity + checks, or when there are no redundant arrays present (closes: #380746). + * checkarray: now supports -s|--status switch to query parity check status + for given devices. + * checkarray: now supports -x|--cancel switch to cancel running checks. + * mkconf: now also outputs 'MAILADDR root' and 'HOMEHOST '. + * Added README.checkarray with some information about the check process. + * Added /usr/share/doc/mdadm/FAQ to answer some FAQs. + + -- martin f. krafft Thu, 3 Aug 2006 22:54:04 +0100 + +mdadm (2.5.2-9) unstable; urgency=low + + * Added logcheck rules for kernel messages generated by checkarray, using + logcheck server level. + * Added handling of partitionable arrays to root RAID autodetection script + (closes: #380596). + * Forcing RAID assembly to run the arrays, working around an upstream bug + until that's fixed (closes: #287415). + * Updated documentation in README.initramfs-transition to include + information related to #380089. + * Updated debconf translations: + - Vietnamese by Clytie Syddall, thanks! + + -- martin f. krafft Mon, 31 Jul 2006 14:35:38 +0100 + +mdadm (2.5.2-8) unstable; urgency=low + + * Re-added rootraiddoc.97.html which was mysteriously lost (closes: #378678). + * Catching modprobe error in case of absence of the kernel modules, or + a non-modular kernel. Thanks to Holger Levsen. + * Copy raid456 kernel module into initramfs, if present (closes: #380152). + * checkarray: check for presence of active RAID arrays and give an + appropriate error if there are none present (closes: #379019). + * checkarray: skip sync for non-redundant devices (closes: #379352, #380424). + * Fixing cron registration for checkarray. crontab(5) is really stupid and + makes me think that they simply documented a bug instead of fixing it, so + now I have to hack around it. See the cron.d file (closes: #380425). + * Removed the code writing auto-detected devices to /var, which was silly + since /var isn't necessarily mounted yet by the time mdadm-raid is called. + Thanks to Maurizio Avogadro for pointing this out. + * Add reference to BAARF to README.Debian and included the RAID5 vs RAID10 + article from the BAARF website. + * Updated debconf translations: + - Japanese by Hideki Yamane, thanks! + - French by Florentin Duneau, thanks! (closes: #379511) + + -- martin f. krafft Thu, 27 Jul 2006 22:49:32 +0100 + +mdadm (2.5.2-7) unstable; urgency=low + + * Release to unstable. + + -- martin f. krafft Thu, 20 Jul 2006 17:23:23 +0100 + +mdadm (2.5.2-6) experimental; urgency=low + + * Adding mdrun to generated udeb. I will only remove mdrun after etch. + * Updated debconf translations: + - Dutch, thanks to Frans "Franzerl" Pop! (closes: #377412) + - French, thanks to Florentin Duneau! (closes: #377968) + + -- martin f. krafft Thu, 13 Jul 2006 23:11:24 +0200 + +mdadm (2.5.2-5) UNRELEASED; urgency=low + + * Remove the check for the lvm prereq in the initramfs hook, as #369617 is + now fixed. Thus conflicts against initramfs-tools (<< 0.65). + * Updated debconf translations: + - German, thanks to Mario Joußen! + * Added short note about maintenance in SVN to README.Debian. + + -- martin f. krafft Thu, 13 Jul 2006 23:10:36 +0200 + +mdadm (2.5.2-4) UNRELEASED; urgency=low + + * The "it takes two to swing" release. + * Now does not stop arrays on upgrade or remove. Thanks (and sorry) to + Christian Pernegger (and hopefully no others). + * Fixed small problem in debconf configuration script related to unbound + MAIL_TO variable. Thanks to Christian Pernegger. + + -- martin f. krafft Fri, 7 Jul 2006 16:59:01 +0200 + +mdadm (2.5.2-3) UNRELEASED; urgency=low + + * The initramfs now gets all RAID modules installed. It's a lot safer to + have them all around, the size difference is negligible, and we still only + load the needed ones at boot time. + * Added /usr/share/mdadm/checkarray, which can be used to check arrays for + parity. Also added a debconf question to let the user choose whether + cron should run these checks (closes: #377071). + * Only shut down arrays automatically when they've been automatically + started (closes: #376009). + * Make sure the user has a chance to choose the autostart feature by + elevating the debconf priority to high (see #376009). The warning about + reuse of RAID components has also been elevated to debconf priority high. + * The MAIL_TO setting from /etc/default/mdadm has been removed. Instead, use + MAILADDR in /etc/mdadm/mdadm.conf. See mdadm.conf(5). Your setting should + be automatically migrated. + * Now rewrites /etc/default/mdadm (but preserves settings) instead of trying + to patch it with changes. + * Added note to README.Debian to ensure users know that only the devices + listed in mdadm.conf will be autostarted (see #376009). + * Now includes latest md.txt from kernel documentation in + /usr/share/doc/mdadm/md.txt.gz. + * Added some more recipes to /usr/share/doc/mdadm/README.recipes.gz. + * Updated debconf translations: + - Swedish, thanks to Daniel Nylander! + - Brazilian Portuguese, thanks to Felipe Augusto van de Wiel! + - Czech, thanks to Miroslav Kure! + - Russian, thanks to Yuri Kozlov (closes: #376181). + * Further updates to the debconf templates; I hope the translators aren't + going to kill me. + + -- martin f. krafft Fri, 7 Jul 2006 15:09:40 +0200 + +mdadm (2.5.2-2) experimental; urgency=low + + * The "if it weren't for Munich's wheat beer, there'd be no" release. + * Removed -fno-strict-aliasing from compiler options, after upstream fixed + the bug that led to its use (see #369779, #356153). Thanks to Elimar + Riesebieter for pointing this out (closes: #375876). + * Moved detection of RAID devices from initramfs hook to debconf control + file, and added a (low-priority) debconf question as to which devices + should be started early in the boot sequence. For the cases where we + failed to auto-detect previously (e.g. root on LVM on RAID), it's paranoid + and suggests to start them all (closes: #375879). Thanks to Alec Berryman + for spotting this. + * Fixed a typo in README.experimental, which could lead to an unbootable + system with initramfs-tools 0.64 or before. Again, thanks to Alec for + spotting this. + * Extended bug script to include --examine output for all components (at + least if called by root, which hopefully should never happen. Err, + wait...) + * Disabled deprecation warning in mdrun until the transition is complete. + * Reworded the debconf templates due to a new question, and also for + readability. + + -- martin f. krafft Thu, 29 Jun 2006 22:54:47 +0200 + +mdadm (2.5.2-1) experimental; urgency=low + + * New upstream release. + * Implemented checks in the initramfs hooks and scripts for compatibility + with initramfs-tools. Now we do not need a conflict anymore because + mdadm's hooks and scripts will simply do nothing while the ones provided + by initramfs-tools are still present. + * Not using /bin/bash for mdrun, which I thought we'd need for read timeout + support (for the deprecation warning). Since the -n and -t flags to the + read shell builtin are non-POSIX, I dropped them, they were merely + cosmetic anyway. + + -- martin f. krafft Tue, 27 Jun 2006 15:06:55 +0200 + +mdadm (2.5.1-2) experimental; urgency=low + + * Updating dependency on initramfs-tools, which has not yet adopted to mdadm + taking over the hooks. + + -- martin f. krafft Mon, 26 Jun 2006 22:35:08 +0200 + +mdadm (2.5.1-1) experimental; urgency=low + + * New upstream release: + - Really fixes return status of examine (closes: #367901). + - Fixes a memory leak in monitor mode (closes: #372618). + - Fixes compiler warnings and errors (closes: #373802, #356153, #369779). + - Fix byte swapping issues (closes: #369765). + - Now lists devices it stops (closes: #369850). This also leads to + beautification of the init.d script's stop action. + * Fixed RAID init script to not complain about missing logger command. + + -- martin f. krafft Mon, 26 Jun 2006 00:58:36 +0200 + +mdadm (2.5-4) experimental; urgency=low + + * The "would you like fries with your parasite?" release. + * Now does not require RAID support from the kernel just for package + installation; that was silly of me, sorry (closes: Bug#370115). + * Added version to Replaces: initramfs-tools dependency. + * Further init.d script improvements. + * Recommends mail-transport-agent, or the monitor daemon won't be able to + send anything. + * Ignores failures from modprobe in postinst when RAID modules are not + available (closes: #370582). + + -- martin f. krafft Tue, 6 Jun 2006 12:45:53 +0200 + +mdadm (2.5-3) experimental; urgency=low + + * Added /usr/share/doc/mdadm/README.recipes with some common usage examples. + * Vastly improved the mdadm-raid init.d script output, and removed bashisms. + + -- martin f. krafft Fri, 2 Jun 2006 00:45:06 +0200 + +mdadm (2.5-2) experimental; urgency=low + + * The "on her majesty's secret service" release. + * Enabled -DDEBIAN during build, which will take care of default permissions + on devices created by mdadm. Together with the CREATE configuration + directive in 2.5, this now certainly closes: Bug#310241. + * Added a patch (incorporating lib/mm/xlate.h from lvm2) to prevent direct + access to kernel headers from userspace (closes: Bug#369765). + * Disabled strict aliasing compiler checks until we find a better + implementation for linked lists in C (closes: Bug#369779, Bug#356153). + * Actually decreased the size of the udeb mdadm binary with -Os + -fomit-frame-pointer (as suggested by Joey Hess) (closes: Bug#314370) + * Added Replaces: initramfs-tools to communicate that we're not conflicting + but replacing instead (see Bug#367567) + * Updated conflict with initramfs-tools to (<< 0.63) per suggestion by the + maintainers. + + -- martin f. krafft Thu, 1 Jun 2006 20:15:17 +0200 + +mdadm (2.5-1) experimental; urgency=low + + * The "show me the way to the next whiskey bar" release. + * See /usr/share/doc/mdadm/README.experimental or + http://madduck.net/~madduck/scratch/README.experimental + * New upstream release: + - mails include /proc/mdstat output (closes: Bug#355882) + - allows specification of device permissions in config (closes: Bug#310241) + * /sbin/mdrun has been deprecated and replaced by calls to /sbin/mdadm; + a proper deprecation warning is in place (see NEWS). + * Moved initramfs hook and script into the package, and switched it to mdadm + (from mdadm. Thanks to Stephen Frost for his help (closes: Bug#354144). + This should make sure that the right minor numbers are chosen during boot + (mainly because mdadm takes care of it all) (closes: Bug#361408). + * Removing mdrun from the udeb (d-i patch submitted to debian-boot mailing + list) + * Upstream links against openssl for SHA1 support (homehost feature), which + is a problem. An internal SHA1 implementation is provided, however, so + I just link against that. + * Switched init.d scripts to use LSB-compliant output. + * Enhanced init.d script output. + + -- martin f. krafft Thu, 1 Jun 2006 02:20:22 +0200 + +mdadm (2.4.1-2) unstable; urgency=low + + * The "this took way longer than I thought" release. + * Migrating to unstable. + * If the init.d script creates the mdadm.conf file, it should remove it on + purge. To accomplish this, I create a semaphore in /var/lib/mdadm if it + was generated, and only remove the conffile on purge if the semaphore + exists. + * Added a little helper /usr/share/mdadm/mkconf to aid generation of + configuration file. + * Added a bug script to collect some important information when the user + uses Debian bug reporting tools (such as reportbug). + * Added a debian/watch file. + + -- martin f. krafft Wed, 31 May 2006 23:07:48 +0200 + +mdadm (2.4.1-1) experimental; urgency=low + + * The "I'll kill that maintainer... uh, wait, it's me" release. Sorry for + the delay, here's the long awaited new upstream release (closes: + Bug#337903, Bug#363592), which gets rid of a bunch of functionality bugs: + - reiserfs size does not overflow anymore (closes: Bug#318230) + - fixed typos in manpages (closes: Bug#352798) + Oh, and we're moving away from that arch nightmare too. Sorry for the + confusion. + * Experimental release, because I really don't want to be responsible for + data loss. Though I am quite sure that the upgrade is painless, I also + don't have access to 18 drive RAID 10 with multipath on s390 or similar + arrangements. + * We now make the /dev/md* devices in postinst unless /dev/md15 exists (no + longer checking for /dev/md0), or unless devfs is in use. If udev is used, + /dev/md15 will only exist in complex setups, so the devices will be made + in /dev/.static by MAKEDEV, which is not really a concern. I opted against + unconditionally calling MAKEDEV until #367407 is fixed so as to preserve + custom permissions or owner settings. This also acknowledges the NMU + (#299623). + closes: Bug#310247, Bug#299623 + * Patched some of the code to make mdadm honour /etc/mdadm/mdadm.conf over + /etc/mdadm.conf (see NEWS). + * Fixed a couple of typos in the mdadm(8) manpage; thanks to Reuben Thomas. + closes: Bug#345669, Bug#345667 + * Pushed Standards-Version to 3.7.2; no changes required. + * Updated Debconf translations: + - Vietnamese by Clytie Siddall (closes: Bug#323950) + - Czech by Miroslav Kure (closes: Bug#360290) + - Russian by Yuri Kozlov (closes: Bug#361116) + - French by Eric Madesclair (closes: Bug#323988) + * Added new Debconf translations: + - Swedish by Daniel Nylander (closes: Bug#333486) + - Dutch by Frans Pop (closes: Bug#344714) + + -- martin f. krafft Tue, 16 May 2006 18:21:36 -0500 + +mdadm (1.12.0-1) unstable; urgency=low + + * New upstream release. + (obsoletes branches: symlinks) + (reduces branches: gcc4signedness, debian, autoscan) + * Fixed typo in mdadm.conf(5) manpage (closes: Bug#321152). + + -- martin f. krafft Sun, 24 Jul 2005 19:20:01 +0200 + +mdadm (1.9.0-5) unstable; urgency=low + + * martin f. krafft: (the, "look ma', we're maintained in arch now!" release) + (no functional differences except for added/updated translations) + - Acknowledge NMU by Steve Langasek; thanks! (closes: Bug#299623) + - split diff.gz into different arch branches (see debian/arch-branches). + - debian/control: + - Changed maintainer to pkg-mdadm-devel. + - Reworded some of the descriptions (closes: Bug#304170). + - Pushed Standards-Version to 3.6.2.1; no changes needed. + - fixed po-debconf integration + - debian/rules: + - fixed po-debconf integration + - l10n changes: + - Removed amiguity from debconf template (closes: Bug#312754). + - Added Vietnamese debconf translation; thanks to Clytie Siddall! + (closes: Bug#312753) + - Added Czech debconf translation; thanks to Miroslav Kure! (closes: Bug#319626) + - Updated German debconf translation; thanks to Jens Seidel! (closes: Bug#313981) + - backported upstream's gcc4 signedness fixes from 1.12.0 (gcc4signedness + branch) (closes: Bug#319743). + + -- martin f. krafft Sun, 24 Jul 2005 17:58:46 +0200 + +mdadm (1.9.0-4.1) unstable; urgency=high + + * Non-maintainer upload. + * High-urgency upload for sarge-targetted RC bugfix + * Make sure error output from MAKEDEV is sent to stderr, to avoid + interfering with debconf; this avoids installation problems on + udev-using systems. Thanks to Jonas Smedegaard for the patch. + Closes: #299623. + + -- Steve Langasek Wed, 1 Jun 2005 03:36:42 -0700 + +mdadm (1.9.0-4) unstable; urgency=high + + * High-urgency upload for sarge targeted RC bugfix. + * mdrun: replaced invocation of /usr/bin/seq with hard-coded sequence + (closes: Bug#310671). + + -- martin f. krafft Wed, 25 May 2005 09:51:41 +0200 + +mdadm (1.9.0-3) unstable; urgency=high + + * High-urgency upload for sarge targeted RC bugfix. + * Applied patch by Peter Samuelson , which causes mdadm to + follow symlinks of device nodes (closes: #274859, #310412, #310492). + * Added myself as co-maintainer as per agreement with Mario Joussen. + + -- martin f. krafft Tue, 24 May 2005 00:03:49 +0200 + +mdadm (1.9.0-2.3) unstable; urgency=high + + * Non-maintainer upload. + * Do not prevent postinst node creation when udev is being used; MAKEDEV + puts files into /dev/.static/dev with udev, which is needed so that device + nodes will be there even if udev is removed. Sorry for letting this slip + my mind and thanks to Steve Langasek for spotting this error. + * Leaving urgency at high to make sarge. + + -- martin f. krafft Sun, 22 May 2005 19:35:04 +0200 + +mdadm (1.9.0-2.2) unstable; urgency=high + + * Non-maintainer upload. + * High-urgency upload for sarge targeted RC bugfix. + * Move mdadm-raid back to S25 as it needs to run after modules have been + loaded at S20 (see followups to #294404, #301560). + * Verified that Steve Langasek's patch to config.c (see item 4 of the + 1.9.0-2.1 changelog) is necessary for `mdadm -A -s` to work. + (closes: #301560) + * Integrated patch by Erik van Konijnenburg to fix mdadm's --auto + option in the presence of --scan. + (closes: #294404, #273182, #284028, #310126). + * Modified mdrun to call mdadm with --auto in assembly mode. Removed code + which would auto-create 24 device nodes during system startup when udev + was used. + * Fixed next_free_md function in mdrun to iterate all 24 nodes instead of + using some fragile shell globbing, which did not work anyway. + * Prevent postinst node creation when udev is being used. + * Added a README.udev file to /usr/share/doc/mdadm. + + -- martin f. krafft Sun, 22 May 2005 12:57:56 +0200 + +mdadm (1.9.0-2.1) unstable; urgency=high + + * Non-maintainer upload. + * High-urgency upload for sarge targetted RC bugfix. + * Start mdadm-raid before udev on boot-up, so that mdadm device node + creation is honored, and support changing the init script ordering + on upgrades (closes: #294404). + * Fix mdadm --scan to prefer the values contained in /proc/partitions, + instead of picking up device node names at random from /dev. + * Teach mdrun to look at /dev/.static/dev instead of /.dev for udev + mounts requiring autostart (closes: #301560). + + -- Steve Langasek Sun, 27 Mar 2005 21:59:12 -0800 + +mdadm (1.9.0-2) unstable; urgency=low + + * Patched is_standard() to accept /dev/md/* names as standard. + Thanks to Colin Watson . + (closes: Bug#296794) + * Added another typecast to make it compilable on amd64 with gcc-4.0. + Thanks to Andreas Jochens . + (closes: Bug#294217) + * Removed unnecessary second assignment to $BASE in mdrun. + Thanks to Colin Watson . + (closes: Bug#295433) + + -- Mario Joussen Sun, 6 Mar 2005 14:22:24 +0100 + +mdadm (1.9.0-1) unstable; urgency=high + + * New upstream release. + Solves problems with same UUID for each array. + Again a stable upstream version. + (closes: Bug#292282, Bug#293406, Bug#292784, Bug#290363, Bug#292715) + * Added some typecasts to make it compilable on amd64 with gcc-4.0. + Thanks to Andreas Jochens . + (closes: Bug#287638) + + -- Mario Joussen Sun, 6 Feb 2005 12:25:03 +0100 + +mdadm (1.8.1-1) unstable; urgency=low + + * New upstream release. + Fixed segfault if no config file present and --scan is used. + (closes: Bug#283425, Bug#282604, Bug#284024) + * Fixed typo in detailed help of grow mode. + (closes: Bug#286980) + * Added japanese debconf translation. Thanks to Hideki Yamane + . + (closes: Bug#281073) + * Fixed missing variable initialization causing segfaults. + + -- Mario Joussen Sun, 26 Dec 2004 14:44:31 +0100 + +mdadm (1.7.0-2) unstable; urgency=high + + * Changed debconf script to save the settings from the config file. + Thanks to Fabio Massimo Di Nitto and + Frank Lichtenheld for the patch. + (closes: Bug#274208) + * Moved try to load md module inside the AUTOSTART if branch in + /etc/init.d/mdadm. + * Removed try to load md module from /etc/init.d/mdadm-raid. + + -- Mario Joussen Sun, 24 Oct 2004 19:48:06 +0200 + +mdadm (1.7.0-1) unstable; urgency=low + + * New upstream release. + (closes: Bug#267814) + + -- Mario Joussen Sun, 12 Sep 2004 20:48:33 +0200 + +mdadm (1.6.0-3) unstable; urgency=high + + * Added 'Conflicts: raidtools2 (<< 1.00.3-12.1)' because these packages + contain a mdrun.8 man page also. + (closes: Bug#268634, Bug#266527) + * Updated the french translation. + Thanks to Eric + (closes: Bug#266251) + + -- Mario Joussen Sat, 28 Aug 2004 18:23:17 +0200 + +mdadm (1.6.0-2) unstable; urgency=low + + * Included version 0.97 of "Debian Software Root Raid Documentation". + * Now mdrun is only used if no mdadm.conf is present. + Thanks to Thomas Prokosch <7nrmi1s02@sneakemail.com>. + (closes: Bug#264059) + * Added man page for mdrun. + Thanks to Robert Collins . + (closes: Bug#265480) + * Moved /etc/mdadm/debian.conf to /etc/default/mdadm. + (closes: Bug#254922) + * Added a little workaround to mdrun to interact better with udev. + Thanks to Fabio Massimo Di Nitto . + (closes: Bug#259491) + * Updated Brazilian Portuguese translation. + Thanks to Andre Luis Lopes . + (closes: Bug#264220) + + -- Mario Joussen Mon, 16 Aug 2004 22:10:59 +0200 + +mdadm (1.6.0-1) unstable; urgency=low + + * New upstream release. + Detect degraded arrays in --monitor mode now. + (closes: Bug#257357) + * Changed default to autostart RAID array. + (closes: Bug#250792) + * Fixed mdrun problem with kernel 2.6. + Thanks to Andre Tomt and Fabio Massimo Di Nitto + + (closes: Bug#231823) + * Changed reuse warning to be less misleading. + (closes: Bug#253339) + + -- Mario Joussen Tue, 20 Jul 2004 21:40:33 +0200 + +mdadm (1.5.0-2) unstable; urgency=low + + * Added french debconf template. + Thanks to Eric Madesclair . + (closes: Bug#231968) + + -- Mario Joussen Tue, 4 May 2004 21:29:19 +0200 + +mdadm (1.5.0-1) unstable; urgency=low + + * New upstream release. + * Rewrote debconf templates to avoid referring to debconf interface + widgets. + (closes: Bug#231221) + * Removed manual scan for RAID devices from init script. + (closes: Bug#233122, Bug#236762) + * Added creation of an udeb package. + (closes: Bug#243609) + * Added "Debian Software Root Raid Documentation". + Thanks to Lucas Albers . + + -- Mario Joussen Sun, 25 Apr 2004 16:16:06 +0200 + +mdadm (1.4.0-3) unstable; urgency=low + + * Updated to standards version 3.6.1.0 and debhelper 4. + * Corrected definition of BLKGETSIZE64 macro to compile with the + 2.6 kernel headers. + (closes: Bug#223191) + * Swichted to po-debconf to provide localized debconf templates. + (closes: Bug#225288) + + -- Mario Joussen Sun, 1 Feb 2004 19:30:53 +0100 + +mdadm (1.4.0-2) unstable; urgency=low + + * Corrected human readable size calculation. + (closes: Bug#225041) + * Added a warning about reusing hard disks and using the autostart + feature. + (closes: Bug#223790) + + -- Mario Joussen Thu, 25 Dec 2003 19:52:57 +0100 + +mdadm (1.4.0-1) unstable; urgency=low + + * New upstream release. + + -- Mario Joussen Sun, 7 Dec 2003 19:39:27 +0100 + +mdadm (1.3.0-2) unstable; urgency=low + + * Added upstream changelog to package. + + -- Mario Joussen Tue, 12 Aug 2003 21:51:59 +0200 + +mdadm (1.3.0-1) unstable; urgency=low + + * New upstream release. + (closes: Bug#191561, Bug#200921) + + -- Mario Joussen Thu, 31 Jul 2003 20:59:20 +0200 + +mdadm (1.2.0-1) unstable; urgency=low + + * New upstream release. (closes: Bug#183191) + * New version of mdrun that works properly with devfs and temporary + device directory. + (closes: Bug#182035) + * Added 'Conflicts: raidtools' because of a name clash with mdrun. + (closes: Bug#182960) + + -- Mario Joussen Sun, 16 Mar 2003 13:32:45 +0100 + +mdadm (1.0.1-4) unstable; urgency=low + + * Changed mdrun so that it can deal with partition statistics in + /proc/partitions. + (closes: Bug#174000, Bug#175130) + * Added russian (ru) debconf template translation. Thanks to Sergey + Spiridonov . + + -- Mario Joussen Sun, 5 Jan 2003 13:14:45 +0100 + +mdadm (1.0.1-3) unstable; urgency=low + + * Fixed a bug in mdrun. (closes: Bug#167607) + + -- Mario Joussen Mon, 11 Nov 2002 07:53:23 +0100 + +mdadm (1.0.1-2) unstable; urgency=low + + * Fixed typo in help option. (closes: Bug#151533) + * Added a script that discovers and assembles all arrays automatically. + Thanks to Eduard Bloch . + (closes: Bug#161699) + + -- Mario Joussen Fri, 1 Nov 2002 13:46:47 +0100 + +mdadm (1.0.1-1) unstable; urgency=low + + * New upstream release. + + -- Mario Joussen Thu, 30 May 2002 14:01:22 +0200 + +mdadm (0.8.2-1) unstable; urgency=low + + * New upstream release. + * Splitted up mdadm.templates in one file for each language. + * Added brazilian portuguese (pt_BR) debconf template translation. + Thanks to Andre Luis Lopes . + (closes: Bug#141540) + + -- Mario Joussen Thu, 18 Apr 2002 19:31:00 +0200 + +mdadm (0.7.2-1) unstable; urgency=low + + * New upstream release. + * 'mdctl' was renamed to 'mdadm' upstream. + * Removed question about updating mdctl init script links. + + -- Mario Joussen Sat, 23 Mar 2002 02:50:51 +0100 + +mdctl (0.5-4) unstable; urgency=low + + * Added debconf template to ask the user if the init script links + should be updated. + + -- Mario Joussen Mon, 4 Mar 2002 22:53:37 +0100 + +mdctl (0.5-3) unstable; urgency=low + + * Splitted up init script in two parts. One starts the md array and the + other starts the raid monitor daemon. + (closes: Bug#136184) + + -- Mario Joussen Thu, 28 Feb 2002 22:45:57 +0100 + +mdctl (0.5-2) unstable; urgency=low + + * Included optimization in Makefile + (closes: Bug#127687) + * Removed Conflicts/Replaces/Provides: mdutils + (closes: Bug#127684, Bug#127719) + * Added an init script, which can start md arrays and the raid monitor + daemon + * MD devices are now created under /dev if necessary + * Added a sample configuration file + + -- Mario Joussen Sun, 13 Jan 2002 23:43:40 +0100 + +mdctl (0.5-1) unstable; urgency=low + + * Initial Release. + (closes: Bug#126610) + + -- Mario Joussen Wed, 26 Dec 2001 17:07:09 +0100 diff --git a/debian/checkarray b/debian/checkarray new file mode 100644 index 00000000..1fb97356 --- /dev/null +++ b/debian/checkarray @@ -0,0 +1,219 @@ +#!/bin/sh +# +# checkarray -- initiates a check run of an MD array's redundancy information. +# +# Copyright © martin f. krafft +# distributed under the terms of the Artistic Licence 2.0 +# +set -eu + +PROGNAME=${0##*/} + +about() +{ + echo "\ +$PROGNAME -- MD array (RAID) redundancy checker tool +Copyright © martin f. krafft +Released under the terms of the Artistic Licence 2.0" +} + +usage() +{ + about + echo " +Usage: $PROGNAME [options] [arrays] + +Valid options are: + -a|--all check all assembled arrays (ignores arrays in command line). + -s|--status print redundancy check status of devices. + -x|--cancel queue a request to cancel a running redundancy check. + -i|--idle perform check in a lowest scheduling class (idle) + -l|--slow perform check in a lower-than-standard scheduling class + -f|--fast perform check in higher-than-standard scheduling class + --realtime perform check in real-time scheduling class (DANGEROUS!) + -c|--cron honour AUTOCHECK setting in /etc/default/mdadm. + -q|--quiet suppress informational messages + (use twice to suppress error messages too). + -h|--help show this output. + -V|--version show version information. + +Examples: + $PROGNAME --all --idle + $PROGNAME --quiet /dev/md[123] + $PROGNAME -sa + $PROGNAME -x --all + +Devices can be specified in almost any format. The following are equivalent: + /dev/md0, md0, /dev/md/0, /sys/block/md0 + +You can also control the status of a check with /proc/mdstat file." +} + +SHORTOPTS=achVqQsxilf +LONGOPTS=all,cron,help,version,quiet,real-quiet,status,cancel,idle,slow,fast,realtime + +eval set -- $(getopt -o $SHORTOPTS -l $LONGOPTS -n $PROGNAME -- "$@") + +arrays='' +cron=0 +all=0 +quiet=0 +status=0 +action=check +ionice= + +for opt in $@; do + case "$opt" in + -a|--all) all=1;; + -s|--status) action=status;; + -x|--cancel) action=idle;; + -i|--idle) ionice=idle;; + -l|--slow) ionice=low;; + -f|--fast) ionice=high;; + --realtime) ionice=realtime;; + -c|--cron) cron=1;; + -q|--quiet) quiet=$(($quiet+1));; + -Q|--real-quiet) quiet=$(($quiet+2));; # for compatibility + -h|--help) usage; exit 0;; + -V|--version) about; exit 0;; + /dev/md/*|md/*) arrays="${arrays:+$arrays }md${opt#*md/}";; + /dev/md*|md*) arrays="${arrays:+$arrays }${opt#/dev/}";; + /sys/block/md*) arrays="${arrays:+$arrays }${opt#/sys/block/}";; + --) :;; + *) echo "$PROGNAME: E: invalid option: $opt. Try --help." >&2; exit 1;; + esac +done + +is_true() +{ + case "${1:-}" in + [Yy]es|[Yy]|1|[Tt]rue|[Tt]) return 0;; + *) return 1; + esac +} + +DEBIANCONFIG=/etc/default/mdadm +[ -r $DEBIANCONFIG ] && . $DEBIANCONFIG +if [ $cron = 1 ] && ! is_true ${AUTOCHECK:-false}; then + [ $quiet -lt 1 ] && echo "$PROGNAME: I: disabled in $DEBIANCONFIG ." >&2 + exit 0 +fi + +if [ ! -f /proc/mdstat ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: E: MD subsystem not loaded, or /proc unavailable." >&2 + exit 2 +fi + +if [ ! -d /sys/block ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: E: /sys filesystem not available." >&2 + exit 7 +fi + +if [ -z "$(ls /sys/block/md* 2>/dev/null)" ]; then + if [ $quiet -lt 2 ] && [ $cron != 1 ]; then + echo "$PROGNAME: W: no active MD arrays found." >&2 + echo "$PROGNAME: W: (maybe uninstall the mdadm package?)" >&2 + fi + exit 0 +fi + +if [ -z "$(ls /sys/block/md*/md/level 2>/dev/null)" ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: E: kernel too old, no support for redundancy checks." >&2 + exit 6 +fi + +if ! egrep -q '^raid([1456]|10)$' /sys/block/md*/md/level 2>/dev/null; then + [ $quiet -lt 1 ] && echo "$PROGNAME: I: no redundant arrays present; skipping checks..." >&2 + exit 0 +fi + +if [ -z "$(ls /sys/block/md*/md/sync_action 2>/dev/null)" ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: E: no kernel support for redundancy checks." >&2 + exit 3 +fi + +[ $all = 1 ] && arrays="$(ls -d1 /sys/block/md* | cut -d/ -f4)" + +for array in $arrays; do + MDBASE=/sys/block/$array/md + + if [ ! -e $MDBASE/sync_action ]; then + [ $quiet -lt 1 ] && echo "$PROGNAME: I: skipping non-redundant array $array." >&2 + continue + fi + + read cur_status < $MDBASE/sync_action + + if [ $action = status ]; then + echo "$array: $cur_status" + continue + fi + + if [ ! -w $MDBASE/sync_action ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: E: $MDBASE/sync_action not writeable." >&2 + exit 4 + fi + + if [ "$(cat $MDBASE/array_state)" = 'read-auto' ]; then + [ $quiet -lt 1 ] && echo "$PROGNAME: W: array $array in auto-read-only state, skipping..." >&2 + continue + fi + + case "$action" in + idle) + echo $action > $MDBASE/sync_action + [ $quiet -lt 1 ] && echo "$PROGNAME: I: cancel request queued for array $array." >&2 + ;; + + check) + if [ "$cur_status" != idle ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: W: array $array not idle, skipping..." >&2 + continue + fi + + # check if the array created recently and skip test if it is + created=$(mdadm --detail /dev/$array 2>/dev/null | + sed -n 's/.*Creation Time *://p' ) + if [ -n "$created" ]; then + created=$(date +%s -d "$created" 2>/dev/null) + fi + if [ -n "$created" ]; then + now=$(date +%s) + if [ "$created" -lt "$now" -a \ + "$created" -gt "$(($now - 14 * 24 * 60 * 60))" ]; then + [ $quiet -lt 2 ] && echo "$PROGNAME: I: array $array created recently, skipping..." >&2 + continue + fi + fi + + # queue request for the array. The kernel will make sure that these requests + # are properly queued so as to not kill one of the array. + echo $action > $MDBASE/sync_action + [ $quiet -lt 1 ] && echo "$PROGNAME: I: check queued for array $array." >&2 + + case "$ionice" in + idle) ioarg='-c3'; renice=15;; + low) ioarg='-c2 -n7'; renice=5;; + high) ioarg='-c2 -n0'; renice=0;; + realtime) ioarg='-c1 -n4'; renice=-5;; + *) break;; + esac + + resync_pid= wait=5 + while [ $wait -gt 0 ]; do + wait=$((wait - 1)) + resync_pid=$(ps -ef | awk -v dev=$array 'BEGIN { pattern = "^\\[" dev "_resync]$" } $8 ~ pattern { print $2 }') + if [ -n "$resync_pid" ]; then + [ $quiet -lt 1 ] && echo "$PROGNAME: I: selecting $ionice I/O scheduling class and $renice niceness for resync of $array." >&2 + ionice -p "$resync_pid" $ioarg 2>/dev/null || : + renice -n $renice -p "$resync_pid" 1>/dev/null 2>&1 || : + break + fi + sleep 1 + done + ;; + esac + +done + +exit 0 diff --git a/debian/compat b/debian/compat new file mode 100644 index 00000000..1e8b3149 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +6 diff --git a/debian/control b/debian/control new file mode 100644 index 00000000..96491c6d --- /dev/null +++ b/debian/control @@ -0,0 +1,32 @@ +Source: mdadm +Section: admin +Priority: optional +Maintainer: Debian mdadm maintainers +Uploaders: Dimitri John Ledkov +Build-Depends: debhelper (>= 6.0.7~), po-debconf, groff-base +Standards-Version: 3.9.7 +Vcs-Git: git://anonscm.debian.org/pkg-mdadm/mdadm.git +Vcs-Browser: https://anonscm.debian.org/cgit/pkg-mdadm/mdadm.git +Homepage: http://neil.brown.name/blog/mdadm + +Package: mdadm +Architecture: linux-any +Depends: ${shlibs:Depends}, udev, ${misc:Depends}, lsb-base, debconf +Recommends: default-mta | mail-transport-agent, kmod | module-init-tools +Description: tool to administer Linux MD arrays (software RAID) + The mdadm utility can be used to create, manage, and monitor MD + (multi-disk) arrays for software RAID or multipath I/O. + . + This package automatically configures mdadm to assemble arrays during the + system startup process. If not needed, this functionality can be disabled. + +Package: mdadm-udeb +Section: debian-installer +XC-Package-Type: udeb +Architecture: linux-any +Depends: ${shlibs:Depends} +Description: tool to administer Linux MD arrays (software RAID) + The mdadm utility can be used to create, manage, and monitor MD + (multi-disk) arrays for software RAID or multipath I/O. + . + This is a minimal package used by the debian-installer. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 00000000..39f1435d --- /dev/null +++ b/debian/copyright @@ -0,0 +1,21 @@ +This package was debianized by Mario Jou/3en on +Wed, 26 Dec 2001 17:07:09 +0100. +Martin F. Krafft took over on +Tue, 16 May 2006 13:21:06 -0500 + +The mdadm source was downloaded from + http://www.kernel.org/pub/linux/utils/raid/mdadm/ + +Upstream Author: Neil Brown + +Copyright © 2001-2006 Neil Brown +Packaging copyright © 2001-2005 Mario Jou/3en +Packaging copyright © 2005-2008 Martin F. Krafft + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +On Debian GNU/Linux systems, the complete text of the GNU General +Public License can be found in '/usr/share/common-licenses/GPL-2'. diff --git a/debian/initramfs/hook b/debian/initramfs/hook new file mode 100644 index 00000000..ae7d4d18 --- /dev/null +++ b/debian/initramfs/hook @@ -0,0 +1,110 @@ +#!/bin/sh +# +# Copyright © 2006-2008 Martin F. Krafft , +# 2012 Michael Tokarev +# based on the scripts in the initramfs-tools package. +# released under the terms of the Artistic Licence. +# +set -eu + +PREREQ="udev" +prereqs() +{ + echo "$PREREQ" +} + +case "${1:-}" in + prereqs) + prereqs + exit 0 + ;; +esac + +is_true() +{ + case "${1:-}" in + [Yy]es|[Yy]|1|[Tt]rue|[Tt]) return 0;; + *) return 1;; + esac +} + +write() +{ + local PREFIX; PREFIX=$1; shift + echo "${PREFIX}: mdadm: $@" >&2 +} + +info() +{ + is_true ${VERBOSE:-false} && write I "$@" || : +} + +warn() +{ + write W "$@" +} + +. /usr/share/initramfs-tools/hook-functions + +# copy the binary as early as possible +copy_exec /sbin/mdadm /sbin +copy_exec /sbin/mdmon /sbin + +# Copy udev rules, which udev no longer does +for UDEV_RULE in 63-md-raid-arrays.rules 64-md-raid-assembly.rules; do +for rules_folder in /lib/udev/rules.d /etc/udev/rules.d; do + if [ -f $rules_folder/$UDEV_RULE ]; then + mkdir -p $DESTDIR$rules_folder + cp $rules_folder/$UDEV_RULE $DESTDIR$rules_folder/$UDEV_RULE + fi +done +done + +# load raid modules in the initramfs +for module in linear multipath raid0 raid1 raid456 raid5 raid6 raid10; do + force_load $module +done + +# copy the mdadm configuration +CONFIG=/etc/mdadm/mdadm.conf +ALTCONFIG=/etc/mdadm.conf +DESTMDADMCONF=$DESTDIR/etc/mdadm/mdadm.conf +[ ! -f $CONFIG ] && [ -f $ALTCONFIG ] && CONFIG=$ALTCONFIG || : +mkdir -p ${DESTDIR}/etc/mdadm + +if [ ! -f $CONFIG ]; then + # there is no configuration file, so let's create one + if /usr/share/mdadm/mkconf generate $CONFIG; then + # all is well + cp -p $CONFIG $DESTMDADMCONF + info "auto-generated the mdadm.conf configuration file." + else + # we failed to auto-generate, so let the emergency procedure take over + warn "failed to auto-generate the mdadm.conf file." + warn "please read /usr/share/doc/mdadm/README.upgrading-2.5.3.gz ." + fi +else + cp -p $CONFIG ${DESTDIR}/etc/mdadm + sed -i '/^CREATE/s/^/#/' $DESTMDADMCONF + if ! grep -q '^ARRAY' $CONFIG; then + tmpfile="${DESTMDADMCONF}.tmp" + warn "$CONFIG defines no arrays." + if /usr/share/mdadm/mkconf > $tmpfile; then + cp -p $tmpfile $DESTMDADMCONF + else + warn "failed to auto-generate temporary mdadm.conf file." + fi + else + # make sure the configuration file knows about all running devices + /sbin/mdadm --detail --scan | while read array device params; do + uuid=${params#*UUID=}; uuid=${uuid%% *} + if ! grep -q "UUID=$uuid" $DESTMDADMCONF; then + warn "the array $device with UUID $uuid" + warn "is currently active, but it is not listed in mdadm.conf. if" + warn "it is needed for boot, then YOUR SYSTEM IS NOW UNBOOTABLE!" + warn "please inspect the output of /usr/share/mdadm/mkconf, compare" + warn "it to $CONFIG, and make the necessary changes." + fi + done + fi +fi diff --git a/debian/initramfs/script.local-block b/debian/initramfs/script.local-block new file mode 100644 index 00000000..e9b47c3d --- /dev/null +++ b/debian/initramfs/script.local-block @@ -0,0 +1,44 @@ +#!/bin/sh + +PREREQ="multipath" + +prereqs() +{ + echo "$PREREQ" +} + +case $1 in +# get pre-requisites +prereqs) + prereqs + exit 0 + ;; +esac + +# Poor man's mdadm-last-resort@.timer +# That kicks in 2/3rds into the ROOTDELAY + +if [ ! -f /run/count.mdadm.initrd ] +then + COUNT=0 +else + COUNT=$(cat /run/count.mdadm.initrd) +fi +COUNT=$((COUNT + 1)) + +echo $COUNT > /run/count.mdadm.initrd + +MAX=30 +if [ ${ROOTDELAY:-0} -gt $MAX ]; then + MAX=$ROOTDELAY +fi +MAX=$((MAX*2/3)) + +if [ "$COUNT" = "$MAX" ] +then + # Poor man's mdadm-last-resort@.service + mdadm -q --run /dev/md?* + rm -f /run/count.mdadm.initrd +fi + +exit 0 diff --git a/debian/initramfs/script.local-bottom b/debian/initramfs/script.local-bottom new file mode 100644 index 00000000..eda3b179 --- /dev/null +++ b/debian/initramfs/script.local-bottom @@ -0,0 +1,3 @@ +#!/bin/sh +rm -f /run/count.mdadm.initrd +exit 0 \ No newline at end of file diff --git a/debian/mdadm-waitidle b/debian/mdadm-waitidle new file mode 100644 index 00000000..920272c7 --- /dev/null +++ b/debian/mdadm-waitidle @@ -0,0 +1,56 @@ +#!/bin/sh +# This script is not used when systemd is running +### BEGIN INIT INFO +# Provides: mdadm-waitidle +# Required-Start: +# Required-Stop: +# Should-Stop: halt reboot kexec +# X-Stop-After: umountroot +# Default-Start: +# Default-Stop: 0 6 +# Short-Description: Wait for MD arrays to become idle +# Description: Waits until all MD arrays are in idle and synced state +# before halt/reboot. +### END INIT INFO +# +set -eu + +MDADM=/sbin/mdadm +test -x "$MDADM" || exit 0 +test -f /proc/mdstat || exit 0 + +. /lib/lsb/init-functions + +case "${1:-}" in + + start|restart|force-reload) + # nothing, the only reason the script is here is to stop arrays + ;; + + stop) + sync + wait= + for md in /sys/block/md*/md ; do + [ -d "$md" ] || continue + [ "$wait" ] || log_action_begin_msg "Waiting for MD arrays to become idle" + wait=y + [ -w $md/sync_action ] && echo idle > $md/sync_action + done + if [ "$wait" ]; then + # mdadm --wait-clean has a short internal timeout + if $MDADM --wait-clean --scan; then + log_action_end_msg 0 + else + log_action_end_msg 1 + sleep 1 + fi + fi + ;; + + *) + echo "Usage: ${0:-} stop" >&2 + exit 1;; + +esac + +exit 0 diff --git a/debian/mdadm.config b/debian/mdadm.config new file mode 100644 index 00000000..b9d4e246 --- /dev/null +++ b/debian/mdadm.config @@ -0,0 +1,43 @@ +#!/bin/sh +# Copyright © 2001-2004 Mario Jou/3en +# Copyright © martin f. krafft +# Distributable under the terms of the GNU GPL version 2. +# +. /usr/share/debconf/confmodule +# see #369953 for ordering +set -eu + +CONFIG=/etc/mdadm/mdadm.conf +ALTCONFIG=/etc/mdadm.conf +[ ! -f $CONFIG ] && [ -f $ALTCONFIG ] && CONFIG=$ALTCONFIG + +DEBIANCONFIG=/etc/default/mdadm + +if [ -s $DEBIANCONFIG ] ; then + AUTOCHECK=true + START_DAEMON=true + MAILADDR=root + + [ -f $DEBIANCONFIG ] && . $DEBIANCONFIG + if [ -f $CONFIG ]; then + MAILADDR=$(sed -rne 's/^MAILADDR[[:space:]]*([^[:space:]]+).*/\1/p' $CONFIG) + fi + + [ -n "$AUTOCHECK" ] && db_set mdadm/autocheck "$AUTOCHECK" + [ -n "$START_DAEMON" ] && db_set mdadm/start_daemon "$START_DAEMON" + [ -n "$MAILADDR" ] && db_set mdadm/mail_to "$MAILADDR" +fi + +db_capb escape + +db_input medium mdadm/autocheck || : +db_go + +db_input medium mdadm/start_daemon || : +db_go + +db_get mdadm/start_daemon || : +if [ "$RET" = true ]; then + db_input medium mdadm/mail_to || : + db_go +fi diff --git a/debian/mdadm.cron.d b/debian/mdadm.cron.d new file mode 100644 index 00000000..309d180e --- /dev/null +++ b/debian/mdadm.cron.d @@ -0,0 +1,12 @@ +# +# cron.d/mdadm -- schedules periodic redundancy checks of MD devices +# +# Copyright © martin f. krafft +# distributed under the terms of the Artistic Licence 2.0 +# + +# By default, run at 00:57 on every Sunday, but do nothing unless the day of +# the month is less than or equal to 7. Thus, only run on the first Sunday of +# each month. crontab(5) sucks, unfortunately, in this regard; therefore this +# hack (see #380425). +57 0 * * 0 root if [ -x /usr/share/mdadm/checkarray ] && [ $(date +\%d) -le 7 ]; then /usr/share/mdadm/checkarray --cron --all --idle --quiet; fi diff --git a/debian/mdadm.cron.daily b/debian/mdadm.cron.daily new file mode 100644 index 00000000..d5ac1ae0 --- /dev/null +++ b/debian/mdadm.cron.daily @@ -0,0 +1,18 @@ +#!/bin/sh +# +# cron.daily/mdadm -- daily check that MD devices are functional +# +# Copyright © 2008 Paul Slootman +# distributed under the terms of the Artistic Licence 2.0 + +# As recommended by the manpage, run +# mdadm --monitor --scan --oneshot +# every day to ensure that any degraded MD devices don't go unnoticed. +# Email will go to the address specified in /etc/mdadm/mdadm.conf . +# +set -eu + +MDADM=/sbin/mdadm +[ -x $MDADM ] || exit 0 # package may be removed but not purged + +exec $MDADM --monitor --scan --oneshot diff --git a/debian/mdadm.doc-base.faq b/debian/mdadm.doc-base.faq new file mode 100644 index 00000000..3fff4504 --- /dev/null +++ b/debian/mdadm.doc-base.faq @@ -0,0 +1,9 @@ +Document: mdadm-faq +Title: mdadm Debian FAQ +Author: martin f. krafft +Abstract: The document answers frequently asked questions about Debian's mdadm +Section: System/Administration + +Format: text +Index: /usr/share/doc/mdadm/FAQ.gz +Files: /usr/share/doc/mdadm/FAQ.gz diff --git a/debian/mdadm.doc-base.recipes b/debian/mdadm.doc-base.recipes new file mode 100644 index 00000000..d1069682 --- /dev/null +++ b/debian/mdadm.doc-base.recipes @@ -0,0 +1,9 @@ +Document: mdadm-readme-recipes +Title: mdadm Debian recipes +Author: David Pashley +Abstract: The document contains some common recipes for mdadm usage on Debian +Section: System/Administration + +Format: text +Index: /usr/share/doc/mdadm/README.recipes.gz +Files: /usr/share/doc/mdadm/README.recipes.gz diff --git a/debian/mdadm.docs b/debian/mdadm.docs new file mode 100644 index 00000000..830665f4 --- /dev/null +++ b/debian/mdadm.docs @@ -0,0 +1,7 @@ +TODO +debian/README.recipes +debian/README.checkarray +debian/FAQ +ANNOUNCE-* +external-reshape-design.txt +mdmon-design.txt diff --git a/debian/mdadm.init b/debian/mdadm.init new file mode 100644 index 00000000..0a9004f0 --- /dev/null +++ b/debian/mdadm.init @@ -0,0 +1,100 @@ +#!/bin/sh +# +# Start the MD monitor daemon for all active MD arrays if desired. +# This script is not used under systemd. +# +# Copyright © 2001-2005 Mario Jou/3en +# Copyright © 2005-2009 Martin F. Krafft +# Distributable under the terms of the GNU GPL version 2. +# +### BEGIN INIT INFO +# Provides: mdadm +# Required-Start: $local_fs $syslog +# Required-Stop: $local_fs $syslog sendsigs +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: MD monitoring daemon +# Description: mdadm provides a monitor mode, in which it will scan for +# problems with the MD devices. If a problem is found, the +# administrator is alerted via email, or a custom script is +# run. +### END INIT INFO +# +set -eu + +MDADM=/sbin/mdadm +MDMON=/sbin/mdmon +RUNDIR=/run/mdadm +PIDFILE=$RUNDIR/monitor.pid +DEBIANCONFIG=/etc/default/mdadm + +test -x "$MDADM" || exit 0 + +test -f /proc/mdstat || exit 0 + +START_DAEMON=true +test -f $DEBIANCONFIG && . $DEBIANCONFIG + +. /lib/lsb/init-functions + +is_true() +{ + case "${1:-}" in + [Yy]es|[Yy]|1|[Tt]|[Tt]rue) return 0;; + *) return 1; + esac +} + +case "${1:-}" in + start) + if [ -x /usr/bin/systemd-detect-virt ] && /usr/bin/systemd-detect-virt --quiet --container; then + log_daemon_msg "Not starting MD monitoring service in container" + log_end_msg 0 + exit 0 + fi + + if is_true $START_DAEMON; then + log_daemon_msg "Starting MD monitoring service" "mdadm --monitor" + mkdir -p $RUNDIR + set +e + start-stop-daemon -S -p $PIDFILE -x $MDADM -- \ + --monitor --pid-file $PIDFILE --daemonise --scan ${DAEMON_OPTIONS:-} + log_end_msg $? + set -e + fi + if [ "$(echo $RUNDIR/md[0-9]*.pid)" != "$RUNDIR/md[0-9]*.pid" ]; then + log_daemon_msg "Restarting MD external metadata monitor" "mdmon --takeover --all" + set +e + $MDMON --takeover --all + log_end_msg $? + set -e + fi + ;; + stop) + if [ -f $PIDFILE ] ; then + log_daemon_msg "Stopping MD monitoring service" "mdadm --monitor" + set +e + start-stop-daemon -K -p $PIDFILE -x $MDADM + rm -f $PIDFILE + log_end_msg $? + set -e + fi + for file in $RUNDIR/md[0-9]*.pid ; do + [ ! -f "$file" ] && continue + ln -sf $file /run/sendsigs.omit.d/mdmon-${file##*/} + done + ;; + status) + status_of_proc -p $PIDFILE "$MDADM" "mdadm" && exit 0 || exit $? + ;; + restart|reload|force-reload) + ${0:-} stop + ${0:-} start + ;; + *) + echo "Usage: ${0:-} {start|stop|status|restart|reload|force-reload}" >&2 + exit 1 + ;; +esac + +exit 0 diff --git a/debian/mdadm.logcheck.ignore.server b/debian/mdadm.logcheck.ignore.server new file mode 100644 index 00000000..051c4732 --- /dev/null +++ b/debian/mdadm.logcheck.ignore.server @@ -0,0 +1,23 @@ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md driver [.[:digit:]]+ MAX_MD_DEVS=[[:digit:]]+, MD_SB_DISKS=[[:digit:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: bitmap version [.[:digit:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md[[:digit:]]+ stopped\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md[[:digit:]]+ still in use\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: cannot remove active disk [[:alnum:]]+ from md[[:digit:]]+ \.\.\. ?$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: raid([01456]|456|10) personality registered for level ([01456]|10)$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: (data-check|requested-resync|resync|reshape|recovery) of RAID array md[[:digit:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: resuming (data-check|requested-resync|resync|reshape|recovery) of md[[:digit:]]+ from checkpoint\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: md[[:digit:]]+: (data-check|requested-resync|resync|reshape|recovery) done\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: minimum _guaranteed_ ?speed: [[:digit:]]+ KB/sec/disk\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: using maximum available idle IO bandwidth \(but not more than [[:digit:]]+ KB/sec\) for (data-check|requested-resync|resync|reshape|recovery)\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: delaying (data-check|requested-resync|resync|reshape|recovery) of md[[:digit:]]+ until md[[:digit:]]+ has finished \(they share one or more physical units\)$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: using [[:digit:]]+k window, over a total of [[:digit:]]+( blocks|k)\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: (un)?bind<[^>]+>$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: export_rdev\([^)]+\)$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? raid[[:digit:]]+: raid set [[:alnum:]]+ active with [[:digit:]]+ out of [[:digit:]]+ mirrors$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? RAID([01456]|10) conf printout:$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])?[[:space:]]+---( [wrf]d:[[:digit:]]+){2,3}$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])?[[:space:]]+disk [[:digit:]]+,( wo:[[:digit:]]+,)? o:[[:digit:]]+, dev:[[:alnum:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: Rebuild((Start|Finish)ed|[[:digit:]]+) event detected on md device /dev/[-_./[:alnum:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: SpareActive event detected on md device /dev/[-_./[:alnum:]]+, component device /dev/[-_./[:alnum:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: (New|Degraded)Array event detected on md device /dev/[-_./[:alnum:]]+$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ mdadm(\[[[:digit:]]+\])?: DeviceDisappeared event detected on md device /dev/[-_./[:alnum:]]+$ diff --git a/debian/mdadm.logcheck.violations b/debian/mdadm.logcheck.violations new file mode 100644 index 00000000..ea8cce72 --- /dev/null +++ b/debian/mdadm.logcheck.violations @@ -0,0 +1,3 @@ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? md: kicking non-fresh [[:alnum:]]+ from array!$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])? raid[[:digit:]]+: Disk failure on [[:alnum:]]+, disabling device\.$ +^\w{3} [ :0-9]{11} [._[:alnum:]-]+ kernel:( \[ *[[:digit:]]+\.[[:digit:]]+\])?[[:space:]]+Operation continuing on [[:digit:]]+ devices?$ diff --git a/debian/mdadm.maintscript b/debian/mdadm.maintscript new file mode 100644 index 00000000..17290594 --- /dev/null +++ b/debian/mdadm.maintscript @@ -0,0 +1 @@ +rm_conffile /etc/init.d/mdadm-raid 3.4-2~ diff --git a/debian/mdadm.modules b/debian/mdadm.modules new file mode 100644 index 00000000..5ad12499 --- /dev/null +++ b/debian/mdadm.modules @@ -0,0 +1,8 @@ +# mdadm module configuration file +# set start_ro=1 to make newly assembled arrays read-only initially, +# to prevent metadata writes. This is needed in order to allow +# resume-from-disk to work - new boot should not perform writes +# because it will be done behind the back of the system being +# resumed. See http://bugs.debian.org/415441 for details. + +options md_mod start_ro=1 diff --git a/debian/mdadm.postinst b/debian/mdadm.postinst new file mode 100644 index 00000000..fbe1f362 --- /dev/null +++ b/debian/mdadm.postinst @@ -0,0 +1,107 @@ +#!/bin/sh +# Copyright © 2001-2005 Mario Jou/3en +# Copyright © 2005-2008 Martin F. Krafft +# Distributable under the terms of the GNU GPL version 2. +# +set -e + +. /usr/share/debconf/confmodule + +case "${1:-}" in + configure|reconfigure) + + if [ ! -f /proc/mdstat ] && [ -x $(command -v modprobe 2>/dev/null) ]; then + modprobe md >/dev/null 2>&1 || : + fi + if [ ! -f /proc/mdstat ]; then + echo 'W: mdadm: failed to load MD subsystem.' >&2 + fi + + DEBIANCONFIG=/etc/default/mdadm + CONFIG=/etc/mdadm/mdadm.conf + ALTCONFIG=/etc/mdadm.conf + MDADM=/sbin/mdadm + + # load current settings, most of which will be overwritten. + [ -f $DEBIANCONFIG ] && . $DEBIANCONFIG + + db_get mdadm/mail_to + MAILADDR="${RET:-root}" + + [ ! -f $CONFIG ] && [ -f $ALTCONFIG ] && CONFIG=$ALTCONFIG + if [ ! -f $CONFIG ]; then + echo -n 'Generating mdadm.conf... ' >&2 + # pass the MAILADDR variable into the script + MDADM_MAILADDR__="$MAILADDR"; export MDADM_MAILADDR__ + if /usr/share/mdadm/mkconf generate $CONFIG 2>/dev/null; then + echo done. >&2 + else + echo "done (failed to scan arrays; /proc probably not mounted)." >&2 + fi + fi + + if [ -w $CONFIG ] && [ -z "${MDADM_MAILADDR__:-}" ]; then + # if the configuration is writeable but has not been written just + # before, then edit it to reflect the MAILADDR preference + if grep -q '^MAILADDR' $CONFIG; then + sed -i -e "s/^MAILADDR.*/MAILADDR $MAILADDR/" $CONFIG + else + echo "MAILADDR $MAILADDR" >> $CONFIG + fi + fi + unset MDADM_MAILADDR__ + + db_get mdadm/autocheck + AUTOCHECK="${RET:-true}" + db_get mdadm/start_daemon + START_DAEMON="${RET:-true}" + #db_get mdadm/daemon_options + [ -n "${DAEMON_OPTIONS:-}" ] || DAEMON_OPTIONS='--syslog' + + cat <<_eof > $DEBIANCONFIG +# mdadm Debian configuration +# +# You can run 'dpkg-reconfigure mdadm' to modify the values in this file, if +# you want. You can also change the values here and changes will be preserved. +# Do note that only the values are preserved; the rest of the file is +# rewritten. +# + +# AUTOCHECK: +# should mdadm run periodic redundancy checks over your arrays? See +# /etc/cron.d/mdadm. +AUTOCHECK=$AUTOCHECK + +# START_DAEMON: +# should mdadm start the MD monitoring daemon during boot? +START_DAEMON=$START_DAEMON + +# DAEMON_OPTIONS: +# additional options to pass to the daemon. +DAEMON_OPTIONS="$DAEMON_OPTIONS" + +# VERBOSE: +# if this variable is set to true, mdadm will be a little more verbose e.g. +# when creating the initramfs. +VERBOSE=${VERBOSE:-false} +_eof + + db_stop + + # Remove old init script + update-rc.d mdadm-raid remove + + command -v update-initramfs >/dev/null 2>&1 && update-initramfs -u + + if dpkg --compare-versions "$2" le 3.3.2-3; then + rm -f /var/lib/mdadm/CONF-UNCHECKED /var/lib/mdadm/mdadm.conf-generated + if [ -d /var/lib/mdadm ]; then + rmdir --ignore-fail-on-non-empty /var/lib/mdadm + fi + fi + ;; +esac + +[ -d /run/systemd/system ] && systemctl --system daemon-reload >/dev/null || : + +#DEBHELPER# diff --git a/debian/mdadm.postrm b/debian/mdadm.postrm new file mode 100644 index 00000000..eaa62ad3 --- /dev/null +++ b/debian/mdadm.postrm @@ -0,0 +1,25 @@ +#! /bin/sh +# Copyright © 2001,2002 Mario Jou/3en +# Copyright © 2006-2008 Martin F. Krafft +# Distributable under the terms of the GNU GPL version 2. +# +set -e + +case "${1:-}" in + remove) + if command -v update-initramfs >/dev/null 2>&1; then + echo "W: mdadm: I'll update the initramfs, but if you need MD to boot" >&2 + echo "W: mdadm: with initramfs, your system may be left unbootable!" >&2 + update-initramfs -u + fi + ;; + + purge) + rm -f /etc/default/mdadm /etc/mdadm.conf /etc/mdadm/mdadm.conf + ;; + +esac + +[ -d /run/systemd/system ] && systemctl --system daemon-reload >/dev/null || : + +#DEBHELPER# diff --git a/debian/mdadm.templates b/debian/mdadm.templates new file mode 100644 index 00000000..5a797c55 --- /dev/null +++ b/debian/mdadm.templates @@ -0,0 +1,38 @@ +# These templates have been reviewed by the debian-l10n-english +# team +# +# If modifications/additions/rewording are needed, please ask +# debian-l10n-english@lists.debian.org for advice. +# +# Even minor modifications require translation updates and such +# changes should be coordinated with translators and reviewers. + +Template: mdadm/autocheck +Type: boolean +Default: true +_Description: Should mdadm run monthly redundancy checks of the MD arrays? + If the kernel supports it (versions greater than 2.6.14), mdadm can periodically check the + redundancy of MD arrays (RAIDs). This may be a resource-intensive process, + depending on the local setup, but it could help prevent rare cases of data loss. + Note that this is a read-only check unless errors are found; if errors are + found, mdadm will try to correct them, which may result in write access to + the media. + . + The default, if turned on, is to check on the first Sunday of every + month at 01:06. + +Template: mdadm/start_daemon +Type: boolean +Default: true +_Description: Do you want to start the MD monitoring daemon? + The MD (RAID) monitor daemon sends email notifications in response to + important MD events (such as a disk failure). + . + Enabling this option is recommended. + +Template: mdadm/mail_to +Type: string +Default: root +_Description: Recipient for email notifications: + Please enter the email address of the user who should get the email + notifications for important MD events. diff --git a/debian/mkconf b/debian/mkconf new file mode 100644 index 00000000..632c9120 --- /dev/null +++ b/debian/mkconf @@ -0,0 +1,97 @@ +#!/bin/sh +# +# mkconf -- outputs valid mdadm.conf contents for the local system +# +# Copyright © martin f. krafft +# distributed under the terms of the Artistic Licence 2.0 +# +set -eu + +ME="${0##*/}" +MDADM=/sbin/mdadm +DEBIANCONFIG=/etc/default/mdadm +CONFIG=/etc/mdadm/mdadm.conf + +# initialise config variables in case the environment leaks +MAILADDR= DEVICE= HOMEHOST= PROGRAM= + +test -r $DEBIANCONFIG && . $DEBIANCONFIG + +if [ -n "${MDADM_MAILADDR__:-}" ]; then + # honour MAILADDR from the environment (from postinst) + MAILADDR="$MDADM_MAILADDR__" +else + # preserve existing MAILADDR + MAILADDR="$(sed -ne 's/^MAILADDR //p' $CONFIG 2>/dev/null)" || : +fi + +# save existing values as defaults +if [ -r "$CONFIG" ]; then + DEVICE="$(sed -ne 's/^DEVICE //p' $CONFIG)" + HOMEHOST="$(sed -ne 's/^HOMEHOST //p' $CONFIG)" + PROGRAM="$(sed -ne 's/^PROGRAM //p' $CONFIG)" +fi + +[ "${1:-}" = force-generate ] && rm -f $CONFIG +case "${1:-}" in + generate|force-generate) + [ -n "${2:-}" ] && CONFIG=$2 + # only barf if the config file specifies anything else than MAILADDR + if egrep -qv '^(MAILADDR.*|#.*|)$' $CONFIG 2>/dev/null; then + echo "E: $ME: $CONFIG already exists." >&2 + exit 255 + fi + + mkdir --parent ${CONFIG%/*} + exec >$CONFIG + ;; +esac + +cat <<_eof +# mdadm.conf +# +# Please refer to mdadm.conf(5) for information about this file. +# + +# by default (built-in), scan all partitions (/proc/partitions) and all +# containers for MD superblocks. alternatively, specify devices to scan, using +# wildcards if desired. +#DEVICE ${DEVICE:-partitions containers} + +# automatically tag new arrays as belonging to the local system +HOMEHOST ${HOMEHOST:-} + +# instruct the monitoring daemon where to send mail alerts +MAILADDR ${MAILADDR:-root} + +_eof + +if [ -n "${PROGRAM:-}" ]; then + cat <<-_eof + # program to run when mdadm monitor detects potentially interesting events + PROGRAM ${PROGRAM} + + _eof +fi + +error=0 +if [ ! -r /proc/mdstat ]; then + echo W: $ME: MD subsystem is not loaded, thus I cannot scan for arrays. >&2 + error=1 +elif [ ! -r /proc/partitions ]; then + echo W: $ME: /proc/partitions cannot be read, thus I cannot scan for arrays. >&2 + error=2 +else + echo "# definitions of existing MD arrays" + if ! $MDADM --examine --scan --config=partitions; then + error=$(($? + 128)) + echo W: $ME: failed to scan for partitions. >&2 + echo "### WARNING: scan failed." + else + echo + fi +fi + +echo "# This configuration was auto-generated on $(date -R) by mkconf" + +exit $error diff --git a/debian/patches/debian-conffile-location.diff b/debian/patches/debian-conffile-location.diff new file mode 100644 index 00000000..8acc6077 --- /dev/null +++ b/debian/patches/debian-conffile-location.diff @@ -0,0 +1,115 @@ +From: martin f. krafft +Subject: Set /etc/mdadm/mdadm.conf as primary config file location + +On Debian, the configuration file resides primarily in /etc/mdadm/mdadm.conf, +/etc/mdadm.conf is only used as a backup. + +This is a Debian-specific patch. + +Forwarded: not-needed +Reviewed-by: martin f. krafft + +--- + Makefile | 4 ++-- + ReadMe.c | 2 +- + mdadm.8.in | 14 ++++++-------- + mdadm.conf.5 | 2 +- + mdassemble.8 | 2 +- + 5 files changed, 11 insertions(+), 13 deletions(-) + +--- a/Makefile ++++ b/Makefile +@@ -61,8 +61,8 @@ + PKG_CONFIG ?= pkg-config + + SYSCONFDIR = /etc +-CONFFILE = $(SYSCONFDIR)/mdadm.conf +-CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf ++CONFFILE = $(SYSCONFDIR)/mdadm/mdadm.conf ++CONFFILE2 = $(SYSCONFDIR)/mdadm.conf + MAILCMD =/usr/sbin/sendmail -t + CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" + # Both MAP_DIR and MDMON_DIR should be somewhere that persists across the +--- a/ReadMe.c ++++ b/ReadMe.c +@@ -590,7 +590,7 @@ + ; + + char Help_config[] = +-"The /etc/mdadm.conf config file:\n\n" ++"The /etc/mdadm/mdadm.conf config file:\n\n" + " The config file contains, apart from blank lines and comment lines that\n" + " start with a hash(#), array lines, device lines, and various\n" + " configuration lines.\n" +--- a/mdadm.8.in ++++ b/mdadm.8.in +@@ -264,13 +264,13 @@ + .TP + .BR \-c ", " \-\-config= + Specify the config file or directory. Default is to use +-.B /etc/mdadm.conf ++.B /etc/mdadm/mdadm.conf + and +-.BR /etc/mdadm.conf.d , ++.BR /etc/mdadm/mdadm.conf.d , + or if those are missing then +-.B /etc/mdadm/mdadm.conf ++.B /etc/mdadm.conf + and +-.BR /etc/mdadm/mdadm.conf.d . ++.BR /etc/mdadm.conf.d . + If the config file given is + .B "partitions" + then nothing will be read, but +@@ -1742,9 +1742,9 @@ + or requested with (a possibly implicit) + .BR \-\-scan . + In the later case, +-.B /etc/mdadm.conf +-or + .B /etc/mdadm/mdadm.conf ++or ++.B /etc/mdadm.conf + is used. + + If +@@ -3003,7 +3003,7 @@ + is given in Misc mode, and to monitor array reconstruction + on Monitor mode. + +-.SS /etc/mdadm.conf ++.SS /etc/mdadm/mdadm.conf (or /etc/mdadm.conf) + + The config file lists which devices may be scanned to see if + they contain MD super block, and gives identifying information +@@ -3011,7 +3011,7 @@ + .BR mdadm.conf (5) + for more details. + +-.SS /etc/mdadm.conf.d ++.SS /etc/mdadm/mdadm.conf.d (or /etc/mdadm.conf.d) + + A directory containing configuration files which are read in lexical + order. +--- a/mdadm.conf.5 ++++ b/mdadm.conf.5 +@@ -8,7 +8,7 @@ + .SH NAME + mdadm.conf \- configuration for management of Software RAID with mdadm + .SH SYNOPSIS +-/etc/mdadm.conf ++/etc/mdadm/mdadm.conf + .SH DESCRIPTION + .PP + .I mdadm +--- a/mdassemble.8 ++++ b/mdassemble.8 +@@ -40,7 +40,7 @@ + + .SH FILES + +-.SS /etc/mdadm.conf ++.SS /etc/mdadm/mdadm.conf + + The config file lists which devices may be scanned to see if + they contain MD super block, and gives identifying information diff --git a/debian/patches/debian-no-Werror.diff b/debian/patches/debian-no-Werror.diff new file mode 100644 index 00000000..0a427f16 --- /dev/null +++ b/debian/patches/debian-no-Werror.diff @@ -0,0 +1,24 @@ +From: martin f. krafft +Subject: Remove -Werror from compiler flags + +-Werror seems like a bad idea on released/packaged code because a toolchain +update (introducing new warnings) could break the build. We'll let upstream +use it to beautify the code, but remove it for out builds. + +Signed-off-by: martin f. krafft + +--- + Makefile | 2 +- + 1 files changed, 1 insertions(+), 1 deletions(-) + +--- a/Makefile ++++ b/Makefile +@@ -43,7 +43,7 @@ + + CC = $(CROSS_COMPILE)gcc + CXFLAGS ?= -ggdb +-CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter ++CWFLAGS = -Wall -Wstrict-prototypes -Wextra -Wno-unused-parameter + ifdef WARN_UNUSED + CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3 + endif diff --git a/debian/patches/mdmonitor-service-simplify.diff b/debian/patches/mdmonitor-service-simplify.diff new file mode 100644 index 00000000..ea706a68 --- /dev/null +++ b/debian/patches/mdmonitor-service-simplify.diff @@ -0,0 +1,20 @@ +Subject: simplify mdmonitor.service +From: Michael Tokarev +Date: Fri, 14 Nov 2014 19:18:05 +0300 +Bug-Debian: http://bugs.debian.org/764647 +Forwarded: no + +There isn't much for customization for mdadm --monitor. +it'll just do what it's supposed to do, so just run it. + +--- a/systemd/mdmonitor.service ++++ b/systemd/mdmonitor.service +@@ -10,7 +10,4 @@ Description=MD array monitor + DefaultDependencies=no + + [Service] +-Environment= MDADM_MONITOR_ARGS=--scan +-EnvironmentFile=-/run/sysconfig/mdadm +-ExecStartPre=-/usr/lib/systemd/scripts/mdadm_env.sh +-ExecStart=BINDIR/mdadm --monitor $MDADM_MONITOR_ARGS ++ExecStart=BINDIR/mdadm --monitor --scan diff --git a/debian/patches/readlink-path.patch b/debian/patches/readlink-path.patch new file mode 100644 index 00000000..86544592 --- /dev/null +++ b/debian/patches/readlink-path.patch @@ -0,0 +1,15 @@ +From: Michael Tokarev +Subject: readlink is in /bin not /usr/bin on debian +Date: Fri, 14 Nov 2014 19:11:51 +0300 +Bug-Debian: http://bugs.debian.org/766416 +Forwarded: no + +This is a debian-specific change, upstream ships +the rule to use /usr/bin/readlink while on debian +it is /bin/readlink + +--- a/udev-md-raid-arrays.rules ++++ b/udev-md-raid-arrays.rules +@@ -38 +38 @@ ENV{MD_LEVEL}=="raid[1-9]*", ENV{SYSTEMD_WANTS}+="mdmonitor.service" +-ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/usr/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c" ++ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c" diff --git a/debian/patches/series b/debian/patches/series new file mode 100644 index 00000000..728fabf4 --- /dev/null +++ b/debian/patches/series @@ -0,0 +1,5 @@ +debian-conffile-location.diff +debian-no-Werror.diff +sha1-includes.diff +readlink-path.patch +mdmonitor-service-simplify.diff diff --git a/debian/patches/sha1-includes.diff b/debian/patches/sha1-includes.diff new file mode 100644 index 00000000..0dfd7daf --- /dev/null +++ b/debian/patches/sha1-includes.diff @@ -0,0 +1,40 @@ +From: Michael Tokarev +Subject: do not #include ansidecl.h from sha1.h, use system headers + +In 3.2.5 version of mdadm, new sha1 implementation has been included +which tries to include ansidecl.h header which is internal to some +other project. But this #include isn't really necessary, since this +implementation does not actually use any defines from ansidecl.h. So +just remove the #include, instead of adding a new external dependency. + +References: http://www.spinics.net/lists/raid/msg38859.html + +While at it, unconditionally include system headers like limits.h and +stdint.h, since on a Linux system these headers are available, and +these contains definitive information about real system types than +any guesses. + +--- a/sha1.h ++++ b/sha1.h +@@ -22,7 +22,7 @@ + + #include + +-#if defined HAVE_LIMITS_H || _LIBC ++#if 1 /* defined HAVE_LIMITS_H || _LIBC */ + # include + #endif + +@@ -33,9 +33,9 @@ + the resulting executable. Locally running cross-compiled executables + is usually not possible. */ + +-#ifdef _LIBC +-# include +-typedef u_int32_t sha1_uint32; ++#if 1 /* def _LIBC */ ++# include ++typedef uint32_t sha1_uint32; + typedef uintptr_t sha1_uintptr; + #else + # define INT_MAX_32_BITS 2147483647 diff --git a/debian/po/POTFILES.in b/debian/po/POTFILES.in new file mode 100644 index 00000000..04922385 --- /dev/null +++ b/debian/po/POTFILES.in @@ -0,0 +1 @@ +[type: gettext/rfc822deb] mdadm.templates diff --git a/debian/po/ca.po b/debian/po/ca.po new file mode 100644 index 00000000..a58c3ffd --- /dev/null +++ b/debian/po/ca.po @@ -0,0 +1,184 @@ +# mdadm Catalan translation. +# Copyright (C) 2004-2006 Software in the Public Interest +# This file is distributed under the same license as the squid package. +# Innocent De Marchi , 2011. +# +msgid "" +msgstr "" +"Project-Id-Version: 3.1.4-1+8efb9d1\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2011-05-17 16:54+0100\n" +"Last-Translator: Innocent De Marchi \n" +"Language-Team: Catalan \n" +"Language: ca\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"X-Poedit-Language: Catalan\n" +"X-Poedit-Country: SPAIN\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"Voleu que «mdadm» executi comprovacions de redundància mensuals de les " +"matrius MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Si el nucli ho accepta (versions superiors a la 2.6.14), «mdadm» pot fer " +"comprovacions periòdiques de la redundància de les matrius MD (RAIDs). Pot " +"ésser que aquest procés consumeixi molts recursos del sistema, depenent de " +"la configuració, però pot ajudar a prevenir casos poc freqüents de pèrdua de " +"dades. Teniu present que aquestes comprovacions es fan en mode lectura " +"llevat que es detectin errors: si hi ha errors, «mdadm» els corregirà i per " +"això, caldrà que tengui accés d'escriptura als mitjans físics. " + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"El valor predeterminat, si s'activa, es fer la comprovació el primer " +"diumenge de cada mes a les 01:06 am." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Desitjau arrencar el dimoni monitor MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"El dimoni monitor de MD (RAID) envia notificacions per correu electrònic " +"quan es produeixen esdeveniments importants en els dispositius MD (com un " +"error de disc)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Es recomana l'activació d'aquesta opció." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Destinatari de les notificacions de correu electrònic:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Introduïu l'adreça de correu electrònic de l'usuari que ha de rebre les " +"notificacions de correu electrònic per a esdeveniments MD rellevants." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Les matrius MD necessaris per al sistema de fitxers arrel:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Escriviu «all» (tots), «none» (cap) o una llista separada per espais dels " +#~ "dispositius com «md0 md1» o «md/1 md/d0» (podeu ometre el «/dev/» " +#~ "inicial)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "per a ús intern, només és necessària la descripció llarga. " + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Si el sistema de fitxers arrel del sistema està en un conjunt MD (RAID), " +#~ "cal que s'iniciï al principi de la seqüència d'arrencada. Si està en un " +#~ "volum lògic (LVM), que està definit sobre un MD, cal iniciar totes les " +#~ "matrius que el constitueixen." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Si sabeu exactament quines matrius són necessàries per arrencar el " +#~ "sistema de fitxers arrel, i vol ajornar l'arrencada de la resta de " +#~ "conjunts a un punt posterior de la seqüència d'arrencada, Introduïu aquí " +#~ "els conjunts que voleu arrencar. També podeu seleccionar «all» per, " +#~ "simplement, arrencar tots els disponibles." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Si no necessita o vol arrencar qualsevol matriu per al sistema de fitxers " +#~ "arrel, deixau en blanc la resposta (o escriviu «none»). Pot ésser el seu " +#~ "cas si fa servir l'auto-arrencada del nucli o no necessiteu cap matriu en " +#~ "l'arrencada." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "S'ha produït un error: el node de dispositiu no existeix." + +#~ msgid "An error occurred: not a block device" +#~ msgstr "S'ha produït un error: no és un dispositiu de blocs." + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "S'ha produït un error: no és un conjunt («array») MD." + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "" +#~ "S'ha produït un error: la matriu («array») no apareix llistada en el " +#~ "fitxer de configuració «mdadm.conf»." + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Voleu arrencar les matrius no llistats en el fitxer «mdadm.conf»?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "La matriu («array») especificada (${array}) no apareix llistada en el " +#~ "fitxer de configuració (${config}). Per tant, no es pot iniciar la matriu " +#~ "durant l'arrencada del sistema, llevat que corregeixi el fitxer de " +#~ "configuració i regenereu el disc RAM inicial." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Aquest avís només és important si necessiteu que les matrius s'arrenquin " +#~ "en el disc RAM inicial per poder arrencar el sistema. Si feu servir " +#~ "l'arrencada automàtica del nucli o no necessiteu que les matrius " +#~ "estiguin arrencats quan es carregui el disc RAM, podeu continuar. També " +#~ "podeu decidir no continuar i introduir «none» quan se li demani quines " +#~ "matrius cal iniciar del disc RAM inicial." diff --git a/debian/po/cs.po b/debian/po/cs.po new file mode 100644 index 00000000..d44601d3 --- /dev/null +++ b/debian/po/cs.po @@ -0,0 +1,228 @@ +# +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans +# +# Developers do not need to manually edit POT or PO files. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-02-17 21:10+0100\n" +"Last-Translator: Miroslav Kure \n" +"Language-Team: Czech \n" +"Language: cs\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Má mdadm spouÅ¡tÄ›t mÄ›síÄní kontroly redundance MD polí?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Pokud to vaÅ¡e jádro podporuje (verze vÄ›tší než 2.6.14), může mdadm " +"pravidelnÄ› kontrolovat redundanci MD polí (RAIDů). Podle konfigurace " +"poÄítaÄe to může být proces velmi nároÄný na prostÅ™edky, ovÅ¡em může pÅ™edejít " +"vzácným případům ztráty dat. Pokud nejsou nalezeny chyby, používá tato " +"kontrola v zásadÄ› jen Ätecí operace. PÅ™i nalezení chyb se je mdadm pokusí " +"opravit, což může znamenat zápis na médium." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Pokud kontrolu povolíte, bude se dle výchozího nastavení spouÅ¡tÄ›t každou " +"první nedÄ›li v mÄ›síci v 01:06 ráno." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Chcete spustit daemon pro monitorování MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Daemon pro monitorování MD (RAIDu) zasílá emailová upozornÄ›ní na významné MD " +"události, jako je selhání disku." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Povolení této možnosti je doporuÄeno." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Příjemce emailových upozornÄ›ní:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Zadejte prosím emailovou adresu uživatele, který má dostávat emailová " +"upozornÄ›ní pÅ™i výskytu významných MD událostí." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "MD pole vyžadovaná pro koÅ™enový souborový systém:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Zadejte prosím mezerami oddÄ›lený seznam zařízení, případnÄ› „all“ nebo " +#~ "„none“. PoÄáteÄní „/dev/“ můžete vynechat a zadat jen napÅ™. „md0 md1“ " +#~ "nebo „md/1 md/d0“." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "pro vnitÅ™ní použití - pouze kvůli zobrazení dlouhého popisu." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Pokud je koÅ™enový souborový systém umístÄ›n na MD (RAID) svazku, musí být " +#~ "tento spuÅ¡tÄ›n bÄ›hem zavádÄ›ní systému co nejdříve. Pokud se koÅ™enový " +#~ "souborový systém nachází na logickém svazku LVM, který je vytvoÅ™en nad MD " +#~ "polem, musí se spustit vÅ¡echna související pole." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Jestliže pÅ™esnÄ› víte, která pole jsou potÅ™eba pro pÅ™ipojení koÅ™enového " +#~ "souborového systému a zároveň chcete pozdržet spuÅ¡tÄ›ní ostatních polí na " +#~ "pozdÄ›jší dobu, zadejte zde prosím pole, která se mají spustit. Chcete-li " +#~ "spustit vÅ¡echna dostupná pole, můžete zadat „all“." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "NepotÅ™ebujete-li nebo nechcete-li spouÅ¡tÄ›t pole pro koÅ™enový souborový " +#~ "systém, ponechte odpovÄ›Ä prázdnou, případnÄ› zadejte „none“. To může " +#~ "nastat tÅ™eba v případÄ›, že používáte automatický start přímo v jádÅ™e, " +#~ "nebo pokud k zavedení systému žádná pole nepotÅ™ebujete." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Vyskytla se chyba: uzel zařízení neexistuje" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Vyskytla se chyba: není blokovým zařízením" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Vyskytla se chyba: není MD polem" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Vyskytla se chyba: pole není uvedeno v souboru mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Spustit pole neuvedená v mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "Zadané pole (${array}) není uvedeno v konfiguraÄním souboru ${config} a " +#~ "tím pádem nemůže být spuÅ¡tÄ›no pÅ™i zavádÄ›ní systému. Napravit to můžete " +#~ "opravou konfiguraÄního souboru a znovuvytvoÅ™ením poÄáteÄního ramdisku." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Toto varování je relevantní pouze pokud k zavedení systému potÅ™ebujete, " +#~ "aby se pole spustila z poÄáteÄního ramdisku. Používáte-li automatické " +#~ "spouÅ¡tÄ›ní přímo v jádÅ™e, nebo pokud nepotÅ™ebujete pouÅ¡tÄ›t žádná pole " +#~ "jeÅ¡tÄ› z poÄáteÄního ramdisku, můžete jednoduÅ¡e pokraÄovat. Jinou možností " +#~ "je nepokraÄovat dále a pÅ™i dotazu na seznam polí, která se mají spouÅ¡tÄ›t " +#~ "z poÄáteÄního ramdisku, zadat 'none'." + +#~ msgid "Initialise the superblock if you reuse hard disks" +#~ msgstr "PÅ™i znovupoužití starších disků inicializujte superblok" + +#~ msgid "" +#~ "WARNING! If you are using hard disks which have RAID superblocks from " +#~ "earlier installations in different RAID arrays, you MUST zero each " +#~ "superblock *before* activating the autostart feature." +#~ msgstr "" +#~ "VAROVÃNÃ! Používáte-li pevné disky, které obsahují RAID superbloky z " +#~ "dřívÄ›jší instalace v jiném RAID poli, MUSÃTE vÅ¡echny superbloky pÅ™ed " +#~ "použitím automatického spouÅ¡tÄ›ní vynulovat." + +#~ msgid "" +#~ "To do this, do not start the RAID devices automatically. First, zero the " +#~ "superblock (mdadm --zero-superblock /dev/mdX). Next, use `dpkg-" +#~ "reconfigure mdadm` to reactivate the autostart feature." +#~ msgstr "" +#~ "Chcete-li to provést, nespouÅ¡tÄ›jte RAID zařízení automaticky. Nejprve " +#~ "vynulujte superblok příkazem 'mdadm --zero-superblock /dev/mdX' a teprve " +#~ "poté můžete povolit automatické spouÅ¡tÄ›ní RAIDu příkazem 'dpkg-" +#~ "reconfigure mdadm'." + +#~ msgid "" +#~ "You have the option to start all other arrays (those not needed for the " +#~ "root filesystem) later in the boot sequence. Doing so will give you " +#~ "greater control over the arrays with the mdadm configuration file. " +#~ "Starting all arrays at boot-time may be safer though." +#~ msgstr "" +#~ "VÅ¡echna ostatní pole (ta, která nejsou potÅ™eba pro koÅ™enový souborový " +#~ "systém) můžete spustit pozdÄ›ji. Pokud tak uÄiníte, budete mít v " +#~ "konfiguraÄním souboru mdadm nad poli vÄ›tší kontrolu. Na druhou stranu je " +#~ "spouÅ¡tÄ›ní vÅ¡ech polí hned na zaÄátku zavádÄ›ní o nÄ›co jistÄ›jší volbou." + +#~ msgid "" +#~ "If RAID devices are started automatically, all RAID devices are " +#~ "discovered and assembled automatically at system startup. This option " +#~ "should only be used if the md driver is compiled as a module. If it is " +#~ "compiled into your kernel, the automatic startup will be performed at " +#~ "boot time by the kernel and therefore you should not choose this option." +#~ msgstr "" +#~ "Jestliže jsou RAID zařízení spouÅ¡tÄ›na automaticky, jsou vÅ¡echna RAID " +#~ "zařízení rozpoznána a poskládána automaticky pÅ™i zavádÄ›ní systému. Tuto " +#~ "volbu byste mÄ›li použít pouze v případÄ›, že jste ovladaÄ md zakompilovali " +#~ "jako modul. Pokud jste jej zakompilovali přímo do jádra, o automatické " +#~ "spuÅ¡tÄ›ní se postará samotné jádro a tedy tuto možnost nepotÅ™ebujete." diff --git a/debian/po/da.po b/debian/po/da.po new file mode 100644 index 00000000..c6bdffed --- /dev/null +++ b/debian/po/da.po @@ -0,0 +1,175 @@ +# Danish translation mdadm. +# Copyright (C) 2011 mdadm & nedenstÃ¥ende oversættere. +# This file is distributed under the same license as the mdadm package. +# Joe Hansen , 2011. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2011-04-03 17:30+01:00\n" +"Last-Translator: Joe Hansen \n" +"Language-Team: Danish \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Skal mdadm køre mÃ¥nedlig redundanskontrol af MD arrays?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Hvis kernen understøtter det (versioner større end 2.6.14), kan mdadm " +"periodisk kontrollere redundansen pÃ¥ MD arrays (RAID'er). Det kan være en " +"ressourcekrævende proces, afhængig af den lokale opsætning, men det kan " +"hjælpe med at forhindre at du i sjældne tilfælde fÃ¥r datatab. Bemærk at " +"dette er en skrivebeskyttet kontrol med mindre at der findes fejl; hvis der " +"registreres fejl vil mdadm forsøge at rette dem, hvilket kan medføre " +"skriveadgang til mediet." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Standarden - hvis aktiveret - er at kontrollere pÃ¥ den første søndag i hver " +"mÃ¥ned klokken 01:06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Ønsker du at starte MD-overvÃ¥gningsdæmonen?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"MD-overvÃ¥gningsdæmonem (RAID) sender e-post-pÃ¥mindelser udløst af vigtige MD-" +"hændelser (sÃ¥som en diskfejl)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Aktivering af denne indstilling anbefales." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Modtager af e-post-pÃ¥mindelser:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Indtast venligst e-post-adressen pÃ¥ brugeren, som skal modtage e-post-" +"pÃ¥mindelser for vigtige MD-hændelser." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "MD arrays krævet for rodfilsystemet:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Indtast venligst »all«, »none« eller en mellemrumsadskilt liste af " +#~ "enheder sÃ¥som »md0 md1« eller »md/1 md/d0« (det foranstillede »/dev/« kan " +#~ "udelades)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "for intern brug - kun den lange beskrivelse er krævet." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Hvis systemets rodfilsystem er placeret pÃ¥ en MD-array (RAID), skal det " +#~ "startes tidligt under opstartssekvensen. Hvis den er placeret pÃ¥ en " +#~ "logisk diskenhed (LVM), som er pÃ¥ MD, skal alle indgÃ¥ende arrays startes." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Hvis du ved præcis hvilke arrays som er krævet, for at fÃ¥ rodfilsystemet " +#~ "op, og du ønsker at udsætte start af alle andre arrays til et senere " +#~ "tidspunkt i opstartssekvensen, sÃ¥ indtast her de arrays som skal startes. " +#~ "Alternativt kan du indtaste »all« for at starte alle tilgængelige arrays." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Hvis du ikke har brug for eller ønsker at starte nogen arrays for " +#~ "rodfilsystemet, sÃ¥ efterlad svaret tomt (eller indtast »none«). Dette kan " +#~ "være tilfældet, hvis du bruger automatisk start af kernen eller ikke skal " +#~ "bruge arrays til at starte op med." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Der opstod en fejl: Enhedsknude findes ikke" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Der opstod en fejl: Ikke en blokenhed" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Der opstod en fejl: Ikke en MD array" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Der opstod en fejl: Array er ikke anført i mdadm.conf-filen" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Start arrays er ikke anført i mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "Den angivne array (${array}) er ikke anført i konfigurationsfilen " +#~ "(${config}). Den kan derfor ikke startes under opstarten, med mindre du " +#~ "retter i konfigurationsfilen og gendanner den oprindleige ramdisk." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Denne advarsel er kun relevant hvis du skal have arrays til at blive " +#~ "startet fra den oprindelige ramdisk for at kunne starte op. Hvis du " +#~ "bruger den automatiske opstart i kernen, eller ikke skal bruge at arrays " +#~ "startes sÃ¥ tidligt som den oprindelige ramdisk indlæses, sÃ¥ kan du bare " +#~ "fortsætte. Alternativt sÃ¥ vælg at fortsætte og indtaste »none« nÃ¥r du " +#~ "bliver spurgt om hvilke arrays, der skal startes fra den oprindelige " +#~ "ramdisk." diff --git a/debian/po/de.po b/debian/po/de.po new file mode 100644 index 00000000..140cc0bd --- /dev/null +++ b/debian/po/de.po @@ -0,0 +1,249 @@ +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans +# Developers do not need to manually edit POT or PO files. +# +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 2.6.9-3\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2009-06-24 17:35+0200\n" +"Last-Translator: Mario Joussen \n" +"Language-Team: German \n" +"Language: de\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=ISO-8859-1\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"Soll mdadm monatlich die Redundanzüberprüfung auf den RAID-Verbünden " +"ausführen?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Falls Ihr Kernel es unterstützt (Versionen größer als 2.6.14) kann mdadm " +"regelmäßig die Redundanz Ihrer MD-Verbünde (RAID) überprüfen. Dies kann " +"abhängig von Ihrer Installation ein resourcenintensiver Vorgang sein, der " +"aber helfen kann, seltene Fälle von Datenverlust zu vermeiden. Bitte " +"beachten Sie, dass diese Überprüfung nur lesend erfolgt, solange keine " +"Fehler gefunden werden. Falls Fehler gefunden werden, wird mdadm versuchen, " +"diese zu beheben, was zu schreibendem Zugriff auf das Medium führen kann." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Die Voreinstellung ist, falls eingeschaltet, die Überprüfung am ersten " +"Sonntag jedes Monats um 01:06 Uhr durchzuführen." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Möchten Sie den RAID-Überwachungsdämon starten?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Der MD- (RAID-)Überwachungsdämon verschickt Benachrichtigungen als Reaktion " +"auf wichtige RAID-Ereignisse (wie zum Beispiel Festplattenfehler)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Das Aktivieren dieser Option ist empfohlen." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Empfänger der E-Mail-Benachrichtungen:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Geben Sie bitte die E-Mail-Adresse des Benutzers an, der die E-Mail-" +"Benachrichtigung für wichtigen MD-Ereignisse erhalten soll." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Für das Wurzeldateisystem benötigte MD folgende Verbünde:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Bitte geben Sie »all«, »none« oder eine leerzeichenseparierte Geräteliste " +#~ "wie zum Beispiel »md0 md1« oder »md/1 md/d0« ein (das führende »/dev« " +#~ "kann weggelassen werden)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "" +#~ "für internen Gebrauch - es wird nur die ausführliche Beschreibung " +#~ "benötigt." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Wenn das Wurzeldateisystem Ihres Systems auf einem MD-Verbund (RAID) " +#~ "liegt, muss es frühzeitig während des Bootvorgangs gestartet werden. Wenn " +#~ "sich Ihr Wurzeldateisystem auf einem logischen Laufwerk (LVM) befindet, " +#~ "das sich wiederum auf einem MD Verbund befindet, müssen alle zugehörigen " +#~ "Verbünde gestartet werden." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Wenn Sie genau wissen, welche Verbünde benötigt werden, um das " +#~ "Wurzeldateisystem zu starten, und Sie den Start der anderen Verbünde auf " +#~ "einen späteren Zeitpunkt in der Bootreihenfolge verschieben wollen, geben " +#~ "Sie die zu startenden Verbünde hier ein. Alternativ geben Sie »all« ein, " +#~ "um alle verfügbaren Verbünde zu starten." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Falls Sie keine RAID-Verbünde für das Wurzeldateisystem benötigen oder " +#~ "starten wollen, lassen Sie die Antwort leer (oder geben »none« ein). Dies " +#~ "könnte der Fall sein, wenn Sie entweder die Autostartfunktion des Kernels " +#~ "verwenden oder keine Verbünde zum Booten benötigen." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Ein Fehler ist aufgetreten: Geräteknoten existiert nicht" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Ein Fehler ist aufgetreten: kein Blockgerät" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Ein Fehler ist aufgetreten: kein RAID-Verbund" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "" +#~ "Ein Fehler ist aufgetreten: Verbund nicht in der Datei mdadm.conf " +#~ "aufgeführt" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Nicht in mdadm.conf aufgeführte Verbünde starten?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "Der angegebene Verbund (${array}) ist in der Konfigurationsdatei " +#~ "${config} nicht aufgeführt. Deshalb kann er während des Bootvorgangs " +#~ "nicht gestartet werden, es sei denn, Sie korrigieren die " +#~ "Konfigurationsdatei und erzeugen die initiale Ramdisk neu." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Diese Warnung ist nur von Bedeutung, wenn Sie RAID-Verbünde, die von der " +#~ "initialen Ramdisk gestartet werden, benötigen, um booten zu können. Falls " +#~ "Sie die Autostartfunktion des Kernels verwenden oder kein RAID-Verbund " +#~ "zum frühen Zeitpunkt des Ladens der initialen Ramdisk gestartet werden " +#~ "muss, können Sie einfach fortfahren. Alternativ wählen Sie, nicht " +#~ "fortzufahren und geben »none« ein, wenn Sie gefragt werden, welche RAID-" +#~ "Verbünde von der initialen Ramdisk gestartet werden sollen." + +#~ msgid "Initialise the superblock if you reuse hard disks" +#~ msgstr "" +#~ "Initialisieren Sie den Superblock, wenn Sie Festplatten wieder verwenden." + +#~ msgid "" +#~ "WARNING! If you are using hard disks which have RAID superblocks from " +#~ "earlier installations in different RAID arrays, you MUST zero each " +#~ "superblock *before* activating the autostart feature." +#~ msgstr "" +#~ "WARNUNG! Wenn Sie Festplatten verwenden, die bereits einen md-Superblock " +#~ "von einer vorherigen Installation in einem anderen RAID-Verbund besitzen, " +#~ "so MÜSSEN Sie diesen löschen, *bevor* Sie die Autostart-Funktion " +#~ "aktivieren." + +#~ msgid "" +#~ "To do this, do not start the RAID devices automatically. First, zero the " +#~ "superblock (mdadm --zero-superblock /dev/mdX). Next, use `dpkg-" +#~ "reconfigure mdadm` to reactivate the autostart feature." +#~ msgstr "" +#~ "Dazu starten Sie die RAID-Laufwerke nicht automatisch und löschen dann " +#~ "erst den Superblock (mdadm --zero-superblock /dev/mdX). Danach können Sie " +#~ "mit »dpkg-reconfigure mdadm« die Autostart-Funktion aktivieren." + +#~ msgid "" +#~ "You have the option to start all other arrays (those not needed for the " +#~ "root filesystem) later in the boot sequence. Doing so will give you " +#~ "greater control over the arrays with the mdadm configuration file. " +#~ "Starting all arrays at boot-time may be safer though." +#~ msgstr "" +#~ "Sie haben die Option, alle anderen Verbünde (diese die nicht für das " +#~ "Wurzeldateisystem benötigt werden) später während des Bootvorgangs zu " +#~ "starten. Damit haben Sie größere Kontrolle über die Verbünde mit Hilfe " +#~ "der mdadm-Konfigurationsdatei. Es ist jedoch sicherer, alle Verbünde beim " +#~ "Booten zu starten." + +#~ msgid "" +#~ "If RAID devices are started automatically, all RAID devices are " +#~ "discovered and assembled automatically at system startup. This option " +#~ "should only be used if the md driver is compiled as a module. If it is " +#~ "compiled into your kernel, the automatic startup will be performed at " +#~ "boot time by the kernel and therefore you should not choose this option." +#~ msgstr "" +#~ "Wenn die RAID-Laufwerke automatisch gestartet werden, werden alle RAID-" +#~ "Laufwerke beim Systemstart automatisch gefunden und gestartet. Diese " +#~ "Option sollte nur benutzt werden, falls der md-Treiber als Modul " +#~ "kompiliert wurde. Falls er in den Kernel einkompiliert wurde, führt der " +#~ "Kernel den automatischen Start beim Booten durch und Sie sollten diese " +#~ "Option deshalb nicht auswählen." + +#~ msgid "" +#~ "When the RAID monitor daemon runs, email notifications are sent when a " +#~ "disk belonging to a RAID array fails or changes its status for some " +#~ "reason." +#~ msgstr "" +#~ "Wird der RAID-Überwachungsdaemon gestartet, so werden E-Mail-" +#~ "Benachrichtigungen verschickt, falls ein zum RAID gehörendes Laufwerk " +#~ "ausfällt oder den Status ändert." diff --git a/debian/po/es.po b/debian/po/es.po new file mode 100644 index 00000000..626b144e --- /dev/null +++ b/debian/po/es.po @@ -0,0 +1,218 @@ +# mdadm po-debconf translation to spanish +# Copyright (C) 2006 Software in the Public Interest, SPI Inc. +# This file is distributed under the same license as the mdadm package. +# +# Changes: +# - Initial translation +# Javier Fernández-Sanguino , 2006 +# - Revision +# Fernando Cerezal +# +# +# Traductores, si no conoce el formato PO, merece la pena leer la +# documentación de gettext, especialmente las secciones dedicadas a este +# formato, por ejemplo ejecutando: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# +# Equipo de traducción al español, por favor lean antes de traducir +# los siguientes documentos: +# +# - El proyecto de traducción de Debian al español +# http://www.debian.org/intl/spanish/ +# especialmente las notas y normas de traducción en +# http://www.debian.org/intl/spanish/notas +# +# - La guía de traducción de po's de debconf: +# /usr/share/doc/po-debconf/README-trans +# o http://www.debian.org/intl/l10n/po-debconf/README-trans +# +# Si tiene dudas o consultas sobre esta traducción consulte con el último +# traductor (campo Last-Translator) y ponga en copia a la lista de +# traducción de Debian al español () +# +# Notas: +# - 'array' no está traducido aán. La traducción como 'arreglo' suena +# fatal (y es poco conocida) [ cambiar cuando se cambie en d-i ] +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 2.5.6-6\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-04-25 17:47+0200\n" +"Last-Translator: Javier Fernández-Sanguino \n" +"Language-Team: Debian Spanish \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=ISO-8859-15\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"¿Debería mdadm ejecutar comprobaciones de redundancia mensuales de los " +"arrays MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Mdadm puede comprobar de forma periódica la redundancia de sus arrays MD " +"(RAIDs) si el núcleo lo soporta (si su versión es superior a la 2.6.14). " +"Esto puede ser un proceso que consuma muchos recursos, dependiendo de su " +"configuración, pero podría ayudar a prevenir casos raros de pérdida de " +"datos. Tenga en cuenta que estas comprobaciones se hacen en modo lectura " +"salvo que se detecten errores, en cuyo caso mdadm necesitará corregirlos, lo " +"que significa que será necesario tener acceso de escritura a los medios " +"físicos." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"El valor por omisio, si se activa, es comprobar el primer Domingo de cada " +"mes a las 01:06 am." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "¿Desea arrancar el demonio de monitorización MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"El demonio de monitorización MD (RAID) envía notificaciones por correo " +"electrónico cuando se producen eventos importantes en los dispositivos MD " +"(como pueda ser el caso de un fallo de un disco)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Es opcional habilitar esta opción." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Destinatario de las notificaciones por correo:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Introduzca la dirección de correo electrónico del usuario que debería " +"recibir las notificaciones por correo de eventos relevantes en los " +"dispositivos MD." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Arrays MD necesarios para el sistema de ficheros raíz:" + +# No se traduce «all» y «none» porque no aparecen en la plantilla para traducir los elementos individuales +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Introduzca «all» (todos), «none» (ninguno) o una lista de dispositivos " +#~ "separados por espacios como por ejemplo puede sólo introducir «md0 md1» o " +#~ "«md/1 md/d0» (no tiene que preceder los nombres de dispositivos con «/" +#~ "dev»)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "para uso interno. Sólo se utiliza la descripción larga." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Si el sistema de ficheros raíz de su sistema está en un array MD (RAID) " +#~ "tiene que inicializarse antes durante de la secuencia de arranque. Si " +#~ "está en un volumen lógico (LVM), que está definido sobre un MD, todos los " +#~ "arrays que lo forman tienen que haberse inicializado." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Introduzca los arrays a iniciar aquí, si sabe con exactitud cuáles son " +#~ "necesarios para arrancar el sistema de ficheros raíz y quiere posponer el " +#~ "arranque de todos los demás arrays a un punto posterior de la secuencia " +#~ "de arranque. También puede introducir «all» (todos) para, sencillamente, " +#~ "iniciar todos los arrays disponibles." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Puede dejar la respuesta en blanco (o introducir «none») si no necesita o " +#~ "desea arrancar los arrays para el sistema de ficheros raíz. Este puede " +#~ "ser su caso si está utilizando el autoarranque del núcleo o no necesita " +#~ "ningún array para el arranque." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Se produjo un error: el nodo de dispositivo no existe" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Se produjo un error: no es un dispositivo de bloques" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Se produjo un error: no es un array MD" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "" +#~ "Se produjo un error: el array no está en la lista definida en el archivo " +#~ "mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "¿Desea arrancar los arrays no listados en mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "El array que ha especificado (${array}) no está listado en el fichero de " +#~ "configuración ${config}. Este array no podrá iniciarse durante el " +#~ "arranque del sistema a no ser que corrija el fichero de configuración y " +#~ "regenere el disco de ram inicial." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Este aviso sólo es relevante si necesita que los arrays se inicien en el " +#~ "disco de RAM inicial para poder arrancar el sistema. Si utiliza el " +#~ "autoarranque del núcleo o no necesita que los arrays estén arrancados tan " +#~ "pronto como se cargue el disco de RAM, puede continuar simplemente. " +#~ "También puede decidir no continuar e introducir «none» cuando se le " +#~ "pregunte qué arrays deberían arrancarse del disco de RAM inicial." diff --git a/debian/po/eu.po b/debian/po/eu.po new file mode 100644 index 00000000..259dce2b --- /dev/null +++ b/debian/po/eu.po @@ -0,0 +1,176 @@ +# mdadm debconf templates basque translation +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# Piarres Beobide , 2008. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm-debconf\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: Y2008-04-30 11:00+0100\n" +"Last-Translator: Piarres Beobide \n" +"Language-Team: Euskara \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"Mdadm-ek hilabetero egin behar al du MD array-en erredundantzia egiaztapena?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Kernelak onartzen badu (2.6.14 baino bertsio berriagoak), mdadm-ek aldiro MD " +"array-en (RAID-en) erredundantzia aldiro egiazta dezake. Hau errekurtso-" +"behar handiko prozesu bat izan daiteke, konfigurazio lokalaren arabera, " +"baina datu galera kasuak saihesten lagundu dezake. Kontutan izan errorerik " +"aurkitzen ez bada irakurketa-soileko egiaztapen bat dela; errorerik " +"arukituko balitz mdadm konpontzen saiatuko da, honetarako euskarrian idaztea " +"beharrezko izan daitekeelarik." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Lehenespen bezala gaiturik dago, egiaztapena hilabete bakoitzeko lehenengo " +"asteleheneko 01:06-etan egingo da." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "MD monitorizazio deabrua abiarazi nahi al duzu?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"MD (RAID) monitore deabruak eposta bidezko berri-emateak bidaltzen ditu " +"gertaera garrantzitsuetan (disko erroreen antzerakoetan)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Aukera hau gaitzea gomendagarria da." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Eposta berri-emateen hartzailea:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Mesedez idatzi MD gertaera garrantzitsuen berri emate mezuak jaso behar " +"dituen erabiltzailearen eposta helbidea." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Erro fitxategi-sistemarentzat beharrezko MD array-ak:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Mesedez idatzi 'denak', 'batez', edo zuriunez bereziriko gailuen " +#~ "zerrenda, adibidez 'md0 md1' edo 'md/1 md/d0' (hasierako '/dev/' baztertu " +#~ "daiteke)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "barne erabilerako - deskribapen luzea bakarrik behar da." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Sistemaren erro fitxategi-sistema MD array (RAID) batetan kokaturik " +#~ "badago, berau abio sekuentziaren hasieran abiarazi behar da. MD batetan " +#~ "kokaturiko bolumen logiko (LVM) batetan badago osatzen duten array " +#~ "guztiak abiarazi behar dira." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Erro fitxategi-sistema erabiltzeko beharrezkoak diren arrayak zehazki " +#~ "jakin eta beste array-en abiaraztea abioaren beranduagoko puntu batetara " +#~ "atzeratu nahi baduzu, idatzi abiarazi beharreko array-ak hemen. Bestela " +#~ "idatzi 'denak' array erabilgarri guztiak abiarazteko." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Ez baduzu erro fitxategi sistemarako array-rik abiarazi behar, hutsik " +#~ "utzi ezazu (edo 'batez' idatzi). Hau abioan array-rik behar ez duzulako " +#~ "edo kernel auto-abioa erabiltzen duzulako izan daiteke." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Errore bat gertatu da: gailu nodoa ez dago" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Errore bat gertatu da: ez da bloke gailu bat" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Errore bat gertatu da: ez da MD array bat" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "" +#~ "Errore bat gertatu da: array-a ez dago mdadm.conf fitxategian " +#~ "zerrendaturik" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Abiarazi mdadm.conf fitxategian ez dauden array-ak?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "Zehazturiko (${array}) array-a ez dago (${config}) konfigurazio " +#~ "fitxategian zerrendaturiko. Horregatik ezin da abioan abiarazi zuk " +#~ "konfigurazio fitxategia konpondu eta abio ramdiskoa bersortu arte." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Abisu hau abiarazi ahal izateko ramdisk.etik array-ak baiaraztea behar " +#~ "baduzu bakarrik da garrantzitsua. Kernel auto-abioa erabiltzen baduzu edo " +#~ "ez baduzu ramdisk-etik hasieran array-rik kargatzea behar aurrera " +#~ "jarraitu dezakezu. Bestela ez jarraitzea hautatu eta 'batez' idatzi " +#~ "hasierako ramdisk-etik kargatu beharreko array-ez galdetzean." diff --git a/debian/po/fi.po b/debian/po/fi.po new file mode 100644 index 00000000..759d3b37 --- /dev/null +++ b/debian/po/fi.po @@ -0,0 +1,173 @@ +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-02-14 11:24+0200\n" +"Last-Translator: Esko Arajärvi \n" +"Language-Team: Finnish \n" +"Language: fi\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"X-Poedit-Language: Finnish\n" +"X-Poedit-Country: FINLAND\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Tulisiko mdadm:n tarkistaa kuukausittain MD-pakkojen eheys?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Ohjelma mdadm voi säännöllisesti tarkistaa MD-pakkojen (RAIDien) tietojen " +"monistuksen, jos ydin tukee tätä (versiosta 2.6.14 eteenpäin). Tämä prosessi " +"voi paikallisesta kokoonpanosta riippuen kuluttaa paljon resursseja, mutta " +"saattaa ehkäistä tietojen menetyksiä tietyissä harvinaisissa tapauksissa. " +"Tarkistus vaatii vain tietojen lukemista, jos virheitä ei löyty. Jos " +"virheitä löytyy, mdadm yrittää korjata ne, jolloin levylle saatetaan myös " +"kirjoittaa." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Oletuksena, jos tarkistus on käytössä, se tehdään kuukauden ensimmäisenä " +"sunnuntaina kello 01.06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Haluatko käynnistää MD-seurannan?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"MD-pakkoja (RAIDeja) seuraava taustaohjelma lähettää tietoja sähköpostiin " +"tärkeiden MD-tapahtumien (kuten levyrikon) sattuessa." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Tämän valitseminen on suositeltavaa." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Sähköpostiviestien vastaanottaja:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Anna sähköpostiosoite, johon sähköpostitiedotteet tärkeistä MD-tapahtumista " +"lähetetään." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Juuritiedostojärjestelmän tarvitsemat MD-pakat:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Syötä â€all†(kaikki), â€none†(ei mitään) tai välilyönnein eroteltu lista " +#~ "laitteista, esimerkiksi â€md0 md1†tai â€md/1 md/d0†(edeltävä /dev/ " +#~ "voidaan jättää pois)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "vain sisäiseen käyttöön - vain pitkä kuvaus on tarpeellinen." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Jos järjestelmän juuritiedostojärjestelmä sijaitsee MD-levypakassa " +#~ "(RAID), pakka tulee käynnistää aikaisessa vaiheessa käynnistettäessä " +#~ "järjestelmää. Jos se sijaitsee loogisella taltiolla (LVM), joka on MD-" +#~ "pakassa, kaikki taltioon liittyvät pakat tulee käynnistää." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Jos tiedät tarkalleen mitä pakkoja tarvitaan juuritiedostojärjestelmän " +#~ "käynnistämiseen ja haluat viivästyttää muiden pakkojen käynnistystä, " +#~ "syötä käynnistettävät pakat tähän. Vaihtoehtoisesti voit käynnistää " +#~ "kaikki pakat syöttämällä â€allâ€." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Jos mitään pakkoja ei tarvitse käynnistää juuritiedostojärjestelmän " +#~ "käyttämiseksi, jätä kenttä tyhjäksi (tai syötä â€noneâ€). Tämä voi olla " +#~ "tilanne, jos käytät ytimen autokäynnistystä tai et tarvitse mitään " +#~ "pakkoja käynnistykseen." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Tapahtui virhe: laitetiedostoa ei ole olemassa" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Tapahtui virhe: ei lohkolaite" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Tapahtui virhe: ei MD-pakka" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Tapahtui virhe: pakkaa ei ole listattu tiedostossa mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "" +#~ "Käynnistetäänkö pakat, joita ei ole listattu tiedostossa mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "Annettua pakkaa (${array}) ei ole listattu asetustiedostossa (${config}). " +#~ "Niinpä sitä ei voida käynnistää käynnistettäessä järjestelmä, ellei " +#~ "asetustiedostoa korjata ja käynnistysmuistilevyä (initrd) luoda uudelleen." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Tämä varoitus on aiheellinen vain, jos järjestelmän käynnistäminen vaatii " +#~ "pakkojen käynnistämistä käynnistysmuistilevyltä. Jos ytimen " +#~ "autokäynnistys on käytössä tai pakkoja ei tarvita siinä vaiheessa, kun " +#~ "käynnistysmuistilevy ladataan, voit jatkaa. Vaihtoehtoisesti voit olla " +#~ "jatkamatta ja syöttää â€none†kysyttäessä käynnistysmuistilevyltä " +#~ "käynnistettäviä pakkoja." diff --git a/debian/po/fr.po b/debian/po/fr.po new file mode 100644 index 00000000..df033b3c --- /dev/null +++ b/debian/po/fr.po @@ -0,0 +1,186 @@ +# Translation of mdadm debconf templates to French +# Copyright (C) 2008 Florentin Duneau +# This file is distributed under the same license as the lurker package. +# +# +# Éric Madesclair , 2005, 2006. +# Jean-Luc Coulon (f5ibh) , 2006. +# Florentin Duneau , 2006, 2007, 2008. +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-02-15 20:42+0100\n" +"Last-Translator: Florentin Duneau \n" +"Language-Team: French \n" +"Language: fr\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=2; plural=(n > 1);\n" +"X-Generator: KBabel 1.11.4\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Faut-il vérifier chaque mois la redondance des ensembles RAID ?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Si le noyau le gère (à partir de la version 2.6.14), mdadm peut vérifier " +"périodiquement la redondance des ensembles RAID. Cette action peut demander " +"beaucoup de ressources selon la configuration, mais cela aide à prévenir les " +"rares cas de pertes de données. Notez que ce test est réalisé en lecture " +"seule à moins que des erreurs ne soient rencontrées. Si des erreurs sont " +"détectées, mdadm essayera de les corriger, ce qui entraînera des écritures " +"sur le média." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Par défaut, la vérification s'effectuera tous les premiers dimanche du mois " +"à 01 h 06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Faut-il démarrer le démon de surveillance MD ?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Le démon de surveillance MD envoie des notifications par courriel lors " +"d'importants événements MD (comme une panne de disque dur)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Il est recommandé d'activer cette option." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Destinataire des notifications par courriel :" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Veuillez indiquer l'adresse électronique de l'utilisateur qui doit recevoir " +"les notifications lors d'importants événements MD." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Ensembles MD requis par le système de fichiers racine :" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Veuillez indiquer « all », « none » ou une liste de périphériques, " +#~ "séparés par des espaces, par exemple, « md0 md1 » ou « md/1 md/d0 » (vous " +#~ "pouvez omettre « /dev/ »)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "" +#~ "Pour une utilisation interne - seule la description longue est nécessaire" + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Si le système de fichiers racine se trouve sur un ensemble MD (RAID), il " +#~ "doit être lancé au début de la procédure de démarrage. Si le système de " +#~ "fichiers racine se trouve sur un volume logique (« LVM »), qui se trouve " +#~ "aussi sur un volume MD, tous les composants de l'ensemble doivent être " +#~ "démarrés." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Si vous savez exactement quels sont les ensembles RAID nécessaires au " +#~ "démarrage du système de fichiers racine et si vous souhaitez différer le " +#~ "démarrage de tous les autres ensembles, veuillez les indiquer ici. Vous " +#~ "pouvez aussi indiquer « all » pour démarrer tous les ensembles existants." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Si vous n'avez pas besoin ou ne souhaitez pas démarrer d'ensemble RAID " +#~ "pour le système de fichiers racine, veuillez laissez l'entrée vide (ou " +#~ "entrez « none »). Ceci peut être le cas si vous utilisez l'option de " +#~ "démarrage automatique (« autostart ») du noyau ou si vous n'avez besoin " +#~ "d'aucun ensemble pour démarrer." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Erreur : périphérique inconnu" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Erreur : ce n'est pas un périphérique en mode bloc" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Erreur : ce n'est pas un ensemble RAID" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Erreur : ensemble non mentionné dans le fichier mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "" +#~ "Faut-il démarrer les ensembles RAID non mentionnés dans mdadm.conf ?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "L'ensemble (${array}) que vous avez spécifié n'est pas mentionné dans le " +#~ "fichier de configuration ${config}. Il ne sera donc pas démarré à moins " +#~ "que vous corrigiez le fichier de configuration et que vous génériez de " +#~ "nouveau le disque mémoire initial (« ramdisk »)." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Cet avertissement n'a de signification que si des ensembles RAID doivent " +#~ "être lancés à partir du disque mémoire initial afin de pouvoir démarrer " +#~ "le système. Si vous utilisez le démarrage automatique par le noyau, ou si " +#~ "vous n'avez pas besoin de lancer d'ensemble RAID depuis le disque mémoire " +#~ "initial, vous pouvez simplement poursuivre. Vous pouvez aussi choisir de " +#~ "ne pas poursuivre et entrer « none » lorsqu'il vous sera demandé le nom " +#~ "des ensembles RAID à démarrer à partir du disque mémoire initial." diff --git a/debian/po/gl.po b/debian/po/gl.po new file mode 100644 index 00000000..85e76e3c --- /dev/null +++ b/debian/po/gl.po @@ -0,0 +1,177 @@ +# Galician translation of mdadm's debconf templates +# This file is distributed under the same license as the mdadm package. +# Jacobo Tarrio , 2007, 2008. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-02-06 23:45+0000\n" +"Last-Translator: Jacobo Tarrio \n" +"Language-Team: Galician \n" +"Language: gl\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"¿Debería mdadm facer comprobacións mensuais de redundancia dos arrays MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Se o núcleo ten soporte para iso (en versións superiores á 2.6.14), mdadm " +"pode facer comprobacións periódicas de redundancia dos arrays MD (RAIDs). " +"Este pode ser un proceso intensivo en recursos, dependendo da configuración " +"local, pero pode axudar a evitar algúns casos raros de perdas de datos. Teña " +"en conta que esta é unha comprobación de só lectura a menos que se atopen " +"erros; se se atopan erros, mdadm ha tratar de os arranxar, o que pode " +"producir accesos de escritura aos soportes." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"A opción por defecto, se se activa, é facer as comprobacións o primeiro " +"domingo de cada mes ás 01:16." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "¿Quere iniciar o servizo de monitorización de MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"O servizo de monitorización de MD (RAID) envía avisos por email en resposta " +"a eventos importantes de MD (coma fallos nos discos)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Recoméndase activar esta opción." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Destinatario para os avisos por email:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Introduza o enderezo de email do usuario que debe recibir os avisos por " +"email de eventos importantes de MD." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Arrays MD necesarios para o sistema de ficheiros raíz" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Introduza \"all\" (todos), \"none\" (ningún) ou unha lista de " +#~ "dispositivos separados por espazos, tales coma \"md0 md1\" ou \"md/1 " +#~ "md/0\" (pódese omitir o \"/dev/\" do principio)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "para uso interno - só se precisa da descrición longa." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Se o sistema de ficheiros raíz do sistema está ubicado nun array MD " +#~ "(RAID), hai que o iniciar no principio da secuencia de inicio. Se está " +#~ "ubicado nun volume lóxico (LVM) que está nun MD, hai que iniciar os " +#~ "arrays constituíntes." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Se sabe exactamente que arrays son necesarios para erguer o sistema de " +#~ "ficheiros raíz, e se quere pospor o inicio dos demáis arrays ata un punto " +#~ "posterior da secuencia de inicio, introduza aquí os arrays a iniciar. " +#~ "Alternativamente, introduza \"all\" para iniciar tódolos arrays " +#~ "dispoñibles." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Se non quere ou precisa de iniciar ningún array para o sistema de " +#~ "ficheiros raíz, deixe a resposta en branco (ou introduza \"none\"). Este " +#~ "pode ser o caso se está a empregar o autoinicio do núcleo ou non precisa " +#~ "de ningún array para o inicio." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Houbo un erro: o nodo do dispositivo non existe" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Houbo un erro: non é un dispositivo de bloques" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Houbo un erro: non é un array MD" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Houbo un erro: o array non figura no ficheiro mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "¿Iniciar os arrays que non figuran en mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "O array indicado (${array}) non figura no ficheiro de configuración " +#~ "(${config}). Polo tanto, non se pode arrincar no inicio do sistema, a " +#~ "menos que corrixa o ficheiro de configuración e volva crear o disco RAM " +#~ "inicial." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Este aviso só é relevante se precisa de iniciar arrays desde o disco RAM " +#~ "inicial para poder iniciar o sistema. Se emprega autoinicio do núcleo ou " +#~ "non precisa de iniciar arrays tan pronto como se cargue o disco RAM " +#~ "inicial, pode continuar. De xeito alternativo, escolla non continuar e " +#~ "introduza \"none\" cando se lle pregunte que arrays quere iniciar do " +#~ "disco RAM inicial." diff --git a/debian/po/it.po b/debian/po/it.po new file mode 100644 index 00000000..62d6cf0f --- /dev/null +++ b/debian/po/it.po @@ -0,0 +1,177 @@ +# Italian (it) translation of debconf templates for mdadm +# Copyright (C) 2008 Software in the Public Interest +# This file is distributed under the same license as the mdadm package. +# Luca Monducci , 2008. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm italian debconf\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-11-19 11:02+0100\n" +"Last-Translator: Luca Monducci \n" +"Language-Team: Italian \n" +"Language: it\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Far eseguire a mdadm i controlli mensili di ridondanza sugli array MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Se il kernel lo supporta (tutte le versioni successive la 2.6.14), mdadm può " +"effettuare delle verifiche periodiche sulla ridondanza degli array MD " +"(RAID). Questo è un processo che potrebbe richiedere molte risorse, in base " +"alle impostazioni locali, ma può prevenire i rari casi di perdita di dati. " +"Notare che questa verifica è di sola-lettura tranne quando riscontra degli " +"errori; quando ci sono errori, mdadm prova a correggerli e potrebbe accedere " +"in scrittura al supporto." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Se attivo, la configurazione predefinita prevede che il controllo sia " +"eseguito la prima domenica di ogni mese alle 01.06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Avviare il demone di monitoraggio MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Il demone di monitoraggio MD (RAID) invia delle notifiche via email quando " +"si verificano eventi importanti (come la rottura di un disco)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Si raccomanda l'attivazione di questa funzione." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Destinatario delle email di notifica:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Inserire l'indirizzo email dell'utente che deve ricevere le notifiche di " +"eventi importanti legati al MD." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Array MD necessari per il file system di root:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Inserire \"all\", \"none\" oppure un elenco dei device separati da uno " +#~ "spazio, per esempio \"md0 md1\" o \"md/1 md/d0\" (il \"/dev/\" iniziale " +#~ "può essere omesso)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "uso interno - è necessaria solo la descrizione lunga." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Se il file system di root è su un array MD (RAID), è necessario attivare " +#~ "tale array all'inizio della sequenza d'avvio. Se è su un volume logico " +#~ "(LVM), il quale è su un MD, è necessario attivare tutti gli array che " +#~ "costituiscono il volume." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Se si conoscono esattamente quali sono gli array da attivare per il file " +#~ "system di root e si vuole rimandare l'attivazione di tutti gli altri " +#~ "array a una fase successiva della sequenza d'avvio, inserire adesso gli " +#~ "array da attivare. In alternativa, inserire \"all\" per attivare tutti " +#~ "gli array disponibili." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Se non si ha bisogno o non si vuole attivare nessun array per il file " +#~ "system di root, lasciare la risposta in bianco (oppure inserire \"none" +#~ "\"). Questo potrebbe essere il caso se si utilizza l'attivazione " +#~ "automatica da kernel oppure se non si ha bisogno di alcun array per " +#~ "l'avvio." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Errore: il nodo del device non esiste" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Errore: non è un device a blocchi" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Errore: non è un array MD" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Errore: array non elencato nel file mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Avviare gli array non elencati in mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "L'array specificato (${array}) non è presente nel file di configurazione " +#~ "(${config}): quindi non può essere attivato durante l'avvio senza " +#~ "correggere il file di configurazione e ricreare il ramdisk iniziale." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Questo avviso è pertinente solo se è necessario attivare gli array dal " +#~ "ramdisk iniziale per permettere l'avvio. Con l'avvio automatico da kernel " +#~ "o se non è necessario attivare gli array così presto come al caricamento " +#~ "del ramdisk iniziale, si può proseguire. In alternativa, scegliere di non " +#~ "continuare e inserire \"none\" quando viene chiesto quali array attivare " +#~ "dal ramdisk iniziale." diff --git a/debian/po/ja.po b/debian/po/ja.po new file mode 100644 index 00000000..7f9c4fb6 --- /dev/null +++ b/debian/po/ja.po @@ -0,0 +1,233 @@ +# +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans +# +# Developers do not need to manually edit POT or PO files. +# +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 2.6.3+200709292116+4450e59-4\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-02-07 05:52+0900\n" +"Last-Translator: Hideki Yamane (Debian-JP) \n" +"Language-Team: Japanese \n" +"Language: ja\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "mdadm ã¯ã€æ¯Žæœˆ MD アレイã®å†—長性ãƒã‚§ãƒƒã‚¯ã‚’è¡Œã„ã¾ã™ã‹?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"カーãƒãƒ«ãŒã‚µãƒãƒ¼ãƒˆã—ã¦ã„ã‚‹å ´åˆ (ãƒãƒ¼ã‚¸ãƒ§ãƒ³ 2.6.14 以é™)ã€mdadm ã¯å®šæœŸçš„ã« MD " +"アレイ (RAID) ã®å†—長性ãƒã‚§ãƒƒã‚¯ã‚’ã™ã‚‹ã“ã¨ãŒå¯èƒ½ã§ã™ã€‚ã“ã‚Œã¯ã€è¨­å®šã«ä¾å­˜ã—ã¾ã™" +"ãŒãƒªã‚½ãƒ¼ã‚¹ã‚’集中的ã«ä½¿ç”¨ã™ã‚‹å‹•ä½œã§ã™ã€‚ã—ã‹ã—ã€ç¨€ãªãƒ‡ãƒ¼ã‚¿æ¶ˆå¤±ã‚’ã‚らã‹ã˜ã‚é¿ã‘" +"ã‚‹ã®ã«å½¹ç«‹ã¤ã§ã—ょã†ã€‚ã“ã‚Œã¯ã€ã‚¨ãƒ©ãƒ¼ãŒè¦‹ã¤ã‹ã‚‰ãªã„é™ã‚Šã¯èª­ã¿è¾¼ã¿ãƒã‚§ãƒƒã‚¯ã®ã¿" +"ã§ã‚ã‚‹ã®ã«æ³¨æ„ã—ã¦ãã ã•ã„。エラーãŒç™ºè¦‹ã•ã‚ŒãŸå ´åˆã€mdadm ã¯ä¿®æ­£ã—よã†ã¨ã—" +"ã¦ã€çµæžœçš„ã«ãƒ¡ãƒ‡ã‚£ã‚¢ã¸æ›¸ãè¾¼ã¿ã‚’è¡Œã„ã¾ã™ã€‚" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"有効ã«ã—ãŸå ´åˆã€ãƒ‡ãƒ•ã‚©ãƒ«ãƒˆã§ã¯æ¯Žæœˆç¬¬ä¸€æ—¥æ›œ 01:06 ã«ãƒã‚§ãƒƒã‚¯ãŒå®Ÿè¡Œã•ã‚Œã¾ã™ã€‚" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "MD 監視デーモンを起動ã—ã¾ã™ã‹?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"MD (RAID) 監視デーモンã¯ã€é‡å¤§ãª MD 関連ã®ã‚¤ãƒ™ãƒ³ãƒˆ (ディスク障害ãªã©) ã«å¯¾ã—" +"ã¦ãƒ¡ãƒ¼ãƒ«ã§é€šçŸ¥ã‚’é€ã‚Šã¾ã™ã€‚" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "ã“ã®æ©Ÿèƒ½ã‚’有効ã«ã™ã‚‹ã®ã‚’ãŠå‹§ã‚ã—ã¾ã™ã€‚" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "メール通知ã®å®›å…ˆ:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"MD 関連ã®é‡å¤§ãªã‚¤ãƒ™ãƒ³ãƒˆãŒç™ºç”Ÿã—ãŸéš›ã€ãƒ¡ãƒ¼ãƒ«ã§ã®é€šçŸ¥ã‚’å—ã‘å–ã‚‹å¿…è¦ãŒã‚るユーザ" +"ã®ãƒ¡ãƒ¼ãƒ«ã‚¢ãƒ‰ãƒ¬ã‚¹ã‚’入力ã—ã¦ãã ã•ã„。" + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "ルートファイルシステムã«å¿…è¦ãª MD アレイ:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "'all' ã¾ãŸã¯ 'none'ã€ã‚ã‚‹ã„ã¯ãƒ‡ãƒã‚¤ã‚¹ã®ãƒªã‚¹ãƒˆã‚’ 'md0 md1' ã‚„ 'md/1 md/d0' " +#~ "ã®ã‚ˆã†ã«ã‚¹ãƒšãƒ¼ã‚¹ã§åŒºåˆ‡ã£ã¦å…¥åŠ›ã—ã¦ãã ã•ã„ (å‰ã«ä»˜ã '/dev/' ã¯çœç•¥å¯èƒ½ã§" +#~ "ã™)。" + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "内部ã§ã®åˆ©ç”¨ã«ã¤ã„㦠- ã§ã‚‚ã€é•·ã„説明ãŒå¿…è¦ã§ã™ã€‚" + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "ã‚ãªãŸã®ã‚·ã‚¹ãƒ†ãƒ ã®ãƒ«ãƒ¼ãƒˆãƒ•ã‚¡ã‚¤ãƒ«ã‚·ã‚¹ãƒ†ãƒ ãŒ MD アレイ (RAID) 上ã«é…ç½®ã•ã‚Œã¦" +#~ "ã„ã‚‹ãªã‚‰ã°ã€ãƒ–ートシーケンスã®åˆæœŸæ®µéšŽã§ MD アレイを開始ã™ã‚‹å¿…è¦ãŒã‚ã‚Šã¾" +#~ "ã™ã€‚ルートファイルシステム㌠MD ã®ã‚ˆã†ãªè«–ç†ãƒœãƒªãƒ¥ãƒ¼ãƒ  (LVM) 上ã«ã‚ã‚‹å ´åˆ" +#~ "ã¯ã€æ§‹æˆã—ã¦ã„るアレイ全ã¦ã®é–‹å§‹ãŒå¿…è¦ã§ã™ã€‚" + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "ã©ã®ã‚¢ãƒ¬ã‚¤ãŒãƒ«ãƒ¼ãƒˆãƒ•ã‚¡ã‚¤ãƒ«ã‚·ã‚¹ãƒ†ãƒ ã®ç«‹ã¡ä¸Šã’ã«å¿…è¦ã‹ã‚’正確ã«çŸ¥ã£ã¦ãŠã‚Šã€" +#~ "ブートシーケンスã®å¾Œã®æ™‚点ã¾ã§æ„図ã—ã¦ã„ã‚‹ã‚‚ã®ä»¥å¤–å…¨ã¦ã®ã‚¢ãƒ¬ã‚¤èµ·å‹•ã‚’é…らã›" +#~ "ãŸã„å ´åˆã€ã“ã“ã§æœ€åˆã«èµ·å‹•ã™ã‚‹ã‚¢ãƒ¬ã‚¤ã‚’入力ã—ã¦ãã ã•ã„。ãã†ã§ãªã„å ´" +#~ "åˆã€'all' ã¨å…¥åŠ›ã—ã¦å˜ã«å…¨ã¦ã®åˆ©ç”¨å¯èƒ½ãªã‚¢ãƒ¬ã‚¤ã‚’最åˆã«ç«‹ã¡ä¸Šã’ã¦ãã ã•ã„。" + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "ルートファイルシステムã®ãŸã‚ã«ã€ã©ã®ã‚¢ãƒ¬ã‚¤ã‚‚å¿…è¦ãªã„ã€ã‚ã‚‹ã„ã¯ã©ã®ã‚¢ãƒ¬ã‚¤ã‚‚" +#~ "èµ·å‹•ã—ãŸãã¯ç„¡ã„ã¨ã„ã†å ´åˆã¯ã€ç©ºç™½ã®ã¾ã¾ã« (ã‚ã‚‹ã„㯠'none' ã¨å…¥åŠ›) ã—ã¦ã" +#~ "ã ã•ã„。ã“ã‚Œã¯ã€ã‚«ãƒ¼ãƒãƒ«ã§è‡ªå‹•çš„ã«èµ·å‹•ã•ã‚Œã‚‹å ´åˆã‚„起動時ã«ã¯ã‚¢ãƒ¬ã‚¤ã¯ä¸è¦ã§" +#~ "ã‚ã‚‹ã¨ã„ã†å ´åˆã§ã™ã€‚" + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "エラーãŒç™ºç”Ÿã—ã¾ã—ãŸ: デãƒã‚¤ã‚¹ãƒŽãƒ¼ãƒ‰ãŒå­˜åœ¨ã—ã¾ã›ã‚“" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "エラーãŒç™ºç”Ÿã—ã¾ã—ãŸ: ブロックデãƒã‚¤ã‚¹ã§ã¯ã‚ã‚Šã¾ã›ã‚“" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "エラーãŒç™ºç”Ÿã—ã¾ã—ãŸ: MD アレイã§ã¯ã‚ã‚Šã¾ã›ã‚“" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "" +#~ "エラーãŒç™ºç”Ÿã—ã¾ã—ãŸ: mdadm.conf ファイルã«è¨˜è¿°ã•ã‚Œã¦ã„ãªã„アレイã§ã™" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "mdadm.conf ã«è¨˜è¿°ã•ã‚Œã¦ã„ãªã„アレイを起動ã—ã¾ã™ã‹?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "指定ã—ãŸã‚¢ãƒ¬ã‚¤ (${array}) ã¯è¨­å®šãƒ•ã‚¡ã‚¤ãƒ« (${config}) ã«è¨˜è¿°ã•ã‚Œã¦ã„ã¾ã›" +#~ "ん。ãã®ãŸã‚ã€è¨­å®šãƒ•ã‚¡ã‚¤ãƒ«ã‚’修正ã—㦠initrd ã‚’å†ç”Ÿæˆã—ãªã‘ã‚Œã°ãƒ–ート時ã«èµ·" +#~ "å‹•ã§ãã¾ã›ã‚“。" + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "ã“ã®è­¦å‘Šã¯ã€ãƒ–ートã§ãるよã†ã«ã‚¢ãƒ¬ã‚¤ã‚’ initrd ã‹ã‚‰èµ·å‹•ã™ã‚‹å¿…è¦ãŒã‚ã‚‹å ´åˆã " +#~ "ã‘関係ã—ã¾ã™ã€‚カーãƒãƒ«ã§è‡ªå‹•çš„ã«ã‚¢ãƒ¬ã‚¤ã‚’èµ·å‹•ã™ã‚‹ã‚ˆã†ã«ã—ã¦ã„ã‚‹å ´åˆã€ã‚ã‚‹ã„" +#~ "㯠initrd ãŒãƒ­ãƒ¼ãƒ‰ã•ã‚Œã‚‹ç¨‹æ—©ã„段階ã§ã©ã®ã‚¢ãƒ¬ã‚¤ã‚‚èµ·å‹•ã—ãŸãã¯ãªã„å ´åˆã¯ãã®" +#~ "ã¾ã¾ç¶šè¡Œã§ãã¾ã™ã€‚ä»–ã®é¸æŠžè‚¢ã¨ã—ã¦ã¯ã€èµ·å‹•ã®ç¶šè¡Œã‚’中止ã—ã€ã©ã®ã‚¢ãƒ¬ã‚¤ã‚’ " +#~ "initrd ã‹ã‚‰èµ·å‹•ã™ã‚‹ã‹ã‚’å°‹ã­ã‚‰ã‚ŒãŸéš›ã« 'none' ã¨å…¥åŠ›ã—ã¾ã™ã€‚" + +#~ msgid "" +#~ "WARNING! If you are using hard disks which have RAID superblocks from " +#~ "earlier installations in different RAID arrays, you MUST zero each " +#~ "superblock *before* activating the autostart feature." +#~ msgstr "" +#~ "警告! 以å‰ã®ã‚¤ãƒ³ã‚¹ãƒˆãƒ¼ãƒ«ã«ã‚ˆã£ã¦ä»–ã® RAID アレイã«å¯¾ã™ã‚‹ RAID superblock " +#~ "ã‚’ä¿æŒã—ã¦ã„ã‚‹ãƒãƒ¼ãƒ‰ãƒ‡ã‚£ã‚¹ã‚¯ã‚’使ã£ã¦ã„ã‚‹å ´åˆã€è‡ªå‹•èµ·å‹•æ©Ÿèƒ½ã‚’有効ã«ã™ã‚‹" +#~ "「å‰ã€ã«ã€ãã® superblock をゼロã§ä¸Šæ›¸ãã™ã‚‹ã“ã¨ãŒã€Œå¿…è¦ã€ã§ã™ã€‚" + +#~ msgid "" +#~ "To do this, do not start the RAID devices automatically. First, zero the " +#~ "superblock (mdadm --zero-superblock /dev/mdX). Next, use `dpkg-" +#~ "reconfigure mdadm` to reactivate the autostart feature." +#~ msgstr "" +#~ "ã“れを行ã†ã«ã¯ã€RAID デãƒã‚¤ã‚¹ã‚’自動的ã«èµ·å‹•ã—ã¦ã¯ã„ã‘ã¾ã›ã‚“。ã¾ãšã€ " +#~ "superblock をゼロã§ä¸Šæ›¸ãã—ã¾ã™ (mdadm --zero-superblock /dev/xxx)。 ãã—" +#~ "ã¦ã€è‡ªå‹•èµ·å‹•æ©Ÿèƒ½ã‚’有効ã«ã™ã‚‹ãŸã‚ã€'dpkg-reconfigure mdadm' コマンドを実行" +#~ "ã—ã¾ã™ã€‚" + +#~ msgid "" +#~ "You have the option to start all other arrays (those not needed for the " +#~ "root filesystem) later in the boot sequence. Doing so will give you " +#~ "greater control over the arrays with the mdadm configuration file. " +#~ "Starting all arrays at boot-time may be safer though." +#~ msgstr "" +#~ "ブートシーケンスã®å¾Œã‚ã®æ–¹ã§ (root ファイルシステムã«ã¯å¿…è¦ãªã„) ä»–ã®ã‚¢ãƒ¬" +#~ "イ全ã¦ã‚’èµ·å‹•ã™ã‚‹ã¨ã„ã†é¸æŠžè‚¢ã‚‚ã‚ã‚Šã¾ã™ã€‚ã“れをé¸ã¹ã°ã€mdadm ã®è¨­å®šãƒ•ã‚¡ã‚¤ãƒ«" +#~ "を使ã£ã¦ã€ã‚¢ãƒ¬ã‚¤ã«ã¤ã„ã¦æ§˜ã€…ãªè¨­å®šãŒå‡ºæ¥ã‚‹ã‚ˆã†ã«ãªã‚‹ã§ã—ょã†ã€‚ã‚‚ã£ã¨ã‚‚ã€èµ·" +#~ "動時ã«å…¨ã¦ã®ã‚¢ãƒ¬ã‚¤ã‚’èµ·å‹•ã™ã‚‹ã»ã†ãŒå®‰å…¨ã§ã¯ã‚ã‚Šã¾ã™ã€‚" + +#~ msgid "" +#~ "If RAID devices are started automatically, all RAID devices are " +#~ "discovered and assembled automatically at system startup. This option " +#~ "should only be used if the md driver is compiled as a module. If it is " +#~ "compiled into your kernel, the automatic startup will be performed at " +#~ "boot time by the kernel and therefore you should not choose this option." +#~ msgstr "" +#~ "RAID デãƒã‚¤ã‚¹ãŒè‡ªå‹•çš„ã«èµ·å‹•ã™ã‚‹ã‚ˆã†ã«ã™ã‚‹ã¨ã€ã‚·ã‚¹ãƒ†ãƒ èµ·å‹•æ™‚ã«å…¨ã¦ã® RAID " +#~ "デãƒã‚¤ã‚¹ãŒæ¤œå‡ºã•ã‚Œã€è‡ªå‹•çš„ã«æ§‹æˆã•ã‚Œã¾ã™ã€‚ã“ã®ã‚ªãƒ—ション㯠md ドライãƒãŒãƒ¢" +#~ "ジュールã¨ã—ã¦ã‚³ãƒ³ãƒ‘イルã•ã‚Œã¦ã„ã‚‹å ´åˆã®ã¿ã«åˆ©ç”¨ã—ã¾ã™ã€‚カーãƒãƒ«ã«çµ„ã¿è¾¼ã‚“" +#~ "ã§ã‚³ãƒ³ãƒ‘イルã—ã¦ã„ãŸå ´åˆã€ã‚·ã‚¹ãƒ†ãƒ èµ·å‹•æ™‚ã«ã‚«ãƒ¼ãƒãƒ«ã«ã‚ˆã£ã¦è‡ªå‹•èµ·å‹•ãŒå®Ÿè¡Œã•" +#~ "れるã®ã§ã€ã“ã®ã‚ªãƒ—ションã§ã®é¸æŠžã¯ã§ãã¾ã›ã‚“。" + +#~ msgid "" +#~ "When the RAID monitor daemon runs, email notifications are sent when a " +#~ "disk belonging to a RAID array fails or changes its status for some " +#~ "reason." +#~ msgstr "" +#~ "RAID 監視デーモンãŒå‹•ä½œã—ã¦ã„ã‚‹å ´åˆã€RAID アレイã«å±žã—ã¦ã„るディスクãŒæ•…éšœ" +#~ "ã™ã‚‹ã‹ä½•ã‚‰ã‹ã®ç†ç”±ã§å¤‰åŒ–ã—ãŸéš›ã«ãƒ¡ãƒ¼ãƒ«ã§é€šçŸ¥ãŒé€ã‚‰ã‚Œã¾ã™ã€‚" diff --git a/debian/po/nl.po b/debian/po/nl.po new file mode 100644 index 00000000..34f473e6 --- /dev/null +++ b/debian/po/nl.po @@ -0,0 +1,188 @@ +# translation of mdadm_2.6.3+200709292116+4450e59-4.po to Dutch +# +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans# +# Developers do not need to manually edit POT or PO files. +# +# Frans Pop , 2005, 2006. +# Frans Pop , 2008. +msgid "" +msgstr "" +"Project-Id-Version: mdadm_2.6.3+200709292116+4450e59-4\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-02-19 14:04+0100\n" +"Last-Translator: Frans Pop \n" +"Language-Team: Dutch \n" +"Language: nl\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"X-Generator: KBabel 1.11.4\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Maandelijkse redundantiecontrole van RAID-reeksen uitvoeren?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Als uw kernel dit ondersteunt (versies groter dan 2.6.14), kan mdadm " +"periodiek de redundantie van uw RAID reeksen controleren. Afhankelijk van uw " +"configuratie kan dit een intensief proces zijn, maar het kan wel helpen om " +"uitzonderlijke gevallen van gegevensverlies te voorkomen. Zolang geen fouten " +"worden gevonden, zal het proces alleen gegevens lezen; als echter wel fouten " +"worden gevonden zal mdadm deze proberen te corrigeren." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Standaard wordt de controle, indien geactiveerd, uitgevoerd om 01:06 op elke " +"eerste zondag van de maand." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Wilt u de achtergronddienst voor de RAID-monitor starten?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"De achtergronddienst voor de RAID-monitor stuurt per e-mail berichten bij " +"belangrijke gebeurtenissen die betrekking hebben op RAID (zoals een falende " +"harde schijf)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Gebruik van deze optie wordt aanbevolen." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Adres voor e-mailberichten:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Wat is het e-mailadres van de gebruiker die de e-mailberichten voor " +"belangrijke gebeurtenissen met betrekking tot RAID dient te ontvangen." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Voor het basisbestandssysteem benodigde RAID-reeksen:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Geef in 'all' (alle), 'none' (geen) of één of meerdere apparaatbestanden " +#~ "(gescheiden door spaties), bijvoorbeeld \"md0 md1\" of \"md/1 md/d0\" (de " +#~ "prefix '/dev/' kan dus worden weggelaten)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "Voor intern gebruik - alleen de lange omschrijving wordt gebruikt." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Als het basisbestandssysteem van uw systeem zich op een RAID-volume " +#~ "bevindt, dient dit vroeg in de opstartcyclus geactiveerd te worden. Als " +#~ "het zich op een logisch volume (LVM) op RAID bevindt, dienen alle " +#~ "betrokken reeksen geactiveerd te worden." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Als u precies weet welke reeksen benodigd zijn voor het " +#~ "basisbestandssysteem en u het activeren van alle overige reeksen wilt " +#~ "uitstellen tot later in de opstartprocedure, geef dan hier de te " +#~ "activeren reeksen in. Anders kunt u 'all' ingeven om alle beschikbare " +#~ "reeksen te activeren." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Als u geen reeksen hoeft of wenst te activeren voor het " +#~ "basisbestandssysteem, laat dan het antwoord leeg (of geef 'none' in). Dit " +#~ "kan het geval zijn als u \"kernel autostart\" gebruikt of geen reeksen " +#~ "nodig heeft om uw systeem op te starten." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Er is een fout opgetreden: apparaatbestand bestaat niet" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Er is een fout opgetreden: geen blokapparaat" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Er is een fout opgetreden: geen RAID reeks" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "" +#~ "Er is een fout opgetreden: reeks komt niet voor in bestand mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Reeksen activeren die niet in mdadm.conf voorkomen?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "De reeks die u heeft opgegeven (${array}) komt niet voor in het " +#~ "configuratiebestand (${config}). Tenzij u het configuratiebestand " +#~ "corrigeert en de initiële ramdisk opnieuw aanmaakt, kan deze reeks " +#~ "tijdens het opstarten van het systeem niet worden geactiveerd." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Deze waarschuwing is alleen relevant als de reeksen geactiveerd moeten " +#~ "worden vanuit een initiële ramdisk om het systeem te kunnen opstarten. " +#~ "Als u \"kernel autostart\" gebruikt of er geen reeksen zijn die vanuit de " +#~ "initiële ramdisk gestart moeten worden, kunt u gewoon doorgaan. Kies " +#~ "anders nu om niet door te gaan en geef 'none' in bij de vraag welke " +#~ "reeksen vanuit de initiële ramdisk gestart moeten worden." diff --git a/debian/po/pt.po b/debian/po/pt.po new file mode 100644 index 00000000..2b549e98 --- /dev/null +++ b/debian/po/pt.po @@ -0,0 +1,179 @@ +# Portuguese translation for mdadm debconf messages. +# Copyright (C) 2008 Pedro Ribeiro +# This file is distributed under the same license as the mdadm package. +# Pedro Ribeiro , 2008 +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 2.6.3+200709292116+4450e59-4\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-02-21 00:15+0000\n" +"Last-Translator: Pedro Ribeiro \n" +"Language-Team: Portuguese \n" +"Language: pt\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"O mdadm deve correr verificações de redundância nos grupos MD mensalmente?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Se o kernel suportar (versões mais recentes que 2.6.14) o mdadm pode " +"verificar periodicamente a redundância dos grupos MD (RAIDs). Isto pode ser " +"um processo que requer muitos recursos, dependendo da sua configuração, mas " +"pode prevenir casos raros de perda de dados. Notar que esta verificação é " +"feita em modo de leitura a não ser que sejam encontrados erros; se forem " +"encontrados erros, o mdadm tenta corrigi-los, o que pode resultar em " +"acessosde escrita aos discos." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"O pré-definido, se ligado, é os testes serem executados no primeiro Domingo " +"de cada mês às 01:06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Quer iniciar o deamon de monitorização do MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"O daemon monitor MD(RAID) envia notificações por email no caso de eventos " +"importantes (tais como falha de um disco). Provavelmente quer activar esta " +"opção." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "É recomendado activar esta opção." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Destinatário de email para notificações:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Por favor, introduza o endereço de email do utilizador que deve receber as " +"notificações de eventos MD importantes." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Grupos MD necessários para o sistema de ficheiros raiz:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Por favor, introduza 'all', 'none', ou uma lista de dispositivos " +#~ "separados por espaços, tais como 'md0 md1' ou 'md/1 md/d0' (o '/dev/' " +#~ "inicial pode ser omitido)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "para uso interno - apenas a descrição longa é necessária" + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Se o sistema de ficheiros de raiz do sistema estiver num grupo MD (RAID), " +#~ "necessita de ser iniciado mais cedo na sequência de arranque. Se o seu " +#~ "sistema de ficheiros de raiz estiver num volume lógico (LVM) que está no " +#~ "MD, todos os grupos constituintes necessitam de ser iniciados." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Se souber exactamente que grupos são necessários para iniciar o sistema " +#~ "de ficheiros raiz, e quiser adiar o inicio de todos os outros grupos para " +#~ "mais tarde no processo de arranque, introduza os grupos aqui. " +#~ "Alternativamente, introduza 'all' para iniciar todos os grupos " +#~ "disponíveis." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Se não necessita ou deseja iniciar grupos para o sistema de ficheiros " +#~ "raiz, deixe a resposta em branco (ou introduza 'none'). Isto vale no caso " +#~ "de usar o auto-arranque do kernel ou não necessitar de grupos para o " +#~ "arranque do sistema." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Ocorreu um erro: o nó do dispositivo não existe" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Ocorreu um erro: não é um dispositivo de bloco" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Ocorreu um erro: não é um grupo MD" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Ocorreu um erro: o grupo não está listado no ficheiro mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Iniciar grupos não listados no mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "O grupo especificado (${array}) não está listado no ficheiro de " +#~ "configuração (${config}). Portanto, não pode ser iniciado durante o " +#~ "processo de arranque, a não ser que corrija o ficheiro de configuração e " +#~ "recrie o ramdisk inicial." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Este aviso só é relevante se houver necessidade de iniciar grupos a " +#~ "partir do ramdisk durante o arranque do sistema. Se usar o auto-arranque " +#~ "do kernel, ou não necessitar de iniciar os grupos tão cedo no processo de " +#~ "arranque do sistema, pode simplesmente continuar. Em alternativa, escolha " +#~ "não continuar e introduza 'none' quando perguntado sobre quais grupos " +#~ "iniciar a partir do ramdisk inicial." diff --git a/debian/po/pt_BR.po b/debian/po/pt_BR.po new file mode 100644 index 00000000..3f6df2fc --- /dev/null +++ b/debian/po/pt_BR.po @@ -0,0 +1,304 @@ +# +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans +# +# Developers do not need to manually edit POT or PO files. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2006-09-24 19:22-0300\n" +"Last-Translator: Felipe Augusto van de Wiel (faw) \n" +"Language-Team: l10n portuguese \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"pt_BR utf-8\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"O mdadm deve, mensalmente, executar checagens de redundância dos " +"dispositivos MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +#, fuzzy +#| msgid "" +#| "If your kernel supports it (>> 2.6.14), mdadm can periodically check the " +#| "redundancy of your MD arrays (RAIDs). This may be a resource-intensive " +#| "process, depending on your setup, but it could help prevent rare cases of " +#| "data loss. Note that this is a read-only check unless errors are found; " +#| "if errors are found, mdadm will try to correct them, which may result in " +#| "write access to the media." +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Se o seu kernel suporta isto (>> 2.6.14), mdadm pode periodicamente checar a " +"redundância dos seus dispositivos MD (RAIDs). Isto pode ser um processo com " +"uso intensivo dos recursos, dependendo da sua configuração, mas pode ajudar " +"a previnir casos raros de perdas de dados. Note que esta é uma checagem " +"somente-leitura a menos que erros sejam encontrados; se erros são " +"encontrados, mdadm tentará corrigí-los, o que poderá resultar em acesso de " +"escrita na mídia." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +#, fuzzy +#| msgid "" +#| "The default, if turned on, is to run the checks on the first Sunday of " +#| "every month at 01:06 o'clock." +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"O padrão, se ativado, é executar checagens no primeiro Domingo de cada mês " +"às 01:06 em ponto." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Você deseja iniciar o \"daemon\" de monitoramento MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +#, fuzzy +#| msgid "" +#| "The MD (RAID) monitor daemon sends email notifications in response to " +#| "important MD events (such as a disk failure). You probably want to enable " +#| "it." +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"O \"daemon\" de monitoramento MD (RAID) envia e-mails de notificações em " +"resposta a eventos MD importantes (como uma falha de disco). Você " +"provavelmente quer habilitar esta opção." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Destinatário para os e-mails de notificações:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +#, fuzzy +#| msgid "" +#| "Please enter the email address of the user who should get the email " +#| "notification for important MD events." +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Por favor, informe o endereço de e-mail do usuário que deverá receber os e-" +"mails de notificações para estes eventos MD importantes." + +#, fuzzy +#~| msgid "MD arrays needed for the root filesystem:" +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Dispositivos MD necessários para o sistema de arquivos raiz:" + +#, fuzzy +#~| msgid "" +#~| "Please enter a space-separated list of devices, 'all', or 'none'. You " +#~| "may omit the leading '/dev/' and just enter e.g. \"md0 md1\", or \"md/1 " +#~| "md/d0\"." +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Por favor, informe uma lista separada por espaços dos dispositivos, 'all' " +#~ "ou 'none'. Você pode omitir a parte inicial '/dev/' e apenas informar, " +#~ "por exemplo, \"md0 md1\", ou \"md/1 md/d0\"." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "para uso interno - apenas a descrição longa é necessária." + +#, fuzzy +#~| msgid "" +#~| "If your system has its root filesystem on an MD array (RAID), it needs " +#~| "to be started early during the boot sequence. If your root filesystem is " +#~| "on a logical volume (LVM), which is on MD, all constituent arrays need " +#~| "to be started." +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Se o seu sistema tem o sistema de arquivos raiz em um dispositivo MD " +#~ "(RAID), este precisa ser iniciado mais cedo durante a seqüência de " +#~ "inicialização. Se o sistema de arquivos raiz está em um volume lógico " +#~ "(LVM), que está em um MD, todos os dispositivos que o constituem precisam " +#~ "ser iniciados." + +#, fuzzy +#~| msgid "" +#~| "If you know exactly which arrays are needed to bring up the root " +#~| "filesystem, and you want to postpone starting all other arrays to a " +#~| "later point in the boot sequence, enter the arrays to start here. " +#~| "Alternatively, enter 'all' to simply start all available arrays." +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Se você sabe exatamente quais dispositivos são necessários para ativar o " +#~ "sistema de arquivos raiz, e você deseja adiar o início de todos os outros " +#~ "dispositivos para um ponto posterior na seqüência de inicialização, " +#~ "informe os dispositivos a serem iniciados aqui. Como alternativa, informe " +#~ "'all' para simplesmente iniciar todos os dispositivos disponíveis." + +#, fuzzy +#~| msgid "" +#~| "If you do not need or want to start any arrays for the root filesystem, " +#~| "leave the answer blank (or enter 'none'). This may be the case if you " +#~| "are using kernel autostart or do not need any arrays to boot." +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Se você não precisa ou não quer iniciar quaisquer dispositivos para o " +#~ "sistema de arquivos raiz, deixe a resposta em branco (ou informe 'none'). " +#~ "Este pode ser o caso se você está usando \"kernel autostart\" ou não " +#~ "precisa de quaisquer dispositivos para a inicialização." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Um erro ocorreu: o dispositivo (\"device node\") não existe" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Um erro ocorreu: não é um dispositivo de blocos" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Um erro ocorreu: não é um dispositivo MD" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Um erro ocorreu: dispositivo não listado no arquivo mdadm.conf" + +#, fuzzy +#~| msgid "Proceed with starting arrays not listed in mdadm.conf?" +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Continuar com o início de dispositivos não listados no mdadm.conf?" + +#, fuzzy +#~| msgid "" +#~| "The array you have specified (${array}) is not listed in the " +#~| "configuration file ${config}. Therefore it cannot be started during " +#~| "boot, unless you correct the configuration file and recreate the initial " +#~| "ramdisk." +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "O dispositivo que você especificou (${array}) não está listado no arquivo " +#~ "de configuração ${config}. Portanto não pode ser iniciado durante a " +#~ "inicialização, a menos que você corrija o arquivo de configuração e " +#~ "recrie o \"ramdisk\" inicial." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Este aviso só é relevante se você precisa de dispositivos que sejam " +#~ "iniciados a partir do \"ramdisk\" inicial para que seja possível " +#~ "inicializar o computador. Se você usa \"kernel autostarting\", ou não " +#~ "precisa de quaisquer dispositivos sendo iniciados tão logo o \"ramdisk\" " +#~ "inicial seja carregado, você pode simplesmente continuar. " +#~ "Alternativamente, escolha não continuar e informe 'none' quando " +#~ "perguntado quais dispositivos iniciar a partir do \"ramdisk\" inicial." + +#~ msgid "Initialise the superblock if you reuse hard disks" +#~ msgstr "Inicialize o superbloco caso você reutilize discos rígidos" + +#~ msgid "" +#~ "WARNING! If you are using hard disks which have RAID superblocks from " +#~ "earlier installations in different RAID arrays, you MUST zero each " +#~ "superblock *before* activating the autostart feature." +#~ msgstr "" +#~ "AVISO! Se você estiver usando discos rígidos que já contêm superblocos " +#~ "RAID de instalações anteriores em \"arrays\" RAID diferentes, você DEVE " +#~ "zerar o superbloco *antes* de ativar o recurso de \"autostart\"." + +#~ msgid "" +#~ "To do this, do not start the RAID devices automatically. First, zero the " +#~ "superblock (mdadm --zero-superblock /dev/mdX). Next, use `dpkg-" +#~ "reconfigure mdadm` to reactivate the autostart feature." +#~ msgstr "" +#~ "Para fazê-lo, não inicie os dispositivos RAID automaticamente. Primeiro, " +#~ "zere os superblocos (mdadm --zero-superblock /dev/mdX). Em seguida, use " +#~ "`dpkg-reconfigure mdadm` para reativar o recurso de \"autostart\"." + +#~ msgid "" +#~ "You have the option to start all other arrays (those not needed for the " +#~ "root filesystem) later in the boot sequence. Doing so will give you " +#~ "greater control over the arrays with the mdadm configuration file. " +#~ "Starting all arrays at boot-time may be safer though." +#~ msgstr "" +#~ "Você tem a opção de iniciar todos os \"arrays\" (aqueles que não são " +#~ "necessários pelo sistema de arquivos raiz) posteriormente na seqüência de " +#~ "inicialização. Fazendo isto, você terá um controle maior sobre os \"arrays" +#~ "\" com o arquivo de configuração mdadm. No entanto, iniciar todos os " +#~ "\"arrays\" durante a inicialização pode ser mais seguro." + +#~ msgid "" +#~ "If RAID devices are started automatically, all RAID devices are " +#~ "discovered and assembled automatically at system startup. This option " +#~ "should only be used if the md driver is compiled as a module. If it is " +#~ "compiled into your kernel, the automatic startup will be performed at " +#~ "boot time by the kernel and therefore you should not choose this option." +#~ msgstr "" +#~ "Caso os dispositivos RAID sejam iniciados automaticamente, todos os " +#~ "dispositivos RAID serão detectados e montados automaticamente na " +#~ "inicialização do sistema operacional. Esta opção deverá ser usada somente " +#~ "caso o driver md esteja compilado como módulo. Caso o mesmo esteja " +#~ "compilado embutido em seu kernel, a inicialização automática será " +#~ "executada em tempo de inicialização pelo próprio kernel e, portanto, você " +#~ "não deverá e nem precisará escolher esta opção." + +#~ msgid "" +#~ "When the RAID monitor daemon runs, email notifications are sent when a " +#~ "disk belonging to a RAID array fails or changes its status for some " +#~ "reason." +#~ msgstr "" +#~ "Quando o daemon monitorador RAID é executado, notificações via e-mail são " +#~ "enviadas quando um disco pertencente a uma array RAID falha ou muda seu " +#~ "status por qualquer razão." + +#~ msgid "Which user should get the email notification?" +#~ msgstr "Qual usuário deve receber o e-mail de notificação ?" diff --git a/debian/po/ru.po b/debian/po/ru.po new file mode 100644 index 00000000..8bf8c5c2 --- /dev/null +++ b/debian/po/ru.po @@ -0,0 +1,189 @@ +# translation of ru.po to Russian +# +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans# +# Developers do not need to manually edit POT or PO files. +# +# Yuri Kozlov , 2006, 2008. +msgid "" +msgstr "" +"Project-Id-Version: 2.6.3+200709292116+4450e59-4\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-02-07 21:02+0300\n" +"Last-Translator: Yuri Kozlov \n" +"Language-Team: Russian \n" +"Language: ru\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"X-Generator: KBabel 1.11.4\n" +"Plural-Forms: nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n" +"%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2);\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"Должен ли mdadm запуÑкать ежемеÑÑчную проверку избыточноÑти на MD-маÑÑивах?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"ЕÑли Ñто поддерживаетÑÑ Ñдром (>> 2.6.14), mdadm может периодичеÑки " +"проверÑÑ‚ÑŒ избыточноÑÑ‚ÑŒ MD маÑÑивов (RAID-ов). Это может Ñтать реÑурÑоёмким " +"процеÑÑом в завиÑимоÑти от наÑтройки, но он может помочь предотвратить " +"редкие Ñлучаи потери данных. Заметим, что пока не обнаружено ошибок, работа " +"ведётÑÑ Ð² режиме только чтение; еÑли обнаруживаетÑÑ Ð¾ÑˆÐ¸Ð±ÐºÐ°, mdadm попытаетÑÑ " +"иÑправить её, что может потребовать прав запиÑи на ноÑитель." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"ЕÑли ответить утвердительно, то по умолчанию проверка выполнÑетÑÑ Ð² первое " +"воÑкреÑенье каждого меÑÑца в 01:06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "ЗапуÑкать демон-монитор MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Демон-монитор MD (RAID) поÑылает почтовые ÑƒÐ²ÐµÐ´Ð¾Ð¼Ð»ÐµÐ½Ð¸Ñ Ð² Ñлучае Ð²Ð¾Ð·Ð½Ð¸ÐºÐ½Ð¾Ð²ÐµÐ½Ð¸Ñ " +"важных Ñобытий Ñ MD (таких как отказ диÑка)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "РекомендуетÑÑ Ð¾Ñ‚Ð²ÐµÑ‚Ð¸Ñ‚ÑŒ утвердительно." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Получатель уведомительных пиÑем:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Введите Ð°Ð´Ñ€ÐµÑ Ñлектронной почты пользователÑ, который будет получать " +"почтовые ÑƒÐ²ÐµÐ´Ð¾Ð¼Ð»ÐµÐ½Ð¸Ñ Ð¾ важных изменениÑÑ… в ÑоÑтоÑнии MD." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "MD-маÑÑивы, необходимые Ð´Ð»Ñ ÐºÐ¾Ñ€Ð½ÐµÐ²Ð¾Ð¹ файловой ÑиÑтемы:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Введите ÑпиÑок уÑтройÑтв через пробел, Ñлово 'all' или 'none'. Ð’Ñ‹ можете " +#~ "не указывать начальную чаÑÑ‚ÑŒ пути типа '/dev/', а проÑто вводить имена " +#~ "уÑтройÑтв, например 'md0 md1' или 'md/1 md/d0'." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "" +#~ "Ð´Ð»Ñ Ð²Ð½ÑƒÑ‚Ñ€ÐµÐ½Ð½ÐµÐ³Ð¾ Ð¿Ð¾Ð»ÑŒÐ·Ð¾Ð²Ð°Ð½Ð¸Ñ - нужно иÑпользовать только длинное опиÑание." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "ЕÑли в ÑиÑтеме ÐºÐ¾Ñ€Ð½ÐµÐ²Ð°Ñ Ñ„Ð°Ð¹Ð»Ð¾Ð²Ð°Ñ ÑиÑтема раÑположена на MD-маÑÑиве " +#~ "(RAID), он должен быть запущен в Ñамом начале процеÑÑа загрузки. ЕÑли " +#~ "ÐºÐ¾Ñ€Ð½ÐµÐ²Ð°Ñ Ñ„Ð°Ð¹Ð»Ð¾Ð²Ð°Ñ ÑиÑтема раÑположена на логичеÑком томе (LVM), который " +#~ "раÑположен на MD, то должны быть запущены вÑе ÑоÑтавлÑющие маÑÑивы." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "ЕÑли вы точно знаете, какие маÑÑивы требуютÑÑ Ð´Ð»Ñ Ð¿Ð¾Ð»ÑƒÑ‡ÐµÐ½Ð¸Ñ " +#~ "работоÑпоÑобной корневой файловой ÑиÑтемы и хотите отложить запуÑк " +#~ "оÑтальных маÑÑивов на более поздний момент в процеÑÑе загрузки, то " +#~ "введите их здеÑÑŒ. Иначе, введите Ñлово 'all', чтобы проÑто запуÑтить вÑе " +#~ "доÑтупные маÑÑивы." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "ЕÑли вам Ñто не нужно, или вы хотите запуÑкать вÑе маÑÑивы Ð´Ð»Ñ ÐºÐ¾Ñ€Ð½ÐµÐ²Ð¾Ð¹ " +#~ "файловой ÑиÑтемы, оÑтавьте Ñто поле пуÑтым (или введите Ñлово 'none'). " +#~ "Этот вариант подходит, еÑли вы иÑпользуете автоматичеÑкий запуÑк из Ñдра " +#~ "или еÑли Ð´Ð»Ñ Ð·Ð°Ð³Ñ€ÑƒÐ·ÐºÐ¸ маÑÑивы ненужны." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Произошла ошибка: нода уÑтройÑтва не ÑущеÑтвует" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Произошла ошибка: уÑтройÑтво не ÑвлÑетÑÑ Ð±Ð»Ð¾Ñ‡Ð½Ñ‹Ð¼" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Произошла ошибка: Ñто не MD-маÑÑив" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Произошла ошибка: маÑÑив не опиÑан в файле mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "ЗапуÑтить маÑÑивы, неопиÑанные в mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "Указанный вами маÑÑив (${array}) не опиÑан в конфигурационном файле " +#~ "(${config}). ПоÑтому он не может быть запущен при Ñтарте машины, пока вы " +#~ "не иÑправите конфигурационный файл и не переÑоздадите первоначальный " +#~ "ramdisk." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Данное предупреждение умеÑтно только, еÑли вам требуетÑÑ Ð·Ð°Ð¿ÑƒÑкать " +#~ "маÑÑивы из первоначального ramdisk Ð´Ð»Ñ Ð·Ð°Ð³Ñ€ÑƒÐ·ÐºÐ¸ машины. ЕÑли вы " +#~ "иÑпользуете автоматичеÑкий запуÑк из Ñдра или вам не нужны маÑÑивы Ð´Ð»Ñ " +#~ "загрузки на Ñтапе загрузки первоначального ramdisk, вы можете проÑто " +#~ "продолжить. Иначе, выберите не продолжать и введите 'none', когда вам " +#~ "предложат выбрать маÑÑивы Ð´Ð»Ñ Ð·Ð°Ð¿ÑƒÑка из первоначального ramdisk." diff --git a/debian/po/sk.po b/debian/po/sk.po new file mode 100644 index 00000000..c84247ba --- /dev/null +++ b/debian/po/sk.po @@ -0,0 +1,176 @@ +# Slovak translations for mdadm package +# Slovenské preklady pre balík mdadm. +# Copyright (C) 2011 THE mdadm'S COPYRIGHT HOLDER +# This file is distributed under the same license as the mdadm package. +# Automatically generated, 2011. +# Slavko , 2011. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 3.2.2-1\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2011-09-18 11:22+0200\n" +"Last-Translator: Slavko \n" +"Language-Team: Slovak \n" +"Language: sk\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=3; plural=(n==1) ? 0 : (n>=2 && n<=4) ? 1 : 2;\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Má mdadm spúšťaÅ¥ mesaÄnú kontrolu redundancie polí MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Ak to jadro podporuje (verzie novÅ¡ie ako 2.6.14), mdadm môže periodicky " +"kontrolovaÅ¥ redundanciu polí MD (RAIDov). Tento proces môže byÅ¥ (v " +"závislosti od lokálneho nastavenia) nároÄný na zdroje systému, ale môže " +"pomôcÅ¥ pri predchádzaní vzácnym prípadom straty dát. Pamätajte, že, pokiaľ " +"nie sú nájdené chyby, je to kontrola read-only, až keÄ sú nájdené chyby, " +"pokúsi sa ich mdadm opraviÅ¥, Äo môže maÅ¥ za následok zápis na médium." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Predvolene je vypnuté, ak túto možnosÅ¥ zapnete, bude kontrola vykonávaná " +"každú prvú nedeľu mesiaca o 01:06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Chcete spustiÅ¥ démona monitorovania MD?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Monitorovací démon MD (RAID) posiela upozornenia emailom, ako reakcie na " +"dôležité udalosti MD (napr. zlyhanie disku)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Povolenie tejto možnosti je odporúÄané." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Príjemca emailových upozornení:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Prosím, zadajte emailovú adresu používateľa, ktorý má dostávaÅ¥ emailové " +"upozornenia na dôležité udalosti MD." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Polia MD, potrebné pre koreň súborového systému:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Prosím, zadajte „allâ€, „none†alebo medzerou oddelený zoznam zariadení, " +#~ "napr. „md0 md1†alebo „md/1 md/d0†(poÄiatoÄné „/dev/†môže byÅ¥ " +#~ "vynechané)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "na interné použitie – potrebný je len dlhý popis." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Ak je koreň súborového systému umiestnený na poli MD (RAID), musí byÅ¥ " +#~ "spustený poÄas zavádzania systému. Ak je koreň umiestnený na logickom " +#~ "zväzku (LVM), ktorý je na MD, musia byÅ¥ spustené vÅ¡etky súvisiace polia." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Ak viete presne, ktoré polia sú potrebné na pripojenie koreňa súborového " +#~ "systému a chcete odložiÅ¥ Å¡tart vÅ¡etkých ostatných polí na neskorší okamih " +#~ "zavádzania, zadajte tu polia, ktoré majú byÅ¥ spustené. Alebo zadajte " +#~ "„allâ€, Äím budú jednoducho spustené vÅ¡etky dostupné polia." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Ak pre koreň súborového systému nepotrebujete alebo nechcete spúšťaÅ¥ " +#~ "žiadne polia, nechajte odpoveÄ prázdnu (alebo zadajte „noneâ€). Tento " +#~ "prípad môže nastaÅ¥, ak používate automatický Å¡tart polí priamo v jadre " +#~ "alebo nepotrebujete pri zavádzaní žiadne polia." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Nastala chyba: uzol zariadenia neexistuje" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Nastala chyba: nie je blokové zariadenie" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Nastala chyba: nie je pole MD" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Nastala chyba: pole nie je uvedené v súbore mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "SpustiÅ¥ polia, ktoré nie sú uvedené v mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "Zadané pole (${array}) nie je uvedené v konfiguraÄnom súbore (${config}), " +#~ "a preto nemôže byÅ¥ spustené poÄas zavádzania, až kým neopravíte " +#~ "konfiguraÄný súbor a nevytvoríte nový poÄiatoÄný ramdisk (initrd)." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Toto varovanie je dôležité, len ak potrebujete aby boli polia spúšťané z " +#~ "poÄiatoÄného ramdisku, aby boli dostupné poÄas zavádzania. Ak používate " +#~ "automatické spúšťanie polí priamo z jadra, alebo ak nepotrebujte aby boli " +#~ "polia spúšťané tak skoro (z poÄiatoÄného ramdisku), môžete prosto " +#~ "pokraÄovaÅ¥. Alebo môžete zvoliÅ¥ nepokraÄovaÅ¥ a odpovedaÅ¥ „none†na " +#~ "otázku, ktoré polia majú byÅ¥ spúšťané z poÄiatoÄného ramdisku." diff --git a/debian/po/sv.po b/debian/po/sv.po new file mode 100644 index 00000000..ed9f1f21 --- /dev/null +++ b/debian/po/sv.po @@ -0,0 +1,186 @@ +# translation of mdadm_2.6.7-3_sv.po to Swedish +# Translators, if you are not familiar with the PO format, gettext +# documentation is worth reading, especially sections dedicated to +# this format, e.g. by running: +# info -n '(gettext)PO Files' +# info -n '(gettext)Header Entry' +# Some information specific to po-debconf are available at +# /usr/share/doc/po-debconf/README-trans +# or http://www.debian.org/intl/l10n/po-debconf/README-trans +# Developers do not need to manually edit POT or PO files. +# +# Martin Ågren , 2008. +msgid "" +msgstr "" +"Project-Id-Version: mdadm_2.6.7-3_sv\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-07-23 18:34+0200\n" +"Last-Translator: Martin Ågren \n" +"Language-Team: Swedish \n" +"Language: sv\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=ISO-8859-1\n" +"Content-Transfer-Encoding: 8bit\n" +"X-Generator: KBabel 1.11.4\n" +"Plural-Forms: nplurals=2; plural=(n != 1);\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "Ska mdadm köra månatliga redundanskontroller av MD-kedjorna?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Om din kärna har stöd för det (versioner senare än 2.6.14), kan mdadm " +"periodvis kontrollera redundansen för dina MD-kedjor (RAID). Det här kan " +"vara en resurskrävande process, beroende på din konfiguration, men den kan " +"hjälpa till att förhindra ovanliga fall av dataförluster. Observera att det " +"är en skrivskyddad kontroll såvida inte fel påträffas; om fel hittas kommer " +"mdadm försöka att rätta till dem, vilket kan leda till skrivåtkomst till " +"mediet." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Standardvärdet, om påslagen, är att kontrollera på den första söndagen i " +"varje månad klockan 01.06." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Vill du starta MD-övervakningsdemonen?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"MD-övervakningsdemonen (RAID) skickar e-postnotifieringar för viktiga MD-" +"händelser (såsom ett diskfel)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Att aktivera denna funktion rekommenderas." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "Mottagare av e-postnotifieringar:" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Ange e-postadressen till den användare som ska ta emot e-postnotifieringar " +"för dessa viktiga MD-händelser." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "MD-kedjor som behövs för rotfilsystemet:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Ange \"all\", \"none\" eller en blankstegsseparerad lista på enheter, " +#~ "såsom \"md0 md1\" eller \"md/1 md/0\" (det inledande \"/dev\" kan " +#~ "uteslutas)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "för intern användning - endast den långa beskrivningen behövs." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Om ditt system har sitt rotfilsystem på en MD-kedja (RAID) behöver den " +#~ "startas upp tidigt under uppstartssekvensen. Om ditt rotfilsystem finns " +#~ "på en logisk volym (LVM), vilket är på MD, behöver alla bestående kedjor " +#~ "startas." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Om du vet exakt vilka kedjor som behövs för att ta upp rotfilsystemet, " +#~ "och du vill skjuta upp uppstarten för alla andra kedjor till en senare " +#~ "tidspunkt i uppstartssekvensen, ange vilka kedjor som ska starta här. " +#~ "Alternativt, ange \"all\" för att helt enkelt starta alla tillgängliga " +#~ "kedjor." + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Om du inte behöver eller vill starta några kedjor för rotfilsystemet, " +#~ "lämna svaret blankt (eller ange \"none\"). Detta kan vara fallet om du " +#~ "använder kärnans automatstart eller inte behöver några kedjor för att " +#~ "starta upp." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Ett fel inträffade: enhetsnoden finns inte" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Ett fel inträffade: inte en blockenhet" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Ett fel inträffade: inte en MD-kedja" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "Ett fel inträffade: kedjan är inte listad i filen mdadm.conf" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "Starta kedjor som inte är listade i mdadm.conf?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "Kedjan du har angivit (${array}) är inte listad i konfigurationsfilen " +#~ "(${config}). Därför kan den inte startas under systemets uppstart, såvida " +#~ "du inte rättar till konfigurationsfilen och återskapar den initiala " +#~ "ramdisken." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Den här varningen är endast relevant om du behöver kedjor som ska startas " +#~ "från den initiala ramdisken för att kunna starta upp systemet. Om du " +#~ "använder kärnans automatstart, eller inte behöver starta några kedjor så " +#~ "tidigt som när de initiala ramdisken läses in, kan du helt enkelt " +#~ "fortsätta. Alternativt, välj att inte fortsätta och ange \"none\" när " +#~ "frågan om vilka kedjor som ska startas från den initiala ramdisken ställs." diff --git a/debian/po/templates.pot b/debian/po/templates.pot new file mode 100644 index 00000000..3860e909 --- /dev/null +++ b/debian/po/templates.pot @@ -0,0 +1,78 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the mdadm package. +# FIRST AUTHOR , YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: mdadm\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=CHARSET\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" diff --git a/debian/po/vi.po b/debian/po/vi.po new file mode 100644 index 00000000..cf6c4b17 --- /dev/null +++ b/debian/po/vi.po @@ -0,0 +1,179 @@ +# Vietnamese Translation for mdadm. +# Copyright © 2008 Free Software Foundation, Inc. +# Clytie Siddall , 2005-2008. +# +msgid "" +msgstr "" +"Project-Id-Version: mdadm 2.6.3+200709292116+4450e59-4\n" +"Report-Msgid-Bugs-To: mdadm@packages.debian.org\n" +"POT-Creation-Date: 2016-07-05 14:12+0200\n" +"PO-Revision-Date: 2008-02-23 17:40+1030\n" +"Last-Translator: Clytie Siddall \n" +"Language-Team: Vietnamese \n" +"Language: vi\n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=utf-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Plural-Forms: nplurals=1; plural=0;\n" +"X-Generator: LocFactoryEditor 1.7b3\n" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "Should mdadm run monthly redundancy checks of the MD arrays?" +msgstr "" +"mdadm có nên chạy việc kiểm tra thừa hàng tháng trên những mảng MD không?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"If the kernel supports it (versions greater than 2.6.14), mdadm can " +"periodically check the redundancy of MD arrays (RAIDs). This may be a " +"resource-intensive process, depending on the local setup, but it could help " +"prevent rare cases of data loss. Note that this is a read-only check unless " +"errors are found; if errors are found, mdadm will try to correct them, which " +"may result in write access to the media." +msgstr "" +"Nếu hạt nhân có phải há»— trợ (các phiên bản sau 2.6.14) thì mdadm có thể kiểm " +"tra theo chu kỳ tình thừa của các mảng MD (RAID). Tiến trình này có thể " +"chiếm nhiá»u tài nguyên hệ thống, phụ thuá»™c vào thiết lập cục bá»™, nhÆ°ng nó có " +"thể giúp ngăn cản trÆ°á»ng hợp mất dữ liệu (ít có). Ghi chú rằng việc kiểm tra " +"này là chỉ Ä‘á»c: gặp lá»—i thì mdadm sẽ thá»­ sá»­a chữa, mà có thể gây ra truy cập " +"ghi vào vật chứa." + +#. Type: boolean +#. Description +#: ../mdadm.templates:2001 +msgid "" +"The default, if turned on, is to check on the first Sunday of every month at " +"01:06." +msgstr "" +"Giá trị mặc định, nếu được bật, là chạy những việc kiểm tra vào ngày hôm Chủ " +"Nhật thứ nhất của má»—i tháng, vào lúc 01:06 giá» (giá» ti)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Do you want to start the MD monitoring daemon?" +msgstr "Bạn có muốn khởi chạy trình ná»n theo dõi MD không?" + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "" +"The MD (RAID) monitor daemon sends email notifications in response to " +"important MD events (such as a disk failure)." +msgstr "" +"Trình ná»n theo dõi MD (RAID) gá»­i thÆ° thông báo hưởng ứng dữ kiện MD quan " +"trá»ng (v.d. Ä‘Ä©a bị há»ng)." + +#. Type: boolean +#. Description +#: ../mdadm.templates:3001 +msgid "Enabling this option is recommended." +msgstr "Khuyên bạn hiệu lá»±c tùy chá»n này." + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "Recipient for email notifications:" +msgstr "NgÆ°á»i nhận thÆ° thông báo :" + +#. Type: string +#. Description +#: ../mdadm.templates:4001 +msgid "" +"Please enter the email address of the user who should get the email " +"notifications for important MD events." +msgstr "" +"Hãy nhập địa chỉ thÆ° của ngÆ°á»i dùng nên nhận thÆ° thông báo vá» dữ kiện MD " +"quan trá»ng." + +#~ msgid "MD arrays needed for the root file system:" +#~ msgstr "Các mảng MD cần thiết cho hệ thống tập tin gốc:" + +#~ msgid "" +#~ "Please enter 'all', 'none', or a space-separated list of devices such as " +#~ "'md0 md1' or 'md/1 md/d0' (the leading '/dev/' can be omitted)." +#~ msgstr "" +#~ "Hãy nhập « all » (tất cả), « none » (không có), hoặc má»™t danh sách các " +#~ "thiết bị định giá»›i bằng dấu cách nhÆ° « md0 md1 » hoặc « md/1 md/d0 » (có " +#~ "thể bá» sót phần « /dev/ » Ä‘i trÆ°á»›c)." + +#~ msgid "for internal use - only the long description is needed." +#~ msgstr "để sá»­ dụng ná»™i bá»™ — chỉ cần thiết mô tả dài." + +#~ msgid "" +#~ "If the system's root file system is located on an MD array (RAID), it " +#~ "needs to be started early during the boot sequence. If it is located on a " +#~ "logical volume (LVM), which is on MD, all constituent arrays need to be " +#~ "started." +#~ msgstr "" +#~ "Nếu hệ thống tập tin gốc của hệ thống nằm trên má»™t mảng MD (RAID) thì cần " +#~ "phải khởi chạy nó sá»›m trong tiến trình khởi Ä‘á»™ng. Nếu nó nằm trên má»™t " +#~ "khối tin hợp lý (LVM) mà lần lượt nằm trên má»™t MD thì cần phải khởi chạy " +#~ "tất cả các mảng thành phần." + +#~ msgid "" +#~ "If you know exactly which arrays are needed to bring up the root file " +#~ "system, and you want to postpone starting all other arrays to a later " +#~ "point in the boot sequence, enter the arrays to start here. " +#~ "Alternatively, enter 'all' to simply start all available arrays." +#~ msgstr "" +#~ "Nếu bạn biết chính xác những mảng nào cần thiết để kích hoạt hệ thống tập " +#~ "tin gốc, và bạn muốn hoãn việc khởi chạy các mảng khác tá»›i má»™t Ä‘iểm sau " +#~ "trong dãy khởi Ä‘á»™ng, hãy nhập vào đây các mảng cần khởi chạy. Hoặc nhập « " +#~ "all » (tất cả) để khởi chạy Ä‘Æ¡n giản tất cả các mảng sẵn sàng. " + +#~ msgid "" +#~ "If you do not need or want to start any arrays for the root file system, " +#~ "leave the answer blank (or enter 'none'). This may be the case if you are " +#~ "using kernel autostart or do not need any arrays to boot." +#~ msgstr "" +#~ "Nếu bạn không cần hoặc muốn khởi chạy mảng nào cho hệ thống tập tin gốc, " +#~ "hãy bá» trống câu trả lá»i này (hoặc nhập « none » [không có]). TrÆ°á»ng hợp " +#~ "này có thể xảy ra nếu bạn sá»­ dụng khả năng tá»± Ä‘á»™ng khởi Ä‘á»™ng hạt nhân " +#~ "(kernel autostart), hoặc không cần mảng nào để khởi Ä‘á»™ng máy tính." + +#~ msgid "An error occurred: device node does not exist" +#~ msgstr "Gặp lá»—i: nút thiết bị không tồn tại" + +#~ msgid "An error occurred: not a block device" +#~ msgstr "Gặp lá»—i: không phải là má»™t thiết bị khối" + +#~ msgid "An error occurred: not an MD array" +#~ msgstr "Gặp lá»—i: không phải là má»™t mảng MD" + +#~ msgid "An error occurred: array not listed in mdadm.conf file" +#~ msgstr "" +#~ "Gặp lá»—i: mảng không được liệt kê trong tập tin cấu hình « mdadm.conf »" + +#~ msgid "Start arrays not listed in mdadm.conf?" +#~ msgstr "" +#~ "Khởi chạy các mảng không được liệt kê trong tập tin cấu hình « mdadm.conf " +#~ "» không?" + +#~ msgid "" +#~ "The specified array (${array}) is not listed in the configuration file " +#~ "(${config}). Therefore, it cannot be started during boot, unless you " +#~ "correct the configuration file and recreate the initial ramdisk." +#~ msgstr "" +#~ "Mảng bạn đã xác định (${array}) không được liệt kê trong tập tin cấu hình " +#~ "${config}. Vì vậy nó không thể được khởi chạy trong khi khởi Ä‘á»™ng, nếu " +#~ "bạn không sá»­a tập tin cấu hình và tạo lại Ä‘Ä©a RAM đầu tiên." + +#~ msgid "" +#~ "This warning is only relevant if you need arrays to be started from the " +#~ "initial ramdisk to be able to boot. If you use kernel autostarting, or do " +#~ "not need any arrays to be started as early as the initial ramdisk is " +#~ "loaded, you can simply continue. Alternatively, choose not to continue " +#~ "and enter 'none' when prompted which arrays to start from the initial " +#~ "ramdisk." +#~ msgstr "" +#~ "Cảnh báo này chỉ là thích hợp nếu bạn cần thiết mảng được khởi chạy từ " +#~ "Ä‘Ä©a RAM đầu tiên, để có thể khởi Ä‘á»™ng được. Nếu bạn sá»­ dụng khả năng tá»± " +#~ "Ä‘á»™ng khởi chạy hạt nhân (kernel autostart), hoặc không cần mảng nào được " +#~ "khởi chạy má»™t khi nạp Ä‘Ä©a RAM đầu tiên, bạn Ä‘Æ¡n giản có thể tiếp tục lại. " +#~ "Hoặc chá»n không tiếp tục, và nhập « none » (không có) khi được nhắc nhập " +#~ "những mảng nào cần khởi chạy từ Ä‘Ä©a RAM đầu tiên." diff --git a/debian/presubj b/debian/presubj new file mode 100644 index 00000000..103208df --- /dev/null +++ b/debian/presubj @@ -0,0 +1,32 @@ +Reporting bugs against mdadm +============================ +Before reporting bugs against mdadm, please read the README documents, as well +as the FAQ in /usr/share/doc/mdadm . Most issues that are reported against the +mdadm package are adequately answered therein. + +In particular, please do not file bugs about mdadm assembling arrays too +early, e.g. when the driver is not yet ready and device nodes do not exist. +Check FAQ item 27 about use of the rootdelay parameter instead. + +Also, please check out http://bugs.debian.org/mdadm and make sure that the +issue you are facing has not already been reported. + +If you are not sure that the answer you are seeking is in those files, or you +are not sure that you are facing a genuine bug, please approach +debian-user@lists.debian.org or linux-raid@vger.kernel.org with your +question(s). + +Gathering information relevant to mdadm as root +=============================================== +If you are not reporting bugs as root (which you should not), you will be +prompted to give permission to run a script to collect relevant information +from your system as the root user. Only the root user has access to some +information that might be relevant to the bug report you are about to file. + +** Please give permission to run the script as root when asked momentarily. + +If you would rather obtain the same information manually, you can run + /usr/share/bug/mdadm/script 3>&1 +as root and include or attach the output. + + -- martin f. krafft Mon, 20 Jul 2009 15:02:48 +0200 diff --git a/debian/rules b/debian/rules new file mode 100755 index 00000000..3a1b5a6f --- /dev/null +++ b/debian/rules @@ -0,0 +1,103 @@ +#!/usr/bin/make -f +# Copyright © 2001-2005 Mario Jou/3en +# Copyright © 2005-2008 Martin F. Krafft +# Distributable under the terms of the GNU GPL version 2. +# + +#export DH_VERBOSE=1 + +export CROSS_COMPILE=$(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)- +LDFLAGS = $(shell dpkg-buildflags --get LDFLAGS) +CXFLAGS = $(shell dpkg-buildflags --get CFLAGS) \ + $(shell dpkg-buildflags --get CPPFLAGS) +BUILDFLAGS = CXFLAGS="$(CXFLAGS)" LDFLAGS="$(LDFLAGS)" DEBIAN=yes +DESTDIR = $(CURDIR)/debian/mdadm +DESTDIR_UDEB = $(DESTDIR)-udeb + +build-arch: build-stamp + +build-stamp: + dh_testdir + $(MAKE) all $(BUILDFLAGS) CONFFILE=/etc/mdadm/mdadm.conf CONFFILE2=/etc/mdadm.conf + touch $@ + +# udeb rules should go, the only diff is the conffile location, +# and d-i specifies path explicitly when needed + +udeb/dir-stamp: + rm -rf udeb + mkdir udeb + ln *.[ch] Makefile udeb/ + touch $@ + +build-arch: udeb/build-stamp + +udeb/build-stamp: udeb/dir-stamp + dh_testdir + $(MAKE) -C udeb mdadm mdmon $(BUILDFLAGS) CONFFILE=/tmp/mdadm.conf + touch $@ + +clean: + dh_testdir + rm -f build-stamp + $(MAKE) clean + rm -rf udeb + dh_clean + +install-arch: build-arch + dh_testdir + dh_prep + dh_installdirs + + $(MAKE) install install-systemd DESTDIR=$(DESTDIR) + + mkdir -p $(DESTDIR)/etc/mdadm + install -Dm0755 debian/initramfs/hook \ + $(DESTDIR)/usr/share/initramfs-tools/hooks/mdadm + install -Dm0755 debian/initramfs/script.local-block \ + $(DESTDIR)/usr/share/initramfs-tools/scripts/local-block/mdadm + install -Dm0755 debian/initramfs/script.local-bottom \ + $(DESTDIR)/usr/share/initramfs-tools/scripts/local-bottom/mdadm + install -Dm0644 debian/mdadm.modules \ + $(DESTDIR)/etc/modprobe.d/mdadm.conf + + install -Dm0755 debian/mkconf $(DESTDIR)/usr/share/mdadm/mkconf + install -Dm0755 debian/checkarray $(DESTDIR)/usr/share/mdadm/checkarray + install -Dm0755 debian/bugscript $(DESTDIR)/usr/share/bug/mdadm/script + install -Dm0644 debian/presubj $(DESTDIR)/usr/share/bug/mdadm/presubj + + install -Dm0755 udeb/mdadm $(DESTDIR_UDEB)/sbin/mdadm + install -Dm0755 udeb/mdmon $(DESTDIR_UDEB)/sbin/mdmon + install -Dm0644 udev-md-raid-arrays.rules $(DESTDIR_UDEB)/lib/udev/rules.d/63-md-raid-arrays.rules + +binary-arch: install-arch + dh_testdir + dh_testroot + dh_installdebconf + dh_installdocs + dh_installexamples -pmdadm mdadm.conf-example misc/syslog-events + dh_installinit --init-script=mdadm-waitidle --no-start -- stop 98 0 6 . + dh_link -pmdadm /dev/null /lib/systemd/system/mdadm-waitidle.service + dh_installinit -- defaults 25 + dh_link -pmdadm /dev/null /lib/systemd/system/mdadm.service + dh_installman + dh_installcron + dh_installchangelogs ChangeLog + dh_installlogcheck + dh_link + dh_strip + dh_compress + dh_fixperms + dh_installdeb + dh_shlibdeps + dh_gencontrol + dh_md5sums + dh_builddeb + +build: build-arch +install: install-arch +binary: binary-arch +build-indep: +install-indep: +binary-indep: +.PHONY: clean build build-indep build-arch binary binary-indep binary-arch install install-indep install-arch diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 00000000..163aaf8d --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/debian/watch b/debian/watch new file mode 100644 index 00000000..1b1172a0 --- /dev/null +++ b/debian/watch @@ -0,0 +1,2 @@ +version=3 +http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-([.[:digit:]]+).tar.gz diff --git a/dlink.c b/dlink.c new file mode 100644 index 00000000..3efa94b7 --- /dev/null +++ b/dlink.c @@ -0,0 +1,74 @@ + +/* doubly linked lists */ +/* This is free software. No strings attached. No copyright claimed */ + +#include +#include +#include +#ifdef __dietlibc__ +char *strncpy(char *dest, const char *src, size_t n) __THROW; +#endif +void *xcalloc(size_t num, size_t size); +#include "dlink.h" + +void *dl_head() +{ + void *h; + h = dl_alloc(0); + dl_next(h) = h; + dl_prev(h) = h; + return h; +} + +void dl_free(void *v) +{ + struct __dl_head *vv = v; + free(vv-1); +} + +void dl_init(void *v) +{ + dl_next(v) = v; + dl_prev(v) = v; +} + +void dl_insert(void *head, void *val) +{ + dl_next(val) = dl_next(head); + dl_prev(val) = head; + dl_next(dl_prev(val)) = val; + dl_prev(dl_next(val)) = val; +} + +void dl_add(void *head, void *val) +{ + dl_prev(val) = dl_prev(head); + dl_next(val) = head; + dl_next(dl_prev(val)) = val; + dl_prev(dl_next(val)) = val; +} + +void dl_del(void *val) +{ + if (dl_prev(val) == 0 || dl_next(val) == 0) + return; + dl_prev(dl_next(val)) = dl_prev(val); + dl_next(dl_prev(val)) = dl_next(val); + dl_prev(val) = dl_next(val) = 0; +} + +char *dl_strndup(char *s, int l) +{ + char *n; + if (s == NULL) + return NULL; + n = dl_newv(char, l+1); + strncpy(n, s, l); + n[l] = 0; + return n; +} + +char *dl_strdup(char *s) +{ + return dl_strndup(s, (int)strlen(s)); +} diff --git a/dlink.h b/dlink.h new file mode 100644 index 00000000..ab2a9459 --- /dev/null +++ b/dlink.h @@ -0,0 +1,25 @@ + +/* doubley linked lists */ +/* This is free software. No strings attached. No copyright claimed */ + +struct __dl_head +{ + void * dh_prev; + void * dh_next; +}; + +#define dl_alloc(size) ((void*)(((char*)xcalloc(1,(size)+sizeof(struct __dl_head)))+sizeof(struct __dl_head))) +#define dl_new(t) ((t*)dl_alloc(sizeof(t))) +#define dl_newv(t,n) ((t*)dl_alloc(sizeof(t)*n)) + +#define dl_next(p) *(&(((struct __dl_head*)(p))[-1].dh_next)) +#define dl_prev(p) *(&(((struct __dl_head*)(p))[-1].dh_prev)) + +void *dl_head(void); +char *dl_strdup(char *); +char *dl_strndup(char *, int); +void dl_insert(void*, void*); +void dl_add(void*, void*); +void dl_del(void*); +void dl_free(void*); +void dl_init(void*); diff --git a/external-reshape-design.txt b/external-reshape-design.txt new file mode 100644 index 00000000..10c57ccb --- /dev/null +++ b/external-reshape-design.txt @@ -0,0 +1,280 @@ +External Reshape + +1 Problem statement + +External (third-party metadata) reshape differs from native-metadata +reshape in three key ways: + +1.1 Format specific constraints + +In the native case reshape is limited by what is implemented in the +generic reshape routine (Grow_reshape()) and what is supported by the +kernel. There are exceptional cases where Grow_reshape() may block +operations when it knows that the kernel implementation is broken, but +otherwise the kernel is relied upon to be the final arbiter of what +reshape operations are supported. + +In the external case the kernel, and the generic checks in +Grow_reshape(), become the super-set of what reshapes are possible. The +metadata format may not support, or have yet to implement a given +reshape type. The implication for Grow_reshape() is that it must query +the metadata handler and effect changes in the metadata before the new +geometry is posted to the kernel. The ->reshape_super method allows +Grow_reshape() to validate the requested operation and post the metadata +update. + +1.2 Scope of reshape + +Native metadata reshape is always performed at the array scope (no +metadata relationship with sibling arrays on the same disks). External +reshape, depending on the format, may not allow the number of member +disks to be changed in a subarray unless the change is simultaneously +applied to all subarrays in the container. For example the imsm format +requires all member disks to be a member of all subarrays, so a 4-disk +raid5 in a container that also houses a 4-disk raid10 array could not be +reshaped to 5 disks as the imsm format does not support a 5-disk raid10 +representation. This requires the ->reshape_super method to check the +contents of the array and ask the user to run the reshape at container +scope (if all subarrays are agreeable to the change), or report an +error in the case where one subarray cannot support the change. + +1.3 Monitoring / checkpointing + +Reshape, unlike rebuild/resync, requires strict checkpointing to survive +interrupted reshape operations. For example when expanding a raid5 +array the first few stripes of the array will be overwritten in a +destructive manner. When restarting the reshape process we need to know +the exact location of the last successfully written stripe, and we need +to restore the data in any partially overwritten stripe. Native +metadata stores this backup data in the unused portion of spares that +are being promoted to array members, or in an external backup file +(located on a non-involved block device). + +The kernel is in charge of recording checkpoints of reshape progress, +but mdadm is delegated the task of managing the backup space which +involves: +1/ Identifying what data will be overwritten in the next unit of reshape + operation +2/ Suspending access to that region so that a snapshot of the data can + be transferred to the backup space. +3/ Allowing the kernel to reshape the saved region and setting the + boundary for the next backup. + +In the external reshape case we want to preserve this mdadm +'reshape-manager' arrangement, but have a third actor, mdmon, to +consider. It is tempting to give the role of managing reshape to mdmon, +but that is counter to its role as a monitor, and conflicts with the +existing capabilities and role of mdadm to manage the progress of +reshape. For clarity the external reshape implementation maintains the +role of mdmon as a (mostly) passive recorder of raid events, and mdadm +treats it as it would the kernel in the native reshape case (modulo +needing to send explicit metadata update messages and checking that +mdmon took the expected action). + +External reshape can use the generic md backup file as a fallback, but in the +optimal/firmware-compatible case the reshape-manager will use the metadata +specific areas for managing reshape. The implementation also needs to spawn a +reshape-manager per subarray when the reshape is being carried out at the +container level. For these two reasons the ->manage_reshape() method is +introduced. This method in addition to base tasks mentioned above: +1/ Processed each subarray one at a time in series - where appropriate. +2/ Uses either generic routines in Grow.c for md-style backup file + support, or uses the metadata-format specific location for storing + recovery data. +This aims to avoid a "midlayer mistake"[1] and lets the metadata handler +optionally take advantage of generic infrastructure in Grow.c + +2 Details for specific reshape requests + +There are quite a few moving pieces spread out across md, mdadm, and mdmon for +the support of external reshape, and there are several different types of +reshape that need to be comprehended by the implementation. A rundown of +these details follows. + +2.0 General provisions: + +Obtain an exclusive open on the container to make sure we are not +running concurrently with a Create() event. + +2.1 Freezing sync_action + + Before making any attempt at a reshape we 'freeze' every array in + the container to ensure no spare assignment or recovery happens. + This involves writing 'frozen' to sync_action and changing the '/' + after 'external:' in metadata_version to a '-'. mdmon knows that + this means not to perform any management. + + Before doing this we check that all sync_actions are 'idle', which + is racy but still useful. + Afterwards we check that all member arrays have no spares + or partial spares (recovery_start != 'none') which would indicate a + race. If they do, we unfreeze again. + + Once this completes we know all the arrays are stable. They may + still have failed devices as devices can fail at any time. However + we treat those like failures that happen during the reshape. + +2.2 Reshape size + + 1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally + initializes st->update_tail + 2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the size change + is allowed (being performed at subarray scope / enough room) prepares a + metadata update + 3/ mdadm::Grow_reshape(): flushes the metadata update (via + flush_metadata_update(), or ->sync_metadata()) + 4/ mdadm::Grow_reshape(): post the new size to the kernel + + +2.3 Reshape level (simple-takeover) + +"simple-takeover" implies the level change can be satisfied without touching +sync_action + + 1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally + initializes st->update_tail + 2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the level change + is allowed (being performed at subarray scope) prepares a + metadata update + 2a/ raid10 --> raid0: degrade all mirror legs prior to calling + ->reshape_super + 3/ mdadm::Grow_reshape(): flushes the metadata update (via + flush_metadata_update(), or ->sync_metadata()) + 4/ mdadm::Grow_reshape(): post the new level to the kernel + +2.4 Reshape chunk, layout + +2.5 Reshape raid disks (grow) + + 1/ mdadm::Grow_reshape(): unconditionally initializes st->update_tail + because only redundant raid levels can modify the number of raid disks + 2/ mdadm::Grow_reshape(): calls ->reshape_super() to check that the level + change is allowed (being performed at proper scope / permissible + geometry / proper spares available in the container), chooses + the spares to use, and prepares a metadata update. + 3/ mdadm::Grow_reshape(): Converts each subarray in the container to the + raid level that can perform the reshape and starts mdmon. + 4/ mdadm::Grow_reshape(): Pushes the update to mdmon. + 5/ mdadm::Grow_reshape(): uses container_content to find details of + the spares and passes them to the kernel. + 6/ mdadm::Grow_reshape(): gives raid_disks update to the kernel, + sets sync_max, sync_min, suspend_lo, suspend_hi all to zero, + and starts the reshape by writing 'reshape' to sync_action. + 7/ mdmon::monitor notices the sync_action change and tells + managemon to check for new devices. managemon notices the new + devices, opens relevant sysfs file, and passes them all to + monitor. + 8/ mdadm::Grow_reshape() calls ->manage_reshape to oversee the + rest of the reshape. + + 9/ mdadm::->manage_reshape(): saves data that will be overwritten by + the kernel to either the backup file or the metadata specific location, + advances sync_max, waits for reshape, ping mdmon, repeat. + Meanwhile mdmon::read_and_act(): records checkpoints. + Specifically. + + 9a/ if the 'next' stripe to be reshaped will over-write + itself during reshape then: + 9a.1/ increase suspend_hi to cover a suitable number of + stripes. + 9a.2/ backup those stripes safely. + 9a.3/ advance sync_max to allow those stripes to be backed up + 9a.4/ when sync_completed indicates that those stripes have + been reshaped, manage_reshape must ping_manager + 9a.5/ when mdmon notices that sync_completed has been updated, + it records the new checkpoint in the metadata + 9a.6/ after the ping_manager, manage_reshape will increase + suspend_lo to allow access to those stripes again + + 9b/ if the 'next' stripe to be reshaped will over-write unused + space during reshape then we apply same process as above, + except that there is no need to back anything up. + Note that we *do* need to keep suspend_hi progressing as + it is not safe to write to the area-under-reshape. For + kernel-managed-metadata this protection is provided by + ->reshape_safe, but that does not protect us in the case + of user-space-managed-metadata. + + 10/ mdadm::->manage_reshape(): Once reshape completes changes the raid + level back to the nominal raid level (if necessary) + + FIXME: native metadata does not have the capability to record the original + raid level in reshape-restart case because the kernel always records current + raid level to the metadata, whereas external metadata can masquerade at an + alternate level based on the reshape state. + +2.6 Reshape raid disks (shrink) + +3 Interaction with metadata handle. + + The following calls are made into the metadata handler to assist + with initiating and monitoring a 'reshape'. + + 1/ ->reshape_super is called quite early (after only minimial + checks) to make sure that the metadata can record the new shape + and any necessary transitions. It may be passed a 'container' + or an individual array within a container, and it should notice + the difference and act accordingly. + When a reshape is requested against a container it is expected + that it should be applied to every array in the container, + however it is up to the metadata handler to determine final + policy. + + If the reshape is supportable, the internal copy of the metadata + should be updated, and a metadata update suitable for sending + to mdmon should be queued. + + If the reshape will involve converting spares into array members, + this must be recorded in the metadata too. + + 2/ ->container_content will be called to find out the new state + of all the array, or all arrays in the container. Any newly + added devices (with state==0 and raid_disk >= 0) will be added + to the array as spares with the relevant slot number. + + It is likely that the info returned by ->container_content will + have ->reshape_active set, ->reshape_progress set to e.g. 0, and + new_* set appropriately. mdadm will use this information to + cause the correct reshape to start at an appropriate time. + + 3/ ->set_array_state will be called by mdmon when reshape has + started and again periodically as it progresses. This should + record the ->last_checkpoint as the point where reshape has + progressed to. When the reshape finished this will be called + again and it should notice that ->curr_action is no longer + 'reshape' and so should record that the reshape has finished + providing 'last_checkpoint' has progressed suitably. + + 4/ ->manage_reshape will be called once the reshape has been set + up in the kernel but before sync_max has been moved from 0, so + no actual reshape will have happened. + + ->manage_reshape should call progress_reshape() to allow the + reshape to progress, and should back-up any data as indicated + by the return value. See the documentation of that function + for more details. + ->manage_reshape will be called multiple times when a + container is being reshaped, once for each member array in + the container. + + + The progress of the metadata is as follows: + 1/ mdadm sends a metadata update to mdmon which marks the array + as undergoing a reshape. This is set up by + ->reshape_super and applied by ->process_update + For container-wide reshape, this happens once for the whole + container. + 2/ mdmon notices progress via the sysfs files and calls + ->set_array_state to update the state periodically + For container-wide reshape, this happens repeatedly for + one array, then repeatedly for the next, etc. + 3/ mdmon notices when reshape has finished and call + ->set_array_state to record the the reshape is complete. + For container-wide reshape, this happens once for each + member array. + + + +... + +[1]: Linux kernel design patterns - part 3, Neil Brown http://lwn.net/Articles/336262/ diff --git a/inventory b/inventory new file mode 100755 index 00000000..ace5df04 --- /dev/null +++ b/inventory @@ -0,0 +1,255 @@ + +.gitignore +ANNOUNCE-3.0 +ANNOUNCE-3.0.1 +ANNOUNCE-3.0.2 +ANNOUNCE-3.0.3 +ANNOUNCE-3.1 +ANNOUNCE-3.1.1 +ANNOUNCE-3.1.2 +ANNOUNCE-3.1.3 +ANNOUNCE-3.1.4 +ANNOUNCE-3.1.5 +ANNOUNCE-3.2 +ANNOUNCE-3.2.1 +ANNOUNCE-3.2.2 +ANNOUNCE-3.2.3 +ANNOUNCE-3.2.4 +ANNOUNCE-3.2.5 +ANNOUNCE-3.2.6 +ANNOUNCE-3.3 +ANNOUNCE-3.3.1 +ANNOUNCE-3.3.2 +ANNOUNCE-3.3.3 +ANNOUNCE-3.3.4 +ANNOUNCE-3.4 +Assemble.c +Build.c +COPYING +ChangeLog +Create.c +Detail.c +Dump.c +Examine.c +Grow.c +INSTALL +Incremental.c +Kill.c +Makefile +Manage.c +Monitor.c +Query.c +README.initramfs +ReadMe.c +TODO +bitmap.c +bitmap.h +config.c +crc32.c +crc32.h +crc32c.c +dlink.c +dlink.h +external-reshape-design.txt +inventory +kernel-patch-2.6.18 +kernel-patch-2.6.18.6 +kernel-patch-2.6.19 +kernel-patch-2.6.25 +kernel-patch-2.6.27 +lib.c +makedist +managemon.c +mapfile.c +maps.c +md.4 +md5.h +md_p.h +md_u.h +mdadm.8.in +mdadm.c +mdadm.conf-example +mdadm.conf.5 +mdadm.h +mdadm.spec +mdassemble.8 +mdassemble.c +mdmon-design.txt +mdmon.8 +mdmon.c +mdmon.h +mdopen.c +mdstat.c +misc/ +misc/mdcheck +misc/syslog-events +mkinitramfs +monitor.c +msg.c +msg.h +part.h +platform-intel.c +platform-intel.h +policy.c +probe_roms.c +probe_roms.h +pwgr.c +raid5extend.c +raid6check.8 +raid6check.c +restripe.c +sg_io.c +sha1.c +sha1.h +super-ddf.c +super-gpt.c +super-intel.c +super-mbr.c +super0.c +super1.c +swap_super.c +sysfs.c +systemd/ +systemd/SUSE-mdadm_env.sh +systemd/mdadm-grow-continue@.service +systemd/mdadm-last-resort@.service +systemd/mdadm-last-resort@.timer +systemd/mdadm.shutdown +systemd/mdmon@.service +systemd/mdmonitor.service +test +tests/ +tests/00linear +tests/00multipath +tests/00names +tests/00raid0 +tests/00raid1 +tests/00raid10 +tests/00raid4 +tests/00raid5 +tests/00raid6 +tests/01r1fail +tests/01r5fail +tests/01r5integ +tests/01raid6integ +tests/01replace +tests/02lineargrow +tests/02r1add +tests/02r1grow +tests/02r5grow +tests/02r6grow +tests/03assem-incr +tests/03r0assem +tests/03r5assem +tests/03r5assem-failed +tests/03r5assemV1 +tests/04r0update +tests/04r1update +tests/04r5swap +tests/04update-metadata +tests/04update-uuid +tests/05r1-add-internalbitmap +tests/05r1-add-internalbitmap-v1a +tests/05r1-add-internalbitmap-v1b +tests/05r1-add-internalbitmap-v1c +tests/05r1-bitmapfile +tests/05r1-grow-external +tests/05r1-grow-internal +tests/05r1-grow-internal-1 +tests/05r1-internalbitmap +tests/05r1-internalbitmap-v1a +tests/05r1-internalbitmap-v1b +tests/05r1-internalbitmap-v1c +tests/05r1-n3-bitmapfile +tests/05r1-re-add +tests/05r1-re-add-nosuper +tests/05r1-remove-internalbitmap +tests/05r1-remove-internalbitmap-v1a +tests/05r1-remove-internalbitmap-v1b +tests/05r1-remove-internalbitmap-v1c +tests/05r5-bitmapfile +tests/05r5-internalbitmap +tests/05r6-bitmapfile +tests/05r6tor0 +tests/06name +tests/06sysfs +tests/06wrmostly +tests/07autoassemble +tests/07autodetect +tests/07changelevelintr +tests/07changelevels +tests/07layouts +tests/07reshape5intr +tests/07revert-grow +tests/07revert-inplace +tests/07revert-shrink +tests/07testreshape5 +tests/09imsm-assemble +tests/09imsm-create-fail-rebuild +tests/09imsm-overlap +tests/10ddf-assemble-missing +tests/10ddf-create +tests/10ddf-create-fail-rebuild +tests/10ddf-fail-create-race +tests/10ddf-fail-readd +tests/10ddf-fail-readd-readonly +tests/10ddf-fail-spare +tests/10ddf-fail-stop-readd +tests/10ddf-fail-twice +tests/10ddf-fail-two-spares +tests/10ddf-geometry +tests/10ddf-incremental-wrong-order +tests/10ddf-sudden-degraded +tests/11spare-migration +tests/12imsm-r0_2d-grow-r0_3d +tests/12imsm-r0_2d-grow-r0_4d +tests/12imsm-r0_2d-grow-r0_5d +tests/12imsm-r0_3d-grow-r0_4d +tests/12imsm-r5_3d-grow-r5_4d +tests/12imsm-r5_3d-grow-r5_5d +tests/13imsm-r0_r0_2d-grow-r0_r0_4d +tests/13imsm-r0_r0_2d-grow-r0_r0_5d +tests/13imsm-r0_r0_3d-grow-r0_r0_4d +tests/13imsm-r0_r5_3d-grow-r0_r5_4d +tests/13imsm-r0_r5_3d-grow-r0_r5_5d +tests/13imsm-r5_r0_3d-grow-r5_r0_4d +tests/13imsm-r5_r0_3d-grow-r5_r0_5d +tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d +tests/14imsm-r0_3d_no_spares-migrate-r5_3d +tests/14imsm-r0_r0_2d-takeover-r10_4d +tests/14imsm-r10_4d-grow-r10_5d +tests/14imsm-r10_r5_4d-takeover-r0_2d +tests/14imsm-r1_2d-grow-r1_3d +tests/14imsm-r1_2d-takeover-r0_2d +tests/14imsm-r5_3d-grow-r5_5d-no-spares +tests/14imsm-r5_3d-migrate-r4_3d +tests/15imsm-r0_3d_64k-migrate-r0_3d_256k +tests/15imsm-r5_3d_4k-migrate-r5_3d_256k +tests/15imsm-r5_3d_64k-migrate-r5_3d_256k +tests/15imsm-r5_6d_4k-migrate-r5_6d_256k +tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k +tests/16imsm-r0_3d-migrate-r5_4d +tests/16imsm-r0_5d-migrate-r5_6d +tests/16imsm-r5_3d-migrate-r0_3d +tests/16imsm-r5_5d-migrate-r0_5d +tests/18imsm-1d-takeover-r0_1d +tests/18imsm-1d-takeover-r1_2d +tests/18imsm-r0_2d-takeover-r10_4d +tests/18imsm-r10_4d-takeover-r0_2d +tests/18imsm-r1_2d-takeover-r0_1d +tests/19raid6auto-repair +tests/19raid6check +tests/19raid6repair +tests/19repair-does-not-destroy +tests/20raid5journal +tests/ToTest +tests/check +tests/env-ddf-template +tests/env-imsm-template +tests/imsm-grow-template +tests/testdev +tests/utils +udev-md-raid-arrays.rules +udev-md-raid-assembly.rules +util.c +xmalloc.c diff --git a/kernel-patch-2.6.18 b/kernel-patch-2.6.18 new file mode 100644 index 00000000..87496ea2 --- /dev/null +++ b/kernel-patch-2.6.18 @@ -0,0 +1,35 @@ + +### Diffstat output + ./drivers/md/md.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff .prev/drivers/md/md.c ./drivers/md/md.c +--- .prev/drivers/md/md.c 2006-10-23 10:26:37.000000000 +1000 ++++ ./drivers/md/md.c 2006-12-21 16:28:29.000000000 +1100 +@@ -1783,7 +1783,8 @@ state_store(mdk_rdev_t *rdev, const char + else { + mddev_t *mddev = rdev->mddev; + kick_rdev_from_array(rdev); +- md_update_sb(mddev); ++ if (mddev->pers) ++ md_update_sb(mddev); + md_new_event(mddev); + err = 0; + } +@@ -1994,6 +1995,8 @@ static mdk_rdev_t *md_import_device(dev_ + kobject_init(&rdev->kobj); + + rdev->desc_nr = -1; ++ rdev->saved_raid_disk = -1; ++ rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->sb_events = 0; +@@ -3991,6 +3994,7 @@ static int set_array_info(mddev_t * mdde + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; ++ mddev->persistent = ! info->not_persistent; + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; diff --git a/kernel-patch-2.6.18.6 b/kernel-patch-2.6.18.6 new file mode 100644 index 00000000..e702e14a --- /dev/null +++ b/kernel-patch-2.6.18.6 @@ -0,0 +1,35 @@ +Signed-off-by: Neil Brown + +### Diffstat output + ./drivers/md/md.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff .prev/drivers/md/md.c ./drivers/md/md.c +--- .prev/drivers/md/md.c 2006-12-21 17:08:23.000000000 +1100 ++++ ./drivers/md/md.c 2006-12-21 17:08:26.000000000 +1100 +@@ -1783,7 +1783,8 @@ state_store(mdk_rdev_t *rdev, const char + else { + mddev_t *mddev = rdev->mddev; + kick_rdev_from_array(rdev); +- md_update_sb(mddev); ++ if (mddev->pers) ++ md_update_sb(mddev); + md_new_event(mddev); + err = 0; + } +@@ -1995,6 +1996,7 @@ static mdk_rdev_t *md_import_device(dev_ + + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; ++ rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->sb_events = 0; +@@ -3993,6 +3995,7 @@ static int set_array_info(mddev_t * mdde + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; ++ mddev->persistent = ! info->not_persistent; + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; diff --git a/kernel-patch-2.6.19 b/kernel-patch-2.6.19 new file mode 100644 index 00000000..22a67a39 --- /dev/null +++ b/kernel-patch-2.6.19 @@ -0,0 +1,34 @@ + +### Diffstat output + ./drivers/md/md.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff .prev/drivers/md/md.c ./drivers/md/md.c +--- .prev/drivers/md/md.c 2006-12-21 15:55:01.000000000 +1100 ++++ ./drivers/md/md.c 2006-12-21 16:28:09.000000000 +1100 +@@ -1792,7 +1792,8 @@ state_store(mdk_rdev_t *rdev, const char + else { + mddev_t *mddev = rdev->mddev; + kick_rdev_from_array(rdev); +- md_update_sb(mddev, 1); ++ if (mddev->pers) ++ md_update_sb(mddev, 1); + md_new_event(mddev); + err = 0; + } +@@ -2004,6 +2005,7 @@ static mdk_rdev_t *md_import_device(dev_ + + rdev->desc_nr = -1; + rdev->saved_raid_disk = -1; ++ rdev->raid_disk = -1; + rdev->flags = 0; + rdev->data_offset = 0; + rdev->sb_events = 0; +@@ -3977,6 +3979,7 @@ static int set_array_info(mddev_t * mdde + mddev->major_version = info->major_version; + mddev->minor_version = info->minor_version; + mddev->patch_version = info->patch_version; ++ mddev->persistent = ! info->not_persistent; + return 0; + } + mddev->major_version = MD_MAJOR_VERSION; diff --git a/kernel-patch-2.6.25 b/kernel-patch-2.6.25 new file mode 100644 index 00000000..23290078 --- /dev/null +++ b/kernel-patch-2.6.25 @@ -0,0 +1,199 @@ +Status: ok + +Support adding a spare to a live md array with external metadata. + +i.e. extend the 'md/dev-XXX/slot' attribute so that you can +tell a device to fill an vacant slot in an and md array. + + +Signed-off-by: Neil Brown + +### Diffstat output + ./drivers/md/md.c | 44 ++++++++++++++++++++++++++++++++++++++++---- + ./drivers/md/multipath.c | 7 ++++++- + ./drivers/md/raid1.c | 7 ++++++- + ./drivers/md/raid10.c | 10 ++++++++-- + ./drivers/md/raid5.c | 10 ++++++++-- + 5 files changed, 68 insertions(+), 10 deletions(-) + +diff .prev/drivers/md/md.c ./drivers/md/md.c +--- .prev/drivers/md/md.c 2008-06-05 09:19:56.000000000 +1000 ++++ ./drivers/md/md.c 2008-06-10 10:41:21.000000000 +1000 +@@ -1932,7 +1932,7 @@ slot_store(mdk_rdev_t *rdev, const char + slot = -1; + else if (e==buf || (*e && *e!= '\n')) + return -EINVAL; +- if (rdev->mddev->pers) { ++ if (rdev->mddev->pers && slot == -1) { + /* Setting 'slot' on an active array requires also + * updating the 'rd%d' link, and communicating + * with the personality with ->hot_*_disk. +@@ -1940,8 +1940,6 @@ slot_store(mdk_rdev_t *rdev, const char + * failed/spare devices. This normally happens automatically, + * but not when the metadata is externally managed. + */ +- if (slot != -1) +- return -EBUSY; + if (rdev->raid_disk == -1) + return -EEXIST; + /* personality does all needed checks */ +@@ -1955,6 +1953,44 @@ slot_store(mdk_rdev_t *rdev, const char + sysfs_remove_link(&rdev->mddev->kobj, nm); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); ++ } else if (rdev->mddev->pers) { ++ mdk_rdev_t *rdev2; ++ struct list_head *tmp; ++ /* Activating a spare .. or possibly reactivating ++ * if we every get bitmaps working here. ++ */ ++ ++ if (rdev->raid_disk != -1) ++ return -EBUSY; ++ ++ if (rdev->mddev->pers->hot_add_disk == NULL) ++ return -EINVAL; ++ ++ rdev_for_each(rdev2, tmp, rdev->mddev) ++ if (rdev2->raid_disk == slot) ++ return -EEXIST; ++ ++ rdev->raid_disk = slot; ++ if (test_bit(In_sync, &rdev->flags)) ++ rdev->saved_raid_disk = slot; ++ else ++ rdev->saved_raid_disk = -1; ++ err = rdev->mddev->pers-> ++ hot_add_disk(rdev->mddev, rdev); ++ if (err != 1) { ++ rdev->raid_disk = -1; ++ if (err == 0) ++ return -EEXIST; ++ return err; ++ } ++ sprintf(nm, "rd%d", rdev->raid_disk); ++ if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) ++ printk(KERN_WARNING ++ "md: cannot register " ++ "%s for %s\n", ++ nm, mdname(rdev->mddev)); ++ ++ /* don't wakeup anyone, leave that to userspace. */ + } else { + if (slot >= rdev->mddev->raid_disks) + return -ENOSPC; +@@ -4205,7 +4241,7 @@ static int add_new_disk(mddev_t * mddev, + super_types[mddev->major_version]. + validate_super(mddev, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); +- if (err) ++ if (err < 0) + unbind_rdev_from_array(rdev); + } + if (err) + +diff .prev/drivers/md/multipath.c ./drivers/md/multipath.c +--- .prev/drivers/md/multipath.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/multipath.c 2008-06-10 10:35:03.000000000 +1000 +@@ -284,10 +284,15 @@ static int multipath_add_disk(mddev_t *m + int found = 0; + int path; + struct multipath_info *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; ++ ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; + + print_multipath_conf(conf); + +- for (path=0; pathraid_disks; path++) ++ for (path = first; path <= last; path++) + if ((p=conf->multipaths+path)->rdev == NULL) { + q = rdev->bdev->bd_disk->queue; + blk_queue_stack_limits(mddev->queue, q); + +diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c +--- .prev/drivers/md/raid10.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/raid10.c 2008-06-10 10:28:53.000000000 +1000 +@@ -1116,6 +1116,8 @@ static int raid10_add_disk(mddev_t *mdde + int found = 0; + int mirror; + mirror_info_t *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; + + if (mddev->recovery_cp < MaxSector) + /* only hot-add to in-sync arrays, as recovery is +@@ -1125,12 +1127,16 @@ static int raid10_add_disk(mddev_t *mdde + if (!enough(conf)) + return 0; + ++ if (rdev->raid_disk) ++ first = last = rdev->raid_disk; ++ + if (rdev->saved_raid_disk >= 0 && ++ rdev->saved_raid_disk >= first && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + mirror = rdev->saved_raid_disk; + else +- mirror = 0; +- for ( ; mirror < mddev->raid_disks; mirror++) ++ mirror = first; ++ for ( ; mirror <= last ; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + + blk_queue_stack_limits(mddev->queue, + +diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c +--- .prev/drivers/md/raid1.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/raid1.c 2008-06-10 10:41:00.000000000 +1000 +@@ -1103,8 +1103,13 @@ static int raid1_add_disk(mddev_t *mddev + int found = 0; + int mirror = 0; + mirror_info_t *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; + +- for (mirror=0; mirror < mddev->raid_disks; mirror++) ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; ++ ++ for (mirror = first; mirror <= last; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + + blk_queue_stack_limits(mddev->queue, + +diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c +--- .prev/drivers/md/raid5.c 2008-05-30 14:49:35.000000000 +1000 ++++ ./drivers/md/raid5.c 2008-06-10 10:27:51.000000000 +1000 +@@ -4399,21 +4399,27 @@ static int raid5_add_disk(mddev_t *mddev + int found = 0; + int disk; + struct disk_info *p; ++ int first = 0; ++ int last = conf->raid_disks - 1; + + if (mddev->degraded > conf->max_degraded) + /* no point adding a device */ + return 0; + ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; ++ + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && ++ rdev->saved_raid_disk >= first && + conf->disks[rdev->saved_raid_disk].rdev == NULL) + disk = rdev->saved_raid_disk; + else +- disk = 0; +- for ( ; disk < conf->raid_disks; disk++) ++ disk = first; ++ for ( ; disk <= last ; disk++) + if ((p=conf->disks + disk)->rdev == NULL) { + clear_bit(In_sync, &rdev->flags); + rdev->raid_disk = disk; diff --git a/kernel-patch-2.6.27 b/kernel-patch-2.6.27 new file mode 100644 index 00000000..8d0785d8 --- /dev/null +++ b/kernel-patch-2.6.27 @@ -0,0 +1,36 @@ +touch_mnt_namespace when the mount flags change + +From: Dan Williams + +Daemons that need to be launched while the rootfs is read-only can now +poll /proc/mounts to be notified when their O_RDWR requests may no +longer end in EROFS. + +Cc: Kay Sievers +Cc: Neil Brown +Signed-off-by: Dan Williams +--- + + fs/namespace.c | 7 ++++++- + 1 files changed, 6 insertions(+), 1 deletions(-) + + +diff --git a/fs/namespace.c b/fs/namespace.c +index 6e283c9..1bd5ba2 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -1553,8 +1553,13 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, + if (!err) + nd->path.mnt->mnt_flags = mnt_flags; + up_write(&sb->s_umount); +- if (!err) ++ if (!err) { + security_sb_post_remount(nd->path.mnt, flags, data); ++ ++ spin_lock(&vfsmount_lock); ++ touch_mnt_namespace(nd->path.mnt->mnt_ns); ++ spin_unlock(&vfsmount_lock); ++ } + return err; + } + diff --git a/lib.c b/lib.c new file mode 100644 index 00000000..6808f62d --- /dev/null +++ b/lib.c @@ -0,0 +1,475 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2011 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "dlink.h" +#include + +/* This fill contains various 'library' style function. They + * have no dependency on anything outside this file. + */ + +int get_mdp_major(void) +{ +static int mdp_major = -1; + FILE *fl; + char *w; + int have_block = 0; + int have_devices = 0; + int last_num = -1; + + if (mdp_major != -1) + return mdp_major; + fl = fopen("/proc/devices", "r"); + if (!fl) + return -1; + while ((w = conf_word(fl, 1))) { + if (have_block && strcmp(w, "devices:")==0) + have_devices = 1; + have_block = (strcmp(w, "Block")==0); + if (isdigit(w[0])) + last_num = atoi(w); + if (have_devices && strcmp(w, "mdp")==0) + mdp_major = last_num; + free(w); + } + fclose(fl); + return mdp_major; +} + +char *devid2kname(int devid) +{ + char path[30]; + char link[200]; + static char devnm[32]; + char *cp; + int n; + + /* Look at the + * /sys/dev/block/%d:%d link which must look like + * and take the last component. + */ + sprintf(path, "/sys/dev/block/%d:%d", major(devid), + minor(devid)); + n = readlink(path, link, sizeof(link)-1); + if (n > 0) { + link[n] = 0; + cp = strrchr(link, '/'); + if (cp) { + strcpy(devnm, cp+1); + return devnm; + } + } + return NULL; +} + +char *devid2devnm(int devid) +{ + char path[30]; + char link[200]; + static char devnm[32]; + char *cp, *ep; + int n; + + /* Might be an extended-minor partition or a + * named md device. Look at the + * /sys/dev/block/%d:%d link which must look like + * ../../block/mdXXX/mdXXXpYY + * or + * ...../block/md_FOO + */ + sprintf(path, "/sys/dev/block/%d:%d", major(devid), + minor(devid)); + n = readlink(path, link, sizeof(link)-1); + if (n > 0) { + link[n] = 0; + cp = strstr(link, "/block/"); + if (cp) { + cp += 7; + ep = strchr(cp, '/'); + if (ep) + *ep = 0; + strcpy(devnm, cp); + return devnm; + } + } + if (major(devid) == MD_MAJOR) + sprintf(devnm,"md%d", minor(devid)); + else if (major(devid) == (unsigned)get_mdp_major()) + sprintf(devnm,"md_d%d", + (minor(devid)>>MdpMinorShift)); + else + return NULL; + return devnm; +} + +char *stat2devnm(struct stat *st) +{ + if ((S_IFMT & st->st_mode) != S_IFBLK) + return NULL; + return devid2devnm(st->st_rdev); +} + +char *fd2devnm(int fd) +{ + struct stat stb; + if (fstat(fd, &stb) == 0) + return stat2devnm(&stb); + return NULL; +} + +/* + * convert a major/minor pair for a block device into a name in /dev, if possible. + * On the first call, walk /dev collecting name. + * Put them in a simple linked listfor now. + */ +struct devmap { + int major, minor; + char *name; + struct devmap *next; +} *devlist = NULL; +int devlist_ready = 0; + +int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s) +{ + struct stat st; + + if (S_ISLNK(stb->st_mode)) { + if (stat(name, &st) != 0) + return 0; + stb = &st; + } + + if ((stb->st_mode&S_IFMT)== S_IFBLK) { + char *n = xstrdup(name); + struct devmap *dm = xmalloc(sizeof(*dm)); + if (strncmp(n, "/dev/./", 7)==0) + strcpy(n+4, name+6); + if (dm) { + dm->major = major(stb->st_rdev); + dm->minor = minor(stb->st_rdev); + dm->name = n; + dm->next = devlist; + devlist = dm; + } + } + return 0; +} + +#ifndef HAVE_NFTW +#ifdef HAVE_FTW +int add_dev_1(const char *name, const struct stat *stb, int flag) +{ + return add_dev(name, stb, flag, NULL); +} +int nftw(const char *path, int (*han)(const char *name, const struct stat *stb, int flag, struct FTW *s), int nopenfd, int flags) +{ + return ftw(path, add_dev_1, nopenfd); +} +#else +int nftw(const char *path, int (*han)(const char *name, const struct stat *stb, int flag, struct FTW *s), int nopenfd, int flags) +{ + return 0; +} +#endif /* HAVE_FTW */ +#endif /* HAVE_NFTW */ + +/* + * Find a block device with the right major/minor number. + * If we find multiple names, choose the shortest. + * If we find a name in /dev/md/, we prefer that. + * This applies only to names for MD devices. + * If 'prefer' is set (normally to e.g. /by-path/) + * then we prefer a name which contains that string. + */ +char *map_dev_preferred(int major, int minor, int create, + char *prefer) +{ + struct devmap *p; + char *regular = NULL, *preferred=NULL; + int did_check = 0; + + if (major == 0 && minor == 0) + return NULL; + + retry: + if (!devlist_ready) { + char *dev = "/dev"; + struct stat stb; + while(devlist) { + struct devmap *d = devlist; + devlist = d->next; + free(d->name); + free(d); + } + if (lstat(dev, &stb)==0 && + S_ISLNK(stb.st_mode)) + dev = "/dev/."; + nftw(dev, add_dev, 10, FTW_PHYS); + devlist_ready=1; + did_check = 1; + } + + for (p=devlist; p; p=p->next) + if (p->major == major && + p->minor == minor) { + if (strncmp(p->name, "/dev/md/",8) == 0 + || (prefer && strstr(p->name, prefer))) { + if (preferred == NULL || + strlen(p->name) < strlen(preferred)) + preferred = p->name; + } else { + if (regular == NULL || + strlen(p->name) < strlen(regular)) + regular = p->name; + } + } + if (!regular && !preferred && !did_check) { + devlist_ready = 0; + goto retry; + } + if (create && !regular && !preferred) { + static char buf[30]; + snprintf(buf, sizeof(buf), "%d:%d", major, minor); + regular = buf; + } + + return preferred ? preferred : regular; +} + +/* conf_word gets one word from the conf file. + * if "allow_key", then accept words at the start of a line, + * otherwise stop when such a word is found. + * We assume that the file pointer is at the end of a word, so the + * next character is a space, or a newline. If not, it is the start of a line. + */ + +char *conf_word(FILE *file, int allow_key) +{ + int wsize = 100; + int len = 0; + int c; + int quote; + int wordfound = 0; + char *word = xmalloc(wsize); + + while (wordfound==0) { + /* at the end of a word.. */ + c = getc(file); + if (c == '#') + while (c != EOF && c != '\n') + c = getc(file); + if (c == EOF) break; + if (c == '\n') continue; + + if (c != ' ' && c != '\t' && ! allow_key) { + ungetc(c, file); + break; + } + /* looks like it is safe to get a word here, if there is one */ + quote = 0; + /* first, skip any spaces */ + while (c == ' ' || c == '\t') + c = getc(file); + if (c != EOF && c != '\n' && c != '#') { + /* we really have a character of a word, so start saving it */ + while (c != EOF && c != '\n' && (quote || (c!=' ' && c != '\t'))) { + wordfound = 1; + if (quote && c == quote) quote = 0; + else if (quote == 0 && (c == '\'' || c == '"')) + quote = c; + else { + if (len == wsize-1) { + wsize += 100; + word = xrealloc(word, wsize); + } + word[len++] = c; + } + c = getc(file); + /* Hack for broken kernels (2.6.14-.24) that put + * "active(auto-read-only)" + * in /proc/mdstat instead of + * "active (auto-read-only)" + */ + if (c == '(' && len >= 6 + && strncmp(word+len-6, "active", 6) == 0) + c = ' '; + } + } + if (c != EOF) ungetc(c, file); + } + word[len] = 0; + + /* Further HACK for broken kernels.. 2.6.14-2.6.24 */ + if (strcmp(word, "auto-read-only)") == 0) + strcpy(word, "(auto-read-only)"); + +/* printf("word is <%s>\n", word); */ + if (!wordfound) { + free(word); + word = NULL; + } + return word; +} + +void print_quoted(char *str) +{ + /* Printf the string with surrounding quotes + * iff needed. + * If no space, tab, or quote - leave unchanged. + * Else print surrounded by " or ', swapping quotes + * when we find one that will cause confusion. + */ + + char first_quote = 0, q; + char *c; + + for (c = str; *c; c++) { + switch(*c) { + case '\'': + case '"': + first_quote = *c; + break; + case ' ': + case '\t': + first_quote = *c; + continue; + default: + continue; + } + break; + } + if (!first_quote) { + printf("%s", str); + return; + } + + if (first_quote == '"') + q = '\''; + else + q = '"'; + putchar(q); + for (c = str; *c; c++) { + if (*c == q) { + putchar(q); + q ^= '"' ^ '\''; + putchar(q); + } + putchar(*c); + } + putchar(q); +} + +void print_escape(char *str) +{ + /* print str, but change space and tab to '_' + * as is suitable for device names + */ + for (; *str ; str++) { + switch (*str) { + case ' ': + case '\t': + putchar('_'); + break; + case '/': + putchar('-'); + break; + default: + putchar(*str); + } + } +} + +int check_env(char *name) +{ + char *val = getenv(name); + + if (val && atoi(val) == 1) + return 1; + + return 0; +} + +int use_udev(void) +{ + static int use = -1; + struct stat stb; + + if (use < 0) { + use = ((stat("/dev/.udev", &stb) == 0 + || stat("/run/udev", &stb) == 0) + && check_env("MDADM_NO_UDEV") == 0); + } + return use; +} + +unsigned long GCD(unsigned long a, unsigned long b) +{ + while (a != b) { + if (a < b) + b -= a; + if (b < a) + a -= b; + } + return a; +} + +/* + * conf_line reads one logical line from the conffile or mdstat. + * It skips comments and continues until it finds a line that starts + * with a non blank/comment. This character is pushed back for the next call + * A doubly linked list of words is returned. + * the first word will be a keyword. Other words will have had quotes removed. + */ + +char *conf_line(FILE *file) +{ + char *w; + char *list; + + w = conf_word(file, 1); + if (w == NULL) return NULL; + + list = dl_strdup(w); + free(w); + dl_init(list); + + while ((w = conf_word(file,0))){ + char *w2 = dl_strdup(w); + free(w); + dl_add(list, w2); + } +/* printf("got a line\n");*/ + return list; +} + +void free_line(char *line) +{ + char *w; + for (w=dl_next(line); w != line; w=dl_next(line)) { + dl_del(w); + dl_free(w); + } + dl_free(line); +} diff --git a/makedist b/makedist new file mode 100755 index 00000000..e4f20acf --- /dev/null +++ b/makedist @@ -0,0 +1,96 @@ +#!/bin/sh +# avoid silly sorting +export LANG=C +arg=$1 +target=~/public_html/source/mdadm +if [ " $arg" = " test" ] +then + target=/tmp/mdadm-test + rm -rf $target + mkdir -p $target +fi +if [ -d $target ] +then : +else echo $target is not a directory + exit 2 +fi +set `grep '^#define VERSION' ReadMe.c ` +version=`echo $3 | sed -e 's/"//g'` +grep "^.TH MDADM 8 .. v$version" mdadm.8.in > /dev/null 2>&1 || + { + echo mdadm.8.in does not mention version $version. + exit 1 + } +grep "^.TH MDMON 8 .. v$version" mdmon.8 > /dev/null 2>&1 || + { + echo mdmon.8 does not mention version $version. + exit 1 + } +rpmv=`echo $version | tr - _` +grep "^Version: *$rpmv$" mdadm.spec > /dev/null 2>&1 || + { + echo mdadm.spec does not mention version $version. + exit 1 + } +if [ -f ANNOUNCE-$version ] +then : +else + echo ANNOUNCE-$version does not exist + exit 1 +fi +if grep "^ANNOUNCE-$version\$" inventory +then : +else { cat inventory ; echo ANNOUNCE-$version ; } | sort -o inventory +fi + +echo version = $version +base=mdadm-$version.tar.gz +if [ " $arg" != " diff" ] +then + if [ -f $target/$base ] + then + echo $target/$base exists. + exit 1 + fi + trap "rm $target/$base; exit" 1 2 3 + git archive --prefix=mdadm-$version/ HEAD | gzip --best > $target/$base + chmod a+r $target/$base + ls -l $target/$base + if tar tzf $target/$base | sed 's,[^/]*/,,' | sort | diff -u inventory - + then : correct files found + else echo "Extra files, or inventory is out-of-date" + rm $target/$base + exit 1 + fi + rpmbuild -ta $target/$base || exit 1 + find /home/neilb/src/RPM -name "*mdadm-$version-*" \ + -exec cp {} $target/RPM \; + cp ANNOUNCE-$version $target/ANNOUNCE + cp ChangeLog $target/ChangeLog + if [ " $arg" != " test" ] + then + echo -n "Confirm signing this release? " + read a + if [ " $a" != " y" ]; then echo OK - bye. ; exit 1; fi + if zcat $target/$base | gpg -ba > $target/$base.sign && gpg -ba $target/ANNOUNCE + then + kup put $target/$base $target/$base.sign \ + /pub/linux/utils/raid/mdadm/mdadm-$version.tar.gz + kup put $target/ANNOUNCE $target/ANNOUNCE.asc /pub/linux/utils/raid/mdadm/ANNOUNCE + else + echo signing failed + exit 1 + fi + fi +else + if [ ! -f $target/$base ] + then + echo $target/$base does not exist. + exit 1 + fi + ( cd .. ; ln -s mdadm.v2 mdadm-$version ; tar chf - --exclude=.git --exclude="TAGS" --exclude='*,v' --exclude='*~' --exclude='*.o' --exclude mdadm --exclude=mdadm'.[^ch0-9]' --exclude=RCS mdadm-$version ; rm mdadm-$version ) | gzip --best > /var/tmp/mdadm-new.tgz + mkdir /var/tmp/mdadm-old ; zcat $target/$base | ( cd /var/tmp/mdadm-old ; tar xf - ) + mkdir /var/tmp/mdadm-new ; zcat /var/tmp/mdadm-new.tgz | ( cd /var/tmp/mdadm-new ; tar xf - ) + diff -ru /var/tmp/mdadm-old /var/tmp/mdadm-new + rm -rf /var/tmp/mdadm-old /var/tmp/mdadm-new /var/tmp/mdadm-new.tgz +fi diff --git a/managemon.c b/managemon.c new file mode 100644 index 00000000..6d1b3d85 --- /dev/null +++ b/managemon.c @@ -0,0 +1,926 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * The management thread for monitoring active md arrays. + * This thread does things which might block such as memory + * allocation. + * In particular: + * + * - Find out about new arrays in this container. + * Allocate the data structures and open the files. + * + * For this we watch /proc/mdstat and find new arrays with + * metadata type that confirms sharing. e.g. "md4" + * When we find a new array we slip it into the list of + * arrays and signal 'monitor' by writing to a pipe. + * + * - Respond to reshape requests by allocating new data structures + * and opening new files. + * + * These come as a change to raid_disks. We allocate a new + * version of the data structures and slip it into the list. + * 'monitor' will notice and release the old version. + * Changes to level, chunksize, layout.. do not need re-allocation. + * Reductions in raid_disks don't really either, but we handle + * them the same way for consistency. + * + * - When a device is added to the container, we add it to the metadata + * as a spare. + * + * - Deal with degraded array + * We only do this when first noticing the array is degraded. + * This can be when we first see the array, when sync completes or + * when recovery completes. + * + * Check if number of failed devices suggests recovery is needed, and + * skip if not. + * Ask metadata to allocate a spare device + * Add device as not in_sync and give a role + * Update metadata. + * Open sysfs files and pass to monitor. + * Make sure that monitor Starts recovery.... + * + * - Pass on metadata updates from external programs such as + * mdadm creating a new array. + * + * This is most-messy. + * It might involve adding a new array or changing the status of + * a spare, or any reconfig that the kernel doesn't get involved in. + * + * The required updates are received via a named pipe. There will + * be one named pipe for each container. Each message contains a + * sync marker: 0x5a5aa5a5, A byte count, and the message. This is + * passed to the metadata handler which will interpret and process it. + * For 'DDF' messages are internal data blocks with the leading + * 'magic number' signifying what sort of data it is. + * + */ + +/* + * We select on /proc/mdstat and the named pipe. + * We create new arrays or updated version of arrays and slip + * them into the head of the list, then signal 'monitor' via a pipe write. + * 'monitor' will notice and place the old array on a return list. + * Metadata updates are placed on a queue just like they arrive + * from the named pipe. + * + * When new arrays are found based on correct metadata string, we + * need to identify them with an entry in the metadata. Maybe we require + * the metadata to be mdX/NN when NN is the index into an appropriate table. + * + */ + +/* + * List of tasks: + * - Watch for spares to be added to the container, and write updated + * metadata to them. + * - Watch for new arrays using this container, confirm they match metadata + * and if so, start monitoring them + * - Watch for spares being added to monitored arrays. This shouldn't + * happen, as we should do all the adding. Just remove them. + * - Watch for change in raid-disks, chunk-size, etc. Update metadata and + * start a reshape. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "mdadm.h" +#include "mdmon.h" +#include +#include +#include + +static void close_aa(struct active_array *aa) +{ + struct mdinfo *d; + + for (d = aa->info.devs; d; d = d->next) { + close(d->recovery_fd); + close(d->state_fd); + } + + if (aa->action_fd >= 0) + close(aa->action_fd); + if (aa->info.state_fd >= 0) + close(aa->info.state_fd); + if (aa->resync_start_fd >= 0) + close(aa->resync_start_fd); + if (aa->metadata_fd >= 0) + close(aa->metadata_fd); + if (aa->sync_completed_fd >= 0) + close(aa->sync_completed_fd); +} + +static void free_aa(struct active_array *aa) +{ + /* Note that this doesn't close fds if they are being used + * by a clone. ->container will be set for a clone + */ + dprintf("sys_name: %s\n", aa->info.sys_name); + if (!aa->container) + close_aa(aa); + while (aa->info.devs) { + struct mdinfo *d = aa->info.devs; + aa->info.devs = d->next; + free(d); + } + free(aa); +} + +static struct active_array *duplicate_aa(struct active_array *aa) +{ + struct active_array *newa = xmalloc(sizeof(*newa)); + struct mdinfo **dp1, **dp2; + + *newa = *aa; + newa->next = NULL; + newa->replaces = NULL; + newa->info.next = NULL; + + dp2 = &newa->info.devs; + + for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) { + struct mdinfo *d; + if ((*dp1)->state_fd < 0) + continue; + + d = xmalloc(sizeof(*d)); + *d = **dp1; + *dp2 = d; + dp2 = & d->next; + } + *dp2 = NULL; + + return newa; +} + +static void wakeup_monitor(void) +{ + /* tgkill(getpid(), mon_tid, SIGUSR1); */ + int pid = getpid(); + syscall(SYS_tgkill, pid, mon_tid, SIGUSR1); +} + +static void remove_old(void) +{ + if (discard_this) { + discard_this->next = NULL; + free_aa(discard_this); + if (pending_discard == discard_this) + pending_discard = NULL; + discard_this = NULL; + wakeup_monitor(); + } +} + +static void replace_array(struct supertype *container, + struct active_array *old, + struct active_array *new) +{ + /* To replace an array, we add it to the top of the list + * marked with ->replaces to point to the original. + * 'monitor' will take the original out of the list + * and put it on 'discard_this'. We take it from there + * and discard it. + */ + remove_old(); + while (pending_discard) { + while (discard_this == NULL) + sleep(1); + remove_old(); + } + pending_discard = old; + new->replaces = old; + new->next = container->arrays; + container->arrays = new; + wakeup_monitor(); +} + +struct metadata_update *update_queue = NULL; +struct metadata_update *update_queue_handled = NULL; +struct metadata_update *update_queue_pending = NULL; + +static void free_updates(struct metadata_update **update) +{ + while (*update) { + struct metadata_update *this = *update; + void **space_list = this->space_list; + + *update = this->next; + free(this->buf); + free(this->space); + while (space_list) { + void *space = space_list; + space_list = *space_list; + free(space); + } + free(this); + } +} + +void check_update_queue(struct supertype *container) +{ + free_updates(&update_queue_handled); + + if (update_queue == NULL && + update_queue_pending) { + update_queue = update_queue_pending; + update_queue_pending = NULL; + wakeup_monitor(); + } +} + +static void queue_metadata_update(struct metadata_update *mu) +{ + struct metadata_update **qp; + + qp = &update_queue_pending; + while (*qp) + qp = & ((*qp)->next); + *qp = mu; +} + +static void add_disk_to_container(struct supertype *st, struct mdinfo *sd) +{ + int dfd; + char nm[20]; + struct supertype *st2; + struct metadata_update *update = NULL; + struct mdinfo info; + mdu_disk_info_t dk = { + .number = -1, + .major = sd->disk.major, + .minor = sd->disk.minor, + .raid_disk = -1, + .state = 0, + }; + + dprintf("add %d:%d to container\n", sd->disk.major, sd->disk.minor); + + sd->next = st->devs; + st->devs = sd; + + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDWR); + if (dfd < 0) + return; + + /* Check the metadata and see if it is already part of this + * array + */ + st2 = dup_super(st); + if (st2->ss->load_super(st2, dfd, NULL) == 0) { + st2->ss->getinfo_super(st2, &info, NULL); + if (st->ss->compare_super(st, st2) == 0 && + info.disk.raid_disk >= 0) { + /* Looks like a good member of array. + * Just accept it. + * mdadm will incorporate any parts into + * active arrays. + */ + st2->ss->free_super(st2); + return; + } + } + st2->ss->free_super(st2); + + st->update_tail = &update; + st->ss->add_to_super(st, &dk, dfd, NULL, INVALID_SECTORS); + st->ss->write_init_super(st); + queue_metadata_update(update); + st->update_tail = NULL; +} + +/* + * Create and queue update structure about the removed disks. + * The update is prepared by super type handler and passed to the monitor + * thread. + */ +static void remove_disk_from_container(struct supertype *st, struct mdinfo *sd) +{ + struct metadata_update *update = NULL; + mdu_disk_info_t dk = { + .number = -1, + .major = sd->disk.major, + .minor = sd->disk.minor, + .raid_disk = -1, + .state = 0, + }; + dprintf("remove %d:%d from container\n", + sd->disk.major, sd->disk.minor); + + st->update_tail = &update; + st->ss->remove_from_super(st, &dk); + /* FIXME this write_init_super shouldn't be here. + * We have it after add_to_super to write to new device, + * but with 'remove' we don't ant to write to that device! + */ + st->ss->write_init_super(st); + queue_metadata_update(update); + st->update_tail = NULL; +} + +static void manage_container(struct mdstat_ent *mdstat, + struct supertype *container) +{ + /* Of interest here are: + * - if a new device has been added to the container, we + * add it to the array ignoring any metadata on it. + * - if a device has been removed from the container, we + * remove it from the device list and update the metadata. + * FIXME should we look for compatible metadata and take hints + * about spare assignment.... probably not. + */ + if (mdstat->devcnt != container->devcnt) { + struct mdinfo **cdp, *cd, *di, *mdi; + int found; + + /* read /sys/block/NAME/md/dev-??/block/dev to find out + * what is there, and compare with container->info.devs + * To see what is removed and what is added. + * These need to be remove from, or added to, the array + */ + mdi = sysfs_read(-1, mdstat->devnm, GET_DEVS); + if (!mdi) { + /* invalidate the current count so we can try again */ + container->devcnt = -1; + return; + } + + /* check for removals */ + for (cdp = &container->devs; *cdp; ) { + found = 0; + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (*cdp)->disk.major && + di->disk.minor == (*cdp)->disk.minor) { + found = 1; + break; + } + if (!found) { + cd = *cdp; + *cdp = (*cdp)->next; + remove_disk_from_container(container, cd); + free(cd); + } else + cdp = &(*cdp)->next; + } + + /* check for additions */ + for (di = mdi->devs; di; di = di->next) { + for (cd = container->devs; cd; cd = cd->next) + if (di->disk.major == cd->disk.major && + di->disk.minor == cd->disk.minor) + break; + if (!cd) { + struct mdinfo *newd = xmalloc(sizeof(*newd)); + + *newd = *di; + add_disk_to_container(container, newd); + } + } + sysfs_free(mdi); + container->devcnt = mdstat->devcnt; + } +} + +static int sysfs_open2(char *devnum, char *name, char *attr) +{ + int fd = sysfs_open(devnum, name, attr); + if (fd >= 0) { + /* seq_file in the kernel allocates buffer space + * on the first read. Do that now so 'monitor' + * never needs too. + */ + char buf[200]; + if (read(fd, buf, sizeof(buf)) < 0) + /* pretend not to ignore return value */ + return fd; + } + return fd; +} + +static int disk_init_and_add(struct mdinfo *disk, struct mdinfo *clone, + struct active_array *aa) +{ + if (!disk || !clone) + return -1; + + *disk = *clone; + disk->recovery_fd = sysfs_open2(aa->info.sys_name, disk->sys_name, + "recovery_start"); + if (disk->recovery_fd < 0) + return -1; + disk->state_fd = sysfs_open2(aa->info.sys_name, disk->sys_name, "state"); + if (disk->state_fd < 0) { + close(disk->recovery_fd); + return -1; + } + disk->prev_state = read_dev_state(disk->state_fd); + disk->curr_state = disk->prev_state; + disk->next = aa->info.devs; + aa->info.devs = disk; + + return 0; +} + +static void manage_member(struct mdstat_ent *mdstat, + struct active_array *a) +{ + /* Compare mdstat info with known state of member array. + * We do not need to look for device state changes here, that + * is dealt with by the monitor. + * + * If a reshape is being requested, monitor will have noticed + * that sync_action changed and will have set check_reshape. + * We just need to see if new devices have appeared. All metadata + * updates will already have been processed. + * + * We also want to handle degraded arrays here by + * trying to find and assign a spare. + * We do that whenever the monitor tells us too. + */ + char buf[64]; + int frozen; + struct supertype *container = a->container; + unsigned long long int component_size = 0; + + if (container == NULL) + /* Raced with something */ + return; + + if (mdstat->active) { + // FIXME + a->info.array.raid_disks = mdstat->raid_disks; + // MORE + } + + if (sysfs_get_ll(&a->info, NULL, "component_size", &component_size) >= 0) + a->info.component_size = component_size << 1; + + /* honor 'frozen' */ + if (sysfs_get_str(&a->info, NULL, "metadata_version", buf, sizeof(buf)) > 0) + frozen = buf[9] == '-'; + else + frozen = 1; /* can't read metadata_version assume the worst */ + + /* If sync_action is not 'idle' then don't try recovery now */ + if (!frozen + && sysfs_get_str(&a->info, NULL, "sync_action", buf, sizeof(buf)) > 0 + && strncmp(buf, "idle", 4) != 0) + frozen = 1; + + if (mdstat->level) { + int level = map_name(pers, mdstat->level); + if (level == 0 || level == LEVEL_LINEAR) { + a->to_remove = 1; + wakeup_monitor(); + return; + } + else if (a->info.array.level != level && level > 0) { + struct active_array *newa = duplicate_aa(a); + if (newa) { + newa->info.array.level = level; + replace_array(container, a, newa); + a = newa; + } + } + } + + /* we are after monitor kick, + * so container field can be cleared - check it again + */ + if (a->container == NULL) + return; + + if (sigterm && a->info.safe_mode_delay != 1) { + sysfs_set_safemode(&a->info, 1); + a->info.safe_mode_delay = 1; + } + + /* We don't check the array while any update is pending, as it + * might container a change (such as a spare assignment) which + * could affect our decisions. + */ + if (a->check_degraded && !frozen && + update_queue == NULL && update_queue_pending == NULL) { + struct metadata_update *updates = NULL; + struct mdinfo *newdev = NULL; + struct active_array *newa; + struct mdinfo *d; + + a->check_degraded = 0; + + /* The array may not be degraded, this is just a good time + * to check. + */ + newdev = container->ss->activate_spare(a, &updates); + if (!newdev) + return; + + newa = duplicate_aa(a); + if (!newa) + goto out; + /* prevent the kernel from activating the disk(s) before we + * finish adding them + */ + dprintf("freezing %s\n", a->info.sys_name); + sysfs_set_str(&a->info, NULL, "sync_action", "frozen"); + + /* Add device to array and set offset/size/slot. + * and open files for each newdev */ + for (d = newdev; d ; d = d->next) { + struct mdinfo *newd; + + newd = xmalloc(sizeof(*newd)); + if (sysfs_add_disk(&newa->info, d, 0) < 0) { + free(newd); + continue; + } + disk_init_and_add(newd, d, newa); + } + queue_metadata_update(updates); + updates = NULL; + while (update_queue_pending || update_queue) { + check_update_queue(container); + usleep(15*1000); + } + replace_array(container, a, newa); + if (sysfs_set_str(&a->info, NULL, "sync_action", "recover") + == 0) + newa->prev_action = recover; + dprintf("recovery started on %s\n", a->info.sys_name); + out: + while (newdev) { + d = newdev->next; + free(newdev); + newdev = d; + } + free_updates(&updates); + } + + if (a->check_reshape) { + /* mdadm might have added some devices to the array. + * We want to disk_init_and_add any such device to a + * duplicate_aa and replace a with that. + * mdstat doesn't have enough info so we sysfs_read + * and look for new stuff. + */ + struct mdinfo *info, *d, *d2, *newd; + unsigned long long array_size; + struct active_array *newa = NULL; + a->check_reshape = 0; + info = sysfs_read(-1, mdstat->devnm, + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE); + if (!info) + goto out2; + for (d = info->devs; d; d = d->next) { + if (d->disk.raid_disk < 0) + continue; + for (d2 = a->info.devs; d2; d2 = d2->next) + if (d2->disk.raid_disk == + d->disk.raid_disk) + break; + if (d2) + /* already have this one */ + continue; + if (!newa) { + newa = duplicate_aa(a); + if (!newa) + break; + } + newd = xmalloc(sizeof(*newd)); + disk_init_and_add(newd, d, newa); + } + if (sysfs_get_ll(info, NULL, "array_size", &array_size) == 0 + && a->info.custom_array_size > array_size*2) { + sysfs_set_num(info, NULL, "array_size", + a->info.custom_array_size/2); + } + out2: + sysfs_free(info); + if (newa) + replace_array(container, a, newa); + } +} + +static int aa_ready(struct active_array *aa) +{ + struct mdinfo *d; + int level = aa->info.array.level; + + for (d = aa->info.devs; d; d = d->next) + if (d->state_fd < 0) + return 0; + + if (aa->info.state_fd < 0) + return 0; + + if (level > 0 && (aa->action_fd < 0 || aa->resync_start_fd < 0)) + return 0; + + if (!aa->container) + return 0; + + return 1; +} + +static void manage_new(struct mdstat_ent *mdstat, + struct supertype *container, + struct active_array *victim) +{ + /* A new array has appeared in this container. + * Hopefully it is already recorded in the metadata. + * Check, then create the new array to report it to + * the monitor. + */ + + struct active_array *new; + struct mdinfo *mdi, *di; + char *inst; + int i; + int failed = 0; + char buf[40]; + + /* check if array is ready to be monitored */ + if (!mdstat->active || !mdstat->level) + return; + if (strcmp(mdstat->level, "raid0") == 0 || + strcmp(mdstat->level, "linear") == 0) + return; + + mdi = sysfs_read(-1, mdstat->devnm, + GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT| + GET_DEGRADED|GET_SAFEMODE| + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|GET_LAYOUT); + + if (!mdi) + return; + new = xcalloc(1, sizeof(*new)); + + strcpy(new->info.sys_name, mdstat->devnm); + + new->prev_state = new->curr_state = new->next_state = inactive; + new->prev_action= new->curr_action= new->next_action= idle; + + new->container = container; + + inst = to_subarray(mdstat, container->devnm); + + new->info.array = mdi->array; + new->info.component_size = mdi->component_size; + + for (i = 0; i < new->info.array.raid_disks; i++) { + struct mdinfo *newd = xmalloc(sizeof(*newd)); + + for (di = mdi->devs; di; di = di->next) + if (i == di->disk.raid_disk) + break; + + if (disk_init_and_add(newd, di, new) != 0) { + if (newd) + free(newd); + + failed++; + if (failed > new->info.array.failed_disks) { + /* we cannot properly monitor without all working disks */ + new->container = NULL; + break; + } + } + } + + new->action_fd = sysfs_open2(new->info.sys_name, NULL, "sync_action"); + new->info.state_fd = sysfs_open2(new->info.sys_name, NULL, "array_state"); + new->resync_start_fd = sysfs_open2(new->info.sys_name, NULL, "resync_start"); + new->metadata_fd = sysfs_open2(new->info.sys_name, NULL, "metadata_version"); + new->sync_completed_fd = sysfs_open2(new->info.sys_name, NULL, "sync_completed"); + + dprintf("inst: %s action: %d state: %d\n", inst, + new->action_fd, new->info.state_fd); + + if (sigterm) + new->info.safe_mode_delay = 1; + else if (mdi->safe_mode_delay >= 50) + /* Normal start, mdadm set this. */ + new->info.safe_mode_delay = mdi->safe_mode_delay; + else + /* Restart, just pick a number */ + new->info.safe_mode_delay = 5000; + sysfs_set_safemode(&new->info, new->info.safe_mode_delay); + + /* reshape_position is set by mdadm in sysfs + * read this information for new arrays only (empty victim) + */ + if ((victim == NULL) && + (sysfs_get_str(mdi, NULL, "sync_action", buf, 40) > 0) && + (strncmp(buf, "reshape", 7) == 0)) { + if (sysfs_get_ll(mdi, NULL, "reshape_position", + &new->last_checkpoint) != 0) + new->last_checkpoint = 0; + else { + int data_disks = mdi->array.raid_disks; + if (mdi->array.level == 4 || mdi->array.level == 5) + data_disks--; + if (mdi->array.level == 6) + data_disks -= 2; + + new->last_checkpoint /= data_disks; + } + dprintf("mdmon: New monitored array is under reshape.\n" + " Last checkpoint is: %llu\n", + new->last_checkpoint); + } + + sysfs_free(mdi); + + /* if everything checks out tell the metadata handler we want to + * manage this instance + */ + if (!aa_ready(new) || container->ss->open_new(container, new, inst) < 0) { + pr_err("failed to monitor %s\n", + mdstat->metadata_version); + new->container = NULL; + free_aa(new); + } else { + replace_array(container, victim, new); + if (failed) { + new->check_degraded = 1; + manage_member(mdstat, new); + } + } +} + +void manage(struct mdstat_ent *mdstat, struct supertype *container) +{ + /* We have just read mdstat and need to compare it with + * the known active arrays. + * Arrays with the wrong metadata are ignored. + */ + + for ( ; mdstat ; mdstat = mdstat->next) { + struct active_array *a; + if (strcmp(mdstat->devnm, container->devnm) == 0) { + manage_container(mdstat, container); + continue; + } + if (!is_container_member(mdstat, container->devnm)) + /* Not for this array */ + continue; + /* Looks like a member of this container */ + for (a = container->arrays; a; a = a->next) { + if (strcmp(mdstat->devnm, a->info.sys_name) == 0) { + if (a->container && a->to_remove == 0) + manage_member(mdstat, a); + break; + } + } + if (a == NULL || !a->container) + manage_new(mdstat, container, a); + } +} + +static void handle_message(struct supertype *container, struct metadata_update *msg) +{ + /* queue this metadata update through to the monitor */ + + struct metadata_update *mu; + + if (msg->len <= 0) + while (update_queue_pending || update_queue) { + check_update_queue(container); + usleep(15*1000); + } + + if (msg->len == 0) { /* ping_monitor */ + int cnt; + + cnt = monitor_loop_cnt; + if (cnt & 1) + cnt += 2; /* wait until next pselect */ + else + cnt += 3; /* wait for 2 pselects */ + wakeup_monitor(); + + while (monitor_loop_cnt - cnt < 0) + usleep(10 * 1000); + } else if (msg->len == -1) { /* ping_manager */ + struct mdstat_ent *mdstat = mdstat_read(1, 0); + + manage(mdstat, container); + free_mdstat(mdstat); + } else if (!sigterm) { + mu = xmalloc(sizeof(*mu)); + mu->len = msg->len; + mu->buf = msg->buf; + msg->buf = NULL; + mu->space = NULL; + mu->space_list = NULL; + mu->next = NULL; + if (container->ss->prepare_update) + if (!container->ss->prepare_update(container, mu)) + free_updates(&mu); + queue_metadata_update(mu); + } +} + +void read_sock(struct supertype *container) +{ + int fd; + struct metadata_update msg; + int terminate = 0; + long fl; + int tmo = 3; /* 3 second timeout before hanging up the socket */ + + fd = accept(container->sock, NULL, NULL); + if (fd < 0) + return; + + fl = fcntl(fd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(fd, F_SETFL, fl); + + do { + msg.buf = NULL; + + /* read and validate the message */ + if (receive_message(fd, &msg, tmo) == 0) { + handle_message(container, &msg); + if (msg.len == 0) { + /* ping reply with version */ + msg.buf = Version; + msg.len = strlen(Version) + 1; + if (send_message(fd, &msg, tmo) < 0) + terminate = 1; + } else if (ack(fd, tmo) < 0) + terminate = 1; + } else + terminate = 1; + + } while (!terminate); + + close(fd); +} + +int exit_now = 0; +int manager_ready = 0; +void do_manager(struct supertype *container) +{ + struct mdstat_ent *mdstat; + sigset_t set; + + sigprocmask(SIG_UNBLOCK, NULL, &set); + sigdelset(&set, SIGUSR1); + sigdelset(&set, SIGTERM); + + do { + + if (exit_now) + exit(0); + + /* Can only 'manage' things if 'monitor' is not making + * structural changes to metadata, so need to check + * update_queue + */ + if (update_queue == NULL) { + mdstat = mdstat_read(1, 0); + + manage(mdstat, container); + + read_sock(container); + + free_mdstat(mdstat); + } + remove_old(); + + check_update_queue(container); + + manager_ready = 1; + + if (sigterm) + wakeup_monitor(); + + if (update_queue == NULL) + mdstat_wait_fd(container->sock, &set); + else + /* If an update is happening, just wait for signal */ + pselect(0, NULL, NULL, NULL, NULL, &set); + } while(1); +} diff --git a/mapfile.c b/mapfile.c new file mode 100644 index 00000000..243ded18 --- /dev/null +++ b/mapfile.c @@ -0,0 +1,508 @@ +/* + * mapfile - keep track of uuid <-> array mapping. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2010 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * Paper: Neil Brown + * Novell Inc + * GPO Box Q1283 + * QVB Post Office, NSW 1230 + * Australia + */ + +/* The mapfile is used to track arrays being created in --incremental + * mode. It particularly allows lookup from UUID to array device, but + * also allows the array device name to be easily found. + * + * The map file is line based with space separated fields. The fields are: + * Device id - mdX or mdpX where X is a number. + * metadata - 0.90 1.0 1.1 1.2 ddf ... + * UUID - uuid of the array + * path - path where device created: /dev/md/home + * + * The best place for the mapfile is /run/mdadm/map. Distros and users + * which have not switched to /run yet can choose a different location + * at compile time via MAP_DIR and MAP_FILE. + */ +#include "mdadm.h" +#include +#include + +#define MAP_READ 0 +#define MAP_NEW 1 +#define MAP_LOCK 2 +#define MAP_DIRNAME 3 + +char *mapname[4] = { + MAP_DIR "/" MAP_FILE, + MAP_DIR "/" MAP_FILE ".new", + MAP_DIR "/" MAP_FILE ".lock", + MAP_DIR +}; + +int mapmode[3] = { O_RDONLY, O_RDWR|O_CREAT, O_RDWR|O_CREAT|O_TRUNC }; +char *mapsmode[3] = { "r", "w", "w"}; + +FILE *open_map(int modenum) +{ + int fd; + if ((mapmode[modenum] & O_CREAT)) + /* Attempt to create directory, don't worry about + * failure. + */ + (void)mkdir(mapname[MAP_DIRNAME], 0755); + fd = open(mapname[modenum], mapmode[modenum], 0600); + if (fd >= 0) + return fdopen(fd, mapsmode[modenum]); + return NULL; +} + +int map_write(struct map_ent *mel) +{ + FILE *f; + int err; + + f = open_map(MAP_NEW); + + if (!f) + return 0; + for (; mel; mel = mel->next) { + if (mel->bad) + continue; + fprintf(f, "%s ", mel->devnm); + fprintf(f, "%s ", mel->metadata); + fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0], + mel->uuid[1], mel->uuid[2], mel->uuid[3]); + fprintf(f, "%s\n", mel->path?:""); + } + fflush(f); + err = ferror(f); + fclose(f); + if (err) { + unlink(mapname[1]); + return 0; + } + return rename(mapname[1], + mapname[0]) == 0; +} + +static FILE *lf = NULL; +int map_lock(struct map_ent **melp) +{ + while (lf == NULL) { + struct stat buf; + lf = open_map(MAP_LOCK); + if (lf == NULL) + return -1; + if (flock(fileno(lf), LOCK_EX) != 0) { + fclose(lf); + lf = NULL; + return -1; + } + if (fstat(fileno(lf), &buf) != 0 || + buf.st_nlink == 0) { + /* The owner of the lock unlinked it, + * so we have a lock on a stale file, + * try again + */ + fclose(lf); + lf = NULL; + } + } + if (*melp) + map_free(*melp); + map_read(melp); + return 0; +} + +void map_unlock(struct map_ent **melp) +{ + if (lf) { + /* must unlink before closing the file, + * as only the owner of the lock may + * unlink the file + */ + unlink(mapname[2]); + fclose(lf); + } + lf = NULL; +} + +void map_fork(void) +{ + /* We are forking, so must close the lock file. + * Don't risk flushing anything though. + */ + if (lf) { + close(fileno(lf)); + fclose(lf); + lf = NULL; + } +} + +void map_add(struct map_ent **melp, + char * devnm, char *metadata, int uuid[4], char *path) +{ + struct map_ent *me = xmalloc(sizeof(*me)); + + strcpy(me->devnm, devnm); + strcpy(me->metadata, metadata); + memcpy(me->uuid, uuid, 16); + me->path = path ? xstrdup(path) : NULL; + me->next = *melp; + me->bad = 0; + *melp = me; +} + +void map_read(struct map_ent **melp) +{ + FILE *f; + char buf[8192]; + char path[201]; + int uuid[4]; + char devnm[32]; + char metadata[30]; + + *melp = NULL; + + f = open_map(MAP_READ); + if (!f) { + RebuildMap(); + f = open_map(MAP_READ); + } + if (!f) + return; + + while (fgets(buf, sizeof(buf), f)) { + path[0] = 0; + if (sscanf(buf, " %s %s %x:%x:%x:%x %200s", + devnm, metadata, uuid, uuid+1, + uuid+2, uuid+3, path) >= 7) { + map_add(melp, devnm, metadata, uuid, path); + } + } + fclose(f); +} + +void map_free(struct map_ent *map) +{ + while (map) { + struct map_ent *mp = map; + map = mp->next; + free(mp->path); + free(mp); + } +} + +int map_update(struct map_ent **mpp, char *devnm, char *metadata, + int *uuid, char *path) +{ + struct map_ent *map, *mp; + int rv; + + if (mpp && *mpp) + map = *mpp; + else + map_read(&map); + + for (mp = map ; mp ; mp=mp->next) + if (strcmp(mp->devnm, devnm) == 0) { + strcpy(mp->metadata, metadata); + memcpy(mp->uuid, uuid, 16); + free(mp->path); + mp->path = path ? xstrdup(path) : NULL; + mp->bad = 0; + break; + } + if (!mp) + map_add(&map, devnm, metadata, uuid, path); + if (mpp) + *mpp = NULL; + rv = map_write(map); + map_free(map); + return rv; +} + +void map_delete(struct map_ent **mapp, char *devnm) +{ + struct map_ent *mp; + + if (*mapp == NULL) + map_read(mapp); + + for (mp = *mapp; mp; mp = *mapp) { + if (strcmp(mp->devnm, devnm) == 0) { + *mapp = mp->next; + free(mp->path); + free(mp); + } else + mapp = & mp->next; + } +} + +void map_remove(struct map_ent **mapp, char *devnm) +{ + if (devnm[0] == 0) + return; + + map_delete(mapp, devnm); + map_write(*mapp); + map_free(*mapp); +} + +struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (memcmp(uuid, mp->uuid, 16) != 0) + continue; + if (!mddev_busy(mp->devnm)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +struct map_ent *map_by_devnm(struct map_ent **map, char *devnm) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (strcmp(mp->devnm, devnm) != 0) + continue; + if (!mddev_busy(mp->devnm)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +struct map_ent *map_by_name(struct map_ent **map, char *name) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (!mp->path) + continue; + if (strncmp(mp->path, "/dev/md/", 8) != 0) + continue; + if (strcmp(mp->path+8, name) != 0) + continue; + if (!mddev_busy(mp->devnm)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +/* sets the proper subarray and container_dev according to the metadata + * version super_by_fd does this automatically, this routine is meant as + * a supplement for guess_super() + */ +static char *get_member_info(struct mdstat_ent *ent) +{ + + if (ent->metadata_version == NULL || + strncmp(ent->metadata_version, "external:", 9) != 0) + return NULL; + + if (is_subarray(&ent->metadata_version[9])) { + char *subarray; + + subarray = strrchr(ent->metadata_version, '/'); + return subarray + 1; + } + return NULL; +} + +void RebuildMap(void) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *md; + struct map_ent *map = NULL; + int require_homehost; + char sys_hostname[256]; + char *homehost = conf_get_homehost(&require_homehost); + + if (homehost == NULL || strcmp(homehost, "")==0) { + if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) { + sys_hostname[sizeof(sys_hostname)-1] = 0; + homehost = sys_hostname; + } + } + + for (md = mdstat ; md ; md = md->next) { + struct mdinfo *sra = sysfs_read(-1, md->devnm, GET_DEVS); + struct mdinfo *sd; + + if (!sra) + continue; + + for (sd = sra->devs ; sd ; sd = sd->next) { + char namebuf[100]; + char dn[30]; + int dfd; + int ok; + int devid; + struct supertype *st; + char *subarray = NULL; + char *path; + struct mdinfo *info; + + sprintf(dn, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + st = guess_super(dfd); + if ( st == NULL) + ok = -1; + else { + subarray = get_member_info(md); + ok = st->ss->load_super(st, dfd, NULL); + } + close(dfd); + if (ok != 0) + continue; + if (subarray) + info = st->ss->container_content(st, subarray); + else { + info = xmalloc(sizeof(*info)); + st->ss->getinfo_super(st, info, NULL); + } + if (!info) + continue; + + devid = devnm2devid(md->devnm); + path = map_dev(major(devid), minor(devid), 0); + if (path == NULL || + strncmp(path, "/dev/md/", 8) != 0) { + /* We would really like a name that provides + * an MD_DEVNAME for udev. + * The name needs to be unique both in /dev/md/ + * and in this mapfile. + * It needs to match what -I or -As would come + * up with. + * That means: + * Check if array is in mdadm.conf + * - if so use that. + * determine trustworthy from homehost etc + * find a unique name based on metadata name. + * + */ + struct mddev_ident *match = conf_match(st, info, + NULL, 0, + NULL); + struct stat stb; + if (match && match->devname && match->devname[0] == '/') { + path = match->devname; + if (path[0] != '/') { + strcpy(namebuf, "/dev/md/"); + strcat(namebuf, path); + path = namebuf; + } + } else { + int unum = 0; + char *sep = "_"; + const char *name; + int conflict = 1; + if ((homehost == NULL || + st->ss->match_home(st, homehost) != 1) && + st->ss->match_home(st, "any") != 1 && + (require_homehost + || ! conf_name_is_free(info->name))) + /* require a numeric suffix */ + unum = 0; + else + /* allow name to be used as-is if no conflict */ + unum = -1; + name = info->name; + if (!*name) { + name = st->ss->name; + if (!isdigit(name[strlen(name)-1]) && + unum == -1) { + unum = 0; + sep = ""; + } + } + if (strchr(name, ':')) { + /* Probably a uniquifying + * hostname prefix. Allow + * without a suffix, and strip + * hostname if it is us. + */ + if (homehost && unum == -1 && + strncmp(name, homehost, + strlen(homehost)) == 0 && + name[strlen(homehost)] == ':') + name += strlen(homehost)+1; + unum = -1; + } + + while (conflict) { + if (unum >= 0) + sprintf(namebuf, "/dev/md/%s%s%d", + name, sep, unum); + else + sprintf(namebuf, "/dev/md/%s", + name); + unum++; + if (lstat(namebuf, &stb) != 0 && + (map == NULL || + !map_by_name(&map, namebuf+8))) + conflict = 0; + } + path = namebuf; + } + } + map_add(&map, md->devnm, + info->text_version, + info->uuid, path); + st->ss->free_super(st); + free(info); + break; + } + sysfs_free(sra); + } + /* Only trigger a change if we wrote a new map file */ + if (map_write(map)) + for (md = mdstat ; md ; md = md->next) { + struct mdinfo *sra = sysfs_read(-1, md->devnm, + GET_VERSION); + if (sra) + sysfs_uevent(sra, "change"); + sysfs_free(sra); + } + map_free(map); + free_mdstat(mdstat); +} diff --git a/maps.c b/maps.c new file mode 100644 index 00000000..64f1df2c --- /dev/null +++ b/maps.c @@ -0,0 +1,150 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2011 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" + +/* name/number mappings */ + +mapping_t r5layout[] = { + { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC}, + { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC}, + { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC}, + { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC}, + + { "default", ALGORITHM_LEFT_SYMMETRIC}, + { "la", ALGORITHM_LEFT_ASYMMETRIC}, + { "ra", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ls", ALGORITHM_LEFT_SYMMETRIC}, + { "rs", ALGORITHM_RIGHT_SYMMETRIC}, + + { "parity-first", ALGORITHM_PARITY_0}, + { "parity-last", ALGORITHM_PARITY_N}, + { "ddf-zero-restart", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ddf-N-restart", ALGORITHM_LEFT_ASYMMETRIC}, + { "ddf-N-continue", ALGORITHM_LEFT_SYMMETRIC}, + + { NULL, 0} +}; +mapping_t r6layout[] = { + { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC}, + { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC}, + { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC}, + { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC}, + + { "default", ALGORITHM_LEFT_SYMMETRIC}, + { "la", ALGORITHM_LEFT_ASYMMETRIC}, + { "ra", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ls", ALGORITHM_LEFT_SYMMETRIC}, + { "rs", ALGORITHM_RIGHT_SYMMETRIC}, + + { "parity-first", ALGORITHM_PARITY_0}, + { "parity-last", ALGORITHM_PARITY_N}, + { "ddf-zero-restart", ALGORITHM_ROTATING_ZERO_RESTART}, + { "ddf-N-restart", ALGORITHM_ROTATING_N_RESTART}, + { "ddf-N-continue", ALGORITHM_ROTATING_N_CONTINUE}, + + { "left-asymmetric-6", ALGORITHM_LEFT_ASYMMETRIC_6}, + { "right-asymmetric-6", ALGORITHM_RIGHT_ASYMMETRIC_6}, + { "left-symmetric-6", ALGORITHM_LEFT_SYMMETRIC_6}, + { "right-symmetric-6", ALGORITHM_RIGHT_SYMMETRIC_6}, + { "parity-first-6", ALGORITHM_PARITY_0_6}, + + { NULL, 0} +}; + +mapping_t pers[] = { + { "linear", LEVEL_LINEAR}, + { "raid0", 0}, + { "0", 0}, + { "stripe", 0}, + { "raid1", 1}, + { "1", 1}, + { "mirror", 1}, + { "raid4", 4}, + { "4", 4}, + { "raid5", 5}, + { "5", 5}, + { "multipath", LEVEL_MULTIPATH}, + { "mp", LEVEL_MULTIPATH}, + { "raid6", 6}, + { "6", 6}, + { "raid10", 10}, + { "10", 10}, + { "faulty", LEVEL_FAULTY}, + { "container", LEVEL_CONTAINER}, + { NULL, 0} +}; + +mapping_t modes[] = { + { "assemble", ASSEMBLE}, + { "build", BUILD}, + { "create", CREATE}, + { "manage", MANAGE}, + { "misc", MISC}, + { "monitor", MONITOR}, + { "grow", GROW}, + { "incremental", INCREMENTAL}, + { "auto-detect", AUTODETECT}, +}; + +mapping_t faultylayout[] = { + { "write-transient", WriteTransient }, + { "wt", WriteTransient }, + { "read-transient", ReadTransient }, + { "rt", ReadTransient }, + { "write-persistent", WritePersistent }, + { "wp", WritePersistent }, + { "read-persistent", ReadPersistent }, + { "rp", ReadPersistent }, + { "write-all", WriteAll }, + { "wa", WriteAll }, + { "read-fixable", ReadFixable }, + { "rf", ReadFixable }, + + { "clear", ClearErrors}, + { "flush", ClearFaults}, + { "none", ClearErrors}, + { "default", ClearErrors}, + { NULL, 0} +}; + +char *map_num(mapping_t *map, int num) +{ + while (map->name) { + if (map->num == num) + return map->name; + map++; + } + return NULL; +} + +int map_name(mapping_t *map, char *name) +{ + while (map->name) { + if (strcmp(map->name, name)==0) + return map->num; + map++; + } + return UnSet; +} diff --git a/md.4 b/md.4 new file mode 100644 index 00000000..f1b88ee6 --- /dev/null +++ b/md.4 @@ -0,0 +1,1145 @@ +.\" Copyright Neil Brown and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH MD 4 +.SH NAME +md \- Multiple Device driver aka Linux Software RAID +.SH SYNOPSIS +.BI /dev/md n +.br +.BI /dev/md/ n +.br +.BR /dev/md/ name +.SH DESCRIPTION +The +.B md +driver provides virtual devices that are created from one or more +independent underlying devices. This array of devices often contains +redundancy and the devices are often disk drives, hence the acronym RAID +which stands for a Redundant Array of Independent Disks. +.PP +.B md +supports RAID levels +1 (mirroring), +4 (striped array with parity device), +5 (striped array with distributed parity information), +6 (striped array with distributed dual redundancy information), and +10 (striped and mirrored). +If some number of underlying devices fails while using one of these +levels, the array will continue to function; this number is one for +RAID levels 4 and 5, two for RAID level 6, and all but one (N-1) for +RAID level 1, and dependent on configuration for level 10. +.PP +.B md +also supports a number of pseudo RAID (non-redundant) configurations +including RAID0 (striped array), LINEAR (catenated array), +MULTIPATH (a set of different interfaces to the same device), +and FAULTY (a layer over a single device into which errors can be injected). + +.SS MD METADATA +Each device in an array may have some +.I metadata +stored in the device. This metadata is sometimes called a +.BR superblock . +The metadata records information about the structure and state of the array. +This allows the array to be reliably re-assembled after a shutdown. + +From Linux kernel version 2.6.10, +.B md +provides support for two different formats of metadata, and +other formats can be added. Prior to this release, only one format is +supported. + +The common format \(em known as version 0.90 \(em has +a superblock that is 4K long and is written into a 64K aligned block that +starts at least 64K and less than 128K from the end of the device +(i.e. to get the address of the superblock round the size of the +device down to a multiple of 64K and then subtract 64K). +The available size of each device is the amount of space before the +super block, so between 64K and 128K is lost when a device in +incorporated into an MD array. +This superblock stores multi-byte fields in a processor-dependent +manner, so arrays cannot easily be moved between computers with +different processors. + +The new format \(em known as version 1 \(em has a superblock that is +normally 1K long, but can be longer. It is normally stored between 8K +and 12K from the end of the device, on a 4K boundary, though +variations can be stored at the start of the device (version 1.1) or 4K from +the start of the device (version 1.2). +This metadata format stores multibyte data in a +processor-independent format and supports up to hundreds of +component devices (version 0.90 only supports 28). + +The metadata contains, among other things: +.TP +LEVEL +The manner in which the devices are arranged into the array +(LINEAR, RAID0, RAID1, RAID4, RAID5, RAID10, MULTIPATH). +.TP +UUID +a 128 bit Universally Unique Identifier that identifies the array that +contains this device. + +.PP +When a version 0.90 array is being reshaped (e.g. adding extra devices +to a RAID5), the version number is temporarily set to 0.91. This +ensures that if the reshape process is stopped in the middle (e.g. by +a system crash) and the machine boots into an older kernel that does +not support reshaping, then the array will not be assembled (which +would cause data corruption) but will be left untouched until a kernel +that can complete the reshape processes is used. + +.SS ARRAYS WITHOUT METADATA +While it is usually best to create arrays with superblocks so that +they can be assembled reliably, there are some circumstances when an +array without superblocks is preferred. These include: +.TP +LEGACY ARRAYS +Early versions of the +.B md +driver only supported LINEAR and RAID0 configurations and did not use +a superblock (which is less critical with these configurations). +While such arrays should be rebuilt with superblocks if possible, +.B md +continues to support them. +.TP +FAULTY +Being a largely transparent layer over a different device, the FAULTY +personality doesn't gain anything from having a superblock. +.TP +MULTIPATH +It is often possible to detect devices which are different paths to +the same storage directly rather than having a distinctive superblock +written to the device and searched for on all paths. In this case, +a MULTIPATH array with no superblock makes sense. +.TP +RAID1 +In some configurations it might be desired to create a RAID1 +configuration that does not use a superblock, and to maintain the state of +the array elsewhere. While not encouraged for general use, it does +have special-purpose uses and is supported. + +.SS ARRAYS WITH EXTERNAL METADATA + +From release 2.6.28, the +.I md +driver supports arrays with externally managed metadata. That is, +the metadata is not managed by the kernel but rather by a user-space +program which is external to the kernel. This allows support for a +variety of metadata formats without cluttering the kernel with lots of +details. +.PP +.I md +is able to communicate with the user-space program through various +sysfs attributes so that it can make appropriate changes to the +metadata \- for example to mark a device as faulty. When necessary, +.I md +will wait for the program to acknowledge the event by writing to a +sysfs attribute. +The manual page for +.IR mdmon (8) +contains more detail about this interaction. + +.SS CONTAINERS +Many metadata formats use a single block of metadata to describe a +number of different arrays which all use the same set of devices. +In this case it is helpful for the kernel to know about the full set +of devices as a whole. This set is known to md as a +.IR container . +A container is an +.I md +array with externally managed metadata and with device offset and size +so that it just covers the metadata part of the devices. The +remainder of each device is available to be incorporated into various +arrays. + +.SS LINEAR + +A LINEAR array simply catenates the available space on each +drive to form one large virtual drive. + +One advantage of this arrangement over the more common RAID0 +arrangement is that the array may be reconfigured at a later time with +an extra drive, so the array is made bigger without disturbing the +data that is on the array. This can even be done on a live +array. + +If a chunksize is given with a LINEAR array, the usable space on each +device is rounded down to a multiple of this chunksize. + +.SS RAID0 + +A RAID0 array (which has zero redundancy) is also known as a +striped array. +A RAID0 array is configured at creation with a +.B "Chunk Size" +which must be a power of two (prior to Linux 2.6.31), and at least 4 +kibibytes. + +The RAID0 driver assigns the first chunk of the array to the first +device, the second chunk to the second device, and so on until all +drives have been assigned one chunk. This collection of chunks forms a +.BR stripe . +Further chunks are gathered into stripes in the same way, and are +assigned to the remaining space in the drives. + +If devices in the array are not all the same size, then once the +smallest device has been exhausted, the RAID0 driver starts +collecting chunks into smaller stripes that only span the drives which +still have remaining space. + + +.SS RAID1 + +A RAID1 array is also known as a mirrored set (though mirrors tend to +provide reflected images, which RAID1 does not) or a plex. + +Once initialised, each device in a RAID1 array contains exactly the +same data. Changes are written to all devices in parallel. Data is +read from any one device. The driver attempts to distribute read +requests across all devices to maximise performance. + +All devices in a RAID1 array should be the same size. If they are +not, then only the amount of space available on the smallest device is +used (any extra space on other devices is wasted). + +Note that the read balancing done by the driver does not make the RAID1 +performance profile be the same as for RAID0; a single stream of +sequential input will not be accelerated (e.g. a single dd), but +multiple sequential streams or a random workload will use more than one +spindle. In theory, having an N-disk RAID1 will allow N sequential +threads to read from all disks. + +Individual devices in a RAID1 can be marked as "write-mostly". +These drives are excluded from the normal read balancing and will only +be read from when there is no other option. This can be useful for +devices connected over a slow link. + +.SS RAID4 + +A RAID4 array is like a RAID0 array with an extra device for storing +parity. This device is the last of the active devices in the +array. Unlike RAID0, RAID4 also requires that all stripes span all +drives, so extra space on devices that are larger than the smallest is +wasted. + +When any block in a RAID4 array is modified, the parity block for that +stripe (i.e. the block in the parity device at the same device offset +as the stripe) is also modified so that the parity block always +contains the "parity" for the whole stripe. I.e. its content is +equivalent to the result of performing an exclusive-or operation +between all the data blocks in the stripe. + +This allows the array to continue to function if one device fails. +The data that was on that device can be calculated as needed from the +parity block and the other data blocks. + +.SS RAID5 + +RAID5 is very similar to RAID4. The difference is that the parity +blocks for each stripe, instead of being on a single device, are +distributed across all devices. This allows more parallelism when +writing, as two different block updates will quite possibly affect +parity blocks on different devices so there is less contention. + +This also allows more parallelism when reading, as read requests are +distributed over all the devices in the array instead of all but one. + +.SS RAID6 + +RAID6 is similar to RAID5, but can handle the loss of any \fItwo\fP +devices without data loss. Accordingly, it requires N+2 drives to +store N drives worth of data. + +The performance for RAID6 is slightly lower but comparable to RAID5 in +normal mode and single disk failure mode. It is very slow in dual +disk failure mode, however. + +.SS RAID10 + +RAID10 provides a combination of RAID1 and RAID0, and is sometimes known +as RAID1+0. Every datablock is duplicated some number of times, and +the resulting collection of datablocks are distributed over multiple +drives. + +When configuring a RAID10 array, it is necessary to specify the number +of replicas of each data block that are required (this will usually +be\ 2) and whether their layout should be "near", "far" or "offset" +(with "offset" being available since Linux\ 2.6.18). + +.B About the RAID10 Layout Examples: +.br +The examples below visualise the chunk distribution on the underlying +devices for the respective layout. + +For simplicity it is assumed that the size of the chunks equals the +size of the blocks of the underlying devices as well as those of the +RAID10 device exported by the kernel (for example \fB/dev/md/\fPname). +.br +Therefore the chunks\ /\ chunk numbers map directly to the blocks\ /\ +block addresses of the exported RAID10 device. + +Decimal numbers (0,\ 1, 2,\ ...) are the chunks of the RAID10 and due +to the above assumption also the blocks and block addresses of the +exported RAID10 device. +.br +Repeated numbers mean copies of a chunk\ /\ block (obviously on +different underlying devices). +.br +Hexadecimal numbers (0x00,\ 0x01, 0x02,\ ...) are the block addresses +of the underlying devices. + +.TP +\fB "near" Layout\fP +When "near" replicas are chosen, the multiple copies of a given chunk are laid +out consecutively ("as close to each other as possible") across the stripes of +the array. + +With an even number of devices, they will likely (unless some misalignment is +present) lay at the very same offset on the different devices. +.br +This is as the "classic" RAID1+0; that is two groups of mirrored devices (in the +example below the groups Device\ #1\ /\ #2 and Device\ #3\ /\ #4 are each a +RAID1) both in turn forming a striped RAID0. + +.ne 10 +.B Example with 2\ copies per chunk and an even number\ (4) of devices: +.TS +tab(;); + C - - - - + C | C | C | C | C | +| - | - | - | - | - | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| - | - | - | - | - | + C C S C S + C C S C S + C C S S S + C C S S S. +; +;Device #1;Device #2;Device #3;Device #4 +0x00;0;0;1;1 +0x01;2;2;3;3 +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +0x80;254;254;255;255 +;\\---------v---------/;\\---------v---------/ +;RAID1;RAID1 +;\\---------------------v---------------------/ +;RAID0 +.TE + +.ne 10 +.B Example with 2\ copies per chunk and an odd number\ (5) of devices: +.TS +tab(;); + C - - - - - + C | C | C | C | C | C | +| - | - | - | - | - | - | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| - | - | - | - | - | - | +C. +; +;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5 +0x00;0;0;1;1;2 +0x01;2;3;3;4;4 +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +0x80;317;318;318;319;319 +; +.TE + +.TP +\fB "far" Layout\fP +When "far" replicas are chosen, the multiple copies of a given chunk +are laid out quite distant ("as far as reasonably possible") from each +other. + +First a complete sequence of all data blocks (that is all the data one +sees on the exported RAID10 block device) is striped over the +devices. Then another (though "shifted") complete sequence of all data +blocks; and so on (in the case of more than 2\ copies per chunk). + +The "shift" needed to prevent placing copies of the same chunks on the +same devices is actually a cyclic permutation with offset\ 1 of each +of the stripes within a complete sequence of chunks. +.br +The offset\ 1 is relative to the previous complete sequence of chunks, +so in case of more than 2\ copies per chunk one gets the following +offsets: +.br +1.\ complete sequence of chunks: offset\ =\ \ 0 +.br +2.\ complete sequence of chunks: offset\ =\ \ 1 +.br +3.\ complete sequence of chunks: offset\ =\ \ 2 +.br + : +.br +n.\ complete sequence of chunks: offset\ =\ n-1 + +.ne 10 +.B Example with 2\ copies per chunk and an even number\ (4) of devices: +.TS +tab(;); + C - - - - + C | C | C | C | C | +| - | - | - | - | - | +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| - | - | - | - | - | +C. +; +;Device #1;Device #2;Device #3;Device #4 +; +0x00;0;1;2;3;\\ +0x01;4;5;6;7;> [#] +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x40;252;253;254;255;/ +0x41;3;0;1;2;\\ +0x42;7;4;5;6;> [#]~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x80;255;252;253;254;/ +; +.TE + +.ne 10 +.B Example with 2\ copies per chunk and an odd number\ (5) of devices: +.TS +tab(;); + C - - - - - + C | C | C | C | C | C | +| - | - | - | - | - | - | +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| - | - | - | - | - | - | +C. +; +;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5 +; +0x00;0;1;2;3;4;\\ +0x01;5;6;7;8;9;> [#] +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x40;315;316;317;318;319;/ +0x41;4;0;1;2;3;\\ +0x42;9;5;6;7;8;> [#]~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x80;319;315;316;317;318;/ +; +.TE + +With [#]\ being the complete sequence of chunks and [#]~\ the cyclic permutation +with offset\ 1 thereof (in the case of more than 2 copies per chunk there would +be ([#]~)~,\ (([#]~)~)~,\ ...). + +The advantage of this layout is that MD can easily spread sequential reads over +the devices, making them similar to RAID0 in terms of speed. +.br +The cost is more seeking for writes, making them substantially slower. + +.TP +\fB"offset" Layout\fP +When "offset" replicas are chosen, all the copies of a given chunk are +striped consecutively ("offset by the stripe length after each other") +over the devices. + +Explained in detail, consecutive chunks are +striped over the devices, immediately followed by a "shifted" copy of +these chunks (and by further such "shifted" copies in the case of more +than 2\ copies per chunk). +.br +This pattern repeats for all further consecutive chunks of the +exported RAID10 device (in other words: all further data blocks). + +The "shift" needed to prevent placing copies of the same chunks on the +same devices is actually a cyclic permutation with offset\ 1 of each +of the striped copies of consecutive chunks. +.br +The offset\ 1 is relative to the previous striped copy of consecutive chunks, so in case of more than 2\ copies per +chunk one gets the following offsets: +.br +1.\ consecutive chunks: offset\ =\ \ 0 +.br +2.\ consecutive chunks: offset\ =\ \ 1 +.br +3.\ consecutive chunks: offset\ =\ \ 2 +.br + : +.br +n.\ consecutive chunks: offset\ =\ n-1 + +.ne 10 +.B Example with 2\ copies per chunk and an even number\ (4) of devices: +.TS +tab(;); + C - - - - + C | C | C | C | C | +| - | - | - | - | - | +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| - | - | - | - | - | +C. +; +;Device #1;Device #2;Device #3;Device #4 +; +0x00;0;1;2;3;) AA +0x01;3;0;1;2;) AA~ +0x02;4;5;6;7;) AB +0x03;7;4;5;6;) AB~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +:;:;:;:;:; : +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +0x79;251;252;253;254;) EX +0x80;254;251;252;253;) EX~ +; +.TE + +.ne 10 +.B Example with 2\ copies per chunk and an odd number\ (5) of devices: +.TS +tab(;); + C - - - - - + C | C | C | C | C | C | +| - | - | - | - | - | - | +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| - | - | - | - | - | - | +C. +; +;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5 +; +0x00;0;1;2;3;4;) AA +0x01;4;0;1;2;3;) AA~ +0x02;5;6;7;8;9;) AB +0x03;9;5;6;7;8;) AB~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +:;:;:;:;:;:; : +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +0x79;314;315;316;317;318;) EX +0x80;318;314;315;316;317;) EX~ +; +.TE + +With AA,\ AB,\ ..., AZ,\ BA,\ ... being the sets of consecutive +chunks and AA~,\ AB~,\ ..., AZ~,\ BA~,\ ... the cyclic permutations with offset\ 1 +thereof (in the case of more than 2 copies per chunk there would be (AA~)~,\ ... +as well as ((AA~)~)~,\ ... and so on). + +This should give similar read characteristics to "far" if a suitably large chunk +size is used, but without as much seeking for writes. +.PP + + +It should be noted that the number of devices in a RAID10 array need +not be a multiple of the number of replica of each data block; however, +there must be at least as many devices as replicas. + +If, for example, an array is created with 5 devices and 2 replicas, +then space equivalent to 2.5 of the devices will be available, and +every block will be stored on two different devices. + +Finally, it is possible to have an array with both "near" and "far" +copies. If an array is configured with 2 near copies and 2 far +copies, then there will be a total of 4 copies of each block, each on +a different drive. This is an artifact of the implementation and is +unlikely to be of real value. + +.SS MULTIPATH + +MULTIPATH is not really a RAID at all as there is only one real device +in a MULTIPATH md array. However there are multiple access points +(paths) to this device, and one of these paths might fail, so there +are some similarities. + +A MULTIPATH array is composed of a number of logically different +devices, often fibre channel interfaces, that all refer the the same +real device. If one of these interfaces fails (e.g. due to cable +problems), the MULTIPATH driver will attempt to redirect requests to +another interface. + +The MULTIPATH drive is not receiving any ongoing development and +should be considered a legacy driver. The device-mapper based +multipath drivers should be preferred for new installations. + +.SS FAULTY +The FAULTY md module is provided for testing purposes. A FAULTY array +has exactly one component device and is normally assembled without a +superblock, so the md array created provides direct access to all of +the data in the component device. + +The FAULTY module may be requested to simulate faults to allow testing +of other md levels or of filesystems. Faults can be chosen to trigger +on read requests or write requests, and can be transient (a subsequent +read/write at the address will probably succeed) or persistent +(subsequent read/write of the same address will fail). Further, read +faults can be "fixable" meaning that they persist until a write +request at the same address. + +Fault types can be requested with a period. In this case, the fault +will recur repeatedly after the given number of requests of the +relevant type. For example if persistent read faults have a period of +100, then every 100th read request would generate a fault, and the +faulty sector would be recorded so that subsequent reads on that +sector would also fail. + +There is a limit to the number of faulty sectors that are remembered. +Faults generated after this limit is exhausted are treated as +transient. + +The list of faulty sectors can be flushed, and the active list of +failure modes can be cleared. + +.SS UNCLEAN SHUTDOWN + +When changes are made to a RAID1, RAID4, RAID5, RAID6, or RAID10 array +there is a possibility of inconsistency for short periods of time as +each update requires at least two block to be written to different +devices, and these writes probably won't happen at exactly the same +time. Thus if a system with one of these arrays is shutdown in the +middle of a write operation (e.g. due to power failure), the array may +not be consistent. + +To handle this situation, the md driver marks an array as "dirty" +before writing any data to it, and marks it as "clean" when the array +is being disabled, e.g. at shutdown. If the md driver finds an array +to be dirty at startup, it proceeds to correct any possibly +inconsistency. For RAID1, this involves copying the contents of the +first drive onto all other drives. For RAID4, RAID5 and RAID6 this +involves recalculating the parity for each stripe and making sure that +the parity block has the correct data. For RAID10 it involves copying +one of the replicas of each block onto all the others. This process, +known as "resynchronising" or "resync" is performed in the background. +The array can still be used, though possibly with reduced performance. + +If a RAID4, RAID5 or RAID6 array is degraded (missing at least one +drive, two for RAID6) when it is restarted after an unclean shutdown, it cannot +recalculate parity, and so it is possible that data might be +undetectably corrupted. The 2.4 md driver +.B does not +alert the operator to this condition. The 2.6 md driver will fail to +start an array in this condition without manual intervention, though +this behaviour can be overridden by a kernel parameter. + +.SS RECOVERY + +If the md driver detects a write error on a device in a RAID1, RAID4, +RAID5, RAID6, or RAID10 array, it immediately disables that device +(marking it as faulty) and continues operation on the remaining +devices. If there are spare drives, the driver will start recreating +on one of the spare drives the data which was on that failed drive, +either by copying a working drive in a RAID1 configuration, or by +doing calculations with the parity block on RAID4, RAID5 or RAID6, or +by finding and copying originals for RAID10. + +In kernels prior to about 2.6.15, a read error would cause the same +effect as a write error. In later kernels, a read-error will instead +cause md to attempt a recovery by overwriting the bad block. i.e. it +will find the correct data from elsewhere, write it over the block +that failed, and then try to read it back again. If either the write +or the re-read fail, md will treat the error the same way that a write +error is treated, and will fail the whole device. + +While this recovery process is happening, the md driver will monitor +accesses to the array and will slow down the rate of recovery if other +activity is happening, so that normal access to the array will not be +unduly affected. When no other activity is happening, the recovery +process proceeds at full speed. The actual speed targets for the two +different situations can be controlled by the +.B speed_limit_min +and +.B speed_limit_max +control files mentioned below. + +.SS SCRUBBING AND MISMATCHES + +As storage devices can develop bad blocks at any time it is valuable +to regularly read all blocks on all devices in an array so as to catch +such bad blocks early. This process is called +.IR scrubbing . + +md arrays can be scrubbed by writing either +.I check +or +.I repair +to the file +.I md/sync_action +in the +.I sysfs +directory for the device. + +Requesting a scrub will cause +.I md +to read every block on every device in the array, and check that the +data is consistent. For RAID1 and RAID10, this means checking that the copies +are identical. For RAID4, RAID5, RAID6 this means checking that the +parity block is (or blocks are) correct. + +If a read error is detected during this process, the normal read-error +handling causes correct data to be found from other devices and to be +written back to the faulty device. In many case this will +effectively +.I fix +the bad block. + +If all blocks read successfully but are found to not be consistent, +then this is regarded as a +.IR mismatch . + +If +.I check +was used, then no action is taken to handle the mismatch, it is simply +recorded. +If +.I repair +was used, then a mismatch will be repaired in the same way that +.I resync +repairs arrays. For RAID5/RAID6 new parity blocks are written. For RAID1/RAID10, +all but one block are overwritten with the content of that one block. + +A count of mismatches is recorded in the +.I sysfs +file +.IR md/mismatch_cnt . +This is set to zero when a +scrub starts and is incremented whenever a sector is +found that is a mismatch. +.I md +normally works in units much larger than a single sector and when it +finds a mismatch, it does not determine exactly how many actual sectors were +affected but simply adds the number of sectors in the IO unit that was +used. So a value of 128 could simply mean that a single 64KB check +found an error (128 x 512bytes = 64KB). + +If an array is created by +.I mdadm +with +.I \-\-assume\-clean +then a subsequent check could be expected to find some mismatches. + +On a truly clean RAID5 or RAID6 array, any mismatches should indicate +a hardware problem at some level - software issues should never cause +such a mismatch. + +However on RAID1 and RAID10 it is possible for software issues to +cause a mismatch to be reported. This does not necessarily mean that +the data on the array is corrupted. It could simply be that the +system does not care what is stored on that part of the array - it is +unused space. + +The most likely cause for an unexpected mismatch on RAID1 or RAID10 +occurs if a swap partition or swap file is stored on the array. + +When the swap subsystem wants to write a page of memory out, it flags +the page as 'clean' in the memory manager and requests the swap device +to write it out. It is quite possible that the memory will be +changed while the write-out is happening. In that case the 'clean' +flag will be found to be clear when the write completes and so the +swap subsystem will simply forget that the swapout had been attempted, +and will possibly choose a different page to write out. + +If the swap device was on RAID1 (or RAID10), then the data is sent +from memory to a device twice (or more depending on the number of +devices in the array). Thus it is possible that the memory gets changed +between the times it is sent, so different data can be written to +the different devices in the array. This will be detected by +.I check +as a mismatch. However it does not reflect any corruption as the +block where this mismatch occurs is being treated by the swap system as +being empty, and the data will never be read from that block. + +It is conceivable for a similar situation to occur on non-swap files, +though it is less likely. + +Thus the +.I mismatch_cnt +value can not be interpreted very reliably on RAID1 or RAID10, +especially when the device is used for swap. + + +.SS BITMAP WRITE-INTENT LOGGING + +From Linux 2.6.13, +.I md +supports a bitmap based write-intent log. If configured, the bitmap +is used to record which blocks of the array may be out of sync. +Before any write request is honoured, md will make sure that the +corresponding bit in the log is set. After a period of time with no +writes to an area of the array, the corresponding bit will be cleared. + +This bitmap is used for two optimisations. + +Firstly, after an unclean shutdown, the resync process will consult +the bitmap and only resync those blocks that correspond to bits in the +bitmap that are set. This can dramatically reduce resync time. + +Secondly, when a drive fails and is removed from the array, md stops +clearing bits in the intent log. If that same drive is re-added to +the array, md will notice and will only recover the sections of the +drive that are covered by bits in the intent log that are set. This +can allow a device to be temporarily removed and reinserted without +causing an enormous recovery cost. + +The intent log can be stored in a file on a separate device, or it can +be stored near the superblocks of an array which has superblocks. + +It is possible to add an intent log to an active array, or remove an +intent log if one is present. + +In 2.6.13, intent bitmaps are only supported with RAID1. Other levels +with redundancy are supported from 2.6.15. + +.SS BAD BLOCK LIST + +From Linux 3.5 each device in an +.I md +array can store a list of known-bad-blocks. This list is 4K in size +and usually positioned at the end of the space between the superblock +and the data. + +When a block cannot be read and cannot be repaired by writing data +recovered from other devices, the address of the block is stored in +the bad block list. Similarly if an attempt to write a block fails, +the address will be recorded as a bad block. If attempting to record +the bad block fails, the whole device will be marked faulty. + +Attempting to read from a known bad block will cause a read error. +Attempting to write to a known bad block will be ignored if any write +errors have been reported by the device. If there have been no write +errors then the data will be written to the known bad block and if +that succeeds, the address will be removed from the list. + +This allows an array to fail more gracefully - a few blocks on +different devices can be faulty without taking the whole array out of +action. + +The list is particularly useful when recovering to a spare. If a few blocks +cannot be read from the other devices, the bulk of the recovery can +complete and those few bad blocks will be recorded in the bad block list. + +.SS RAID456 WRITE JOURNAL + +Due to non-atomicity nature of RAID write operations, interruption of +write operations (system crash, etc.) to RAID456 array can lead to +inconsistent parity and data loss (so called RAID-5 write hole). + +To plug the write hole, from Linux 4.4 (to be confirmed), +.I md +supports write ahead journal for RAID456. When the array is created, +an additional journal device can be added to the array through +.IR write-journal +option. The RAID write journal works similar to file system journals. +Before writing to the data disks, md persists data AND parity of the +stripe to the journal device. After crashes, md searches the journal +device for incomplete write operations, and replay them to the data +disks. + +When the journal device fails, the RAID array is forced to run in +read-only mode. + +.SS WRITE-BEHIND + +From Linux 2.6.14, +.I md +supports WRITE-BEHIND on RAID1 arrays. + +This allows certain devices in the array to be flagged as +.IR write-mostly . +MD will only read from such devices if there is no +other option. + +If a write-intent bitmap is also provided, write requests to +write-mostly devices will be treated as write-behind requests and md +will not wait for writes to those requests to complete before +reporting the write as complete to the filesystem. + +This allows for a RAID1 with WRITE-BEHIND to be used to mirror data +over a slow link to a remote computer (providing the link isn't too +slow). The extra latency of the remote link will not slow down normal +operations, but the remote system will still have a reasonably +up-to-date copy of all data. + +.SS RESTRIPING + +.IR Restriping , +also known as +.IR Reshaping , +is the processes of re-arranging the data stored in each stripe into a +new layout. This might involve changing the number of devices in the +array (so the stripes are wider), changing the chunk size (so stripes +are deeper or shallower), or changing the arrangement of data and +parity (possibly changing the RAID level, e.g. 1 to 5 or 5 to 6). + +As of Linux 2.6.35, md can reshape a RAID4, RAID5, or RAID6 array to +have a different number of devices (more or fewer) and to have a +different layout or chunk size. It can also convert between these +different RAID levels. It can also convert between RAID0 and RAID10, +and between RAID0 and RAID4 or RAID5. +Other possibilities may follow in future kernels. + +During any stripe process there is a 'critical section' during which +live data is being overwritten on disk. For the operation of +increasing the number of drives in a RAID5, this critical section +covers the first few stripes (the number being the product of the old +and new number of devices). After this critical section is passed, +data is only written to areas of the array which no longer hold live +data \(em the live data has already been located away. + +For a reshape which reduces the number of devices, the 'critical +section' is at the end of the reshape process. + +md is not able to ensure data preservation if there is a crash +(e.g. power failure) during the critical section. If md is asked to +start an array which failed during a critical section of restriping, +it will fail to start the array. + +To deal with this possibility, a user-space program must +.IP \(bu 4 +Disable writes to that section of the array (using the +.B sysfs +interface), +.IP \(bu 4 +take a copy of the data somewhere (i.e. make a backup), +.IP \(bu 4 +allow the process to continue and invalidate the backup and restore +write access once the critical section is passed, and +.IP \(bu 4 +provide for restoring the critical data before restarting the array +after a system crash. +.PP + +.B mdadm +versions from 2.4 do this for growing a RAID5 array. + +For operations that do not change the size of the array, like simply +increasing chunk size, or converting RAID5 to RAID6 with one extra +device, the entire process is the critical section. In this case, the +restripe will need to progress in stages, as a section is suspended, +backed up, restriped, and released. + +.SS SYSFS INTERFACE +Each block device appears as a directory in +.I sysfs +(which is usually mounted at +.BR /sys ). +For MD devices, this directory will contain a subdirectory called +.B md +which contains various files for providing access to information about +the array. + +This interface is documented more fully in the file +.B Documentation/md.txt +which is distributed with the kernel sources. That file should be +consulted for full documentation. The following are just a selection +of attribute files that are available. + +.TP +.B md/sync_speed_min +This value, if set, overrides the system-wide setting in +.B /proc/sys/dev/raid/speed_limit_min +for this array only. +Writing the value +.B "system" +to this file will cause the system-wide setting to have effect. + +.TP +.B md/sync_speed_max +This is the partner of +.B md/sync_speed_min +and overrides +.B /proc/sys/dev/raid/speed_limit_max +described below. + +.TP +.B md/sync_action +This can be used to monitor and control the resync/recovery process of +MD. +In particular, writing "check" here will cause the array to read all +data block and check that they are consistent (e.g. parity is correct, +or all mirror replicas are the same). Any discrepancies found are +.B NOT +corrected. + +A count of problems found will be stored in +.BR md/mismatch_count . + +Alternately, "repair" can be written which will cause the same check +to be performed, but any errors will be corrected. + +Finally, "idle" can be written to stop the check/repair process. + +.TP +.B md/stripe_cache_size +This is only available on RAID5 and RAID6. It records the size (in +pages per device) of the stripe cache which is used for synchronising +all write operations to the array and all read operations if the array +is degraded. The default is 256. Valid values are 17 to 32768. +Increasing this number can increase performance in some situations, at +some cost in system memory. Note, setting this value too high can +result in an "out of memory" condition for the system. + +memory_consumed = system_page_size * nr_disks * stripe_cache_size + +.TP +.B md/preread_bypass_threshold +This is only available on RAID5 and RAID6. This variable sets the +number of times MD will service a full-stripe-write before servicing a +stripe that requires some "prereading". For fairness this defaults to +1. Valid values are 0 to stripe_cache_size. Setting this to 0 +maximizes sequential-write throughput at the cost of fairness to threads +doing small or random writes. + +.SS KERNEL PARAMETERS + +The md driver recognised several different kernel parameters. +.TP +.B raid=noautodetect +This will disable the normal detection of md arrays that happens at +boot time. If a drive is partitioned with MS-DOS style partitions, +then if any of the 4 main partitions has a partition type of 0xFD, +then that partition will normally be inspected to see if it is part of +an MD array, and if any full arrays are found, they are started. This +kernel parameter disables this behaviour. + +.TP +.B raid=partitionable +.TP +.B raid=part +These are available in 2.6 and later kernels only. They indicate that +autodetected MD arrays should be created as partitionable arrays, with +a different major device number to the original non-partitionable md +arrays. The device number is listed as +.I mdp +in +.IR /proc/devices . + +.TP +.B md_mod.start_ro=1 +.TP +.B /sys/module/md_mod/parameters/start_ro +This tells md to start all arrays in read-only mode. This is a soft +read-only that will automatically switch to read-write on the first +write request. However until that write request, nothing is written +to any device by md, and in particular, no resync or recovery +operation is started. + +.TP +.B md_mod.start_dirty_degraded=1 +.TP +.B /sys/module/md_mod/parameters/start_dirty_degraded +As mentioned above, md will not normally start a RAID4, RAID5, or +RAID6 that is both dirty and degraded as this situation can imply +hidden data loss. This can be awkward if the root filesystem is +affected. Using this module parameter allows such arrays to be started +at boot time. It should be understood that there is a real (though +small) risk of data corruption in this situation. + +.TP +.BI md= n , dev , dev ,... +.TP +.BI md=d n , dev , dev ,... +This tells the md driver to assemble +.B /dev/md n +from the listed devices. It is only necessary to start the device +holding the root filesystem this way. Other arrays are best started +once the system is booted. + +In 2.6 kernels, the +.B d +immediately after the +.B = +indicates that a partitionable device (e.g. +.BR /dev/md/d0 ) +should be created rather than the original non-partitionable device. + +.TP +.BI md= n , l , c , i , dev... +This tells the md driver to assemble a legacy RAID0 or LINEAR array +without a superblock. +.I n +gives the md device number, +.I l +gives the level, 0 for RAID0 or \-1 for LINEAR, +.I c +gives the chunk size as a base-2 logarithm offset by twelve, so 0 +means 4K, 1 means 8K. +.I i +is ignored (legacy support). + +.SH FILES +.TP +.B /proc/mdstat +Contains information about the status of currently running array. +.TP +.B /proc/sys/dev/raid/speed_limit_min +A readable and writable file that reflects the current "goal" rebuild +speed for times when non-rebuild activity is current on an array. +The speed is in Kibibytes per second, and is a per-device rate, not a +per-array rate (which means that an array with more disks will shuffle +more data for a given speed). The default is 1000. + +.TP +.B /proc/sys/dev/raid/speed_limit_max +A readable and writable file that reflects the current "goal" rebuild +speed for times when no non-rebuild activity is current on an array. +The default is 200,000. + +.SH SEE ALSO +.BR mdadm (8), diff --git a/md5.h b/md5.h new file mode 100644 index 00000000..145970d4 --- /dev/null +++ b/md5.h @@ -0,0 +1,136 @@ +/* Declaration of functions and data types used for MD5 sum computing + library functions. + Copyright (C) 1995-1997,1999-2005 Free Software Foundation, Inc. + + NOTE: The canonical source of this file is maintained with the GNU C + Library. Bugs can be reported to bug-glibc@prep.ai.mit.edu. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#ifndef _MD5_H +#define _MD5_H 1 + +#include + +#if HAVE_INTTYPES_H +# include +#endif +#if HAVE_STDINT_H || _LIBC || defined __UCLIBC__ +# include +#endif + +#ifndef __GNUC_PREREQ +# if defined __GNUC__ && defined __GNUC_MINOR__ +# define __GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GNUC_PREREQ(maj, min) 0 +# endif +#endif + +#ifndef __THROW +# if defined __cplusplus && __GNUC_PREREQ (2,8) +# define __THROW throw () +# else +# define __THROW +# endif +#endif + +#ifndef __attribute__ +# if ! __GNUC_PREREQ (2,8) || __STRICT_ANSI__ +# define __attribute__(x) +# endif +#endif + +#ifndef _LIBC +# define __md5_buffer md5_buffer +# define __md5_finish_ctx md5_finish_ctx +# define __md5_init_ctx md5_init_ctx +# define __md5_process_block md5_process_block +# define __md5_process_bytes md5_process_bytes +# define __md5_read_ctx md5_read_ctx +# define __md5_stream md5_stream +#endif + +typedef uint32_t md5_uint32; + +/* Structure to save state of computation between the single steps. */ +struct md5_ctx +{ + md5_uint32 A; + md5_uint32 B; + md5_uint32 C; + md5_uint32 D; + + md5_uint32 total[2]; + md5_uint32 buflen; + char buffer[128] __attribute__ ((__aligned__ (__alignof__ (md5_uint32)))); +}; + +/* + * The following three functions are build up the low level used in + * the functions `md5_stream' and `md5_buffer'. + */ + +/* Initialize structure containing state of computation. + (RFC 1321, 3.3: Step 3) */ +extern void __md5_init_ctx (struct md5_ctx *ctx) __THROW; + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is necessary that LEN is a multiple of 64!!! */ +extern void __md5_process_block (const void *buffer, size_t len, + struct md5_ctx *ctx) __THROW; + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is NOT required that LEN is a multiple of 64. */ +extern void __md5_process_bytes (const void *buffer, size_t len, + struct md5_ctx *ctx) __THROW; + +/* Process the remaining bytes in the buffer and put result from CTX + in first 16 bytes following RESBUF. The result is always in little + endian byte order, so that a byte-wise output yields to the wanted + ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF be correctly + aligned for a 32 bits value. */ +extern void *__md5_finish_ctx (struct md5_ctx *ctx, void *resbuf) __THROW; + + +/* Put result from CTX in first 16 bytes following RESBUF. The result is + always in little endian byte order, so that a byte-wise output yields + to the wanted ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32 bits value. */ +extern void *__md5_read_ctx (const struct md5_ctx *ctx, void *resbuf) __THROW; + + +/* Compute MD5 message digest for bytes read from STREAM. The + resulting message digest number will be written into the 16 bytes + beginning at RESBLOCK. */ +extern int __md5_stream (FILE *stream, void *resblock) __THROW; + +/* Compute MD5 message digest for LEN bytes beginning at BUFFER. The + result is always in little endian byte order, so that a byte-wise + output yields to the wanted ASCII representation of the message + digest. */ +extern void *__md5_buffer (const char *buffer, size_t len, + void *resblock) __THROW; + +#endif /* md5.h */ diff --git a/md_p.h b/md_p.h new file mode 100644 index 00000000..0d691fbc --- /dev/null +++ b/md_p.h @@ -0,0 +1,269 @@ +/* + md_p.h : physical layout of Linux RAID devices + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_P_H +#define _MD_P_H + +/* + * RAID superblock. + * + * The RAID superblock maintains some statistics on each RAID configuration. + * Each real device in the RAID set contains it near the end of the device. + * Some of the ideas are copied from the ext2fs implementation. + * + * We currently use 4096 bytes as follows: + * + * word offset function + * + * 0 - 31 Constant generic RAID device information. + * 32 - 63 Generic state information. + * 64 - 127 Personality specific information. + * 128 - 511 12 32-words descriptors of the disks in the raid set. + * 512 - 911 Reserved. + * 912 - 1023 Disk specific descriptor. + */ + +/* + * If x is the real device size in bytes, we return an apparent size of: + * + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES + * + * and place the 4kB superblock at offset y. + */ +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) + +#define MD_SB_BYTES 4096 +#define MD_SB_WORDS (MD_SB_BYTES / 4) +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) +#define MD_SB_SECTORS (MD_SB_BYTES / 512) + +/* + * The following are counted in 32-bit words + */ +#define MD_SB_GENERIC_OFFSET 0 +#define MD_SB_PERSONALITY_OFFSET 64 +#define MD_SB_DISKS_OFFSET 128 +#define MD_SB_DESCRIPTOR_OFFSET 992 + +#define MD_SB_GENERIC_CONSTANT_WORDS 32 +#define MD_SB_GENERIC_STATE_WORDS 32 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) +#define MD_SB_PERSONALITY_WORDS 64 +#define MD_SB_DESCRIPTOR_WORDS 32 +#define MD_SB_DISKS 27 +#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) + +/* + * Device "operational" state bits + */ +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */ +#define MD_DISK_ACTIVE 1 /* disk is running but may not be in sync */ +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster + * For clustered enviroments only. + */ +#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed + * For clustered enviroments only. + */ + +#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. + * read requests will only be sent here in + * dire need + */ + +#define MD_DISK_REPLACEMENT 17 +#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */ + +#define MD_DISK_ROLE_SPARE 0xffff +#define MD_DISK_ROLE_FAULTY 0xfffe +#define MD_DISK_ROLE_JOURNAL 0xfffd +#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */ + +typedef struct mdp_device_descriptor_s { + __u32 number; /* 0 Device number in the entire set */ + __u32 major; /* 1 Device major number */ + __u32 minor; /* 2 Device minor number */ + __u32 raid_disk; /* 3 The role of the device in the raid set */ + __u32 state; /* 4 Operational state */ + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; +} mdp_disk_t; + +#define MD_SB_MAGIC 0xa92b4efc + +/* + * Superblock state bits + */ +#define MD_SB_CLEAN 0 +#define MD_SB_ERRORS 1 +#define MD_SB_BBM_ERRORS 2 +#define MD_SB_BLOCK_CONTAINER_RESHAPE 3 /* block container wide reshapes */ +#define MD_SB_BLOCK_VOLUME 4 /* block activation of array, other arrays + * in container can be activated */ +#define MD_SB_CLUSTERED 5 /* MD is clustered */ +#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ + +typedef struct mdp_superblock_s { + /* + * Constant generic information + */ + __u32 md_magic; /* 0 MD identifier */ + __u32 major_version; /* 1 major version to which the set conforms */ + __u32 minor_version; /* 2 minor version ... */ + __u32 patch_version; /* 3 patchlevel version ... */ + __u32 gvalid_words; /* 4 Number of used words in this section */ + __u32 set_uuid0; /* 5 Raid set identifier */ + __u32 ctime; /* 6 Creation time */ + __u32 level; /* 7 Raid personality */ + __u32 size; /* 8 Apparent size of each individual disk */ + __u32 nr_disks; /* 9 total disks in the raid set */ + __u32 raid_disks; /* 10 disks in a fully functional raid set */ + __u32 md_minor; /* 11 preferred MD minor device number */ + __u32 not_persistent; /* 12 does it have a persistent superblock */ + __u32 set_uuid1; /* 13 Raid set identifier #2 */ + __u32 set_uuid2; /* 14 Raid set identifier #3 */ + __u32 set_uuid3; /* 15 Raid set identifier #4 */ + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; + + /* + * Generic state information + */ + __u32 utime; /* 0 Superblock update time */ + __u32 state; /* 1 State bits (clean, ...) */ + __u32 active_disks; /* 2 Number of currently active disks */ + __u32 working_disks; /* 3 Number of working disks */ + __u32 failed_disks; /* 4 Number of failed disks */ + __u32 spare_disks; /* 5 Number of spare disks */ + __u32 sb_csum; /* 6 checksum of the whole superblock */ +#if __BYTE_ORDER == __BIG_ENDIAN + __u32 events_hi; /* 7 high-order of superblock update count */ + __u32 events_lo; /* 8 low-order of superblock update count */ + __u32 cp_events_hi; /* 9 high-order of checkpoint update count */ + __u32 cp_events_lo; /* 10 low-order of checkpoint update count */ +#else + __u32 events_lo; /* 7 low-order of superblock update count */ + __u32 events_hi; /* 8 high-order of superblock update count */ + __u32 cp_events_lo; /* 9 low-order of checkpoint update count */ + __u32 cp_events_hi; /* 10 high-order of checkpoint update count */ +#endif + __u32 recovery_cp; /* 11 recovery checkpoint sector count */ + /* There are only valid for minor_version > 90 */ + __u64 reshape_position; /* 12,13 next address in array-space for reshape */ + __u32 new_level; /* 14 new level we are reshaping to */ + __u32 delta_disks; /* 15 change in number of raid_disks */ + __u32 new_layout; /* 16 new layout */ + __u32 new_chunk; /* 17 new chunk size (bytes) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18]; + + /* + * Personality information + */ + __u32 layout; /* 0 the array's physical layout */ + __u32 chunk_size; /* 1 chunk size in bytes */ + __u32 root_pv; /* 2 LV root PV */ + __u32 root_block; /* 3 LV root block */ + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; + + /* + * Disks information + */ + mdp_disk_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + __u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + mdp_disk_t this_disk; + +} mdp_super_t; + +#ifdef __TINYC__ +typedef unsigned long long __u64; +#endif + +static inline __u64 md_event(mdp_super_t *sb) { + __u64 ev = sb->events_hi; + return (ev<<32)| sb->events_lo; +} + +struct r5l_payload_header { + __u16 type; + __u16 flags; +} __attribute__ ((__packed__)); + +enum r5l_payload_type { + R5LOG_PAYLOAD_DATA = 0, + R5LOG_PAYLOAD_PARITY = 1, + R5LOG_PAYLOAD_FLUSH = 2, +}; + +struct r5l_payload_data_parity { + struct r5l_payload_header header; + __u32 size; /* sector. data/parity size. each 4k has a checksum */ + __u64 location; /* sector. For data, it's raid sector. For + parity, it's stripe sector */ + __u32 checksum[]; +} __attribute__ ((__packed__)); + +enum r5l_payload_data_parity_flag { + R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */ + /* + * RESHAPED/RESHAPING is only set when there is reshape activity. Note, + * both data/parity of a stripe should have the same flag set + * + * RESHAPED: reshape is running, and this stripe finished reshape + * RESHAPING: reshape is running, and this stripe isn't reshaped + * */ + R5LOG_PAYLOAD_FLAG_RESHAPED = 2, + R5LOG_PAYLOAD_FLAG_RESHAPING = 3, +}; + +struct r5l_payload_flush { + struct r5l_payload_header header; + __u32 size; /* flush_stripes size, bytes */ + __u64 flush_stripes[]; +} __attribute__ ((__packed__)); + +enum r5l_payload_flush_flag { + R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */ +}; + +struct r5l_meta_block { + __u32 magic; + __u32 checksum; + __u8 version; + __u8 __zero_pading_1; + __u16 __zero_pading_2; + __u32 meta_size; /* whole size of the block */ + + __u64 seq; + __u64 position; /* sector, start from rdev->data_offset, current position */ + struct r5l_payload_header payloads[]; +} __attribute__ ((__packed__)); + +#define R5LOG_VERSION 0x1 +#define R5LOG_MAGIC 0x6433c509 + +#endif diff --git a/md_u.h b/md_u.h new file mode 100644 index 00000000..f570a346 --- /dev/null +++ b/md_u.h @@ -0,0 +1,123 @@ +/* + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_U_H +#define _MD_U_H + +/* ioctls */ + +/* status */ +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t) +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t) +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) +#define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) +#define RAID_AUTORUN _IO (MD_MAJOR, 0x14) +#define GET_BITMAP_FILE _IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t) + +/* configuration */ +#define CLEAR_ARRAY _IO (MD_MAJOR, 0x20) +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t) +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22) +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t) +#define SET_DISK_INFO _IO (MD_MAJOR, 0x24) +#define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25) +#define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26) +#define PROTECT_ARRAY _IO (MD_MAJOR, 0x27) +#define HOT_ADD_DISK _IO (MD_MAJOR, 0x28) +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) +#define SET_BITMAP_FILE _IOW (MD_MAJOR, 0x2b, int) + +/* usage */ +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) +#define START_ARRAY _IO (MD_MAJOR, 0x31) +#define STOP_ARRAY _IO (MD_MAJOR, 0x32) +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) +#define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35) + +typedef struct mdu_version_s { + int major; + int minor; + int patchlevel; +} mdu_version_t; + +typedef struct mdu_array_info_s { + /* + * Generic constant information + */ + int major_version; + int minor_version; + int patch_version; + unsigned int ctime; + int level; + int size; + int nr_disks; + int raid_disks; + int md_minor; + int not_persistent; + + /* + * Generic state information + */ + unsigned int utime; /* 0 Superblock update time */ + int state; /* 1 State bits (clean, ...) */ + int active_disks; /* 2 Number of currently active disks */ + int working_disks; /* 3 Number of working disks */ + int failed_disks; /* 4 Number of failed disks */ + int spare_disks; /* 5 Number of spare disks */ + + /* + * Personality information + */ + int layout; /* 0 the array's physical layout */ + int chunk_size; /* 1 chunk size in bytes */ + +} mdu_array_info_t; + +typedef struct mdu_disk_info_s { + /* + * configuration/status of one particular disk + */ + int number; + int major; + int minor; + int raid_disk; + int state; + +} mdu_disk_info_t; + +typedef struct mdu_start_info_s { + /* + * configuration/status of one particular disk + */ + int major; + int minor; + int raid_disk; + int state; + +} mdu_start_info_t; + +typedef struct mdu_bitmap_file_s +{ + char pathname[4096]; +} mdu_bitmap_file_t; + +typedef struct mdu_param_s +{ + int personality; /* 1,2,3,4 */ + int chunk_size; /* in bytes */ + int max_fault; /* unused for now */ +} mdu_param_t; + +#endif diff --git a/mdadm.8.in b/mdadm.8.in new file mode 100644 index 00000000..7bae49d8 --- /dev/null +++ b/mdadm.8.in @@ -0,0 +1,3258 @@ +.\" -*- nroff -*- +.\" Copyright Neil Brown and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH MDADM 8 "" v3.4 +.SH NAME +mdadm \- manage MD devices +.I aka +Linux Software RAID + +.SH SYNOPSIS + +.BI mdadm " [mode] [options] " + +.SH DESCRIPTION +RAID devices are virtual devices created from two or more +real block devices. This allows multiple devices (typically disk +drives or partitions thereof) to be combined into a single device to +hold (for example) a single filesystem. +Some RAID levels include redundancy and so can survive some degree of +device failure. + +Linux Software RAID devices are implemented through the md (Multiple +Devices) device driver. + +Currently, Linux supports +.B LINEAR +md devices, +.B RAID0 +(striping), +.B RAID1 +(mirroring), +.BR RAID4 , +.BR RAID5 , +.BR RAID6 , +.BR RAID10 , +.BR MULTIPATH , +.BR FAULTY , +and +.BR CONTAINER . + +.B MULTIPATH +is not a Software RAID mechanism, but does involve +multiple devices: +each device is a path to one common physical storage device. +New installations should not use md/multipath as it is not well +supported and has no ongoing development. Use the Device Mapper based +multipath-tools instead. + +.B FAULTY +is also not true RAID, and it only involves one device. It +provides a layer over a true device that can be used to inject faults. + +.B CONTAINER +is different again. A +.B CONTAINER +is a collection of devices that are +managed as a set. This is similar to the set of devices connected to +a hardware RAID controller. The set of devices may contain a number +of different RAID arrays each utilising some (or all) of the blocks from a +number of the devices in the set. For example, two devices in a 5-device set +might form a RAID1 using the whole devices. The remaining three might +have a RAID5 over the first half of each device, and a RAID0 over the +second half. + +With a +.BR CONTAINER , +there is one set of metadata that describes all of +the arrays in the container. So when +.I mdadm +creates a +.B CONTAINER +device, the device just represents the metadata. Other normal arrays (RAID1 +etc) can be created inside the container. + +.SH MODES +mdadm has several major modes of operation: +.TP +.B Assemble +Assemble the components of a previously created +array into an active array. Components can be explicitly given +or can be searched for. +.I mdadm +checks that the components +do form a bona fide array, and can, on request, fiddle superblock +information so as to assemble a faulty array. + +.TP +.B Build +Build an array that doesn't have per-device metadata (superblocks). For these +sorts of arrays, +.I mdadm +cannot differentiate between initial creation and subsequent assembly +of an array. It also cannot perform any checks that appropriate +components have been requested. Because of this, the +.B Build +mode should only be used together with a complete understanding of +what you are doing. + +.TP +.B Create +Create a new array with per-device metadata (superblocks). +Appropriate metadata is written to each device, and then the array +comprising those devices is activated. A 'resync' process is started +to make sure that the array is consistent (e.g. both sides of a mirror +contain the same data) but the content of the device is left otherwise +untouched. +The array can be used as soon as it has been created. There is no +need to wait for the initial resync to finish. + +.TP +.B "Follow or Monitor" +Monitor one or more md devices and act on any state changes. This is +only meaningful for RAID1, 4, 5, 6, 10 or multipath arrays, as +only these have interesting state. RAID0 or Linear never have +missing, spare, or failed drives, so there is nothing to monitor. + +.TP +.B "Grow" +Grow (or shrink) an array, or otherwise reshape it in some way. +Currently supported growth options including changing the active size +of component devices and changing the number of active devices in +Linear and RAID levels 0/1/4/5/6, +changing the RAID level between 0, 1, 5, and 6, and between 0 and 10, +changing the chunk size and layout for RAID 0,4,5,6,10 as well as adding or +removing a write-intent bitmap. + +.TP +.B "Incremental Assembly" +Add a single device to an appropriate array. If the addition of the +device makes the array runnable, the array will be started. +This provides a convenient interface to a +.I hot-plug +system. As each device is detected, +.I mdadm +has a chance to include it in some array as appropriate. +Optionally, when the +.I \-\-fail +flag is passed in we will remove the device from any active array +instead of adding it. + +If a +.B CONTAINER +is passed to +.I mdadm +in this mode, then any arrays within that container will be assembled +and started. + +.TP +.B Manage +This is for doing things to specific components of an array such as +adding new spares and removing faulty devices. + +.TP +.B Misc +This is an 'everything else' mode that supports operations on active +arrays, operations on component devices such as erasing old superblocks, and +information gathering operations. +.\"This mode allows operations on independent devices such as examine MD +.\"superblocks, erasing old superblocks and stopping active arrays. + +.TP +.B Auto-detect +This mode does not act on a specific device or array, but rather it +requests the Linux Kernel to activate any auto-detected arrays. +.SH OPTIONS + +.SH Options for selecting a mode are: + +.TP +.BR \-A ", " \-\-assemble +Assemble a pre-existing array. + +.TP +.BR \-B ", " \-\-build +Build a legacy array without superblocks. + +.TP +.BR \-C ", " \-\-create +Create a new array. + +.TP +.BR \-F ", " \-\-follow ", " \-\-monitor +Select +.B Monitor +mode. + +.TP +.BR \-G ", " \-\-grow +Change the size or shape of an active array. + +.TP +.BR \-I ", " \-\-incremental +Add/remove a single device to/from an appropriate array, and possibly start the array. + +.TP +.B \-\-auto-detect +Request that the kernel starts any auto-detected arrays. This can only +work if +.I md +is compiled into the kernel \(em not if it is a module. +Arrays can be auto-detected by the kernel if all the components are in +primary MS-DOS partitions with partition type +.BR FD , +and all use v0.90 metadata. +In-kernel autodetect is not recommended for new installations. Using +.I mdadm +to detect and assemble arrays \(em possibly in an +.I initrd +\(em is substantially more flexible and should be preferred. + +.P +If a device is given before any options, or if the first option is +one of +.BR \-\-add , +.BR \-\-re\-add , +.BR \-\-add\-spare , +.BR \-\-fail , +.BR \-\-remove , +or +.BR \-\-replace , +then the MANAGE mode is assumed. +Anything other than these will cause the +.B Misc +mode to be assumed. + +.SH Options that are not mode-specific are: + +.TP +.BR \-h ", " \-\-help +Display general help message or, after one of the above options, a +mode-specific help message. + +.TP +.B \-\-help\-options +Display more detailed help about command line parsing and some commonly +used options. + +.TP +.BR \-V ", " \-\-version +Print version information for mdadm. + +.TP +.BR \-v ", " \-\-verbose +Be more verbose about what is happening. This can be used twice to be +extra-verbose. +The extra verbosity currently only affects +.B \-\-detail \-\-scan +and +.BR "\-\-examine \-\-scan" . + +.TP +.BR \-q ", " \-\-quiet +Avoid printing purely informative messages. With this, +.I mdadm +will be silent unless there is something really important to report. + + +.TP +.BR \-f ", " \-\-force +Be more forceful about certain operations. See the various modes for +the exact meaning of this option in different contexts. + +.TP +.BR \-c ", " \-\-config= +Specify the config file or directory. Default is to use +.B /etc/mdadm/mdadm.conf +and +.BR /etc/mdadm/mdadm.conf.d , +or if those are missing then +.B /etc/mdadm.conf +and +.BR /etc/mdadm.conf.d . +If the config file given is +.B "partitions" +then nothing will be read, but +.I mdadm +will act as though the config file contained exactly +.br +.B " DEVICE partitions containers" +.br +and will read +.B /proc/partitions +to find a list of devices to scan, and +.B /proc/mdstat +to find a list of containers to examine. +If the word +.B "none" +is given for the config file, then +.I mdadm +will act as though the config file were empty. + +If the name given is of a directory, then +.I mdadm +will collect all the files contained in the directory with a name ending +in +.BR .conf , +sort them lexically, and process all of those files as config files. + +.TP +.BR \-s ", " \-\-scan +Scan config file or +.B /proc/mdstat +for missing information. +In general, this option gives +.I mdadm +permission to get any missing information (like component devices, +array devices, array identities, and alert destination) from the +configuration file (see previous option); +one exception is MISC mode when using +.B \-\-detail +or +.B \-\-stop, +in which case +.B \-\-scan +says to get a list of array devices from +.BR /proc/mdstat . + +.TP +.BR \-e ", " \-\-metadata= +Declare the style of RAID metadata (superblock) to be used. The +default is {DEFAULT_METADATA} for +.BR \-\-create , +and to guess for other operations. +The default can be overridden by setting the +.B metadata +value for the +.B CREATE +keyword in +.BR mdadm.conf . + +Options are: +.RS +.ie '{DEFAULT_METADATA}'0.90' +.IP "0, 0.90, default" +.el +.IP "0, 0.90" +Use the original 0.90 format superblock. This format limits arrays to +28 component devices and limits component devices of levels 1 and +greater to 2 terabytes. It is also possible for there to be confusion +about whether the superblock applies to a whole device or just the +last partition, if that partition starts on a 64K boundary. +.ie '{DEFAULT_METADATA}'0.90' +.IP "1, 1.0, 1.1, 1.2" +.el +.IP "1, 1.0, 1.1, 1.2 default" +Use the new version-1 format superblock. This has fewer restrictions. +It can easily be moved between hosts with different endian-ness, and a +recovery operation can be checkpointed and restarted. The different +sub-versions store the superblock at different locations on the +device, either at the end (for 1.0), at the start (for 1.1) or 4K from +the start (for 1.2). "1" is equivalent to "1.2" (the commonly +preferred 1.x format). +'if '{DEFAULT_METADATA}'1.2' "default" is equivalent to "1.2". +.IP ddf +Use the "Industry Standard" DDF (Disk Data Format) format defined by +SNIA. +When creating a DDF array a +.B CONTAINER +will be created, and normal arrays can be created in that container. +.IP imsm +Use the Intel(R) Matrix Storage Manager metadata format. This creates a +.B CONTAINER +which is managed in a similar manner to DDF, and is supported by an +option-rom on some platforms: +.IP +.B http://www.intel.com/design/chipsets/matrixstorage_sb.htm +.PP +.RE + +.TP +.B \-\-homehost= +This will override any +.B HOMEHOST +setting in the config file and provides the identity of the host which +should be considered the home for any arrays. + +When creating an array, the +.B homehost +will be recorded in the metadata. For version-1 superblocks, it will +be prefixed to the array name. For version-0.90 superblocks, part of +the SHA1 hash of the hostname will be stored in the later half of the +UUID. + +When reporting information about an array, any array which is tagged +for the given homehost will be reported as such. + +When using Auto-Assemble, only arrays tagged for the given homehost +will be allowed to use 'local' names (i.e. not ending in '_' followed +by a digit string). See below under +.BR "Auto Assembly" . + +The special name "\fBany\fP" can be used as a wild card. If an array +is created with +.B --homehost=any +then the name "\fBany\fP" will be stored in the array and it can be +assembled in the same way on any host. If an array is assembled with +this option, then the homehost recorded on the array will be ignored. + +.TP +.B \-\-prefer= +When +.I mdadm +needs to print the name for a device it normally finds the name in +.B /dev +which refers to the device and is shortest. When a path component is +given with +.B \-\-prefer +.I mdadm +will prefer a longer name if it contains that component. For example +.B \-\-prefer=by-uuid +will prefer a name in a subdirectory of +.B /dev +called +.BR by-uuid . + +This functionality is currently only provided by +.B \-\-detail +and +.BR \-\-monitor . + +.TP +.B \-\-home\-cluster= +specifies the cluster name for the md device. The md device can be assembled +only on the cluster which matches the name specified. If this option is not +provided, mdadm tries to detect the cluster name automatically. + +.SH For create, build, or grow: + +.TP +.BR \-n ", " \-\-raid\-devices= +Specify the number of active devices in the array. This, plus the +number of spare devices (see below) must equal the number of +.I component-devices +(including "\fBmissing\fP" devices) +that are listed on the command line for +.BR \-\-create . +Setting a value of 1 is probably +a mistake and so requires that +.B \-\-force +be specified first. A value of 1 will then be allowed for linear, +multipath, RAID0 and RAID1. It is never allowed for RAID4, RAID5 or RAID6. +.br +This number can only be changed using +.B \-\-grow +for RAID1, RAID4, RAID5 and RAID6 arrays, and only on kernels which provide +the necessary support. + +.TP +.BR \-x ", " \-\-spare\-devices= +Specify the number of spare (eXtra) devices in the initial array. +Spares can also be added +and removed later. The number of component devices listed +on the command line must equal the number of RAID devices plus the +number of spare devices. + +.TP +.BR \-z ", " \-\-size= +Amount (in Kibibytes) of space to use from each drive in RAID levels 1/4/5/6. +This must be a multiple of the chunk size, and must leave about 128Kb +of space at the end of the drive for the RAID superblock. +If this is not specified +(as it normally is not) the smallest drive (or partition) sets the +size, though if there is a variance among the drives of greater than 1%, a warning is +issued. + +A suffix of 'M' or 'G' can be given to indicate Megabytes or +Gigabytes respectively. + +Sometimes a replacement drive can be a little smaller than the +original drives though this should be minimised by IDEMA standards. +Such a replacement drive will be rejected by +.IR md . +To guard against this it can be useful to set the initial size +slightly smaller than the smaller device with the aim that it will +still be larger than any replacement. + +This value can be set with +.B \-\-grow +for RAID level 1/4/5/6 though +.B CONTAINER +based arrays such as those with IMSM metadata may not be able to +support this. +If the array was created with a size smaller than the currently +active drives, the extra space can be accessed using +.BR \-\-grow . +The size can be given as +.B max +which means to choose the largest size that fits on all current drives. + +Before reducing the size of the array (with +.BR "\-\-grow \-\-size=" ) +you should make sure that space isn't needed. If the device holds a +filesystem, you would need to resize the filesystem to use less space. + +After reducing the array size you should check that the data stored in +the device is still available. If the device holds a filesystem, then +an 'fsck' of the filesystem is a minimum requirement. If there are +problems the array can be made bigger again with no loss with another +.B "\-\-grow \-\-size=" +command. + +This value cannot be used when creating a +.B CONTAINER +such as with DDF and IMSM metadata, though it perfectly valid when +creating an array inside a container. + +.TP +.BR \-Z ", " \-\-array\-size= +This is only meaningful with +.B \-\-grow +and its effect is not persistent: when the array is stopped and +restarted the default array size will be restored. + +Setting the array-size causes the array to appear smaller to programs +that access the data. This is particularly needed before reshaping an +array so that it will be smaller. As the reshape is not reversible, +but setting the size with +.B \-\-array-size +is, it is required that the array size is reduced as appropriate +before the number of devices in the array is reduced. + +Before reducing the size of the array you should make sure that space +isn't needed. If the device holds a filesystem, you would need to +resize the filesystem to use less space. + +After reducing the array size you should check that the data stored in +the device is still available. If the device holds a filesystem, then +an 'fsck' of the filesystem is a minimum requirement. If there are +problems the array can be made bigger again with no loss with another +.B "\-\-grow \-\-array\-size=" +command. + +A suffix of 'M' or 'G' can be given to indicate Megabytes or +Gigabytes respectively. +A value of +.B max +restores the apparent size of the array to be whatever the real +amount of available space is. + +.TP +.BR \-c ", " \-\-chunk= +Specify chunk size of kibibytes. The default when creating an +array is 512KB. To ensure compatibility with earlier versions, the +default when building an array with no persistent metadata is 64KB. +This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10. + +RAID4, RAID5, RAID6, and RAID10 require the chunk size to be a power +of 2. In any case it must be a multiple of 4KB. + +A suffix of 'M' or 'G' can be given to indicate Megabytes or +Gigabytes respectively. + +.TP +.BR \-\-rounding= +Specify rounding factor for a Linear array. The size of each +component will be rounded down to a multiple of this size. +This is a synonym for +.B \-\-chunk +but highlights the different meaning for Linear as compared to other +RAID levels. The default is 64K if a kernel earlier than 2.6.16 is in +use, and is 0K (i.e. no rounding) in later kernels. + +.TP +.BR \-l ", " \-\-level= +Set RAID level. When used with +.BR \-\-create , +options are: linear, raid0, 0, stripe, raid1, 1, mirror, raid4, 4, +raid5, 5, raid6, 6, raid10, 10, multipath, mp, faulty, container. +Obviously some of these are synonymous. + +When a +.B CONTAINER +metadata type is requested, only the +.B container +level is permitted, and it does not need to be explicitly given. + +When used with +.BR \-\-build , +only linear, stripe, raid0, 0, raid1, multipath, mp, and faulty are valid. + +Can be used with +.B \-\-grow +to change the RAID level in some cases. See LEVEL CHANGES below. + +.TP +.BR \-p ", " \-\-layout= +This option configures the fine details of data layout for RAID5, RAID6, +and RAID10 arrays, and controls the failure modes for +.IR faulty . + +The layout of the RAID5 parity block can be one of +.BR left\-asymmetric , +.BR left\-symmetric , +.BR right\-asymmetric , +.BR right\-symmetric , +.BR la ", " ra ", " ls ", " rs . +The default is +.BR left\-symmetric . + +It is also possible to cause RAID5 to use a RAID4-like layout by +choosing +.BR parity\-first , +or +.BR parity\-last . + +Finally for RAID5 there are DDF\-compatible layouts, +.BR ddf\-zero\-restart , +.BR ddf\-N\-restart , +and +.BR ddf\-N\-continue . + +These same layouts are available for RAID6. There are also 4 layouts +that will provide an intermediate stage for converting between RAID5 +and RAID6. These provide a layout which is identical to the +corresponding RAID5 layout on the first N\-1 devices, and has the 'Q' +syndrome (the second 'parity' block used by RAID6) on the last device. +These layouts are: +.BR left\-symmetric\-6 , +.BR right\-symmetric\-6 , +.BR left\-asymmetric\-6 , +.BR right\-asymmetric\-6 , +and +.BR parity\-first\-6 . + +When setting the failure mode for level +.I faulty, +the options are: +.BR write\-transient ", " wt , +.BR read\-transient ", " rt , +.BR write\-persistent ", " wp , +.BR read\-persistent ", " rp , +.BR write\-all , +.BR read\-fixable ", " rf , +.BR clear ", " flush ", " none . + +Each failure mode can be followed by a number, which is used as a period +between fault generation. Without a number, the fault is generated +once on the first relevant request. With a number, the fault will be +generated after that many requests, and will continue to be generated +every time the period elapses. + +Multiple failure modes can be current simultaneously by using the +.B \-\-grow +option to set subsequent failure modes. + +"clear" or "none" will remove any pending or periodic failure modes, +and "flush" will clear any persistent faults. + +Finally, the layout options for RAID10 are one of 'n', 'o' or 'f' followed +by a small number. The default is 'n2'. The supported options are: + +.I 'n' +signals 'near' copies. Multiple copies of one data block are at +similar offsets in different devices. + +.I 'o' +signals 'offset' copies. Rather than the chunks being duplicated +within a stripe, whole stripes are duplicated but are rotated by one +device so duplicate blocks are on different devices. Thus subsequent +copies of a block are in the next drive, and are one chunk further +down. + +.I 'f' +signals 'far' copies +(multiple copies have very different offsets). +See md(4) for more detail about 'near', 'offset', and 'far'. + +The number is the number of copies of each datablock. 2 is normal, 3 +can be useful. This number can be at most equal to the number of +devices in the array. It does not need to divide evenly into that +number (e.g. it is perfectly legal to have an 'n2' layout for an array +with an odd number of devices). + +When an array is converted between RAID5 and RAID6 an intermediate +RAID6 layout is used in which the second parity block (Q) is always on +the last device. To convert a RAID5 to RAID6 and leave it in this new +layout (which does not require re-striping) use +.BR \-\-layout=preserve . +This will try to avoid any restriping. + +The converse of this is +.B \-\-layout=normalise +which will change a non-standard RAID6 layout into a more standard +arrangement. + +.TP +.BR \-\-parity= +same as +.B \-\-layout +(thus explaining the p of +.BR \-p ). + +.TP +.BR \-b ", " \-\-bitmap= +Specify a file to store a write-intent bitmap in. The file should not +exist unless +.B \-\-force +is also given. The same file should be provided +when assembling the array. If the word +.B "internal" +is given, then the bitmap is stored with the metadata on the array, +and so is replicated on all devices. If the word +.B "none" +is given with +.B \-\-grow +mode, then any bitmap that is present is removed. If the word +.B "clustered" +is given, the array is created for a clustered environment. One bitmap +is created for each node as defined by the +.B \-\-nodes +parameter and are stored internally. + +To help catch typing errors, the filename must contain at least one +slash ('/') if it is a real file (not 'internal' or 'none'). + +Note: external bitmaps are only known to work on ext2 and ext3. +Storing bitmap files on other filesystems may result in serious problems. + +When creating an array on devices which are 100G or larger, +.I mdadm +automatically adds an internal bitmap as it will usually be +beneficial. This can be suppressed with +.B "\-\-bitmap=none". + +.TP +.BR \-\-bitmap\-chunk= +Set the chunksize of the bitmap. Each bit corresponds to that many +Kilobytes of storage. +When using a file based bitmap, the default is to use the smallest +size that is at-least 4 and requires no more than 2^21 chunks. +When using an +.B internal +bitmap, the chunksize defaults to 64Meg, or larger if necessary to +fit the bitmap into the available space. + +A suffix of 'M' or 'G' can be given to indicate Megabytes or +Gigabytes respectively. + +.TP +.BR \-W ", " \-\-write\-mostly +subsequent devices listed in a +.BR \-\-build , +.BR \-\-create , +or +.B \-\-add +command will be flagged as 'write-mostly'. This is valid for RAID1 +only and means that the 'md' driver will avoid reading from these +devices if at all possible. This can be useful if mirroring over a +slow link. + +.TP +.BR \-\-write\-behind= +Specify that write-behind mode should be enabled (valid for RAID1 +only). If an argument is specified, it will set the maximum number +of outstanding writes allowed. The default value is 256. +A write-intent bitmap is required in order to use write-behind +mode, and write-behind is only attempted on drives marked as +.IR write-mostly . + +.TP +.BR \-\-assume\-clean +Tell +.I mdadm +that the array pre-existed and is known to be clean. It can be useful +when trying to recover from a major failure as you can be sure that no +data will be affected unless you actually write to the array. It can +also be used when creating a RAID1 or RAID10 if you want to avoid the +initial resync, however this practice \(em while normally safe \(em is not +recommended. Use this only if you really know what you are doing. +.IP +When the devices that will be part of a new array were filled +with zeros before creation the operator knows the array is +actually clean. If that is the case, such as after running +badblocks, this argument can be used to tell mdadm the +facts the operator knows. +.IP +When an array is resized to a larger size with +.B "\-\-grow \-\-size=" +the new space is normally resynced in that same way that the whole +array is resynced at creation. From Linux version 3.0, +.B \-\-assume\-clean +can be used with that command to avoid the automatic resync. + +.TP +.BR \-\-backup\-file= +This is needed when +.B \-\-grow +is used to increase the number of raid-devices in a RAID5 or RAID6 if +there are no spare devices available, or to shrink, change RAID level +or layout. See the GROW MODE section below on RAID\-DEVICES CHANGES. +The file must be stored on a separate device, not on the RAID array +being reshaped. + +.TP +.B \-\-data\-offset= +Arrays with 1.x metadata can leave a gap between the start of the +device and the start of array data. This gap can be used for various +metadata. The start of data is known as the +.IR data\-offset . +Normally an appropriate data offset is computed automatically. +However it can be useful to set it explicitly such as when re-creating +an array which was originally created using a different version of +.I mdadm +which computed a different offset. + +Setting the offset explicitly over-rides the default. The value given +is in Kilobytes unless an 'M' or 'G' suffix is given. + +Since Linux 3.4, +.B \-\-data\-offset +can also be used with +.B --grow +for some RAID levels (initially on RAID10). This allows the +data\-offset to be changed as part of the reshape process. When the +data offset is changed, no backup file is required as the difference +in offsets is used to provide the same functionality. + +When the new offset is earlier than the old offset, the number of +devices in the array cannot shrink. When it is after the old offset, +the number of devices in the array cannot increase. + +When creating an array, +.B \-\-data\-offset +can be specified as +.BR variable . +In the case each member device is expected to have a offset appended +to the name, separated by a colon. This makes it possible to recreate +exactly an array which has varying data offsets (as can happen when +different versions of +.I mdadm +are used to add different devices). + +.TP +.BR \-\-continue +This option is complementary to the +.B \-\-freeze-reshape +option for assembly. It is needed when +.B \-\-grow +operation is interrupted and it is not restarted automatically due to +.B \-\-freeze-reshape +usage during array assembly. This option is used together with +.BR \-G +, ( +.BR \-\-grow +) command and device for a pending reshape to be continued. +All parameters required for reshape continuation will be read from array metadata. +If initial +.BR \-\-grow +command had required +.BR \-\-backup\-file= +option to be set, continuation option will require to have exactly the same +backup file given as well. +.IP +Any other parameter passed together with +.BR \-\-continue +option will be ignored. + +.TP +.BR \-N ", " \-\-name= +Set a +.B name +for the array. This is currently only effective when creating an +array with a version-1 superblock, or an array in a DDF container. +The name is a simple textual string that can be used to identify array +components when assembling. If name is needed but not specified, it +is taken from the basename of the device that is being created. +e.g. when creating +.I /dev/md/home +the +.B name +will default to +.IR home . + +.TP +.BR \-R ", " \-\-run +Insist that +.I mdadm +run the array, even if some of the components +appear to be active in another array or filesystem. Normally +.I mdadm +will ask for confirmation before including such components in an +array. This option causes that question to be suppressed. + +.TP +.BR \-f ", " \-\-force +Insist that +.I mdadm +accept the geometry and layout specified without question. Normally +.I mdadm +will not allow creation of an array with only one device, and will try +to create a RAID5 array with one missing drive (as this makes the +initial resync work faster). With +.BR \-\-force , +.I mdadm +will not try to be so clever. + +.TP +.BR \-o ", " \-\-readonly +Start the array +.B read only +rather than read-write as normal. No writes will be allowed to the +array, and no resync, recovery, or reshape will be started. + +.TP +.BR \-a ", " "\-\-auto{=yes,md,mdp,part,p}{NN}" +Instruct mdadm how to create the device file if needed, possibly allocating +an unused minor number. "md" causes a non-partitionable array +to be used (though since Linux 2.6.28, these array devices are in fact +partitionable). "mdp", "part" or "p" causes a partitionable array (2.6 and +later) to be used. "yes" requires the named md device to have +a 'standard' format, and the type and minor number will be determined +from this. With mdadm 3.0, device creation is normally left up to +.I udev +so this option is unlikely to be needed. +See DEVICE NAMES below. + +The argument can also come immediately after +"\-a". e.g. "\-ap". + +If +.B \-\-auto +is not given on the command line or in the config file, then +the default will be +.BR \-\-auto=yes . + +If +.B \-\-scan +is also given, then any +.I auto= +entries in the config file will override the +.B \-\-auto +instruction given on the command line. + +For partitionable arrays, +.I mdadm +will create the device file for the whole array and for the first 4 +partitions. A different number of partitions can be specified at the +end of this option (e.g. +.BR \-\-auto=p7 ). +If the device name ends with a digit, the partition names add a 'p', +and a number, e.g. +.IR /dev/md/home1p3 . +If there is no trailing digit, then the partition names just have a +number added, e.g. +.IR /dev/md/scratch3 . + +If the md device name is in a 'standard' format as described in DEVICE +NAMES, then it will be created, if necessary, with the appropriate +device number based on that name. If the device name is not in one of these +formats, then a unused device number will be allocated. The device +number will be considered unused if there is no active array for that +number, and there is no entry in /dev for that number and with a +non-standard name. Names that are not in 'standard' format are only +allowed in "/dev/md/". + +This is meaningful with +.B \-\-create +or +.BR \-\-build . + +.TP +.BR \-a ", " "\-\-add" +This option can be used in Grow mode in two cases. + +If the target array is a Linear array, then +.B \-\-add +can be used to add one or more devices to the array. They +are simply catenated on to the end of the array. Once added, the +devices cannot be removed. + +If the +.B \-\-raid\-disks +option is being used to increase the number of devices in an array, +then +.B \-\-add +can be used to add some extra devices to be included in the array. +In most cases this is not needed as the extra devices can be added as +spares first, and then the number of raid-disks can be changed. +However for RAID0, it is not possible to add spares. So to increase +the number of devices in a RAID0, it is necessary to set the new +number of devices, and to add the new devices, in the same command. + +.TP +.BR \-\-nodes +Only works when the array is for clustered environment. It specifies +the maximum number of nodes in the cluster that will use this device +simultaneously. If not specified, this defaults to 4. + +.TP +.BR \-\-write-journal +Specify journal device for the RAID-4/5/6 array. The journal device +should be a SSD with reasonable lifetime. + + +.SH For assemble: + +.TP +.BR \-u ", " \-\-uuid= +uuid of array to assemble. Devices which don't have this uuid are +excluded + +.TP +.BR \-m ", " \-\-super\-minor= +Minor number of device that array was created for. Devices which +don't have this minor number are excluded. If you create an array as +/dev/md1, then all superblocks will contain the minor number 1, even if +the array is later assembled as /dev/md2. + +Giving the literal word "dev" for +.B \-\-super\-minor +will cause +.I mdadm +to use the minor number of the md device that is being assembled. +e.g. when assembling +.BR /dev/md0 , +.B \-\-super\-minor=dev +will look for super blocks with a minor number of 0. + +.B \-\-super\-minor +is only relevant for v0.90 metadata, and should not normally be used. +Using +.B \-\-uuid +is much safer. + +.TP +.BR \-N ", " \-\-name= +Specify the name of the array to assemble. This must be the name +that was specified when creating the array. It must either match +the name stored in the superblock exactly, or it must match +with the current +.I homehost +prefixed to the start of the given name. + +.TP +.BR \-f ", " \-\-force +Assemble the array even if the metadata on some devices appears to be +out-of-date. If +.I mdadm +cannot find enough working devices to start the array, but can find +some devices that are recorded as having failed, then it will mark +those devices as working so that the array can be started. +An array which requires +.B \-\-force +to be started may contain data corruption. Use it carefully. + +.TP +.BR \-R ", " \-\-run +Attempt to start the array even if fewer drives were given than were +present last time the array was active. Normally if not all the +expected drives are found and +.B \-\-scan +is not used, then the array will be assembled but not started. +With +.B \-\-run +an attempt will be made to start it anyway. + +.TP +.B \-\-no\-degraded +This is the reverse of +.B \-\-run +in that it inhibits the startup of array unless all expected drives +are present. This is only needed with +.B \-\-scan, +and can be used if the physical connections to devices are +not as reliable as you would like. + +.TP +.BR \-a ", " "\-\-auto{=no,yes,md,mdp,part}" +See this option under Create and Build options. + +.TP +.BR \-b ", " \-\-bitmap= +Specify the bitmap file that was given when the array was created. If +an array has an +.B internal +bitmap, there is no need to specify this when assembling the array. + +.TP +.BR \-\-backup\-file= +If +.B \-\-backup\-file +was used while reshaping an array (e.g. changing number of devices or +chunk size) and the system crashed during the critical section, then the same +.B \-\-backup\-file +must be presented to +.B \-\-assemble +to allow possibly corrupted data to be restored, and the reshape +to be completed. + +.TP +.BR \-\-invalid\-backup +If the file needed for the above option is not available for any +reason an empty file can be given together with this option to +indicate that the backup file is invalid. In this case the data that +was being rearranged at the time of the crash could be irrecoverably +lost, but the rest of the array may still be recoverable. This option +should only be used as a last resort if there is no way to recover the +backup file. + + +.TP +.BR \-U ", " \-\-update= +Update the superblock on each device while assembling the array. The +argument given to this flag can be one of +.BR sparc2.2 , +.BR summaries , +.BR uuid , +.BR name , +.BR nodes , +.BR homehost , +.BR home-cluster , +.BR resync , +.BR byteorder , +.BR devicesize , +.BR no\-bitmap , +.BR bbl , +.BR no\-bbl , +.BR metadata , +or +.BR super\-minor . + +The +.B sparc2.2 +option will adjust the superblock of an array what was created on a Sparc +machine running a patched 2.2 Linux kernel. This kernel got the +alignment of part of the superblock wrong. You can use the +.B "\-\-examine \-\-sparc2.2" +option to +.I mdadm +to see what effect this would have. + +The +.B super\-minor +option will update the +.B "preferred minor" +field on each superblock to match the minor number of the array being +assembled. +This can be useful if +.B \-\-examine +reports a different "Preferred Minor" to +.BR \-\-detail . +In some cases this update will be performed automatically +by the kernel driver. In particular the update happens automatically +at the first write to an array with redundancy (RAID level 1 or +greater) on a 2.6 (or later) kernel. + +The +.B uuid +option will change the uuid of the array. If a UUID is given with the +.B \-\-uuid +option that UUID will be used as a new UUID and will +.B NOT +be used to help identify the devices in the array. +If no +.B \-\-uuid +is given, a random UUID is chosen. + +The +.B name +option will change the +.I name +of the array as stored in the superblock. This is only supported for +version-1 superblocks. + +The +.B nodes +option will change the +.I nodes +of the array as stored in the bitmap superblock. This option only +works for a clustered environment. + +The +.B homehost +option will change the +.I homehost +as recorded in the superblock. For version-0 superblocks, this is the +same as updating the UUID. +For version-1 superblocks, this involves updating the name. + +The +.B home\-cluster +option will change the cluster name as recorded in the superblock and +bitmap. This option only works for clustered environment. + +The +.B resync +option will cause the array to be marked +.I dirty +meaning that any redundancy in the array (e.g. parity for RAID5, +copies for RAID1) may be incorrect. This will cause the RAID system +to perform a "resync" pass to make sure that all redundant information +is correct. + +The +.B byteorder +option allows arrays to be moved between machines with different +byte-order. +When assembling such an array for the first time after a move, giving +.B "\-\-update=byteorder" +will cause +.I mdadm +to expect superblocks to have their byteorder reversed, and will +correct that order before assembling the array. This is only valid +with original (Version 0.90) superblocks. + +The +.B summaries +option will correct the summaries in the superblock. That is the +counts of total, working, active, failed, and spare devices. + +The +.B devicesize +option will rarely be of use. It applies to version 1.1 and 1.2 metadata +only (where the metadata is at the start of the device) and is only +useful when the component device has changed size (typically become +larger). The version 1 metadata records the amount of the device that +can be used to store data, so if a device in a version 1.1 or 1.2 +array becomes larger, the metadata will still be visible, but the +extra space will not. In this case it might be useful to assemble the +array with +.BR \-\-update=devicesize . +This will cause +.I mdadm +to determine the maximum usable amount of space on each device and +update the relevant field in the metadata. + +The +.B metadata +option only works on v0.90 metadata arrays and will convert them to +v1.0 metadata. The array must not be dirty (i.e. it must not need a +sync) and it must not have a write-intent bitmap. + +The old metadata will remain on the devices, but will appear older +than the new metadata and so will usually be ignored. The old metadata +(or indeed the new metadata) can be removed by giving the appropriate +.B \-\-metadata= +option to +.BR \-\-zero\-superblock . + +The +.B no\-bitmap +option can be used when an array has an internal bitmap which is +corrupt in some way so that assembling the array normally fails. It +will cause any internal bitmap to be ignored. + +The +.B bbl +option will reserve space in each device for a bad block list. This +will be 4K in size and positioned near the end of any free space +between the superblock and the data. + +The +.B no\-bbl +option will cause any reservation of space for a bad block list to be +removed. If the bad block list contains entries, this will fail, as +removing the list could cause data corruption. + +.TP +.BR \-\-freeze\-reshape +Option is intended to be used in start-up scripts during initrd boot phase. +When array under reshape is assembled during initrd phase, this option +stops reshape after reshape critical section is being restored. This happens +before file system pivot operation and avoids loss of file system context. +Losing file system context would cause reshape to be broken. + +Reshape can be continued later using the +.B \-\-continue +option for the grow command. + +.SH For Manage mode: + +.TP +.BR \-t ", " \-\-test +Unless a more serious error occurred, +.I mdadm +will exit with a status of 2 if no changes were made to the array and +0 if at least one change was made. +This can be useful when an indirect specifier such as +.BR missing , +.B detached +or +.B faulty +is used in requesting an operation on the array. +.B \-\-test +will report failure if these specifiers didn't find any match. + +.TP +.BR \-a ", " \-\-add +hot-add listed devices. +If a device appears to have recently been part of the array +(possibly it failed or was removed) the device is re\-added as described +in the next point. +If that fails or the device was never part of the array, the device is +added as a hot-spare. +If the array is degraded, it will immediately start to rebuild data +onto that spare. + +Note that this and the following options are only meaningful on array +with redundancy. They don't apply to RAID0 or Linear. + +.TP +.BR \-\-re\-add +re\-add a device that was previously removed from an array. +If the metadata on the device reports that it is a member of the +array, and the slot that it used is still vacant, then the device will +be added back to the array in the same position. This will normally +cause the data for that device to be recovered. However based on the +event count on the device, the recovery may only require sections that +are flagged a write-intent bitmap to be recovered or may not require +any recovery at all. + +When used on an array that has no metadata (i.e. it was built with +.BR \-\-build) +it will be assumed that bitmap-based recovery is enough to make the +device fully consistent with the array. + +When used with v1.x metadata, +.B \-\-re\-add +can be accompanied by +.BR \-\-update=devicesize , +.BR \-\-update=bbl ", or" +.BR \-\-update=no\-bbl . +See the description of these option when used in Assemble mode for an +explanation of their use. + +If the device name given is +.B missing +then +.I mdadm +will try to find any device that looks like it should be +part of the array but isn't and will try to re\-add all such devices. + +If the device name given is +.B faulty +then +.I mdadm +will find all devices in the array that are marked +.BR faulty , +remove them and attempt to immediately re\-add them. This can be +useful if you are certain that the reason for failure has been +resolved. + +.TP +.B \-\-add\-spare +Add a device as a spare. This is similar to +.B \-\-add +except that it does not attempt +.B \-\-re\-add +first. The device will be added as a spare even if it looks like it +could be an recent member of the array. + +.TP +.BR \-r ", " \-\-remove +remove listed devices. They must not be active. i.e. they should +be failed or spare devices. + +As well as the name of a device file +(e.g. +.BR /dev/sda1 ) +the words +.BR failed , +.B detached +and names like +.B set-A +can be given to +.BR \-\-remove . +The first causes all failed device to be removed. The second causes +any device which is no longer connected to the system (i.e an 'open' +returns +.BR ENXIO ) +to be removed. +The third will remove a set as describe below under +.BR \-\-fail . + +.TP +.BR \-f ", " \-\-fail +Mark listed devices as faulty. +As well as the name of a device file, the word +.B detached +or a set name like +.B set\-A +can be given. The former will cause any device that has been detached from +the system to be marked as failed. It can then be removed. + +For RAID10 arrays where the number of copies evenly divides the number +of devices, the devices can be conceptually divided into sets where +each set contains a single complete copy of the data on the array. +Sometimes a RAID10 array will be configured so that these sets are on +separate controllers. In this case all the devices in one set can be +failed by giving a name like +.B set\-A +or +.B set\-B +to +.BR \-\-fail . +The appropriate set names are reported by +.BR \-\-detail . + +.TP +.BR \-\-set\-faulty +same as +.BR \-\-fail . + +.TP +.B \-\-replace +Mark listed devices as requiring replacement. As soon as a spare is +available, it will be rebuilt and will replace the marked device. +This is similar to marking a device as faulty, but the device remains +in service during the recovery process to increase resilience against +multiple failures. When the replacement process finishes, the +replaced device will be marked as faulty. + +.TP +.B \-\-with +This can follow a list of +.B \-\-replace +devices. The devices listed after +.B \-\-with +will be preferentially used to replace the devices listed after +.BR \-\-replace . +These device must already be spare devices in the array. + +.TP +.BR \-\-write\-mostly +Subsequent devices that are added or re\-added will have the 'write-mostly' +flag set. This is only valid for RAID1 and means that the 'md' driver +will avoid reading from these devices if possible. +.TP +.BR \-\-readwrite +Subsequent devices that are added or re\-added will have the 'write-mostly' +flag cleared. +.TP +.BR \-\-cluster\-confirm +Confirm the existence of the device. This is issued in response to an \-\-add +request by a node in a cluster. When a node adds a device it sends a message +to all nodes in the cluster to look for a device with a UUID. This translates +to a udev notification with the UUID of the device to be added and the slot +number. The receiving node must acknowledge this message +with \-\-cluster\-confirm. Valid arguments are : in case +the device is found or :missing in case the device is not found. + +.P +Each of these options requires that the first device listed is the array +to be acted upon, and the remainder are component devices to be added, +removed, marked as faulty, etc. Several different operations can be +specified for different devices, e.g. +.in +5 +mdadm /dev/md0 \-\-add /dev/sda1 \-\-fail /dev/sdb1 \-\-remove /dev/sdb1 +.in -5 +Each operation applies to all devices listed until the next +operation. + +If an array is using a write-intent bitmap, then devices which have +been removed can be re\-added in a way that avoids a full +reconstruction but instead just updates the blocks that have changed +since the device was removed. For arrays with persistent metadata +(superblocks) this is done automatically. For arrays created with +.B \-\-build +mdadm needs to be told that this device we removed recently with +.BR \-\-re\-add . + +Devices can only be removed from an array if they are not in active +use, i.e. that must be spares or failed devices. To remove an active +device, it must first be marked as +.B faulty. + +.SH For Misc mode: + +.TP +.BR \-Q ", " \-\-query +Examine a device to see +(1) if it is an md device and (2) if it is a component of an md +array. +Information about what is discovered is presented. + +.TP +.BR \-D ", " \-\-detail +Print details of one or more md devices. + +.TP +.BR \-\-detail\-platform +Print details of the platform's RAID capabilities (firmware / hardware +topology) for a given metadata format. If used without argument, mdadm +will scan all controllers looking for their capabilities. Otherwise, mdadm +will only look at the controller specified by the argument in form of an +absolute filepath or a link, e.g. +.IR /sys/devices/pci0000:00/0000:00:1f.2 . + +.TP +.BR \-Y ", " \-\-export +When used with +.BR \-\-detail , +.BR \-\-detail-platform , +.BR \-\-examine , +or +.B \-\-incremental +output will be formatted as +.B key=value +pairs for easy import into the environment. + +With +.B \-\-incremental +The value +.B MD_STARTED +indicates whether an array was started +.RB ( yes ) +or not, which may include a reason +.RB ( unsafe ", " nothing ", " no ). +Also the value +.B MD_FOREIGN +indicates if the array is expected on this host +.RB ( no ), +or seems to be from elsewhere +.RB ( yes ). + +.TP +.BR \-E ", " \-\-examine +Print contents of the metadata stored on the named device(s). +Note the contrast between +.B \-\-examine +and +.BR \-\-detail . +.B \-\-examine +applies to devices which are components of an array, while +.B \-\-detail +applies to a whole array which is currently active. +.TP +.B \-\-sparc2.2 +If an array was created on a SPARC machine with a 2.2 Linux kernel +patched with RAID support, the superblock will have been created +incorrectly, or at least incompatibly with 2.4 and later kernels. +Using the +.B \-\-sparc2.2 +flag with +.B \-\-examine +will fix the superblock before displaying it. If this appears to do +the right thing, then the array can be successfully assembled using +.BR "\-\-assemble \-\-update=sparc2.2" . + +.TP +.BR \-X ", " \-\-examine\-bitmap +Report information about a bitmap file. +The argument is either an external bitmap file or an array component +in case of an internal bitmap. Note that running this on an array +device (e.g. +.BR /dev/md0 ) +does not report the bitmap for that array. + +.TP +.B \-\-examine\-badblocks +List the bad-blocks recorded for the device, if a bad-blocks list has +been configured. Currently only +.B 1.x +metadata supports bad-blocks lists. + +.TP +.BI \-\-dump= directory +.TP +.BI \-\-restore= directory +Save metadata from lists devices, or restore metadata to listed devices. + +.TP +.BR \-R ", " \-\-run +start a partially assembled array. If +.B \-\-assemble +did not find enough devices to fully start the array, it might leaving +it partially assembled. If you wish, you can then use +.B \-\-run +to start the array in degraded mode. + +.TP +.BR \-S ", " \-\-stop +deactivate array, releasing all resources. + +.TP +.BR \-o ", " \-\-readonly +mark array as readonly. + +.TP +.BR \-w ", " \-\-readwrite +mark array as readwrite. + +.TP +.B \-\-zero\-superblock +If the device contains a valid md superblock, the block is +overwritten with zeros. With +.B \-\-force +the block where the superblock would be is overwritten even if it +doesn't appear to be valid. + +.TP +.B \-\-kill\-subarray= +If the device is a container and the argument to \-\-kill\-subarray +specifies an inactive subarray in the container, then the subarray is +deleted. Deleting all subarrays will leave an 'empty-container' or +spare superblock on the drives. See +.B \-\-zero\-superblock +for completely +removing a superblock. Note that some formats depend on the subarray +index for generating a UUID, this command will fail if it would change +the UUID of an active subarray. + +.TP +.B \-\-update\-subarray= +If the device is a container and the argument to \-\-update\-subarray +specifies a subarray in the container, then attempt to update the given +superblock field in the subarray. See below in +.B MISC MODE +for details. + +.TP +.BR \-t ", " \-\-test +When used with +.BR \-\-detail , +the exit status of +.I mdadm +is set to reflect the status of the device. See below in +.B MISC MODE +for details. + +.TP +.BR \-W ", " \-\-wait +For each md device given, wait for any resync, recovery, or reshape +activity to finish before returning. +.I mdadm +will return with success if it actually waited for every device +listed, otherwise it will return failure. + +.TP +.BR \-\-wait\-clean +For each md device given, or each device in /proc/mdstat if +.B \-\-scan +is given, arrange for the array to be marked clean as soon as possible. +.I mdadm +will return with success if the array uses external metadata and we +successfully waited. For native arrays this returns immediately as the +kernel handles dirty-clean transitions at shutdown. No action is taken +if safe-mode handling is disabled. + +.TP +.B \-\-action= +Set the "sync_action" for all md devices given to one of +.BR idle , +.BR frozen , +.BR check , +.BR repair . +Setting to +.B idle +will abort any currently running action though some actions will +automatically restart. +Setting to +.B frozen +will abort any current action and ensure no other action starts +automatically. + +Details of +.B check +and +.B repair +can be found it +.IR md (4) +under +.BR "SCRUBBING AND MISMATCHES" . + +.SH For Incremental Assembly mode: +.TP +.BR \-\-rebuild\-map ", " \-r +Rebuild the map file +.RB ( {MAP_PATH} ) +that +.I mdadm +uses to help track which arrays are currently being assembled. + +.TP +.BR \-\-run ", " \-R +Run any array assembled as soon as a minimal number of devices are +available, rather than waiting until all expected devices are present. + +.TP +.BR \-\-scan ", " \-s +Only meaningful with +.B \-R +this will scan the +.B map +file for arrays that are being incrementally assembled and will try to +start any that are not already started. If any such array is listed +in +.B mdadm.conf +as requiring an external bitmap, that bitmap will be attached first. + +.TP +.BR \-\-fail ", " \-f +This allows the hot-plug system to remove devices that have fully disappeared +from the kernel. It will first fail and then remove the device from any +array it belongs to. +The device name given should be a kernel device name such as "sda", +not a name in +.IR /dev . + +.TP +.BR \-\-path= +Only used with \-\-fail. The 'path' given will be recorded so that if +a new device appears at the same location it can be automatically +added to the same array. This allows the failed device to be +automatically replaced by a new device without metadata if it appears +at specified path. This option is normally only set by a +.I udev +script. + +.SH For Monitor mode: +.TP +.BR \-m ", " \-\-mail +Give a mail address to send alerts to. + +.TP +.BR \-p ", " \-\-program ", " \-\-alert +Give a program to be run whenever an event is detected. + +.TP +.BR \-y ", " \-\-syslog +Cause all events to be reported through 'syslog'. The messages have +facility of 'daemon' and varying priorities. + +.TP +.BR \-d ", " \-\-delay +Give a delay in seconds. +.I mdadm +polls the md arrays and then waits this many seconds before polling +again. The default is 60 seconds. Since 2.6.16, there is no need to +reduce this as the kernel alerts +.I mdadm +immediately when there is any change. + +.TP +.BR \-r ", " \-\-increment +Give a percentage increment. +.I mdadm +will generate RebuildNN events with the given percentage increment. + +.TP +.BR \-f ", " \-\-daemonise +Tell +.I mdadm +to run as a background daemon if it decides to monitor anything. This +causes it to fork and run in the child, and to disconnect from the +terminal. The process id of the child is written to stdout. +This is useful with +.B \-\-scan +which will only continue monitoring if a mail address or alert program +is found in the config file. + +.TP +.BR \-i ", " \-\-pid\-file +When +.I mdadm +is running in daemon mode, write the pid of the daemon process to +the specified file, instead of printing it on standard output. + +.TP +.BR \-1 ", " \-\-oneshot +Check arrays only once. This will generate +.B NewArray +events and more significantly +.B DegradedArray +and +.B SparesMissing +events. Running +.in +5 +.B " mdadm \-\-monitor \-\-scan \-1" +.in -5 +from a cron script will ensure regular notification of any degraded arrays. + +.TP +.BR \-t ", " \-\-test +Generate a +.B TestMessage +alert for every array found at startup. This alert gets mailed and +passed to the alert program. This can be used for testing that alert +message do get through successfully. + +.TP +.BR \-\-no\-sharing +This inhibits the functionality for moving spares between arrays. +Only one monitoring process started with +.B \-\-scan +but without this flag is allowed, otherwise the two could interfere +with each other. + +.SH ASSEMBLE MODE + +.HP 12 +Usage: +.B mdadm \-\-assemble +.I md-device options-and-component-devices... +.HP 12 +Usage: +.B mdadm \-\-assemble \-\-scan +.I md-devices-and-options... +.HP 12 +Usage: +.B mdadm \-\-assemble \-\-scan +.I options... + +.PP +This usage assembles one or more RAID arrays from pre-existing components. +For each array, mdadm needs to know the md device, the identity of the +array, and a number of component-devices. These can be found in a number of ways. + +In the first usage example (without the +.BR \-\-scan ) +the first device given is the md device. +In the second usage example, all devices listed are treated as md +devices and assembly is attempted. +In the third (where no devices are listed) all md devices that are +listed in the configuration file are assembled. If no arrays are +described by the configuration file, then any arrays that +can be found on unused devices will be assembled. + +If precisely one device is listed, but +.B \-\-scan +is not given, then +.I mdadm +acts as though +.B \-\-scan +was given and identity information is extracted from the configuration file. + +The identity can be given with the +.B \-\-uuid +option, the +.B \-\-name +option, or the +.B \-\-super\-minor +option, will be taken from the md-device record in the config file, or +will be taken from the super block of the first component-device +listed on the command line. + +Devices can be given on the +.B \-\-assemble +command line or in the config file. Only devices which have an md +superblock which contains the right identity will be considered for +any array. + +The config file is only used if explicitly named with +.B \-\-config +or requested with (a possibly implicit) +.BR \-\-scan . +In the later case, +.B /etc/mdadm/mdadm.conf +or +.B /etc/mdadm.conf +is used. + +If +.B \-\-scan +is not given, then the config file will only be used to find the +identity of md arrays. + +Normally the array will be started after it is assembled. However if +.B \-\-scan +is not given and not all expected drives were listed, then the array +is not started (to guard against usage errors). To insist that the +array be started in this case (as may work for RAID1, 4, 5, 6, or 10), +give the +.B \-\-run +flag. + +If +.I udev +is active, +.I mdadm +does not create any entries in +.B /dev +but leaves that to +.IR udev . +It does record information in +.B {MAP_PATH} +which will allow +.I udev +to choose the correct name. + +If +.I mdadm +detects that udev is not configured, it will create the devices in +.B /dev +itself. + +In Linux kernels prior to version 2.6.28 there were two distinctly +different types of md devices that could be created: one that could be +partitioned using standard partitioning tools and one that could not. +Since 2.6.28 that distinction is no longer relevant as both type of +devices can be partitioned. +.I mdadm +will normally create the type that originally could not be partitioned +as it has a well defined major number (9). + +Prior to 2.6.28, it is important that mdadm chooses the correct type +of array device to use. This can be controlled with the +.B \-\-auto +option. In particular, a value of "mdp" or "part" or "p" tells mdadm +to use a partitionable device rather than the default. + +In the no-udev case, the value given to +.B \-\-auto +can be suffixed by a number. This tells +.I mdadm +to create that number of partition devices rather than the default of 4. + +The value given to +.B \-\-auto +can also be given in the configuration file as a word starting +.B auto= +on the ARRAY line for the relevant array. + +.SS Auto Assembly +When +.B \-\-assemble +is used with +.B \-\-scan +and no devices are listed, +.I mdadm +will first attempt to assemble all the arrays listed in the config +file. + +If no arrays are listed in the config (other than those marked +.BR ) +it will look through the available devices for possible arrays and +will try to assemble anything that it finds. Arrays which are tagged +as belonging to the given homehost will be assembled and started +normally. Arrays which do not obviously belong to this host are given +names that are expected not to conflict with anything local, and are +started "read-auto" so that nothing is written to any device until the +array is written to. i.e. automatic resync etc is delayed. + +If +.I mdadm +finds a consistent set of devices that look like they should comprise +an array, and if the superblock is tagged as belonging to the given +home host, it will automatically choose a device name and try to +assemble the array. If the array uses version-0.90 metadata, then the +.B minor +number as recorded in the superblock is used to create a name in +.B /dev/md/ +so for example +.BR /dev/md/3 . +If the array uses version-1 metadata, then the +.B name +from the superblock is used to similarly create a name in +.B /dev/md/ +(the name will have any 'host' prefix stripped first). + +This behaviour can be modified by the +.I AUTO +line in the +.I mdadm.conf +configuration file. This line can indicate that specific metadata +type should, or should not, be automatically assembled. If an array +is found which is not listed in +.I mdadm.conf +and has a metadata format that is denied by the +.I AUTO +line, then it will not be assembled. +The +.I AUTO +line can also request that all arrays identified as being for this +homehost should be assembled regardless of their metadata type. +See +.IR mdadm.conf (5) +for further details. + +Note: Auto assembly cannot be used for assembling and activating some +arrays which are undergoing reshape. In particular as the +.B backup\-file +cannot be given, any reshape which requires a backup-file to continue +cannot be started by auto assembly. An array which is growing to more +devices and has passed the critical section can be assembled using +auto-assembly. + +.SH BUILD MODE + +.HP 12 +Usage: +.B mdadm \-\-build +.I md-device +.BI \-\-chunk= X +.BI \-\-level= Y +.BI \-\-raid\-devices= Z +.I devices + +.PP +This usage is similar to +.BR \-\-create . +The difference is that it creates an array without a superblock. With +these arrays there is no difference between initially creating the array and +subsequently assembling the array, except that hopefully there is useful +data there in the second case. + +The level may raid0, linear, raid1, raid10, multipath, or faulty, or +one of their synonyms. All devices must be listed and the array will +be started once complete. It will often be appropriate to use +.B \-\-assume\-clean +with levels raid1 or raid10. + +.SH CREATE MODE + +.HP 12 +Usage: +.B mdadm \-\-create +.I md-device +.BI \-\-chunk= X +.BI \-\-level= Y +.br +.BI \-\-raid\-devices= Z +.I devices + +.PP +This usage will initialise a new md array, associate some devices with +it, and activate the array. + +The named device will normally not exist when +.I "mdadm \-\-create" +is run, but will be created by +.I udev +once the array becomes active. + +As devices are added, they are checked to see if they contain RAID +superblocks or filesystems. They are also checked to see if the variance in +device size exceeds 1%. + +If any discrepancy is found, the array will not automatically be run, though +the presence of a +.B \-\-run +can override this caution. + +To create a "degraded" array in which some devices are missing, simply +give the word "\fBmissing\fP" +in place of a device name. This will cause +.I mdadm +to leave the corresponding slot in the array empty. +For a RAID4 or RAID5 array at most one slot can be +"\fBmissing\fP"; for a RAID6 array at most two slots. +For a RAID1 array, only one real device needs to be given. All of the +others can be +"\fBmissing\fP". + +When creating a RAID5 array, +.I mdadm +will automatically create a degraded array with an extra spare drive. +This is because building the spare into a degraded array is in general +faster than resyncing the parity on a non-degraded, but not clean, +array. This feature can be overridden with the +.B \-\-force +option. + +When creating an array with version-1 metadata a name for the array is +required. +If this is not given with the +.B \-\-name +option, +.I mdadm +will choose a name based on the last component of the name of the +device being created. So if +.B /dev/md3 +is being created, then the name +.B 3 +will be chosen. +If +.B /dev/md/home +is being created, then the name +.B home +will be used. + +When creating a partition based array, using +.I mdadm +with version-1.x metadata, the partition type should be set to +.B 0xDA +(non fs-data). This type selection allows for greater precision since +using any other [RAID auto-detect (0xFD) or a GNU/Linux partition (0x83)], +might create problems in the event of array recovery through a live cdrom. + +A new array will normally get a randomly assigned 128bit UUID which is +very likely to be unique. If you have a specific need, you can choose +a UUID for the array by giving the +.B \-\-uuid= +option. Be warned that creating two arrays with the same UUID is a +recipe for disaster. Also, using +.B \-\-uuid= +when creating a v0.90 array will silently override any +.B \-\-homehost= +setting. +.\"If the +.\".B \-\-size +.\"option is given, it is not necessary to list any component-devices in this command. +.\"They can be added later, before a +.\".B \-\-run. +.\"If no +.\".B \-\-size +.\"is given, the apparent size of the smallest drive given is used. + +If the array type supports a write-intent bitmap, and if the devices +in the array exceed 100G is size, an internal write-intent bitmap +will automatically be added unless some other option is explicitly +requested with the +.B \-\-bitmap +option. In any case space for a bitmap will be reserved so that one +can be added layer with +.BR "\-\-grow \-\-bitmap=internal" . + +If the metadata type supports it (currently only 1.x metadata), space +will be allocated to store a bad block list. This allows a modest +number of bad blocks to be recorded, allowing the drive to remain in +service while only partially functional. + +When creating an array within a +.B CONTAINER +.I mdadm +can be given either the list of devices to use, or simply the name of +the container. The former case gives control over which devices in +the container will be used for the array. The latter case allows +.I mdadm +to automatically choose which devices to use based on how much spare +space is available. + +The General Management options that are valid with +.B \-\-create +are: +.TP +.B \-\-run +insist on running the array even if some devices look like they might +be in use. + +.TP +.B \-\-readonly +start the array readonly \(em not supported yet. + +.SH MANAGE MODE +.HP 12 +Usage: +.B mdadm +.I device +.I options... devices... +.PP + +This usage will allow individual devices in an array to be failed, +removed or added. It is possible to perform multiple operations with +on command. For example: +.br +.B " mdadm /dev/md0 \-f /dev/hda1 \-r /dev/hda1 \-a /dev/hda1" +.br +will firstly mark +.B /dev/hda1 +as faulty in +.B /dev/md0 +and will then remove it from the array and finally add it back +in as a spare. However only one md array can be affected by a single +command. + +When a device is added to an active array, mdadm checks to see if it +has metadata on it which suggests that it was recently a member of the +array. If it does, it tries to "re\-add" the device. If there have +been no changes since the device was removed, or if the array has a +write-intent bitmap which has recorded whatever changes there were, +then the device will immediately become a full member of the array and +those differences recorded in the bitmap will be resolved. + +.SH MISC MODE +.HP 12 +Usage: +.B mdadm +.I options ... +.I devices ... +.PP + +MISC mode includes a number of distinct operations that +operate on distinct devices. The operations are: +.TP +.B \-\-query +The device is examined to see if it is +(1) an active md array, or +(2) a component of an md array. +The information discovered is reported. + +.TP +.B \-\-detail +The device should be an active md device. +.B mdadm +will display a detailed description of the array. +.B \-\-brief +or +.B \-\-scan +will cause the output to be less detailed and the format to be +suitable for inclusion in +.BR mdadm.conf . +The exit status of +.I mdadm +will normally be 0 unless +.I mdadm +failed to get useful information about the device(s); however, if the +.B \-\-test +option is given, then the exit status will be: +.RS +.TP +0 +The array is functioning normally. +.TP +1 +The array has at least one failed device. +.TP +2 +The array has multiple failed devices such that it is unusable. +.TP +4 +There was an error while trying to get information about the device. +.RE + +.TP +.B \-\-detail\-platform +Print detail of the platform's RAID capabilities (firmware / hardware +topology). If the metadata is specified with +.B \-e +or +.B \-\-metadata= +then the return status will be: +.RS +.TP +0 +metadata successfully enumerated its platform components on this system +.TP +1 +metadata is platform independent +.TP +2 +metadata failed to find its platform components on this system +.RE + +.TP +.B \-\-update\-subarray= +If the device is a container and the argument to \-\-update\-subarray +specifies a subarray in the container, then attempt to update the given +superblock field in the subarray. Similar to updating an array in +"assemble" mode, the field to update is selected by +.B \-U +or +.B \-\-update= +option. Currently only +.B name +is supported. + +The +.B name +option updates the subarray name in the metadata, it may not affect the +device node name or the device node symlink until the subarray is +re\-assembled. If updating +.B name +would change the UUID of an active subarray this operation is blocked, +and the command will end in an error. + +.TP +.B \-\-examine +The device should be a component of an md array. +.I mdadm +will read the md superblock of the device and display the contents. +If +.B \-\-brief +or +.B \-\-scan +is given, then multiple devices that are components of the one array +are grouped together and reported in a single entry suitable +for inclusion in +.BR mdadm.conf . + +Having +.B \-\-scan +without listing any devices will cause all devices listed in the +config file to be examined. + +.TP +.BI \-\-dump= directory +If the device contains RAID metadata, a file will be created in the +.I directory +and the metadata will be written to it. The file will be the same +size as the device and have the metadata written in the file at the +same locate that it exists in the device. However the file will be "sparse" so +that only those blocks containing metadata will be allocated. The +total space used will be small. + +The file name used in the +.I directory +will be the base name of the device. Further if any links appear in +.I /dev/disk/by-id +which point to the device, then hard links to the file will be created +in +.I directory +based on these +.I by-id +names. + +Multiple devices can be listed and their metadata will all be stored +in the one directory. + +.TP +.BI \-\-restore= directory +This is the reverse of +.BR \-\-dump . +.I mdadm +will locate a file in the directory that has a name appropriate for +the given device and will restore metadata from it. Names that match +.I /dev/disk/by-id +names are preferred, however if two of those refer to different files, +.I mdadm +will not choose between them but will abort the operation. + +If a file name is given instead of a +.I directory +then +.I mdadm +will restore from that file to a single device, always provided the +size of the file matches that of the device, and the file contains +valid metadata. +.TP +.B \-\-stop +The devices should be active md arrays which will be deactivated, as +long as they are not currently in use. + +.TP +.B \-\-run +This will fully activate a partially assembled md array. + +.TP +.B \-\-readonly +This will mark an active array as read-only, providing that it is +not currently being used. + +.TP +.B \-\-readwrite +This will change a +.B readonly +array back to being read/write. + +.TP +.B \-\-scan +For all operations except +.BR \-\-examine , +.B \-\-scan +will cause the operation to be applied to all arrays listed in +.BR /proc/mdstat . +For +.BR \-\-examine, +.B \-\-scan +causes all devices listed in the config file to be examined. + +.TP +.BR \-b ", " \-\-brief +Be less verbose. This is used with +.B \-\-detail +and +.BR \-\-examine . +Using +.B \-\-brief +with +.B \-\-verbose +gives an intermediate level of verbosity. + +.SH MONITOR MODE + +.HP 12 +Usage: +.B mdadm \-\-monitor +.I options... devices... + +.PP +This usage causes +.I mdadm +to periodically poll a number of md arrays and to report on any events +noticed. +.I mdadm +will never exit once it decides that there are arrays to be checked, +so it should normally be run in the background. + +As well as reporting events, +.I mdadm +may move a spare drive from one array to another if they are in the +same +.B spare-group +or +.B domain +and if the destination array has a failed drive but no spares. + +If any devices are listed on the command line, +.I mdadm +will only monitor those devices. Otherwise all arrays listed in the +configuration file will be monitored. Further, if +.B \-\-scan +is given, then any other md devices that appear in +.B /proc/mdstat +will also be monitored. + +The result of monitoring the arrays is the generation of events. +These events are passed to a separate program (if specified) and may +be mailed to a given E-mail address. + +When passing events to a program, the program is run once for each event, +and is given 2 or 3 command-line arguments: the first is the +name of the event (see below), the second is the name of the +md device which is affected, and the third is the name of a related +device if relevant (such as a component device that has failed). + +If +.B \-\-scan +is given, then a program or an E-mail address must be specified on the +command line or in the config file. If neither are available, then +.I mdadm +will not monitor anything. +Without +.B \-\-scan, +.I mdadm +will continue monitoring as long as something was found to monitor. If +no program or email is given, then each event is reported to +.BR stdout . + +The different events are: + +.RS 4 +.TP +.B DeviceDisappeared +An md array which previously was configured appears to no longer be +configured. (syslog priority: Critical) + +If +.I mdadm +was told to monitor an array which is RAID0 or Linear, then it will +report +.B DeviceDisappeared +with the extra information +.BR Wrong-Level . +This is because RAID0 and Linear do not support the device-failed, +hot-spare and resync operations which are monitored. + +.TP +.B RebuildStarted +An md array started reconstruction (e.g. recovery, resync, reshape, +check, repair). (syslog priority: Warning) + +.TP +.BI Rebuild NN +Where +.I NN +is a two-digit number (ie. 05, 48). This indicates that rebuild +has passed that many percent of the total. The events are generated +with fixed increment since 0. Increment size may be specified with +a commandline option (default is 20). (syslog priority: Warning) + +.TP +.B RebuildFinished +An md array that was rebuilding, isn't any more, either because it +finished normally or was aborted. (syslog priority: Warning) + +.TP +.B Fail +An active component device of an array has been marked as +faulty. (syslog priority: Critical) + +.TP +.B FailSpare +A spare component device which was being rebuilt to replace a faulty +device has failed. (syslog priority: Critical) + +.TP +.B SpareActive +A spare component device which was being rebuilt to replace a faulty +device has been successfully rebuilt and has been made active. +(syslog priority: Info) + +.TP +.B NewArray +A new md array has been detected in the +.B /proc/mdstat +file. (syslog priority: Info) + +.TP +.B DegradedArray +A newly noticed array appears to be degraded. This message is not +generated when +.I mdadm +notices a drive failure which causes degradation, but only when +.I mdadm +notices that an array is degraded when it first sees the array. +(syslog priority: Critical) + +.TP +.B MoveSpare +A spare drive has been moved from one array in a +.B spare-group +or +.B domain +to another to allow a failed drive to be replaced. +(syslog priority: Info) + +.TP +.B SparesMissing +If +.I mdadm +has been told, via the config file, that an array should have a certain +number of spare devices, and +.I mdadm +detects that it has fewer than this number when it first sees the +array, it will report a +.B SparesMissing +message. +(syslog priority: Warning) + +.TP +.B TestMessage +An array was found at startup, and the +.B \-\-test +flag was given. +(syslog priority: Info) +.RE + +Only +.B Fail, +.B FailSpare, +.B DegradedArray, +.B SparesMissing +and +.B TestMessage +cause Email to be sent. All events cause the program to be run. +The program is run with two or three arguments: the event +name, the array device and possibly a second device. + +Each event has an associated array device (e.g. +.BR /dev/md1 ) +and possibly a second device. For +.BR Fail , +.BR FailSpare , +and +.B SpareActive +the second device is the relevant component device. +For +.B MoveSpare +the second device is the array that the spare was moved from. + +For +.I mdadm +to move spares from one array to another, the different arrays need to +be labeled with the same +.B spare-group +or the spares must be allowed to migrate through matching POLICY domains +in the configuration file. The +.B spare-group +name can be any string; it is only necessary that different spare +groups use different names. + +When +.I mdadm +detects that an array in a spare group has fewer active +devices than necessary for the complete array, and has no spare +devices, it will look for another array in the same spare group that +has a full complement of working drive and a spare. It will then +attempt to remove the spare from the second drive and add it to the +first. +If the removal succeeds but the adding fails, then it is added back to +the original array. + +If the spare group for a degraded array is not defined, +.I mdadm +will look at the rules of spare migration specified by POLICY lines in +.B mdadm.conf +and then follow similar steps as above if a matching spare is found. + +.SH GROW MODE +The GROW mode is used for changing the size or shape of an active +array. +For this to work, the kernel must support the necessary change. +Various types of growth are being added during 2.6 development. + +Currently the supported changes include +.IP \(bu 4 +change the "size" attribute for RAID1, RAID4, RAID5 and RAID6. +.IP \(bu 4 +increase or decrease the "raid\-devices" attribute of RAID0, RAID1, RAID4, +RAID5, and RAID6. +.IP \(bu 4 +change the chunk-size and layout of RAID0, RAID4, RAID5, RAID6 and RAID10. +.IP \(bu 4 +convert between RAID1 and RAID5, between RAID5 and RAID6, between +RAID0, RAID4, and RAID5, and between RAID0 and RAID10 (in the near-2 mode). +.IP \(bu 4 +add a write-intent bitmap to any array which supports these bitmaps, or +remove a write-intent bitmap from such an array. +.PP + +Using GROW on containers is currently supported only for Intel's IMSM +container format. The number of devices in a container can be +increased - which affects all arrays in the container - or an array +in a container can be converted between levels where those levels are +supported by the container, and the conversion is on of those listed +above. Resizing arrays in an IMSM container with +.B "--grow --size" +is not yet supported. + +Grow functionality (e.g. expand a number of raid devices) for Intel's +IMSM container format has an experimental status. It is guarded by the +.B MDADM_EXPERIMENTAL +environment variable which must be set to '1' for a GROW command to +succeed. +This is for the following reasons: + +.IP 1. +Intel's native IMSM check-pointing is not fully tested yet. +This can causes IMSM incompatibility during the grow process: an array +which is growing cannot roam between Microsoft Windows(R) and Linux +systems. + +.IP 2. +Interrupting a grow operation is not recommended, because it +has not been fully tested for Intel's IMSM container format yet. + +.PP +Note: Intel's native checkpointing doesn't use +.B --backup-file +option and it is transparent for assembly feature. + +.SS SIZE CHANGES +Normally when an array is built the "size" is taken from the smallest +of the drives. If all the small drives in an arrays are, one at a +time, removed and replaced with larger drives, then you could have an +array of large drives with only a small amount used. In this +situation, changing the "size" with "GROW" mode will allow the extra +space to start being used. If the size is increased in this way, a +"resync" process will start to make sure the new parts of the array +are synchronised. + +Note that when an array changes size, any filesystem that may be +stored in the array will not automatically grow or shrink to use or +vacate the space. The +filesystem will need to be explicitly told to use the extra space +after growing, or to reduce its size +.B prior +to shrinking the array. + +Also the size of an array cannot be changed while it has an active +bitmap. If an array has a bitmap, it must be removed before the size +can be changed. Once the change is complete a new bitmap can be created. + +.SS RAID\-DEVICES CHANGES + +A RAID1 array can work with any number of devices from 1 upwards +(though 1 is not very useful). There may be times which you want to +increase or decrease the number of active devices. Note that this is +different to hot-add or hot-remove which changes the number of +inactive devices. + +When reducing the number of devices in a RAID1 array, the slots which +are to be removed from the array must already be vacant. That is, the +devices which were in those slots must be failed and removed. + +When the number of devices is increased, any hot spares that are +present will be activated immediately. + +Changing the number of active devices in a RAID5 or RAID6 is much more +effort. Every block in the array will need to be read and written +back to a new location. From 2.6.17, the Linux Kernel is able to +increase the number of devices in a RAID5 safely, including restarting +an interrupted "reshape". From 2.6.31, the Linux Kernel is able to +increase or decrease the number of devices in a RAID5 or RAID6. + +From 2.6.35, the Linux Kernel is able to convert a RAID0 in to a RAID4 +or RAID5. +.I mdadm +uses this functionality and the ability to add +devices to a RAID4 to allow devices to be added to a RAID0. When +requested to do this, +.I mdadm +will convert the RAID0 to a RAID4, add the necessary disks and make +the reshape happen, and then convert the RAID4 back to RAID0. + +When decreasing the number of devices, the size of the array will also +decrease. If there was data in the array, it could get destroyed and +this is not reversible, so you should firstly shrink the filesystem on +the array to fit within the new size. To help prevent accidents, +.I mdadm +requires that the size of the array be decreased first with +.BR "mdadm --grow --array-size" . +This is a reversible change which simply makes the end of the array +inaccessible. The integrity of any data can then be checked before +the non-reversible reduction in the number of devices is request. + +When relocating the first few stripes on a RAID5 or RAID6, it is not +possible to keep the data on disk completely consistent and +crash-proof. To provide the required safety, mdadm disables writes to +the array while this "critical section" is reshaped, and takes a +backup of the data that is in that section. For grows, this backup may be +stored in any spare devices that the array has, however it can also be +stored in a separate file specified with the +.B \-\-backup\-file +option, and is required to be specified for shrinks, RAID level +changes and layout changes. If this option is used, and the system +does crash during the critical period, the same file must be passed to +.B \-\-assemble +to restore the backup and reassemble the array. When shrinking rather +than growing the array, the reshape is done from the end towards the +beginning, so the "critical section" is at the end of the reshape. + +.SS LEVEL CHANGES + +Changing the RAID level of any array happens instantaneously. However +in the RAID5 to RAID6 case this requires a non-standard layout of the +RAID6 data, and in the RAID6 to RAID5 case that non-standard layout is +required before the change can be accomplished. So while the level +change is instant, the accompanying layout change can take quite a +long time. A +.B \-\-backup\-file +is required. If the array is not simultaneously being grown or +shrunk, so that the array size will remain the same - for example, +reshaping a 3-drive RAID5 into a 4-drive RAID6 - the backup file will +be used not just for a "cricital section" but throughout the reshape +operation, as described below under LAYOUT CHANGES. + +.SS CHUNK-SIZE AND LAYOUT CHANGES + +Changing the chunk-size of layout without also changing the number of +devices as the same time will involve re-writing all blocks in-place. +To ensure against data loss in the case of a crash, a +.B --backup-file +must be provided for these changes. Small sections of the array will +be copied to the backup file while they are being rearranged. This +means that all the data is copied twice, once to the backup and once +to the new layout on the array, so this type of reshape will go very +slowly. + +If the reshape is interrupted for any reason, this backup file must be +made available to +.B "mdadm --assemble" +so the array can be reassembled. Consequently the file cannot be +stored on the device being reshaped. + + +.SS BITMAP CHANGES + +A write-intent bitmap can be added to, or removed from, an active +array. Either internal bitmaps, or bitmaps stored in a separate file, +can be added. Note that if you add a bitmap stored in a file which is +in a filesystem that is on the RAID array being affected, the system +will deadlock. The bitmap must be on a separate filesystem. + +.SH INCREMENTAL MODE + +.HP 12 +Usage: +.B mdadm \-\-incremental +.RB [ \-\-run ] +.RB [ \-\-quiet ] +.I component-device +.RI [ optional-aliases-for-device ] +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-fail +.I component-device +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-rebuild\-map +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-run \-\-scan + +.PP +This mode is designed to be used in conjunction with a device +discovery system. As devices are found in a system, they can be +passed to +.B "mdadm \-\-incremental" +to be conditionally added to an appropriate array. + +Conversely, it can also be used with the +.B \-\-fail +flag to do just the opposite and find whatever array a particular device +is part of and remove the device from that array. + +If the device passed is a +.B CONTAINER +device created by a previous call to +.IR mdadm , +then rather than trying to add that device to an array, all the arrays +described by the metadata of the container will be started. + +.I mdadm +performs a number of tests to determine if the device is part of an +array, and which array it should be part of. If an appropriate array +is found, or can be created, +.I mdadm +adds the device to the array and conditionally starts the array. + +Note that +.I mdadm +will normally only add devices to an array which were previously working +(active or spare) parts of that array. The support for automatic +inclusion of a new drive as a spare in some array requires +a configuration through POLICY in config file. + +The tests that +.I mdadm +makes are as follow: +.IP + +Is the device permitted by +.BR mdadm.conf ? +That is, is it listed in a +.B DEVICES +line in that file. If +.B DEVICES +is absent then the default it to allow any device. Similarly if +.B DEVICES +contains the special word +.B partitions +then any device is allowed. Otherwise the device name given to +.IR mdadm , +or one of the aliases given, or an alias found in the filesystem, +must match one of the names or patterns in a +.B DEVICES +line. + +This is the only context where the aliases are used. They are +usually provided by a +.I udev +rules mentioning +.BR ${DEVLINKS} . + +.IP + +Does the device have a valid md superblock? If a specific metadata +version is requested with +.B \-\-metadata +or +.B \-e +then only that style of metadata is accepted, otherwise +.I mdadm +finds any known version of metadata. If no +.I md +metadata is found, the device may be still added to an array +as a spare if POLICY allows. + +.ig +.IP + +Does the metadata match an expected array? +The metadata can match in two ways. Either there is an array listed +in +.B mdadm.conf +which identifies the array (either by UUID, by name, by device list, +or by minor-number), or the array was created with a +.B homehost +specified and that +.B homehost +matches the one in +.B mdadm.conf +or on the command line. +If +.I mdadm +is not able to positively identify the array as belonging to the +current host, the device will be rejected. +.. + +.PP +.I mdadm +keeps a list of arrays that it has partially assembled in +.BR {MAP_PATH} . +If no array exists which matches +the metadata on the new device, +.I mdadm +must choose a device name and unit number. It does this based on any +name given in +.B mdadm.conf +or any name information stored in the metadata. If this name +suggests a unit number, that number will be used, otherwise a free +unit number will be chosen. Normally +.I mdadm +will prefer to create a partitionable array, however if the +.B CREATE +line in +.B mdadm.conf +suggests that a non-partitionable array is preferred, that will be +honoured. + +If the array is not found in the config file and its metadata does not +identify it as belonging to the "homehost", then +.I mdadm +will choose a name for the array which is certain not to conflict with +any array which does belong to this host. It does this be adding an +underscore and a small number to the name preferred by the metadata. + +Once an appropriate array is found or created and the device is added, +.I mdadm +must decide if the array is ready to be started. It will +normally compare the number of available (non-spare) devices to the +number of devices that the metadata suggests need to be active. If +there are at least that many, the array will be started. This means +that if any devices are missing the array will not be restarted. + +As an alternative, +.B \-\-run +may be passed to +.I mdadm +in which case the array will be run as soon as there are enough +devices present for the data to be accessible. For a RAID1, that +means one device will start the array. For a clean RAID5, the array +will be started as soon as all but one drive is present. + +Note that neither of these approaches is really ideal. If it can +be known that all device discovery has completed, then +.br +.B " mdadm \-IRs" +.br +can be run which will try to start all arrays that are being +incrementally assembled. They are started in "read-auto" mode in +which they are read-only until the first write request. This means +that no metadata updates are made and no attempt at resync or recovery +happens. Further devices that are found before the first write can +still be added safely. + +.SH ENVIRONMENT +This section describes environment variables that affect how mdadm +operates. + +.TP +.B MDADM_NO_MDMON +Setting this value to 1 will prevent mdadm from automatically launching +mdmon. This variable is intended primarily for debugging mdadm/mdmon. + +.TP +.B MDADM_NO_UDEV +Normally, +.I mdadm +does not create any device nodes in /dev, but leaves that task to +.IR udev . +If +.I udev +appears not to be configured, or if this environment variable is set +to '1', the +.I mdadm +will create and devices that are needed. + +.TP +.B MDADM_NO_SYSTEMCTL +If +.I mdadm +detects that +.I systemd +is in use it will normally request +.I systemd +to start various background tasks (particularly +.IR mdmon ) +rather than forking and running them in the background. This can be +suppressed by setting +.BR MDADM_NO_SYSTEMCTL=1 . + +.TP +.B IMSM_NO_PLATFORM +A key value of IMSM metadata is that it allows interoperability with +boot ROMs on Intel platforms, and with other major operating systems. +Consequently, +.I mdadm +will only allow an IMSM array to be created or modified if detects +that it is running on an Intel platform which supports IMSM, and +supports the particular configuration of IMSM that is being requested +(some functionality requires newer OROM support). + +These checks can be suppressed by setting IMSM_NO_PLATFORM=1 in the +environment. This can be useful for testing or for disaster +recovery. You should be aware that interoperability may be +compromised by setting this value. + +.TP +.B MDADM_GROW_ALLOW_OLD +If an array is stopped while it is performing a reshape and that +reshape was making use of a backup file, then when the array is +re-assembled +.I mdadm +will sometimes complain that the backup file is too old. If this +happens and you are certain it is the right backup file, you can +over-ride this check by setting +.B MDADM_GROW_ALLOW_OLD=1 +in the environment. + +.TP +.B MDADM_CONF_AUTO +Any string given in this variable is added to the start of the +.B AUTO +line in the config file, or treated as the whole +.B AUTO +line if none is given. It can be used to disable certain metadata +types when +.I mdadm +is called from a boot script. For example +.br +.B " export MDADM_CONF_AUTO='-ddf -imsm' +.br +will make sure that +.I mdadm +does not automatically assemble any DDF or +IMSM arrays that are found. This can be useful on systems configured +to manage such arrays with +.BR dmraid . + + +.SH EXAMPLES + +.B " mdadm \-\-query /dev/name-of-device" +.br +This will find out if a given device is a RAID array, or is part of +one, and will provide brief information about the device. + +.B " mdadm \-\-assemble \-\-scan" +.br +This will assemble and start all arrays listed in the standard config +file. This command will typically go in a system startup file. + +.B " mdadm \-\-stop \-\-scan" +.br +This will shut down all arrays that can be shut down (i.e. are not +currently in use). This will typically go in a system shutdown script. + +.B " mdadm \-\-follow \-\-scan \-\-delay=120" +.br +If (and only if) there is an Email address or program given in the +standard config file, then +monitor the status of all arrays listed in that file by +polling them ever 2 minutes. + +.B " mdadm \-\-create /dev/md0 \-\-level=1 \-\-raid\-devices=2 /dev/hd[ac]1" +.br +Create /dev/md0 as a RAID1 array consisting of /dev/hda1 and /dev/hdc1. + +.br +.B " echo 'DEVICE /dev/hd*[0\-9] /dev/sd*[0\-9]' > mdadm.conf" +.br +.B " mdadm \-\-detail \-\-scan >> mdadm.conf" +.br +This will create a prototype config file that describes currently +active arrays that are known to be made from partitions of IDE or SCSI drives. +This file should be reviewed before being used as it may +contain unwanted detail. + +.B " echo 'DEVICE /dev/hd[a\-z] /dev/sd*[a\-z]' > mdadm.conf" +.br +.B " mdadm \-\-examine \-\-scan \-\-config=mdadm.conf >> mdadm.conf" +.br +This will find arrays which could be assembled from existing IDE and +SCSI whole drives (not partitions), and store the information in the +format of a config file. +This file is very likely to contain unwanted detail, particularly +the +.B devices= +entries. It should be reviewed and edited before being used as an +actual config file. + +.B " mdadm \-\-examine \-\-brief \-\-scan \-\-config=partitions" +.br +.B " mdadm \-Ebsc partitions" +.br +Create a list of devices by reading +.BR /proc/partitions , +scan these for RAID superblocks, and printout a brief listing of all +that were found. + +.B " mdadm \-Ac partitions \-m 0 /dev/md0" +.br +Scan all partitions and devices listed in +.BR /proc/partitions +and assemble +.B /dev/md0 +out of all such devices with a RAID superblock with a minor number of 0. + +.B " mdadm \-\-monitor \-\-scan \-\-daemonise > /run/mdadm/mon.pid" +.br +If config file contains a mail address or alert program, run mdadm in +the background in monitor mode monitoring all md devices. Also write +pid of mdadm daemon to +.BR /run/mdadm/mon.pid . + +.B " mdadm \-Iq /dev/somedevice" +.br +Try to incorporate newly discovered device into some array as +appropriate. + +.B " mdadm \-\-incremental \-\-rebuild\-map \-\-run \-\-scan" +.br +Rebuild the array map from any current arrays, and then start any that +can be started. + +.B " mdadm /dev/md4 --fail detached --remove detached" +.br +Any devices which are components of /dev/md4 will be marked as faulty +and then remove from the array. + +.B " mdadm --grow /dev/md4 --level=6 --backup-file=/root/backup-md4" +.br +The array +.B /dev/md4 +which is currently a RAID5 array will be converted to RAID6. There +should normally already be a spare drive attached to the array as a +RAID6 needs one more drive than a matching RAID5. + +.B " mdadm --create /dev/md/ddf --metadata=ddf --raid-disks 6 /dev/sd[a-f]" +.br +Create a DDF array over 6 devices. + +.B " mdadm --create /dev/md/home -n3 -l5 -z 30000000 /dev/md/ddf" +.br +Create a RAID5 array over any 3 devices in the given DDF set. Use +only 30 gigabytes of each device. + +.B " mdadm -A /dev/md/ddf1 /dev/sd[a-f]" +.br +Assemble a pre-exist ddf array. + +.B " mdadm -I /dev/md/ddf1" +.br +Assemble all arrays contained in the ddf array, assigning names as +appropriate. + +.B " mdadm \-\-create \-\-help" +.br +Provide help about the Create mode. + +.B " mdadm \-\-config \-\-help" +.br +Provide help about the format of the config file. + +.B " mdadm \-\-help" +.br +Provide general help. + +.SH FILES + +.SS /proc/mdstat + +If you're using the +.B /proc +filesystem, +.B /proc/mdstat +lists all active md devices with information about them. +.I mdadm +uses this to find arrays when +.B \-\-scan +is given in Misc mode, and to monitor array reconstruction +on Monitor mode. + +.SS /etc/mdadm/mdadm.conf (or /etc/mdadm.conf) + +The config file lists which devices may be scanned to see if +they contain MD super block, and gives identifying information +(e.g. UUID) about known MD arrays. See +.BR mdadm.conf (5) +for more details. + +.SS /etc/mdadm/mdadm.conf.d (or /etc/mdadm.conf.d) + +A directory containing configuration files which are read in lexical +order. + +.SS {MAP_PATH} +When +.B \-\-incremental +mode is used, this file gets a list of arrays currently being created. + +.SH DEVICE NAMES + +.I mdadm +understand two sorts of names for array devices. + +The first is the so-called 'standard' format name, which matches the +names used by the kernel and which appear in +.IR /proc/mdstat . + +The second sort can be freely chosen, but must reside in +.IR /dev/md/ . +When giving a device name to +.I mdadm +to create or assemble an array, either full path name such as +.I /dev/md0 +or +.I /dev/md/home +can be given, or just the suffix of the second sort of name, such as +.I home +can be given. + +When +.I mdadm +chooses device names during auto-assembly or incremental assembly, it +will sometimes add a small sequence number to the end of the name to +avoid conflicted between multiple arrays that have the same name. If +.I mdadm +can reasonably determine that the array really is meant for this host, +either by a hostname in the metadata, or by the presence of the array +in +.BR mdadm.conf , +then it will leave off the suffix if possible. +Also if the homehost is specified as +.B +.I mdadm +will only use a suffix if a different array of the same name already +exists or is listed in the config file. + +The standard names for non-partitioned arrays (the only sort of md +array available in 2.4 and earlier) are of the form +.IP +.RB /dev/md NN +.PP +where NN is a number. +The standard names for partitionable arrays (as available from 2.6 +onwards) are of the form: +.IP +.RB /dev/md_d NN +.PP +Partition numbers should be indicated by adding "pMM" to these, thus "/dev/md/d1p2". +.PP +From kernel version 2.6.28 the "non-partitioned array" can actually +be partitioned. So the "md_d\fBNN\fP" +names are no longer needed, and +partitions such as "/dev/md\fBNN\fPp\fBXX\fP" +are possible. +.PP +From kernel version 2.6.29 standard names can be non-numeric following +the form: +.IP +.RB /dev/md_ XXX +.PP +where +.B XXX +is any string. These names are supported by +.I mdadm +since version 3.3 provided they are enabled in +.IR mdadm.conf . + +.SH NOTE +.I mdadm +was previously known as +.IR mdctl . + +.SH SEE ALSO +For further information on mdadm usage, MD and the various levels of +RAID, see: +.IP +.B http://raid.wiki.kernel.org/ +.PP +(based upon Jakob \(/Ostergaard's Software\-RAID.HOWTO) +.PP +The latest version of +.I mdadm +should always be available from +.IP +.B http://www.kernel.org/pub/linux/utils/raid/mdadm/ +.PP +Related man pages: +.PP +.IR mdmon (8), +.IR mdadm.conf (5), +.IR md (4). diff --git a/mdadm.c b/mdadm.c new file mode 100644 index 00000000..51e16f3f --- /dev/null +++ b/mdadm.c @@ -0,0 +1,1936 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * + * Additions for bitmap and write-behind RAID options, Copyright (C) 2003-2004, + * Paul Clements, SteelEye Technology, Inc. + */ + +#include "mdadm.h" +#include "md_p.h" +#include + +static int scan_assemble(struct supertype *ss, + struct context *c, + struct mddev_ident *ident); +static int misc_scan(char devmode, struct context *c); +static int stop_scan(int verbose); +static int misc_list(struct mddev_dev *devlist, + struct mddev_ident *ident, + char *dump_directory, + struct supertype *ss, struct context *c); +const char Name[] = "mdadm"; + +int main(int argc, char *argv[]) +{ + int mode = 0; + int opt; + int option_index; + int rv; + int i; + + unsigned long long array_size = 0; + unsigned long long data_offset = INVALID_SECTORS; + struct mddev_ident ident; + char *configfile = NULL; + int devmode = 0; + int bitmap_fd = -1; + struct mddev_dev *devlist = NULL; + struct mddev_dev **devlistend = & devlist; + struct mddev_dev *dv; + int devs_found = 0; + char *symlinks = NULL; + int grow_continue = 0; + /* autof indicates whether and how to create device node. + * bottom 3 bits are style. Rest (when shifted) are number of parts + * 0 - unset + * 1 - don't create (no) + * 2 - if is_standard, then create (yes) + * 3 - create as 'md' - reject is_standard mdp (md) + * 4 - create as 'mdp' - reject is_standard md (mdp) + * 5 - default to md if not is_standard (md in config file) + * 6 - default to mdp if not is_standard (part, or mdp in config file) + */ + struct context c = { + .require_homehost = 1, + }; + struct shape s = { + .journaldisks = 0, + .level = UnSet, + .layout = UnSet, + .bitmap_chunk = UnSet, + }; + + char sys_hostname[256]; + char *mailaddr = NULL; + char *program = NULL; + int increments = 20; + int daemonise = 0; + char *pidfile = NULL; + int oneshot = 0; + int spare_sharing = 1; + struct supertype *ss = NULL; + int writemostly = 0; + char *shortopt = short_options; + int dosyslog = 0; + int rebuild_map = 0; + char *remove_path = NULL; + char *udev_filename = NULL; + char *dump_directory = NULL; + + int print_help = 0; + FILE *outf; + + int mdfd = -1; + + srandom(time(0) ^ getpid()); + + ident.uuid_set=0; + ident.level = UnSet; + ident.raid_disks = UnSet; + ident.super_minor= UnSet; + ident.devices=0; + ident.spare_group = NULL; + ident.autof = 0; + ident.st = NULL; + ident.bitmap_fd = -1; + ident.bitmap_file = NULL; + ident.name[0] = 0; + ident.container = NULL; + ident.member = NULL; + + while ((option_index = -1) , + (opt=getopt_long(argc, argv, + shortopt, long_options, + &option_index)) != -1) { + int newmode = mode; + /* firstly, some mode-independent options */ + switch(opt) { + case HelpOptions: + print_help = 2; + continue; + case 'h': + print_help = 1; + continue; + + case 'V': + fputs(Version, stderr); + exit(0); + + case 'v': c.verbose++; + continue; + + case 'q': c.verbose--; + continue; + + case 'b': + if (mode == ASSEMBLE || mode == BUILD || mode == CREATE + || mode == GROW || mode == INCREMENTAL + || mode == MANAGE) + break; /* b means bitmap */ + case Brief: + c.brief = 1; + continue; + + case 'Y': c.export++; + continue; + + case HomeHost: + if (strcasecmp(optarg, "") == 0) + c.require_homehost = 0; + else + c.homehost = optarg; + continue; + + case OffRootOpt: + /* Silently ignore old option */ + continue; + + case Prefer: + if (c.prefer) + free(c.prefer); + if (asprintf(&c.prefer, "/%s/", optarg) <= 0) + c.prefer = NULL; + continue; + + case ':': + case '?': + fputs(Usage, stderr); + exit(2); + } + /* second, figure out the mode. + * Some options force the mode. Others + * set the mode if it isn't already + */ + + switch(opt) { + case ManageOpt: + newmode = MANAGE; + shortopt = short_bitmap_options; + break; + case 'a': + case Add: + case AddSpare: + case AddJournal: + case 'r': + case Remove: + case Replace: + case With: + case 'f': + case Fail: + case ReAdd: /* re-add */ + case ClusterConfirm: + if (!mode) { + newmode = MANAGE; + shortopt = short_bitmap_options; + } + break; + + case 'A': newmode = ASSEMBLE; + shortopt = short_bitmap_auto_options; + break; + case 'B': newmode = BUILD; + shortopt = short_bitmap_auto_options; + break; + case 'C': newmode = CREATE; + shortopt = short_bitmap_auto_options; + break; + case 'F': newmode = MONITOR; + break; + case 'G': newmode = GROW; + shortopt = short_bitmap_options; + break; + case 'I': newmode = INCREMENTAL; + shortopt = short_bitmap_auto_options; + break; + case AutoDetect: + newmode = AUTODETECT; + break; + + case MiscOpt: + case 'D': + case 'E': + case 'X': + case 'Q': + case ExamineBB: + case Dump: + case Restore: + case Action: + newmode = MISC; + break; + + case 'R': + case 'S': + case 'o': + case 'w': + case 'W': + case WaitOpt: + case Waitclean: + case DetailPlatform: + case KillSubarray: + case UpdateSubarray: + case UdevRules: + case KillOpt: + if (!mode) + newmode = MISC; + break; + + case NoSharing: + newmode = MONITOR; + break; + } + if (mode && newmode == mode) { + /* everybody happy ! */ + } else if (mode && newmode != mode) { + /* not allowed.. */ + pr_err(""); + if (option_index >= 0) + fprintf(stderr, "--%s", long_options[option_index].name); + else + fprintf(stderr, "-%c", opt); + fprintf(stderr, " would set mdadm mode to \"%s\", but it is already set to \"%s\".\n", + map_num(modes, newmode), + map_num(modes, mode)); + exit(2); + } else if (!mode && newmode) { + mode = newmode; + if (mode == MISC && devs_found) { + pr_err("No action given for %s in --misc mode\n", + devlist->devname); + cont_err("Action options must come before device names\n"); + exit(2); + } + } else { + /* special case of -c --help */ + if ((opt == 'c' || opt == ConfigFile) && + ( strncmp(optarg, "--h", 3)==0 || + strncmp(optarg, "-h", 2)==0)) { + fputs(Help_config, stdout); + exit(0); + } + + /* If first option is a device, don't force the mode yet */ + if (opt == 1) { + if (devs_found == 0) { + dv = xmalloc(sizeof(*dv)); + dv->devname = optarg; + dv->disposition = devmode; + dv->writemostly = writemostly; + dv->used = 0; + dv->next = NULL; + *devlistend = dv; + devlistend = &dv->next; + + devs_found++; + continue; + } + /* No mode yet, and this is the second device ... */ + pr_err("An option must be given to set the mode before a second device\n" + " (%s) is listed\n", optarg); + exit(2); + } + if (option_index >= 0) + pr_err("--%s", long_options[option_index].name); + else + pr_err("-%c", opt); + fprintf(stderr, " does not set the mode, and so cannot be the first option.\n"); + exit(2); + } + + /* if we just set the mode, then done */ + switch(opt) { + case ManageOpt: + case MiscOpt: + case 'A': + case 'B': + case 'C': + case 'F': + case 'G': + case 'I': + case AutoDetect: + continue; + } + if (opt == 1) { + /* an undecorated option - must be a device name. + */ + + if (devs_found > 0 && devmode == DetailPlatform) { + pr_err("controller may only be specified once. %s ignored\n", + optarg); + continue; + } + + if (devs_found > 0 && mode == MANAGE && !devmode) { + pr_err("Must give one of -a/-r/-f for subsequent devices at %s\n", optarg); + exit(2); + } + if (devs_found > 0 && mode == GROW && !devmode) { + pr_err("Must give -a/--add for devices to add: %s\n", optarg); + exit(2); + } + dv = xmalloc(sizeof(*dv)); + dv->devname = optarg; + dv->disposition = devmode; + dv->writemostly = writemostly; + dv->used = 0; + dv->next = NULL; + *devlistend = dv; + devlistend = &dv->next; + + devs_found++; + continue; + } + + /* We've got a mode, and opt is now something else which + * could depend on the mode */ +#define O(a,b) ((a<<16)|b) + switch (O(mode,opt)) { + case O(GROW,'c'): + case O(GROW,ChunkSize): + case O(CREATE,'c'): + case O(CREATE,ChunkSize): + case O(BUILD,'c'): /* chunk or rounding */ + case O(BUILD,ChunkSize): /* chunk or rounding */ + if (s.chunk) { + pr_err("chunk/rounding may only be specified once. Second value is %s.\n", optarg); + exit(2); + } + s.chunk = parse_size(optarg); + if (s.chunk == INVALID_SECTORS || + s.chunk < 8 || (s.chunk&1)) { + pr_err("invalid chunk/rounding value: %s\n", + optarg); + exit(2); + } + /* Convert sectors to K */ + s.chunk /= 2; + continue; + + case O(INCREMENTAL, 'e'): + case O(CREATE,'e'): + case O(ASSEMBLE,'e'): + case O(MISC,'e'): /* set metadata (superblock) information */ + if (ss) { + pr_err("metadata information already given\n"); + exit(2); + } + for(i=0; !ss && superlist[i]; i++) + ss = superlist[i]->match_metadata_desc(optarg); + + if (!ss) { + pr_err("unrecognised metadata identifier: %s\n", optarg); + exit(2); + } + continue; + + case O(MANAGE,'W'): + case O(MANAGE,WriteMostly): + case O(BUILD,'W'): + case O(BUILD,WriteMostly): + case O(CREATE,'W'): + case O(CREATE,WriteMostly): + /* set write-mostly for following devices */ + writemostly = 1; + continue; + + case O(MANAGE,'w'): + /* clear write-mostly for following devices */ + writemostly = 2; + continue; + + case O(GROW,'z'): + case O(CREATE,'z'): + case O(BUILD,'z'): /* size */ + if (s.size > 0) { + pr_err("size may only be specified once. Second value is %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "max")==0) + s.size = MAX_SIZE; + else { + s.size = parse_size(optarg); + if (s.size == INVALID_SECTORS || + s.size < 8) { + pr_err("invalid size: %s\n", + optarg); + exit(2); + } + /* convert sectors to K */ + s.size /= 2; + } + continue; + + case O(GROW,'Z'): /* array size */ + if (array_size > 0) { + pr_err("array-size may only be specified once. Second value is %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "max") == 0) + array_size = MAX_SIZE; + else { + array_size = parse_size(optarg); + if (array_size == 0 || + array_size == INVALID_SECTORS) { + pr_err("invalid array size: %s\n", + optarg); + exit(2); + } + } + continue; + + case O(CREATE,DataOffset): + case O(GROW,DataOffset): + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset may only be specified one. Second value is %s.\n", optarg); + exit(2); + } + if (mode == CREATE && + strcmp(optarg, "variable") == 0) + data_offset = VARIABLE_OFFSET; + else + data_offset = parse_size(optarg); + if (data_offset == INVALID_SECTORS) { + pr_err("invalid data-offset: %s\n", + optarg); + exit(2); + } + continue; + + case O(GROW,'l'): + case O(CREATE,'l'): + case O(BUILD,'l'): /* set raid level*/ + if (s.level != UnSet) { + pr_err("raid level may only be set once. Second value is %s.\n", optarg); + exit(2); + } + s.level = map_name(pers, optarg); + if (s.level == UnSet) { + pr_err("invalid raid level: %s\n", + optarg); + exit(2); + } + if (s.level != 0 && s.level != LEVEL_LINEAR && s.level != 1 && + s.level != LEVEL_MULTIPATH && s.level != LEVEL_FAULTY && + s.level != 10 && + mode == BUILD) { + pr_err("Raid level %s not permitted with --build.\n", + optarg); + exit(2); + } + if (s.sparedisks > 0 && s.level < 1 && s.level >= -1) { + pr_err("raid level %s is incompatible with spare-devices setting.\n", + optarg); + exit(2); + } + ident.level = s.level; + continue; + + case O(GROW, 'p'): /* new layout */ + case O(GROW, Layout): + if (s.layout_str) { + pr_err("layout may only be sent once. Second value was %s\n", optarg); + exit(2); + } + s.layout_str = optarg; + /* 'Grow' will parse the value */ + continue; + + case O(CREATE,'p'): /* raid5 layout */ + case O(CREATE,Layout): + case O(BUILD,'p'): /* faulty layout */ + case O(BUILD,Layout): + if (s.layout != UnSet) { + pr_err("layout may only be sent once. Second value was %s\n", optarg); + exit(2); + } + switch(s.level) { + default: + pr_err("layout not meaningful for %s arrays.\n", + map_num(pers, s.level)); + exit(2); + case UnSet: + pr_err("raid level must be given before layout.\n"); + exit(2); + + case 5: + s.layout = map_name(r5layout, optarg); + if (s.layout==UnSet) { + pr_err("layout %s not understood for raid5.\n", + optarg); + exit(2); + } + break; + case 6: + s.layout = map_name(r6layout, optarg); + if (s.layout==UnSet) { + pr_err("layout %s not understood for raid6.\n", + optarg); + exit(2); + } + break; + + case 10: + s.layout = parse_layout_10(optarg); + if (s.layout < 0) { + pr_err("layout for raid10 must be 'nNN', 'oNN' or 'fNN' where NN is a number, not %s\n", optarg); + exit(2); + } + break; + case LEVEL_FAULTY: + /* Faulty + * modeNNN + */ + s.layout = parse_layout_faulty(optarg); + if (s.layout == -1) { + pr_err("layout %s not understood for faulty.\n", + optarg); + exit(2); + } + break; + } + continue; + + case O(CREATE,AssumeClean): + case O(BUILD,AssumeClean): /* assume clean */ + case O(GROW,AssumeClean): + s.assume_clean = 1; + continue; + + case O(GROW,'n'): + case O(CREATE,'n'): + case O(BUILD,'n'): /* number of raid disks */ + if (s.raiddisks) { + pr_err("raid-devices set twice: %d and %s\n", + s.raiddisks, optarg); + exit(2); + } + s.raiddisks = parse_num(optarg); + if (s.raiddisks <= 0) { + pr_err("invalid number of raid devices: %s\n", + optarg); + exit(2); + } + ident.raid_disks = s.raiddisks; + continue; + case O(ASSEMBLE, Nodes): + case O(CREATE, Nodes): + c.nodes = parse_num(optarg); + if (c.nodes <= 0) { + pr_err("invalid number for the number of cluster nodes: %s\n", + optarg); + exit(2); + } + continue; + case O(CREATE, ClusterName): + case O(ASSEMBLE, ClusterName): + c.homecluster = optarg; + if (strlen(c.homecluster) > 64) { + pr_err("Cluster name too big.\n"); + exit(ERANGE); + } + continue; + case O(CREATE,'x'): /* number of spare (eXtra) disks */ + if (s.sparedisks) { + pr_err("spare-devices set twice: %d and %s\n", + s.sparedisks, optarg); + exit(2); + } + if (s.level != UnSet && s.level <= 0 && s.level >= -1) { + pr_err("spare-devices setting is incompatible with raid level %d\n", + s.level); + exit(2); + } + s.sparedisks = parse_num(optarg); + if (s.sparedisks < 0) { + pr_err("invalid number of spare-devices: %s\n", + optarg); + exit(2); + } + continue; + + case O(CREATE,'a'): + case O(CREATE,Auto): + case O(BUILD,'a'): + case O(BUILD,Auto): + case O(INCREMENTAL,'a'): + case O(INCREMENTAL,Auto): + case O(ASSEMBLE,'a'): + case O(ASSEMBLE,Auto): /* auto-creation of device node */ + c.autof = parse_auto(optarg, "--auto flag", 0); + continue; + + case O(CREATE,Symlinks): + case O(BUILD,Symlinks): + case O(ASSEMBLE,Symlinks): /* auto creation of symlinks in /dev to /dev/md */ + symlinks = optarg; + continue; + + case O(BUILD,'f'): /* force honouring '-n 1' */ + case O(BUILD,Force): /* force honouring '-n 1' */ + case O(GROW,'f'): /* ditto */ + case O(GROW,Force): /* ditto */ + case O(CREATE,'f'): /* force honouring of device list */ + case O(CREATE,Force): /* force honouring of device list */ + case O(ASSEMBLE,'f'): /* force assembly */ + case O(ASSEMBLE,Force): /* force assembly */ + case O(MISC,'f'): /* force zero */ + case O(MISC,Force): /* force zero */ + case O(MANAGE,Force): /* add device which is too large */ + c.force=1; + continue; + /* now for the Assemble options */ + case O(ASSEMBLE, FreezeReshape): /* Freeze reshape during + * initrd phase */ + case O(INCREMENTAL, FreezeReshape): + c.freeze_reshape = 1; + continue; + case O(CREATE,'u'): /* uuid of array */ + case O(ASSEMBLE,'u'): /* uuid of array */ + if (ident.uuid_set) { + pr_err("uuid cannot be set twice. Second value %s.\n", optarg); + exit(2); + } + if (parse_uuid(optarg, ident.uuid)) + ident.uuid_set = 1; + else { + pr_err("Bad uuid: %s\n", optarg); + exit(2); + } + continue; + + case O(CREATE,'N'): + case O(ASSEMBLE,'N'): + case O(MISC,'N'): + if (ident.name[0]) { + pr_err("name cannot be set twice. Second value %s.\n", optarg); + exit(2); + } + if (mode == MISC && !c.subarray) { + pr_err("-N/--name only valid with --update-subarray in misc mode\n"); + exit(2); + } + if (strlen(optarg) > 32) { + pr_err("name '%s' is too long, 32 chars max.\n", + optarg); + exit(2); + } + strcpy(ident.name, optarg); + continue; + + case O(ASSEMBLE,'m'): /* super-minor for array */ + case O(ASSEMBLE,SuperMinor): + if (ident.super_minor != UnSet) { + pr_err("super-minor cannot be set twice. Second value: %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "dev")==0) + ident.super_minor = -2; + else { + ident.super_minor = parse_num(optarg); + if (ident.super_minor < 0) { + pr_err("Bad super-minor number: %s.\n", optarg); + exit(2); + } + } + continue; + + case O(ASSEMBLE,'o'): + case O(MANAGE,'o'): + case O(CREATE,'o'): + c.readonly = 1; + continue; + + case O(ASSEMBLE,'U'): /* update the superblock */ + case O(MISC,'U'): + if (c.update) { + pr_err("Can only update one aspect of superblock, both %s and %s given.\n", + c.update, optarg); + exit(2); + } + if (mode == MISC && !c.subarray) { + pr_err("Only subarrays can be updated in misc mode\n"); + exit(2); + } + c.update = optarg; + if (strcmp(c.update, "sparc2.2")==0) + continue; + if (strcmp(c.update, "super-minor") == 0) + continue; + if (strcmp(c.update, "summaries")==0) + continue; + if (strcmp(c.update, "resync")==0) + continue; + if (strcmp(c.update, "uuid")==0) + continue; + if (strcmp(c.update, "name")==0) + continue; + if (strcmp(c.update, "homehost")==0) + continue; + if (strcmp(c.update, "home-cluster")==0) + continue; + if (strcmp(c.update, "nodes")==0) + continue; + if (strcmp(c.update, "devicesize")==0) + continue; + if (strcmp(c.update, "no-bitmap")==0) + continue; + if (strcmp(c.update, "bbl") == 0) + continue; + if (strcmp(c.update, "no-bbl") == 0) + continue; + if (strcmp(c.update, "force-no-bbl") == 0) + continue; + if (strcmp(c.update, "metadata") == 0) + continue; + if (strcmp(c.update, "revert-reshape") == 0) + continue; + if (strcmp(c.update, "byteorder")==0) { + if (ss) { + pr_err("must not set metadata type with --update=byteorder.\n"); + exit(2); + } + for(i=0; !ss && superlist[i]; i++) + ss = superlist[i]->match_metadata_desc( + "0.swap"); + if (!ss) { + pr_err("INTERNAL ERROR cannot find 0.swap\n"); + exit(2); + } + + continue; + } + if (strcmp(c.update,"?") == 0 || + strcmp(c.update, "help") == 0) { + outf = stdout; + fprintf(outf, "%s: ", Name); + } else { + outf = stderr; + fprintf(outf, + "%s: '--update=%s' is invalid. ", + Name, c.update); + } + fprintf(outf, "Valid --update options are:\n" + " 'sparc2.2', 'super-minor', 'uuid', 'name', 'nodes', 'resync',\n" + " 'summaries', 'homehost', 'home-cluster', 'byteorder', 'devicesize',\n" + " 'no-bitmap', 'metadata', 'revert-reshape'\n" + " 'bbl', 'no-bbl', 'force-no-bbl'\n" + ); + exit(outf == stdout ? 0 : 2); + + case O(MANAGE,'U'): + /* update=devicesize is allowed with --re-add */ + if (devmode != 'A') { + pr_err("--update in Manage mode only allowed with --re-add.\n"); + exit(1); + } + if (c.update) { + pr_err("Can only update one aspect of superblock, both %s and %s given.\n", + c.update, optarg); + exit(2); + } + c.update = optarg; + if (strcmp(c.update, "devicesize") != 0 && + strcmp(c.update, "bbl") != 0 && + strcmp(c.update, "force-no-bbl") != 0 && + strcmp(c.update, "no-bbl") != 0) { + pr_err("only 'devicesize', 'bbl', 'no-bbl', and 'force-no-bbl' can be updated with --re-add\n"); + exit(2); + } + continue; + + case O(INCREMENTAL,NoDegraded): + pr_err("--no-degraded is deprecated in Incremental mode\n"); + case O(ASSEMBLE,NoDegraded): /* --no-degraded */ + c.runstop = -1; /* --stop isn't allowed for --assemble, + * so we overload slightly */ + continue; + + case O(ASSEMBLE,'c'): + case O(ASSEMBLE,ConfigFile): + case O(INCREMENTAL, 'c'): + case O(INCREMENTAL, ConfigFile): + case O(MISC, 'c'): + case O(MISC, ConfigFile): + case O(MONITOR,'c'): + case O(MONITOR,ConfigFile): + case O(CREATE,ConfigFile): + if (configfile) { + pr_err("configfile cannot be set twice. Second value is %s.\n", optarg); + exit(2); + } + configfile = optarg; + set_conffile(configfile); + /* FIXME possibly check that config file exists. Even parse it */ + continue; + case O(ASSEMBLE,'s'): /* scan */ + case O(MISC,'s'): + case O(MONITOR,'s'): + case O(INCREMENTAL,'s'): + c.scan = 1; + continue; + + case O(MONITOR,'m'): /* mail address */ + case O(MONITOR,EMail): + if (mailaddr) + pr_err("only specify one mailaddress. %s ignored.\n", + optarg); + else + mailaddr = optarg; + continue; + + case O(MONITOR,'p'): /* alert program */ + case O(MONITOR,ProgramOpt): /* alert program */ + if (program) + pr_err("only specify one alter program. %s ignored.\n", + optarg); + else + program = optarg; + continue; + + case O(MONITOR,'r'): /* rebuild increments */ + case O(MONITOR,Increment): + increments = atoi(optarg); + if (increments > 99 || increments < 1) { + pr_err("please specify positive integer between 1 and 99 as rebuild increments.\n"); + exit(2); + } + continue; + + case O(MONITOR,'d'): /* delay in seconds */ + case O(GROW, 'd'): + case O(BUILD,'d'): /* delay for bitmap updates */ + case O(CREATE,'d'): + if (c.delay) + pr_err("only specify delay once. %s ignored.\n", + optarg); + else { + c.delay = parse_num(optarg); + if (c.delay < 1) { + pr_err("invalid delay: %s\n", + optarg); + exit(2); + } + } + continue; + case O(MONITOR,'f'): /* daemonise */ + case O(MONITOR,Fork): + daemonise = 1; + continue; + case O(MONITOR,'i'): /* pid */ + if (pidfile) + pr_err("only specify one pid file. %s ignored.\n", + optarg); + else + pidfile = optarg; + continue; + case O(MONITOR,'1'): /* oneshot */ + oneshot = 1; + spare_sharing = 0; + continue; + case O(MONITOR,'t'): /* test */ + c.test = 1; + continue; + case O(MONITOR,'y'): /* log messages to syslog */ + openlog("mdadm", LOG_PID, SYSLOG_FACILITY); + dosyslog = 1; + continue; + case O(MONITOR, NoSharing): + spare_sharing = 0; + continue; + + /* now the general management options. Some are applicable + * to other modes. None have arguments. + */ + case O(GROW,'a'): + case O(GROW,Add): + case O(MANAGE,'a'): + case O(MANAGE,Add): /* add a drive */ + devmode = 'a'; + continue; + case O(MANAGE,AddSpare): /* add drive - never re-add */ + devmode = 'S'; + continue; + case O(MANAGE,AddJournal): /* add journal */ + if (s.journaldisks && (s.level < 4 || s.level > 6)) { + pr_err("--add-journal is only supported for RAID level 4/5/6.\n"); + exit(2); + } + devmode = 'j'; + continue; + case O(MANAGE,ReAdd): + devmode = 'A'; + continue; + case O(MANAGE,'r'): /* remove a drive */ + case O(MANAGE,Remove): + devmode = 'r'; + continue; + case O(MANAGE,'f'): /* set faulty */ + case O(MANAGE,Fail): + case O(INCREMENTAL,'f'): + case O(INCREMENTAL,Remove): + case O(INCREMENTAL,Fail): /* r for incremental is taken, use f + * even though we will both fail and + * remove the device */ + devmode = 'f'; + continue; + case O(MANAGE, ClusterConfirm): + devmode = 'c'; + continue; + case O(MANAGE,Replace): + /* Mark these devices for replacement */ + devmode = 'R'; + continue; + case O(MANAGE,With): + /* These are the replacements to use */ + if (devmode != 'R') { + pr_err("--with must follow --replace\n"); + exit(2); + } + devmode = 'W'; + continue; + case O(INCREMENTAL,'R'): + case O(MANAGE,'R'): + case O(ASSEMBLE,'R'): + case O(BUILD,'R'): + case O(CREATE,'R'): /* Run the array */ + if (c.runstop < 0) { + pr_err("Cannot both Stop and Run an array\n"); + exit(2); + } + c.runstop = 1; + continue; + case O(MANAGE,'S'): + if (c.runstop > 0) { + pr_err("Cannot both Run and Stop an array\n"); + exit(2); + } + c.runstop = -1; + continue; + case O(MANAGE,'t'): + c.test = 1; + continue; + + case O(MISC,'Q'): + case O(MISC,'D'): + case O(MISC,'E'): + case O(MISC,KillOpt): + case O(MISC,'R'): + case O(MISC,'S'): + case O(MISC,'X'): + case O(MISC, ExamineBB): + case O(MISC,'o'): + case O(MISC,'w'): + case O(MISC,'W'): + case O(MISC, WaitOpt): + case O(MISC, Waitclean): + case O(MISC, DetailPlatform): + case O(MISC, KillSubarray): + case O(MISC, UpdateSubarray): + case O(MISC, Dump): + case O(MISC, Restore): + case O(MISC ,Action): + if (opt == KillSubarray || opt == UpdateSubarray) { + if (c.subarray) { + pr_err("subarray can only be specified once\n"); + exit(2); + } + c.subarray = optarg; + } + if (opt == Action) { + if (c.action) { + pr_err("Only one --action can be specified\n"); + exit(2); + } + if (strcmp(optarg, "idle") == 0 || + strcmp(optarg, "frozen") == 0 || + strcmp(optarg, "check") == 0 || + strcmp(optarg, "repair") == 0) + c.action = optarg; + else { + pr_err("action must be one of idle, frozen, check, repair\n"); + exit(2); + } + } + if (devmode && devmode != opt && + (devmode == 'E' || (opt == 'E' && devmode != 'Q'))) { + pr_err("--examine/-E cannot be given with "); + if (devmode == 'E') { + if (option_index >= 0) + fprintf(stderr, "--%s\n", + long_options[option_index].name); + else + fprintf(stderr, "-%c\n", opt); + } else if (isalpha(devmode)) + fprintf(stderr, "-%c\n", devmode); + else + fprintf(stderr, "previous option\n"); + exit(2); + } + devmode = opt; + if (opt == Dump || opt == Restore) { + if (dump_directory != NULL) { + pr_err("dump/restore directory specified twice: %s and %s\n", + dump_directory, optarg); + exit(2); + } + dump_directory = optarg; + } + continue; + case O(MISC, UdevRules): + if (devmode && devmode != opt) { + pr_err("--udev-rules must be the only option.\n"); + } else { + if (udev_filename) + pr_err("only specify one udev rule filename. %s ignored.\n", + optarg); + else + udev_filename = optarg; + } + devmode = opt; + continue; + case O(MISC,'t'): + c.test = 1; + continue; + + case O(MISC, Sparc22): + if (devmode != 'E') { + pr_err("--sparc2.2 only allowed with --examine\n"); + exit(2); + } + c.SparcAdjust = 1; + continue; + + case O(ASSEMBLE,'b'): /* here we simply set the bitmap file */ + case O(ASSEMBLE,Bitmap): + if (!optarg) { + pr_err("bitmap file needed with -b in --assemble mode\n"); + exit(2); + } + if (strcmp(optarg, "internal")==0) { + pr_err("there is no need to specify --bitmap when assembling arrays with internal bitmaps\n"); + continue; + } + bitmap_fd = open(optarg, O_RDWR); + if (!*optarg || bitmap_fd < 0) { + pr_err("cannot open bitmap file %s: %s\n", optarg, strerror(errno)); + exit(2); + } + ident.bitmap_fd = bitmap_fd; /* for Assemble */ + continue; + + case O(ASSEMBLE, BackupFile): + case O(GROW, BackupFile): + /* Specify a file into which grow might place a backup, + * or from which assemble might recover a backup + */ + if (c.backup_file) { + pr_err("backup file already specified, rejecting %s\n", optarg); + exit(2); + } + c.backup_file = optarg; + continue; + + case O(GROW, Continue): + /* Continue interrupted grow + */ + grow_continue = 1; + continue; + case O(ASSEMBLE, InvalidBackup): + /* Acknowledge that the backupfile is invalid, but ask + * to continue anyway + */ + c.invalid_backup = 1; + continue; + + case O(BUILD,'b'): + case O(BUILD,Bitmap): + case O(CREATE,'b'): + case O(CREATE,Bitmap): /* here we create the bitmap */ + case O(GROW,'b'): + case O(GROW,Bitmap): + if (strcmp(optarg, "internal")== 0 || + strcmp(optarg, "none")== 0 || + strchr(optarg, '/') != NULL) { + s.bitmap_file = optarg; + continue; + } + if (strcmp(optarg, "clustered")== 0) { + s.bitmap_file = optarg; + /* Set the default number of cluster nodes + * to 4 if not already set by user + */ + if (c.nodes < 1) + c.nodes = 4; + continue; + } + /* probable typo */ + pr_err("bitmap file must contain a '/', or be 'internal', or 'none'\n" + " not '%s'\n", optarg); + exit(2); + + case O(GROW,BitmapChunk): + case O(BUILD,BitmapChunk): + case O(CREATE,BitmapChunk): /* bitmap chunksize */ + s.bitmap_chunk = parse_size(optarg); + if (s.bitmap_chunk == 0 || + s.bitmap_chunk == INVALID_SECTORS || + s.bitmap_chunk & (s.bitmap_chunk - 1)) { + pr_err("invalid bitmap chunksize: %s\n", + optarg); + exit(2); + } + s.bitmap_chunk = s.bitmap_chunk * 512; + continue; + + case O(GROW, WriteBehind): + case O(BUILD, WriteBehind): + case O(CREATE, WriteBehind): /* write-behind mode */ + s.write_behind = DEFAULT_MAX_WRITE_BEHIND; + if (optarg) { + s.write_behind = parse_num(optarg); + if (s.write_behind < 0 || + s.write_behind > 16383) { + pr_err("Invalid value for maximum outstanding write-behind writes: %s.\n\tMust be between 0 and 16383.\n", optarg); + exit(2); + } + } + continue; + + case O(INCREMENTAL, 'r'): + case O(INCREMENTAL, RebuildMapOpt): + rebuild_map = 1; + continue; + case O(INCREMENTAL, IncrementalPath): + remove_path = optarg; + continue; + case O(CREATE, WriteJournal): + if (s.journaldisks) { + pr_err("Please specify only one journal device for the array.\n"); + pr_err("Ignoring --write-journal %s...\n", optarg); + continue; + } + dv = xmalloc(sizeof(*dv)); + dv->devname = optarg; + dv->disposition = 'j'; /* WriteJournal */ + dv->used = 0; + dv->next = NULL; + *devlistend = dv; + devlistend = &dv->next; + devs_found++; + + s.journaldisks = 1; + continue; + } + /* We have now processed all the valid options. Anything else is + * an error + */ + if (option_index > 0) + pr_err(":option --%s not valid in %s mode\n", + long_options[option_index].name, + map_num(modes, mode)); + else + pr_err("option -%c not valid in %s mode\n", + opt, map_num(modes, mode)); + exit(2); + + } + + if (print_help) { + char *help_text; + if (print_help == 2) + help_text = OptionHelp; + else + help_text = mode_help[mode]; + if (help_text == NULL) + help_text = Help; + fputs(help_text,stdout); + exit(0); + } + + if (s.journaldisks && (s.level < 4 || s.level > 6)) { + pr_err("--write-journal is only supported for RAID level 4/5/6.\n"); + exit(2); + } + + if (!mode && devs_found) { + mode = MISC; + devmode = 'Q'; + if (devlist->disposition == 0) + devlist->disposition = devmode; + } + if (!mode) { + fputs(Usage, stderr); + exit(2); + } + + if (symlinks) { + struct createinfo *ci = conf_get_create_info(); + + if (strcasecmp(symlinks, "yes") == 0) + ci->symlinks = 1; + else if (strcasecmp(symlinks, "no") == 0) + ci->symlinks = 0; + else { + pr_err("option --symlinks must be 'no' or 'yes'\n"); + exit(2); + } + } + /* Ok, got the option parsing out of the way + * hopefully it's mostly right but there might be some stuff + * missing + * + * That is mosty checked in the per-mode stuff but... + * + * For @,B,C and A without -s, the first device listed must be + * an md device. We check that here and open it. + */ + + if (mode == MANAGE || mode == BUILD || mode == CREATE + || mode == GROW + || (mode == ASSEMBLE && ! c.scan)) { + if (devs_found < 1) { + pr_err("an md device must be given in this mode\n"); + exit(2); + } + if ((int)ident.super_minor == -2 && c.autof) { + pr_err("--super-minor=dev is incompatible with --auto\n"); + exit(2); + } + if (mode == MANAGE || mode == GROW) { + mdfd = open_mddev(devlist->devname, 1); + if (mdfd < 0) + exit(1); + } else + /* non-existent device is OK */ + mdfd = open_mddev(devlist->devname, 0); + if (mdfd == -2) { + pr_err("device %s exists but is not an md array.\n", devlist->devname); + exit(1); + } + if ((int)ident.super_minor == -2) { + struct stat stb; + if (mdfd < 0) { + pr_err("--super-minor=dev given, and listed device %s doesn't exist.\n", + devlist->devname); + exit(1); + } + fstat(mdfd, &stb); + ident.super_minor = minor(stb.st_rdev); + } + if (mdfd >= 0 && mode != MANAGE && mode != GROW) { + /* We don't really want this open yet, we just might + * have wanted to check some things + */ + close(mdfd); + mdfd = -1; + } + } + + if (s.raiddisks) { + if (s.raiddisks == 1 && !c.force && s.level != LEVEL_FAULTY) { + pr_err("'1' is an unusual number of drives for an array, so it is probably\n" + " a mistake. If you really mean it you will need to specify --force before\n" + " setting the number of drives.\n"); + exit(2); + } + } + + if (c.homehost == NULL && c.require_homehost) + c.homehost = conf_get_homehost(&c.require_homehost); + if (c.homehost == NULL || strcasecmp(c.homehost, "")==0) { + if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) { + sys_hostname[sizeof(sys_hostname)-1] = 0; + c.homehost = sys_hostname; + } + } + if (c.homehost && (!c.homehost[0] || strcasecmp(c.homehost, "") == 0)) { + c.homehost = NULL; + c.require_homehost = 0; + } + + rv = 0; + + set_hooks(); /* set hooks from libs */ + + if (c.homecluster == NULL && (c.nodes > 0)) { + c.homecluster = conf_get_homecluster(); + if (c.homecluster == NULL) + rv = get_cluster_name(&c.homecluster); + if (rv) { + pr_err("The md can't get cluster name\n"); + exit(1); + } + } + + if (c.backup_file && data_offset != INVALID_SECTORS) { + pr_err("--backup-file and --data-offset are incompatible\n"); + exit(2); + } + + if ((mode == MISC && devmode == 'E') + || (mode == MONITOR && spare_sharing == 0)) + /* Anyone may try this */; + else if (geteuid() != 0) { + pr_err("must be super-user to perform this action\n"); + exit(1); + } + + ident.autof = c.autof; + + if (c.scan && c.verbose < 2) + /* --scan implied --brief unless -vv */ + c.brief = 1; + + switch(mode) { + case MANAGE: + /* readonly, add/remove, readwrite, runstop */ + if (c.readonly > 0) + rv = Manage_ro(devlist->devname, mdfd, c.readonly); + if (!rv && devs_found>1) + rv = Manage_subdevs(devlist->devname, mdfd, + devlist->next, c.verbose, c.test, + c.update, c.force); + if (!rv && c.readonly < 0) + rv = Manage_ro(devlist->devname, mdfd, c.readonly); + if (!rv && c.runstop > 0) + rv = Manage_run(devlist->devname, mdfd, &c); + if (!rv && c.runstop < 0) + rv = Manage_stop(devlist->devname, mdfd, c.verbose, 0); + break; + case ASSEMBLE: + if (devs_found == 1 && ident.uuid_set == 0 && + ident.super_minor == UnSet && ident.name[0] == 0 && !c.scan ) { + /* Only a device has been given, so get details from config file */ + struct mddev_ident *array_ident = conf_get_ident(devlist->devname); + if (array_ident == NULL) { + pr_err("%s not identified in config file.\n", + devlist->devname); + rv |= 1; + if (mdfd >= 0) + close(mdfd); + } else { + if (array_ident->autof == 0) + array_ident->autof = c.autof; + rv |= Assemble(ss, devlist->devname, array_ident, + NULL, &c); + } + } else if (!c.scan) + rv = Assemble(ss, devlist->devname, &ident, + devlist->next, &c); + else if (devs_found > 0) { + if (c.update && devs_found > 1) { + pr_err("can only update a single array at a time\n"); + exit(1); + } + if (c.backup_file && devs_found > 1) { + pr_err("can only assemble a single array when providing a backup file.\n"); + exit(1); + } + for (dv = devlist ; dv ; dv=dv->next) { + struct mddev_ident *array_ident = conf_get_ident(dv->devname); + if (array_ident == NULL) { + pr_err("%s not identified in config file.\n", + dv->devname); + rv |= 1; + continue; + } + if (array_ident->autof == 0) + array_ident->autof = c.autof; + rv |= Assemble(ss, dv->devname, array_ident, + NULL, &c); + } + } else { + if (c.update) { + pr_err("--update not meaningful with a --scan assembly.\n"); + exit(1); + } + if (c.backup_file) { + pr_err("--backup_file not meaningful with a --scan assembly.\n"); + exit(1); + } + rv = scan_assemble(ss, &c, &ident); + } + + break; + case BUILD: + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + if (s.write_behind && !s.bitmap_file) { + pr_err("write-behind mode requires a bitmap.\n"); + rv = 1; + break; + } + if (s.raiddisks == 0) { + pr_err("no raid-devices specified.\n"); + rv = 1; + break; + } + + if (s.bitmap_file) { + if (strcmp(s.bitmap_file, "internal")==0 || + strcmp(s.bitmap_file, "clustered") == 0) { + pr_err("'internal' and 'clustered' bitmaps not supported with --build\n"); + rv |= 1; + break; + } + } + rv = Build(devlist->devname, devlist->next, &s, &c); + break; + case CREATE: + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + + if (c.nodes) { + if (!s.bitmap_file || strcmp(s.bitmap_file, "clustered") != 0) { + pr_err("--nodes argument only compatible with --bitmap=clustered\n"); + rv = 1; + break; + } + + if (s.level != 1) { + pr_err("--bitmap=clustered is currently supported with RAID mirror only\n"); + rv = 1; + break; + } + } + + if (s.write_behind && !s.bitmap_file) { + pr_err("write-behind mode requires a bitmap.\n"); + rv = 1; + break; + } + if (s.raiddisks == 0) { + pr_err("no raid-devices specified.\n"); + rv = 1; + break; + } + + rv = Create(ss, devlist->devname, + ident.name, ident.uuid_set ? ident.uuid : NULL, + devs_found-1, devlist->next, + &s, &c, data_offset); + break; + case MISC: + if (devmode == 'E') { + if (devlist == NULL && !c.scan) { + pr_err("No devices to examine\n"); + exit(2); + } + if (devlist == NULL) + devlist = conf_get_devs(); + if (devlist == NULL) { + pr_err("No devices listed in %s\n", configfile?configfile:DefaultConfFile); + exit(1); + } + rv = Examine(devlist, &c, ss); + } else if (devmode == DetailPlatform) { + rv = Detail_Platform(ss ? ss->ss : NULL, ss ? c.scan : 1, + c.verbose, c.export, + devlist ? devlist->devname : NULL); + } else if (devlist == NULL) { + if (devmode == 'S' && c.scan) + rv = stop_scan(c.verbose); + else if ((devmode == 'D' || devmode == Waitclean) && c.scan) + rv = misc_scan(devmode, &c); + else if (devmode == UdevRules) + rv = Write_rules(udev_filename); + else { + pr_err("No devices given.\n"); + exit(2); + } + } else + rv = misc_list(devlist, &ident, dump_directory, ss, &c); + break; + case MONITOR: + if (!devlist && !c.scan) { + pr_err("Cannot monitor: need --scan or at least one device\n"); + rv = 1; + break; + } + if (pidfile && !daemonise) { + pr_err("Cannot write a pid file when not in daemon mode\n"); + rv = 1; + break; + } + if (c.delay == 0) { + if (get_linux_version() > 2006016) + /* mdstat responds to poll */ + c.delay = 1000; + else + c.delay = 60; + } + rv= Monitor(devlist, mailaddr, program, + &c, daemonise, oneshot, + dosyslog, pidfile, increments, + spare_sharing); + break; + + case GROW: + if (array_size > 0) { + /* alway impose array size first, independent of + * anything else + * Do not allow level or raid_disks changes at the + * same time as that can be irreversibly destructive. + */ + struct mdinfo sra; + int err; + if (s.raiddisks || s.level != UnSet) { + pr_err("cannot change array size in same operation as changing raiddisks or level.\n" + " Change size first, then check that data is still intact.\n"); + rv = 1; + break; + } + sysfs_init(&sra, mdfd, NULL); + if (array_size == MAX_SIZE) + err = sysfs_set_str(&sra, NULL, "array_size", "default"); + else + err = sysfs_set_num(&sra, NULL, "array_size", array_size / 2); + if (err < 0) { + if (errno == E2BIG) + pr_err("--array-size setting is too large.\n"); + else + pr_err("current kernel does not support setting --array-size\n"); + rv = 1; + break; + } + } + if (devs_found > 1 && s.raiddisks == 0 && s.level == UnSet) { + /* must be '-a'. */ + if (s.size > 0 || s.chunk || s.layout_str != NULL || s.bitmap_file) { + pr_err("--add cannot be used with other geometry changes in --grow mode\n"); + rv = 1; + break; + } + for (dv=devlist->next; dv ; dv=dv->next) { + rv = Grow_Add_device(devlist->devname, mdfd, + dv->devname); + if (rv) + break; + } + } else if (s.bitmap_file) { + if (s.size > 0 || s.raiddisks || s.chunk || + s.layout_str != NULL || devs_found > 1) { + pr_err("--bitmap changes cannot be used with other geometry changes in --grow mode\n"); + rv = 1; + break; + } + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + rv = Grow_addbitmap(devlist->devname, mdfd, &c, &s); + } else if (grow_continue) + rv = Grow_continue_command(devlist->devname, + mdfd, c.backup_file, + c.verbose); + else if (s.size > 0 || s.raiddisks || s.layout_str != NULL + || s.chunk != 0 || s.level != UnSet + || data_offset != INVALID_SECTORS) { + rv = Grow_reshape(devlist->devname, mdfd, + devlist->next, + data_offset, &c, &s); + } else if (array_size == 0) + pr_err("no changes to --grow\n"); + break; + case INCREMENTAL: + if (rebuild_map) { + RebuildMap(); + } + if (c.scan) { + rv = 1; + if (devlist) { + pr_err("In --incremental mode, a device cannot be given with --scan.\n"); + break; + } + if (c.runstop <= 0) { + pr_err("--incremental --scan meaningless without --run.\n"); + break; + } + if (devmode == 'f') { + pr_err("--incremental --scan --fail not supported.\n"); + break; + } + rv = IncrementalScan(&c, NULL); + } + if (!devlist) { + if (!rebuild_map && !c.scan) { + pr_err("--incremental requires a device.\n"); + rv = 1; + } + break; + } + if (devmode == 'f') { + if (devlist->next) { + pr_err("'--incremental --fail' can only handle one device.\n"); + rv = 1; + break; + } + rv = IncrementalRemove(devlist->devname, remove_path, + c.verbose); + } else + rv = Incremental(devlist, &c, ss); + break; + case AUTODETECT: + autodetect(); + break; + } + exit(rv); +} + +static int scan_assemble(struct supertype *ss, + struct context *c, + struct mddev_ident *ident) +{ + struct mddev_ident *a, *array_list = conf_get_ident(NULL); + struct mddev_dev *devlist = conf_get_devs(); + struct map_ent *map = NULL; + int cnt = 0; + int rv = 0; + int failures, successes; + + if (conf_verify_devnames(array_list)) { + pr_err("Duplicate MD device names in conf file were found.\n"); + return 1; + } + if (devlist == NULL) { + pr_err("No devices listed in conf file were found.\n"); + return 1; + } + for (a = array_list; a ; a = a->next) { + a->assembled = 0; + if (a->autof == 0) + a->autof = c->autof; + } + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + do { + failures = 0; + successes = 0; + rv = 0; + for (a = array_list; a ; a = a->next) { + int r; + if (a->assembled) + continue; + if (a->devname && + strcasecmp(a->devname, "") == 0) + continue; + + r = Assemble(ss, a->devname, + a, NULL, c); + if (r == 0) { + a->assembled = 1; + successes++; + } else + failures++; + rv |= r; + cnt++; + } + } while (failures && successes); + if (c->homehost && cnt == 0) { + /* Maybe we can auto-assemble something. + * Repeatedly call Assemble in auto-assemble mode + * until it fails + */ + int rv2; + int acnt; + ident->autof = c->autof; + do { + struct mddev_dev *devlist = conf_get_devs(); + acnt = 0; + do { + rv2 = Assemble(ss, NULL, + ident, + devlist, c); + if (rv2==0) { + cnt++; + acnt++; + } + } while (rv2!=2); + /* Incase there are stacked devices, we need to go around again */ + } while (acnt); + if (cnt == 0 && rv == 0) { + pr_err("No arrays found in config file or automatically\n"); + rv = 1; + } else if (cnt) + rv = 0; + } else if (cnt == 0 && rv == 0) { + pr_err("No arrays found in config file\n"); + rv = 1; + } + map_unlock(&map); + return rv; +} + +static int misc_scan(char devmode, struct context *c) +{ + /* apply --detail or --wait-clean to + * all devices in /proc/mdstat + */ + struct mdstat_ent *ms = mdstat_read(0, 1); + struct mdstat_ent *e; + struct map_ent *map = NULL; + int members; + int rv = 0; + + for (members = 0; members <= 1; members++) { + for (e=ms ; e ; e=e->next) { + char *name = NULL; + struct map_ent *me; + struct stat stb; + int member = e->metadata_version && + strncmp(e->metadata_version, + "external:/", 10) == 0; + if (members != member) + continue; + me = map_by_devnm(&map, e->devnm); + if (me && me->path + && strcmp(me->path, "/unknown") != 0) + name = me->path; + if (name == NULL || + stat(name, &stb) != 0) + name = get_md_name(e->devnm); + + if (!name) { + pr_err("cannot find device file for %s\n", + e->devnm); + continue; + } + if (devmode == 'D') + rv |= Detail(name, c); + else + rv |= WaitClean(name, -1, c->verbose); + put_md_name(name); + } + } + free_mdstat(ms); + return rv; +} + +static int stop_scan(int verbose) +{ + /* apply --stop to all devices in /proc/mdstat */ + /* Due to possible stacking of devices, repeat until + * nothing more can be stopped + */ + int progress=1, err; + int last = 0; + int rv = 0; + do { + struct mdstat_ent *ms = mdstat_read(0, 0); + struct mdstat_ent *e; + + if (!progress) last = 1; + progress = 0; err = 0; + for (e=ms ; e ; e=e->next) { + char *name = get_md_name(e->devnm); + int mdfd; + + if (!name) { + pr_err("cannot find device file for %s\n", + e->devnm); + continue; + } + mdfd = open_mddev(name, 1); + if (mdfd >= 0) { + if (Manage_stop(name, mdfd, verbose, !last)) + err = 1; + else + progress = 1; + close(mdfd); + } + + put_md_name(name); + } + free_mdstat(ms); + } while (!last && err); + if (err) + rv |= 1; + return rv; +} + +static int misc_list(struct mddev_dev *devlist, + struct mddev_ident *ident, + char *dump_directory, + struct supertype *ss, struct context *c) +{ + struct mddev_dev *dv; + int rv = 0; + + for (dv=devlist ; dv; dv=(rv & 16) ? NULL : dv->next) { + int mdfd; + + switch(dv->disposition) { + case 'D': + rv |= Detail(dv->devname, c); + continue; + case KillOpt: /* Zero superblock */ + if (ss) + rv |= Kill(dv->devname, ss, c->force, c->verbose,0); + else { + int v = c->verbose; + do { + rv |= Kill(dv->devname, NULL, c->force, v, 0); + v = -1; + } while (rv == 0); + rv &= ~2; + } + continue; + case 'Q': + rv |= Query(dv->devname); continue; + case 'X': + rv |= ExamineBitmap(dv->devname, c->brief, ss); continue; + case ExamineBB: + rv |= ExamineBadblocks(dv->devname, c->brief, ss); continue; + case 'W': + case WaitOpt: + rv |= Wait(dv->devname); continue; + case Waitclean: + rv |= WaitClean(dv->devname, -1, c->verbose); continue; + case KillSubarray: + rv |= Kill_subarray(dv->devname, c->subarray, c->verbose); + continue; + case UpdateSubarray: + if (c->update == NULL) { + pr_err("-U/--update must be specified with --update-subarray\n"); + rv |= 1; + continue; + } + rv |= Update_subarray(dv->devname, c->subarray, + c->update, ident, c->verbose); + continue; + case Dump: + rv |= Dump_metadata(dv->devname, dump_directory, c, ss); + continue; + case Restore: + rv |= Restore_metadata(dv->devname, dump_directory, c, ss, + (dv == devlist && dv->next == NULL)); + continue; + case Action: + rv |= SetAction(dv->devname, c->action); + continue; + } + if (dv->devname[0] == '/') + mdfd = open_mddev(dv->devname, 1); + else { + mdfd = open_dev(dv->devname); + if (mdfd < 0) + pr_err("Cannot open %s\n", dv->devname); + } + if (mdfd>=0) { + switch(dv->disposition) { + case 'R': + c->runstop = 1; + rv |= Manage_run(dv->devname, mdfd, c); break; + case 'S': + rv |= Manage_stop(dv->devname, mdfd, c->verbose, 0); break; + case 'o': + rv |= Manage_ro(dv->devname, mdfd, 1); break; + case 'w': + rv |= Manage_ro(dv->devname, mdfd, -1); break; + } + close(mdfd); + } else + rv |= 1; + } + return rv; +} + +int SetAction(char *dev, char *action) +{ + int fd = open(dev, O_RDONLY); + struct mdinfo mdi; + if (fd < 0) { + pr_err("Couldn't open %s: %s\n", dev, strerror(errno)); + return 1; + } + sysfs_init(&mdi, fd, NULL); + close(fd); + if (!mdi.sys_name[0]) { + pr_err("%s is no an md array\n", dev); + return 1; + } + + if (sysfs_set_str(&mdi, NULL, "sync_action", action) < 0) { + pr_err("Count not set action for %s to %s: %s\n", + dev, action, strerror(errno)); + return 1; + } + return 0; +} diff --git a/mdadm.conf-example b/mdadm.conf-example new file mode 100644 index 00000000..35a75d12 --- /dev/null +++ b/mdadm.conf-example @@ -0,0 +1,65 @@ +# mdadm configuration file +# +# mdadm will function properly without the use of a configuration file, +# but this file is useful for keeping track of arrays and member disks. +# In general, a mdadm.conf file is created, and updated, after arrays +# are created. This is the opposite behavior of /etc/raidtab which is +# created prior to array construction. +# +# +# the config file takes two types of lines: +# +# DEVICE lines specify a list of devices of where to look for +# potential member disks +# +# ARRAY lines specify information about how to identify arrays so +# so that they can be activated +# +# You can have more than one device line and use wild cards. The first +# example includes SCSI the first partition of SCSI disks /dev/sdb, +# /dev/sdc, /dev/sdd, /dev/sdj, /dev/sdk, and /dev/sdl. The second +# line looks for array slices on IDE disks. +# +#DEVICE /dev/sd[bcdjkl]1 +#DEVICE /dev/hda1 /dev/hdb1 +# +# If you mount devfs on /dev, then a suitable way to list all devices is: +#DEVICE /dev/discs/*/* +# +# +# The AUTO line can control which arrays get assembled by auto-assembly, +# meaing either "mdadm -As" when there are no 'ARRAY' lines in this file, +# or "mdadm --incremental" when the array found is not listed in this file. +# By default, all arrays that are found are assembled. +# If you want to ignore all DDF arrays (maybe they are managed by dmraid), +# and only assemble 1.x arrays if which are marked for 'this' homehost, +# but assemble all others, then use +#AUTO -ddf homehost -1.x +all +# +# ARRAY lines specify an array to assemble and a method of identification. +# Arrays can currently be identified by using a UUID, superblock minor number, +# or a listing of devices. +# +# super-minor is usually the minor number of the metadevice +# UUID is the Universally Unique Identifier for the array +# Each can be obtained using +# +# mdadm -D +# +#ARRAY /dev/md0 UUID=3aaa0122:29827cfa:5331ad66:ca767371 +#ARRAY /dev/md1 super-minor=1 +#ARRAY /dev/md2 devices=/dev/hda1,/dev/hdb1 +# +# ARRAY lines can also specify a "spare-group" for each array. mdadm --monitor +# will then move a spare between arrays in a spare-group if one array has a failed +# drive but no spare +#ARRAY /dev/md4 uuid=b23f3c6d:aec43a9f:fd65db85:369432df spare-group=group1 +#ARRAY /dev/md5 uuid=19464854:03f71b1b:e0df2edd:246cc977 spare-group=group1 +# +# When used in --follow (aka --monitor) mode, mdadm needs a +# mail address and/or a program. This can be given with "mailaddr" +# and "program" lines to that monitoring can be started using +# mdadm --follow --scan & echo $! > /run/mdadm/mon.pid +# If the lines are not found, mdadm will exit quietly +#MAILADDR root@mydomain.tld +#PROGRAM /usr/sbin/handle-mdadm-events diff --git a/mdadm.conf.5 b/mdadm.conf.5 new file mode 100644 index 00000000..542e2635 --- /dev/null +++ b/mdadm.conf.5 @@ -0,0 +1,641 @@ +.\" Copyright Neil Brown and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH MDADM.CONF 5 +.SH NAME +mdadm.conf \- configuration for management of Software RAID with mdadm +.SH SYNOPSIS +/etc/mdadm/mdadm.conf +.SH DESCRIPTION +.PP +.I mdadm +is a tool for creating, managing, and monitoring RAID devices using the +.B md +driver in Linux. +.PP +Some common tasks, such as assembling all arrays, can be simplified +by describing the devices and arrays in this configuration file. + +.SS SYNTAX +The file should be seen as a collection of words separated by white +space (space, tab, or newline). +Any word that beings with a hash sign (#) starts a comment and that +word together with the remainder of the line is ignored. + +Spaces can be included in a word using quotation characters. Either +single quotes +.RB ( ' ) +or double quotes (\fB"\fP) +may be used. All the characters from one quotation character to +next identical character are protected and will not be used to +separate words to start new quoted strings. To include a single quote +it must be between double quotes. To include a double quote it must +be between single quotes. + +Any line that starts with white space (space or tab) is treated as +though it were a continuation of the previous line. + +Empty lines are ignored, but otherwise each (non continuation) line +must start with a keyword as listed below. The keywords are case +insensitive and can be abbreviated to 3 characters. + +The keywords are: +.TP +.B DEVICE +A +.B device +line lists the devices (whole devices or partitions) that might contain +a component of an MD array. When looking for the components of an +array, +.I mdadm +will scan these devices (or any devices listed on the command line). + +The +.B device +line may contain a number of different devices (separated by spaces) +and each device name can contain wild cards as defined by +.BR glob (7). + +Also, there may be several device lines present in the file. + +Alternatively, a +.B device +line can contain either or both of the words +.B containers +and +.BR partitions . +The word +.B containers +will cause +.I mdadm +to look for assembled CONTAINER arrays and included them as a source +for assembling further arrays. + +The word +.I partitions +will cause +.I mdadm +to read +.I /proc/partitions +and include all devices and partitions found therein. +.I mdadm +does not use the names from +.I /proc/partitions +but only the major and minor device numbers. It scans +.I /dev +to find the name that matches the numbers. + +If no DEVICE line is present, then "DEVICE partitions containers" is assumed. + +For example: +.IP +DEVICE /dev/hda* /dev/hdc* +.br +DEV /dev/sd* +.br +DEVICE /dev/disk/by-path/pci* +.br +DEVICE partitions + +.TP +.B ARRAY +The ARRAY lines identify actual arrays. The second word on the line +may be the name of the device where the array is normally +assembled, such as +.B /dev/md1 +or +.BR /dev/md/backup . +If the name does not start with a slash +.RB (' / '), +it is treated as being in +.BR /dev/md/ . +Alternately the word +.B +(complete with angle brackets) can be given in which case any array +which matches the rest of the line will never be automatically assembled. +If no device name is given, +.I mdadm +will use various heuristics to determine an appropriate name. + +Subsequent words identify the array, or identify the array as a member +of a group. If multiple identities are given, +then a component device must match ALL identities to be considered a +match. Each identity word has a tag, and equals sign, and some value. +The tags are: +.RS 4 +.TP +.B uuid= +The value should be a 128 bit uuid in hexadecimal, with punctuation +interspersed if desired. This must match the uuid stored in the +superblock. +.TP +.B name= +The value should be a simple textual name as was given to +.I mdadm +when the array was created. This must match the name stored in the +superblock on a device for that device to be included in the array. +Not all superblock formats support names. +.TP +.B super\-minor= +The value is an integer which indicates the minor number that was +stored in the superblock when the array was created. When an array is +created as /dev/mdX, then the minor number X is stored. +.TP +.B devices= +The value is a comma separated list of device names or device name +patterns. +Only devices with names which match one entry in the list will be used +to assemble the array. Note that the devices +listed there must also be listed on a DEVICE line. +.TP +.B level= +The value is a RAID level. This is not normally used to +identify an array, but is supported so that the output of + +.B "mdadm \-\-examine \-\-scan" + +can be use directly in the configuration file. +.TP +.B num\-devices= +The value is the number of devices in a complete active array. As with +.B level= +this is mainly for compatibility with the output of + +.BR "mdadm \-\-examine \-\-scan" . + +.TP +.B spares= +The value is a number of spare devices to expect the array to have. +The sole use of this keyword and value is as follows: +.B mdadm \-\-monitor +will report an array if it is found to have fewer than this number of +spares when +.B \-\-monitor +starts or when +.B \-\-oneshot +is used. + +.TP +.B spare\-group= +The value is a textual name for a group of arrays. All arrays with +the same +.B spare\-group +name are considered to be part of the same group. The significance of +a group of arrays is that +.I mdadm +will, when monitoring the arrays, move a spare drive from one array in +a group to another array in that group if the first array had a failed +or missing drive but no spare. + +.TP +.B auto= +This option is rarely needed with mdadm-3.0, particularly if use with +the Linux kernel v2.6.28 or later. +It tells +.I mdadm +whether to use partitionable array or non-partitionable arrays and, +in the absence of +.IR udev , +how many partition devices to create. From 2.6.28 all md array +devices are partitionable, hence this option is not needed. + +The value of this option can be "yes" or "md" to indicate that a +traditional, non-partitionable md array should be created, or "mdp", +"part" or "partition" to indicate that a partitionable md array (only +available in linux 2.6 and later) should be used. This later set can +also have a number appended to indicate how many partitions to create +device files for, e.g. +.BR auto=mdp5 . +The default is 4. + +.TP +.B bitmap= +The option specifies a file in which a write-intent bitmap should be +found. When assembling the array, +.I mdadm +will provide this file to the +.B md +driver as the bitmap file. This has the same function as the +.B \-\-bitmap\-file +option to +.BR \-\-assemble . + +.TP +.B metadata= +Specify the metadata format that the array has. This is mainly +recognised for comparability with the output of +.BR "mdadm \-Es" . + +.TP +.B container= +Specify that this array is a member array of some container. The +value given can be either a path name in /dev, or a UUID of the +container array. + +.TP +.B member= +Specify that this array is a member array of some container. Each +type of container has some way to enumerate member arrays, often a +simple sequence number. The value identifies which member of a +container the array is. It will usually accompany a "container=" word. +.RE + +.TP +.B MAILADDR +The +.B mailaddr +line gives an E-mail address that alerts should be +sent to when +.I mdadm +is running in +.B \-\-monitor +mode (and was given the +.B \-\-scan +option). There should only be one +.B MAILADDR +line and it should have only one address. Any subsequent addresses +are silently ignored. + +.TP +.B MAILFROM +The +.B mailfrom +line (which can only be abbreviated to at least 5 characters) gives an +address to appear in the "From" address for alert mails. This can be +useful if you want to explicitly set a domain, as the default from +address is "root" with no domain. All words on this line are +catenated with spaces to form the address. + +Note that this value cannot be set via the +.I mdadm +commandline. It is only settable via the config file. + +.TP +.B PROGRAM +The +.B program +line gives the name of a program to be run when +.B "mdadm \-\-monitor" +detects potentially interesting events on any of the arrays that it +is monitoring. This program gets run with two or three arguments, they +being the Event, the md device, and possibly the related component +device. + +There should only be one +.B program +line and it should be give only one program. + + +.TP +.B CREATE +The +.B create +line gives default values to be used when creating arrays, new members +of arrays, and device entries for arrays. +These include: + +.RS 4 +.TP +.B owner= +.TP +.B group= +These can give user/group ids or names to use instead of system +defaults (root/wheel or root/disk). +.TP +.B mode= +An octal file mode such as 0660 can be given to override the default +of 0600. +.TP +.B auto= +This corresponds to the +.B \-\-auto +flag to mdadm. Give +.BR yes , +.BR md , +.BR mdp , +.B part +\(em possibly followed by a number of partitions \(em to indicate how +missing device entries should be created. + +.TP +.B metadata= +The name of the metadata format to use if none is explicitly given. +This can be useful to impose a system-wide default of version-1 superblocks. + +.TP +.B symlinks=no +Normally when creating devices in +.B /dev/md/ +.I mdadm +will create a matching symlink from +.B /dev/ +with a name starting +.B md +or +.BR md_ . +Give +.B symlinks=no +to suppress this symlink creation. + +.TP +.B names=yes +Since Linux 2.6.29 it has been possible to create +.B md +devices with a name like +.B md_home +rather than just a number, like +.BR md3 . +.I mdadm +will use the numeric alternative by default as other tools that interact +with md arrays may expect only numbers. +If +.B names=yes +is given in +.I mdadm.conf +then +.I mdadm +will use a name when appropriate. +If +.B names=no +is given, then non-numeric +.I md +device names will not be used even if the default changes in a future +release of +.IR mdadm . + +.TP +.B bbl=no +By default, +.I mdadm +will reserve space for a bad block list (bbl) on all devices +included in or added to any array that supports them. Setting +.B bbl=no +will prevent this, so newly added devices will not have a bad +block log. +.RE + +.TP +.B HOMEHOST +The +.B homehost +line gives a default value for the +.B \-\-homehost= +option to mdadm. There should normally be only one other word on the line. +It should either be a host name, or one of the special words +.BR , +.B +and +.BR . +If +.B +is given, then the +.BR gethostname ( 2 ) +systemcall is used to get the host name. This is the default. + +If +.B +is given, then a flag is set so that when arrays are being +auto-assembled the checking of the recorded +.I homehost +is disabled. +If +.B +is given it is also possible to give an explicit name which will be +used when creating arrays. This is the only case when there can be +more that one other word on the +.B HOMEHOST +line. If there are other words, or other +.B HOMEHOST +lines, they are silently ignored. + +If +.B +is given, then the default of using +.BR gethostname ( 2 ) +is over-ridden and no homehost name is assumed. + +When arrays are created, this host name will be stored in the +metadata. When arrays are assembled using auto-assembly, arrays which +do not record the correct homehost name in their metadata will be +assembled using a "foreign" name. A "foreign" name alway ends with a +digit string preceded by an underscore to differentiate it +from any possible local name. e.g. +.B /dev/md/1_1 +or +.BR /dev/md/home_0 . +.TP +.B AUTO +A list of names of metadata format can be given, each preceded by a +plus or minus sign. Also the word +.I homehost +is allowed as is +.I all +preceded by plus or minus sign. +.I all +is usually last. + +When +.I mdadm +is auto-assembling an array, either via +.I \-\-assemble +or +.I \-\-incremental +and it finds metadata of a given type, it checks that metadata type +against those listed in this line. The first match wins, where +.I all +matches anything. +If a match is found that was preceded by a plus sign, the auto +assembly is allowed. If the match was preceded by a minus sign, the +auto assembly is disallowed. If no match is found, the auto assembly +is allowed. + +If the metadata indicates that the array was created for +.I this +host, and the word +.I homehost +appears before any other match, then the array is treated as a valid +candidate for auto-assembly. + +This can be used to disable all auto-assembly (so that only arrays +explicitly listed in mdadm.conf or on the command line are assembled), +or to disable assembly of certain metadata types which might be +handled by other software. It can also be used to disable assembly of +all foreign arrays - normally such arrays are assembled but given a +non-deterministic name in +.BR /dev/md/ . + +The known metadata types are +.BR 0.90 , +.BR 1.x , +.BR ddf , +.BR imsm . + +.B AUTO +should be given at most once. Subsequent lines are silently ignored. +Thus an earlier config file in a config directory will over-ride +the setting in a later config file. + +.TP +.B POLICY +This is used to specify what automatic behavior is allowed on devices +newly appearing in the system and provides a way of marking spares that can +be moved to other arrays as well as the migration domains. +.I Domain +can be defined through +.I policy +line by specifying a domain name for a number of paths from +.BR /dev/disk/by-path/ . +A device may belong to several domains. The domain of an array is a union +of domains of all devices in that array. A spare can be automatically +moved from one array to another if the set of the destination array's +.I domains +contains all the +.I domains +of the new disk or if both arrays have the same +.IR spare-group . + +To update hot plug configuration it is necessary to execute +.B mdadm \-\-udev\-rules +command after changing the config file + +Key words used in the +.I POLICY +line and supported values are: + +.RS 7 +.TP +.B domain= +any arbitrary string +.TP +.B metadata= +0.9 1.x ddf or imsm +.TP +.B path= +file glob matching anything from +.B /dev/disk/by-path +.TP +.B type= +either +.B disk +or +.BR part . +.TP +.B action= +include, re-add, spare, spare-same-slot, or force-spare +.TP +.B auto= +yes, no, or homehost. + +.P +The +.I action +item determines the automatic behavior allowed for devices matching the +.I path +and +.I type +in the same line. If a device matches several lines with different +.I actions +then the most permissive will apply. The ordering of policy lines +is irrelevant to the end result. +.TP +.B include +allows adding a disk to an array if metadata on that disk matches that array +.TP +.B re\-add +will include the device in the array if it appears to be a current member +or a member that was recently removed and the array has a +write-intent-bitmap to allow the +.B re\-add +functionality. +.TP +.B spare +as above and additionally: if the device is bare it can +become a spare if there is any array that it is a candidate for based +on domains and metadata. +.TP +.B spare\-same\-slot +as above and additionally if given slot was used by an array that went +degraded recently and the device plugged in has no metadata then it will +be automatically added to that array (or it's container) +.TP +.B force\-spare +as above and the disk will become a spare in remaining cases +.RE + +.SH EXAMPLE +DEVICE /dev/sd[bcdjkl]1 +.br +DEVICE /dev/hda1 /dev/hdb1 + +# /dev/md0 is known by its UUID. +.br +ARRAY /dev/md0 UUID=3aaa0122:29827cfa:5331ad66:ca767371 +.br +# /dev/md1 contains all devices with a minor number of +.br +# 1 in the superblock. +.br +ARRAY /dev/md1 superminor=1 +.br +# /dev/md2 is made from precisely these two devices +.br +ARRAY /dev/md2 devices=/dev/hda1,/dev/hdb1 + +# /dev/md4 and /dev/md5 are a spare-group and spares +.br +# can be moved between them +.br +ARRAY /dev/md4 uuid=b23f3c6d:aec43a9f:fd65db85:369432df +.br + spare\-group=group1 +.br +ARRAY /dev/md5 uuid=19464854:03f71b1b:e0df2edd:246cc977 +.br + spare\-group=group1 +.br +# /dev/md/home is created if need to be a partitionable md array +.br +# any spare device number is allocated. +.br +ARRAY /dev/md/home UUID=9187a482:5dde19d9:eea3cc4a:d646ab8b +.br + auto=part +.br +# The name of this array contains a space. +.br +ARRAY /dev/md9 name='Data Storage' +.sp +POLICY domain=domain1 metadata=imsm path=pci-0000:00:1f.2-scsi-* +.br + action=spare +.br +POLICY domain=domain1 metadata=imsm path=pci-0000:04:00.0-scsi-[01]* +.br + action=include +.br +# One domain comprising of devices attached to specified paths is defined. +.br +# Bare device matching first path will be made an imsm spare on hot plug. +.br +# If more than one array is created on devices belonging to domain1 and +.br +# one of them becomes degraded, then any imsm spare matching any path for +.br +# given domain name can be migrated. +.br +MAILADDR root@mydomain.tld +.br +PROGRAM /usr/sbin/handle\-mdadm\-events +.br +CREATE group=system mode=0640 auto=part\-8 +.br +HOMEHOST +.br +AUTO +1.x homehost \-all + +.SH SEE ALSO +.BR mdadm (8), +.BR md (4). diff --git a/mdadm.h b/mdadm.h new file mode 100755 index 00000000..dd02be71 --- /dev/null +++ b/mdadm.h @@ -0,0 +1,1691 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#define _GNU_SOURCE +#define _FILE_OFFSET_BITS 64 +#include +#ifdef __GLIBC__ +extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); +#elif !defined(lseek64) +# if defined(__NO_STAT64) || __WORDSIZE != 32 +# define lseek64 lseek +# endif +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __dietlibc__ +#include +/* dietlibc has deprecated random and srandom!! */ +#define random rand +#define srandom srand +#endif + +#ifdef NO_COROSYNC +#define CS_OK 1 +typedef uint64_t cmap_handle_t; +#else +#include +#endif + +#ifndef NO_DLM +#include +#include +#else +#define LKF_NOQUEUE 0x00000001 +#define LKF_CONVERT 0x00000004 +#define LKM_PWMODE 4 +#define EUNLOCK 0x10002 + +typedef void *dlm_lshandle_t; + +struct dlm_lksb { + int sb_status; + uint32_t sb_lkid; + char sb_flags; + char *sb_lvbptr; +}; +#endif + +#include +/*#include */ +#include +#include +#include +#define MD_MAJOR 9 +#define MdpMinorShift 6 + +#ifndef BLKGETSIZE64 +#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ +#endif + +#define DEFAULT_CHUNK 512 +#define DEFAULT_BITMAP_CHUNK 4096 +#define DEFAULT_BITMAP_DELAY 5 +#define DEFAULT_MAX_WRITE_BEHIND 256 + +/* MAP_DIR should be somewhere that persists across the pivotroot + * from early boot to late boot. + * /run seems to have emerged as the best standard. + */ +#ifndef MAP_DIR +#define MAP_DIR "/run/mdadm" +#endif /* MAP_DIR */ +/* MAP_FILE is what we name the map file we put in MAP_DIR, in case you + * want something other than the default of "map" + */ +#ifndef MAP_FILE +#define MAP_FILE "map" +#endif /* MAP_FILE */ +/* MDMON_DIR is where pid and socket files used for communicating + * with mdmon normally live. Best is /var/run/mdadm as + * mdmon is needed at early boot then it needs to write there prior + * to /var/run being mounted read/write, and it also then needs to + * persist beyond when /var/run is mounter read-only. So, to be + * safe, the default is somewhere that is read/write early in the + * boot process and stays up as long as possible during shutdown. + */ +#ifndef MDMON_DIR +#define MDMON_DIR "/run/mdadm" +#endif /* MDMON_DIR */ + +/* FAILED_SLOTS is where to save files storing recent removal of array + * member in order to allow future reuse of disk inserted in the same + * slot for array recovery + */ +#ifndef FAILED_SLOTS_DIR +#define FAILED_SLOTS_DIR "/run/mdadm/failed-slots" +#endif /* FAILED_SLOTS */ + +#include "md_u.h" +#include "md_p.h" +#include "bitmap.h" +#include "msg.h" + +#include +/* Redhat don't like to #include , and + * some time include isn't enough, + * and there is no standard conversion function so... */ +/* And dietlibc doesn't think byteswap is ok, so.. */ +/* #include */ +#define bswap_16(x) (((x) & 0x00ffU) << 8 | \ + ((x) & 0xff00U) >> 8) +#define bswap_32(x) (((x) & 0x000000ffU) << 24 | \ + ((x) & 0xff000000U) >> 24 | \ + ((x) & 0x0000ff00U) << 8 | \ + ((x) & 0x00ff0000U) >> 8) +#define bswap_64(x) (((x) & 0x00000000000000ffULL) << 56 | \ + ((x) & 0xff00000000000000ULL) >> 56 | \ + ((x) & 0x000000000000ff00ULL) << 40 | \ + ((x) & 0x00ff000000000000ULL) >> 40 | \ + ((x) & 0x0000000000ff0000ULL) << 24 | \ + ((x) & 0x0000ff0000000000ULL) >> 24 | \ + ((x) & 0x00000000ff000000ULL) << 8 | \ + ((x) & 0x000000ff00000000ULL) >> 8) + +#if !defined(__KLIBC__) +#if BYTE_ORDER == LITTLE_ENDIAN +#define __cpu_to_le16(_x) (unsigned int)(_x) +#define __cpu_to_le32(_x) (unsigned int)(_x) +#define __cpu_to_le64(_x) (unsigned long long)(_x) +#define __le16_to_cpu(_x) (unsigned int)(_x) +#define __le32_to_cpu(_x) (unsigned int)(_x) +#define __le64_to_cpu(_x) (unsigned long long)(_x) + +#define __cpu_to_be16(_x) bswap_16(_x) +#define __cpu_to_be32(_x) bswap_32(_x) +#define __cpu_to_be64(_x) bswap_64(_x) +#define __be16_to_cpu(_x) bswap_16(_x) +#define __be32_to_cpu(_x) bswap_32(_x) +#define __be64_to_cpu(_x) bswap_64(_x) +#elif BYTE_ORDER == BIG_ENDIAN +#define __cpu_to_le16(_x) bswap_16(_x) +#define __cpu_to_le32(_x) bswap_32(_x) +#define __cpu_to_le64(_x) bswap_64(_x) +#define __le16_to_cpu(_x) bswap_16(_x) +#define __le32_to_cpu(_x) bswap_32(_x) +#define __le64_to_cpu(_x) bswap_64(_x) + +#define __cpu_to_be16(_x) (unsigned int)(_x) +#define __cpu_to_be32(_x) (unsigned int)(_x) +#define __cpu_to_be64(_x) (unsigned long long)(_x) +#define __be16_to_cpu(_x) (unsigned int)(_x) +#define __be32_to_cpu(_x) (unsigned int)(_x) +#define __be64_to_cpu(_x) (unsigned long long)(_x) +#else +# error "unknown endianness." +#endif +#endif /* __KLIBC__ */ + +/* + * Check at compile time that something is of a particular type. + * Always evaluates to 1 so you may use it easily in comparisons. +*/ + +#define typecheck(type,x) \ +({ type __dummy; \ + typeof(x) __dummy2; \ + (void)(&__dummy == &__dummy2); \ + 1; \ +}) + +/* + * These inlines deal with timer wrapping correctly. + * + * time_after(a,b) returns true if the time a is after time b. +*/ + +#define time_after(a,b) \ + (typecheck(unsigned int, a) && \ + typecheck(unsigned int, b) && \ + ((int)((b) - (a)) < 0)) + +#define time_before(a,b) time_after(b,a) + +/* + * min()/max()/clamp() macros that also do + * strict type-checking.. See the + * "unnecessary" pointer comparison. + */ +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +extern const char Name[]; + +/* general information that might be extracted from a superblock */ +struct mdinfo { + mdu_array_info_t array; + mdu_disk_info_t disk; + __u64 events; + int uuid[4]; + char name[33]; + unsigned long long data_offset; + unsigned long long new_data_offset; + unsigned long long component_size; /* same as array.size, except in + * sectors and up to 64bits. + */ + unsigned long long custom_array_size; /* size for non-default sized + * arrays (in sectors) + */ +#define NO_RESHAPE 0 +#define VOLUME_RESHAPE 1 +#define CONTAINER_RESHAPE 2 +#define RESHAPE_NO_BACKUP 16 /* Mask 'or'ed in */ + int reshape_active; + unsigned long long reshape_progress; + int recovery_blocked; /* for external metadata it + * indicates that there is + * reshape in progress in + * container, + * for native metadata it is + * reshape_active field mirror + */ + int journal_device_required; + int journal_clean; + + /* During reshape we can sometimes change the data_offset to avoid + * over-writing still-valid data. We need to know if there is space. + * So getinfo_super will fill in space_before and space_after in sectors. + * data_offset can be increased or decreased by this amount. + */ + unsigned long long space_before, space_after; + union { + unsigned long long resync_start; /* per-array resync position */ + unsigned long long recovery_start; /* per-device rebuild position */ + #define MaxSector (~0ULL) /* resync/recovery complete position */ + }; + long bitmap_offset; /* 0 == none, 1 == a file */ + unsigned long safe_mode_delay; /* ms delay to mark clean */ + int new_level, delta_disks, new_layout, new_chunk; + int errors; + unsigned long cache_size; /* size of raid456 stripe cache*/ + int mismatch_cnt; + char text_version[50]; + + int container_member; /* for assembling external-metatdata arrays + * This is to be used internally by metadata + * handler only */ + int container_enough; /* flag external handlers can set to + * indicate that subarrays have not enough (-1), + * enough to start (0), or all expected disks (1) */ + char sys_name[20]; + struct mdinfo *devs; + struct mdinfo *next; + + /* Device info for mdmon: */ + int recovery_fd; + int state_fd; + #define DS_FAULTY 1 + #define DS_INSYNC 2 + #define DS_WRITE_MOSTLY 4 + #define DS_SPARE 8 + #define DS_BLOCKED 16 + #define DS_REMOVE 1024 + #define DS_UNBLOCK 2048 + int prev_state, curr_state, next_state; + + /* info read from sysfs */ + char sysfs_array_state[20]; +}; + +struct createinfo { + int uid; + int gid; + int autof; + int mode; + int symlinks; + int names; + int bblist; + struct supertype *supertype; +}; + +enum mode { + ASSEMBLE=1, + BUILD, + CREATE, + MANAGE, + MISC, + MONITOR, + GROW, + INCREMENTAL, + AUTODETECT, + mode_count +}; + +extern char short_options[]; +extern char short_bitmap_options[]; +extern char short_bitmap_auto_options[]; +extern struct option long_options[]; +extern char Version[], Usage[], Help[], OptionHelp[], + *mode_help[], + Help_create[], Help_build[], Help_assemble[], Help_grow[], + Help_incr[], + Help_manage[], Help_misc[], Help_monitor[], Help_config[]; + +/* for option that don't have short equivilents, we assign arbitrary + * numbers later than any 'short' character option. + */ +enum special_options { + AssumeClean = 300, + BitmapChunk, + WriteBehind, + ReAdd, + NoDegraded, + Sparc22, + BackupFile, + HomeHost, + AutoHomeHost, + Symlinks, + AutoDetect, + Waitclean, + DetailPlatform, + KillSubarray, + UpdateSubarray, + IncrementalPath, + NoSharing, + HelpOptions, + Brief, + ManageOpt, + Add, + AddSpare, + AddJournal, + Remove, + Fail, + Replace, + With, + MiscOpt, + WaitOpt, + ConfigFile, + ChunkSize, + WriteMostly, + Layout, + Auto, + Force, + SuperMinor, + EMail, + ProgramOpt, + Increment, + Fork, + Bitmap, + RebuildMapOpt, + InvalidBackup, + UdevRules, + FreezeReshape, + Continue, + OffRootOpt, + Prefer, + KillOpt, + DataOffset, + ExamineBB, + Dump, + Restore, + Action, + Nodes, + ClusterName, + ClusterConfirm, + WriteJournal, +}; + +enum prefix_standard { + JEDEC, + IEC +}; + +enum bitmap_update { + NoUpdate, + NameUpdate, + NodeNumUpdate, +}; + +/* structures read from config file */ +/* List of mddevice names and identifiers + * Identifiers can be: + * uuid=128-hex-uuid + * super-minor=decimal-minor-number-from-superblock + * devices=comma,separated,list,of,device,names,with,wildcards + * + * If multiple fields are present, the intersection of all matching + * devices is considered + */ +#define UnSet (0xfffe) +struct mddev_ident { + char *devname; + + int uuid_set; + int uuid[4]; + char name[33]; + + int super_minor; + + char *devices; /* comma separated list of device + * names with wild cards + */ + int level; + int raid_disks; + int spare_disks; + struct supertype *st; + int autof; /* 1 for normal, 2 for partitioned */ + char *spare_group; + char *bitmap_file; + int bitmap_fd; + + char *container; /* /dev/whatever name of container, or + * uuid of container. You would expect + * this to be the 'devname' or UUID + * of some other entry. + */ + char *member; /* subarray within a container */ + + struct mddev_ident *next; + union { + /* fields needed by different users of this structure */ + int assembled; /* set when assembly succeeds */ + }; +}; + +struct context { + int readonly; + int runstop; + int verbose; + int brief; + int force; + char *homehost; + int require_homehost; + char *prefer; + int export; + int test; + char *subarray; + char *update; + int scan; + int SparcAdjust; + int autof; + int delay; + int freeze_reshape; + char *backup_file; + int invalid_backup; + char *action; + int nodes; + char *homecluster; +}; + +struct shape { + int raiddisks; + int sparedisks; + int journaldisks; + int level; + int layout; + char *layout_str; + int chunk; + int bitmap_chunk; + char *bitmap_file; + int assume_clean; + int write_behind; + unsigned long long size; +}; + +/* List of device names - wildcards expanded */ +struct mddev_dev { + char *devname; + int disposition; /* 'a' for add, 'r' for remove, 'f' for fail, + * 'A' for re_add. + * Not set for names read from .config + */ + char writemostly; /* 1 for 'set writemostly', 2 for 'clear writemostly' */ + int used; /* set when used */ + long long data_offset; + struct mddev_dev *next; +}; + +typedef struct mapping { + char *name; + int num; +} mapping_t; + +struct mdstat_ent { + char devnm[32]; + int active; + char *level; + char *pattern; /* U or up, _ for down */ + int percent; /* -1 if no resync */ + int resync; /* 3 if check, 2 if reshape, 1 if resync, 0 if recovery */ + int devcnt; + int raid_disks; + char * metadata_version; + struct dev_member { + char *name; + struct dev_member *next; + } *members; + struct mdstat_ent *next; +}; + +extern struct mdstat_ent *mdstat_read(int hold, int start); +extern void mdstat_close(void); +extern void free_mdstat(struct mdstat_ent *ms); +extern void mdstat_wait(int seconds); +extern void mdstat_wait_fd(int fd, const sigset_t *sigmask); +extern int mddev_busy(char *devnm); +extern struct mdstat_ent *mdstat_by_component(char *name); +extern struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container); + +struct map_ent { + struct map_ent *next; + char devnm[32]; + char metadata[20]; + int uuid[4]; + int bad; + char *path; +}; +extern int map_update(struct map_ent **mpp, char *devnm, char *metadata, + int uuid[4], char *path); +extern void map_remove(struct map_ent **map, char *devnm); +extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]); +extern struct map_ent *map_by_devnm(struct map_ent **map, char *devnm); +extern void map_free(struct map_ent *map); +extern struct map_ent *map_by_name(struct map_ent **map, char *name); +extern void map_read(struct map_ent **melp); +extern int map_write(struct map_ent *mel); +extern void map_delete(struct map_ent **mapp, char *devnm); +extern void map_add(struct map_ent **melp, + char *devnm, char *metadata, int uuid[4], char *path); +extern int map_lock(struct map_ent **melp); +extern void map_unlock(struct map_ent **melp); +extern void map_fork(void); + +/* various details can be requested */ +enum sysfs_read_flags { + GET_LEVEL = (1 << 0), + GET_LAYOUT = (1 << 1), + GET_COMPONENT = (1 << 2), + GET_CHUNK = (1 << 3), + GET_CACHE = (1 << 4), + GET_MISMATCH = (1 << 5), + GET_VERSION = (1 << 6), + GET_DISKS = (1 << 7), + GET_DEGRADED = (1 << 8), + GET_SAFEMODE = (1 << 9), + GET_BITMAP_LOCATION = (1 << 10), + + GET_DEVS = (1 << 20), /* gets role, major, minor */ + GET_OFFSET = (1 << 21), + GET_SIZE = (1 << 22), + GET_STATE = (1 << 23), + GET_ERROR = (1 << 24), + GET_ARRAY_STATE = (1 << 25), +}; + +/* If fd >= 0, get the array it is open on, + * else use devnm. + */ +extern int sysfs_open(char *devnm, char *devname, char *attr); +extern void sysfs_init(struct mdinfo *mdi, int fd, char *devnm); +extern void sysfs_init_dev(struct mdinfo *mdi, unsigned long devid); +extern void sysfs_free(struct mdinfo *sra); +extern struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options); +extern int sysfs_attr_match(const char *attr, const char *str); +extern int sysfs_match_word(const char *word, char **list); +extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val); +extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long val); +extern int sysfs_set_num_signed(struct mdinfo *sra, struct mdinfo *dev, + char *name, long long val); +extern int sysfs_uevent(struct mdinfo *sra, char *event); +extern int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, + char *name); +extern int sysfs_fd_get_ll(int fd, unsigned long long *val); +extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *val); +extern int sysfs_fd_get_two(int fd, unsigned long long *v1, unsigned long long *v2); +extern int sysfs_get_two(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *v1, unsigned long long *v2); +extern int sysfs_fd_get_str(int fd, char *val, int size); +extern int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, + char *name); +extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val, int size); +extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms); +extern int sysfs_set_array(struct mdinfo *info, int vers); +extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume); +extern int sysfs_disk_to_scsi_id(int fd, __u32 *id); +extern int sysfs_unique_holder(char *devnm, long rdev); +extern int sysfs_freeze_array(struct mdinfo *sra); +extern int sysfs_wait(int fd, int *msec); +extern int load_sys(char *path, char *buf); +extern int reshape_prepare_fdlist(char *devname, + struct mdinfo *sra, + int raid_disks, + int nrdisks, + unsigned long blocks, + char *backup_file, + int *fdlist, + unsigned long long *offsets); +extern void reshape_free_fdlist(int *fdlist, + unsigned long long *offsets, + int size); +extern int reshape_open_backup_file(char *backup, + int fd, + char *devname, + long blocks, + int *fdlist, + unsigned long long *offsets, + char *sysfs_name, + int restart); +extern unsigned long compute_backup_blocks(int nchunk, int ochunk, + unsigned int ndata, unsigned int odata); +extern char *locate_backup(char *name); +extern char *make_backup(char *name); + +extern int save_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int nwrites, int *dest, + unsigned long long start, unsigned long long length, + char *buf); +extern int restore_stripes(int *dest, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int source, unsigned long long read_offset, + unsigned long long start, unsigned long long length, + char *src_buf); + +#ifndef Sendmail +#define Sendmail "/usr/lib/sendmail -t" +#endif + +#define SYSLOG_FACILITY LOG_DAEMON + +extern char *map_num(mapping_t *map, int num); +extern int map_name(mapping_t *map, char *name); +extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[]; + +extern char *map_dev_preferred(int major, int minor, int create, + char *prefer); +static inline char *map_dev(int major, int minor, int create) +{ + return map_dev_preferred(major, minor, create, NULL); +} + +struct active_array; +struct metadata_update; + +/* 'struct reshape' records the intermediate states of + * a general reshape. + * The starting geometry is converted to the 'before' geometry + * by at most an atomic level change. They could be the same. + * Similarly the 'after' geometry is converted to the final + * geometry by at most a level change. + * Note that 'before' and 'after' must have the same level. + * 'blocks' is the minimum number of sectors for a reshape unit. + * This will be a multiple of the stripe size in each of the + * 'before' and 'after' geometries. + * If 'blocks' is 0, no restriping is necessary. + * 'min_offset_change' is the minimum change to data_offset to + * allow the reshape to happen. It is at least the larger of + * the old and new chunk sizes, and typically the same as 'blocks' + * divided by number of data disks. + */ +struct reshape { + int level; + int parity; /* number of parity blocks/devices */ + struct { + int layout; + int data_disks; + } before, after; + unsigned long long backup_blocks; + unsigned long long min_offset_change; + unsigned long long stripes; /* number of old stripes that comprise 'blocks'*/ + unsigned long long new_size; /* New size of array in sectors */ +}; + +/* A superswitch provides entry point the a metadata handler. + * + * The superswitch primarily operates on some "metadata" that + * is accessed via the 'supertype'. + * This metadata has one of three possible sources. + * 1/ It is read from a single device. In this case it may not completely + * describe the array or arrays as some information might be on other + * devices. + * 2/ It is read from all devices in a container. In this case all + * information is present. + * 3/ It is created by ->init_super / ->add_to_super. In this case it will + * be complete once enough ->add_to_super calls have completed. + * + * When creating an array inside a container, the metadata will be + * formed by a combination of 2 and 3. The metadata or the array is read, + * then new information is added. + * + * The metadata must sometimes have a concept of a 'current' array + * and a 'current' device. + * The 'current' array is set by init_super to be the newly created array, + * or is set by super_by_fd when it finds it is looking at an array inside + * a container. + * + * The 'current' device is either the device that the metadata was read from + * in case 1, or the last device added by add_to_super in case 3. + * Case 2 does not identify a 'current' device. + */ +extern struct superswitch { + + /* Used to report details of metadata read from a component + * device. ->load_super has been called. + */ + void (*examine_super)(struct supertype *st, char *homehost); + void (*brief_examine_super)(struct supertype *st, int verbose); + void (*brief_examine_subarrays)(struct supertype *st, int verbose); + void (*export_examine_super)(struct supertype *st); + int (*examine_badblocks)(struct supertype *st, int fd, char *devname); + int (*copy_metadata)(struct supertype *st, int from, int to); + + /* Used to report details of an active array. + * ->load_super was possibly given a 'component' string. + */ + void (*detail_super)(struct supertype *st, char *homehost); + void (*brief_detail_super)(struct supertype *st); + void (*export_detail_super)(struct supertype *st); + + /* Optional: platform hardware / firmware details */ + int (*detail_platform)(int verbose, int enumerate_only, char *controller_path); + int (*export_detail_platform)(int verbose, char *controller_path); + + /* Used: + * to get uuid to storing in bitmap metadata + * and 'reshape' backup-data metadata + * To see if a device is being re-added to an array it was part of. + */ + void (*uuid_from_super)(struct supertype *st, int uuid[4]); + + /* Extract generic details from metadata. This could be details about + * the container, or about an individual array within the container. + * The determination is made either by: + * load_super being given a 'component' string. + * validate_geometry determining what to create. + * The info includes both array information and device information. + * The particular device should be: + * The last device added by add_to_super + * The device the metadata was loaded from by load_super + * If 'map' is present, then it is an array raid_disks long + * (raid_disk must already be set and correct) and it is filled + * with 1 for slots that are thought to be active and 0 for slots which + * appear to be failed/missing. + * *info is zeroed out before data is added. + */ + void (*getinfo_super)(struct supertype *st, struct mdinfo *info, char *map); + struct mdinfo *(*getinfo_super_disks)(struct supertype *st); + /* Check if the given metadata is flagged as belonging to "this" + * host. 0 for 'no', 1 for 'yes', -1 for "Don't record homehost" + */ + int (*match_home)(struct supertype *st, char *homehost); + + /* Make one of several generic modifications to metadata + * prior to assembly (or other times). + * sparc2.2 - first bug in early 0.90 metadata + * super-minor - change name of 0.90 metadata + * summaries - 'correct' any redundant data + * resync - mark array as dirty to trigger a resync. + * uuid - set new uuid - only 0.90 or 1.x + * name - change the name of the array (where supported) + * homehost - change which host this array is tied to. + * devicesize - If metadata is at start of device, change recorded + * device size to match actual device size + * byteorder - swap bytes for 0.90 metadata + * + * force-one - mark that device as uptodate, not old or failed. + * force-array - mark array as clean if it would not otherwise + * assemble + * assemble - not sure how this is different from force-one... + * linear-grow-new - add a new device to a linear array, but don't + * change the size: so superblock still matches + * linear-grow-update - now change the size of the array. + * writemostly - set the WriteMostly1 bit in the superblock devflags + * readwrite - clear the WriteMostly1 bit in the superblock devflags + * no-bitmap - clear any record that a bitmap is present. + * bbl - add a bad-block-log if possible + * no-bbl - remove any bad-block-log is it is empty. + * force-no-bbl - remove any bad-block-log even if empty. + * revert-reshape - If a reshape is in progress, modify metadata so + * it will resume going in the opposite direction. + */ + int (*update_super)(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost); + + /* Create new metadata for new array as described. This could + * be a new container, or an array in a pre-existing container. + * Also used to zero metadata prior to writing it to invalidate old + * metadata. + */ + int (*init_super)(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, + char *homehost, int *uuid, + unsigned long long data_offset); + + /* update the metadata to include new device, either at create or + * when hot-adding a spare. + */ + int (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo, + int fd, char *devname, + unsigned long long data_offset); + /* update the metadata to delete a device, + * when hot-removing. + */ + int (*remove_from_super)(struct supertype *st, mdu_disk_info_t *dinfo); + + /* Write metadata to one device when fixing problems or adding + * a new device. + */ + int (*store_super)(struct supertype *st, int fd); + + /* Write all metadata for this array. + */ + int (*write_init_super)(struct supertype *st); + /* Check if metadata read from one device is compatible with an array, + * used when assembling an array, or pseudo-assembling was with + * "--examine --brief" + * If "st" has not yet been loaded the superblock from, "tst" is + * moved in, otherwise the superblock in 'st' is compared with + * 'tst'. + */ + int (*compare_super)(struct supertype *st, struct supertype *tst); + /* Load metadata from a single device. If 'devname' is not NULL + * print error messages as appropriate */ + int (*load_super)(struct supertype *st, int fd, char *devname); + /* 'fd' is a 'container' md array - load array metadata from the + * whole container. + */ + int (*load_container)(struct supertype *st, int fd, char *devname); + /* If 'arg' is a valid name of this metadata type, allocate and + * return a 'supertype' for the particular minor version */ + struct supertype * (*match_metadata_desc)(char *arg); + /* If a device has the given size, and the data_offset has been + * requested - work out how much space is available for data. + * This involves adjusting for reserved space (e.g. bitmaps) + * and for any rounding. + * 'mdadm' only calls this for existing arrays where a possible + * spare is being added. However some super-handlers call it + * internally from validate_geometry when creating an array. + */ + __u64 (*avail_size)(struct supertype *st, __u64 size, + unsigned long long data_offset); + /* This is similar to 'avail_size' in purpose, but is used for + * containers for which there is no 'component size' to compare. + * This reports that whole-device size which is a minimum + */ + unsigned long long (*min_acceptable_spare_size)(struct supertype *st); + /* Find somewhere to put a bitmap - possibly auto-size it - and + * update the metadata to record this. The array may be newly + * created, in which case data_size may be updated, or it might + * already exist. Metadata handler can know if init_super + * has been called, but not write_init_super. + */ + int (*add_internal_bitmap)(struct supertype *st, int *chunkp, + int delay, int write_behind, + unsigned long long size, int may_change, int major); + /* Seek 'fd' to start of write-intent-bitmap. Must be an + * md-native format bitmap + */ + int (*locate_bitmap)(struct supertype *st, int fd); + /* if add_internal_bitmap succeeded for existing array, this + * writes it out. + */ + int (*write_bitmap)(struct supertype *st, int fd, enum bitmap_update update); + /* Free the superblock and any other allocated data */ + void (*free_super)(struct supertype *st); + + /* validate_geometry is called with an st returned by + * match_metadata_desc. + * It should check that the geometry described is compatible with + * the metadata type. It will be called repeatedly as devices + * added to validate changing size and new devices. If there are + * inter-device dependencies, it should record sufficient details + * so these can be validated. + * Both 'size' and '*freesize' are in sectors. chunk is KiB. + * Return value is: + * 1: everything is OK + * 0: not OK for some reason - if 'verbose', then error was reported. + * -1: st->sb was NULL, 'subdev' is a member of a container of this + * type, but array is not acceptable for some reason + * message was reported even if verbose is 0. + */ + int (*validate_geometry)(struct supertype *st, int level, int layout, + int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int verbose); + + /* Return a linked list of 'mdinfo' structures for all arrays + * in the container. For non-containers, it is like + * getinfo_super with an allocated mdinfo.*/ + struct mdinfo *(*container_content)(struct supertype *st, char *subarray); + /* query the supertype for default geometry */ + void (*default_geometry)(struct supertype *st, int *level, int *layout, int *chunk); /* optional */ + /* Permit subarray's to be deleted from inactive containers */ + int (*kill_subarray)(struct supertype *st); /* optional */ + /* Permit subarray's to be modified */ + int (*update_subarray)(struct supertype *st, char *subarray, + char *update, struct mddev_ident *ident); /* optional */ + /* Check if reshape is supported for this external format. + * st is obtained from super_by_fd() where st->subarray[0] is + * initialized to indicate if reshape is being performed at the + * container or subarray level + */ +#define APPLY_METADATA_CHANGES 1 +#define ROLLBACK_METADATA_CHANGES 0 + + int (*reshape_super)(struct supertype *st, + unsigned long long size, int level, + int layout, int chunksize, int raid_disks, + int delta_disks, char *backup, char *dev, + int direction, + int verbose); /* optional */ + int (*manage_reshape)( /* optional */ + int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets); + +/* for mdmon */ + int (*open_new)(struct supertype *c, struct active_array *a, + char *inst); + + /* Tell the metadata handler the current state of the array. + * This covers whether it is known to be consistent (no pending writes) + * and how far along a resync is known to have progressed + * (in a->resync_start). + * resync status is really irrelevant if the array is not consistent, + * but some metadata (DDF!) have a place to record the distinction. + * If 'consistent' is '2', then the array can mark it dirty if a + * resync/recovery/whatever is required, or leave it clean if not. + * Return value is 0 dirty (not consistent) and 1 if clean. + * it is only really important if consistent is passed in as '2'. + */ + int (*set_array_state)(struct active_array *a, int consistent); + + /* When the state of a device might have changed, we call set_disk to + * tell the metadata what the current state is. + * Typically this happens on spare->in_sync and (spare|in_sync)->faulty + * transitions. + * set_disk might be called when the state of the particular disk has + * not in fact changed. + */ + void (*set_disk)(struct active_array *a, int n, int state); + void (*sync_metadata)(struct supertype *st); + void (*process_update)(struct supertype *st, + struct metadata_update *update); + /* Prepare updates allocates extra memory that might be + * needed. If the update cannot be understood, return 0. + */ + int (*prepare_update)(struct supertype *st, + struct metadata_update *update); + + /* activate_spare will check if the array is degraded and, if it + * is, try to find some spare space in the container. + * On success, it add appropriate updates (For process_update) to + * to the 'updates' list and returns a list of 'mdinfo' identifying + * the device, or devices as there might be multiple missing + * devices and multiple spares available. + */ + struct mdinfo *(*activate_spare)(struct active_array *a, + struct metadata_update **updates); + /* + * Return statically allocated string that represents metadata specific + * controller domain of the disk. The domain is used in disk domain + * matching functions. Disks belong to the same domain if the they have + * the same domain from mdadm.conf and belong the same metadata domain. + * Returning NULL or not providing this handler means that metadata + * does not distinguish the differences between disks that belong to + * different controllers. They are in the domain specified by + * configuration file (mdadm.conf). + * In case when the metadata has the notion of domains based on disk + * it shall return NULL for disks that do not belong to the controller + * the supported domains. Such disks will form another domain and won't + * be mixed with supported ones. + */ + const char *(*get_disk_controller_domain)(const char *path); + + /* for external backup area */ + int (*recover_backup)(struct supertype *st, struct mdinfo *info); + + /* validate container after assemble */ + int (*validate_container)(struct mdinfo *info); + + int swapuuid; /* true if uuid is bigending rather than hostendian */ + int external; + const char *name; /* canonical metadata name */ +} *superlist[]; + +extern struct superswitch super0, super1; +extern struct superswitch super_imsm, super_ddf; +extern struct superswitch mbr, gpt; + +struct metadata_update { + int len; + char *buf; + void *space; /* allocated space that monitor will use */ + void **space_list; /* list of allocated spaces that monitor can + * use or that it returned. + */ + struct metadata_update *next; +}; + +/* A supertype holds a particular collection of metadata. + * It identifies the metadata type by the superswitch, and the particular + * sub-version of that metadata type. + * metadata read in or created is stored in 'sb' and 'info'. + * There are also fields used by mdmon to track containers. + * + * A supertype may refer to: + * Just an array, possibly in a container + * A container, not identifying any particular array + * Info read from just one device, not yet fully describing the array/container. + * + * + * A supertype is created by: + * super_by_fd + * guess_super + * dup_super + */ +struct supertype { + struct superswitch *ss; + int minor_version; + int max_devs; + char container_devnm[32]; /* devnm of container */ + void *sb; + void *info; + void *other; /* Hack used to convert v0.90 to v1.0 */ + unsigned long long devsize; + unsigned long long data_offset; /* used by v1.x only */ + int ignore_hw_compat; /* used to inform metadata handlers that it should ignore + HW/firmware related incompatability to load metadata. + Used when examining metadata to display content of disk + when user has no hw/firmare compatible system. + */ + struct metadata_update *updates; + struct metadata_update **update_tail; + + /* extra stuff used by mdmon */ + struct active_array *arrays; + int sock; /* listen to external programs */ + char devnm[32]; /* e.g. md0. This appears in metadata_version: + * external:/md0/12 + */ + int devcnt; + int retry_soon; + int nodes; + char *cluster_name; + + struct mdinfo *devs; + +}; + +extern struct supertype *super_by_fd(int fd, char **subarray); +enum guess_types { guess_any, guess_array, guess_partitions }; +extern struct supertype *guess_super_type(int fd, enum guess_types guess_type); +static inline struct supertype *guess_super(int fd) { + return guess_super_type(fd, guess_any); +} +extern struct supertype *dup_super(struct supertype *st); +extern int get_dev_size(int fd, char *dname, unsigned long long *sizep); +extern int must_be_container(int fd); +extern int dev_size_from_id(dev_t id, unsigned long long *size); +void wait_for(char *dev, int fd); + +/* + * Data structures for policy management. + * Each device can have a policy structure that lists + * various name/value pairs each possibly with a metadata associated. + * The policy list is sorted by name/value/metadata + */ +struct dev_policy { + struct dev_policy *next; + char *name; /* None of these strings are allocated. They are + * all just references to strings which are known + * to exist elsewhere. + * name and metadata can be compared by address equality. + */ + const char *metadata; + const char *value; +}; + +extern char pol_act[], pol_domain[], pol_metadata[], pol_auto[]; + +/* iterate over the sublist starting at list, having the same + * 'name' as 'list', and matching the given metadata (Where + * NULL matches anything + */ +#define pol_for_each(item, list, _metadata) \ + for (item = list; \ + item && item->name == list->name; \ + item = item->next) \ + if (!(!_metadata || !item->metadata || _metadata == item->metadata)) \ + ; else + +/* + * policy records read from mdadm are largely just name-value pairs. + * The names are constants, not strdupped + */ +struct pol_rule { + struct pol_rule *next; + char *type; /* rule_policy or rule_part */ + struct rule { + struct rule *next; + char *name; + char *value; + char *dups; /* duplicates of 'value' with a partNN appended */ + } *rule; +}; + +extern char rule_policy[], rule_part[]; +extern char rule_path[], rule_type[]; +extern char type_part[], type_disk[]; + +extern void policyline(char *line, char *type); +extern void policy_add(char *type, ...); +extern void policy_free(void); + +extern struct dev_policy *path_policy(char *path, char *type); +extern struct dev_policy *disk_policy(struct mdinfo *disk); +extern struct dev_policy *devid_policy(int devid); +extern void dev_policy_free(struct dev_policy *p); + +//extern void pol_new(struct dev_policy **pol, char *name, char *val, char *metadata); +extern void pol_add(struct dev_policy **pol, char *name, char *val, char *metadata); +extern struct dev_policy *pol_find(struct dev_policy *pol, char *name); + +enum policy_action { + act_default, + act_include, + act_re_add, + act_spare, /* This only applies to bare devices */ + act_spare_same_slot, /* this allows non-bare devices, + * but only if recent removal */ + act_force_spare, /* this allow non-bare devices in any case */ + act_err +}; + +extern int policy_action_allows(struct dev_policy *plist, const char *metadata, + enum policy_action want); +extern int disk_action_allows(struct mdinfo *disk, const char *metadata, + enum policy_action want); + +struct domainlist { + struct domainlist *next; + const char *dom; +}; + +extern int domain_test(struct domainlist *dom, struct dev_policy *pol, + const char *metadata); +extern struct domainlist *domain_from_array(struct mdinfo *mdi, + const char *metadata); +extern void domainlist_add_dev(struct domainlist **dom, int devid, + const char *metadata); +extern void domain_free(struct domainlist *dl); +extern void domain_merge(struct domainlist **domp, struct dev_policy *pol, + const char *metadata); +void domain_add(struct domainlist **domp, char *domain); + +extern void policy_save_path(char *id_path, struct map_ent *array); +extern int policy_check_path(struct mdinfo *disk, struct map_ent *array); + +#if __GNUC__ < 3 +struct stat64; +#endif + +#define HAVE_NFTW we assume +#define HAVE_FTW + +#ifdef __UCLIBC__ +# include +# ifndef __UCLIBC_HAS_LFS__ +# define lseek64 lseek +# endif +# ifndef __UCLIBC_HAS_FTW__ +# undef HAVE_FTW +# undef HAVE_NFTW +# endif +#endif + +#ifdef __dietlibc__ +# undef HAVE_NFTW +#endif + +#if defined(__KLIBC__) +# undef HAVE_NFTW +# undef HAVE_FTW +#endif + +#ifndef HAVE_NFTW +# define FTW_PHYS 1 +# ifndef HAVE_FTW + struct FTW {}; +# endif +#endif + +#ifdef HAVE_FTW +# include +#endif + +extern int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s); + +extern int Manage_ro(char *devname, int fd, int readonly); +extern int Manage_run(char *devname, int fd, struct context *c); +extern int Manage_stop(char *devname, int fd, int quiet, + int will_retry); +extern int Manage_subdevs(char *devname, int fd, + struct mddev_dev *devlist, int verbose, int test, + char *update, int force); +extern int autodetect(void); +extern int Grow_Add_device(char *devname, int fd, char *newdev); +extern int Grow_addbitmap(char *devname, int fd, + struct context *c, struct shape *s); +extern int Grow_reshape(char *devname, int fd, + struct mddev_dev *devlist, + unsigned long long data_offset, + struct context *c, struct shape *s); +extern int Grow_restart(struct supertype *st, struct mdinfo *info, + int *fdlist, int cnt, char *backup_file, int verbose); +extern int Grow_continue(int mdfd, struct supertype *st, + struct mdinfo *info, char *backup_file, + int forked, int freeze_reshape); + +extern int restore_backup(struct supertype *st, + struct mdinfo *content, + int working_disks, + int spares, + char **backup_filep, + int verbose); +extern int Grow_continue_command(char *devname, int fd, + char *backup_file, int verbose); + +extern int Assemble(struct supertype *st, char *mddev, + struct mddev_ident *ident, + struct mddev_dev *devlist, + struct context *c); + +extern int Build(char *mddev, struct mddev_dev *devlist, + struct shape *s, struct context *c); + +extern int Create(struct supertype *st, char *mddev, + char *name, int *uuid, + int subdevs, struct mddev_dev *devlist, + struct shape *s, + struct context *c, + unsigned long long data_offset); + +extern int Detail(char *dev, struct context *c); +extern int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path); +extern int Query(char *dev); +extern int ExamineBadblocks(char *devname, int brief, struct supertype *forcest); +extern int Examine(struct mddev_dev *devlist, struct context *c, + struct supertype *forcest); +extern int Monitor(struct mddev_dev *devlist, + char *mailaddr, char *alert_cmd, + struct context *c, + int daemonise, int oneshot, + int dosyslog, char *pidfile, int increments, + int share); + +extern int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl); +extern int Kill_subarray(char *dev, char *subarray, int verbose); +extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet); +extern int Wait(char *dev); +extern int WaitClean(char *dev, int sock, int verbose); +extern int SetAction(char *dev, char *action); + +extern int Incremental(struct mddev_dev *devlist, struct context *c, + struct supertype *st); +extern void RebuildMap(void); +extern int IncrementalScan(struct context *c, char *devnm); +extern int IncrementalRemove(char *devname, char *path, int verbose); +extern int CreateBitmap(char *filename, int force, char uuid[16], + unsigned long chunksize, unsigned long daemon_sleep, + unsigned long write_behind, + unsigned long long array_size, + int major); +extern int ExamineBitmap(char *filename, int brief, struct supertype *st); +extern int Write_rules(char *rule_name); +extern int bitmap_update_uuid(int fd, int *uuid, int swap); +extern unsigned long bitmap_sectors(struct bitmap_super_s *bsb); +extern int Dump_metadata(char *dev, char *dir, struct context *c, + struct supertype *st); +extern int Restore_metadata(char *dev, char *dir, struct context *c, + struct supertype *st, int only); + +extern int md_get_version(int fd); +extern int get_linux_version(void); +extern int mdadm_version(char *version); +extern unsigned long long parse_size(char *size); +extern int parse_uuid(char *str, int uuid[4]); +extern int parse_layout_10(char *layout); +extern int parse_layout_faulty(char *layout); +extern long parse_num(char *num); +extern int parse_cluster_confirm_arg(char *inp, char **devname, int *slot); +extern int check_ext2(int fd, char *name); +extern int check_reiser(int fd, char *name); +extern int check_raid(int fd, char *name); +extern int check_partitions(int fd, char *dname, + unsigned long long freesize, + unsigned long long size); + +extern int get_mdp_major(void); +extern int get_maj_min(char *dev, int *major, int *minor); +extern int dev_open(char *dev, int flags); +extern int open_dev(char *devnm); +extern void reopen_mddev(int mdfd); +extern int open_dev_flags(char *devnm, int flags); +extern int open_dev_excl(char *devnm); +extern int is_standard(char *dev, int *nump); +extern int same_dev(char *one, char *two); +extern int compare_paths (char* path1,char* path2); +extern void enable_fds(int devices); + +extern int parse_auto(char *str, char *msg, int config); +extern struct mddev_ident *conf_get_ident(char *dev); +extern struct mddev_dev *conf_get_devs(void); +extern int conf_test_dev(char *devname); +extern int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost); +extern struct createinfo *conf_get_create_info(void); +extern void set_conffile(char *file); +extern char *conf_get_mailaddr(void); +extern char *conf_get_mailfrom(void); +extern char *conf_get_program(void); +extern char *conf_get_homehost(int *require_homehostp); +extern char *conf_get_homecluster(void); +extern char *conf_line(FILE *file); +extern char *conf_word(FILE *file, int allow_key); +extern void print_quoted(char *str); +extern void print_escape(char *str); +extern int use_udev(void); +extern unsigned long GCD(unsigned long a, unsigned long b); +extern int conf_name_is_free(char *name); +extern int conf_verify_devnames(struct mddev_ident *array_list); +extern int devname_matches(char *name, char *match); +extern struct mddev_ident *conf_match(struct supertype *st, + struct mdinfo *info, + char *devname, + int verbose, int *rvp); +extern int experimental(void); + +extern void free_line(char *line); +extern int match_oneof(char *devices, char *devname); +extern void uuid_from_super(int uuid[4], mdp_super_t *super); +extern const int uuid_zero[4]; +extern int same_uuid(int a[4], int b[4], int swapuuid); +extern void copy_uuid(void *a, int b[4], int swapuuid); +extern char *__fname_from_uuid(int id[4], int swap, char *buf, char sep); +extern char *fname_from_uuid(struct supertype *st, + struct mdinfo *info, char *buf, char sep); +extern unsigned long calc_csum(void *super, int bytes); +extern int enough(int level, int raid_disks, int layout, int clean, + char *avail); +extern int enough_fd(int fd); +extern int ask(char *mesg); +extern unsigned long long get_component_size(int fd); +extern void remove_partitions(int fd); +extern int test_partition(int fd); +extern int test_partition_from_id(dev_t id); +extern int get_data_disks(int level, int layout, int raid_disks); +extern unsigned long long calc_array_size(int level, int raid_disks, int layout, + int chunksize, unsigned long long devsize); +extern int flush_metadata_updates(struct supertype *st); +extern void append_metadata_update(struct supertype *st, void *buf, int len); +extern int assemble_container_content(struct supertype *st, int mdfd, + struct mdinfo *content, + struct context *c, + char *chosen_name, int *result); +#define INCR_NO 1 +#define INCR_UNSAFE 2 +#define INCR_ALREADY 4 +#define INCR_YES 8 +extern struct mdinfo *container_choose_spares(struct supertype *st, + unsigned long long min_size, + struct domainlist *domlist, + char *spare_group, + const char *metadata, int get_one); +extern int move_spare(char *from_devname, char *to_devname, dev_t devid); +extern int add_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info); +extern int remove_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info); +extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info); +unsigned long long min_recovery_start(struct mdinfo *array); + +extern char *human_size(long long bytes); +extern char *human_size_brief(long long bytes, int prefix); +extern void print_r10_layout(int layout); + +extern char *find_free_devnm(int use_partitions); + +extern void put_md_name(char *name); +extern char *devid2kname(int devid); +extern char *devid2devnm(int devid); +extern int devnm2devid(char *devnm); +extern char *get_md_name(char *devnm); + +extern char DefaultConfFile[]; + +extern int create_mddev(char *dev, char *name, int autof, int trustworthy, + char *chosen); +/* values for 'trustworthy' */ +#define LOCAL 1 +#define LOCAL_ANY 10 +#define FOREIGN 2 +#define METADATA 3 +extern int open_mddev(char *dev, int report_errors); +extern int open_container(int fd); +extern int metadata_container_matches(char *metadata, char *devnm); +extern int metadata_subdev_matches(char *metadata, char *devnm); +extern int is_container_member(struct mdstat_ent *ent, char *devname); +extern int is_subarray_active(char *subarray, char *devname); +extern int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet); +extern struct superswitch *version_to_superswitch(char *vers); + +extern int mdmon_running(char *devnm); +extern int mdmon_pid(char *devnm); +extern int check_env(char *name); +extern __u32 random32(void); +extern int start_mdmon(char *devnm); + +extern int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long stripes, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets); +void abort_reshape(struct mdinfo *sra); + +void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0); + +extern void fmt_devname(char *name, int num); +extern char *stat2devnm(struct stat *st); +extern char *fd2devnm(int fd); + +extern int in_initrd(void); + +struct cmap_hooks { + void *cmap_handle; /* corosync lib related */ + + int (*initialize)(cmap_handle_t *handle); + int (*get_string)(cmap_handle_t handle, + const char *string, + char **name); + int (*finalize)(cmap_handle_t handle); +}; + +extern void set_cmap_hooks(void); +extern void set_hooks(void); + +struct dlm_hooks { + void *dlm_handle; /* dlm lib related */ + + dlm_lshandle_t (*create_lockspace)(const char *name, + unsigned int mode); + int (*release_lockspace)(const char *name, dlm_lshandle_t ls, + int force); + int (*ls_lock)(dlm_lshandle_t lockspace, uint32_t mode, + struct dlm_lksb *lksb, uint32_t flags, + const void *name, unsigned int namelen, + uint32_t parent, void (*astaddr) (void *astarg), + void *astarg, void (*bastaddr) (void *astarg), + void *range); + int (*ls_unlock)(dlm_lshandle_t lockspace, uint32_t lkid, + uint32_t flags, struct dlm_lksb *lksb, + void *astarg); + int (*ls_get_fd)(dlm_lshandle_t ls); + int (*dispatch)(int fd); +}; + +extern int get_cluster_name(char **name); +extern int dlm_funs_ready(void); +extern int cluster_get_dlmlock(int *lockid); +extern int cluster_release_dlmlock(int lockid); +extern void set_dlm_hooks(void); + +#define _ROUND_UP(val, base) (((val) + (base) - 1) & ~(base - 1)) +#define ROUND_UP(val, base) _ROUND_UP(val, (typeof(val))(base)) +#define ROUND_UP_PTR(ptr, base) ((typeof(ptr)) \ + (ROUND_UP((unsigned long)(ptr), base))) + +static inline int is_subarray(char *vers) +{ + /* The version string for a 'subarray' (an array in a container) + * is + * /containername/componentname for normal read-write arrays + * -containername/componentname for arrays which mdmon must not + * reconfigure. They might be read-only + * or might be undergoing reshape etc. + * containername is e.g. md0, md_d1 + * componentname is dependant on the metadata. e.g. '1' 'S1' ... + */ + return (*vers == '/' || *vers == '-'); +} + +static inline char *to_subarray(struct mdstat_ent *ent, char *container) +{ + return &ent->metadata_version[10+strlen(container)+1]; +} + +#ifdef DEBUG +#define dprintf(fmt, arg...) \ + fprintf(stderr, "%s: %s: "fmt, Name, __func__, ##arg) +#define dprintf_cont(fmt, arg...) \ + fprintf(stderr, fmt, ##arg) +#else +#define dprintf(fmt, arg...) \ + ({ if (0) fprintf(stderr, "%s: %s: " fmt, Name, __func__, ##arg); 0; }) +#define dprintf_cont(fmt, arg...) \ + ({ if (0) fprintf(stderr, fmt, ##arg); 0; }) +#endif +#include +#include +static inline int xasprintf(char **strp, const char *fmt, ...) { + va_list ap; + int ret; + va_start(ap, fmt); + ret = vasprintf(strp, fmt, ap); + va_end(ap); + assert(ret >= 0); + return ret; +} + +#ifdef DEBUG +#define pr_err(fmt, args...) fprintf(stderr, "%s: %s: "fmt, Name, __func__, ##args) +#else +#define pr_err(fmt, args...) fprintf(stderr, "%s: "fmt, Name, ##args) +#endif +#define cont_err(fmt ...) fprintf(stderr, " " fmt) + +void *xmalloc(size_t len); +void *xrealloc(void *ptr, size_t len); +void *xcalloc(size_t num, size_t size); +char *xstrdup(const char *str); + +#define LEVEL_MULTIPATH (-4) +#define LEVEL_LINEAR (-1) +#define LEVEL_FAULTY (-5) + +/* kernel module doesn't know about these */ +#define LEVEL_CONTAINER (-100) +#define LEVEL_UNSUPPORTED (-200) + +/* the kernel does know about this one ... */ +#define LEVEL_NONE (-1000000) + +/* faulty stuff */ + +#define WriteTransient 0 +#define ReadTransient 1 +#define WritePersistent 2 +#define ReadPersistent 3 +#define WriteAll 4 /* doesn't go to device */ +#define ReadFixable 5 +#define Modes 6 + +#define ClearErrors 31 +#define ClearFaults 30 + +#define AllPersist 100 /* internal use only */ +#define NoPersist 101 + +#define ModeMask 0x1f +#define ModeShift 5 + +#ifdef __TINYC__ +#undef minor +#undef major +#undef makedev +#define minor(x) ((x)&0xff) +#define major(x) (((x)>>8)&0xff) +#define makedev(M,m) (((M)<<8) | (m)) +#endif + +/* for raid4/5/6 */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 +#define ALGORITHM_RIGHT_SYMMETRIC 3 + +/* Define non-rotating (raid4) algorithms. These allow + * conversion of raid4 to raid5. + */ +#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ +#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ + +/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. + * Firstly, the exact positioning of the parity block is slightly + * different between the 'LEFT_*' modes of md and the "_N_*" modes + * of DDF. + * Secondly, or order of datablocks over which the Q syndrome is computed + * is different. + * Consequently we have different layouts for DDF/raid6 than md/raid6. + * These layouts are from the DDFv1.2 spec. + * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but + * leaves RLQ=3 as 'Vendor Specific' + */ + +#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ +#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ +#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ + +/* For every RAID5 algorithm we define a RAID6 algorithm + * with exactly the same layout for data and parity, and + * with the Q block always on the last device (N-1). + * This allows trivial conversion from RAID5 to RAID6 + */ +#define ALGORITHM_LEFT_ASYMMETRIC_6 16 +#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 +#define ALGORITHM_LEFT_SYMMETRIC_6 18 +#define ALGORITHM_RIGHT_SYMMETRIC_6 19 +#define ALGORITHM_PARITY_0_6 20 +#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N + +/* Define PATH_MAX in case we don't use glibc or standard library does + * not have PATH_MAX defined. Assume max path length is 4K characters. + */ +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +#define RESYNC_NONE -1 +#define RESYNC_DELAYED -2 +#define RESYNC_PENDING -3 +#define RESYNC_UNKNOWN -4 + +/* When using "GET_DISK_INFO" it isn't certain how high + * we need to check. So we impose an absolute limit of + * MAX_DISKS. This needs to be much more than the largest + * number of devices any metadata can support. Currently + * v1.x can support 1920 + */ +#define MAX_DISKS 4096 + +/* Sometimes the 'size' value passed needs to mean "Maximum". + * In those cases with use MAX_SIZE + */ +#define MAX_SIZE 1 + +/* We want to use unsigned numbers for sector counts, but need + * a value for 'invalid'. Use '1'. + */ +#define INVALID_SECTORS 1 +/* And another special number needed for --data_offset=variable */ +#define VARIABLE_OFFSET 3 diff --git a/mdadm.spec b/mdadm.spec new file mode 100644 index 00000000..685a5642 --- /dev/null +++ b/mdadm.spec @@ -0,0 +1,45 @@ +Summary: mdadm is used for controlling Linux md devices (aka RAID arrays) +Name: mdadm +Version: 3.4 +Release: 1 +Source: http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tar.gz +URL: http://neil.brown.name/blog/mdadm +License: GPL +Group: Utilities/System +BuildRoot: %{_tmppath}/%{name}-root +Obsoletes: mdctl + +%description +mdadm is a program that can be used to create, manage, and monitor +Linux MD (Software RAID) devices. + +%prep +%setup -q +# we want to install in /sbin, not /usr/sbin... +%define _exec_prefix %{nil} + +%build +# This is a debatable issue. The author of this RPM spec file feels that +# people who install RPMs (especially given that the default RPM options +# will strip the binary) are not going to be running gdb against the +# program. +make CXFLAGS="$RPM_OPT_FLAGS" SYSCONFDIR="%{_sysconfdir}" + +%install +make DESTDIR=$RPM_BUILD_ROOT MANDIR=%{_mandir} BINDIR=%{_sbindir} install +install -D -m644 mdadm.conf-example $RPM_BUILD_ROOT/%{_sysconfdir}/mdadm.conf + +%clean +rm -rf $RPM_BUILD_ROOT + +%files +%defattr(-,root,root) +%doc TODO ChangeLog mdadm.conf-example COPYING +%{_sbindir}/mdadm +%{_sbindir}/mdmon +/usr/lib/udev/rules.d/63-md-raid-arrays.rules +/usr/lib/udev/rules.d/64-md-raid-assembly.rules +%config(noreplace,missingok)/%{_sysconfdir}/mdadm.conf +%{_mandir}/man*/md* + +%changelog diff --git a/mdassemble.8 b/mdassemble.8 new file mode 100644 index 00000000..6cb005c5 --- /dev/null +++ b/mdassemble.8 @@ -0,0 +1,65 @@ +.\" -*- nroff -*- +.TH MDASSEMBLE 8 "" v3.4 +.SH NAME +mdassemble \- assemble MD devices +.I aka +Linux Software RAID + +.SH SYNOPSIS + +.BI mdassemble + +.SH DESCRIPTION +.B mdassemble +is a tiny program that can be used to assemble MD devices inside an +initial ramdisk (initrd) or initramfs; it is meant to replace the in-kernel +automatic RAID detection and activation. +It can be built statically and linked against lightweight libc alternatives, like +.B dietlibc, +.B klibc +or +.B uClibc. + +.SH USAGE +Invoking +.B mdassemble +has the same effect as invoking +.B mdadm \-\-assemble \-\-scan. +.PP +Invoking +.B mdassemble +a second time will make all defined arrays readwrite, this is useful if +using the +.B start_ro +module parameter. + +.SH OPTIONS + +There are no options to +.B mdassemble. + +.SH FILES + +.SS /etc/mdadm/mdadm.conf + +The config file lists which devices may be scanned to see if +they contain MD super block, and gives identifying information +(e.g. UUID) about known MD arrays. See +.BR mdadm.conf (5) +for more details. + +.B mdassemble +supports all configuration parameters defined in +.B mdadm.conf +with the exception of +.B auto= +which is supported only if mdadm was built with the +.B \-DMDASSEMBLE_AUTO +define. + +.SH SEE ALSO +.PP +.BR mdadm (8), +.BR mdadm.conf (5), +.BR md (4), +.BR diet (1). diff --git a/mdassemble.c b/mdassemble.c new file mode 100644 index 00000000..78d363a3 --- /dev/null +++ b/mdassemble.c @@ -0,0 +1,80 @@ +/* + * mdassemble - assemble Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown + * Copyright (C) 2003 Luca Berra + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "md_p.h" + +char const Name[] = "mdassemble"; + +#ifndef MDASSEMBLE_AUTO +/* from mdopen.c */ +int open_mddev(char *dev, int report_errors/*unused*/) +{ + int mdfd = open(dev, O_RDWR); + if (mdfd < 0) + pr_err("error opening %s: %s\n", + dev, strerror(errno)); + else if (md_get_version(mdfd) <= 0) { + pr_err("%s does not appear to be an md device\n", + dev); + close(mdfd); + mdfd = -1; + } + return mdfd; +} +int create_mddev(char *dev, char *name, int autof/*unused*/, int trustworthy, + char *chosen) +{ + return open_mddev(dev, 0); +} +#endif + +int rv; +int mdfd = -1; + +int main(int argc, char *argv[]) +{ + struct mddev_ident *array_list = conf_get_ident(NULL); + struct context c = { .freeze_reshape = 1 }; + if (!array_list) { + pr_err("No arrays found in config file\n"); + rv = 1; + } else + for (; array_list; array_list = array_list->next) { + mdu_array_info_t array; + if (strcasecmp(array_list->devname, "") == 0) + continue; + mdfd = open_mddev(array_list->devname, 0); + if (mdfd >= 0 && ioctl(mdfd, GET_ARRAY_INFO, &array) == 0) { + rv |= Manage_ro(array_list->devname, mdfd, -1); /* make it readwrite */ + continue; + } + if (mdfd >= 0) + close(mdfd); + rv |= Assemble(array_list->st, array_list->devname, + array_list, NULL, &c); + } + return rv; +} diff --git a/mdmon-design.txt b/mdmon-design.txt new file mode 100644 index 00000000..f09184a9 --- /dev/null +++ b/mdmon-design.txt @@ -0,0 +1,146 @@ + +When managing a RAID1 array which uses metadata other than the +"native" metadata understood by the kernel, mdadm makes use of a +partner program named 'mdmon' to manage some aspects of updating +that metadata and synchronising the metadata with the array state. + +This document provides some details on how mdmon works. + +Containers +---------- + +As background: mdadm makes a distinction between an 'array' and a +'container'. Other sources sometimes use the term 'volume' or +'device' for an 'array', and may use the term 'array' for a +'container'. + +For our purposes: + - a 'container' is a collection of devices which are described by a + single set of metadata. The metadata may be stored equally + on all devices, or different devices may have quite different + subsets of the total metadata. But there is conceptually one set + of metadata that unifies the devices. + + - an 'array' is a set of datablock from various devices which + together are used to present the abstraction of a single linear + sequence of block, which may provide data redundancy or enhanced + performance. + +So a container has some metadata and provides a number of arrays which +are described by that metadata. + +Sometimes this model doesn't work perfectly. For example, global +spares may have their own metadata which is quite different from the +metadata from any device that participates in one or more arrays. +Such a global spare might still need to belong to some container so +that it is available to be used should a failure arise. In that case +we consider the 'metadata' to be the union of the metadata on the +active devices which describes the arrays, and the metadata on the +global spares which only describes the spares. In this case different +devices in the one container will have quite different metadata. + + +Purpose +------- + +The main purpose of mdmon is to update the metadata in response to +changes to the array which need to be reflected in the metadata before +futures writes to the array can safely be performed. +These include: + - transitions from 'clean' to 'dirty'. + - recording the devices have failed. + - recording the progress of a 'reshape' + +This requires mdmon to be running at any time that the array is +writable (a read-only array does not require mdmon to be running). + +Because mdmon must be able to process these metadata updates at any +time, it must (when running) have exclusive write access to the +metadata. Any other changes (e.g. reconfiguration of the array) must +go through mdmon. + +A secondary role for mdmon is to activate spares when a device fails. +This role is much less time-critical than the other metadata updates, +so it could be performed by a separate process, possibly +"mdadm --monitor" which has a related role of moving devices between +arrays. A main reason for including this functionality in mdmon is +that in the native-metadata case this function is handled in the +kernel, and mdmon's reason for existence to provide functionality +which is otherwise handled by the kernel. + + +Design overview +--------------- + +mdmon is structured as two threads with a common address space and +common data structures. These threads are know as the 'monitor' and +the 'manager'. + +The 'monitor' has the primary role of monitoring the array for +important state changes and updating the metadata accordingly. As +writes to the array can be blocked until 'monitor' completes and +acknowledges the update, it much be very careful not to block itself. +In particular it must not block waiting for any write to complete else +it could deadlock. This means that it must not allocate memory as +doing this can require dirty memory to be written out and if the +system choose to write to the array that mdmon is monitoring, the +memory allocation could deadlock. + +So 'monitor' must never allocate memory and must limit the number of +other system call it performs. It may: + - use select (or poll) to wait for activity on a file descriptor + - read from a sysfs file descriptor + - write to a sysfs file descriptor + - write the metadata out to the block devices using O_DIRECT + - send a signal (kill) to the manager thread + +It must not e.g. open files or do anything similar that might allocate +resources. + +The 'manager' thread does everything else that is needed. If any +files are to be opened (e.g. because a device has been added to the +array), the manager does that. If any memory needs to be allocated +(e.g. to hold data about a new array as can happen when one set of +metadata describes several arrays), the manager performs that +allocation. + +The 'manager' is also responsible for communicating with mdadm and +assigning spares to replace failed devices. + + +Handling metadata updates +------------------------- + +There are a number of cases in which mdadm needs to update the +metdata which mdmon is managing. These include: + - creating a new array in an active container + - adding a device to a container + - reconfiguring an array +etc. + +To complete these updates, mdadm must send a message to mdmon which +will merge the update into the metadata as it is at that moment. + +To achieve this, mdmon creates a Unix Domain Socket which the manager +thread listens on. mdadm sends a message over this socket. The +manager thread examines the message to see if it will require +allocating any memory and allocates it. This is done in the +'prepare_update' metadata method. + +The update message is then queued for handling by the monitor thread +which it will do when convenient. The monitor thread calls +->process_update which should atomically make the required changes to +the metadata, making use of the pre-allocate memory as required. Any +memory the is no-longer needed can be placed back in the request and +the manager thread will free it. + +The exact format of a metadata update is up to the implementer of the +metadata handlers. It will simply describe a change that needs to be +made. It will sometimes contain fragments of the metadata to be +copied in to place. However the ->process_update routine must make +sure not to over-write any field that the monitor thread might have +updated, such as a 'device failed' or 'array is dirty' state. + +When the monitor thread has completed the update and written it to the +devices, an acknowledgement message is sent back over the socket so +that mdadm knows it is complete. diff --git a/mdmon.8 b/mdmon.8 new file mode 100644 index 00000000..cc6add8f --- /dev/null +++ b/mdmon.8 @@ -0,0 +1,257 @@ +.\" See file COPYING in distribution for details. +.TH MDMON 8 "" v3.4 +.SH NAME +mdmon \- monitor MD external metadata arrays + +.SH SYNOPSIS + +.BI mdmon " [--all] [--takeover] [--foreground] CONTAINER" + +.SH OVERVIEW +The 2.6.27 kernel brings the ability to support external metadata arrays. +External metadata implies that user space handles all updates to the metadata. +The kernel's responsibility is to notify user space when a "metadata event" +occurs, like disk failures and clean-to-dirty transitions. The kernel, in +important cases, waits for user space to take action on these notifications. + +.SH DESCRIPTION +.SS Metadata updates: +To service metadata update requests a daemon, +.IR mdmon , +is introduced. +.I Mdmon +is tasked with polling the sysfs namespace looking for changes in +.BR array_state , +.BR sync_action , +and per disk +.BR state +attributes. When a change is detected it calls a per metadata type +handler to make modifications to the metadata. The following actions +are taken: +.RS +.TP +.B array_state \- inactive +Clear the dirty bit for the volume and let the array be stopped +.TP +.B array_state \- write pending +Set the dirty bit for the array and then set +.B array_state +to +.BR active . +Writes +are blocked until userspace writes +.BR active. +.TP +.B array_state \- active-idle +The safe mode timer has expired so set array state to clean to block writes to the array +.TP +.B array_state \- clean +Clear the dirty bit for the volume +.TP +.B array_state \- read-only +This is the initial state that all arrays start at. +.I mdmon +takes one of the three actions: +.RS +.TP +1/ +Transition the array to read-auto keeping the dirty bit clear if the metadata +handler determines that the array does not need resyncing or other modification +.TP +2/ +Transition the array to active if the metadata handler determines a resync or +some other manipulation is necessary +.TP +3/ +Leave the array read\-only if the volume is marked to not be monitored; for +example, the metadata version has been set to "external:\-dev/md127" instead of +"external:/dev/md127" +.RE +.TP +.B sync_action \- resync\-to\-idle +Notify the metadata handler that a resync may have completed. If a resync +process is idled before it completes this event allows the metadata handler to +checkpoint resync. +.TP +.B sync_action \- recover\-to\-idle +A spare may have completed rebuilding so tell the metadata handler about the +state of each disk. This is the metadata handler's opportunity to clear +any "out-of-sync" bits and clear the volume's degraded status. If a recovery +process is idled before it completes this event allows the metadata handler to +checkpoint recovery. +.TP +.B /state \- faulty +A disk failure kicks off a series of events. First, notify the metadata +handler that a disk has failed, and then notify the kernel that it can unblock +writes that were dependent on this disk. After unblocking the kernel this disk +is set to be removed+ from the member array. Finally the disk is marked failed +in all other member arrays in the container. +.IP ++ Note This behavior differs slightly from native MD arrays where +removal is reserved for a +.B mdadm --remove +event. In the external metadata case the container holds the final +reference on a block device and a +.B mdadm --remove +call is still required. +.RE + +.SS Containers: +.P +External metadata formats, like DDF, differ from the native MD metadata +formats in that they define a set of disks and a series of sub-arrays +within those disks. MD metadata in comparison defines a 1:1 +relationship between a set of block devices and a RAID array. For +example to create 2 arrays at different RAID levels on a single +set of disks, MD metadata requires the disks be partitioned and then +each array can be created with a subset of those partitions. The +supported external formats perform this disk carving internally. +.P +Container devices simply hold references to all member disks and allow +tools like +.I mdmon +to determine which active arrays belong to which +container. Some array management commands like disk removal and disk +add are now only valid at the container level. Attempts to perform +these actions on member arrays are blocked with error messages like: +.IP +"mdadm: Cannot remove disks from a \'member\' array, perform this +operation on the parent container" +.P +Containers are identified in /proc/mdstat with a metadata version string +"external:". Member devices are identified by +"external://", or "external:-/" if the array is to remain readonly. + +.SH OPTIONS +.TP +CONTAINER +The +.B container +device to monitor. It can be a full path like /dev/md/container, or a +simple md device name like md127. +.TP +.B \-\-foreground +Normally, +.I mdmon +will fork and continue in the background. Adding this option will +skip that step and run +.I mdmon +in the foreground. +.TP +.B \-\-takeover +This instructs +.I mdmon +to replace any active +.I mdmon +which is currently monitoring the array. This is primarily used late +in the boot process to replace any +.I mdmon +which was started from an +.B initramfs +before the root filesystem was mounted. This avoids holding a +reference on that +.B initramfs +indefinitely and ensures that the +.I pid +and +.I sock +files used to communicate with +.I mdmon +are in a standard place. +.TP +.B \-\-all +This tells mdmon to find any active containers and start monitoring +each of them if appropriate. This is normally used with +.B \-\-takeover +late in the boot sequence. +A separate +.I mdmon +process is started for each container as the +.B \-\-all +argument is over-written with the name of the container. To allow for +containers with names longer than 5 characters, this argument can be +arbitrarily extended, e.g. to +.BR \-\-all-active-arrays . +.TP + +.PP +Note that +.I mdmon +is automatically started by +.I mdadm +when needed and so does not need to be considered when working with +RAID arrays. The only times it is run other than by +.I mdadm +is when the boot scripts need to restart it after mounting the new +root filesystem. + +.SH START UP AND SHUTDOWN + +As +.I mdmon +needs to be running whenever any filesystem on the monitored device is +mounted there are special considerations when the root filesystem is +mounted from an +.I mdmon +monitored device. +Note that in general +.I mdmon +is needed even if the filesystem is mounted read-only as some +filesystems can still write to the device in those circumstances, for +example to replay a journal after an unclean shutdown. + +When the array is assembled by the +.B initramfs +code, mdadm will automatically start +.I mdmon +as required. This means that +.I mdmon +must be installed on the +.B initramfs +and there must be a writable filesystem (typically tmpfs) in which +.B mdmon +can create a +.B .pid +and +.B .sock +file. The particular filesystem to use is given to mdmon at compile +time and defaults to +.BR /run/mdadm . + +This filesystem must persist through to shutdown time. + +After the final root filesystem has be instantiated (usually with +.BR pivot_root ) +.I mdmon +should be run with +.I "\-\-all \-\-takeover" +so that the +.I mdmon +running from the +.B initramfs +can be replaced with one running in the main root, and so the +memory used by the initramfs can be released. + +At shutdown time, +.I mdmon +should not be killed along with other processes. Also as it holds a +file (socket actually) open in +.B /dev +(by default) it will not be possible to unmount +.B /dev +if it is a separate filesystem. + +.SH EXAMPLES + +.B " mdmon \-\-all-active-arrays \-\-takeover" +.br +Any +.I mdmon +which is currently running is killed and a new instance is started. +This should be run during in the boot sequence if an initramfs was +used, so that any mdmon running from the initramfs will not hold +the initramfs active. +.SH SEE ALSO +.IR mdadm (8), +.IR md (4). diff --git a/mdmon.c b/mdmon.c new file mode 100644 index 00000000..e4b73d96 --- /dev/null +++ b/mdmon.c @@ -0,0 +1,602 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * md array manager. + * When md arrays have user-space managed metadata, this is the program + * that does the managing. + * + * Given one argument: the name of the array (e.g. /dev/md0) that is + * the container. + * We fork off a helper that runs high priority and mlocked. It responds to + * device failures and other events that might stop writeout, or that are + * trivial to deal with. + * The main thread then watches for new arrays being created in the container + * and starts monitoring them too ... along with a few other tasks. + * + * The main thread communicates with the priority thread by writing over + * a pipe. + * Separate programs can communicate with the main thread via Unix-domain + * socket. + * The two threads share address space and open file table. + * + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef USE_PTHREADS +#include +#else +#include +#endif + +#include "mdadm.h" +#include "mdmon.h" + +char const Name[] = "mdmon"; + +struct active_array *discard_this; +struct active_array *pending_discard; + +int mon_tid, mgr_tid; + +int sigterm; + +#ifdef USE_PTHREADS +static void *run_child(void *v) +{ + struct supertype *c = v; + + mon_tid = syscall(SYS_gettid); + do_monitor(c); + return 0; +} + +static int clone_monitor(struct supertype *container) +{ + pthread_attr_t attr; + pthread_t thread; + int rc; + + mon_tid = -1; + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 4096); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + rc = pthread_create(&thread, &attr, run_child, container); + if (rc) + return rc; + while (mon_tid == -1) + usleep(10); + pthread_attr_destroy(&attr); + + mgr_tid = syscall(SYS_gettid); + + return mon_tid; +} +#else /* USE_PTHREADS */ +static int run_child(void *v) +{ + struct supertype *c = v; + + do_monitor(c); + return 0; +} + +#ifdef __ia64__ +int __clone2(int (*fn)(void *), + void *child_stack_base, size_t stack_size, + int flags, void *arg, ... + /* pid_t *pid, struct user_desc *tls, pid_t *ctid */ ); +#endif +static int clone_monitor(struct supertype *container) +{ + static char stack[4096]; + +#ifdef __ia64__ + mon_tid = __clone2(run_child, stack, sizeof(stack), + CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD, + container); +#else + mon_tid = clone(run_child, stack+4096-64, + CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD, + container); +#endif + + mgr_tid = syscall(SYS_gettid); + + return mon_tid; +} +#endif /* USE_PTHREADS */ + +static int make_pidfile(char *devname) +{ + char path[100]; + char pid[10]; + int fd; + int n; + + if (mkdir(MDMON_DIR, 0755) < 0 && + errno != EEXIST) + return -errno; + sprintf(path, "%s/%s.pid", MDMON_DIR, devname); + + fd = open(path, O_RDWR|O_CREAT|O_EXCL, 0600); + if (fd < 0) + return -errno; + sprintf(pid, "%d\n", getpid()); + n = write(fd, pid, strlen(pid)); + close(fd); + if (n < 0) + return -errno; + return 0; +} + +static void try_kill_monitor(pid_t pid, char *devname, int sock) +{ + char buf[100]; + int fd; + int n; + long fl; + + /* first rule of survival... don't off yourself */ + if (pid == getpid()) + return; + + /* kill this process if it is mdmon */ + sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid); + fd = open(buf, O_RDONLY); + if (fd < 0) + return; + + n = read(fd, buf, sizeof(buf)-1); + buf[sizeof(buf)-1] = 0; + close(fd); + + if (n < 0 || !(strstr(buf, "mdmon") || + strstr(buf, "@dmon"))) + return; + + kill(pid, SIGTERM); + + if (sock < 0) + return; + + /* Wait for monitor to exit by reading from the socket, after + * clearing the non-blocking flag */ + fl = fcntl(sock, F_GETFL, 0); + fl &= ~O_NONBLOCK; + fcntl(sock, F_SETFL, fl); + n = read(sock, buf, 100); + /* Ignore result, it is just the wait that + * matters + */ +} + +void remove_pidfile(char *devname) +{ + char buf[100]; + + sprintf(buf, "%s/%s.pid", MDMON_DIR, devname); + unlink(buf); + sprintf(buf, "%s/%s.sock", MDMON_DIR, devname); + unlink(buf); +} + +static int make_control_sock(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + + if (sigterm) + return -1; + + sprintf(path, "%s/%s.sock", MDMON_DIR, devname); + unlink(path); + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + umask(077); /* ensure no world write access */ + if (bind(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + listen(sfd, 10); + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + return sfd; +} + +static void term(int sig) +{ + sigterm = 1; +} + +static void wake_me(int sig) +{ + +} + +/* if we are debugging and starting mdmon by hand then don't fork */ +static int do_fork(void) +{ + #ifdef DEBUG + if (check_env("MDADM_NO_MDMON")) + return 0; + #endif + + return 1; +} + +void usage(void) +{ + fprintf(stderr, +"Usage: mdmon [options] CONTAINER\n" +"\n" +"Options are:\n" +" --help -h : This message\n" +" --all -a : All devices\n" +" --foreground -F : Run in foreground (do not fork)\n" +" --takeover -t : Takeover container\n" +); + exit(2); +} + +static int mdmon(char *devnm, int must_fork, int takeover); + +int main(int argc, char *argv[]) +{ + char *container_name = NULL; + char *devnm = NULL; + int status = 0; + int opt; + int all = 0; + int takeover = 0; + int dofork = 1; + static struct option options[] = { + {"all", 0, NULL, 'a'}, + {"takeover", 0, NULL, 't'}, + {"help", 0, NULL, 'h'}, + {"offroot", 0, NULL, OffRootOpt}, + {"foreground", 0, NULL, 'F'}, + {NULL, 0, NULL, 0} + }; + + if (in_initrd()) { + /* + * set first char of argv[0] to @. This is used by + * systemd to signal that the task was launched from + * initrd/initramfs and should be preserved during shutdown + */ + argv[0][0] = '@'; + } + + while ((opt = getopt_long(argc, argv, "thaF", options, NULL)) != -1) { + switch (opt) { + case 'a': + container_name = argv[optind-1]; + all = 1; + break; + case 't': + takeover = 1; + break; + case 'F': + dofork = 0; + break; + case OffRootOpt: + argv[0][0] = '@'; + break; + case 'h': + default: + usage(); + break; + } + } + + if (all == 0 && container_name == NULL) { + if (argv[optind]) + container_name = argv[optind]; + } + + if (container_name == NULL) + usage(); + + if (argc - optind > 1) + usage(); + + if (strcmp(container_name, "/proc/mdstat") == 0) + all = 1; + + if (all) { + struct mdstat_ent *mdstat, *e; + int container_len = strlen(container_name); + + /* launch an mdmon instance for each container found */ + mdstat = mdstat_read(0, 0); + for (e = mdstat; e; e = e->next) { + if (e->metadata_version && + strncmp(e->metadata_version, "external:", 9) == 0 && + !is_subarray(&e->metadata_version[9])) { + /* update cmdline so this mdmon instance can be + * distinguished from others in a call to ps(1) + */ + if (strlen(e->devnm) <= (unsigned)container_len) { + memset(container_name, 0, container_len); + sprintf(container_name, "%s", e->devnm); + } + status |= mdmon(e->devnm, 1, takeover); + } + } + free_mdstat(mdstat); + + return status; + } else if (strncmp(container_name, "md", 2) == 0) { + int id = devnm2devid(container_name); + if (id) + devnm = container_name; + } else { + struct stat st; + + if (stat(container_name, &st) == 0) + devnm = xstrdup(stat2devnm(&st)); + } + + if (!devnm) { + pr_err("%s is not a valid md device name\n", + container_name); + exit(1); + } + return mdmon(devnm, dofork && do_fork(), takeover); +} + +static int mdmon(char *devnm, int must_fork, int takeover) +{ + int mdfd; + struct mdinfo *mdi, *di; + struct supertype *container; + sigset_t set; + struct sigaction act; + int pfd[2]; + int status; + int ignore; + pid_t victim = -1; + int victim_sock = -1; + + dprintf("starting mdmon for %s\n", devnm); + + mdfd = open_dev(devnm); + if (mdfd < 0) { + pr_err("%s: %s\n", devnm, strerror(errno)); + return 1; + } + if (md_get_version(mdfd) < 0) { + pr_err("%s: Not an md device\n", devnm); + return 1; + } + + /* Fork, and have the child tell us when they are ready */ + if (must_fork) { + if (pipe(pfd) != 0) { + pr_err("failed to create pipe\n"); + return 1; + } + switch(fork()) { + case -1: + pr_err("failed to fork: %s\n", strerror(errno)); + return 1; + case 0: /* child */ + close(pfd[0]); + break; + default: /* parent */ + close(pfd[1]); + if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) { + wait(&status); + status = WEXITSTATUS(status); + } + close(pfd[0]); + return status; + } + } else + pfd[0] = pfd[1] = -1; + + container = xcalloc(1, sizeof(*container)); + strcpy(container->devnm, devnm); + container->arrays = NULL; + container->sock = -1; + + mdi = sysfs_read(mdfd, container->devnm, GET_VERSION|GET_LEVEL|GET_DEVS); + + if (!mdi) { + pr_err("failed to load sysfs info for %s\n", container->devnm); + exit(3); + } + if (mdi->array.level != UnSet) { + pr_err("%s is not a container - cannot monitor\n", devnm); + exit(3); + } + if (mdi->array.major_version != -1 || + mdi->array.minor_version != -2) { + pr_err("%s does not use external metadata - cannot monitor\n", + devnm); + exit(3); + } + + container->ss = version_to_superswitch(mdi->text_version); + if (container->ss == NULL) { + pr_err("%s uses unsupported metadata: %s\n", + devnm, mdi->text_version); + exit(3); + } + + container->devs = NULL; + for (di = mdi->devs; di; di = di->next) { + struct mdinfo *cd = xmalloc(sizeof(*cd)); + *cd = *di; + cd->next = container->devs; + container->devs = cd; + } + sysfs_free(mdi); + + /* SIGUSR is sent between parent and child. So both block it + * and enable it only with pselect. + */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + sigaddset(&set, SIGTERM); + sigprocmask(SIG_BLOCK, &set, NULL); + act.sa_handler = wake_me; + act.sa_flags = 0; + sigaction(SIGUSR1, &act, NULL); + act.sa_handler = term; + sigaction(SIGTERM, &act, NULL); + act.sa_handler = SIG_IGN; + sigaction(SIGPIPE, &act, NULL); + + victim = mdmon_pid(container->devnm); + if (victim >= 0) + victim_sock = connect_monitor(container->devnm); + + ignore = chdir("/"); + if (!takeover && victim > 0 && victim_sock >= 0) { + if (fping_monitor(victim_sock) == 0) { + pr_err("%s already managed\n", container->devnm); + exit(3); + } + close(victim_sock); + victim_sock = -1; + } + if (container->ss->load_container(container, mdfd, devnm)) { + pr_err("Cannot load metadata for %s\n", devnm); + exit(3); + } + close(mdfd); + + /* Ok, this is close enough. We can say goodbye to our parent now. + */ + if (victim > 0) + remove_pidfile(devnm); + if (make_pidfile(devnm) < 0) { + exit(3); + } + container->sock = make_control_sock(devnm); + + status = 0; + if (pfd[1] >= 0) { + if (write(pfd[1], &status, sizeof(status)) < 0) + pr_err("failed to notify our parent: %d\n", + getppid()); + close(pfd[1]); + } + + mlockall(MCL_CURRENT | MCL_FUTURE); + + if (clone_monitor(container) < 0) { + pr_err("failed to start monitor process: %s\n", + strerror(errno)); + exit(2); + } + + if (victim > 0) { + try_kill_monitor(victim, container->devnm, victim_sock); + if (victim_sock >= 0) + close(victim_sock); + } + + setsid(); + close(0); + open("/dev/null", O_RDWR); + close(1); + ignore = dup(0); +#ifndef DEBUG + close(2); + ignore = dup(0); +#endif + + /* This silliness is to stop the compiler complaining + * that we ignore 'ignore' + */ + if (ignore) + ignore++; + + do_manager(container); + + exit(0); +} + +/* Some stub functions so super-* can link with us */ +int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) +{ + return 0; +} + +int restore_stripes(int *dest, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int source, unsigned long long read_offset, + unsigned long long start, unsigned long long length, + char *src_buf) +{ + return 1; +} + +void abort_reshape(struct mdinfo *sra) +{ + return; +} + +int save_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int nwrites, int *dest, + unsigned long long start, unsigned long long length, + char *buf) +{ + return 0; +} + +struct superswitch super0 = { + .name = "0.90", +}; +struct superswitch super1 = { + .name = "1.x", +}; diff --git a/mdmon.h b/mdmon.h new file mode 100644 index 00000000..aa750c68 --- /dev/null +++ b/mdmon.h @@ -0,0 +1,110 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +extern const char Name[]; + +enum array_state { clear, inactive, suspended, readonly, read_auto, + clean, active, write_pending, active_idle, bad_word}; + +enum sync_action { idle, reshape, resync, recover, check, repair, bad_action }; + +struct active_array { + struct mdinfo info; + struct supertype *container; + struct active_array *next, *replaces; + int to_remove; + + int action_fd; + int resync_start_fd; + int metadata_fd; /* for monitoring rw/ro status */ + int sync_completed_fd; /* for checkpoint notification events */ + unsigned long long last_checkpoint; /* sync_completed fires for many + * reasons this field makes sure the + * kernel has made progress before + * moving the checkpoint. It is + * cleared by the metadata handler + * when it determines recovery is + * terminated. + */ + + enum array_state prev_state, curr_state, next_state; + enum sync_action prev_action, curr_action, next_action; + + int check_degraded; /* flag set by mon, read by manage */ + int check_reshape; /* flag set by mon, read by manage */ +}; + +/* + * Metadata updates are handled by the monitor thread, + * as it has exclusive access to the metadata. + * When the manager want to updates metadata, either + * for it's own reason (e.g. committing a spare) or + * on behalf of mdadm, it creates a metadata_update + * structure and queues it to the monitor. + * Updates are created and processed by code under the + * superswitch. All common code sees them as opaque + * blobs. + */ +extern struct metadata_update *update_queue, *update_queue_handled; + +#define MD_MAJOR 9 + +extern struct active_array *container; +extern struct active_array *discard_this; +extern struct active_array *pending_discard; +extern struct md_generic_cmd *active_cmd; + +void remove_pidfile(char *devname); +void do_monitor(struct supertype *container); +void do_manager(struct supertype *container); +extern int sigterm; + +int read_dev_state(int fd); +int is_container_member(struct mdstat_ent *mdstat, char *container); + +struct mdstat_ent *mdstat_read(int hold, int start); + +extern int exit_now, manager_ready; +extern int mon_tid, mgr_tid; +extern int monitor_loop_cnt; + +/* helper routine to determine resync completion since MaxSector is a + * moving target + */ +static inline int is_resync_complete(struct mdinfo *array) +{ + unsigned long long sync_size = 0; + int ncopies, l; + switch(array->array.level) { + case 1: + case 4: + case 5: + case 6: + sync_size = array->component_size; + break; + case 10: + l = array->array.layout; + ncopies = (l & 0xff) * ((l >> 8) && 0xff); + sync_size = array->component_size * array->array.raid_disks; + sync_size /= ncopies; + break; + } + return array->resync_start >= sync_size; +} diff --git a/mdopen.c b/mdopen.c new file mode 100644 index 00000000..28410f46 --- /dev/null +++ b/mdopen.c @@ -0,0 +1,468 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "md_p.h" +#include + +void make_parts(char *dev, int cnt) +{ + /* make 'cnt' partition devices for 'dev' + * If dev is a device name we use the + * major/minor from dev and add 1..cnt + * If it is a symlink, we make similar symlinks. + * If dev ends with a digit, we add "p%d" else "%d" + * If the name exists, we use it's owner/mode, + * else that of dev + */ + struct stat stb; + int major_num; + int minor_num; + int odig; + int i; + int nlen = strlen(dev) + 20; + char *name; + int dig = isdigit(dev[strlen(dev)-1]); + char orig[1024]; + char sym[1024]; + int err; + + if (cnt == 0) + cnt = 4; + if (lstat(dev, &stb)!= 0) + return; + + if (S_ISBLK(stb.st_mode)) { + major_num = major(stb.st_rdev); + minor_num = minor(stb.st_rdev); + odig = -1; + } else if (S_ISLNK(stb.st_mode)) { + int len = readlink(dev, orig, sizeof(orig)); + if (len < 0 || len > 1000) + return; + orig[len] = 0; + odig = isdigit(orig[len-1]); + major_num = -1; + minor_num = -1; + } else + return; + name = xmalloc(nlen); + for (i = 1; i <= cnt ; i++) { + struct stat stb2; + snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i); + if (stat(name, &stb2) == 0) { + if (!S_ISBLK(stb2.st_mode) || !S_ISBLK(stb.st_mode)) + continue; + if (stb2.st_rdev == makedev(major_num, minor_num+i)) + continue; + unlink(name); + } else { + stb2 = stb; + } + if (S_ISBLK(stb.st_mode)) { + if (mknod(name, S_IFBLK | 0600, + makedev(major_num, minor_num+i))) + perror("mknod"); + if (chown(name, stb2.st_uid, stb2.st_gid)) + perror("chown"); + if (chmod(name, stb2.st_mode & 07777)) + perror("chmod"); + err = 0; + } else { + snprintf(sym, sizeof(sym), "%s%s%d", orig, odig?"p":"", i); + err = symlink(sym, name); + } + + if (err == 0 && stat(name, &stb2) == 0) + add_dev(name, &stb2, 0, NULL); + } + free(name); +} + +/* + * We need a new md device to assemble/build/create an array. + * 'dev' is a name given us by the user (command line or mdadm.conf) + * It might start with /dev or /dev/md any might end with a digit + * string. + * If it starts with just /dev, it must be /dev/mdX or /dev/md_dX + * If it ends with a digit string, then it must be as above, or + * 'trustworthy' must be 'METADATA' and the 'dev' must be + * /dev/md/'name'NN or 'name'NN + * If it doesn't end with a digit string, it must be /dev/md/'name' + * or 'name' or must be NULL. + * If the digit string is present, it gives the minor number to use + * If not, we choose a high, unused minor number. + * If the 'dev' is a standard name, it devices whether 'md' or 'mdp'. + * else if the name is 'd[0-9]+' then we use mdp + * else if trustworthy is 'METADATA' we use md + * else the choice depends on 'autof'. + * If name is NULL it is assumed to match whatever dev provides. + * If both name and dev are NULL, we choose a name 'mdXX' or 'mdpXX' + * + * If 'name' is given, and 'trustworthy' is 'foreign' and name is not + * supported by 'dev', we add a "_%d" suffix based on the minor number + * use that. + * + * If udev is configured, we create a temporary device, open it, and + * unlink it. + * If not, we create the /dev/mdXX device, and if name is usable, + * /dev/md/name + * In any case we return /dev/md/name or (if that isn't available) + * /dev/mdXX in 'chosen'. + * + * When we create devices, we use uid/gid/umask from config file. + */ + +int create_mddev(char *dev, char *name, int autof, int trustworthy, + char *chosen) +{ + int mdfd; + struct stat stb; + int num = -1; + int use_mdp = -1; + struct createinfo *ci = conf_get_create_info(); + int parts; + char *cname; + char devname[20]; + char devnm[32]; + char cbuf[400]; + if (chosen == NULL) + chosen = cbuf; + + if (autof == 0) + autof = ci->autof; + + parts = autof >> 3; + autof &= 7; + + strcpy(chosen, "/dev/md/"); + cname = chosen + strlen(chosen); + + if (dev) { + if (strncmp(dev, "/dev/md/", 8) == 0) { + strcpy(cname, dev+8); + } else if (strncmp(dev, "/dev/", 5) == 0) { + char *e = dev + strlen(dev); + while (e > dev && isdigit(e[-1])) + e--; + if (e[0]) + num = strtoul(e, NULL, 10); + strcpy(cname, dev+5); + cname[e-(dev+5)] = 0; + /* name *must* be mdXX or md_dXX in this context */ + if (num < 0 || + (strcmp(cname, "md") != 0 && strcmp(cname, "md_d") != 0)) { + pr_err("%s is an invalid name for an md device. Try /dev/md/%s\n", + dev, dev+5); + return -1; + } + if (strcmp(cname, "md") == 0) + use_mdp = 0; + else + use_mdp = 1; + /* recreate name: /dev/md/0 or /dev/md/d0 */ + sprintf(cname, "%s%d", use_mdp?"d":"", num); + } else + strcpy(cname, dev); + + /* 'cname' must not contain a slash, and may not be + * empty. + */ + if (strchr(cname, '/') != NULL) { + pr_err("%s is an invalid name for an md device.\n", dev); + return -1; + } + if (cname[0] == 0) { + pr_err("%s is an invalid name for an md device (empty!).", dev); + return -1; + } + if (num < 0) { + /* If cname is 'N' or 'dN', we get dev number + * from there. + */ + char *sp = cname; + char *ep; + if (cname[0] == 'd') + sp++; + if (isdigit(sp[0])) + num = strtoul(sp, &ep, 10); + else + ep = sp; + if (ep == sp || *ep || num < 0) + num = -1; + else if (cname[0] == 'd') + use_mdp = 1; + else + use_mdp = 0; + } + } + + /* Now determine device number */ + /* named 'METADATA' cannot use 'mdp'. */ + if (name && name[0] == 0) + name = NULL; + if (name && trustworthy == METADATA && use_mdp == 1) { + pr_err("%s is not allowed for a %s container. Consider /dev/md%d.\n", dev, name, num); + return -1; + } + if (name && trustworthy == METADATA) + use_mdp = 0; + if (use_mdp == -1) { + if (autof == 4 || autof == 6) + use_mdp = 1; + else + use_mdp = 0; + } + if (num < 0 && trustworthy == LOCAL && name) { + /* if name is numeric, possibly prefixed by + * 'md' or '/dev/md', use that for num + * if it is not already in use */ + char *ep; + char *n2 = name; + if (strncmp(n2, "/dev/", 5) == 0) + n2 += 5; + if (strncmp(n2, "md", 2) == 0) + n2 += 2; + if (*n2 == '/') + n2++; + num = strtoul(n2, &ep, 10); + if (ep == n2 || *ep) + num = -1; + else { + sprintf(devnm, "md%s%d", use_mdp ? "_d":"", num); + if (mddev_busy(devnm)) + num = -1; + } + } + + if (cname[0] == 0 && name) { + /* Need to find a name if we can + * We don't completely trust 'name'. Truncate to + * reasonable length and remove '/' + */ + char *cp; + struct map_ent *map = NULL; + int conflict = 1; + int unum = 0; + int cnlen; + strncpy(cname, name, 200); + cname[200] = 0; + for (cp = cname; *cp ; cp++) + switch (*cp) { + case '/': + *cp = '-'; + break; + case ' ': + case '\t': + *cp = '_'; + break; + } + + if (trustworthy == LOCAL || + (trustworthy == FOREIGN && strchr(cname, ':') != NULL)) { + /* Only need suffix if there is a conflict */ + if (map_by_name(&map, cname) == NULL) + conflict = 0; + } + cnlen = strlen(cname); + while (conflict) { + if (trustworthy == METADATA && !isdigit(cname[cnlen-1])) + sprintf(cname+cnlen, "%d", unum); + else + /* add _%d to FOREIGN array that don't + * a 'host:' prefix + */ + sprintf(cname+cnlen, "_%d", unum); + unum++; + if (map_by_name(&map, cname) == NULL) + conflict = 0; + } + } + + devnm[0] = 0; + if (num < 0 && cname && ci->names) { + int fd; + int n = -1; + sprintf(devnm, "md_%s", cname); + fd = open("/sys/module/md_mod/parameters/new_array", O_WRONLY); + if (fd >= 0) { + n = write(fd, devnm, strlen(devnm)); + close(fd); + } + if (n < 0) + devnm[0] = 0; + } + if (devnm[0]) + ; + else if (num < 0) { + /* need to choose a free number. */ + char *_devnm = find_free_devnm(use_mdp); + if (devnm == NULL) { + pr_err("No avail md devices - aborting\n"); + return -1; + } + strcpy(devnm, _devnm); + } else { + sprintf(devnm, "%s%d", use_mdp?"md_d":"md", num); + if (mddev_busy(devnm)) { + pr_err("%s is already in use.\n", + dev); + return -1; + } + } + + sprintf(devname, "/dev/%s", devnm); + + if (dev && dev[0] == '/') + strcpy(chosen, dev); + else if (cname[0] == 0) + strcpy(chosen, devname); + + /* We have a device number and name. + * If we cannot detect udev, we need to make + * devices and links ourselves. + */ + if (!use_udev()) { + /* Make sure 'devname' exists and 'chosen' is a symlink to it */ + if (lstat(devname, &stb) == 0) { + /* Must be the correct device, else error */ + if ((stb.st_mode&S_IFMT) != S_IFBLK || + stb.st_rdev != (dev_t)devnm2devid(devnm)) { + pr_err("%s exists but looks wrong, please fix\n", + devname); + return -1; + } + } else { + if (mknod(devname, S_IFBLK|0600, + devnm2devid(devnm)) != 0) { + pr_err("failed to create %s\n", + devname); + return -1; + } + if (chown(devname, ci->uid, ci->gid)) + perror("chown"); + if (chmod(devname, ci->mode)) + perror("chmod"); + stat(devname, &stb); + add_dev(devname, &stb, 0, NULL); + } + if (use_mdp == 1) + make_parts(devname, parts); + + if (strcmp(chosen, devname) != 0) { + if (mkdir("/dev/md",0700) == 0) { + if (chown("/dev/md", ci->uid, ci->gid)) + perror("chown /dev/md"); + if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111))) + perror("chmod /dev/md"); + } + + if (dev && strcmp(chosen, dev) == 0) + /* We know we are allowed to use this name */ + unlink(chosen); + + if (lstat(chosen, &stb) == 0) { + char buf[300]; + ssize_t link_len = readlink(chosen, buf, sizeof(buf)-1); + if (link_len >= 0) + buf[link_len] = '\0'; + + if ((stb.st_mode & S_IFMT) != S_IFLNK || + link_len < 0 || + strcmp(buf, devname) != 0) { + pr_err("%s exists - ignoring\n", + chosen); + strcpy(chosen, devname); + } + } else if (symlink(devname, chosen) != 0) + pr_err("failed to create %s: %s\n", + chosen, strerror(errno)); + if (use_mdp && strcmp(chosen, devname) != 0) + make_parts(chosen, parts); + } + } + mdfd = open_dev_excl(devnm); + if (mdfd < 0) + pr_err("unexpected failure opening %s\n", + devname); + return mdfd; +} + +/* Open this and check that it is an md device. + * On success, return filedescriptor. + * On failure, return -1 if it doesn't exist, + * or -2 if it exists but is not an md device. + */ +int open_mddev(char *dev, int report_errors) +{ + int mdfd = open(dev, O_RDWR); + if (mdfd < 0 && errno == EACCES) + mdfd = open(dev, O_RDONLY); + if (mdfd < 0) { + if (report_errors) + pr_err("error opening %s: %s\n", + dev, strerror(errno)); + return -1; + } + if (md_get_version(mdfd) <= 0) { + close(mdfd); + if (report_errors) + pr_err("%s does not appear to be an md device\n", dev); + return -2; + } + return mdfd; +} + +char *find_free_devnm(int use_partitions) +{ + static char devnm[32]; + int devnum; + for (devnum = 127; devnum != 128; + devnum = devnum ? devnum-1 : (1<<20)-1) { + + if (use_partitions) + sprintf(devnm, "md_d%d", devnum); + else + sprintf(devnm, "md%d", devnum); + if (mddev_busy(devnm)) + continue; + if (!conf_name_is_free(devnm)) + continue; + if (!use_udev()) { + /* make sure it is new to /dev too, at least as a + * non-standard */ + int devid = devnm2devid(devnm); + if (devid) { + char *dn = map_dev(major(devid), + minor(devid), 0); + if (dn && ! is_standard(dn, NULL)) + continue; + } + } + break; + } + if (devnum == 128) + return NULL; + return devnm; +} diff --git a/mdstat.c b/mdstat.c new file mode 100644 index 00000000..2972cdf6 --- /dev/null +++ b/mdstat.c @@ -0,0 +1,414 @@ +/* + * mdstat - parse /proc/mdstat file. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2002-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +/* + * The /proc/mdstat file comes in at least 3 flavours: + * In an unpatched 2.2 kernel (md 0.36.6): + * Personalities : [n raidx] ... + * read_ahead {not set|%d sectors} + * md0 : {in}active{ raidX /dev/hda... %d blocks{ maxfault=%d}} + * md1 : ..... + * + * Normally only 4 md lines, but all are listed. + * + * In a patched 2.2 kernel (md 0.90.0) + * Personalities : [raidx] ... + * read_ahead {not set|%d sectors} + * mdN : {in}active {(readonly)} raidX dev[%d]{(F)} ... %d blocks STATUS RESYNC + * ... Only initialised arrays listed + * unused devices: {dev dev ... | } + * + * STATUS is personality dependant: + * linear: %dk rounding + * raid0: %dk chunks + * raid1: [%d/%d] [U_U] ( raid/working. operational or not) + * raid5: level 4/5, %dk chunk, algorithm %d [%d/%d] [U_U] + * + * RESYNC is empty or: + * {resync|recovery}=%u%% finish=%u.%umin + * or + * resync=DELAYED + * + * In a 2.4 kernel (md 0.90.0/2.4) + * Personalities : [raidX] ... + * read_ahead {not set|%d sectors} + * mdN : {in}active {(read-only)} raidX dev[%d]{(F)} ... + * %d blocks STATUS + * RESYNC + * unused devices: {dev dev .. | } + * + * STATUS matches 0.90.0/2.2 + * RESYNC includes [===>....], + * adds a space after {resync|recovery} and before and after '=' + * adds a decimal to the recovery percent. + * adds (%d/%d) resync amount and max_blocks, before finish. + * adds speed=%dK/sec after finish + * + * + * + * Out of this we want to extract: + * list of devices, active or not + * pattern of failed drives (so need number of drives) + * percent resync complete + * + * As continuation is indicated by leading space, we use + * conf_line from config.c to read logical lines + * + */ + +#include "mdadm.h" +#include "dlink.h" +#include +#include + +static void free_member_devnames(struct dev_member *m) +{ + while(m) { + struct dev_member *t = m; + + m = m->next; + free(t->name); + free(t); + } +} + +static int add_member_devname(struct dev_member **m, char *name) +{ + struct dev_member *new; + char *t; + + if ((t = strchr(name, '[')) == NULL) + /* not a device */ + return 0; + + new = xmalloc(sizeof(*new)); + new->name = strndup(name, t - name); + new->next = *m; + *m = new; + return 1; +} + +void free_mdstat(struct mdstat_ent *ms) +{ + while (ms) { + struct mdstat_ent *t; + free(ms->level); + free(ms->pattern); + free(ms->metadata_version); + free_member_devnames(ms->members); + t = ms; + ms = ms->next; + free(t); + } +} + +static int mdstat_fd = -1; +struct mdstat_ent *mdstat_read(int hold, int start) +{ + FILE *f; + struct mdstat_ent *all, *rv, **end, **insert_here; + char *line; + int fd; + + if (hold && mdstat_fd != -1) { + lseek(mdstat_fd, 0L, 0); + fd = dup(mdstat_fd); + if (fd >= 0) + f = fdopen(fd, "r"); + else + return NULL; + } else + f = fopen("/proc/mdstat", "r"); + if (f == NULL) + return NULL; + else + fcntl(fileno(f), F_SETFD, FD_CLOEXEC); + + all = NULL; + end = &all; + for (; (line = conf_line(f)) ; free_line(line)) { + struct mdstat_ent *ent; + char *w; + char devnm[32]; + int in_devs = 0; + + if (strcmp(line, "Personalities")==0) + continue; + if (strcmp(line, "read_ahead")==0) + continue; + if (strcmp(line, "unused")==0) + continue; + insert_here = NULL; + /* Better be an md line.. */ + if (strncmp(line, "md", 2)!= 0 || strlen(line) >= 32 + || (line[2] != '_' && !isdigit(line[2]))) + continue; + strcpy(devnm, line); + + ent = xmalloc(sizeof(*ent)); + ent->level = ent->pattern= NULL; + ent->next = NULL; + ent->percent = RESYNC_NONE; + ent->active = -1; + ent->resync = 0; + ent->metadata_version = NULL; + ent->raid_disks = 0; + ent->devcnt = 0; + ent->members = NULL; + + strcpy(ent->devnm, devnm); + + for (w=dl_next(line); w!= line ; w=dl_next(w)) { + int l = strlen(w); + char *eq; + if (strcmp(w, "active")==0) + ent->active = 1; + else if (strcmp(w, "inactive")==0) { + ent->active = 0; + in_devs = 1; + } else if (ent->active > 0 && + ent->level == NULL && + w[0] != '(' /*readonly*/) { + ent->level = xstrdup(w); + in_devs = 1; + } else if (in_devs && strcmp(w, "blocks")==0) + in_devs = 0; + else if (in_devs) { + char *ep = strchr(w, '['); + ent->devcnt += + add_member_devname(&ent->members, w); + if (ep && strncmp(w, "md", 2)==0) { + /* This has an md device as a component. + * If that device is already in the + * list, make sure we insert before + * there. + */ + struct mdstat_ent **ih; + ih = &all; + while (ih != insert_here && *ih && + ((int)strlen((*ih)->devnm) != ep-w + || strncmp((*ih)->devnm, w, ep-w) != 0)) + ih = & (*ih)->next; + insert_here = ih; + } + } else if (strcmp(w, "super") == 0 && + dl_next(w) != line) { + w = dl_next(w); + ent->metadata_version = xstrdup(w); + } else if (w[0] == '[' && isdigit(w[1])) { + ent->raid_disks = atoi(w+1); + } else if (!ent->pattern && + w[0] == '[' && + (w[1] == 'U' || w[1] == '_')) { + ent->pattern = xstrdup(w+1); + if (ent->pattern[l-2]==']') + ent->pattern[l-2] = '\0'; + } else if (ent->percent == RESYNC_NONE && + strncmp(w, "re", 2)== 0 && + w[l-1] == '%' && + (eq=strchr(w, '=')) != NULL ) { + ent->percent = atoi(eq+1); + if (strncmp(w,"resync", 6)==0) + ent->resync = 1; + else if (strncmp(w, "reshape", 7)==0) + ent->resync = 2; + else + ent->resync = 0; + } else if (ent->percent == RESYNC_NONE && + (w[0] == 'r' || w[0] == 'c')) { + if (strncmp(w, "resync", 4)==0) + ent->resync = 1; + if (strncmp(w, "reshape", 7)==0) + ent->resync = 2; + if (strncmp(w, "recovery", 8)==0) + ent->resync = 0; + if (strncmp(w, "check", 5)==0) + ent->resync = 3; + + if (l > 8 && strcmp(w+l-8, "=DELAYED") == 0) + ent->percent = RESYNC_DELAYED; + if (l > 8 && strcmp(w+l-8, "=PENDING") == 0) + ent->percent = RESYNC_PENDING; + } else if (ent->percent == RESYNC_NONE && + w[0] >= '0' && + w[0] <= '9' && + w[l-1] == '%') { + ent->percent = atoi(w); + } + } + if (insert_here && (*insert_here)) { + ent->next = *insert_here; + *insert_here = ent; + } else { + *end = ent; + end = &ent->next; + } + } + if (hold && mdstat_fd == -1) { + mdstat_fd = dup(fileno(f)); + fcntl(mdstat_fd, F_SETFD, FD_CLOEXEC); + } + fclose(f); + + /* If we might want to start array, + * reverse the order, so that components comes before composites + */ + if (start) { + rv = NULL; + while (all) { + struct mdstat_ent *e = all; + all = all->next; + e->next = rv; + rv = e; + } + } else rv = all; + return rv; +} + +void mdstat_close(void) +{ + if (mdstat_fd >= 0) + close(mdstat_fd); + mdstat_fd = -1; +} + +void mdstat_wait(int seconds) +{ + fd_set fds; + struct timeval tm; + int maxfd = 0; + FD_ZERO(&fds); + if (mdstat_fd >= 0) { + FD_SET(mdstat_fd, &fds); + maxfd = mdstat_fd; + } + tm.tv_sec = seconds; + tm.tv_usec = 0; + select(maxfd + 1, NULL, NULL, &fds, &tm); +} + +void mdstat_wait_fd(int fd, const sigset_t *sigmask) +{ + fd_set fds, rfds; + int maxfd = 0; + + FD_ZERO(&fds); + FD_ZERO(&rfds); + if (mdstat_fd >= 0) + FD_SET(mdstat_fd, &fds); + + if (fd >= 0) { + struct stat stb; + fstat(fd, &stb); + if ((stb.st_mode & S_IFMT) == S_IFREG) + /* Must be a /proc or /sys fd, so expect + * POLLPRI + * i.e. an 'exceptional' event. + */ + FD_SET(fd, &fds); + else + FD_SET(fd, &rfds); + + if (fd > maxfd) + maxfd = fd; + + } + if (mdstat_fd > maxfd) + maxfd = mdstat_fd; + + pselect(maxfd + 1, &rfds, NULL, &fds, + NULL, sigmask); +} + +int mddev_busy(char *devnm) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *me; + + for (me = mdstat ; me ; me = me->next) + if (strcmp(me->devnm, devnm) == 0) + break; + free_mdstat(mdstat); + return me != NULL; +} + +struct mdstat_ent *mdstat_by_component(char *name) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + + while (mdstat) { + struct dev_member *m; + struct mdstat_ent *ent; + if (mdstat->metadata_version && + strncmp(mdstat->metadata_version, "external:", 9) == 0 && + is_subarray(mdstat->metadata_version+9)) + /* don't return subarrays, only containers */ + ; + else for (m = mdstat->members; m; m = m->next) { + if (strcmp(m->name, name) == 0) { + free_mdstat(mdstat->next); + mdstat->next = NULL; + return mdstat; + } + } + ent = mdstat; + mdstat = mdstat->next; + ent->next = NULL; + free_mdstat(ent); + } + return NULL; +} + +struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent = NULL; + + while (mdstat) { + /* metadata version must match: + * external:[/-]%s/%s + * where first %s is 'container' and second %s is 'subdev' + */ + if (ent) + free_mdstat(ent); + ent = mdstat; + mdstat = mdstat->next; + ent->next = NULL; + + if (ent->metadata_version == NULL || + strncmp(ent->metadata_version, "external:", 9) != 0) + continue; + + if (!metadata_container_matches(ent->metadata_version+9, + container) || + !metadata_subdev_matches(ent->metadata_version+9, + subdev)) + continue; + + free_mdstat(mdstat); + return ent; + } + return NULL; +} diff --git a/misc/mdcheck b/misc/mdcheck new file mode 100644 index 00000000..2c8f54d6 --- /dev/null +++ b/misc/mdcheck @@ -0,0 +1,159 @@ +#!/bin/bash + +# Copyright (C) 2014 Neil Brown +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# Author: Neil Brown +# Email: + +# This script should be run periodically to automatically +# perform a 'check' on any md arrays. +# +# It supports a 'time budget' such that any incomplete 'check' +# will be checkpointed when that time has expired. +# A subsequent invocation can allow the 'check' to continue. +# +# Options are: +# --continue Don't start new checks, only continue old ones. +# --duration This is passed to "date --date=$duration" to find out +# when to finish +# +# To support '--continue', arrays are identified by UUID and the 'sync_completed' +# value is stored in /var/lib/mdcheck/$UUID + +# convert a /dev/md name into /sys/.../md equivalent +sysname() { + set `ls -lLd $1` + maj=${5%,} + min=$6 + readlink -f /sys/dev/block/$maj:$min +} + +args=$(getopt -o hcd: -l help,continue,duration: -n mdcheck -- "$@") +rv=$? +if [ $rv -ne 0 ]; then exit $rv; fi + +eval set -- $args + +cont= +endtime= +while [ " $1" != " --" ] +do + case $1 in + --help ) + echo >&2 'Usage: mdcheck [--continue] [--duration time-offset]' + echo >&2 ' time-offset must be understood by "date --date"' + exit 0 + ;; + --continue ) cont=yes ;; + --duration ) shift; dur=$1 + endtime=$(date --date "$dur" "+%s") + ;; + esac + shift +done +shift + +# We need a temp file occasionally... +tmp=/var/lib/mdcheck/.md-check-$$ +trap 'rm -f "$tmp"' 0 + + +# firstly, clean out really old state files +mkdir -p /var/lib/mdcheck +find /var/lib/mdcheck -name "MD_UUID*" -type f -mtime +180 -exec rm {} \; + +# Now look at each md device. +cnt=0 +for dev in /dev/md?* +do + [ -e "$dev" ] || continue + sys=`sysname $dev` + if [ ! -f "$sys/md/sync_action" ] + then # cannot check this array + continue + fi + if [ "`cat $sys/md/sync_action`" != 'idle' ] + then # This array is busy + continue + fi + + mdadm --detail --export "$dev" | grep '^MD_UUID=' > $tmp || continue + source $tmp + fl="/var/lib/mdcheck/MD_UUID_$MD_UUID" + if [ -z "$cont" ] + then + start=0 + elif [ -z "$MD_UUID" -o ! -f "$fl" ] + then + # Nothing to continue here + continue + else + start=`cat "$fl"` + fi + + cnt=$[cnt+1] + eval MD_${cnt}_fl=\$fl + eval MD_${cnt}_sys=\$sys + echo $start > $fl + echo $start > $sys/md/sync_min + echo check > $sys/md/sync_action +done + +if [ -z "$endtime" ] +then + exit 0 +fi + +while [ `date +%s` -lt $endtime ] +do + any= + for i in `eval echo {1..$cnt}` + do + eval fl=\$MD_${i}_fl + eval sys=\$MD_${i}_sys + + if [ -z "$fl" ]; then continue; fi + + if [ "`cat $sys/md/sync_action`" != 'check' ] + then + eval MD_${i}_fl= + rm -f $fl + continue; + fi + read a rest < $sys/md/sync_completed + echo $a > $fl + any=yes + done + if [ -z "$any" ]; then exit 0; fi + sleep 120 +done + +# We've waited, and there are still checks running. +# Time to stop them. +for i in `eval echo {1..$cnt}` +do + eval fl=\$MD_${i}_fl + eval sys=\$MD_${i}_sys + + if [ -z "$fl" ]; then continue; fi + + if [ "`cat $sys/md/sync_action`" != 'check' ] + then + eval MD_${i}_fl= + rm -f $fl + continue; + fi + echo idle > $sys/md/sync_action + cat $sys/md/sync_min > $fl +done diff --git a/misc/syslog-events b/misc/syslog-events new file mode 100644 index 00000000..fe8c14e4 --- /dev/null +++ b/misc/syslog-events @@ -0,0 +1,27 @@ +#!/bin/sh +# +# sample event handling script for mdadm +# e.g. mdadm --follow --program=/sbin/syslog-events --scan +# +# License: GPL ver.2 +# Copyright (C) 2004 SEKINE Tatsuo + +event="$1" +dev="$2" +disc="$3" + +facility="kern" +tag="mdmonitor" + +case x"${event}" in + xFail*) priority="error" ;; + xTest*) priority="debug" ;; + x*) priority="info" ;; +esac + +msg="${event} event on ${dev}" +if [ x"${disc}" != x ]; then + msg="${msg}, related to disc ${disc}" +fi + +exec logger -t "${tag}" -p "${facility}.${priority}" -- "${msg}" diff --git a/mkinitramfs b/mkinitramfs new file mode 100644 index 00000000..c6275ddb --- /dev/null +++ b/mkinitramfs @@ -0,0 +1,55 @@ +#!/bin/sh + +# make sure we are being run in the right directory... +if [ -f mkinitramfs ] +then : +else + echo >&2 mkinitramfs must be run from the mdadm source directory. + exit 1 +fi +if [ -f /bin/busybox ] +then : good, it exists + case `file /bin/busybox` in + *statically* ) : good ;; + * ) echo >&2 mkinitramfs: /bin/busybox is not statically linked: cannot proceed. + exit 1 + esac +else + echo >&2 "mkinitramfs: /bin/busybox doesn't exist - please install it statically linked." + exit 1 +fi + +rm -rf initramfs +mkdir initramfs +mkdir initramfs/bin +make mdadm.static +cp mdadm.static initramfs/bin/mdadm +cp /bin/busybox initramfs/bin/busybox +ln initramfs/bin/busybox initramfs/bin/sh +cat <<- END > initramfs/init + #!/bin/sh + + echo 'Auto-assembling boot md array' + mkdir /proc + mount -t proc proc /proc + if [ -n "$rootuuid" ] + then arg=--uuid=$rootuuid + elif [ -n "$mdminor" ] + then arg=--super-minor=$mdminor + else arg=--super-minor=0 + fi + echo "Using $arg" + mdadm -Acpartitions $arg --auto=part /dev/mda + cd / + mount /dev/mda1 /root || mount /dev/mda /root + umount /proc + cd /root + exec chroot . /sbin/init < /dev/console > /dev/console 2>&1 +END +chmod +x initramfs/init + +(cd initramfs + find init bin | cpio -o -H newc | gzip --best +) > init.cpio.gz +rm -rf initramfs +ls -l init.cpio.gz diff --git a/monitor.c b/monitor.c new file mode 100644 index 00000000..870cc1a7 --- /dev/null +++ b/monitor.c @@ -0,0 +1,712 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "mdadm.h" +#include "mdmon.h" +#include +#include +#include + +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", + "clean", "active", "write-pending", "active-idle", NULL }; +static char *sync_actions[] = { + "idle", "reshape", "resync", "recover", "check", "repair", NULL +}; + +static int write_attr(char *attr, int fd) +{ + return write(fd, attr, strlen(attr)); +} + +static void add_fd(fd_set *fds, int *maxfd, int fd) +{ + struct stat st; + if (fd < 0) + return; + if (fstat(fd, &st) == -1) { + dprintf("Invalid fd %d\n", fd); + return; + } + if (st.st_nlink == 0) { + dprintf("fd %d was deleted\n", fd); + return; + } + if (fd > *maxfd) + *maxfd = fd; + FD_SET(fd, fds); +} + +static int read_attr(char *buf, int len, int fd) +{ + int n; + + if (fd < 0) { + buf[0] = 0; + return 0; + } + lseek(fd, 0, 0); + n = read(fd, buf, len - 1); + + if (n <= 0) { + buf[0] = 0; + return 0; + } + buf[n] = 0; + if (buf[n-1] == '\n') + buf[n-1] = 0; + return n; +} + +static void read_resync_start(int fd, unsigned long long *v) +{ + char buf[30]; + int n; + + n = read_attr(buf, 30, fd); + if (n <= 0) { + dprintf("Failed to read resync_start (%d)\n", fd); + return; + } + if (strncmp(buf, "none", 4) == 0) + *v = MaxSector; + else + *v = strtoull(buf, NULL, 10); +} + +static unsigned long long read_sync_completed(int fd) +{ + unsigned long long val; + char buf[50]; + int n; + char *ep; + + n = read_attr(buf, 50, fd); + + if (n <= 0) + return 0; + buf[n] = 0; + val = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return 0; + return val; +} + +static enum array_state read_state(int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_word; + return (enum array_state) sysfs_match_word(buf, array_states); +} + +static enum sync_action read_action( int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_action; + return (enum sync_action) sysfs_match_word(buf, sync_actions); +} + +int read_dev_state(int fd) +{ + char buf[60]; + int n = read_attr(buf, 60, fd); + char *cp; + int rv = 0; + + if (n <= 0) + return 0; + + cp = buf; + while (cp) { + if (sysfs_attr_match(cp, "faulty")) + rv |= DS_FAULTY; + if (sysfs_attr_match(cp, "in_sync")) + rv |= DS_INSYNC; + if (sysfs_attr_match(cp, "write_mostly")) + rv |= DS_WRITE_MOSTLY; + if (sysfs_attr_match(cp, "spare")) + rv |= DS_SPARE; + if (sysfs_attr_match(cp, "blocked")) + rv |= DS_BLOCKED; + cp = strchr(cp, ','); + if (cp) + cp++; + } + return rv; +} + +static void signal_manager(void) +{ + /* tgkill(getpid(), mon_tid, SIGUSR1); */ + int pid = getpid(); + syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1); +} + +/* Monitor a set of active md arrays - all of which share the + * same metadata - and respond to events that require + * metadata update. + * + * New arrays are detected by another thread which allocates + * required memory and attaches the data structure to our list. + * + * Events: + * Array stops. + * This is detected by array_state going to 'clear' or 'inactive'. + * while we thought it was active. + * Response is to mark metadata as clean and 'clear' the array(??) + * write-pending + * array_state if 'write-pending' + * We mark metadata as 'dirty' then set array to 'active'. + * active_idle + * Either ignore, or mark clean, then mark metadata as clean. + * + * device fails + * detected by rd-N/state reporting "faulty" + * mark device as 'failed' in metadata, let the kernel release the + * device by writing '-blocked' to rd/state, and finally write 'remove' to + * rd/state. Before a disk can be replaced it must be failed and removed + * from all container members, this will be preemptive for the other + * arrays... safe? + * + * sync completes + * sync_action was 'resync' and becomes 'idle' and resync_start becomes + * MaxSector + * Notify metadata that sync is complete. + * + * recovery completes + * sync_action changes from 'recover' to 'idle' + * Check each device state and mark metadata if 'faulty' or 'in_sync'. + * + * deal with resync + * This only happens on finding a new array... mdadm will have set + * 'resync_start' to the correct value. If 'resync_start' indicates that an + * resync needs to occur set the array to the 'active' state rather than the + * initial read-auto state. + * + * + * + * We wait for a change (poll/select) on array_state, sync_action, and + * each rd-X/state file. + * When we get any change, we check everything. So read each state file, + * then decide what to do. + * + * The core action is to write new metadata to all devices in the array. + * This is done at most once on any wakeup. + * After that we might: + * - update the array_state + * - set the role of some devices. + * - request a sync_action + * + */ + +#define ARRAY_DIRTY 1 +#define ARRAY_BUSY 2 +static int read_and_act(struct active_array *a) +{ + unsigned long long sync_completed; + int check_degraded = 0; + int check_reshape = 0; + int deactivate = 0; + struct mdinfo *mdi; + int ret = 0; + int count = 0; + struct timeval tv; + + a->next_state = bad_word; + a->next_action = bad_action; + + a->curr_state = read_state(a->info.state_fd); + a->curr_action = read_action(a->action_fd); + if (a->curr_state != clear) + /* + * In "clear" state, resync_start may wrongly be set to "0" + * when the kernel called md_clean but didn't remove the + * sysfs attributes yet + */ + read_resync_start(a->resync_start_fd, &a->info.resync_start); + sync_completed = read_sync_completed(a->sync_completed_fd); + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->next_state = 0; + mdi->curr_state = 0; + if (mdi->state_fd >= 0) { + read_resync_start(mdi->recovery_fd, + &mdi->recovery_start); + mdi->curr_state = read_dev_state(mdi->state_fd); + } + } + + gettimeofday(&tv, NULL); + dprintf("(%d): %ld.%06ld state:%s prev:%s action:%s prev: %s start:%llu\n", + a->info.container_member, + tv.tv_sec, tv.tv_usec, + array_states[a->curr_state], + array_states[a->prev_state], + sync_actions[a->curr_action], + sync_actions[a->prev_action], + a->info.resync_start + ); + + if ((a->curr_state == bad_word || a->curr_state <= inactive) && + a->prev_state > inactive) { + /* array has been stopped */ + a->container->ss->set_array_state(a, 1); + a->next_state = clear; + deactivate = 1; + } + if (a->curr_state == write_pending) { + a->container->ss->set_array_state(a, 0); + a->next_state = active; + ret |= ARRAY_DIRTY; + } + if (a->curr_state == active_idle) { + /* Set array to 'clean' FIRST, then mark clean + * in the metadata + */ + a->next_state = clean; + ret |= ARRAY_DIRTY; + } + if (a->curr_state == clean) { + a->container->ss->set_array_state(a, 1); + } + if (a->curr_state == active || + a->curr_state == suspended) + ret |= ARRAY_DIRTY; + if (a->curr_state == readonly) { + /* Well, I'm ready to handle things. If readonly + * wasn't requested, transition to read-auto. + */ + char buf[64]; + read_attr(buf, sizeof(buf), a->metadata_fd); + if (strncmp(buf, "external:-", 10) == 0) { + /* explicit request for readonly array. Leave it alone */ + ; + } else { + if (a->container->ss->set_array_state(a, 2)) + a->next_state = read_auto; /* array is clean */ + else { + a->next_state = active; /* Now active for recovery etc */ + ret |= ARRAY_DIRTY; + } + } + } + + if (!deactivate && + a->curr_action == idle && + a->prev_action == resync) { + /* A resync has finished. The endpoint is recorded in + * 'sync_start'. We don't update the metadata + * until the array goes inactive or readonly though. + * Just check if we need to fiddle spares. + */ + a->container->ss->set_array_state(a, a->curr_state <= clean); + check_degraded = 1; + } + + if (!deactivate && + a->curr_action == idle && + a->prev_action == recover) { + /* A recovery has finished. Some disks may be in sync now, + * and the array may no longer be degraded + */ + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + a->container->ss->set_disk(a, mdi->disk.raid_disk, + mdi->curr_state); + if (! (mdi->curr_state & DS_INSYNC)) + check_degraded = 1; + count++; + } + if (count != a->info.array.raid_disks) + check_degraded = 1; + } + + if (!deactivate && + a->curr_action == reshape && + a->prev_action != reshape) + /* reshape was requested by mdadm. Need to see if + * new devices have been added. Manager does that + * when it sees check_reshape + */ + check_reshape = 1; + + /* Check for failures and if found: + * 1/ Record the failure in the metadata and unblock the device. + * FIXME update the kernel to stop notifying on failed drives when + * the array is readonly and we have cleared 'blocked' + * 2/ Try to remove the device if the array is writable, or can be + * made writable. + */ + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + if (mdi->curr_state & DS_FAULTY) { + a->container->ss->set_disk(a, mdi->disk.raid_disk, + mdi->curr_state); + check_degraded = 1; + if (mdi->curr_state & DS_BLOCKED) + mdi->next_state |= DS_UNBLOCK; + if (a->curr_state == read_auto) { + a->container->ss->set_array_state(a, 0); + a->next_state = active; + } + if (a->curr_state > readonly) + mdi->next_state |= DS_REMOVE; + } + } + + /* Check for recovery checkpoint notifications. We need to be a + * minimum distance away from the last checkpoint to prevent + * over checkpointing. Note reshape checkpointing is handled + * in the second branch. + */ + if (sync_completed > a->last_checkpoint && + sync_completed - a->last_checkpoint > a->info.component_size >> 4 && + a->curr_action > reshape) { + /* A (non-reshape) sync_action has reached a checkpoint. + * Record the updated position in the metadata + */ + a->last_checkpoint = sync_completed; + a->container->ss->set_array_state(a, a->curr_state <= clean); + } else if ((a->curr_action == idle && a->prev_action == reshape) || + (a->curr_action == reshape + && sync_completed > a->last_checkpoint) ) { + /* Reshape has progressed or completed so we need to + * update the array state - and possibly the array size + */ + if (sync_completed != 0) + a->last_checkpoint = sync_completed; + /* We might need to update last_checkpoint depending on + * the reason that reshape finished. + * if array reshape is really finished: + * set check point to the end, this allows + * set_array_state() to finalize reshape in metadata + * if reshape if broken: do not set checkpoint to the end + * this allows for reshape restart from checkpoint + */ + if ((a->curr_action != reshape) && + (a->prev_action == reshape)) { + char buf[40]; + if ((sysfs_get_str(&a->info, NULL, + "reshape_position", + buf, + sizeof(buf)) >= 0) && + strncmp(buf, "none", 4) == 0) + a->last_checkpoint = a->info.component_size; + } + a->container->ss->set_array_state(a, a->curr_state <= clean); + a->last_checkpoint = sync_completed; + } + + if (sync_completed > a->last_checkpoint) + a->last_checkpoint = sync_completed; + + a->container->ss->sync_metadata(a->container); + dprintf("(%d): state:%s action:%s next(", a->info.container_member, + array_states[a->curr_state], sync_actions[a->curr_action]); + + /* Effect state changes in the array */ + if (a->next_state != bad_word) { + dprintf_cont(" state:%s", array_states[a->next_state]); + write_attr(array_states[a->next_state], a->info.state_fd); + } + if (a->next_action != bad_action) { + write_attr(sync_actions[a->next_action], a->action_fd); + dprintf_cont(" action:%s", sync_actions[a->next_action]); + } + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + if (mdi->next_state & DS_UNBLOCK) { + dprintf_cont(" %d:-blocked", mdi->disk.raid_disk); + write_attr("-blocked", mdi->state_fd); + } + + if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) { + int remove_result; + + /* The kernel may not be able to immediately remove the + * disk. In that case we wait a little while and + * try again. + */ + remove_result = write_attr("remove", mdi->state_fd); + if (remove_result > 0) { + dprintf_cont(" %d:removed", mdi->disk.raid_disk); + close(mdi->state_fd); + close(mdi->recovery_fd); + mdi->state_fd = -1; + } else + ret |= ARRAY_BUSY; + } + if (mdi->next_state & DS_INSYNC) { + write_attr("+in_sync", mdi->state_fd); + dprintf_cont(" %d:+in_sync", mdi->disk.raid_disk); + } + } + dprintf_cont(" )\n"); + + /* move curr_ to prev_ */ + a->prev_state = a->curr_state; + + a->prev_action = a->curr_action; + + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->prev_state = mdi->curr_state; + mdi->next_state = 0; + } + + if (check_degraded || check_reshape) { + /* manager will do the actual check */ + if (check_degraded) + a->check_degraded = 1; + if (check_reshape) + a->check_reshape = 1; + signal_manager(); + } + + if (deactivate) + a->container = NULL; + + return ret; +} + +static struct mdinfo * +find_device(struct active_array *a, int major, int minor) +{ + struct mdinfo *mdi; + + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + if (mdi->disk.major == major && mdi->disk.minor == minor) + return mdi; + + return NULL; +} + +static void reconcile_failed(struct active_array *aa, struct mdinfo *failed) +{ + struct active_array *a; + struct mdinfo *victim; + + for (a = aa; a; a = a->next) { + if (!a->container || a->to_remove) + continue; + victim = find_device(a, failed->disk.major, failed->disk.minor); + if (!victim) + continue; + + if (!(victim->curr_state & DS_FAULTY)) + write_attr("faulty", victim->state_fd); + } +} + +#ifdef DEBUG +static void dprint_wake_reasons(fd_set *fds) +{ + int i; + char proc_path[256]; + char link[256]; + char *basename; + int rv; + + fprintf(stderr, "monitor: wake ( "); + for (i = 0; i < FD_SETSIZE; i++) { + if (FD_ISSET(i, fds)) { + sprintf(proc_path, "/proc/%d/fd/%d", + (int) getpid(), i); + + rv = readlink(proc_path, link, sizeof(link) - 1); + if (rv < 0) { + fprintf(stderr, "%d:unknown ", i); + continue; + } + link[rv] = '\0'; + basename = strrchr(link, '/'); + fprintf(stderr, "%d:%s ", + i, basename ? ++basename : link); + } + } + fprintf(stderr, ")\n"); +} +#endif + +int monitor_loop_cnt; + +static int wait_and_act(struct supertype *container, int nowait) +{ + fd_set rfds; + int maxfd = 0; + struct active_array **aap = &container->arrays; + struct active_array *a, **ap; + int rv; + struct mdinfo *mdi; + static unsigned int dirty_arrays = ~0; /* start at some non-zero value */ + + FD_ZERO(&rfds); + + for (ap = aap ; *ap ;) { + a = *ap; + /* once an array has been deactivated we want to + * ask the manager to discard it. + */ + if (!a->container || a->to_remove) { + if (discard_this) { + ap = &(*ap)->next; + continue; + } + *ap = a->next; + a->next = NULL; + discard_this = a; + signal_manager(); + continue; + } + + add_fd(&rfds, &maxfd, a->info.state_fd); + add_fd(&rfds, &maxfd, a->action_fd); + add_fd(&rfds, &maxfd, a->sync_completed_fd); + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + add_fd(&rfds, &maxfd, mdi->state_fd); + + ap = &(*ap)->next; + } + + if (manager_ready && (*aap == NULL || (sigterm && !dirty_arrays))) { + /* No interesting arrays, or we have been told to + * terminate and everything is clean. Lets see about + * exiting. Note that blocking at this point is not a + * problem as there are no active arrays, there is + * nothing that we need to be ready to do. + */ + int fd; + if (sigterm) + fd = open_dev_excl(container->devnm); + else + fd = open_dev_flags(container->devnm, O_RDONLY|O_EXCL); + if (fd >= 0 || errno != EBUSY) { + /* OK, we are safe to leave */ + if (sigterm && !dirty_arrays) + dprintf("caught sigterm, all clean... exiting\n"); + else + dprintf("no arrays to monitor... exiting\n"); + if (!sigterm) + /* On SIGTERM, someone (the take-over mdmon) will + * clean up + */ + remove_pidfile(container->devnm); + exit_now = 1; + signal_manager(); + close(fd); + exit(0); + } + } + + if (!nowait) { + sigset_t set; + struct timespec ts; + ts.tv_sec = 24*3600; + ts.tv_nsec = 0; + if (*aap == NULL || container->retry_soon) { + /* just waiting to get O_EXCL access */ + ts.tv_sec = 0; + ts.tv_nsec = 20000000ULL; + } + sigprocmask(SIG_UNBLOCK, NULL, &set); + sigdelset(&set, SIGUSR1); + monitor_loop_cnt |= 1; + rv = pselect(maxfd+1, NULL, NULL, &rfds, &ts, &set); + monitor_loop_cnt += 1; + if (rv == -1) { + if (errno == EINTR) { + rv = 0; + dprintf("monitor: caught signal\n"); + } else + dprintf("monitor: error %d in pselect\n", + errno); + } + #ifdef DEBUG + else + dprint_wake_reasons(&rfds); + #endif + container->retry_soon = 0; + } + + if (update_queue) { + struct metadata_update *this; + + for (this = update_queue; this ; this = this->next) + container->ss->process_update(container, this); + + update_queue_handled = update_queue; + update_queue = NULL; + signal_manager(); + container->ss->sync_metadata(container); + } + + rv = 0; + dirty_arrays = 0; + for (a = *aap; a ; a = a->next) { + + if (a->replaces && !discard_this) { + struct active_array **ap; + for (ap = &a->next; *ap && *ap != a->replaces; + ap = & (*ap)->next) + ; + if (*ap) + *ap = (*ap)->next; + discard_this = a->replaces; + a->replaces = NULL; + /* FIXME check if device->state_fd need to be cleared?*/ + signal_manager(); + } + if (a->container && !a->to_remove) { + int ret = read_and_act(a); + rv |= 1; + dirty_arrays += !!(ret & ARRAY_DIRTY); + /* when terminating stop manipulating the array after it + * is clean, but make sure read_and_act() is given a + * chance to handle 'active_idle' + */ + if (sigterm && !(ret & ARRAY_DIRTY)) + a->container = NULL; /* stop touching this array */ + if (ret & ARRAY_BUSY) + container->retry_soon = 1; + } + } + + /* propagate failures across container members */ + for (a = *aap; a ; a = a->next) { + if (!a->container || a->to_remove) + continue; + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + if (mdi->curr_state & DS_FAULTY) + reconcile_failed(*aap, mdi); + } + + return rv; +} + +void do_monitor(struct supertype *container) +{ + int rv; + int first = 1; + do { + rv = wait_and_act(container, first); + first = 0; + } while (rv >= 0); +} diff --git a/msg.c b/msg.c new file mode 100644 index 00000000..45cd4504 --- /dev/null +++ b/msg.c @@ -0,0 +1,475 @@ +/* + * Copyright (C) 2008 Intel Corporation + * + * mdmon socket / message handling + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mdadm.h" +#include "mdmon.h" + +static const __u32 start_magic = 0x5a5aa5a5; +static const __u32 end_magic = 0xa5a55a5a; + +static int send_buf(int fd, const void* buf, int len, int tmo) +{ + fd_set set; + int rv; + struct timeval timeout = {tmo, 0}; + struct timeval *ptmo = tmo ? &timeout : NULL; + + while (len) { + FD_ZERO(&set); + FD_SET(fd, &set); + rv = select(fd+1, NULL, &set, NULL, ptmo); + if (rv <= 0) + return -1; + rv = write(fd, buf, len); + if (rv <= 0) + return -1; + len -= rv; + buf += rv; + } + return 0; +} + +static int recv_buf(int fd, void* buf, int len, int tmo) +{ + fd_set set; + int rv; + struct timeval timeout = {tmo, 0}; + struct timeval *ptmo = tmo ? &timeout : NULL; + + while (len) { + FD_ZERO(&set); + FD_SET(fd, &set); + rv = select(fd+1, &set, NULL, NULL, ptmo); + if (rv <= 0) + return -1; + rv = read(fd, buf, len); + if (rv <= 0) + return -1; + len -= rv; + buf += rv; + } + return 0; +} + +int send_message(int fd, struct metadata_update *msg, int tmo) +{ + __s32 len = msg->len; + int rv; + + rv = send_buf(fd, &start_magic, 4, tmo); + rv = rv ?: send_buf(fd, &len, 4, tmo); + if (len > 0) + rv = rv ?: send_buf(fd, msg->buf, msg->len, tmo); + rv = send_buf(fd, &end_magic, 4, tmo); + + return rv; +} + +int receive_message(int fd, struct metadata_update *msg, int tmo) +{ + __u32 magic; + __s32 len; + int rv; + + rv = recv_buf(fd, &magic, 4, tmo); + if (rv < 0 || magic != start_magic) + return -1; + rv = recv_buf(fd, &len, 4, tmo); + if (rv < 0 || len > MSG_MAX_LEN) + return -1; + if (len > 0) { + msg->buf = xmalloc(len); + rv = recv_buf(fd, msg->buf, len, tmo); + if (rv < 0) { + free(msg->buf); + return -1; + } + } else + msg->buf = NULL; + rv = recv_buf(fd, &magic, 4, tmo); + if (rv < 0 || magic != end_magic) { + free(msg->buf); + return -1; + } + msg->len = len; + return 0; +} + +int ack(int fd, int tmo) +{ + struct metadata_update msg = { .len = 0 }; + + return send_message(fd, &msg, tmo); +} + +int wait_reply(int fd, int tmo) +{ + struct metadata_update msg; + int err = receive_message(fd, &msg, tmo); + + /* mdmon sent extra data, but caller only cares that we got a + * successful reply + */ + if (err == 0 && msg.len > 0) + free(msg.buf); + + return err; +} + +int connect_monitor(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + int pos; + char *c; + + pos = sprintf(path, "%s/", MDMON_DIR); + if (is_subarray(devname)) { + devname++; + c = strchr(devname, '/'); + if (!c) + return -1; + snprintf(&path[pos], c - devname + 1, "%s", devname); + pos += c - devname; + } else + pos += sprintf(&path[pos], "%s", devname); + sprintf(&path[pos], ".sock"); + + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + if (connect(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + + return sfd; +} + +int fping_monitor(int sfd) +{ + int err = 0; + + if (sfd < 0) + return sfd; + + /* try to ping existing socket */ + if (ack(sfd, 20) != 0) + err = -1; + + /* check the reply */ + if (!err && wait_reply(sfd, 20) != 0) + err = -1; + + return err; +} + +/* give the monitor a chance to update the metadata */ +int ping_monitor(char *devname) +{ + int sfd = connect_monitor(devname); + int err; + + if (sfd >= 0) { + err = fping_monitor(sfd); + close(sfd); + } else + err = -1; + + return err; +} + +static char *ping_monitor_version(char *devname) +{ + int sfd = connect_monitor(devname); + struct metadata_update msg; + int err = 0; + + if (sfd < 0) + return NULL; + + if (ack(sfd, 20) != 0) + err = -1; + + if (!err && receive_message(sfd, &msg, 20) != 0) + err = -1; + + close(sfd); + + if (err || !msg.len || !msg.buf) + return NULL; + return msg.buf; +} + +int unblock_subarray(struct mdinfo *sra, const int unfreeze) +{ + char buf[64]; + int rc = 0; + + if (sra) { + sprintf(buf, "external:%s\n", sra->text_version); + buf[9] = '/'; + } else + buf[9] = '-'; + + if (buf[9] == '-' || + sysfs_set_str(sra, NULL, "metadata_version", buf) || + (unfreeze && + sysfs_attribute_available(sra, NULL, "sync_action") && + sysfs_set_str(sra, NULL, "sync_action", "idle"))) + rc = -1; + return rc; +} + +int block_subarray(struct mdinfo *sra) +{ + char buf[64]; + int rc = 0; + + sprintf(buf, "external:%s\n", sra->text_version); + buf[9] = '-'; + if (sysfs_set_str(sra, NULL, "metadata_version", buf)) + rc = -1; + + return rc; +} + +/* check mdmon version if it supports + * array blocking mechanism + */ +int check_mdmon_version(char *container) +{ + char *version = NULL; + + if (!mdmon_running(container)) { + /* if mdmon is not active we assume that any instance that is + * later started will match the current mdadm version, if this + * assumption is violated we may inadvertantly rebuild an array + * that was meant for reshape, or start rebuild on a spare that + * was to be moved to another container + */ + /* pass */; + } else { + int ver; + + version = ping_monitor_version(container); + ver = version ? mdadm_version(version) : -1; + free(version); + if (ver < 3002000) { + pr_err("mdmon instance for %s cannot be disabled\n", + container); + return -1; + } + } + + return 0; +} + +/** + * block_monitor - prevent mdmon spare assignment + * @container - container to block + * @freeze - flag to additionally freeze sync_action + * + * This is used by the reshape code to freeze the container, and the + * auto-rebuild implementation to atomically move spares. + * In both cases we need to stop mdmon from assigning spares to replace + * failed devices as we might have other plans for the spare. + * For the reshape case we also need to 'freeze' sync_action so that + * no recovery happens until we have fully prepared for the reshape. + * + * We tell mdmon that the array is frozen by marking the 'metadata' name + * with a leading '-'. The previously told mdmon "Don't make this array + * read/write, leave it readonly". Now it means a more general "Don't + * reconfigure this array at all". + * As older versions of mdmon (which might run from initrd) don't understand + * this, we first check that the running mdmon is new enough. + */ +int block_monitor(char *container, const int freeze) +{ + struct mdstat_ent *ent, *e, *e2; + struct mdinfo *sra = NULL; + char buf[64]; + int rv = 0; + + if (check_mdmon_version(container)) + return -1; + + ent = mdstat_read(0, 0); + if (!ent) { + pr_err("failed to read /proc/mdstat while disabling mdmon\n"); + return -1; + } + + /* freeze container contents */ + for (e = ent; e; e = e->next) { + if (!is_container_member(e, container)) + continue; + sysfs_free(sra); + sra = sysfs_read(-1, e->devnm, GET_VERSION); + if (!sra) { + pr_err("failed to read sysfs for subarray%s\n", + to_subarray(e, container)); + break; + } + /* can't reshape an array that we can't monitor */ + if (sra->text_version[0] == '-') + break; + + if (freeze && sysfs_freeze_array(sra) < 1) + break; + /* flag this array to not be modified by mdmon (close race with + * takeover in reshape case and spare reassignment in the + * auto-rebuild case) + */ + if (block_subarray(sra)) + break; + ping_monitor(container); + + /* check that we did not race with recovery */ + if ((freeze && + !sysfs_attribute_available(sra, NULL, "sync_action")) || + (freeze && + sysfs_attribute_available(sra, NULL, "sync_action") && + sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 && + strcmp(buf, "frozen\n") == 0)) + /* pass */; + else { + unblock_subarray(sra, 0); + break; + } + /* Double check against races - there should be no spares + * or part-spares + */ + sysfs_free(sra); + sra = sysfs_read(-1, e->devnm, GET_DEVS | GET_STATE); + if (sra && sra->array.spare_disks > 0) { + unblock_subarray(sra, freeze); + break; + } + } + + if (e) { + pr_err("failed to freeze subarray%s\n", + to_subarray(e, container)); + + /* thaw the partially frozen container */ + for (e2 = ent; e2 && e2 != e; e2 = e2->next) { + if (!is_container_member(e2, container)) + continue; + sysfs_free(sra); + sra = sysfs_read(-1, e2->devnm, GET_VERSION); + if (unblock_subarray(sra, freeze)) + pr_err("Failed to unfreeze %s\n", e2->devnm); + } + + ping_monitor(container); /* cleared frozen */ + rv = -1; + } + + sysfs_free(sra); + free_mdstat(ent); + + return rv; +} + +void unblock_monitor(char *container, const int unfreeze) +{ + struct mdstat_ent *ent, *e; + struct mdinfo *sra = NULL; + int to_ping = 0; + + ent = mdstat_read(0, 0); + if (!ent) { + pr_err("failed to read /proc/mdstat while unblocking container\n"); + return; + } + + /* unfreeze container contents */ + for (e = ent; e; e = e->next) { + if (!is_container_member(e, container)) + continue; + sysfs_free(sra); + sra = sysfs_read(-1, e->devnm, GET_VERSION|GET_LEVEL); + if (!sra) + continue; + if (sra->array.level > 0) + to_ping++; + if (unblock_subarray(sra, unfreeze)) + pr_err("Failed to unfreeze %s\n", e->devnm); + } + if (to_ping) + ping_monitor(container); + + sysfs_free(sra); + free_mdstat(ent); +} + +/* give the manager a chance to view the updated container state. This + * would naturally happen due to the manager noticing a change in + * /proc/mdstat; however, pinging encourages this detection to happen + * while an exclusive open() on the container is active + */ +int ping_manager(char *devname) +{ + int sfd = connect_monitor(devname); + struct metadata_update msg = { .len = -1 }; + int err = 0; + + if (sfd < 0) + return sfd; + + err = send_message(sfd, &msg, 20); + + /* check the reply */ + if (!err && wait_reply(sfd, 20) != 0) + err = -1; + + close(sfd); + return err; +} + +/* using takeover operation for grow purposes, mdadm has to be sure + * that mdmon processes all updates, and if necessary it will be closed + * at takeover to raid0 operation + */ +void flush_mdmon(char *container) +{ + ping_manager(container); + ping_monitor(container); +} diff --git a/msg.h b/msg.h new file mode 100644 index 00000000..016612cd --- /dev/null +++ b/msg.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2008 Intel Corporation + * + * mdmon socket / message handling + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +struct mdinfo; +struct metadata_update; + +extern int receive_message(int fd, struct metadata_update *msg, int tmo); +extern int send_message(int fd, struct metadata_update *msg, int tmo); +extern int ack(int fd, int tmo); +extern int wait_reply(int fd, int tmo); +extern int connect_monitor(char *devname); +extern int ping_monitor(char *devname); +extern int block_subarray(struct mdinfo *sra); +extern int unblock_subarray(struct mdinfo *sra, const int unfreeze); +extern int block_monitor(char *container, const int freeze); +extern void unblock_monitor(char *container, const int unfreeze); +extern int fping_monitor(int sock); +extern int ping_manager(char *devname); +extern void flush_mdmon(char *container); + +#define MSG_MAX_LEN (4*1024*1024) diff --git a/part.h b/part.h new file mode 100644 index 00000000..862a14c3 --- /dev/null +++ b/part.h @@ -0,0 +1,79 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2010 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * + */ + +/* Structure definitions ext for MBR and GPT partition tables + */ + +#define MBR_SIGNATURE_MAGIC __cpu_to_le16(0xAA55) +#define MBR_PARTITIONS 4 + +struct MBR_part_record { + __u8 bootable; + __u8 first_head; + __u8 first_sector; + __u8 first_cyl; + __u8 part_type; + __u8 last_head; + __u8 last_sector; + __u8 last_cyl; + __u32 first_sect_lba; + __u32 blocks_num; +}; + +struct MBR { + __u8 pad[446]; + struct MBR_part_record parts[MBR_PARTITIONS]; + __u16 magic; +} __attribute__((packed)); + +#define GPT_SIGNATURE_MAGIC __cpu_to_le64(0x5452415020494645ULL) +#define MBR_GPT_PARTITION_TYPE 0xEE + +struct GPT_part_entry { + unsigned char type_guid[16]; + unsigned char partition_guid[16]; + __u64 starting_lba; + __u64 ending_lba; + unsigned char attr_bits[8]; + unsigned char name[72]; +} __attribute__((packed)); + +struct GPT { + __u64 magic; + __u32 revision; + __u32 header_size; + __u32 crc; + __u32 pad1; + __u64 current_lba; + __u64 backup_lba; + __u64 first_lba; + __u64 last_lba; + __u8 guid[16]; + __u64 part_start; + __u32 part_cnt; + __u32 part_size; + __u32 part_crc; + __u8 pad2[420]; +} __attribute__((packed)); diff --git a/platform-intel.c b/platform-intel.c new file mode 100644 index 00000000..88818f34 --- /dev/null +++ b/platform-intel.c @@ -0,0 +1,741 @@ +/* + * Intel(R) Matrix Storage Manager hardware and firmware support routines + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include "mdadm.h" +#include "platform-intel.h" +#include "probe_roms.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int devpath_to_ll(const char *dev_path, const char *entry, + unsigned long long *val); + +static void free_sys_dev(struct sys_dev **list) +{ + while (*list) { + struct sys_dev *next = (*list)->next; + + if ((*list)->path) + free((*list)->path); + free(*list); + *list = next; + } +} + +struct sys_dev *find_driver_devices(const char *bus, const char *driver) +{ + /* search sysfs for devices driven by 'driver' */ + char path[292]; + char link[256]; + char *c; + DIR *driver_dir; + struct dirent *de; + struct sys_dev *head = NULL; + struct sys_dev *list = NULL; + struct sys_dev *vmd = NULL; + enum sys_dev_type type; + unsigned long long dev_id; + unsigned long long class; + + if (strcmp(driver, "isci") == 0) + type = SYS_DEV_SAS; + else if (strcmp(driver, "ahci") == 0) + type = SYS_DEV_SATA; + else if (strcmp(driver, "nvme") == 0) { + /* if looking for nvme devs, first look for vmd */ + vmd = find_driver_devices("pci", "vmd"); + type = SYS_DEV_NVME; + } else if (strcmp(driver, "vmd") == 0) + type = SYS_DEV_VMD; + else + type = SYS_DEV_UNKNOWN; + + sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver); + driver_dir = opendir(path); + if (!driver_dir) { + if (vmd) + free_sys_dev(&vmd); + return NULL; + } + for (de = readdir(driver_dir); de; de = readdir(driver_dir)) { + int n; + int skip = 0; + + /* is 'de' a device? check that the 'subsystem' link exists and + * that its target matches 'bus' + */ + sprintf(path, "/sys/bus/%s/drivers/%s/%s/subsystem", + bus, driver, de->d_name); + n = readlink(path, link, sizeof(link)); + if (n < 0 || n >= (int)sizeof(link)) + continue; + link[n] = '\0'; + c = strrchr(link, '/'); + if (!c) + continue; + if (strncmp(bus, c+1, strlen(bus)) != 0) + continue; + + sprintf(path, "/sys/bus/%s/drivers/%s/%s", + bus, driver, de->d_name); + + /* if searching for nvme - skip vmd connected one */ + if (type == SYS_DEV_NVME) { + struct sys_dev *dev; + char *rp = realpath(path, NULL); + for (dev = vmd; dev; dev = dev->next) { + if ((strncmp(dev->path, rp, strlen(dev->path)) == 0)) + skip = 1; + } + free(rp); + } + + /* if it's not Intel device or mark as VMD connected - skip it. */ + if (devpath_to_vendor(path) != 0x8086 || skip == 1) + continue; + + if (devpath_to_ll(path, "device", &dev_id) != 0) + continue; + + if (devpath_to_ll(path, "class", &class) != 0) + continue; + + /* start / add list entry */ + if (!head) { + head = xmalloc(sizeof(*head)); + list = head; + } else { + list->next = xmalloc(sizeof(*head)); + list = list->next; + } + + if (!list) { + free_sys_dev(&head); + break; + } + + list->dev_id = (__u16) dev_id; + list->class = (__u32) class; + list->type = type; + /* Each VMD device (domain) adds separate PCI bus, it is better to + * store path as a path to that bus (easier further determination which + * NVMe dev is connected to this particular VMD domain). + */ + if (type == SYS_DEV_VMD) { + sprintf(path, "/sys/bus/%s/drivers/%s/%s/domain/device", + bus, driver, de->d_name); + } + list->path = realpath(path, NULL); + list->next = NULL; + if ((list->pci_id = strrchr(list->path, '/')) != NULL) + list->pci_id++; + } + closedir(driver_dir); + + if (vmd) { + if (list) + list->next = vmd; + else + head = vmd; + } + + return head; +} + +static struct sys_dev *intel_devices=NULL; +static time_t valid_time = 0; + +struct sys_dev *device_by_id(__u16 device_id) +{ + struct sys_dev *iter; + + for (iter = intel_devices; iter != NULL; iter = iter->next) + if (iter->dev_id == device_id) + return iter; + return NULL; +} + +static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long long *val) +{ + char path[strlen(dev_path) + strlen(entry) + 2]; + int fd; + int n; + + sprintf(path, "%s/%s", dev_path, entry); + + fd = open(path, O_RDONLY); + if (fd < 0) + return -1; + n = sysfs_fd_get_ll(fd, val); + close(fd); + return n; +} + +__u16 devpath_to_vendor(const char *dev_path) +{ + char path[strlen(dev_path) + strlen("/vendor") + 1]; + char vendor[7]; + int fd; + __u16 id = 0xffff; + int n; + + sprintf(path, "%s/vendor", dev_path); + + fd = open(path, O_RDONLY); + if (fd < 0) + return 0xffff; + + n = read(fd, vendor, sizeof(vendor)); + if (n == sizeof(vendor)) { + vendor[n - 1] = '\0'; + id = strtoul(vendor, NULL, 16); + } + close(fd); + + return id; +} + +struct sys_dev *find_intel_devices(void) +{ + struct sys_dev *ahci, *isci, *nvme; + + if (valid_time > time(0) - 10) + return intel_devices; + + if (intel_devices) + free_sys_dev(&intel_devices); + + isci = find_driver_devices("pci", "isci"); + ahci = find_driver_devices("pci", "ahci"); + /* Searching for NVMe will return list of NVMe and VMD controllers */ + nvme = find_driver_devices("pci", "nvme"); + + if (!isci && !ahci) { + ahci = nvme; + } else if (!ahci) { + ahci = isci; + struct sys_dev *elem = ahci; + while (elem->next) + elem = elem->next; + elem->next = nvme; + } else { + struct sys_dev *elem = ahci; + while (elem->next) + elem = elem->next; + elem->next = isci; + while (elem->next) + elem = elem->next; + elem->next = nvme; + } + intel_devices = ahci; + valid_time = time(0); + return intel_devices; +} + +/* + * PCI Expansion ROM Data Structure Format */ +struct pciExpDataStructFormat { + __u8 ver[4]; + __u16 vendorID; + __u16 deviceID; + __u16 devListOffset; + __u16 pciDataStructLen; + __u8 pciDataStructRev; +} __attribute__ ((packed)); + +struct orom_entry *orom_entries; + +const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id) +{ + struct orom_entry *entry; + struct devid_list *devid; + + for (entry = orom_entries; entry; entry = entry->next) { + for (devid = entry->devid_list; devid; devid = devid->next) { + if (devid->devid == dev_id) + return entry; + } + } + + return NULL; +} + +const struct imsm_orom *get_orom_by_device_id(__u16 dev_id) +{ + const struct orom_entry *entry = get_orom_entry_by_device_id(dev_id); + + if (entry) + return &entry->orom; + + return NULL; +} + +static struct orom_entry *add_orom(const struct imsm_orom *orom) +{ + struct orom_entry *list; + struct orom_entry *prev = NULL; + + for (list = orom_entries; list; prev = list, list = list->next) + ; + + list = xmalloc(sizeof(struct orom_entry)); + list->orom = *orom; + list->devid_list = NULL; + list->next = NULL; + + if (prev == NULL) + orom_entries = list; + else + prev->next = list; + + return list; +} + +static void add_orom_device_id(struct orom_entry *entry, __u16 dev_id) +{ + struct devid_list *list; + struct devid_list *prev = NULL; + + for (list = entry->devid_list; list; prev = list, list = list->next) { + if (list->devid == dev_id) + return; + } + list = xmalloc(sizeof(struct devid_list)); + list->devid = dev_id; + list->next = NULL; + + if (prev == NULL) + entry->devid_list = list; + else + prev->next = list; +} + +static int scan(const void *start, const void *end, const void *data) +{ + int offset; + const struct imsm_orom *imsm_mem = NULL; + int len = (end - start); + struct pciExpDataStructFormat *ptr= (struct pciExpDataStructFormat *)data; + + if (data + 0x18 > end) { + dprintf("cannot find pciExpDataStruct \n"); + return 0; + } + + dprintf("ptr->vendorID: %lx __le16_to_cpu(ptr->deviceID): %lx \n", + (ulong) __le16_to_cpu(ptr->vendorID), + (ulong) __le16_to_cpu(ptr->deviceID)); + + if (__le16_to_cpu(ptr->vendorID) != 0x8086) + return 0; + + for (offset = 0; offset < len; offset += 4) { + const void *mem = start + offset; + + if ((memcmp(mem, IMSM_OROM_SIGNATURE, 4) == 0)) { + imsm_mem = mem; + break; + } + } + + if (!imsm_mem) + return 0; + + struct orom_entry *orom = add_orom(imsm_mem); + + /* only PciDataStructure with revision 3 and above supports devices list. */ + if (ptr->pciDataStructRev >= 3 && ptr->devListOffset) { + const __u16 *dev_list = (void *)ptr + ptr->devListOffset; + int i; + + for (i = 0; dev_list[i] != 0; i++) + add_orom_device_id(orom, dev_list[i]); + } else { + add_orom_device_id(orom, __le16_to_cpu(ptr->deviceID)); + } + + return 0; +} + +const struct imsm_orom *imsm_platform_test(struct sys_dev *hba) +{ + struct imsm_orom orom = { + .signature = IMSM_OROM_SIGNATURE, + .rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5, + .sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB | + IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB | + IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB | + IMSM_OROM_SSS_256kB | IMSM_OROM_SSS_512kB | + IMSM_OROM_SSS_1MB | IMSM_OROM_SSS_2MB, + .dpa = IMSM_OROM_DISKS_PER_ARRAY, + .tds = IMSM_OROM_TOTAL_DISKS, + .vpa = IMSM_OROM_VOLUMES_PER_ARRAY, + .vphba = IMSM_OROM_VOLUMES_PER_HBA + }; + orom.attr = orom.rlc | IMSM_OROM_ATTR_ChecksumVerify; + + if (check_env("IMSM_TEST_OROM_NORAID5")) { + orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10; + } + if (check_env("IMSM_TEST_AHCI_EFI_NORAID5") && (hba->type == SYS_DEV_SAS)) { + orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10; + } + if (check_env("IMSM_TEST_SCU_EFI_NORAID5") && (hba->type == SYS_DEV_SATA)) { + orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10; + } + + struct orom_entry *ret = add_orom(&orom); + + add_orom_device_id(ret, hba->dev_id); + + return &ret->orom; +} + +static const struct imsm_orom *find_imsm_hba_orom(struct sys_dev *hba) +{ + unsigned long align; + + if (check_env("IMSM_TEST_OROM")) + return imsm_platform_test(hba); + + /* return empty OROM capabilities in EFI test mode */ + if (check_env("IMSM_TEST_AHCI_EFI") || check_env("IMSM_TEST_SCU_EFI")) + return NULL; + + find_intel_devices(); + + if (intel_devices == NULL) + return NULL; + + /* scan option-rom memory looking for an imsm signature */ + if (check_env("IMSM_SAFE_OROM_SCAN")) + align = 2048; + else + align = 512; + if (probe_roms_init(align) != 0) + return NULL; + probe_roms(); + /* ignore return value - True is returned if both adapater roms are found */ + scan_adapter_roms(scan); + probe_roms_exit(); + + return get_orom_by_device_id(hba->dev_id); +} + +#define GUID_STR_MAX 37 /* according to GUID format: + * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" */ + +#define EFI_GUID(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ +((struct efi_guid) \ +{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, \ + (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) + +#define SYS_EFI_VAR_PATH "/sys/firmware/efi/vars" +#define SYS_EFIVARS_PATH "/sys/firmware/efi/efivars" +#define SCU_PROP "RstScuV" +#define AHCI_PROP "RstSataV" +#define AHCI_SSATA_PROP "RstsSatV" +#define AHCI_CSATA_PROP "RstCSatV" +#define VMD_PROP "RstUefiV" + +#define VENDOR_GUID \ + EFI_GUID(0x193dfefa, 0xa445, 0x4302, 0x99, 0xd8, 0xef, 0x3a, 0xad, 0x1a, 0x04, 0xc6) + +#define PCI_CLASS_RAID_CNTRL 0x010400 + +static int read_efi_var(void *buffer, ssize_t buf_size, char *variable_name, struct efi_guid guid) +{ + char path[PATH_MAX]; + char buf[GUID_STR_MAX]; + int fd; + ssize_t n; + + snprintf(path, PATH_MAX, "%s/%s-%s", SYS_EFIVARS_PATH, variable_name, guid_str(buf, guid)); + + fd = open(path, O_RDONLY); + if (fd < 0) + return 1; + + /* read the variable attributes and ignore it */ + n = read(fd, buf, sizeof(__u32)); + if (n < 0) { + close(fd); + return 1; + } + + /* read the variable data */ + n = read(fd, buffer, buf_size); + close(fd); + if (n < buf_size) + return 1; + + return 0; +} + +static int read_efi_variable(void *buffer, ssize_t buf_size, char *variable_name, struct efi_guid guid) +{ + char path[PATH_MAX]; + char buf[GUID_STR_MAX]; + int dfd; + ssize_t n, var_data_len; + + /* Try to read the variable using the new efivarfs interface first. + * If that fails, fall back to the old sysfs-efivars interface. */ + if (!read_efi_var(buffer, buf_size, variable_name, guid)) + return 0; + + snprintf(path, PATH_MAX, "%s/%s-%s/size", SYS_EFI_VAR_PATH, variable_name, guid_str(buf, guid)); + + dprintf("EFI VAR: path=%s\n", path); + /* get size of variable data */ + dfd = open(path, O_RDONLY); + if (dfd < 0) + return 1; + + n = read(dfd, &buf, sizeof(buf)); + close(dfd); + if (n < 0) + return 1; + buf[n] = '\0'; + + errno = 0; + var_data_len = strtoul(buf, NULL, 16); + if ((errno == ERANGE && (var_data_len == LONG_MAX)) + || (errno != 0 && var_data_len == 0)) + return 1; + + /* get data */ + snprintf(path, PATH_MAX, "%s/%s-%s/data", SYS_EFI_VAR_PATH, variable_name, guid_str(buf, guid)); + + dprintf("EFI VAR: path=%s\n", path); + dfd = open(path, O_RDONLY); + if (dfd < 0) + return 1; + + n = read(dfd, buffer, buf_size); + close(dfd); + if (n != var_data_len || n < buf_size) { + return 1; + } + + return 0; +} + +const struct imsm_orom *find_imsm_efi(struct sys_dev *hba) +{ + struct imsm_orom orom; + struct orom_entry *ret; + int err; + + if (check_env("IMSM_TEST_AHCI_EFI") || check_env("IMSM_TEST_SCU_EFI")) + return imsm_platform_test(hba); + + /* OROM test is set, return that there is no EFI capabilities */ + if (check_env("IMSM_TEST_OROM")) + return NULL; + + if (hba->type == SYS_DEV_SATA && hba->class != PCI_CLASS_RAID_CNTRL) + return NULL; + + err = read_efi_variable(&orom, sizeof(orom), hba->type == SYS_DEV_SAS ? SCU_PROP : AHCI_PROP, VENDOR_GUID); + + /* try to read variable for second AHCI controller */ + if (err && hba->type == SYS_DEV_SATA) + err = read_efi_variable(&orom, sizeof(orom), AHCI_SSATA_PROP, VENDOR_GUID); + + /* try to read variable for combined AHCI controllers */ + if (err && hba->type == SYS_DEV_SATA) { + static struct orom_entry *csata; + + err = read_efi_variable(&orom, sizeof(orom), AHCI_CSATA_PROP, VENDOR_GUID); + if (!err) { + if (!csata) + csata = add_orom(&orom); + add_orom_device_id(csata, hba->dev_id); + csata->type = hba->type; + return &csata->orom; + } + } + + if (hba->type == SYS_DEV_VMD) { + err = read_efi_variable(&orom, sizeof(orom), VMD_PROP, VENDOR_GUID); + } + + if (err) + return NULL; + + ret = add_orom(&orom); + add_orom_device_id(ret, hba->dev_id); + ret->type = hba->type; + + return &ret->orom; +} + +const struct imsm_orom *find_imsm_nvme(struct sys_dev *hba) +{ + static struct orom_entry *nvme_orom; + + if (hba->type != SYS_DEV_NVME) + return NULL; + + if (!nvme_orom) { + struct imsm_orom nvme_orom_compat = { + .signature = IMSM_NVME_OROM_COMPAT_SIGNATURE, + .rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5, + .sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB | + IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB | + IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB, + .dpa = IMSM_OROM_DISKS_PER_ARRAY_NVME, + .tds = IMSM_OROM_TOTAL_DISKS_NVME, + .vpa = IMSM_OROM_VOLUMES_PER_ARRAY, + .vphba = IMSM_OROM_TOTAL_DISKS_NVME / 2 * IMSM_OROM_VOLUMES_PER_ARRAY, + .attr = IMSM_OROM_ATTR_2TB | IMSM_OROM_ATTR_2TB_DISK, + .driver_features = IMSM_OROM_CAPABILITIES_EnterpriseSystem + }; + nvme_orom = add_orom(&nvme_orom_compat); + } + add_orom_device_id(nvme_orom, hba->dev_id); + nvme_orom->type = SYS_DEV_NVME; + return &nvme_orom->orom; +} + +const struct imsm_orom *find_imsm_capability(struct sys_dev *hba) +{ + const struct imsm_orom *cap = get_orom_by_device_id(hba->dev_id); + + if (cap) + return cap; + + if (hba->type == SYS_DEV_NVME) + return find_imsm_nvme(hba); + if ((cap = find_imsm_efi(hba)) != NULL) + return cap; + if ((cap = find_imsm_hba_orom(hba)) != NULL) + return cap; + + return NULL; +} + +char *devt_to_devpath(dev_t dev) +{ + char device[46]; + + sprintf(device, "/sys/dev/block/%d:%d/device", major(dev), minor(dev)); + return realpath(device, NULL); +} + +char *diskfd_to_devpath(int fd) +{ + /* return the device path for a disk, return NULL on error or fd + * refers to a partition + */ + struct stat st; + + if (fstat(fd, &st) != 0) + return NULL; + if (!S_ISBLK(st.st_mode)) + return NULL; + + return devt_to_devpath(st.st_rdev); +} + +int path_attached_to_hba(const char *disk_path, const char *hba_path) +{ + int rc; + + if (check_env("IMSM_TEST_AHCI_DEV") || + check_env("IMSM_TEST_SCU_DEV")) { + return 1; + } + + if (!disk_path || !hba_path) + return 0; + dprintf("hba: %s - disk: %s\n", hba_path, disk_path); + if (strncmp(disk_path, hba_path, strlen(hba_path)) == 0) + rc = 1; + else + rc = 0; + + return rc; +} + +int devt_attached_to_hba(dev_t dev, const char *hba_path) +{ + char *disk_path = devt_to_devpath(dev); + int rc = path_attached_to_hba(disk_path, hba_path); + + if (disk_path) + free(disk_path); + + return rc; +} + +int disk_attached_to_hba(int fd, const char *hba_path) +{ + char *disk_path = diskfd_to_devpath(fd); + int rc = path_attached_to_hba(disk_path, hba_path); + + if (disk_path) + free(disk_path); + + return rc; +} + +char *vmd_domain_to_controller(struct sys_dev *hba, char *buf) +{ + struct dirent *ent; + DIR *dir; + char path[PATH_MAX]; + + if (!hba) + return NULL; + + if (hba->type != SYS_DEV_VMD) + return NULL; + + dir = opendir("/sys/bus/pci/drivers/vmd"); + + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + sprintf(path, "/sys/bus/pci/drivers/vmd/%s/domain/device", + ent->d_name); + + if (!realpath(path, buf)) + continue; + + if (strncmp(buf, hba->path, strlen(buf)) == 0) { + sprintf(path, "/sys/bus/pci/drivers/vmd/%s", ent->d_name); + return realpath(path, buf); + } + } + return NULL; +} diff --git a/platform-intel.h b/platform-intel.h new file mode 100644 index 00000000..a8ae85f4 --- /dev/null +++ b/platform-intel.h @@ -0,0 +1,247 @@ +/* + * Intel(R) Matrix Storage Manager hardware and firmware support routines + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include +#include + +/* The IMSM Capability (IMSM AHCI and ISCU OROM/EFI variable) Version Table definition */ +struct imsm_orom { + __u8 signature[4]; + #define IMSM_OROM_SIGNATURE "$VER" + #define IMSM_NVME_OROM_COMPAT_SIGNATURE "$NVM" + __u8 table_ver_major; /* Currently 2 (can change with future revs) */ + __u8 table_ver_minor; /* Currently 2 (can change with future revs) */ + __u16 major_ver; /* Example: 8 as in 8.6.0.1020 */ + __u16 minor_ver; /* Example: 6 as in 8.6.0.1020 */ + __u16 hotfix_ver; /* Example: 0 as in 8.6.0.1020 */ + __u16 build; /* Example: 1020 as in 8.6.0.1020 */ + __u8 len; /* number of bytes in this entire table */ + __u8 checksum; /* checksum of all the bytes in this table */ + __u16 rlc; /* RAID Level Capability */ + /* we assume the cpu is x86 as the orom should not be found + * anywhere else + */ + #define IMSM_OROM_RLC_RAID0 (1 << 0) + #define IMSM_OROM_RLC_RAID1 (1 << 1) + #define IMSM_OROM_RLC_RAID10 (1 << 2) + #define IMSM_OROM_RLC_RAID1E (1 << 3) + #define IMSM_OROM_RLC_RAID5 (1 << 4) + #define IMSM_OROM_RLC_RAID_CNG (1 << 5) + __u16 sss; /* Strip Size Supported */ + #define IMSM_OROM_SSS_2kB (1 << 0) + #define IMSM_OROM_SSS_4kB (1 << 1) + #define IMSM_OROM_SSS_8kB (1 << 2) + #define IMSM_OROM_SSS_16kB (1 << 3) + #define IMSM_OROM_SSS_32kB (1 << 4) + #define IMSM_OROM_SSS_64kB (1 << 5) + #define IMSM_OROM_SSS_128kB (1 << 6) + #define IMSM_OROM_SSS_256kB (1 << 7) + #define IMSM_OROM_SSS_512kB (1 << 8) + #define IMSM_OROM_SSS_1MB (1 << 9) + #define IMSM_OROM_SSS_2MB (1 << 10) + #define IMSM_OROM_SSS_4MB (1 << 11) + #define IMSM_OROM_SSS_8MB (1 << 12) + #define IMSM_OROM_SSS_16MB (1 << 13) + #define IMSM_OROM_SSS_32MB (1 << 14) + #define IMSM_OROM_SSS_64MB (1 << 15) + __u16 dpa; /* Disks Per Array supported */ + #define IMSM_OROM_DISKS_PER_ARRAY 6 + #define IMSM_OROM_DISKS_PER_ARRAY_NVME 12 + __u16 tds; /* Total Disks Supported */ + #define IMSM_OROM_TOTAL_DISKS 6 + #define IMSM_OROM_TOTAL_DISKS_NVME 12 + __u8 vpa; /* # Volumes Per Array supported */ + #define IMSM_OROM_VOLUMES_PER_ARRAY 2 + __u8 vphba; /* # Volumes Per Host Bus Adapter supported */ + #define IMSM_OROM_VOLUMES_PER_HBA 4 + #define IMSM_OROM_VOLUMES_PER_HBA_NVME 4 + /* Attributes supported. This should map to the + * attributes in the MPB. Also, lower 16 bits + * should match/duplicate RLC bits above. + */ + __u32 attr; + #define IMSM_OROM_ATTR_RAID0 IMSM_OROM_RLC_RAID0 + #define IMSM_OROM_ATTR_RAID1 IMSM_OROM_RLC_RAID1 + #define IMSM_OROM_ATTR_RAID10 IMSM_OROM_RLC_RAID10 + #define IMSM_OROM_ATTR_RAID1E IMSM_OROM_RLC_RAID1E + #define IMSM_OROM_ATTR_RAID5 IMSM_OROM_RLC_RAID5 + #define IMSM_OROM_ATTR_RAID_CNG IMSM_OROM_RLC_RAID_CNG + #define IMSM_OROM_ATTR_2TB_DISK (1 << 26) + #define IMSM_OROM_ATTR_2TB (1 << 29) + #define IMSM_OROM_ATTR_PM (1 << 30) + #define IMSM_OROM_ATTR_ChecksumVerify (1 << 31) + __u32 capabilities; + #define IMSM_OROM_CAPABILITIES_Ext_SATA (1 << 0) + #define IMSM_OROM_CAPABILITIES_TurboMemory (1 << 1) + #define IMSM_OROM_CAPABILITIES_HddPassword (1 << 2) + #define IMSM_OROM_CAPABILITIES_DiskCoercion (1 << 3) + __u32 driver_features; + #define IMSM_OROM_CAPABILITIES_HDDUnlock (1 << 0) + #define IMSM_OROM_CAPABILITIES_LEDLoc (1 << 1) + #define IMSM_OROM_CAPABILITIES_EnterpriseSystem (1 << 2) + #define IMSM_OROM_CAPABILITIES_Zpodd (1 << 3) + #define IMSM_OROM_CAPABILITIES_LargeDramCache (1 << 4) + #define IMSM_OROM_CAPABILITIES_Rohi (1 << 5) + #define IMSM_OROM_CAPABILITIES_ReadPatrol (1 << 6) + #define IMSM_OROM_CAPABILITIES_XorHw (1 << 7) +} __attribute__((packed)); + +static inline int imsm_orom_has_raid0(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID0); +} +static inline int imsm_orom_has_raid1(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID1); +} +static inline int imsm_orom_has_raid1e(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID1E); +} +static inline int imsm_orom_has_raid10(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID10); +} +static inline int imsm_orom_has_raid5(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID5); +} + +/** + * imsm_orom_has_chunk - check if the orom supports the given chunk size + * @orom: orom pointer from find_imsm_orom + * @chunk: chunk size in kibibytes + */ +static inline int imsm_orom_has_chunk(const struct imsm_orom *orom, int chunk) +{ + int fs = ffs(chunk); + if (!fs) + return 0; + fs--; /* bit num to bit index */ + if (chunk & (chunk-1)) + return 0; /* not a power of 2 */ + return !!(orom->sss & (1 << (fs - 1))); +} + +/** + * fls - find last (most-significant) bit set + * @x: the word to search + * The funciton is borrowed from Linux kernel code + * include/asm-generic/bitops/fls.h + */ +static inline int fls(int x) +{ + int r = 32; + + if (!x) + return 0; + if (!(x & 0xffff0000u)) { + x <<= 16; + r -= 16; + } + if (!(x & 0xff000000u)) { + x <<= 8; + r -= 8; + } + if (!(x & 0xf0000000u)) { + x <<= 4; + r -= 4; + } + if (!(x & 0xc0000000u)) { + x <<= 2; + r -= 2; + } + if (!(x & 0x80000000u)) { + x <<= 1; + r -= 1; + } + return r; +} + +static inline int imsm_orom_is_enterprise(const struct imsm_orom *orom) +{ + return !!(orom->driver_features & IMSM_OROM_CAPABILITIES_EnterpriseSystem); +} + +static inline int imsm_orom_is_nvme(const struct imsm_orom *orom) +{ + return memcmp(orom->signature, IMSM_NVME_OROM_COMPAT_SIGNATURE, + sizeof(orom->signature)) == 0; +} + +enum sys_dev_type { + SYS_DEV_UNKNOWN = 0, + SYS_DEV_SAS, + SYS_DEV_SATA, + SYS_DEV_NVME, + SYS_DEV_VMD, + SYS_DEV_MAX +}; + +struct sys_dev { + enum sys_dev_type type; + char *path; + char *pci_id; + __u16 dev_id; + __u32 class; + struct sys_dev *next; +}; + +struct efi_guid { + __u8 b[16]; +}; + +struct devid_list { + __u16 devid; + struct devid_list *next; +}; + +struct orom_entry { + struct imsm_orom orom; + struct devid_list *devid_list; + enum sys_dev_type type; + struct orom_entry *next; +}; + +extern struct orom_entry *orom_entries; + +static inline char *guid_str(char *buf, struct efi_guid guid) +{ + sprintf(buf, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + guid.b[3], guid.b[2], guid.b[1], guid.b[0], + guid.b[5], guid.b[4], guid.b[7], guid.b[6], + guid.b[8], guid.b[9], guid.b[10], guid.b[11], + guid.b[12], guid.b[13], guid.b[14], guid.b[15]); + return buf; +} + +char *diskfd_to_devpath(int fd); +__u16 devpath_to_vendor(const char *dev_path); +struct sys_dev *find_driver_devices(const char *bus, const char *driver); +struct sys_dev *find_intel_devices(void); +const struct imsm_orom *find_imsm_capability(struct sys_dev *hba); +const struct imsm_orom *find_imsm_orom(void); +int disk_attached_to_hba(int fd, const char *hba_path); +int devt_attached_to_hba(dev_t dev, const char *hba_path); +char *devt_to_devpath(dev_t dev); +int path_attached_to_hba(const char *disk_path, const char *hba_path); +const char *get_sys_dev_type(enum sys_dev_type); +const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id); +const struct imsm_orom *get_orom_by_device_id(__u16 device_id); +struct sys_dev *device_by_id(__u16 device_id); +char *vmd_domain_to_controller(struct sys_dev *hba, char *buf); diff --git a/policy.c b/policy.c new file mode 100644 index 00000000..064d3491 --- /dev/null +++ b/policy.c @@ -0,0 +1,911 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include +#include +#include +#include "dlink.h" +/* + * Policy module for mdadm. + * A policy statement about a device lists a set of values for each + * of a set of names. Each value can have a metadata type as context. + * + * names include: + * action - the actions that can be taken on hot-plug + * domain - the domain(s) that the device is part of + * + * Policy information is extracted from various sources, but + * particularly from a set of policy rules in mdadm.conf + */ + +static void pol_new(struct dev_policy **pol, char *name, const char *val, + const char *metadata) +{ + struct dev_policy *n = xmalloc(sizeof(*n)); + const char *real_metadata = NULL; + int i; + + n->name = name; + n->value = val; + + /* We need to normalise the metadata name */ + if (metadata) { + for (i = 0; superlist[i] ; i++) + if (strcmp(metadata, superlist[i]->name) == 0) { + real_metadata = superlist[i]->name; + break; + } + if (!real_metadata) { + if (strcmp(metadata, "1") == 0 || + strcmp(metadata, "1.0") == 0 || + strcmp(metadata, "1.1") == 0 || + strcmp(metadata, "1.2") == 0) + real_metadata = super1.name; + } + if (!real_metadata) { + static const char *prev = NULL; + if (prev != metadata) { + pr_err("metadata=%s unrecognised - ignoring rule\n", + metadata); + prev = metadata; + } + real_metadata = "unknown"; + } + } + + n->metadata = real_metadata; + n->next = *pol; + *pol = n; +} + +static int pol_lesseq(struct dev_policy *a, struct dev_policy *b) +{ + int cmp; + + if (a->name < b->name) + return 1; + if (a->name > b->name) + return 0; + + cmp = strcmp(a->value, b->value); + if (cmp < 0) + return 1; + if (cmp > 0) + return 0; + + return (a->metadata <= b->metadata); +} + +static void pol_sort(struct dev_policy **pol) +{ + /* sort policy list in *pol by name/metadata/value + * using merge sort + */ + + struct dev_policy *pl[2]; + pl[0] = *pol; + pl[1] = NULL; + + do { + struct dev_policy **plp[2], *p[2]; + int curr = 0; + struct dev_policy nul = { NULL, NULL, NULL, NULL }; + struct dev_policy *prev = &nul; + int next = 0; + + /* p[] are the two lists that we are merging. + * plp[] are the ends of the two lists we create + * from the merge. + * 'curr' is which of plp[] that we are currently + * adding items to. + * 'next' is which if p[] we will take the next + * item from. + * 'prev' is that last value, which was placed in + * plp[curr]. + */ + plp[0] = &pl[0]; + plp[1] = &pl[1]; + p[0] = pl[0]; + p[1] = pl[1]; + + /* take least of p[0] and p[1] + * if it is larger than prev, add to + * plp[curr], else swap curr then add + */ + while (p[0] || p[1]) { + if (p[next] == NULL || + (p[1-next] != NULL && + !(pol_lesseq(prev, p[1-next]) + ^pol_lesseq(prev, p[next]) + ^pol_lesseq(p[next], p[1-next]))) + ) + next = 1 - next; + + if (!pol_lesseq(prev, p[next])) + curr = 1 - curr; + + *plp[curr] = prev = p[next]; + plp[curr] = &p[next]->next; + p[next] = p[next]->next; + } + *plp[0] = NULL; + *plp[1] = NULL; + } while (pl[0] && pl[1]); + if (pl[0]) + *pol = pl[0]; + else + *pol = pl[1]; +} + +static void pol_dedup(struct dev_policy *pol) +{ + /* This is a sorted list - remove duplicates. */ + while (pol && pol->next) { + if (pol_lesseq(pol->next, pol)) { + struct dev_policy *tmp = pol->next; + pol->next = tmp->next; + free(tmp); + } else + pol = pol->next; + } +} + +/* + * pol_find finds the first entry in the policy + * list to match name. + * If it returns non-NULL there is at least one + * value, but how many can only be found by + * iterating through the list. + */ +struct dev_policy *pol_find(struct dev_policy *pol, char *name) +{ + while (pol && pol->name < name) + pol = pol->next; + + if (!pol || pol->name != name) + return NULL; + return pol; +} + +static char *disk_path(struct mdinfo *disk) +{ + struct stat stb; + int prefix_len; + DIR *by_path; + char symlink[PATH_MAX] = "/dev/disk/by-path/"; + char nm[PATH_MAX]; + struct dirent *ent; + int rv; + + by_path = opendir(symlink); + if (by_path) { + prefix_len = strlen(symlink); + while ((ent = readdir(by_path)) != NULL) { + if (ent->d_type != DT_LNK) + continue; + strncpy(symlink + prefix_len, + ent->d_name, + sizeof(symlink) - prefix_len); + if (stat(symlink, &stb) < 0) + continue; + if ((stb.st_mode & S_IFMT) != S_IFBLK) + continue; + if (stb.st_rdev != makedev(disk->disk.major, disk->disk.minor)) + continue; + closedir(by_path); + return xstrdup(ent->d_name); + } + closedir(by_path); + } + /* A NULL path isn't really acceptable - use the devname.. */ + sprintf(symlink, "/sys/dev/block/%d:%d", disk->disk.major, disk->disk.minor); + rv = readlink(symlink, nm, sizeof(nm)-1); + if (rv > 0) { + char *dname; + nm[rv] = 0; + dname = strrchr(nm, '/'); + if (dname) + return xstrdup(dname + 1); + } + return xstrdup("unknown"); +} + +char type_part[] = "part"; +char type_disk[] = "disk"; +static char *disk_type(struct mdinfo *disk) +{ + char buf[30+20+20]; + struct stat stb; + sprintf(buf, "/sys/dev/block/%d:%d/partition", + disk->disk.major, disk->disk.minor); + if (stat(buf, &stb) == 0) + return type_part; + else + return type_disk; +} + +static int pol_match(struct rule *rule, char *path, char *type) +{ + /* check if this rule matches on path and type */ + int pathok = 0; /* 0 == no path, 1 == match, -1 == no match yet */ + int typeok = 0; + + while (rule) { + if (rule->name == rule_path) { + if (pathok == 0) + pathok = -1; + if (path && fnmatch(rule->value, path, 0) == 0) + pathok = 1; + } + if (rule->name == rule_type) { + if (typeok == 0) + typeok = -1; + if (type && strcmp(rule->value, type) == 0) + typeok = 1; + } + rule = rule->next; + } + return pathok >= 0 && typeok >= 0; +} + +static void pol_merge(struct dev_policy **pol, struct rule *rule) +{ + /* copy any name assignments from rule into pol */ + struct rule *r; + char *metadata = NULL; + for (r = rule; r ; r = r->next) + if (r->name == pol_metadata) + metadata = r->value; + + for (r = rule; r ; r = r->next) + if (r->name == pol_act || + r->name == pol_domain || + r->name == pol_auto) + pol_new(pol, r->name, r->value, metadata); +} + +static int path_has_part(char *path, char **part) +{ + /* check if path ends with "-partNN" and + * if it does, place a pointer to "-pathNN" + * in 'part'. + */ + int l; + if (!path) + return 0; + l = strlen(path); + while (l > 1 && isdigit(path[l-1])) + l--; + if (l < 5 || strncmp(path+l-5, "-part", 5) != 0) + return 0; + *part = path+l-4; + return 1; +} + +static void pol_merge_part(struct dev_policy **pol, struct rule *rule, char *part) +{ + /* copy any name assignments from rule into pol, appending + * -part to any domain. The string with -part appended is + * stored with the rule so it has a lifetime to match + * the rule. + */ + struct rule *r; + char *metadata = NULL; + for (r = rule; r ; r = r->next) + if (r->name == pol_metadata) + metadata = r->value; + + for (r = rule; r ; r = r->next) { + if (r->name == pol_act) + pol_new(pol, r->name, r->value, metadata); + else if (r->name == pol_domain) { + char *dom; + int len; + if (r->dups == NULL) + r->dups = dl_head(); + len = strlen(r->value); + for (dom = dl_next(r->dups); dom != r->dups; + dom = dl_next(dom)) + if (strcmp(dom+len+1, part)== 0) + break; + if (dom == r->dups) { + char *newdom = dl_strndup( + r->value, len + 1 + strlen(part)); + strcat(strcat(newdom, "-"), part); + dl_add(r->dups, newdom); + dom = newdom; + } + pol_new(pol, r->name, dom, metadata); + } + } +} + +static struct pol_rule *config_rules = NULL; +static struct pol_rule **config_rules_end = NULL; +static int config_rules_has_path = 0; + +/* + * most policy comes from a set policy rules that are + * read from the config file. + * path_policy() gathers policy information for the + * disk described in the given a 'path' and a 'type'. + */ +struct dev_policy *path_policy(char *path, char *type) +{ + struct pol_rule *rules; + struct dev_policy *pol = NULL; + int i; + + rules = config_rules; + + while (rules) { + char *part; + if (rules->type == rule_policy) + if (pol_match(rules->rule, path, type)) + pol_merge(&pol, rules->rule); + if (rules->type == rule_part && strcmp(type, type_part) == 0) + if (path_has_part(path, &part)) { + *part = 0; + if (pol_match(rules->rule, path, type_disk)) + pol_merge_part(&pol, rules->rule, part+1); + *part = '-'; + } + rules = rules->next; + } + + /* Now add any metadata-specific internal knowledge + * about this path + */ + for (i=0; path && superlist[i]; i++) + if (superlist[i]->get_disk_controller_domain) { + const char *d = + superlist[i]->get_disk_controller_domain(path); + if (d) + pol_new(&pol, pol_domain, d, superlist[i]->name); + } + + pol_sort(&pol); + pol_dedup(pol); + return pol; +} + +void pol_add(struct dev_policy **pol, + char *name, char *val, + char *metadata) +{ + pol_new(pol, name, val, metadata); + pol_sort(pol); + pol_dedup(*pol); +} + +/* + * disk_policy() gathers policy information for the + * disk described in the given mdinfo (disk.{major,minor}). + */ +struct dev_policy *disk_policy(struct mdinfo *disk) +{ + char *path = NULL; + char *type = disk_type(disk); + struct dev_policy *pol = NULL; + + if (config_rules_has_path) + path = disk_path(disk); + + pol = path_policy(path, type); + + free(path); + return pol; +} + +struct dev_policy *devid_policy(int dev) +{ + struct mdinfo disk; + disk.disk.major = major(dev); + disk.disk.minor = minor(dev); + return disk_policy(&disk); +} + +/* + * process policy rules read from config file. + */ + +char rule_path[] = "path"; +char rule_type[] = "type"; + +char rule_policy[] = "policy"; +char rule_part[] = "part-policy"; + +char pol_metadata[] = "metadata"; +char pol_act[] = "action"; +char pol_domain[] = "domain"; +char pol_auto[] = "auto"; + +static int try_rule(char *w, char *name, struct rule **rp) +{ + struct rule *r; + int len = strlen(name); + if (strncmp(w, name, len) != 0 || + w[len] != '=') + return 0; + r = xmalloc(sizeof(*r)); + r->next = *rp; + r->name = name; + r->value = xstrdup(w+len+1); + r->dups = NULL; + *rp = r; + return 1; +} + +void policyline(char *line, char *type) +{ + struct pol_rule *pr; + char *w; + + if (config_rules_end == NULL) + config_rules_end = &config_rules; + + pr = xmalloc(sizeof(*pr)); + pr->type = type; + pr->rule = NULL; + for (w = dl_next(line); w != line ; w = dl_next(w)) { + if (try_rule(w, rule_path, &pr->rule)) + config_rules_has_path = 1; + else if (! try_rule(w, rule_type, &pr->rule) && + ! try_rule(w, pol_metadata, &pr->rule) && + ! try_rule(w, pol_act, &pr->rule) && + ! try_rule(w, pol_domain, &pr->rule) && + ! try_rule(w, pol_auto, &pr->rule)) + pr_err("policy rule %s unrecognised and ignored\n", + w); + } + pr->next = config_rules; + config_rules = pr; +} + +void policy_add(char *type, ...) +{ + va_list ap; + struct pol_rule *pr; + char *name, *val; + + pr = xmalloc(sizeof(*pr)); + pr->type = type; + pr->rule = NULL; + + va_start(ap, type); + while ((name = va_arg(ap, char*)) != NULL) { + struct rule *r; + + val = va_arg(ap, char*); + r = xmalloc(sizeof(*r)); + r->next = pr->rule; + r->name = name; + r->value = xstrdup(val); + r->dups = NULL; + pr->rule = r; + } + pr->next = config_rules; + config_rules = pr; + va_end(ap); +} + +void policy_free(void) +{ + while (config_rules) { + struct pol_rule *pr = config_rules; + struct rule *r; + + config_rules = config_rules->next; + + for (r = pr->rule; r; ) { + struct rule *next = r->next; + free(r->value); + if (r->dups) + free_line(r->dups); + free(r); + r = next; + } + free(pr); + } + config_rules_end = NULL; + config_rules_has_path = 0; +} + +void dev_policy_free(struct dev_policy *p) +{ + struct dev_policy *t; + while (p) { + t = p; + p = p->next; + free(t); + } +} + +static enum policy_action map_act(const char *act) +{ + if (strcmp(act, "include") == 0) + return act_include; + if (strcmp(act, "re-add") == 0) + return act_re_add; + if (strcmp(act, "spare") == 0) + return act_spare; + if (strcmp(act, "spare-same-slot") == 0) + return act_spare_same_slot; + if (strcmp(act, "force-spare") == 0) + return act_force_spare; + return act_err; +} + +static enum policy_action policy_action(struct dev_policy *plist, const char *metadata) +{ + enum policy_action rv = act_default; + struct dev_policy *p; + + plist = pol_find(plist, pol_act); + pol_for_each(p, plist, metadata) { + enum policy_action a = map_act(p->value); + if (a > rv) + rv = a; + } + return rv; +} + +int policy_action_allows(struct dev_policy *plist, const char *metadata, enum policy_action want) +{ + enum policy_action act = policy_action(plist, metadata); + + if (act == act_err) + return 0; + return (act >= want); +} + +int disk_action_allows(struct mdinfo *disk, const char *metadata, enum policy_action want) +{ + struct dev_policy *pol = disk_policy(disk); + int rv = policy_action_allows(pol, metadata, want); + + dev_policy_free(pol); + return rv; +} + +/* Domain policy: + * Any device can have a list of domains asserted by different policy + * statements. + * An array also has a list of domains comprising all the domains of + * all the devices in an array. + * Where an array has a spare-group, that becomes an addition domain for + * every device in the array and thus for the array. + * + * We keep the list of domains in a sorted linked list + * As dev policies are already sorted, this is fairly easy to manage. + */ + +static struct domainlist **domain_merge_one(struct domainlist **domp, + const char *domain) +{ + /* merge a domain name into a sorted list and return the + * location of the insertion or match + */ + struct domainlist *dom = *domp; + + while (dom && strcmp(dom->dom, domain) < 0) { + domp = &dom->next; + dom = *domp; + } + if (dom == NULL || strcmp(dom->dom, domain) != 0) { + dom = xmalloc(sizeof(*dom)); + dom->next = *domp; + dom->dom = domain; + *domp = dom; + } + return domp; +} + +#if (DEBUG) +void dump_policy(struct dev_policy *policy) +{ + while (policy) { + dprintf("policy: %p name: %s value: %s metadata: %s\n", + policy, + policy->name, + policy->value, + policy->metadata); + policy = policy->next; + } +} +#endif + +void domain_merge(struct domainlist **domp, struct dev_policy *pollist, + const char *metadata) +{ + /* Add to 'domp' all the domains in pol that apply to 'metadata' + * which are not already in domp + */ + struct dev_policy *pol; + pollist = pol_find(pollist, pol_domain); + pol_for_each(pol, pollist, metadata) + domain_merge_one(domp, pol->value); +} + +int domain_test(struct domainlist *dom, struct dev_policy *pol, + const char *metadata) +{ + /* Check that all domains in pol (for metadata) are also in + * dom. Both lists are sorted. + * If pol has no domains, we don't really know about this device + * so we allow caller to choose: + * -1: has no domains + * 0: has domains, not all match + * 1: has domains, all match + */ + int found_any = -1; + struct dev_policy *p; + + pol = pol_find(pol, pol_domain); + pol_for_each(p, pol, metadata) { + found_any = 1; + while (dom && strcmp(dom->dom, p->value) < 0) + dom = dom->next; + if (!dom || strcmp(dom->dom, p->value) != 0) + return 0; + } + return found_any; +} + +void domainlist_add_dev(struct domainlist **dom, int devid, const char *metadata) +{ + struct dev_policy *pol = devid_policy(devid); + domain_merge(dom, pol, metadata); + dev_policy_free(pol); +} + +struct domainlist *domain_from_array(struct mdinfo *mdi, const char *metadata) +{ + struct domainlist *domlist = NULL; + + if (!mdi) + return NULL; + for (mdi = mdi->devs ; mdi ; mdi = mdi->next) + domainlist_add_dev(&domlist, makedev(mdi->disk.major, + mdi->disk.minor), + metadata); + + return domlist; +} + +void domain_add(struct domainlist **domp, char *domain) +{ + domain_merge_one(domp, domain); +} + +void domain_free(struct domainlist *dl) +{ + while (dl) { + struct domainlist *head = dl; + dl = dl->next; + free(head); + } +} + +/* + * same-path policy. + * Some policy decisions are guided by knowledge of which + * array previously owned the device at a given physical location (path). + * When removing a device from an array we might record the array against + * the path, and when finding a new device, we might look for which + * array previously used that path. + * + * The 'array' is described by a map_ent, and the path by a the disk in an + * mdinfo, or a string. + */ + +void policy_save_path(char *id_path, struct map_ent *array) +{ + char path[PATH_MAX]; + FILE *f = NULL; + + if (mkdir(FAILED_SLOTS_DIR, S_IRWXU) < 0 && errno != EEXIST) { + pr_err("can't create file to save path to old disk: %s\n", strerror(errno)); + return; + } + + snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path); + f = fopen(path, "w"); + if (!f) { + pr_err("can't create file to save path to old disk: %s\n", + strerror(errno)); + return; + } + + if (fprintf(f, "%s %08x:%08x:%08x:%08x\n", + array->metadata, + array->uuid[0], array->uuid[1], + array->uuid[2], array->uuid[3]) <= 0) + pr_err("Failed to write to cookie\n"); + + fclose(f); +} + +int policy_check_path(struct mdinfo *disk, struct map_ent *array) +{ + char path[PATH_MAX]; + FILE *f = NULL; + char *id_path = disk_path(disk); + int rv; + + if (!id_path) + return 0; + + snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path); + f = fopen(path, "r"); + if (!f) { + free(id_path); + return 0; + } + + rv = fscanf(f, " %s %x:%x:%x:%x\n", + array->metadata, + array->uuid, + array->uuid+1, + array->uuid+2, + array->uuid+3); + fclose(f); + free(id_path); + return rv == 5; +} + +/* invocation of udev rule file */ +char udev_template_start[] = +"# do not edit this file, it is automatically generated by mdadm\n" +"\n"; + +/* find rule named rule_type and return its value */ +char *find_rule(struct rule *rule, char *rule_type) +{ + while (rule) { + if (rule->name == rule_type) + return rule->value; + + rule = rule->next; + } + return NULL; +} + +#define UDEV_RULE_FORMAT \ +"ACTION==\"add\", SUBSYSTEM==\"block\", " \ +"ENV{DEVTYPE}==\"%s\", ENV{ID_PATH}==\"%s\", " \ +"RUN+=\"" BINDIR "/mdadm --incremental $env{DEVNAME}\"\n" + +#define UDEV_RULE_FORMAT_NOTYPE \ +"ACTION==\"add\", SUBSYSTEM==\"block\", " \ +"ENV{ID_PATH}==\"%s\", " \ +"RUN+=\"" BINDIR "/mdadm --incremental $env{DEVNAME}\"\n" + +/* Write rule in the rule file. Use format from UDEV_RULE_FORMAT */ +int write_rule(struct rule *rule, int fd, int force_part) +{ + char line[1024]; + char *pth = find_rule(rule, rule_path); + char *typ = find_rule(rule, rule_type); + if (!pth) + return -1; + + if (force_part) + typ = type_part; + if (typ) + snprintf(line, sizeof(line) - 1, UDEV_RULE_FORMAT, typ, pth); + else + snprintf(line, sizeof(line) - 1, UDEV_RULE_FORMAT_NOTYPE, pth); + return write(fd, line, strlen(line)) == (int)strlen(line); +} + +/* Generate single entry in udev rule basing on POLICY line found in config + * file. Take only those with paths, only first occurrence if paths are equal + * and if actions supports handling of spares (>=act_spare_same_slot) + */ +int generate_entries(int fd) +{ + struct pol_rule *loop, *dup; + char *loop_value, *dup_value; + int duplicate; + + for (loop = config_rules; loop; loop = loop->next) { + if (loop->type != rule_policy && loop->type != rule_part) + continue; + duplicate = 0; + + /* only policies with paths and with actions supporting + * bare disks are considered */ + loop_value = find_rule(loop->rule, pol_act); + if (!loop_value || map_act(loop_value) < act_spare_same_slot) + continue; + loop_value = find_rule(loop->rule, rule_path); + if (!loop_value) + continue; + for (dup = config_rules; dup != loop; dup = dup->next) { + if (dup->type != rule_policy && loop->type != rule_part) + continue; + dup_value = find_rule(dup->rule, pol_act); + if (!dup_value || map_act(dup_value) < act_spare_same_slot) + continue; + dup_value = find_rule(dup->rule, rule_path); + if (!dup_value) + continue; + if (strcmp(loop_value, dup_value) == 0) { + duplicate = 1; + break; + } + } + + /* not a dup or first occurrence */ + if (!duplicate) + if (!write_rule(loop->rule, fd, loop->type == rule_part) ) + return 0; + } + return 1; +} + +/* Write_rules routine creates dynamic udev rules used to handle + * hot-plug events for bare devices (and making them spares) + */ +int Write_rules(char *rule_name) +{ + int fd; + char udev_rule_file[PATH_MAX]; + + if (rule_name) { + strncpy(udev_rule_file, rule_name, sizeof(udev_rule_file) - 6); + udev_rule_file[sizeof(udev_rule_file) - 6] = '\0'; + strcat(udev_rule_file, ".temp"); + fd = creat(udev_rule_file, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd == -1) + return 1; + } else + fd = 1; + + /* write static invocation */ + if (write(fd, udev_template_start, + sizeof(udev_template_start) - 1) + != (int)sizeof(udev_template_start)-1) + goto abort; + + /* iterate, if none created or error occurred, remove file */ + if (generate_entries(fd) < 0) + goto abort; + + fsync(fd); + if (rule_name) { + close(fd); + rename(udev_rule_file, rule_name); + } + return 0; +abort: + if (rule_name) { + close(fd); + unlink(udev_rule_file); + } + return 1; +} diff --git a/probe_roms.c b/probe_roms.c new file mode 100644 index 00000000..b0b08833 --- /dev/null +++ b/probe_roms.c @@ -0,0 +1,317 @@ +/* + * probe_roms - scan for Adapter ROMS + * + * (based on linux-2.6:arch/x86/kernel/probe_roms_32.c) + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "probe_roms.h" +#include "mdadm.h" +#include +#include +#include +#include +#include +#include +#include + +static void *rom_mem = MAP_FAILED; +static int rom_fd = -1; +static const int rom_len = 0xf0000 - 0xc0000; /* option-rom memory region */ +static int _sigbus; +static unsigned long rom_align; + +static void sigbus(int sig) +{ + _sigbus = 1; +} + +static int probe_address8(const __u8 *ptr, __u8 *val) +{ + int rc = 0; + + *val = *ptr; + if (_sigbus) + rc = -1; + _sigbus = 0; + + return rc; +} + +static int probe_address16(const __u16 *ptr, __u16 *val) +{ + int rc = 0; + + *val = *ptr; + if (_sigbus) + rc = -1; + _sigbus = 0; + + return rc; +} + +void probe_roms_exit(void) +{ + signal(SIGBUS, SIG_DFL); + if (rom_fd >= 0) { + close(rom_fd); + rom_fd = -1; + } + if (rom_mem != MAP_FAILED) { + munmap(rom_mem, rom_len); + rom_mem = MAP_FAILED; + } +} + +int probe_roms_init(unsigned long align) +{ + int fd = -1; + int rc = 0; + + /* valid values are 2048 and 512. 512 is for PCI-3.0 compliant + * systems, or systems that do not have dangerous/legacy ISA + * devices. 2048 should always be safe + */ + if (align == 512 || align == 2048) + rom_align = align; + else + return -1; + + if (signal(SIGBUS, sigbus) == SIG_ERR) + rc = -1; + if (rc == 0) { + fd = open("/dev/mem", O_RDONLY); + if (fd < 0) + rc = -1; + } + if (rc == 0) { + rom_mem = mmap(NULL, rom_len, PROT_READ, MAP_PRIVATE, fd, 0xc0000); + if (rom_mem == MAP_FAILED) + rc = -1; + } + + if (rc == 0) + rom_fd = fd; + else { + if (fd >= 0) + close(fd); + probe_roms_exit(); + } + return rc; +} + +/** + * isa_bus_to_virt - convert physical address to mmap'd region + * @addr - address to convert + * + * Only valid between a successful call to probe_roms_init and the + * corresponding probe_roms_exit + */ +static void *isa_bus_to_virt(unsigned long addr) +{ + return rom_mem + (addr - 0xc0000); +} + +struct resource { + unsigned long start; + unsigned long end; + unsigned long data; + const char *name; +}; + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .data = 0, + .end = 0xfffff, +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .data = 0, + .end = 0xeffff, +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .data = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .data = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .data = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .data = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .data = 0, + .end = 0, +}, { + .name = "Adapter ROM", + .start = 0, + .data = 0, + .end = 0, +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .data = 0, + .end = 0xc7fff, +}; + +#define ROMSIGNATURE 0xaa55 + +static int romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig = 0; + + return probe_address16(ptr, &sig) == 0 && sig == ROMSIGNATURE; +} + +static int romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_address8(rom++, &c) == 0; length--) + sum += c; + return !length && !sum; +} + +int scan_adapter_roms(scan_fn fn) +{ + /* let scan_fn examing each of the adapter roms found by probe_roms */ + unsigned int i; + int found; + + if (rom_fd < 0) + return 0; + + found = 0; + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + + if (res->start) { + found = fn(isa_bus_to_virt(res->start), + isa_bus_to_virt(res->end), + isa_bus_to_virt(res->data)); + if (found) + break; + } else + break; + } + + return found; +} + +static unsigned long align(unsigned long addr, unsigned long alignment) +{ + return (addr + alignment - 1) & ~(alignment - 1); +} + +void probe_roms(void) +{ + const void *rom; + unsigned long start, length, upper; + unsigned char c; + unsigned int i; + __u16 val=0; + + if (rom_fd < 0) + return; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += rom_align) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (probe_address8(rom + 2, &c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + break; + } + + start = align(video_rom_resource.end + 1, rom_align); + if (start < upper) + start = upper; + + /* system rom */ + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) + upper = extension_rom_resource.start; + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += rom_align) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_address8(rom + 2, &c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* Retrieve 16-bit pointer to PCI Data Structure (offset 18h-19h) + * The data can be within 64KB forward of the first location + * of this code image. The pointer is in little-endian order + */ + + if (probe_address16(rom + 0x18, &val) != 0) + continue; + val = __le16_to_cpu(val); + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].data = start + (unsigned long) val; + adapter_rom_resources[i].end = start + length - 1; + + start = adapter_rom_resources[i++].end & ~(rom_align - 1); + } +} diff --git a/probe_roms.h b/probe_roms.h new file mode 100644 index 00000000..6d70411a --- /dev/null +++ b/probe_roms.h @@ -0,0 +1,24 @@ +/* + * probe_roms - scan for Adapter ROMS + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +void probe_roms_exit(void); +int probe_roms_init(unsigned long align); +typedef int (*scan_fn)(const void *start, const void *end, const void *data); +int scan_adapter_roms(scan_fn fn); +void probe_roms(void); diff --git a/pwgr.c b/pwgr.c new file mode 100644 index 00000000..a07de336 --- /dev/null +++ b/pwgr.c @@ -0,0 +1,17 @@ + +/* + * We cannot link a static binary with passwd/group support, so + * just do without + */ +#include +#include +#include + +struct passwd *getpwnam(const char *name) +{ + return NULL; +} +struct group *getgrnam(const char *name) +{ + return NULL; +} diff --git a/raid5extend.c b/raid5extend.c new file mode 100644 index 00000000..d8e62c2c --- /dev/null +++ b/raid5extend.c @@ -0,0 +1,80 @@ + +int phys2log(int phys, int stripe, int n, int layout) +{ + /* In an 'n' disk array using 'layout', + * in stripe 'stripe', the physical disc 'phys' + * stores what logical chunk? + * -1 mean parity. + * + */ + switch(layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + pd = (n-1) - (stripe % n); + if (phys < pd) + return phys; + else if (phys == pd) + return -1; + else return phys-1; + + case ALGORITHM_RIGHT_ASYMMETRIC: + pd = stripe % n; + if (phys < pd) + return phys; + else if (phys == pd) + return -1; + else return phys-1; + + case ALGORITHM_LEFT_SYMMETRIC: + pd = (n-1) - (stripe %n); + if (phys < pd) + return phys+ n-1-pd; + else if (phys == pd) + return -1; + else return phys-pd-1; + + case ALGORITHM_RIGHT_SYMMETRIC: + pd = stripe % n; + if (phys < pd) + return phys+ n-1-pd; + else if (phys == pd) + return -1; + else return phys-pd-1; + } + return -2; +} + +raid5_extend(unsigned long len, int chunksize, int layout, int n, int m, int rfds[], int wfds[]) +{ + + static char buf[4096]; + + unsigned long blocks = len/4; + unsigned int blocksperchunk= chunksize/4096; + + unsigned long b; + + for (b=0; b " + +.SH DESCRIPTION +RAID6 devices in which one single component drive has errors can use +the double parity in order to find out which component drive. +The "raid6check" tool checks, for each stripe, the double parity +consistency, reports mismatches and, if possible, which +component drive has the mismatch. +Since it works at stripe level, it can report different drives with +mismatches at different stripes. + +"raid6check" requires a non-degraded RAID6 MD device as first +parameter, a starting stripe (usually 0) and the number of stripes +to be checked. +If this third parameter is also 0, it will check the array up to +the end. + +"raid6check" will start printing information about the RAID6, then +for each stripe, it will report the parity rotation status. +In case of parity mismatches, "raid6check" reports, if possible, +which component drive could be responsible. Otherwise it reports +that it is not possible to find the component drive. + +If the given MD device is not a RAID6, "raid6check" will, of +course, not continue. + +If the RAID6 MD device is degraded, "raid6check" will report +an error and it will not proceed further. + +No write operations are performed on the array or the components. +Furthermore, the checked array can be online and in use during +the operation of "raid6check". + +.SH EXAMPLES + +.B " raid6check /dev/md0 0 0" +.br +This will check /dev/md0 from start to end. + +.B " raid6check /dev/md3 0 1" +.br +This will check the first stripe of /dev/md3. + +.B " raid6check /dev/md1 1000 0" +.br +This will check /dev/md1 from stripe 1000 up to the end. + +.B " raid6check /dev/m127 128 256" +.br +This will check 256 stripes of /dev/md127 starting from stripe 128. + +.B " raid6check /dev/md0 0 0 | grep -i error > md0_err.log" +.br +This will check /dev/md0 completely and create a log file only +with errors, if any. + +.SH FILES + +"raid6check" uses directly the component drives as found in /dev. +Furthermore, the sysfs interface is needed in order to find out +the RAID6 parameters. + +.SH BUGS +Negative parameters can lead to unexpected results. + +It is not clear what will happen if the RAID6 MD device gets +degraded during the check. + +.PP +The latest version of +.I raid6check +should always be available from +.IP +.B http://www.kernel.org/pub/linux/utils/raid/mdadm/ +.PP +Related man pages: +.PP +.IR mdadm (8) +.IR mdmon (8), +.IR mdadm.conf (5), +.IR md (4). diff --git a/raid6check.c b/raid6check.c new file mode 100644 index 00000000..ad7ffe7e --- /dev/null +++ b/raid6check.c @@ -0,0 +1,713 @@ +/* + * raid6check - extended consistency check for RAID-6 + * + * Copyright (C) 2011 Piergiorgio Sartor + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Piergiorgio Sartor + * Based on "restripe.c" from "mdadm" codebase + */ + +#include "mdadm.h" +#include +#include +#include + +#define CHECK_PAGE_BITS (12) +#define CHECK_PAGE_SIZE (1 << CHECK_PAGE_BITS) + +char const Name[] = "raid6check"; + +enum repair { + NO_REPAIR = 0, + MANUAL_REPAIR, + AUTO_REPAIR +}; + +int geo_map(int block, unsigned long long stripe, int raid_disks, + int level, int layout); +int is_ddf(int layout); +void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size); +void make_tables(void); +void ensure_zero_has_size(int chunk_size); +void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs, + int neg_offset); +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + uint8_t **ptrs, int neg_offset); +void xor_blocks(char *target, char **sources, int disks, int size); + +/* Collect per stripe consistency information */ +void raid6_collect(int chunk_size, uint8_t *p, uint8_t *q, + char *chunkP, char *chunkQ, int *results) +{ + int i; + int data_id; + uint8_t Px, Qx; + extern uint8_t raid6_gflog[]; + + for(i = 0; i < chunk_size; i++) { + Px = (uint8_t)chunkP[i] ^ (uint8_t)p[i]; + Qx = (uint8_t)chunkQ[i] ^ (uint8_t)q[i]; + + if((Px != 0) && (Qx == 0)) + results[i] = -1; + + if((Px == 0) && (Qx != 0)) + results[i] = -2; + + if((Px != 0) && (Qx != 0)) { + data_id = (raid6_gflog[Qx] - raid6_gflog[Px]); + if(data_id < 0) data_id += 255; + results[i] = data_id; + } + + if((Px == 0) && (Qx == 0)) + results[i] = -255; + } +} + +/* Try to find out if a specific disk has problems in a CHECK_PAGE_SIZE page size */ +int raid6_stats_blk(int *results, int raid_disks) +{ + int i; + int curr_broken_disk = -255; + int prev_broken_disk = -255; + int broken_status = 0; + + for(i = 0; i < CHECK_PAGE_SIZE; i++) { + + if(results[i] != -255) + curr_broken_disk = results[i]; + + if(curr_broken_disk >= raid_disks) + broken_status = 2; + + switch(broken_status) { + case 0: + if(curr_broken_disk != -255) { + prev_broken_disk = curr_broken_disk; + broken_status = 1; + } + break; + + case 1: + if(curr_broken_disk != prev_broken_disk) + broken_status = 2; + break; + + case 2: + default: + curr_broken_disk = prev_broken_disk = -65535; + break; + } + } + + return curr_broken_disk; +} + +/* Collect disks status for a strip in CHECK_PAGE_SIZE page size blocks */ +void raid6_stats(int *disk, int *results, int raid_disks, int chunk_size) +{ + int i, j; + + for(i = 0, j = 0; i < chunk_size; i += CHECK_PAGE_SIZE, j++) { + disk[j] = raid6_stats_blk(&results[i], raid_disks); + } +} + +int lock_stripe(struct mdinfo *info, unsigned long long start, + int chunk_size, int data_disks, sighandler_t *sig) { + int rv; + if(mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + return 2; + } + + sig[0] = signal(SIGTERM, SIG_IGN); + sig[1] = signal(SIGINT, SIG_IGN); + sig[2] = signal(SIGQUIT, SIG_IGN); + + rv = sysfs_set_num(info, NULL, "suspend_lo", start * chunk_size * data_disks); + rv |= sysfs_set_num(info, NULL, "suspend_hi", (start + 1) * chunk_size * data_disks); + return rv * 256; +} + +int unlock_all_stripes(struct mdinfo *info, sighandler_t *sig) { + int rv; + rv = sysfs_set_num(info, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + rv |= sysfs_set_num(info, NULL, "suspend_hi", 0); + rv |= sysfs_set_num(info, NULL, "suspend_lo", 0); + + signal(SIGQUIT, sig[2]); + signal(SIGINT, sig[1]); + signal(SIGTERM, sig[0]); + + if(munlockall() != 0) + return 3; + return rv * 256; +} + +/* Autorepair */ +int autorepair(int *disk, unsigned long long start, int chunk_size, + char *name[], int raid_disks, int syndrome_disks, char **blocks_page, + char **blocks, uint8_t *p, int *block_index_for_slot, + int *source, unsigned long long *offsets) +{ + int i, j; + int pages_to_write_count = 0; + int page_to_write[chunk_size >> CHECK_PAGE_BITS]; + for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) { + if (disk[j] >= -2 && block_index_for_slot[disk[j]] >= 0) { + int slot = block_index_for_slot[disk[j]]; + printf("Auto-repairing slot %d (%s)\n", slot, name[slot]); + pages_to_write_count++; + page_to_write[j] = 1; + for(i = -2; i < syndrome_disks; i++) { + blocks_page[i] = blocks[i] + j * CHECK_PAGE_SIZE; + } + if (disk[j] == -2) { + qsyndrome(p, (uint8_t*)blocks_page[-2], + (uint8_t**)blocks_page, + syndrome_disks, CHECK_PAGE_SIZE); + } + else { + char *all_but_failed_blocks[syndrome_disks]; + for(i = 0; i < syndrome_disks; i++) { + if (i == disk[j]) + all_but_failed_blocks[i] = blocks_page[-1]; + else + all_but_failed_blocks[i] = blocks_page[i]; + } + xor_blocks(blocks_page[disk[j]], + all_but_failed_blocks, syndrome_disks, + CHECK_PAGE_SIZE); + } + } + else { + page_to_write[j] = 0; + } + } + + if(pages_to_write_count > 0) { + int write_res = 0; + for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) { + if(page_to_write[j] == 1) { + int slot = block_index_for_slot[disk[j]]; + lseek64(source[slot], offsets[slot] + start * chunk_size + j * CHECK_PAGE_SIZE, SEEK_SET); + write_res += write(source[slot], + blocks[disk[j]] + j * CHECK_PAGE_SIZE, + CHECK_PAGE_SIZE); + } + } + + if (write_res != (CHECK_PAGE_SIZE * pages_to_write_count)) { + fprintf(stderr, "Failed to write a full chunk.\n"); + return -1; + } + } + + return 0; +} + +/* Manual repair */ +int manual_repair(int chunk_size, int syndrome_disks, + int failed_slot1, int failed_slot2, + unsigned long long start, int *block_index_for_slot, + char *name[], char **stripes, char **blocks, uint8_t *p, + int *source, unsigned long long *offsets) +{ + int i; + int fd1 = block_index_for_slot[failed_slot1]; + int fd2 = block_index_for_slot[failed_slot2]; + printf("Repairing stripe %llu\n", start); + printf("Assuming slots %d (%s) and %d (%s) are incorrect\n", + fd1, name[fd1], + fd2, name[fd2]); + + if (failed_slot1 == -2 || failed_slot2 == -2) { + char *all_but_failed_blocks[syndrome_disks]; + int failed_data_or_p; + + if (failed_slot1 == -2) + failed_data_or_p = failed_slot2; + else + failed_data_or_p = failed_slot1; + + printf("Repairing D/P(%d) and Q\n", failed_data_or_p); + + for (i = 0; i < syndrome_disks; i++) { + if (i == failed_data_or_p) + all_but_failed_blocks[i] = blocks[-1]; + else + all_but_failed_blocks[i] = blocks[i]; + } + xor_blocks(blocks[failed_data_or_p], + all_but_failed_blocks, syndrome_disks, chunk_size); + qsyndrome(p, (uint8_t*)blocks[-2], (uint8_t**)blocks, + syndrome_disks, chunk_size); + } else { + ensure_zero_has_size(chunk_size); + if (failed_slot1 == -1 || failed_slot2 == -1) { + int failed_data; + if (failed_slot1 == -1) + failed_data = failed_slot2; + else + failed_data = failed_slot1; + printf("Repairing D(%d) and P\n", failed_data); + raid6_datap_recov(syndrome_disks+2, chunk_size, + failed_data, (uint8_t**)blocks, 1); + } else { + printf("Repairing D and D\n"); + raid6_2data_recov(syndrome_disks+2, chunk_size, + failed_slot1, failed_slot2, + (uint8_t**)blocks, 1); + } + } + + int write_res1, write_res2; + off64_t seek_res; + + seek_res = lseek64(source[fd1], + offsets[fd1] + start * chunk_size, SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek failed for failed_disk1\n"); + return -1; + } + write_res1 = write(source[fd1], blocks[failed_slot1], chunk_size); + + seek_res = lseek64(source[fd2], + offsets[fd2] + start * chunk_size, SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek failed for failed_disk2\n"); + return -1; + } + write_res2 = write(source[fd2], blocks[failed_slot2], chunk_size); + + if (write_res1 != chunk_size || write_res2 != chunk_size) { + fprintf(stderr, "Failed to write a complete chunk.\n"); + return -2; + } + + return 0; +} + +int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + unsigned long long start, unsigned long long length, char *name[], + enum repair repair, int failed_disk1, int failed_disk2) +{ + /* read the data and p and q blocks, and check we got them right */ + int data_disks = raid_disks - 2; + int syndrome_disks = data_disks + is_ddf(layout) * 2; + char *stripe_buf; + + /* stripes[] is indexed by raid_disk and holds chunks from each device */ + char **stripes = xmalloc(raid_disks * sizeof(char*)); + + /* blocks[] is indexed by syndrome number and points to either one of the + * chunks from 'stripes[]', or to a chunk of zeros. -1 and -2 are + * P and Q */ + char **blocks = xmalloc((syndrome_disks + 2) * sizeof(char*)); + + /* blocks_page[] is a temporary index to just one page of the chunks + * that blocks[] points to. */ + char **blocks_page = xmalloc((syndrome_disks + 2) * sizeof(char*)); + + /* block_index_for_slot[] provides the reverse mapping from blocks to stripes. + * The index is a syndrome position, the content is a raid_disk number. + * indicies -1 and -2 work, and are P and Q disks */ + int *block_index_for_slot = xmalloc((syndrome_disks+2) * sizeof(int)); + + /* 'p' and 'q' contain calcualted P and Q, to be compared with + * blocks[-1] and blocks[-2]; + */ + uint8_t *p = xmalloc(chunk_size); + uint8_t *q = xmalloc(chunk_size); + char *zero = xmalloc(chunk_size); + int *results = xmalloc(chunk_size * sizeof(int)); + sighandler_t *sig = xmalloc(3 * sizeof(sighandler_t)); + + int i, j; + int diskP, diskQ, diskD; + int err = 0; + + extern int tables_ready; + + if (!tables_ready) + make_tables(); + + if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size) != 0) + exit(4); + block_index_for_slot += 2; + blocks += 2; + blocks_page += 2; + + memset(zero, 0, chunk_size); + for ( i = 0 ; i < raid_disks ; i++) + stripes[i] = stripe_buf + i * chunk_size; + + while (length > 0) { + /* The syndrome number of the broken disk is recorded + * in 'disk[]' which allows a different broken disk for + * each page. + */ + int disk[chunk_size >> CHECK_PAGE_BITS]; + + err = lock_stripe(info, start, chunk_size, data_disks, sig); + if(err != 0) { + if (err != 2) + unlock_all_stripes(info, sig); + goto exitCheck; + } + for (i = 0 ; i < raid_disks ; i++) { + off64_t seek_res = lseek64(source[i], offsets[i] + start * chunk_size, + SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek to source %d failed\n", i); + unlock_all_stripes(info, sig); + err = -1; + goto exitCheck; + } + int read_res = read(source[i], stripes[i], chunk_size); + if (read_res < chunk_size) { + fprintf(stderr, "Failed to read complete chunk disk %d, aborting\n", i); + unlock_all_stripes(info, sig); + err = -1; + goto exitCheck; + } + } + + diskP = geo_map(-1, start, raid_disks, level, layout); + block_index_for_slot[-1] = diskP; + blocks[-1] = stripes[diskP]; + + diskQ = geo_map(-2, start, raid_disks, level, layout); + block_index_for_slot[-2] = diskQ; + blocks[-2] = stripes[diskQ]; + + if (!is_ddf(layout)) { + /* The syndrome-order of disks starts immediately after 'Q', + * but skips P */ + diskD = diskQ; + for (i = 0 ; i < data_disks ; i++) { + diskD = diskD + 1; + if (diskD >= raid_disks) + diskD = 0; + if (diskD == diskP) + diskD += 1; + if (diskD >= raid_disks) + diskD = 0; + blocks[i] = stripes[diskD]; + block_index_for_slot[i] = diskD; + } + } else { + /* The syndrome-order exactly follows raid-disk + * numbers, with ZERO in place of P and Q + */ + for (i = 0 ; i < raid_disks; i++) { + if (i == diskP || i == diskQ) { + blocks[i] = zero; + block_index_for_slot[i] = -1; + } else { + blocks[i] = stripes[i]; + block_index_for_slot[i] = i; + } + } + } + + qsyndrome(p, q, (uint8_t**)blocks, syndrome_disks, chunk_size); + + raid6_collect(chunk_size, p, q, stripes[diskP], stripes[diskQ], results); + raid6_stats(disk, results, raid_disks, chunk_size); + + for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) { + int role = disk[j]; + if (role >= -2) { + int slot = block_index_for_slot[role]; + if (slot >= 0) + printf("Error detected at stripe %llu, page %d: possible failed disk slot %d: %d --> %s\n", + start, j, role, slot, name[slot]); + else + printf("Error detected at stripe %llu, page %d: failed slot %d should be zeros\n", + start, j, role); + } else if(disk[j] == -65535) { + printf("Error detected at stripe %llu, page %d: disk slot unknown\n", start, j); + } + } + + if(repair == AUTO_REPAIR) { + err = autorepair(disk, start, chunk_size, + name, raid_disks, syndrome_disks, blocks_page, + blocks, p, block_index_for_slot, + source, offsets); + if(err != 0) { + unlock_all_stripes(info, sig); + goto exitCheck; + } + } + + if(repair == MANUAL_REPAIR) { + int failed_slot1 = -1, failed_slot2 = -1; + for (i = -2; i < syndrome_disks; i++) { + if (block_index_for_slot[i] == failed_disk1) + failed_slot1 = i; + if (block_index_for_slot[i] == failed_disk2) + failed_slot2 = i; + } + err = manual_repair(chunk_size, syndrome_disks, + failed_slot1, failed_slot2, + start, block_index_for_slot, + name, stripes, blocks, p, + source, offsets); + if(err == -1) { + unlock_all_stripes(info, sig); + goto exitCheck; + } + } + + err = unlock_all_stripes(info, sig); + if(err != 0) { + goto exitCheck; + } + + length--; + start++; + } + +exitCheck: + + free(stripe_buf); + free(stripes); + free(blocks-2); + free(blocks_page-2); + free(block_index_for_slot-2); + free(p); + free(q); + free(results); + free(sig); + + return err; +} + +unsigned long long getnum(char *str, char **err) +{ + char *e; + unsigned long long rv = strtoull(str, &e, 10); + if (e==str || *e) { + *err = str; + return 0; + } + return rv; +} + +int main(int argc, char *argv[]) +{ + /* md_device start length */ + int *fds = NULL; + char *buf = NULL; + char **disk_name = NULL; + unsigned long long *offsets = NULL; + int raid_disks = 0; + int active_disks; + int chunk_size = 0; + int layout = -1; + int level = 6; + enum repair repair = NO_REPAIR; + int failed_disk1 = -1; + int failed_disk2 = -1; + unsigned long long start, length; + int i; + int mdfd; + struct mdinfo *info = NULL, *comp = NULL; + char *err = NULL; + int exit_err = 0; + int close_flag = 0; + char *prg = strrchr(argv[0], '/'); + + if (prg == NULL) + prg = argv[0]; + else + prg++; + + if (argc < 4) { + fprintf(stderr, "Usage: %s md_device start_stripe length_stripes [autorepair]\n", prg); + fprintf(stderr, " or: %s md_device repair stripe failed_slot_1 failed_slot_2\n", prg); + exit_err = 1; + goto exitHere; + } + + mdfd = open(argv[1], O_RDONLY); + if(mdfd < 0) { + perror(argv[1]); + fprintf(stderr, "%s: cannot open %s\n", prg, argv[1]); + exit_err = 2; + goto exitHere; + } + + info = sysfs_read(mdfd, NULL, + GET_LEVEL| + GET_LAYOUT| + GET_DISKS| + GET_DEGRADED | + GET_COMPONENT| + GET_CHUNK| + GET_DEVS| + GET_OFFSET| + GET_SIZE); + + if(info == NULL) { + fprintf(stderr, "%s: Error reading sysfs information of %s\n", prg, argv[1]); + exit_err = 9; + goto exitHere; + } + + if(info->array.level != level) { + fprintf(stderr, "%s: %s not a RAID-6\n", prg, argv[1]); + exit_err = 3; + goto exitHere; + } + + if(info->array.failed_disks > 0) { + fprintf(stderr, "%s: %s degraded array\n", prg, argv[1]); + exit_err = 8; + goto exitHere; + } + + printf("layout: %d\n", info->array.layout); + printf("disks: %d\n", info->array.raid_disks); + printf("component size: %llu\n", info->component_size * 512); + printf("total stripes: %llu\n", (info->component_size * 512) / info->array.chunk_size); + printf("chunk size: %d\n", info->array.chunk_size); + printf("\n"); + + comp = info->devs; + for(i = 0, active_disks = 0; active_disks < info->array.raid_disks; i++) { + printf("disk: %d - offset: %llu - size: %llu - name: %s - slot: %d\n", + i, comp->data_offset * 512, comp->component_size * 512, + map_dev(comp->disk.major, comp->disk.minor, 0), + comp->disk.raid_disk); + if(comp->disk.raid_disk >= 0) + active_disks++; + comp = comp->next; + } + printf("\n"); + + close(mdfd); + + raid_disks = info->array.raid_disks; + chunk_size = info->array.chunk_size; + layout = info->array.layout; + if (strcmp(argv[2], "repair")==0) { + if (argc < 6) { + fprintf(stderr, "For repair mode, call %s md_device repair stripe failed_slot_1 failed_slot_2\n", prg); + exit_err = 1; + goto exitHere; + } + repair = MANUAL_REPAIR; + start = getnum(argv[3], &err); + length = 1; + failed_disk1 = getnum(argv[4], &err); + failed_disk2 = getnum(argv[5], &err); + + if(failed_disk1 >= info->array.raid_disks) { + fprintf(stderr, "%s: failed_slot_1 index is higher than number of devices in raid\n", prg); + exit_err = 4; + goto exitHere; + } + if(failed_disk2 >= info->array.raid_disks) { + fprintf(stderr, "%s: failed_slot_2 index is higher than number of devices in raid\n", prg); + exit_err = 4; + goto exitHere; + } + if(failed_disk1 == failed_disk2) { + fprintf(stderr, "%s: failed_slot_1 and failed_slot_2 are the same\n", prg); + exit_err = 4; + goto exitHere; + } + } + else { + start = getnum(argv[2], &err); + length = getnum(argv[3], &err); + if (argc >= 5 && strcmp(argv[4], "autorepair")==0) + repair = AUTO_REPAIR; + } + + if (err) { + fprintf(stderr, "%s: Bad number: %s\n", prg, err); + exit_err = 4; + goto exitHere; + } + + if(start > ((info->component_size * 512) / chunk_size)) { + start = (info->component_size * 512) / chunk_size; + fprintf(stderr, "%s: start beyond disks size\n", prg); + } + + if((length == 0) || + ((length + start) > ((info->component_size * 512) / chunk_size))) { + length = (info->component_size * 512) / chunk_size - start; + } + + disk_name = xmalloc(raid_disks * sizeof(*disk_name)); + fds = xmalloc(raid_disks * sizeof(*fds)); + offsets = xcalloc(raid_disks, sizeof(*offsets)); + buf = xmalloc(raid_disks * chunk_size); + + for(i=0; idevs; + for (i=0, active_disks=0; active_disksdisk.raid_disk; + if(disk_slot >= 0) { + disk_name[disk_slot] = map_dev(comp->disk.major, comp->disk.minor, 0); + offsets[disk_slot] = comp->data_offset * 512; + fds[disk_slot] = open(disk_name[disk_slot], O_RDWR | O_DIRECT); + if (fds[disk_slot] < 0) { + perror(disk_name[disk_slot]); + fprintf(stderr,"%s: cannot open %s\n", prg, disk_name[disk_slot]); + exit_err = 6; + goto exitHere; + } + active_disks++; + } + comp = comp->next; + } + + int rv = check_stripes(info, fds, offsets, + raid_disks, chunk_size, level, layout, + start, length, disk_name, repair, failed_disk1, failed_disk2); + if (rv != 0) { + fprintf(stderr, "%s: check_stripes returned %d\n", prg, rv); + exit_err = 7; + goto exitHere; + } + +exitHere: + + if (close_flag) + for(i = 0; i < raid_disks; i++) + close(fds[i]); + + free(disk_name); + free(fds); + free(offsets); + free(buf); + + exit(exit_err); +} diff --git a/restripe.c b/restripe.c new file mode 100644 index 00000000..56dca73e --- /dev/null +++ b/restripe.c @@ -0,0 +1,1008 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include + +/* To restripe, we read from old geometry to a buffer, and + * read from buffer to new geometry. + * When reading, we might have missing devices and so could need + * to reconstruct. + * When writing, we need to create correct parity and Q. + * + */ + +int geo_map(int block, unsigned long long stripe, int raid_disks, + int level, int layout) +{ + /* On the given stripe, find which disk in the array will have + * block numbered 'block'. + * '-1' means the parity block. + * '-2' means the Q syndrome. + */ + int pd; + + /* layout is not relevant for raid0 and raid4 */ + if ((level == 0) || + (level == 4)) + layout = 0; + + switch(level*100 + layout) { + case 000: + case 400: + case 500 + ALGORITHM_PARITY_N: + /* raid 4 isn't messed around by parity blocks */ + if (block == -1) + return raid_disks-1; /* parity block */ + return block; + case 500 + ALGORITHM_LEFT_ASYMMETRIC: + pd = (raid_disks-1) - stripe % raid_disks; + if (block == -1) return pd; + if (block >= pd) + block++; + return block; + + case 500 + ALGORITHM_RIGHT_ASYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) return pd; + if (block >= pd) + block++; + return block; + + case 500 + ALGORITHM_LEFT_SYMMETRIC: + pd = (raid_disks - 1) - stripe % raid_disks; + if (block == -1) return pd; + return (pd + 1 + block) % raid_disks; + + case 500 + ALGORITHM_RIGHT_SYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) return pd; + return (pd + 1 + block) % raid_disks; + + case 500 + ALGORITHM_PARITY_0: + return block + 1; + + case 600 + ALGORITHM_PARITY_N_6: + if (block == -2) + return raid_disks - 1; + if (block == -1) + return raid_disks - 2; /* parity block */ + return block; + case 600 + ALGORITHM_LEFT_ASYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = (raid_disks-1) - stripe % raid_disks; + if (block == -1) return pd; + if (block >= pd) + block++; + return block; + + case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = stripe % raid_disks; + if (block == -1) return pd; + if (block >= pd) + block++; + return block; + + case 600 + ALGORITHM_LEFT_SYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = (raid_disks - 1) - stripe % raid_disks; + if (block == -1) return pd; + return (pd + 1 + block) % raid_disks; + + case 600 + ALGORITHM_RIGHT_SYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = stripe % raid_disks; + if (block == -1) return pd; + return (pd + 1 + block) % raid_disks; + + case 600 + ALGORITHM_PARITY_0_6: + if (block == -2) + return raid_disks - 1; + return block + 1; + + case 600 + ALGORITHM_PARITY_0: + if (block == -1) + return 0; + if (block == -2) + return 1; + return block + 2; + + case 600 + ALGORITHM_LEFT_ASYMMETRIC: + pd = raid_disks - 1 - (stripe % raid_disks); + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + if (pd == raid_disks - 1) + return block+1; + if (block >= pd) + return block+2; + return block; + + case 600 + ALGORITHM_ROTATING_ZERO_RESTART: + /* Different order for calculating Q, otherwize same as ... */ + case 600 + ALGORITHM_RIGHT_ASYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + if (pd == raid_disks - 1) + return block+1; + if (block >= pd) + return block+2; + return block; + + case 600 + ALGORITHM_LEFT_SYMMETRIC: + pd = raid_disks - 1 - (stripe % raid_disks); + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + return (pd + 2 + block) % raid_disks; + + case 600 + ALGORITHM_RIGHT_SYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + return (pd + 2 + block) % raid_disks; + + case 600 + ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + pd = raid_disks - 1 - ((stripe + 1) % raid_disks); + if (block == -1) return pd; + if (block == -2) return (pd+1) % raid_disks; + if (pd == raid_disks - 1) + return block+1; + if (block >= pd) + return block+2; + return block; + + case 600 + ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd = raid_disks - 1 - (stripe % raid_disks); + if (block == -1) return pd; + if (block == -2) return (pd+raid_disks-1) % raid_disks; + return (pd + 1 + block) % raid_disks; + } + return -1; +} + +int is_ddf(int layout) +{ + switch (layout) + { + default: + return 0; + case ALGORITHM_ROTATING_N_CONTINUE: + case ALGORITHM_ROTATING_N_RESTART: + case ALGORITHM_ROTATING_ZERO_RESTART: + return 1; + } +} + +void xor_blocks(char *target, char **sources, int disks, int size) +{ + int i, j; + /* Amazingly inefficient... */ + for (i=0; i= 0 ; z-- ) { + wd0 = sources[z][d]; + wp0 ^= wd0; + w20 = (wq0&0x80) ? 0xff : 0x00; + w10 = (wq0 << 1) & 0xff; + w20 &= 0x1d; + w10 ^= w20; + wq0 = w10 ^ wd0; + } + p[d] = wp0; + q[d] = wq0; + } +} + +/* + * The following was taken from linux/drivers/md/mktables.c, and modified + * to create in-memory tables rather than C code + */ +static uint8_t gfmul(uint8_t a, uint8_t b) +{ + uint8_t v = 0; + + while (b) { + if (b & 1) + v ^= a; + a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); + b >>= 1; + } + + return v; +} + +static uint8_t gfpow(uint8_t a, int b) +{ + uint8_t v = 1; + + b %= 255; + if (b < 0) + b += 255; + + while (b) { + if (b & 1) + v = gfmul(v, a); + a = gfmul(a, a); + b >>= 1; + } + + return v; +} + +int tables_ready = 0; +uint8_t raid6_gfmul[256][256]; +uint8_t raid6_gfexp[256]; +uint8_t raid6_gfinv[256]; +uint8_t raid6_gfexi[256]; +uint8_t raid6_gflog[256]; +uint8_t raid6_gfilog[256]; +void make_tables(void) +{ + int i, j; + uint8_t v; + uint32_t b, log; + + /* Compute multiplication table */ + for (i = 0; i < 256; i++) + for (j = 0; j < 256; j++) + raid6_gfmul[i][j] = gfmul(i, j); + + /* Compute power-of-2 table (exponent) */ + v = 1; + for (i = 0; i < 256; i++) { + raid6_gfexp[i] = v; + v = gfmul(v, 2); + if (v == 1) + v = 0; /* For entry 255, not a real entry */ + } + + /* Compute inverse table x^-1 == x^254 */ + for (i = 0; i < 256; i++) + raid6_gfinv[i] = gfpow(i, 254); + + /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ + for (i = 0; i < 256; i ++) + raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1]; + + /* Compute log and inverse log */ + /* Modified code from: + * http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html + */ + b = 1; + raid6_gflog[0] = 0; + raid6_gfilog[255] = 0; + + for (log = 0; log < 255; log++) { + raid6_gflog[b] = (uint8_t) log; + raid6_gfilog[log] = (uint8_t) b; + b = b << 1; + if (b & 256) b = b ^ 0435; + } + + tables_ready = 1; +} + +uint8_t *zero; +int zero_size; + +void ensure_zero_has_size(int chunk_size) +{ + if (zero == NULL || chunk_size > zero_size) { + if (zero) + free(zero); + zero = xcalloc(1, chunk_size); + zero_size = chunk_size; + } +} + +/* Following was taken from linux/drivers/md/raid6recov.c */ + +/* Recover two failed data blocks. */ + +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + uint8_t **ptrs, int neg_offset) +{ + uint8_t *p, *q, *dp, *dq; + uint8_t px, qx, db; + const uint8_t *pbmul; /* P multiplier table for B data */ + const uint8_t *qmul; /* Q multiplier table (for both) */ + + if (faila > failb) { + int t = faila; + faila = failb; + failb = t; + } + + if (neg_offset) { + p = ptrs[-1]; + q = ptrs[-2]; + } else { + p = ptrs[disks-2]; + q = ptrs[disks-1]; + } + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = ptrs[faila]; + ptrs[faila] = zero; + dq = ptrs[failb]; + ptrs[failb] = zero; + + qsyndrome(dp, dq, ptrs, disks-2, bytes); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while ( bytes-- ) { + px = *p ^ *dp; + qx = qmul[*q ^ *dq]; + *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ + *dp++ = db ^ px; /* Reconstructed A */ + p++; q++; + } +} + +/* Recover failure of one data block plus the P block */ +void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs, + int neg_offset) +{ + uint8_t *p, *q, *dq; + const uint8_t *qmul; /* Q multiplier table */ + + if (neg_offset) { + p = ptrs[-1]; + q = ptrs[-2]; + } else { + p = ptrs[disks-2]; + q = ptrs[disks-1]; + } + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = ptrs[faila]; + ptrs[faila] = zero; + + qsyndrome(p, dq, ptrs, disks-2, bytes); + + /* Restore pointer table */ + ptrs[faila] = dq; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while ( bytes-- ) { + *p++ ^= *dq = qmul[*q ^ *dq]; + q++; dq++; + } +} + +/* Try to find out if a specific disk has a problem */ +int raid6_check_disks(int data_disks, int start, int chunk_size, + int level, int layout, int diskP, int diskQ, + uint8_t *p, uint8_t *q, char **stripes) +{ + int i; + int data_id, diskD; + uint8_t Px, Qx; + int curr_broken_disk = -1; + int prev_broken_disk = -1; + int broken_status = 0; + + for(i = 0; i < chunk_size; i++) { + Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i]; + Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i]; + + if((Px != 0) && (Qx == 0)) + curr_broken_disk = diskP; + + if((Px == 0) && (Qx != 0)) + curr_broken_disk = diskQ; + + if((Px != 0) && (Qx != 0)) { + data_id = (raid6_gflog[Qx] - raid6_gflog[Px]); + if(data_id < 0) data_id += 255; + diskD = geo_map(data_id, start/chunk_size, + data_disks + 2, level, layout); + curr_broken_disk = diskD; + } + + if((Px == 0) && (Qx == 0)) + curr_broken_disk = curr_broken_disk; + + if(curr_broken_disk >= data_disks + 2) + broken_status = 2; + + switch(broken_status) { + case 0: + if(curr_broken_disk != -1) { + prev_broken_disk = curr_broken_disk; + broken_status = 1; + } + break; + + case 1: + if(curr_broken_disk != prev_broken_disk) + broken_status = 2; + break; + + case 2: + default: + curr_broken_disk = prev_broken_disk = -2; + break; + } + } + + return curr_broken_disk; +} + +/******************************************************************************* + * Function: save_stripes + * Description: + * Function reads data (only data without P and Q) from array and writes + * it to buf and opcjonaly to backup files + * Parameters: + * source : A list of 'fds' of the active disks. + * Some may be absent + * offsets : A list of offsets on disk belonging + * to the array [bytes] + * raid_disks : geometry: number of disks in the array + * chunk_size : geometry: chunk size [bytes] + * level : geometry: RAID level + * layout : geometry: layout + * nwrites : number of backup files + * dest : A list of 'fds' for mirrored targets + * (e.g. backup files). They are already seeked to right + * (write) location. If NULL, data will be wrote + * to the buf only + * start : start address of data to read (must be stripe-aligned) + * [bytes] + * length - : length of data to read (must be stripe-aligned) + * [bytes] + * buf : buffer for data. It is large enough to hold + * one stripe. It is stripe aligned + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +int save_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int nwrites, int *dest, + unsigned long long start, unsigned long long length, + char *buf) +{ + int len; + int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2); + int disk; + int i; + unsigned long long length_test; + + if (!tables_ready) + make_tables(); + ensure_zero_has_size(chunk_size); + + len = data_disks * chunk_size; + length_test = length / len; + length_test *= len; + + if (length != length_test) { + dprintf("Error: save_stripes(): Data are not alligned. EXIT\n"); + dprintf("\tArea for saving stripes (length) = %llu\n", length); + dprintf("\tWork step (len) = %i\n", len); + dprintf("\tExpected save area (length_test) = %llu\n", + length_test); + abort(); + } + + while (length > 0) { + int failed = 0; + int fdisk[3], fblock[3]; + for (disk = 0; disk < raid_disks ; disk++) { + unsigned long long offset; + int dnum; + + offset = (start/chunk_size/data_disks)*chunk_size; + dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1, + start/chunk_size/data_disks, + raid_disks, level, layout); + if (dnum < 0) abort(); + if (source[dnum] < 0 || + lseek64(source[dnum], offsets[dnum]+offset, 0) < 0 || + read(source[dnum], buf+disk * chunk_size, chunk_size) + != chunk_size) + if (failed <= 2) { + fdisk[failed] = dnum; + fblock[failed] = disk; + failed++; + } + } + if (failed == 0 || fblock[0] >= data_disks) + /* all data disks are good */ + ; + else if (failed == 1 || fblock[1] >= data_disks+1) { + /* one failed data disk and good parity */ + char *bufs[data_disks]; + for (i=0; i < data_disks; i++) + if (fblock[0] == i) + bufs[i] = buf + data_disks*chunk_size; + else + bufs[i] = buf + i*chunk_size; + + xor_blocks(buf + fblock[0]*chunk_size, + bufs, data_disks, chunk_size); + } else if (failed > 2 || level != 6) + /* too much failure */ + return -1; + else { + /* RAID6 computations needed. */ + uint8_t *bufs[data_disks+4]; + int qdisk; + int syndrome_disks; + disk = geo_map(-1, start/chunk_size/data_disks, + raid_disks, level, layout); + qdisk = geo_map(-2, start/chunk_size/data_disks, + raid_disks, level, layout); + if (is_ddf(layout)) { + /* q over 'raid_disks' blocks, in device order. + * 'p' and 'q' get to be all zero + */ + for (i = 0; i < raid_disks; i++) + bufs[i] = zero; + for (i = 0; i < data_disks; i++) { + int dnum = geo_map(i, + start/chunk_size/data_disks, + raid_disks, level, layout); + int snum; + /* i is the logical block number, so is index to 'buf'. + * dnum is physical disk number + * and thus the syndrome number. + */ + snum = dnum; + bufs[snum] = (uint8_t*)buf + chunk_size * i; + } + syndrome_disks = raid_disks; + } else { + /* for md, q is over 'data_disks' blocks, + * starting immediately after 'q' + * Note that for the '_6' variety, the p block + * makes a hole that we need to be careful of. + */ + int j; + int snum = 0; + for (j = 0; j < raid_disks; j++) { + int dnum = (qdisk + 1 + j) % raid_disks; + if (dnum == disk || dnum == qdisk) + continue; + for (i = 0; i < data_disks; i++) + if (geo_map(i, + start/chunk_size/data_disks, + raid_disks, level, layout) == dnum) + break; + /* i is the logical block number, so is index to 'buf'. + * dnum is physical disk number + * snum is syndrome disk for which 0 is immediately after Q + */ + bufs[snum] = (uint8_t*)buf + chunk_size * i; + + if (fblock[0] == i) + fdisk[0] = snum; + if (fblock[1] == i) + fdisk[1] = snum; + snum++; + } + + syndrome_disks = data_disks; + } + + /* Place P and Q blocks at end of bufs */ + bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks; + bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1); + + if (fblock[1] == data_disks) + /* One data failed, and parity failed */ + raid6_datap_recov(syndrome_disks+2, chunk_size, + fdisk[0], bufs, 0); + else { + /* Two data blocks failed, P,Q OK */ + raid6_2data_recov(syndrome_disks+2, chunk_size, + fdisk[0], fdisk[1], bufs, 0); + } + } + if (dest) { + for (i = 0; i < nwrites; i++) + if (write(dest[i], buf, len) != len) + return -1; + } else { + /* build next stripe in buffer */ + buf += len; + } + length -= len; + start += len; + } + return 0; +} + +/* Restore data: + * We are given: + * A list of 'fds' of the active disks. Some may be '-1' for not-available. + * A geometry: raid_disks, chunk_size, level, layout + * An 'fd' to read from. It is already seeked to the right (Read) location. + * A start and length. + * The length must be a multiple of the stripe size. + * + * We build a full stripe in memory and then write it out. + * We assume that there are enough working devices. + */ +int restore_stripes(int *dest, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int source, unsigned long long read_offset, + unsigned long long start, unsigned long long length, + char *src_buf) +{ + char *stripe_buf; + char **stripes = xmalloc(raid_disks * sizeof(char*)); + char **blocks = xmalloc(raid_disks * sizeof(char*)); + int i; + int rv; + + int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2); + + if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size)) + stripe_buf = NULL; + + if (zero == NULL || chunk_size > zero_size) { + if (zero) + free(zero); + zero = xcalloc(1, chunk_size); + zero_size = chunk_size; + } + + if (stripe_buf == NULL || stripes == NULL || blocks == NULL + || zero == NULL) { + rv = -2; + goto abort; + } + for (i = 0; i < raid_disks; i++) + stripes[i] = stripe_buf + i * chunk_size; + while (length > 0) { + unsigned int len = data_disks * chunk_size; + unsigned long long offset; + int disk, qdisk; + int syndrome_disks; + if (length < len) { + rv = -3; + goto abort; + } + for (i = 0; i < data_disks; i++) { + int disk = geo_map(i, start/chunk_size/data_disks, + raid_disks, level, layout); + if (src_buf == NULL) { + /* read from file */ + if (lseek64(source, read_offset, 0) != + (off64_t)read_offset) { + rv = -1; + goto abort; + } + if (read(source, + stripes[disk], + chunk_size) != chunk_size) { + rv = -1; + goto abort; + } + } else { + /* read from input buffer */ + memcpy(stripes[disk], + src_buf + read_offset, + chunk_size); + } + read_offset += chunk_size; + } + /* We have the data, now do the parity */ + offset = (start/chunk_size/data_disks) * chunk_size; + switch (level) { + case 4: + case 5: + disk = geo_map(-1, start/chunk_size/data_disks, + raid_disks, level, layout); + for (i = 0; i < data_disks; i++) + blocks[i] = stripes[(disk+1+i) % raid_disks]; + xor_blocks(stripes[disk], blocks, data_disks, chunk_size); + break; + case 6: + disk = geo_map(-1, start/chunk_size/data_disks, + raid_disks, level, layout); + qdisk = geo_map(-2, start/chunk_size/data_disks, + raid_disks, level, layout); + if (is_ddf(layout)) { + /* q over 'raid_disks' blocks, in device order. + * 'p' and 'q' get to be all zero + */ + for (i = 0; i < raid_disks; i++) + if (i == disk || i == qdisk) + blocks[i] = (char*)zero; + else + blocks[i] = stripes[i]; + syndrome_disks = raid_disks; + } else { + /* for md, q is over 'data_disks' blocks, + * starting immediately after 'q' + */ + for (i = 0; i < data_disks; i++) + blocks[i] = stripes[(qdisk+1+i) % raid_disks]; + + syndrome_disks = data_disks; + } + qsyndrome((uint8_t*)stripes[disk], + (uint8_t*)stripes[qdisk], + (uint8_t**)blocks, + syndrome_disks, chunk_size); + break; + } + for (i=0; i < raid_disks ; i++) + if (dest[i] >= 0) { + if (lseek64(dest[i], + offsets[i]+offset, 0) < 0) { + rv = -1; + goto abort; + } + if (write(dest[i], stripes[i], + chunk_size) != chunk_size) { + rv = -1; + goto abort; + } + } + length -= len; + start += len; + } + rv = 0; + +abort: + free(stripe_buf); + free(stripes); + free(blocks); + return rv; +} + +#ifdef MAIN + +int test_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + unsigned long long start, unsigned long long length) +{ + /* ready the data and p (and q) blocks, and check we got them right */ + char *stripe_buf = xmalloc(raid_disks * chunk_size); + char **stripes = xmalloc(raid_disks * sizeof(char*)); + char **blocks = xmalloc(raid_disks * sizeof(char*)); + uint8_t *p = xmalloc(chunk_size); + uint8_t *q = xmalloc(chunk_size); + + int i; + int diskP, diskQ; + int data_disks = raid_disks - (level == 5 ? 1: 2); + + if (!tables_ready) + make_tables(); + + for ( i = 0 ; i < raid_disks ; i++) + stripes[i] = stripe_buf + i * chunk_size; + + while (length > 0) { + int disk; + + for (i = 0 ; i < raid_disks ; i++) { + lseek64(source[i], offsets[i]+start, 0); + read(source[i], stripes[i], chunk_size); + } + for (i = 0 ; i < data_disks ; i++) { + int disk = geo_map(i, start/chunk_size, raid_disks, + level, layout); + blocks[i] = stripes[disk]; + printf("%d->%d\n", i, disk); + } + switch(level) { + case 6: + qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size); + diskP = geo_map(-1, start/chunk_size, raid_disks, + level, layout); + if (memcmp(p, stripes[diskP], chunk_size) != 0) { + printf("P(%d) wrong at %llu\n", diskP, + start / chunk_size); + } + diskQ = geo_map(-2, start/chunk_size, raid_disks, + level, layout); + if (memcmp(q, stripes[diskQ], chunk_size) != 0) { + printf("Q(%d) wrong at %llu\n", diskQ, + start / chunk_size); + } + disk = raid6_check_disks(data_disks, start, chunk_size, + level, layout, diskP, diskQ, + p, q, stripes); + if(disk >= 0) { + printf("Possible failed disk: %d\n", disk); + } + if(disk == -2) { + printf("Failure detected, but disk unknown\n"); + } + break; + } + length -= chunk_size; + start += chunk_size; + } + return 0; +} + +unsigned long long getnum(char *str, char **err) +{ + char *e; + unsigned long long rv = strtoull(str, &e, 10); + if (e==str || *e) { + *err = str; + return 0; + } + return rv; +} + +char const Name[] = "test_restripe"; +int main(int argc, char *argv[]) +{ + /* save/restore file raid_disks chunk_size level layout start length devices... + */ + int save; + int *fds; + char *file; + char *buf; + int storefd; + unsigned long long *offsets; + int raid_disks, chunk_size, level, layout; + unsigned long long start, length; + int i; + + char *err = NULL; + if (argc < 10) { + fprintf(stderr, "Usage: test_stripe save/restore file raid_disks chunk_size level layout start length devices...\n"); + exit(1); + } + if (strcmp(argv[1], "save")==0) + save = 1; + else if (strcmp(argv[1], "restore") == 0) + save = 0; + else if (strcmp(argv[1], "test") == 0) + save = 2; + else { + fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n"); + exit(2); + } + + file = argv[2]; + raid_disks = getnum(argv[3], &err); + chunk_size = getnum(argv[4], &err); + level = getnum(argv[5], &err); + layout = getnum(argv[6], &err); + start = getnum(argv[7], &err); + length = getnum(argv[8], &err); + if (err) { + fprintf(stderr, "test_stripe: Bad number: %s\n", err); + exit(2); + } + if (argc != raid_disks + 9) { + fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n", + raid_disks, argc-9); + exit(2); + } + fds = xmalloc(raid_disks * sizeof(*fds)); + offsets = xcalloc(raid_disks, sizeof(*offsets)); + + storefd = open(file, O_RDWR); + if (storefd < 0) { + perror(file); + fprintf(stderr, "test_stripe: could not open %s.\n", file); + exit(3); + } + for (i=0; i +#include +#include +#include + +int scsi_get_serial(int fd, void *buf, size_t buf_len) +{ + unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, buf_len, 0}; + unsigned char sense[32]; + struct sg_io_hdr io_hdr; + + memset(&io_hdr, 0, sizeof(io_hdr)); + io_hdr.interface_id = 'S'; + io_hdr.cmdp = inq_cmd; + io_hdr.cmd_len = sizeof(inq_cmd); + io_hdr.dxferp = buf; + io_hdr.dxfer_len = buf_len; + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.sbp = sense; + io_hdr.mx_sb_len = sizeof(sense); + io_hdr.timeout = 5000; + + return ioctl(fd, SG_IO, &io_hdr); +} diff --git a/sha1.c b/sha1.c new file mode 100644 index 00000000..11be7045 --- /dev/null +++ b/sha1.c @@ -0,0 +1,415 @@ +/* sha1.c - Functions to compute SHA1 message digest of files or + memory blocks according to the NIST specification FIPS-180-1. + + Copyright (C) 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software + Foundation, Inc. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* Written by Scott G. Miller + Credits: + Robert Klep -- Expansion function fix +*/ + +//#include + +#include "sha1.h" + +#include +#include + +#if USE_UNLOCKED_IO +# include "unlocked-io.h" +#endif + +#ifdef WORDS_BIGENDIAN +# define SWAP(n) (n) +#else +# define SWAP(n) \ + (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24)) +#endif + +#define BLOCKSIZE 4096 +#if BLOCKSIZE % 64 != 0 +# error "invalid BLOCKSIZE" +#endif + +/* This array contains the bytes used to pad the buffer to the next + 64-byte boundary. (RFC 1321, 3.1: Step 1) */ +static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ... */ }; + +/* Take a pointer to a 160 bit block of data (five 32 bit ints) and + initialize it to the start constants of the SHA1 algorithm. This + must be called before using hash in the call to sha1_hash. */ +void +sha1_init_ctx (struct sha1_ctx *ctx) +{ + ctx->A = 0x67452301; + ctx->B = 0xefcdab89; + ctx->C = 0x98badcfe; + ctx->D = 0x10325476; + ctx->E = 0xc3d2e1f0; + + ctx->total[0] = ctx->total[1] = 0; + ctx->buflen = 0; +} + +/* Put result from CTX in first 20 bytes following RESBUF. The result + must be in little endian byte order. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32-bit value. */ +void * +sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf) +{ + ((sha1_uint32 *) resbuf)[0] = SWAP (ctx->A); + ((sha1_uint32 *) resbuf)[1] = SWAP (ctx->B); + ((sha1_uint32 *) resbuf)[2] = SWAP (ctx->C); + ((sha1_uint32 *) resbuf)[3] = SWAP (ctx->D); + ((sha1_uint32 *) resbuf)[4] = SWAP (ctx->E); + + return resbuf; +} + +/* Process the remaining bytes in the internal buffer and the usual + prolog according to the standard and write the result to RESBUF. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32-bit value. */ +void * +sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf) +{ + /* Take yet unprocessed bytes into account. */ + sha1_uint32 bytes = ctx->buflen; + size_t size = (bytes < 56) ? 64 / 4 : 64 * 2 / 4; + + /* Now count remaining bytes. */ + ctx->total[0] += bytes; + if (ctx->total[0] < bytes) + ++ctx->total[1]; + + /* Put the 64-bit file length in *bits* at the end of the buffer. */ + ctx->buffer[size - 2] = SWAP ((ctx->total[1] << 3) | (ctx->total[0] >> 29)); + ctx->buffer[size - 1] = SWAP (ctx->total[0] << 3); + + memcpy (&((char *) ctx->buffer)[bytes], fillbuf, (size - 2) * 4 - bytes); + + /* Process last bytes. */ + sha1_process_block (ctx->buffer, size * 4, ctx); + + return sha1_read_ctx (ctx, resbuf); +} + +/* Compute SHA1 message digest for bytes read from STREAM. The + resulting message digest number will be written into the 16 bytes + beginning at RESBLOCK. */ +int +sha1_stream (FILE *stream, void *resblock) +{ + struct sha1_ctx ctx; + char buffer[BLOCKSIZE + 72]; + size_t sum; + + /* Initialize the computation context. */ + sha1_init_ctx (&ctx); + + /* Iterate over full file contents. */ + while (1) + { + /* We read the file in blocks of BLOCKSIZE bytes. One call of the + computation function processes the whole buffer so that with the + next round of the loop another block can be read. */ + size_t n; + sum = 0; + + /* Read block. Take care for partial reads. */ + while (1) + { + n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream); + + sum += n; + + if (sum == BLOCKSIZE) + break; + + if (n == 0) + { + /* Check for the error flag IFF N == 0, so that we don't + exit the loop after a partial read due to e.g., EAGAIN + or EWOULDBLOCK. */ + if (ferror (stream)) + return 1; + goto process_partial_block; + } + + /* We've read at least one byte, so ignore errors. But always + check for EOF, since feof may be true even though N > 0. + Otherwise, we could end up calling fread after EOF. */ + if (feof (stream)) + goto process_partial_block; + } + + /* Process buffer with BLOCKSIZE bytes. Note that + BLOCKSIZE % 64 == 0 + */ + sha1_process_block (buffer, BLOCKSIZE, &ctx); + } + + process_partial_block:; + + /* Process any remaining bytes. */ + if (sum > 0) + sha1_process_bytes (buffer, sum, &ctx); + + /* Construct result in desired memory. */ + sha1_finish_ctx (&ctx, resblock); + return 0; +} + +/* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The + result is always in little endian byte order, so that a byte-wise + output yields to the wanted ASCII representation of the message + digest. */ +void * +sha1_buffer (const char *buffer, size_t len, void *resblock) +{ + struct sha1_ctx ctx; + + /* Initialize the computation context. */ + sha1_init_ctx (&ctx); + + /* Process whole buffer but last len % 64 bytes. */ + sha1_process_bytes (buffer, len, &ctx); + + /* Put result in desired memory area. */ + return sha1_finish_ctx (&ctx, resblock); +} + +void +sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx) +{ + /* When we already have some bits in our internal buffer concatenate + both inputs first. */ + if (ctx->buflen != 0) + { + size_t left_over = ctx->buflen; + size_t add = 128 - left_over > len ? len : 128 - left_over; + + memcpy (&((char *) ctx->buffer)[left_over], buffer, add); + ctx->buflen += add; + + if (ctx->buflen > 64) + { + sha1_process_block (ctx->buffer, ctx->buflen & ~63, ctx); + + ctx->buflen &= 63; + /* The regions in the following copy operation cannot overlap. */ + memcpy (ctx->buffer, + &((char *) ctx->buffer)[(left_over + add) & ~63], + ctx->buflen); + } + + buffer = (const char *) buffer + add; + len -= add; + } + + /* Process available complete blocks. */ + if (len >= 64) + { +#if !_STRING_ARCH_unaligned +# define alignof(type) offsetof (struct { char c; type x; }, x) +# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0) + if (UNALIGNED_P (buffer)) + while (len > 64) + { + sha1_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx); + buffer = (const char *) buffer + 64; + len -= 64; + } + else +#endif + { + sha1_process_block (buffer, len & ~63, ctx); + buffer = (const char *) buffer + (len & ~63); + len &= 63; + } + } + + /* Move remaining bytes in internal buffer. */ + if (len > 0) + { + size_t left_over = ctx->buflen; + + memcpy (&((char *) ctx->buffer)[left_over], buffer, len); + left_over += len; + if (left_over >= 64) + { + sha1_process_block (ctx->buffer, 64, ctx); + left_over -= 64; + memcpy (ctx->buffer, &ctx->buffer[16], left_over); + } + ctx->buflen = left_over; + } +} + +/* --- Code below is the primary difference between md5.c and sha1.c --- */ + +/* SHA1 round constants */ +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +/* Round functions. Note that F2 is the same as F4. */ +#define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) ) +#define F2(B,C,D) (B ^ C ^ D) +#define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) ) +#define F4(B,C,D) (B ^ C ^ D) + +/* Process LEN bytes of BUFFER, accumulating context into CTX. + It is assumed that LEN % 64 == 0. + Most of this code comes from GnuPG's cipher/sha1.c. */ + +void +sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx) +{ + const sha1_uint32 *words = (const sha1_uint32*) buffer; + size_t nwords = len / sizeof (sha1_uint32); + const sha1_uint32 *endp = words + nwords; + sha1_uint32 x[16]; + sha1_uint32 a = ctx->A; + sha1_uint32 b = ctx->B; + sha1_uint32 c = ctx->C; + sha1_uint32 d = ctx->D; + sha1_uint32 e = ctx->E; + + /* First increment the byte count. RFC 1321 specifies the possible + length of the file up to 2^64 bits. Here we only compute the + number of bytes. Do a double word increment. */ + ctx->total[0] += len; + if (ctx->total[0] < len) + ++ctx->total[1]; + +#define rol(x, n) (((x) << (n)) | ((sha1_uint32) (x) >> (32 - (n)))) + +#define M(I) ( tm = x[I&0x0f] ^ x[(I-14)&0x0f] \ + ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \ + , (x[I&0x0f] = rol(tm, 1)) ) + +#define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \ + + F( B, C, D ) \ + + K \ + + M; \ + B = rol( B, 30 ); \ + } while(0) + + while (words < endp) + { + sha1_uint32 tm; + int t; + for (t = 0; t < 16; t++) + { + x[t] = SWAP (*words); + words++; + } + + R( a, b, c, d, e, F1, K1, x[ 0] ); + R( e, a, b, c, d, F1, K1, x[ 1] ); + R( d, e, a, b, c, F1, K1, x[ 2] ); + R( c, d, e, a, b, F1, K1, x[ 3] ); + R( b, c, d, e, a, F1, K1, x[ 4] ); + R( a, b, c, d, e, F1, K1, x[ 5] ); + R( e, a, b, c, d, F1, K1, x[ 6] ); + R( d, e, a, b, c, F1, K1, x[ 7] ); + R( c, d, e, a, b, F1, K1, x[ 8] ); + R( b, c, d, e, a, F1, K1, x[ 9] ); + R( a, b, c, d, e, F1, K1, x[10] ); + R( e, a, b, c, d, F1, K1, x[11] ); + R( d, e, a, b, c, F1, K1, x[12] ); + R( c, d, e, a, b, F1, K1, x[13] ); + R( b, c, d, e, a, F1, K1, x[14] ); + R( a, b, c, d, e, F1, K1, x[15] ); + R( e, a, b, c, d, F1, K1, M(16) ); + R( d, e, a, b, c, F1, K1, M(17) ); + R( c, d, e, a, b, F1, K1, M(18) ); + R( b, c, d, e, a, F1, K1, M(19) ); + R( a, b, c, d, e, F2, K2, M(20) ); + R( e, a, b, c, d, F2, K2, M(21) ); + R( d, e, a, b, c, F2, K2, M(22) ); + R( c, d, e, a, b, F2, K2, M(23) ); + R( b, c, d, e, a, F2, K2, M(24) ); + R( a, b, c, d, e, F2, K2, M(25) ); + R( e, a, b, c, d, F2, K2, M(26) ); + R( d, e, a, b, c, F2, K2, M(27) ); + R( c, d, e, a, b, F2, K2, M(28) ); + R( b, c, d, e, a, F2, K2, M(29) ); + R( a, b, c, d, e, F2, K2, M(30) ); + R( e, a, b, c, d, F2, K2, M(31) ); + R( d, e, a, b, c, F2, K2, M(32) ); + R( c, d, e, a, b, F2, K2, M(33) ); + R( b, c, d, e, a, F2, K2, M(34) ); + R( a, b, c, d, e, F2, K2, M(35) ); + R( e, a, b, c, d, F2, K2, M(36) ); + R( d, e, a, b, c, F2, K2, M(37) ); + R( c, d, e, a, b, F2, K2, M(38) ); + R( b, c, d, e, a, F2, K2, M(39) ); + R( a, b, c, d, e, F3, K3, M(40) ); + R( e, a, b, c, d, F3, K3, M(41) ); + R( d, e, a, b, c, F3, K3, M(42) ); + R( c, d, e, a, b, F3, K3, M(43) ); + R( b, c, d, e, a, F3, K3, M(44) ); + R( a, b, c, d, e, F3, K3, M(45) ); + R( e, a, b, c, d, F3, K3, M(46) ); + R( d, e, a, b, c, F3, K3, M(47) ); + R( c, d, e, a, b, F3, K3, M(48) ); + R( b, c, d, e, a, F3, K3, M(49) ); + R( a, b, c, d, e, F3, K3, M(50) ); + R( e, a, b, c, d, F3, K3, M(51) ); + R( d, e, a, b, c, F3, K3, M(52) ); + R( c, d, e, a, b, F3, K3, M(53) ); + R( b, c, d, e, a, F3, K3, M(54) ); + R( a, b, c, d, e, F3, K3, M(55) ); + R( e, a, b, c, d, F3, K3, M(56) ); + R( d, e, a, b, c, F3, K3, M(57) ); + R( c, d, e, a, b, F3, K3, M(58) ); + R( b, c, d, e, a, F3, K3, M(59) ); + R( a, b, c, d, e, F4, K4, M(60) ); + R( e, a, b, c, d, F4, K4, M(61) ); + R( d, e, a, b, c, F4, K4, M(62) ); + R( c, d, e, a, b, F4, K4, M(63) ); + R( b, c, d, e, a, F4, K4, M(64) ); + R( a, b, c, d, e, F4, K4, M(65) ); + R( e, a, b, c, d, F4, K4, M(66) ); + R( d, e, a, b, c, F4, K4, M(67) ); + R( c, d, e, a, b, F4, K4, M(68) ); + R( b, c, d, e, a, F4, K4, M(69) ); + R( a, b, c, d, e, F4, K4, M(70) ); + R( e, a, b, c, d, F4, K4, M(71) ); + R( d, e, a, b, c, F4, K4, M(72) ); + R( c, d, e, a, b, F4, K4, M(73) ); + R( b, c, d, e, a, F4, K4, M(74) ); + R( a, b, c, d, e, F4, K4, M(75) ); + R( e, a, b, c, d, F4, K4, M(76) ); + R( d, e, a, b, c, F4, K4, M(77) ); + R( c, d, e, a, b, F4, K4, M(78) ); + R( b, c, d, e, a, F4, K4, M(79) ); + + a = ctx->A += a; + b = ctx->B += b; + c = ctx->C += c; + d = ctx->D += d; + e = ctx->E += e; + } +} diff --git a/sha1.h b/sha1.h new file mode 100644 index 00000000..0f986585 --- /dev/null +++ b/sha1.h @@ -0,0 +1,136 @@ +/* Declarations of functions and data types used for SHA1 sum + library functions. + Copyright (C) 2000, 2001, 2003, 2005, 2006, 2008 + Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#ifndef SHA1_H +# define SHA1_H 1 + +#include + +#if 1 /* defined HAVE_LIMITS_H || _LIBC */ +# include +#endif + +/* The following contortions are an attempt to use the C preprocessor + to determine an unsigned integral type that is 32 bits wide. An + alternative approach is to use autoconf's AC_CHECK_SIZEOF macro, but + doing that would require that the configure script compile and *run* + the resulting executable. Locally running cross-compiled executables + is usually not possible. */ + +#if 1 /* def _LIBC */ +# include +typedef uint32_t sha1_uint32; +typedef uintptr_t sha1_uintptr; +#else +# define INT_MAX_32_BITS 2147483647 + +/* If UINT_MAX isn't defined, assume it's a 32-bit type. + This should be valid for all systems GNU cares about because + that doesn't include 16-bit systems, and only modern systems + (that certainly have ) have 64+-bit integral types. */ + +# ifndef INT_MAX +# define INT_MAX INT_MAX_32_BITS +# endif + +# if INT_MAX == INT_MAX_32_BITS + typedef unsigned int sha1_uint32; +# else +# if SHRT_MAX == INT_MAX_32_BITS + typedef unsigned short sha1_uint32; +# else +# if LONG_MAX == INT_MAX_32_BITS + typedef unsigned long sha1_uint32; +# else + /* The following line is intended to evoke an error. + Using #error is not portable enough. */ + "Cannot determine unsigned 32-bit data type." +# endif +# endif +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Structure to save state of computation between the single steps. */ +struct sha1_ctx +{ + sha1_uint32 A; + sha1_uint32 B; + sha1_uint32 C; + sha1_uint32 D; + sha1_uint32 E; + + sha1_uint32 total[2]; + sha1_uint32 buflen; + sha1_uint32 buffer[32]; +}; + +/* Initialize structure containing state of computation. */ +extern void sha1_init_ctx (struct sha1_ctx *ctx); + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is necessary that LEN is a multiple of 64!!! */ +extern void sha1_process_block (const void *buffer, size_t len, + struct sha1_ctx *ctx); + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is NOT required that LEN is a multiple of 64. */ +extern void sha1_process_bytes (const void *buffer, size_t len, + struct sha1_ctx *ctx); + +/* Process the remaining bytes in the buffer and put result from CTX + in first 20 bytes following RESBUF. The result is always in little + endian byte order, so that a byte-wise output yields to the wanted + ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF be correctly + aligned for a 32 bits value. */ +extern void *sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf); + +/* Put result from CTX in first 20 bytes following RESBUF. The result is + always in little endian byte order, so that a byte-wise output yields + to the wanted ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32 bits value. */ +extern void *sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf); + +/* Compute SHA1 message digest for bytes read from STREAM. The + resulting message digest number will be written into the 20 bytes + beginning at RESBLOCK. */ +extern int sha1_stream (FILE *stream, void *resblock); + +/* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The + result is always in little endian byte order, so that a byte-wise + output yields to the wanted ASCII representation of the message + digest. */ +extern void *sha1_buffer (const char *buffer, size_t len, void *resblock); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/super-ddf.c b/super-ddf.c new file mode 100644 index 00000000..faaf0a7c --- /dev/null +++ b/super-ddf.c @@ -0,0 +1,5273 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2014 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * + * Specifications for DDF taken from Common RAID DDF Specification Revision 1.2 + * (July 28 2006). Reused by permission of SNIA. + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "mdmon.h" +#include "sha1.h" +#include +#include + +/* a non-official T10 name for creation GUIDs */ +static char T10[] = "Linux-MD"; + +/* DDF timestamps are 1980 based, so we need to add + * second-in-decade-of-seventies to convert to linux timestamps. + * 10 years with 2 leap years. + */ +#define DECADE (3600*24*(365*10+2)) +unsigned long crc32( + unsigned long crc, + const unsigned char *buf, + unsigned len); + +#define DDF_NOTFOUND (~0U) +#define DDF_CONTAINER (DDF_NOTFOUND-1) + +/* Default for safe_mode_delay. Same value as for IMSM. + */ +static const int DDF_SAFE_MODE_DELAY = 4000; + +/* The DDF metadata handling. + * DDF metadata lives at the end of the device. + * The last 512 byte block provides an 'anchor' which is used to locate + * the rest of the metadata which usually lives immediately behind the anchor. + * + * Note: + * - all multibyte numeric fields are bigendian. + * - all strings are space padded. + * + */ + +typedef struct __be16 { + __u16 _v16; +} be16; +#define be16_eq(x, y) ((x)._v16 == (y)._v16) +#define be16_and(x, y) ((x)._v16 & (y)._v16) +#define be16_or(x, y) ((x)._v16 | (y)._v16) +#define be16_clear(x, y) ((x)._v16 &= ~(y)._v16) +#define be16_set(x, y) ((x)._v16 |= (y)._v16) + +typedef struct __be32 { + __u32 _v32; +} be32; +#define be32_eq(x, y) ((x)._v32 == (y)._v32) + +typedef struct __be64 { + __u64 _v64; +} be64; +#define be64_eq(x, y) ((x)._v64 == (y)._v64) + +#define be16_to_cpu(be) __be16_to_cpu((be)._v16) +static inline be16 cpu_to_be16(__u16 x) +{ + be16 be = { ._v16 = __cpu_to_be16(x) }; + return be; +} + +#define be32_to_cpu(be) __be32_to_cpu((be)._v32) +static inline be32 cpu_to_be32(__u32 x) +{ + be32 be = { ._v32 = __cpu_to_be32(x) }; + return be; +} + +#define be64_to_cpu(be) __be64_to_cpu((be)._v64) +static inline be64 cpu_to_be64(__u64 x) +{ + be64 be = { ._v64 = __cpu_to_be64(x) }; + return be; +} + +/* Primary Raid Level (PRL) */ +#define DDF_RAID0 0x00 +#define DDF_RAID1 0x01 +#define DDF_RAID3 0x03 +#define DDF_RAID4 0x04 +#define DDF_RAID5 0x05 +#define DDF_RAID1E 0x11 +#define DDF_JBOD 0x0f +#define DDF_CONCAT 0x1f +#define DDF_RAID5E 0x15 +#define DDF_RAID5EE 0x25 +#define DDF_RAID6 0x06 + +/* Raid Level Qualifier (RLQ) */ +#define DDF_RAID0_SIMPLE 0x00 +#define DDF_RAID1_SIMPLE 0x00 /* just 2 devices in this plex */ +#define DDF_RAID1_MULTI 0x01 /* exactly 3 devices in this plex */ +#define DDF_RAID3_0 0x00 /* parity in first extent */ +#define DDF_RAID3_N 0x01 /* parity in last extent */ +#define DDF_RAID4_0 0x00 /* parity in first extent */ +#define DDF_RAID4_N 0x01 /* parity in last extent */ +/* these apply to raid5e and raid5ee as well */ +#define DDF_RAID5_0_RESTART 0x00 /* same as 'right asymmetric' - layout 1 */ +#define DDF_RAID6_0_RESTART 0x01 /* raid6 different from raid5 here!!! */ +#define DDF_RAID5_N_RESTART 0x02 /* same as 'left asymmetric' - layout 0 */ +#define DDF_RAID5_N_CONTINUE 0x03 /* same as 'left symmetric' - layout 2 */ + +#define DDF_RAID1E_ADJACENT 0x00 /* raid10 nearcopies==2 */ +#define DDF_RAID1E_OFFSET 0x01 /* raid10 offsetcopies==2 */ + +/* Secondary RAID Level (SRL) */ +#define DDF_2STRIPED 0x00 /* This is weirder than RAID0 !! */ +#define DDF_2MIRRORED 0x01 +#define DDF_2CONCAT 0x02 +#define DDF_2SPANNED 0x03 /* This is also weird - be careful */ + +/* Magic numbers */ +#define DDF_HEADER_MAGIC cpu_to_be32(0xDE11DE11) +#define DDF_CONTROLLER_MAGIC cpu_to_be32(0xAD111111) +#define DDF_PHYS_RECORDS_MAGIC cpu_to_be32(0x22222222) +#define DDF_PHYS_DATA_MAGIC cpu_to_be32(0x33333333) +#define DDF_VIRT_RECORDS_MAGIC cpu_to_be32(0xDDDDDDDD) +#define DDF_VD_CONF_MAGIC cpu_to_be32(0xEEEEEEEE) +#define DDF_SPARE_ASSIGN_MAGIC cpu_to_be32(0x55555555) +#define DDF_VU_CONF_MAGIC cpu_to_be32(0x88888888) +#define DDF_VENDOR_LOG_MAGIC cpu_to_be32(0x01dBEEF0) +#define DDF_BBM_LOG_MAGIC cpu_to_be32(0xABADB10C) + +#define DDF_GUID_LEN 24 +#define DDF_REVISION_0 "01.00.00" +#define DDF_REVISION_2 "01.02.00" + +struct ddf_header { + be32 magic; /* DDF_HEADER_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + char revision[8]; /* 01.02.00 */ + be32 seq; /* starts at '1' */ + be32 timestamp; + __u8 openflag; + __u8 foreignflag; + __u8 enforcegroups; + __u8 pad0; /* 0xff */ + __u8 pad1[12]; /* 12 * 0xff */ + /* 64 bytes so far */ + __u8 header_ext[32]; /* reserved: fill with 0xff */ + be64 primary_lba; + be64 secondary_lba; + __u8 type; + __u8 pad2[3]; /* 0xff */ + be32 workspace_len; /* sectors for vendor space - + * at least 32768(sectors) */ + be64 workspace_lba; + be16 max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */ + be16 max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */ + be16 max_partitions; /* i.e. max num of configuration + record entries per disk */ + be16 config_record_len; /* 1 +ROUNDUP(max_primary_element_entries + *12/512) */ + be16 max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */ + __u8 pad3[54]; /* 0xff */ + /* 192 bytes so far */ + be32 controller_section_offset; + be32 controller_section_length; + be32 phys_section_offset; + be32 phys_section_length; + be32 virt_section_offset; + be32 virt_section_length; + be32 config_section_offset; + be32 config_section_length; + be32 data_section_offset; + be32 data_section_length; + be32 bbm_section_offset; + be32 bbm_section_length; + be32 diag_space_offset; + be32 diag_space_length; + be32 vendor_offset; + be32 vendor_length; + /* 256 bytes so far */ + __u8 pad4[256]; /* 0xff */ +}; + +/* type field */ +#define DDF_HEADER_ANCHOR 0x00 +#define DDF_HEADER_PRIMARY 0x01 +#define DDF_HEADER_SECONDARY 0x02 + +/* The content of the 'controller section' - global scope */ +struct ddf_controller_data { + be32 magic; /* DDF_CONTROLLER_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + struct controller_type { + be16 vendor_id; + be16 device_id; + be16 sub_vendor_id; + be16 sub_device_id; + } type; + char product_id[16]; + __u8 pad[8]; /* 0xff */ + __u8 vendor_data[448]; +}; + +/* The content of phys_section - global scope */ +struct phys_disk { + be32 magic; /* DDF_PHYS_RECORDS_MAGIC */ + be32 crc; + be16 used_pdes; /* This is a counter, not a max - the list + * of used entries may not be dense */ + be16 max_pdes; + __u8 pad[52]; + struct phys_disk_entry { + char guid[DDF_GUID_LEN]; + be32 refnum; + be16 type; + be16 state; + be64 config_size; /* DDF structures must be after here */ + char path[18]; /* Another horrible structure really + * but is "used for information + * purposes only" */ + __u8 pad[6]; + } entries[0]; +}; + +/* phys_disk_entry.type is a bitmap - bigendian remember */ +#define DDF_Forced_PD_GUID 1 +#define DDF_Active_in_VD 2 +#define DDF_Global_Spare 4 /* VD_CONF records are ignored */ +#define DDF_Spare 8 /* overrides Global_spare */ +#define DDF_Foreign 16 +#define DDF_Legacy 32 /* no DDF on this device */ + +#define DDF_Interface_mask 0xf00 +#define DDF_Interface_SCSI 0x100 +#define DDF_Interface_SAS 0x200 +#define DDF_Interface_SATA 0x300 +#define DDF_Interface_FC 0x400 + +/* phys_disk_entry.state is a bigendian bitmap */ +#define DDF_Online 1 +#define DDF_Failed 2 /* overrides 1,4,8 */ +#define DDF_Rebuilding 4 +#define DDF_Transition 8 +#define DDF_SMART 16 +#define DDF_ReadErrors 32 +#define DDF_Missing 64 + +/* The content of the virt_section global scope */ +struct virtual_disk { + be32 magic; /* DDF_VIRT_RECORDS_MAGIC */ + be32 crc; + be16 populated_vdes; + be16 max_vdes; + __u8 pad[52]; + struct virtual_entry { + char guid[DDF_GUID_LEN]; + be16 unit; + __u16 pad0; /* 0xffff */ + be16 guid_crc; + be16 type; + __u8 state; + __u8 init_state; + __u8 pad1[14]; + char name[16]; + } entries[0]; +}; + +/* virtual_entry.type is a bitmap - bigendian */ +#define DDF_Shared 1 +#define DDF_Enforce_Groups 2 +#define DDF_Unicode 4 +#define DDF_Owner_Valid 8 + +/* virtual_entry.state is a bigendian bitmap */ +#define DDF_state_mask 0x7 +#define DDF_state_optimal 0x0 +#define DDF_state_degraded 0x1 +#define DDF_state_deleted 0x2 +#define DDF_state_missing 0x3 +#define DDF_state_failed 0x4 +#define DDF_state_part_optimal 0x5 + +#define DDF_state_morphing 0x8 +#define DDF_state_inconsistent 0x10 + +/* virtual_entry.init_state is a bigendian bitmap */ +#define DDF_initstate_mask 0x03 +#define DDF_init_not 0x00 +#define DDF_init_quick 0x01 /* initialisation is progress. + * i.e. 'state_inconsistent' */ +#define DDF_init_full 0x02 + +#define DDF_access_mask 0xc0 +#define DDF_access_rw 0x00 +#define DDF_access_ro 0x80 +#define DDF_access_blocked 0xc0 + +/* The content of the config_section - local scope + * It has multiple records each config_record_len sectors + * They can be vd_config or spare_assign + */ + +struct vd_config { + be32 magic; /* DDF_VD_CONF_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + be32 timestamp; + be32 seqnum; + __u8 pad0[24]; + be16 prim_elmnt_count; + __u8 chunk_shift; /* 0 == 512, 1==1024 etc */ + __u8 prl; + __u8 rlq; + __u8 sec_elmnt_count; + __u8 sec_elmnt_seq; + __u8 srl; + be64 blocks; /* blocks per component could be different + * on different component devices...(only + * for concat I hope) */ + be64 array_blocks; /* blocks in array */ + __u8 pad1[8]; + be32 spare_refs[8]; /* This is used to detect missing spares. + * As we don't have an interface for that + * the values are ignored. + */ + __u8 cache_pol[8]; + __u8 bg_rate; + __u8 pad2[3]; + __u8 pad3[52]; + __u8 pad4[192]; + __u8 v0[32]; /* reserved- 0xff */ + __u8 v1[32]; /* reserved- 0xff */ + __u8 v2[16]; /* reserved- 0xff */ + __u8 v3[16]; /* reserved- 0xff */ + __u8 vendor[32]; + be32 phys_refnum[0]; /* refnum of each disk in sequence */ + /*__u64 lba_offset[0]; LBA offset in each phys. Note extents in a + bvd are always the same size */ +}; +#define LBA_OFFSET(ddf, vd) ((be64 *) &(vd)->phys_refnum[(ddf)->mppe]) + +/* vd_config.cache_pol[7] is a bitmap */ +#define DDF_cache_writeback 1 /* else writethrough */ +#define DDF_cache_wadaptive 2 /* only applies if writeback */ +#define DDF_cache_readahead 4 +#define DDF_cache_radaptive 8 /* only if doing read-ahead */ +#define DDF_cache_ifnobatt 16 /* even to write cache if battery is poor */ +#define DDF_cache_wallowed 32 /* enable write caching */ +#define DDF_cache_rallowed 64 /* enable read caching */ + +struct spare_assign { + be32 magic; /* DDF_SPARE_ASSIGN_MAGIC */ + be32 crc; + be32 timestamp; + __u8 reserved[7]; + __u8 type; + be16 populated; /* SAEs used */ + be16 max; /* max SAEs */ + __u8 pad[8]; + struct spare_assign_entry { + char guid[DDF_GUID_LEN]; + be16 secondary_element; + __u8 pad[6]; + } spare_ents[0]; +}; +/* spare_assign.type is a bitmap */ +#define DDF_spare_dedicated 0x1 /* else global */ +#define DDF_spare_revertible 0x2 /* else committable */ +#define DDF_spare_active 0x4 /* else not active */ +#define DDF_spare_affinity 0x8 /* enclosure affinity */ + +/* The data_section contents - local scope */ +struct disk_data { + be32 magic; /* DDF_PHYS_DATA_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + be32 refnum; /* crc of some magic drive data ... */ + __u8 forced_ref; /* set when above was not result of magic */ + __u8 forced_guid; /* set if guid was forced rather than magic */ + __u8 vendor[32]; + __u8 pad[442]; +}; + +/* bbm_section content */ +struct bad_block_log { + be32 magic; + be32 crc; + be16 entry_count; + be32 spare_count; + __u8 pad[10]; + be64 first_spare; + struct mapped_block { + be64 defective_start; + be32 replacement_start; + be16 remap_count; + __u8 pad[2]; + } entries[0]; +}; + +/* Struct for internally holding ddf structures */ +/* The DDF structure stored on each device is potentially + * quite different, as some data is global and some is local. + * The global data is: + * - ddf header + * - controller_data + * - Physical disk records + * - Virtual disk records + * The local data is: + * - Configuration records + * - Physical Disk data section + * ( and Bad block and vendor which I don't care about yet). + * + * The local data is parsed into separate lists as it is read + * and reconstructed for writing. This means that we only need + * to make config changes once and they are automatically + * propagated to all devices. + * The global (config and disk data) records are each in a list + * of separate data structures. When writing we find the entry + * or entries applicable to the particular device. + */ +struct ddf_super { + struct ddf_header anchor, primary, secondary; + struct ddf_controller_data controller; + struct ddf_header *active; + struct phys_disk *phys; + struct virtual_disk *virt; + char *conf; + int pdsize, vdsize; + unsigned int max_part, mppe, conf_rec_len; + int currentdev; + int updates_pending; + struct vcl { + union { + char space[512]; + struct { + struct vcl *next; + unsigned int vcnum; /* index into ->virt */ + /* For an array with a secondary level there are + * multiple vd_config structures, all with the same + * guid but with different sec_elmnt_seq. + * One of these structures is in 'conf' below. + * The others are in other_bvds, not in any + * particular order. + */ + struct vd_config **other_bvds; + __u64 *block_sizes; /* NULL if all the same */ + }; + }; + struct vd_config conf; + } *conflist, *currentconf; + struct dl { + union { + char space[512]; + struct { + struct dl *next; + int major, minor; + char *devname; + int fd; + unsigned long long size; /* sectors */ + be64 primary_lba; /* sectors */ + be64 secondary_lba; /* sectors */ + be64 workspace_lba; /* sectors */ + int pdnum; /* index in ->phys */ + struct spare_assign *spare; + void *mdupdate; /* hold metadata update */ + + /* These fields used by auto-layout */ + int raiddisk; /* slot to fill in autolayout */ + __u64 esize; + int displayed; + }; + }; + struct disk_data disk; + struct vcl *vlist[0]; /* max_part in size */ + } *dlist, *add_list; +}; + +#ifndef MDASSEMBLE +static int load_super_ddf_all(struct supertype *st, int fd, + void **sbp, char *devname); +static int get_svd_state(const struct ddf_super *, const struct vcl *); +static int +validate_geometry_ddf_container(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose); + +static int validate_geometry_ddf_bvd(struct supertype *st, + int level, int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose); +#endif + +static void free_super_ddf(struct supertype *st); +static int all_ff(const char *guid); +static unsigned int get_pd_index_from_refnum(const struct vcl *vc, + be32 refnum, unsigned int nmax, + const struct vd_config **bvd, + unsigned int *idx); +static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map); +static void uuid_from_ddf_guid(const char *guid, int uuid[4]); +static void uuid_from_super_ddf(struct supertype *st, int uuid[4]); +static void _ddf_array_name(char *name, const struct ddf_super *ddf, int i); +static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map); +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid, unsigned long long data_offset); + +#if DEBUG +static void pr_state(struct ddf_super *ddf, const char *msg) +{ + unsigned int i; + dprintf("%s: ", msg); + for (i = 0; i < be16_to_cpu(ddf->active->max_vd_entries); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + continue; + dprintf_cont("%u(s=%02x i=%02x) ", i, + ddf->virt->entries[i].state, + ddf->virt->entries[i].init_state); + } + dprintf_cont("\n"); +} +#else +static void pr_state(const struct ddf_super *ddf, const char *msg) {} +#endif + +static void _ddf_set_updates_pending(struct ddf_super *ddf, struct vd_config *vc, + const char *func) +{ + if (vc) { + vc->timestamp = cpu_to_be32(time(0)-DECADE); + vc->seqnum = cpu_to_be32(be32_to_cpu(vc->seqnum) + 1); + } + if (ddf->updates_pending) + return; + ddf->updates_pending = 1; + ddf->active->seq = cpu_to_be32((be32_to_cpu(ddf->active->seq)+1)); + pr_state(ddf, func); +} + +#define ddf_set_updates_pending(x,v) _ddf_set_updates_pending((x), (v), __func__) + +static be32 calc_crc(void *buf, int len) +{ + /* crcs are always at the same place as in the ddf_header */ + struct ddf_header *ddf = buf; + be32 oldcrc = ddf->crc; + __u32 newcrc; + ddf->crc = cpu_to_be32(0xffffffff); + + newcrc = crc32(0, buf, len); + ddf->crc = oldcrc; + /* The crc is stored (like everything) bigendian, so convert + * here for simplicity + */ + return cpu_to_be32(newcrc); +} + +#define DDF_INVALID_LEVEL 0xff +#define DDF_NO_SECONDARY 0xff +static int err_bad_md_layout(const mdu_array_info_t *array) +{ + pr_err("RAID%d layout %x with %d disks is unsupported for DDF\n", + array->level, array->layout, array->raid_disks); + return -1; +} + +static int layout_md2ddf(const mdu_array_info_t *array, + struct vd_config *conf) +{ + be16 prim_elmnt_count = cpu_to_be16(array->raid_disks); + __u8 prl = DDF_INVALID_LEVEL, rlq = 0; + __u8 sec_elmnt_count = 1; + __u8 srl = DDF_NO_SECONDARY; + + switch (array->level) { + case LEVEL_LINEAR: + prl = DDF_CONCAT; + break; + case 0: + rlq = DDF_RAID0_SIMPLE; + prl = DDF_RAID0; + break; + case 1: + switch (array->raid_disks) { + case 2: + rlq = DDF_RAID1_SIMPLE; + break; + case 3: + rlq = DDF_RAID1_MULTI; + break; + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID1; + break; + case 4: + if (array->layout != 0) + return err_bad_md_layout(array); + rlq = DDF_RAID4_N; + prl = DDF_RAID4; + break; + case 5: + switch (array->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + rlq = DDF_RAID5_N_RESTART; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + rlq = DDF_RAID5_0_RESTART; + break; + case ALGORITHM_LEFT_SYMMETRIC: + rlq = DDF_RAID5_N_CONTINUE; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + /* not mentioned in standard */ + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID5; + break; + case 6: + switch (array->layout) { + case ALGORITHM_ROTATING_N_RESTART: + rlq = DDF_RAID5_N_RESTART; + break; + case ALGORITHM_ROTATING_ZERO_RESTART: + rlq = DDF_RAID6_0_RESTART; + break; + case ALGORITHM_ROTATING_N_CONTINUE: + rlq = DDF_RAID5_N_CONTINUE; + break; + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID6; + break; + case 10: + if (array->raid_disks % 2 == 0 && array->layout == 0x102) { + rlq = DDF_RAID1_SIMPLE; + prim_elmnt_count = cpu_to_be16(2); + sec_elmnt_count = array->raid_disks / 2; + srl = DDF_2SPANNED; + prl = DDF_RAID1; + } else if (array->raid_disks % 3 == 0 + && array->layout == 0x103) { + rlq = DDF_RAID1_MULTI; + prim_elmnt_count = cpu_to_be16(3); + sec_elmnt_count = array->raid_disks / 3; + srl = DDF_2SPANNED; + prl = DDF_RAID1; + } else if (array->layout == 0x201) { + prl = DDF_RAID1E; + rlq = DDF_RAID1E_OFFSET; + } else if (array->layout == 0x102) { + prl = DDF_RAID1E; + rlq = DDF_RAID1E_ADJACENT; + } else + return err_bad_md_layout(array); + break; + default: + return err_bad_md_layout(array); + } + conf->prl = prl; + conf->prim_elmnt_count = prim_elmnt_count; + conf->rlq = rlq; + conf->srl = srl; + conf->sec_elmnt_count = sec_elmnt_count; + return 0; +} + +static int err_bad_ddf_layout(const struct vd_config *conf) +{ + pr_err("DDF RAID %u qualifier %u with %u disks is unsupported\n", + conf->prl, conf->rlq, be16_to_cpu(conf->prim_elmnt_count)); + return -1; +} + +static int layout_ddf2md(const struct vd_config *conf, + mdu_array_info_t *array) +{ + int level = LEVEL_UNSUPPORTED; + int layout = 0; + int raiddisks = be16_to_cpu(conf->prim_elmnt_count); + + if (conf->sec_elmnt_count > 1) { + /* see also check_secondary() */ + if (conf->prl != DDF_RAID1 || + (conf->srl != DDF_2STRIPED && conf->srl != DDF_2SPANNED)) { + pr_err("Unsupported secondary RAID level %u/%u\n", + conf->prl, conf->srl); + return -1; + } + if (raiddisks == 2 && conf->rlq == DDF_RAID1_SIMPLE) + layout = 0x102; + else if (raiddisks == 3 && conf->rlq == DDF_RAID1_MULTI) + layout = 0x103; + else + return err_bad_ddf_layout(conf); + raiddisks *= conf->sec_elmnt_count; + level = 10; + goto good; + } + + switch (conf->prl) { + case DDF_CONCAT: + level = LEVEL_LINEAR; + break; + case DDF_RAID0: + if (conf->rlq != DDF_RAID0_SIMPLE) + return err_bad_ddf_layout(conf); + level = 0; + break; + case DDF_RAID1: + if (!((conf->rlq == DDF_RAID1_SIMPLE && raiddisks == 2) || + (conf->rlq == DDF_RAID1_MULTI && raiddisks == 3))) + return err_bad_ddf_layout(conf); + level = 1; + break; + case DDF_RAID1E: + if (conf->rlq == DDF_RAID1E_ADJACENT) + layout = 0x102; + else if (conf->rlq == DDF_RAID1E_OFFSET) + layout = 0x201; + else + return err_bad_ddf_layout(conf); + level = 10; + break; + case DDF_RAID4: + if (conf->rlq != DDF_RAID4_N) + return err_bad_ddf_layout(conf); + level = 4; + break; + case DDF_RAID5: + switch (conf->rlq) { + case DDF_RAID5_N_RESTART: + layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case DDF_RAID5_0_RESTART: + layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case DDF_RAID5_N_CONTINUE: + layout = ALGORITHM_LEFT_SYMMETRIC; + break; + default: + return err_bad_ddf_layout(conf); + } + level = 5; + break; + case DDF_RAID6: + switch (conf->rlq) { + case DDF_RAID5_N_RESTART: + layout = ALGORITHM_ROTATING_N_RESTART; + break; + case DDF_RAID6_0_RESTART: + layout = ALGORITHM_ROTATING_ZERO_RESTART; + break; + case DDF_RAID5_N_CONTINUE: + layout = ALGORITHM_ROTATING_N_CONTINUE; + break; + default: + return err_bad_ddf_layout(conf); + } + level = 6; + break; + default: + return err_bad_ddf_layout(conf); + }; + +good: + array->level = level; + array->layout = layout; + array->raid_disks = raiddisks; + return 0; +} + +static int load_ddf_header(int fd, unsigned long long lba, + unsigned long long size, + int type, + struct ddf_header *hdr, struct ddf_header *anchor) +{ + /* read a ddf header (primary or secondary) from fd/lba + * and check that it is consistent with anchor + * Need to check: + * magic, crc, guid, rev, and LBA's header_type, and + * everything after header_type must be the same + */ + if (lba >= size-1) + return 0; + + if (lseek64(fd, lba<<9, 0) < 0) + return 0; + + if (read(fd, hdr, 512) != 512) + return 0; + + if (!be32_eq(hdr->magic, DDF_HEADER_MAGIC)) { + pr_err("bad header magic\n"); + return 0; + } + if (!be32_eq(calc_crc(hdr, 512), hdr->crc)) { + pr_err("bad CRC\n"); + return 0; + } + if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 || + memcmp(anchor->revision, hdr->revision, 8) != 0 || + !be64_eq(anchor->primary_lba, hdr->primary_lba) || + !be64_eq(anchor->secondary_lba, hdr->secondary_lba) || + hdr->type != type || + memcmp(anchor->pad2, hdr->pad2, 512 - + offsetof(struct ddf_header, pad2)) != 0) { + pr_err("header mismatch\n"); + return 0; + } + + /* Looks good enough to me... */ + return 1; +} + +static void *load_section(int fd, struct ddf_super *super, void *buf, + be32 offset_be, be32 len_be, int check) +{ + unsigned long long offset = be32_to_cpu(offset_be); + unsigned long long len = be32_to_cpu(len_be); + int dofree = (buf == NULL); + + if (check) + if (len != 2 && len != 8 && len != 32 + && len != 128 && len != 512) + return NULL; + + if (len > 1024) + return NULL; + if (!buf && posix_memalign(&buf, 512, len<<9) != 0) + buf = NULL; + + if (!buf) + return NULL; + + if (super->active->type == 1) + offset += be64_to_cpu(super->active->primary_lba); + else + offset += be64_to_cpu(super->active->secondary_lba); + + if ((unsigned long long)lseek64(fd, offset<<9, 0) != (offset<<9)) { + if (dofree) + free(buf); + return NULL; + } + if ((unsigned long long)read(fd, buf, len<<9) != (len<<9)) { + if (dofree) + free(buf); + return NULL; + } + return buf; +} + +static int load_ddf_headers(int fd, struct ddf_super *super, char *devname) +{ + unsigned long long dsize; + + get_dev_size(fd, NULL, &dsize); + + if (lseek64(fd, dsize-512, 0) < 0) { + if (devname) + pr_err("Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + if (read(fd, &super->anchor, 512) != 512) { + if (devname) + pr_err("Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + if (!be32_eq(super->anchor.magic, DDF_HEADER_MAGIC)) { + if (devname) + pr_err("no DDF anchor found on %s\n", + devname); + return 2; + } + if (!be32_eq(calc_crc(&super->anchor, 512), super->anchor.crc)) { + if (devname) + pr_err("bad CRC on anchor on %s\n", + devname); + return 2; + } + if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 && + memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) { + if (devname) + pr_err("can only support super revision %.8s and earlier, not %.8s on %s\n", + DDF_REVISION_2, super->anchor.revision,devname); + return 2; + } + super->active = NULL; + if (load_ddf_header(fd, be64_to_cpu(super->anchor.primary_lba), + dsize >> 9, 1, + &super->primary, &super->anchor) == 0) { + if (devname) + pr_err("Failed to load primary DDF header on %s\n", devname); + } else + super->active = &super->primary; + + if (load_ddf_header(fd, be64_to_cpu(super->anchor.secondary_lba), + dsize >> 9, 2, + &super->secondary, &super->anchor)) { + if (super->active == NULL + || (be32_to_cpu(super->primary.seq) + < be32_to_cpu(super->secondary.seq) && + !super->secondary.openflag) + || (be32_to_cpu(super->primary.seq) + == be32_to_cpu(super->secondary.seq) && + super->primary.openflag && !super->secondary.openflag) + ) + super->active = &super->secondary; + } else if (devname && + be64_to_cpu(super->anchor.secondary_lba) != ~(__u64)0) + pr_err("Failed to load secondary DDF header on %s\n", + devname); + if (super->active == NULL) + return 2; + return 0; +} + +static int load_ddf_global(int fd, struct ddf_super *super, char *devname) +{ + void *ok; + ok = load_section(fd, super, &super->controller, + super->active->controller_section_offset, + super->active->controller_section_length, + 0); + super->phys = load_section(fd, super, NULL, + super->active->phys_section_offset, + super->active->phys_section_length, + 1); + super->pdsize = be32_to_cpu(super->active->phys_section_length) * 512; + + super->virt = load_section(fd, super, NULL, + super->active->virt_section_offset, + super->active->virt_section_length, + 1); + super->vdsize = be32_to_cpu(super->active->virt_section_length) * 512; + if (!ok || + !super->phys || + !super->virt) { + free(super->phys); + free(super->virt); + super->phys = NULL; + super->virt = NULL; + return 2; + } + super->conflist = NULL; + super->dlist = NULL; + + super->max_part = be16_to_cpu(super->active->max_partitions); + super->mppe = be16_to_cpu(super->active->max_primary_element_entries); + super->conf_rec_len = be16_to_cpu(super->active->config_record_len); + return 0; +} + +#define DDF_UNUSED_BVD 0xff +static int alloc_other_bvds(const struct ddf_super *ddf, struct vcl *vcl) +{ + unsigned int n_vds = vcl->conf.sec_elmnt_count - 1; + unsigned int i, vdsize; + void *p; + if (n_vds == 0) { + vcl->other_bvds = NULL; + return 0; + } + vdsize = ddf->conf_rec_len * 512; + if (posix_memalign(&p, 512, n_vds * + (vdsize + sizeof(struct vd_config *))) != 0) + return -1; + vcl->other_bvds = (struct vd_config **) (p + n_vds * vdsize); + for (i = 0; i < n_vds; i++) { + vcl->other_bvds[i] = p + i * vdsize; + memset(vcl->other_bvds[i], 0, vdsize); + vcl->other_bvds[i]->sec_elmnt_seq = DDF_UNUSED_BVD; + } + return 0; +} + +static void add_other_bvd(struct vcl *vcl, struct vd_config *vd, + unsigned int len) +{ + int i; + for (i = 0; i < vcl->conf.sec_elmnt_count-1; i++) + if (vcl->other_bvds[i]->sec_elmnt_seq == vd->sec_elmnt_seq) + break; + + if (i < vcl->conf.sec_elmnt_count-1) { + if (be32_to_cpu(vd->seqnum) <= + be32_to_cpu(vcl->other_bvds[i]->seqnum)) + return; + } else { + for (i = 0; i < vcl->conf.sec_elmnt_count-1; i++) + if (vcl->other_bvds[i]->sec_elmnt_seq == DDF_UNUSED_BVD) + break; + if (i == vcl->conf.sec_elmnt_count-1) { + pr_err("no space for sec level config %u, count is %u\n", + vd->sec_elmnt_seq, vcl->conf.sec_elmnt_count); + return; + } + } + memcpy(vcl->other_bvds[i], vd, len); +} + +static int load_ddf_local(int fd, struct ddf_super *super, + char *devname, int keep) +{ + struct dl *dl; + struct stat stb; + char *conf; + unsigned int i; + unsigned int confsec; + int vnum; + unsigned int max_virt_disks = + be16_to_cpu(super->active->max_vd_entries); + unsigned long long dsize; + + /* First the local disk info */ + if (posix_memalign((void**)&dl, 512, + sizeof(*dl) + + (super->max_part) * sizeof(dl->vlist[0])) != 0) { + pr_err("could not allocate disk info buffer\n"); + return 1; + } + + load_section(fd, super, &dl->disk, + super->active->data_section_offset, + super->active->data_section_length, + 0); + dl->devname = devname ? xstrdup(devname) : NULL; + + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->dlist; + dl->fd = keep ? fd : -1; + + dl->size = 0; + if (get_dev_size(fd, devname, &dsize)) + dl->size = dsize >> 9; + /* If the disks have different sizes, the LBAs will differ + * between phys disks. + * At this point here, the values in super->active must be valid + * for this phys disk. */ + dl->primary_lba = super->active->primary_lba; + dl->secondary_lba = super->active->secondary_lba; + dl->workspace_lba = super->active->workspace_lba; + dl->spare = NULL; + for (i = 0 ; i < super->max_part ; i++) + dl->vlist[i] = NULL; + super->dlist = dl; + dl->pdnum = -1; + for (i = 0; i < be16_to_cpu(super->active->max_pd_entries); i++) + if (memcmp(super->phys->entries[i].guid, + dl->disk.guid, DDF_GUID_LEN) == 0) + dl->pdnum = i; + + /* Now the config list. */ + /* 'conf' is an array of config entries, some of which are + * probably invalid. Those which are good need to be copied into + * the conflist + */ + + conf = load_section(fd, super, super->conf, + super->active->config_section_offset, + super->active->config_section_length, + 0); + super->conf = conf; + vnum = 0; + for (confsec = 0; + confsec < be32_to_cpu(super->active->config_section_length); + confsec += super->conf_rec_len) { + struct vd_config *vd = + (struct vd_config *)((char*)conf + confsec*512); + struct vcl *vcl; + + if (be32_eq(vd->magic, DDF_SPARE_ASSIGN_MAGIC)) { + if (dl->spare) + continue; + if (posix_memalign((void**)&dl->spare, 512, + super->conf_rec_len*512) != 0) { + pr_err("could not allocate spare info buf\n"); + return 1; + } + + memcpy(dl->spare, vd, super->conf_rec_len*512); + continue; + } + if (!be32_eq(vd->magic, DDF_VD_CONF_MAGIC)) + /* Must be vendor-unique - I cannot handle those */ + continue; + + for (vcl = super->conflist; vcl; vcl = vcl->next) { + if (memcmp(vcl->conf.guid, + vd->guid, DDF_GUID_LEN) == 0) + break; + } + + if (vcl) { + dl->vlist[vnum++] = vcl; + if (vcl->other_bvds != NULL && + vcl->conf.sec_elmnt_seq != vd->sec_elmnt_seq) { + add_other_bvd(vcl, vd, super->conf_rec_len*512); + continue; + } + if (be32_to_cpu(vd->seqnum) <= + be32_to_cpu(vcl->conf.seqnum)) + continue; + } else { + if (posix_memalign((void**)&vcl, 512, + (super->conf_rec_len*512 + + offsetof(struct vcl, conf))) != 0) { + pr_err("could not allocate vcl buf\n"); + return 1; + } + vcl->next = super->conflist; + vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + vcl->conf.sec_elmnt_count = vd->sec_elmnt_count; + if (alloc_other_bvds(super, vcl) != 0) { + pr_err("could not allocate other bvds\n"); + free(vcl); + return 1; + }; + super->conflist = vcl; + dl->vlist[vnum++] = vcl; + } + memcpy(&vcl->conf, vd, super->conf_rec_len*512); + for (i=0; i < max_virt_disks ; i++) + if (memcmp(super->virt->entries[i].guid, + vcl->conf.guid, DDF_GUID_LEN)==0) + break; + if (i < max_virt_disks) + vcl->vcnum = i; + } + + return 0; +} + +static int load_super_ddf(struct supertype *st, int fd, + char *devname) +{ + unsigned long long dsize; + struct ddf_super *super; + int rv; + + if (get_dev_size(fd, devname, &dsize) == 0) + return 1; + + if (test_partition(fd)) + /* DDF is not allowed on partitions */ + return 1; + + /* 32M is a lower bound */ + if (dsize <= 32*1024*1024) { + if (devname) + pr_err("%s is too small for ddf: size is %llu sectors.\n", + devname, dsize>>9); + return 1; + } + if (dsize & 511) { + if (devname) + pr_err("%s is an odd size for ddf: size is %llu bytes.\n", + devname, dsize); + return 1; + } + + free_super_ddf(st); + + if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) { + pr_err("malloc of %zu failed.\n", + sizeof(*super)); + return 1; + } + memset(super, 0, sizeof(*super)); + + rv = load_ddf_headers(fd, super, devname); + if (rv) { + free(super); + return rv; + } + + /* Have valid headers and have chosen the best. Let's read in the rest*/ + + rv = load_ddf_global(fd, super, devname); + + if (rv) { + if (devname) + pr_err("Failed to load all information sections on %s\n", devname); + free(super); + return rv; + } + + rv = load_ddf_local(fd, super, devname, 0); + + if (rv) { + if (devname) + pr_err("Failed to load all information sections on %s\n", devname); + free(super); + return rv; + } + + /* Should possibly check the sections .... */ + + st->sb = super; + if (st->ss == NULL) { + st->ss = &super_ddf; + st->minor_version = 0; + st->max_devs = 512; + } + return 0; + +} + +static void free_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + if (ddf == NULL) + return; + free(ddf->phys); + free(ddf->virt); + free(ddf->conf); + while (ddf->conflist) { + struct vcl *v = ddf->conflist; + ddf->conflist = v->next; + if (v->block_sizes) + free(v->block_sizes); + if (v->other_bvds) + /* + v->other_bvds[0] points to beginning of buffer, + see alloc_other_bvds() + */ + free(v->other_bvds[0]); + free(v); + } + while (ddf->dlist) { + struct dl *d = ddf->dlist; + ddf->dlist = d->next; + if (d->fd >= 0) + close(d->fd); + if (d->spare) + free(d->spare); + free(d); + } + while (ddf->add_list) { + struct dl *d = ddf->add_list; + ddf->add_list = d->next; + if (d->fd >= 0) + close(d->fd); + if (d->spare) + free(d->spare); + free(d); + } + free(ddf); + st->sb = NULL; +} + +static struct supertype *match_metadata_desc_ddf(char *arg) +{ + /* 'ddf' only supports containers */ + struct supertype *st; + if (strcmp(arg, "ddf") != 0 && + strcmp(arg, "default") != 0 + ) + return NULL; + + st = xcalloc(1, sizeof(*st)); + st->ss = &super_ddf; + st->max_devs = 512; + st->minor_version = 0; + st->sb = NULL; + return st; +} + +#ifndef MDASSEMBLE + +static mapping_t ddf_state[] = { + { "Optimal", 0}, + { "Degraded", 1}, + { "Deleted", 2}, + { "Missing", 3}, + { "Failed", 4}, + { "Partially Optimal", 5}, + { "-reserved-", 6}, + { "-reserved-", 7}, + { NULL, 0} +}; + +static mapping_t ddf_init_state[] = { + { "Not Initialised", 0}, + { "QuickInit in Progress", 1}, + { "Fully Initialised", 2}, + { "*UNKNOWN*", 3}, + { NULL, 0} +}; +static mapping_t ddf_access[] = { + { "Read/Write", 0}, + { "Reserved", 1}, + { "Read Only", 2}, + { "Blocked (no access)", 3}, + { NULL ,0} +}; + +static mapping_t ddf_level[] = { + { "RAID0", DDF_RAID0}, + { "RAID1", DDF_RAID1}, + { "RAID3", DDF_RAID3}, + { "RAID4", DDF_RAID4}, + { "RAID5", DDF_RAID5}, + { "RAID1E",DDF_RAID1E}, + { "JBOD", DDF_JBOD}, + { "CONCAT",DDF_CONCAT}, + { "RAID5E",DDF_RAID5E}, + { "RAID5EE",DDF_RAID5EE}, + { "RAID6", DDF_RAID6}, + { NULL, 0} +}; +static mapping_t ddf_sec_level[] = { + { "Striped", DDF_2STRIPED}, + { "Mirrored", DDF_2MIRRORED}, + { "Concat", DDF_2CONCAT}, + { "Spanned", DDF_2SPANNED}, + { NULL, 0} +}; +#endif + +static int all_ff(const char *guid) +{ + int i; + for (i = 0; i < DDF_GUID_LEN; i++) + if (guid[i] != (char)0xff) + return 0; + return 1; +} + +static const char *guid_str(const char *guid) +{ + static char buf[DDF_GUID_LEN*2+1]; + int i; + char *p = buf; + for (i = 0; i < DDF_GUID_LEN; i++) { + unsigned char c = guid[i]; + if (c >= 32 && c < 127) + p += sprintf(p, "%c", c); + else + p += sprintf(p, "%02x", c); + } + *p = '\0'; + return (const char *) buf; +} + +#ifndef MDASSEMBLE +static void print_guid(char *guid, int tstamp) +{ + /* A GUIDs are part (or all) ASCII and part binary. + * They tend to be space padded. + * We print the GUID in HEX, then in parentheses add + * any initial ASCII sequence, and a possible + * time stamp from bytes 16-19 + */ + int l = DDF_GUID_LEN; + int i; + + for (i=0 ; i= 0x20 && guid[i] < 0x7f) + fputc(guid[i], stdout); + else + break; + } + if (tstamp) { + time_t then = __be32_to_cpu(*(__u32*)(guid+16)) + DECADE; + char tbuf[100]; + struct tm *tm; + tm = localtime(&then); + strftime(tbuf, 100, " %D %T",tm); + fputs(tbuf, stdout); + } + printf(")"); +} + +static void examine_vd(int n, struct ddf_super *sb, char *guid) +{ + int crl = sb->conf_rec_len; + struct vcl *vcl; + + for (vcl = sb->conflist ; vcl ; vcl = vcl->next) { + unsigned int i; + struct vd_config *vc = &vcl->conf; + + if (!be32_eq(calc_crc(vc, crl*512), vc->crc)) + continue; + if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0) + continue; + + /* Ok, we know about this VD, let's give more details */ + printf(" Raid Devices[%d] : %d (", n, + be16_to_cpu(vc->prim_elmnt_count)); + for (i = 0; i < be16_to_cpu(vc->prim_elmnt_count); i++) { + int j; + int cnt = be16_to_cpu(sb->phys->max_pdes); + for (j=0; jphys_refnum[i], + sb->phys->entries[j].refnum)) + break; + if (i) printf(" "); + if (j < cnt) + printf("%d", j); + else + printf("--"); + printf("@%lluK", (unsigned long long) be64_to_cpu(LBA_OFFSET(sb, vc)[i])/2); + } + printf(")\n"); + if (vc->chunk_shift != 255) + printf(" Chunk Size[%d] : %d sectors\n", n, + 1 << vc->chunk_shift); + printf(" Raid Level[%d] : %s\n", n, + map_num(ddf_level, vc->prl)?:"-unknown-"); + if (vc->sec_elmnt_count != 1) { + printf(" Secondary Position[%d] : %d of %d\n", n, + vc->sec_elmnt_seq, vc->sec_elmnt_count); + printf(" Secondary Level[%d] : %s\n", n, + map_num(ddf_sec_level, vc->srl) ?: "-unknown-"); + } + printf(" Device Size[%d] : %llu\n", n, + be64_to_cpu(vc->blocks)/2); + printf(" Array Size[%d] : %llu\n", n, + be64_to_cpu(vc->array_blocks)/2); + } +} + +static void examine_vds(struct ddf_super *sb) +{ + int cnt = be16_to_cpu(sb->virt->populated_vdes); + unsigned int i; + printf(" Virtual Disks : %d\n", cnt); + + for (i = 0; i < be16_to_cpu(sb->virt->max_vdes); i++) { + struct virtual_entry *ve = &sb->virt->entries[i]; + if (all_ff(ve->guid)) + continue; + printf("\n"); + printf(" VD GUID[%d] : ", i); print_guid(ve->guid, 1); + printf("\n"); + printf(" unit[%d] : %d\n", i, be16_to_cpu(ve->unit)); + printf(" state[%d] : %s, %s%s\n", i, + map_num(ddf_state, ve->state & 7), + (ve->state & DDF_state_morphing) ? "Morphing, ": "", + (ve->state & DDF_state_inconsistent)? "Not Consistent" : "Consistent"); + printf(" init state[%d] : %s\n", i, + map_num(ddf_init_state, ve->init_state&DDF_initstate_mask)); + printf(" access[%d] : %s\n", i, + map_num(ddf_access, (ve->init_state & DDF_access_mask) >> 6)); + printf(" Name[%d] : %.16s\n", i, ve->name); + examine_vd(i, sb, ve->guid); + } + if (cnt) printf("\n"); +} + +static void examine_pds(struct ddf_super *sb) +{ + int cnt = be16_to_cpu(sb->phys->max_pdes); + int i; + struct dl *dl; + int unlisted = 0; + printf(" Physical Disks : %d\n", cnt); + printf(" Number RefNo Size Device Type/State\n"); + + for (dl = sb->dlist; dl; dl = dl->next) + dl->displayed = 0; + + for (i=0 ; iphys->entries[i]; + int type = be16_to_cpu(pd->type); + int state = be16_to_cpu(pd->state); + + if (be32_to_cpu(pd->refnum) == 0xffffffff) + /* Not in use */ + continue; + //printf(" PD GUID[%d] : ", i); print_guid(pd->guid, 0); + //printf("\n"); + printf(" %3d %08x ", i, + be32_to_cpu(pd->refnum)); + printf("%8lluK ", + be64_to_cpu(pd->config_size)>>1); + for (dl = sb->dlist; dl ; dl = dl->next) { + if (be32_eq(dl->disk.refnum, pd->refnum)) { + char *dv = map_dev(dl->major, dl->minor, 0); + if (dv) { + printf("%-15s", dv); + break; + } + } + } + if (!dl) + printf("%15s",""); + else + dl->displayed = 1; + printf(" %s%s%s%s%s", + (type&2) ? "active":"", + (type&4) ? "Global-Spare":"", + (type&8) ? "spare" : "", + (type&16)? ", foreign" : "", + (type&32)? "pass-through" : ""); + if (state & DDF_Failed) + /* This over-rides these three */ + state &= ~(DDF_Online|DDF_Rebuilding|DDF_Transition); + printf("/%s%s%s%s%s%s%s", + (state&1)? "Online": "Offline", + (state&2)? ", Failed": "", + (state&4)? ", Rebuilding": "", + (state&8)? ", in-transition": "", + (state&16)? ", SMART-errors": "", + (state&32)? ", Unrecovered-Read-Errors": "", + (state&64)? ", Missing" : ""); + printf("\n"); + } + for (dl = sb->dlist; dl; dl = dl->next) { + char *dv; + if (dl->displayed) + continue; + if (!unlisted) + printf(" Physical disks not in metadata!:\n"); + unlisted = 1; + dv = map_dev(dl->major, dl->minor, 0); + printf(" %08x %s\n", be32_to_cpu(dl->disk.refnum), + dv ? dv : "-unknown-"); + } + if (unlisted) + printf("\n"); +} + +static void examine_super_ddf(struct supertype *st, char *homehost) +{ + struct ddf_super *sb = st->sb; + + printf(" Magic : %08x\n", be32_to_cpu(sb->anchor.magic)); + printf(" Version : %.8s\n", sb->anchor.revision); + printf("Controller GUID : "); print_guid(sb->controller.guid, 0); + printf("\n"); + printf(" Container GUID : "); print_guid(sb->anchor.guid, 1); + printf("\n"); + printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq)); + printf(" Redundant hdr : %s\n", (be32_eq(sb->secondary.magic, + DDF_HEADER_MAGIC) + ?"yes" : "no")); + examine_vds(sb); + examine_pds(sb); +} + +static unsigned int get_vd_num_of_subarray(struct supertype *st) +{ + /* + * Figure out the VD number for this supertype. + * Returns DDF_CONTAINER for the container itself, + * and DDF_NOTFOUND on error. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo *sra; + char *sub, *end; + unsigned int vcnum; + + if (*st->container_devnm == '\0') + return DDF_CONTAINER; + + sra = sysfs_read(-1, st->devnm, GET_VERSION); + if (!sra || sra->array.major_version != -1 || + sra->array.minor_version != -2 || + !is_subarray(sra->text_version)) + return DDF_NOTFOUND; + + sub = strchr(sra->text_version + 1, '/'); + if (sub != NULL) + vcnum = strtoul(sub + 1, &end, 10); + if (sub == NULL || *sub == '\0' || *end != '\0' || + vcnum >= be16_to_cpu(ddf->active->max_vd_entries)) + return DDF_NOTFOUND; + + return vcnum; +} + +static void brief_examine_super_ddf(struct supertype *st, int verbose) +{ + /* We just write a generic DDF ARRAY entry + */ + struct mdinfo info; + char nbuf[64]; + getinfo_super_ddf(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + + printf("ARRAY metadata=ddf UUID=%s\n", nbuf + 5); +} + +static void brief_examine_subarrays_ddf(struct supertype *st, int verbose) +{ + /* We write a DDF ARRAY member entry for each vd, identifying container + * by uuid and member by unit number and uuid. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo info; + unsigned int i; + char nbuf[64]; + getinfo_super_ddf(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + struct virtual_entry *ve = &ddf->virt->entries[i]; + struct vcl vcl; + char nbuf1[64]; + char namebuf[17]; + if (all_ff(ve->guid)) + continue; + memcpy(vcl.conf.guid, ve->guid, DDF_GUID_LEN); + ddf->currentconf =&vcl; + vcl.vcnum = i; + uuid_from_super_ddf(st, info.uuid); + fname_from_uuid(st, &info, nbuf1, ':'); + _ddf_array_name(namebuf, ddf, i); + printf("ARRAY%s%s container=%s member=%d UUID=%s\n", + namebuf[0] == '\0' ? "" : " /dev/md/", namebuf, + nbuf+5, i, nbuf1+5); + } +} + +static void export_examine_super_ddf(struct supertype *st) +{ + struct mdinfo info; + char nbuf[64]; + getinfo_super_ddf(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("MD_METADATA=ddf\n"); + printf("MD_LEVEL=container\n"); + printf("MD_UUID=%s\n", nbuf+5); + printf("MD_DEVICES=%u\n", + be16_to_cpu(((struct ddf_super *)st->sb)->phys->used_pdes)); +} + +static int copy_metadata_ddf(struct supertype *st, int from, int to) +{ + void *buf; + unsigned long long dsize, offset; + int bytes; + struct ddf_header *ddf; + int written = 0; + + /* The meta consists of an anchor, a primary, and a secondary. + * This all lives at the end of the device. + * So it is easiest to find the earliest of primary and + * secondary, and copy everything from there. + * + * Anchor is 512 from end. It contains primary_lba and secondary_lba + * we choose one of those + */ + + if (posix_memalign(&buf, 4096, 4096) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (lseek64(from, dsize-512, 0) < 0) + goto err; + if (read(from, buf, 512) != 512) + goto err; + ddf = buf; + if (!be32_eq(ddf->magic, DDF_HEADER_MAGIC) || + !be32_eq(calc_crc(ddf, 512), ddf->crc) || + (memcmp(ddf->revision, DDF_REVISION_0, 8) != 0 && + memcmp(ddf->revision, DDF_REVISION_2, 8) != 0)) + goto err; + + offset = dsize - 512; + if ((be64_to_cpu(ddf->primary_lba) << 9) < offset) + offset = be64_to_cpu(ddf->primary_lba) << 9; + if ((be64_to_cpu(ddf->secondary_lba) << 9) < offset) + offset = be64_to_cpu(ddf->secondary_lba) << 9; + + bytes = dsize - offset; + + if (lseek64(from, offset, 0) < 0 || + lseek64(to, offset, 0) < 0) + goto err; + while (written < bytes) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (read(from, buf, n) != n) + goto err; + if (write(to, buf, n) != n) + goto err; + written += n; + } + free(buf); + return 0; +err: + free(buf); + return 1; +} + +static void detail_super_ddf(struct supertype *st, char *homehost) +{ + struct ddf_super *sb = st->sb; + int cnt = be16_to_cpu(sb->virt->populated_vdes); + + printf(" Container GUID : "); print_guid(sb->anchor.guid, 1); + printf("\n"); + printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq)); + printf(" Virtual Disks : %d\n", cnt); + printf("\n"); +} +#endif + +static const char *vendors_with_variable_volume_UUID[] = { + "LSI ", +}; + +static int volume_id_is_reliable(const struct ddf_super *ddf) +{ + int n = ARRAY_SIZE(vendors_with_variable_volume_UUID); + int i; + for (i = 0; i < n; i++) + if (!memcmp(ddf->controller.guid, + vendors_with_variable_volume_UUID[i], 8)) + return 0; + return 1; +} + +static void uuid_of_ddf_subarray(const struct ddf_super *ddf, + unsigned int vcnum, int uuid[4]) +{ + char buf[DDF_GUID_LEN+18], sha[20], *p; + struct sha1_ctx ctx; + if (volume_id_is_reliable(ddf)) { + uuid_from_ddf_guid(ddf->virt->entries[vcnum].guid, uuid); + return; + } + /* + * Some fake RAID BIOSes (in particular, LSI ones) change the + * VD GUID at every boot. These GUIDs are not suitable for + * identifying an array. Luckily the header GUID appears to + * remain constant. + * We construct a pseudo-UUID from the header GUID and those + * properties of the subarray that we expect to remain constant. + */ + memset(buf, 0, sizeof(buf)); + p = buf; + memcpy(p, ddf->anchor.guid, DDF_GUID_LEN); + p += DDF_GUID_LEN; + memcpy(p, ddf->virt->entries[vcnum].name, 16); + p += 16; + *((__u16 *) p) = vcnum; + sha1_init_ctx(&ctx); + sha1_process_bytes(buf, sizeof(buf), &ctx); + sha1_finish_ctx(&ctx, sha); + memcpy(uuid, sha, 4*4); +} + +#ifndef MDASSEMBLE +static void brief_detail_super_ddf(struct supertype *st) +{ + struct mdinfo info; + char nbuf[64]; + struct ddf_super *ddf = st->sb; + unsigned int vcnum = get_vd_num_of_subarray(st); + if (vcnum == DDF_CONTAINER) + uuid_from_super_ddf(st, info.uuid); + else if (vcnum == DDF_NOTFOUND) + return; + else + uuid_of_ddf_subarray(ddf, vcnum, info.uuid); + fname_from_uuid(st, &info, nbuf,':'); + printf(" UUID=%s", nbuf + 5); +} +#endif + +static int match_home_ddf(struct supertype *st, char *homehost) +{ + /* It matches 'this' host if the controller is a + * Linux-MD controller with vendor_data matching + * the hostname. It would be nice if we could + * test against controller found in /sys or somewhere... + */ + struct ddf_super *ddf = st->sb; + unsigned int len; + + if (!homehost) + return 0; + len = strlen(homehost); + + return (memcmp(ddf->controller.guid, T10, 8) == 0 && + len < sizeof(ddf->controller.vendor_data) && + memcmp(ddf->controller.vendor_data, homehost,len) == 0 && + ddf->controller.vendor_data[len] == 0); +} + +#ifndef MDASSEMBLE +static int find_index_in_bvd(const struct ddf_super *ddf, + const struct vd_config *conf, unsigned int n, + unsigned int *n_bvd) +{ + /* + * Find the index of the n-th valid physical disk in this BVD. + * Unused entries can be sprinkled in with the used entries, + * but don't count. + */ + unsigned int i, j; + for (i = 0, j = 0; + i < ddf->mppe && j < be16_to_cpu(conf->prim_elmnt_count); + i++) { + if (be32_to_cpu(conf->phys_refnum[i]) != 0xffffffff) { + if (n == j) { + *n_bvd = i; + return 1; + } + j++; + } + } + dprintf("couldn't find BVD member %u (total %u)\n", + n, be16_to_cpu(conf->prim_elmnt_count)); + return 0; +} + +/* Given a member array instance number, and a raid disk within that instance, + * find the vd_config structure. The offset of the given disk in the phys_refnum + * table is returned in n_bvd. + * For two-level members with a secondary raid level the vd_config for + * the appropriate BVD is returned. + * The return value is always &vlc->conf, where vlc is returned in last pointer. + */ +static struct vd_config *find_vdcr(struct ddf_super *ddf, unsigned int inst, + unsigned int n, + unsigned int *n_bvd, struct vcl **vcl) +{ + struct vcl *v; + + for (v = ddf->conflist; v; v = v->next) { + unsigned int nsec, ibvd = 0; + struct vd_config *conf; + if (inst != v->vcnum) + continue; + conf = &v->conf; + if (conf->sec_elmnt_count == 1) { + if (find_index_in_bvd(ddf, conf, n, n_bvd)) { + *vcl = v; + return conf; + } else + goto bad; + } + if (v->other_bvds == NULL) { + pr_err("BUG: other_bvds is NULL, nsec=%u\n", + conf->sec_elmnt_count); + goto bad; + } + nsec = n / be16_to_cpu(conf->prim_elmnt_count); + if (conf->sec_elmnt_seq != nsec) { + for (ibvd = 1; ibvd < conf->sec_elmnt_count; ibvd++) { + if (v->other_bvds[ibvd-1]->sec_elmnt_seq + == nsec) + break; + } + if (ibvd == conf->sec_elmnt_count) + goto bad; + conf = v->other_bvds[ibvd-1]; + } + if (!find_index_in_bvd(ddf, conf, + n - nsec*conf->sec_elmnt_count, n_bvd)) + goto bad; + dprintf("found disk %u as member %u in bvd %d of array %u\n", + n, *n_bvd, ibvd, inst); + *vcl = v; + return conf; + } +bad: + pr_err("Could't find disk %d in array %u\n", n, inst); + return NULL; +} +#endif + +static int find_phys(const struct ddf_super *ddf, be32 phys_refnum) +{ + /* Find the entry in phys_disk which has the given refnum + * and return it's index + */ + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->phys->max_pdes); i++) + if (be32_eq(ddf->phys->entries[i].refnum, phys_refnum)) + return i; + return -1; +} + +static void uuid_from_ddf_guid(const char *guid, int uuid[4]) +{ + char buf[20]; + struct sha1_ctx ctx; + sha1_init_ctx(&ctx); + sha1_process_bytes(guid, DDF_GUID_LEN, &ctx); + sha1_finish_ctx(&ctx, buf); + memcpy(uuid, buf, 4*4); +} + +static void uuid_from_super_ddf(struct supertype *st, int uuid[4]) +{ + /* The uuid returned here is used for: + * uuid to put into bitmap file (Create, Grow) + * uuid for backup header when saving critical section (Grow) + * comparing uuids when re-adding a device into an array + * In these cases the uuid required is that of the data-array, + * not the device-set. + * uuid to recognise same set when adding a missing device back + * to an array. This is a uuid for the device-set. + * + * For each of these we can make do with a truncated + * or hashed uuid rather than the original, as long as + * everyone agrees. + * In the case of SVD we assume the BVD is of interest, + * though that might be the case if a bitmap were made for + * a mirrored SVD - worry about that later. + * So we need to find the VD configuration record for the + * relevant BVD and extract the GUID and Secondary_Element_Seq. + * The first 16 bytes of the sha1 of these is used. + */ + struct ddf_super *ddf = st->sb; + struct vcl *vcl = ddf->currentconf; + + if (vcl) + uuid_of_ddf_subarray(ddf, vcl->vcnum, uuid); + else + uuid_from_ddf_guid(ddf->anchor.guid, uuid); +} + +static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map) +{ + struct ddf_super *ddf = st->sb; + int map_disks = info->array.raid_disks; + __u32 *cptr; + + if (ddf->currentconf) { + getinfo_super_ddf_bvd(st, info, map); + return; + } + memset(info, 0, sizeof(*info)); + + info->array.raid_disks = be16_to_cpu(ddf->phys->used_pdes); + info->array.level = LEVEL_CONTAINER; + info->array.layout = 0; + info->array.md_minor = -1; + cptr = (__u32 *)(ddf->anchor.guid + 16); + info->array.ctime = DECADE + __be32_to_cpu(*cptr); + + info->array.chunk_size = 0; + info->container_enough = 1; + + info->disk.major = 0; + info->disk.minor = 0; + if (ddf->dlist) { + struct phys_disk_entry *pde = NULL; + info->disk.number = be32_to_cpu(ddf->dlist->disk.refnum); + info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum); + + info->data_offset = be64_to_cpu(ddf->phys-> + entries[info->disk.raid_disk]. + config_size); + info->component_size = ddf->dlist->size - info->data_offset; + if (info->disk.raid_disk >= 0) + pde = ddf->phys->entries + info->disk.raid_disk; + if (pde && + !(be16_to_cpu(pde->state) & DDF_Failed) && + !(be16_to_cpu(pde->state) & DDF_Missing)) + info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + else + info->disk.state = 1 << MD_DISK_FAULTY; + + } else { + /* There should always be a dlist, but just in case...*/ + info->disk.number = -1; + info->disk.raid_disk = -1; + info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + } + info->events = be32_to_cpu(ddf->active->seq); + info->array.utime = DECADE + be32_to_cpu(ddf->active->timestamp); + + info->recovery_start = MaxSector; + info->reshape_active = 0; + info->recovery_blocked = 0; + info->name[0] = 0; + + info->array.major_version = -1; + info->array.minor_version = -2; + strcpy(info->text_version, "ddf"); + info->safe_mode_delay = 0; + + uuid_from_super_ddf(st, info->uuid); + + if (map) { + int i, e = 0; + int max = be16_to_cpu(ddf->phys->max_pdes); + for (i = e = 0 ; i < map_disks ; i++, e++) { + while (e < max && + be32_to_cpu(ddf->phys->entries[e].refnum) == 0xffffffff) + e++; + if (i < info->array.raid_disks && e < max && + !(be16_to_cpu(ddf->phys->entries[e].state) + & DDF_Failed)) + map[i] = 1; + else + map[i] = 0; + } + } +} + +/* size of name must be at least 17 bytes! */ +static void _ddf_array_name(char *name, const struct ddf_super *ddf, int i) +{ + int j; + memcpy(name, ddf->virt->entries[i].name, 16); + name[16] = 0; + for(j = 0; j < 16; j++) + if (name[j] == ' ') + name[j] = 0; +} + +static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map) +{ + struct ddf_super *ddf = st->sb; + struct vcl *vc = ddf->currentconf; + int cd = ddf->currentdev; + int n_prim; + int j; + struct dl *dl = NULL; + int map_disks = info->array.raid_disks; + __u32 *cptr; + struct vd_config *conf; + + memset(info, 0, sizeof(*info)); + if (layout_ddf2md(&vc->conf, &info->array) == -1) + return; + info->array.md_minor = -1; + cptr = (__u32 *)(vc->conf.guid + 16); + info->array.ctime = DECADE + __be32_to_cpu(*cptr); + info->array.utime = DECADE + be32_to_cpu(vc->conf.timestamp); + info->array.chunk_size = 512 << vc->conf.chunk_shift; + info->custom_array_size = be64_to_cpu(vc->conf.array_blocks); + + conf = &vc->conf; + n_prim = be16_to_cpu(conf->prim_elmnt_count); + if (conf->sec_elmnt_count > 1 && cd >= n_prim) { + int ibvd = cd / n_prim - 1; + cd %= n_prim; + conf = vc->other_bvds[ibvd]; + } + + if (cd >= 0 && (unsigned)cd < ddf->mppe) { + info->data_offset = + be64_to_cpu(LBA_OFFSET(ddf, conf)[cd]); + if (vc->block_sizes) + info->component_size = vc->block_sizes[cd]; + else + info->component_size = be64_to_cpu(conf->blocks); + + for (dl = ddf->dlist; dl ; dl = dl->next) + if (be32_eq(dl->disk.refnum, conf->phys_refnum[cd])) + break; + } + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.state = 0; + if (dl && dl->pdnum >= 0) { + info->disk.major = dl->major; + info->disk.minor = dl->minor; + info->disk.raid_disk = cd + conf->sec_elmnt_seq + * be16_to_cpu(conf->prim_elmnt_count); + info->disk.number = dl->pdnum; + info->disk.state = 0; + if (info->disk.number >= 0 && + (be16_to_cpu(ddf->phys->entries[info->disk.number].state) & DDF_Online) && + !(be16_to_cpu(ddf->phys->entries[info->disk.number].state) & DDF_Failed)) + info->disk.state = (1<events = be32_to_cpu(ddf->active->seq); + } + + info->container_member = ddf->currentconf->vcnum; + + info->recovery_start = MaxSector; + info->resync_start = 0; + info->reshape_active = 0; + info->recovery_blocked = 0; + if (!(ddf->virt->entries[info->container_member].state + & DDF_state_inconsistent) && + (ddf->virt->entries[info->container_member].init_state + & DDF_initstate_mask) + == DDF_init_full) + info->resync_start = MaxSector; + + uuid_from_super_ddf(st, info->uuid); + + info->array.major_version = -1; + info->array.minor_version = -2; + sprintf(info->text_version, "/%s/%d", + st->container_devnm, + info->container_member); + info->safe_mode_delay = DDF_SAFE_MODE_DELAY; + + _ddf_array_name(info->name, ddf, info->container_member); + + if (map) + for (j = 0; j < map_disks; j++) { + map[j] = 0; + if (j < info->array.raid_disks) { + int i = find_phys(ddf, vc->conf.phys_refnum[j]); + if (i >= 0 && + (be16_to_cpu(ddf->phys->entries[i].state) + & DDF_Online) && + !(be16_to_cpu(ddf->phys->entries[i].state) + & DDF_Failed)) + map[i] = 1; + } + } +} + +static int update_super_ddf(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* For 'assemble' and 'force' we need to return non-zero if any + * change was made. For others, the return value is ignored. + * Update options are: + * force-one : This device looks a bit old but needs to be included, + * update age info appropriately. + * assemble: clear any 'faulty' flag to allow this device to + * be assembled. + * force-array: Array is degraded but being forced, mark it clean + * if that will be needed to assemble it. + * + * newdev: not used ???? + * grow: Array has gained a new device - this is currently for + * linear only + * resync: mark as dirty so a resync will happen. + * uuid: Change the uuid of the array to match what is given + * homehost: update the recorded homehost + * name: update the name - preserving the homehost + * _reshape_progress: record new reshape_progress position. + * + * Following are not relevant for this version: + * sparc2.2 : update from old dodgey metadata + * super-minor: change the preferred_minor number + * summaries: update redundant counters. + */ + int rv = 0; +// struct ddf_super *ddf = st->sb; +// struct vd_config *vd = find_vdcr(ddf, info->container_member); +// struct virtual_entry *ve = find_ve(ddf); + + /* we don't need to handle "force-*" or "assemble" as + * there is no need to 'trick' the kernel. When the metadata is + * first updated to activate the array, all the implied modifications + * will just happen. + */ + + if (strcmp(update, "grow") == 0) { + /* FIXME */ + } else if (strcmp(update, "resync") == 0) { +// info->resync_checkpoint = 0; + } else if (strcmp(update, "homehost") == 0) { + /* homehost is stored in controller->vendor_data, + * or it is when we are the vendor + */ +// if (info->vendor_is_local) +// strcpy(ddf->controller.vendor_data, homehost); + rv = -1; + } else if (strcmp(update, "name") == 0) { + /* name is stored in virtual_entry->name */ +// memset(ve->name, ' ', 16); +// strncpy(ve->name, info->name, 16); + rv = -1; + } else if (strcmp(update, "_reshape_progress") == 0) { + /* We don't support reshape yet */ + } else if (strcmp(update, "assemble") == 0 ) { + /* Do nothing, just succeed */ + rv = 0; + } else + rv = -1; + +// update_all_csum(ddf); + + return rv; +} + +static void make_header_guid(char *guid) +{ + be32 stamp; + /* Create a DDF Header of Virtual Disk GUID */ + + /* 24 bytes of fiction required. + * first 8 are a 'vendor-id' - "Linux-MD" + * next 8 are controller type.. how about 0X DEAD BEEF 0000 0000 + * Remaining 8 random number plus timestamp + */ + memcpy(guid, T10, sizeof(T10)); + stamp = cpu_to_be32(0xdeadbeef); + memcpy(guid+8, &stamp, 4); + stamp = cpu_to_be32(0); + memcpy(guid+12, &stamp, 4); + stamp = cpu_to_be32(time(0) - DECADE); + memcpy(guid+16, &stamp, 4); + stamp._v32 = random32(); + memcpy(guid+20, &stamp, 4); +} + +static unsigned int find_unused_vde(const struct ddf_super *ddf) +{ + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + return i; + } + return DDF_NOTFOUND; +} + +static unsigned int find_vde_by_name(const struct ddf_super *ddf, + const char *name) +{ + unsigned int i; + if (name == NULL) + return DDF_NOTFOUND; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + continue; + if (!strncmp(name, ddf->virt->entries[i].name, + sizeof(ddf->virt->entries[i].name))) + return i; + } + return DDF_NOTFOUND; +} + +#ifndef MDASSEMBLE +static unsigned int find_vde_by_guid(const struct ddf_super *ddf, + const char *guid) +{ + unsigned int i; + if (guid == NULL || all_ff(guid)) + return DDF_NOTFOUND; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) + if (!memcmp(ddf->virt->entries[i].guid, guid, DDF_GUID_LEN)) + return i; + return DDF_NOTFOUND; +} +#endif + +static int init_super_ddf(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, char *name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + /* This is primarily called by Create when creating a new array. + * We will then get add_to_super called for each component, and then + * write_init_super called to write it out to each device. + * For DDF, Create can create on fresh devices or on a pre-existing + * array. + * To create on a pre-existing array a different method will be called. + * This one is just for fresh drives. + * + * We need to create the entire 'ddf' structure which includes: + * DDF headers - these are easy. + * Controller data - a Sector describing this controller .. not that + * this is a controller exactly. + * Physical Disk Record - one entry per device, so + * leave plenty of space. + * Virtual Disk Records - again, just leave plenty of space. + * This just lists VDs, doesn't give details. + * Config records - describe the VDs that use this disk + * DiskData - describes 'this' device. + * BadBlockManagement - empty + * Diag Space - empty + * Vendor Logs - Could we put bitmaps here? + * + */ + struct ddf_super *ddf; + char hostname[17]; + int hostlen; + int max_phys_disks, max_virt_disks; + unsigned long long sector; + int clen; + int i; + int pdsize, vdsize; + struct phys_disk *pd; + struct virtual_disk *vd; + + if (st->sb) + return init_super_ddf_bvd(st, info, size, name, homehost, uuid, + data_offset); + + if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) { + pr_err("could not allocate superblock\n"); + return 0; + } + memset(ddf, 0, sizeof(*ddf)); + st->sb = ddf; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + /* At least 32MB *must* be reserved for the ddf. So let's just + * start 32MB from the end, and put the primary header there. + * Don't do secondary for now. + * We don't know exactly where that will be yet as it could be + * different on each device. So just set up the lengths. + */ + + ddf->anchor.magic = DDF_HEADER_MAGIC; + make_header_guid(ddf->anchor.guid); + + memcpy(ddf->anchor.revision, DDF_REVISION_2, 8); + ddf->anchor.seq = cpu_to_be32(1); + ddf->anchor.timestamp = cpu_to_be32(time(0) - DECADE); + ddf->anchor.openflag = 0xFF; + ddf->anchor.foreignflag = 0; + ddf->anchor.enforcegroups = 0; /* Is this best?? */ + ddf->anchor.pad0 = 0xff; + memset(ddf->anchor.pad1, 0xff, 12); + memset(ddf->anchor.header_ext, 0xff, 32); + ddf->anchor.primary_lba = cpu_to_be64(~(__u64)0); + ddf->anchor.secondary_lba = cpu_to_be64(~(__u64)0); + ddf->anchor.type = DDF_HEADER_ANCHOR; + memset(ddf->anchor.pad2, 0xff, 3); + ddf->anchor.workspace_len = cpu_to_be32(32768); /* Must be reserved */ + /* Put this at bottom of 32M reserved.. */ + ddf->anchor.workspace_lba = cpu_to_be64(~(__u64)0); + max_phys_disks = 1023; /* Should be enough, 4095 is also allowed */ + ddf->anchor.max_pd_entries = cpu_to_be16(max_phys_disks); + max_virt_disks = 255; /* 15, 63, 255, 1024, 4095 are all allowed */ + ddf->anchor.max_vd_entries = cpu_to_be16(max_virt_disks); + ddf->max_part = 64; + ddf->anchor.max_partitions = cpu_to_be16(ddf->max_part); + ddf->mppe = 256; /* 16, 64, 256, 1024, 4096 are all allowed */ + ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512; + ddf->anchor.config_record_len = cpu_to_be16(ddf->conf_rec_len); + ddf->anchor.max_primary_element_entries = cpu_to_be16(ddf->mppe); + memset(ddf->anchor.pad3, 0xff, 54); + /* Controller section is one sector long immediately + * after the ddf header */ + sector = 1; + ddf->anchor.controller_section_offset = cpu_to_be32(sector); + ddf->anchor.controller_section_length = cpu_to_be32(1); + sector += 1; + + /* phys is 8 sectors after that */ + pdsize = ROUND_UP(sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)*max_phys_disks, + 512); + switch(pdsize/512) { + case 2: case 8: case 32: case 128: case 512: break; + default: abort(); + } + ddf->anchor.phys_section_offset = cpu_to_be32(sector); + ddf->anchor.phys_section_length = + cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */ + sector += pdsize/512; + + /* virt is another 32 sectors */ + vdsize = ROUND_UP(sizeof(struct virtual_disk) + + sizeof(struct virtual_entry) * max_virt_disks, + 512); + switch(vdsize/512) { + case 2: case 8: case 32: case 128: case 512: break; + default: abort(); + } + ddf->anchor.virt_section_offset = cpu_to_be32(sector); + ddf->anchor.virt_section_length = + cpu_to_be32(vdsize/512); /* max_vd_entries/8 */ + sector += vdsize/512; + + clen = ddf->conf_rec_len * (ddf->max_part+1); + ddf->anchor.config_section_offset = cpu_to_be32(sector); + ddf->anchor.config_section_length = cpu_to_be32(clen); + sector += clen; + + ddf->anchor.data_section_offset = cpu_to_be32(sector); + ddf->anchor.data_section_length = cpu_to_be32(1); + sector += 1; + + ddf->anchor.bbm_section_length = cpu_to_be32(0); + ddf->anchor.bbm_section_offset = cpu_to_be32(0xFFFFFFFF); + ddf->anchor.diag_space_length = cpu_to_be32(0); + ddf->anchor.diag_space_offset = cpu_to_be32(0xFFFFFFFF); + ddf->anchor.vendor_length = cpu_to_be32(0); + ddf->anchor.vendor_offset = cpu_to_be32(0xFFFFFFFF); + + memset(ddf->anchor.pad4, 0xff, 256); + + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); + + ddf->primary.openflag = 1; /* I guess.. */ + ddf->primary.type = DDF_HEADER_PRIMARY; + + ddf->secondary.openflag = 1; /* I guess.. */ + ddf->secondary.type = DDF_HEADER_SECONDARY; + + ddf->active = &ddf->primary; + + ddf->controller.magic = DDF_CONTROLLER_MAGIC; + + /* 24 more bytes of fiction required. + * first 8 are a 'vendor-id' - "Linux-MD" + * Remaining 16 are serial number.... maybe a hostname would do? + */ + memcpy(ddf->controller.guid, T10, sizeof(T10)); + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = 0; + hostlen = strlen(hostname); + memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen); + for (i = strlen(T10) ; i+hostlen < 24; i++) + ddf->controller.guid[i] = ' '; + + ddf->controller.type.vendor_id = cpu_to_be16(0xDEAD); + ddf->controller.type.device_id = cpu_to_be16(0xBEEF); + ddf->controller.type.sub_vendor_id = cpu_to_be16(0); + ddf->controller.type.sub_device_id = cpu_to_be16(0); + memcpy(ddf->controller.product_id, "What Is My PID??", 16); + memset(ddf->controller.pad, 0xff, 8); + memset(ddf->controller.vendor_data, 0xff, 448); + if (homehost && strlen(homehost) < 440) + strcpy((char*)ddf->controller.vendor_data, homehost); + + if (posix_memalign((void**)&pd, 512, pdsize) != 0) { + pr_err("could not allocate pd\n"); + return 0; + } + ddf->phys = pd; + ddf->pdsize = pdsize; + + memset(pd, 0xff, pdsize); + memset(pd, 0, sizeof(*pd)); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = cpu_to_be16(0); + pd->max_pdes = cpu_to_be16(max_phys_disks); + memset(pd->pad, 0xff, 52); + for (i = 0; i < max_phys_disks; i++) + memset(pd->entries[i].guid, 0xff, DDF_GUID_LEN); + + if (posix_memalign((void**)&vd, 512, vdsize) != 0) { + pr_err("could not allocate vd\n"); + return 0; + } + ddf->virt = vd; + ddf->vdsize = vdsize; + memset(vd, 0, vdsize); + vd->magic = DDF_VIRT_RECORDS_MAGIC; + vd->populated_vdes = cpu_to_be16(0); + vd->max_vdes = cpu_to_be16(max_virt_disks); + memset(vd->pad, 0xff, 52); + + for (i=0; ientries[i], 0xff, sizeof(struct virtual_entry)); + + st->sb = ddf; + ddf_set_updates_pending(ddf, NULL); + return 1; +} + +static int chunk_to_shift(int chunksize) +{ + return ffs(chunksize/512)-1; +} + +#ifndef MDASSEMBLE +struct extent { + unsigned long long start, size; +}; +static int cmp_extent(const void *av, const void *bv) +{ + const struct extent *a = av; + const struct extent *b = bv; + if (a->start < b->start) + return -1; + if (a->start > b->start) + return 1; + return 0; +} + +static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl) +{ + /* Find a list of used extents on the given physical device + * (dnum) of the given ddf. + * Return a malloced array of 'struct extent' + */ + struct extent *rv; + int n = 0; + unsigned int i; + __u16 state; + + if (dl->pdnum < 0) + return NULL; + state = be16_to_cpu(ddf->phys->entries[dl->pdnum].state); + + if ((state & (DDF_Online|DDF_Failed|DDF_Missing)) != DDF_Online) + return NULL; + + rv = xmalloc(sizeof(struct extent) * (ddf->max_part + 2)); + + for (i = 0; i < ddf->max_part; i++) { + const struct vd_config *bvd; + unsigned int ibvd; + struct vcl *v = dl->vlist[i]; + if (v == NULL || + get_pd_index_from_refnum(v, dl->disk.refnum, ddf->mppe, + &bvd, &ibvd) == DDF_NOTFOUND) + continue; + rv[n].start = be64_to_cpu(LBA_OFFSET(ddf, bvd)[ibvd]); + rv[n].size = be64_to_cpu(bvd->blocks); + n++; + } + qsort(rv, n, sizeof(*rv), cmp_extent); + + rv[n].start = be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size); + rv[n].size = 0; + return rv; +} + +static unsigned long long find_space( + struct ddf_super *ddf, struct dl *dl, + unsigned long long data_offset, + unsigned long long *size) +{ + /* Find if the requested amount of space is available. + * If it is, return start. + * If not, set *size to largest space. + * If data_offset != INVALID_SECTORS, then the space must start + * at this location. + */ + struct extent *e = get_extents(ddf, dl); + int i = 0; + unsigned long long pos = 0; + unsigned long long max_size = 0; + + if (!e) { + *size = 0; + return INVALID_SECTORS; + } + do { + unsigned long long esize = e[i].start - pos; + if (data_offset != INVALID_SECTORS && + pos <= data_offset && + e[i].start > data_offset) { + pos = data_offset; + esize = e[i].start - pos; + } + if (data_offset != INVALID_SECTORS && + pos != data_offset) { + i++; + continue; + } + if (esize >= *size) { + /* Found! */ + free(e); + return pos; + } + if (esize > max_size) + max_size = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + *size = max_size; + free(e); + return INVALID_SECTORS; +} +#endif + +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + /* We are creating a BVD inside a pre-existing container. + * so st->sb is already set. + * We need to create a new vd_config and a new virtual_entry + */ + struct ddf_super *ddf = st->sb; + unsigned int venum, i; + struct virtual_entry *ve; + struct vcl *vcl; + struct vd_config *vc; + + if (find_vde_by_name(ddf, name) != DDF_NOTFOUND) { + pr_err("This ddf already has an array called %s\n", name); + return 0; + } + venum = find_unused_vde(ddf); + if (venum == DDF_NOTFOUND) { + pr_err("Cannot find spare slot for virtual disk\n"); + return 0; + } + ve = &ddf->virt->entries[venum]; + + /* A Virtual Disk GUID contains the T10 Vendor ID, controller type, + * timestamp, random number + */ + make_header_guid(ve->guid); + ve->unit = cpu_to_be16(info->md_minor); + ve->pad0 = 0xFFFF; + ve->guid_crc._v16 = crc32(0, (unsigned char *)ddf->anchor.guid, + DDF_GUID_LEN); + ve->type = cpu_to_be16(0); + ve->state = DDF_state_degraded; /* Will be modified as devices are added */ + if (info->state & 1) /* clean */ + ve->init_state = DDF_init_full; + else + ve->init_state = DDF_init_not; + + memset(ve->pad1, 0xff, 14); + memset(ve->name, ' ', 16); + if (name) + strncpy(ve->name, name, 16); + ddf->virt->populated_vdes = + cpu_to_be16(be16_to_cpu(ddf->virt->populated_vdes)+1); + + /* Now create a new vd_config */ + if (posix_memalign((void**)&vcl, 512, + (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512)) != 0) { + pr_err("could not allocate vd_config\n"); + return 0; + } + vcl->vcnum = venum; + vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + vc = &vcl->conf; + + vc->magic = DDF_VD_CONF_MAGIC; + memcpy(vc->guid, ve->guid, DDF_GUID_LEN); + vc->timestamp = cpu_to_be32(time(0)-DECADE); + vc->seqnum = cpu_to_be32(1); + memset(vc->pad0, 0xff, 24); + vc->chunk_shift = chunk_to_shift(info->chunk_size); + if (layout_md2ddf(info, vc) == -1 || + be16_to_cpu(vc->prim_elmnt_count) > ddf->mppe) { + pr_err("unsupported RAID level/layout %d/%d with %d disks\n", + info->level, info->layout, info->raid_disks); + free(vcl); + return 0; + } + vc->sec_elmnt_seq = 0; + if (alloc_other_bvds(ddf, vcl) != 0) { + pr_err("could not allocate other bvds\n"); + free(vcl); + return 0; + } + vc->blocks = cpu_to_be64(info->size * 2); + vc->array_blocks = cpu_to_be64( + calc_array_size(info->level, info->raid_disks, info->layout, + info->chunk_size, info->size*2)); + memset(vc->pad1, 0xff, 8); + vc->spare_refs[0] = cpu_to_be32(0xffffffff); + vc->spare_refs[1] = cpu_to_be32(0xffffffff); + vc->spare_refs[2] = cpu_to_be32(0xffffffff); + vc->spare_refs[3] = cpu_to_be32(0xffffffff); + vc->spare_refs[4] = cpu_to_be32(0xffffffff); + vc->spare_refs[5] = cpu_to_be32(0xffffffff); + vc->spare_refs[6] = cpu_to_be32(0xffffffff); + vc->spare_refs[7] = cpu_to_be32(0xffffffff); + memset(vc->cache_pol, 0, 8); + vc->bg_rate = 0x80; + memset(vc->pad2, 0xff, 3); + memset(vc->pad3, 0xff, 52); + memset(vc->pad4, 0xff, 192); + memset(vc->v0, 0xff, 32); + memset(vc->v1, 0xff, 32); + memset(vc->v2, 0xff, 16); + memset(vc->v3, 0xff, 16); + memset(vc->vendor, 0xff, 32); + + memset(vc->phys_refnum, 0xff, 4*ddf->mppe); + memset(vc->phys_refnum+ddf->mppe, 0x00, 8*ddf->mppe); + + for (i = 1; i < vc->sec_elmnt_count; i++) { + memcpy(vcl->other_bvds[i-1], vc, ddf->conf_rec_len * 512); + vcl->other_bvds[i-1]->sec_elmnt_seq = i; + } + + vcl->next = ddf->conflist; + ddf->conflist = vcl; + ddf->currentconf = vcl; + ddf_set_updates_pending(ddf, NULL); + return 1; +} + +#ifndef MDASSEMBLE +static void add_to_super_ddf_bvd(struct supertype *st, + mdu_disk_info_t *dk, int fd, char *devname, + unsigned long long data_offset) +{ + /* fd and devname identify a device within the ddf container (st). + * dk identifies a location in the new BVD. + * We need to find suitable free space in that device and update + * the phys_refnum and lba_offset for the newly created vd_config. + * We might also want to update the type in the phys_disk + * section. + * + * Alternately: fd == -1 and we have already chosen which device to + * use and recorded in dlist->raid_disk; + */ + struct dl *dl; + struct ddf_super *ddf = st->sb; + struct vd_config *vc; + unsigned int i; + unsigned long long blocks, pos; + unsigned int raid_disk = dk->raid_disk; + + if (fd == -1) { + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->raiddisk == dk->raid_disk) + break; + } else { + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + } + if (!dl || dl->pdnum < 0 || ! (dk->state & (1<currentconf->conf; + if (vc->sec_elmnt_count > 1) { + unsigned int n = be16_to_cpu(vc->prim_elmnt_count); + if (raid_disk >= n) + vc = ddf->currentconf->other_bvds[raid_disk / n - 1]; + raid_disk %= n; + } + + blocks = be64_to_cpu(vc->blocks); + if (ddf->currentconf->block_sizes) + blocks = ddf->currentconf->block_sizes[dk->raid_disk]; + + pos = find_space(ddf, dl, data_offset, &blocks); + if (pos == INVALID_SECTORS) + return; + + ddf->currentdev = dk->raid_disk; + vc->phys_refnum[raid_disk] = dl->disk.refnum; + LBA_OFFSET(ddf, vc)[raid_disk] = cpu_to_be64(pos); + + for (i = 0; i < ddf->max_part ; i++) + if (dl->vlist[i] == NULL) + break; + if (i == ddf->max_part) + return; + dl->vlist[i] = ddf->currentconf; + + if (fd >= 0) + dl->fd = fd; + if (devname) + dl->devname = devname; + + /* Check if we can mark array as optimal yet */ + i = ddf->currentconf->vcnum; + ddf->virt->entries[i].state = + (ddf->virt->entries[i].state & ~DDF_state_mask) + | get_svd_state(ddf, ddf->currentconf); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + dprintf("added disk %d/%08x to VD %d/%s as disk %d\n", + dl->pdnum, be32_to_cpu(dl->disk.refnum), + ddf->currentconf->vcnum, guid_str(vc->guid), + dk->raid_disk); + ddf_set_updates_pending(ddf, vc); +} + +static unsigned int find_unused_pde(const struct ddf_super *ddf) +{ + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->phys->max_pdes); i++) { + if (all_ff(ddf->phys->entries[i].guid)) + return i; + } + return DDF_NOTFOUND; +} + +static void _set_config_size(struct phys_disk_entry *pde, const struct dl *dl) +{ + __u64 cfs, t; + cfs = min(dl->size - 32*1024*2ULL, be64_to_cpu(dl->primary_lba)); + t = be64_to_cpu(dl->secondary_lba); + if (t != ~(__u64)0) + cfs = min(cfs, t); + /* + * Some vendor DDF structures interpret workspace_lba + * very differently than we do: Make a sanity check on the value. + */ + t = be64_to_cpu(dl->workspace_lba); + if (t < cfs) { + __u64 wsp = cfs - t; + if (wsp > 1024*1024*2ULL && wsp > dl->size / 16) { + pr_err("%x:%x: workspace size 0x%llx too big, ignoring\n", + dl->major, dl->minor, (unsigned long long)wsp); + } else + cfs = t; + } + pde->config_size = cpu_to_be64(cfs); + dprintf("%x:%x config_size %llx, DDF structure is %llx blocks\n", + dl->major, dl->minor, + (unsigned long long)cfs, (unsigned long long)(dl->size-cfs)); +} + +/* Add a device to a container, either while creating it or while + * expanding a pre-existing container + */ +static int add_to_super_ddf(struct supertype *st, + mdu_disk_info_t *dk, int fd, char *devname, + unsigned long long data_offset) +{ + struct ddf_super *ddf = st->sb; + struct dl *dd; + time_t now; + struct tm *tm; + unsigned long long size; + struct phys_disk_entry *pde; + unsigned int n, i; + struct stat stb; + __u32 *tptr; + + if (ddf->currentconf) { + add_to_super_ddf_bvd(st, dk, fd, devname, data_offset); + return 0; + } + + /* This is device numbered dk->number. We need to create + * a phys_disk entry and a more detailed disk_data entry. + */ + fstat(fd, &stb); + n = find_unused_pde(ddf); + if (n == DDF_NOTFOUND) { + pr_err("No free slot in array, cannot add disk\n"); + return 1; + } + pde = &ddf->phys->entries[n]; + get_dev_size(fd, NULL, &size); + if (size <= 32*1024*1024) { + pr_err("device size must be at least 32MB\n"); + return 1; + } + size >>= 9; + + if (posix_memalign((void**)&dd, 512, + sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part) != 0) { + pr_err("could allocate buffer for new disk, aborting\n"); + return 1; + } + dd->major = major(stb.st_rdev); + dd->minor = minor(stb.st_rdev); + dd->devname = devname; + dd->fd = fd; + dd->spare = NULL; + + dd->disk.magic = DDF_PHYS_DATA_MAGIC; + now = time(0); + tm = localtime(&now); + sprintf(dd->disk.guid, "%8s%04d%02d%02d", + T10, tm->tm_year+1900, tm->tm_mon+1, tm->tm_mday); + tptr = (__u32 *)(dd->disk.guid + 16); + *tptr++ = random32(); + *tptr = random32(); + + do { + /* Cannot be bothered finding a CRC of some irrelevant details*/ + dd->disk.refnum._v32 = random32(); + for (i = be16_to_cpu(ddf->active->max_pd_entries); + i > 0; i--) + if (be32_eq(ddf->phys->entries[i-1].refnum, + dd->disk.refnum)) + break; + } while (i > 0); + + dd->disk.forced_ref = 1; + dd->disk.forced_guid = 1; + memset(dd->disk.vendor, ' ', 32); + memcpy(dd->disk.vendor, "Linux", 5); + memset(dd->disk.pad, 0xff, 442); + for (i = 0; i < ddf->max_part ; i++) + dd->vlist[i] = NULL; + + dd->pdnum = n; + + if (st->update_tail) { + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + struct phys_disk *pd; + + pd = xmalloc(len); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = cpu_to_be16(n); + pde = &pd->entries[0]; + dd->mdupdate = pd; + } else + ddf->phys->used_pdes = cpu_to_be16( + 1 + be16_to_cpu(ddf->phys->used_pdes)); + + memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN); + pde->refnum = dd->disk.refnum; + pde->type = cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare); + pde->state = cpu_to_be16(DDF_Online); + dd->size = size; + /* + * If there is already a device in dlist, try to reserve the same + * amount of workspace. Otherwise, use 32MB. + * We checked disk size above already. + */ +#define __calc_lba(new, old, lba, mb) do { \ + unsigned long long dif; \ + if ((old) != NULL) \ + dif = (old)->size - be64_to_cpu((old)->lba); \ + else \ + dif = (new)->size; \ + if ((new)->size > dif) \ + (new)->lba = cpu_to_be64((new)->size - dif); \ + else \ + (new)->lba = cpu_to_be64((new)->size - (mb*1024*2)); \ + } while (0) + __calc_lba(dd, ddf->dlist, workspace_lba, 32); + __calc_lba(dd, ddf->dlist, primary_lba, 16); + if (ddf->dlist == NULL || + be64_to_cpu(ddf->dlist->secondary_lba) != ~(__u64)0) + __calc_lba(dd, ddf->dlist, secondary_lba, 32); + _set_config_size(pde, dd); + + sprintf(pde->path, "%17.17s","Information: nil") ; + memset(pde->pad, 0xff, 6); + + if (st->update_tail) { + dd->next = ddf->add_list; + ddf->add_list = dd; + } else { + dd->next = ddf->dlist; + ddf->dlist = dd; + ddf_set_updates_pending(ddf, NULL); + } + + return 0; +} + +static int remove_from_super_ddf(struct supertype *st, mdu_disk_info_t *dk) +{ + struct ddf_super *ddf = st->sb; + struct dl *dl; + + /* mdmon has noticed that this disk (dk->major/dk->minor) has + * disappeared from the container. + * We need to arrange that it disappears from the metadata and + * internal data structures too. + * Most of the work is done by ddf_process_update which edits + * the metadata and closes the file handle and attaches the memory + * where free_updates will free it. + */ + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + if (!dl || dl->pdnum < 0) + return -1; + + if (st->update_tail) { + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + struct phys_disk *pd; + + pd = xmalloc(len); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = cpu_to_be16(dl->pdnum); + pd->entries[0].state = cpu_to_be16(DDF_Missing); + append_metadata_update(st, pd, len); + } + return 0; +} +#endif + +/* + * This is the write_init_super method for a ddf container. It is + * called when creating a container or adding another device to a + * container. + */ + +static int __write_ddf_structure(struct dl *d, struct ddf_super *ddf, __u8 type) +{ + unsigned long long sector; + struct ddf_header *header; + int fd, i, n_config, conf_size, buf_size; + int ret = 0; + char *conf; + + fd = d->fd; + + switch (type) { + case DDF_HEADER_PRIMARY: + header = &ddf->primary; + sector = be64_to_cpu(header->primary_lba); + break; + case DDF_HEADER_SECONDARY: + header = &ddf->secondary; + sector = be64_to_cpu(header->secondary_lba); + break; + default: + return 0; + } + if (sector == ~(__u64)0) + return 0; + + header->type = type; + header->openflag = 1; + header->crc = calc_crc(header, 512); + + lseek64(fd, sector<<9, 0); + if (write(fd, header, 512) < 0) + goto out; + + ddf->controller.crc = calc_crc(&ddf->controller, 512); + if (write(fd, &ddf->controller, 512) < 0) + goto out; + + ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize); + if (write(fd, ddf->phys, ddf->pdsize) < 0) + goto out; + ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize); + if (write(fd, ddf->virt, ddf->vdsize) < 0) + goto out; + + /* Now write lots of config records. */ + n_config = ddf->max_part; + conf_size = ddf->conf_rec_len * 512; + conf = ddf->conf; + buf_size = conf_size * (n_config + 1); + if (!conf) { + if (posix_memalign((void**)&conf, 512, buf_size) != 0) + goto out; + ddf->conf = conf; + } + for (i = 0 ; i <= n_config ; i++) { + struct vcl *c; + struct vd_config *vdc = NULL; + if (i == n_config) { + c = (struct vcl *)d->spare; + if (c) + vdc = &c->conf; + } else { + unsigned int dummy; + c = d->vlist[i]; + if (c) + get_pd_index_from_refnum( + c, d->disk.refnum, + ddf->mppe, + (const struct vd_config **)&vdc, + &dummy); + } + if (vdc) { + dprintf("writing conf record %i on disk %08x for %s/%u\n", + i, be32_to_cpu(d->disk.refnum), + guid_str(vdc->guid), + vdc->sec_elmnt_seq); + vdc->crc = calc_crc(vdc, conf_size); + memcpy(conf + i*conf_size, vdc, conf_size); + } else + memset(conf + i*conf_size, 0xff, conf_size); + } + if (write(fd, conf, buf_size) != buf_size) + goto out; + + d->disk.crc = calc_crc(&d->disk, 512); + if (write(fd, &d->disk, 512) < 0) + goto out; + + ret = 1; +out: + header->openflag = 0; + header->crc = calc_crc(header, 512); + + lseek64(fd, sector<<9, 0); + if (write(fd, header, 512) < 0) + ret = 0; + + return ret; +} + +static int _write_super_to_disk(struct ddf_super *ddf, struct dl *d) +{ + unsigned long long size; + int fd = d->fd; + if (fd < 0) + return 0; + + /* We need to fill in the primary, (secondary) and workspace + * lba's in the headers, set their checksums, + * Also checksum phys, virt.... + * + * Then write everything out, finally the anchor is written. + */ + get_dev_size(fd, NULL, &size); + size /= 512; + memcpy(&ddf->anchor, ddf->active, 512); + if (be64_to_cpu(d->workspace_lba) != 0ULL) + ddf->anchor.workspace_lba = d->workspace_lba; + else + ddf->anchor.workspace_lba = + cpu_to_be64(size - 32*1024*2); + if (be64_to_cpu(d->primary_lba) != 0ULL) + ddf->anchor.primary_lba = d->primary_lba; + else + ddf->anchor.primary_lba = + cpu_to_be64(size - 16*1024*2); + if (be64_to_cpu(d->secondary_lba) != 0ULL) + ddf->anchor.secondary_lba = d->secondary_lba; + else + ddf->anchor.secondary_lba = + cpu_to_be64(size - 32*1024*2); + ddf->anchor.timestamp = cpu_to_be32(time(0) - DECADE); + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); + + ddf->anchor.type = DDF_HEADER_ANCHOR; + ddf->anchor.openflag = 0xFF; /* 'open' means nothing */ + ddf->anchor.seq = cpu_to_be32(0xFFFFFFFF); /* no sequencing in anchor */ + ddf->anchor.crc = calc_crc(&ddf->anchor, 512); + + if (!__write_ddf_structure(d, ddf, DDF_HEADER_PRIMARY)) + return 0; + + if (!__write_ddf_structure(d, ddf, DDF_HEADER_SECONDARY)) + return 0; + + lseek64(fd, (size-1)*512, SEEK_SET); + if (write(fd, &ddf->anchor, 512) < 0) + return 0; + + return 1; +} + +#ifndef MDASSEMBLE +static int __write_init_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + struct dl *d; + int attempts = 0; + int successes = 0; + + pr_state(ddf, __func__); + + /* try to write updated metadata, + * if we catch a failure move on to the next disk + */ + for (d = ddf->dlist; d; d=d->next) { + attempts++; + successes += _write_super_to_disk(ddf, d); + } + + return attempts != successes; +} + +static int write_init_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + struct vcl *currentconf = ddf->currentconf; + + /* We are done with currentconf - reset it so st refers to the container */ + ddf->currentconf = NULL; + + if (st->update_tail) { + /* queue the virtual_disk and vd_config as metadata updates */ + struct virtual_disk *vd; + struct vd_config *vc; + int len, tlen; + unsigned int i; + + if (!currentconf) { + /* Must be adding a physical disk to the container */ + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + + /* adding a disk to the container. */ + if (!ddf->add_list) + return 0; + + append_metadata_update(st, ddf->add_list->mdupdate, len); + ddf->add_list->mdupdate = NULL; + return 0; + } + + /* Newly created VD */ + + /* First the virtual disk. We have a slightly fake header */ + len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry); + vd = xmalloc(len); + *vd = *ddf->virt; + vd->entries[0] = ddf->virt->entries[currentconf->vcnum]; + vd->populated_vdes = cpu_to_be16(currentconf->vcnum); + append_metadata_update(st, vd, len); + + /* Then the vd_config */ + len = ddf->conf_rec_len * 512; + tlen = len * currentconf->conf.sec_elmnt_count; + vc = xmalloc(tlen); + memcpy(vc, ¤tconf->conf, len); + for (i = 1; i < currentconf->conf.sec_elmnt_count; i++) + memcpy((char *)vc + i*len, currentconf->other_bvds[i-1], + len); + append_metadata_update(st, vc, tlen); + + return 0; + } else { + struct dl *d; + if (!currentconf) + for (d = ddf->dlist; d; d=d->next) + while (Kill(d->devname, NULL, 0, -1, 1) == 0); + /* Note: we don't close the fd's now, but a subsequent + * ->free_super() will + */ + return __write_init_super_ddf(st); + } +} + +#endif + +static __u64 avail_size_ddf(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + /* We must reserve the last 32Meg */ + if (devsize <= 32*1024*2) + return 0; + return devsize - 32*1024*2; +} + +#ifndef MDASSEMBLE + +static int reserve_space(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long data_offset, + unsigned long long *freesize) +{ + /* Find 'raiddisks' spare extents at least 'size' big (but + * only caring about multiples of 'chunk') and remember + * them. If size==0, find the largest size possible. + * Report available size in *freesize + * If space cannot be found, fail. + */ + struct dl *dl; + struct ddf_super *ddf = st->sb; + int cnt = 0; + + for (dl = ddf->dlist; dl ; dl=dl->next) { + dl->raiddisk = -1; + dl->esize = 0; + } + /* Now find largest extent on each device */ + for (dl = ddf->dlist ; dl ; dl=dl->next) { + unsigned long long minsize = ULLONG_MAX; + + find_space(ddf, dl, data_offset, &minsize); + if (minsize >= size && minsize >= (unsigned)chunk) { + cnt++; + dl->esize = minsize; + } + } + if (cnt < raiddisks) { + pr_err("not enough devices with space to create array.\n"); + return 0; /* No enough free spaces large enough */ + } + if (size == 0) { + /* choose the largest size of which there are at least 'raiddisk' */ + for (dl = ddf->dlist ; dl ; dl=dl->next) { + struct dl *dl2; + if (dl->esize <= size) + continue; + /* This is bigger than 'size', see if there are enough */ + cnt = 0; + for (dl2 = ddf->dlist; dl2 ; dl2=dl2->next) + if (dl2->esize >= dl->esize) + cnt++; + if (cnt >= raiddisks) + size = dl->esize; + } + if (chunk) { + size = size / chunk; + size *= chunk; + } + *freesize = size; + if (size < 32) { + pr_err("not enough spare devices to create array.\n"); + return 0; + } + } + /* We have a 'size' of which there are enough spaces. + * We simply do a first-fit */ + cnt = 0; + for (dl = ddf->dlist ; dl && cnt < raiddisks ; dl=dl->next) { + if (dl->esize < size) + continue; + + dl->raiddisk = cnt; + cnt++; + } + return 1; +} + +static int validate_geometry_ddf(struct supertype *st, + int level, int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd; + struct mdinfo *sra; + int cfd; + + /* ddf potentially supports lots of things, but it depends on + * what devices are offered (and maybe kernel version?) + * If given unused devices, we will make a container. + * If given devices in a container, we will make a BVD. + * If given BVDs, we make an SVD, changing all the GUIDs in the process. + */ + + if (*chunk == UnSet) + *chunk = DEFAULT_CHUNK; + + if (level == LEVEL_NONE) + level = LEVEL_CONTAINER; + if (level == LEVEL_CONTAINER) { + /* Must be a fresh device to add to a container */ + return validate_geometry_ddf_container(st, level, layout, + raiddisks, *chunk, + size, data_offset, dev, + freesize, + verbose); + } + + if (!dev) { + mdu_array_info_t array = { + .level = level, + .layout = layout, + .raid_disks = raiddisks + }; + struct vd_config conf; + if (layout_md2ddf(&array, &conf) == -1) { + if (verbose) + pr_err("DDF does not support level %d /layout %d arrays with %d disks\n", + level, layout, raiddisks); + return 0; + } + /* Should check layout? etc */ + + if (st->sb && freesize) { + /* --create was given a container to create in. + * So we need to check that there are enough + * free spaces and return the amount of space. + * We may as well remember which drives were + * chosen so that add_to_super/getinfo_super + * can return them. + */ + return reserve_space(st, raiddisks, size, *chunk, + data_offset, freesize); + } + return 1; + } + + if (st->sb) { + /* A container has already been opened, so we are + * creating in there. Maybe a BVD, maybe an SVD. + * Should make a distinction one day. + */ + return validate_geometry_ddf_bvd(st, level, layout, raiddisks, + chunk, size, data_offset, dev, + freesize, + verbose); + } + /* This is the first device for the array. + * If it is a container, we read it in and do automagic allocations, + * no other devices should be given. + * Otherwise it must be a member device of a container, and we + * do manual allocation. + * Later we should check for a BVD and make an SVD. + */ + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd >= 0) { + close(fd); + /* Just a bare device, no good to us */ + if (verbose) + pr_err("ddf: Cannot create this array on device %s - a container is required.\n", + dev); + return 0; + } + if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { + if (verbose) + pr_err("ddf: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + /* Well, it is in use by someone, maybe a 'ddf' container. */ + cfd = open_container(fd); + if (cfd < 0) { + close(fd); + if (verbose) + pr_err("ddf: Cannot use %s: %s\n", + dev, strerror(EBUSY)); + return 0; + } + sra = sysfs_read(cfd, NULL, GET_VERSION); + close(fd); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "ddf") == 0) { + /* This is a member of a ddf container. Load the container + * and try to create a bvd + */ + struct ddf_super *ddf; + if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL) == 0) { + st->sb = ddf; + strcpy(st->container_devnm, fd2devnm(cfd)); + close(cfd); + return validate_geometry_ddf_bvd(st, level, layout, + raiddisks, chunk, size, + data_offset, + dev, freesize, + verbose); + } + close(cfd); + } else /* device may belong to a different container */ + return 0; + + return 1; +} + +static int +validate_geometry_ddf_container(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd; + unsigned long long ldsize; + + if (level != LEVEL_CONTAINER) + return 0; + if (!dev) + return 1; + + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + pr_err("ddf: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + if (!get_dev_size(fd, dev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + *freesize = avail_size_ddf(st, ldsize >> 9, INVALID_SECTORS); + if (*freesize == 0) + return 0; + + return 1; +} + +static int validate_geometry_ddf_bvd(struct supertype *st, + int level, int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose) +{ + struct stat stb; + struct ddf_super *ddf = st->sb; + struct dl *dl; + unsigned long long maxsize; + /* ddf/bvd supports lots of things, but not containers */ + if (level == LEVEL_CONTAINER) { + if (verbose) + pr_err("DDF cannot create a container within an container\n"); + return 0; + } + /* We must have the container info already read in. */ + if (!ddf) + return 0; + + if (!dev) { + /* General test: make sure there is space for + * 'raiddisks' device extents of size 'size'. + */ + unsigned long long minsize = size; + int dcnt = 0; + if (minsize == 0) + minsize = 8; + for (dl = ddf->dlist; dl ; dl = dl->next) { + if (find_space(ddf, dl, data_offset, &minsize) + != INVALID_SECTORS) + dcnt++; + } + if (dcnt < raiddisks) { + if (verbose) + pr_err("ddf: Not enough devices with space for this array (%d < %d)\n", + dcnt, raiddisks); + return 0; + } + return 1; + } + /* This device must be a member of the set */ + if (stat(dev, &stb) < 0) + return 0; + if ((S_IFMT & stb.st_mode) != S_IFBLK) + return 0; + for (dl = ddf->dlist ; dl ; dl = dl->next) { + if (dl->major == (int)major(stb.st_rdev) && + dl->minor == (int)minor(stb.st_rdev)) + break; + } + if (!dl) { + if (verbose) + pr_err("ddf: %s is not in the same DDF set\n", + dev); + return 0; + } + maxsize = ULLONG_MAX; + find_space(ddf, dl, data_offset, &maxsize); + *freesize = maxsize; + + return 1; +} + +static int load_super_ddf_all(struct supertype *st, int fd, + void **sbp, char *devname) +{ + struct mdinfo *sra; + struct ddf_super *super; + struct mdinfo *sd, *best = NULL; + int bestseq = 0; + int seq; + char nm[20]; + int dfd; + + sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + if (!sra) + return 1; + if (sra->array.major_version != -1 || + sra->array.minor_version != -2 || + strcmp(sra->text_version, "ddf") != 0) + return 1; + + if (posix_memalign((void**)&super, 512, sizeof(*super)) != 0) + return 1; + memset(super, 0, sizeof(*super)); + + /* first, try each device, and choose the best ddf */ + for (sd = sra->devs ; sd ; sd = sd->next) { + int rv; + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) + return 2; + rv = load_ddf_headers(dfd, super, NULL); + close(dfd); + if (rv == 0) { + seq = be32_to_cpu(super->active->seq); + if (super->active->openflag) + seq--; + if (!best || seq > bestseq) { + bestseq = seq; + best = sd; + } + } + } + if (!best) + return 1; + /* OK, load this ddf */ + sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) + return 1; + load_ddf_headers(dfd, super, NULL); + load_ddf_global(dfd, super, NULL); + close(dfd); + /* Now we need the device-local bits */ + for (sd = sra->devs ; sd ; sd = sd->next) { + int rv; + + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDWR); + if (dfd < 0) + return 2; + rv = load_ddf_headers(dfd, super, NULL); + if (rv == 0) + rv = load_ddf_local(dfd, super, NULL, 1); + if (rv) + return 1; + } + + *sbp = super; + if (st->ss == NULL) { + st->ss = &super_ddf; + st->minor_version = 0; + st->max_devs = 512; + } + strcpy(st->container_devnm, fd2devnm(fd)); + return 0; +} + +static int load_container_ddf(struct supertype *st, int fd, + char *devname) +{ + return load_super_ddf_all(st, fd, &st->sb, devname); +} + +#endif /* MDASSEMBLE */ + +static int check_secondary(const struct vcl *vc) +{ + const struct vd_config *conf = &vc->conf; + int i; + + /* The only DDF secondary RAID level md can support is + * RAID 10, if the stripe sizes and Basic volume sizes + * are all equal. + * Other configurations could in theory be supported by exposing + * the BVDs to user space and using device mapper for the secondary + * mapping. So far we don't support that. + */ + + __u64 sec_elements[4] = {0, 0, 0, 0}; +#define __set_sec_seen(n) (sec_elements[(n)>>6] |= (1<<((n)&63))) +#define __was_sec_seen(n) ((sec_elements[(n)>>6] & (1<<((n)&63))) != 0) + + if (vc->other_bvds == NULL) { + pr_err("No BVDs for secondary RAID found\n"); + return -1; + } + if (conf->prl != DDF_RAID1) { + pr_err("Secondary RAID level only supported for mirrored BVD\n"); + return -1; + } + if (conf->srl != DDF_2STRIPED && conf->srl != DDF_2SPANNED) { + pr_err("Secondary RAID level %d is unsupported\n", + conf->srl); + return -1; + } + __set_sec_seen(conf->sec_elmnt_seq); + for (i = 0; i < conf->sec_elmnt_count-1; i++) { + const struct vd_config *bvd = vc->other_bvds[i]; + if (bvd->sec_elmnt_seq == DDF_UNUSED_BVD) + continue; + if (bvd->srl != conf->srl) { + pr_err("Inconsistent secondary RAID level across BVDs\n"); + return -1; + } + if (bvd->prl != conf->prl) { + pr_err("Different RAID levels for BVDs are unsupported\n"); + return -1; + } + if (!be16_eq(bvd->prim_elmnt_count, conf->prim_elmnt_count)) { + pr_err("All BVDs must have the same number of primary elements\n"); + return -1; + } + if (bvd->chunk_shift != conf->chunk_shift) { + pr_err("Different strip sizes for BVDs are unsupported\n"); + return -1; + } + if (!be64_eq(bvd->array_blocks, conf->array_blocks)) { + pr_err("Different BVD sizes are unsupported\n"); + return -1; + } + __set_sec_seen(bvd->sec_elmnt_seq); + } + for (i = 0; i < conf->sec_elmnt_count; i++) { + if (!__was_sec_seen(i)) { + /* pr_err("BVD %d is missing\n", i); */ + return -1; + } + } + return 0; +} + +static unsigned int get_pd_index_from_refnum(const struct vcl *vc, + be32 refnum, unsigned int nmax, + const struct vd_config **bvd, + unsigned int *idx) +{ + unsigned int i, j, n, sec, cnt; + + cnt = be16_to_cpu(vc->conf.prim_elmnt_count); + sec = (vc->conf.sec_elmnt_count == 1 ? 0 : vc->conf.sec_elmnt_seq); + + for (i = 0, j = 0 ; i < nmax ; i++) { + /* j counts valid entries for this BVD */ + if (be32_eq(vc->conf.phys_refnum[i], refnum)) { + *bvd = &vc->conf; + *idx = i; + return sec * cnt + j; + } + if (be32_to_cpu(vc->conf.phys_refnum[i]) != 0xffffffff) + j++; + } + if (vc->other_bvds == NULL) + goto bad; + + for (n = 1; n < vc->conf.sec_elmnt_count; n++) { + struct vd_config *vd = vc->other_bvds[n-1]; + sec = vd->sec_elmnt_seq; + if (sec == DDF_UNUSED_BVD) + continue; + for (i = 0, j = 0 ; i < nmax ; i++) { + if (be32_eq(vd->phys_refnum[i], refnum)) { + *bvd = vd; + *idx = i; + return sec * cnt + j; + } + if (be32_to_cpu(vd->phys_refnum[i]) != 0xffffffff) + j++; + } + } +bad: + *bvd = NULL; + return DDF_NOTFOUND; +} + +static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray) +{ + /* Given a container loaded by load_super_ddf_all, + * extract information about all the arrays into + * an mdinfo tree. + * + * For each vcl in conflist: create an mdinfo, fill it in, + * then look for matching devices (phys_refnum) in dlist + * and create appropriate device mdinfo. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo *rest = NULL; + struct vcl *vc; + + for (vc = ddf->conflist ; vc ; vc=vc->next) { + unsigned int i; + struct mdinfo *this; + char *ep; + __u32 *cptr; + unsigned int pd; + + if (subarray && + (strtoul(subarray, &ep, 10) != vc->vcnum || + *ep != '\0')) + continue; + + if (vc->conf.sec_elmnt_count > 1) { + if (check_secondary(vc) != 0) + continue; + } + + this = xcalloc(1, sizeof(*this)); + this->next = rest; + rest = this; + + if (layout_ddf2md(&vc->conf, &this->array)) + continue; + this->array.md_minor = -1; + this->array.major_version = -1; + this->array.minor_version = -2; + this->safe_mode_delay = DDF_SAFE_MODE_DELAY; + cptr = (__u32 *)(vc->conf.guid + 16); + this->array.ctime = DECADE + __be32_to_cpu(*cptr); + this->array.utime = DECADE + + be32_to_cpu(vc->conf.timestamp); + this->array.chunk_size = 512 << vc->conf.chunk_shift; + + i = vc->vcnum; + if ((ddf->virt->entries[i].state & DDF_state_inconsistent) || + (ddf->virt->entries[i].init_state & DDF_initstate_mask) != + DDF_init_full) { + this->array.state = 0; + this->resync_start = 0; + } else { + this->array.state = 1; + this->resync_start = MaxSector; + } + _ddf_array_name(this->name, ddf, i); + memset(this->uuid, 0, sizeof(this->uuid)); + this->component_size = be64_to_cpu(vc->conf.blocks); + this->array.size = this->component_size / 2; + this->container_member = i; + + ddf->currentconf = vc; + uuid_from_super_ddf(st, this->uuid); + if (!subarray) + ddf->currentconf = NULL; + + sprintf(this->text_version, "/%s/%d", + st->container_devnm, this->container_member); + + for (pd = 0; pd < be16_to_cpu(ddf->phys->max_pdes); pd++) { + struct mdinfo *dev; + struct dl *d; + const struct vd_config *bvd; + unsigned int iphys; + int stt; + + if (be32_to_cpu(ddf->phys->entries[pd].refnum) + == 0xFFFFFFFF) + continue; + + stt = be16_to_cpu(ddf->phys->entries[pd].state); + if ((stt & (DDF_Online|DDF_Failed|DDF_Rebuilding)) + != DDF_Online) + continue; + + i = get_pd_index_from_refnum( + vc, ddf->phys->entries[pd].refnum, + ddf->mppe, &bvd, &iphys); + if (i == DDF_NOTFOUND) + continue; + + this->array.working_disks++; + + for (d = ddf->dlist; d ; d=d->next) + if (be32_eq(d->disk.refnum, + ddf->phys->entries[pd].refnum)) + break; + if (d == NULL) + /* Haven't found that one yet, maybe there are others */ + continue; + + dev = xcalloc(1, sizeof(*dev)); + dev->next = this->devs; + this->devs = dev; + + dev->disk.number = be32_to_cpu(d->disk.refnum); + dev->disk.major = d->major; + dev->disk.minor = d->minor; + dev->disk.raid_disk = i; + dev->disk.state = (1<recovery_start = MaxSector; + + dev->events = be32_to_cpu(ddf->active->seq); + dev->data_offset = + be64_to_cpu(LBA_OFFSET(ddf, bvd)[iphys]); + dev->component_size = be64_to_cpu(bvd->blocks); + if (d->devname) + strcpy(dev->name, d->devname); + } + } + return rest; +} + +static int store_super_ddf(struct supertype *st, int fd) +{ + struct ddf_super *ddf = st->sb; + unsigned long long dsize; + void *buf; + int rc; + + if (!ddf) + return 1; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + if (ddf->dlist || ddf->conflist) { + struct stat sta; + struct dl *dl; + int ofd, ret; + + if (fstat(fd, &sta) == -1 || !S_ISBLK(sta.st_mode)) { + pr_err("file descriptor for invalid device\n"); + return 1; + } + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == (int)major(sta.st_rdev) && + dl->minor == (int)minor(sta.st_rdev)) + break; + if (!dl) { + pr_err("couldn't find disk %d/%d\n", + (int)major(sta.st_rdev), + (int)minor(sta.st_rdev)); + return 1; + } + ofd = dl->fd; + dl->fd = fd; + ret = (_write_super_to_disk(ddf, dl) != 1); + dl->fd = ofd; + return ret; + } + + if (posix_memalign(&buf, 512, 512) != 0) + return 1; + memset(buf, 0, 512); + + lseek64(fd, dsize-512, 0); + rc = write(fd, buf, 512); + free(buf); + if (rc < 0) + return 1; + return 0; +} + +static int compare_super_ddf(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong magic number - but that isn't possible + * 2 wrong uuid + * 3 wrong other info + */ + struct ddf_super *first = st->sb; + struct ddf_super *second = tst->sb; + struct dl *dl1, *dl2; + struct vcl *vl1, *vl2; + unsigned int max_vds, max_pds, pd, vd; + + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } + + if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0) + return 2; + + /* It is only OK to compare info in the anchor. Anything else + * could be changing due to a reconfig so must be ignored. + * guid really should be enough anyway. + */ + + if (!be32_eq(first->active->seq, second->active->seq)) { + dprintf("sequence number mismatch %u<->%u\n", + be32_to_cpu(first->active->seq), + be32_to_cpu(second->active->seq)); + return 0; + } + + /* + * At this point we are fairly sure that the meta data matches. + * But the new disk may contain additional local data. + * Add it to the super block. + */ + max_vds = be16_to_cpu(first->active->max_vd_entries); + max_pds = be16_to_cpu(first->phys->max_pdes); + for (vl2 = second->conflist; vl2; vl2 = vl2->next) { + for (vl1 = first->conflist; vl1; vl1 = vl1->next) + if (!memcmp(vl1->conf.guid, vl2->conf.guid, + DDF_GUID_LEN)) + break; + if (vl1) { + if (vl1->other_bvds != NULL && + vl1->conf.sec_elmnt_seq != + vl2->conf.sec_elmnt_seq) { + dprintf("adding BVD %u\n", + vl2->conf.sec_elmnt_seq); + add_other_bvd(vl1, &vl2->conf, + first->conf_rec_len*512); + } + continue; + } + + if (posix_memalign((void **)&vl1, 512, + (first->conf_rec_len*512 + + offsetof(struct vcl, conf))) != 0) { + pr_err("could not allocate vcl buf\n"); + return 3; + } + + vl1->next = first->conflist; + vl1->block_sizes = NULL; + memcpy(&vl1->conf, &vl2->conf, first->conf_rec_len*512); + if (alloc_other_bvds(first, vl1) != 0) { + pr_err("could not allocate other bvds\n"); + free(vl1); + return 3; + } + for (vd = 0; vd < max_vds; vd++) + if (!memcmp(first->virt->entries[vd].guid, + vl1->conf.guid, DDF_GUID_LEN)) + break; + vl1->vcnum = vd; + dprintf("added config for VD %u\n", vl1->vcnum); + first->conflist = vl1; + } + + for (dl2 = second->dlist; dl2; dl2 = dl2->next) { + for (dl1 = first->dlist; dl1; dl1 = dl1->next) + if (be32_eq(dl1->disk.refnum, dl2->disk.refnum)) + break; + if (dl1) + continue; + + if (posix_memalign((void **)&dl1, 512, + sizeof(*dl1) + (first->max_part) * sizeof(dl1->vlist[0])) + != 0) { + pr_err("could not allocate disk info buffer\n"); + return 3; + } + memcpy(dl1, dl2, sizeof(*dl1)); + dl1->mdupdate = NULL; + dl1->next = first->dlist; + dl1->fd = -1; + for (pd = 0; pd < max_pds; pd++) + if (be32_eq(first->phys->entries[pd].refnum, + dl1->disk.refnum)) + break; + dl1->pdnum = pd < max_pds ? (int)pd : -1; + if (dl2->spare) { + if (posix_memalign((void **)&dl1->spare, 512, + first->conf_rec_len*512) != 0) { + pr_err("could not allocate spare info buf\n"); + return 3; + } + memcpy(dl1->spare, dl2->spare, first->conf_rec_len*512); + } + for (vd = 0 ; vd < first->max_part ; vd++) { + if (!dl2->vlist[vd]) { + dl1->vlist[vd] = NULL; + continue; + } + for (vl1 = first->conflist; vl1; vl1 = vl1->next) { + if (!memcmp(vl1->conf.guid, + dl2->vlist[vd]->conf.guid, + DDF_GUID_LEN)) + break; + dl1->vlist[vd] = vl1; + } + } + first->dlist = dl1; + dprintf("added disk %d: %08x\n", dl1->pdnum, + be32_to_cpu(dl1->disk.refnum)); + } + + return 0; +} + +#ifndef MDASSEMBLE +/* + * A new array 'a' has been started which claims to be instance 'inst' + * within container 'c'. + * We need to confirm that the array matches the metadata in 'c' so + * that we don't corrupt any metadata. + */ +static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst) +{ + struct ddf_super *ddf = c->sb; + int n = atoi(inst); + struct mdinfo *dev; + struct dl *dl; + static const char faulty[] = "faulty"; + + if (all_ff(ddf->virt->entries[n].guid)) { + pr_err("subarray %d doesn't exist\n", n); + return -ENODEV; + } + dprintf("new subarray %d, GUID: %s\n", n, + guid_str(ddf->virt->entries[n].guid)); + for (dev = a->info.devs; dev; dev = dev->next) { + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == dev->disk.major && + dl->minor == dev->disk.minor) + break; + if (!dl || dl->pdnum < 0) { + pr_err("device %d/%d of subarray %d not found in meta data\n", + dev->disk.major, dev->disk.minor, n); + return -1; + } + if ((be16_to_cpu(ddf->phys->entries[dl->pdnum].state) & + (DDF_Online|DDF_Missing|DDF_Failed)) != DDF_Online) { + pr_err("new subarray %d contains broken device %d/%d (%02x)\n", + n, dl->major, dl->minor, + be16_to_cpu(ddf->phys->entries[dl->pdnum].state)); + if (write(dev->state_fd, faulty, sizeof(faulty)-1) != + sizeof(faulty) - 1) + pr_err("Write to state_fd failed\n"); + dev->curr_state = DS_FAULTY; + } + } + a->info.container_member = n; + return 0; +} + +static void handle_missing(struct ddf_super *ddf, struct active_array *a, int inst) +{ + /* This member array is being activated. If any devices + * are missing they must now be marked as failed. + */ + struct vd_config *vc; + unsigned int n_bvd; + struct vcl *vcl; + struct dl *dl; + int pd; + int n; + int state; + + for (n = 0; ; n++) { + vc = find_vdcr(ddf, inst, n, &n_bvd, &vcl); + if (!vc) + break; + for (dl = ddf->dlist; dl; dl = dl->next) + if (be32_eq(dl->disk.refnum, vc->phys_refnum[n_bvd])) + break; + if (dl) + /* Found this disk, so not missing */ + continue; + + /* Mark the device as failed/missing. */ + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); + if (pd >= 0 && be16_and(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Online))) { + be16_clear(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Online)); + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Failed|DDF_Missing)); + vc->phys_refnum[n_bvd] = cpu_to_be32(0); + ddf_set_updates_pending(ddf, vc); + } + + /* Mark the array as Degraded */ + state = get_svd_state(ddf, vcl); + if (ddf->virt->entries[inst].state != + ((ddf->virt->entries[inst].state & ~DDF_state_mask) + | state)) { + ddf->virt->entries[inst].state = + (ddf->virt->entries[inst].state & ~DDF_state_mask) + | state; + a->check_degraded = 1; + ddf_set_updates_pending(ddf, vc); + } + } +} + +/* + * The array 'a' is to be marked clean in the metadata. + * If '->resync_start' is not ~(unsigned long long)0, then the array is only + * clean up to the point (in sectors). If that cannot be recorded in the + * metadata, then leave it as dirty. + * + * For DDF, we need to clear the DDF_state_inconsistent bit in the + * !global! virtual_disk.virtual_entry structure. + */ +static int ddf_set_array_state(struct active_array *a, int consistent) +{ + struct ddf_super *ddf = a->container->sb; + int inst = a->info.container_member; + int old = ddf->virt->entries[inst].state; + if (consistent == 2) { + handle_missing(ddf, a, inst); + consistent = 1; + if (!is_resync_complete(&a->info)) + consistent = 0; + } + if (consistent) + ddf->virt->entries[inst].state &= ~DDF_state_inconsistent; + else + ddf->virt->entries[inst].state |= DDF_state_inconsistent; + if (old != ddf->virt->entries[inst].state) + ddf_set_updates_pending(ddf, NULL); + + old = ddf->virt->entries[inst].init_state; + ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask; + if (is_resync_complete(&a->info)) + ddf->virt->entries[inst].init_state |= DDF_init_full; + else if (a->info.resync_start == 0) + ddf->virt->entries[inst].init_state |= DDF_init_not; + else + ddf->virt->entries[inst].init_state |= DDF_init_quick; + if (old != ddf->virt->entries[inst].init_state) + ddf_set_updates_pending(ddf, NULL); + + dprintf("ddf mark %d/%s (%d) %s %llu\n", inst, + guid_str(ddf->virt->entries[inst].guid), a->curr_state, + consistent?"clean":"dirty", + a->info.resync_start); + return consistent; +} + +static int get_bvd_state(const struct ddf_super *ddf, + const struct vd_config *vc) +{ + unsigned int i, n_bvd, working = 0; + unsigned int n_prim = be16_to_cpu(vc->prim_elmnt_count); + int pd, st, state; + char *avail = xcalloc(1, n_prim); + mdu_array_info_t array; + + layout_ddf2md(vc, &array); + + for (i = 0; i < n_prim; i++) { + if (!find_index_in_bvd(ddf, vc, i, &n_bvd)) + continue; + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); + if (pd < 0) + continue; + st = be16_to_cpu(ddf->phys->entries[pd].state); + if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding)) + == DDF_Online) { + working++; + avail[i] = 1; + } + } + + state = DDF_state_degraded; + if (working == n_prim) + state = DDF_state_optimal; + else + switch (vc->prl) { + case DDF_RAID0: + case DDF_CONCAT: + case DDF_JBOD: + state = DDF_state_failed; + break; + case DDF_RAID1: + if (working == 0) + state = DDF_state_failed; + else if (working >= 2) + state = DDF_state_part_optimal; + break; + case DDF_RAID1E: + if (!enough(10, n_prim, array.layout, 1, avail)) + state = DDF_state_failed; + break; + case DDF_RAID4: + case DDF_RAID5: + if (working < n_prim - 1) + state = DDF_state_failed; + break; + case DDF_RAID6: + if (working < n_prim - 2) + state = DDF_state_failed; + else if (working == n_prim - 1) + state = DDF_state_part_optimal; + break; + } + return state; +} + +static int secondary_state(int state, int other, int seclevel) +{ + if (state == DDF_state_optimal && other == DDF_state_optimal) + return DDF_state_optimal; + if (seclevel == DDF_2MIRRORED) { + if (state == DDF_state_optimal || other == DDF_state_optimal) + return DDF_state_part_optimal; + if (state == DDF_state_failed && other == DDF_state_failed) + return DDF_state_failed; + return DDF_state_degraded; + } else { + if (state == DDF_state_failed || other == DDF_state_failed) + return DDF_state_failed; + if (state == DDF_state_degraded || other == DDF_state_degraded) + return DDF_state_degraded; + return DDF_state_part_optimal; + } +} + +static int get_svd_state(const struct ddf_super *ddf, const struct vcl *vcl) +{ + int state = get_bvd_state(ddf, &vcl->conf); + unsigned int i; + for (i = 1; i < vcl->conf.sec_elmnt_count; i++) { + state = secondary_state( + state, + get_bvd_state(ddf, vcl->other_bvds[i-1]), + vcl->conf.srl); + } + return state; +} + +/* + * The state of each disk is stored in the global phys_disk structure + * in phys_disk.entries[n].state. + * This makes various combinations awkward. + * - When a device fails in any array, it must be failed in all arrays + * that include a part of this device. + * - When a component is rebuilding, we cannot include it officially in the + * array unless this is the only array that uses the device. + * + * So: when transitioning: + * Online -> failed, just set failed flag. monitor will propagate + * spare -> online, the device might need to be added to the array. + * spare -> failed, just set failed. Don't worry if in array or not. + */ +static void ddf_set_disk(struct active_array *a, int n, int state) +{ + struct ddf_super *ddf = a->container->sb; + unsigned int inst = a->info.container_member, n_bvd; + struct vcl *vcl; + struct vd_config *vc = find_vdcr(ddf, inst, (unsigned int)n, + &n_bvd, &vcl); + int pd; + struct mdinfo *mdi; + struct dl *dl; + int update = 0; + + dprintf("%d to %x\n", n, state); + if (vc == NULL) { + dprintf("ddf: cannot find instance %d!!\n", inst); + return; + } + /* Find the matching slot in 'info'. */ + for (mdi = a->info.devs; mdi; mdi = mdi->next) + if (mdi->disk.raid_disk == n) + break; + if (!mdi) { + pr_err("cannot find raid disk %d\n", n); + return; + } + + /* and find the 'dl' entry corresponding to that. */ + for (dl = ddf->dlist; dl; dl = dl->next) + if (mdi->state_fd >= 0 && + mdi->disk.major == dl->major && + mdi->disk.minor == dl->minor) + break; + if (!dl) { + pr_err("cannot find raid disk %d (%d/%d)\n", + n, mdi->disk.major, mdi->disk.minor); + return; + } + + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); + if (pd < 0 || pd != dl->pdnum) { + /* disk doesn't currently exist or has changed. + * If it is now in_sync, insert it. */ + dprintf("phys disk not found for %d: %d/%d ref %08x\n", + dl->pdnum, dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum)); + dprintf("array %u disk %u ref %08x pd %d\n", + inst, n_bvd, + be32_to_cpu(vc->phys_refnum[n_bvd]), pd); + if ((state & DS_INSYNC) && ! (state & DS_FAULTY) && + dl->pdnum >= 0) { + pd = dl->pdnum; + vc->phys_refnum[n_bvd] = dl->disk.refnum; + LBA_OFFSET(ddf, vc)[n_bvd] = + cpu_to_be64(mdi->data_offset); + be16_clear(ddf->phys->entries[pd].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[pd].type, + cpu_to_be16(DDF_Active_in_VD)); + update = 1; + } + } else { + be16 old = ddf->phys->entries[pd].state; + if (state & DS_FAULTY) + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Failed)); + if (state & DS_INSYNC) { + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Online)); + be16_clear(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Rebuilding)); + } + if (!be16_eq(old, ddf->phys->entries[pd].state)) + update = 1; + } + + dprintf("ddf: set_disk %d (%08x) to %x->%02x\n", n, + be32_to_cpu(dl->disk.refnum), state, + be16_to_cpu(ddf->phys->entries[pd].state)); + + /* Now we need to check the state of the array and update + * virtual_disk.entries[n].state. + * It needs to be one of "optimal", "degraded", "failed". + * I don't understand 'deleted' or 'missing'. + */ + state = get_svd_state(ddf, vcl); + + if (ddf->virt->entries[inst].state != + ((ddf->virt->entries[inst].state & ~DDF_state_mask) + | state)) { + ddf->virt->entries[inst].state = + (ddf->virt->entries[inst].state & ~DDF_state_mask) + | state; + update = 1; + } + if (update) + ddf_set_updates_pending(ddf, vc); +} + +static void ddf_sync_metadata(struct supertype *st) +{ + /* + * Write all data to all devices. + * Later, we might be able to track whether only local changes + * have been made, or whether any global data has been changed, + * but ddf is sufficiently weird that it probably always + * changes global data .... + */ + struct ddf_super *ddf = st->sb; + if (!ddf->updates_pending) + return; + ddf->updates_pending = 0; + __write_init_super_ddf(st); + dprintf("ddf: sync_metadata\n"); +} + +static int del_from_conflist(struct vcl **list, const char *guid) +{ + struct vcl **p; + int found = 0; + for (p = list; p && *p; p = &((*p)->next)) + if (!memcmp((*p)->conf.guid, guid, DDF_GUID_LEN)) { + found = 1; + *p = (*p)->next; + } + return found; +} + +static int _kill_subarray_ddf(struct ddf_super *ddf, const char *guid) +{ + struct dl *dl; + unsigned int vdnum, i; + vdnum = find_vde_by_guid(ddf, guid); + if (vdnum == DDF_NOTFOUND) { + pr_err("could not find VD %s\n", guid_str(guid)); + return -1; + } + if (del_from_conflist(&ddf->conflist, guid) == 0) { + pr_err("could not find conf %s\n", guid_str(guid)); + return -1; + } + for (dl = ddf->dlist; dl; dl = dl->next) + for (i = 0; i < ddf->max_part; i++) + if (dl->vlist[i] != NULL && + !memcmp(dl->vlist[i]->conf.guid, guid, + DDF_GUID_LEN)) + dl->vlist[i] = NULL; + memset(ddf->virt->entries[vdnum].guid, 0xff, DDF_GUID_LEN); + dprintf("deleted %s\n", guid_str(guid)); + return 0; +} + +static int kill_subarray_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + /* + * currentconf is set in container_content_ddf, + * called with subarray arg + */ + struct vcl *victim = ddf->currentconf; + struct vd_config *conf; + unsigned int vdnum; + + ddf->currentconf = NULL; + if (!victim) { + pr_err("nothing to kill\n"); + return -1; + } + conf = &victim->conf; + vdnum = find_vde_by_guid(ddf, conf->guid); + if (vdnum == DDF_NOTFOUND) { + pr_err("could not find VD %s\n", guid_str(conf->guid)); + return -1; + } + if (st->update_tail) { + struct virtual_disk *vd; + int len = sizeof(struct virtual_disk) + + sizeof(struct virtual_entry); + vd = xmalloc(len); + if (vd == NULL) { + pr_err("failed to allocate %d bytes\n", len); + return -1; + } + memset(vd, 0 , len); + vd->magic = DDF_VIRT_RECORDS_MAGIC; + vd->populated_vdes = cpu_to_be16(0); + memcpy(vd->entries[0].guid, conf->guid, DDF_GUID_LEN); + /* we use DDF_state_deleted as marker */ + vd->entries[0].state = DDF_state_deleted; + append_metadata_update(st, vd, len); + } else { + _kill_subarray_ddf(ddf, conf->guid); + ddf_set_updates_pending(ddf, NULL); + ddf_sync_metadata(st); + } + return 0; +} + +static void copy_matching_bvd(struct ddf_super *ddf, + struct vd_config *conf, + const struct metadata_update *update) +{ + unsigned int mppe = + be16_to_cpu(ddf->anchor.max_primary_element_entries); + unsigned int len = ddf->conf_rec_len * 512; + char *p; + struct vd_config *vc; + for (p = update->buf; p < update->buf + update->len; p += len) { + vc = (struct vd_config *) p; + if (vc->sec_elmnt_seq == conf->sec_elmnt_seq) { + memcpy(conf->phys_refnum, vc->phys_refnum, + mppe * (sizeof(__u32) + sizeof(__u64))); + return; + } + } + pr_err("no match for BVD %d of %s in update\n", + conf->sec_elmnt_seq, guid_str(conf->guid)); +} + +static void ddf_process_phys_update(struct supertype *st, + struct metadata_update *update) +{ + struct ddf_super *ddf = st->sb; + struct phys_disk *pd; + unsigned int ent; + + pd = (struct phys_disk*)update->buf; + ent = be16_to_cpu(pd->used_pdes); + if (ent >= be16_to_cpu(ddf->phys->max_pdes)) + return; + if (be16_and(pd->entries[0].state, cpu_to_be16(DDF_Missing))) { + struct dl **dlp; + /* removing this disk. */ + be16_set(ddf->phys->entries[ent].state, + cpu_to_be16(DDF_Missing)); + for (dlp = &ddf->dlist; *dlp; dlp = &(*dlp)->next) { + struct dl *dl = *dlp; + if (dl->pdnum == (signed)ent) { + close(dl->fd); + dl->fd = -1; + *dlp = dl->next; + update->space = dl->devname; + *(void**)dl = update->space_list; + update->space_list = (void**)dl; + break; + } + } + ddf_set_updates_pending(ddf, NULL); + return; + } + if (!all_ff(ddf->phys->entries[ent].guid)) + return; + ddf->phys->entries[ent] = pd->entries[0]; + ddf->phys->used_pdes = cpu_to_be16 + (1 + be16_to_cpu(ddf->phys->used_pdes)); + ddf_set_updates_pending(ddf, NULL); + if (ddf->add_list) { + struct active_array *a; + struct dl *al = ddf->add_list; + ddf->add_list = al->next; + + al->next = ddf->dlist; + ddf->dlist = al; + + /* As a device has been added, we should check + * for any degraded devices that might make + * use of this spare */ + for (a = st->arrays ; a; a=a->next) + a->check_degraded = 1; + } +} + +static void ddf_process_virt_update(struct supertype *st, + struct metadata_update *update) +{ + struct ddf_super *ddf = st->sb; + struct virtual_disk *vd; + unsigned int ent; + + vd = (struct virtual_disk*)update->buf; + + if (vd->entries[0].state == DDF_state_deleted) { + if (_kill_subarray_ddf(ddf, vd->entries[0].guid)) + return; + } else { + ent = find_vde_by_guid(ddf, vd->entries[0].guid); + if (ent != DDF_NOTFOUND) { + dprintf("VD %s exists already in slot %d\n", + guid_str(vd->entries[0].guid), + ent); + return; + } + ent = find_unused_vde(ddf); + if (ent == DDF_NOTFOUND) + return; + ddf->virt->entries[ent] = vd->entries[0]; + ddf->virt->populated_vdes = + cpu_to_be16( + 1 + be16_to_cpu( + ddf->virt->populated_vdes)); + dprintf("added VD %s in slot %d(s=%02x i=%02x)\n", + guid_str(vd->entries[0].guid), ent, + ddf->virt->entries[ent].state, + ddf->virt->entries[ent].init_state); + } + ddf_set_updates_pending(ddf, NULL); +} + +static void ddf_remove_failed(struct ddf_super *ddf) +{ + /* Now remove any 'Failed' devices that are not part + * of any VD. They will have the Transition flag set. + * Once done, we need to update all dl->pdnum numbers. + */ + unsigned int pdnum; + unsigned int pd2 = 0; + struct dl *dl; + + for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->max_pdes); + pdnum++) { + if (be32_to_cpu(ddf->phys->entries[pdnum].refnum) == + 0xFFFFFFFF) + continue; + if (be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Failed)) + && be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Transition))) { + /* skip this one unless in dlist*/ + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->pdnum == (int)pdnum) + break; + if (!dl) + continue; + } + if (pdnum == pd2) + pd2++; + else { + ddf->phys->entries[pd2] = + ddf->phys->entries[pdnum]; + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->pdnum == (int)pdnum) + dl->pdnum = pd2; + pd2++; + } + } + ddf->phys->used_pdes = cpu_to_be16(pd2); + while (pd2 < pdnum) { + memset(ddf->phys->entries[pd2].guid, 0xff, + DDF_GUID_LEN); + pd2++; + } +} + +static void ddf_update_vlist(struct ddf_super *ddf, struct dl *dl) +{ + struct vcl *vcl; + unsigned int vn = 0; + int in_degraded = 0; + + if (dl->pdnum < 0) + return; + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) { + unsigned int dn, ibvd; + const struct vd_config *conf; + int vstate; + dn = get_pd_index_from_refnum(vcl, + dl->disk.refnum, + ddf->mppe, + &conf, &ibvd); + if (dn == DDF_NOTFOUND) + continue; + dprintf("dev %d/%08x has %s (sec=%u) at %d\n", + dl->pdnum, + be32_to_cpu(dl->disk.refnum), + guid_str(conf->guid), + conf->sec_elmnt_seq, vn); + /* Clear the Transition flag */ + if (be16_and + (ddf->phys->entries[dl->pdnum].state, + cpu_to_be16(DDF_Failed))) + be16_clear(ddf->phys + ->entries[dl->pdnum].state, + cpu_to_be16(DDF_Transition)); + dl->vlist[vn++] = vcl; + vstate = ddf->virt->entries[vcl->vcnum].state + & DDF_state_mask; + if (vstate == DDF_state_degraded || + vstate == DDF_state_part_optimal) + in_degraded = 1; + } + while (vn < ddf->max_part) + dl->vlist[vn++] = NULL; + if (dl->vlist[0]) { + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + if (!be16_and(ddf->phys + ->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD))) { + be16_set(ddf->phys + ->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + if (in_degraded) + be16_set(ddf->phys + ->entries[dl->pdnum] + .state, + cpu_to_be16 + (DDF_Rebuilding)); + } + } + if (dl->spare) { + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare)); + } + if (!dl->vlist[0] && !dl->spare) { + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare)); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + } +} + +static void ddf_process_conf_update(struct supertype *st, + struct metadata_update *update) +{ + struct ddf_super *ddf = st->sb; + struct vd_config *vc; + struct vcl *vcl; + struct dl *dl; + unsigned int ent; + unsigned int pdnum, len; + + vc = (struct vd_config*)update->buf; + len = ddf->conf_rec_len * 512; + if ((unsigned int)update->len != len * vc->sec_elmnt_count) { + pr_err("%s: insufficient data (%d) for %u BVDs\n", + guid_str(vc->guid), update->len, + vc->sec_elmnt_count); + return; + } + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) + if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0) + break; + dprintf("conf update for %s (%s)\n", + guid_str(vc->guid), (vcl ? "old" : "new")); + if (vcl) { + /* An update, just copy the phys_refnum and lba_offset + * fields + */ + unsigned int i; + unsigned int k; + copy_matching_bvd(ddf, &vcl->conf, update); + for (k = 0; k < be16_to_cpu(vc->prim_elmnt_count); k++) + dprintf("BVD %u has %08x at %llu\n", 0, + be32_to_cpu(vcl->conf.phys_refnum[k]), + be64_to_cpu(LBA_OFFSET(ddf, + &vcl->conf)[k])); + for (i = 1; i < vc->sec_elmnt_count; i++) { + copy_matching_bvd(ddf, vcl->other_bvds[i-1], + update); + for (k = 0; k < be16_to_cpu( + vc->prim_elmnt_count); k++) + dprintf("BVD %u has %08x at %llu\n", i, + be32_to_cpu + (vcl->other_bvds[i-1]-> + phys_refnum[k]), + be64_to_cpu + (LBA_OFFSET + (ddf, + vcl->other_bvds[i-1])[k])); + } + } else { + /* A new VD_CONF */ + unsigned int i; + if (!update->space) + return; + vcl = update->space; + update->space = NULL; + vcl->next = ddf->conflist; + memcpy(&vcl->conf, vc, len); + ent = find_vde_by_guid(ddf, vc->guid); + if (ent == DDF_NOTFOUND) + return; + vcl->vcnum = ent; + ddf->conflist = vcl; + for (i = 1; i < vc->sec_elmnt_count; i++) + memcpy(vcl->other_bvds[i-1], + update->buf + len * i, len); + } + /* Set DDF_Transition on all Failed devices - to help + * us detect those that are no longer in use + */ + for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->max_pdes); + pdnum++) + if (be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Failed))) + be16_set(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Transition)); + + /* Now make sure vlist is correct for each dl. */ + for (dl = ddf->dlist; dl; dl = dl->next) + ddf_update_vlist(ddf, dl); + ddf_remove_failed(ddf); + + ddf_set_updates_pending(ddf, vc); +} + +static void ddf_process_update(struct supertype *st, + struct metadata_update *update) +{ + /* Apply this update to the metadata. + * The first 4 bytes are a DDF_*_MAGIC which guides + * our actions. + * Possible update are: + * DDF_PHYS_RECORDS_MAGIC + * Add a new physical device or remove an old one. + * Changes to this record only happen implicitly. + * used_pdes is the device number. + * DDF_VIRT_RECORDS_MAGIC + * Add a new VD. Possibly also change the 'access' bits. + * populated_vdes is the entry number. + * DDF_VD_CONF_MAGIC + * New or updated VD. the VIRT_RECORD must already + * exist. For an update, phys_refnum and lba_offset + * (at least) are updated, and the VD_CONF must + * be written to precisely those devices listed with + * a phys_refnum. + * DDF_SPARE_ASSIGN_MAGIC + * replacement Spare Assignment Record... but for which device? + * + * So, e.g.: + * - to create a new array, we send a VIRT_RECORD and + * a VD_CONF. Then assemble and start the array. + * - to activate a spare we send a VD_CONF to add the phys_refnum + * and offset. This will also mark the spare as active with + * a spare-assignment record. + */ + be32 *magic = (be32 *)update->buf; + + dprintf("Process update %x\n", be32_to_cpu(*magic)); + + if (be32_eq(*magic, DDF_PHYS_RECORDS_MAGIC)) { + if (update->len == (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry))) + ddf_process_phys_update(st, update); + } else if (be32_eq(*magic, DDF_VIRT_RECORDS_MAGIC)) { + if (update->len == (sizeof(struct virtual_disk) + + sizeof(struct virtual_entry))) + ddf_process_virt_update(st, update); + } else if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) { + ddf_process_conf_update(st, update); + } + /* case DDF_SPARE_ASSIGN_MAGIC */ +} + +static int ddf_prepare_update(struct supertype *st, + struct metadata_update *update) +{ + /* This update arrived at managemon. + * We are about to pass it to monitor. + * If a malloc is needed, do it here. + */ + struct ddf_super *ddf = st->sb; + be32 *magic; + if (update->len < 4) + return 0; + magic = (be32 *)update->buf; + if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) { + struct vcl *vcl; + struct vd_config *conf; + if (update->len < (int)sizeof(*conf)) + return 0; + conf = (struct vd_config *) update->buf; + if (posix_memalign(&update->space, 512, + offsetof(struct vcl, conf) + + ddf->conf_rec_len * 512) != 0) { + update->space = NULL; + return 0; + } + vcl = update->space; + vcl->conf.sec_elmnt_count = conf->sec_elmnt_count; + if (alloc_other_bvds(ddf, vcl) != 0) { + free(update->space); + update->space = NULL; + return 0; + } + } + return 1; +} + +/* + * Check degraded state of a RAID10. + * returns 2 for good, 1 for degraded, 0 for failed, and -1 for error + */ +static int raid10_degraded(struct mdinfo *info) +{ + int n_prim, n_bvds; + int i; + struct mdinfo *d; + char *found; + int ret = -1; + + n_prim = info->array.layout & ~0x100; + n_bvds = info->array.raid_disks / n_prim; + found = xmalloc(n_bvds); + if (found == NULL) + return ret; + memset(found, 0, n_bvds); + for (d = info->devs; d; d = d->next) { + i = d->disk.raid_disk / n_prim; + if (i >= n_bvds) { + pr_err("BUG: invalid raid disk\n"); + goto out; + } + if (d->state_fd > 0) + found[i]++; + } + ret = 2; + for (i = 0; i < n_bvds; i++) + if (!found[i]) { + dprintf("BVD %d/%d failed\n", i, n_bvds); + ret = 0; + goto out; + } else if (found[i] < n_prim) { + dprintf("BVD %d/%d degraded\n", i, n_bvds); + ret = 1; + } +out: + free(found); + return ret; +} + +/* + * Check if the array 'a' is degraded but not failed. + * If it is, find as many spares as are available and needed and + * arrange for their inclusion. + * We only choose devices which are not already in the array, + * and prefer those with a spare-assignment to this array. + * Otherwise we choose global spares - assuming always that + * there is enough room. + * For each spare that we assign, we return an 'mdinfo' which + * describes the position for the device in the array. + * We also add to 'updates' a DDF_VD_CONF_MAGIC update with + * the new phys_refnum and lba_offset values. + * + * Only worry about BVDs at the moment. + */ +static struct mdinfo *ddf_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + int working = 0; + struct mdinfo *d; + struct ddf_super *ddf = a->container->sb; + int global_ok = 0; + struct mdinfo *rv = NULL; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + int i; + unsigned int j; + struct vcl *vcl; + struct vd_config *vc; + unsigned int n_bvd; + + for (d = a->info.devs ; d ; d = d->next) { + if ((d->curr_state & DS_FAULTY) && + d->state_fd >= 0) + /* wait for Removal to happen */ + return NULL; + if (d->state_fd >= 0) + working ++; + } + + dprintf("working=%d (%d) level=%d\n", working, + a->info.array.raid_disks, + a->info.array.level); + if (working == a->info.array.raid_disks) + return NULL; /* array not degraded */ + switch (a->info.array.level) { + case 1: + if (working == 0) + return NULL; /* failed */ + break; + case 4: + case 5: + if (working < a->info.array.raid_disks - 1) + return NULL; /* failed */ + break; + case 6: + if (working < a->info.array.raid_disks - 2) + return NULL; /* failed */ + break; + case 10: + if (raid10_degraded(&a->info) < 1) + return NULL; + break; + default: /* concat or stripe */ + return NULL; /* failed */ + } + + /* For each slot, if it is not working, find a spare */ + dl = ddf->dlist; + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && (d->state_fd >= 0)) + continue; + + /* OK, this device needs recovery. Find a spare */ + again: + for ( ; dl ; dl = dl->next) { + unsigned long long esize; + unsigned long long pos; + struct mdinfo *d2; + int is_global = 0; + int is_dedicated = 0; + be16 state; + + if (dl->pdnum < 0) + continue; + state = ddf->phys->entries[dl->pdnum].state; + if (be16_and(state, + cpu_to_be16(DDF_Failed|DDF_Missing)) || + !be16_and(state, + cpu_to_be16(DDF_Online))) + continue; + + /* If in this array, skip */ + for (d2 = a->info.devs ; d2 ; d2 = d2->next) + if (d2->state_fd >= 0 && + d2->disk.major == dl->major && + d2->disk.minor == dl->minor) { + dprintf("%x:%x (%08x) already in array\n", + dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum)); + break; + } + if (d2) + continue; + if (be16_and(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare))) { + /* Check spare assign record */ + if (dl->spare) { + if (dl->spare->type & DDF_spare_dedicated) { + /* check spare_ents for guid */ + unsigned int j; + for (j = 0 ; + j < be16_to_cpu + (dl->spare + ->populated); + j++) { + if (memcmp(dl->spare->spare_ents[j].guid, + ddf->virt->entries[a->info.container_member].guid, + DDF_GUID_LEN) == 0) + is_dedicated = 1; + } + } else + is_global = 1; + } + } else if (be16_and(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare))) { + is_global = 1; + } else if (!be16_and(ddf->phys + ->entries[dl->pdnum].state, + cpu_to_be16(DDF_Failed))) { + /* we can possibly use some of this */ + is_global = 1; + } + if ( ! (is_dedicated || + (is_global && global_ok))) { + dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor, + is_dedicated, is_global); + continue; + } + + /* We are allowed to use this device - is there space? + * We need a->info.component_size sectors */ + esize = a->info.component_size; + pos = find_space(ddf, dl, INVALID_SECTORS, &esize); + + if (esize < a->info.component_size) { + dprintf("%x:%x has no room: %llu %llu\n", + dl->major, dl->minor, + esize, a->info.component_size); + /* No room */ + continue; + } + + /* Cool, we have a device with some space at pos */ + di = xcalloc(1, sizeof(*di)); + di->disk.number = i; + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->recovery_start = 0; + di->data_offset = pos; + di->component_size = a->info.component_size; + di->next = rv; + rv = di; + dprintf("%x:%x (%08x) to be %d at %llu\n", + dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum), i, pos); + + break; + } + if (!dl && ! global_ok) { + /* not enough dedicated spares, try global */ + global_ok = 1; + dl = ddf->dlist; + goto again; + } + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * phys_refnum and lba_offset values + */ + vc = find_vdcr(ddf, a->info.container_member, rv->disk.raid_disk, + &n_bvd, &vcl); + if (vc == NULL) + return NULL; + + mu = xmalloc(sizeof(*mu)); + if (posix_memalign(&mu->space, 512, sizeof(struct vcl)) != 0) { + free(mu); + mu = NULL; + } + + mu->len = ddf->conf_rec_len * 512 * vcl->conf.sec_elmnt_count; + mu->buf = xmalloc(mu->len); + mu->space = NULL; + mu->space_list = NULL; + mu->next = *updates; + memcpy(mu->buf, &vcl->conf, ddf->conf_rec_len * 512); + for (j = 1; j < vcl->conf.sec_elmnt_count; j++) + memcpy(mu->buf + j * ddf->conf_rec_len * 512, + vcl->other_bvds[j-1], ddf->conf_rec_len * 512); + + vc = (struct vd_config*)mu->buf; + for (di = rv ; di ; di = di->next) { + unsigned int i_sec, i_prim; + i_sec = di->disk.raid_disk + / be16_to_cpu(vcl->conf.prim_elmnt_count); + i_prim = di->disk.raid_disk + % be16_to_cpu(vcl->conf.prim_elmnt_count); + vc = (struct vd_config *)(mu->buf + + i_sec * ddf->conf_rec_len * 512); + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == di->disk.major + && dl->minor == di->disk.minor) + break; + if (!dl || dl->pdnum < 0) { + pr_err("BUG: can't find disk %d (%d/%d)\n", + di->disk.raid_disk, + di->disk.major, di->disk.minor); + return NULL; + } + vc->phys_refnum[i_prim] = ddf->phys->entries[dl->pdnum].refnum; + LBA_OFFSET(ddf, vc)[i_prim] = cpu_to_be64(di->data_offset); + dprintf("BVD %u gets %u: %08x at %llu\n", i_sec, i_prim, + be32_to_cpu(vc->phys_refnum[i_prim]), + be64_to_cpu(LBA_OFFSET(ddf, vc)[i_prim])); + } + *updates = mu; + return rv; +} +#endif /* MDASSEMBLE */ + +static int ddf_level_to_layout(int level) +{ + switch(level) { + case 0: + case 1: + return 0; + case 5: + return ALGORITHM_LEFT_SYMMETRIC; + case 6: + return ALGORITHM_ROTATING_N_CONTINUE; + case 10: + return 0x102; + default: + return UnSet; + } +} + +static void default_geometry_ddf(struct supertype *st, int *level, int *layout, int *chunk) +{ + if (level && *level == UnSet) + *level = LEVEL_CONTAINER; + + if (level && layout && *layout == UnSet) + *layout = ddf_level_to_layout(*level); +} + +struct superswitch super_ddf = { +#ifndef MDASSEMBLE + .examine_super = examine_super_ddf, + .brief_examine_super = brief_examine_super_ddf, + .brief_examine_subarrays = brief_examine_subarrays_ddf, + .export_examine_super = export_examine_super_ddf, + .detail_super = detail_super_ddf, + .brief_detail_super = brief_detail_super_ddf, + .validate_geometry = validate_geometry_ddf, + .write_init_super = write_init_super_ddf, + .add_to_super = add_to_super_ddf, + .remove_from_super = remove_from_super_ddf, + .load_container = load_container_ddf, + .copy_metadata = copy_metadata_ddf, + .kill_subarray = kill_subarray_ddf, +#endif + .match_home = match_home_ddf, + .uuid_from_super= uuid_from_super_ddf, + .getinfo_super = getinfo_super_ddf, + .update_super = update_super_ddf, + + .avail_size = avail_size_ddf, + + .compare_super = compare_super_ddf, + + .load_super = load_super_ddf, + .init_super = init_super_ddf, + .store_super = store_super_ddf, + .free_super = free_super_ddf, + .match_metadata_desc = match_metadata_desc_ddf, + .container_content = container_content_ddf, + .default_geometry = default_geometry_ddf, + + .external = 1, + +#ifndef MDASSEMBLE +/* for mdmon */ + .open_new = ddf_open_new, + .set_array_state= ddf_set_array_state, + .set_disk = ddf_set_disk, + .sync_metadata = ddf_sync_metadata, + .process_update = ddf_process_update, + .prepare_update = ddf_prepare_update, + .activate_spare = ddf_activate_spare, +#endif + .name = "ddf", +}; diff --git a/super-gpt.c b/super-gpt.c new file mode 100644 index 00000000..1a2adce0 --- /dev/null +++ b/super-gpt.c @@ -0,0 +1,216 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2010 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * + */ + +/* + * 'gpt' is a pseudo metadata type for devices which have a + * GPT partition table. + * + * Obviously arrays cannot be created or assembled for this type. + * It is used to allow a new bare device to have an partition table + * added so the member partitions can then be included in other + * arrays as relevant. + * + * The meaning operations are: + * examine_super, but not brief_examine_super or export_examine + * load_super + * store_super + */ + +#include "mdadm.h" +#include "part.h" + +static void free_gpt(struct supertype *st) +{ + free(st->sb); + st->sb = NULL; +} + +#ifndef MDASSEMBLE +static void examine_gpt(struct supertype *st, char *homehost) +{ + struct GPT *gpt = st->sb + 512; + struct GPT_part_entry *gpe = st->sb + 1024; + unsigned int i; + + printf(" GPT Magic : %llx\n", (unsigned long long)__le64_to_cpu(gpt->magic)); + printf(" GPT Revision : %ld\n", (long)__le32_to_cpu(gpt->revision)); + for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) { + printf(" Partition[%02d] : %12llu sectors at %12llu\n", + i, + (unsigned long long)__le64_to_cpu(gpe[i].starting_lba), + (unsigned long long)__le64_to_cpu(gpe[i].ending_lba)- + (unsigned long long)__le64_to_cpu(gpe[i].starting_lba) + +1 + ); + } +} +#endif /* MDASSEMBLE */ + +static int load_gpt(struct supertype *st, int fd, char *devname) +{ + struct MBR *super; + struct GPT *gpt_head; + int to_read; + + free_gpt(st); + + if (posix_memalign((void**)&super, 4096, 32*512) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + lseek(fd, 0, 0); + if (read(fd, super, sizeof(*super)) != sizeof(*super)) { + no_read: + if (devname) + pr_err("Cannot read partition table on %s\n", + devname); + free(super); + return 1; + } + + if (super->magic != MBR_SIGNATURE_MAGIC || + super->parts[0].part_type != MBR_GPT_PARTITION_TYPE) { + not_found: + if (devname) + pr_err("No partition table found on %s\n", + devname); + free(super); + return 1; + } + /* Seem to have GPT, load the header */ + gpt_head = (struct GPT*)(super+1); + if (read(fd, gpt_head, sizeof(*gpt_head)) != sizeof(*gpt_head)) + goto no_read; + if (gpt_head->magic != GPT_SIGNATURE_MAGIC) + goto not_found; + if (__le32_to_cpu(gpt_head->part_cnt) >= 128) + goto not_found; + + to_read = __le32_to_cpu(gpt_head->part_cnt) * sizeof(struct GPT_part_entry); + to_read = ((to_read+511)/512) * 512; + if (read(fd, gpt_head+1, to_read) != to_read) + goto no_read; + + st->sb = super; + + if (st->ss == NULL) { + st->ss = &gpt; + st->minor_version = 0; + st->max_devs = 1; + st->info = NULL; + } + return 0; +} + +static int store_gpt(struct supertype *st, int fd) +{ + /* FIXME should I save the boot loader */ + /* need to write two copies! */ + /* FIXME allow for blocks != 512 bytes + *etc + */ + struct MBR *super = st->sb; + struct GPT *gpt; + int to_write; + + gpt = (struct GPT*)(super+1); + + to_write = __le32_to_cpu(gpt->part_cnt) * sizeof(struct GPT_part_entry); + to_write = ((to_write+511)/512) * 512; + + lseek(fd, 0, 0); + if (write(fd, st->sb, to_write) != to_write) + return 4; + + fsync(fd); + ioctl(fd, BLKRRPART, 0); + return 0; +} + +static void getinfo_gpt(struct supertype *st, struct mdinfo *info, char *map) +{ + struct GPT *gpt = st->sb + 512; + struct GPT_part_entry *gpe = st->sb + 1024; + unsigned int i; + + memset(&info->array, 0, sizeof(info->array)); + memset(&info->disk, 0, sizeof(info->disk)); + strcpy(info->text_version, "gpt"); + strcpy(info->name, "gpt"); + info->component_size = 0; + + for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) { + unsigned long long last = + (unsigned long long)__le64_to_cpu(gpe[i].ending_lba); + if (last > info->component_size) + info->component_size = last; + } +} + +static struct supertype *match_metadata_desc(char *arg) +{ + struct supertype *st = xmalloc(sizeof(*st)); + + if (!st) + return st; + if (strcmp(arg, "gpt") != 0) { + free(st); + return NULL; + } + + st->ss = &gpt; + st->info = NULL; + st->minor_version = 0; + st->max_devs = 1; + st->sb = NULL; + return st; +} + +#ifndef MDASSEMBLE +static int validate_geometry(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int verbose) +{ + pr_err("gpt metadata cannot be used this way\n"); + return 0; +} +#endif + +struct superswitch gpt = { +#ifndef MDASSEMBLE + .examine_super = examine_gpt, + .validate_geometry = validate_geometry, +#endif + .match_metadata_desc = match_metadata_desc, + .load_super = load_gpt, + .store_super = store_gpt, + .getinfo_super = getinfo_gpt, + .free_super = free_gpt, + .name = "gpt", +}; diff --git a/super-intel.c b/super-intel.c new file mode 100644 index 00000000..90b7b6de --- /dev/null +++ b/super-intel.c @@ -0,0 +1,10765 @@ +/* + * mdadm - Intel(R) Matrix Storage Manager Support + * + * Copyright (C) 2002-2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "mdmon.h" +#include "sha1.h" +#include "platform-intel.h" +#include +#include +#include +#include + +/* MPB == Metadata Parameter Block */ +#define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. " +#define MPB_SIG_LEN (strlen(MPB_SIGNATURE)) +#define MPB_VERSION_RAID0 "1.0.00" +#define MPB_VERSION_RAID1 "1.1.00" +#define MPB_VERSION_MANY_VOLUMES_PER_ARRAY "1.2.00" +#define MPB_VERSION_3OR4_DISK_ARRAY "1.2.01" +#define MPB_VERSION_RAID5 "1.2.02" +#define MPB_VERSION_5OR6_DISK_ARRAY "1.2.04" +#define MPB_VERSION_CNG "1.2.06" +#define MPB_VERSION_ATTRIBS "1.3.00" +#define MAX_SIGNATURE_LENGTH 32 +#define MAX_RAID_SERIAL_LEN 16 + +/* supports RAID0 */ +#define MPB_ATTRIB_RAID0 __cpu_to_le32(0x00000001) +/* supports RAID1 */ +#define MPB_ATTRIB_RAID1 __cpu_to_le32(0x00000002) +/* supports RAID10 */ +#define MPB_ATTRIB_RAID10 __cpu_to_le32(0x00000004) +/* supports RAID1E */ +#define MPB_ATTRIB_RAID1E __cpu_to_le32(0x00000008) +/* supports RAID5 */ +#define MPB_ATTRIB_RAID5 __cpu_to_le32(0x00000010) +/* supports RAID CNG */ +#define MPB_ATTRIB_RAIDCNG __cpu_to_le32(0x00000020) +/* supports expanded stripe sizes of 256K, 512K and 1MB */ +#define MPB_ATTRIB_EXP_STRIPE_SIZE __cpu_to_le32(0x00000040) + +/* The OROM Support RST Caching of Volumes */ +#define MPB_ATTRIB_NVM __cpu_to_le32(0x02000000) +/* The OROM supports creating disks greater than 2TB */ +#define MPB_ATTRIB_2TB_DISK __cpu_to_le32(0x04000000) +/* The OROM supports Bad Block Management */ +#define MPB_ATTRIB_BBM __cpu_to_le32(0x08000000) + +/* THe OROM Supports NVM Caching of Volumes */ +#define MPB_ATTRIB_NEVER_USE2 __cpu_to_le32(0x10000000) +/* The OROM supports creating volumes greater than 2TB */ +#define MPB_ATTRIB_2TB __cpu_to_le32(0x20000000) +/* originally for PMP, now it's wasted b/c. Never use this bit! */ +#define MPB_ATTRIB_NEVER_USE __cpu_to_le32(0x40000000) +/* Verify MPB contents against checksum after reading MPB */ +#define MPB_ATTRIB_CHECKSUM_VERIFY __cpu_to_le32(0x80000000) + +/* Define all supported attributes that have to be accepted by mdadm + */ +#define MPB_ATTRIB_SUPPORTED (MPB_ATTRIB_CHECKSUM_VERIFY | \ + MPB_ATTRIB_2TB | \ + MPB_ATTRIB_2TB_DISK | \ + MPB_ATTRIB_RAID0 | \ + MPB_ATTRIB_RAID1 | \ + MPB_ATTRIB_RAID10 | \ + MPB_ATTRIB_RAID5 | \ + MPB_ATTRIB_EXP_STRIPE_SIZE) + +/* Define attributes that are unused but not harmful */ +#define MPB_ATTRIB_IGNORED (MPB_ATTRIB_NEVER_USE) + +#define MPB_SECTOR_CNT 2210 +#define IMSM_RESERVED_SECTORS 4096 +#define NUM_BLOCKS_DIRTY_STRIPE_REGION 2056 +#define SECT_PER_MB_SHIFT 11 + +/* Disk configuration info. */ +#define IMSM_MAX_DEVICES 255 +struct imsm_disk { + __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */ + __u32 total_blocks_lo; /* 0xE8 - 0xEB total blocks lo */ + __u32 scsi_id; /* 0xEC - 0xEF scsi ID */ +#define SPARE_DISK __cpu_to_le32(0x01) /* Spare */ +#define CONFIGURED_DISK __cpu_to_le32(0x02) /* Member of some RaidDev */ +#define FAILED_DISK __cpu_to_le32(0x04) /* Permanent failure */ + __u32 status; /* 0xF0 - 0xF3 */ + __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */ + __u32 total_blocks_hi; /* 0xF4 - 0xF5 total blocks hi */ +#define IMSM_DISK_FILLERS 3 + __u32 filler[IMSM_DISK_FILLERS]; /* 0xF5 - 0x107 MPB_DISK_FILLERS for future expansion */ +}; + +/* map selector for map managment + */ +#define MAP_0 0 +#define MAP_1 1 +#define MAP_X -1 + +/* RAID map configuration infos. */ +struct imsm_map { + __u32 pba_of_lba0_lo; /* start address of partition */ + __u32 blocks_per_member_lo;/* blocks per member */ + __u32 num_data_stripes_lo; /* number of data stripes */ + __u16 blocks_per_strip; + __u8 map_state; /* Normal, Uninitialized, Degraded, Failed */ +#define IMSM_T_STATE_NORMAL 0 +#define IMSM_T_STATE_UNINITIALIZED 1 +#define IMSM_T_STATE_DEGRADED 2 +#define IMSM_T_STATE_FAILED 3 + __u8 raid_level; +#define IMSM_T_RAID0 0 +#define IMSM_T_RAID1 1 +#define IMSM_T_RAID5 5 /* since metadata version 1.2.02 ? */ + __u8 num_members; /* number of member disks */ + __u8 num_domains; /* number of parity domains */ + __u8 failed_disk_num; /* valid only when state is degraded */ + __u8 ddf; + __u32 pba_of_lba0_hi; + __u32 blocks_per_member_hi; + __u32 num_data_stripes_hi; + __u32 filler[4]; /* expansion area */ +#define IMSM_ORD_REBUILD (1 << 24) + __u32 disk_ord_tbl[1]; /* disk_ord_tbl[num_members], + * top byte contains some flags + */ +} __attribute__ ((packed)); + +struct imsm_vol { + __u32 curr_migr_unit; + __u32 checkpoint_id; /* id to access curr_migr_unit */ + __u8 migr_state; /* Normal or Migrating */ +#define MIGR_INIT 0 +#define MIGR_REBUILD 1 +#define MIGR_VERIFY 2 /* analagous to echo check > sync_action */ +#define MIGR_GEN_MIGR 3 +#define MIGR_STATE_CHANGE 4 +#define MIGR_REPAIR 5 + __u8 migr_type; /* Initializing, Rebuilding, ... */ + __u8 dirty; + __u8 fs_state; /* fast-sync state for CnG (0xff == disabled) */ + __u16 verify_errors; /* number of mismatches */ + __u16 bad_blocks; /* number of bad blocks during verify */ + __u32 filler[4]; + struct imsm_map map[1]; + /* here comes another one if migr_state */ +} __attribute__ ((packed)); + +struct imsm_dev { + __u8 volume[MAX_RAID_SERIAL_LEN]; + __u32 size_low; + __u32 size_high; +#define DEV_BOOTABLE __cpu_to_le32(0x01) +#define DEV_BOOT_DEVICE __cpu_to_le32(0x02) +#define DEV_READ_COALESCING __cpu_to_le32(0x04) +#define DEV_WRITE_COALESCING __cpu_to_le32(0x08) +#define DEV_LAST_SHUTDOWN_DIRTY __cpu_to_le32(0x10) +#define DEV_HIDDEN_AT_BOOT __cpu_to_le32(0x20) +#define DEV_CURRENTLY_HIDDEN __cpu_to_le32(0x40) +#define DEV_VERIFY_AND_FIX __cpu_to_le32(0x80) +#define DEV_MAP_STATE_UNINIT __cpu_to_le32(0x100) +#define DEV_NO_AUTO_RECOVERY __cpu_to_le32(0x200) +#define DEV_CLONE_N_GO __cpu_to_le32(0x400) +#define DEV_CLONE_MAN_SYNC __cpu_to_le32(0x800) +#define DEV_CNG_MASTER_DISK_NUM __cpu_to_le32(0x1000) + __u32 status; /* Persistent RaidDev status */ + __u32 reserved_blocks; /* Reserved blocks at beginning of volume */ + __u8 migr_priority; + __u8 num_sub_vols; + __u8 tid; + __u8 cng_master_disk; + __u16 cache_policy; + __u8 cng_state; + __u8 cng_sub_state; +#define IMSM_DEV_FILLERS 10 + __u32 filler[IMSM_DEV_FILLERS]; + struct imsm_vol vol; +} __attribute__ ((packed)); + +struct imsm_super { + __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */ + __u32 check_sum; /* 0x20 - 0x23 MPB Checksum */ + __u32 mpb_size; /* 0x24 - 0x27 Size of MPB */ + __u32 family_num; /* 0x28 - 0x2B Checksum from first time this config was written */ + __u32 generation_num; /* 0x2C - 0x2F Incremented each time this array's MPB is written */ + __u32 error_log_size; /* 0x30 - 0x33 in bytes */ + __u32 attributes; /* 0x34 - 0x37 */ + __u8 num_disks; /* 0x38 Number of configured disks */ + __u8 num_raid_devs; /* 0x39 Number of configured volumes */ + __u8 error_log_pos; /* 0x3A */ + __u8 fill[1]; /* 0x3B */ + __u32 cache_size; /* 0x3c - 0x40 in mb */ + __u32 orig_family_num; /* 0x40 - 0x43 original family num */ + __u32 pwr_cycle_count; /* 0x44 - 0x47 simulated power cycle count for array */ + __u32 bbm_log_size; /* 0x48 - 0x4B - size of bad Block Mgmt Log in bytes */ +#define IMSM_FILLERS 35 + __u32 filler[IMSM_FILLERS]; /* 0x4C - 0xD7 RAID_MPB_FILLERS */ + struct imsm_disk disk[1]; /* 0xD8 diskTbl[numDisks] */ + /* here comes imsm_dev[num_raid_devs] */ + /* here comes BBM logs */ +} __attribute__ ((packed)); + +#define BBM_LOG_MAX_ENTRIES 254 + +struct bbm_log_entry { + __u64 defective_block_start; +#define UNREADABLE 0xFFFFFFFF + __u32 spare_block_offset; + __u16 remapped_marked_count; + __u16 disk_ordinal; +} __attribute__ ((__packed__)); + +struct bbm_log { + __u32 signature; /* 0xABADB10C */ + __u32 entry_count; + __u32 reserved_spare_block_count; /* 0 */ + __u32 reserved; /* 0xFFFF */ + __u64 first_spare_lba; + struct bbm_log_entry mapped_block_entries[BBM_LOG_MAX_ENTRIES]; +} __attribute__ ((__packed__)); + +#ifndef MDASSEMBLE +static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" }; +#endif + +#define RAID_DISK_RESERVED_BLOCKS_IMSM_HI 2209 + +#define GEN_MIGR_AREA_SIZE 2048 /* General Migration Copy Area size in blocks */ + +#define MIGR_REC_BUF_SIZE 512 /* size of migr_record i/o buffer */ +#define MIGR_REC_POSITION 512 /* migr_record position offset on disk, + * MIGR_REC_BUF_SIZE <= MIGR_REC_POSITION + */ + +#define UNIT_SRC_NORMAL 0 /* Source data for curr_migr_unit must + * be recovered using srcMap */ +#define UNIT_SRC_IN_CP_AREA 1 /* Source data for curr_migr_unit has + * already been migrated and must + * be recovered from checkpoint area */ +struct migr_record { + __u32 rec_status; /* Status used to determine how to restart + * migration in case it aborts + * in some fashion */ + __u32 curr_migr_unit; /* 0..numMigrUnits-1 */ + __u32 family_num; /* Family number of MPB + * containing the RaidDev + * that is migrating */ + __u32 ascending_migr; /* True if migrating in increasing + * order of lbas */ + __u32 blocks_per_unit; /* Num disk blocks per unit of operation */ + __u32 dest_depth_per_unit; /* Num member blocks each destMap + * member disk + * advances per unit-of-operation */ + __u32 ckpt_area_pba; /* Pba of first block of ckpt copy area */ + __u32 dest_1st_member_lba; /* First member lba on first + * stripe of destination */ + __u32 num_migr_units; /* Total num migration units-of-op */ + __u32 post_migr_vol_cap; /* Size of volume after + * migration completes */ + __u32 post_migr_vol_cap_hi; /* Expansion space for LBA64 */ + __u32 ckpt_read_disk_num; /* Which member disk in destSubMap[0] the + * migration ckpt record was read from + * (for recovered migrations) */ +} __attribute__ ((__packed__)); + +struct md_list { + /* usage marker: + * 1: load metadata + * 2: metadata does not match + * 4: already checked + */ + int used; + char *devname; + int found; + int container; + dev_t st_rdev; + struct md_list *next; +}; + +#define pr_vrb(fmt, arg...) (void) (verbose && pr_err(fmt, ##arg)) + +static __u8 migr_type(struct imsm_dev *dev) +{ + if (dev->vol.migr_type == MIGR_VERIFY && + dev->status & DEV_VERIFY_AND_FIX) + return MIGR_REPAIR; + else + return dev->vol.migr_type; +} + +static void set_migr_type(struct imsm_dev *dev, __u8 migr_type) +{ + /* for compatibility with older oroms convert MIGR_REPAIR, into + * MIGR_VERIFY w/ DEV_VERIFY_AND_FIX status + */ + if (migr_type == MIGR_REPAIR) { + dev->vol.migr_type = MIGR_VERIFY; + dev->status |= DEV_VERIFY_AND_FIX; + } else { + dev->vol.migr_type = migr_type; + dev->status &= ~DEV_VERIFY_AND_FIX; + } +} + +static unsigned int sector_count(__u32 bytes) +{ + return ROUND_UP(bytes, 512) / 512; +} + +static unsigned int mpb_sectors(struct imsm_super *mpb) +{ + return sector_count(__le32_to_cpu(mpb->mpb_size)); +} + +struct intel_dev { + struct imsm_dev *dev; + struct intel_dev *next; + unsigned index; +}; + +struct intel_hba { + enum sys_dev_type type; + char *path; + char *pci_id; + struct intel_hba *next; +}; + +enum action { + DISK_REMOVE = 1, + DISK_ADD +}; +/* internal representation of IMSM metadata */ +struct intel_super { + union { + void *buf; /* O_DIRECT buffer for reading/writing metadata */ + struct imsm_super *anchor; /* immovable parameters */ + }; + union { + void *migr_rec_buf; /* buffer for I/O operations */ + struct migr_record *migr_rec; /* migration record */ + }; + int clean_migration_record_by_mdmon; /* when reshape is switched to next + array, it indicates that mdmon is allowed to clean migration + record */ + size_t len; /* size of the 'buf' allocation */ + void *next_buf; /* for realloc'ing buf from the manager */ + size_t next_len; + int updates_pending; /* count of pending updates for mdmon */ + int current_vol; /* index of raid device undergoing creation */ + unsigned long long create_offset; /* common start for 'current_vol' */ + __u32 random; /* random data for seeding new family numbers */ + struct intel_dev *devlist; + struct dl { + struct dl *next; + int index; + __u8 serial[MAX_RAID_SERIAL_LEN]; + int major, minor; + char *devname; + struct imsm_disk disk; + int fd; + int extent_cnt; + struct extent *e; /* for determining freespace @ create */ + int raiddisk; /* slot to fill in autolayout */ + enum action action; + } *disks, *current_disk; + struct dl *disk_mgmt_list; /* list of disks to add/remove while mdmon + active */ + struct dl *missing; /* disks removed while we weren't looking */ + struct bbm_log *bbm_log; + struct intel_hba *hba; /* device path of the raid controller for this metadata */ + const struct imsm_orom *orom; /* platform firmware support */ + struct intel_super *next; /* (temp) list for disambiguating family_num */ +}; + +struct intel_disk { + struct imsm_disk disk; + #define IMSM_UNKNOWN_OWNER (-1) + int owner; + struct intel_disk *next; +}; + +struct extent { + unsigned long long start, size; +}; + +/* definitions of reshape process types */ +enum imsm_reshape_type { + CH_TAKEOVER, + CH_MIGRATION, + CH_ARRAY_SIZE, +}; + +/* definition of messages passed to imsm_process_update */ +enum imsm_update_type { + update_activate_spare, + update_create_array, + update_kill_array, + update_rename_array, + update_add_remove_disk, + update_reshape_container_disks, + update_reshape_migration, + update_takeover, + update_general_migration_checkpoint, + update_size_change, +}; + +struct imsm_update_activate_spare { + enum imsm_update_type type; + struct dl *dl; + int slot; + int array; + struct imsm_update_activate_spare *next; +}; + +struct geo_params { + char devnm[32]; + char *dev_name; + unsigned long long size; + int level; + int layout; + int chunksize; + int raid_disks; +}; + +enum takeover_direction { + R10_TO_R0, + R0_TO_R10 +}; +struct imsm_update_takeover { + enum imsm_update_type type; + int subarray; + enum takeover_direction direction; +}; + +struct imsm_update_reshape { + enum imsm_update_type type; + int old_raid_disks; + int new_raid_disks; + + int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */ +}; + +struct imsm_update_reshape_migration { + enum imsm_update_type type; + int old_raid_disks; + int new_raid_disks; + /* fields for array migration changes + */ + int subdev; + int new_level; + int new_layout; + int new_chunksize; + + int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */ +}; + +struct imsm_update_size_change { + enum imsm_update_type type; + int subdev; + long long new_size; +}; + +struct imsm_update_general_migration_checkpoint { + enum imsm_update_type type; + __u32 curr_migr_unit; +}; + +struct disk_info { + __u8 serial[MAX_RAID_SERIAL_LEN]; +}; + +struct imsm_update_create_array { + enum imsm_update_type type; + int dev_idx; + struct imsm_dev dev; +}; + +struct imsm_update_kill_array { + enum imsm_update_type type; + int dev_idx; +}; + +struct imsm_update_rename_array { + enum imsm_update_type type; + __u8 name[MAX_RAID_SERIAL_LEN]; + int dev_idx; +}; + +struct imsm_update_add_remove_disk { + enum imsm_update_type type; +}; + +static const char *_sys_dev_type[] = { + [SYS_DEV_UNKNOWN] = "Unknown", + [SYS_DEV_SAS] = "SAS", + [SYS_DEV_SATA] = "SATA", + [SYS_DEV_NVME] = "NVMe", + [SYS_DEV_VMD] = "VMD" +}; + +const char *get_sys_dev_type(enum sys_dev_type type) +{ + if (type >= SYS_DEV_MAX) + type = SYS_DEV_UNKNOWN; + + return _sys_dev_type[type]; +} + +static struct intel_hba * alloc_intel_hba(struct sys_dev *device) +{ + struct intel_hba *result = xmalloc(sizeof(*result)); + + result->type = device->type; + result->path = xstrdup(device->path); + result->next = NULL; + if (result->path && (result->pci_id = strrchr(result->path, '/')) != NULL) + result->pci_id++; + + return result; +} + +static struct intel_hba * find_intel_hba(struct intel_hba *hba, struct sys_dev *device) +{ + struct intel_hba *result=NULL; + for (result = hba; result; result = result->next) { + if (result->type == device->type && strcmp(result->path, device->path) == 0) + break; + } + return result; +} + +static int attach_hba_to_super(struct intel_super *super, struct sys_dev *device) +{ + struct intel_hba *hba; + + /* check if disk attached to Intel HBA */ + hba = find_intel_hba(super->hba, device); + if (hba != NULL) + return 1; + /* Check if HBA is already attached to super */ + if (super->hba == NULL) { + super->hba = alloc_intel_hba(device); + return 1; + } + + hba = super->hba; + /* Intel metadata allows for all disks attached to the same type HBA. + * Do not support HBA types mixing + */ + if (device->type != hba->type) + return 2; + + /* Always forbid spanning between VMD domains (seen as different controllers by mdadm) */ + if (device->type == SYS_DEV_VMD && !path_attached_to_hba(device->path, hba->path)) + return 2; + + /* Multiple same type HBAs can be used if they share the same OROM */ + const struct imsm_orom *device_orom = get_orom_by_device_id(device->dev_id); + + if (device_orom != super->orom) + return 2; + + while (hba->next) + hba = hba->next; + + hba->next = alloc_intel_hba(device); + return 1; +} + +static struct sys_dev* find_disk_attached_hba(int fd, const char *devname) +{ + struct sys_dev *list, *elem; + char *disk_path; + + if ((list = find_intel_devices()) == NULL) + return 0; + + if (fd < 0) + disk_path = (char *) devname; + else + disk_path = diskfd_to_devpath(fd); + + if (!disk_path) + return 0; + + for (elem = list; elem; elem = elem->next) + if (path_attached_to_hba(disk_path, elem->path)) + return elem; + + if (disk_path != devname) + free(disk_path); + + return NULL; +} + +static int find_intel_hba_capability(int fd, struct intel_super *super, + char *devname); + +static struct supertype *match_metadata_desc_imsm(char *arg) +{ + struct supertype *st; + + if (strcmp(arg, "imsm") != 0 && + strcmp(arg, "default") != 0 + ) + return NULL; + + st = xcalloc(1, sizeof(*st)); + st->ss = &super_imsm; + st->max_devs = IMSM_MAX_DEVICES; + st->minor_version = 0; + st->sb = NULL; + return st; +} + +#ifndef MDASSEMBLE +static __u8 *get_imsm_version(struct imsm_super *mpb) +{ + return &mpb->sig[MPB_SIG_LEN]; +} +#endif + +/* retrieve a disk directly from the anchor when the anchor is known to be + * up-to-date, currently only at load time + */ +static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index) +{ + if (index >= mpb->num_disks) + return NULL; + return &mpb->disk[index]; +} + +/* retrieve the disk description based on a index of the disk + * in the sub-array + */ +static struct dl *get_imsm_dl_disk(struct intel_super *super, __u8 index) +{ + struct dl *d; + + for (d = super->disks; d; d = d->next) + if (d->index == index) + return d; + + return NULL; +} +/* retrieve a disk from the parsed metadata */ +static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index) +{ + struct dl *dl; + + dl = get_imsm_dl_disk(super, index); + if (dl) + return &dl->disk; + + return NULL; +} + +/* generate a checksum directly from the anchor when the anchor is known to be + * up-to-date, currently only at load or write_super after coalescing + */ +static __u32 __gen_imsm_checksum(struct imsm_super *mpb) +{ + __u32 end = mpb->mpb_size / sizeof(end); + __u32 *p = (__u32 *) mpb; + __u32 sum = 0; + + while (end--) { + sum += __le32_to_cpu(*p); + p++; + } + + return sum - __le32_to_cpu(mpb->check_sum); +} + +static size_t sizeof_imsm_map(struct imsm_map *map) +{ + return sizeof(struct imsm_map) + sizeof(__u32) * (map->num_members - 1); +} + +struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map) +{ + /* A device can have 2 maps if it is in the middle of a migration. + * If second_map is: + * MAP_0 - we return the first map + * MAP_1 - we return the second map if it exists, else NULL + * MAP_X - we return the second map if it exists, else the first + */ + struct imsm_map *map = &dev->vol.map[0]; + struct imsm_map *map2 = NULL; + + if (dev->vol.migr_state) + map2 = (void *)map + sizeof_imsm_map(map); + + switch (second_map) { + case MAP_0: + break; + case MAP_1: + map = map2; + break; + case MAP_X: + if (map2) + map = map2; + break; + default: + map = NULL; + } + return map; + +} + +/* return the size of the device. + * migr_state increases the returned size if map[0] were to be duplicated + */ +static size_t sizeof_imsm_dev(struct imsm_dev *dev, int migr_state) +{ + size_t size = sizeof(*dev) - sizeof(struct imsm_map) + + sizeof_imsm_map(get_imsm_map(dev, MAP_0)); + + /* migrating means an additional map */ + if (dev->vol.migr_state) + size += sizeof_imsm_map(get_imsm_map(dev, MAP_1)); + else if (migr_state) + size += sizeof_imsm_map(get_imsm_map(dev, MAP_0)); + + return size; +} + +#ifndef MDASSEMBLE +/* retrieve disk serial number list from a metadata update */ +static struct disk_info *get_disk_info(struct imsm_update_create_array *update) +{ + void *u = update; + struct disk_info *inf; + + inf = u + sizeof(*update) - sizeof(struct imsm_dev) + + sizeof_imsm_dev(&update->dev, 0); + + return inf; +} +#endif + +static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index) +{ + int offset; + int i; + void *_mpb = mpb; + + if (index >= mpb->num_raid_devs) + return NULL; + + /* devices start after all disks */ + offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb; + + for (i = 0; i <= index; i++) + if (i == index) + return _mpb + offset; + else + offset += sizeof_imsm_dev(_mpb + offset, 0); + + return NULL; +} + +static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index) +{ + struct intel_dev *dv; + + if (index >= super->anchor->num_raid_devs) + return NULL; + for (dv = super->devlist; dv; dv = dv->next) + if (dv->index == index) + return dv->dev; + return NULL; +} + +/* + * for second_map: + * == MAP_0 get first map + * == MAP_1 get second map + * == MAP_X than get map according to the current migr_state + */ +static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev, + int slot, + int second_map) +{ + struct imsm_map *map; + + map = get_imsm_map(dev, second_map); + + /* top byte identifies disk under rebuild */ + return __le32_to_cpu(map->disk_ord_tbl[slot]); +} + +#define ord_to_idx(ord) (((ord) << 8) >> 8) +static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot, int second_map) +{ + __u32 ord = get_imsm_ord_tbl_ent(dev, slot, second_map); + + return ord_to_idx(ord); +} + +static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord) +{ + map->disk_ord_tbl[slot] = __cpu_to_le32(ord); +} + +static int get_imsm_disk_slot(struct imsm_map *map, unsigned idx) +{ + int slot; + __u32 ord; + + for (slot = 0; slot < map->num_members; slot++) { + ord = __le32_to_cpu(map->disk_ord_tbl[slot]); + if (ord_to_idx(ord) == idx) + return slot; + } + + return -1; +} + +static int get_imsm_raid_level(struct imsm_map *map) +{ + if (map->raid_level == 1) { + if (map->num_members == 2) + return 1; + else + return 10; + } + + return map->raid_level; +} + +static int cmp_extent(const void *av, const void *bv) +{ + const struct extent *a = av; + const struct extent *b = bv; + if (a->start < b->start) + return -1; + if (a->start > b->start) + return 1; + return 0; +} + +static int count_memberships(struct dl *dl, struct intel_super *super) +{ + int memberships = 0; + int i; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + + if (get_imsm_disk_slot(map, dl->index) >= 0) + memberships++; + } + + return memberships; +} + +static __u32 imsm_min_reserved_sectors(struct intel_super *super); + +static int split_ull(unsigned long long n, __u32 *lo, __u32 *hi) +{ + if (lo == 0 || hi == 0) + return 1; + *lo = __le32_to_cpu((unsigned)n); + *hi = __le32_to_cpu((unsigned)(n >> 32)); + return 0; +} + +static unsigned long long join_u32(__u32 lo, __u32 hi) +{ + return (unsigned long long)__le32_to_cpu(lo) | + (((unsigned long long)__le32_to_cpu(hi)) << 32); +} + +static unsigned long long total_blocks(struct imsm_disk *disk) +{ + if (disk == NULL) + return 0; + return join_u32(disk->total_blocks_lo, disk->total_blocks_hi); +} + +static unsigned long long pba_of_lba0(struct imsm_map *map) +{ + if (map == NULL) + return 0; + return join_u32(map->pba_of_lba0_lo, map->pba_of_lba0_hi); +} + +static unsigned long long blocks_per_member(struct imsm_map *map) +{ + if (map == NULL) + return 0; + return join_u32(map->blocks_per_member_lo, map->blocks_per_member_hi); +} + +#ifndef MDASSEMBLE +static unsigned long long num_data_stripes(struct imsm_map *map) +{ + if (map == NULL) + return 0; + return join_u32(map->num_data_stripes_lo, map->num_data_stripes_hi); +} + +static void set_total_blocks(struct imsm_disk *disk, unsigned long long n) +{ + split_ull(n, &disk->total_blocks_lo, &disk->total_blocks_hi); +} +#endif + +static void set_pba_of_lba0(struct imsm_map *map, unsigned long long n) +{ + split_ull(n, &map->pba_of_lba0_lo, &map->pba_of_lba0_hi); +} + +static void set_blocks_per_member(struct imsm_map *map, unsigned long long n) +{ + split_ull(n, &map->blocks_per_member_lo, &map->blocks_per_member_hi); +} + +static void set_num_data_stripes(struct imsm_map *map, unsigned long long n) +{ + split_ull(n, &map->num_data_stripes_lo, &map->num_data_stripes_hi); +} + +static struct extent *get_extents(struct intel_super *super, struct dl *dl) +{ + /* find a list of used extents on the given physical device */ + struct extent *rv, *e; + int i; + int memberships = count_memberships(dl, super); + __u32 reservation; + + /* trim the reserved area for spares, so they can join any array + * regardless of whether the OROM has assigned sectors from the + * IMSM_RESERVED_SECTORS region + */ + if (dl->index == -1) + reservation = imsm_min_reserved_sectors(super); + else + reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + + rv = xcalloc(sizeof(struct extent), (memberships + 1)); + e = rv; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + + if (get_imsm_disk_slot(map, dl->index) >= 0) { + e->start = pba_of_lba0(map); + e->size = blocks_per_member(map); + e++; + } + } + qsort(rv, memberships, sizeof(*rv), cmp_extent); + + /* determine the start of the metadata + * when no raid devices are defined use the default + * ...otherwise allow the metadata to truncate the value + * as is the case with older versions of imsm + */ + if (memberships) { + struct extent *last = &rv[memberships - 1]; + unsigned long long remainder; + + remainder = total_blocks(&dl->disk) - (last->start + last->size); + /* round down to 1k block to satisfy precision of the kernel + * 'size' interface + */ + remainder &= ~1UL; + /* make sure remainder is still sane */ + if (remainder < (unsigned)ROUND_UP(super->len, 512) >> 9) + remainder = ROUND_UP(super->len, 512) >> 9; + if (reservation > remainder) + reservation = remainder; + } + e->start = total_blocks(&dl->disk) - reservation; + e->size = 0; + return rv; +} + +/* try to determine how much space is reserved for metadata from + * the last get_extents() entry, otherwise fallback to the + * default + */ +static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl) +{ + struct extent *e; + int i; + __u32 rv; + + /* for spares just return a minimal reservation which will grow + * once the spare is picked up by an array + */ + if (dl->index == -1) + return MPB_SECTOR_CNT; + + e = get_extents(super, dl); + if (!e) + return MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + + /* scroll to last entry */ + for (i = 0; e[i].size; i++) + continue; + + rv = total_blocks(&dl->disk) - e[i].start; + + free(e); + + return rv; +} + +static int is_spare(struct imsm_disk *disk) +{ + return (disk->status & SPARE_DISK) == SPARE_DISK; +} + +static int is_configured(struct imsm_disk *disk) +{ + return (disk->status & CONFIGURED_DISK) == CONFIGURED_DISK; +} + +static int is_failed(struct imsm_disk *disk) +{ + return (disk->status & FAILED_DISK) == FAILED_DISK; +} + +/* try to determine how much space is reserved for metadata from + * the last get_extents() entry on the smallest active disk, + * otherwise fallback to the default + */ +static __u32 imsm_min_reserved_sectors(struct intel_super *super) +{ + struct extent *e; + int i; + unsigned long long min_active; + __u32 remainder; + __u32 rv = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + struct dl *dl, *dl_min = NULL; + + if (!super) + return rv; + + min_active = 0; + for (dl = super->disks; dl; dl = dl->next) { + if (dl->index < 0) + continue; + unsigned long long blocks = total_blocks(&dl->disk); + if (blocks < min_active || min_active == 0) { + dl_min = dl; + min_active = blocks; + } + } + if (!dl_min) + return rv; + + /* find last lba used by subarrays on the smallest active disk */ + e = get_extents(super, dl_min); + if (!e) + return rv; + for (i = 0; e[i].size; i++) + continue; + + remainder = min_active - e[i].start; + free(e); + + /* to give priority to recovery we should not require full + IMSM_RESERVED_SECTORS from the spare */ + rv = MPB_SECTOR_CNT + NUM_BLOCKS_DIRTY_STRIPE_REGION; + + /* if real reservation is smaller use that value */ + return (remainder < rv) ? remainder : rv; +} + +/* Return minimum size of a spare that can be used in this array*/ +static unsigned long long min_acceptable_spare_size_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + struct dl *dl; + struct extent *e; + int i; + unsigned long long rv = 0; + + if (!super) + return rv; + /* find first active disk in array */ + dl = super->disks; + while (dl && (is_failed(&dl->disk) || dl->index == -1)) + dl = dl->next; + if (!dl) + return rv; + /* find last lba used by subarrays */ + e = get_extents(super, dl); + if (!e) + return rv; + for (i = 0; e[i].size; i++) + continue; + if (i > 0) + rv = e[i-1].start + e[i-1].size; + free(e); + + /* add the amount of space needed for metadata */ + rv = rv + imsm_min_reserved_sectors(super); + + return rv * 512; +} + +static int is_gen_migration(struct imsm_dev *dev); + +#ifndef MDASSEMBLE +static __u64 blocks_per_migr_unit(struct intel_super *super, + struct imsm_dev *dev); + +static void print_imsm_dev(struct intel_super *super, + struct imsm_dev *dev, + char *uuid, + int disk_idx) +{ + __u64 sz; + int slot, i; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *map2 = get_imsm_map(dev, MAP_1); + __u32 ord; + + printf("\n"); + printf("[%.16s]:\n", dev->volume); + printf(" UUID : %s\n", uuid); + printf(" RAID Level : %d", get_imsm_raid_level(map)); + if (map2) + printf(" <-- %d", get_imsm_raid_level(map2)); + printf("\n"); + printf(" Members : %d", map->num_members); + if (map2) + printf(" <-- %d", map2->num_members); + printf("\n"); + printf(" Slots : ["); + for (i = 0; i < map->num_members; i++) { + ord = get_imsm_ord_tbl_ent(dev, i, MAP_0); + printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U"); + } + printf("]"); + if (map2) { + printf(" <-- ["); + for (i = 0; i < map2->num_members; i++) { + ord = get_imsm_ord_tbl_ent(dev, i, MAP_1); + printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U"); + } + printf("]"); + } + printf("\n"); + printf(" Failed disk : "); + if (map->failed_disk_num == 0xff) + printf("none"); + else + printf("%i", map->failed_disk_num); + printf("\n"); + slot = get_imsm_disk_slot(map, disk_idx); + if (slot >= 0) { + ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X); + printf(" This Slot : %d%s\n", slot, + ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : ""); + } else + printf(" This Slot : ?\n"); + sz = __le32_to_cpu(dev->size_high); + sz <<= 32; + sz += __le32_to_cpu(dev->size_low); + printf(" Array Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); + sz = blocks_per_member(map); + printf(" Per Dev Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); + printf(" Sector Offset : %llu\n", + pba_of_lba0(map)); + printf(" Num Stripes : %llu\n", + num_data_stripes(map)); + printf(" Chunk Size : %u KiB", + __le16_to_cpu(map->blocks_per_strip) / 2); + if (map2) + printf(" <-- %u KiB", + __le16_to_cpu(map2->blocks_per_strip) / 2); + printf("\n"); + printf(" Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks)); + printf(" Migrate State : "); + if (dev->vol.migr_state) { + if (migr_type(dev) == MIGR_INIT) + printf("initialize\n"); + else if (migr_type(dev) == MIGR_REBUILD) + printf("rebuild\n"); + else if (migr_type(dev) == MIGR_VERIFY) + printf("check\n"); + else if (migr_type(dev) == MIGR_GEN_MIGR) + printf("general migration\n"); + else if (migr_type(dev) == MIGR_STATE_CHANGE) + printf("state change\n"); + else if (migr_type(dev) == MIGR_REPAIR) + printf("repair\n"); + else + printf("\n", migr_type(dev)); + } else + printf("idle\n"); + printf(" Map State : %s", map_state_str[map->map_state]); + if (dev->vol.migr_state) { + struct imsm_map *map = get_imsm_map(dev, MAP_1); + + printf(" <-- %s", map_state_str[map->map_state]); + printf("\n Checkpoint : %u ", + __le32_to_cpu(dev->vol.curr_migr_unit)); + if ((is_gen_migration(dev)) && ((slot > 1) || (slot < 0))) + printf("(N/A)"); + else + printf("(%llu)", (unsigned long long) + blocks_per_migr_unit(super, dev)); + } + printf("\n"); + printf(" Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean"); +} + +static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved) +{ + char str[MAX_RAID_SERIAL_LEN + 1]; + __u64 sz; + + if (index < -1 || !disk) + return; + + printf("\n"); + snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial); + if (index >= 0) + printf(" Disk%02d Serial : %s\n", index, str); + else + printf(" Disk Serial : %s\n", str); + printf(" State :%s%s%s\n", is_spare(disk) ? " spare" : "", + is_configured(disk) ? " active" : "", + is_failed(disk) ? " failed" : ""); + printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id)); + sz = total_blocks(disk) - reserved; + printf(" Usable Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); +} + +void examine_migr_rec_imsm(struct intel_super *super) +{ + struct migr_record *migr_rec = super->migr_rec; + struct imsm_super *mpb = super->anchor; + int i; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + struct imsm_map *map; + int slot = -1; + + if (is_gen_migration(dev) == 0) + continue; + + printf("\nMigration Record Information:"); + + /* first map under migration */ + map = get_imsm_map(dev, MAP_0); + if (map) + slot = get_imsm_disk_slot(map, super->disks->index); + if ((map == NULL) || (slot > 1) || (slot < 0)) { + printf(" Empty\n "); + printf("Examine one of first two disks in array\n"); + break; + } + printf("\n Status : "); + if (__le32_to_cpu(migr_rec->rec_status) == UNIT_SRC_NORMAL) + printf("Normal\n"); + else + printf("Contains Data\n"); + printf(" Current Unit : %u\n", + __le32_to_cpu(migr_rec->curr_migr_unit)); + printf(" Family : %u\n", + __le32_to_cpu(migr_rec->family_num)); + printf(" Ascending : %u\n", + __le32_to_cpu(migr_rec->ascending_migr)); + printf(" Blocks Per Unit : %u\n", + __le32_to_cpu(migr_rec->blocks_per_unit)); + printf(" Dest. Depth Per Unit : %u\n", + __le32_to_cpu(migr_rec->dest_depth_per_unit)); + printf(" Checkpoint Area pba : %u\n", + __le32_to_cpu(migr_rec->ckpt_area_pba)); + printf(" First member lba : %u\n", + __le32_to_cpu(migr_rec->dest_1st_member_lba)); + printf(" Total Number of Units : %u\n", + __le32_to_cpu(migr_rec->num_migr_units)); + printf(" Size of volume : %u\n", + __le32_to_cpu(migr_rec->post_migr_vol_cap)); + printf(" Expansion space for LBA64 : %u\n", + __le32_to_cpu(migr_rec->post_migr_vol_cap_hi)); + printf(" Record was read from : %u\n", + __le32_to_cpu(migr_rec->ckpt_read_disk_num)); + + break; + } +} +#endif /* MDASSEMBLE */ +/******************************************************************************* + * function: imsm_check_attributes + * Description: Function checks if features represented by attributes flags + * are supported by mdadm. + * Parameters: + * attributes - Attributes read from metadata + * Returns: + * 0 - passed attributes contains unsupported features flags + * 1 - all features are supported + ******************************************************************************/ +static int imsm_check_attributes(__u32 attributes) +{ + int ret_val = 1; + __u32 not_supported = MPB_ATTRIB_SUPPORTED^0xffffffff; + + not_supported &= ~MPB_ATTRIB_IGNORED; + + not_supported &= attributes; + if (not_supported) { + pr_err("(IMSM): Unsupported attributes : %x\n", + (unsigned)__le32_to_cpu(not_supported)); + if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) { + dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY \n"); + not_supported ^= MPB_ATTRIB_CHECKSUM_VERIFY; + } + if (not_supported & MPB_ATTRIB_2TB) { + dprintf("\t\tMPB_ATTRIB_2TB\n"); + not_supported ^= MPB_ATTRIB_2TB; + } + if (not_supported & MPB_ATTRIB_RAID0) { + dprintf("\t\tMPB_ATTRIB_RAID0\n"); + not_supported ^= MPB_ATTRIB_RAID0; + } + if (not_supported & MPB_ATTRIB_RAID1) { + dprintf("\t\tMPB_ATTRIB_RAID1\n"); + not_supported ^= MPB_ATTRIB_RAID1; + } + if (not_supported & MPB_ATTRIB_RAID10) { + dprintf("\t\tMPB_ATTRIB_RAID10\n"); + not_supported ^= MPB_ATTRIB_RAID10; + } + if (not_supported & MPB_ATTRIB_RAID1E) { + dprintf("\t\tMPB_ATTRIB_RAID1E\n"); + not_supported ^= MPB_ATTRIB_RAID1E; + } + if (not_supported & MPB_ATTRIB_RAID5) { + dprintf("\t\tMPB_ATTRIB_RAID5\n"); + not_supported ^= MPB_ATTRIB_RAID5; + } + if (not_supported & MPB_ATTRIB_RAIDCNG) { + dprintf("\t\tMPB_ATTRIB_RAIDCNG\n"); + not_supported ^= MPB_ATTRIB_RAIDCNG; + } + if (not_supported & MPB_ATTRIB_BBM) { + dprintf("\t\tMPB_ATTRIB_BBM\n"); + not_supported ^= MPB_ATTRIB_BBM; + } + if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) { + dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY (== MPB_ATTRIB_LEGACY)\n"); + not_supported ^= MPB_ATTRIB_CHECKSUM_VERIFY; + } + if (not_supported & MPB_ATTRIB_EXP_STRIPE_SIZE) { + dprintf("\t\tMPB_ATTRIB_EXP_STRIP_SIZE\n"); + not_supported ^= MPB_ATTRIB_EXP_STRIPE_SIZE; + } + if (not_supported & MPB_ATTRIB_2TB_DISK) { + dprintf("\t\tMPB_ATTRIB_2TB_DISK\n"); + not_supported ^= MPB_ATTRIB_2TB_DISK; + } + if (not_supported & MPB_ATTRIB_NEVER_USE2) { + dprintf("\t\tMPB_ATTRIB_NEVER_USE2\n"); + not_supported ^= MPB_ATTRIB_NEVER_USE2; + } + if (not_supported & MPB_ATTRIB_NEVER_USE) { + dprintf("\t\tMPB_ATTRIB_NEVER_USE\n"); + not_supported ^= MPB_ATTRIB_NEVER_USE; + } + + if (not_supported) + dprintf("(IMSM): Unknown attributes : %x\n", not_supported); + + ret_val = 0; + } + + return ret_val; +} + +#ifndef MDASSEMBLE +static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map); + +static void examine_super_imsm(struct supertype *st, char *homehost) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + char str[MAX_SIGNATURE_LENGTH]; + int i; + struct mdinfo info; + char nbuf[64]; + __u32 sum; + __u32 reserved = imsm_reserved_sectors(super, super->disks); + struct dl *dl; + + snprintf(str, MPB_SIG_LEN, "%s", mpb->sig); + printf(" Magic : %s\n", str); + snprintf(str, strlen(MPB_VERSION_RAID0), "%s", get_imsm_version(mpb)); + printf(" Version : %s\n", get_imsm_version(mpb)); + printf(" Orig Family : %08x\n", __le32_to_cpu(mpb->orig_family_num)); + printf(" Family : %08x\n", __le32_to_cpu(mpb->family_num)); + printf(" Generation : %08x\n", __le32_to_cpu(mpb->generation_num)); + printf(" Attributes : "); + if (imsm_check_attributes(mpb->attributes)) + printf("All supported\n"); + else + printf("not supported\n"); + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf(" UUID : %s\n", nbuf + 5); + sum = __le32_to_cpu(mpb->check_sum); + printf(" Checksum : %08x %s\n", sum, + __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect"); + printf(" MPB Sectors : %d\n", mpb_sectors(mpb)); + printf(" Disks : %d\n", mpb->num_disks); + printf(" RAID Devices : %d\n", mpb->num_raid_devs); + print_imsm_disk(__get_imsm_disk(mpb, super->disks->index), super->disks->index, reserved); + if (super->bbm_log) { + struct bbm_log *log = super->bbm_log; + + printf("\n"); + printf("Bad Block Management Log:\n"); + printf(" Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size)); + printf(" Signature : %x\n", __le32_to_cpu(log->signature)); + printf(" Entry Count : %d\n", __le32_to_cpu(log->entry_count)); + printf(" Spare Blocks : %d\n", __le32_to_cpu(log->reserved_spare_block_count)); + printf(" First Spare : %llx\n", + (unsigned long long) __le64_to_cpu(log->first_spare_lba)); + } + for (i = 0; i < mpb->num_raid_devs; i++) { + struct mdinfo info; + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + + super->current_vol = i; + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + print_imsm_dev(super, dev, nbuf + 5, super->disks->index); + } + for (i = 0; i < mpb->num_disks; i++) { + if (i == super->disks->index) + continue; + print_imsm_disk(__get_imsm_disk(mpb, i), i, reserved); + } + + for (dl = super->disks; dl; dl = dl->next) + if (dl->index == -1) + print_imsm_disk(&dl->disk, -1, reserved); + + examine_migr_rec_imsm(super); +} + +static void brief_examine_super_imsm(struct supertype *st, int verbose) +{ + /* We just write a generic IMSM ARRAY entry */ + struct mdinfo info; + char nbuf[64]; + struct intel_super *super = st->sb; + + if (!super->anchor->num_raid_devs) { + printf("ARRAY metadata=imsm\n"); + return; + } + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("ARRAY metadata=imsm UUID=%s\n", nbuf + 5); +} + +static void brief_examine_subarrays_imsm(struct supertype *st, int verbose) +{ + /* We just write a generic IMSM ARRAY entry */ + struct mdinfo info; + char nbuf[64]; + char nbuf1[64]; + struct intel_super *super = st->sb; + int i; + + if (!super->anchor->num_raid_devs) + return; + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + + super->current_vol = i; + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf1, ':'); + printf("ARRAY /dev/md/%.16s container=%s member=%d UUID=%s\n", + dev->volume, nbuf + 5, i, nbuf1 + 5); + } +} + +static void export_examine_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct mdinfo info; + char nbuf[64]; + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("MD_METADATA=imsm\n"); + printf("MD_LEVEL=container\n"); + printf("MD_UUID=%s\n", nbuf+5); + printf("MD_DEVICES=%u\n", mpb->num_disks); +} + +static int copy_metadata_imsm(struct supertype *st, int from, int to) +{ + /* The second last 512byte sector of the device contains + * the "struct imsm_super" metadata. + * This contains mpb_size which is the size in bytes of the + * extended metadata. This is located immediately before + * the imsm_super. + * We want to read all that, plus the last sector which + * may contain a migration record, and write it all + * to the target. + */ + void *buf; + unsigned long long dsize, offset; + int sectors; + struct imsm_super *sb; + int written = 0; + + if (posix_memalign(&buf, 4096, 4096) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (lseek64(from, dsize-1024, 0) < 0) + goto err; + if (read(from, buf, 512) != 512) + goto err; + sb = buf; + if (strncmp((char*)sb->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) + goto err; + + sectors = mpb_sectors(sb) + 2; + offset = dsize - sectors * 512; + if (lseek64(from, offset, 0) < 0 || + lseek64(to, offset, 0) < 0) + goto err; + while (written < sectors * 512) { + int n = sectors*512 - written; + if (n > 4096) + n = 4096; + if (read(from, buf, n) != n) + goto err; + if (write(to, buf, n) != n) + goto err; + written += n; + } + free(buf); + return 0; +err: + free(buf); + return 1; +} + +static void detail_super_imsm(struct supertype *st, char *homehost) +{ + struct mdinfo info; + char nbuf[64]; + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("\n UUID : %s\n", nbuf + 5); +} + +static void brief_detail_super_imsm(struct supertype *st) +{ + struct mdinfo info; + char nbuf[64]; + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf(" UUID=%s", nbuf + 5); +} + +static int imsm_read_serial(int fd, char *devname, __u8 *serial); +static void fd2devname(int fd, char *name); + +static int ahci_enumerate_ports(const char *hba_path, int port_count, int host_base, int verbose) +{ + /* dump an unsorted list of devices attached to AHCI Intel storage + * controller, as well as non-connected ports + */ + int hba_len = strlen(hba_path) + 1; + struct dirent *ent; + DIR *dir; + char *path = NULL; + int err = 0; + unsigned long port_mask = (1 << port_count) - 1; + + if (port_count > (int)sizeof(port_mask) * 8) { + if (verbose > 0) + pr_err("port_count %d out of range\n", port_count); + return 2; + } + + /* scroll through /sys/dev/block looking for devices attached to + * this hba + */ + dir = opendir("/sys/dev/block"); + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + int fd; + char model[64]; + char vendor[64]; + char buf[1024]; + int major, minor; + char *device; + char *c; + int port; + int type; + + if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2) + continue; + path = devt_to_devpath(makedev(major, minor)); + if (!path) + continue; + if (!path_attached_to_hba(path, hba_path)) { + free(path); + path = NULL; + continue; + } + + /* retrieve the scsi device type */ + if (asprintf(&device, "/sys/dev/block/%d:%d/device/xxxxxxx", major, minor) < 0) { + if (verbose > 0) + pr_err("failed to allocate 'device'\n"); + err = 2; + break; + } + sprintf(device, "/sys/dev/block/%d:%d/device/type", major, minor); + if (load_sys(device, buf) != 0) { + if (verbose > 0) + pr_err("failed to read device type for %s\n", + path); + err = 2; + free(device); + break; + } + type = strtoul(buf, NULL, 10); + + /* if it's not a disk print the vendor and model */ + if (!(type == 0 || type == 7 || type == 14)) { + vendor[0] = '\0'; + model[0] = '\0'; + sprintf(device, "/sys/dev/block/%d:%d/device/vendor", major, minor); + if (load_sys(device, buf) == 0) { + strncpy(vendor, buf, sizeof(vendor)); + vendor[sizeof(vendor) - 1] = '\0'; + c = (char *) &vendor[sizeof(vendor) - 1]; + while (isspace(*c) || *c == '\0') + *c-- = '\0'; + + } + sprintf(device, "/sys/dev/block/%d:%d/device/model", major, minor); + if (load_sys(device, buf) == 0) { + strncpy(model, buf, sizeof(model)); + model[sizeof(model) - 1] = '\0'; + c = (char *) &model[sizeof(model) - 1]; + while (isspace(*c) || *c == '\0') + *c-- = '\0'; + } + + if (vendor[0] && model[0]) + sprintf(buf, "%.64s %.64s", vendor, model); + else + switch (type) { /* numbers from hald/linux/device.c */ + case 1: sprintf(buf, "tape"); break; + case 2: sprintf(buf, "printer"); break; + case 3: sprintf(buf, "processor"); break; + case 4: + case 5: sprintf(buf, "cdrom"); break; + case 6: sprintf(buf, "scanner"); break; + case 8: sprintf(buf, "media_changer"); break; + case 9: sprintf(buf, "comm"); break; + case 12: sprintf(buf, "raid"); break; + default: sprintf(buf, "unknown"); + } + } else + buf[0] = '\0'; + free(device); + + /* chop device path to 'host%d' and calculate the port number */ + c = strchr(&path[hba_len], '/'); + if (!c) { + if (verbose > 0) + pr_err("%s - invalid path name\n", path + hba_len); + err = 2; + break; + } + *c = '\0'; + if ((sscanf(&path[hba_len], "ata%d", &port) == 1) || + ((sscanf(&path[hba_len], "host%d", &port) == 1))) + port -= host_base; + else { + if (verbose > 0) { + *c = '/'; /* repair the full string */ + pr_err("failed to determine port number for %s\n", + path); + } + err = 2; + break; + } + + /* mark this port as used */ + port_mask &= ~(1 << port); + + /* print out the device information */ + if (buf[0]) { + printf(" Port%d : - non-disk device (%s) -\n", port, buf); + continue; + } + + fd = dev_open(ent->d_name, O_RDONLY); + if (fd < 0) + printf(" Port%d : - disk info unavailable -\n", port); + else { + fd2devname(fd, buf); + printf(" Port%d : %s", port, buf); + if (imsm_read_serial(fd, NULL, (__u8 *) buf) == 0) + printf(" (%.*s)\n", MAX_RAID_SERIAL_LEN, buf); + else + printf(" ()\n"); + close(fd); + } + free(path); + path = NULL; + } + if (path) + free(path); + if (dir) + closedir(dir); + if (err == 0) { + int i; + + for (i = 0; i < port_count; i++) + if (port_mask & (1 << i)) + printf(" Port%d : - no device attached -\n", i); + } + + return err; +} + +static int print_vmd_attached_devs(struct sys_dev *hba) +{ + struct dirent *ent; + DIR *dir; + char path[292]; + char link[256]; + char *c, *rp; + + if (hba->type != SYS_DEV_VMD) + return 1; + + /* scroll through /sys/dev/block looking for devices attached to + * this hba + */ + dir = opendir("/sys/bus/pci/drivers/nvme"); + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + int n; + + /* is 'ent' a device? check that the 'subsystem' link exists and + * that its target matches 'bus' + */ + sprintf(path, "/sys/bus/pci/drivers/nvme/%s/subsystem", + ent->d_name); + n = readlink(path, link, sizeof(link)); + if (n < 0 || n >= (int)sizeof(link)) + continue; + link[n] = '\0'; + c = strrchr(link, '/'); + if (!c) + continue; + if (strncmp("pci", c+1, strlen("pci")) != 0) + continue; + + sprintf(path, "/sys/bus/pci/drivers/nvme/%s", ent->d_name); + /* if not a intel NVMe - skip it*/ + if (devpath_to_vendor(path) != 0x8086) + continue; + + rp = realpath(path, NULL); + if (!rp) + continue; + + if (path_attached_to_hba(rp, hba->path)) { + printf(" NVMe under VMD : %s\n", rp); + } + free(rp); + } + + return 0; +} + +static void print_found_intel_controllers(struct sys_dev *elem) +{ + for (; elem; elem = elem->next) { + pr_err("found Intel(R) "); + if (elem->type == SYS_DEV_SATA) + fprintf(stderr, "SATA "); + else if (elem->type == SYS_DEV_SAS) + fprintf(stderr, "SAS "); + else if (elem->type == SYS_DEV_NVME) + fprintf(stderr, "NVMe "); + + if (elem->type == SYS_DEV_VMD) + fprintf(stderr, "VMD domain"); + else + fprintf(stderr, "RAID controller"); + + if (elem->pci_id) + fprintf(stderr, " at %s", elem->pci_id); + fprintf(stderr, ".\n"); + } + fflush(stderr); +} + +static int ahci_get_port_count(const char *hba_path, int *port_count) +{ + struct dirent *ent; + DIR *dir; + int host_base = -1; + + *port_count = 0; + if ((dir = opendir(hba_path)) == NULL) + return -1; + + for (ent = readdir(dir); ent; ent = readdir(dir)) { + int host; + + if ((sscanf(ent->d_name, "ata%d", &host) != 1) && + ((sscanf(ent->d_name, "host%d", &host) != 1))) + continue; + if (*port_count == 0) + host_base = host; + else if (host < host_base) + host_base = host; + + if (host + 1 > *port_count + host_base) + *port_count = host + 1 - host_base; + } + closedir(dir); + return host_base; +} + +static void print_imsm_capability(const struct imsm_orom *orom) +{ + printf(" Platform : Intel(R) "); + if (orom->capabilities == 0 && orom->driver_features == 0) + printf("Matrix Storage Manager\n"); + else + printf("Rapid Storage Technology%s\n", + imsm_orom_is_enterprise(orom) ? " enterprise" : ""); + if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build) + printf(" Version : %d.%d.%d.%d\n", orom->major_ver, + orom->minor_ver, orom->hotfix_ver, orom->build); + printf(" RAID Levels :%s%s%s%s%s\n", + imsm_orom_has_raid0(orom) ? " raid0" : "", + imsm_orom_has_raid1(orom) ? " raid1" : "", + imsm_orom_has_raid1e(orom) ? " raid1e" : "", + imsm_orom_has_raid10(orom) ? " raid10" : "", + imsm_orom_has_raid5(orom) ? " raid5" : ""); + printf(" Chunk Sizes :%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + imsm_orom_has_chunk(orom, 2) ? " 2k" : "", + imsm_orom_has_chunk(orom, 4) ? " 4k" : "", + imsm_orom_has_chunk(orom, 8) ? " 8k" : "", + imsm_orom_has_chunk(orom, 16) ? " 16k" : "", + imsm_orom_has_chunk(orom, 32) ? " 32k" : "", + imsm_orom_has_chunk(orom, 64) ? " 64k" : "", + imsm_orom_has_chunk(orom, 128) ? " 128k" : "", + imsm_orom_has_chunk(orom, 256) ? " 256k" : "", + imsm_orom_has_chunk(orom, 512) ? " 512k" : "", + imsm_orom_has_chunk(orom, 1024*1) ? " 1M" : "", + imsm_orom_has_chunk(orom, 1024*2) ? " 2M" : "", + imsm_orom_has_chunk(orom, 1024*4) ? " 4M" : "", + imsm_orom_has_chunk(orom, 1024*8) ? " 8M" : "", + imsm_orom_has_chunk(orom, 1024*16) ? " 16M" : "", + imsm_orom_has_chunk(orom, 1024*32) ? " 32M" : "", + imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : ""); + printf(" 2TB volumes :%s supported\n", + (orom->attr & IMSM_OROM_ATTR_2TB)?"":" not"); + printf(" 2TB disks :%s supported\n", + (orom->attr & IMSM_OROM_ATTR_2TB_DISK)?"":" not"); + printf(" Max Disks : %d\n", orom->tds); + printf(" Max Volumes : %d per array, %d per %s\n", + orom->vpa, orom->vphba, + imsm_orom_is_nvme(orom) ? "platform" : "controller"); + return; +} + +static void print_imsm_capability_export(const struct imsm_orom *orom) +{ + printf("MD_FIRMWARE_TYPE=imsm\n"); + if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build) + printf("IMSM_VERSION=%d.%d.%d.%d\n", orom->major_ver, orom->minor_ver, + orom->hotfix_ver, orom->build); + printf("IMSM_SUPPORTED_RAID_LEVELS=%s%s%s%s%s\n", + imsm_orom_has_raid0(orom) ? "raid0 " : "", + imsm_orom_has_raid1(orom) ? "raid1 " : "", + imsm_orom_has_raid1e(orom) ? "raid1e " : "", + imsm_orom_has_raid5(orom) ? "raid10 " : "", + imsm_orom_has_raid10(orom) ? "raid5 " : ""); + printf("IMSM_SUPPORTED_CHUNK_SIZES=%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + imsm_orom_has_chunk(orom, 2) ? "2k " : "", + imsm_orom_has_chunk(orom, 4) ? "4k " : "", + imsm_orom_has_chunk(orom, 8) ? "8k " : "", + imsm_orom_has_chunk(orom, 16) ? "16k " : "", + imsm_orom_has_chunk(orom, 32) ? "32k " : "", + imsm_orom_has_chunk(orom, 64) ? "64k " : "", + imsm_orom_has_chunk(orom, 128) ? "128k " : "", + imsm_orom_has_chunk(orom, 256) ? "256k " : "", + imsm_orom_has_chunk(orom, 512) ? "512k " : "", + imsm_orom_has_chunk(orom, 1024*1) ? "1M " : "", + imsm_orom_has_chunk(orom, 1024*2) ? "2M " : "", + imsm_orom_has_chunk(orom, 1024*4) ? "4M " : "", + imsm_orom_has_chunk(orom, 1024*8) ? "8M " : "", + imsm_orom_has_chunk(orom, 1024*16) ? "16M " : "", + imsm_orom_has_chunk(orom, 1024*32) ? "32M " : "", + imsm_orom_has_chunk(orom, 1024*64) ? "64M " : ""); + printf("IMSM_2TB_VOLUMES=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB) ? "yes" : "no"); + printf("IMSM_2TB_DISKS=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB_DISK) ? "yes" : "no"); + printf("IMSM_MAX_DISKS=%d\n",orom->tds); + printf("IMSM_MAX_VOLUMES_PER_ARRAY=%d\n",orom->vpa); + printf("IMSM_MAX_VOLUMES_PER_CONTROLLER=%d\n",orom->vphba); +} + +static int detail_platform_imsm(int verbose, int enumerate_only, char *controller_path) +{ + /* There are two components to imsm platform support, the ahci SATA + * controller and the option-rom. To find the SATA controller we + * simply look in /sys/bus/pci/drivers/ahci to see if an ahci + * controller with the Intel vendor id is present. This approach + * allows mdadm to leverage the kernel's ahci detection logic, with the + * caveat that if ahci.ko is not loaded mdadm will not be able to + * detect platform raid capabilities. The option-rom resides in a + * platform "Adapter ROM". We scan for its signature to retrieve the + * platform capabilities. If raid support is disabled in the BIOS the + * option-rom capability structure will not be available. + */ + struct sys_dev *list, *hba; + int host_base = 0; + int port_count = 0; + int result=1; + + if (enumerate_only) { + if (check_env("IMSM_NO_PLATFORM")) + return 0; + list = find_intel_devices(); + if (!list) + return 2; + for (hba = list; hba; hba = hba->next) { + if (find_imsm_capability(hba)) { + result = 0; + break; + } + else + result = 2; + } + return result; + } + + list = find_intel_devices(); + if (!list) { + if (verbose > 0) + pr_err("no active Intel(R) RAID controller found.\n"); + return 2; + } else if (verbose > 0) + print_found_intel_controllers(list); + + for (hba = list; hba; hba = hba->next) { + if (controller_path && (compare_paths(hba->path, controller_path) != 0)) + continue; + if (!find_imsm_capability(hba)) { + char buf[PATH_MAX]; + pr_err("imsm capabilities not found for controller: %s (type %s)\n", + hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path, + get_sys_dev_type(hba->type)); + continue; + } + result = 0; + } + + if (controller_path && result == 1) { + pr_err("no active Intel(R) RAID controller found under %s\n", + controller_path); + return result; + } + + const struct orom_entry *entry; + + for (entry = orom_entries; entry; entry = entry->next) { + if (entry->type == SYS_DEV_VMD) { + for (hba = list; hba; hba = hba->next) { + if (hba->type == SYS_DEV_VMD) { + char buf[PATH_MAX]; + print_imsm_capability(&entry->orom); + printf(" I/O Controller : %s (%s)\n", + vmd_domain_to_controller(hba, buf), get_sys_dev_type(hba->type)); + print_vmd_attached_devs(hba); + printf("\n"); + } + } + continue; + } + + print_imsm_capability(&entry->orom); + if (entry->type == SYS_DEV_NVME) { + for (hba = list; hba; hba = hba->next) { + if (hba->type == SYS_DEV_NVME) + printf(" NVMe Device : %s\n", hba->path); + } + printf("\n"); + continue; + } + + struct devid_list *devid; + for (devid = entry->devid_list; devid; devid = devid->next) { + hba = device_by_id(devid->devid); + if (!hba) + continue; + + printf(" I/O Controller : %s (%s)\n", + hba->path, get_sys_dev_type(hba->type)); + if (hba->type == SYS_DEV_SATA) { + host_base = ahci_get_port_count(hba->path, &port_count); + if (ahci_enumerate_ports(hba->path, port_count, host_base, verbose)) { + if (verbose > 0) + pr_err("failed to enumerate ports on SATA controller at %s.\n", hba->pci_id); + result |= 2; + } + } + } + printf("\n"); + } + + return result; +} + +static int export_detail_platform_imsm(int verbose, char *controller_path) +{ + struct sys_dev *list, *hba; + int result=1; + + list = find_intel_devices(); + if (!list) { + if (verbose > 0) + pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_INTEL_DEVICES\n"); + result = 2; + return result; + } + + for (hba = list; hba; hba = hba->next) { + if (controller_path && (compare_paths(hba->path,controller_path) != 0)) + continue; + if (!find_imsm_capability(hba) && verbose > 0) { + char buf[PATH_MAX]; + pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n", + hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path); + } + else + result = 0; + } + + const struct orom_entry *entry; + + for (entry = orom_entries; entry; entry = entry->next) { + if (entry->type == SYS_DEV_VMD) { + for (hba = list; hba; hba = hba->next) + print_imsm_capability_export(&entry->orom); + continue; + } + print_imsm_capability_export(&entry->orom); + } + + return result; +} + +#endif + +static int match_home_imsm(struct supertype *st, char *homehost) +{ + /* the imsm metadata format does not specify any host + * identification information. We return -1 since we can never + * confirm nor deny whether a given array is "meant" for this + * host. We rely on compare_super and the 'family_num' fields to + * exclude member disks that do not belong, and we rely on + * mdadm.conf to specify the arrays that should be assembled. + * Auto-assembly may still pick up "foreign" arrays. + */ + + return -1; +} + +static void uuid_from_super_imsm(struct supertype *st, int uuid[4]) +{ + /* The uuid returned here is used for: + * uuid to put into bitmap file (Create, Grow) + * uuid for backup header when saving critical section (Grow) + * comparing uuids when re-adding a device into an array + * In these cases the uuid required is that of the data-array, + * not the device-set. + * uuid to recognise same set when adding a missing device back + * to an array. This is a uuid for the device-set. + * + * For each of these we can make do with a truncated + * or hashed uuid rather than the original, as long as + * everyone agrees. + * In each case the uuid required is that of the data-array, + * not the device-set. + */ + /* imsm does not track uuid's so we synthesis one using sha1 on + * - The signature (Which is constant for all imsm array, but no matter) + * - the orig_family_num of the container + * - the index number of the volume + * - the 'serial' number of the volume. + * Hopefully these are all constant. + */ + struct intel_super *super = st->sb; + + char buf[20]; + struct sha1_ctx ctx; + struct imsm_dev *dev = NULL; + __u32 family_num; + + /* some mdadm versions failed to set ->orig_family_num, in which + * case fall back to ->family_num. orig_family_num will be + * fixed up with the first metadata update. + */ + family_num = super->anchor->orig_family_num; + if (family_num == 0) + family_num = super->anchor->family_num; + sha1_init_ctx(&ctx); + sha1_process_bytes(super->anchor->sig, MPB_SIG_LEN, &ctx); + sha1_process_bytes(&family_num, sizeof(__u32), &ctx); + if (super->current_vol >= 0) + dev = get_imsm_dev(super, super->current_vol); + if (dev) { + __u32 vol = super->current_vol; + sha1_process_bytes(&vol, sizeof(vol), &ctx); + sha1_process_bytes(dev->volume, MAX_RAID_SERIAL_LEN, &ctx); + } + sha1_finish_ctx(&ctx, buf); + memcpy(uuid, buf, 4*4); +} + +#if 0 +static void +get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p) +{ + __u8 *v = get_imsm_version(mpb); + __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH; + char major[] = { 0, 0, 0 }; + char minor[] = { 0 ,0, 0 }; + char patch[] = { 0, 0, 0 }; + char *ver_parse[] = { major, minor, patch }; + int i, j; + + i = j = 0; + while (*v != '\0' && v < end) { + if (*v != '.' && j < 2) + ver_parse[i][j++] = *v; + else { + i++; + j = 0; + } + v++; + } + + *m = strtol(minor, NULL, 0); + *p = strtol(patch, NULL, 0); +} +#endif + +static __u32 migr_strip_blocks_resync(struct imsm_dev *dev) +{ + /* migr_strip_size when repairing or initializing parity */ + struct imsm_map *map = get_imsm_map(dev, MAP_0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 5: + case 10: + return chunk; + default: + return 128*1024 >> 9; + } +} + +static __u32 migr_strip_blocks_rebuild(struct imsm_dev *dev) +{ + /* migr_strip_size when rebuilding a degraded disk, no idea why + * this is different than migr_strip_size_resync(), but it's good + * to be compatible + */ + struct imsm_map *map = get_imsm_map(dev, MAP_1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: + if (map->num_members % map->num_domains == 0) + return 128*1024 >> 9; + else + return chunk; + case 5: + return max((__u32) 64*1024 >> 9, chunk); + default: + return 128*1024 >> 9; + } +} + +static __u32 num_stripes_per_unit_resync(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, MAP_0); + struct imsm_map *hi = get_imsm_map(dev, MAP_1); + __u32 lo_chunk = __le32_to_cpu(lo->blocks_per_strip); + __u32 hi_chunk = __le32_to_cpu(hi->blocks_per_strip); + + return max((__u32) 1, hi_chunk / lo_chunk); +} + +static __u32 num_stripes_per_unit_rebuild(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, MAP_0); + int level = get_imsm_raid_level(lo); + + if (level == 1 || level == 10) { + struct imsm_map *hi = get_imsm_map(dev, MAP_1); + + return hi->num_domains; + } else + return num_stripes_per_unit_resync(dev); +} + +static __u8 imsm_num_data_members(struct imsm_dev *dev, int second_map) +{ + /* named 'imsm_' because raid0, raid1 and raid10 + * counter-intuitively have the same number of data disks + */ + struct imsm_map *map = get_imsm_map(dev, second_map); + + switch (get_imsm_raid_level(map)) { + case 0: + return map->num_members; + break; + case 1: + case 10: + return map->num_members/2; + case 5: + return map->num_members - 1; + default: + dprintf("unsupported raid level\n"); + return 0; + } +} + +static __u32 parity_segment_depth(struct imsm_dev *dev) +{ + struct imsm_map *map = get_imsm_map(dev, MAP_0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch(get_imsm_raid_level(map)) { + case 1: + case 10: + return chunk * map->num_domains; + case 5: + return chunk * map->num_members; + default: + return chunk; + } +} + +static __u32 map_migr_block(struct imsm_dev *dev, __u32 block) +{ + struct imsm_map *map = get_imsm_map(dev, MAP_1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + __u32 strip = block / chunk; + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: { + __u32 vol_strip = (strip * map->num_domains) + 1; + __u32 vol_stripe = vol_strip / map->num_members; + + return vol_stripe * chunk + block % chunk; + } case 5: { + __u32 stripe = strip / (map->num_members - 1); + + return stripe * chunk + block % chunk; + } + default: + return 0; + } +} + +static __u64 blocks_per_migr_unit(struct intel_super *super, + struct imsm_dev *dev) +{ + /* calculate the conversion factor between per member 'blocks' + * (md/{resync,rebuild}_start) and imsm migration units, return + * 0 for the 'not migrating' and 'unsupported migration' cases + */ + if (!dev->vol.migr_state) + return 0; + + switch (migr_type(dev)) { + case MIGR_GEN_MIGR: { + struct migr_record *migr_rec = super->migr_rec; + return __le32_to_cpu(migr_rec->blocks_per_unit); + } + case MIGR_VERIFY: + case MIGR_REPAIR: + case MIGR_INIT: { + struct imsm_map *map = get_imsm_map(dev, MAP_0); + __u32 stripes_per_unit; + __u32 blocks_per_unit; + __u32 parity_depth; + __u32 migr_chunk; + __u32 block_map; + __u32 block_rel; + __u32 segment; + __u32 stripe; + __u8 disks; + + /* yes, this is really the translation of migr_units to + * per-member blocks in the 'resync' case + */ + stripes_per_unit = num_stripes_per_unit_resync(dev); + migr_chunk = migr_strip_blocks_resync(dev); + disks = imsm_num_data_members(dev, MAP_0); + blocks_per_unit = stripes_per_unit * migr_chunk * disks; + stripe = __le16_to_cpu(map->blocks_per_strip) * disks; + segment = blocks_per_unit / stripe; + block_rel = blocks_per_unit - segment * stripe; + parity_depth = parity_segment_depth(dev); + block_map = map_migr_block(dev, block_rel); + return block_map + parity_depth * segment; + } + case MIGR_REBUILD: { + __u32 stripes_per_unit; + __u32 migr_chunk; + + stripes_per_unit = num_stripes_per_unit_rebuild(dev); + migr_chunk = migr_strip_blocks_rebuild(dev); + return migr_chunk * stripes_per_unit; + } + case MIGR_STATE_CHANGE: + default: + return 0; + } +} + +static int imsm_level_to_layout(int level) +{ + switch (level) { + case 0: + case 1: + return 0; + case 5: + case 6: + return ALGORITHM_LEFT_ASYMMETRIC; + case 10: + return 0x102; + } + return UnSet; +} + +/******************************************************************************* + * Function: read_imsm_migr_rec + * Description: Function reads imsm migration record from last sector of disk + * Parameters: + * fd : disk descriptor + * super : metadata info + * Returns: + * 0 : success, + * -1 : fail + ******************************************************************************/ +static int read_imsm_migr_rec(int fd, struct intel_super *super) +{ + int ret_val = -1; + unsigned long long dsize; + + get_dev_size(fd, NULL, &dsize); + if (lseek64(fd, dsize - MIGR_REC_POSITION, SEEK_SET) < 0) { + pr_err("Cannot seek to anchor block: %s\n", + strerror(errno)); + goto out; + } + if (read(fd, super->migr_rec_buf, MIGR_REC_BUF_SIZE) != + MIGR_REC_BUF_SIZE) { + pr_err("Cannot read migr record block: %s\n", + strerror(errno)); + goto out; + } + ret_val = 0; + +out: + return ret_val; +} + +static struct imsm_dev *imsm_get_device_during_migration( + struct intel_super *super) +{ + + struct intel_dev *dv; + + for (dv = super->devlist; dv; dv = dv->next) { + if (is_gen_migration(dv->dev)) + return dv->dev; + } + return NULL; +} + +/******************************************************************************* + * Function: load_imsm_migr_rec + * Description: Function reads imsm migration record (it is stored at the last + * sector of disk) + * Parameters: + * super : imsm internal array info + * info : general array info + * Returns: + * 0 : success + * -1 : fail + * -2 : no migration in progress + ******************************************************************************/ +static int load_imsm_migr_rec(struct intel_super *super, struct mdinfo *info) +{ + struct mdinfo *sd; + struct dl *dl = NULL; + char nm[30]; + int retval = -1; + int fd = -1; + struct imsm_dev *dev; + struct imsm_map *map = NULL; + int slot = -1; + + /* find map under migration */ + dev = imsm_get_device_during_migration(super); + /* nothing to load,no migration in progress? + */ + if (dev == NULL) + return -2; + map = get_imsm_map(dev, MAP_0); + + if (info) { + for (sd = info->devs ; sd ; sd = sd->next) { + /* skip spare and failed disks + */ + if (sd->disk.raid_disk < 0) + continue; + /* read only from one of the first two slots */ + if (map) + slot = get_imsm_disk_slot(map, + sd->disk.raid_disk); + if ((map == NULL) || (slot > 1) || (slot < 0)) + continue; + + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + fd = dev_open(nm, O_RDONLY); + if (fd >= 0) + break; + } + } + if (fd < 0) { + for (dl = super->disks; dl; dl = dl->next) { + /* skip spare and failed disks + */ + if (dl->index < 0) + continue; + /* read only from one of the first two slots */ + if (map) + slot = get_imsm_disk_slot(map, dl->index); + if ((map == NULL) || (slot > 1) || (slot < 0)) + continue; + sprintf(nm, "%d:%d", dl->major, dl->minor); + fd = dev_open(nm, O_RDONLY); + if (fd >= 0) + break; + } + } + if (fd < 0) + goto out; + retval = read_imsm_migr_rec(fd, super); + +out: + if (fd >= 0) + close(fd); + return retval; +} + +#ifndef MDASSEMBLE +/******************************************************************************* + * function: imsm_create_metadata_checkpoint_update + * Description: It creates update for checkpoint change. + * Parameters: + * super : imsm internal array info + * u : pointer to prepared update + * Returns: + * Uptate length. + * If length is equal to 0, input pointer u contains no update + ******************************************************************************/ +static int imsm_create_metadata_checkpoint_update( + struct intel_super *super, + struct imsm_update_general_migration_checkpoint **u) +{ + + int update_memory_size = 0; + + dprintf("(enter)\n"); + + if (u == NULL) + return 0; + *u = NULL; + + /* size of all update data without anchor */ + update_memory_size = + sizeof(struct imsm_update_general_migration_checkpoint); + + *u = xcalloc(1, update_memory_size); + if (*u == NULL) { + dprintf("error: cannot get memory\n"); + return 0; + } + (*u)->type = update_general_migration_checkpoint; + (*u)->curr_migr_unit = __le32_to_cpu(super->migr_rec->curr_migr_unit); + dprintf("prepared for %u\n", (*u)->curr_migr_unit); + + return update_memory_size; +} + +static void imsm_update_metadata_locally(struct supertype *st, + void *buf, int len); + +/******************************************************************************* + * Function: write_imsm_migr_rec + * Description: Function writes imsm migration record + * (at the last sector of disk) + * Parameters: + * super : imsm internal array info + * Returns: + * 0 : success + * -1 : if fail + ******************************************************************************/ +static int write_imsm_migr_rec(struct supertype *st) +{ + struct intel_super *super = st->sb; + unsigned long long dsize; + char nm[30]; + int fd = -1; + int retval = -1; + struct dl *sd; + int len; + struct imsm_update_general_migration_checkpoint *u; + struct imsm_dev *dev; + struct imsm_map *map = NULL; + + /* find map under migration */ + dev = imsm_get_device_during_migration(super); + /* if no migration, write buffer anyway to clear migr_record + * on disk based on first available device + */ + if (dev == NULL) + dev = get_imsm_dev(super, super->current_vol < 0 ? 0 : + super->current_vol); + + map = get_imsm_map(dev, MAP_0); + + for (sd = super->disks ; sd ; sd = sd->next) { + int slot = -1; + + /* skip failed and spare devices */ + if (sd->index < 0) + continue; + /* write to 2 first slots only */ + if (map) + slot = get_imsm_disk_slot(map, sd->index); + if ((map == NULL) || (slot > 1) || (slot < 0)) + continue; + + sprintf(nm, "%d:%d", sd->major, sd->minor); + fd = dev_open(nm, O_RDWR); + if (fd < 0) + continue; + get_dev_size(fd, NULL, &dsize); + if (lseek64(fd, dsize - MIGR_REC_POSITION, SEEK_SET) < 0) { + pr_err("Cannot seek to anchor block: %s\n", + strerror(errno)); + goto out; + } + if (write(fd, super->migr_rec_buf, MIGR_REC_BUF_SIZE) != + MIGR_REC_BUF_SIZE) { + pr_err("Cannot write migr record block: %s\n", + strerror(errno)); + goto out; + } + close(fd); + fd = -1; + } + /* update checkpoint information in metadata */ + len = imsm_create_metadata_checkpoint_update(super, &u); + + if (len <= 0) { + dprintf("imsm: Cannot prepare update\n"); + goto out; + } + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) { + append_metadata_update(st, u, len); + /* during reshape we do all work inside metadata handler + * manage_reshape(), so metadata update has to be triggered + * insida it + */ + flush_metadata_updates(st); + st->update_tail = &st->updates; + } else + free(u); + + retval = 0; + out: + if (fd >= 0) + close(fd); + return retval; +} +#endif /* MDASSEMBLE */ + +/* spare/missing disks activations are not allowe when + * array/container performs reshape operation, because + * all arrays in container works on the same disks set + */ +int imsm_reshape_blocks_arrays_changes(struct intel_super *super) +{ + int rv = 0; + struct intel_dev *i_dev; + struct imsm_dev *dev; + + /* check whole container + */ + for (i_dev = super->devlist; i_dev; i_dev = i_dev->next) { + dev = i_dev->dev; + if (is_gen_migration(dev)) { + /* No repair during any migration in container + */ + rv = 1; + break; + } + } + return rv; +} +static unsigned long long imsm_component_size_aligment_check(int level, + int chunk_size, + unsigned long long component_size) +{ + unsigned int component_size_alligment; + + /* check component size aligment + */ + component_size_alligment = component_size % (chunk_size/512); + + dprintf("(Level: %i, chunk_size = %i, component_size = %llu), component_size_alligment = %u\n", + level, chunk_size, component_size, + component_size_alligment); + + if (component_size_alligment && (level != 1) && (level != UnSet)) { + dprintf("imsm: reported component size alligned from %llu ", + component_size); + component_size -= component_size_alligment; + dprintf_cont("to %llu (%i).\n", + component_size, component_size_alligment); + } + + return component_size; +} + +static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, char *dmap) +{ + struct intel_super *super = st->sb; + struct migr_record *migr_rec = super->migr_rec; + struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *prev_map = get_imsm_map(dev, MAP_1); + struct imsm_map *map_to_analyse = map; + struct dl *dl; + int map_disks = info->array.raid_disks; + + memset(info, 0, sizeof(*info)); + if (prev_map) + map_to_analyse = prev_map; + + dl = super->current_disk; + + info->container_member = super->current_vol; + info->array.raid_disks = map->num_members; + info->array.level = get_imsm_raid_level(map_to_analyse); + info->array.layout = imsm_level_to_layout(info->array.level); + info->array.md_minor = -1; + info->array.ctime = 0; + info->array.utime = 0; + info->array.chunk_size = + __le16_to_cpu(map_to_analyse->blocks_per_strip) << 9; + info->array.state = !dev->vol.dirty; + info->custom_array_size = __le32_to_cpu(dev->size_high); + info->custom_array_size <<= 32; + info->custom_array_size |= __le32_to_cpu(dev->size_low); + info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb); + + if (is_gen_migration(dev)) { + info->reshape_active = 1; + info->new_level = get_imsm_raid_level(map); + info->new_layout = imsm_level_to_layout(info->new_level); + info->new_chunk = __le16_to_cpu(map->blocks_per_strip) << 9; + info->delta_disks = map->num_members - prev_map->num_members; + if (info->delta_disks) { + /* this needs to be applied to every array + * in the container. + */ + info->reshape_active = CONTAINER_RESHAPE; + } + /* We shape information that we give to md might have to be + * modify to cope with md's requirement for reshaping arrays. + * For example, when reshaping a RAID0, md requires it to be + * presented as a degraded RAID4. + * Also if a RAID0 is migrating to a RAID5 we need to specify + * the array as already being RAID5, but the 'before' layout + * is a RAID4-like layout. + */ + switch (info->array.level) { + case 0: + switch(info->new_level) { + case 0: + /* conversion is happening as RAID4 */ + info->array.level = 4; + info->array.raid_disks += 1; + break; + case 5: + /* conversion is happening as RAID5 */ + info->array.level = 5; + info->array.layout = ALGORITHM_PARITY_N; + info->delta_disks -= 1; + break; + default: + /* FIXME error message */ + info->array.level = UnSet; + break; + } + break; + } + } else { + info->new_level = UnSet; + info->new_layout = UnSet; + info->new_chunk = info->array.chunk_size; + info->delta_disks = 0; + } + + if (dl) { + info->disk.major = dl->major; + info->disk.minor = dl->minor; + info->disk.number = dl->index; + info->disk.raid_disk = get_imsm_disk_slot(map_to_analyse, + dl->index); + } + + info->data_offset = pba_of_lba0(map_to_analyse); + info->component_size = blocks_per_member(map_to_analyse); + + info->component_size = imsm_component_size_aligment_check( + info->array.level, + info->array.chunk_size, + info->component_size); + + memset(info->uuid, 0, sizeof(info->uuid)); + info->recovery_start = MaxSector; + + info->reshape_progress = 0; + info->resync_start = MaxSector; + if ((map_to_analyse->map_state == IMSM_T_STATE_UNINITIALIZED || + dev->vol.dirty) && + imsm_reshape_blocks_arrays_changes(super) == 0) { + info->resync_start = 0; + } + if (dev->vol.migr_state) { + switch (migr_type(dev)) { + case MIGR_REPAIR: + case MIGR_INIT: { + __u64 blocks_per_unit = blocks_per_migr_unit(super, + dev); + __u64 units = __le32_to_cpu(dev->vol.curr_migr_unit); + + info->resync_start = blocks_per_unit * units; + break; + } + case MIGR_GEN_MIGR: { + __u64 blocks_per_unit = blocks_per_migr_unit(super, + dev); + __u64 units = __le32_to_cpu(migr_rec->curr_migr_unit); + unsigned long long array_blocks; + int used_disks; + + if (__le32_to_cpu(migr_rec->ascending_migr) && + (units < + (__le32_to_cpu(migr_rec->num_migr_units)-1)) && + (super->migr_rec->rec_status == + __cpu_to_le32(UNIT_SRC_IN_CP_AREA))) + units++; + + info->reshape_progress = blocks_per_unit * units; + + dprintf("IMSM: General Migration checkpoint : %llu (%llu) -> read reshape progress : %llu\n", + (unsigned long long)units, + (unsigned long long)blocks_per_unit, + info->reshape_progress); + + used_disks = imsm_num_data_members(dev, MAP_1); + if (used_disks > 0) { + array_blocks = blocks_per_member(map) * + used_disks; + /* round array size down to closest MB + */ + info->custom_array_size = (array_blocks + >> SECT_PER_MB_SHIFT) + << SECT_PER_MB_SHIFT; + } + } + case MIGR_VERIFY: + /* we could emulate the checkpointing of + * 'sync_action=check' migrations, but for now + * we just immediately complete them + */ + case MIGR_REBUILD: + /* this is handled by container_content_imsm() */ + case MIGR_STATE_CHANGE: + /* FIXME handle other migrations */ + default: + /* we are not dirty, so... */ + info->resync_start = MaxSector; + } + } + + strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN); + info->name[MAX_RAID_SERIAL_LEN] = 0; + + info->array.major_version = -1; + info->array.minor_version = -2; + sprintf(info->text_version, "/%s/%d", st->container_devnm, info->container_member); + info->safe_mode_delay = 4000; /* 4 secs like the Matrix driver */ + uuid_from_super_imsm(st, info->uuid); + + if (dmap) { + int i, j; + for (i=0; iarray.raid_disks) { + struct imsm_disk *dsk; + j = get_imsm_disk_idx(dev, i, MAP_X); + dsk = get_imsm_disk(super, j); + if (dsk && (dsk->status & CONFIGURED_DISK)) + dmap[i] = 1; + } + } + } +} + +static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, + int failed, int look_in_map); + +static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev, + int look_in_map); + +#ifndef MDASSEMBLE +static void manage_second_map(struct intel_super *super, struct imsm_dev *dev) +{ + if (is_gen_migration(dev)) { + int failed; + __u8 map_state; + struct imsm_map *map2 = get_imsm_map(dev, MAP_1); + + failed = imsm_count_failed(super, dev, MAP_1); + map_state = imsm_check_degraded(super, dev, failed, MAP_1); + if (map2->map_state != map_state) { + map2->map_state = map_state; + super->updates_pending++; + } + } +} +#endif + +static struct imsm_disk *get_imsm_missing(struct intel_super *super, __u8 index) +{ + struct dl *d; + + for (d = super->missing; d; d = d->next) + if (d->index == index) + return &d->disk; + return NULL; +} + +static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map) +{ + struct intel_super *super = st->sb; + struct imsm_disk *disk; + int map_disks = info->array.raid_disks; + int max_enough = -1; + int i; + struct imsm_super *mpb; + + if (super->current_vol >= 0) { + getinfo_super_imsm_volume(st, info, map); + return; + } + memset(info, 0, sizeof(*info)); + + /* Set raid_disks to zero so that Assemble will always pull in valid + * spares + */ + info->array.raid_disks = 0; + info->array.level = LEVEL_CONTAINER; + info->array.layout = 0; + info->array.md_minor = -1; + info->array.ctime = 0; /* N/A for imsm */ + info->array.utime = 0; + info->array.chunk_size = 0; + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.raid_disk = -1; + info->reshape_active = 0; + info->array.major_version = -1; + info->array.minor_version = -2; + strcpy(info->text_version, "imsm"); + info->safe_mode_delay = 0; + info->disk.number = -1; + info->disk.state = 0; + info->name[0] = 0; + info->recovery_start = MaxSector; + info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb); + + /* do we have the all the insync disks that we expect? */ + mpb = super->anchor; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + int failed, enough, j, missing = 0; + struct imsm_map *map; + __u8 state; + + failed = imsm_count_failed(super, dev, MAP_0); + state = imsm_check_degraded(super, dev, failed, MAP_0); + map = get_imsm_map(dev, MAP_0); + + /* any newly missing disks? + * (catches single-degraded vs double-degraded) + */ + for (j = 0; j < map->num_members; j++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, j, MAP_0); + __u32 idx = ord_to_idx(ord); + + if (!(ord & IMSM_ORD_REBUILD) && + get_imsm_missing(super, idx)) { + missing = 1; + break; + } + } + + if (state == IMSM_T_STATE_FAILED) + enough = -1; + else if (state == IMSM_T_STATE_DEGRADED && + (state != map->map_state || missing)) + enough = 0; + else /* we're normal, or already degraded */ + enough = 1; + if (is_gen_migration(dev) && missing) { + /* during general migration we need all disks + * that process is running on. + * No new missing disk is allowed. + */ + max_enough = -1; + enough = -1; + /* no more checks necessary + */ + break; + } + /* in the missing/failed disk case check to see + * if at least one array is runnable + */ + max_enough = max(max_enough, enough); + } + dprintf("enough: %d\n", max_enough); + info->container_enough = max_enough; + + if (super->disks) { + __u32 reserved = imsm_reserved_sectors(super, super->disks); + + disk = &super->disks->disk; + info->data_offset = total_blocks(&super->disks->disk) - reserved; + info->component_size = reserved; + info->disk.state = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0; + /* we don't change info->disk.raid_disk here because + * this state will be finalized in mdmon after we have + * found the 'most fresh' version of the metadata + */ + info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + info->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); + } + + /* only call uuid_from_super_imsm when this disk is part of a populated container, + * ->compare_super may have updated the 'num_raid_devs' field for spares + */ + if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs) + uuid_from_super_imsm(st, info->uuid); + else + memcpy(info->uuid, uuid_zero, sizeof(uuid_zero)); + + /* I don't know how to compute 'map' on imsm, so use safe default */ + if (map) { + int i; + for (i = 0; i < map_disks; i++) + map[i] = 1; + } + +} + +/* allocates memory and fills disk in mdinfo structure + * for each disk in array */ +struct mdinfo *getinfo_super_disks_imsm(struct supertype *st) +{ + struct mdinfo *mddev = NULL; + struct intel_super *super = st->sb; + struct imsm_disk *disk; + int count = 0; + struct dl *dl; + if (!super || !super->disks) + return NULL; + dl = super->disks; + mddev = xcalloc(1, sizeof(*mddev)); + while (dl) { + struct mdinfo *tmp; + disk = &dl->disk; + tmp = xcalloc(1, sizeof(*tmp)); + if (mddev->devs) + tmp->next = mddev->devs; + mddev->devs = tmp; + tmp->disk.number = count++; + tmp->disk.major = dl->major; + tmp->disk.minor = dl->minor; + tmp->disk.state = is_configured(disk) ? + (1 << MD_DISK_ACTIVE) : 0; + tmp->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + tmp->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); + tmp->disk.raid_disk = -1; + dl = dl->next; + } + return mddev; +} + +static int update_super_imsm(struct supertype *st, struct mdinfo *info, + char *update, char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* For 'assemble' and 'force' we need to return non-zero if any + * change was made. For others, the return value is ignored. + * Update options are: + * force-one : This device looks a bit old but needs to be included, + * update age info appropriately. + * assemble: clear any 'faulty' flag to allow this device to + * be assembled. + * force-array: Array is degraded but being forced, mark it clean + * if that will be needed to assemble it. + * + * newdev: not used ???? + * grow: Array has gained a new device - this is currently for + * linear only + * resync: mark as dirty so a resync will happen. + * name: update the name - preserving the homehost + * uuid: Change the uuid of the array to match watch is given + * + * Following are not relevant for this imsm: + * sparc2.2 : update from old dodgey metadata + * super-minor: change the preferred_minor number + * summaries: update redundant counters. + * homehost: update the recorded homehost + * _reshape_progress: record new reshape_progress position. + */ + int rv = 1; + struct intel_super *super = st->sb; + struct imsm_super *mpb; + + /* we can only update container info */ + if (!super || super->current_vol >= 0 || !super->anchor) + return 1; + + mpb = super->anchor; + + if (strcmp(update, "uuid") == 0) { + /* We take this to mean that the family_num should be updated. + * However that is much smaller than the uuid so we cannot really + * allow an explicit uuid to be given. And it is hard to reliably + * know if one was. + * So if !uuid_set we know the current uuid is random and just used + * the first 'int' and copy it to the other 3 positions. + * Otherwise we require the 4 'int's to be the same as would be the + * case if we are using a random uuid. So an explicit uuid will be + * accepted as long as all for ints are the same... which shouldn't hurt + */ + if (!uuid_set) { + info->uuid[1] = info->uuid[2] = info->uuid[3] = info->uuid[0]; + rv = 0; + } else { + if (info->uuid[0] != info->uuid[1] || + info->uuid[1] != info->uuid[2] || + info->uuid[2] != info->uuid[3]) + rv = -1; + else + rv = 0; + } + if (rv == 0) + mpb->orig_family_num = info->uuid[0]; + } else if (strcmp(update, "assemble") == 0) + rv = 0; + else + rv = -1; + + /* successful update? recompute checksum */ + if (rv == 0) + mpb->check_sum = __le32_to_cpu(__gen_imsm_checksum(mpb)); + + return rv; +} + +static size_t disks_to_mpb_size(int disks) +{ + size_t size; + + size = sizeof(struct imsm_super); + size += (disks - 1) * sizeof(struct imsm_disk); + size += 2 * sizeof(struct imsm_dev); + /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */ + size += (4 - 2) * sizeof(struct imsm_map); + /* 4 possible disk_ord_tbl's */ + size += 4 * (disks - 1) * sizeof(__u32); + + return size; +} + +static __u64 avail_size_imsm(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS)) + return 0; + + return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS); +} + +static void free_devlist(struct intel_super *super) +{ + struct intel_dev *dv; + + while (super->devlist) { + dv = super->devlist->next; + free(super->devlist->dev); + free(super->devlist); + super->devlist = dv; + } +} + +static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src) +{ + memcpy(dest, src, sizeof_imsm_dev(src, 0)); +} + +static int compare_super_imsm(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + struct intel_super *first = st->sb; + struct intel_super *sec = tst->sb; + + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } + /* in platform dependent environment test if the disks + * use the same Intel hba + * If not on Intel hba at all, allow anything. + */ + if (!check_env("IMSM_NO_PLATFORM") && first->hba && sec->hba) { + if (first->hba->type != sec->hba->type) { + fprintf(stderr, + "HBAs of devices do not match %s != %s\n", + get_sys_dev_type(first->hba->type), + get_sys_dev_type(sec->hba->type)); + return 3; + } + if (first->orom != sec->orom) { + fprintf(stderr, + "HBAs of devices do not match %s != %s\n", + first->hba->pci_id, sec->hba->pci_id); + return 3; + } + } + + /* if an anchor does not have num_raid_devs set then it is a free + * floating spare + */ + if (first->anchor->num_raid_devs > 0 && + sec->anchor->num_raid_devs > 0) { + /* Determine if these disks might ever have been + * related. Further disambiguation can only take place + * in load_super_imsm_all + */ + __u32 first_family = first->anchor->orig_family_num; + __u32 sec_family = sec->anchor->orig_family_num; + + if (memcmp(first->anchor->sig, sec->anchor->sig, + MAX_SIGNATURE_LENGTH) != 0) + return 3; + + if (first_family == 0) + first_family = first->anchor->family_num; + if (sec_family == 0) + sec_family = sec->anchor->family_num; + + if (first_family != sec_family) + return 3; + + } + + /* if 'first' is a spare promote it to a populated mpb with sec's + * family number + */ + if (first->anchor->num_raid_devs == 0 && + sec->anchor->num_raid_devs > 0) { + int i; + struct intel_dev *dv; + struct imsm_dev *dev; + + /* we need to copy raid device info from sec if an allocation + * fails here we don't associate the spare + */ + for (i = 0; i < sec->anchor->num_raid_devs; i++) { + dv = xmalloc(sizeof(*dv)); + dev = xmalloc(sizeof_imsm_dev(get_imsm_dev(sec, i), 1)); + dv->dev = dev; + dv->index = i; + dv->next = first->devlist; + first->devlist = dv; + } + if (i < sec->anchor->num_raid_devs) { + /* allocation failure */ + free_devlist(first); + pr_err("imsm: failed to associate spare\n"); + return 3; + } + first->anchor->num_raid_devs = sec->anchor->num_raid_devs; + first->anchor->orig_family_num = sec->anchor->orig_family_num; + first->anchor->family_num = sec->anchor->family_num; + memcpy(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH); + for (i = 0; i < sec->anchor->num_raid_devs; i++) + imsm_copy_dev(get_imsm_dev(first, i), get_imsm_dev(sec, i)); + } + + return 0; +} + +static void fd2devname(int fd, char *name) +{ + struct stat st; + char path[256]; + char dname[PATH_MAX]; + char *nm; + int rv; + + name[0] = '\0'; + if (fstat(fd, &st) != 0) + return; + sprintf(path, "/sys/dev/block/%d:%d", + major(st.st_rdev), minor(st.st_rdev)); + + rv = readlink(path, dname, sizeof(dname)-1); + if (rv <= 0) + return; + + dname[rv] = '\0'; + nm = strrchr(dname, '/'); + if (nm) { + nm++; + snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm); + } +} + +extern int scsi_get_serial(int fd, void *buf, size_t buf_len); + +static int imsm_read_serial(int fd, char *devname, + __u8 serial[MAX_RAID_SERIAL_LEN]) +{ + unsigned char scsi_serial[255]; + int rv; + int rsp_len; + int len; + char *dest; + char *src; + char *rsp_buf; + int i; + + memset(scsi_serial, 0, sizeof(scsi_serial)); + + rv = scsi_get_serial(fd, scsi_serial, sizeof(scsi_serial)); + + if (rv && check_env("IMSM_DEVNAME_AS_SERIAL")) { + memset(serial, 0, MAX_RAID_SERIAL_LEN); + fd2devname(fd, (char *) serial); + return 0; + } + + if (rv != 0) { + if (devname) + pr_err("Failed to retrieve serial for %s\n", + devname); + return rv; + } + + rsp_len = scsi_serial[3]; + if (!rsp_len) { + if (devname) + pr_err("Failed to retrieve serial for %s\n", + devname); + return 2; + } + rsp_buf = (char *) &scsi_serial[4]; + + /* trim all whitespace and non-printable characters and convert + * ':' to ';' + */ + for (i = 0, dest = rsp_buf; i < rsp_len; i++) { + src = &rsp_buf[i]; + if (*src > 0x20) { + /* ':' is reserved for use in placeholder serial + * numbers for missing disks + */ + if (*src == ':') + *dest++ = ';'; + else + *dest++ = *src; + } + } + len = dest - rsp_buf; + dest = rsp_buf; + + /* truncate leading characters */ + if (len > MAX_RAID_SERIAL_LEN) { + dest += len - MAX_RAID_SERIAL_LEN; + len = MAX_RAID_SERIAL_LEN; + } + + memset(serial, 0, MAX_RAID_SERIAL_LEN); + memcpy(serial, dest, len); + + return 0; +} + +static int serialcmp(__u8 *s1, __u8 *s2) +{ + return strncmp((char *) s1, (char *) s2, MAX_RAID_SERIAL_LEN); +} + +static void serialcpy(__u8 *dest, __u8 *src) +{ + strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN); +} + +static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) +{ + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (serialcmp(dl->serial, serial) == 0) + break; + + return dl; +} + +static struct imsm_disk * +__serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx) +{ + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + + if (serialcmp(disk->serial, serial) == 0) { + if (idx) + *idx = i; + return disk; + } + } + + return NULL; +} + +static int +load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + struct imsm_disk *disk; + struct dl *dl; + struct stat stb; + int rv; + char name[40]; + __u8 serial[MAX_RAID_SERIAL_LEN]; + + rv = imsm_read_serial(fd, devname, serial); + + if (rv != 0) + return 2; + + dl = xcalloc(1, sizeof(*dl)); + + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->disks; + dl->fd = keep_fd ? fd : -1; + assert(super->disks == NULL); + super->disks = dl; + serialcpy(dl->serial, serial); + dl->index = -2; + dl->e = NULL; + fd2devname(fd, name); + if (devname) + dl->devname = xstrdup(devname); + else + dl->devname = xstrdup(name); + + /* look up this disk's index in the current anchor */ + disk = __serial_to_disk(dl->serial, super->anchor, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of a + * populated contianer, i.e. one with raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + } + + return 0; +} + +#ifndef MDASSEMBLE +/* When migrating map0 contains the 'destination' state while map1 + * contains the current state. When not migrating map0 contains the + * current state. This routine assumes that map[0].map_state is set to + * the current array state before being called. + * + * Migration is indicated by one of the following states + * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed) + * 2/ Initialize (migr_state=1 migr_type=MIGR_INIT map0state=normal + * map1state=unitialized) + * 3/ Repair (Resync) (migr_state=1 migr_type=MIGR_REPAIR map0state=normal + * map1state=normal) + * 4/ Rebuild (migr_state=1 migr_type=MIGR_REBUILD map0state=normal + * map1state=degraded) + * 5/ Migration (mig_state=1 migr_type=MIGR_GEN_MIGR map0state=normal + * map1state=normal) + */ +static void migrate(struct imsm_dev *dev, struct intel_super *super, + __u8 to_state, int migr_type) +{ + struct imsm_map *dest; + struct imsm_map *src = get_imsm_map(dev, MAP_0); + + dev->vol.migr_state = 1; + set_migr_type(dev, migr_type); + dev->vol.curr_migr_unit = 0; + dest = get_imsm_map(dev, MAP_1); + + /* duplicate and then set the target end state in map[0] */ + memcpy(dest, src, sizeof_imsm_map(src)); + if ((migr_type == MIGR_REBUILD) || + (migr_type == MIGR_GEN_MIGR)) { + __u32 ord; + int i; + + for (i = 0; i < src->num_members; i++) { + ord = __le32_to_cpu(src->disk_ord_tbl[i]); + set_imsm_ord_tbl_ent(src, i, ord_to_idx(ord)); + } + } + + if (migr_type == MIGR_GEN_MIGR) + /* Clear migration record */ + memset(super->migr_rec, 0, sizeof(struct migr_record)); + + src->map_state = to_state; +} + +static void end_migration(struct imsm_dev *dev, struct intel_super *super, + __u8 map_state) +{ + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state == 0 ? + MAP_0 : MAP_1); + int i, j; + + /* merge any IMSM_ORD_REBUILD bits that were not successfully + * completed in the last migration. + * + * FIXME add support for raid-level-migration + */ + if ((map_state != map->map_state) && (is_gen_migration(dev) == 0) && + (prev->map_state != IMSM_T_STATE_UNINITIALIZED)) { + /* when final map state is other than expected + * merge maps (not for migration) + */ + int failed; + + for (i = 0; i < prev->num_members; i++) + for (j = 0; j < map->num_members; j++) + /* during online capacity expansion + * disks position can be changed + * if takeover is used + */ + if (ord_to_idx(map->disk_ord_tbl[j]) == + ord_to_idx(prev->disk_ord_tbl[i])) { + map->disk_ord_tbl[j] |= + prev->disk_ord_tbl[i]; + break; + } + failed = imsm_count_failed(super, dev, MAP_0); + map_state = imsm_check_degraded(super, dev, failed, MAP_0); + } + + dev->vol.migr_state = 0; + set_migr_type(dev, 0); + dev->vol.curr_migr_unit = 0; + map->map_state = map_state; +} +#endif + +static int parse_raid_devices(struct intel_super *super) +{ + int i; + struct imsm_dev *dev_new; + size_t len, len_migr; + size_t max_len = 0; + size_t space_needed = 0; + struct imsm_super *mpb = super->anchor; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); + struct intel_dev *dv; + + len = sizeof_imsm_dev(dev_iter, 0); + len_migr = sizeof_imsm_dev(dev_iter, 1); + if (len_migr > len) + space_needed += len_migr - len; + + dv = xmalloc(sizeof(*dv)); + if (max_len < len_migr) + max_len = len_migr; + if (max_len > len_migr) + space_needed += max_len - len_migr; + dev_new = xmalloc(max_len); + imsm_copy_dev(dev_new, dev_iter); + dv->dev = dev_new; + dv->index = i; + dv->next = super->devlist; + super->devlist = dv; + } + + /* ensure that super->buf is large enough when all raid devices + * are migrating + */ + if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) { + void *buf; + + len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed, 512); + if (posix_memalign(&buf, 512, len) != 0) + return 1; + + memcpy(buf, super->buf, super->len); + memset(buf + super->len, 0, len - super->len); + free(super->buf); + super->buf = buf; + super->len = len; + } + + return 0; +} + +/* retrieve a pointer to the bbm log which starts after all raid devices */ +struct bbm_log *__get_imsm_bbm_log(struct imsm_super *mpb) +{ + void *ptr = NULL; + + if (__le32_to_cpu(mpb->bbm_log_size)) { + ptr = mpb; + ptr += mpb->mpb_size - __le32_to_cpu(mpb->bbm_log_size); + } + + return ptr; +} + +/******************************************************************************* + * Function: check_mpb_migr_compatibility + * Description: Function checks for unsupported migration features: + * - migration optimization area (pba_of_lba0) + * - descending reshape (ascending_migr) + * Parameters: + * super : imsm metadata information + * Returns: + * 0 : migration is compatible + * -1 : migration is not compatible + ******************************************************************************/ +int check_mpb_migr_compatibility(struct intel_super *super) +{ + struct imsm_map *map0, *map1; + struct migr_record *migr_rec = super->migr_rec; + int i; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); + + if (dev_iter && + dev_iter->vol.migr_state == 1 && + dev_iter->vol.migr_type == MIGR_GEN_MIGR) { + /* This device is migrating */ + map0 = get_imsm_map(dev_iter, MAP_0); + map1 = get_imsm_map(dev_iter, MAP_1); + if (pba_of_lba0(map0) != pba_of_lba0(map1)) + /* migration optimization area was used */ + return -1; + if (migr_rec->ascending_migr == 0 + && migr_rec->dest_depth_per_unit > 0) + /* descending reshape not supported yet */ + return -1; + } + } + return 0; +} + +static void __free_imsm(struct intel_super *super, int free_disks); + +/* load_imsm_mpb - read matrix metadata + * allocates super->mpb to be freed by free_imsm + */ +static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) +{ + unsigned long long dsize; + unsigned long long sectors; + struct stat; + struct imsm_super *anchor; + __u32 check_sum; + + get_dev_size(fd, NULL, &dsize); + if (dsize < 1024) { + if (devname) + pr_err("%s: device to small for imsm\n", + devname); + return 1; + } + + if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) { + if (devname) + pr_err("Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void**)&anchor, 512, 512) != 0) { + if (devname) + pr_err("Failed to allocate imsm anchor buffer on %s\n", devname); + return 1; + } + if (read(fd, anchor, 512) != 512) { + if (devname) + pr_err("Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); + free(anchor); + return 1; + } + + if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) { + if (devname) + pr_err("no IMSM anchor on %s\n", devname); + free(anchor); + return 2; + } + + __free_imsm(super, 0); + /* reload capability and hba */ + + /* capability and hba must be updated with new super allocation */ + find_intel_hba_capability(fd, super, devname); + super->len = ROUND_UP(anchor->mpb_size, 512); + if (posix_memalign(&super->buf, 512, super->len) != 0) { + if (devname) + pr_err("unable to allocate %zu byte mpb buffer\n", + super->len); + free(anchor); + return 2; + } + memcpy(super->buf, anchor, 512); + + sectors = mpb_sectors(anchor) - 1; + free(anchor); + + if (posix_memalign(&super->migr_rec_buf, 512, MIGR_REC_BUF_SIZE) != 0) { + pr_err("could not allocate migr_rec buffer\n"); + free(super->buf); + return 2; + } + super->clean_migration_record_by_mdmon = 0; + + if (!sectors) { + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { + if (devname) + pr_err("IMSM checksum %x != %x on %s\n", + check_sum, + __le32_to_cpu(super->anchor->check_sum), + devname); + return 2; + } + + return 0; + } + + /* read the extended mpb */ + if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) { + if (devname) + pr_err("Cannot seek to extended mpb on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if ((unsigned)read(fd, super->buf + 512, super->len - 512) != super->len - 512) { + if (devname) + pr_err("Cannot read extended mpb on %s: %s\n", + devname, strerror(errno)); + return 2; + } + + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { + if (devname) + pr_err("IMSM checksum %x != %x on %s\n", + check_sum, __le32_to_cpu(super->anchor->check_sum), + devname); + return 3; + } + + /* FIXME the BBM log is disk specific so we cannot use this global + * buffer for all disks. Ok for now since we only look at the global + * bbm_log_size parameter to gate assembly + */ + super->bbm_log = __get_imsm_bbm_log(super->anchor); + + return 0; +} + +static int read_imsm_migr_rec(int fd, struct intel_super *super); + +/* clears hi bits in metadata if MPB_ATTRIB_2TB_DISK not set */ +static void clear_hi(struct intel_super *super) +{ + struct imsm_super *mpb = super->anchor; + int i, n; + if (mpb->attributes & MPB_ATTRIB_2TB_DISK) + return; + for (i = 0; i < mpb->num_disks; ++i) { + struct imsm_disk *disk = &mpb->disk[i]; + disk->total_blocks_hi = 0; + } + for (i = 0; i < mpb->num_raid_devs; ++i) { + struct imsm_dev *dev = get_imsm_dev(super, i); + if (!dev) + return; + for (n = 0; n < 2; ++n) { + struct imsm_map *map = get_imsm_map(dev, n); + if (!map) + continue; + map->pba_of_lba0_hi = 0; + map->blocks_per_member_hi = 0; + map->num_data_stripes_hi = 0; + } + } +} + +static int +load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + int err; + + err = load_imsm_mpb(fd, super, devname); + if (err) + return err; + err = load_imsm_disk(fd, super, devname, keep_fd); + if (err) + return err; + err = parse_raid_devices(super); + clear_hi(super); + return err; +} + +static void __free_imsm_disk(struct dl *d) +{ + if (d->fd >= 0) + close(d->fd); + if (d->devname) + free(d->devname); + if (d->e) + free(d->e); + free(d); + +} + +static void free_imsm_disks(struct intel_super *super) +{ + struct dl *d; + + while (super->disks) { + d = super->disks; + super->disks = d->next; + __free_imsm_disk(d); + } + while (super->disk_mgmt_list) { + d = super->disk_mgmt_list; + super->disk_mgmt_list = d->next; + __free_imsm_disk(d); + } + while (super->missing) { + d = super->missing; + super->missing = d->next; + __free_imsm_disk(d); + } + +} + +/* free all the pieces hanging off of a super pointer */ +static void __free_imsm(struct intel_super *super, int free_disks) +{ + struct intel_hba *elem, *next; + + if (super->buf) { + free(super->buf); + super->buf = NULL; + } + /* unlink capability description */ + super->orom = NULL; + if (super->migr_rec_buf) { + free(super->migr_rec_buf); + super->migr_rec_buf = NULL; + } + if (free_disks) + free_imsm_disks(super); + free_devlist(super); + elem = super->hba; + while (elem) { + if (elem->path) + free((void *)elem->path); + next = elem->next; + free(elem); + elem = next; + } + super->hba = NULL; +} + +static void free_imsm(struct intel_super *super) +{ + __free_imsm(super, 1); + free(super); +} + +static void free_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + + if (!super) + return; + + free_imsm(super); + st->sb = NULL; +} + +static struct intel_super *alloc_super(void) +{ + struct intel_super *super = xcalloc(1, sizeof(*super)); + + super->current_vol = -1; + super->create_offset = ~((unsigned long long) 0); + return super; +} + +/* + * find and allocate hba and OROM/EFI based on valid fd of RAID component device + */ +static int find_intel_hba_capability(int fd, struct intel_super *super, char *devname) +{ + struct sys_dev *hba_name; + int rv = 0; + + if ((fd < 0) || check_env("IMSM_NO_PLATFORM")) { + super->orom = NULL; + super->hba = NULL; + return 0; + } + hba_name = find_disk_attached_hba(fd, NULL); + if (!hba_name) { + if (devname) + pr_err("%s is not attached to Intel(R) RAID controller.\n", + devname); + return 1; + } + rv = attach_hba_to_super(super, hba_name); + if (rv == 2) { + if (devname) { + struct intel_hba *hba = super->hba; + + pr_err("%s is attached to Intel(R) %s %s (%s),\n" + " but the container is assigned to Intel(R) %s %s (", + devname, + get_sys_dev_type(hba_name->type), + hba_name->type == SYS_DEV_VMD ? "domain" : "RAID controller", + hba_name->pci_id ? : "Err!", + get_sys_dev_type(super->hba->type), + hba->type == SYS_DEV_VMD ? "domain" : "RAID controller"); + + while (hba) { + fprintf(stderr, "%s", hba->pci_id ? : "Err!"); + if (hba->next) + fprintf(stderr, ", "); + hba = hba->next; + } + fprintf(stderr, ").\n" + " Mixing devices attached to different %s is not allowed.\n", + hba_name->type == SYS_DEV_VMD ? "VMD domains" : "controllers"); + } + return 2; + } + super->orom = find_imsm_capability(hba_name); + if (!super->orom) + return 3; + + return 0; +} + +/* find_missing - helper routine for load_super_imsm_all that identifies + * disks that have disappeared from the system. This routine relies on + * the mpb being uptodate, which it is at load time. + */ +static int find_missing(struct intel_super *super) +{ + int i; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + struct imsm_disk *disk; + + for (i = 0; i < mpb->num_disks; i++) { + disk = __get_imsm_disk(mpb, i); + dl = serial_to_dl(disk->serial, super); + if (dl) + continue; + + dl = xmalloc(sizeof(*dl)); + dl->major = 0; + dl->minor = 0; + dl->fd = -1; + dl->devname = xstrdup("missing"); + dl->index = i; + serialcpy(dl->serial, disk->serial); + dl->disk = *disk; + dl->e = NULL; + dl->next = super->missing; + super->missing = dl; + } + + return 0; +} + +#ifndef MDASSEMBLE +static struct intel_disk *disk_list_get(__u8 *serial, struct intel_disk *disk_list) +{ + struct intel_disk *idisk = disk_list; + + while (idisk) { + if (serialcmp(idisk->disk.serial, serial) == 0) + break; + idisk = idisk->next; + } + + return idisk; +} + +static int __prep_thunderdome(struct intel_super **table, int tbl_size, + struct intel_super *super, + struct intel_disk **disk_list) +{ + struct imsm_disk *d = &super->disks->disk; + struct imsm_super *mpb = super->anchor; + int i, j; + + for (i = 0; i < tbl_size; i++) { + struct imsm_super *tbl_mpb = table[i]->anchor; + struct imsm_disk *tbl_d = &table[i]->disks->disk; + + if (tbl_mpb->family_num == mpb->family_num) { + if (tbl_mpb->check_sum == mpb->check_sum) { + dprintf("mpb from %d:%d matches %d:%d\n", + super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + break; + } + + if (((is_configured(d) && !is_configured(tbl_d)) || + is_configured(d) == is_configured(tbl_d)) && + tbl_mpb->generation_num < mpb->generation_num) { + /* current version of the mpb is a + * better candidate than the one in + * super_table, but copy over "cross + * generational" status + */ + struct intel_disk *idisk; + + dprintf("mpb from %d:%d replaces %d:%d\n", + super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + idisk = disk_list_get(tbl_d->serial, *disk_list); + if (idisk && is_failed(&idisk->disk)) + tbl_d->status |= FAILED_DISK; + break; + } else { + struct intel_disk *idisk; + struct imsm_disk *disk; + + /* tbl_mpb is more up to date, but copy + * over cross generational status before + * returning + */ + disk = __serial_to_disk(d->serial, mpb, NULL); + if (disk && is_failed(disk)) + d->status |= FAILED_DISK; + + idisk = disk_list_get(d->serial, *disk_list); + if (idisk) { + idisk->owner = i; + if (disk && is_configured(disk)) + idisk->disk.status |= CONFIGURED_DISK; + } + + dprintf("mpb from %d:%d prefer %d:%d\n", + super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + return tbl_size; + } + } + } + + if (i >= tbl_size) + table[tbl_size++] = super; + else + table[i] = super; + + /* update/extend the merged list of imsm_disk records */ + for (j = 0; j < mpb->num_disks; j++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, j); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, *disk_list); + if (idisk) { + idisk->disk.status |= disk->status; + if (is_configured(&idisk->disk) || + is_failed(&idisk->disk)) + idisk->disk.status &= ~(SPARE_DISK); + } else { + idisk = xcalloc(1, sizeof(*idisk)); + idisk->owner = IMSM_UNKNOWN_OWNER; + idisk->disk = *disk; + idisk->next = *disk_list; + *disk_list = idisk; + } + + if (serialcmp(idisk->disk.serial, d->serial) == 0) + idisk->owner = i; + } + + return tbl_size; +} + +static struct intel_super * +validate_members(struct intel_super *super, struct intel_disk *disk_list, + const int owner) +{ + struct imsm_super *mpb = super->anchor; + int ok_count = 0; + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, disk_list); + if (idisk) { + if (idisk->owner == owner || + idisk->owner == IMSM_UNKNOWN_OWNER) + ok_count++; + else + dprintf("'%.16s' owner %d != %d\n", + disk->serial, idisk->owner, + owner); + } else { + dprintf("unknown disk %x [%d]: %.16s\n", + __le32_to_cpu(mpb->family_num), i, + disk->serial); + break; + } + } + + if (ok_count == mpb->num_disks) + return super; + return NULL; +} + +static void show_conflicts(__u32 family_num, struct intel_super *super_list) +{ + struct intel_super *s; + + for (s = super_list; s; s = s->next) { + if (family_num != s->anchor->family_num) + continue; + pr_err("Conflict, offlining family %#x on '%s'\n", + __le32_to_cpu(family_num), s->disks->devname); + } +} + +static struct intel_super * +imsm_thunderdome(struct intel_super **super_list, int len) +{ + struct intel_super *super_table[len]; + struct intel_disk *disk_list = NULL; + struct intel_super *champion, *spare; + struct intel_super *s, **del; + int tbl_size = 0; + int conflict; + int i; + + memset(super_table, 0, sizeof(super_table)); + for (s = *super_list; s; s = s->next) + tbl_size = __prep_thunderdome(super_table, tbl_size, s, &disk_list); + + for (i = 0; i < tbl_size; i++) { + struct imsm_disk *d; + struct intel_disk *idisk; + struct imsm_super *mpb = super_table[i]->anchor; + + s = super_table[i]; + d = &s->disks->disk; + + /* 'd' must appear in merged disk list for its + * configuration to be valid + */ + idisk = disk_list_get(d->serial, disk_list); + if (idisk && idisk->owner == i) + s = validate_members(s, disk_list, i); + else + s = NULL; + + if (!s) + dprintf("marking family: %#x from %d:%d offline\n", + mpb->family_num, + super_table[i]->disks->major, + super_table[i]->disks->minor); + super_table[i] = s; + } + + /* This is where the mdadm implementation differs from the Windows + * driver which has no strict concept of a container. We can only + * assemble one family from a container, so when returning a prodigal + * array member to this system the code will not be able to disambiguate + * the container contents that should be assembled ("foreign" versus + * "local"). It requires user intervention to set the orig_family_num + * to a new value to establish a new container. The Windows driver in + * this situation fixes up the volume name in place and manages the + * foreign array as an independent entity. + */ + s = NULL; + spare = NULL; + conflict = 0; + for (i = 0; i < tbl_size; i++) { + struct intel_super *tbl_ent = super_table[i]; + int is_spare = 0; + + if (!tbl_ent) + continue; + + if (tbl_ent->anchor->num_raid_devs == 0) { + spare = tbl_ent; + is_spare = 1; + } + + if (s && !is_spare) { + show_conflicts(tbl_ent->anchor->family_num, *super_list); + conflict++; + } else if (!s && !is_spare) + s = tbl_ent; + } + + if (!s) + s = spare; + if (!s) { + champion = NULL; + goto out; + } + champion = s; + + if (conflict) + pr_err("Chose family %#x on '%s', assemble conflicts to new container with '--update=uuid'\n", + __le32_to_cpu(s->anchor->family_num), s->disks->devname); + + /* collect all dl's onto 'champion', and update them to + * champion's version of the status + */ + for (s = *super_list; s; s = s->next) { + struct imsm_super *mpb = champion->anchor; + struct dl *dl = s->disks; + + if (s == champion) + continue; + + mpb->attributes |= s->anchor->attributes & MPB_ATTRIB_2TB_DISK; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk; + + disk = __serial_to_disk(dl->serial, mpb, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of + * a populated contianer, i.e. one with + * raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + break; + } + } + + if (i >= mpb->num_disks) { + struct intel_disk *idisk; + + idisk = disk_list_get(dl->serial, disk_list); + if (idisk && is_spare(&idisk->disk) && + !is_failed(&idisk->disk) && !is_configured(&idisk->disk)) + dl->index = -1; + else { + dl->index = -2; + continue; + } + } + + dl->next = champion->disks; + champion->disks = dl; + s->disks = NULL; + } + + /* delete 'champion' from super_list */ + for (del = super_list; *del; ) { + if (*del == champion) { + *del = (*del)->next; + break; + } else + del = &(*del)->next; + } + champion->next = NULL; + + out: + while (disk_list) { + struct intel_disk *idisk = disk_list; + + disk_list = disk_list->next; + free(idisk); + } + + return champion; +} + +static int +get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd); +static int get_super_block(struct intel_super **super_list, char *devnm, char *devname, + int major, int minor, int keep_fd); +static int +get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list, + int *max, int keep_fd); + +static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, + char *devname, struct md_list *devlist, + int keep_fd) +{ + struct intel_super *super_list = NULL; + struct intel_super *super = NULL; + int err = 0; + int i = 0; + + if (fd >= 0) + /* 'fd' is an opened container */ + err = get_sra_super_block(fd, &super_list, devname, &i, keep_fd); + else + /* get super block from devlist devices */ + err = get_devlist_super_block(devlist, &super_list, &i, keep_fd); + if (err) + goto error; + /* all mpbs enter, maybe one leaves */ + super = imsm_thunderdome(&super_list, i); + if (!super) { + err = 1; + goto error; + } + + if (find_missing(super) != 0) { + free_imsm(super); + err = 2; + goto error; + } + + /* load migration record */ + err = load_imsm_migr_rec(super, NULL); + if (err == -1) { + /* migration is in progress, + * but migr_rec cannot be loaded, + */ + err = 4; + goto error; + } + + /* Check migration compatibility */ + if ((err == 0) && (check_mpb_migr_compatibility(super) != 0)) { + pr_err("Unsupported migration detected"); + if (devname) + fprintf(stderr, " on %s\n", devname); + else + fprintf(stderr, " (IMSM).\n"); + + err = 5; + goto error; + } + + err = 0; + + error: + while (super_list) { + struct intel_super *s = super_list; + + super_list = super_list->next; + free_imsm(s); + } + + if (err) + return err; + + *sbp = super; + if (fd >= 0) + strcpy(st->container_devnm, fd2devnm(fd)); + else + st->container_devnm[0] = 0; + if (err == 0 && st->ss == NULL) { + st->ss = &super_imsm; + st->minor_version = 0; + st->max_devs = IMSM_MAX_DEVICES; + } + return 0; +} + +static int +get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list, + int *max, int keep_fd) +{ + struct md_list *tmpdev; + int err = 0; + int i = 0; + + for (i = 0, tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if (tmpdev->used != 1) + continue; + if (tmpdev->container == 1) { + int lmax = 0; + int fd = dev_open(tmpdev->devname, O_RDONLY|O_EXCL); + if (fd < 0) { + pr_err("cannot open device %s: %s\n", + tmpdev->devname, strerror(errno)); + err = 8; + goto error; + } + err = get_sra_super_block(fd, super_list, + tmpdev->devname, &lmax, + keep_fd); + i += lmax; + close(fd); + if (err) { + err = 7; + goto error; + } + } else { + int major = major(tmpdev->st_rdev); + int minor = minor(tmpdev->st_rdev); + err = get_super_block(super_list, + NULL, + tmpdev->devname, + major, minor, + keep_fd); + i++; + if (err) { + err = 6; + goto error; + } + } + } + error: + *max = i; + return err; +} + +static int get_super_block(struct intel_super **super_list, char *devnm, char *devname, + int major, int minor, int keep_fd) +{ + struct intel_super*s = NULL; + char nm[32]; + int dfd = -1; + int err = 0; + int retry; + + s = alloc_super(); + if (!s) { + err = 1; + goto error; + } + + sprintf(nm, "%d:%d", major, minor); + dfd = dev_open(nm, O_RDWR); + if (dfd < 0) { + err = 2; + goto error; + } + + find_intel_hba_capability(dfd, s, devname); + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + + /* retry the load if we might have raced against mdmon */ + if (err == 3 && devnm && mdmon_running(devnm)) + for (retry = 0; retry < 3; retry++) { + usleep(3000); + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + if (err != 3) + break; + } + error: + if (!err) { + s->next = *super_list; + *super_list = s; + } else { + if (s) + free(s); + if (dfd >= 0) + close(dfd); + } + if ((dfd >= 0) && (!keep_fd)) + close(dfd); + return err; + +} + +static int +get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd) +{ + struct mdinfo *sra; + char *devnm; + struct mdinfo *sd; + int err = 0; + int i = 0; + sra = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + if (!sra) + return 1; + + if (sra->array.major_version != -1 || + sra->array.minor_version != -2 || + strcmp(sra->text_version, "imsm") != 0) { + err = 1; + goto error; + } + /* load all mpbs */ + devnm = fd2devnm(fd); + for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) { + if (get_super_block(super_list, devnm, devname, + sd->disk.major, sd->disk.minor, keep_fd) != 0) { + err = 7; + goto error; + } + } + error: + sysfs_free(sra); + *max = i; + return err; +} + +static int load_container_imsm(struct supertype *st, int fd, char *devname) +{ + return load_super_imsm_all(st, fd, &st->sb, devname, NULL, 1); +} +#endif + +static int load_super_imsm(struct supertype *st, int fd, char *devname) +{ + struct intel_super *super; + int rv; + int retry; + + if (test_partition(fd)) + /* IMSM not allowed on partitions */ + return 1; + + free_super_imsm(st); + + super = alloc_super(); + /* Load hba and capabilities if they exist. + * But do not preclude loading metadata in case capabilities or hba are + * non-compliant and ignore_hw_compat is set. + */ + rv = find_intel_hba_capability(fd, super, devname); + /* no orom/efi or non-intel hba of the disk */ + if ((rv != 0) && (st->ignore_hw_compat == 0)) { + if (devname) + pr_err("No OROM/EFI properties for %s\n", devname); + free_imsm(super); + return 2; + } + rv = load_and_parse_mpb(fd, super, devname, 0); + + /* retry the load if we might have raced against mdmon */ + if (rv == 3) { + struct mdstat_ent *mdstat = mdstat_by_component(fd2devnm(fd)); + + if (mdstat && mdmon_running(mdstat->devnm) && getpid() != mdmon_pid(mdstat->devnm)) { + for (retry = 0; retry < 3; retry++) { + usleep(3000); + rv = load_and_parse_mpb(fd, super, devname, 0); + if (rv != 3) + break; + } + } + + free_mdstat(mdstat); + } + + if (rv) { + if (devname) + pr_err("Failed to load all information sections on %s\n", devname); + free_imsm(super); + return rv; + } + + st->sb = super; + if (st->ss == NULL) { + st->ss = &super_imsm; + st->minor_version = 0; + st->max_devs = IMSM_MAX_DEVICES; + } + + /* load migration record */ + if (load_imsm_migr_rec(super, NULL) == 0) { + /* Check for unsupported migration features */ + if (check_mpb_migr_compatibility(super) != 0) { + pr_err("Unsupported migration detected"); + if (devname) + fprintf(stderr, " on %s\n", devname); + else + fprintf(stderr, " (IMSM).\n"); + return 3; + } + } + + return 0; +} + +static __u16 info_to_blocks_per_strip(mdu_array_info_t *info) +{ + if (info->level == 1) + return 128; + return info->chunk_size >> 9; +} + +static unsigned long long info_to_blocks_per_member(mdu_array_info_t *info, + unsigned long long size) +{ + if (info->level == 1) + return size * 2; + else + return (size * 2) & ~(info_to_blocks_per_strip(info) - 1); +} + +static void imsm_update_version_info(struct intel_super *super) +{ + /* update the version and attributes */ + struct imsm_super *mpb = super->anchor; + char *version; + struct imsm_dev *dev; + struct imsm_map *map; + int i; + + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + if (__le32_to_cpu(dev->size_high) > 0) + mpb->attributes |= MPB_ATTRIB_2TB; + + /* FIXME detect when an array spans a port multiplier */ + #if 0 + mpb->attributes |= MPB_ATTRIB_PM; + #endif + + if (mpb->num_raid_devs > 1 || + mpb->attributes != MPB_ATTRIB_CHECKSUM_VERIFY) { + version = MPB_VERSION_ATTRIBS; + switch (get_imsm_raid_level(map)) { + case 0: mpb->attributes |= MPB_ATTRIB_RAID0; break; + case 1: mpb->attributes |= MPB_ATTRIB_RAID1; break; + case 10: mpb->attributes |= MPB_ATTRIB_RAID10; break; + case 5: mpb->attributes |= MPB_ATTRIB_RAID5; break; + } + } else { + if (map->num_members >= 5) + version = MPB_VERSION_5OR6_DISK_ARRAY; + else if (dev->status == DEV_CLONE_N_GO) + version = MPB_VERSION_CNG; + else if (get_imsm_raid_level(map) == 5) + version = MPB_VERSION_RAID5; + else if (map->num_members >= 3) + version = MPB_VERSION_3OR4_DISK_ARRAY; + else if (get_imsm_raid_level(map) == 1) + version = MPB_VERSION_RAID1; + else + version = MPB_VERSION_RAID0; + } + strcpy(((char *) mpb->sig) + strlen(MPB_SIGNATURE), version); + } +} + +static int check_name(struct intel_super *super, char *name, int quiet) +{ + struct imsm_super *mpb = super->anchor; + char *reason = NULL; + int i; + + if (strlen(name) > MAX_RAID_SERIAL_LEN) + reason = "must be 16 characters or less"; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + + if (strncmp((char *) dev->volume, name, MAX_RAID_SERIAL_LEN) == 0) { + reason = "already exists"; + break; + } + } + + if (reason && !quiet) + pr_err("imsm volume name %s\n", reason); + + return !reason; +} + +static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, + char *homehost, int *uuid, + long long data_offset) +{ + /* We are creating a volume inside a pre-existing container. + * so st->sb is already set. + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct intel_dev *dv; + struct imsm_dev *dev; + struct imsm_vol *vol; + struct imsm_map *map; + int idx = mpb->num_raid_devs; + int i; + unsigned long long array_blocks; + size_t size_old, size_new; + unsigned long long num_data_stripes; + + if (super->orom && mpb->num_raid_devs >= super->orom->vpa) { + pr_err("This imsm-container already has the maximum of %d volumes\n", super->orom->vpa); + return 0; + } + + /* ensure the mpb is large enough for the new data */ + size_old = __le32_to_cpu(mpb->mpb_size); + size_new = disks_to_mpb_size(info->nr_disks); + if (size_new > size_old) { + void *mpb_new; + size_t size_round = ROUND_UP(size_new, 512); + + if (posix_memalign(&mpb_new, 512, size_round) != 0) { + pr_err("could not allocate new mpb\n"); + return 0; + } + if (posix_memalign(&super->migr_rec_buf, 512, + MIGR_REC_BUF_SIZE) != 0) { + pr_err("could not allocate migr_rec buffer\n"); + free(super->buf); + free(super); + free(mpb_new); + return 0; + } + memcpy(mpb_new, mpb, size_old); + free(mpb); + mpb = mpb_new; + super->anchor = mpb_new; + mpb->mpb_size = __cpu_to_le32(size_new); + memset(mpb_new + size_old, 0, size_round - size_old); + } + super->current_vol = idx; + + /* handle 'failed_disks' by either: + * a) create dummy disk entries in the table if this the first + * volume in the array. We add them here as this is the only + * opportunity to add them. add_to_super_imsm_volume() + * handles the non-failed disks and continues incrementing + * mpb->num_disks. + * b) validate that 'failed_disks' matches the current number + * of missing disks if the container is populated + */ + if (super->current_vol == 0) { + mpb->num_disks = 0; + for (i = 0; i < info->failed_disks; i++) { + struct imsm_disk *disk; + + mpb->num_disks++; + disk = __get_imsm_disk(mpb, i); + disk->status = CONFIGURED_DISK | FAILED_DISK; + disk->scsi_id = __cpu_to_le32(~(__u32)0); + snprintf((char *) disk->serial, MAX_RAID_SERIAL_LEN, + "missing:%d", i); + } + find_missing(super); + } else { + int missing = 0; + struct dl *d; + + for (d = super->missing; d; d = d->next) + missing++; + if (info->failed_disks > missing) { + pr_err("unable to add 'missing' disk to container\n"); + return 0; + } + } + + if (!check_name(super, name, 0)) + return 0; + dv = xmalloc(sizeof(*dv)); + dev = xcalloc(1, sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1)); + strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN); + array_blocks = calc_array_size(info->level, info->raid_disks, + info->layout, info->chunk_size, + size * 2); + /* round array size down to closest MB */ + array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT; + + dev->size_low = __cpu_to_le32((__u32) array_blocks); + dev->size_high = __cpu_to_le32((__u32) (array_blocks >> 32)); + dev->status = (DEV_READ_COALESCING | DEV_WRITE_COALESCING); + vol = &dev->vol; + vol->migr_state = 0; + set_migr_type(dev, MIGR_INIT); + vol->dirty = !info->state; + vol->curr_migr_unit = 0; + map = get_imsm_map(dev, MAP_0); + set_pba_of_lba0(map, super->create_offset); + set_blocks_per_member(map, info_to_blocks_per_member(info, size)); + map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info)); + map->failed_disk_num = ~0; + if (info->level > 0) + map->map_state = (info->state ? IMSM_T_STATE_NORMAL + : IMSM_T_STATE_UNINITIALIZED); + else + map->map_state = info->failed_disks ? IMSM_T_STATE_FAILED : + IMSM_T_STATE_NORMAL; + map->ddf = 1; + + if (info->level == 1 && info->raid_disks > 2) { + free(dev); + free(dv); + pr_err("imsm does not support more than 2 disksin a raid1 volume\n"); + return 0; + } + + map->raid_level = info->level; + if (info->level == 10) { + map->raid_level = 1; + map->num_domains = info->raid_disks / 2; + } else if (info->level == 1) + map->num_domains = info->raid_disks; + else + map->num_domains = 1; + + /* info->size is only int so use the 'size' parameter instead */ + num_data_stripes = (size * 2) / info_to_blocks_per_strip(info); + num_data_stripes /= map->num_domains; + set_num_data_stripes(map, num_data_stripes); + + map->num_members = info->raid_disks; + for (i = 0; i < map->num_members; i++) { + /* initialized in add_to_super */ + set_imsm_ord_tbl_ent(map, i, IMSM_ORD_REBUILD); + } + mpb->num_raid_devs++; + + dv->dev = dev; + dv->index = super->current_vol; + dv->next = super->devlist; + super->devlist = dv; + + imsm_update_version_info(super); + + return 1; +} + +static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, + char *homehost, int *uuid, + unsigned long long data_offset) +{ + /* This is primarily called by Create when creating a new array. + * We will then get add_to_super called for each component, and then + * write_init_super called to write it out to each device. + * For IMSM, Create can create on fresh devices or on a pre-existing + * array. + * To create on a pre-existing array a different method will be called. + * This one is just for fresh drives. + */ + struct intel_super *super; + struct imsm_super *mpb; + size_t mpb_size; + char *version; + + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset not supported by imsm\n"); + return 0; + } + + if (st->sb) + return init_super_imsm_volume(st, info, size, name, homehost, uuid, + data_offset); + + if (info) + mpb_size = disks_to_mpb_size(info->nr_disks); + else + mpb_size = 512; + + super = alloc_super(); + if (super && posix_memalign(&super->buf, 512, mpb_size) != 0) { + free(super); + super = NULL; + } + if (!super) { + pr_err("could not allocate superblock\n"); + return 0; + } + if (posix_memalign(&super->migr_rec_buf, 512, MIGR_REC_BUF_SIZE) != 0) { + pr_err("could not allocate migr_rec buffer\n"); + free(super->buf); + free(super); + return 0; + } + memset(super->buf, 0, mpb_size); + mpb = super->buf; + mpb->mpb_size = __cpu_to_le32(mpb_size); + st->sb = super; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + + version = (char *) mpb->sig; + strcpy(version, MPB_SIGNATURE); + version += strlen(MPB_SIGNATURE); + strcpy(version, MPB_VERSION_RAID0); + + return 1; +} + +#ifndef MDASSEMBLE +static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct imsm_disk *_disk; + struct imsm_dev *dev; + struct imsm_map *map; + struct dl *dl, *df; + int slot; + + dev = get_imsm_dev(super, super->current_vol); + map = get_imsm_map(dev, MAP_0); + + if (! (dk->state & (1<disks; dl; dl = dl->next) + if (dl->raiddisk == dk->raid_disk) + break; + } else { + for (dl = super->disks; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + } + + if (!dl) { + pr_err("%s is not a member of the same container\n", devname); + return 1; + } + + /* add a pristine spare to the metadata */ + if (dl->index < 0) { + dl->index = super->anchor->num_disks; + super->anchor->num_disks++; + } + /* Check the device has not already been added */ + slot = get_imsm_disk_slot(map, dl->index); + if (slot >= 0 && + (get_imsm_ord_tbl_ent(dev, slot, MAP_X) & IMSM_ORD_REBUILD) == 0) { + pr_err("%s has been included in this array twice\n", + devname); + return 1; + } + set_imsm_ord_tbl_ent(map, dk->raid_disk, dl->index); + dl->disk.status = CONFIGURED_DISK; + + /* update size of 'missing' disks to be at least as large as the + * largest acitve member (we only have dummy missing disks when + * creating the first volume) + */ + if (super->current_vol == 0) { + for (df = super->missing; df; df = df->next) { + if (total_blocks(&dl->disk) > total_blocks(&df->disk)) + set_total_blocks(&df->disk, total_blocks(&dl->disk)); + _disk = __get_imsm_disk(mpb, df->index); + *_disk = df->disk; + } + } + + /* refresh unset/failed slots to point to valid 'missing' entries */ + for (df = super->missing; df; df = df->next) + for (slot = 0; slot < mpb->num_disks; slot++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X); + + if ((ord & IMSM_ORD_REBUILD) == 0) + continue; + set_imsm_ord_tbl_ent(map, slot, df->index | IMSM_ORD_REBUILD); + if (is_gen_migration(dev)) { + struct imsm_map *map2 = get_imsm_map(dev, + MAP_1); + int slot2 = get_imsm_disk_slot(map2, df->index); + if ((slot2 < map2->num_members) && + (slot2 >= 0)) { + __u32 ord2 = get_imsm_ord_tbl_ent(dev, + slot2, + MAP_1); + if ((unsigned)df->index == + ord_to_idx(ord2)) + set_imsm_ord_tbl_ent(map2, + slot2, + df->index | + IMSM_ORD_REBUILD); + } + } + dprintf("set slot:%d to missing disk:%d\n", slot, df->index); + break; + } + + /* if we are creating the first raid device update the family number */ + if (super->current_vol == 0) { + __u32 sum; + struct imsm_dev *_dev = __get_imsm_dev(mpb, 0); + + _disk = __get_imsm_disk(mpb, dl->index); + if (!_dev || !_disk) { + pr_err("BUG mpb setup error\n"); + return 1; + } + *_dev = *dev; + *_disk = dl->disk; + sum = random32(); + sum += __gen_imsm_checksum(mpb); + mpb->family_num = __cpu_to_le32(sum); + mpb->orig_family_num = mpb->family_num; + } + super->current_disk = dl; + return 0; +} + +/* mark_spare() + * Function marks disk as spare and restores disk serial + * in case it was previously marked as failed by takeover operation + * reruns: + * -1 : critical error + * 0 : disk is marked as spare but serial is not set + * 1 : success + */ +int mark_spare(struct dl *disk) +{ + __u8 serial[MAX_RAID_SERIAL_LEN]; + int ret_val = -1; + + if (!disk) + return ret_val; + + ret_val = 0; + if (!imsm_read_serial(disk->fd, NULL, serial)) { + /* Restore disk serial number, because takeover marks disk + * as failed and adds to serial ':0' before it becomes + * a spare disk. + */ + serialcpy(disk->serial, serial); + serialcpy(disk->disk.serial, serial); + ret_val = 1; + } + disk->disk.status = SPARE_DISK; + disk->index = -1; + + return ret_val; +} + +static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname, + unsigned long long data_offset) +{ + struct intel_super *super = st->sb; + struct dl *dd; + unsigned long long size; + __u32 id; + int rv; + struct stat stb; + + /* If we are on an RAID enabled platform check that the disk is + * attached to the raid controller. + * We do not need to test disks attachment for container based additions, + * they shall be already tested when container was created/assembled. + */ + rv = find_intel_hba_capability(fd, super, devname); + /* no orom/efi or non-intel hba of the disk */ + if (rv != 0) { + dprintf("capability: %p fd: %d ret: %d\n", + super->orom, fd, rv); + return 1; + } + + if (super->current_vol >= 0) + return add_to_super_imsm_volume(st, dk, fd, devname); + + fstat(fd, &stb); + dd = xcalloc(sizeof(*dd), 1); + dd->major = major(stb.st_rdev); + dd->minor = minor(stb.st_rdev); + dd->devname = devname ? xstrdup(devname) : NULL; + dd->fd = fd; + dd->e = NULL; + dd->action = DISK_ADD; + rv = imsm_read_serial(fd, devname, dd->serial); + if (rv) { + pr_err("failed to retrieve scsi serial, aborting\n"); + free(dd); + abort(); + } + + get_dev_size(fd, NULL, &size); + /* clear migr_rec when adding disk to container */ + memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SIZE); + if (lseek64(fd, size - MIGR_REC_POSITION, SEEK_SET) >= 0) { + if (write(fd, super->migr_rec_buf, + MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE) + perror("Write migr_rec failed"); + } + + size /= 512; + serialcpy(dd->disk.serial, dd->serial); + set_total_blocks(&dd->disk, size); + if (__le32_to_cpu(dd->disk.total_blocks_hi) > 0) { + struct imsm_super *mpb = super->anchor; + mpb->attributes |= MPB_ATTRIB_2TB_DISK; + } + mark_spare(dd); + if (sysfs_disk_to_scsi_id(fd, &id) == 0) + dd->disk.scsi_id = __cpu_to_le32(id); + else + dd->disk.scsi_id = __cpu_to_le32(0); + + if (st->update_tail) { + dd->next = super->disk_mgmt_list; + super->disk_mgmt_list = dd; + } else { + dd->next = super->disks; + super->disks = dd; + super->updates_pending++; + } + + return 0; +} + +static int remove_from_super_imsm(struct supertype *st, mdu_disk_info_t *dk) +{ + struct intel_super *super = st->sb; + struct dl *dd; + + /* remove from super works only in mdmon - for communication + * manager - monitor. Check if communication memory buffer + * is prepared. + */ + if (!st->update_tail) { + pr_err("shall be used in mdmon context only\n"); + return 1; + } + dd = xcalloc(1, sizeof(*dd)); + dd->major = dk->major; + dd->minor = dk->minor; + dd->fd = -1; + mark_spare(dd); + dd->action = DISK_REMOVE; + + dd->next = super->disk_mgmt_list; + super->disk_mgmt_list = dd; + + return 0; +} + +static int store_imsm_mpb(int fd, struct imsm_super *mpb); + +static union { + char buf[512]; + struct imsm_super anchor; +} spare_record __attribute__ ((aligned(512))); + +/* spare records have their own family number and do not have any defined raid + * devices + */ +static int write_super_imsm_spares(struct intel_super *super, int doclose) +{ + struct imsm_super *mpb = super->anchor; + struct imsm_super *spare = &spare_record.anchor; + __u32 sum; + struct dl *d; + + spare->mpb_size = __cpu_to_le32(sizeof(struct imsm_super)); + spare->generation_num = __cpu_to_le32(1UL); + spare->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + spare->num_disks = 1; + spare->num_raid_devs = 0; + spare->cache_size = mpb->cache_size; + spare->pwr_cycle_count = __cpu_to_le32(1); + + snprintf((char *) spare->sig, MAX_SIGNATURE_LENGTH, + MPB_SIGNATURE MPB_VERSION_RAID0); + + for (d = super->disks; d; d = d->next) { + if (d->index != -1) + continue; + + spare->disk[0] = d->disk; + if (__le32_to_cpu(d->disk.total_blocks_hi) > 0) + spare->attributes |= MPB_ATTRIB_2TB_DISK; + + sum = __gen_imsm_checksum(spare); + spare->family_num = __cpu_to_le32(sum); + spare->orig_family_num = 0; + sum = __gen_imsm_checksum(spare); + spare->check_sum = __cpu_to_le32(sum); + + if (store_imsm_mpb(d->fd, spare)) { + pr_err("failed for device %d:%d %s\n", + d->major, d->minor, strerror(errno)); + return 1; + } + if (doclose) { + close(d->fd); + d->fd = -1; + } + } + + return 0; +} + +static int write_super_imsm(struct supertype *st, int doclose) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct dl *d; + __u32 generation; + __u32 sum; + int spares = 0; + int i; + __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk); + int num_disks = 0; + int clear_migration_record = 1; + + /* 'generation' is incremented everytime the metadata is written */ + generation = __le32_to_cpu(mpb->generation_num); + generation++; + mpb->generation_num = __cpu_to_le32(generation); + + /* fix up cases where previous mdadm releases failed to set + * orig_family_num + */ + if (mpb->orig_family_num == 0) + mpb->orig_family_num = mpb->family_num; + + for (d = super->disks; d; d = d->next) { + if (d->index == -1) + spares++; + else { + mpb->disk[d->index] = d->disk; + num_disks++; + } + } + for (d = super->missing; d; d = d->next) { + mpb->disk[d->index] = d->disk; + num_disks++; + } + mpb->num_disks = num_disks; + mpb_size += sizeof(struct imsm_disk) * mpb->num_disks; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + struct imsm_dev *dev2 = get_imsm_dev(super, i); + if (dev && dev2) { + imsm_copy_dev(dev, dev2); + mpb_size += sizeof_imsm_dev(dev, 0); + } + if (is_gen_migration(dev2)) + clear_migration_record = 0; + } + mpb_size += __le32_to_cpu(mpb->bbm_log_size); + mpb->mpb_size = __cpu_to_le32(mpb_size); + + /* recalculate checksum */ + sum = __gen_imsm_checksum(mpb); + mpb->check_sum = __cpu_to_le32(sum); + + if (super->clean_migration_record_by_mdmon) { + clear_migration_record = 1; + super->clean_migration_record_by_mdmon = 0; + } + if (clear_migration_record) + memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SIZE); + + /* write the mpb for disks that compose raid devices */ + for (d = super->disks; d ; d = d->next) { + if (d->index < 0 || is_failed(&d->disk)) + continue; + + if (clear_migration_record) { + unsigned long long dsize; + + get_dev_size(d->fd, NULL, &dsize); + if (lseek64(d->fd, dsize - 512, SEEK_SET) >= 0) { + if (write(d->fd, super->migr_rec_buf, + MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE) + perror("Write migr_rec failed"); + } + } + + if (store_imsm_mpb(d->fd, mpb)) + fprintf(stderr, + "failed for device %d:%d (fd: %d)%s\n", + d->major, d->minor, + d->fd, strerror(errno)); + + if (doclose) { + close(d->fd); + d->fd = -1; + } + } + + if (spares) + return write_super_imsm_spares(super, doclose); + + return 0; +} + +static int create_array(struct supertype *st, int dev_idx) +{ + size_t len; + struct imsm_update_create_array *u; + struct intel_super *super = st->sb; + struct imsm_dev *dev = get_imsm_dev(super, dev_idx); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct disk_info *inf; + struct imsm_disk *disk; + int i; + + len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0) + + sizeof(*inf) * map->num_members; + u = xmalloc(len); + u->type = update_create_array; + u->dev_idx = dev_idx; + imsm_copy_dev(&u->dev, dev); + inf = get_disk_info(u); + for (i = 0; i < map->num_members; i++) { + int idx = get_imsm_disk_idx(dev, i, MAP_X); + + disk = get_imsm_disk(super, idx); + if (!disk) + disk = get_imsm_missing(super, idx); + serialcpy(inf[i].serial, disk->serial); + } + append_metadata_update(st, u, len); + + return 0; +} + +static int mgmt_disk(struct supertype *st) +{ + struct intel_super *super = st->sb; + size_t len; + struct imsm_update_add_remove_disk *u; + + if (!super->disk_mgmt_list) + return 0; + + len = sizeof(*u); + u = xmalloc(len); + u->type = update_add_remove_disk; + append_metadata_update(st, u, len); + + return 0; +} + +static int write_init_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + int current_vol = super->current_vol; + + /* we are done with current_vol reset it to point st at the container */ + super->current_vol = -1; + + if (st->update_tail) { + /* queue the recently created array / added disk + * as a metadata update */ + int rv; + + /* determine if we are creating a volume or adding a disk */ + if (current_vol < 0) { + /* in the mgmt (add/remove) disk case we are running + * in mdmon context, so don't close fd's + */ + return mgmt_disk(st); + } else + rv = create_array(st, current_vol); + + return rv; + } else { + struct dl *d; + for (d = super->disks; d; d = d->next) + Kill(d->devname, NULL, 0, -1, 1); + return write_super_imsm(st, 1); + } +} +#endif + +static int store_super_imsm(struct supertype *st, int fd) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super ? super->anchor : NULL; + + if (!mpb) + return 1; + +#ifndef MDASSEMBLE + return store_imsm_mpb(fd, mpb); +#else + return 1; +#endif +} + +static int imsm_bbm_log_size(struct imsm_super *mpb) +{ + return __le32_to_cpu(mpb->bbm_log_size); +} + +#ifndef MDASSEMBLE +static int validate_geometry_imsm_container(struct supertype *st, int level, + int layout, int raiddisks, int chunk, + unsigned long long size, + unsigned long long data_offset, + char *dev, + unsigned long long *freesize, + int verbose) +{ + int fd; + unsigned long long ldsize; + struct intel_super *super=NULL; + int rv = 0; + + if (level != LEVEL_CONTAINER) + return 0; + if (!dev) + return 1; + + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose > 0) + pr_err("imsm: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + if (!get_dev_size(fd, dev, &ldsize)) { + close(fd); + return 0; + } + + /* capabilities retrieve could be possible + * note that there is no fd for the disks in array. + */ + super = alloc_super(); + rv = find_intel_hba_capability(fd, super, verbose > 0 ? dev : NULL); + if (rv != 0) { +#if DEBUG + char str[256]; + fd2devname(fd, str); + dprintf("fd: %d %s orom: %p rv: %d raiddisk: %d\n", + fd, str, super->orom, rv, raiddisks); +#endif + /* no orom/efi or non-intel hba of the disk */ + close(fd); + free_imsm(super); + return 0; + } + close(fd); + if (super->orom) { + if (raiddisks > super->orom->tds) { + if (verbose) + pr_err("%d exceeds maximum number of platform supported disks: %d\n", + raiddisks, super->orom->tds); + free_imsm(super); + return 0; + } + if ((super->orom->attr & IMSM_OROM_ATTR_2TB_DISK) == 0 && + (ldsize >> 9) >> 32 > 0) { + if (verbose) + pr_err("%s exceeds maximum platform supported size\n", dev); + free_imsm(super); + return 0; + } + } + + *freesize = avail_size_imsm(st, ldsize >> 9, data_offset); + free_imsm(super); + + return 1; +} + +static unsigned long long find_size(struct extent *e, int *idx, int num_extents) +{ + const unsigned long long base_start = e[*idx].start; + unsigned long long end = base_start + e[*idx].size; + int i; + + if (base_start == end) + return 0; + + *idx = *idx + 1; + for (i = *idx; i < num_extents; i++) { + /* extend overlapping extents */ + if (e[i].start >= base_start && + e[i].start <= end) { + if (e[i].size == 0) + return 0; + if (e[i].start + e[i].size > end) + end = e[i].start + e[i].size; + } else if (e[i].start > end) { + *idx = i; + break; + } + } + + return end - base_start; +} + +static unsigned long long merge_extents(struct intel_super *super, int sum_extents) +{ + /* build a composite disk with all known extents and generate a new + * 'maxsize' given the "all disks in an array must share a common start + * offset" constraint + */ + struct extent *e = xcalloc(sum_extents, sizeof(*e)); + struct dl *dl; + int i, j; + int start_extent; + unsigned long long pos; + unsigned long long start = 0; + unsigned long long maxsize; + unsigned long reserve; + + /* coalesce and sort all extents. also, check to see if we need to + * reserve space between member arrays + */ + j = 0; + for (dl = super->disks; dl; dl = dl->next) { + if (!dl->e) + continue; + for (i = 0; i < dl->extent_cnt; i++) + e[j++] = dl->e[i]; + } + qsort(e, sum_extents, sizeof(*e), cmp_extent); + + /* merge extents */ + i = 0; + j = 0; + while (i < sum_extents) { + e[j].start = e[i].start; + e[j].size = find_size(e, &i, sum_extents); + j++; + if (e[j-1].size == 0) + break; + } + + pos = 0; + maxsize = 0; + start_extent = 0; + i = 0; + do { + unsigned long long esize; + + esize = e[i].start - pos; + if (esize >= maxsize) { + maxsize = esize; + start = pos; + start_extent = i; + } + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + free(e); + + if (maxsize == 0) + return 0; + + /* FIXME assumes volume at offset 0 is the first volume in a + * container + */ + if (start_extent > 0) + reserve = IMSM_RESERVED_SECTORS; /* gap between raid regions */ + else + reserve = 0; + + if (maxsize < reserve) + return 0; + + super->create_offset = ~((unsigned long long) 0); + if (start + reserve > super->create_offset) + return 0; /* start overflows create_offset */ + super->create_offset = start + reserve; + + return maxsize - reserve; +} + +static int is_raid_level_supported(const struct imsm_orom *orom, int level, int raiddisks) +{ + if (level < 0 || level == 6 || level == 4) + return 0; + + /* if we have an orom prevent invalid raid levels */ + if (orom) + switch (level) { + case 0: return imsm_orom_has_raid0(orom); + case 1: + if (raiddisks > 2) + return imsm_orom_has_raid1e(orom); + return imsm_orom_has_raid1(orom) && raiddisks == 2; + case 10: return imsm_orom_has_raid10(orom) && raiddisks == 4; + case 5: return imsm_orom_has_raid5(orom) && raiddisks > 2; + } + else + return 1; /* not on an Intel RAID platform so anything goes */ + + return 0; +} + +static int +active_arrays_by_format(char *name, char* hba, struct md_list **devlist, + int dpa, int verbose) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *memb = NULL; + int count = 0; + int num = 0; + struct md_list *dv = NULL; + int found; + + for (memb = mdstat ; memb ; memb = memb->next) { + if (memb->metadata_version && + (strncmp(memb->metadata_version, "external:", 9) == 0) && + (strcmp(&memb->metadata_version[9], name) == 0) && + !is_subarray(memb->metadata_version+9) && + memb->members) { + struct dev_member *dev = memb->members; + int fd = -1; + while(dev && (fd < 0)) { + char *path = xmalloc(strlen(dev->name) + strlen("/dev/") + 1); + num = sprintf(path, "%s%s", "/dev/", dev->name); + if (num > 0) + fd = open(path, O_RDONLY, 0); + if ((num <= 0) || (fd < 0)) { + pr_vrb(": Cannot open %s: %s\n", + dev->name, strerror(errno)); + } + free(path); + dev = dev->next; + } + found = 0; + if ((fd >= 0) && disk_attached_to_hba(fd, hba)) { + struct mdstat_ent *vol; + for (vol = mdstat ; vol ; vol = vol->next) { + if ((vol->active > 0) && + vol->metadata_version && + is_container_member(vol, memb->devnm)) { + found++; + count++; + } + } + if (*devlist && (found < dpa)) { + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xmalloc(strlen(memb->devnm) + strlen("/dev/") + 1); + sprintf(dv->devname, "%s%s", "/dev/", memb->devnm); + dv->found = found; + dv->used = 0; + dv->next = *devlist; + *devlist = dv; + } + } + if (fd >= 0) + close(fd); + } + } + free_mdstat(mdstat); + return count; +} + +#ifdef DEBUG_LOOP +static struct md_list* +get_loop_devices(void) +{ + int i; + struct md_list *devlist = NULL; + struct md_list *dv = NULL; + + for(i = 0; i < 12; i++) { + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xmalloc(40); + sprintf(dv->devname, "/dev/loop%d", i); + dv->next = devlist; + devlist = dv; + } + return devlist; +} +#endif + +static struct md_list* +get_devices(const char *hba_path) +{ + struct md_list *devlist = NULL; + struct md_list *dv = NULL; + struct dirent *ent; + DIR *dir; + int err = 0; + +#if DEBUG_LOOP + devlist = get_loop_devices(); + return devlist; +#endif + /* scroll through /sys/dev/block looking for devices attached to + * this hba + */ + dir = opendir("/sys/dev/block"); + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + int fd; + char buf[1024]; + int major, minor; + char *path = NULL; + if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2) + continue; + path = devt_to_devpath(makedev(major, minor)); + if (!path) + continue; + if (!path_attached_to_hba(path, hba_path)) { + free(path); + path = NULL; + continue; + } + free(path); + path = NULL; + fd = dev_open(ent->d_name, O_RDONLY); + if (fd >= 0) { + fd2devname(fd, buf); + close(fd); + } else { + pr_err("cannot open device: %s\n", + ent->d_name); + continue; + } + + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xstrdup(buf); + dv->next = devlist; + devlist = dv; + } + if (err) { + while(devlist) { + dv = devlist; + devlist = devlist->next; + free(dv->devname); + free(dv); + } + } + closedir(dir); + return devlist; +} + +static int +count_volumes_list(struct md_list *devlist, char *homehost, + int verbose, int *found) +{ + struct md_list *tmpdev; + int count = 0; + struct supertype *st = NULL; + + /* first walk the list of devices to find a consistent set + * that match the criterea, if that is possible. + * We flag the ones we like with 'used'. + */ + *found = 0; + st = match_metadata_desc_imsm("imsm"); + if (st == NULL) { + pr_vrb(": cannot allocate memory for imsm supertype\n"); + return 0; + } + + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + char *devname = tmpdev->devname; + struct stat stb; + struct supertype *tst; + int dfd; + if (tmpdev->used > 1) + continue; + tst = dup_super(st); + if (tst == NULL) { + pr_vrb(": cannot allocate memory for imsm supertype\n"); + goto err_1; + } + tmpdev->container = 0; + dfd = dev_open(devname, O_RDONLY|O_EXCL); + if (dfd < 0) { + dprintf("cannot open device %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if (fstat(dfd, &stb)< 0) { + /* Impossible! */ + dprintf("fstat failed for %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if ((stb.st_mode & S_IFMT) != S_IFBLK) { + dprintf("%s is not a block device.\n", + devname); + tmpdev->used = 2; + } else if (must_be_container(dfd)) { + struct supertype *cst; + cst = super_by_fd(dfd, NULL); + if (cst == NULL) { + dprintf("cannot recognize container type %s\n", + devname); + tmpdev->used = 2; + } else if (tst->ss != st->ss) { + dprintf("non-imsm container - ignore it: %s\n", + devname); + tmpdev->used = 2; + } else if (!tst->ss->load_container || + tst->ss->load_container(tst, dfd, NULL)) + tmpdev->used = 2; + else { + tmpdev->container = 1; + } + if (cst) + cst->ss->free_super(cst); + } else { + tmpdev->st_rdev = stb.st_rdev; + if (tst->ss->load_super(tst,dfd, NULL)) { + dprintf("no RAID superblock on %s\n", + devname); + tmpdev->used = 2; + } else if (tst->ss->compare_super == NULL) { + dprintf("Cannot assemble %s metadata on %s\n", + tst->ss->name, devname); + tmpdev->used = 2; + } + } + if (dfd >= 0) + close(dfd); + if (tmpdev->used == 2 || tmpdev->used == 4) { + /* Ignore unrecognised devices during auto-assembly */ + goto loop; + } + else { + struct mdinfo info; + tst->ss->getinfo_super(tst, &info, NULL); + + if (st->minor_version == -1) + st->minor_version = tst->minor_version; + + if (memcmp(info.uuid, uuid_zero, + sizeof(int[4])) == 0) { + /* this is a floating spare. It cannot define + * an array unless there are no more arrays of + * this type to be found. It can be included + * in an array of this type though. + */ + tmpdev->used = 3; + goto loop; + } + + if (st->ss != tst->ss || + st->minor_version != tst->minor_version || + st->ss->compare_super(st, tst) != 0) { + /* Some mismatch. If exactly one array matches this host, + * we can resolve on that one. + * Or, if we are auto assembling, we just ignore the second + * for now. + */ + dprintf("superblock on %s doesn't match others - assembly aborted\n", + devname); + goto loop; + } + tmpdev->used = 1; + *found = 1; + dprintf("found: devname: %s\n", devname); + } + loop: + if (tst) + tst->ss->free_super(tst); + } + if (*found != 0) { + int err; + if ((err = load_super_imsm_all(st, -1, &st->sb, NULL, devlist, 0)) == 0) { + struct mdinfo *iter, *head = st->ss->container_content(st, NULL); + for (iter = head; iter; iter = iter->next) { + dprintf("content->text_version: %s vol\n", + iter->text_version); + if (iter->array.state & (1<text_version); + } else + count++; + } + sysfs_free(head); + + } else { + dprintf("No valid super block on device list: err: %d %p\n", + err, st->sb); + } + } else { + dprintf("no more devices to examine\n"); + } + + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if ((tmpdev->used == 1) && (tmpdev->found)) { + if (count) { + if (count < tmpdev->found) + count = 0; + else + count -= tmpdev->found; + } + } + if (tmpdev->used == 1) + tmpdev->used = 4; + } + err_1: + if (st) + st->ss->free_super(st); + return count; +} + +static int +count_volumes(struct intel_hba *hba, int dpa, int verbose) +{ + struct sys_dev *idev, *intel_devices = find_intel_devices(); + int count = 0; + const struct orom_entry *entry; + struct devid_list *dv, *devid_list; + + if (!hba || !hba->path) + return 0; + + for (idev = intel_devices; idev; idev = idev->next) { + if (strstr(idev->path, hba->path)) + break; + } + + if (!idev || !idev->dev_id) + return 0; + + entry = get_orom_entry_by_device_id(idev->dev_id); + + if (!entry || !entry->devid_list) + return 0; + + devid_list = entry->devid_list; + for (dv = devid_list; dv; dv = dv->next) { + struct md_list *devlist = NULL; + struct sys_dev *device = device_by_id(dv->devid); + char *hba_path; + int found = 0; + + if (device) + hba_path = device->path; + else + return 0; + + /* VMD has one orom entry for all domain, but spanning is not allowed. + * VMD arrays should be counted per domain (controller), so skip + * domains that are not the given one. + */ + if ((hba->type == SYS_DEV_VMD) && + (strncmp(device->path, hba->path, strlen(device->path)) != 0)) + continue; + + devlist = get_devices(hba_path); + /* if no intel devices return zero volumes */ + if (devlist == NULL) + return 0; + + count += active_arrays_by_format("imsm", hba_path, &devlist, dpa, verbose); + dprintf("path: %s active arrays: %d\n", hba_path, count); + if (devlist == NULL) + return 0; + do { + found = 0; + count += count_volumes_list(devlist, + NULL, + verbose, + &found); + dprintf("found %d count: %d\n", found, count); + } while (found); + + dprintf("path: %s total number of volumes: %d\n", hba_path, count); + + while (devlist) { + struct md_list *dv = devlist; + devlist = devlist->next; + free(dv->devname); + free(dv); + } + } + return count; +} + +static int imsm_default_chunk(const struct imsm_orom *orom) +{ + /* up to 512 if the plaform supports it, otherwise the platform max. + * 128 if no platform detected + */ + int fs = max(7, orom ? fls(orom->sss) : 0); + + return min(512, (1 << fs)); +} + +static int +validate_geometry_imsm_orom(struct intel_super *super, int level, int layout, + int raiddisks, int *chunk, unsigned long long size, int verbose) +{ + /* check/set platform and metadata limits/defaults */ + if (super->orom && raiddisks > super->orom->dpa) { + pr_vrb(": platform supports a maximum of %d disks per array\n", + super->orom->dpa); + return 0; + } + + /* capabilities of OROM tested - copied from validate_geometry_imsm_volume */ + if (!is_raid_level_supported(super->orom, level, raiddisks)) { + pr_vrb(": platform does not support raid%d with %d disk%s\n", + level, raiddisks, raiddisks > 1 ? "s" : ""); + return 0; + } + + if (*chunk == 0 || *chunk == UnSet) + *chunk = imsm_default_chunk(super->orom); + + if (super->orom && !imsm_orom_has_chunk(super->orom, *chunk)) { + pr_vrb(": platform does not support a chunk size of: %d\n", *chunk); + return 0; + } + + if (layout != imsm_level_to_layout(level)) { + if (level == 5) + pr_vrb(": imsm raid 5 only supports the left-asymmetric layout\n"); + else if (level == 10) + pr_vrb(": imsm raid 10 only supports the n2 layout\n"); + else + pr_vrb(": imsm unknown layout %#x for this raid level %d\n", + layout, level); + return 0; + } + + if (super->orom && (super->orom->attr & IMSM_OROM_ATTR_2TB) == 0 && + (calc_array_size(level, raiddisks, layout, *chunk, size) >> 32) > 0) { + pr_vrb(": platform does not support a volume size over 2TB\n"); + return 0; + } + + return 1; +} + +/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd + * FIX ME add ahci details + */ +static int validate_geometry_imsm_volume(struct supertype *st, int level, + int layout, int raiddisks, int *chunk, + unsigned long long size, + unsigned long long data_offset, + char *dev, + unsigned long long *freesize, + int verbose) +{ + struct stat stb; + struct intel_super *super = st->sb; + struct imsm_super *mpb; + struct dl *dl; + unsigned long long pos = 0; + unsigned long long maxsize; + struct extent *e; + int i; + + /* We must have the container info already read in. */ + if (!super) + return 0; + + mpb = super->anchor; + + if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, size, verbose)) { + pr_err("RAID gemetry validation failed. Cannot proceed with the action(s).\n"); + return 0; + } + if (!dev) { + /* General test: make sure there is space for + * 'raiddisks' device extents of size 'size' at a given + * offset + */ + unsigned long long minsize = size; + unsigned long long start_offset = MaxSector; + int dcnt = 0; + if (minsize == 0) + minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + for (dl = super->disks; dl ; dl = dl->next) { + int found = 0; + + pos = 0; + i = 0; + e = get_extents(super, dl); + if (!e) continue; + do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= minsize) + found = 1; + if (found && start_offset == MaxSector) { + start_offset = pos; + break; + } else if (found && pos != start_offset) { + found = 0; + break; + } + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + if (found) + dcnt++; + free(e); + } + if (dcnt < raiddisks) { + if (verbose) + pr_err("imsm: Not enough devices with space for this array (%d < %d)\n", + dcnt, raiddisks); + return 0; + } + return 1; + } + + /* This device must be a member of the set */ + if (stat(dev, &stb) < 0) + return 0; + if ((S_IFMT & stb.st_mode) != S_IFBLK) + return 0; + for (dl = super->disks ; dl ; dl = dl->next) { + if (dl->major == (int)major(stb.st_rdev) && + dl->minor == (int)minor(stb.st_rdev)) + break; + } + if (!dl) { + if (verbose) + pr_err("%s is not in the same imsm set\n", dev); + return 0; + } else if (super->orom && dl->index < 0 && mpb->num_raid_devs) { + /* If a volume is present then the current creation attempt + * cannot incorporate new spares because the orom may not + * understand this configuration (all member disks must be + * members of each array in the container). + */ + pr_err("%s is a spare and a volume is already defined for this container\n", dev); + pr_err("The option-rom requires all member disks to be a member of all volumes\n"); + return 0; + } else if (super->orom && mpb->num_raid_devs > 0 && + mpb->num_disks != raiddisks) { + pr_err("The option-rom requires all member disks to be a member of all volumes\n"); + return 0; + } + + /* retrieve the largest free space block */ + e = get_extents(super, dl); + maxsize = 0; + i = 0; + if (e) { + do { + unsigned long long esize; + + esize = e[i].start - pos; + if (esize >= maxsize) + maxsize = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + dl->e = e; + dl->extent_cnt = i; + } else { + if (verbose) + pr_err("unable to determine free space for: %s\n", + dev); + return 0; + } + if (maxsize < size) { + if (verbose) + pr_err("%s not enough space (%llu < %llu)\n", + dev, maxsize, size); + return 0; + } + + /* count total number of extents for merge */ + i = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + i += dl->extent_cnt; + + maxsize = merge_extents(super, i); + + if (!check_env("IMSM_NO_PLATFORM") && + mpb->num_raid_devs > 0 && size && size != maxsize) { + pr_err("attempting to create a second volume with size less then remaining space. Aborting...\n"); + return 0; + } + + if (maxsize < size || maxsize == 0) { + if (verbose) { + if (maxsize == 0) + pr_err("no free space left on device. Aborting...\n"); + else + pr_err("not enough space to create volume of given size (%llu < %llu). Aborting...\n", + maxsize, size); + } + return 0; + } + + *freesize = maxsize; + + if (super->orom) { + int count = count_volumes(super->hba, + super->orom->dpa, verbose); + if (super->orom->vphba <= count) { + pr_vrb(": platform does not support more than %d raid volumes.\n", + super->orom->vphba); + return 0; + } + } + return 1; +} + +static int imsm_get_free_size(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long *freesize) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + int i; + int extent_cnt; + struct extent *e; + unsigned long long maxsize; + unsigned long long minsize; + int cnt; + int used; + + /* find the largest common start free region of the possible disks */ + used = 0; + extent_cnt = 0; + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) { + dl->raiddisk = -1; + + if (dl->index >= 0) + used++; + + /* don't activate new spares if we are orom constrained + * and there is already a volume active in the container + */ + if (super->orom && dl->index < 0 && mpb->num_raid_devs) + continue; + + e = get_extents(super, dl); + if (!e) + continue; + for (i = 1; e[i-1].size; i++) + ; + dl->e = e; + dl->extent_cnt = i; + extent_cnt += i; + cnt++; + } + + maxsize = merge_extents(super, extent_cnt); + minsize = size; + if (size == 0) + /* chunk is in K */ + minsize = chunk * 2; + + if (cnt < raiddisks || + (super->orom && used && used != raiddisks) || + maxsize < minsize || + maxsize == 0) { + pr_err("not enough devices with space to create array.\n"); + return 0; /* No enough free spaces large enough */ + } + + if (size == 0) { + size = maxsize; + if (chunk) { + size /= 2 * chunk; + size *= 2 * chunk; + } + maxsize = size; + } + if (!check_env("IMSM_NO_PLATFORM") && + mpb->num_raid_devs > 0 && size && size != maxsize) { + pr_err("attempting to create a second volume with size less then remaining space. Aborting...\n"); + return 0; + } + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + dl->raiddisk = cnt++; + + *freesize = size; + + dprintf("imsm: imsm_get_free_size() returns : %llu\n", size); + + return 1; +} + +static int reserve_space(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long *freesize) +{ + struct intel_super *super = st->sb; + struct dl *dl; + int cnt; + int rv = 0; + + rv = imsm_get_free_size(st, raiddisks, size, chunk, freesize); + if (rv) { + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + dl->raiddisk = cnt++; + rv = 1; + } + + return rv; +} + +static int validate_geometry_imsm(struct supertype *st, int level, int layout, + int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd, cfd; + struct mdinfo *sra; + int is_member = 0; + + /* load capability + * if given unused devices create a container + * if given given devices in a container create a member volume + */ + if (level == LEVEL_CONTAINER) { + /* Must be a fresh device to add to a container */ + return validate_geometry_imsm_container(st, level, layout, + raiddisks, + *chunk, + size, data_offset, + dev, freesize, + verbose); + } + + if (!dev) { + if (st->sb) { + struct intel_super *super = st->sb; + if (!validate_geometry_imsm_orom(st->sb, level, layout, + raiddisks, chunk, size, + verbose)) + return 0; + /* we are being asked to automatically layout a + * new volume based on the current contents of + * the container. If the the parameters can be + * satisfied reserve_space will record the disks, + * start offset, and size of the volume to be + * created. add_to_super and getinfo_super + * detect when autolayout is in progress. + */ + /* assuming that freesize is always given when array is + created */ + if (super->orom && freesize) { + int count; + count = count_volumes(super->hba, + super->orom->dpa, verbose); + if (super->orom->vphba <= count) { + pr_vrb(": platform does not support more than %d raid volumes.\n", + super->orom->vphba); + return 0; + } + } + if (freesize) + return reserve_space(st, raiddisks, size, + *chunk, freesize); + } + return 1; + } + if (st->sb) { + /* creating in a given container */ + return validate_geometry_imsm_volume(st, level, layout, + raiddisks, chunk, size, + data_offset, + dev, freesize, verbose); + } + + /* This device needs to be a device in an 'imsm' container */ + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd >= 0) { + if (verbose) + pr_err("Cannot create this array on device %s\n", + dev); + close(fd); + return 0; + } + if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { + if (verbose) + pr_err("Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + /* Well, it is in use by someone, maybe an 'imsm' container. */ + cfd = open_container(fd); + close(fd); + if (cfd < 0) { + if (verbose) + pr_err("Cannot use %s: It is busy\n", + dev); + return 0; + } + sra = sysfs_read(cfd, NULL, GET_VERSION); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "imsm") == 0) + is_member = 1; + sysfs_free(sra); + if (is_member) { + /* This is a member of a imsm container. Load the container + * and try to create a volume + */ + struct intel_super *super; + + if (load_super_imsm_all(st, cfd, (void **) &super, NULL, NULL, 1) == 0) { + st->sb = super; + strcpy(st->container_devnm, fd2devnm(cfd)); + close(cfd); + return validate_geometry_imsm_volume(st, level, layout, + raiddisks, chunk, + size, data_offset, dev, + freesize, 1) + ? 1 : -1; + } + } + + if (verbose) + pr_err("failed container membership check\n"); + + close(cfd); + return 0; +} + +static void default_geometry_imsm(struct supertype *st, int *level, int *layout, int *chunk) +{ + struct intel_super *super = st->sb; + + if (level && *level == UnSet) + *level = LEVEL_CONTAINER; + + if (level && layout && *layout == UnSet) + *layout = imsm_level_to_layout(*level); + + if (chunk && (*chunk == UnSet || *chunk == 0)) + *chunk = imsm_default_chunk(super->orom); +} + +static void handle_missing(struct intel_super *super, struct imsm_dev *dev); + +static int kill_subarray_imsm(struct supertype *st) +{ + /* remove the subarray currently referenced by ->current_vol */ + __u8 i; + struct intel_dev **dp; + struct intel_super *super = st->sb; + __u8 current_vol = super->current_vol; + struct imsm_super *mpb = super->anchor; + + if (super->current_vol < 0) + return 2; + super->current_vol = -1; /* invalidate subarray cursor */ + + /* block deletions that would change the uuid of active subarrays + * + * FIXME when immutable ids are available, but note that we'll + * also need to fixup the invalidated/active subarray indexes in + * mdstat + */ + for (i = 0; i < mpb->num_raid_devs; i++) { + char subarray[4]; + + if (i < current_vol) + continue; + sprintf(subarray, "%u", i); + if (is_subarray_active(subarray, st->devnm)) { + pr_err("deleting subarray-%d would change the UUID of active subarray-%d, aborting\n", + current_vol, i); + + return 2; + } + } + + if (st->update_tail) { + struct imsm_update_kill_array *u = xmalloc(sizeof(*u)); + + u->type = update_kill_array; + u->dev_idx = current_vol; + append_metadata_update(st, u, sizeof(*u)); + + return 0; + } + + for (dp = &super->devlist; *dp;) + if ((*dp)->index == current_vol) { + *dp = (*dp)->next; + } else { + handle_missing(super, (*dp)->dev); + if ((*dp)->index > current_vol) + (*dp)->index--; + dp = &(*dp)->next; + } + + /* no more raid devices, all active components are now spares, + * but of course failed are still failed + */ + if (--mpb->num_raid_devs == 0) { + struct dl *d; + + for (d = super->disks; d; d = d->next) + if (d->index > -2) + mark_spare(d); + } + + super->updates_pending++; + + return 0; +} + +static int update_subarray_imsm(struct supertype *st, char *subarray, + char *update, struct mddev_ident *ident) +{ + /* update the subarray currently referenced by ->current_vol */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + + if (strcmp(update, "name") == 0) { + char *name = ident->name; + char *ep; + int vol; + + if (is_subarray_active(subarray, st->devnm)) { + pr_err("Unable to update name of active subarray\n"); + return 2; + } + + if (!check_name(super, name, 0)) + return 2; + + vol = strtoul(subarray, &ep, 10); + if (*ep != '\0' || vol >= super->anchor->num_raid_devs) + return 2; + + if (st->update_tail) { + struct imsm_update_rename_array *u = xmalloc(sizeof(*u)); + + u->type = update_rename_array; + u->dev_idx = vol; + snprintf((char *) u->name, MAX_RAID_SERIAL_LEN, "%s", name); + append_metadata_update(st, u, sizeof(*u)); + } else { + struct imsm_dev *dev; + int i; + + dev = get_imsm_dev(super, vol); + snprintf((char *) dev->volume, MAX_RAID_SERIAL_LEN, "%s", name); + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + handle_missing(super, dev); + } + super->updates_pending++; + } + } else + return 2; + + return 0; +} +#endif /* MDASSEMBLE */ + +static int is_gen_migration(struct imsm_dev *dev) +{ + if (dev == NULL) + return 0; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) == MIGR_GEN_MIGR) + return 1; + + return 0; +} + +static int is_rebuilding(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) != MIGR_REBUILD) + return 0; + + migr_map = get_imsm_map(dev, MAP_1); + + if (migr_map->map_state == IMSM_T_STATE_DEGRADED) + return 1; + else + return 0; +} + +#ifndef MDASSEMBLE +static int is_initializing(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) != MIGR_INIT) + return 0; + + migr_map = get_imsm_map(dev, MAP_1); + + if (migr_map->map_state == IMSM_T_STATE_UNINITIALIZED) + return 1; + + return 0; +} +#endif + +static void update_recovery_start(struct intel_super *super, + struct imsm_dev *dev, + struct mdinfo *array) +{ + struct mdinfo *rebuild = NULL; + struct mdinfo *d; + __u32 units; + + if (!is_rebuilding(dev)) + return; + + /* Find the rebuild target, but punt on the dual rebuild case */ + for (d = array->devs; d; d = d->next) + if (d->recovery_start == 0) { + if (rebuild) + return; + rebuild = d; + } + + if (!rebuild) { + /* (?) none of the disks are marked with + * IMSM_ORD_REBUILD, so assume they are missing and the + * disk_ord_tbl was not correctly updated + */ + dprintf("failed to locate out-of-sync disk\n"); + return; + } + + units = __le32_to_cpu(dev->vol.curr_migr_unit); + rebuild->recovery_start = units * blocks_per_migr_unit(super, dev); +} + +#ifndef MDASSEMBLE +static int recover_backup_imsm(struct supertype *st, struct mdinfo *info); +#endif + +static struct mdinfo *container_content_imsm(struct supertype *st, char *subarray) +{ + /* Given a container loaded by load_super_imsm_all, + * extract information about all the arrays into + * an mdinfo tree. + * If 'subarray' is given, just extract info about that array. + * + * For each imsm_dev create an mdinfo, fill it in, + * then look for matching devices in super->disks + * and create appropriate device mdinfo. + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct mdinfo *rest = NULL; + unsigned int i; + int sb_errors = 0; + struct dl *d; + int spare_disks = 0; + + /* do not assemble arrays when not all attributes are supported */ + if (imsm_check_attributes(mpb->attributes) == 0) { + sb_errors = 1; + pr_err("Unsupported attributes in IMSM metadata.Arrays activation is blocked.\n"); + } + + /* check for bad blocks */ + if (imsm_bbm_log_size(super->anchor)) { + pr_err("BBM log found in IMSM metadata.Arrays activation is blocked.\n"); + sb_errors = 1; + } + + /* count spare devices, not used in maps + */ + for (d = super->disks; d; d = d->next) + if (d->index == -1) + spare_disks++; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev; + struct imsm_map *map; + struct imsm_map *map2; + struct mdinfo *this; + int slot; +#ifndef MDASSEMBLE + int chunk; +#endif + char *ep; + + if (subarray && + (i != strtoul(subarray, &ep, 10) || *ep != '\0')) + continue; + + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + map2 = get_imsm_map(dev, MAP_1); + + /* do not publish arrays that are in the middle of an + * unsupported migration + */ + if (dev->vol.migr_state && + (migr_type(dev) == MIGR_STATE_CHANGE)) { + pr_err("cannot assemble volume '%.16s': unsupported migration in progress\n", + dev->volume); + continue; + } + /* do not publish arrays that are not support by controller's + * OROM/EFI + */ + + this = xmalloc(sizeof(*this)); + + super->current_vol = i; + getinfo_super_imsm_volume(st, this, NULL); + this->next = rest; +#ifndef MDASSEMBLE + chunk = __le16_to_cpu(map->blocks_per_strip) >> 1; + /* mdadm does not support all metadata features- set the bit in all arrays state */ + if (!validate_geometry_imsm_orom(super, + get_imsm_raid_level(map), /* RAID level */ + imsm_level_to_layout(get_imsm_raid_level(map)), + map->num_members, /* raid disks */ + &chunk, join_u32(dev->size_low, dev->size_high), + 1 /* verbose */)) { + pr_err("IMSM RAID geometry validation failed. Array %s activation is blocked.\n", + dev->volume); + this->array.state |= + (1<array.state |= + (1<num_members; slot++) { + unsigned long long recovery_start; + struct mdinfo *info_d; + struct dl *d; + int idx; + int skip; + __u32 ord; + + skip = 0; + idx = get_imsm_disk_idx(dev, slot, MAP_0); + ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X); + for (d = super->disks; d ; d = d->next) + if (d->index == idx) + break; + + recovery_start = MaxSector; + if (d == NULL) + skip = 1; + if (d && is_failed(&d->disk)) + skip = 1; + if (ord & IMSM_ORD_REBUILD) + recovery_start = 0; + + /* + * if we skip some disks the array will be assmebled degraded; + * reset resync start to avoid a dirty-degraded + * situation when performing the intial sync + * + * FIXME handle dirty degraded + */ + if ((skip || recovery_start == 0) && !dev->vol.dirty) + this->resync_start = MaxSector; + if (skip) + continue; + + info_d = xcalloc(1, sizeof(*info_d)); + info_d->next = this->devs; + this->devs = info_d; + + info_d->disk.number = d->index; + info_d->disk.major = d->major; + info_d->disk.minor = d->minor; + info_d->disk.raid_disk = slot; + info_d->recovery_start = recovery_start; + if (map2) { + if (slot < map2->num_members) + info_d->disk.state = (1 << MD_DISK_ACTIVE); + else + this->array.spare_disks++; + } else { + if (slot < map->num_members) + info_d->disk.state = (1 << MD_DISK_ACTIVE); + else + this->array.spare_disks++; + } + if (info_d->recovery_start == MaxSector) + this->array.working_disks++; + + info_d->events = __le32_to_cpu(mpb->generation_num); + info_d->data_offset = pba_of_lba0(map); + info_d->component_size = blocks_per_member(map); + } + /* now that the disk list is up-to-date fixup recovery_start */ + update_recovery_start(super, dev, this); + this->array.spare_disks += spare_disks; + +#ifndef MDASSEMBLE + /* check for reshape */ + if (this->reshape_active == 1) + recover_backup_imsm(st, this); +#endif + rest = this; + } + + return rest; +} + +static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, + int failed, int look_in_map) +{ + struct imsm_map *map; + + map = get_imsm_map(dev, look_in_map); + + if (!failed) + return map->map_state == IMSM_T_STATE_UNINITIALIZED ? + IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL; + + switch (get_imsm_raid_level(map)) { + case 0: + return IMSM_T_STATE_FAILED; + break; + case 1: + if (failed < map->num_members) + return IMSM_T_STATE_DEGRADED; + else + return IMSM_T_STATE_FAILED; + break; + case 10: + { + /** + * check to see if any mirrors have failed, otherwise we + * are degraded. Even numbered slots are mirrored on + * slot+1 + */ + int i; + /* gcc -Os complains that this is unused */ + int insync = insync; + + for (i = 0; i < map->num_members; i++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, i, MAP_X); + int idx = ord_to_idx(ord); + struct imsm_disk *disk; + + /* reset the potential in-sync count on even-numbered + * slots. num_copies is always 2 for imsm raid10 + */ + if ((i & 1) == 0) + insync = 2; + + disk = get_imsm_disk(super, idx); + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) + insync--; + + /* no in-sync disks left in this mirror the + * array has failed + */ + if (insync == 0) + return IMSM_T_STATE_FAILED; + } + + return IMSM_T_STATE_DEGRADED; + } + case 5: + if (failed < 2) + return IMSM_T_STATE_DEGRADED; + else + return IMSM_T_STATE_FAILED; + break; + default: + break; + } + + return map->map_state; +} + +static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev, + int look_in_map) +{ + int i; + int failed = 0; + struct imsm_disk *disk; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *prev = get_imsm_map(dev, MAP_1); + struct imsm_map *map_for_loop; + __u32 ord; + int idx; + int idx_1; + + /* at the beginning of migration we set IMSM_ORD_REBUILD on + * disks that are being rebuilt. New failures are recorded to + * map[0]. So we look through all the disks we started with and + * see if any failures are still present, or if any new ones + * have arrived + */ + map_for_loop = map; + if (prev && (map->num_members < prev->num_members)) + map_for_loop = prev; + + for (i = 0; i < map_for_loop->num_members; i++) { + idx_1 = -255; + /* when MAP_X is passed both maps failures are counted + */ + if (prev && + ((look_in_map == MAP_1) || (look_in_map == MAP_X)) && + (i < prev->num_members)) { + ord = __le32_to_cpu(prev->disk_ord_tbl[i]); + idx_1 = ord_to_idx(ord); + + disk = get_imsm_disk(super, idx_1); + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) + failed++; + } + if (((look_in_map == MAP_0) || (look_in_map == MAP_X)) && + (i < map->num_members)) { + ord = __le32_to_cpu(map->disk_ord_tbl[i]); + idx = ord_to_idx(ord); + + if (idx != idx_1) { + disk = get_imsm_disk(super, idx); + if (!disk || is_failed(disk) || + ord & IMSM_ORD_REBUILD) + failed++; + } + } + } + + return failed; +} + +#ifndef MDASSEMBLE +static int imsm_open_new(struct supertype *c, struct active_array *a, + char *inst) +{ + struct intel_super *super = c->sb; + struct imsm_super *mpb = super->anchor; + + if (atoi(inst) >= mpb->num_raid_devs) { + pr_err("subarry index %d, out of range\n", atoi(inst)); + return -ENODEV; + } + + dprintf("imsm: open_new %s\n", inst); + a->info.container_member = atoi(inst); + return 0; +} + +static int is_resyncing(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) == MIGR_INIT || + migr_type(dev) == MIGR_REPAIR) + return 1; + + if (migr_type(dev) == MIGR_GEN_MIGR) + return 0; + + migr_map = get_imsm_map(dev, MAP_1); + + if ((migr_map->map_state == IMSM_T_STATE_NORMAL) && + (dev->vol.migr_type != MIGR_GEN_MIGR)) + return 1; + else + return 0; +} + +/* return true if we recorded new information */ +static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx) +{ + __u32 ord; + int slot; + struct imsm_map *map; + char buf[MAX_RAID_SERIAL_LEN+3]; + unsigned int len, shift = 0; + + /* new failures are always set in map[0] */ + map = get_imsm_map(dev, MAP_0); + + slot = get_imsm_disk_slot(map, idx); + if (slot < 0) + return 0; + + ord = __le32_to_cpu(map->disk_ord_tbl[slot]); + if (is_failed(disk) && (ord & IMSM_ORD_REBUILD)) + return 0; + + memcpy(buf, disk->serial, MAX_RAID_SERIAL_LEN); + buf[MAX_RAID_SERIAL_LEN] = '\000'; + strcat(buf, ":0"); + if ((len = strlen(buf)) >= MAX_RAID_SERIAL_LEN) + shift = len - MAX_RAID_SERIAL_LEN + 1; + strncpy((char *)disk->serial, &buf[shift], MAX_RAID_SERIAL_LEN); + + disk->status |= FAILED_DISK; + set_imsm_ord_tbl_ent(map, slot, idx | IMSM_ORD_REBUILD); + /* mark failures in second map if second map exists and this disk + * in this slot. + * This is valid for migration, initialization and rebuild + */ + if (dev->vol.migr_state) { + struct imsm_map *map2 = get_imsm_map(dev, MAP_1); + int slot2 = get_imsm_disk_slot(map2, idx); + + if ((slot2 < map2->num_members) && + (slot2 >= 0)) + set_imsm_ord_tbl_ent(map2, slot2, + idx | IMSM_ORD_REBUILD); + } + if (map->failed_disk_num == 0xff) + map->failed_disk_num = slot; + return 1; +} + +static void mark_missing(struct imsm_dev *dev, struct imsm_disk *disk, int idx) +{ + mark_failure(dev, disk, idx); + + if (disk->scsi_id == __cpu_to_le32(~(__u32)0)) + return; + + disk->scsi_id = __cpu_to_le32(~(__u32)0); + memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1); +} + +static void handle_missing(struct intel_super *super, struct imsm_dev *dev) +{ + struct dl *dl; + + if (!super->missing) + return; + + /* When orom adds replacement for missing disk it does + * not remove entry of missing disk, but just updates map with + * new added disk. So it is not enough just to test if there is + * any missing disk, we have to look if there are any failed disks + * in map to stop migration */ + + dprintf("imsm: mark missing\n"); + /* end process for initialization and rebuild only + */ + if (is_gen_migration(dev) == 0) { + __u8 map_state; + int failed; + + failed = imsm_count_failed(super, dev, MAP_0); + map_state = imsm_check_degraded(super, dev, failed, MAP_0); + + if (failed) + end_migration(dev, super, map_state); + } + for (dl = super->missing; dl; dl = dl->next) + mark_missing(dev, &dl->disk, dl->index); + super->updates_pending++; +} + +static unsigned long long imsm_set_array_size(struct imsm_dev *dev, + long long new_size) +{ + int used_disks = imsm_num_data_members(dev, MAP_0); + unsigned long long array_blocks; + struct imsm_map *map; + + if (used_disks == 0) { + /* when problems occures + * return current array_blocks value + */ + array_blocks = __le32_to_cpu(dev->size_high); + array_blocks = array_blocks << 32; + array_blocks += __le32_to_cpu(dev->size_low); + + return array_blocks; + } + + /* set array size in metadata + */ + if (new_size <= 0) { + /* OLCE size change is caused by added disks + */ + map = get_imsm_map(dev, MAP_0); + array_blocks = blocks_per_member(map) * used_disks; + } else { + /* Online Volume Size Change + * Using available free space + */ + array_blocks = new_size; + } + + /* round array size down to closest MB + */ + array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT; + dev->size_low = __cpu_to_le32((__u32)array_blocks); + dev->size_high = __cpu_to_le32((__u32)(array_blocks >> 32)); + + return array_blocks; +} + +static void imsm_set_disk(struct active_array *a, int n, int state); + +static void imsm_progress_container_reshape(struct intel_super *super) +{ + /* if no device has a migr_state, but some device has a + * different number of members than the previous device, start + * changing the number of devices in this device to match + * previous. + */ + struct imsm_super *mpb = super->anchor; + int prev_disks = -1; + int i; + int copy_map_size; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *map2; + int prev_num_members; + + if (dev->vol.migr_state) + return; + + if (prev_disks == -1) + prev_disks = map->num_members; + if (prev_disks == map->num_members) + continue; + + /* OK, this array needs to enter reshape mode. + * i.e it needs a migr_state + */ + + copy_map_size = sizeof_imsm_map(map); + prev_num_members = map->num_members; + map->num_members = prev_disks; + dev->vol.migr_state = 1; + dev->vol.curr_migr_unit = 0; + set_migr_type(dev, MIGR_GEN_MIGR); + for (i = prev_num_members; + i < map->num_members; i++) + set_imsm_ord_tbl_ent(map, i, i); + map2 = get_imsm_map(dev, MAP_1); + /* Copy the current map */ + memcpy(map2, map, copy_map_size); + map2->num_members = prev_num_members; + + imsm_set_array_size(dev, -1); + super->clean_migration_record_by_mdmon = 1; + super->updates_pending++; + } +} + +/* Handle dirty -> clean transititions, resync and reshape. Degraded and rebuild + * states are handled in imsm_set_disk() with one exception, when a + * resync is stopped due to a new failure this routine will set the + * 'degraded' state for the array. + */ +static int imsm_set_array_state(struct active_array *a, int consistent) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int failed = imsm_count_failed(super, dev, MAP_0); + __u8 map_state = imsm_check_degraded(super, dev, failed, MAP_0); + __u32 blocks_per_unit; + + if (dev->vol.migr_state && + dev->vol.migr_type == MIGR_GEN_MIGR) { + /* array state change is blocked due to reshape action + * We might need to + * - abort the reshape (if last_checkpoint is 0 and action!= reshape) + * - finish the reshape (if last_checkpoint is big and action != reshape) + * - update curr_migr_unit + */ + if (a->curr_action == reshape) { + /* still reshaping, maybe update curr_migr_unit */ + goto mark_checkpoint; + } else { + if (a->last_checkpoint == 0 && a->prev_action == reshape) { + /* for some reason we aborted the reshape. + * + * disable automatic metadata rollback + * user action is required to recover process + */ + if (0) { + struct imsm_map *map2 = + get_imsm_map(dev, MAP_1); + dev->vol.migr_state = 0; + set_migr_type(dev, 0); + dev->vol.curr_migr_unit = 0; + memcpy(map, map2, + sizeof_imsm_map(map2)); + super->updates_pending++; + } + } + if (a->last_checkpoint >= a->info.component_size) { + unsigned long long array_blocks; + int used_disks; + struct mdinfo *mdi; + + used_disks = imsm_num_data_members(dev, MAP_0); + if (used_disks > 0) { + array_blocks = + blocks_per_member(map) * + used_disks; + /* round array size down to closest MB + */ + array_blocks = (array_blocks + >> SECT_PER_MB_SHIFT) + << SECT_PER_MB_SHIFT; + a->info.custom_array_size = array_blocks; + /* encourage manager to update array + * size + */ + + a->check_reshape = 1; + } + /* finalize online capacity expansion/reshape */ + for (mdi = a->info.devs; mdi; mdi = mdi->next) + imsm_set_disk(a, + mdi->disk.raid_disk, + mdi->curr_state); + + imsm_progress_container_reshape(super); + } + } + } + + /* before we activate this array handle any missing disks */ + if (consistent == 2) + handle_missing(super, dev); + + if (consistent == 2 && + (!is_resync_complete(&a->info) || + map_state != IMSM_T_STATE_NORMAL || + dev->vol.migr_state)) + consistent = 0; + + if (is_resync_complete(&a->info)) { + /* complete intialization / resync, + * recovery and interrupted recovery is completed in + * ->set_disk + */ + if (is_resyncing(dev)) { + dprintf("imsm: mark resync done\n"); + end_migration(dev, super, map_state); + super->updates_pending++; + a->last_checkpoint = 0; + } + } else if ((!is_resyncing(dev) && !failed) && + (imsm_reshape_blocks_arrays_changes(super) == 0)) { + /* mark the start of the init process if nothing is failed */ + dprintf("imsm: mark resync start\n"); + if (map->map_state == IMSM_T_STATE_UNINITIALIZED) + migrate(dev, super, IMSM_T_STATE_NORMAL, MIGR_INIT); + else + migrate(dev, super, IMSM_T_STATE_NORMAL, MIGR_REPAIR); + super->updates_pending++; + } + +mark_checkpoint: + /* skip checkpointing for general migration, + * it is controlled in mdadm + */ + if (is_gen_migration(dev)) + goto skip_mark_checkpoint; + + /* check if we can update curr_migr_unit from resync_start, recovery_start */ + blocks_per_unit = blocks_per_migr_unit(super, dev); + if (blocks_per_unit) { + __u32 units32; + __u64 units; + + units = a->last_checkpoint / blocks_per_unit; + units32 = units; + + /* check that we did not overflow 32-bits, and that + * curr_migr_unit needs updating + */ + if (units32 == units && + units32 != 0 && + __le32_to_cpu(dev->vol.curr_migr_unit) != units32) { + dprintf("imsm: mark checkpoint (%u)\n", units32); + dev->vol.curr_migr_unit = __cpu_to_le32(units32); + super->updates_pending++; + } + } + +skip_mark_checkpoint: + /* mark dirty / clean */ + if (dev->vol.dirty != !consistent) { + dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty"); + if (consistent) + dev->vol.dirty = 0; + else + dev->vol.dirty = 1; + super->updates_pending++; + } + + return consistent; +} + +static void imsm_set_disk(struct active_array *a, int n, int state) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_disk *disk; + struct mdinfo *mdi; + int recovery_not_finished = 0; + int failed; + __u32 ord; + __u8 map_state; + + if (n > map->num_members) + pr_err("imsm: set_disk %d out of range 0..%d\n", + n, map->num_members - 1); + + if (n < 0) + return; + + dprintf("imsm: set_disk %d:%x\n", n, state); + + ord = get_imsm_ord_tbl_ent(dev, n, MAP_0); + disk = get_imsm_disk(super, ord_to_idx(ord)); + + /* check for new failures */ + if (state & DS_FAULTY) { + if (mark_failure(dev, disk, ord_to_idx(ord))) + super->updates_pending++; + } + + /* check if in_sync */ + if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD && is_rebuilding(dev)) { + struct imsm_map *migr_map = get_imsm_map(dev, MAP_1); + + set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord)); + super->updates_pending++; + } + + failed = imsm_count_failed(super, dev, MAP_0); + map_state = imsm_check_degraded(super, dev, failed, MAP_0); + + /* check if recovery complete, newly degraded, or failed */ + dprintf("imsm: Detected transition to state "); + switch (map_state) { + case IMSM_T_STATE_NORMAL: /* transition to normal state */ + dprintf("normal: "); + if (is_rebuilding(dev)) { + dprintf_cont("while rebuilding"); + /* check if recovery is really finished */ + for (mdi = a->info.devs; mdi ; mdi = mdi->next) + if (mdi->recovery_start != MaxSector) { + recovery_not_finished = 1; + break; + } + if (recovery_not_finished) { + dprintf_cont("\n"); + dprintf("Rebuild has not finished yet, state not changed"); + if (a->last_checkpoint < mdi->recovery_start) { + a->last_checkpoint = mdi->recovery_start; + super->updates_pending++; + } + break; + } + end_migration(dev, super, map_state); + map = get_imsm_map(dev, MAP_0); + map->failed_disk_num = ~0; + super->updates_pending++; + a->last_checkpoint = 0; + break; + } + if (is_gen_migration(dev)) { + dprintf_cont("while general migration"); + if (a->last_checkpoint >= a->info.component_size) + end_migration(dev, super, map_state); + else + map->map_state = map_state; + map = get_imsm_map(dev, MAP_0); + map->failed_disk_num = ~0; + super->updates_pending++; + break; + } + break; + case IMSM_T_STATE_DEGRADED: /* transition to degraded state */ + dprintf_cont("degraded: "); + if ((map->map_state != map_state) && + !dev->vol.migr_state) { + dprintf_cont("mark degraded"); + map->map_state = map_state; + super->updates_pending++; + a->last_checkpoint = 0; + break; + } + if (is_rebuilding(dev)) { + dprintf_cont("while rebuilding."); + if (map->map_state != map_state) { + dprintf_cont(" Map state change"); + end_migration(dev, super, map_state); + super->updates_pending++; + } + break; + } + if (is_gen_migration(dev)) { + dprintf_cont("while general migration"); + if (a->last_checkpoint >= a->info.component_size) + end_migration(dev, super, map_state); + else { + map->map_state = map_state; + manage_second_map(super, dev); + } + super->updates_pending++; + break; + } + if (is_initializing(dev)) { + dprintf_cont("while initialization."); + map->map_state = map_state; + super->updates_pending++; + break; + } + break; + case IMSM_T_STATE_FAILED: /* transition to failed state */ + dprintf_cont("failed: "); + if (is_gen_migration(dev)) { + dprintf_cont("while general migration"); + map->map_state = map_state; + super->updates_pending++; + break; + } + if (map->map_state != map_state) { + dprintf_cont("mark failed"); + end_migration(dev, super, map_state); + super->updates_pending++; + a->last_checkpoint = 0; + break; + } + break; + default: + dprintf_cont("state %i\n", map_state); + } + dprintf_cont("\n"); +} + +static int store_imsm_mpb(int fd, struct imsm_super *mpb) +{ + void *buf = mpb; + __u32 mpb_size = __le32_to_cpu(mpb->mpb_size); + unsigned long long dsize; + unsigned long long sectors; + + get_dev_size(fd, NULL, &dsize); + + if (mpb_size > 512) { + /* -1 to account for anchor */ + sectors = mpb_sectors(mpb) - 1; + + /* write the extended mpb to the sectors preceeding the anchor */ + if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) + return 1; + + if ((unsigned long long)write(fd, buf + 512, 512 * sectors) + != 512 * sectors) + return 1; + } + + /* first block is stored on second to last sector of the disk */ + if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) + return 1; + + if (write(fd, buf, 512) != 512) + return 1; + + return 0; +} + +static void imsm_sync_metadata(struct supertype *container) +{ + struct intel_super *super = container->sb; + + dprintf("sync metadata: %d\n", super->updates_pending); + if (!super->updates_pending) + return; + + write_super_imsm(container, 0); + + super->updates_pending = 0; +} + +static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a) +{ + struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); + int i = get_imsm_disk_idx(dev, idx, MAP_X); + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (dl->index == i) + break; + + if (dl && is_failed(&dl->disk)) + dl = NULL; + + if (dl) + dprintf("found %x:%x\n", dl->major, dl->minor); + + return dl; +} + +static struct dl *imsm_add_spare(struct intel_super *super, int slot, + struct active_array *a, int activate_new, + struct mdinfo *additional_test_list) +{ + struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); + int idx = get_imsm_disk_idx(dev, slot, MAP_X); + struct imsm_super *mpb = super->anchor; + struct imsm_map *map; + unsigned long long pos; + struct mdinfo *d; + struct extent *ex; + int i, j; + int found; + __u32 array_start = 0; + __u32 array_end = 0; + struct dl *dl; + struct mdinfo *test_list; + + for (dl = super->disks; dl; dl = dl->next) { + /* If in this array, skip */ + for (d = a->info.devs ; d ; d = d->next) + if (d->state_fd >= 0 && + d->disk.major == dl->major && + d->disk.minor == dl->minor) { + dprintf("%x:%x already in array\n", + dl->major, dl->minor); + break; + } + if (d) + continue; + test_list = additional_test_list; + while (test_list) { + if (test_list->disk.major == dl->major && + test_list->disk.minor == dl->minor) { + dprintf("%x:%x already in additional test list\n", + dl->major, dl->minor); + break; + } + test_list = test_list->next; + } + if (test_list) + continue; + + /* skip in use or failed drives */ + if (is_failed(&dl->disk) || idx == dl->index || + dl->index == -2) { + dprintf("%x:%x status (failed: %d index: %d)\n", + dl->major, dl->minor, is_failed(&dl->disk), idx); + continue; + } + + /* skip pure spares when we are looking for partially + * assimilated drives + */ + if (dl->index == -1 && !activate_new) + continue; + + /* Does this unused device have the requisite free space? + * It needs to be able to cover all member volumes + */ + ex = get_extents(super, dl); + if (!ex) { + dprintf("cannot get extents\n"); + continue; + } + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + + /* check if this disk is already a member of + * this array + */ + if (get_imsm_disk_slot(map, dl->index) >= 0) + continue; + + found = 0; + j = 0; + pos = 0; + array_start = pba_of_lba0(map); + array_end = array_start + + blocks_per_member(map) - 1; + + do { + /* check that we can start at pba_of_lba0 with + * blocks_per_member of space + */ + if (array_start >= pos && array_end < ex[j].start) { + found = 1; + break; + } + pos = ex[j].start + ex[j].size; + j++; + } while (ex[j-1].size); + + if (!found) + break; + } + + free(ex); + if (i < mpb->num_raid_devs) { + dprintf("%x:%x does not have %u to %u available\n", + dl->major, dl->minor, array_start, array_end); + /* No room */ + continue; + } + return dl; + } + + return dl; +} + +static int imsm_rebuild_allowed(struct supertype *cont, int dev_idx, int failed) +{ + struct imsm_dev *dev2; + struct imsm_map *map; + struct dl *idisk; + int slot; + int idx; + __u8 state; + + dev2 = get_imsm_dev(cont->sb, dev_idx); + if (dev2) { + state = imsm_check_degraded(cont->sb, dev2, failed, MAP_0); + if (state == IMSM_T_STATE_FAILED) { + map = get_imsm_map(dev2, MAP_0); + if (!map) + return 1; + for (slot = 0; slot < map->num_members; slot++) { + /* + * Check if failed disks are deleted from intel + * disk list or are marked to be deleted + */ + idx = get_imsm_disk_idx(dev2, slot, MAP_X); + idisk = get_imsm_dl_disk(cont->sb, idx); + /* + * Do not rebuild the array if failed disks + * from failed sub-array are not removed from + * container. + */ + if (idisk && + is_failed(&idisk->disk) && + (idisk->action != DISK_REMOVE)) + return 0; + } + } + } + return 1; +} + +static struct mdinfo *imsm_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + /** + * Find a device with unused free space and use it to replace a + * failed/vacant region in an array. We replace failed regions one a + * array at a time. The result is that a new spare disk will be added + * to the first failed array and after the monitor has finished + * propagating failures the remainder will be consumed. + * + * FIXME add a capability for mdmon to request spares from another + * container. + */ + + struct intel_super *super = a->container->sb; + int inst = a->info.container_member; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int failed = a->info.array.raid_disks; + struct mdinfo *rv = NULL; + struct mdinfo *d; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + struct imsm_update_activate_spare *u; + int num_spares = 0; + int i; + int allowed; + + for (d = a->info.devs ; d ; d = d->next) { + if ((d->curr_state & DS_FAULTY) && + d->state_fd >= 0) + /* wait for Removal to happen */ + return NULL; + if (d->state_fd >= 0) + failed--; + } + + dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n", + inst, failed, a->info.array.raid_disks, a->info.array.level); + + if (imsm_reshape_blocks_arrays_changes(super)) + return NULL; + + /* Cannot activate another spare if rebuild is in progress already + */ + if (is_rebuilding(dev)) { + dprintf("imsm: No spare activation allowed. Rebuild in progress already.\n"); + return NULL; + } + + if (a->info.array.level == 4) + /* No repair for takeovered array + * imsm doesn't support raid4 + */ + return NULL; + + if (imsm_check_degraded(super, dev, failed, MAP_0) != + IMSM_T_STATE_DEGRADED) + return NULL; + + /* + * If there are any failed disks check state of the other volume. + * Block rebuild if the another one is failed until failed disks + * are removed from container. + */ + if (failed) { + dprintf("found failed disks in %.*s, check if there anotherfailed sub-array.\n", + MAX_RAID_SERIAL_LEN, dev->volume); + /* check if states of the other volumes allow for rebuild */ + for (i = 0; i < super->anchor->num_raid_devs; i++) { + if (i != inst) { + allowed = imsm_rebuild_allowed(a->container, + i, failed); + if (!allowed) + return NULL; + } + } + } + + /* For each slot, if it is not working, find a spare */ + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && (d->state_fd >= 0)) + continue; + + /* + * OK, this device needs recovery. Try to re-add the + * previous occupant of this slot, if this fails see if + * we can continue the assimilation of a spare that was + * partially assimilated, finally try to activate a new + * spare. + */ + dl = imsm_readd(super, i, a); + if (!dl) + dl = imsm_add_spare(super, i, a, 0, rv); + if (!dl) + dl = imsm_add_spare(super, i, a, 1, rv); + if (!dl) + continue; + + /* found a usable disk with enough space */ + di = xcalloc(1, sizeof(*di)); + + /* dl->index will be -1 in the case we are activating a + * pristine spare. imsm_process_update() will create a + * new index in this case. Once a disk is found to be + * failed in all member arrays it is kicked from the + * metadata + */ + di->disk.number = dl->index; + + /* (ab)use di->devs to store a pointer to the device + * we chose + */ + di->devs = (struct mdinfo *) dl; + + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->recovery_start = 0; + di->data_offset = pba_of_lba0(map); + di->component_size = a->info.component_size; + di->container_member = inst; + super->random = random32(); + di->next = rv; + rv = di; + num_spares++; + dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor, + i, di->data_offset); + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * disk_ord_tbl for the array + */ + mu = xmalloc(sizeof(*mu)); + mu->buf = xcalloc(num_spares, + sizeof(struct imsm_update_activate_spare)); + mu->space = NULL; + mu->space_list = NULL; + mu->len = sizeof(struct imsm_update_activate_spare) * num_spares; + mu->next = *updates; + u = (struct imsm_update_activate_spare *) mu->buf; + + for (di = rv ; di ; di = di->next) { + u->type = update_activate_spare; + u->dl = (struct dl *) di->devs; + di->devs = NULL; + u->slot = di->disk.raid_disk; + u->array = inst; + u->next = u + 1; + u++; + } + (u-1)->next = NULL; + *updates = mu; + + return rv; +} + +static int disks_overlap(struct intel_super *super, int idx, struct imsm_update_create_array *u) +{ + struct imsm_dev *dev = get_imsm_dev(super, idx); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *new_map = get_imsm_map(&u->dev, MAP_0); + struct disk_info *inf = get_disk_info(u); + struct imsm_disk *disk; + int i; + int j; + + for (i = 0; i < map->num_members; i++) { + disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i, MAP_X)); + for (j = 0; j < new_map->num_members; j++) + if (serialcmp(disk->serial, inf[j].serial) == 0) + return 1; + } + + return 0; +} + +static struct dl *get_disk_super(struct intel_super *super, int major, int minor) +{ + struct dl *dl = NULL; + for (dl = super->disks; dl; dl = dl->next) + if ((dl->major == major) && (dl->minor == minor)) + return dl; + return NULL; +} + +static int remove_disk_super(struct intel_super *super, int major, int minor) +{ + struct dl *prev = NULL; + struct dl *dl; + + prev = NULL; + for (dl = super->disks; dl; dl = dl->next) { + if ((dl->major == major) && (dl->minor == minor)) { + /* remove */ + if (prev) + prev->next = dl->next; + else + super->disks = dl->next; + dl->next = NULL; + __free_imsm_disk(dl); + dprintf("removed %x:%x\n", major, minor); + break; + } + prev = dl; + } + return 0; +} + +static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index); + +static int add_remove_disk_update(struct intel_super *super) +{ + int check_degraded = 0; + struct dl *disk = NULL; + /* add/remove some spares to/from the metadata/contrainer */ + while (super->disk_mgmt_list) { + struct dl *disk_cfg; + + disk_cfg = super->disk_mgmt_list; + super->disk_mgmt_list = disk_cfg->next; + disk_cfg->next = NULL; + + if (disk_cfg->action == DISK_ADD) { + disk_cfg->next = super->disks; + super->disks = disk_cfg; + check_degraded = 1; + dprintf("added %x:%x\n", + disk_cfg->major, disk_cfg->minor); + } else if (disk_cfg->action == DISK_REMOVE) { + dprintf("Disk remove action processed: %x.%x\n", + disk_cfg->major, disk_cfg->minor); + disk = get_disk_super(super, + disk_cfg->major, + disk_cfg->minor); + if (disk) { + /* store action status */ + disk->action = DISK_REMOVE; + /* remove spare disks only */ + if (disk->index == -1) { + remove_disk_super(super, + disk_cfg->major, + disk_cfg->minor); + } + } + /* release allocate disk structure */ + __free_imsm_disk(disk_cfg); + } + } + return check_degraded; +} + +static int apply_reshape_migration_update(struct imsm_update_reshape_migration *u, + struct intel_super *super, + void ***space_list) +{ + struct intel_dev *id; + void **tofree = NULL; + int ret_val = 0; + + dprintf("(enter)\n"); + if ((u->subdev < 0) || + (u->subdev > 1)) { + dprintf("imsm: Error: Wrong subdev: %i\n", u->subdev); + return ret_val; + } + if ((space_list == NULL) || (*space_list == NULL)) { + dprintf("imsm: Error: Memory is not allocated\n"); + return ret_val; + } + + for (id = super->devlist ; id; id = id->next) { + if (id->index == (unsigned)u->subdev) { + struct imsm_dev *dev = get_imsm_dev(super, u->subdev); + struct imsm_map *map; + struct imsm_dev *new_dev = + (struct imsm_dev *)*space_list; + struct imsm_map *migr_map = get_imsm_map(dev, MAP_1); + int to_state; + struct dl *new_disk; + + if (new_dev == NULL) + return ret_val; + *space_list = **space_list; + memcpy(new_dev, dev, sizeof_imsm_dev(dev, 0)); + map = get_imsm_map(new_dev, MAP_0); + if (migr_map) { + dprintf("imsm: Error: migration in progress"); + return ret_val; + } + + to_state = map->map_state; + if ((u->new_level == 5) && (map->raid_level == 0)) { + map->num_members++; + /* this should not happen */ + if (u->new_disks[0] < 0) { + map->failed_disk_num = + map->num_members - 1; + to_state = IMSM_T_STATE_DEGRADED; + } else + to_state = IMSM_T_STATE_NORMAL; + } + migrate(new_dev, super, to_state, MIGR_GEN_MIGR); + if (u->new_level > -1) + map->raid_level = u->new_level; + migr_map = get_imsm_map(new_dev, MAP_1); + if ((u->new_level == 5) && + (migr_map->raid_level == 0)) { + int ord = map->num_members - 1; + migr_map->num_members--; + if (u->new_disks[0] < 0) + ord |= IMSM_ORD_REBUILD; + set_imsm_ord_tbl_ent(map, + map->num_members - 1, + ord); + } + id->dev = new_dev; + tofree = (void **)dev; + + /* update chunk size + */ + if (u->new_chunksize > 0) + map->blocks_per_strip = + __cpu_to_le16(u->new_chunksize * 2); + + /* add disk + */ + if ((u->new_level != 5) || + (migr_map->raid_level != 0) || + (migr_map->raid_level == map->raid_level)) + goto skip_disk_add; + + if (u->new_disks[0] >= 0) { + /* use passes spare + */ + new_disk = get_disk_super(super, + major(u->new_disks[0]), + minor(u->new_disks[0])); + dprintf("imsm: new disk for reshape is: %i:%i (%p, index = %i)\n", + major(u->new_disks[0]), + minor(u->new_disks[0]), + new_disk, new_disk->index); + if (new_disk == NULL) + goto error_disk_add; + + new_disk->index = map->num_members - 1; + /* slot to fill in autolayout + */ + new_disk->raiddisk = new_disk->index; + new_disk->disk.status |= CONFIGURED_DISK; + new_disk->disk.status &= ~SPARE_DISK; + } else + goto error_disk_add; + +skip_disk_add: + *tofree = *space_list; + /* calculate new size + */ + imsm_set_array_size(new_dev, -1); + + ret_val = 1; + } + } + + if (tofree) + *space_list = tofree; + return ret_val; + +error_disk_add: + dprintf("Error: imsm: Cannot find disk.\n"); + return ret_val; +} + +static int apply_size_change_update(struct imsm_update_size_change *u, + struct intel_super *super) +{ + struct intel_dev *id; + int ret_val = 0; + + dprintf("(enter)\n"); + if ((u->subdev < 0) || + (u->subdev > 1)) { + dprintf("imsm: Error: Wrong subdev: %i\n", u->subdev); + return ret_val; + } + + for (id = super->devlist ; id; id = id->next) { + if (id->index == (unsigned)u->subdev) { + struct imsm_dev *dev = get_imsm_dev(super, u->subdev); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int used_disks = imsm_num_data_members(dev, MAP_0); + unsigned long long blocks_per_member; + + /* calculate new size + */ + blocks_per_member = u->new_size / used_disks; + dprintf("(size: %llu, blocks per member: %llu)\n", + u->new_size, blocks_per_member); + set_blocks_per_member(map, blocks_per_member); + imsm_set_array_size(dev, u->new_size); + + ret_val = 1; + break; + } + } + + return ret_val; +} + +static int apply_update_activate_spare(struct imsm_update_activate_spare *u, + struct intel_super *super, + struct active_array *active_array) +{ + struct imsm_super *mpb = super->anchor; + struct imsm_dev *dev = get_imsm_dev(super, u->array); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *migr_map; + struct active_array *a; + struct imsm_disk *disk; + __u8 to_state; + struct dl *dl; + unsigned int found; + int failed; + int victim; + int i; + int second_map_created = 0; + + for (; u; u = u->next) { + victim = get_imsm_disk_idx(dev, u->slot, MAP_X); + + if (victim < 0) + return 0; + + for (dl = super->disks; dl; dl = dl->next) + if (dl == u->dl) + break; + + if (!dl) { + pr_err("error: imsm_activate_spare passed an unknown disk (index: %d)\n", + u->dl->index); + return 0; + } + + /* count failures (excluding rebuilds and the victim) + * to determine map[0] state + */ + failed = 0; + for (i = 0; i < map->num_members; i++) { + if (i == u->slot) + continue; + disk = get_imsm_disk(super, + get_imsm_disk_idx(dev, i, MAP_X)); + if (!disk || is_failed(disk)) + failed++; + } + + /* adding a pristine spare, assign a new index */ + if (dl->index < 0) { + dl->index = super->anchor->num_disks; + super->anchor->num_disks++; + } + disk = &dl->disk; + disk->status |= CONFIGURED_DISK; + disk->status &= ~SPARE_DISK; + + /* mark rebuild */ + to_state = imsm_check_degraded(super, dev, failed, MAP_0); + if (!second_map_created) { + second_map_created = 1; + map->map_state = IMSM_T_STATE_DEGRADED; + migrate(dev, super, to_state, MIGR_REBUILD); + } else + map->map_state = to_state; + migr_map = get_imsm_map(dev, MAP_1); + set_imsm_ord_tbl_ent(map, u->slot, dl->index); + set_imsm_ord_tbl_ent(migr_map, u->slot, + dl->index | IMSM_ORD_REBUILD); + + /* update the family_num to mark a new container + * generation, being careful to record the existing + * family_num in orig_family_num to clean up after + * earlier mdadm versions that neglected to set it. + */ + if (mpb->orig_family_num == 0) + mpb->orig_family_num = mpb->family_num; + mpb->family_num += super->random; + + /* count arrays using the victim in the metadata */ + found = 0; + for (a = active_array; a ; a = a->next) { + dev = get_imsm_dev(super, a->info.container_member); + map = get_imsm_map(dev, MAP_0); + + if (get_imsm_disk_slot(map, victim) >= 0) + found++; + } + + /* delete the victim if it is no longer being + * utilized anywhere + */ + if (!found) { + struct dl **dlp; + + /* We know that 'manager' isn't touching anything, + * so it is safe to delete + */ + for (dlp = &super->disks; *dlp; dlp = &(*dlp)->next) + if ((*dlp)->index == victim) + break; + + /* victim may be on the missing list */ + if (!*dlp) + for (dlp = &super->missing; *dlp; + dlp = &(*dlp)->next) + if ((*dlp)->index == victim) + break; + imsm_delete(super, dlp, victim); + } + } + + return 1; +} + +static int apply_reshape_container_disks_update(struct imsm_update_reshape *u, + struct intel_super *super, + void ***space_list) +{ + struct dl *new_disk; + struct intel_dev *id; + int i; + int delta_disks = u->new_raid_disks - u->old_raid_disks; + int disk_count = u->old_raid_disks; + void **tofree = NULL; + int devices_to_reshape = 1; + struct imsm_super *mpb = super->anchor; + int ret_val = 0; + unsigned int dev_id; + + dprintf("(enter)\n"); + + /* enable spares to use in array */ + for (i = 0; i < delta_disks; i++) { + new_disk = get_disk_super(super, + major(u->new_disks[i]), + minor(u->new_disks[i])); + dprintf("imsm: new disk for reshape is: %i:%i (%p, index = %i)\n", + major(u->new_disks[i]), minor(u->new_disks[i]), + new_disk, new_disk->index); + if ((new_disk == NULL) || + ((new_disk->index >= 0) && + (new_disk->index < u->old_raid_disks))) + goto update_reshape_exit; + new_disk->index = disk_count++; + /* slot to fill in autolayout + */ + new_disk->raiddisk = new_disk->index; + new_disk->disk.status |= + CONFIGURED_DISK; + new_disk->disk.status &= ~SPARE_DISK; + } + + dprintf("imsm: volume set mpb->num_raid_devs = %i\n", + mpb->num_raid_devs); + /* manage changes in volume + */ + for (dev_id = 0; dev_id < mpb->num_raid_devs; dev_id++) { + void **sp = *space_list; + struct imsm_dev *newdev; + struct imsm_map *newmap, *oldmap; + + for (id = super->devlist ; id; id = id->next) { + if (id->index == dev_id) + break; + } + if (id == NULL) + break; + if (!sp) + continue; + *space_list = *sp; + newdev = (void*)sp; + /* Copy the dev, but not (all of) the map */ + memcpy(newdev, id->dev, sizeof(*newdev)); + oldmap = get_imsm_map(id->dev, MAP_0); + newmap = get_imsm_map(newdev, MAP_0); + /* Copy the current map */ + memcpy(newmap, oldmap, sizeof_imsm_map(oldmap)); + /* update one device only + */ + if (devices_to_reshape) { + dprintf("imsm: modifying subdev: %i\n", + id->index); + devices_to_reshape--; + newdev->vol.migr_state = 1; + newdev->vol.curr_migr_unit = 0; + set_migr_type(newdev, MIGR_GEN_MIGR); + newmap->num_members = u->new_raid_disks; + for (i = 0; i < delta_disks; i++) { + set_imsm_ord_tbl_ent(newmap, + u->old_raid_disks + i, + u->old_raid_disks + i); + } + /* New map is correct, now need to save old map + */ + newmap = get_imsm_map(newdev, MAP_1); + memcpy(newmap, oldmap, sizeof_imsm_map(oldmap)); + + imsm_set_array_size(newdev, -1); + } + + sp = (void **)id->dev; + id->dev = newdev; + *sp = tofree; + tofree = sp; + + /* Clear migration record */ + memset(super->migr_rec, 0, sizeof(struct migr_record)); + } + if (tofree) + *space_list = tofree; + ret_val = 1; + +update_reshape_exit: + + return ret_val; +} + +static int apply_takeover_update(struct imsm_update_takeover *u, + struct intel_super *super, + void ***space_list) +{ + struct imsm_dev *dev = NULL; + struct intel_dev *dv; + struct imsm_dev *dev_new; + struct imsm_map *map; + struct dl *dm, *du; + int i; + + for (dv = super->devlist; dv; dv = dv->next) + if (dv->index == (unsigned int)u->subarray) { + dev = dv->dev; + break; + } + + if (dev == NULL) + return 0; + + map = get_imsm_map(dev, MAP_0); + + if (u->direction == R10_TO_R0) { + /* Number of failed disks must be half of initial disk number */ + if (imsm_count_failed(super, dev, MAP_0) != + (map->num_members / 2)) + return 0; + + /* iterate through devices to mark removed disks as spare */ + for (dm = super->disks; dm; dm = dm->next) { + if (dm->disk.status & FAILED_DISK) { + int idx = dm->index; + /* update indexes on the disk list */ +/* FIXME this loop-with-the-loop looks wrong, I'm not convinced + the index values will end up being correct.... NB */ + for (du = super->disks; du; du = du->next) + if (du->index > idx) + du->index--; + /* mark as spare disk */ + mark_spare(dm); + } + } + /* update map */ + map->num_members = map->num_members / 2; + map->map_state = IMSM_T_STATE_NORMAL; + map->num_domains = 1; + map->raid_level = 0; + map->failed_disk_num = -1; + } + + if (u->direction == R0_TO_R10) { + void **space; + /* update slots in current disk list */ + for (dm = super->disks; dm; dm = dm->next) { + if (dm->index >= 0) + dm->index *= 2; + } + /* create new *missing* disks */ + for (i = 0; i < map->num_members; i++) { + space = *space_list; + if (!space) + continue; + *space_list = *space; + du = (void *)space; + memcpy(du, super->disks, sizeof(*du)); + du->fd = -1; + du->minor = 0; + du->major = 0; + du->index = (i * 2) + 1; + sprintf((char *)du->disk.serial, + " MISSING_%d", du->index); + sprintf((char *)du->serial, + "MISSING_%d", du->index); + du->next = super->missing; + super->missing = du; + } + /* create new dev and map */ + space = *space_list; + if (!space) + return 0; + *space_list = *space; + dev_new = (void *)space; + memcpy(dev_new, dev, sizeof(*dev)); + /* update new map */ + map = get_imsm_map(dev_new, MAP_0); + map->num_members = map->num_members * 2; + map->map_state = IMSM_T_STATE_DEGRADED; + map->num_domains = 2; + map->raid_level = 1; + /* replace dev<->dev_new */ + dv->dev = dev_new; + } + /* update disk order table */ + for (du = super->disks; du; du = du->next) + if (du->index >= 0) + set_imsm_ord_tbl_ent(map, du->index, du->index); + for (du = super->missing; du; du = du->next) + if (du->index >= 0) { + set_imsm_ord_tbl_ent(map, du->index, du->index); + mark_missing(dv->dev, &du->disk, du->index); + } + + return 1; +} + +static void imsm_process_update(struct supertype *st, + struct metadata_update *update) +{ + /** + * crack open the metadata_update envelope to find the update record + * update can be one of: + * update_reshape_container_disks - all the arrays in the container + * are being reshaped to have more devices. We need to mark + * the arrays for general migration and convert selected spares + * into active devices. + * update_activate_spare - a spare device has replaced a failed + * device in an array, update the disk_ord_tbl. If this disk is + * present in all member arrays then also clear the SPARE_DISK + * flag + * update_create_array + * update_kill_array + * update_rename_array + * update_add_remove_disk + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb; + enum imsm_update_type type = *(enum imsm_update_type *) update->buf; + + /* update requires a larger buf but the allocation failed */ + if (super->next_len && !super->next_buf) { + super->next_len = 0; + return; + } + + if (super->next_buf) { + memcpy(super->next_buf, super->buf, super->len); + free(super->buf); + super->len = super->next_len; + super->buf = super->next_buf; + + super->next_len = 0; + super->next_buf = NULL; + } + + mpb = super->anchor; + + switch (type) { + case update_general_migration_checkpoint: { + struct intel_dev *id; + struct imsm_update_general_migration_checkpoint *u = + (void *)update->buf; + + dprintf("called for update_general_migration_checkpoint\n"); + + /* find device under general migration */ + for (id = super->devlist ; id; id = id->next) { + if (is_gen_migration(id->dev)) { + id->dev->vol.curr_migr_unit = + __cpu_to_le32(u->curr_migr_unit); + super->updates_pending++; + } + } + break; + } + case update_takeover: { + struct imsm_update_takeover *u = (void *)update->buf; + if (apply_takeover_update(u, super, &update->space_list)) { + imsm_update_version_info(super); + super->updates_pending++; + } + break; + } + + case update_reshape_container_disks: { + struct imsm_update_reshape *u = (void *)update->buf; + if (apply_reshape_container_disks_update( + u, super, &update->space_list)) + super->updates_pending++; + break; + } + case update_reshape_migration: { + struct imsm_update_reshape_migration *u = (void *)update->buf; + if (apply_reshape_migration_update( + u, super, &update->space_list)) + super->updates_pending++; + break; + } + case update_size_change: { + struct imsm_update_size_change *u = (void *)update->buf; + if (apply_size_change_update(u, super)) + super->updates_pending++; + break; + } + case update_activate_spare: { + struct imsm_update_activate_spare *u = (void *) update->buf; + if (apply_update_activate_spare(u, super, st->arrays)) + super->updates_pending++; + break; + } + case update_create_array: { + /* someone wants to create a new array, we need to be aware of + * a few races/collisions: + * 1/ 'Create' called by two separate instances of mdadm + * 2/ 'Create' versus 'activate_spare': mdadm has chosen + * devices that have since been assimilated via + * activate_spare. + * In the event this update can not be carried out mdadm will + * (FIX ME) notice that its update did not take hold. + */ + struct imsm_update_create_array *u = (void *) update->buf; + struct intel_dev *dv; + struct imsm_dev *dev; + struct imsm_map *map, *new_map; + unsigned long long start, end; + unsigned long long new_start, new_end; + int i; + struct disk_info *inf; + struct dl *dl; + + /* handle racing creates: first come first serve */ + if (u->dev_idx < mpb->num_raid_devs) { + dprintf("subarray %d already defined\n", u->dev_idx); + goto create_error; + } + + /* check update is next in sequence */ + if (u->dev_idx != mpb->num_raid_devs) { + dprintf("can not create array %d expected index %d\n", + u->dev_idx, mpb->num_raid_devs); + goto create_error; + } + + new_map = get_imsm_map(&u->dev, MAP_0); + new_start = pba_of_lba0(new_map); + new_end = new_start + blocks_per_member(new_map); + inf = get_disk_info(u); + + /* handle activate_spare versus create race: + * check to make sure that overlapping arrays do not include + * overalpping disks + */ + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + start = pba_of_lba0(map); + end = start + blocks_per_member(map); + if ((new_start >= start && new_start <= end) || + (start >= new_start && start <= new_end)) + /* overlap */; + else + continue; + + if (disks_overlap(super, i, u)) { + dprintf("arrays overlap\n"); + goto create_error; + } + } + + /* check that prepare update was successful */ + if (!update->space) { + dprintf("prepare update failed\n"); + goto create_error; + } + + /* check that all disks are still active before committing + * changes. FIXME: could we instead handle this by creating a + * degraded array? That's probably not what the user expects, + * so better to drop this update on the floor. + */ + for (i = 0; i < new_map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (!dl) { + dprintf("disk disappeared\n"); + goto create_error; + } + } + + super->updates_pending++; + + /* convert spares to members and fixup ord_tbl */ + for (i = 0; i < new_map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (dl->index == -1) { + dl->index = mpb->num_disks; + mpb->num_disks++; + dl->disk.status |= CONFIGURED_DISK; + dl->disk.status &= ~SPARE_DISK; + } + set_imsm_ord_tbl_ent(new_map, i, dl->index); + } + + dv = update->space; + dev = dv->dev; + update->space = NULL; + imsm_copy_dev(dev, &u->dev); + dv->index = u->dev_idx; + dv->next = super->devlist; + super->devlist = dv; + mpb->num_raid_devs++; + + imsm_update_version_info(super); + break; + create_error: + /* mdmon knows how to release update->space, but not + * ((struct intel_dev *) update->space)->dev + */ + if (update->space) { + dv = update->space; + free(dv->dev); + } + break; + } + case update_kill_array: { + struct imsm_update_kill_array *u = (void *) update->buf; + int victim = u->dev_idx; + struct active_array *a; + struct intel_dev **dp; + struct imsm_dev *dev; + + /* sanity check that we are not affecting the uuid of + * active arrays, or deleting an active array + * + * FIXME when immutable ids are available, but note that + * we'll also need to fixup the invalidated/active + * subarray indexes in mdstat + */ + for (a = st->arrays; a; a = a->next) + if (a->info.container_member >= victim) + break; + /* by definition if mdmon is running at least one array + * is active in the container, so checking + * mpb->num_raid_devs is just extra paranoia + */ + dev = get_imsm_dev(super, victim); + if (a || !dev || mpb->num_raid_devs == 1) { + dprintf("failed to delete subarray-%d\n", victim); + break; + } + + for (dp = &super->devlist; *dp;) + if ((*dp)->index == (unsigned)super->current_vol) { + *dp = (*dp)->next; + } else { + if ((*dp)->index > (unsigned)victim) + (*dp)->index--; + dp = &(*dp)->next; + } + mpb->num_raid_devs--; + super->updates_pending++; + break; + } + case update_rename_array: { + struct imsm_update_rename_array *u = (void *) update->buf; + char name[MAX_RAID_SERIAL_LEN+1]; + int target = u->dev_idx; + struct active_array *a; + struct imsm_dev *dev; + + /* sanity check that we are not affecting the uuid of + * an active array + */ + snprintf(name, MAX_RAID_SERIAL_LEN, "%s", (char *) u->name); + name[MAX_RAID_SERIAL_LEN] = '\0'; + for (a = st->arrays; a; a = a->next) + if (a->info.container_member == target) + break; + dev = get_imsm_dev(super, u->dev_idx); + if (a || !dev || !check_name(super, name, 1)) { + dprintf("failed to rename subarray-%d\n", target); + break; + } + + snprintf((char *) dev->volume, MAX_RAID_SERIAL_LEN, "%s", name); + super->updates_pending++; + break; + } + case update_add_remove_disk: { + /* we may be able to repair some arrays if disks are + * being added, check the status of add_remove_disk + * if discs has been added. + */ + if (add_remove_disk_update(super)) { + struct active_array *a; + + super->updates_pending++; + for (a = st->arrays; a; a = a->next) + a->check_degraded = 1; + } + break; + } + default: + pr_err("error: unsuported process update type:(type: %d)\n", type); + } +} + +static struct mdinfo *get_spares_for_grow(struct supertype *st); + +static int imsm_prepare_update(struct supertype *st, + struct metadata_update *update) +{ + /** + * Allocate space to hold new disk entries, raid-device entries or a new + * mpb if necessary. The manager synchronously waits for updates to + * complete in the monitor, so new mpb buffers allocated here can be + * integrated by the monitor thread without worrying about live pointers + * in the manager thread. + */ + enum imsm_update_type type; + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + size_t buf_len; + size_t len = 0; + + if (update->len < (int)sizeof(type)) + return 0; + + type = *(enum imsm_update_type *) update->buf; + + switch (type) { + case update_general_migration_checkpoint: + if (update->len < (int)sizeof(struct imsm_update_general_migration_checkpoint)) + return 0; + dprintf("called for update_general_migration_checkpoint\n"); + break; + case update_takeover: { + struct imsm_update_takeover *u = (void *)update->buf; + if (update->len < (int)sizeof(*u)) + return 0; + if (u->direction == R0_TO_R10) { + void **tail = (void **)&update->space_list; + struct imsm_dev *dev = get_imsm_dev(super, u->subarray); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int num_members = map->num_members; + void *space; + int size, i; + /* allocate memory for added disks */ + for (i = 0; i < num_members; i++) { + size = sizeof(struct dl); + space = xmalloc(size); + *tail = space; + tail = space; + *tail = NULL; + } + /* allocate memory for new device */ + size = sizeof_imsm_dev(super->devlist->dev, 0) + + (num_members * sizeof(__u32)); + space = xmalloc(size); + *tail = space; + tail = space; + *tail = NULL; + len = disks_to_mpb_size(num_members * 2); + } + + break; + } + case update_reshape_container_disks: { + /* Every raid device in the container is about to + * gain some more devices, and we will enter a + * reconfiguration. + * So each 'imsm_map' will be bigger, and the imsm_vol + * will now hold 2 of them. + * Thus we need new 'struct imsm_dev' allocations sized + * as sizeof_imsm_dev but with more devices in both maps. + */ + struct imsm_update_reshape *u = (void *)update->buf; + struct intel_dev *dl; + void **space_tail = (void**)&update->space_list; + + if (update->len < (int)sizeof(*u)) + return 0; + + dprintf("for update_reshape\n"); + + for (dl = super->devlist; dl; dl = dl->next) { + int size = sizeof_imsm_dev(dl->dev, 1); + void *s; + if (u->new_raid_disks > u->old_raid_disks) + size += sizeof(__u32)*2* + (u->new_raid_disks - u->old_raid_disks); + s = xmalloc(size); + *space_tail = s; + space_tail = s; + *space_tail = NULL; + } + + len = disks_to_mpb_size(u->new_raid_disks); + dprintf("New anchor length is %llu\n", (unsigned long long)len); + break; + } + case update_reshape_migration: { + /* for migration level 0->5 we need to add disks + * so the same as for container operation we will copy + * device to the bigger location. + * in memory prepared device and new disk area are prepared + * for usage in process update + */ + struct imsm_update_reshape_migration *u = (void *)update->buf; + struct intel_dev *id; + void **space_tail = (void **)&update->space_list; + int size; + void *s; + int current_level = -1; + + if (update->len < (int)sizeof(*u)) + return 0; + + dprintf("for update_reshape\n"); + + /* add space for bigger array in update + */ + for (id = super->devlist; id; id = id->next) { + if (id->index == (unsigned)u->subdev) { + size = sizeof_imsm_dev(id->dev, 1); + if (u->new_raid_disks > u->old_raid_disks) + size += sizeof(__u32)*2* + (u->new_raid_disks - u->old_raid_disks); + s = xmalloc(size); + *space_tail = s; + space_tail = s; + *space_tail = NULL; + break; + } + } + if (update->space_list == NULL) + break; + + /* add space for disk in update + */ + size = sizeof(struct dl); + s = xmalloc(size); + *space_tail = s; + space_tail = s; + *space_tail = NULL; + + /* add spare device to update + */ + for (id = super->devlist ; id; id = id->next) + if (id->index == (unsigned)u->subdev) { + struct imsm_dev *dev; + struct imsm_map *map; + + dev = get_imsm_dev(super, u->subdev); + map = get_imsm_map(dev, MAP_0); + current_level = map->raid_level; + break; + } + if ((u->new_level == 5) && (u->new_level != current_level)) { + struct mdinfo *spares; + + spares = get_spares_for_grow(st); + if (spares) { + struct dl *dl; + struct mdinfo *dev; + + dev = spares->devs; + if (dev) { + u->new_disks[0] = + makedev(dev->disk.major, + dev->disk.minor); + dl = get_disk_super(super, + dev->disk.major, + dev->disk.minor); + dl->index = u->old_raid_disks; + dev = dev->next; + } + sysfs_free(spares); + } + } + len = disks_to_mpb_size(u->new_raid_disks); + dprintf("New anchor length is %llu\n", (unsigned long long)len); + break; + } + case update_size_change: { + if (update->len < (int)sizeof(struct imsm_update_size_change)) + return 0; + break; + } + case update_activate_spare: { + if (update->len < (int)sizeof(struct imsm_update_activate_spare)) + return 0; + break; + } + case update_create_array: { + struct imsm_update_create_array *u = (void *) update->buf; + struct intel_dev *dv; + struct imsm_dev *dev = &u->dev; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct dl *dl; + struct disk_info *inf; + int i; + int activate = 0; + + if (update->len < (int)sizeof(*u)) + return 0; + + inf = get_disk_info(u); + len = sizeof_imsm_dev(dev, 1); + /* allocate a new super->devlist entry */ + dv = xmalloc(sizeof(*dv)); + dv->dev = xmalloc(len); + update->space = dv; + + /* count how many spares will be converted to members */ + for (i = 0; i < map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (!dl) { + /* hmm maybe it failed?, nothing we can do about + * it here + */ + continue; + } + if (count_memberships(dl, super) == 0) + activate++; + } + len += activate * sizeof(struct imsm_disk); + break; + } + case update_kill_array: { + if (update->len < (int)sizeof(struct imsm_update_kill_array)) + return 0; + break; + } + case update_rename_array: { + if (update->len < (int)sizeof(struct imsm_update_rename_array)) + return 0; + break; + } + case update_add_remove_disk: + /* no update->len needed */ + break; + default: + return 0; + } + + /* check if we need a larger metadata buffer */ + if (super->next_buf) + buf_len = super->next_len; + else + buf_len = super->len; + + if (__le32_to_cpu(mpb->mpb_size) + len > buf_len) { + /* ok we need a larger buf than what is currently allocated + * if this allocation fails process_update will notice that + * ->next_len is set and ->next_buf is NULL + */ + buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + len, 512); + if (super->next_buf) + free(super->next_buf); + + super->next_len = buf_len; + if (posix_memalign(&super->next_buf, 512, buf_len) == 0) + memset(super->next_buf, 0, buf_len); + else + super->next_buf = NULL; + } + return 1; +} + +/* must be called while manager is quiesced */ +static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index) +{ + struct imsm_super *mpb = super->anchor; + struct dl *iter; + struct imsm_dev *dev; + struct imsm_map *map; + int i, j, num_members; + __u32 ord; + + dprintf("deleting device[%d] from imsm_super\n", index); + + /* shift all indexes down one */ + for (iter = super->disks; iter; iter = iter->next) + if (iter->index > (int)index) + iter->index--; + for (iter = super->missing; iter; iter = iter->next) + if (iter->index > (int)index) + iter->index--; + + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + num_members = map->num_members; + for (j = 0; j < num_members; j++) { + /* update ord entries being careful not to propagate + * ord-flags to the first map + */ + ord = get_imsm_ord_tbl_ent(dev, j, MAP_X); + + if (ord_to_idx(ord) <= index) + continue; + + map = get_imsm_map(dev, MAP_0); + set_imsm_ord_tbl_ent(map, j, ord_to_idx(ord - 1)); + map = get_imsm_map(dev, MAP_1); + if (map) + set_imsm_ord_tbl_ent(map, j, ord - 1); + } + } + + mpb->num_disks--; + super->updates_pending++; + if (*dlp) { + struct dl *dl = *dlp; + + *dlp = (*dlp)->next; + __free_imsm_disk(dl); + } +} +#endif /* MDASSEMBLE */ + +static void close_targets(int *targets, int new_disks) +{ + int i; + + if (!targets) + return; + + for (i = 0; i < new_disks; i++) { + if (targets[i] >= 0) { + close(targets[i]); + targets[i] = -1; + } + } +} + +static int imsm_get_allowed_degradation(int level, int raid_disks, + struct intel_super *super, + struct imsm_dev *dev) +{ + switch (level) { + case 1: + case 10:{ + int ret_val = 0; + struct imsm_map *map; + int i; + + ret_val = raid_disks/2; + /* check map if all disks pairs not failed + * in both maps + */ + map = get_imsm_map(dev, MAP_0); + for (i = 0; i < ret_val; i++) { + int degradation = 0; + if (get_imsm_disk(super, i) == NULL) + degradation++; + if (get_imsm_disk(super, i + 1) == NULL) + degradation++; + if (degradation == 2) + return 0; + } + map = get_imsm_map(dev, MAP_1); + /* if there is no second map + * result can be returned + */ + if (map == NULL) + return ret_val; + /* check degradation in second map + */ + for (i = 0; i < ret_val; i++) { + int degradation = 0; + if (get_imsm_disk(super, i) == NULL) + degradation++; + if (get_imsm_disk(super, i + 1) == NULL) + degradation++; + if (degradation == 2) + return 0; + } + return ret_val; + } + case 5: + return 1; + case 6: + return 2; + default: + return 0; + } +} + +/******************************************************************************* + * Function: open_backup_targets + * Description: Function opens file descriptors for all devices given in + * info->devs + * Parameters: + * info : general array info + * raid_disks : number of disks + * raid_fds : table of device's file descriptors + * super : intel super for raid10 degradation check + * dev : intel device for raid10 degradation check + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +int open_backup_targets(struct mdinfo *info, int raid_disks, int *raid_fds, + struct intel_super *super, struct imsm_dev *dev) +{ + struct mdinfo *sd; + int i; + int opened = 0; + + for (i = 0; i < raid_disks; i++) + raid_fds[i] = -1; + + for (sd = info->devs ; sd ; sd = sd->next) { + char *dn; + + if (sd->disk.state & (1<disk.raid_disk >= raid_disks) || + (sd->disk.raid_disk < 0)) + continue; + + dn = map_dev(sd->disk.major, + sd->disk.minor, 1); + raid_fds[sd->disk.raid_disk] = dev_open(dn, O_RDWR); + if (raid_fds[sd->disk.raid_disk] < 0) { + pr_err("cannot open component\n"); + continue; + } + opened++; + } + /* check if maximum array degradation level is not exceeded + */ + if ((raid_disks - opened) > + imsm_get_allowed_degradation(info->new_level, + raid_disks, + super, dev)) { + pr_err("Not enough disks can be opened.\n"); + close_targets(raid_fds, raid_disks); + return -2; + } + return 0; +} + +/******************************************************************************* + * Function: validate_container_imsm + * Description: This routine validates container after assemble, + * eg. if devices in container are under the same controller. + * + * Parameters: + * info : linked list with info about devices used in array + * Returns: + * 1 : HBA mismatch + * 0 : Success + ******************************************************************************/ +int validate_container_imsm(struct mdinfo *info) +{ + if (check_env("IMSM_NO_PLATFORM")) + return 0; + + struct sys_dev *idev; + struct sys_dev *hba = NULL; + struct sys_dev *intel_devices = find_intel_devices(); + char *dev_path = devt_to_devpath(makedev(info->disk.major, + info->disk.minor)); + + for (idev = intel_devices; idev; idev = idev->next) { + if (dev_path && strstr(dev_path, idev->path)) { + hba = idev; + break; + } + } + if (dev_path) + free(dev_path); + + if (!hba) { + pr_err("WARNING - Cannot detect HBA for device %s!\n", + devid2kname(makedev(info->disk.major, info->disk.minor))); + return 1; + } + + const struct imsm_orom *orom = get_orom_by_device_id(hba->dev_id); + struct mdinfo *dev; + + for (dev = info->next; dev; dev = dev->next) { + dev_path = devt_to_devpath(makedev(dev->disk.major, dev->disk.minor)); + + struct sys_dev *hba2 = NULL; + for (idev = intel_devices; idev; idev = idev->next) { + if (dev_path && strstr(dev_path, idev->path)) { + hba2 = idev; + break; + } + } + if (dev_path) + free(dev_path); + + const struct imsm_orom *orom2 = hba2 == NULL ? NULL : + get_orom_by_device_id(hba2->dev_id); + + if (hba2 && hba->type != hba2->type) { + pr_err("WARNING - HBAs of devices do not match %s != %s\n", + get_sys_dev_type(hba->type), get_sys_dev_type(hba2->type)); + return 1; + } + + if ((orom != orom2) || ((hba->type == SYS_DEV_VMD) && (hba != hba2))) { + pr_err("WARNING - IMSM container assembled with disks under different HBAs!\n" + " This operation is not supported and can lead to data loss.\n"); + return 1; + } + + if (!orom) { + pr_err("WARNING - IMSM container assembled with disks under HBAs without IMSM platform support!\n" + " This operation is not supported and can lead to data loss.\n"); + return 1; + } + } + + return 0; +} +#ifndef MDASSEMBLE +/******************************************************************************* + * Function: init_migr_record_imsm + * Description: Function inits imsm migration record + * Parameters: + * super : imsm internal array info + * dev : device under migration + * info : general array info to find the smallest device + * Returns: + * none + ******************************************************************************/ +void init_migr_record_imsm(struct supertype *st, struct imsm_dev *dev, + struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct migr_record *migr_rec = super->migr_rec; + int new_data_disks; + unsigned long long dsize, dev_sectors; + long long unsigned min_dev_sectors = -1LLU; + struct mdinfo *sd; + char nm[30]; + int fd; + struct imsm_map *map_dest = get_imsm_map(dev, MAP_0); + struct imsm_map *map_src = get_imsm_map(dev, MAP_1); + unsigned long long num_migr_units; + unsigned long long array_blocks; + + memset(migr_rec, 0, sizeof(struct migr_record)); + migr_rec->family_num = __cpu_to_le32(super->anchor->family_num); + + /* only ascending reshape supported now */ + migr_rec->ascending_migr = __cpu_to_le32(1); + + migr_rec->dest_depth_per_unit = GEN_MIGR_AREA_SIZE / + max(map_dest->blocks_per_strip, map_src->blocks_per_strip); + migr_rec->dest_depth_per_unit *= + max(map_dest->blocks_per_strip, map_src->blocks_per_strip); + new_data_disks = imsm_num_data_members(dev, MAP_0); + migr_rec->blocks_per_unit = + __cpu_to_le32(migr_rec->dest_depth_per_unit * new_data_disks); + migr_rec->dest_depth_per_unit = + __cpu_to_le32(migr_rec->dest_depth_per_unit); + array_blocks = info->component_size * new_data_disks; + num_migr_units = + array_blocks / __le32_to_cpu(migr_rec->blocks_per_unit); + + if (array_blocks % __le32_to_cpu(migr_rec->blocks_per_unit)) + num_migr_units++; + migr_rec->num_migr_units = __cpu_to_le32(num_migr_units); + + migr_rec->post_migr_vol_cap = dev->size_low; + migr_rec->post_migr_vol_cap_hi = dev->size_high; + + /* Find the smallest dev */ + for (sd = info->devs ; sd ; sd = sd->next) { + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + fd = dev_open(nm, O_RDONLY); + if (fd < 0) + continue; + get_dev_size(fd, NULL, &dsize); + dev_sectors = dsize / 512; + if (dev_sectors < min_dev_sectors) + min_dev_sectors = dev_sectors; + close(fd); + } + migr_rec->ckpt_area_pba = __cpu_to_le32(min_dev_sectors - + RAID_DISK_RESERVED_BLOCKS_IMSM_HI); + + write_imsm_migr_rec(st); + + return; +} + +/******************************************************************************* + * Function: save_backup_imsm + * Description: Function saves critical data stripes to Migration Copy Area + * and updates the current migration unit status. + * Use restore_stripes() to form a destination stripe, + * and to write it to the Copy Area. + * Parameters: + * st : supertype information + * dev : imsm device that backup is saved for + * info : general array info + * buf : input buffer + * length : length of data to backup (blocks_per_unit) + * Returns: + * 0 : success + *, -1 : fail + ******************************************************************************/ +int save_backup_imsm(struct supertype *st, + struct imsm_dev *dev, + struct mdinfo *info, + void *buf, + int length) +{ + int rv = -1; + struct intel_super *super = st->sb; + unsigned long long *target_offsets = NULL; + int *targets = NULL; + int i; + struct imsm_map *map_dest = get_imsm_map(dev, MAP_0); + int new_disks = map_dest->num_members; + int dest_layout = 0; + int dest_chunk; + unsigned long long start; + int data_disks = imsm_num_data_members(dev, MAP_0); + + targets = xmalloc(new_disks * sizeof(int)); + + for (i = 0; i < new_disks; i++) + targets[i] = -1; + + target_offsets = xcalloc(new_disks, sizeof(unsigned long long)); + + start = info->reshape_progress * 512; + for (i = 0; i < new_disks; i++) { + target_offsets[i] = (unsigned long long) + __le32_to_cpu(super->migr_rec->ckpt_area_pba) * 512; + /* move back copy area adderss, it will be moved forward + * in restore_stripes() using start input variable + */ + target_offsets[i] -= start/data_disks; + } + + if (open_backup_targets(info, new_disks, targets, + super, dev)) + goto abort; + + dest_layout = imsm_level_to_layout(map_dest->raid_level); + dest_chunk = __le16_to_cpu(map_dest->blocks_per_strip) * 512; + + if (restore_stripes(targets, /* list of dest devices */ + target_offsets, /* migration record offsets */ + new_disks, + dest_chunk, + map_dest->raid_level, + dest_layout, + -1, /* source backup file descriptor */ + 0, /* input buf offset + * always 0 buf is already offseted */ + start, + length, + buf) != 0) { + pr_err("Error restoring stripes\n"); + goto abort; + } + + rv = 0; + +abort: + if (targets) { + close_targets(targets, new_disks); + free(targets); + } + free(target_offsets); + + return rv; +} + +/******************************************************************************* + * Function: save_checkpoint_imsm + * Description: Function called for current unit status update + * in the migration record. It writes it to disk. + * Parameters: + * super : imsm internal array info + * info : general array info + * Returns: + * 0: success + * 1: failure + * 2: failure, means no valid migration record + * / no general migration in progress / + ******************************************************************************/ +int save_checkpoint_imsm(struct supertype *st, struct mdinfo *info, int state) +{ + struct intel_super *super = st->sb; + unsigned long long blocks_per_unit; + unsigned long long curr_migr_unit; + + if (load_imsm_migr_rec(super, info) != 0) { + dprintf("imsm: ERROR: Cannot read migration record for checkpoint save.\n"); + return 1; + } + + blocks_per_unit = __le32_to_cpu(super->migr_rec->blocks_per_unit); + if (blocks_per_unit == 0) { + dprintf("imsm: no migration in progress.\n"); + return 2; + } + curr_migr_unit = info->reshape_progress / blocks_per_unit; + /* check if array is alligned to copy area + * if it is not alligned, add one to current migration unit value + * this can happend on array reshape finish only + */ + if (info->reshape_progress % blocks_per_unit) + curr_migr_unit++; + + super->migr_rec->curr_migr_unit = + __cpu_to_le32(curr_migr_unit); + super->migr_rec->rec_status = __cpu_to_le32(state); + super->migr_rec->dest_1st_member_lba = + __cpu_to_le32(curr_migr_unit * + __le32_to_cpu(super->migr_rec->dest_depth_per_unit)); + if (write_imsm_migr_rec(st) < 0) { + dprintf("imsm: Cannot write migration record outside backup area\n"); + return 1; + } + + return 0; +} + +/******************************************************************************* + * Function: recover_backup_imsm + * Description: Function recovers critical data from the Migration Copy Area + * while assembling an array. + * Parameters: + * super : imsm internal array info + * info : general array info + * Returns: + * 0 : success (or there is no data to recover) + * 1 : fail + ******************************************************************************/ +int recover_backup_imsm(struct supertype *st, struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct migr_record *migr_rec = super->migr_rec; + struct imsm_map *map_dest = NULL; + struct intel_dev *id = NULL; + unsigned long long read_offset; + unsigned long long write_offset; + unsigned unit_len; + int *targets = NULL; + int new_disks, i, err; + char *buf = NULL; + int retval = 1; + unsigned long curr_migr_unit = __le32_to_cpu(migr_rec->curr_migr_unit); + unsigned long num_migr_units = __le32_to_cpu(migr_rec->num_migr_units); + char buffer[20]; + int skipped_disks = 0; + + err = sysfs_get_str(info, NULL, "array_state", (char *)buffer, 20); + if (err < 1) + return 1; + + /* recover data only during assemblation */ + if (strncmp(buffer, "inactive", 8) != 0) + return 0; + /* no data to recover */ + if (__le32_to_cpu(migr_rec->rec_status) == UNIT_SRC_NORMAL) + return 0; + if (curr_migr_unit >= num_migr_units) + return 1; + + /* find device during reshape */ + for (id = super->devlist; id; id = id->next) + if (is_gen_migration(id->dev)) + break; + if (id == NULL) + return 1; + + map_dest = get_imsm_map(id->dev, MAP_0); + new_disks = map_dest->num_members; + + read_offset = (unsigned long long) + __le32_to_cpu(migr_rec->ckpt_area_pba) * 512; + + write_offset = ((unsigned long long) + __le32_to_cpu(migr_rec->dest_1st_member_lba) + + pba_of_lba0(map_dest)) * 512; + + unit_len = __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512; + if (posix_memalign((void **)&buf, 512, unit_len) != 0) + goto abort; + targets = xcalloc(new_disks, sizeof(int)); + + if (open_backup_targets(info, new_disks, targets, super, id->dev)) { + pr_err("Cannot open some devices belonging to array.\n"); + goto abort; + } + + for (i = 0; i < new_disks; i++) { + if (targets[i] < 0) { + skipped_disks++; + continue; + } + if (lseek64(targets[i], read_offset, SEEK_SET) < 0) { + pr_err("Cannot seek to block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + if ((unsigned)read(targets[i], buf, unit_len) != unit_len) { + pr_err("Cannot read copy area block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + if (lseek64(targets[i], write_offset, SEEK_SET) < 0) { + pr_err("Cannot seek to block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + if ((unsigned)write(targets[i], buf, unit_len) != unit_len) { + pr_err("Cannot restore block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + } + + if (skipped_disks > imsm_get_allowed_degradation(info->new_level, + new_disks, + super, + id->dev)) { + pr_err("Cannot restore data from backup. Too many failed disks\n"); + goto abort; + } + + if (save_checkpoint_imsm(st, info, UNIT_SRC_NORMAL)) { + /* ignore error == 2, this can mean end of reshape here + */ + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL) during restart\n"); + } else + retval = 0; + +abort: + if (targets) { + for (i = 0; i < new_disks; i++) + if (targets[i]) + close(targets[i]); + free(targets); + } + free(buf); + return retval; +} + +static char disk_by_path[] = "/dev/disk/by-path/"; + +static const char *imsm_get_disk_controller_domain(const char *path) +{ + char disk_path[PATH_MAX]; + char *drv=NULL; + struct stat st; + + strcpy(disk_path, disk_by_path); + strncat(disk_path, path, PATH_MAX - strlen(disk_path) - 1); + if (stat(disk_path, &st) == 0) { + struct sys_dev* hba; + char *path=NULL; + + path = devt_to_devpath(st.st_rdev); + if (path == NULL) + return "unknown"; + hba = find_disk_attached_hba(-1, path); + if (hba && hba->type == SYS_DEV_SAS) + drv = "isci"; + else if (hba && hba->type == SYS_DEV_SATA) + drv = "ahci"; + else + drv = "unknown"; + dprintf("path: %s hba: %s attached: %s\n", + path, (hba) ? hba->path : "NULL", drv); + free(path); + } + return drv; +} + +static char *imsm_find_array_devnm_by_subdev(int subdev, char *container) +{ + static char devnm[32]; + char subdev_name[20]; + struct mdstat_ent *mdstat; + + sprintf(subdev_name, "%d", subdev); + mdstat = mdstat_by_subdev(subdev_name, container); + if (!mdstat) + return NULL; + + strcpy(devnm, mdstat->devnm); + free_mdstat(mdstat); + return devnm; +} + +static int imsm_reshape_is_allowed_on_container(struct supertype *st, + struct geo_params *geo, + int *old_raid_disks, + int direction) +{ + /* currently we only support increasing the number of devices + * for a container. This increases the number of device for each + * member array. They must all be RAID0 or RAID5. + */ + int ret_val = 0; + struct mdinfo *info, *member; + int devices_that_can_grow = 0; + + dprintf("imsm: imsm_reshape_is_allowed_on_container(ENTER): st->devnm = (%s)\n", st->devnm); + + if (geo->size > 0 || + geo->level != UnSet || + geo->layout != UnSet || + geo->chunksize != 0 || + geo->raid_disks == UnSet) { + dprintf("imsm: Container operation is allowed for raid disks number change only.\n"); + return ret_val; + } + + if (direction == ROLLBACK_METADATA_CHANGES) { + dprintf("imsm: Metadata changes rollback is not supported for container operation.\n"); + return ret_val; + } + + info = container_content_imsm(st, NULL); + for (member = info; member; member = member->next) { + char *result; + + dprintf("imsm: checking device_num: %i\n", + member->container_member); + + if (geo->raid_disks <= member->array.raid_disks) { + /* we work on container for Online Capacity Expansion + * only so raid_disks has to grow + */ + dprintf("imsm: for container operation raid disks increase is required\n"); + break; + } + + if ((info->array.level != 0) && + (info->array.level != 5)) { + /* we cannot use this container with other raid level + */ + dprintf("imsm: for container operation wrong raid level (%i) detected\n", + info->array.level); + break; + } else { + /* check for platform support + * for this raid level configuration + */ + struct intel_super *super = st->sb; + if (!is_raid_level_supported(super->orom, + member->array.level, + geo->raid_disks)) { + dprintf("platform does not support raid%d with %d disk%s\n", + info->array.level, + geo->raid_disks, + geo->raid_disks > 1 ? "s" : ""); + break; + } + /* check if component size is aligned to chunk size + */ + if (info->component_size % + (info->array.chunk_size/512)) { + dprintf("Component size is not aligned to chunk size\n"); + break; + } + } + + if (*old_raid_disks && + info->array.raid_disks != *old_raid_disks) + break; + *old_raid_disks = info->array.raid_disks; + + /* All raid5 and raid0 volumes in container + * have to be ready for Online Capacity Expansion + * so they need to be assembled. We have already + * checked that no recovery etc is happening. + */ + result = imsm_find_array_devnm_by_subdev(member->container_member, + st->container_devnm); + if (result == NULL) { + dprintf("imsm: cannot find array\n"); + break; + } + devices_that_can_grow++; + } + sysfs_free(info); + if (!member && devices_that_can_grow) + ret_val = 1; + + if (ret_val) + dprintf("Container operation allowed\n"); + else + dprintf("Error: %i\n", ret_val); + + return ret_val; +} + +/* Function: get_spares_for_grow + * Description: Allocates memory and creates list of spare devices + * avaliable in container. Checks if spare drive size is acceptable. + * Parameters: Pointer to the supertype structure + * Returns: Pointer to the list of spare devices (mdinfo structure) on success, + * NULL if fail + */ +static struct mdinfo *get_spares_for_grow(struct supertype *st) +{ + unsigned long long min_size = min_acceptable_spare_size_imsm(st); + return container_choose_spares(st, min_size, NULL, NULL, NULL, 0); +} + +/****************************************************************************** + * function: imsm_create_metadata_update_for_reshape + * Function creates update for whole IMSM container. + * + ******************************************************************************/ +static int imsm_create_metadata_update_for_reshape( + struct supertype *st, + struct geo_params *geo, + int old_raid_disks, + struct imsm_update_reshape **updatep) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + int update_memory_size = 0; + struct imsm_update_reshape *u = NULL; + struct mdinfo *spares = NULL; + int i; + int delta_disks = 0; + struct mdinfo *dev; + + dprintf("(enter) raid_disks = %i\n", geo->raid_disks); + + delta_disks = geo->raid_disks - old_raid_disks; + + /* size of all update data without anchor */ + update_memory_size = sizeof(struct imsm_update_reshape); + + /* now add space for spare disks that we need to add. */ + update_memory_size += sizeof(u->new_disks[0]) * (delta_disks - 1); + + u = xcalloc(1, update_memory_size); + u->type = update_reshape_container_disks; + u->old_raid_disks = old_raid_disks; + u->new_raid_disks = geo->raid_disks; + + /* now get spare disks list + */ + spares = get_spares_for_grow(st); + + if (spares == NULL + || delta_disks > spares->array.spare_disks) { + pr_err("imsm: ERROR: Cannot get spare devices for %s.\n", geo->dev_name); + i = -1; + goto abort; + } + + /* we have got spares + * update disk list in imsm_disk list table in anchor + */ + dprintf("imsm: %i spares are available.\n\n", + spares->array.spare_disks); + + dev = spares->devs; + for (i = 0; i < delta_disks; i++) { + struct dl *dl; + + if (dev == NULL) + break; + u->new_disks[i] = makedev(dev->disk.major, + dev->disk.minor); + dl = get_disk_super(super, dev->disk.major, dev->disk.minor); + dl->index = mpb->num_disks; + mpb->num_disks++; + dev = dev->next; + } + +abort: + /* free spares + */ + sysfs_free(spares); + + dprintf("imsm: reshape update preparation :"); + if (i == delta_disks) { + dprintf_cont(" OK\n"); + *updatep = u; + return update_memory_size; + } + free(u); + dprintf_cont(" Error\n"); + + return 0; +} + +/****************************************************************************** + * function: imsm_create_metadata_update_for_size_change() + * Creates update for IMSM array for array size change. + * + ******************************************************************************/ +static int imsm_create_metadata_update_for_size_change( + struct supertype *st, + struct geo_params *geo, + struct imsm_update_size_change **updatep) +{ + struct intel_super *super = st->sb; + int update_memory_size = 0; + struct imsm_update_size_change *u = NULL; + + dprintf("(enter) New size = %llu\n", geo->size); + + /* size of all update data without anchor */ + update_memory_size = sizeof(struct imsm_update_size_change); + + u = xcalloc(1, update_memory_size); + u->type = update_size_change; + u->subdev = super->current_vol; + u->new_size = geo->size; + + dprintf("imsm: reshape update preparation : OK\n"); + *updatep = u; + + return update_memory_size; +} + +/****************************************************************************** + * function: imsm_create_metadata_update_for_migration() + * Creates update for IMSM array. + * + ******************************************************************************/ +static int imsm_create_metadata_update_for_migration( + struct supertype *st, + struct geo_params *geo, + struct imsm_update_reshape_migration **updatep) +{ + struct intel_super *super = st->sb; + int update_memory_size = 0; + struct imsm_update_reshape_migration *u = NULL; + struct imsm_dev *dev; + int previous_level = -1; + + dprintf("(enter) New Level = %i\n", geo->level); + + /* size of all update data without anchor */ + update_memory_size = sizeof(struct imsm_update_reshape_migration); + + u = xcalloc(1, update_memory_size); + u->type = update_reshape_migration; + u->subdev = super->current_vol; + u->new_level = geo->level; + u->new_layout = geo->layout; + u->new_raid_disks = u->old_raid_disks = geo->raid_disks; + u->new_disks[0] = -1; + u->new_chunksize = -1; + + dev = get_imsm_dev(super, u->subdev); + if (dev) { + struct imsm_map *map; + + map = get_imsm_map(dev, MAP_0); + if (map) { + int current_chunk_size = + __le16_to_cpu(map->blocks_per_strip) / 2; + + if (geo->chunksize != current_chunk_size) { + u->new_chunksize = geo->chunksize / 1024; + dprintf("imsm: chunk size change from %i to %i\n", + current_chunk_size, u->new_chunksize); + } + previous_level = map->raid_level; + } + } + if ((geo->level == 5) && (previous_level == 0)) { + struct mdinfo *spares = NULL; + + u->new_raid_disks++; + spares = get_spares_for_grow(st); + if ((spares == NULL) || (spares->array.spare_disks < 1)) { + free(u); + sysfs_free(spares); + update_memory_size = 0; + dprintf("error: cannot get spare device for requested migration"); + return 0; + } + sysfs_free(spares); + } + dprintf("imsm: reshape update preparation : OK\n"); + *updatep = u; + + return update_memory_size; +} + +static void imsm_update_metadata_locally(struct supertype *st, + void *buf, int len) +{ + struct metadata_update mu; + + mu.buf = buf; + mu.len = len; + mu.space = NULL; + mu.space_list = NULL; + mu.next = NULL; + if (imsm_prepare_update(st, &mu)) + imsm_process_update(st, &mu); + + while (mu.space_list) { + void **space = mu.space_list; + mu.space_list = *space; + free(space); + } +} + +/*************************************************************************** +* Function: imsm_analyze_change +* Description: Function analyze change for single volume +* and validate if transition is supported +* Parameters: Geometry parameters, supertype structure, +* metadata change direction (apply/rollback) +* Returns: Operation type code on success, -1 if fail +****************************************************************************/ +enum imsm_reshape_type imsm_analyze_change(struct supertype *st, + struct geo_params *geo, + int direction) +{ + struct mdinfo info; + int change = -1; + int check_devs = 0; + int chunk; + /* number of added/removed disks in operation result */ + int devNumChange = 0; + /* imsm compatible layout value for array geometry verification */ + int imsm_layout = -1; + int data_disks; + struct imsm_dev *dev; + struct intel_super *super; + unsigned long long current_size; + unsigned long long free_size; + unsigned long long max_size; + int rv; + + getinfo_super_imsm_volume(st, &info, NULL); + if ((geo->level != info.array.level) && + (geo->level >= 0) && + (geo->level != UnSet)) { + switch (info.array.level) { + case 0: + if (geo->level == 5) { + change = CH_MIGRATION; + if (geo->layout != ALGORITHM_LEFT_ASYMMETRIC) { + pr_err("Error. Requested Layout not supported (left-asymmetric layout is supported only)!\n"); + change = -1; + goto analyse_change_exit; + } + imsm_layout = geo->layout; + check_devs = 1; + devNumChange = 1; /* parity disk added */ + } else if (geo->level == 10) { + change = CH_TAKEOVER; + check_devs = 1; + devNumChange = 2; /* two mirrors added */ + imsm_layout = 0x102; /* imsm supported layout */ + } + break; + case 1: + case 10: + if (geo->level == 0) { + change = CH_TAKEOVER; + check_devs = 1; + devNumChange = -(geo->raid_disks/2); + imsm_layout = 0; /* imsm raid0 layout */ + } + break; + } + if (change == -1) { + pr_err("Error. Level Migration from %d to %d not supported!\n", + info.array.level, geo->level); + goto analyse_change_exit; + } + } else + geo->level = info.array.level; + + if ((geo->layout != info.array.layout) + && ((geo->layout != UnSet) && (geo->layout != -1))) { + change = CH_MIGRATION; + if ((info.array.layout == 0) + && (info.array.level == 5) + && (geo->layout == 5)) { + /* reshape 5 -> 4 */ + } else if ((info.array.layout == 5) + && (info.array.level == 5) + && (geo->layout == 0)) { + /* reshape 4 -> 5 */ + geo->layout = 0; + geo->level = 5; + } else { + pr_err("Error. Layout Migration from %d to %d not supported!\n", + info.array.layout, geo->layout); + change = -1; + goto analyse_change_exit; + } + } else { + geo->layout = info.array.layout; + if (imsm_layout == -1) + imsm_layout = info.array.layout; + } + + if ((geo->chunksize > 0) && (geo->chunksize != UnSet) + && (geo->chunksize != info.array.chunk_size)) + change = CH_MIGRATION; + else + geo->chunksize = info.array.chunk_size; + + chunk = geo->chunksize / 1024; + + super = st->sb; + dev = get_imsm_dev(super, super->current_vol); + data_disks = imsm_num_data_members(dev , MAP_0); + /* compute current size per disk member + */ + current_size = info.custom_array_size / data_disks; + + if ((geo->size > 0) && (geo->size != MAX_SIZE)) { + /* align component size + */ + geo->size = imsm_component_size_aligment_check( + get_imsm_raid_level(dev->vol.map), + chunk * 1024, + geo->size * 2); + if (geo->size == 0) { + pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is 0).\n", + current_size); + goto analyse_change_exit; + } + } + + if ((current_size != geo->size) && (geo->size > 0)) { + if (change != -1) { + pr_err("Error. Size change should be the only one at a time.\n"); + change = -1; + goto analyse_change_exit; + } + if ((super->current_vol + 1) != super->anchor->num_raid_devs) { + pr_err("Error. The last volume in container can be expanded only (%i/%s).\n", + super->current_vol, st->devnm); + goto analyse_change_exit; + } + /* check the maximum available size + */ + rv = imsm_get_free_size(st, dev->vol.map->num_members, + 0, chunk, &free_size); + if (rv == 0) + /* Cannot find maximum available space + */ + max_size = 0; + else { + max_size = free_size + current_size; + /* align component size + */ + max_size = imsm_component_size_aligment_check( + get_imsm_raid_level(dev->vol.map), + chunk * 1024, + max_size); + } + if (geo->size == MAX_SIZE) { + /* requested size change to the maximum available size + */ + if (max_size == 0) { + pr_err("Error. Cannot find maximum available space.\n"); + change = -1; + goto analyse_change_exit; + } else + geo->size = max_size; + } + + if ((direction == ROLLBACK_METADATA_CHANGES)) { + /* accept size for rollback only + */ + } else { + /* round size due to metadata compatibility + */ + geo->size = (geo->size >> SECT_PER_MB_SHIFT) + << SECT_PER_MB_SHIFT; + dprintf("Prepare update for size change to %llu\n", + geo->size ); + if (current_size >= geo->size) { + pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is %llu).\n", + current_size, geo->size); + goto analyse_change_exit; + } + if (max_size && geo->size > max_size) { + pr_err("Error. Requested size is larger than maximum available size (maximum available size is %llu, requested size /rounded/ is %llu).\n", + max_size, geo->size); + goto analyse_change_exit; + } + } + geo->size *= data_disks; + geo->raid_disks = dev->vol.map->num_members; + change = CH_ARRAY_SIZE; + } + if (!validate_geometry_imsm(st, + geo->level, + imsm_layout, + geo->raid_disks + devNumChange, + &chunk, + geo->size, INVALID_SECTORS, + 0, 0, 1)) + change = -1; + + if (check_devs) { + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + + if (mpb->num_raid_devs > 1) { + pr_err("Error. Cannot perform operation on %s- for this operation it MUST be single array in container\n", + geo->dev_name); + change = -1; + } + } + +analyse_change_exit: + if ((direction == ROLLBACK_METADATA_CHANGES) && + ((change == CH_MIGRATION) || (change == CH_TAKEOVER))) { + dprintf("imsm: Metadata changes rollback is not supported for migration and takeover operations.\n"); + change = -1; + } + return change; +} + +int imsm_takeover(struct supertype *st, struct geo_params *geo) +{ + struct intel_super *super = st->sb; + struct imsm_update_takeover *u; + + u = xmalloc(sizeof(struct imsm_update_takeover)); + + u->type = update_takeover; + u->subarray = super->current_vol; + + /* 10->0 transition */ + if (geo->level == 0) + u->direction = R10_TO_R0; + + /* 0->10 transition */ + if (geo->level == 10) + u->direction = R0_TO_R10; + + /* update metadata locally */ + imsm_update_metadata_locally(st, u, + sizeof(struct imsm_update_takeover)); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, + sizeof(struct imsm_update_takeover)); + else + free(u); + + return 0; +} + +static int imsm_reshape_super(struct supertype *st, unsigned long long size, + int level, + int layout, int chunksize, int raid_disks, + int delta_disks, char *backup, char *dev, + int direction, int verbose) +{ + int ret_val = 1; + struct geo_params geo; + + dprintf("(enter)\n"); + + memset(&geo, 0, sizeof(struct geo_params)); + + geo.dev_name = dev; + strcpy(geo.devnm, st->devnm); + geo.size = size; + geo.level = level; + geo.layout = layout; + geo.chunksize = chunksize; + geo.raid_disks = raid_disks; + if (delta_disks != UnSet) + geo.raid_disks += delta_disks; + + dprintf("for level : %i\n", geo.level); + dprintf("for raid_disks : %i\n", geo.raid_disks); + + if (experimental() == 0) + return ret_val; + + if (strcmp(st->container_devnm, st->devnm) == 0) { + /* On container level we can only increase number of devices. */ + dprintf("imsm: info: Container operation\n"); + int old_raid_disks = 0; + + if (imsm_reshape_is_allowed_on_container( + st, &geo, &old_raid_disks, direction)) { + struct imsm_update_reshape *u = NULL; + int len; + + len = imsm_create_metadata_update_for_reshape( + st, &geo, old_raid_disks, &u); + + if (len <= 0) { + dprintf("imsm: Cannot prepare update\n"); + goto exit_imsm_reshape_super; + } + + ret_val = 0; + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, len); + else + free(u); + + } else { + pr_err("(imsm) Operation is not allowed on this container\n"); + } + } else { + /* On volume level we support following operations + * - takeover: raid10 -> raid0; raid0 -> raid10 + * - chunk size migration + * - migration: raid5 -> raid0; raid0 -> raid5 + */ + struct intel_super *super = st->sb; + struct intel_dev *dev = super->devlist; + int change; + dprintf("imsm: info: Volume operation\n"); + /* find requested device */ + while (dev) { + char *devnm = + imsm_find_array_devnm_by_subdev( + dev->index, st->container_devnm); + if (devnm && strcmp(devnm, geo.devnm) == 0) + break; + dev = dev->next; + } + if (dev == NULL) { + pr_err("Cannot find %s (%s) subarray\n", + geo.dev_name, geo.devnm); + goto exit_imsm_reshape_super; + } + super->current_vol = dev->index; + change = imsm_analyze_change(st, &geo, direction); + switch (change) { + case CH_TAKEOVER: + ret_val = imsm_takeover(st, &geo); + break; + case CH_MIGRATION: { + struct imsm_update_reshape_migration *u = NULL; + int len = + imsm_create_metadata_update_for_migration( + st, &geo, &u); + if (len < 1) { + dprintf("imsm: Cannot prepare update\n"); + break; + } + ret_val = 0; + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, len); + else + free(u); + } + break; + case CH_ARRAY_SIZE: { + struct imsm_update_size_change *u = NULL; + int len = + imsm_create_metadata_update_for_size_change( + st, &geo, &u); + if (len < 1) { + dprintf("imsm: Cannot prepare update\n"); + break; + } + ret_val = 0; + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, len); + else + free(u); + } + break; + default: + ret_val = 1; + } + } + +exit_imsm_reshape_super: + dprintf("imsm: reshape_super Exit code = %i\n", ret_val); + return ret_val; +} + +/******************************************************************************* + * Function: wait_for_reshape_imsm + * Description: Function writes new sync_max value and waits until + * reshape process reach new position + * Parameters: + * sra : general array info + * ndata : number of disks in new array's layout + * Returns: + * 0 : success, + * 1 : there is no reshape in progress, + * -1 : fail + ******************************************************************************/ +int wait_for_reshape_imsm(struct mdinfo *sra, int ndata) +{ + int fd = sysfs_get_fd(sra, NULL, "sync_completed"); + unsigned long long completed; + /* to_complete : new sync_max position */ + unsigned long long to_complete = sra->reshape_progress; + unsigned long long position_to_set = to_complete / ndata; + + if (fd < 0) { + dprintf("cannot open reshape_position\n"); + return 1; + } + + if (sysfs_fd_get_ll(fd, &completed) < 0) { + dprintf("cannot read reshape_position (no reshape in progres)\n"); + close(fd); + return 1; + } + + if (completed > position_to_set) { + dprintf("wrong next position to set %llu (%llu)\n", + to_complete, position_to_set); + close(fd); + return -1; + } + dprintf("Position set: %llu\n", position_to_set); + if (sysfs_set_num(sra, NULL, "sync_max", + position_to_set) != 0) { + dprintf("cannot set reshape position to %llu\n", + position_to_set); + close(fd); + return -1; + } + + do { + char action[20]; + int timeout = 3000; + sysfs_wait(fd, &timeout); + if (sysfs_get_str(sra, NULL, "sync_action", + action, 20) > 0 && + strncmp(action, "reshape", 7) != 0) { + close(fd); + return -1; + } + if (sysfs_fd_get_ll(fd, &completed) < 0) { + dprintf("cannot read reshape_position (in loop)\n"); + close(fd); + return 1; + } + } while (completed < position_to_set); + close(fd); + return 0; + +} + +/******************************************************************************* + * Function: check_degradation_change + * Description: Check that array hasn't become failed. + * Parameters: + * info : for sysfs access + * sources : source disks descriptors + * degraded: previous degradation level + * Returns: + * degradation level + ******************************************************************************/ +int check_degradation_change(struct mdinfo *info, + int *sources, + int degraded) +{ + unsigned long long new_degraded; + int rv; + + rv = sysfs_get_ll(info, NULL, "degraded", &new_degraded); + if ((rv == -1) || (new_degraded != (unsigned long long)degraded)) { + /* check each device to ensure it is still working */ + struct mdinfo *sd; + new_degraded = 0; + for (sd = info->devs ; sd ; sd = sd->next) { + if (sd->disk.state & (1<disk.state & (1<disk.state = (1<disk.raid_disk >= 0 && + sources[sd->disk.raid_disk] >= 0) { + close(sources[ + sd->disk.raid_disk]); + sources[sd->disk.raid_disk] = + -1; + } + new_degraded++; + } + } + } + } + + return new_degraded; +} + +/******************************************************************************* + * Function: imsm_manage_reshape + * Description: Function finds array under reshape and it manages reshape + * process. It creates stripes backups (if required) and sets + * checheckpoits. + * Parameters: + * afd : Backup handle (nattive) - not used + * sra : general array info + * reshape : reshape parameters - not used + * st : supertype structure + * blocks : size of critical section [blocks] + * fds : table of source device descriptor + * offsets : start of array (offest per devices) + * dests : not used + * destfd : table of destination device descriptor + * destoffsets : table of destination offsets (per device) + * Returns: + * 1 : success, reshape is done + * 0 : fail + ******************************************************************************/ +static int imsm_manage_reshape( + int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long backup_blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) +{ + int ret_val = 0; + struct intel_super *super = st->sb; + struct intel_dev *dv = NULL; + struct imsm_dev *dev = NULL; + struct imsm_map *map_src; + int migr_vol_qan = 0; + int ndata, odata; /* [bytes] */ + int chunk; /* [bytes] */ + struct migr_record *migr_rec; + char *buf = NULL; + unsigned int buf_size; /* [bytes] */ + unsigned long long max_position; /* array size [bytes] */ + unsigned long long next_step; /* [blocks]/[bytes] */ + unsigned long long old_data_stripe_length; + unsigned long long start_src; /* [bytes] */ + unsigned long long start; /* [bytes] */ + unsigned long long start_buf_shift; /* [bytes] */ + int degraded = 0; + int source_layout = 0; + + if (!fds || !offsets || !sra) + goto abort; + + /* Find volume during the reshape */ + for (dv = super->devlist; dv; dv = dv->next) { + if (dv->dev->vol.migr_type == MIGR_GEN_MIGR + && dv->dev->vol.migr_state == 1) { + dev = dv->dev; + migr_vol_qan++; + } + } + /* Only one volume can migrate at the same time */ + if (migr_vol_qan != 1) { + pr_err(": %s", migr_vol_qan ? + "Number of migrating volumes greater than 1\n" : + "There is no volume during migrationg\n"); + goto abort; + } + + map_src = get_imsm_map(dev, MAP_1); + if (map_src == NULL) + goto abort; + + ndata = imsm_num_data_members(dev, MAP_0); + odata = imsm_num_data_members(dev, MAP_1); + + chunk = __le16_to_cpu(map_src->blocks_per_strip) * 512; + old_data_stripe_length = odata * chunk; + + migr_rec = super->migr_rec; + + /* initialize migration record for start condition */ + if (sra->reshape_progress == 0) + init_migr_record_imsm(st, dev, sra); + else { + if (__le32_to_cpu(migr_rec->rec_status) != UNIT_SRC_NORMAL) { + dprintf("imsm: cannot restart migration when data are present in copy area.\n"); + goto abort; + } + /* Save checkpoint to update migration record for current + * reshape position (in md). It can be farther than current + * reshape position in metadata. + */ + if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) { + /* ignore error == 2, this can mean end of reshape here + */ + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL, initial save)\n"); + goto abort; + } + } + + /* size for data */ + buf_size = __le32_to_cpu(migr_rec->blocks_per_unit) * 512; + /* extend buffer size for parity disk */ + buf_size += __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512; + /* add space for stripe aligment */ + buf_size += old_data_stripe_length; + if (posix_memalign((void **)&buf, 4096, buf_size)) { + dprintf("imsm: Cannot allocate checpoint buffer\n"); + goto abort; + } + + max_position = sra->component_size * ndata; + source_layout = imsm_level_to_layout(map_src->raid_level); + + while (__le32_to_cpu(migr_rec->curr_migr_unit) < + __le32_to_cpu(migr_rec->num_migr_units)) { + /* current reshape position [blocks] */ + unsigned long long current_position = + __le32_to_cpu(migr_rec->blocks_per_unit) + * __le32_to_cpu(migr_rec->curr_migr_unit); + unsigned long long border; + + /* Check that array hasn't become failed. + */ + degraded = check_degradation_change(sra, fds, degraded); + if (degraded > 1) { + dprintf("imsm: Abort reshape due to degradation level (%i)\n", degraded); + goto abort; + } + + next_step = __le32_to_cpu(migr_rec->blocks_per_unit); + + if ((current_position + next_step) > max_position) + next_step = max_position - current_position; + + start = current_position * 512; + + /* allign reading start to old geometry */ + start_buf_shift = start % old_data_stripe_length; + start_src = start - start_buf_shift; + + border = (start_src / odata) - (start / ndata); + border /= 512; + if (border <= __le32_to_cpu(migr_rec->dest_depth_per_unit)) { + /* save critical stripes to buf + * start - start address of current unit + * to backup [bytes] + * start_src - start address of current unit + * to backup alligned to source array + * [bytes] + */ + unsigned long long next_step_filler = 0; + unsigned long long copy_length = next_step * 512; + + /* allign copy area length to stripe in old geometry */ + next_step_filler = ((copy_length + start_buf_shift) + % old_data_stripe_length); + if (next_step_filler) + next_step_filler = (old_data_stripe_length + - next_step_filler); + dprintf("save_stripes() parameters: start = %llu,\tstart_src = %llu,\tnext_step*512 = %llu,\tstart_in_buf_shift = %llu,\tnext_step_filler = %llu\n", + start, start_src, copy_length, + start_buf_shift, next_step_filler); + + if (save_stripes(fds, offsets, map_src->num_members, + chunk, map_src->raid_level, + source_layout, 0, NULL, start_src, + copy_length + + next_step_filler + start_buf_shift, + buf)) { + dprintf("imsm: Cannot save stripes to buffer\n"); + goto abort; + } + /* Convert data to destination format and store it + * in backup general migration area + */ + if (save_backup_imsm(st, dev, sra, + buf + start_buf_shift, copy_length)) { + dprintf("imsm: Cannot save stripes to target devices\n"); + goto abort; + } + if (save_checkpoint_imsm(st, sra, + UNIT_SRC_IN_CP_AREA)) { + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_IN_CP_AREA)\n"); + goto abort; + } + } else { + /* set next step to use whole border area */ + border /= next_step; + if (border > 1) + next_step *= border; + } + /* When data backed up, checkpoint stored, + * kick the kernel to reshape unit of data + */ + next_step = next_step + sra->reshape_progress; + /* limit next step to array max position */ + if (next_step > max_position) + next_step = max_position; + sysfs_set_num(sra, NULL, "suspend_lo", sra->reshape_progress); + sysfs_set_num(sra, NULL, "suspend_hi", next_step); + sra->reshape_progress = next_step; + + /* wait until reshape finish */ + if (wait_for_reshape_imsm(sra, ndata)) { + dprintf("wait_for_reshape_imsm returned error!\n"); + goto abort; + } + if (sigterm) + goto abort; + + if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) { + /* ignore error == 2, this can mean end of reshape here + */ + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL)\n"); + goto abort; + } + + } + + /* clear migr_rec on disks after successful migration */ + struct dl *d; + + memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SIZE); + for (d = super->disks; d; d = d->next) { + if (d->index < 0 || is_failed(&d->disk)) + continue; + unsigned long long dsize; + + get_dev_size(d->fd, NULL, &dsize); + if (lseek64(d->fd, dsize - MIGR_REC_POSITION, + SEEK_SET) >= 0) { + if (write(d->fd, super->migr_rec_buf, + MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE) + perror("Write migr_rec failed"); + } + } + + /* return '1' if done */ + ret_val = 1; +abort: + free(buf); + + return ret_val; +} + +#endif /* MDASSEMBLE */ + +struct superswitch super_imsm = { +#ifndef MDASSEMBLE + .examine_super = examine_super_imsm, + .brief_examine_super = brief_examine_super_imsm, + .brief_examine_subarrays = brief_examine_subarrays_imsm, + .export_examine_super = export_examine_super_imsm, + .detail_super = detail_super_imsm, + .brief_detail_super = brief_detail_super_imsm, + .write_init_super = write_init_super_imsm, + .validate_geometry = validate_geometry_imsm, + .add_to_super = add_to_super_imsm, + .remove_from_super = remove_from_super_imsm, + .detail_platform = detail_platform_imsm, + .export_detail_platform = export_detail_platform_imsm, + .kill_subarray = kill_subarray_imsm, + .update_subarray = update_subarray_imsm, + .load_container = load_container_imsm, + .default_geometry = default_geometry_imsm, + .get_disk_controller_domain = imsm_get_disk_controller_domain, + .reshape_super = imsm_reshape_super, + .manage_reshape = imsm_manage_reshape, + .recover_backup = recover_backup_imsm, + .copy_metadata = copy_metadata_imsm, +#endif + .match_home = match_home_imsm, + .uuid_from_super= uuid_from_super_imsm, + .getinfo_super = getinfo_super_imsm, + .getinfo_super_disks = getinfo_super_disks_imsm, + .update_super = update_super_imsm, + + .avail_size = avail_size_imsm, + .min_acceptable_spare_size = min_acceptable_spare_size_imsm, + + .compare_super = compare_super_imsm, + + .load_super = load_super_imsm, + .init_super = init_super_imsm, + .store_super = store_super_imsm, + .free_super = free_super_imsm, + .match_metadata_desc = match_metadata_desc_imsm, + .container_content = container_content_imsm, + .validate_container = validate_container_imsm, + + .external = 1, + .name = "imsm", + +#ifndef MDASSEMBLE +/* for mdmon */ + .open_new = imsm_open_new, + .set_array_state= imsm_set_array_state, + .set_disk = imsm_set_disk, + .sync_metadata = imsm_sync_metadata, + .activate_spare = imsm_activate_spare, + .process_update = imsm_process_update, + .prepare_update = imsm_prepare_update, +#endif /* MDASSEMBLE */ +}; diff --git a/super-mbr.c b/super-mbr.c new file mode 100644 index 00000000..62b3f031 --- /dev/null +++ b/super-mbr.c @@ -0,0 +1,204 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2010 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + * + */ + +/* + * 'mbr' is a pseudo metadata type for devices which have a + * partition table in the Master Boot Record (mbr) also known + * as a dos partition table. + * + * Obviously arrays cannot be created or assembled for this type. + * It is used to allow a new bare device to have an partition table + * added so the member partitions can then be included in other + * arrays as relevant. + * + * The meaning operations are: + * examine_super, but not brief_examine_super or export_examine + * load_super + * store_super + */ + +#include "mdadm.h" +#include "part.h" + +static void free_mbr(struct supertype *st) +{ + free(st->sb); + st->sb = NULL; +} + +#ifndef MDASSEMBLE + +static void examine_mbr(struct supertype *st, char *homehost) +{ + struct MBR *sb = st->sb; + int i; + + printf(" MBR Magic : %04x\n", sb->magic); + for (i = 0; i < MBR_PARTITIONS; i++) + if (sb->parts[i].blocks_num) + printf("Partition[%d] : %12lu sectors at %12lu (type %02x)\n", + i, + (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num), + (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba), + sb->parts[i].part_type); + +} + +#endif /*MDASSEMBLE */ + +static int load_super_mbr(struct supertype *st, int fd, char *devname) +{ + /* try to read an mbr + * Return + * 0 on success + * 1 cannot get record + * 2 record is meaningless + */ + struct MBR *super; + + free_mbr(st); + + if (posix_memalign((void**)&super, 512, 512) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + lseek(fd, 0, 0); + if (read(fd, super, sizeof(*super)) != sizeof(*super)) { + if (devname) + pr_err("Cannot read partition table on %s\n", + devname); + free(super); + return 1; + } + + if (super->magic != MBR_SIGNATURE_MAGIC) { + if (devname) + pr_err("No partition table found on %s\n", + devname); + free(super); + return 1; + } + + st->sb = super; + + if (st->ss == NULL) { + st->ss = &mbr; + st->minor_version = 0; + st->max_devs = 1; + st->info = NULL; + } + return 0; +} + +static int store_mbr(struct supertype *st, int fd) +{ + struct MBR *old, *super; + + if (posix_memalign((void**)&old, 512, 512) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + lseek(fd, 0, 0); + if (read(fd, old, sizeof(*old)) != sizeof(*old)) { + free(old); + return 1; + } + + super = st->sb; + memcpy(super->pad, old->pad, sizeof(super->pad)); + free(old); + lseek(fd, 0, 0); + if (write(fd, super, sizeof(*super)) != sizeof(*super)) + return 4; + fsync(fd); + ioctl(fd, BLKRRPART, 0); + return 0; +} + +static void getinfo_mbr(struct supertype *st, struct mdinfo *info, char *map) +{ + struct MBR *sb = st->sb; + int i; + + memset(&info->array, 0, sizeof(info->array)); + memset(&info->disk, 0, sizeof(info->disk)); + strcpy(info->text_version, "mbr"); + strcpy(info->name, "mbr"); + info->component_size = 0; + + for (i = 0; i < MBR_PARTITIONS ; i++) + if (sb->parts[i].blocks_num) { + unsigned long last = + (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num) + + (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba); + if (last > info->component_size) + info->component_size = last; + } + +} + +static struct supertype *match_metadata_desc(char *arg) +{ + struct supertype *st; + + if (strcmp(arg, "mbr") != 0) + return NULL; + + st = xmalloc(sizeof(*st)); + st->ss = &mbr; + st->info = NULL; + st->minor_version = 0; + st->max_devs = 1; + st->sb = NULL; + return st; +} + +#ifndef MDASSEMBLE +static int validate_geometry(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int verbose) +{ + pr_err("mbr metadata cannot be used this way\n"); + return 0; +} +#endif + +struct superswitch mbr = { +#ifndef MDASSEMBLE + .examine_super = examine_mbr, + .validate_geometry = validate_geometry, +#endif + .match_metadata_desc = match_metadata_desc, + .load_super = load_super_mbr, + .store_super = store_mbr, + .getinfo_super = getinfo_mbr, + .free_super = free_mbr, + .name = "mbr", +}; diff --git a/super0.c b/super0.c new file mode 100644 index 00000000..59a6a034 --- /dev/null +++ b/super0.c @@ -0,0 +1,1332 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "sha1.h" +/* + * All handling for the 0.90.0 version superblock is in + * this file. + * This includes: + * - finding, loading, and writing the superblock. + * - initialising a new superblock + * - printing the superblock for --examine + * - printing part of the superblock for --detail + * .. other stuff + */ + +static unsigned long calc_sb0_csum(mdp_super_t *super) +{ + unsigned long csum = super->sb_csum; + unsigned long newcsum; + super->sb_csum= 0 ; + newcsum = calc_csum(super, MD_SB_BYTES); + super->sb_csum = csum; + return newcsum; +} + +static void super0_swap_endian(struct mdp_superblock_s *sb) +{ + /* as super0 superblocks are host-endian, it is sometimes + * useful to be able to swap the endianness + * as (almost) everything is u32's we byte-swap every 4byte + * number. + * We then also have to swap the events_hi and events_lo + */ + char *sbc = (char *)sb; + __u32 t32; + int i; + + for (i=0; i < MD_SB_BYTES ; i+=4) { + char t = sbc[i]; + sbc[i] = sbc[i+3]; + sbc[i+3] = t; + t=sbc[i+1]; + sbc[i+1]=sbc[i+2]; + sbc[i+2]=t; + } + t32 = sb->events_hi; + sb->events_hi = sb->events_lo; + sb->events_lo = t32; + + t32 = sb->cp_events_hi; + sb->cp_events_hi = sb->cp_events_lo; + sb->cp_events_lo = t32; + +} + +#ifndef MDASSEMBLE + +static void examine_super0(struct supertype *st, char *homehost) +{ + mdp_super_t *sb = st->sb; + time_t atime; + int d; + int delta_extra = 0; + char *c; + + printf(" Magic : %08x\n", sb->md_magic); + printf(" Version : %d.%02d.%02d\n", sb->major_version, sb->minor_version, + sb->patch_version); + if (sb->minor_version >= 90) { + printf(" UUID : %08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + if (homehost) { + char buf[20]; + void *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + if (memcmp(&sb->set_uuid2, hash, 8)==0) + printf(" (local to host %s)", homehost); + } + printf("\n"); + } else + printf(" UUID : %08x\n", sb->set_uuid0); + + if (sb->not_persistent) + printf(" Eedk : not persistent\n"); + + atime = sb->ctime; + printf(" Creation Time : %.24s\n", ctime(&atime)); + c=map_num(pers, sb->level); + printf(" Raid Level : %s\n", c?c:"-unknown-"); + if ((int)sb->level > 0) { + int ddsks = 0, ddsks_denom = 1; + printf(" Used Dev Size : %d%s\n", sb->size, + human_size((long long)sb->size<<10)); + switch(sb->level) { + case 1: ddsks=1;break; + case 4: + case 5: ddsks = sb->raid_disks-1; break; + case 6: ddsks = sb->raid_disks-2; break; + case 10: ddsks = sb->raid_disks; + ddsks_denom = (sb->layout&255) * ((sb->layout>>8)&255); + } + if (ddsks) { + long long asize = sb->size; + asize = (asize << 10) * ddsks / ddsks_denom; + printf(" Array Size : %llu%s\n", + asize >> 10, human_size(asize)); + } + } + printf(" Raid Devices : %d\n", sb->raid_disks); + printf(" Total Devices : %d\n", sb->nr_disks); + printf("Preferred Minor : %d\n", sb->md_minor); + printf("\n"); + if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) { + printf(" Reshape pos'n : %llu%s\n", (unsigned long long)sb->reshape_position/2, human_size((long long)sb->reshape_position<<9)); + if (sb->delta_disks) { + printf(" Delta Devices : %d", sb->delta_disks); + printf(" (%d->%d)\n", sb->raid_disks-sb->delta_disks, sb->raid_disks); + if (((int)sb->delta_disks) < 0) + delta_extra = - sb->delta_disks; + } + if (sb->new_level != sb->level) { + c = map_num(pers, sb->new_level); + printf(" New Level : %s\n", c?c:"-unknown-"); + } + if (sb->new_layout != sb->layout) { + if (sb->level == 5) { + c = map_num(r5layout, sb->new_layout); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } + if (sb->level == 6) { + c = map_num(r6layout, sb->new_layout); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } + if (sb->level == 10) { + printf(" New Layout : near=%d, %s=%d\n", + sb->new_layout&255, + (sb->new_layout&0x10000)?"offset":"far", + (sb->new_layout>>8)&255); + } + } + if (sb->new_chunk != sb->chunk_size) + printf(" New Chunksize : %d\n", sb->new_chunk); + printf("\n"); + } + atime = sb->utime; + printf(" Update Time : %.24s\n", ctime(&atime)); + printf(" State : %s\n", + (sb->state&(1<state & (1<active_disks); + printf("Working Devices : %d\n", sb->working_disks); + printf(" Failed Devices : %d\n", sb->failed_disks); + printf(" Spare Devices : %d\n", sb->spare_disks); + if (calc_sb0_csum(sb) == sb->sb_csum) + printf(" Checksum : %x - correct\n", sb->sb_csum); + else + printf(" Checksum : %x - expected %lx\n", sb->sb_csum, calc_sb0_csum(sb)); + printf(" Events : %llu\n", + ((unsigned long long)sb->events_hi << 32) + + sb->events_lo); + printf("\n"); + if (sb->level == 5) { + c = map_num(r5layout, sb->layout); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (sb->level == 6) { + c = map_num(r6layout, sb->layout); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (sb->level == 10) { + printf(" Layout :"); + print_r10_layout(sb->layout); + printf("\n"); + } + switch(sb->level) { + case 0: + case 4: + case 5: + case 6: + case 10: + printf(" Chunk Size : %dK\n", sb->chunk_size/1024); + break; + case -1: + printf(" Rounding : %dK\n", sb->chunk_size/1024); + break; + default: break; + } + printf("\n"); + printf(" Number Major Minor RaidDevice State\n"); + for (d= -1; d<(signed int)(sb->raid_disks+delta_extra + sb->spare_disks); d++) { + mdp_disk_t *dp; + char *dv; + char nb[5]; + int wonly; + if (d>=0) dp = &sb->disks[d]; + else dp = &sb->this_disk; + snprintf(nb, sizeof(nb), "%4d", d); + printf("%4s %5d %5d %5d %5d ", d < 0 ? "this" : nb, + dp->number, dp->major, dp->minor, dp->raid_disk); + wonly = dp->state & (1<state &= ~(1<state & (1<state & (1<state & (1<state & (1<state == 0) printf(" spare"); + if ((dv=map_dev(dp->major, dp->minor, 0))) + printf(" %s", dv); + printf("\n"); + if (d == -1) printf("\n"); + } +} + +static void brief_examine_super0(struct supertype *st, int verbose) +{ + mdp_super_t *sb = st->sb; + char *c=map_num(pers, sb->level); + char devname[20]; + + sprintf(devname, "/dev/md%d", sb->md_minor); + + if (verbose) { + printf("ARRAY %s level=%s num-devices=%d", + devname, + c?c:"-unknown-", sb->raid_disks); + } else + printf("ARRAY %s", devname); + + if (sb->minor_version >= 90) + printf(" UUID=%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf(" UUID=%08x", sb->set_uuid0); + printf("\n"); +} + +static void export_examine_super0(struct supertype *st) +{ + mdp_super_t *sb = st->sb; + + printf("MD_LEVEL=%s\n", map_num(pers, sb->level)); + printf("MD_DEVICES=%d\n", sb->raid_disks); + if (sb->minor_version >= 90) + printf("MD_UUID=%08x:%08x:%08x:%08x\n", + sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf("MD_UUID=%08x\n", sb->set_uuid0); + printf("MD_UPDATE_TIME=%llu\n", + __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL); + printf("MD_EVENTS=%llu\n", + ((unsigned long long)sb->events_hi << 32) + + sb->events_lo); +} + +static int copy_metadata0(struct supertype *st, int from, int to) +{ + /* Read 64K from the appropriate offset of 'from' + * and if it looks a little like a 0.90 superblock, + * write it to the same offset of 'to' + */ + void *buf; + unsigned long long dsize, offset; + const int bufsize = 64*1024; + mdp_super_t *super; + + if (posix_memalign(&buf, 4096, bufsize) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (dsize < MD_RESERVED_SECTORS*512) + goto err; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(from, offset, 0) < 0LL) + goto err; + if (read(from, buf, bufsize) != bufsize) + goto err; + + if (lseek64(to, offset, 0) < 0LL) + goto err; + super = buf; + if (super->md_magic != MD_SB_MAGIC || + super->major_version != 0 || + calc_sb0_csum(super) != super->sb_csum) + goto err; + if (write(to, buf, bufsize) != bufsize) + goto err; + free(buf); + return 0; +err: + free(buf); + return 1; +} + +static void detail_super0(struct supertype *st, char *homehost) +{ + mdp_super_t *sb = st->sb; + printf(" UUID : "); + if (sb->minor_version >= 90) + printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf("%08x", sb->set_uuid0); + if (homehost) { + char buf[20]; + void *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + if (memcmp(&sb->set_uuid2, hash, 8)==0) + printf(" (local to host %s)", homehost); + } + printf("\n Events : %d.%d\n\n", sb->events_hi, sb->events_lo); +} + +static void brief_detail_super0(struct supertype *st) +{ + mdp_super_t *sb = st->sb; + printf(" UUID="); + if (sb->minor_version >= 90) + printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf("%08x", sb->set_uuid0); +} +#endif + +static int match_home0(struct supertype *st, char *homehost) +{ + mdp_super_t *sb = st->sb; + char buf[20]; + char *hash; + + if (!homehost) + return 0; + hash = sha1_buffer(homehost, + strlen(homehost), + buf); + + return (memcmp(&sb->set_uuid2, hash, 8)==0); +} + +static void uuid_from_super0(struct supertype *st, int uuid[4]) +{ + mdp_super_t *super = st->sb; + uuid[0] = super->set_uuid0; + if (super->minor_version >= 90) { + uuid[1] = super->set_uuid1; + uuid[2] = super->set_uuid2; + uuid[3] = super->set_uuid3; + } else { + uuid[1] = 0; + uuid[2] = 0; + uuid[3] = 0; + } +} + +static void getinfo_super0(struct supertype *st, struct mdinfo *info, char *map) +{ + mdp_super_t *sb = st->sb; + int working = 0; + int i; + int map_disks = info->array.raid_disks; + + memset(info, 0, sizeof(*info)); + info->array.major_version = sb->major_version; + info->array.minor_version = sb->minor_version; + info->array.patch_version = sb->patch_version; + info->array.raid_disks = sb->raid_disks; + info->array.level = sb->level; + info->array.layout = sb->layout; + info->array.md_minor = sb->md_minor; + info->array.ctime = sb->ctime; + info->array.utime = sb->utime; + info->array.chunk_size = sb->chunk_size; + info->array.state = sb->state; + info->component_size = sb->size; + info->component_size *= 2; + + if (sb->state & (1<bitmap_offset = 8; + + info->disk.state = sb->this_disk.state; + info->disk.major = sb->this_disk.major; + info->disk.minor = sb->this_disk.minor; + info->disk.raid_disk = sb->this_disk.raid_disk; + info->disk.number = sb->this_disk.number; + + info->events = md_event(sb); + info->data_offset = 0; + + sprintf(info->text_version, "0.%d", sb->minor_version); + info->safe_mode_delay = 200; + + uuid_from_super0(st, info->uuid); + + info->recovery_start = MaxSector; + if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) { + info->reshape_active = 1; + info->reshape_progress = sb->reshape_position; + info->new_level = sb->new_level; + info->delta_disks = sb->delta_disks; + info->new_layout = sb->new_layout; + info->new_chunk = sb->new_chunk; + if (info->delta_disks < 0) + info->array.raid_disks -= info->delta_disks; + } else + info->reshape_active = 0; + + info->recovery_blocked = info->reshape_active; + + sprintf(info->name, "%d", sb->md_minor); + /* work_disks is calculated rather than read directly */ + for (i=0; i < MD_SB_DISKS; i++) + if ((sb->disks[i].state & (1<disks[i].raid_disk < (unsigned)info->array.raid_disks) && + (sb->disks[i].state & (1<disks[i].state & (1<array.working_disks = working; +} + +static struct mdinfo *container_content0(struct supertype *st, char *subarray) +{ + struct mdinfo *info; + + if (subarray) + return NULL; + + info = xmalloc(sizeof(*info)); + getinfo_super0(st, info, NULL); + return info; +} + +static int update_super0(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* NOTE: for 'assemble' and 'force' we need to return non-zero + * if any change was made. For others, the return value is + * ignored. + */ + int rv = 0; + int uuid[4]; + mdp_super_t *sb = st->sb; + + if (strcmp(update, "homehost") == 0 && + homehost) { + /* note that 'homehost' is special as it is really + * a "uuid" update. + */ + uuid_set = 0; + update = "uuid"; + info->uuid[0] = sb->set_uuid0; + info->uuid[1] = sb->set_uuid1; + } + + if (strcmp(update, "sparc2.2")==0 ) { + /* 2.2 sparc put the events in the wrong place + * So we copy the tail of the superblock + * up 4 bytes before continuing + */ + __u32 *sb32 = (__u32*)sb; + memcpy(sb32+MD_SB_GENERIC_CONSTANT_WORDS+7, + sb32+MD_SB_GENERIC_CONSTANT_WORDS+7+1, + (MD_SB_WORDS - (MD_SB_GENERIC_CONSTANT_WORDS+7+1))*4); + if (verbose >= 0) + pr_err("adjusting superblock of %s for 2.2/sparc compatibility.\n", + devname); + } else if (strcmp(update, "super-minor") ==0) { + sb->md_minor = info->array.md_minor; + if (verbose > 0) + pr_err("updating superblock of %s with minor number %d\n", + devname, info->array.md_minor); + } else if (strcmp(update, "summaries") == 0) { + unsigned int i; + /* set nr_disks, active_disks, working_disks, + * failed_disks, spare_disks based on disks[] + * array in superblock. + * Also make sure extra slots aren't 'failed' + */ + sb->nr_disks = sb->active_disks = + sb->working_disks = sb->failed_disks = + sb->spare_disks = 0; + for (i=0; i < MD_SB_DISKS ; i++) + if (sb->disks[i].major || + sb->disks[i].minor) { + int state = sb->disks[i].state; + if (state & (1<nr_disks++; + if (state & (1<active_disks++; + if (state & (1<failed_disks++; + else + sb->working_disks++; + if (state == 0) + sb->spare_disks++; + } else if (i >= sb->raid_disks && sb->disks[i].number == 0) + sb->disks[i].state = 0; + } else if (strcmp(update, "force-one")==0) { + /* Not enough devices for a working array, so + * bring this one up-to-date. + */ + __u32 ehi = sb->events_hi, elo = sb->events_lo; + sb->events_hi = (info->events>>32) & 0xFFFFFFFF; + sb->events_lo = (info->events) & 0xFFFFFFFF; + if (sb->events_hi != ehi || + sb->events_lo != elo) + rv = 1; + } else if (strcmp(update, "force-array")==0) { + /* degraded array and 'force' requested, so + * maybe need to mark it 'clean' + */ + if ((sb->level == 5 || sb->level == 4 || sb->level == 6) && + (sb->state & (1 << MD_SB_CLEAN)) == 0) { + /* need to force clean */ + sb->state |= (1 << MD_SB_CLEAN); + rv = 1; + } + } else if (strcmp(update, "assemble")==0) { + int d = info->disk.number; + int wonly = sb->disks[d].state & (1<minor_version >= 91) + /* During reshape we don't insist on everything + * being marked 'sync' + */ + add = (1<disks[d].state & ~mask) | add) + != (unsigned)info->disk.state) { + sb->disks[d].state = info->disk.state | wonly; + rv = 1; + } + if (info->reshape_active && + sb->minor_version > 90 && (sb->reshape_position+1) != 0 && + info->delta_disks >= 0 && + info->reshape_progress < sb->reshape_position) { + sb->reshape_position = info->reshape_progress; + rv = 1; + } + if (info->reshape_active && + sb->minor_version > 90 && (sb->reshape_position+1) != 0 && + info->delta_disks < 0 && + info->reshape_progress > sb->reshape_position) { + sb->reshape_position = info->reshape_progress; + rv = 1; + } + } else if (strcmp(update, "linear-grow-new") == 0) { + memset(&sb->disks[info->disk.number], 0, sizeof(sb->disks[0])); + sb->disks[info->disk.number].number = info->disk.number; + sb->disks[info->disk.number].major = info->disk.major; + sb->disks[info->disk.number].minor = info->disk.minor; + sb->disks[info->disk.number].raid_disk = info->disk.raid_disk; + sb->disks[info->disk.number].state = info->disk.state; + sb->this_disk = sb->disks[info->disk.number]; + } else if (strcmp(update, "linear-grow-update") == 0) { + sb->raid_disks = info->array.raid_disks; + sb->nr_disks = info->array.nr_disks; + sb->active_disks = info->array.active_disks; + sb->working_disks = info->array.working_disks; + memset(&sb->disks[info->disk.number], 0, sizeof(sb->disks[0])); + sb->disks[info->disk.number].number = info->disk.number; + sb->disks[info->disk.number].major = info->disk.major; + sb->disks[info->disk.number].minor = info->disk.minor; + sb->disks[info->disk.number].raid_disk = info->disk.raid_disk; + sb->disks[info->disk.number].state = info->disk.state; + } else if (strcmp(update, "resync") == 0) { + /* make sure resync happens */ + sb->state &= ~(1<recovery_cp = 0; + } else if (strcmp(update, "uuid") == 0) { + if (!uuid_set && homehost) { + char buf[20]; + char *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + memcpy(info->uuid+2, hash, 8); + } + sb->set_uuid0 = info->uuid[0]; + sb->set_uuid1 = info->uuid[1]; + sb->set_uuid2 = info->uuid[2]; + sb->set_uuid3 = info->uuid[3]; + if (sb->state & (1<uuid, uuid, 16); + } + } else if (strcmp(update, "metadata") == 0) { + /* Create some v1.0 metadata to match ours but make the + * ctime bigger. Also update info->array.*_version. + * We need to arrange that store_super writes out + * the v1.0 metadata. + * Not permitted for unclean array, or array with + * bitmap. + */ + if (info->bitmap_offset) { + pr_err("Cannot update metadata when bitmap is present\n"); + rv = -2; + } else if (info->array.state != 1) { + pr_err("Cannot update metadata on unclean array\n"); + rv = -2; + } else { + info->array.major_version = 1; + info->array.minor_version = 0; + uuid_from_super0(st, info->uuid); + st->other = super1_make_v0(st, info, st->sb); + } + } else if (strcmp(update, "revert-reshape") == 0) { + rv = -2; + if (sb->minor_version <= 90) + pr_err("No active reshape to revert on %s\n", + devname); + else if (sb->delta_disks == 0) + pr_err("%s: Can only revert reshape which changes number of devices\n", + devname); + else { + int tmp; + int parity = sb->level == 6 ? 2 : 1; + rv = 0; + + if (sb->level >= 4 && sb->level <= 6 && + sb->reshape_position % ( + sb->new_chunk/512 * + (sb->raid_disks - sb->delta_disks - parity))) { + pr_err("Reshape position is not suitably aligned.\n"); + pr_err("Try normal assembly and stop again\n"); + return -2; + } + sb->raid_disks -= sb->delta_disks; + sb->delta_disks = -sb->delta_disks; + + tmp = sb->new_layout; + sb->new_layout = sb->layout; + sb->layout = tmp; + + tmp = sb->new_chunk; + sb->new_chunk = sb->chunk_size; + sb->chunk_size = tmp; + } + } else if (strcmp(update, "no-bitmap") == 0) { + sb->state &= ~(1<reshape_position = info->reshape_progress; + else if (strcmp(update, "writemostly")==0) + sb->state |= (1<state &= ~(1<sb_csum = calc_sb0_csum(sb); + return rv; +} + +/* + * For verion-0 superblock, the homehost is 'stored' in the + * uuid. 8 bytes for a hash of the host leaving 8 bytes + * of random material. + * We use the first 8 bytes (64bits) of the sha1 of the + * host name + */ + +static int init_super0(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *ignored_name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + mdp_super_t *sb; + int spares; + + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset not support for 0.90\n"); + return 0; + } + + if (posix_memalign((void**)&sb, 4096, + MD_SB_BYTES + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) { + pr_err("could not allocate superblock\n"); + return 0; + } + memset(sb, 0, MD_SB_BYTES + sizeof(bitmap_super_t)); + + st->sb = sb; + if (info == NULL) { + /* zeroing the superblock */ + return 0; + } + + spares = info->working_disks - info->active_disks; + if (info->raid_disks + spares > MD_SB_DISKS) { + pr_err("too many devices requested: %d+%d > %d\n", + info->raid_disks , spares, MD_SB_DISKS); + return 0; + } + + sb->md_magic = MD_SB_MAGIC; + sb->major_version = 0; + sb->minor_version = 90; + sb->patch_version = 0; + sb->gvalid_words = 0; /* ignored */ + sb->ctime = time(0); + sb->level = info->level; + sb->size = size; + if (size != (unsigned long long)sb->size) + return 0; + sb->nr_disks = info->nr_disks; + sb->raid_disks = info->raid_disks; + sb->md_minor = info->md_minor; + sb->not_persistent = 0; + if (uuid) { + sb->set_uuid0 = uuid[0]; + sb->set_uuid1 = uuid[1]; + sb->set_uuid2 = uuid[2]; + sb->set_uuid3 = uuid[3]; + } else { + int rfd = open("/dev/urandom", O_RDONLY); + if (rfd < 0 || read(rfd, &sb->set_uuid0, 4) != 4) + sb->set_uuid0 = random(); + if (rfd < 0 || read(rfd, &sb->set_uuid1, 12) != 12) { + sb->set_uuid1 = random(); + sb->set_uuid2 = random(); + sb->set_uuid3 = random(); + } + if (rfd >= 0) + close(rfd); + } + if (homehost && !uuid) { + char buf[20]; + char *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + memcpy(&sb->set_uuid2, hash, 8); + } + + sb->utime = sb->ctime; + sb->state = info->state; + sb->active_disks = info->active_disks; + sb->working_disks = info->working_disks; + sb->failed_disks = info->failed_disks; + sb->spare_disks = info->spare_disks; + sb->events_hi = 0; + sb->events_lo = 1; + + sb->layout = info->layout; + sb->chunk_size = info->chunk_size; + + return 1; +} + +struct devinfo { + int fd; + char *devname; + mdu_disk_info_t disk; + struct devinfo *next; +}; + +#ifndef MDASSEMBLE +/* Add a device to the superblock being created */ +static int add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo, + int fd, char *devname, unsigned long long data_offset) +{ + mdp_super_t *sb = st->sb; + mdp_disk_t *dk = &sb->disks[dinfo->number]; + struct devinfo *di, **dip; + + dk->number = dinfo->number; + dk->major = dinfo->major; + dk->minor = dinfo->minor; + dk->raid_disk = dinfo->raid_disk; + dk->state = dinfo->state & ((1<this_disk = sb->disks[dinfo->number]; + sb->sb_csum = calc_sb0_csum(sb); + + dip = (struct devinfo **)&st->info; + while (*dip) + dip = &(*dip)->next; + di = xmalloc(sizeof(struct devinfo)); + di->fd = fd; + di->devname = devname; + di->disk = *dinfo; + di->next = NULL; + *dip = di; + + return 0; +} +#endif + +static int store_super0(struct supertype *st, int fd) +{ + unsigned long long dsize; + unsigned long long offset; + mdp_super_t *super = st->sb; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + if (dsize < MD_RESERVED_SECTORS*512) + return 2; + + if (st->other) { + /* Writing out v1.0 metadata for --update=metadata */ + int ret = 0; + + offset = dsize/512 - 8*2; + offset &= ~(4*2-1); + offset *= 512; + if (lseek64(fd, offset, 0)< 0LL) + ret = 3; + else if (write(fd, st->other, 1024) != 1024) + ret = 4; + else + fsync(fd); + free(st->other); + st->other = NULL; + return ret; + } + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(fd, offset, 0)< 0LL) + return 3; + + if (write(fd, super, sizeof(*super)) != sizeof(*super)) + return 4; + + if (super->state & (1<magic) == BITMAP_MAGIC) + if (write(fd, bm, ROUND_UP(sizeof(*bm),4096)) != + ROUND_UP(sizeof(*bm),4096)) + return 5; + } + + fsync(fd); + return 0; +} + +#ifndef MDASSEMBLE +static int write_init_super0(struct supertype *st) +{ + mdp_super_t *sb = st->sb; + int rv = 0; + struct devinfo *di; + + for (di = st->info ; di && ! rv ; di = di->next) { + + if (di->disk.state & (1 << MD_DISK_FAULTY)) + continue; + if (di->fd == -1) + continue; + while (Kill(di->devname, NULL, 0, -1, 1) == 0) + ; + + sb->disks[di->disk.number].state &= ~(1<this_disk = sb->disks[di->disk.number]; + sb->sb_csum = calc_sb0_csum(sb); + rv = store_super0(st, di->fd); + + if (rv == 0 && (sb->state & (1<ss->write_bitmap(st, di->fd, NoUpdate); + + if (rv) + pr_err("failed to write superblock to %s\n", + di->devname); + } + return rv; +} +#endif + +static int compare_super0(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + mdp_super_t *first = st->sb; + mdp_super_t *second = tst->sb; + int uuid1[4], uuid2[4]; + + if (second->md_magic != MD_SB_MAGIC) + return 1; + if (!first) { + if (posix_memalign((void**)&first, 4096, + MD_SB_BYTES + + ROUND_UP(sizeof(struct bitmap_super_s), 4096)) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s)); + st->sb = first; + return 0; + } + + uuid_from_super0(st, uuid1); + uuid_from_super0(tst, uuid2); + if (!same_uuid(uuid1, uuid2, 0)) + return 2; + if (first->major_version != second->major_version || + first->minor_version != second->minor_version || + first->patch_version != second->patch_version || + first->gvalid_words != second->gvalid_words || + first->ctime != second->ctime || + first->level != second->level || + first->size != second->size || + first->raid_disks != second->raid_disks ) + return 3; + + return 0; +} + +static void free_super0(struct supertype *st); + +static int load_super0(struct supertype *st, int fd, char *devname) +{ + /* try to read in the superblock + * Return: + * 0 on success + * 1 on cannot get superblock + * 2 on superblock meaningless + */ + unsigned long long dsize; + unsigned long long offset; + mdp_super_t *super; + int uuid[4]; + struct bitmap_super_s *bsb; + + free_super0(st); + + if (!get_dev_size(fd, devname, &dsize)) + return 1; + + if (dsize < MD_RESERVED_SECTORS*512) { + if (devname) + pr_err("%s is too small for md: size is %llu sectors.\n", + devname, dsize); + return 1; + } + st->devsize = dsize; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(fd, offset, 0)< 0LL) { + if (devname) + pr_err("Cannot seek to superblock on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void**)&super, 4096, + MD_SB_BYTES + + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) { + if (devname) + pr_err("Cannot read superblock on %s\n", + devname); + free(super); + return 1; + } + + if (st->ss && st->minor_version == 9) + super0_swap_endian(super); + + if (super->md_magic != MD_SB_MAGIC) { + if (devname) + pr_err("No super block found on %s (Expected magic %08x, got %08x)\n", + devname, MD_SB_MAGIC, super->md_magic); + free(super); + return 2; + } + + if (super->major_version != 0) { + if (devname) + pr_err("Cannot interpret superblock on %s - version is %d\n", + devname, super->major_version); + free(super); + return 2; + } + st->sb = super; + + if (st->ss == NULL) { + st->ss = &super0; + st->minor_version = super->minor_version; + st->max_devs = MD_SB_DISKS; + st->info = NULL; + } + + /* Now check on the bitmap superblock */ + if ((super->state & (1<magic) != BITMAP_MAGIC || + memcmp(bsb->uuid, uuid, 16) != 0) + goto no_bitmap; + return 0; + + no_bitmap: + super->state &= ~(1<container_devnm[0] = 0; + st->ss = &super0; + st->info = NULL; + st->minor_version = 90; + st->max_devs = MD_SB_DISKS; + st->sb = NULL; + /* we sometimes get 00.90 */ + while (arg[0] == '0' && arg[1] == '0') + arg++; + if (strcmp(arg, "0") == 0 || +#ifdef DEFAULT_OLD_METADATA /* ifndef in super1.c */ + strcmp(arg, "default") == 0 || +#endif /* DEFAULT_OLD_METADATA */ + strcmp(arg, "0.90") == 0 || + strcmp(arg, "") == 0 /* no metadata - i.e. non_persistent */ + ) + return st; + + st->minor_version = 91; /* reshape in progress */ + if (strcmp(arg, "0.91") == 0) /* For dup_super support */ + return st; + + st->minor_version = 9; /* flag for 'byte-swapped' */ + if (strcmp(arg, "0.swap")==0 || + strcmp(arg, "0.9") == 0) /* For dup_super support */ + return st; + + free(st); + return NULL; +} + +static __u64 avail_size0(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + if (data_offset != 0 && data_offset != INVALID_SECTORS) + return 0ULL; + if (devsize < MD_RESERVED_SECTORS) + return 0ULL; + return MD_NEW_SIZE_SECTORS(devsize); +} + +static int add_internal_bitmap0(struct supertype *st, int *chunkp, + int delay, int write_behind, + unsigned long long size, int may_change, + int major) +{ + /* + * The bitmap comes immediately after the superblock and must be 60K in size + * at most. The default size is between 30K and 60K + * + * size is in sectors, chunk is in bytes !!! + */ + unsigned long long bits; + unsigned long long max_bits = (60*1024 - sizeof(bitmap_super_t))*8; + unsigned long long min_chunk; + int chunk = *chunkp; + mdp_super_t *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MD_SB_BYTES); + int uuid[4]; + + min_chunk = 4096; /* sub-page chunks don't work yet.. */ + bits = (size * 512) / min_chunk + 1; + while (bits > max_bits) { + min_chunk *= 2; + bits = (bits+1)/2; + } + if (chunk == UnSet) { + /* A chunk size less than a few Megabytes gives poor + * performance without increasing resync noticeably + */ + chunk = min_chunk; + if (chunk < 64*1024*1024) + chunk = 64*1024*1024; + } else if ((unsigned long long)chunk < min_chunk) + return 0; /* chunk size too small */ + + sb->state |= (1<magic = __cpu_to_le32(BITMAP_MAGIC); + bms->version = __cpu_to_le32(major); + uuid_from_super0(st, uuid); + memcpy(bms->uuid, uuid, 16); + bms->chunksize = __cpu_to_le32(chunk); + bms->daemon_sleep = __cpu_to_le32(delay); + bms->sync_size = __cpu_to_le64(size); + bms->write_behind = __cpu_to_le32(write_behind); + *chunkp = chunk; + return 1; +} + +static int locate_bitmap0(struct supertype *st, int fd) +{ + unsigned long long dsize; + unsigned long long offset; + + if (!get_dev_size(fd, NULL, &dsize)) + return -1; + + if (dsize < MD_RESERVED_SECTORS*512) + return -1; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + offset += MD_SB_BYTES; + + lseek64(fd, offset, 0); + return 0; +} + +static int write_bitmap0(struct supertype *st, int fd, enum bitmap_update update) +{ + unsigned long long dsize; + unsigned long long offset; + mdp_super_t *sb = st->sb; + + int rv = 0; + + int towrite, n; + void *buf; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + if (dsize < MD_RESERVED_SECTORS*512) + return -1; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(fd, offset + 4096, 0)< 0LL) + return 3; + + if (posix_memalign(&buf, 4096, 4096)) + return -ENOMEM; + + memset(buf, 0xff, 4096); + memcpy(buf, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t)); + towrite = 60*1024; + while (towrite > 0) { + n = towrite; + if (n > 4096) + n = 4096; + n = write(fd, buf, n); + if (n > 0) + towrite -= n; + else + break; + memset(buf, 0xff, 4096); + } + fsync(fd); + if (towrite) + rv = -2; + + free(buf); + return rv; +} + +static void free_super0(struct supertype *st) +{ + if (st->sb) + free(st->sb); + while (st->info) { + struct devinfo *di = st->info; + st->info = di->next; + if (di->fd >= 0) + close(di->fd); + free(di); + } + st->sb = NULL; +} + +#ifndef MDASSEMBLE +static int validate_geometry0(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int verbose) +{ + unsigned long long ldsize; + int fd; + unsigned int tbmax = 4; + + /* prior to linux 3.1, a but limits usable device size to 2TB. + * It was introduced in 2.6.29, but we won't worry about that detail + */ + if (get_linux_version() < 3001000) + tbmax = 2; + + if (level == LEVEL_CONTAINER) { + if (verbose) + pr_err("0.90 metadata does not support containers\n"); + return 0; + } + if (raiddisks > MD_SB_DISKS) { + if (verbose) + pr_err("0.90 metadata supports at most %d devices per array\n", + MD_SB_DISKS); + return 0; + } + if (size >= tbmax * 2ULL*1024*1024*1024) { + if (verbose) + pr_err("0.90 metadata supports at most %d terabytes per device\n", tbmax); + return 0; + } + if (*chunk == UnSet) + *chunk = DEFAULT_CHUNK; + + if (!subdev) + return 1; + + fd = open(subdev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + pr_err("super0.90 cannot open %s: %s\n", + subdev, strerror(errno)); + return 0; + } + + if (!get_dev_size(fd, subdev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + if (ldsize < MD_RESERVED_SECTORS * 512) + return 0; + *freesize = MD_NEW_SIZE_SECTORS(ldsize >> 9); + return 1; +} +#endif /* MDASSEMBLE */ + +struct superswitch super0 = { +#ifndef MDASSEMBLE + .examine_super = examine_super0, + .brief_examine_super = brief_examine_super0, + .export_examine_super = export_examine_super0, + .detail_super = detail_super0, + .brief_detail_super = brief_detail_super0, + .write_init_super = write_init_super0, + .validate_geometry = validate_geometry0, + .add_to_super = add_to_super0, + .copy_metadata = copy_metadata0, +#endif + .match_home = match_home0, + .uuid_from_super = uuid_from_super0, + .getinfo_super = getinfo_super0, + .container_content = container_content0, + .update_super = update_super0, + .init_super = init_super0, + .store_super = store_super0, + .compare_super = compare_super0, + .load_super = load_super0, + .match_metadata_desc = match_metadata_desc0, + .avail_size = avail_size0, + .add_internal_bitmap = add_internal_bitmap0, + .locate_bitmap = locate_bitmap0, + .write_bitmap = write_bitmap0, + .free_super = free_super0, + .name = "0.90", +}; diff --git a/super1.c b/super1.c new file mode 100644 index 00000000..8bcaa2fe --- /dev/null +++ b/super1.c @@ -0,0 +1,2656 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2016 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include +#include "mdadm.h" +/* + * The version-1 superblock : + * All numeric fields are little-endian. + * + * total size: 256 bytes plus 2 per device. + * 1K allows 384 devices. + */ +struct mdp_superblock_1 { + /* constant array information - 128 bytes */ + __u32 magic; /* MD_SB_MAGIC: 0xa92b4efc - little endian */ + __u32 major_version; /* 1 */ + __u32 feature_map; /* 0 for now */ + __u32 pad0; /* always set to 0 when writing */ + + __u8 set_uuid[16]; /* user-space generated. */ + char set_name[32]; /* set and interpreted by user-space */ + + __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ + __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ + __u32 layout; /* only for raid5 currently */ + __u64 size; /* used size of component devices, in 512byte sectors */ + + __u32 chunksize; /* in 512byte sectors */ + __u32 raid_disks; + __u32 bitmap_offset; /* sectors after start of superblock that bitmap starts + * NOTE: signed, so bitmap can be before superblock + * only meaningful of feature_map[0] is set. + */ + + /* These are only valid with feature bit '4' */ + __u32 new_level; /* new level we are reshaping to */ + __u64 reshape_position; /* next address in array-space for reshape */ + __u32 delta_disks; /* change in number of raid_disks */ + __u32 new_layout; /* new layout */ + __u32 new_chunk; /* new chunk size (sectors) */ + __u32 new_offset; /* signed number to add to data_offset in new + * layout. 0 == no-change. This can be + * different on each device in the array. + */ + + /* constant this-device information - 64 bytes */ + __u64 data_offset; /* sector start of data, often 0 */ + __u64 data_size; /* sectors in this device that can be used for data */ + __u64 super_offset; /* sector start of this superblock */ + union { + __u64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + __u64 journal_tail;/* journal tail of journal device (from data_offset) */ + }; + __u32 dev_number; /* permanent identifier of this device - not role in raid */ + __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ + __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ + __u8 devflags; /* per-device flags. Only one defined...*/ +#define WriteMostly1 1 /* mask for writemostly flag in above */ + /* bad block log. If there are any bad blocks the feature flag is set. + * if offset and size are non-zero, that space is reserved and available. + */ + __u8 bblog_shift; /* shift from sectors to block size for badblocklist */ + __u16 bblog_size; /* number of sectors reserved for badblocklist */ + __u32 bblog_offset; /* sector offset from superblock to bblog, signed */ + + /* array state information - 64 bytes */ + __u64 utime; /* 40 bits second, 24 btes microseconds */ + __u64 events; /* incremented when superblock updated */ + __u64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ + __u32 sb_csum; /* checksum upto dev_roles[max_dev] */ + __u32 max_dev; /* size of dev_roles[] array to consider */ + __u8 pad3[64-32]; /* set to 0 when writing */ + + /* device state information. Indexed by dev_number. + * 2 bytes per device + * Note there are no per-device state flags. State information is rolled + * into the 'roles' value. If a device is spare or faulty, then it doesn't + * have a meaningful role. + */ + __u16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ +}; + +#define MAX_SB_SIZE 4096 +/* bitmap super size is 256, but we round up to a sector for alignment */ +#define BM_SUPER_SIZE 512 +#define MAX_DEVS ((int)(MAX_SB_SIZE - sizeof(struct mdp_superblock_1)) / 2) +#define SUPER1_SIZE (MAX_SB_SIZE + BM_SUPER_SIZE \ + + sizeof(struct misc_dev_info)) + +struct misc_dev_info { + __u64 device_size; +}; + +/* feature_map bits */ +#define MD_FEATURE_BITMAP_OFFSET 1 +#define MD_FEATURE_RECOVERY_OFFSET 2 /* recovery_offset is present and + * must be honoured + */ +#define MD_FEATURE_RESHAPE_ACTIVE 4 +#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ +#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an + * active device with same 'role'. + * 'recovery_offset' is also set. + */ +#define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number + * of devices, but is going + * backwards anyway. + */ +#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ +#define MD_FEATURE_BITMAP_VERSIONED 256 /* bitmap version number checked properly */ +#define MD_FEATURE_JOURNAL 512 /* support write journal */ +#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ + |MD_FEATURE_RECOVERY_OFFSET \ + |MD_FEATURE_RESHAPE_ACTIVE \ + |MD_FEATURE_BAD_BLOCKS \ + |MD_FEATURE_REPLACEMENT \ + |MD_FEATURE_RESHAPE_BACKWARDS \ + |MD_FEATURE_NEW_OFFSET \ + |MD_FEATURE_BITMAP_VERSIONED \ + |MD_FEATURE_JOURNAL \ + ) + +#ifndef MDASSEMBLE +static int role_from_sb(struct mdp_superblock_1 *sb) +{ + unsigned int d; + int role; + + d = __le32_to_cpu(sb->dev_number); + if (d < __le32_to_cpu(sb->max_dev)) + role = __le16_to_cpu(sb->dev_roles[d]); + else + role = MD_DISK_ROLE_SPARE; + return role; +} +#endif + +/* return how many bytes are needed for bitmap, for cluster-md each node + * should have it's own bitmap */ +static unsigned int calc_bitmap_size(bitmap_super_t *bms, unsigned int boundary) +{ + unsigned long long bits, bytes; + + bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); + bytes = (bits+7) >> 3; + bytes += sizeof(bitmap_super_t); + bytes = ROUND_UP(bytes, boundary); + + return bytes; +} + +static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +{ + unsigned int disk_csum, csum; + unsigned long long newcsum; + int size = sizeof(*sb) + __le32_to_cpu(sb->max_dev)*2; + unsigned int *isuper = (unsigned int*)sb; + +/* make sure I can count... */ + if (offsetof(struct mdp_superblock_1,data_offset) != 128 || + offsetof(struct mdp_superblock_1, utime) != 192 || + sizeof(struct mdp_superblock_1) != 256) { + fprintf(stderr, "WARNING - superblock isn't sized correctly\n"); + } + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + newcsum = 0; + for (; size>=4; size -= 4 ) { + newcsum += __le32_to_cpu(*isuper); + isuper++; + } + + if (size == 2) + newcsum += __le16_to_cpu(*(unsigned short*) isuper); + + csum = (newcsum & 0xffffffff) + (newcsum >> 32); + sb->sb_csum = disk_csum; + return __cpu_to_le32(csum); +} + +/* + * Information related to file descriptor used for aligned reads/writes. + * Cache the block size. + */ +struct align_fd { + int fd; + int blk_sz; +}; + +static void init_afd(struct align_fd *afd, int fd) +{ + afd->fd = fd; + + if (ioctl(afd->fd, BLKSSZGET, &afd->blk_sz) != 0) + afd->blk_sz = 512; +} + +static char abuf[4096+4096]; +static int aread(struct align_fd *afd, void *buf, int len) +{ + /* aligned read. + * On devices with a 4K sector size, we need to read + * the full sector and copy relevant bits into + * the buffer + */ + int bsize, iosize; + char *b; + int n; + + bsize = afd->blk_sz; + + if (!bsize || bsize > 4096 || len > 4096) { + if (!bsize) + fprintf(stderr, "WARNING - aread() called with invalid block size\n"); + return -1; + } + b = ROUND_UP_PTR((char *)abuf, 4096); + + for (iosize = 0; iosize < len; iosize += bsize) + ; + n = read(afd->fd, b, iosize); + if (n <= 0) + return n; + lseek(afd->fd, len - n, 1); + if (n > len) + n = len; + memcpy(buf, b, n); + return n; +} + +static int awrite(struct align_fd *afd, void *buf, int len) +{ + /* aligned write. + * On devices with a 4K sector size, we need to write + * the full sector. We pre-read if the sector is larger + * than the write. + * The address must be sector-aligned. + */ + int bsize, iosize; + char *b; + int n; + + bsize = afd->blk_sz; + if (!bsize || bsize > 4096 || len > 4096) { + if (!bsize) + fprintf(stderr, "WARNING - awrite() called with invalid block size\n"); + return -1; + } + b = ROUND_UP_PTR((char *)abuf, 4096); + + for (iosize = 0; iosize < len ; iosize += bsize) + ; + + if (len != iosize) { + n = read(afd->fd, b, iosize); + if (n <= 0) + return n; + lseek(afd->fd, -n, 1); + } + + memcpy(b, buf, len); + n = write(afd->fd, b, iosize); + if (n <= 0) + return n; + lseek(afd->fd, len - n, 1); + return len; +} + +#ifndef MDASSEMBLE +static void examine_super1(struct supertype *st, char *homehost) +{ + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE); + time_t atime; + unsigned int d; + int role; + int delta_extra = 0; + int i; + char *c; + int l = homehost ? strlen(homehost) : 0; + int layout; + unsigned long long sb_offset; + struct mdinfo info; + + printf(" Magic : %08x\n", __le32_to_cpu(sb->magic)); + printf(" Version : 1"); + sb_offset = __le64_to_cpu(sb->super_offset); + if (sb_offset <= 4) + printf(".1\n"); + else if (sb_offset <= 8) + printf(".2\n"); + else + printf(".0\n"); + printf(" Feature Map : 0x%x\n", __le32_to_cpu(sb->feature_map)); + printf(" Array UUID : "); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } + printf("\n"); + printf(" Name : %.32s", sb->set_name); + if (l > 0 && l < 32 && + sb->set_name[l] == ':' && + strncmp(sb->set_name, homehost, l) == 0) + printf(" (local to host %s)", homehost); + printf("\n"); + if (bms->nodes > 0 && (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) + printf(" Cluster Name : %-64s\n", bms->cluster_name); + atime = __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL; + printf(" Creation Time : %.24s\n", ctime(&atime)); + c=map_num(pers, __le32_to_cpu(sb->level)); + printf(" Raid Level : %s\n", c?c:"-unknown-"); + printf(" Raid Devices : %d\n", __le32_to_cpu(sb->raid_disks)); + printf("\n"); + printf(" Avail Dev Size : %llu%s\n", + (unsigned long long)__le64_to_cpu(sb->data_size), + human_size(__le64_to_cpu(sb->data_size)<<9)); + if (__le32_to_cpu(sb->level) > 0) { + int ddsks = 0, ddsks_denom = 1; + switch(__le32_to_cpu(sb->level)) { + case 1: ddsks=1;break; + case 4: + case 5: ddsks = __le32_to_cpu(sb->raid_disks)-1; break; + case 6: ddsks = __le32_to_cpu(sb->raid_disks)-2; break; + case 10: + layout = __le32_to_cpu(sb->layout); + ddsks = __le32_to_cpu(sb->raid_disks); + ddsks_denom = (layout&255) * ((layout>>8)&255); + } + if (ddsks) { + long long asize = __le64_to_cpu(sb->size); + asize = (asize << 9) * ddsks / ddsks_denom; + printf(" Array Size : %llu%s\n", + asize >> 10, human_size(asize)); + } + if (sb->size != sb->data_size) + printf(" Used Dev Size : %llu%s\n", + (unsigned long long)__le64_to_cpu(sb->size), + human_size(__le64_to_cpu(sb->size)<<9)); + } + if (sb->data_offset) + printf(" Data Offset : %llu sectors\n", + (unsigned long long)__le64_to_cpu(sb->data_offset)); + if (sb->new_offset && + (__le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) { + unsigned long long offset = __le64_to_cpu(sb->data_offset); + offset += (signed)(int32_t)__le32_to_cpu(sb->new_offset); + printf(" New Offset : %llu sectors\n", offset); + } + printf(" Super Offset : %llu sectors\n", + (unsigned long long)__le64_to_cpu(sb->super_offset)); + if (__le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET) + printf("Recovery Offset : %llu sectors\n", (unsigned long long)__le64_to_cpu(sb->recovery_offset)); + + st->ss->getinfo_super(st, &info, NULL); + if (info.space_after != 1 && + !(__le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) + printf(" Unused Space : before=%llu sectors, after=%llu sectors\n", + info.space_before, info.space_after); + + printf(" State : %s\n", (__le64_to_cpu(sb->resync_offset)+1)? "active":"clean"); + printf(" Device UUID : "); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->device_uuid[i]); + } + printf("\n"); + printf("\n"); + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + printf("Internal Bitmap : %ld sectors from superblock\n", + (long)(int32_t)__le32_to_cpu(sb->bitmap_offset)); + } + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE)) { + printf(" Reshape pos'n : %llu%s\n", (unsigned long long)__le64_to_cpu(sb->reshape_position)/2, + human_size(__le64_to_cpu(sb->reshape_position)<<9)); + if (__le32_to_cpu(sb->delta_disks)) { + printf(" Delta Devices : %d", __le32_to_cpu(sb->delta_disks)); + printf(" (%d->%d)\n", + __le32_to_cpu(sb->raid_disks)-__le32_to_cpu(sb->delta_disks), + __le32_to_cpu(sb->raid_disks)); + if ((int)__le32_to_cpu(sb->delta_disks) < 0) + delta_extra = -__le32_to_cpu(sb->delta_disks); + } + if (__le32_to_cpu(sb->new_level) != __le32_to_cpu(sb->level)) { + c = map_num(pers, __le32_to_cpu(sb->new_level)); + printf(" New Level : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->new_layout) != __le32_to_cpu(sb->layout)) { + if (__le32_to_cpu(sb->level) == 5) { + c = map_num(r5layout, __le32_to_cpu(sb->new_layout)); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 6) { + c = map_num(r6layout, __le32_to_cpu(sb->new_layout)); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 10) { + printf(" New Layout :"); + print_r10_layout(__le32_to_cpu(sb->new_layout)); + printf("\n"); + } + } + if (__le32_to_cpu(sb->new_chunk) != __le32_to_cpu(sb->chunksize)) + printf(" New Chunksize : %dK\n", __le32_to_cpu(sb->new_chunk)/2); + printf("\n"); + } + if (sb->devflags) { + printf(" Flags :"); + if (sb->devflags & WriteMostly1) + printf(" write-mostly"); + printf("\n"); + } + + atime = __le64_to_cpu(sb->utime) & 0xFFFFFFFFFFULL; + printf(" Update Time : %.24s\n", ctime(&atime)); + + if (sb->bblog_size && sb->bblog_offset) { + printf(" Bad Block Log : %d entries available at offset %ld sectors", + __le16_to_cpu(sb->bblog_size)*512/8, + (long)(int32_t)__le32_to_cpu(sb->bblog_offset)); + if (sb->feature_map & + __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + printf(" - bad blocks present."); + printf("\n"); + } + + if (calc_sb_1_csum(sb) == sb->sb_csum) + printf(" Checksum : %x - correct\n", __le32_to_cpu(sb->sb_csum)); + else + printf(" Checksum : %x - expected %x\n", __le32_to_cpu(sb->sb_csum), + __le32_to_cpu(calc_sb_1_csum(sb))); + printf(" Events : %llu\n", (unsigned long long)__le64_to_cpu(sb->events)); + printf("\n"); + if (__le32_to_cpu(sb->level) == 5) { + c = map_num(r5layout, __le32_to_cpu(sb->layout)); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 6) { + c = map_num(r6layout, __le32_to_cpu(sb->layout)); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 10) { + int lo = __le32_to_cpu(sb->layout); + printf(" Layout :"); + print_r10_layout(lo); + printf("\n"); + } + switch(__le32_to_cpu(sb->level)) { + case 0: + case 4: + case 5: + case 6: + case 10: + printf(" Chunk Size : %dK\n", __le32_to_cpu(sb->chunksize)/2); + break; + case -1: + printf(" Rounding : %dK\n", __le32_to_cpu(sb->chunksize)/2); + break; + default: break; + } + printf("\n"); +#if 0 + /* This turns out to just be confusing */ + printf(" Array Slot : %d (", __le32_to_cpu(sb->dev_number)); + for (i= __le32_to_cpu(sb->max_dev); i> 0 ; i--) + if (__le16_to_cpu(sb->dev_roles[i-1]) != MD_DISK_ROLE_SPARE) + break; + for (d=0; d < i; d++) { + int role = __le16_to_cpu(sb->dev_roles[d]); + if (d) printf(", "); + if (role == MD_DISK_ROLE_SPARE) printf("empty"); + else if(role == MD_DISK_ROLE_FAULTY) printf("failed"); + else printf("%d", role); + } + printf(")\n"); +#endif + printf(" Device Role : "); + role = role_from_sb(sb); + if (role >= MD_DISK_ROLE_FAULTY) + printf("spare\n"); + else if (role == MD_DISK_ROLE_JOURNAL) + printf("Journal\n"); + else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_REPLACEMENT)) + printf("Replacement device %d\n", role); + else + printf("Active device %d\n", role); + + printf(" Array State : "); + for (d=0; d<__le32_to_cpu(sb->raid_disks) + delta_extra; d++) { + int cnt = 0; + unsigned int i; + for (i=0; i< __le32_to_cpu(sb->max_dev); i++) { + unsigned int role = __le16_to_cpu(sb->dev_roles[i]); + if (role == d) + cnt++; + } + if (cnt == 2) + printf("R"); + else if (cnt == 1) + printf("A"); + else if (cnt == 0) + printf("."); + else + printf("?"); + } +#if 0 + /* This is confusing too */ + faulty = 0; + for (i=0; i< __le32_to_cpu(sb->max_dev); i++) { + int role = __le16_to_cpu(sb->dev_roles[i]); + if (role == MD_DISK_ROLE_FAULTY) + faulty++; + } + if (faulty) printf(" %d failed", faulty); +#endif + printf(" ('A' == active, '.' == missing, 'R' == replacing)"); + printf("\n"); +} + +static void brief_examine_super1(struct supertype *st, int verbose) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + unsigned long long sb_offset; + char *nm; + char *c=map_num(pers, __le32_to_cpu(sb->level)); + + nm = strchr(sb->set_name, ':'); + if (nm) + nm++; + else if (sb->set_name[0]) + nm = sb->set_name; + else + nm = NULL; + + printf("ARRAY "); + if (nm) { + printf("/dev/md/"); + print_escape(nm); + putchar(' '); + } + if (verbose && c) + printf(" level=%s", c); + sb_offset = __le64_to_cpu(sb->super_offset); + if (sb_offset <= 4) + printf(" metadata=1.1 "); + else if (sb_offset <= 8) + printf(" metadata=1.2 "); + else + printf(" metadata=1.0 "); + if (verbose) + printf("num-devices=%d ", __le32_to_cpu(sb->raid_disks)); + printf("UUID="); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } + if (sb->set_name[0]) { + printf(" name="); + print_quoted(sb->set_name); + } + printf("\n"); +} + +static void export_examine_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + int len = 32; + int layout; + + printf("MD_LEVEL=%s\n", map_num(pers, __le32_to_cpu(sb->level))); + printf("MD_DEVICES=%d\n", __le32_to_cpu(sb->raid_disks)); + for (i=0; i<32; i++) + if (sb->set_name[i] == '\n' || + sb->set_name[i] == '\0') { + len = i; + break; + } + if (len) + printf("MD_NAME=%.*s\n", len, sb->set_name); + if (__le32_to_cpu(sb->level) > 0) { + int ddsks = 0, ddsks_denom = 1; + switch(__le32_to_cpu(sb->level)) { + case 1: ddsks=1;break; + case 4: + case 5: ddsks = __le32_to_cpu(sb->raid_disks)-1; break; + case 6: ddsks = __le32_to_cpu(sb->raid_disks)-2; break; + case 10: + layout = __le32_to_cpu(sb->layout); + ddsks = __le32_to_cpu(sb->raid_disks); + ddsks_denom = (layout&255) * ((layout>>8)&255); + } + if (ddsks) { + long long asize = __le64_to_cpu(sb->size); + asize = (asize << 9) * ddsks / ddsks_denom; + printf("MD_ARRAY_SIZE=%s\n",human_size_brief(asize,JEDEC)); + } + } + printf("MD_UUID="); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } + printf("\n"); + printf("MD_UPDATE_TIME=%llu\n", + __le64_to_cpu(sb->utime) & 0xFFFFFFFFFFULL); + printf("MD_DEV_UUID="); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->device_uuid[i]); + } + printf("\n"); + printf("MD_EVENTS=%llu\n", + (unsigned long long)__le64_to_cpu(sb->events)); +} + +static int copy_metadata1(struct supertype *st, int from, int to) +{ + /* Read superblock. If it looks good, write it out. + * Then if a bitmap is present, copy that. + * And if a bad-block-list is present, copy that too. + */ + void *buf; + unsigned long long dsize, sb_offset; + const int bufsize = 4*1024; + struct mdp_superblock_1 super, *sb; + + if (posix_memalign(&buf, 4096, bufsize) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + dsize >>= 9; + if (dsize < 24) + goto err; + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4*2; + break; + default: + goto err; + } + + if (lseek64(from, sb_offset << 9, 0) < 0LL) + goto err; + if (read(from, buf, bufsize) != bufsize) + goto err; + + sb = buf; + super = *sb; // save most of sb for when we reuse buf + + if (__le32_to_cpu(super.magic) != MD_SB_MAGIC || + __le32_to_cpu(super.major_version) != 1 || + __le64_to_cpu(super.super_offset) != sb_offset || + calc_sb_1_csum(sb) != super.sb_csum) + goto err; + + if (lseek64(to, sb_offset << 9, 0) < 0LL) + goto err; + if (write(to, buf, bufsize) != bufsize) + goto err; + + if (super.feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) { + unsigned long long bitmap_offset = sb_offset; + int bytes = 4096; // just an estimate. + int written = 0; + struct align_fd afrom, ato; + + init_afd(&afrom, from); + init_afd(&ato, to); + + bitmap_offset += (int32_t)__le32_to_cpu(super.bitmap_offset); + + if (lseek64(from, bitmap_offset<<9, 0) < 0) + goto err; + if (lseek64(to, bitmap_offset<<9, 0) < 0) + goto err; + + for (written = 0; written < bytes ; ) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (aread(&afrom, buf, n) != n) + goto err; + if (written == 0) { + /* have the header, can calculate + * correct bitmap bytes */ + bitmap_super_t *bms; + bms = (void*)buf; + bytes = calc_bitmap_size(bms, 512); + if (n > bytes) + n = bytes; + } + if (awrite(&ato, buf, n) != n) + goto err; + written += n; + } + } + + if (super.bblog_size != 0 && + __le32_to_cpu(super.bblog_size) <= 100 && + super.bblog_offset != 0 && + (super.feature_map & __le32_to_cpu(MD_FEATURE_BAD_BLOCKS))) { + /* There is a bad block log */ + unsigned long long bb_offset = sb_offset; + int bytes = __le32_to_cpu(super.bblog_size) * 512; + int written = 0; + struct align_fd afrom, ato; + + init_afd(&afrom, from); + init_afd(&ato, to); + + bb_offset += (int32_t)__le32_to_cpu(super.bblog_offset); + + if (lseek64(from, bb_offset<<9, 0) < 0) + goto err; + if (lseek64(to, bb_offset<<9, 0) < 0) + goto err; + + for (written = 0; written < bytes ; ) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (aread(&afrom, buf, n) != n) + goto err; + + if (awrite(&ato, buf, n) != n) + goto err; + written += n; + } + } + + free(buf); + return 0; + +err: + free(buf); + return 1; +} + +static void detail_super1(struct supertype *st, char *homehost) +{ + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + int i; + int l = homehost ? strlen(homehost) : 0; + + printf(" Name : %.32s", sb->set_name); + if (l > 0 && l < 32 && + sb->set_name[l] == ':' && + strncmp(sb->set_name, homehost, l) == 0) + printf(" (local to host %s)", homehost); + if (bms->nodes > 0 && (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) + printf("\n Cluster Name : %-64s", bms->cluster_name); + printf("\n UUID : "); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } + printf("\n Events : %llu\n\n", (unsigned long long)__le64_to_cpu(sb->events)); +} + +static void brief_detail_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + + if (sb->set_name[0]) { + printf(" name="); + print_quoted(sb->set_name); + } + printf(" UUID="); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } +} + +static void export_detail_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + int len = 32; + + for (i=0; i<32; i++) + if (sb->set_name[i] == '\n' || + sb->set_name[i] == '\0') { + len = i; + break; + } + if (len) + printf("MD_NAME=%.*s\n", len, sb->set_name); +} + +static int examine_badblocks_super1(struct supertype *st, int fd, char *devname) +{ + struct mdp_superblock_1 *sb = st->sb; + unsigned long long offset; + int size; + __u64 *bbl, *bbp; + int i; + + if (!sb->bblog_size || __le32_to_cpu(sb->bblog_size) > 100 + || !sb->bblog_offset){ + printf("No bad-blocks list configured on %s\n", devname); + return 0; + } + if ((sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + == 0) { + printf("Bad-blocks list is empty in %s\n", devname); + return 0; + } + + size = __le32_to_cpu(sb->bblog_size)* 512; + if (posix_memalign((void**)&bbl, 4096, size) != 0) { + pr_err("could not allocate badblocks list\n"); + return 0; + } + offset = __le64_to_cpu(sb->super_offset) + + (int)__le32_to_cpu(sb->bblog_offset); + offset <<= 9; + if (lseek64(fd, offset, 0) < 0) { + pr_err("Cannot seek to bad-blocks list\n"); + return 1; + } + if (read(fd, bbl, size) != size) { + pr_err("Cannot read bad-blocks list\n"); + return 1; + } + /* 64bits per entry. 10 bits is block-count, 54 bits is block + * offset. Blocks are sectors unless bblog->shift makes them bigger + */ + bbp = (__u64*)bbl; + printf("Bad-blocks on %s:\n", devname); + for (i = 0; i < size/8; i++, bbp++) { + __u64 bb = __le64_to_cpu(*bbp); + int count = bb & 0x3ff; + unsigned long long sector = bb >> 10; + + if (bb + 1 == 0) + break; + + sector <<= sb->bblog_shift; + count <<= sb->bblog_shift; + + printf("%20llu for %d sectors\n", sector, count); + } + return 0; +} + +#endif + +static int match_home1(struct supertype *st, char *homehost) +{ + struct mdp_superblock_1 *sb = st->sb; + int l = homehost ? strlen(homehost) : 0; + + return (l > 0 && l < 32 && + sb->set_name[l] == ':' && + strncmp(sb->set_name, homehost, l) == 0); +} + +static void uuid_from_super1(struct supertype *st, int uuid[4]) +{ + struct mdp_superblock_1 *super = st->sb; + char *cuuid = (char*)uuid; + int i; + for (i=0; i<16; i++) + cuuid[i] = super->set_uuid[i]; +} + +static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) +{ + struct mdp_superblock_1 *sb = st->sb; + struct bitmap_super_s *bsb = (void*)(((char*)sb)+MAX_SB_SIZE); + struct misc_dev_info *misc = (void*)(((char*)sb)+MAX_SB_SIZE+BM_SUPER_SIZE); + int working = 0; + unsigned int i; + unsigned int role; + unsigned int map_disks = info->array.raid_disks; + unsigned long long super_offset; + unsigned long long data_size; + + memset(info, 0, sizeof(*info)); + info->array.major_version = 1; + info->array.minor_version = st->minor_version; + info->array.patch_version = 0; + info->array.raid_disks = __le32_to_cpu(sb->raid_disks); + info->array.level = __le32_to_cpu(sb->level); + info->array.layout = __le32_to_cpu(sb->layout); + info->array.md_minor = -1; + info->array.ctime = __le64_to_cpu(sb->ctime); + info->array.utime = __le64_to_cpu(sb->utime); + info->array.chunk_size = __le32_to_cpu(sb->chunksize)*512; + info->array.state = + (__le64_to_cpu(sb->resync_offset) == MaxSector) + ? 1 : 0; + if (__le32_to_cpu(bsb->nodes) > 1) + info->array.state |= (1 << MD_SB_CLUSTERED); + + info->data_offset = __le64_to_cpu(sb->data_offset); + info->component_size = __le64_to_cpu(sb->size); + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) + info->bitmap_offset = (int32_t)__le32_to_cpu(sb->bitmap_offset); + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.number = __le32_to_cpu(sb->dev_number); + if (__le32_to_cpu(sb->dev_number) >= __le32_to_cpu(sb->max_dev) || + __le32_to_cpu(sb->dev_number) >= MAX_DEVS) + role = MD_DISK_ROLE_FAULTY; + else + role = __le16_to_cpu(sb->dev_roles[__le32_to_cpu(sb->dev_number)]); + + super_offset = __le64_to_cpu(sb->super_offset); + if (info->array.level <= 0) + data_size = __le64_to_cpu(sb->data_size); + else + data_size = __le64_to_cpu(sb->size); + if (info->data_offset < super_offset) { + unsigned long long end; + info->space_before = info->data_offset; + end = super_offset; + + if (sb->bblog_offset && sb->bblog_size) { + unsigned long long bboffset = super_offset; + bboffset += (int32_t)__le32_to_cpu(sb->bblog_offset); + if (bboffset < end) + end = bboffset; + } + + if (super_offset + info->bitmap_offset < end) + end = super_offset + info->bitmap_offset; + + if (info->data_offset + data_size < end) + info->space_after = end - data_size - info->data_offset; + else + info->space_after = 0; + } else { + unsigned long long earliest; + earliest = super_offset + (32+4)*2; /* match kernel */ + if (info->bitmap_offset > 0) { + unsigned long long bmend = info->bitmap_offset; + unsigned long long size = __le64_to_cpu(bsb->sync_size); + size /= __le32_to_cpu(bsb->chunksize) >> 9; + size = (size + 7) >> 3; + size += sizeof(bitmap_super_t); + size = ROUND_UP(size, 4096); + size /= 512; + bmend += size; + if (bmend > earliest) + earliest = bmend; + } + if (sb->bblog_offset && sb->bblog_size) { + unsigned long long bbend = super_offset; + bbend += (int32_t)__le32_to_cpu(sb->bblog_offset); + bbend += __le32_to_cpu(sb->bblog_size); + if (bbend > earliest) + earliest = bbend; + } + if (earliest < info->data_offset) + info->space_before = info->data_offset - earliest; + else + info->space_before = 0; + info->space_after = misc->device_size - data_size - info->data_offset; + } + if (info->space_before == 0 && info->space_after == 0) { + /* It will look like we don't support data_offset changes, + * be we do - it's just that there is no room. + * A change that reduced the number of devices should + * still be allowed, so set the otherwise useless value of '1' + */ + info->space_after = 1; + } + + info->disk.raid_disk = -1; + switch(role) { + case MD_DISK_ROLE_SPARE: + info->disk.state = 0; /* spare: not active, not sync, not faulty */ + break; + case MD_DISK_ROLE_FAULTY: + info->disk.state = 1; /* faulty */ + break; + case MD_DISK_ROLE_JOURNAL: + info->disk.state = (1 << MD_DISK_JOURNAL); + info->disk.raid_disk = role; + info->space_after = (misc->device_size - info->data_offset) % 8; /* journal uses all 4kB blocks*/ + break; + default: + info->disk.state = 6; /* active and in sync */ + info->disk.raid_disk = role; + } + if (sb->devflags & WriteMostly1) + info->disk.state |= (1 << MD_DISK_WRITEMOSTLY); + info->events = __le64_to_cpu(sb->events); + sprintf(info->text_version, "1.%d", st->minor_version); + info->safe_mode_delay = 200; + + memcpy(info->uuid, sb->set_uuid, 16); + + strncpy(info->name, sb->set_name, 32); + info->name[32] = 0; + + if ((__le32_to_cpu(sb->feature_map)&MD_FEATURE_REPLACEMENT)) { + info->disk.state &= ~(1 << MD_DISK_SYNC); + info->disk.state |= 1 << MD_DISK_REPLACEMENT; + } + + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RECOVERY_OFFSET)) + info->recovery_start = __le32_to_cpu(sb->recovery_offset); + else + info->recovery_start = MaxSector; + + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE)) { + info->reshape_active = 1; + if ((sb->feature_map & __le32_to_cpu(MD_FEATURE_NEW_OFFSET)) && + sb->new_offset != 0) + info->reshape_active |= RESHAPE_NO_BACKUP; + info->reshape_progress = __le64_to_cpu(sb->reshape_position); + info->new_level = __le32_to_cpu(sb->new_level); + info->delta_disks = __le32_to_cpu(sb->delta_disks); + info->new_layout = __le32_to_cpu(sb->new_layout); + info->new_chunk = __le32_to_cpu(sb->new_chunk)<<9; + if (info->delta_disks < 0) + info->array.raid_disks -= info->delta_disks; + } else + info->reshape_active = 0; + + info->recovery_blocked = info->reshape_active; + + if (map) + for (i=0; imax_dev); i++) { + role = __le16_to_cpu(sb->dev_roles[i]); + if (/*role == MD_DISK_ROLE_SPARE || */role < (unsigned) info->array.raid_disks) { + working++; + if (map && role < map_disks) + map[role] = 1; + } + } + + info->array.working_disks = working; + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_JOURNAL)) + info->journal_device_required = 1; + info->journal_clean = 0; +} + +static struct mdinfo *container_content1(struct supertype *st, char *subarray) +{ + struct mdinfo *info; + + if (subarray) + return NULL; + + info = xmalloc(sizeof(*info)); + getinfo_super1(st, info, NULL); + return info; +} + +static int update_super1(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* NOTE: for 'assemble' and 'force' we need to return non-zero + * if any change was made. For others, the return value is + * ignored. + */ + int rv = 0; + int lockid; + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) { + rv = cluster_get_dlmlock(&lockid); + if (rv) { + pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv); + cluster_release_dlmlock(lockid); + return rv; + } + } + + if (strcmp(update, "homehost") == 0 && + homehost) { + /* Note that 'homehost' is special as it is really + * a "name" update. + */ + char *c; + update = "name"; + c = strchr(sb->set_name, ':'); + if (c) + strncpy(info->name, c+1, 31 - (c-sb->set_name)); + else + strncpy(info->name, sb->set_name, 32); + info->name[32] = 0; + } + + if (strcmp(update, "force-one")==0) { + /* Not enough devices for a working array, + * so bring this one up-to-date + */ + if (sb->events != __cpu_to_le64(info->events)) + rv = 1; + sb->events = __cpu_to_le64(info->events); + } else if (strcmp(update, "force-array")==0) { + /* Degraded array and 'force' requests to + * maybe need to mark it 'clean'. + */ + switch(__le32_to_cpu(sb->level)) { + case 5: case 4: case 6: + /* need to force clean */ + if (sb->resync_offset != MaxSector) + rv = 1; + sb->resync_offset = MaxSector; + } + } else if (strcmp(update, "assemble")==0) { + int d = info->disk.number; + int want; + if (info->disk.state & (1<disk.raid_disk; + else if (info->disk.state & (1<dev_roles[d] != __cpu_to_le16(want)) { + sb->dev_roles[d] = __cpu_to_le16(want); + rv = 1; + } + if (info->reshape_active && + sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE) && + info->delta_disks >= 0 && + info->reshape_progress < __le64_to_cpu(sb->reshape_position)) { + sb->reshape_position = __cpu_to_le64(info->reshape_progress); + rv = 1; + } + if (info->reshape_active && + sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE) && + info->delta_disks < 0 && + info->reshape_progress > __le64_to_cpu(sb->reshape_position)) { + sb->reshape_position = __cpu_to_le64(info->reshape_progress); + rv = 1; + } + } else if (strcmp(update, "linear-grow-new") == 0) { + unsigned int i; + int rfd, fd; + unsigned int max = __le32_to_cpu(sb->max_dev); + + for (i=0 ; i < max ; i++) + if (__le16_to_cpu(sb->dev_roles[i]) >= MD_DISK_ROLE_FAULTY) + break; + sb->dev_number = __cpu_to_le32(i); + info->disk.number = i; + if (max >= __le32_to_cpu(sb->max_dev)) + sb->max_dev = __cpu_to_le32(max+1); + + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->device_uuid, 16) != 16) { + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->device_uuid, r, 16); + } + if (rfd >= 0) + close(rfd); + + sb->dev_roles[i] = + __cpu_to_le16(info->disk.raid_disk); + + fd = open(devname, O_RDONLY); + if (fd >= 0) { + unsigned long long ds; + get_dev_size(fd, devname, &ds); + close(fd); + ds >>= 9; + if (__le64_to_cpu(sb->super_offset) < + __le64_to_cpu(sb->data_offset)) { + sb->data_size = __cpu_to_le64( + ds - __le64_to_cpu(sb->data_offset)); + } else { + ds -= 8*2; + ds &= ~(unsigned long long)(4*2-1); + sb->super_offset = __cpu_to_le64(ds); + sb->data_size = __cpu_to_le64( + ds - __le64_to_cpu(sb->data_offset)); + } + } + } else if (strcmp(update, "linear-grow-update") == 0) { + sb->raid_disks = __cpu_to_le32(info->array.raid_disks); + sb->dev_roles[info->disk.number] = + __cpu_to_le16(info->disk.raid_disk); + } else if (strcmp(update, "resync") == 0) { + /* make sure resync happens */ + sb->resync_offset = 0ULL; + } else if (strcmp(update, "uuid") == 0) { + copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid); + + if (__le32_to_cpu(sb->feature_map)&MD_FEATURE_BITMAP_OFFSET) { + struct bitmap_super_s *bm; + bm = (struct bitmap_super_s*)(st->sb+MAX_SB_SIZE); + memcpy(bm->uuid, sb->set_uuid, 16); + } + } else if (strcmp(update, "no-bitmap") == 0) { + sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); + } else if (strcmp(update, "bbl") == 0) { + /* only possible if there is room after the bitmap, or if + * there is no bitmap + */ + unsigned long long sb_offset = __le64_to_cpu(sb->super_offset); + unsigned long long data_offset = __le64_to_cpu(sb->data_offset); + long bitmap_offset = (long)(int32_t)__le32_to_cpu(sb->bitmap_offset); + long bm_sectors = 0; + long space; + +#ifndef MDASSEMBLE + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + struct bitmap_super_s *bsb; + bsb = (struct bitmap_super_s *)(((char*)sb)+MAX_SB_SIZE); + bm_sectors = bitmap_sectors(bsb); + } +#endif + if (sb_offset < data_offset) { + /* 1.1 or 1.2. Put bbl after bitmap leaving at least 32K + */ + long bb_offset; + bb_offset = sb_offset + 8; + if (bm_sectors && bitmap_offset > 0) + bb_offset = bitmap_offset + bm_sectors; + while (bb_offset < (long)sb_offset + 8 + 32*2 + && bb_offset + 8+8 <= (long)data_offset) + /* too close to bitmap, and room to grow */ + bb_offset += 8; + if (bb_offset + 8 <= (long)data_offset) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(bb_offset); + } + } else { + /* 1.0 - Put bbl just before super block */ + if (bm_sectors && bitmap_offset < 0) + space = -bitmap_offset - bm_sectors; + else + space = sb_offset - data_offset - + __le64_to_cpu(sb->data_size); + if (space >= 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32((unsigned)-8); + } + } + } else if (strcmp(update, "no-bbl") == 0) { + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + pr_err("Cannot remove active bbl from %s\n",devname); + else { + sb->bblog_size = 0; + sb->bblog_shift = 0; + sb->bblog_offset = 0; + } + } else if (strcmp(update, "force-no-bbl") == 0) { + sb->feature_map &= ~ __cpu_to_le32(MD_FEATURE_BAD_BLOCKS); + sb->bblog_size = 0; + sb->bblog_shift = 0; + sb->bblog_offset = 0; + } else if (strcmp(update, "name") == 0) { + if (info->name[0] == 0) + sprintf(info->name, "%d", info->array.md_minor); + memset(sb->set_name, 0, sizeof(sb->set_name)); + if (homehost && + strchr(info->name, ':') == NULL && + strlen(homehost)+1+strlen(info->name) < 32) { + strcpy(sb->set_name, homehost); + strcat(sb->set_name, ":"); + strcat(sb->set_name, info->name); + } else + strcpy(sb->set_name, info->name); + } else if (strcmp(update, "devicesize") == 0 && + __le64_to_cpu(sb->super_offset) < + __le64_to_cpu(sb->data_offset)) { + /* set data_size to device size less data_offset */ + struct misc_dev_info *misc = (struct misc_dev_info*) + (st->sb + MAX_SB_SIZE + BM_SUPER_SIZE); + sb->data_size = __cpu_to_le64( + misc->device_size - __le64_to_cpu(sb->data_offset)); + } else if (strncmp(update, "revert-reshape", 14) == 0) { + rv = -2; + if (!(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE))) + pr_err("No active reshape to revert on %s\n", + devname); + else { + __u32 temp; + unsigned long long reshape_sectors; + long reshape_chunk; + rv = 0; + /* If the reshape hasn't started, just stop it. + * It is conceivable that a stripe was modified but + * the metadata not updated. In that case the backup + * should have been used to get passed the critical stage. + * If that couldn't happen, the "-nobackup" version + * will be used. + */ + if (strcmp(update, "revert-reshape-nobackup") == 0 && + sb->reshape_position == 0 && + (__le32_to_cpu(sb->delta_disks) > 0 || + (__le32_to_cpu(sb->delta_disks) == 0 && + !(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS))))) { + sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); + sb->raid_disks = __cpu_to_le32(__le32_to_cpu(sb->raid_disks) - + __le32_to_cpu(sb->delta_disks)); + sb->delta_disks = 0; + goto done; + } + /* reshape_position is a little messy. + * Its value must be a multiple of the larger + * chunk size, and of the "after" data disks. + * So when reverting we need to change it to + * be a multiple of the new "after" data disks, + * which is the old "before". + * If it isn't already a multiple of 'before', + * the only thing we could do would be + * copy some block around on the disks, which + * is easy to get wrong. + * So we reject a revert-reshape unless the + * alignment is good. + */ + if (__le32_to_cpu(sb->level) >= 4 && + __le32_to_cpu(sb->level) <= 6) { + reshape_sectors = __le64_to_cpu(sb->reshape_position); + reshape_chunk = __le32_to_cpu(sb->new_chunk); + reshape_chunk *= __le32_to_cpu(sb->raid_disks) - __le32_to_cpu(sb->delta_disks) - + (__le32_to_cpu(sb->level)==6 ? 2 : 1); + if (reshape_sectors % reshape_chunk) { + pr_err("Reshape position is not suitably aligned.\n"); + pr_err("Try normal assembly and stop again\n"); + return -2; + } + } + sb->raid_disks = __cpu_to_le32(__le32_to_cpu(sb->raid_disks) - + __le32_to_cpu(sb->delta_disks)); + if (sb->delta_disks == 0) + sb->feature_map ^= __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); + else + sb->delta_disks = __cpu_to_le32(-__le32_to_cpu(sb->delta_disks)); + + temp = sb->new_layout; + sb->new_layout = sb->layout; + sb->layout = temp; + + temp = sb->new_chunk; + sb->new_chunk = sb->chunksize; + sb->chunksize = temp; + + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_NEW_OFFSET)) { + long offset_delta = (int32_t)__le32_to_cpu(sb->new_offset); + sb->data_offset = __cpu_to_le64(__le64_to_cpu(sb->data_offset) + offset_delta); + sb->new_offset = __cpu_to_le32(-offset_delta); + sb->data_size = __cpu_to_le64(__le64_to_cpu(sb->data_size) - offset_delta); + } + done:; + } + } else if (strcmp(update, "_reshape_progress")==0) + sb->reshape_position = __cpu_to_le64(info->reshape_progress); + else if (strcmp(update, "writemostly")==0) + sb->devflags |= WriteMostly1; + else if (strcmp(update, "readwrite")==0) + sb->devflags &= ~WriteMostly1; + else + rv = -1; + + sb->sb_csum = calc_sb_1_csum(sb); + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) + cluster_release_dlmlock(lockid); + + return rv; +} + +static int init_super1(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + struct mdp_superblock_1 *sb; + int spares; + int rfd; + char defname[10]; + int sbsize; + + if (posix_memalign((void**)&sb, 4096, SUPER1_SIZE) != 0) { + pr_err("could not allocate superblock\n"); + return 0; + } + memset(sb, 0, SUPER1_SIZE); + + st->sb = sb; + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + spares = info->working_disks - info->active_disks; + if (info->raid_disks + spares > MAX_DEVS) { + pr_err("too many devices requested: %d+%d > %d\n", + info->raid_disks , spares, MAX_DEVS); + return 0; + } + + sb->magic = __cpu_to_le32(MD_SB_MAGIC); + sb->major_version = __cpu_to_le32(1); + sb->feature_map = 0; + sb->pad0 = 0; + + if (uuid) + copy_uuid(sb->set_uuid, uuid, super1.swapuuid); + else { + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->set_uuid, 16) != 16) { + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->set_uuid, r, 16); + } + if (rfd >= 0) close(rfd); + } + + if (name == NULL || *name == 0) { + sprintf(defname, "%d", info->md_minor); + name = defname; + } + if (homehost && + strchr(name, ':')== NULL && + strlen(homehost)+1+strlen(name) < 32) { + strcpy(sb->set_name, homehost); + strcat(sb->set_name, ":"); + strcat(sb->set_name, name); + } else + strcpy(sb->set_name, name); + + sb->ctime = __cpu_to_le64((unsigned long long)time(0)); + sb->level = __cpu_to_le32(info->level); + sb->layout = __cpu_to_le32(info->layout); + sb->size = __cpu_to_le64(size*2ULL); + sb->chunksize = __cpu_to_le32(info->chunk_size>>9); + sb->raid_disks = __cpu_to_le32(info->raid_disks); + + sb->data_offset = __cpu_to_le64(data_offset); + sb->data_size = __cpu_to_le64(0); + sb->super_offset = __cpu_to_le64(0); + sb->recovery_offset = __cpu_to_le64(0); + + sb->utime = sb->ctime; + sb->events = __cpu_to_le64(1); + if (info->state & (1<resync_offset = MaxSector; + else + sb->resync_offset = 0; + sbsize = sizeof(struct mdp_superblock_1) + 2 * (info->raid_disks + spares); + sbsize = ROUND_UP(sbsize, 512); + sb->max_dev = __cpu_to_le32((sbsize - sizeof(struct mdp_superblock_1)) / 2); + + memset(sb->dev_roles, 0xff, MAX_SB_SIZE - sizeof(struct mdp_superblock_1)); + + return 1; +} + +struct devinfo { + int fd; + char *devname; + long long data_offset; + mdu_disk_info_t disk; + struct devinfo *next; +}; +#ifndef MDASSEMBLE +/* Add a device to the superblock being created */ +static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname, unsigned long long data_offset) +{ + struct mdp_superblock_1 *sb = st->sb; + __u16 *rp = sb->dev_roles + dk->number; + struct devinfo *di, **dip; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + int rv, lockid; + + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) { + rv = cluster_get_dlmlock(&lockid); + if (rv) { + pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv); + cluster_release_dlmlock(lockid); + return rv; + } + } + + if ((dk->state & 6) == 6) /* active, sync */ + *rp = __cpu_to_le16(dk->raid_disk); + else if (dk->state & (1<state & ~2) == 0) /* active or idle -> spare */ + *rp = MD_DISK_ROLE_SPARE; + else + *rp = MD_DISK_ROLE_FAULTY; + + if (dk->number >= (int)__le32_to_cpu(sb->max_dev) && + __le32_to_cpu(sb->max_dev) < MAX_DEVS) + sb->max_dev = __cpu_to_le32(dk->number+1); + + sb->dev_number = __cpu_to_le32(dk->number); + sb->devflags = 0; /* don't copy another disks flags */ + sb->sb_csum = calc_sb_1_csum(sb); + + dip = (struct devinfo **)&st->info; + while (*dip) + dip = &(*dip)->next; + di = xmalloc(sizeof(struct devinfo)); + di->fd = fd; + di->devname = devname; + di->disk = *dk; + di->data_offset = data_offset; + di->next = NULL; + *dip = di; + + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) + cluster_release_dlmlock(lockid); + + return 0; +} +#endif + +static int locate_bitmap1(struct supertype *st, int fd); + +static int store_super1(struct supertype *st, int fd) +{ + struct mdp_superblock_1 *sb = st->sb; + unsigned long long sb_offset; + struct align_fd afd; + int sbsize; + unsigned long long dsize; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + int rv, lockid; + + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) { + rv = cluster_get_dlmlock(&lockid); + if (rv) { + pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv); + cluster_release_dlmlock(lockid); + return rv; + } + } + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + dsize >>= 9; + + if (dsize < 24) + return 2; + + init_afd(&afd, fd); + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4*2; + break; + default: + return -EINVAL; + } + + if (sb_offset != __le64_to_cpu(sb->super_offset) && + 0 != __le64_to_cpu(sb->super_offset) + ) { + pr_err("internal error - sb_offset is wrong\n"); + abort(); + } + + if (lseek64(fd, sb_offset << 9, 0)< 0LL) + return 3; + + sbsize = ROUND_UP(sizeof(*sb) + 2 * __le32_to_cpu(sb->max_dev), 512); + + if (awrite(&afd, sb, sbsize) != sbsize) + return 4; + + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + struct bitmap_super_s *bm = (struct bitmap_super_s*) + (((char*)sb)+MAX_SB_SIZE); + if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) { + locate_bitmap1(st, fd); + if (awrite(&afd, bm, sizeof(*bm)) != sizeof(*bm)) + return 5; + } + } + fsync(fd); + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) + cluster_release_dlmlock(lockid); + + return 0; +} + +static int load_super1(struct supertype *st, int fd, char *devname); + +static unsigned long choose_bm_space(unsigned long devsize) +{ + /* if the device is bigger than 8Gig, save 64k for bitmap usage, + * if bigger than 200Gig, save 128k + * NOTE: result must be multiple of 4K else bad things happen + * on 4K-sector devices. + */ + if (devsize < 64*2) return 0; + if (devsize - 64*2 >= 200*1024*1024*2) + return 128*2; + if (devsize - 4*2 > 8*1024*1024*2) + return 64*2; + return 4*2; +} + +static void free_super1(struct supertype *st); + +#define META_BLOCK_SIZE 4096 +__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len); + +#ifndef MDASSEMBLE +static int write_empty_r5l_meta_block(struct supertype *st, int fd) +{ + struct r5l_meta_block *mb; + struct mdp_superblock_1 *sb = st->sb; + struct align_fd afd; + __u32 crc; + + init_afd(&afd, fd); + + if (posix_memalign((void**)&mb, 4096, META_BLOCK_SIZE) != 0) { + pr_err("Could not allocate memory for the meta block.\n"); + return 1; + } + + memset(mb, 0, META_BLOCK_SIZE); + + mb->magic = __cpu_to_le32(R5LOG_MAGIC); + mb->version = R5LOG_VERSION; + mb->meta_size = __cpu_to_le32(sizeof(struct r5l_meta_block)); + mb->seq = __cpu_to_le64(random32()); + mb->position = __cpu_to_le64(0); + + crc = crc32c_le(0xffffffff, sb->set_uuid, sizeof(sb->set_uuid)); + crc = crc32c_le(crc, (void *)mb, META_BLOCK_SIZE); + mb->checksum = crc; + + if (lseek64(fd, (sb->data_offset) * 512, 0) < 0LL) { + pr_err("cannot seek to offset of the meta block\n"); + goto fail_to_write; + } + + if (awrite(&afd, mb, META_BLOCK_SIZE) != META_BLOCK_SIZE) { + pr_err("failed to store write the meta block \n"); + goto fail_to_write; + } + fsync(fd); + + free(mb); + return 0; + +fail_to_write: + free(mb); + return 1; +} + +static int write_init_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + struct supertype *refst; + int rfd; + int rv = 0; + unsigned long long bm_space; + struct devinfo *di; + unsigned long long dsize, array_size; + unsigned long long sb_offset; + unsigned long long data_offset; + + for (di = st->info; di; di = di->next) { + if (di->disk.state & (1 << MD_DISK_JOURNAL)) + sb->feature_map |= MD_FEATURE_JOURNAL; + } + + for (di = st->info; di; di = di->next) { + if (di->disk.state & (1 << MD_DISK_FAULTY)) + continue; + if (di->fd < 0) + continue; + + while (Kill(di->devname, NULL, 0, -1, 1) == 0) + ; + + sb->dev_number = __cpu_to_le32(di->disk.number); + if (di->disk.state & (1<devflags |= WriteMostly1; + else + sb->devflags &= ~WriteMostly1; + + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->device_uuid, 16) != 16) { + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->device_uuid, r, 16); + } + if (rfd >= 0) + close(rfd); + + if (!(di->disk.state & (1<events = 0; + + refst = dup_super(st); + if (load_super1(refst, di->fd, NULL)==0) { + struct mdp_superblock_1 *refsb = refst->sb; + + memcpy(sb->device_uuid, refsb->device_uuid, 16); + if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) { + /* same array, so preserve events and + * dev_number */ + sb->events = refsb->events; + /* bugs in 2.6.17 and earlier mean the + * dev_number chosen in Manage must be preserved + */ + if (get_linux_version() >= 2006018) + sb->dev_number = refsb->dev_number; + } + free_super1(refst); + } + free(refst); + + if (!get_dev_size(di->fd, NULL, &dsize)) { + rv = 1; + goto error_out; + } + dsize >>= 9; + + if (dsize < 24) { + close(di->fd); + rv = 2; + goto error_out; + } + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + * data_offset has already been set. + */ + array_size = __le64_to_cpu(sb->size); + /* work out how much space we left for a bitmap, + * Add 8 sectors for bad block log */ + bm_space = choose_bm_space(array_size) + 8; + + data_offset = di->data_offset; + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + switch(st->minor_version) { + case 0: + if (data_offset == INVALID_SECTORS) + data_offset = 0; + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + sb->data_offset = __cpu_to_le64(data_offset); + sb->super_offset = __cpu_to_le64(sb_offset); + if (sb_offset < array_size + bm_space) + bm_space = sb_offset - array_size; + sb->data_size = __cpu_to_le64(sb_offset - bm_space); + if (bm_space >= 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32((unsigned)-8); + } + break; + case 1: + sb->super_offset = __cpu_to_le64(0); + if (data_offset == INVALID_SECTORS) + data_offset = 16; + + sb->data_offset = __cpu_to_le64(data_offset); + sb->data_size = __cpu_to_le64(dsize - data_offset); + if (data_offset >= 8 + 32*2 + 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(8 + 32*2); + } else if (data_offset >= 16) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(data_offset-8); + } + break; + case 2: + sb_offset = 4*2; + sb->super_offset = __cpu_to_le64(sb_offset); + if (data_offset == INVALID_SECTORS) + data_offset = 24; + + sb->data_offset = __cpu_to_le64(data_offset); + sb->data_size = __cpu_to_le64(dsize - data_offset); + if (data_offset >= 16 + 32*2 + 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(8 + 32*2); + } else if (data_offset >= 16+16) { + sb->bblog_size = __cpu_to_le16(8); + /* '8' sectors for the bblog, and another '8' + * because we want offset from superblock, not + * start of device. + */ + sb->bblog_offset = __cpu_to_le32(data_offset-8-8); + } + break; + default: + pr_err("Failed to write invalid metadata format 1.%i to %s\n", + st->minor_version, di->devname); + rv = -EINVAL; + goto out; + } + /* Disable badblock log on clusters, or when explicitly requested */ + if (st->nodes > 0 || conf_get_create_info()->bblist == 0) { + sb->bblog_size = 0; + sb->bblog_offset = 0; + } + + sb->sb_csum = calc_sb_1_csum(sb); + rv = store_super1(st, di->fd); + + if (rv == 0 && (di->disk.state & (1 << MD_DISK_JOURNAL))) { + rv = write_empty_r5l_meta_block(st, di->fd); + if (rv) + goto error_out; + } + + if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1)) + rv = st->ss->write_bitmap(st, di->fd, NoUpdate); + close(di->fd); + di->fd = -1; + if (rv) + goto error_out; + } +error_out: + if (rv) + pr_err("Failed to write metadata to %s\n", + di->devname); +out: + return rv; +} +#endif + +static int compare_super1(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + struct mdp_superblock_1 *first = st->sb; + struct mdp_superblock_1 *second = tst->sb; + + if (second->magic != __cpu_to_le32(MD_SB_MAGIC)) + return 1; + if (second->major_version != __cpu_to_le32(1)) + return 1; + + if (!first) { + if (posix_memalign((void**)&first, 4096, SUPER1_SIZE) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + memcpy(first, second, SUPER1_SIZE); + st->sb = first; + return 0; + } + if (memcmp(first->set_uuid, second->set_uuid, 16)!= 0) + return 2; + + if (first->ctime != second->ctime || + first->level != second->level || + first->layout != second->layout || + first->size != second->size || + first->chunksize != second->chunksize || + first->raid_disks != second->raid_disks) + return 3; + return 0; +} + +static int load_super1(struct supertype *st, int fd, char *devname) +{ + unsigned long long dsize; + unsigned long long sb_offset; + struct mdp_superblock_1 *super; + int uuid[4]; + struct bitmap_super_s *bsb; + struct misc_dev_info *misc; + struct align_fd afd; + + free_super1(st); + + init_afd(&afd, fd); + + if (st->ss == NULL || st->minor_version == -1) { + int bestvers = -1; + struct supertype tst; + __u64 bestctime = 0; + /* guess... choose latest ctime */ + memset(&tst, 0, sizeof(tst)); + tst.ss = &super1; + for (tst.minor_version = 0; tst.minor_version <= 2 ; tst.minor_version++) { + switch(load_super1(&tst, fd, devname)) { + case 0: super = tst.sb; + if (bestvers == -1 || + bestctime < __le64_to_cpu(super->ctime)) { + bestvers = tst.minor_version; + bestctime = __le64_to_cpu(super->ctime); + } + free(super); + tst.sb = NULL; + break; + case 1: return 1; /*bad device */ + case 2: break; /* bad, try next */ + } + } + if (bestvers != -1) { + int rv; + tst.minor_version = bestvers; + tst.ss = &super1; + tst.max_devs = MAX_DEVS; + rv = load_super1(&tst, fd, devname); + if (rv == 0) + *st = tst; + return rv; + } + return 2; + } + if (!get_dev_size(fd, devname, &dsize)) + return 1; + dsize >>= 9; + + if (dsize < 24) { + if (devname) + pr_err("%s is too small for md: size is %llu sectors.\n", + devname, dsize); + return 1; + } + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4*2; + break; + default: + return -EINVAL; + } + + if (lseek64(fd, sb_offset << 9, 0)< 0LL) { + if (devname) + pr_err("Cannot seek to superblock on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void**)&super, 4096, SUPER1_SIZE) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + if (aread(&afd, super, MAX_SB_SIZE) != MAX_SB_SIZE) { + if (devname) + pr_err("Cannot read superblock on %s\n", + devname); + free(super); + return 1; + } + + if (__le32_to_cpu(super->magic) != MD_SB_MAGIC) { + if (devname) + pr_err("No super block found on %s (Expected magic %08x, got %08x)\n", + devname, MD_SB_MAGIC, __le32_to_cpu(super->magic)); + free(super); + return 2; + } + + if (__le32_to_cpu(super->major_version) != 1) { + if (devname) + pr_err("Cannot interpret superblock on %s - version is %d\n", + devname, __le32_to_cpu(super->major_version)); + free(super); + return 2; + } + if (__le64_to_cpu(super->super_offset) != sb_offset) { + if (devname) + pr_err("No superblock found on %s (super_offset is wrong)\n", + devname); + free(super); + return 2; + } + st->sb = super; + + bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE); + + misc = (struct misc_dev_info*) (((char*)super)+MAX_SB_SIZE+BM_SUPER_SIZE); + misc->device_size = dsize; + if (st->data_offset == INVALID_SECTORS) + st->data_offset = __le64_to_cpu(super->data_offset); + + /* Now check on the bitmap superblock */ + if ((__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) == 0) + return 0; + /* Read the bitmap superblock and make sure it looks + * valid. If it doesn't clear the bit. An --assemble --force + * should get that written out. + */ + locate_bitmap1(st, fd); + if (aread(&afd, bsb, 512) != 512) + goto no_bitmap; + + uuid_from_super1(st, uuid); + if (__le32_to_cpu(bsb->magic) != BITMAP_MAGIC || + memcmp(bsb->uuid, uuid, 16) != 0) + goto no_bitmap; + return 0; + + no_bitmap: + super->feature_map = __cpu_to_le32(__le32_to_cpu(super->feature_map) + & ~MD_FEATURE_BITMAP_OFFSET); + return 0; +} + +static struct supertype *match_metadata_desc1(char *arg) +{ + struct supertype *st = xcalloc(1, sizeof(*st)); + + st->container_devnm[0] = 0; + st->ss = &super1; + st->max_devs = MAX_DEVS; + st->sb = NULL; + st->data_offset = INVALID_SECTORS; + /* leading zeros can be safely ignored. --detail generates them. */ + while (*arg == '0') + arg++; + if (strcmp(arg, "1.0") == 0 || + strcmp(arg, "1.00") == 0) { + st->minor_version = 0; + return st; + } + if (strcmp(arg, "1.1") == 0 || + strcmp(arg, "1.01") == 0 + ) { + st->minor_version = 1; + return st; + } + if (strcmp(arg, "1.2") == 0 || +#ifndef DEFAULT_OLD_METADATA /* ifdef in super0.c */ + strcmp(arg, "default") == 0 || +#endif /* DEFAULT_OLD_METADATA */ + strcmp(arg, "1.02") == 0) { + st->minor_version = 2; + return st; + } + if (strcmp(arg, "1") == 0 || + strcmp(arg, "default") == 0) { + st->minor_version = -1; + return st; + } + + free(st); + return NULL; +} + +/* find available size on device with this devsize, using + * superblock type st, and reserving 'reserve' sectors for + * a possible bitmap + */ +static __u64 avail_size1(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + struct mdp_superblock_1 *super = st->sb; + int bmspace = 0; + int bbspace = 0; + if (devsize < 24) + return 0; + +#ifndef MDASSEMBLE + if (__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) { + /* hot-add. allow for actual size of bitmap */ + struct bitmap_super_s *bsb; + bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE); + bmspace = bitmap_sectors(bsb); + } +#endif + /* Allow space for bad block log */ + if (super->bblog_size) + bbspace = __le16_to_cpu(super->bblog_size); + + if (st->minor_version < 0) + /* not specified, so time to set default */ + st->minor_version = 2; + + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + + if (data_offset != INVALID_SECTORS) + switch(st->minor_version) { + case 0: + return devsize - data_offset - 8*2 - bbspace; + case 1: + case 2: + return devsize - data_offset; + default: + return 0; + } + + devsize -= bmspace; + + switch(st->minor_version) { + case 0: + /* at end */ + return ((devsize - 8*2 - bbspace ) & ~(4*2-1)); + case 1: + /* at start, 4K for superblock and possible bitmap */ + return devsize - 4*2 - bbspace; + case 2: + /* 4k from start, 4K for superblock and possible bitmap */ + return devsize - (4+4)*2 - bbspace; + } + return 0; +} + +static int +add_internal_bitmap1(struct supertype *st, + int *chunkp, int delay, int write_behind, + unsigned long long size, + int may_change, int major) +{ + /* + * If not may_change, then this is a 'Grow' without sysfs support for + * bitmaps, and the bitmap must fit after the superblock at 1K offset. + * If may_change, then this is create or a Grow with sysfs syupport, + * and we can put the bitmap wherever we like. + * + * size is in sectors, chunk is in bytes !!! + */ + + unsigned long long bits; + unsigned long long max_bits; + unsigned long long min_chunk; + long offset; + long bbl_offset, bbl_size; + unsigned long long chunk = *chunkp; + int room = 0; + int creating = 0; + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + int uuid[4]; + + if (__le64_to_cpu(sb->data_size) == 0) + /* Must be creating the array, else data_size would be non-zero */ + creating = 1; + switch(st->minor_version) { + case 0: + /* either 3K after the superblock (when hot-add), + * or some amount of space before. + */ + if (creating) { + /* We are creating array, so we *know* how much room has + * been left. + */ + offset = 0; + bbl_size = 8; + room = choose_bm_space(__le64_to_cpu(sb->size)) + bbl_size; + } else { + room = __le64_to_cpu(sb->super_offset) + - __le64_to_cpu(sb->data_offset) + - __le64_to_cpu(sb->data_size); + bbl_size = __le16_to_cpu(sb->bblog_size); + if (bbl_size < 8) + bbl_size = 8; + bbl_offset = (__s32)__le32_to_cpu(sb->bblog_offset); + if (bbl_size < -bbl_offset) + bbl_size = -bbl_offset; + + if (!may_change || (room < 3*2 && + __le32_to_cpu(sb->max_dev) <= 384)) { + room = 3*2; + offset = 1*2; + bbl_size = 0; + } else { + offset = 0; /* means movable offset */ + } + } + break; + case 1: + case 2: /* between superblock and data */ + if (creating) { + offset = 4*2; + bbl_size = 8; + room = choose_bm_space(__le64_to_cpu(sb->size)) + bbl_size; + } else { + room = __le64_to_cpu(sb->data_offset) + - __le64_to_cpu(sb->super_offset); + bbl_size = __le16_to_cpu(sb->bblog_size); + if (bbl_size) + room = __le32_to_cpu(sb->bblog_offset) + bbl_size; + else + bbl_size = 8; + + if (!may_change) { + room -= 2; /* Leave 1K for superblock */ + offset = 2; + bbl_size = 0; + } else { + room -= 4*2; /* leave 4K for superblock */ + offset = 4*2; + } + } + break; + default: + return 0; + } + + room -= bbl_size; + if (chunk == UnSet && room > 128*2) + /* Limit to 128K of bitmap when chunk size not requested */ + room = 128*2; + + if (room <= 1) + /* No room for a bitmap */ + return 0; + + max_bits = (room * 512 - sizeof(bitmap_super_t)) * 8; + + min_chunk = 4096; /* sub-page chunks don't work yet.. */ + bits = (size*512)/min_chunk +1; + while (bits > max_bits) { + min_chunk *= 2; + bits = (bits+1)/2; + } + if (chunk == UnSet) { + /* For practical purpose, 64Meg is a good + * default chunk size for internal bitmaps. + */ + chunk = min_chunk; + if (chunk < 64*1024*1024) + chunk = 64*1024*1024; + } else if (chunk < min_chunk) + return 0; /* chunk size too small */ + if (chunk == 0) /* rounding problem */ + return 0; + + if (offset == 0) { + /* start bitmap on a 4K boundary with enough space for + * the bitmap + */ + bits = (size*512) / chunk + 1; + room = ((bits+7)/8 + sizeof(bitmap_super_t) +4095)/4096; + room *= 8; /* convert 4K blocks to sectors */ + offset = -room - bbl_size; + } + + sb->bitmap_offset = (int32_t)__cpu_to_le32(offset); + + sb->feature_map = __cpu_to_le32(__le32_to_cpu(sb->feature_map) + | MD_FEATURE_BITMAP_OFFSET); + memset(bms, 0, sizeof(*bms)); + bms->magic = __cpu_to_le32(BITMAP_MAGIC); + bms->version = __cpu_to_le32(major); + uuid_from_super1(st, uuid); + memcpy(bms->uuid, uuid, 16); + bms->chunksize = __cpu_to_le32(chunk); + bms->daemon_sleep = __cpu_to_le32(delay); + bms->sync_size = __cpu_to_le64(size); + bms->write_behind = __cpu_to_le32(write_behind); + bms->nodes = __cpu_to_le32(st->nodes); + if (st->nodes) + sb->feature_map = __cpu_to_le32(__le32_to_cpu(sb->feature_map) + | MD_FEATURE_BITMAP_VERSIONED); + if (st->cluster_name) + strncpy((char *)bms->cluster_name, + st->cluster_name, strlen(st->cluster_name)); + + *chunkp = chunk; + return 1; +} + +static int locate_bitmap1(struct supertype *st, int fd) +{ + unsigned long long offset; + struct mdp_superblock_1 *sb; + int mustfree = 0; + int ret; + + if (!st->sb) { + if (st->ss->load_super(st, fd, NULL)) + return -1; /* no error I hope... */ + mustfree = 1; + } + sb = st->sb; + + if ((__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) + ret = 0; + else + ret = -1; + offset = __le64_to_cpu(sb->super_offset); + offset += (int32_t) __le32_to_cpu(sb->bitmap_offset); + if (mustfree) + free(sb); + lseek64(fd, offset<<9, 0); + return ret; +} + +static int write_bitmap1(struct supertype *st, int fd, enum bitmap_update update) +{ + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE); + int rv = 0; + void *buf; + int towrite, n; + struct align_fd afd; + unsigned int i = 0; + unsigned long long total_bm_space, bm_space_per_node; + + switch (update) { + case NameUpdate: + /* update cluster name */ + if (st->cluster_name) { + memset((char *)bms->cluster_name, 0, sizeof(bms->cluster_name)); + strncpy((char *)bms->cluster_name, st->cluster_name, 64); + } + break; + case NodeNumUpdate: + /* cluster md only supports superblock 1.2 now */ + if (st->minor_version != 2) { + pr_err("Warning: cluster md only works with superblock 1.2\n"); + return -EINVAL; + } + + /* Each node has an independent bitmap, it is necessary to calculate the + * space is enough or not, first get how many bytes for the total bitmap */ + bm_space_per_node = calc_bitmap_size(bms, 4096); + + total_bm_space = 512 * (__le64_to_cpu(sb->data_offset) - __le64_to_cpu(sb->super_offset)); + total_bm_space = total_bm_space - 4096; /* leave another 4k for superblock */ + + if (bm_space_per_node * st->nodes > total_bm_space) { + pr_err("Warning: The max num of nodes can't exceed %llu\n", + total_bm_space / bm_space_per_node); + return -ENOMEM; + } + + bms->nodes = __cpu_to_le32(st->nodes); + break; + case NoUpdate: + default: + break; + } + + init_afd(&afd, fd); + + locate_bitmap1(st, fd); + + if (posix_memalign(&buf, 4096, 4096)) + return -ENOMEM; + + do { + /* Only the bitmap[0] should resync + * whole device on initial assembly + */ + if (i) + memset(buf, 0x00, 4096); + else + memset(buf, 0xff, 4096); + memcpy(buf, (char *)bms, sizeof(bitmap_super_t)); + + towrite = calc_bitmap_size(bms, 4096); + while (towrite > 0) { + n = towrite; + if (n > 4096) + n = 4096; + n = awrite(&afd, buf, n); + if (n > 0) + towrite -= n; + else + break; + if (i) + memset(buf, 0x00, 4096); + else + memset(buf, 0xff, 4096); + } + fsync(fd); + if (towrite) { + rv = -2; + break; + } + } while (++i < __le32_to_cpu(bms->nodes)); + + free(buf); + return rv; +} + +static void free_super1(struct supertype *st) +{ + + if (st->sb) + free(st->sb); + while (st->info) { + struct devinfo *di = st->info; + st->info = di->next; + if (di->fd >= 0) + close(di->fd); + free(di); + } + st->sb = NULL; +} + +#ifndef MDASSEMBLE +static int validate_geometry1(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int verbose) +{ + unsigned long long ldsize, devsize; + int bmspace; + unsigned long long headroom; + int fd; + + if (level == LEVEL_CONTAINER) { + if (verbose) + pr_err("1.x metadata does not support containers\n"); + return 0; + } + if (*chunk == UnSet) + *chunk = DEFAULT_CHUNK; + + if (!subdev) + return 1; + + if (st->minor_version < 0) + /* not specified, so time to set default */ + st->minor_version = 2; + + fd = open(subdev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + pr_err("super1.x cannot open %s: %s\n", + subdev, strerror(errno)); + return 0; + } + + if (!get_dev_size(fd, subdev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + devsize = ldsize >> 9; + if (devsize < 24) { + *freesize = 0; + return 0; + } + + /* creating: allow suitable space for bitmap */ + bmspace = choose_bm_space(devsize); + + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + if (data_offset == INVALID_SECTORS) + switch (st->minor_version) { + case 0: + data_offset = 0; + break; + case 1: + case 2: + /* Choose data offset appropriate for this device + * and use as default for whole array. + * The data_offset must allow for bitmap space + * and base metadata, should allow for some headroom + * for reshape, and should be rounded to multiple + * of 1M. + * Headroom is limited to 128M, but aim for about 0.1% + */ + headroom = 128*1024*2; + while ((headroom << 10) > devsize && + (*chunk == 0 || + headroom / 2 >= ((unsigned)(*chunk)*2)*2)) + headroom >>= 1; + data_offset = 12*2 + bmspace + headroom; + #define ONE_MEG (2*1024) + if (data_offset > ONE_MEG) + data_offset = (data_offset / ONE_MEG) * ONE_MEG; + break; + } + if (st->data_offset == INVALID_SECTORS) + st->data_offset = data_offset; + switch(st->minor_version) { + case 0: /* metadata at end. Round down and subtract space to reserve */ + devsize = (devsize & ~(4ULL*2-1)); + /* space for metadata, bblog, bitmap */ + devsize -= 8*2 + 8 + bmspace; + break; + case 1: + case 2: + devsize -= data_offset; + break; + } + *freesize = devsize; + return 1; +} +#endif /* MDASSEMBLE */ + +void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0) +{ + /* Create a v1.0 superblock based on 'info'*/ + void *ret; + struct mdp_superblock_1 *sb; + int i; + int rfd; + unsigned long long offset; + + if (posix_memalign(&ret, 4096, 1024) != 0) + return NULL; + sb = ret; + memset(ret, 0, 1024); + sb->magic = __cpu_to_le32(MD_SB_MAGIC); + sb->major_version = __cpu_to_le32(1); + + copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid); + sprintf(sb->set_name, "%d", sb0->md_minor); + sb->ctime = __cpu_to_le32(info->array.ctime+1); + sb->level = __cpu_to_le32(info->array.level); + sb->layout = __cpu_to_le32(info->array.layout); + sb->size = __cpu_to_le64(info->component_size); + sb->chunksize = __cpu_to_le32(info->array.chunk_size/512); + sb->raid_disks = __cpu_to_le32(info->array.raid_disks); + if (info->array.level > 0) + sb->data_size = sb->size; + else + sb->data_size = st->ss->avail_size(st, st->devsize/512, 0); + sb->resync_offset = MaxSector; + sb->max_dev = __cpu_to_le32(MD_SB_DISKS); + sb->dev_number = __cpu_to_le32(info->disk.number); + sb->utime = __cpu_to_le64(info->array.utime); + + offset = st->devsize/512 - 8*2; + offset &= ~(4*2-1); + sb->super_offset = __cpu_to_le64(offset); + //*(__u64*)(st->other + 128 + 8 + 8) = __cpu_to_le64(offset); + + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->device_uuid, 16) != 16) { + __u32 r[4] = {random(), random(), random(), random()}; + memcpy(sb->device_uuid, r, 16); + } + if (rfd >= 0) + close(rfd); + + for (i = 0; i < MD_SB_DISKS; i++) { + int state = sb0->disks[i].state; + sb->dev_roles[i] = MD_DISK_ROLE_SPARE; + if ((state & (1<dev_roles[i] = __cpu_to_le16(sb0->disks[i].raid_disk); + } + sb->sb_csum = calc_sb_1_csum(sb); + return ret; +} + +struct superswitch super1 = { +#ifndef MDASSEMBLE + .examine_super = examine_super1, + .brief_examine_super = brief_examine_super1, + .export_examine_super = export_examine_super1, + .detail_super = detail_super1, + .brief_detail_super = brief_detail_super1, + .export_detail_super = export_detail_super1, + .write_init_super = write_init_super1, + .validate_geometry = validate_geometry1, + .add_to_super = add_to_super1, + .examine_badblocks = examine_badblocks_super1, + .copy_metadata = copy_metadata1, +#endif + .match_home = match_home1, + .uuid_from_super = uuid_from_super1, + .getinfo_super = getinfo_super1, + .container_content = container_content1, + .update_super = update_super1, + .init_super = init_super1, + .store_super = store_super1, + .compare_super = compare_super1, + .load_super = load_super1, + .match_metadata_desc = match_metadata_desc1, + .avail_size = avail_size1, + .add_internal_bitmap = add_internal_bitmap1, + .locate_bitmap = locate_bitmap1, + .write_bitmap = write_bitmap1, + .free_super = free_super1, +#if __BYTE_ORDER == BIG_ENDIAN + .swapuuid = 0, +#else + .swapuuid = 1, +#endif + .name = "1.x", +}; diff --git a/swap_super.c b/swap_super.c new file mode 100644 index 00000000..b6db5743 --- /dev/null +++ b/swap_super.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +/* + * This is a tiny test program to endian-swap + * the superblock on a given device. + * We simply read 4k from where the superblock should be + * do the swap, and write it back + * Don't use this on a real array, use mdadm. + */ + +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) + +extern long long lseek64(int, long long, int); + +int main(int argc, char *argv[]) +{ + int fd, i; + unsigned long size; + unsigned long long offset; + char super[4096]; + if (argc != 2) { + fprintf(stderr, "Usage: swap_super device\n"); + exit(1); + } + fd = open(argv[1], O_RDWR); + if (fd<0) { + perror(argv[1]); + exit(1); + } + if (ioctl(fd, BLKGETSIZE, &size)) { + perror("BLKGETSIZE"); + exit(1); + } + offset = MD_NEW_SIZE_SECTORS(size) * 512LL; + if (lseek64(fd, offset, 0) < 0LL) { + perror("lseek64"); + exit(1); + } + if (read(fd, super, 4096) != 4096) { + perror("read"); + exit(1); + } + + for (i=0; i < 4096 ; i+=4) { + char t = super[i]; + super[i] = super[i+3]; + super[i+3] = t; + t=super[i+1]; + super[i+1]=super[i+2]; + super[i+2]=t; + } + /* swap the u64 events counters */ + for (i=0; i<4; i++) { + /* events_hi and events_lo */ + char t=super[32*4+7*4 +i]; + super[32*4+7*4 +i] = super[32*4+8*4 +i]; + super[32*4+8*4 +i] = t; + + /* cp_events_hi and cp_events_lo */ + t=super[32*4+9*4 +i]; + super[32*4+9*4 +i] = super[32*4+10*4 +i]; + super[32*4+10*4 +i] = t; + } + + if (lseek64(fd, offset, 0) < 0LL) { + perror("lseek64"); + exit(1); + } + if (write(fd, super, 4096) != 4096) { + perror("write"); + exit(1); + } + exit(0); + +} diff --git a/sysfs.c b/sysfs.c new file mode 100644 index 00000000..26003432 --- /dev/null +++ b/sysfs.c @@ -0,0 +1,931 @@ +/* + * sysfs - extract md related information from sysfs. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include +#include + +int load_sys(char *path, char *buf) +{ + int fd = open(path, O_RDONLY); + int n; + if (fd < 0) + return -1; + n = read(fd, buf, 1024); + close(fd); + if (n <0 || n >= 1024) + return -1; + buf[n] = 0; + if (n && buf[n-1] == '\n') + buf[n-1] = 0; + return 0; +} + +void sysfs_free(struct mdinfo *sra) +{ + while (sra) { + struct mdinfo *sra2 = sra->next; + while (sra->devs) { + struct mdinfo *d = sra->devs; + sra->devs = d->next; + free(d); + } + free(sra); + sra = sra2; + } +} + +int sysfs_open(char *devnm, char *devname, char *attr) +{ + char fname[50]; + int fd; + + sprintf(fname, "/sys/block/%s/md/", devnm); + if (devname) { + strcat(fname, devname); + strcat(fname, "/"); + } + strcat(fname, attr); + fd = open(fname, O_RDWR); + if (fd < 0 && errno == EACCES) + fd = open(fname, O_RDONLY); + return fd; +} + +void sysfs_init_dev(struct mdinfo *mdi, unsigned long devid) +{ + snprintf(mdi->sys_name, + sizeof(mdi->sys_name), "dev-%s", devid2kname(devid)); +} + +void sysfs_init(struct mdinfo *mdi, int fd, char *devnm) +{ + mdi->sys_name[0] = 0; + if (fd >= 0) { + mdu_version_t vers; + if (ioctl(fd, RAID_VERSION, &vers) != 0) + return; + devnm = fd2devnm(fd); + } + if (devnm == NULL) + return; + strcpy(mdi->sys_name, devnm); +} + +struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options) +{ + char fname[PATH_MAX]; + char buf[PATH_MAX]; + char *base; + char *dbase; + struct mdinfo *sra; + struct mdinfo *dev, **devp; + DIR *dir = NULL; + struct dirent *de; + + sra = xcalloc(1, sizeof(*sra)); + sysfs_init(sra, fd, devnm); + if (sra->sys_name[0] == 0) { + free(sra); + return NULL; + } + + sprintf(fname, "/sys/block/%s/md/", sra->sys_name); + base = fname + strlen(fname); + + sra->devs = NULL; + if (options & GET_VERSION) { + strcpy(base, "metadata_version"); + if (load_sys(fname, buf)) + goto abort; + if (strncmp(buf, "none", 4) == 0) { + sra->array.major_version = + sra->array.minor_version = -1; + strcpy(sra->text_version, ""); + } else if (strncmp(buf, "external:", 9) == 0) { + sra->array.major_version = -1; + sra->array.minor_version = -2; + strcpy(sra->text_version, buf+9); + } else { + sscanf(buf, "%d.%d", + &sra->array.major_version, + &sra->array.minor_version); + strcpy(sra->text_version, buf); + } + } + if (options & GET_LEVEL) { + strcpy(base, "level"); + if (load_sys(fname, buf)) + goto abort; + sra->array.level = map_name(pers, buf); + } + if (options & GET_LAYOUT) { + strcpy(base, "layout"); + if (load_sys(fname, buf)) + goto abort; + sra->array.layout = strtoul(buf, NULL, 0); + } + if (options & GET_DISKS) { + strcpy(base, "raid_disks"); + if (load_sys(fname, buf)) + goto abort; + sra->array.raid_disks = strtoul(buf, NULL, 0); + } + if (options & GET_DEGRADED) { + strcpy(base, "degraded"); + if (load_sys(fname, buf)) + goto abort; + sra->array.failed_disks = strtoul(buf, NULL, 0); + } + if (options & GET_COMPONENT) { + strcpy(base, "component_size"); + if (load_sys(fname, buf)) + goto abort; + sra->component_size = strtoull(buf, NULL, 0); + /* sysfs reports "K", but we want sectors */ + sra->component_size *= 2; + } + if (options & GET_CHUNK) { + strcpy(base, "chunk_size"); + if (load_sys(fname, buf)) + goto abort; + sra->array.chunk_size = strtoul(buf, NULL, 0); + } + if (options & GET_CACHE) { + strcpy(base, "stripe_cache_size"); + if (load_sys(fname, buf)) + /* Probably level doesn't support it */ + sra->cache_size = 0; + else + sra->cache_size = strtoul(buf, NULL, 0); + } + if (options & GET_MISMATCH) { + strcpy(base, "mismatch_cnt"); + if (load_sys(fname, buf)) + goto abort; + sra->mismatch_cnt = strtoul(buf, NULL, 0); + } + if (options & GET_SAFEMODE) { + int scale = 1; + int dot = 0; + unsigned i; + unsigned long msec; + size_t len; + + strcpy(base, "safe_mode_delay"); + if (load_sys(fname, buf)) + goto abort; + + /* remove a period, and count digits after it */ + len = strlen(buf); + for (i = 0; i < len; i++) { + if (dot) { + if (isdigit(buf[i])) { + buf[i-1] = buf[i]; + scale *= 10; + } + buf[i] = 0; + } else if (buf[i] == '.') { + dot=1; + buf[i] = 0; + } + } + msec = strtoul(buf, NULL, 10); + msec = (msec * 1000) / scale; + sra->safe_mode_delay = msec; + } + if (options & GET_BITMAP_LOCATION) { + strcpy(base, "bitmap/location"); + if (load_sys(fname, buf)) + goto abort; + if (strncmp(buf, "file", 4) == 0) + sra->bitmap_offset = 1; + else if (strncmp(buf, "none", 4) == 0) + sra->bitmap_offset = 0; + else if (buf[0] == '+') + sra->bitmap_offset = strtol(buf+1, NULL, 10); + else + goto abort; + } + + if (options & GET_ARRAY_STATE) { + strcpy(base, "array_state"); + if (load_sys(fname, sra->sysfs_array_state)) + goto abort; + } else + sra->sysfs_array_state[0] = 0; + + if (! (options & GET_DEVS)) + return sra; + + /* Get all the devices as well */ + *base = 0; + dir = opendir(fname); + if (!dir) + goto abort; + sra->array.spare_disks = 0; + + devp = &sra->devs; + sra->devs = NULL; + while ((de = readdir(dir)) != NULL) { + char *ep; + if (de->d_ino == 0 || + strncmp(de->d_name, "dev-", 4) != 0) + continue; + strcpy(base, de->d_name); + dbase = base + strlen(base); + *dbase++ = '/'; + + dev = xmalloc(sizeof(*dev)); + + /* Always get slot, major, minor */ + strcpy(dbase, "slot"); + if (load_sys(fname, buf)) { + /* hmm... unable to read 'slot' maybe the device + * is going away? + */ + strcpy(dbase, "block"); + if (readlink(fname, buf, sizeof(buf)) < 0 && + errno != ENAMETOOLONG) { + /* ...yup device is gone */ + free(dev); + continue; + } else { + /* slot is unreadable but 'block' link + * still intact... something bad is happening + * so abort + */ + free(dev); + goto abort; + } + + } + strcpy(dev->sys_name, de->d_name); + dev->disk.raid_disk = strtoul(buf, &ep, 10); + if (*ep) dev->disk.raid_disk = -1; + + strcpy(dbase, "block/dev"); + if (load_sys(fname, buf)) { + /* assume this is a stale reference to a hot + * removed device + */ + free(dev); + continue; + } + sra->array.nr_disks++; + sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor); + + /* special case check for block devices that can go 'offline' */ + strcpy(dbase, "block/device/state"); + if (load_sys(fname, buf) == 0 && + strncmp(buf, "offline", 7) == 0) { + free(dev); + continue; + } + + /* finally add this disk to the array */ + *devp = dev; + devp = & dev->next; + dev->next = NULL; + + if (options & GET_OFFSET) { + strcpy(dbase, "offset"); + if (load_sys(fname, buf)) + goto abort; + dev->data_offset = strtoull(buf, NULL, 0); + strcpy(dbase, "new_offset"); + if (load_sys(fname, buf) == 0) + dev->new_data_offset = strtoull(buf, NULL, 0); + else + dev->new_data_offset = dev->data_offset; + } + if (options & GET_SIZE) { + strcpy(dbase, "size"); + if (load_sys(fname, buf)) + goto abort; + dev->component_size = strtoull(buf, NULL, 0) * 2; + } + if (options & GET_STATE) { + dev->disk.state = 0; + strcpy(dbase, "state"); + if (load_sys(fname, buf)) + goto abort; + if (strstr(buf, "in_sync")) + dev->disk.state |= (1<disk.state |= (1<disk.state == 0) + sra->array.spare_disks++; + } + if (options & GET_ERROR) { + strcpy(buf, "errors"); + if (load_sys(fname, buf)) + goto abort; + dev->errors = strtoul(buf, NULL, 0); + } + } + closedir(dir); + return sra; + + abort: + if (dir) + closedir(dir); + sysfs_free(sra); + return NULL; +} + +int sysfs_attr_match(const char *attr, const char *str) +{ + /* See if attr, read from a sysfs file, matches + * str. They must either be the same, or attr can + * have a trailing newline or comma + */ + while (*attr && *str && *attr == *str) { + attr++; + str++; + } + + if (*str || (*attr && *attr != ',' && *attr != '\n')) + return 0; + return 1; +} + +int sysfs_match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (sysfs_attr_match(word, list[n])) + break; + return n; +} + +unsigned long long get_component_size(int fd) +{ + /* Find out the component size of the array. + * We cannot trust GET_ARRAY_INFO ioctl as it's + * size field is only 32bits. + * So look in /sys/block/mdXXX/md/component_size + * + * This returns in units of sectors. + */ + struct stat stb; + char fname[50]; + int n; + if (fstat(fd, &stb)) return 0; + if (major(stb.st_rdev) != (unsigned)get_mdp_major()) + sprintf(fname, "/sys/block/md%d/md/component_size", + (int)minor(stb.st_rdev)); + else + sprintf(fname, "/sys/block/md_d%d/md/component_size", + (int)minor(stb.st_rdev)>>MdpMinorShift); + fd = open(fname, O_RDONLY); + if (fd < 0) + return 0; + n = read(fd, fname, sizeof(fname)); + close(fd); + if (n < 0 || n == sizeof(fname)) + return 0; + fname[n] = 0; + return strtoull(fname, NULL, 10) * 2; +} + +int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val) +{ + char fname[50]; + unsigned int n; + int fd; + + sprintf(fname, "/sys/block/%s/md/%s/%s", + sra->sys_name, dev?dev->sys_name:"", name); + fd = open(fname, O_WRONLY); + if (fd < 0) + return -1; + n = write(fd, val, strlen(val)); + close(fd); + if (n != strlen(val)) { + dprintf("failed to write '%s' to '%s' (%s)\n", + val, fname, strerror(errno)); + return -1; + } + return 0; +} + +int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long val) +{ + char valstr[50]; + sprintf(valstr, "%llu", val); + return sysfs_set_str(sra, dev, name, valstr); +} + +int sysfs_set_num_signed(struct mdinfo *sra, struct mdinfo *dev, + char *name, long long val) +{ + char valstr[50]; + sprintf(valstr, "%lli", val); + return sysfs_set_str(sra, dev, name, valstr); +} + +int sysfs_uevent(struct mdinfo *sra, char *event) +{ + char fname[50]; + int n; + int fd; + + sprintf(fname, "/sys/block/%s/uevent", + sra->sys_name); + fd = open(fname, O_WRONLY); + if (fd < 0) + return -1; + n = write(fd, event, strlen(event)); + close(fd); + if (n != (int)strlen(event)) { + dprintf("failed to write '%s' to '%s' (%s)\n", + event, fname, strerror(errno)); + return -1; + } + return 0; +} + +int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name) +{ + char fname[50]; + struct stat st; + + sprintf(fname, "/sys/block/%s/md/%s/%s", + sra->sys_name, dev?dev->sys_name:"", name); + + return stat(fname, &st) == 0; +} + +int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, + char *name) +{ + char fname[50]; + int fd; + + sprintf(fname, "/sys/block/%s/md/%s/%s", + sra->sys_name, dev?dev->sys_name:"", name); + fd = open(fname, O_RDWR); + if (fd < 0) + fd = open(fname, O_RDONLY); + return fd; +} + +int sysfs_fd_get_ll(int fd, unsigned long long *val) +{ + char buf[50]; + int n; + char *ep; + + lseek(fd, 0, 0); + n = read(fd, buf, sizeof(buf)); + if (n <= 0 || n == sizeof(buf)) + return -2; + buf[n] = 0; + *val = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return -1; + return 0; +} + +int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *val) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_ll(fd, val); + close(fd); + return n; +} + +int sysfs_fd_get_two(int fd, unsigned long long *v1, unsigned long long *v2) +{ + /* two numbers in this sysfs file, either + * NNN (NNN) + * or + * NNN / NNN + */ + char buf[80]; + int n; + char *ep, *ep2; + + lseek(fd, 0, 0); + n = read(fd, buf, sizeof(buf)); + if (n <= 0 || n == sizeof(buf)) + return -2; + buf[n] = 0; + *v1 = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return -1; + while (*ep == ' ' || *ep == '/' || *ep == '(') + ep++; + *v2 = strtoull(ep, &ep2, 0); + if (ep2 == ep || (*ep2 != 0 && *ep2 != '\n' && *ep2 != ' ' && *ep2 != ')')) { + *v2 = *v1; + return 1; + } + return 2; +} + +int sysfs_get_two(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *v1, unsigned long long *v2) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_two(fd, v1, v2); + close(fd); + return n; +} + +int sysfs_fd_get_str(int fd, char *val, int size) +{ + int n; + + lseek(fd, 0, 0); + n = read(fd, val, size); + if (n <= 0 || n == size) + return -1; + val[n] = 0; + return n; +} + +int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val, int size) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_str(fd, val, size); + close(fd); + return n; +} + +int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms) +{ + unsigned long sec; + unsigned long msec; + char delay[30]; + + sec = ms / 1000; + msec = ms % 1000; + + sprintf(delay, "%ld.%03ld\n", sec, msec); + /* this '\n' ^ needed for kernels older than 2.6.28 */ + return sysfs_set_str(sra, NULL, "safe_mode_delay", delay); +} + +int sysfs_set_array(struct mdinfo *info, int vers) +{ + int rv = 0; + char ver[100]; + int raid_disks = info->array.raid_disks; + + ver[0] = 0; + if (info->array.major_version == -1 && + info->array.minor_version == -2) { + char buf[1024]; + + strcat(strcpy(ver, "external:"), info->text_version); + + /* meta version might already be set if we are setting + * new geometry for a reshape. In that case we don't + * want to over-write the 'readonly' flag that is + * stored in the metadata version. So read the current + * version first, and preserve the flag + */ + if (sysfs_get_str(info, NULL, "metadata_version", + buf, 1024) > 0) + if (strlen(buf) >= 9 && buf[9] == '-') + ver[9] = '-'; + + if ((vers % 100) < 2 || + sysfs_set_str(info, NULL, "metadata_version", + ver) < 0) { + pr_err("This kernel does not support external metadata.\n"); + return 1; + } + } + if (info->array.level < 0) + return 0; /* FIXME */ + rv |= sysfs_set_str(info, NULL, "level", + map_num(pers, info->array.level)); + if (info->reshape_active && info->delta_disks != UnSet) + raid_disks -= info->delta_disks; + rv |= sysfs_set_num(info, NULL, "raid_disks", raid_disks); + rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size); + rv |= sysfs_set_num(info, NULL, "layout", info->array.layout); + rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2); + if (info->custom_array_size) { + int rc; + + rc = sysfs_set_num(info, NULL, "array_size", + info->custom_array_size/2); + if (rc && errno == ENOENT) { + pr_err("This kernel does not have the md/array_size attribute, the array may be larger than expected\n"); + rc = 0; + } + rv |= rc; + } + + if (info->array.level > 0) + rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start); + + if (info->reshape_active) { + rv |= sysfs_set_num(info, NULL, "reshape_position", + info->reshape_progress); + rv |= sysfs_set_num(info, NULL, "chunk_size", info->new_chunk); + rv |= sysfs_set_num(info, NULL, "layout", info->new_layout); + rv |= sysfs_set_num(info, NULL, "raid_disks", + info->array.raid_disks); + /* We don't set 'new_level' here. That can only happen + * once the reshape completes. + */ + } + return rv; +} + +int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume) +{ + char dv[PATH_MAX]; + char nm[PATH_MAX]; + char *dname; + int rv; + + sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor); + rv = sysfs_set_str(sra, NULL, "new_dev", dv); + if (rv) + return rv; + + memset(nm, 0, sizeof(nm)); + dname = devid2kname(makedev(sd->disk.major, sd->disk.minor)); + strcpy(sd->sys_name, "dev-"); + strcpy(sd->sys_name+4, dname); + + /* test write to see if 'recovery_start' is available */ + if (resume && sd->recovery_start < MaxSector && + sysfs_set_num(sra, sd, "recovery_start", 0)) { + sysfs_set_str(sra, sd, "state", "remove"); + return -1; + } + + rv = sysfs_set_num(sra, sd, "offset", sd->data_offset); + rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2); + if (sra->array.level != LEVEL_CONTAINER) { + if (sd->recovery_start == MaxSector) + /* This can correctly fail if array isn't started, + * yet, so just ignore status for now. + */ + sysfs_set_str(sra, sd, "state", "insync"); + if (sd->disk.raid_disk >= 0) + rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk); + if (resume) + sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start); + } + return rv; +} + +#if 0 +int sysfs_disk_to_sg(int fd) +{ + /* from an open block device, try find and open its corresponding + * scsi_generic interface + */ + struct stat st; + char path[256]; + char sg_path[256]; + char sg_major_minor[10]; + char *c; + DIR *dir; + struct dirent *de; + int major, minor, rv; + + if (fstat(fd, &st)) + return -1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return -1; + + de = readdir(dir); + while (de) { + if (strncmp("scsi_generic:", de->d_name, + strlen("scsi_generic:")) == 0) + break; + de = readdir(dir); + } + closedir(dir); + + if (!de) + return -1; + + snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name); + fd = open(sg_path, O_RDONLY); + if (fd < 0) + return fd; + + rv = read(fd, sg_major_minor, sizeof(sg_major_minor)); + close(fd); + if (rv < 0 || rv == sizeof(sg_major_minor)) + return -1; + else + sg_major_minor[rv - 1] = '\0'; + + c = strchr(sg_major_minor, ':'); + *c = '\0'; + c++; + major = strtol(sg_major_minor, NULL, 10); + minor = strtol(c, NULL, 10); + snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d", + (int) getpid(), major, minor); + if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) { + fd = open(path, O_RDONLY); + unlink(path); + return fd; + } + + return -1; +} +#endif + +int sysfs_disk_to_scsi_id(int fd, __u32 *id) +{ + /* from an open block device, try to retrieve it scsi_id */ + struct stat st; + char path[256]; + DIR *dir; + struct dirent *de; + int host, bus, target, lun; + + if (fstat(fd, &st)) + return 1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device/scsi_device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return 1; + + for (de = readdir(dir); de; de = readdir(dir)) { + int count; + + if (de->d_type != DT_DIR) + continue; + + count = sscanf(de->d_name, "%d:%d:%d:%d", &host, &bus, &target, &lun); + if (count == 4) + break; + } + closedir(dir); + + if (!de) + return 1; + + *id = (host << 24) | (bus << 16) | (target << 8) | (lun << 0); + return 0; +} + +int sysfs_unique_holder(char *devnm, long rdev) +{ + /* Check that devnm is a holder of rdev, + * and is the only holder. + * we should be locked against races by + * an O_EXCL on devnm + * Return values: + * 0 - not unique, not even a holder + * 1 - unique, this is the only holder. + * 2/3 - not unique, there is another holder + * -1 - error, cannot find the holders + */ + DIR *dir; + struct dirent *de; + char dirname[100]; + char l; + int ret = 0; + sprintf(dirname, "/sys/dev/block/%d:%d/holders", + major(rdev), minor(rdev)); + dir = opendir(dirname); + if (!dir) + return -1; + l = strlen(dirname); + while ((de = readdir(dir)) != NULL) { + char buf[100]; + char *sl; + int n; + + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + strcpy(dirname+l, "/"); + strcat(dirname+l, de->d_name); + n = readlink(dirname, buf, sizeof(buf)-1); + if (n <= 0) + continue; + buf[n] = 0; + sl = strrchr(buf, '/'); + if (!sl) + continue; + sl++; + + if (strcmp(devnm, sl) == 0) + ret |= 1; + else + ret |= 2; + } + closedir(dir); + return ret; +} + +int sysfs_freeze_array(struct mdinfo *sra) +{ + /* Try to freeze resync/rebuild on this array/container. + * Return -1 if the array is busy, + * return 0 if this kernel doesn't support 'frozen' + * return 1 if it worked. + */ + char buf[20]; + + if (!sysfs_attribute_available(sra, NULL, "sync_action")) + return 1; /* no sync_action == frozen */ + if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0) + return 0; + if (strcmp(buf, "frozen\n") == 0) + /* Already frozen */ + return 0; + if (strcmp(buf, "idle\n") != 0 && strcmp(buf, "recover\n") != 0) + return -1; + if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0) + return 0; + return 1; +} + +int sysfs_wait(int fd, int *msec) +{ + /* Wait up to '*msec' for fd to have an exception condition. + * if msec == NULL, wait indefinitely. + */ + fd_set fds; + int n; + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (msec == NULL) + n = select(fd+1, NULL, NULL, &fds, NULL); + else if (*msec < 0) + n = 0; + else { + struct timeval start, end, tv; + gettimeofday(&start, NULL); + if (*msec < 1000) { + tv.tv_sec = 0; + tv.tv_usec = (*msec)*1000; + } else { + tv.tv_sec = (*msec)/1000; + tv.tv_usec = 0; + } + n = select(fd+1, NULL, NULL, &fds, &tv); + gettimeofday(&end, NULL); + end.tv_sec -= start.tv_sec; + *msec -= (end.tv_sec * 1000 + end.tv_usec/1000 + - start.tv_usec/1000) + 1; + } + return n; +} diff --git a/systemd/SUSE-mdadm_env.sh b/systemd/SUSE-mdadm_env.sh new file mode 100644 index 00000000..10b2e749 --- /dev/null +++ b/systemd/SUSE-mdadm_env.sh @@ -0,0 +1,45 @@ +#!/bin/sh + +# extract configuration from /etc/sysconfig/mdadm and write +# environment to /run/sysconfig/mdadm to be used by +# systemd unit files. + +MDADM_SCAN="yes" + +# Following adapted from /etc/init.d/mdadmd on openSUSE + +mdadmd_CONFIG=/etc/sysconfig/mdadm +if test -r $mdadmd_CONFIG; then + . $mdadmd_CONFIG +fi + +if [ x$MDADM_DELAY != x"" ]; then + MDADM_DELAY="-d "$MDADM_DELAY; +fi + +if [ x$MDADM_MAIL != x"" ]; then + MDADM_MAIL="-m \"$MDADM_MAIL\"" +fi + +if [ x$MDADM_PROGRAM != x"" ]; then + MDADM_PROGRAM="-p \"$MDADM_PROGRAM\"" +fi + +if [ x$MDADM_SCAN = x"yes" ]; then + MDADM_SCAN="--scan" +else + MDADM_SCAN="" +fi + +if [ x$MDADM_SEND_MAIL_ON_START = x"yes" ]; then + MDADM_SEND_MAIL="-t" +else + MDADM_SEND_MAIL="" +fi + +if [ x$MDADM_CONFIG != x"" ]; then + MDADM_CONFIG="-c \"$MDADM_CONFIG\"" +fi + +mkdir -p /run/sysconfig +echo "MDADM_MONITOR_ARGS=$MDADM_RAIDDEVICES $MDADM_DELAY $MDADM_MAIL $MDADM_PROGRAM $MDADM_SCAN $MDADM_SEND_MAIL $MDADM_CONFIG" > /run/sysconfig/mdadm diff --git a/systemd/mdadm-grow-continue@.service b/systemd/mdadm-grow-continue@.service new file mode 100644 index 00000000..5c667d2a --- /dev/null +++ b/systemd/mdadm-grow-continue@.service @@ -0,0 +1,17 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=Manage MD Reshape on /dev/%I +DefaultDependencies=no + +[Service] +ExecStart=BINDIR/mdadm --grow --continue /dev/%I +StandardInput=null +StandardOutput=null +StandardError=null +KillMode=none diff --git a/systemd/mdadm-last-resort@.service b/systemd/mdadm-last-resort@.service new file mode 100644 index 00000000..e93d72b2 --- /dev/null +++ b/systemd/mdadm-last-resort@.service @@ -0,0 +1,8 @@ +[Unit] +Description=Activate md array even though degraded +DefaultDependencies=no +Conflicts=sys-devices-virtual-block-%i.device + +[Service] +Type=oneshot +ExecStart=BINDIR/mdadm --run /dev/%i diff --git a/systemd/mdadm-last-resort@.timer b/systemd/mdadm-last-resort@.timer new file mode 100644 index 00000000..52b3f227 --- /dev/null +++ b/systemd/mdadm-last-resort@.timer @@ -0,0 +1,7 @@ +[Unit] +Description=Timer to wait for more drives before activating degraded array. +DefaultDependencies=no +Conflicts=sys-devices-virtual-block-%i.device + +[Timer] +OnActiveSec=30 diff --git a/systemd/mdadm.shutdown b/systemd/mdadm.shutdown new file mode 100644 index 00000000..33f27783 --- /dev/null +++ b/systemd/mdadm.shutdown @@ -0,0 +1,4 @@ +#!/bin/sh +# We need to ensure all md arrays with external metadata +# (e.g. IMSM, DDF) are clean before completing the shutdown. +BINDIR/mdadm --wait-clean --scan diff --git a/systemd/mdmon@.service b/systemd/mdmon@.service new file mode 100644 index 00000000..85a3a7c5 --- /dev/null +++ b/systemd/mdmon@.service @@ -0,0 +1,28 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD Metadata Monitor on /dev/%I +DefaultDependencies=no +Before=initrd-switch-root.target + +[Service] +# mdmon should never complain due to lack of a platform, +# that is mdadm's job if at all. +Environment=IMSM_NO_PLATFORM=1 +# The mdmon starting in the initramfs (with dracut at least) +# cannot see sysfs after root is mounted, so we will have to +# 'takeover'. As the '--offroot --takeover' don't hurt when +# not necessary, are are useful with root-on-md in dracut, +# have them always present. +ExecStart=BINDIR/mdmon --offroot --takeover %I +Type=forking +# Don't set the PIDFile. It isn't necessary (systemd can work +# it out) and systemd will remove it when transitioning from +# initramfs to rootfs. +#PIDFile=/run/mdadm/%I.pid +KillMode=none diff --git a/systemd/mdmonitor.service b/systemd/mdmonitor.service new file mode 100644 index 00000000..9aff2f56 --- /dev/null +++ b/systemd/mdmonitor.service @@ -0,0 +1,13 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD array monitor +DefaultDependencies=no + +[Service] +ExecStart=BINDIR/mdadm --monitor --scan diff --git a/test b/test new file mode 100755 index 00000000..13f1bda7 --- /dev/null +++ b/test @@ -0,0 +1,440 @@ +#!/bin/bash +# +# run test suite for mdadm +user=`id -un` +if [ " $user" != " root" ] +then echo >&2 "test: testing can only be done as 'root'." + exit 1; +fi + +prefix='[0-9][0-9]' + +dir=`pwd` +mdadm=$dir/mdadm +if [ \! -x $mdadm ] +then + echo >&2 "test: $mdadm isn't usable." +fi + +testdir="tests" +logdir="$testdir/logs" +logsave=0 +exitonerror=1 + +echo "Testing on linux-$(uname -r) kernel" + +# Check whether to run multipath tests +modprobe multipath 2> /dev/null +if grep -s 'Personalities : .*multipath' > /dev/null /proc/mdstat ; then + MULTIPATH="yes" +fi +INTEGRITY=yes +DEVTYPE=loop +LVM_VOLGROUP=mdtest + +# make sure to test local mdmon, not system one +export MDADM_NO_SYSTEMCTL=1 + +# assume md0, md1, md2 exist in /dev +md0=/dev/md0 md1=/dev/md1 md2=/dev/md2 +mdp0=/dev/md_d0 +mdp1=/dev/md_d1 + +# We test mdadm on loop-back block devices. +# dir for storing files should be settable by command line maybe +targetdir=/var/tmp +size=20000 +# super0, round down to multiple of 64 and substract 64 +mdsize0=19904 +# super00 is nested, subtract 128 +mdsize00=19840 +# super1.0 round down to multiple of 2, subtract 8 +mdsize1=19992 +mdsize1a=19988 +mdsize12=19988 +# super1.2 for linear: round to multiple of 2, subtract 4 +mdsize1_l=19996 +mdsize2_l=19996 +# subtract another 4 for bitmaps +mdsize1b=19988 +mdsize11=19992 +mdsize11a=19456 +mdsize12=19988 + +# ddf needs bigger devices as 32Meg is reserved! +ddfsize=65536 + +config=/tmp/mdadm.conf + +cleanup() { + udevadm settle + $mdadm -Ssq 2> /dev/null + case $DEVTYPE in + loop) + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + losetup -d /dev/loop$d ; # rm -f $targetdir/mdtest$d + rm -f /dev/disk/by-path/loop* + done + ;; + lvm) + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + eval "lvremove --quiet -f \$dev$d" + done + ;; + esac +} + +ctrl_c() { + exitonerror=1 +} + +do_setup() { + trap cleanup 0 1 3 15 + trap ctrl_c 2 + + # make sure there are no loop devices remaining. + # udev started things can sometimes prevent them being stopped + # immediately + while grep loop /proc/partitions > /dev/null 2>&1 + do + mdadm -Ss + losetup -d /dev/loop[0-9]* 2> /dev/null + sleep 1 + done + devlist= + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + sz=$size + if [ $d -gt 7 ]; then sz=$ddfsize ; fi + case $DEVTYPE in + loop) + [ -f $targetdir/mdtest$d ] || dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1 + # make sure udev doesn't touch + mdadm --zero $targetdir/mdtest$d 2> /dev/null + [ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d + if [ $d -eq 7 ] + then + losetup /dev/loop$d $targetdir/mdtest6 # for multipath use + else + losetup /dev/loop$d $targetdir/mdtest$d + fi + eval dev$d=/dev/loop$d + eval file$d=$targetdir/mdtest$d + ;; + lvm) + unset MULTIPATH + eval dev$d=/dev/mapper/${LVM_VOLGROUP}-mdtest$d + if ! lvcreate --quiet -L ${sz}K -n mdtest$d $LVM_VOLGROUP; then + trap '' 0 # make sure lvremove is not called + eval echo error creating \$dev$d + exit 129 + fi + ;; + ram) + unset MULTIPATH + eval dev$d=/dev/ram$d + ;; + esac + eval devlist=\"\$devlist \$dev$d\" + eval devlist$d=\"\$devlist\" + #" <-- add this quote to un-confuse vim syntax highlighting + done + path0=$dev6 + path1=$dev7 + + ulimit -c unlimited + [ -f /proc/mdstat ] || modprobe md_mod + echo 2000 > /proc/sys/dev/raid/speed_limit_max + echo 0 > /sys/module/md_mod/parameters/start_ro +} + +# mdadm always adds --quiet, and we want to see any unexpected messages +mdadm() { + rm -f $targetdir/stderr + case $* in + *-S* ) udevadm settle + p=`cat /proc/sys/dev/raid/speed_limit_max` + echo 20000 > /proc/sys/dev/raid/speed_limit_max + esac + case $* in + *-C* ) $mdadm 2> $targetdir/stderr --quiet "$@" --auto=yes;; + * ) $mdadm 2> $targetdir/stderr --quiet "$@" + esac + rv=$? + case $* in + *-S* ) udevadm settle + echo $p > /proc/sys/dev/raid/speed_limit_max + esac + cat >&2 $targetdir/stderr + return $rv +} + +# check various things +check() { + case $1 in + spares ) + spares=`tr '] ' '\012\012' < /proc/mdstat | grep -c '(S)' || exit 0` + if [ $spares -ne $2 ] + then + echo >&2 "ERROR expected $2 spares, found $spares"; exit 1; + fi + ;; + raid* | linear ) + grep -s "active $1 " /proc/mdstat > /dev/null || { + echo >&2 "ERROR active $1 not found" ; cat /proc/mdstat ; exit 1;} + ;; + algorithm ) + grep -s " algorithm $2 " /proc/mdstat > /dev/null || { + echo >&2 "ERROR algorithm $2 not found"; cat /proc/mdstat; exit 1;} + ;; + resync | recovery | reshape) + cnt=5 + while ! grep -s $1 /proc/mdstat > /dev/null + do + if [ $cnt -gt 0 ] && grep -v idle /sys/block/md*/md/sync_action > /dev/null + then # Something isn't idle - wait a bit + sleep 0.5 + cnt=$[cnt-1] + else + echo >&2 ERROR no $1 happening; cat /proc/mdstat; exit 1 + fi + done + ;; + + nosync ) + sleep 0.5 + # Since 4.2 we delay the close of recovery until there has been a chance for + # spares to be activated. That means that a recovery that finds nothing + # to do can still take a little longer than expected. + # add an extra check: is sync_completed shows the end is reached, assume + # there is no recovery. + if grep -s -E '(resync|recovery|reshape) *=' > /dev/null /proc/mdstat ; then + incomplete=`grep / /sys/block/md*/md/sync_completed 2> /dev/null | sed '/^ *\([0-9]*\) \/ \1/d'` + if [ -n "$incomplete" ]; then + echo >&2 "ERROR resync or recovery is happening!"; cat /proc/mdstat ; exit 1; + fi + fi + ;; + + wait ) + p=`cat /proc/sys/dev/raid/speed_limit_max` + echo 2000000 > /proc/sys/dev/raid/speed_limit_max + sleep 0.1 + while grep -E '(resync|recovery|reshape|check|repair) *=' > /dev/null /proc/mdstat || + grep -v idle > /dev/null /sys/block/md*/md/sync_action + do sleep 0.5; + done + echo $p > /proc/sys/dev/raid/speed_limit_max + ;; + + state ) + grep -s "blocks.*\[$2\]\$" /proc/mdstat > /dev/null || { + echo >&2 "ERROR state $2 not found!"; cat /proc/mdstat ; exit 1; } + sleep 0.5 + ;; + + bitmap ) + grep -s bitmap > /dev/null /proc/mdstat || { + echo >&2 ERROR no bitmap ; cat /proc/mdstat ; exit 1; } + ;; + nobitmap ) + if grep -s "bitmap" > /dev/null /proc/mdstat + then + echo >&2 ERROR bitmap present ; cat /proc/mdstat ; exit 1; + fi + ;; + + readonly ) + grep -s "read-only" > /dev/null /proc/mdstat || { + echo >&2 "ERROR array is not read-only!"; cat /proc/mdstat ; exit 1; } + ;; + + inactive ) + grep -s "inactive" > /dev/null /proc/mdstat || { + echo >&2 "ERROR array is not inactive!"; cat /proc/mdstat ; exit 1; } + ;; + * ) echo >&2 ERROR unknown check $1 ; exit 1; + esac +} + +no_errors() { + if [ -s $targetdir/stderr ] + then echo Bad errors from mdadm: ; cat $targetdir/stderr; exit 2; + fi +} +# basic device test + +testdev() { + udevadm settle + dev=$1 + cnt=$2 + dvsize=$3 + chunk=$4 + if [ -z "$5" ]; then + mkfs.ext3 -F -j $dev > /dev/null 2>&1 && fsck -fn $dev >&2 + fi + dsize=$[dvsize/chunk] + dsize=$[dsize*chunk] + rasize=$[dsize*2*cnt] + # rasize is in sectors + if [ -n "$DEV_ROUND_K" ]; then + rasize=$[rasize/DEV_ROUND_K/2] + rasize=$[rasize*DEV_ROUND_K*2] + fi + if [ `/sbin/blockdev --getsize $dev` -eq 0 ]; then sleep 2 ; fi + _sz=`/sbin/blockdev --getsize $dev` + if [ $rasize -lt $_sz -o $[rasize*4/5] -gt $_sz ] + then + echo "ERROR: size is wrong for $dev: $cnt * $dvsize (chunk=$chunk) = $rasize, not $_sz" + exit 1 + fi +} + +fast_sync() { + echo 200000 > /proc/sys/dev/raid/speed_limit_max +} + +rotest() { + dev=$1 + fsck -fn $dev >&2 +} + +do_test() { + _script=$1 + _basename=`basename $_script` + if [ -f "$_script" ] + then + rm -f $targetdir/stderr + # stop all arrays, just incase some script left an array active. + $mdadm -Ssq 2> /dev/null + mdadm --zero $devlist 2> /dev/null + mdadm --zero $devlist 2> /dev/null + # this might have been reset: restore the default. + echo 2000 > /proc/sys/dev/raid/speed_limit_max + # source script in a subshell, so it has access to our + # namespace, but cannot change it. + echo -ne "$_script... " + if ( set -ex ; . $_script ) &> $targetdir/log + then + echo "succeeded" + _fail=0 + else + log=log + cat $targetdir/stderr >> $targetdir/log + echo "=======================dmesg=================" >> $targetdir/log + dmesg | tail -n 200 >> $targetdir/log + if [ $exitonerror == 0 ]; then + log=log-`basename $_script` + mv $targetdir/log $logdir/$log + fi + echo "FAILED - see $logdir/$log for details" + _fail=1 + fi + if [ "$savelogs" == "1" ]; then + cp $targetdir/log $logdir/$_basename.log + fi + if [ "$_fail" == "1" -a "$exitonerror" == "1" ]; then + exit 1 + fi + fi +} + +do_help() { + echo "Usage: $0 [options]" + echo " Options:" + echo " --tests= Comma separated list of tests to run" + echo " --disable-multipath Disable any tests involving multipath" + echo " --disable-integrity Disable slow tests of RAID[56] consistency" + echo " --logdir= Directory to save logfiles in" + echo " --save-logs Save all logs in " + echo " --keep-going Don't stop on error, ie. run all tests" + echo " --dev=[loop|lvm|ram] Use loop devices (default), LVM, or RAM disk" + echo " --volgroup= LVM volume group for LVM test" + echo " setup Setup test environment and exit" + echo " cleanup Cleanup test environment" + echo " Run tests with " +} + +parse_args() { + for i in $* + do + case $i in + [0-9]*) + prefix=$i + ;; + setup) + echo "mdadm test environment setup" + do_setup + trap 0; exit 0 + ;; + cleanup) + cleanup + exit 0 + ;; + --tests=*) + TESTLIST=`expr "x$i" : 'x[^=]*=\(.*\)' | sed -e 's/,/ /g'` + ;; + --logdir=*) + logdir=`expr "x$i" : 'x[^=]*=\(.*\)'` + ;; + --save-logs) + savelogs=1 + ;; + --keep-going | --no-error) + exitonerror=0 + ;; + --disable-multipath) + unset MULTIPATH + ;; + --disable-integrity) + unset INTEGRITY + ;; + --dev=loop) + DEVTYPE=loop + ;; + --dev=lvm) + DEVTYPE=lvm + ;; + --dev=ram) + DEVTYPE=ram + ;; + --volgroup=*) + LVM_VOLGROUP=`expr "x$i" : 'x[^=]*=\(.*\)'` + ;; + --help) + do_help + exit 0; + ;; + -*) + echo " $0: Unknown argument: $i" + do_help + exit 0; + ;; + esac +done +} + +logdir=$targetdir +parse_args $@ + +do_setup +mkdir -p $logdir + +if [ "$savelogs" == "1" ]; then + echo "Saving logs to $logdir" +fi + +if [ "x$TESTLIST" != "x" ]; then + for script in $TESTLIST + do + do_test $testdir/$script + done +else + for script in $testdir/$prefix $testdir/$prefix*[^~] + do + do_test $script + done +fi +exit 0 diff --git a/tests/00linear b/tests/00linear new file mode 100644 index 00000000..e3ac6555 --- /dev/null +++ b/tests/00linear @@ -0,0 +1,25 @@ + +# create a simple linear + +mdadm -CR $md0 -l linear -n3 $dev0 $dev1 $dev2 +check linear +testdev $md0 3 $mdsize2_l 1 +mdadm -S $md0 + +# now with version-0.90 superblock +mdadm -CR $md0 -e0.90 --level=linear -n4 $dev0 $dev1 $dev2 $dev3 +check linear +testdev $md0 4 $mdsize0 1 +mdadm -S $md0 + +# now with version-1.0 superblock +mdadm -CR $md0 -e1.0 --level=linear -n4 $dev0 $dev1 $dev2 $dev3 +check linear +testdev $md0 4 $mdsize1 1 +mdadm -S $md0 + +# now with no superblock +mdadm -B $md0 -l linear -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check linear +testdev $md0 5 $size 64 +mdadm -S $md0 diff --git a/tests/00multipath b/tests/00multipath new file mode 100644 index 00000000..84e4d693 --- /dev/null +++ b/tests/00multipath @@ -0,0 +1,29 @@ + +# +# create a multipath, and fail and stuff + +if [ "$MULTIPATH" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi + +mdadm -CR $md1 -l multipath -n2 $path0 $path1 + +testdev $md1 1 $mdsize12 1 + +mdadm $md1 -f $path0 +rotest $md1 +testdev $md1 1 $mdsize12 1 + +mdadm $md1 -r $path0 +mdadm $md1 -a $path0 + +rotest $md1 +testdev $md1 1 $mdsize12 1 + +mdadm $md1 -f $path1 +mdadm $md1 -r $path1 +rotest $md1 +testdev $md1 1 $mdsize12 1 + +mdadm -S $md1 diff --git a/tests/00names b/tests/00names new file mode 100644 index 00000000..7a066d8f --- /dev/null +++ b/tests/00names @@ -0,0 +1,13 @@ +set -x -e + +# create arrays with non-numeric names +conf=$targetdir/mdadm.conf +echo "CREATE names=yes" > $conf + +for i in linear raid0 raid1 raid4 raid5 raid6 +do + mdadm -CR --config $conf /dev/md/$i -l $i -n 4 $dev4 $dev3 $dev2 $dev1 + check $i + [ -d /sys/class/block/md_$i/md ] + mdadm -S md_$i +done diff --git a/tests/00raid0 b/tests/00raid0 new file mode 100644 index 00000000..8bc18985 --- /dev/null +++ b/tests/00raid0 @@ -0,0 +1,43 @@ + +# create a simple raid0 + +mdadm -CR $md0 -l raid0 -n3 $dev0 $dev1 $dev2 +check raid0 +testdev $md0 3 $mdsize2_l 512 +mdadm -S $md0 + +# now with version-0.90 superblock +mdadm -CR $md0 -e0.90 -l0 -n4 $dev0 $dev1 $dev2 $dev3 +check raid0 +testdev $md0 4 $mdsize0 512 +mdadm -S $md0 + +# now with no superblock +mdadm -B $md0 -l0 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check raid0 +testdev $md0 5 $size 512 +mdadm -S $md0 + + +# now same again with different chunk size +for chunk in 4 32 256 +do + mdadm -CR $md0 -e0.90 -l raid0 --chunk $chunk -n3 $dev0 $dev1 $dev2 + check raid0 + testdev $md0 3 $mdsize0 $chunk + mdadm -S $md0 + + # now with version-1 superblock + mdadm -CR $md0 -e1.0 -l0 -c $chunk -n4 $dev0 $dev1 $dev2 $dev3 + check raid0 + testdev $md0 4 $mdsize1 $chunk + mdadm -S $md0 + + # now with no superblock + mdadm -B $md0 -l0 -n5 --chun=$chunk $dev0 $dev1 $dev2 $dev3 $dev4 + check raid0 + testdev $md0 5 $size $chunk + mdadm -S $md0 + +done +exit 0 diff --git a/tests/00raid1 b/tests/00raid1 new file mode 100644 index 00000000..c93465d8 --- /dev/null +++ b/tests/00raid1 @@ -0,0 +1,34 @@ + +# create a simple mirror +# test version0, version1, and no super +# test resync and recovery. + +mdadm -CR $md0 -l 1 -n2 $dev0 $dev1 +check resync +check raid1 +testdev $md0 1 $mdsize1a 64 +mdadm -S $md0 + +# now with version-0.90 superblock, spare +mdadm -CR $md0 -e0.90 --level=raid1 -n3 -x2 $dev0 missing missing $dev1 $dev2 +check recovery +check raid1 +testdev $md0 1 $mdsize0 64 +mdadm -S $md0 + +# now with no superblock +mdadm -B $md0 -l mirror -n2 $dev0 $dev1 +check resync +check raid1 +testdev $md0 1 $size 1 +mdadm -S $md0 + +# again, but with no resync +mdadm -B $md0 -l 1 --assume-clean -n2 $dev0 $dev1 +check raid1 +check nosync +testdev $md0 1 $size 1 +mdadm -S $md0 + + +exit 0 diff --git a/tests/00raid10 b/tests/00raid10 new file mode 100644 index 00000000..796b9702 --- /dev/null +++ b/tests/00raid10 @@ -0,0 +1,18 @@ + +# Create some raid10 arrays, all with 6 devices and one spare +devs="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6" + +for lo in n2 n3 f2 f3 +do + cm=1 + case $lo in + f2 ) m=3 cm=2;; + f3 ) m=2 cm=3;; + n2 ) m=3;; + n3 ) m=2;; + esac + mdadm --create --run --level=raid10 --layout $lo --raid-disks 6 -x 1 $md0 $devs + check resync ; check raid10 + testdev $md0 $m $mdsize1 $[512*cm] + mdadm -S $md0 +done diff --git a/tests/00raid4 b/tests/00raid4 new file mode 100644 index 00000000..00a14f2f --- /dev/null +++ b/tests/00raid4 @@ -0,0 +1,16 @@ + +# create a simple raid4 set + +mdadm -CfR $md0 -l 4 -n3 $dev0 $dev1 $dev2 +check resync ; check raid[45] +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +# now with version-1 superblock +mdadm -CR $md0 -e1 --level=raid4 -n4 $dev0 $dev1 $dev2 $dev3 +check recovery; check raid[45] +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + + +exit 0 diff --git a/tests/00raid5 b/tests/00raid5 new file mode 100644 index 00000000..b2b7a971 --- /dev/null +++ b/tests/00raid5 @@ -0,0 +1,33 @@ + +# create a simple raid5 set + +mdadm -CfR $md0 -e 0.90 -l 5 -n3 $dev0 $dev1 $dev2 +check resync +testdev $md0 2 $mdsize0 512 +mdadm -S $md0 + +# now with version-1 superblock +mdadm -CR $md0 -e1 --level=raid5 -n4 $dev0 $dev1 $dev2 $dev3 +check recovery +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# now same again with explicit layout + +for lo in la ra left-symmetric right-symmetric +do + + mdadm -CfR $md0 -l 5 -p $lo -n3 $dev0 $dev1 $dev2 + check resync ; check raid5 + testdev $md0 2 $mdsize1 512 + mdadm -S $md0 + + # now with version-1 superblock + mdadm -CR $md0 -e1 --level=raid5 --layout $lo -n4 $dev0 $dev1 $dev2 $dev3 + check recovery ; check raid5 + testdev $md0 3 $mdsize1 512 + mdadm -S $md0 + +done + +exit 0 diff --git a/tests/00raid6 b/tests/00raid6 new file mode 100644 index 00000000..6977af9b --- /dev/null +++ b/tests/00raid6 @@ -0,0 +1,16 @@ + +# create a simple raid6 set + +mdadm -CfR $md0 -e0.90 -l 6 -n4 $dev0 $dev1 $dev2 $dev3 +check resync ; check raid6 +testdev $md0 2 $mdsize0 512 +mdadm -S $md0 + +# now with version-1 superblock +mdadm -CR $md0 -e1 --level=raid6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check resync ; check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + + +exit 0 diff --git a/tests/01r1fail b/tests/01r1fail new file mode 100644 index 00000000..389b813f --- /dev/null +++ b/tests/01r1fail @@ -0,0 +1,29 @@ + +# create a raid1, fail and remove a drive during initial sync +# Add two more, fail and remove one +# wait for sync to complete, fail, remove, re-add + +mdadm -CR $md0 -l1 -n4 $dev0 $dev1 $dev2 missing +check resync +mdadm $md0 --fail $dev2 +check resync +mdadm $md0 --fail $dev1 +sleep 1 +check nosync +check state U___ +mdadm $md0 --add $dev4 $dev3 +check recovery +# there could be two separate recoveries, one for each dev +check wait +check wait +mdadm $md0 --remove $dev2 $dev1 +check nosync +check state UUU_ + +mdadm --zero-superblock $dev2 +mdadm $md0 -a $dev2 +check recovery +check wait +check state UUUU + +mdadm -S $md0 diff --git a/tests/01r5fail b/tests/01r5fail new file mode 100644 index 00000000..873dba58 --- /dev/null +++ b/tests/01r5fail @@ -0,0 +1,27 @@ + + +# create a raid5, fail and remove a drive during initial sync +# Add two more, fail and remove one +# wait for sync to complete, fail, remove, re-add + +mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +check recovery +mdadm $md0 --fail $dev3 +sleep 1 +check nosync +check state UUU_ + +mdadm $md0 --add $dev4 $dev5 +check recovery +check wait +mdadm $md0 --fail $dev0 +mdadm $md0 --remove $dev3 $dev0 +check recovery +check state _UUU + +mdadm $md0 -a $dev3 +check recovery +check wait +check state UUUU + +mdadm -S $md0 \ No newline at end of file diff --git a/tests/01r5integ b/tests/01r5integ new file mode 100644 index 00000000..48676a22 --- /dev/null +++ b/tests/01r5integ @@ -0,0 +1,33 @@ + +# Check integrity of raid5 in degraded mode +# Create a 4 disk raid5, create a filesystem and +# sha1sum it with each device failed + +if [ "$INTEGRITY" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi + +for layout in ls rs la ra +do + mdadm -CR $md0 -l5 --layout $layout -n4 $dev0 $dev1 $dev2 $dev3 + check wait + tar cf - /etc > $md0 + sum=`sha1sum $md0` + + for i in $dev0 $dev1 $dev2 $dev3 + do + mdadm $md0 -f $i + mdadm $md0 -r $i + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ "$sum" != "$sum1" ] + then + echo $sum does not match $sum1 with $i missing + exit 1 + fi + mdadm $md0 -a $i + while ! (check state 'U*'); do check wait; sleep 0.2; done + done + mdadm -S $md0 +done diff --git a/tests/01raid6integ b/tests/01raid6integ new file mode 100644 index 00000000..12f4d81b --- /dev/null +++ b/tests/01raid6integ @@ -0,0 +1,57 @@ + +# Check integrity of raid6 in degraded modes +# Create a 5 disk raid6, dump some data to it, then +# sha1sum it with different pairs of devices failed + +if [ "$INTEGRITY" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi + +layouts='ls rs la ra' +lv=`uname -r` +if expr $lv '>=' 2.6.30 > /dev/null +then + layouts="$layouts parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6" +fi + +for layout in $layouts +do + mdadm -CR $md0 -l6 --layout $layout -n5 $dev0 $dev1 $dev2 $dev3 $dev4 + check wait + tar cf - /etc > $md0 + sum=`sha1sum $md0` + + totest= + for second in $dev0 $dev1 $dev2 $dev3 $dev4 + do + mdadm $md0 -f $second + mdadm $md0 -r $second + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ "$sum" != "$sum1" ] + then + echo $sum does not match $sum1 with $second missing + exit 1 + fi + for first in $totest + do + mdadm $md0 -f $first + mdadm $md0 -r $first + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ "$sum" != "$sum1" ] + then + echo $sum does not match $sum1 with $first and $second missing + exit 1 + fi + mdadm $md0 -a $first + while ! (check state 'U*_U*'); do check wait; sleep 0.2; done + done + mdadm $md0 -a $second + while ! (check state 'U*'); do check wait; sleep 0.2; done + totest="$totest $second" + done + mdadm -S $md0 +done diff --git a/tests/01replace b/tests/01replace new file mode 100644 index 00000000..6223a223 --- /dev/null +++ b/tests/01replace @@ -0,0 +1,52 @@ +set -x -e + +## test --replace for raid5 raid6 raid1 and raid10 +#1/ after replace, can remove replaced device +#2/ after --replace-with cannot remove the 'with' device +#3/ preserve integrity with concurrent failure + +for level in 1 5 6 10 +do + dd if=/dev/zero of=$dev4 bs=1M || true + dd if=/dev/zero of=$dev5 bs=1M || true + mdadm -CR $md0 -l $level -n4 -x2 $devlist5 + dd if=/dev/urandom of=$md0 bs=1M || true + sum=`sha1sum < $md0` + check wait + mdadm $md0 --replace $dev1 + check wait + mdadm $md0 --remove $dev1 + mdadm $md0 --remove $dev5 && exit 1 + mdadm -S $md0 + dd if=/dev/zero of=$dev4 bs=1M || true + dd if=/dev/zero of=$dev5 bs=1M || true + mdadm -CR $md0 -l $level -n4 -x2 $devlist5 + check wait + sum1=`sha1sum < $md0` + [ "$sum" == "$sum1" ] + + mdadm $md0 --replace $dev1 --with $dev4 + check wait + mdadm $md0 --remove $dev1 + mdadm $md0 --remove $dev5 + mdadm $md0 --remove $dev4 && exit 1 + + mdadm $md0 --add $dev1 $dev5 + mdadm $md0 --replace $dev0 + sleep 1 + mdadm $md0 --fail $dev2 + check wait + sum2=`sha1sum < $md0` + [ "$sum" == "$sum2" ] + + mdadm $md0 --remove $dev0 $dev2 + mdadm $md0 --add $dev0 $dev2 + mdadm $md0 --replace $dev3 + sleep 1 + mdadm $md0 --fail $dev0 $dev2 + check wait + sum3=`sha1sum < $md0` + [ "$sum" == "$sum3" ] + + mdadm -S $md0 +done diff --git a/tests/02lineargrow b/tests/02lineargrow new file mode 100644 index 00000000..e05c219d --- /dev/null +++ b/tests/02lineargrow @@ -0,0 +1,23 @@ + +# create a liner array, and add more drives to to. + +for e in 0.90 1 1.1 1.2 +do + case $e in + 0.90 ) sz=$mdsize0 ;; + 1 ) sz=$mdsize2_l ;; + 1.0 ) sz=$mdsize1 ;; + 1.1 ) sz=$mdsize1_l ;; + 1.2 ) sz=$mdsize2_l ;; + esac + mdadm -CRf $md0 --level linear -e $e --raid-disks=1 $dev1 + testdev $md0 1 $sz 1 + + mdadm --grow $md0 --add $dev2 + testdev $md0 2 $sz 1 + + mdadm --grow $md0 --add $dev3 + testdev $md0 3 $sz 1 + + mdadm -S $md0 +done diff --git a/tests/02r1add b/tests/02r1add new file mode 100644 index 00000000..757f6965 --- /dev/null +++ b/tests/02r1add @@ -0,0 +1,40 @@ + +# Make a raid1, add a device, then remove it again. + +mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2 +check resync +check wait +check state UU + +mdadm --grow $md0 -n 3 +check recovery +check wait +check state UUU + +mdadm $md0 --fail $dev0 +check state _UU + +mdadm --grow $md0 -n 2 +check state UU + +mdadm -S $md0 +# same again for version-1 + + +mdadm -CR $md0 -l1 -n2 -e1.2 -x1 $dev0 $dev1 $dev2 +check resync +check wait +check state UU + +mdadm --grow $md0 -n 3 +check recovery +check wait +check state UUU + +mdadm $md0 --fail $dev0 +check state _UU + +mdadm --grow $md0 -n 2 +check state UU + +mdadm -S $md0 diff --git a/tests/02r1grow b/tests/02r1grow new file mode 100644 index 00000000..5754c88b --- /dev/null +++ b/tests/02r1grow @@ -0,0 +1,36 @@ + + +# create a small raid1 array, make it larger. Then make it smaller + +mdadm -CR $md0 -e 0.90 --level raid1 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +check wait +check state UUU +testdev $md0 1 $[size/2] 1 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 1 $mdsize0 1 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 1 $[size/2] 1 + +mdadm -S $md0 + +# same again with version 1.1 superblock +mdadm -CR $md0 --level raid1 --metadata=1.1 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +check wait +check state UUU +testdev $md0 1 $[size/2] 1 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 1 $mdsize1_l 1 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 1 $[size/2] 1 + +mdadm -S $md0 diff --git a/tests/02r5grow b/tests/02r5grow new file mode 100644 index 00000000..386e82ee --- /dev/null +++ b/tests/02r5grow @@ -0,0 +1,36 @@ + + +# create a small raid5 array, make it larger. Then make it smaller + +mdadm -CR $md0 -e0.90 --level raid5 --chunk=64 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +check wait +check state UUU +testdev $md0 2 $[size/2] 32 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 2 $mdsize0 32 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 2 $[size/2] 32 + +mdadm -S $md0 + +# same again with version 1.1 superblock +mdadm -CR $md0 --level raid5 --metadata=1.1 --chunk=128 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +check wait +check state UUUU +testdev $md0 3 $[size/2] 128 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 3 $[mdsize1_l] 128 + +mdadm --grow $md0 --size $[size/2] +check nosync +sh tests/testdev $md0 3 $[size/2] 128 + +mdadm -S $md0 diff --git a/tests/02r6grow b/tests/02r6grow new file mode 100644 index 00000000..759e6275 --- /dev/null +++ b/tests/02r6grow @@ -0,0 +1,36 @@ + + +# create a small raid6 array, make it larger. Then make it smaller + +mdadm -CR $md0 -e 0.90 --level raid6 --chunk=64 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +check wait +check state UUUU +testdev $md0 2 $[size/2] 32 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 2 $mdsize0 32 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 2 $[size/2] 32 + +mdadm -S $md0 + +# same again with version 1.1 superblock +mdadm -CR $md0 --level raid6 --metadata=1.1 --chunk=128 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +check wait +check state UUUU +testdev $md0 2 $[size/2] 128 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 2 $[mdsize1_l] 128 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 2 $[size/2] 128 + +mdadm -S $md0 diff --git a/tests/03assem-incr b/tests/03assem-incr new file mode 100644 index 00000000..f10a1a48 --- /dev/null +++ b/tests/03assem-incr @@ -0,0 +1,17 @@ +set -x -e + +# Test interaction between -I and -A +# there are locking issue too, but those are hard to test for. +# +# Here just test that a partly "-I" assembled array can +# be completed with "-A" + +for l in 0 1 5 linear +do + mdadm -CR $md0 -l $l -n5 $dev0 $dev1 $dev2 $dev3 $dev4 --assume-clean + mdadm -S md0 + mdadm -I $dev1 + mdadm -I $dev3 + mdadm -A /dev/md0 $dev0 $dev1 $dev2 $dev3 $dev4 + mdadm -S /dev/md0 +done diff --git a/tests/03r0assem b/tests/03r0assem new file mode 100644 index 00000000..6744e322 --- /dev/null +++ b/tests/03r0assem @@ -0,0 +1,137 @@ + +# create a raid0 array from 3 devices, and assemble it in a multitude of ways. +# explicitly list devices +# uuid, md-minor on command line with wildcard devices +# mdadm.conf file + +mdadm -CR $md2 -l0 -n3 $dev0 $dev1 $dev2 +check raid0 +tst="testdev $md2 3 $mdsize1_l 512" +$tst +uuid=`mdadm -Db $md2 | sed 's/.*UUID=//'` +mdadm -S $md2 + +mdadm -A $md2 $dev0 $dev1 $dev2 +$tst +mdadm -S $md2 + +mdadm -A $md2 -u $uuid $devlist +$tst +mdadm -S $md2 + +mdadm --assemble $md2 --name=2 $devlist +$tst +mdadm -S $md2 + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md2 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + +{ + echo DEVICE $devlist + echo array $md2 name=2 +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + + +{ + echo DEVICE $devlist + echo array $md2 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf $md2 +$tst + +echo "DEVICE $devlist" > $conf +mdadm -Db $md2 >> $conf +mdadm -S $md2 + +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + +echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + + +### Now for version 0... + +mdadm --zero-superblock $dev0 $dev1 $dev2 +mdadm -CR $md2 -l0 --metadata=0.90 -n3 $dev0 $dev1 $dev2 +check raid0 +tst="testdev $md2 3 $mdsize0 512" +$tst + +uuid=`mdadm -Db $md2 | sed 's/.*UUID=//'` +mdadm -S $md2 + +mdadm -A $md2 $dev0 $dev1 $dev2 +$tst +mdadm -S $md2 + +mdadm -A $md2 -u $uuid $devlist +$tst +mdadm -S $md2 + +mdadm --assemble $md2 --super-minor=2 $devlist # +$tst +mdadm -S $md2 + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md2 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + +{ + echo DEVICE $devlist + echo array $md2 super-minor=2 +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + + +{ + echo DEVICE $devlist + echo array $md2 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf $md2 +$tst + +echo "DEVICE $devlist" > $conf +mdadm -Db $md2 >> $conf +mdadm -S $md2 + +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + +echo " metadata=1 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + +# Now use incremental assembly. +mdadm -I --config=$conf $dev0 +mdadm -I --config=$conf $dev1 +mdadm -I --config=$conf $dev2 +$tst +mdadm -S $md2 diff --git a/tests/03r5assem b/tests/03r5assem new file mode 100644 index 00000000..0c7fb8c6 --- /dev/null +++ b/tests/03r5assem @@ -0,0 +1,109 @@ + +# create a raid5 array and assemble it in various ways, +# including with missing devices. + +mdadm -CR -e 0.90 $md1 -l5 -n3 $dev0 $dev1 $dev2 +tst="check raid5 ;testdev $md1 2 $mdsize0 512 ; mdadm -S $md1" +uuid=`mdadm -Db $md1 | sed 's/.*UUID=//'` +check wait +eval $tst + +mdadm -A $md1 $dev0 $dev1 $dev2 +eval $tst + +mdadm -A $md1 -u $uuid $devlist +eval $tst + +mdadm -A $md1 -m 1 $devlist +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 super-minor=1 +} > $conf + +mdadm -As -c $conf +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +eval $tst + +mdadm --assemble --scan --config=$conf $md1 +eval $tst + +echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md1 +eval $tst + +### Now with a missing device + +mdadm -AR $md1 $dev0 $dev2 # +check state U_U +eval $tst + +mdadm -A $md1 -u $uuid $devlist +check state U_U +eval $tst + +mdadm -A $md1 -m 1 $devlist +check state U_U +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 super-minor=1 +} > $conf + +mdadm -As -c $conf +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +check state U_U +eval $tst + +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst + +echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst diff --git a/tests/03r5assem-failed b/tests/03r5assem-failed new file mode 100644 index 00000000..d38241df --- /dev/null +++ b/tests/03r5assem-failed @@ -0,0 +1,12 @@ + +# Create an array, fail one device while array is active, stop array, +# then re-assemble listing the failed device first. + +mdadm -CR $md1 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +check wait + +echo 2000 > /sys/block/md1/md/safe_mode_delay +mkfs $md1 +mdadm $md1 -f $dev0 +mdadm -S $md1 +mdadm -A $md1 $dev0 $dev1 $dev2 $dev3 || exit 1 diff --git a/tests/03r5assemV1 b/tests/03r5assemV1 new file mode 100644 index 00000000..bca0c583 --- /dev/null +++ b/tests/03r5assemV1 @@ -0,0 +1,128 @@ + +# create a v-1 raid5 array and assemble in various ways + +mdadm -CR -e1 --name one $md1 -l5 -n3 -x2 $dev0 $dev1 $dev2 $dev3 $dev4 +tst="check raid5 ;testdev $md1 2 $mdsize1 512 ; mdadm -S $md1" +uuid=`mdadm -Db $md1 | sed 's/.*UUID=//'` +check wait + +eval $tst + +mdadm -A $md1 $dev0 $dev1 $dev2 +mdadm $md1 --add $dev3 $dev4 +check spares 2 +eval $tst + +mdadm -A $md1 -u $uuid $devlist +check spares 2 +eval $tst + +mdadm -A $md1 --name one $devlist +check spares 2 +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 name=one +} > $conf + +mdadm -As -c $conf +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2,$dev3,$dev4 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +eval $tst +mdadm --assemble --scan --config=$conf $md1 +eval $tst +echo PING >&2 + +echo " metadata=1.0 devices=$dev0,$dev1,$dev2,$dev3,$dev4" >> $conf +mdadm --assemble --scan --config=$conf $md1 +eval $tst + +### Now with a missing device +# We don't want the recovery to complete while we are +# messing about here. +echo 100 > /proc/sys/dev/raid/speed_limit_max +echo 100 > /proc/sys/dev/raid/speed_limit_min + +mdadm -AR $md1 $dev0 $dev2 $dev3 $dev4 # +check state U_U +check spares 1 +eval $tst + +mdadm -A $md1 -u $uuid $devlist +check state U_U +eval $tst + +mdadm -A $md1 --name=one $devlist +check state U_U +check spares 1 +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 name=one +} > $conf + +mdadm -As -c $conf +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +check state U_U +eval $tst + +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst + +echo " metadata=1.0 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst + +# And now assemble with -I +mdadm -Ss +mdadm -I -c $conf $dev0 +mdadm -I -c $conf $dev1 +mdadm -I -c $conf $dev2 +eval $tst +echo 2000 > /proc/sys/dev/raid/speed_limit_max +echo 1000 > /proc/sys/dev/raid/speed_limit_min diff --git a/tests/04r0update b/tests/04r0update new file mode 100644 index 00000000..73ee3b9f --- /dev/null +++ b/tests/04r0update @@ -0,0 +1,20 @@ + +# create a raid0, re-assemble with a different super-minor +mdadm -CR -e 0.90 $md0 -l0 -n3 $dev0 $dev1 $dev2 +testdev $md0 3 $mdsize0 512 +minor1=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'` +mdadm -S /dev/md0 + +mdadm -A $md1 $dev0 $dev1 $dev2 +minor2=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'` +mdadm -S /dev/md1 + +mdadm -A $md1 --update=super-minor $dev0 $dev1 $dev2 +minor3=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'` +mdadm -S /dev/md1 + +case "$minor1 $minor2 $minor3" in + "0 0 1" ) ;; + * ) echo >&2 "ERROR minors should be '0 0 1' but are '$minor1 $minor2 $minor3'" + exit 1 +esac diff --git a/tests/04r1update b/tests/04r1update new file mode 100644 index 00000000..e22965bc --- /dev/null +++ b/tests/04r1update @@ -0,0 +1,15 @@ +set -i + +# create a raid1 array, let it sync, then re-assemble with a force-sync + +mdadm -CR $md0 -l1 -n2 $dev0 $dev1 +check wait +mdadm -S $md0 + +mdadm -A $md0 $dev0 $dev1 +check nosync +mdadm -S $md0 + +mdadm -A $md0 -U resync $dev0 $dev1 +check resync +mdadm -S $md0 diff --git a/tests/04r5swap b/tests/04r5swap new file mode 100644 index 00000000..5373a607 --- /dev/null +++ b/tests/04r5swap @@ -0,0 +1,18 @@ + +# make a raid5 array, byte swap the superblocks, then assemble... + +mdadm -CR $md0 -e 0.90 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +sleep 4 +mdadm -S $md0 + +mdadm -E --metadata=0 $dev1 > $targetdir/d1 +for d in $dev0 $dev1 $dev2 $dev3 +do $dir/swap_super $d +done +mdadm -E --metadata=0.swap $dev1 > $targetdir/d1s +diff -u $targetdir/d1 $targetdir/d1s + +mdadm --assemble --update=byteorder $md0 $dev0 $dev1 $dev2 $dev3 +sleep 3 +check recovery +mdadm -S $md0 diff --git a/tests/04update-metadata b/tests/04update-metadata new file mode 100644 index 00000000..232fc1ff --- /dev/null +++ b/tests/04update-metadata @@ -0,0 +1,48 @@ +set -xe + +# test converting v0.90 to v1.0 +# check for different levels +# check it fails for non-v0.90 +# check it fails during reshape or recovery +# check it fails when bitmap is present + +dlist="$dev0 $dev1 $dev2 $dev3" + +for ls in raid0/4 linear/4 raid1/1 raid5/3 raid6/2 +do + s=${ls#*/} l=${ls%/*} + mdadm -CR --assume-clean -e 0.90 $md0 --level $l -n 4 -c 64 $dlist + testdev $md0 $s 19904 64 + mdadm -S $md0 + mdadm -A $md0 --update=metadata $dlist + testdev $md0 $s 19904 64 check + mdadm -S $md0 +done + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail with v1.0 metadata + exit 1 +fi + +mdadm -CR -e 0.90 $md0 --level=6 -n4 -c32 $dlist +mdadm -S $md0 + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail during resync + exit 1 +fi +mdadm -A $md0 $dlist +mdadm --wait $md0 || true +mdadm -S $md0 + +# should succeed now +mdadm -A $md0 --update=metadata $dlist + +mdadm -S /dev/md0 +mdadm -CR --assume-clean -e 0.90 $md0 --level=6 -n4 -c32 $dlist --bitmap=internal +mdadm -S $md0 + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail when bitmap present + exit 1 +fi diff --git a/tests/04update-uuid b/tests/04update-uuid new file mode 100644 index 00000000..a4409e78 --- /dev/null +++ b/tests/04update-uuid @@ -0,0 +1,82 @@ +set -x + +# create an array, then change the uuid. + +mdadm -CR --assume-clean $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -S /dev/md0 + +# try v1 superblock + +mdadm -CR --assume-clean -e1 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -S /dev/md0 + + +# now if we have a bitmap, that needs updating too. +rm -f $targetdir/bitmap +mdadm -CR --assume-clean -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || + mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476 +then : ; else + echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2; +fi +mdadm -S /dev/md0 + +# and bitmap for version1 +rm -f $targetdir/bitmap +mdadm -CR --assume-clean -e1.1 -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +# -X cannot tell which byteorder to use for the UUID, so allow both. +if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || + mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476 +then : ; else + echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2; +fi +mdadm -S /dev/md0 + +# Internal bitmaps too. +mdadm -CR --assume-clean -b internal --bitmap-chunk 4 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -X $dev0; exit 2; +} +mdadm -S /dev/md0 + +mdadm -CR --assume-clean -e1.2 -b internal --bitmap-chunk=4 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -X $dev0; exit 2; +} +mdadm -S /dev/md0 diff --git a/tests/05r1-add-internalbitmap b/tests/05r1-add-internalbitmap new file mode 100644 index 00000000..4e203052 --- /dev/null +++ b/tests/05r1-add-internalbitmap @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=0.9 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-add-internalbitmap-v1a b/tests/05r1-add-internalbitmap-v1a new file mode 100644 index 00000000..721a41c1 --- /dev/null +++ b/tests/05r1-add-internalbitmap-v1a @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-add-internalbitmap-v1b b/tests/05r1-add-internalbitmap-v1b new file mode 100644 index 00000000..da78fd61 --- /dev/null +++ b/tests/05r1-add-internalbitmap-v1b @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-add-internalbitmap-v1c b/tests/05r1-add-internalbitmap-v1c new file mode 100644 index 00000000..9f2f128b --- /dev/null +++ b/tests/05r1-add-internalbitmap-v1c @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-bitmapfile b/tests/05r1-bitmapfile new file mode 100644 index 00000000..f384f0ea --- /dev/null +++ b/tests/05r1-bitmapfile @@ -0,0 +1,49 @@ + +# +# create a raid1 with a bitmap file +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create --run $md0 --level=1 -n2 --delay=1 --bitmap $bmf $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1a 64 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 +testdev $md0 1 $mdsize1a 64 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize1a 64 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev2 +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +mdadm --zero $dev1 # force --add, not --re-add +mdadm $md0 --add $dev1 +#it is too fast# check recovery + +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-grow-external b/tests/05r1-grow-external new file mode 100644 index 00000000..69da3e90 --- /dev/null +++ b/tests/05r1-grow-external @@ -0,0 +1,33 @@ + +# +# create a raid1 array, add an external bitmap +# +mdadm --create --run $md0 -l 1 -n 2 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1a 64 + +bmf=$targetdir/bm +rm -f $bmf +#mdadm -E $dev1 +mdadm --grow $md0 --bitmap=$bmf --delay=1 || { mdadm -X $bmf ; exit 1; } +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +testdev $md0 1 $mdsize1a 64 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +#echo $dirty1 $dirty2 $dirty3 $dirty4 +if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ] +then + echo bad dirty counts + exit 1 +fi + +# now to remove the bitmap +check bitmap +mdadm --grow $md0 --bitmap=none +check nobitmap +mdadm -S $md0 diff --git a/tests/05r1-grow-internal b/tests/05r1-grow-internal new file mode 100644 index 00000000..24b3aece --- /dev/null +++ b/tests/05r1-grow-internal @@ -0,0 +1,31 @@ + +# +# create a raid1 array, add an internal bitmap +# +mdadm --create --run $md0 -l 1 -n 2 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1a 64 + +#mdadm -E $dev1 +mdadm --grow $md0 --bitmap=internal --bitmap-chunk=4 --delay=1 || { mdadm -X $dev2 ; exit 1; } +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +testdev $md0 1 $mdsize1a 64 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +#echo $dirty1 $dirty2 $dirty3 $dirty4 +if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ] +then + echo bad dirty counts + exit 1 +fi + +# now to remove the bitmap +check bitmap +mdadm --grow $md0 --bitmap=none +check nobitmap +mdadm -S $md0 diff --git a/tests/05r1-grow-internal-1 b/tests/05r1-grow-internal-1 new file mode 100644 index 00000000..2f0d8237 --- /dev/null +++ b/tests/05r1-grow-internal-1 @@ -0,0 +1,31 @@ + +# +# create a raid1 array, version 1 superblock, add an internal bitmap +# +mdadm --create --run $md0 -e1 -l 1 -n 2 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1b 64 + +#mdadm -E $dev1 +mdadm --grow $md0 --bitmap=internal --bitmap-chunk=4 --delay=1 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +testdev $md0 1 $mdsize1b 64 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +#echo $dirty1 $dirty2 $dirty3 $dirty4 +if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ] +then + echo bad dirty counts + exit 1 +fi + +# now to remove the bitmap +check bitmap +mdadm --grow $md0 --bitmap=none +check nobitmap +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap b/tests/05r1-internalbitmap new file mode 100644 index 00000000..dd7232a7 --- /dev/null +++ b/tests/05r1-internalbitmap @@ -0,0 +1,47 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create -e0.90 --run $md0 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize0 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +testdev $md0 1 $mdsize0 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize0 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 $dev2 +mdadm --zero-superblock $dev1 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap-v1a b/tests/05r1-internalbitmap-v1a new file mode 100644 index 00000000..3ddc082f --- /dev/null +++ b/tests/05r1-internalbitmap-v1a @@ -0,0 +1,48 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +testdev $md0 1 $mdsize1b 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize1b 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --zero-superblock $dev1 +mdadm --assemble -R $md0 $dev2 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap-v1b b/tests/05r1-internalbitmap-v1b new file mode 100644 index 00000000..40f7abea --- /dev/null +++ b/tests/05r1-internalbitmap-v1b @@ -0,0 +1,49 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize11 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize11 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize11 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --zero-superblock $dev1 +mdadm --assemble -R $md0 $dev2 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap-v1c b/tests/05r1-internalbitmap-v1c new file mode 100644 index 00000000..2eaea59b --- /dev/null +++ b/tests/05r1-internalbitmap-v1c @@ -0,0 +1,48 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk 4 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize12 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +testdev $md0 1 $mdsize12 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize12 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --zero-superblock $dev1 +mdadm --assemble -R $md0 $dev2 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-n3-bitmapfile b/tests/05r1-n3-bitmapfile new file mode 100644 index 00000000..f1c3f1ee --- /dev/null +++ b/tests/05r1-n3-bitmapfile @@ -0,0 +1,53 @@ + +# +# create a raid1 with 3 devices and a bitmap file +# make sure resync does right thing. +# +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create -e0.90 --run $md0 --level=1 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 +check wait +testdev $md0 1 $mdsize0 64 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 +testdev $md0 1 $mdsize0 64 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev2 +testdev $md0 1 $mdsize0 64 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev1 $dev3 +check nosync +mdadm --zero-superblock $dev2 +mdadm $md0 --add $dev2 +check recovery + +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 +exit 0 diff --git a/tests/05r1-re-add b/tests/05r1-re-add new file mode 100644 index 00000000..fa6bbcb4 --- /dev/null +++ b/tests/05r1-re-add @@ -0,0 +1,39 @@ + +# +# create a raid1, remove a drive, and readd it. +# resync should be instant. +# Then do some IO first. Resync should still be very fast +# + +mdadm -CR $md0 -l1 -n2 -binternal --bitmap-chunk=4 -d1 $dev1 $dev2 +check resync +check wait +testdev $md0 1 $mdsize1a 64 +sleep 4 + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +mdadm $md0 -a $dev2 +#cat /proc/mdstat +check nosync + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +testdev $md0 1 $mdsize1a 64 +mdadm $md0 -a $dev2 +check wait +blockdev --flushbufs $dev1 $dev2 +cmp --ignore-initial=$[64*512] --bytes=$[$mdsize0*1024] $dev1 $dev2 + +mdadm $md0 -f $dev2; sleep 1 +mdadm $md0 -r $dev2 +if dd if=/dev/zero of=$md0 ; then : ; fi +blockdev --flushbufs $md0 # ensure writes have been sent. +mdadm $md0 -a $dev2 +check recovery +check wait +blockdev --flushbufs $dev1 $dev2 +cmp --ignore-initial=$[64*512] --bytes=$[$mdsize0*1024] $dev1 $dev2 +mdadm -S $md0 diff --git a/tests/05r1-re-add-nosuper b/tests/05r1-re-add-nosuper new file mode 100644 index 00000000..058d602d --- /dev/null +++ b/tests/05r1-re-add-nosuper @@ -0,0 +1,38 @@ + +# +# create a raid1, remove a drive, and readd it. +# resync should be instant. +# Then do some IO first. Resync should still be very fast +# +bmf=$targetdir/bitmap2 +rm -f $bmf +mdadm -B $md0 -l1 -n2 -b$bmf -d1 $dev1 $dev2 +check resync +check wait +testdev $md0 1 $size 1 +sleep 4 + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +mdadm $md0 --re-add $dev2 +check nosync + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +testdev $md0 1 $size 1 +mdadm $md0 --re-add $dev2 +check wait +cmp --bytes=$[$mdsize0*1024] $dev1 $dev2 + +mdadm $md0 -f $dev2; sleep 1 +mdadm $md0 -r $dev2 +if dd if=/dev/zero of=$md0 ; then : ; fi +blockdev --flushbufs $md0 # make sure writes have been sent +mdadm $md0 --re-add $dev2 +check recovery +check wait +# should BLKFLSBUF and then read $dev1/$dev2... +cmp --bytes=$[$mdsize0*1024] $file1 $file2 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap b/tests/05r1-remove-internalbitmap new file mode 100644 index 00000000..712fd56f --- /dev/null +++ b/tests/05r1-remove-internalbitmap @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=0.9 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap-v1a b/tests/05r1-remove-internalbitmap-v1a new file mode 100644 index 00000000..a4a9aaf1 --- /dev/null +++ b/tests/05r1-remove-internalbitmap-v1a @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap-v1b b/tests/05r1-remove-internalbitmap-v1b new file mode 100644 index 00000000..c0918eb6 --- /dev/null +++ b/tests/05r1-remove-internalbitmap-v1b @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap-v1c b/tests/05r1-remove-internalbitmap-v1c new file mode 100644 index 00000000..15f1fbb0 --- /dev/null +++ b/tests/05r1-remove-internalbitmap-v1c @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r5-bitmapfile b/tests/05r5-bitmapfile new file mode 100644 index 00000000..6d173d88 --- /dev/null +++ b/tests/05r5-bitmapfile @@ -0,0 +1,49 @@ + +# +# create a raid1 with a bitmap file +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 +check wait +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 +testdev $md0 2 $mdsize1 512 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 2 $mdsize1 512 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev2 $dev3 +mdadm --zero $dev1 # force add, not re-add +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r5-internalbitmap b/tests/05r5-internalbitmap new file mode 100644 index 00000000..13dc5921 --- /dev/null +++ b/tests/05r5-internalbitmap @@ -0,0 +1,47 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 $dev3 +check wait +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 $dev3 +testdev $md0 2 $mdsize1 512 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 2 $mdsize1 512 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 $dev2 $dev3 +mdadm --zero $dev1 # force --add, not --re-add +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r6-bitmapfile b/tests/05r6-bitmapfile new file mode 100644 index 00000000..d11896db --- /dev/null +++ b/tests/05r6-bitmapfile @@ -0,0 +1,49 @@ + +# +# create a raid1 with a bitmap file +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create --run $md0 --level=6 -n4 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 $dev4 +check wait +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 $dev4 +testdev $md0 2 $mdsize1 512 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev3 +testdev $md0 2 $mdsize1 512 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev1 $dev2 $dev4 +mdadm --zero $dev3 # force --add, not --re-add +mdadm $md0 --add $dev3 +check recovery + +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r6tor0 b/tests/05r6tor0 new file mode 100644 index 00000000..2fd51f2e --- /dev/null +++ b/tests/05r6tor0 @@ -0,0 +1,27 @@ +set -x -e + +# reshape a RAID6 to RAID5 and then RAID0. +# then reshape back up to RAID5 and RAID5 + +mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check wait; sleep 1 +check raid6 +testdev $md0 3 19456 512 +mdadm -G $md0 -l5 +check wait; sleep 1 +check raid5 +testdev $md0 3 19456 512 +mdadm -G $md0 -l0 +check wait; sleep 1 +check raid0 +testdev $md0 3 19456 512 +mdadm -G $md0 -l5 --add $dev3 $dev4 +check wait; sleep 1 +check raid5 +check algorithm 2 +testdev $md0 3 19456 512 +mdadm -G $md0 -l 6 +check wait; sleep 1 +check raid6 +check algorithm 2 +testdev $md0 3 19456 512 diff --git a/tests/06name b/tests/06name new file mode 100644 index 00000000..4d5e824d --- /dev/null +++ b/tests/06name @@ -0,0 +1,12 @@ +set -x + +# create an array with a name + +mdadm -CR $md0 -l0 -n2 --metadata=1 --name="Fred" $dev0 $dev1 +mdadm -E $dev0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1 +mdadm -D $md0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1 +mdadm -S $md0 + +mdadm -A $md0 --name="Fred" $devlist +#mdadm -Db $md0 +mdadm -S $md0 diff --git a/tests/06sysfs b/tests/06sysfs new file mode 100644 index 00000000..af63ef45 --- /dev/null +++ b/tests/06sysfs @@ -0,0 +1,11 @@ +exit 0 +mdadm -CR $md0 -l1 -n3 $dev1 $dev2 $dev3 + +ls -Rl /sys/block/md0 + +cat /sys/block/md0/md/level +cat /sys/block/md0/md/raid_disks + +mdadm -S $md0 + +exit 1 diff --git a/tests/06wrmostly b/tests/06wrmostly new file mode 100644 index 00000000..968c1974 --- /dev/null +++ b/tests/06wrmostly @@ -0,0 +1,13 @@ + +# create a raid1 array with a wrmostly device + +mdadm -CR $md0 -l1 -n3 $dev0 $dev1 --write-mostly $dev2 +testdev $md0 1 $mdsize1a 64 + +# unfortunately, we cannot measure if any read requests are going to $dev2 + +mdadm -S $md0 + +mdadm -CR $md0 -l1 -n3 --write-behind --bitmap=internal --bitmap-chunk=4 $dev0 $dev1 --write-mostly $dev2 +testdev $md0 1 $mdsize1a 64 +mdadm -S $md0 diff --git a/tests/07autoassemble b/tests/07autoassemble new file mode 100644 index 00000000..e689be7c --- /dev/null +++ b/tests/07autoassemble @@ -0,0 +1,24 @@ + +# create two raid1s, build a raid0 on top, then +# tear it down and get auto-assemble to rebuild it. + +mdadm -CR $md1 -l1 -n2 $dev0 $dev1 --homehost=testing +mdadm -CR $md2 -l1 -n2 $dev2 $dev3 --homehost=testing +mdadm -CR $md0 -l0 -n2 $md1 $md2 --homehost=testing + +mdadm -Ss +mdadm -As -c /dev/null --homehost=testing -vvv +testdev $md1 1 $mdsize1a 64 +testdev $md2 1 $mdsize1a 64 +testdev $md0 2 $mdsize11a 512 +mdadm -Ss + +mdadm --zero-superblock $dev0 $dev1 $dev2 $dev3 +## Now the raid0 uses one stacked and one not +mdadm -CR $md1 -l1 -n2 $dev0 $dev1 --homehost=testing +mdadm -CR $md0 -l0 -n2 $md1 $dev2 --homehost=testing +mdadm -Ss +mdadm -As -c /dev/null --homehost=testing -vvv +testdev $md1 1 $mdsize1a 64 +testdev $md0 1 $[mdsize1a+mdsize11a] 512 +mdadm -Ss diff --git a/tests/07autodetect b/tests/07autodetect new file mode 100644 index 00000000..917e0d66 --- /dev/null +++ b/tests/07autodetect @@ -0,0 +1,34 @@ + +# +# Test in-kernel autodetect. +# Create a partitionable array on each of two devices, +# put a partition on each, create an array, and see if we can +# use autodetect to restart the array. + +if lsmod | grep md_mod > /dev/null 2>&1 +then + echo md is a module - cannot test autodetect + exit 0 +fi + + +mdadm -CR -e 0 $mdp0 -l0 -f -n1 $dev0 +mdadm -CR -e 0 $mdp1 -l0 -f -n1 $dev1 +udevadm settle +sfdisk $mdp0 >&2 << END +,,FD +END +sfdisk $mdp1 >&2 << END +,,FD +END +udevadm settle +mdadm -CR -e 0 $md0 -l1 -n2 ${mdp0}p1 ${mdp1}p1 +check resync +check raid1 +check wait +mdadm -S $md0 +mdadm --auto-detect +check raid1 + +mdadm -Ss +exit 0 diff --git a/tests/07changelevelintr b/tests/07changelevelintr new file mode 100644 index 00000000..18c63092 --- /dev/null +++ b/tests/07changelevelintr @@ -0,0 +1,61 @@ + +# +# test that we can stop and restart a level change. +# just test a few in-place changes, and a few +# size-reducing changes. + + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + sleep 1 + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `cat /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + +restart() { + sleep 0.5 + check reshape + mdadm -S $md0 + mdadm -A $md0 $devs --backup-file=$bu + sleep 0.5 + check reshape +} + +bu=/tmp/md-backup +rm -f $bu +devs="$dev0 $dev1 $dev2 $dev3 $dev4" +mdadm -CR $md0 -l5 -n5 -c 256 $devs +checkgeo md0 raid5 5 $[256*1024] 2 + +mdadm -G $md0 -c 128 --backup-file=$bu +restart +checkgeo md0 raid5 5 $[128*1024] 2 + +mdadm -G $md0 --layout rs --backup-file=$bu +restart +checkgeo md0 raid5 5 $[128*1024] 3 + +mdadm -G $md0 --array-size 58368 +mdadm -G $md0 --raid-disks 4 -c 64 --backup-file=$bu +restart +checkgeo md0 raid5 4 $[64*1024] 3 + +devs="$dev0 $dev1 $dev2 $dev3" +mdadm -G $md0 --array-size 19456 +mdadm -G $md0 -n 2 -c 256 --backup-file=$bu +restart +checkgeo md0 raid5 2 $[256*1024] 3 diff --git a/tests/07changelevels b/tests/07changelevels new file mode 100644 index 00000000..a328874a --- /dev/null +++ b/tests/07changelevels @@ -0,0 +1,114 @@ + +# Test changing of level, chunksize etc. +# Create a RAID1, convert to RAID5, add a disk, add another disk +# convert to RAID6, back to RAID5 and ultimately to RAID1 + +testK=$[64*3*6] +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$testK +export MDADM_GROW_VERIFY=1 + +dotest() { + sleep 2 + check wait + testdev $md0 $1 19968 64 nd + blockdev --flushbufs $md0 + cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; } + # write something new - shift chars 4 space + tr ' -~' '$-~ -#' < /tmp/RandFile > /tmp/RandFile2 + mv /tmp/RandFile2 /tmp/RandFile + dd if=/tmp/RandFile of=$md0 +} + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + sleep 1 + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `cat /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + + +bu=/tmp/md-test-backup +rm -f $bu +mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2 -z 19968 +testdev $md0 1 $mdsize1a 64 +dd if=/tmp/RandFile of=$md0 +dotest 1 + +mdadm --grow $md0 -l5 -n3 --chunk 64 +dotest 2 + +mdadm $md0 --add $dev3 $dev4 +mdadm --grow $md0 -n4 --chunk 32 +dotest 3 + +mdadm -G $md0 -l6 --backup-file $bu +dotest 3 + +mdadm -G /dev/md0 --array-size 39936 +mdadm -G $md0 -n4 --backup-file $bu +checkgeo md0 raid6 4 $[32*1024] +dotest 2 + +mdadm -G $md0 -l5 --backup-file $bu +checkgeo md0 raid5 3 $[32*1024] +dotest 2 + +mdadm -G /dev/md0 --array-size 19968 +mdadm -G $md0 -n2 --backup-file $bu +checkgeo md0 raid5 2 $[32*1024] +dotest 1 + +mdadm -G --level=1 $md0 +dotest 1 + +# now repeat that last few steps only with a degraded array. +mdadm -S $md0 +mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +dd if=/tmp/RandFile of=$md0 +dotest 3 + +mdadm $md0 --fail $dev0 + +mdadm -G /dev/md0 --array-size 37888 +mdadm -G $md0 -n4 --backup-file $bu +dotest 2 +checkgeo md0 raid6 4 $[512*1024] +mdadm $md0 --fail $dev4 + +mdadm $md0 --fail $dev3 +# now double-degraded. +# switch layout to a DDF layout and back to make sure that works. + +mdadm -G /dev/md0 --layout=ddf-N-continue --backup-file $bu +checkgeo md0 raid6 4 $[512*1024] 10 +dotest 2 +mdadm -G /dev/md0 --layout=ra --backup-file $bu +checkgeo md0 raid6 4 $[512*1024] 1 +dotest 2 + +mdadm -G $md0 -l5 --backup-file $bu +dotest 2 + +mdadm -G /dev/md0 --array-size 18944 +mdadm -G $md0 -n2 --backup-file $bu +dotest 1 +checkgeo md0 raid5 2 $[512*1024] +mdadm $md0 --fail $dev2 + +mdadm -G --level=1 $md0 +dotest 1 +checkgeo md0 raid1 2 diff --git a/tests/07layouts b/tests/07layouts new file mode 100644 index 00000000..acd1a800 --- /dev/null +++ b/tests/07layouts @@ -0,0 +1,91 @@ + +# check that kernel an restripe interpret all the different layouts +# the same +# This involves changing the layout to each different possibility +# while MDADM_GROW_VERIFY is set. + +testK=$[64*3*6] +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$testK +export MDADM_GROW_VERITY=1 + + +dotest() { + sleep 0.5 + check wait + testdev $md0 $1 $mdsize1 512 nd + blockdev --flushbufs $md0 + cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; } + # write something new - shift chars 4 space + tr ' -~' '$-~ -#' < /tmp/RandFile > /tmp/RandFile2 + mv /tmp/RandFile2 /tmp/RandFile + dd if=/tmp/RandFile of=$md0 +} + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `sed 's/ .*//' /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + + +bu=/tmp/md-test-backup +rm -f $bu + +# first a degraded 5 device raid5 +mdadm -CR $md0 -l5 -n5 $dev0 $dev1 missing $dev2 $dev3 +dd if=/tmp/RandFile of=$md0 +dotest 4 + +l5[0]=la +l5[1]=ra +l5[2]=ls +l5[3]=rs +l5[4]=parity-first +l5[5]=parity-last +for layout in 0 1 2 3 4 5 0 +do + mdadm -G $md0 --layout=${l5[$layout]} --backup-file $bu + checkgeo md0 raid5 5 $[512*1024] $layout + dotest 4 +done + +mdadm -S $md0 +# now a doubly degraded raid6 +mdadm -CR $md0 -l6 -n5 $dev0 missing $dev2 missing $dev4 +dd if=/tmp/RandFile of=$md0 +dotest 3 + +l6[0]=la +l6[1]=ra +l6[2]=ls +l6[3]=rs +l6[4]=parity-first +l6[5]=parity-last +l6[8]=ddf-zero-restart +l6[9]=ddf-N-restart +l6[10]=ddf-N-continue +l6[16]=left-asymmetric-6 +l6[17]=right-asymmetric-6 +l6[18]=left-symmetric-6 +l6[19]=right-symmetric-6 +l6[20]=parity-first-6 +for layout in 0 1 2 3 4 5 8 9 10 16 17 18 19 20 0 +do + mdadm -G $md0 --layout=${l6[$layout]} --backup-file $bu + checkgeo md0 raid6 5 $[512*1024] $layout + dotest 3 +done diff --git a/tests/07reshape5intr b/tests/07reshape5intr new file mode 100644 index 00000000..0f4803ac --- /dev/null +++ b/tests/07reshape5intr @@ -0,0 +1,41 @@ + +# +# test interrupting and restarting raid5 reshape. +set -x +devs="$dev1" +st=UU +for disks in 2 3 4 5 +do + eval devs=\"$devs \$dev$disks\" + st=U$st + for d in $devs + do dd if=/dev/urandom of=$d bs=1024 || true + done + + case $disks in + 2 | 3) chunk=1024;; + 4 ) chunk=512;; + 5 ) chunk=256;; + esac + + mdadm -CR $md0 -amd -l5 -c $chunk -n$disks --assume-clean $devs + mdadm $md0 --add $dev6 + echo 20 > /proc/sys/dev/raid/speed_limit_min + echo 20 > /proc/sys/dev/raid/speed_limit_max + mdadm --grow $md0 -n $[disks+1] + check reshape + check state $st + mdadm --stop $md0 + mdadm --assemble $md0 $devs $dev6 + check reshape + echo 1000 > /proc/sys/dev/raid/speed_limit_min + echo 2000 > /proc/sys/dev/raid/speed_limit_max + check wait + while ! echo check > /sys/block/md0/md/sync_action; do sleep 0.1; done + check wait + mm=`cat /sys/block/md0/md/mismatch_cnt` + if [ $mm -gt 0 ] + then echo >&2 "ERROR mismatch_cnt non-zero : $mm" ; exit 1 + fi + mdadm -S $md0 +done diff --git a/tests/07revert-grow b/tests/07revert-grow new file mode 100644 index 00000000..c8c4e855 --- /dev/null +++ b/tests/07revert-grow @@ -0,0 +1,52 @@ +set -e -x + +# revert a reshape that is increasing the number of devices, +# raid5, raid6, and raid10 + +# metadate 0.90 cannot handle RAID10 growth +# metadata 1.0 doesn't get a default headspace, is don't try it either. + +for metadata in 0.90 1.1 1.2 +do +# RAID5 +mdadm -CR --assume-clean $md0 -l5 -n4 -x1 $devlist4 --metadata=$metadata +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID6 +mdadm -CR --assume-clean $md0 -l6 -n4 -x1 $devlist4 --metadata=$metadata +check raid6 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +if [ $metadata = 0.90 ]; then continue; fi + +# RAID10 +mdadm -CR --assume-clean $md0 -l10 -n4 -x1 $devlist4 --metadata=$metadata +check raid10 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +strace -o /tmp/str ./mdadm -A $md0 --update=revert-reshape $devlist4 +check wait +check raid10 +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +done diff --git a/tests/07revert-inplace b/tests/07revert-inplace new file mode 100644 index 00000000..a73eb977 --- /dev/null +++ b/tests/07revert-inplace @@ -0,0 +1,44 @@ +set -e -x + +# revert a reshape that is not changing the number of data devices, +# raid5, raid6, and raid10 + +# RAID5 -> RAID6 +mdadm -CR --assume-clean $md0 -l5 -n4 -x1 $devlist4 +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -l 6 +sleep 2 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +check algorithm 18 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID6 -> RAID5 +mdadm -CR --assume-clean $md0 -l6 -n5 $devlist4 +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -l 5 +sleep 2 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID10 - decrease chunk size +mdadm -CR --assume-clean $md0 -l10 -n6 -c 64 $devlist5 +check raid10 +testdev $md0 3 $mdsize1 64 +mdadm -G $md0 -c 32 +sleep 2 +mdadm -S $md0 +strace -o /tmp/str ./mdadm -A $md0 --update=revert-reshape $devlist5 +check wait +check raid10 +testdev $md0 3 $mdsize1 64 +mdadm -S $md0 diff --git a/tests/07revert-shrink b/tests/07revert-shrink new file mode 100644 index 00000000..62b5ae02 --- /dev/null +++ b/tests/07revert-shrink @@ -0,0 +1,56 @@ +set -e -x + +# revert a reshape that is decreasing the number of devices, +# raid5, raid6, and raid10 + +bu=$targetdir/md-backup +rm -f $bu +# RAID5 +mdadm -CR --assume-clean $md0 -l5 -n5 $devlist4 +check raid5 +testdev $md0 4 $mdsize1 512 +mdadm --grow $md0 --array-size 56832 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -n 4 --backup=$bu +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=$bu +check wait +check raid5 +fsck -f -n $md0 +testdev $md0 4 $mdsize1 512 +mdadm -S $md0 + +#FIXME +rm -f $bu +# RAID6 +mdadm -CR --assume-clean $md0 -l6 -n5 $devlist4 +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm --grow $md0 --array-size 37888 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 4 --backup=$bu +sleep 2 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=$bu +check wait +check raid6 +fsck -f -n $md0 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID10 +mdadm -CR --assume-clean $md0 -l10 -n6 $devlist5 +check raid10 +testdev $md0 3 $mdsize1 512 +mdadm --grow $md0 --array-size 36864 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 4 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist5 +check wait +check raid10 +fsck -f -n $md0 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 diff --git a/tests/07testreshape5 b/tests/07testreshape5 new file mode 100644 index 00000000..0e1f25f9 --- /dev/null +++ b/tests/07testreshape5 @@ -0,0 +1,45 @@ + +# +# test the reshape code by using test_reshape and the +# kernel md code to move data into and out of variously +# shaped md arrays. +set -x +layouts=(la ra ls rs) +for level in 5 6 +do +for chunk in 4 8 16 32 64 128 +do + devs="$dev1" + for disks in 2 3 4 5 6 + do + eval devs=\"$devs \$dev$disks\" + if [ " $level $disks" = " 6 3" -o " $level $disks" = " 6 2" ] + then continue + fi + for nlayout in 0 1 2 3 + do + layout=${layouts[$nlayout]} + + size=$[chunk*(disks-(level-4))*disks] + + # test restore: make a raid5 from a file, then do a compare + dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$size + $dir/test_stripe restore /tmp/RandFile $disks $[chunk*1024] $level $nlayout 0 $[size*1024] $devs + mdadm -CR -e 1.0 $md0 -amd -l$level -n$disks --assume-clean -c $chunk -p $layout $devs + cmp -s -n $[size*1024] $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + # FIXME check parity + + # test save + dd if=/dev/urandom of=$md0 bs=1024 count=$size + blockdev --flushbufs $md0 $devs; sync + > /tmp/NewRand + $dir/test_stripe save /tmp/NewRand $disks $[chunk*1024] $level $nlayout 0 $[size*1024] $devs + cmp -s -n $[size*1024] $md0 /tmp/NewRand || { echo cmp failed ; exit 2; } + mdadm -S $md0 + udevadm settle + done + done +done +done +exit 0 diff --git a/tests/09imsm-assemble b/tests/09imsm-assemble new file mode 100644 index 00000000..d7028c62 --- /dev/null +++ b/tests/09imsm-assemble @@ -0,0 +1,73 @@ +# validate the prodigal member disk scenario i.e. a former container +# member is returned after having been rebuilt on another system + + +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +export IMSM_DEVNAME_AS_SERIAL=1 +export IMSM_TEST_OROM=1 +export IMSM_NO_PLATFORM=1 +container=/dev/md/container +member=/dev/md/vol0 + + +num_disks=4 +size=$((10*1024)) +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +mdadm -CR $member $dev0 $dev2 -n 2 -l 1 -z $size +mdadm --wait $member || true +mdadm -Ss + +# make dev0 and dev1 a new rebuild family +mdadm -A $container $dev0 $dev1 +mdadm -IR $container +mdadm --wait ${member}_0 || true +mdadm -Ss + +# make dev2 and dev3 a new rebuild family +mdadm -A $container $dev2 $dev3 +mdadm -IR $container +mdadm --wait ${member}_0 || true +mdadm -Ss + +# reassemble and make sure one of the families falls out +mdadm -A $container $dev0 $dev1 $dev2 $dev3 +mdadm -IR $container +testdev ${member}_0 1 $size 64 +if mdadm --remove $container $dev0 ; then + # the dev[23] family won + imsm_check_removal $container $dev1 + imsm_check_hold $container $dev2 + imsm_check_hold $container $dev3 +else + # the dev[01] family won + imsm_check_hold $container $dev1 + imsm_check_removal $container $dev2 + imsm_check_removal $container $dev3 +fi +mdadm -Ss + +# reassemble with a new id for the dev[23] family +mdadm -A $container $dev0 $dev1 +mdadm -IR $container +mdadm -A ${container}2 $dev2 $dev3 --update=uuid +mdadm -IR ${container}2 + +testdev ${member}_0 1 $size 64 +testdev ${member}_1 1 $size 64 diff --git a/tests/09imsm-create-fail-rebuild b/tests/09imsm-create-fail-rebuild new file mode 100644 index 00000000..de17f321 --- /dev/null +++ b/tests/09imsm-create-fail-rebuild @@ -0,0 +1,78 @@ +# sanity check array creation + +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +. tests/env-imsm-template + +# IMSM rounds to multiples of one mebibyte - 1024K +DEV_ROUND_K=1024 + +num_disks=2 +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 +imsm_check container $num_disks + +# RAID0 + RAID1 +size=9000 +level=0 +chunk=64 +offset=0 +mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk +testdev $member0 $num_disks $size $chunk + +offset=$(((size & ~(chunk - 1)) + 2048)) +size=4000 +level=1 +chunk=0 +mdadm -CR $member1 $dev0 $dev1 -n $num_disks -l $level -z $size +imsm_check member $member1 $num_disks $level $size $size $offset $chunk +testdev $member1 1 $size 64 +check wait + +mdadm -Ss + +# RAID10 + RAID5 +num_disks=4 +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +imsm_check container $num_disks + +size=9000 +level=10 +chunk=64 +offset=0 +mdadm -CR $member0 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk +testdev $member0 $((num_disks-2)) $size $chunk + +offset=$(((size & ~(chunk - 1)) + 2048)) +size=4000 +level=5 +mdadm -CR $member1 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member1 $num_disks $level $size $((size*3)) $offset $chunk +testdev $member1 $((num_disks-1)) $size $chunk +check wait + +# FAIL / REBUILD +imsm_check_hold $container $dev0 +mdadm --fail $member0 $dev0 +mdadm --wait-clean --scan || true +imsm_check_removal $container $dev0 +mdadm --add $container $dev4 +check wait +imsm_check_hold $container $dev4 diff --git a/tests/09imsm-overlap b/tests/09imsm-overlap new file mode 100644 index 00000000..e832257c --- /dev/null +++ b/tests/09imsm-overlap @@ -0,0 +1,30 @@ + +. tests/env-imsm-template + +# create raid arrays with varying degress of overlap +mdadm -CR $container -e imsm -n 6 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5 +imsm_check container 6 + +size=1910 +level=1 +num_disks=2 +mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size +mdadm -CR $member1 $dev1 $dev2 -n $num_disks -l $level -z $size +mdadm -CR $member2 $dev2 $dev3 -n $num_disks -l $level -z $size +mdadm -CR $member3 $dev3 $dev4 -n $num_disks -l $level -z $size +mdadm -CR $member4 $dev4 $dev5 -n $num_disks -l $level -z $size + +udevadm settle + +offset=0 +imsm_check member $member0 $num_disks $level $size 1024 $offset +offset=$((offset+size+2048)) +imsm_check member $member1 $num_disks $level $size 1024 $offset +offset=$((offset+size+2048)) +imsm_check member $member2 $num_disks $level $size 1024 $offset +# at this point there should be more freespace at the start of the disk +# than the end +offset=0 +imsm_check member $member3 $num_disks $level $size 1024 $offset +offset=$((offset+size+2048)) +imsm_check member $member4 $num_disks $level $size 1024 $offset diff --git a/tests/10ddf-assemble-missing b/tests/10ddf-assemble-missing new file mode 100644 index 00000000..4bf21b25 --- /dev/null +++ b/tests/10ddf-assemble-missing @@ -0,0 +1,61 @@ +# An array is assembled incompletely. +# Re missing disks get marked as missing and are not allowed back in + +. tests/env-ddf-template +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp /var/tmp/mdmon.log +ret=0 + +mdadm -CR $container -e ddf -n 4 $dev8 $dev9 $dev10 $dev11 +ddf_check container 4 + +mdadm -CR $member1 -n 4 -l 10 $dev8 $dev10 $dev9 $dev11 -z 10000 +mdadm -CR $member0 -n 2 -l 1 $dev8 $dev9 -z 10000 + +mdadm --wait $member0 || true +mdadm --wait $member1 || true + +mdadm -Ss +sleep 1 + +# Add all devices except those for $member0 +mdadm -I $dev10 +mdadm -I $dev11 + +# Start runnable members +mdadm -IRs || true +mdadm -Ss + +#[ -f /var/tmp/mdmon.log ] && cat /var/tmp/mdmon.log + +# Now reassemble +# This should work because BVDs weren't written to +for d in $dev8 $dev9 $dev10 $dev11; do + mdadm -I $d +done +mdadm -Ss + +# Expect consistent state +for d in $dev10 $dev11; do + mdadm -E $d>$tmp + egrep 'state\[0\] : Degraded, Consistent' $tmp || { + ret=1 + echo ERROR: $member0 has unexpected state on $d + } + egrep 'state\[1\] : Optimal, Consistent' $tmp || { + ret=1 + echo ERROR: $member1 has unexpected state on $d + } + + if [ x$(egrep -c 'active/Online$' $tmp) != x2 ]; then + ret=1 + echo ERROR: unexpected number of online disks on $d + fi +done + +if [ $ret -ne 0 ]; then + mdadm -E $dev10 + mdadm -E $dev8 +fi +rm -f $tmp /var/tmp/mdmon.log +[ $ret -eq 0 ] diff --git a/tests/10ddf-create b/tests/10ddf-create new file mode 100644 index 00000000..44e95441 --- /dev/null +++ b/tests/10ddf-create @@ -0,0 +1,89 @@ +# +# Test basic DDF functionality. +# +# Create a container with 5 drives +# create a small raid0 across them all, +# then a small raid10 using 4 drives, then a 2disk raid1 +# and a 3disk raid5 using the remaining space +# +# add some data, tear down the array, reassemble +# and make sure it is still there. +set -e +. tests/env-ddf-template +sda=$(get_rootdev) || exit 1 + +mdadm -CR /dev/md/ddf0 -e ddf -n 5 $dev8 $dev9 $dev10 $dev11 $dev12 +mdadm -CR r5 -l5 -n5 /dev/md/ddf0 -z 5000 +if mdadm -CR r5 -l1 -n2 /dev/md/ddf0 -z 5000 +then echo >&2 create with same name should fail ; exit 1 +fi +mdadm -CR r10 -l10 -n4 -pn2 /dev/md/ddf0 -z 5000 +mdadm -CR r1 -l1 -n2 /dev/md/ddf0 +mdadm -CR r0 -l0 -n3 /dev/md/ddf0 +testdev /dev/md/r5 4 5000 512 +testdev /dev/md/r10 2 5000 512 +# r0/r10 will use 4608 due to chunk size, so that leaves 23552 for the rest +testdev /dev/md/r1 1 23552 64 +testdev /dev/md/r0 3 23552 512 +dd if=$sda of=/dev/md/r0 || true +dd if=$sda of=/dev/md/r10 || true +dd if=$sda of=/dev/md/r1 || true +dd if=$sda of=/dev/md/r5 || true + +s0=`sha1sum /dev/md/r0` +s10=`sha1sum /dev/md/r10` +s1=`sha1sum /dev/md/r1` +s5=`sha1sum /dev/md/r5` + + +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 +mdadm -I /dev/md/ddf0 + +udevadm settle +s0a=`sha1sum /dev/md/r0` +s10a=`sha1sum /dev/md/r10` +s1a=`sha1sum /dev/md/r1` +s5a=`sha1sum /dev/md/r5` + +if [ "$s0" != "$s0a" ]; then + echo r0 did not match ; exit 1; +fi +if [ "$s10" != "$s10a" ]; then + echo r10 did not match ; exit 1; +fi +if [ "$s1" != "$s1a" ]; then + echo r1 did not match ; exit 1; +fi +if [ "$s5" != "$s5a" ]; then + echo r5 did not match ; exit 1; +fi + +# failure status just means it has completed already, so ignore it. +mdadm --wait /dev/md/r1 || true +mdadm --wait /dev/md/r10 || true +mdadm --wait /dev/md/r5 || true + +mdadm -Dbs > /var/tmp/mdadm.conf + +mdadm -Ss + +# Now try to assemble using mdadm.conf +mdadm -Asc /var/tmp/mdadm.conf +check nosync # This failed once. The raid5 was resyncing. +udevadm settle +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - +mdadm -Ss + +# and now assemble fully incrementally. +for i in $dev8 $dev9 $dev10 $dev11 $dev12 +do + mdadm -I $i -c /var/tmp/mdadm.conf +done +check nosync +udevadm settle +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - +mdadm -Ss +rm /tmp/mdadm.conf /var/tmp/mdadm.conf diff --git a/tests/10ddf-create-fail-rebuild b/tests/10ddf-create-fail-rebuild new file mode 100644 index 00000000..a8e8ced9 --- /dev/null +++ b/tests/10ddf-create-fail-rebuild @@ -0,0 +1,77 @@ +# sanity check array creation + +ddf_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +ddf_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +. tests/env-ddf-template + +num_disks=2 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 +ddf_check container $num_disks + +# RAID0 + RAID1 +size=9000 +level=0 +chunk=64 +offset=0 +layout=0 +mdadm -CR $member0 $dev8 $dev9 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk $layout +testdev $member0 $num_disks $size $chunk + +offset=$(((size & ~(chunk - 1)))) +size=4000 +level=1 +chunk=0 +mdadm -CR $member1 $dev8 $dev9 -n $num_disks -l $level -z $size +ddf_check member $member1 $num_disks $level $size $size $offset $chunk $layout +testdev $member1 1 $size 1 +check wait + +mdadm -Ss + +# RAID10 + RAID5 +num_disks=4 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 $dev10 $dev11 +ddf_check container $num_disks + +size=9000 +level=10 +chunk=64 +offset=0 +layout=2 +mdadm -CR $member0 $dev8 $dev9 $dev10 $dev11 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk $layout +testdev $member0 $((num_disks-2)) $size $chunk + +offset=$(((size & ~(chunk - 1)))) +size=4000 +level=5 +mdadm -CR $member1 $dev8 $dev9 $dev10 $dev11 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member1 $num_disks $level $size $((size*3)) $offset $chunk $layout +testdev $member1 $((num_disks-1)) $size $chunk +check wait + +# FAIL / REBUILD +ddf_check_hold $container $dev8 +mdadm --fail $member0 $dev8 +mdadm --wait-clean --scan || true +ddf_check_removal $container $dev8 +mdadm --add $container $dev12 +check wait +ddf_check_hold $container $dev12 diff --git a/tests/10ddf-fail-create-race b/tests/10ddf-fail-create-race new file mode 100644 index 00000000..bd5dfb51 --- /dev/null +++ b/tests/10ddf-fail-create-race @@ -0,0 +1,66 @@ +# This test creates a RAID1, fails a disk, and immediately +# (simultaneously) creates a new array. This tests for a possible +# race where the meta data reflecting the disk failure may not +# be written when the 2nd array is created. +. tests/env-ddf-template + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +mdadm -CR $container -e ddf -l container -n 2 $dev11 $dev12 +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container -z 10000 >/tmp/mdmon.txt 2>&1 +mdadm -CR $member0 -l raid1 -n 2 $container -z 10000 +check wait +fail0=$dev11 +mdadm --fail $member0 $fail0 & + +# The test can succeed two ways: +# 1) mdadm -C member1 fails - in this case the meta data +# was already on disk when the create attempt was made +# 2) mdadm -C succeeds in the first place (meta data not on disk yet), +# but mdmon detects the problem and sets the disk faulty. + +if mdadm -CR $member1 -l raid1 -n 2 $container; then + + echo create should have failed / race condition? + + check wait + set -- $(get_raiddisks $member0) + d0=$1 + ret=0 + if [ $1 = $fail0 -o $2 = $fail0 ]; then + ret=1 + else + set -- $(get_raiddisks $member1) + if [ $1 = $fail0 -o $2 = $fail0 ]; then + ret=1 + fi + fi + if [ $ret -eq 1 ]; then + echo ERROR: failed disk $fail0 is still a RAID member + echo $member0: $(get_raiddisks $member0) + echo $member1: $(get_raiddisks $member1) + fi + tmp=$(mktemp /tmp/mdest-XXXXXX) + mdadm -E $d0 >$tmp + if [ x$(grep -c 'state\[[01]\] : Degraded' $tmp) != x2 ]; then + echo ERROR: non-degraded array found + mdadm -E $d0 + ret=1 + fi + if ! grep -q '^ *0 *[0-9a-f]\{8\} .*Offline, Failed' $tmp; then + echo ERROR: disk 0 not marked as failed in meta data + mdadm -E $d0 + ret=1 + fi + rm -f $tmp +else + ret=0 +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} + +[ $ret -eq 0 ] + diff --git a/tests/10ddf-fail-readd b/tests/10ddf-fail-readd new file mode 100644 index 00000000..9cd78937 --- /dev/null +++ b/tests/10ddf-fail-readd @@ -0,0 +1,55 @@ +# Simple fail / re-add test +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 +mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +mke2fs -F $member0 +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 + +sleep 1 +mdadm $container --remove $fail0 + +set -- $(get_raiddisks $member0) +case $1 in MISSING) shift;; esac +good0=$1 + +# We re-add the disk now +mdadm $container --add $fail0 + +sleep 1 +mdadm --wait $member0 || true + +ret=0 +set -- $(get_raiddisks $member0) +case $1:$2 in + $dev8:$dev9|$dev9:$dev8);; + *) echo ERROR: bad raid disks "$@"; ret=1;; +esac + +mdadm -Ss +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: member 0 should be optimal in meta data on $x + ret=1 + fi +done + +rm -f $tmp +if [ $ret -ne 0 ]; then + mdadm -E $dev8 + mdadm -E $dev9 +fi + +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-readd-readonly b/tests/10ddf-fail-readd-readonly new file mode 100644 index 00000000..6a74d9c8 --- /dev/null +++ b/tests/10ddf-fail-readd-readonly @@ -0,0 +1,71 @@ +# Simple fail / re-add test +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 +mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 + +sleep 1 +set -- $(get_raiddisks $member0) +case $1 in MISSING) shift;; esac +good0=$1 + +# Check that the meta data now show one disk as failed +ret=0 +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Degraded, Consistent' $tmp; then + echo ERROR: member 0 should be degraded in meta data on $x + ret=1 + fi + phys=$(grep $x $tmp) + case $x:$phys in + $fail0:*active/Offline,\ Failed);; + $good0:*active/Online);; + *) echo ERROR: wrong phys disk state for $x + ret=1 + ;; + esac +done + +mdadm $container --remove $fail0 + +# We re-add the disk now +mdadm $container --add $fail0 + +sleep 1 +mdadm --wait $member0 || true + +set -- $(get_raiddisks $member0) +case $1:$2 in + $dev8:$dev9|$dev9:$dev8);; + *) echo ERROR: bad raid disks "$@"; ret=1;; +esac + +mdadm -Ss +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: member 0 should be optimal in meta data on $x + ret=1 + fi +done + +rm -f $tmp +if [ $ret -ne 0 ]; then + mdadm -E $dev8 + mdadm -E $dev9 +fi + +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-spare b/tests/10ddf-fail-spare new file mode 100644 index 00000000..ab737ca4 --- /dev/null +++ b/tests/10ddf-fail-spare @@ -0,0 +1,86 @@ +# Test suggested by Albert Pauw: Create, fail one disk, have mdmon +# activate the spare, +# then run create again. Shouldn't use the failed disk for Create, +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -CR $container -e ddf -l container -n 5 $dev8 $dev9 $dev10 $dev11 $dev12 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm --fail $member0 $fail0 + +# To make sure the spare is activated, we may have to sleep +# 2s has always been enough for me +sleep 2 +check wait + +# This test can succeed both ways - if spare was activated +# before new array was created, we see only member 0. +# otherwise, we see both, adn member0 is degraded because the +# new array grabbed the spare +# which case occurs depends on the sleep time above. +ret=0 +if mdadm -CR $member1 -l raid5 -n 3 $container; then + # Creation successful - must have been quicker than spare activation + + check wait + set -- $(get_raiddisks $member1) + if [ $1 = $fail0 -o $2 = $fail0 -o $3 = $fail0 ]; then + echo ERROR: $member1 must not contain $fail0: $@ + ret=1 + fi + d1=$1 + mdadm -E $d1 >$tmp + if ! grep -q 'state\[1\] : Optimal, Consistent' $tmp; then + echo ERROR: member 1 should be optimal in meta data + ret=1 + fi + state0=Degraded +else + # Creation unsuccessful - spare was used for member 0 + state0=Optimal +fi + +# need to delay a little bit, sometimes the meta data aren't +# up-to-date yet +sleep 0.5 +set -- $(get_raiddisks $member0) +if [ $1 = $fail0 -o $2 = $fail0 ]; then + echo ERROR: $member0 must not contain $fail0: $@ + ret=1 +fi +d0=$1 + +[ -f $tmp ] || mdadm -E $d0 >$tmp + +if ! grep -q 'state\[0\] : '$state0', Consistent' $tmp; then + echo ERROR: member 0 should be $state0 in meta data + ret=1 +fi +if ! grep -q 'Offline, Failed' $tmp; then + echo ERROR: Failed disk expected in meta data + ret=1 +fi +if [ $ret -eq 1 ]; then + cat /proc/mdstat + mdadm -E $d0 + mdadm -E $d1 + mdadm -E $fail0 +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} + +rm -f $tmp +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-stop-readd b/tests/10ddf-fail-stop-readd new file mode 100644 index 00000000..f8ebe176 --- /dev/null +++ b/tests/10ddf-fail-stop-readd @@ -0,0 +1,66 @@ +# Simple fail / re-add test +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 +mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +# Write to the array +mke2fs -F $member0 +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 + +sleep 1 +mdadm $container --remove $fail0 + +set -- $(get_raiddisks $member0) +case $1 in MISSING) shift;; esac +good0=$1 + +mdadm -Ss + +sleep 1 +# Now simulate incremental assembly +mdadm -I $good0 +mdadm -IRs || true + +# Write to the array +mke2fs -F $member0 + +# We re-add the disk now +mdadm $container --add $fail0 + +sleep 1 +mdadm --wait $member0 || true + +ret=0 +set -- $(get_raiddisks $member0) +case $1:$2 in + $dev8:$dev9|$dev9:$dev8);; + *) echo ERROR: bad raid disks "$@"; ret=1;; +esac + +mdadm -Ss +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: member 0 should be optimal in meta data on $x + ret=1 + fi +done + +rm -f $tmp +if [ $ret -ne 0 ]; then + mdadm -E $dev8 + mdadm -E $dev9 +fi + +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-twice b/tests/10ddf-fail-twice new file mode 100644 index 00000000..6af19434 --- /dev/null +++ b/tests/10ddf-fail-twice @@ -0,0 +1,59 @@ +. tests/env-ddf-template + +num_disks=5 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 $dev10 $dev11 $dev12 +ddf_check container $num_disks + +mdadm -CR $member0 -n 2 -l 1 $container +mdadm -CR $member1 -n 3 -l 5 $container + +mdadm --wait $member1 $member0 || mdadm --wait $member1 $member0 || true + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 +set -- $(get_raiddisks $member1) +fail1=$1 +mdadm $member1 --fail $fail1 + +mdadm $container --add $dev13 + +mdadm --wait $member1 $member0 || mdadm --wait $member1 $member0 || true + + +devs0="$(get_raiddisks $member0)" +devs1="$(get_raiddisks $member1)" + +present=$(($(get_present $member0) + $(get_present $member1))) +[ $present -eq 4 ] || { + echo expected 4 present disks, got $present + devices for $member0: $devs0 + devices for $member1: $devs1 + exit 1 +} + +if echo "$devs0" | grep -q MISSING; then + good=1 + bad=0 +else + good=0 + bad=1 +fi + +# find a good device +eval "set -- \$devs$good" +check=$1 + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +mdadm -E $check >$tmp + +{ grep -q 'state\['$bad'\] : Degraded, Consistent' $tmp && + grep -q 'state\['$good'\] : Optimal, Consistent' $tmp; } || { + echo unexpected meta data state on $check + mdadm -E $check + rm -f $tmp + exit 1 +} + +rm -f $tmp +exit 0 diff --git a/tests/10ddf-fail-two-spares b/tests/10ddf-fail-two-spares new file mode 100644 index 00000000..e00810d8 --- /dev/null +++ b/tests/10ddf-fail-two-spares @@ -0,0 +1,86 @@ +# Simulate two disks failing shorty after each other +. tests/env-ddf-template +sda=$(get_rootdev) || exit 1 +tmp=$(mktemp /tmp/mdtest-XXXXXX) + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -CR $container -e ddf -l container -n 6 \ + $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +#fast_sync + +mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 +#$dir/mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 \ +# >/tmp/mdmon.txt 2>&1 +mdadm -CR $member1 -l raid10 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 + +dd if=$sda of=$member0 bs=1M count=32 +dd if=$sda of=$member1 bs=1M skip=16 count=16 + +check wait + +sum0=$(sha1sum $member0) +sum1=$(sha1sum $member1) + +mdadm --fail $member1 $dev11 +sleep 1 +mdadm --fail $member1 $dev12 + +# We will have 4 resync procedures, 2 spares for 2 arrays. +mdadm --wait $member1 $member0 || true +mdadm --wait $member1 $member0 || true + +devs0="$(get_raiddisks $member0)" +devs1="$(get_raiddisks $member1)" +expected="$dev10 +$dev13 +$dev8 +$dev9" + +ret=0 +if [ "$(echo "$devs0" | sort)" != "$expected" \ + -o "$(echo "$devs1" | sort)" != "$expected" ]; then + echo ERROR: unexpected members + echo $member0: $devs0 + echo $member1: $devs1 + ret=1 +fi + +mdadm -E $dev10 >$tmp +if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: $member0 should be optimal in meta data + ret=1 +fi +if ! grep -q 'state\[1\] : Optimal, Consistent' $tmp; then + echo ERROR: $member1 should be optimal in meta data + ret=1 +fi +if [ x"$(grep -c active/Online $tmp)" != x4 ]; then + echo ERROR: expected 4 online disks + ret=1 +fi +if [ x"$(grep -c "Offline, Failed" $tmp)" != x2 ]; then + echo ERROR: expected 2 failed disks + ret=1 +fi + +sum0a=$(sha1sum $member0) +sum1a=$(sha1sum $member1) + +if [ "$sum0" != "$sum0a" -o "$sum1" != "$sum1a" ]; then + echo ERROR: checksum mismatch + ret=1 +fi + +if [ $ret -eq 1 ]; then + cat /proc/mdstat + cat $tmp +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} +rm -f $tmp + +[ $ret -eq 0 ] diff --git a/tests/10ddf-geometry b/tests/10ddf-geometry new file mode 100644 index 00000000..b0cce2f6 --- /dev/null +++ b/tests/10ddf-geometry @@ -0,0 +1,82 @@ +# +# Test various RAID geometries, creation and deletion of subarrays +# + +assert_fail() { + if mdadm "$@"; then + echo mdadm "$@" must fail + return 1 + else + return 0 + fi +} + +assert_kill() { + local dev=$1 n=$2 + mdadm -S $dev + mdadm --kill-subarray=$n /dev/md/ddf0 + if mdadm -Dbs | grep -q $dev; then + echo >&2 $dev should be deleted + return 1 + fi + return 0 +} + +set -e +mdadm -CR /dev/md/ddf0 -e ddf -n 6 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +# RAID1 geometries +# Use different sizes to make offset calculation harder +mdadm -CR l1s -l1 -n2 /dev/md/ddf0 -z 8000 +mdadm -CR l1m -l1 -n3 $dev8 $dev9 $dev10 -z 10000 +assert_fail -CR badl1 -l1 -n4 /dev/md/ddf0 + +# RAID10 geometries +mdadm -CR l10_0 -l10 -n3 /dev/md/ddf0 -z 1000 +mdadm -CR l10_1 -l10 -n5 /dev/md/ddf0 -z 1000 +assert_fail mdadm -CR badl10 -l10 -n4 -pn3 /dev/md/ddf0 +mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 4000 +mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 4000 + +assert_fail -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 +assert_kill /dev/md/l10_2 4 +# gone now, must be able to create it again +mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 + +# Now stop and reassemble +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +# Same as above, on inactive container +assert_fail -CR l10_3 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 +# Kill subarray without having started anything (no mdmon) +mdadm --kill-subarray=5 /dev/md/ddf0 +mdadm -I /dev/md/ddf0 +mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 5000 + +assert_kill /dev/md/l10_2 4 +assert_kill /dev/md/l10_3 5 + +# RAID5 geometries +mdadm -CR l5la -l5 -n3 --layout=ddf-N-restart /dev/md/ddf0 -z 5000 +mdadm -CR l5ra -l5 -n3 --layout=ddf-zero-restart /dev/md/ddf0 -z 5000 +mdadm -CR l5ls -l5 -n3 --layout=ddf-N-continue /dev/md/ddf0 -z 5000 +assert_fail -CR l5rs -l5 -n3 -prs /dev/md/ddf0 -z 5000 + +# Stop and reassemble +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -I /dev/md/ddf0 + +assert_kill /dev/md/l5la 4 +assert_kill /dev/md/l5ls 6 +assert_kill /dev/md/l5ra 5 + +# RAID6 geometries +assert_fail -CR l6la -l6 -n3 -pla /dev/md/ddf0 -z 5000 +assert_fail -CR l6rs -l5 -n4 -prs /dev/md/ddf0 -z 5000 +mdadm -CR l6la -l6 -n4 --layout=ddf-N-restart /dev/md/ddf0 -z 5000 +mdadm -CR l6ra -l6 -n4 --layout=ddf-zero-restart $dev8 $dev9 $dev10 $dev11 -z 5000 +mdadm -CR l6ls -l6 -n4 --layout=ddf-N-continue $dev13 $dev8 $dev9 $dev12 -z 5000 + +mdadm -Ss diff --git a/tests/10ddf-incremental-wrong-order b/tests/10ddf-incremental-wrong-order new file mode 100644 index 00000000..9ecf6bc2 --- /dev/null +++ b/tests/10ddf-incremental-wrong-order @@ -0,0 +1,131 @@ +# An array is assembled incompletely. Some disks will +# have later metadata than others. +# The array is then reassembled in the "wrong" order - +# older meta data first. +# This FAILS with mdadm 3.3 +. tests/env-ddf-template +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp /var/tmp/mdmon.log +ret=0 + +mdadm -CR $container -e ddf -n 4 $dev8 $dev9 $dev10 $dev11 +ddf_check container 4 + +mdadm -CR $member1 -n 4 -l 10 $dev8 $dev10 $dev9 $dev11 -z 10000 +mdadm -CR $member0 -n 2 -l 1 $dev8 $dev9 -z 10000 + +mdadm --wait $member0 || true +mdadm --wait $member1 || true + +mke2fs -F $member0 +mke2fs -F $member1 +sha_0a=$(sha1_sum $member0) +sha_1a=$(sha1_sum $member1) + +mdadm -Ss +sleep 1 + +# Add all devices except those for $member0 +mdadm -I $dev10 +mdadm -I $dev11 + +# Start runnable members ($member1) and write +mdadm -IRs || true +e2fsck -fy $member1 +sha_1b=$(sha1_sum $member1) + +mdadm -Ss +sleep 1 + +# Seq number should be different now +seq8a=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p') +seq10a=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p') + +if [ $seq8a -ge $seq10a ]; then + ret=1 + echo ERROR: sequential number of $dev10 not bigger than $dev8 +fi +if [ x$sha_1a = x$sha_1b ]; then + ret=1 + echo ERROR: sha1sums equal after write +fi + +#[ -f /var/tmp/mdmon.log ] && cat /var/tmp/mdmon.log + +# Now reassemble +# Note that we add the previously missing disks first. +# $dev10 should have a higher seq number than $dev8 +for d in $dev8 $dev9 $dev10 $dev11; do + mdadm -I $d +done + +mdadm -IRs || true +sha_0c=$(sha1_sum $member0) +sha_1c=$(sha1_sum $member1) + +mdadm -Ss +sleep 1 + +seq8c=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p') +seq10c=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p') + +if [ x$sha_0a != x$sha_0c ]; then + ret=1 + echo ERROR: sha1sum of $member0 has changed +fi +if [ x$sha_1b != x$sha_1c ]; then + ret=1 + echo ERROR: sha1sum of $member1 has changed +fi +if [ \( $seq10a -ge $seq10c \) -o \( $seq8c -ne $seq10c \) ]; then + ret=1 + echo ERROR: sequential numbers are wrong +fi + +# Expect consistent state +for d in $dev10 $dev8; do + mdadm -E $d>$tmp + for x in 0 1; do + egrep 'state\['$x'\] : Optimal, Consistent' $tmp || { + ret=1 + echo ERROR: $member0 has unexpected state on $d + } + done + if [ x$(egrep -c 'active/Online$' $tmp) != x4 ]; then + ret=1 + echo ERROR: unexpected number of online disks on $d + fi +done + +# Now try assembly +if mdadm -A $container $dev8 $dev9 $dev10 $dev11; then + mdadm -IR $container + sha_0d=$(sha1_sum $member0) + sha_1d=$(sha1_sum $member1) + mdadm -Ss + sleep 1 + seq8d=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p') + seq10d=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p') + if [ x$sha_0a != x$sha_0d ]; then + ret=1 + echo ERROR: sha1sum of $member0 has changed + fi + if [ x$sha_1b != x$sha_1d ]; then + ret=1 + echo ERROR: sha1sum of $member1 has changed + fi + if [ \( $seq10a -ge $seq10d \) -o \( $seq8d -ne $seq10d \) ]; then + ret=1 + echo ERROR: sequential numbers are wrong + fi +else + ret=1 + echo ERROR: assembly failed +fi + +if [ $ret -ne 0 ]; then + mdadm -E $dev10 + mdadm -E $dev8 +fi +rm -f $tmp /var/tmp/mdmon.log +[ $ret -eq 0 ] diff --git a/tests/10ddf-sudden-degraded b/tests/10ddf-sudden-degraded new file mode 100644 index 00000000..dc692aea --- /dev/null +++ b/tests/10ddf-sudden-degraded @@ -0,0 +1,18 @@ +# +# An array is assembled with one device missing. +# The other device must be marked as Failed in metadata + +. tests/env-ddf-template + +mdadm -CR $container -e ddf -n 2 $dev8 $dev9 +ddf_check container 2 + +mdadm -CR $member1 -n 2 -l1 $dev8 $dev9 +mdadm --wait $member1 || true +mdadm -Ss + +mdadm -I $dev8 +mdadm -R $container +mkfs $member1 +# There must be a missing device recorded +mdadm --examine $dev8 | grep 'Raid Devices.*--' || exit 1 diff --git a/tests/11spare-migration b/tests/11spare-migration new file mode 100644 index 00000000..24b6ec69 --- /dev/null +++ b/tests/11spare-migration @@ -0,0 +1,454 @@ +# Set of tests for autorebuild functionality using mdadm -F +# To be able to test ddf one must have all loop devices of bigger size, with the ones +# above number 7 bigger again by any amount (this is not changed for now as it +# could affect other tests) + +export IMSM_DEVNAME_AS_SERIAL=1 +export IMSM_TEST_OROM=1 +export IMSM_NO_PLATFORM=1 + +. tests/utils +set -ex +verbose="yes" +sleeptime=10 + +# if listfailed=yes then don't exit if test failed due to wrong +# spare-migration and just print a list at the end. Other errors still +# stop the test. +# if listfailed=no then exit on first failure +listfailed="yes" + +# start Monitor, set monitorpid +# uses global scan variable +# all parameters are numbers of devices to be monitored. only used when $scan="no" +# eg. monitor 0 1 will start monitoring of containers c0, c1 and subarrays v0, v1 +monitor(){ + [ -z $monitorpid ] || return + if [ "$scan" == "yes" ]; then + $mdadm -F -d 1 --scan --mail root@localhost -c $config & + monitorpid=$! + return + fi + unset mddevs + while [ -n "$1" ] + do + eval container=\$c$1 + eval volumes=\$v$1 + mddevs="$mddevs /dev/$container" + if [ "$container" != "$volumes" ]; then + for vol in $volumes; do + mddevs="$mddevs /dev/$vol" + done + fi + shift + done + if [ -n "$mddevs" ]; then + if [ "$verbose" != "yes" ]; then + $mdadm -F -d 1 $mddevs -c $config >&2 & + monitorpid=$! + else + $mdadm -F -t -d 1 $mddevs -c $config & + monitorpid=$! + fi + fi + [ "$verbose" != "yes" ] || echo $mddevs $monitorpid +} + +test0() +{ +dsc "Test 0: No config file, no spare should be moved" +> $config +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev0 +# check that spare loop2 was not moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 n +tidyup +} + +test0a() +{ +dsc "Test 0a: No domains in config file, no spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev0 +# check that spare loop2 was not moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 n +tidyup +} + +test1() +{ +dsc "Test 1: Common domain, add disk to one container and fail first one in another container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +# create config file with arrays and common domain +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev0 +# check that spare loop2 was moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 +tidyup +} + +test1a() +{ +dsc "Test 1a: Common domain, add disk to one container and fail second one in another container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev1 +# check that spare loop2 was moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 +tidyup +} + +test2() +{ +dsc "Test 2: Common domain, fail disk in one container and add one to another container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm --fail /dev/$v0 $dev1 +mdadm -a /dev/$c1 $dev2 +chksparemoved $c1 $c0 $dev2 +tidyup +} + +test3() +{ +dsc "Test 3: Two domains, fail a disk in one domain, add a disk to another domain, the spare should not be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +# create config file with 2 domains +createconfig a +createconfig domain-$platform"1" $platform spare 0 1 2 +createconfig domain-$platform"2" $platform spare 3 4 5 +monitor 0 1 +mdadm --fail /dev/$v0 $dev1 +mdadm -a /dev/$c1 $dev5 +chksparemoved $c1 $c0 $dev5 n +tidyup +} + +test4() +{ +dsc "Test 4: One domain holds one container, fail a disk in domain, and add disk to a container not described by domain, move if metadata allows" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 +monitor 0 1 +mdadm --fail /dev/$v0 $dev1 +mdadm -a /dev/$c1 $dev5 +unset shouldmove +[ "$platform" == "imsm" ] || shouldmove="n" +chksparemoved $c1 $c0 $dev5 $shouldmove +tidyup +} + +test5() +{ +dsc "Test 5: Two domains, two containers in each domain" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +setupdevs 2 5 6 $platform +setupdevs 3 8 10 $platform +# 2 and 9 for spares +createconfig a +createconfig domain-$platform"1" $platform spare 0 1 2 3 4 +createconfig domain-$platform"2" $platform spare 5 6 8 9 10 +monitor 0 1 2 3 +test5a +test5b +test5c +tidyup +} + +test5a() +{ +dsc "Test 5a: Two containers in each domain, add spare loop2 to domain1 and fail disk in the other domain, the spare should not be moved" +mdadm -a /dev/$c0 $dev2 +mdadm --fail /dev/$v2 $dev5 +chksparemoved $c0 $c2 $dev2 n +} + +test5b() +{ +dsc "Test 5b: Fail disk in the same domain but different container, spare loop2 should be moved" +mdadm --fail /dev/$v1 $dev3 +chksparemoved $c0 $c1 $dev2 +} + +test5c() +{ +dsc "Test 5c: Add spare loop9 to different container in domain with degraded array, spare should be moved" +mdadm -a /dev/$c3 $dev9 +chksparemoved $c3 $c2 $dev9 +} + +test6() +{ +dsc "Test 6: One domain has two containers, fail a disk in one container, there is a spare in other container too small to use for rebuild" +setupdevs 0 0 1 $platform +setupdevs 1 8 9 $platform +# all devices in one domain +createconfig a +createconfig domain-$platform $platform spare 0 1 2 8 9 +monitor 0 1 +mdadm -a /dev/$c0 $dev2 +mdadm --fail /dev/$v1 $dev8 +chksparemoved $c0 $c1 $dev2 n +tidyup +} + +test7() +{ +dsc "Test 7: One domain, add small spare to container, fail disk in array, spare not used, add suitable spare to other container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 8 9 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 8 9 10 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v1 $dev8 +mdadm -a /dev/$c0 $dev10 +chksparemoved $c0 $c1 $dev10 +tidyup +} + + +test7a() +{ +dsc "Test 7a: Small spare in parent, suitable one in other container, $dev2 in $c1 is not in common domain" +setupdevs 0 0 1 $platform +setupdevs 1 8 9 $platform +#all $platform devices in one domain +createconfig a +createconfig domain-$platform"1" $platform spare 0 1 8 9 10 +createconfig domain-$platform"2" $platform spare 2 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +chkspare $c1 $dev2 +mdadm --fail /dev/$v1 $dev8 +mdadm -a /dev/$c0 $dev10 +chksparemoved $c0 $c1 $dev10 +tidyup +} + +test8() +{ +# ddf does not have getinfo_super_disks implemented so skip this test +return +dsc "Test 8: imsm and ddf - spare should not be migrated" +setupdevs 0 10 11 imsm +setupdevs 1 8 9 ddf +createconfig a +createconfig domain0 noplatform spare 8 9 10 11 12 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 n +tidyup +} + +test9() +{ +dsc "Test 9: imsm and native 1.2 - one domain, no metadata specified, spare should be moved" +setupdevs 0 10 11 imsm +setupdevs 1 8 9 1.2 +createconfig a +createconfig domain0 noplatform spare 8 9 10 11 12 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 +tidyup +} + +test9a() +{ +dsc "Test 9a: imsm and native 1.2 - spare in global domain, should be moved" +setupdevs 0 10 11 imsm +setupdevs 1 8 9 1.2 +createconfig a +createconfig domain-global noplatform spare 8 9 10 11 12 +createconfig domain-1.2 1.2 spare 8 9 +createconfig domain-imsm imsm spare 10 11 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 +tidyup +} + +test10() +{ +dsc "Test 10: Two arrays on the same devices in container" +setupdevs 0 0 1 $platform 10000 +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 5 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/md/sub0_ $dev0 +chksparemoved $c1 $c0 $dev2 +if [ $failed -eq 0 ]; then +# now fail the spare and see if we get another one + mdadm --fail /dev/md/sub0_ $dev2 + mdadm -a /dev/$c1 $dev5 + chksparemoved $c1 $c0 $dev5 +fi +tidyup +} + +test11() +{ +dsc "Test 11: Failed spare from other container should not be used" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v1 $dev3 +#wait until recovery finishes so no degraded array in c1 +check wait +mdadm --fail /dev/$v0 $dev0 +chksparemoved $c1 $c0 $dev3 n +tidyup +} + +test12() +{ +dsc "Test 12: Only one spare should be taken for rebuild, second not needed" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 5 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm -a /dev/$c1 $dev5 +mdadm --fail /dev/$v0 $dev0 +sleep $sleeptime +chkarray $dev2 n +sc1=$c +chkarray $dev5 n +sc2=$c +[ "$sc1" != "$sc2" ] || err "both spares in the same container $sc1" +tidyup +} + +test13() +{ +dsc "Test 13: Common domain, two containers, fail a disk in container, action is below spare, the spare should be moved regadless of action" +setupdevs 0 0 1 $platform +setupdevs 1 4 5 $platform +# same domain but different action on 4 5 6 +createconfig a +createconfig domain-$platform $platform spare 0 1 +createconfig domain-$platform $platform include 4 5 6 +monitor 0 1 +mdadm -a /dev/$c1 $dev6 +mdadm --fail /dev/$v0 $dev0 +chksparemoved $c1 $c0 $d6 +tidyup +} + +test14() +{ +dsc "Test 14: One domain, small array on big disks, check if small spare is accepted" +setupdevs 0 8 9 $platform 10000 1 +setupdevs 1 0 1 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 8 9 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev9 +chksparemoved $c1 $c0 $d2 +tidyup +} + +test15() +{ +dsc "Test 15: spare in global domain for $platform metadata, should be moved" +# this is like 9a but only one metadata used +setupdevs 0 10 11 $platform +setupdevs 1 8 9 $platform +createconfig a +createconfig domain-global $platform spare 8 9 10 11 12 +createconfig domain-1 $platform spare 8 9 +createconfig domain-2 $platform spare 10 11 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 +tidyup +} + +try() +{ +test0 +test0a +test1 +test1a +test2 +test3 +test4 +test5 +test6 +if [ "$platform" != "1.2" ]; then +# this is because we can't have a small spare added to native array + test7 + test7a +fi +test8 +test9 +test9a +if [ "$platform" != "1.2" ]; then +# we can't create two subarrays on the same devices for native (without +# partitions) + test10 +fi +test11 +test12 +test13 +test14 +test15 +} + +try_failed() +{ +platform="1.2" +scan="no" +test5 +test9 +test13 +scan="yes" +test9 +} + +#try_failed + +for scan in no yes; do + for platform in 1.2 imsm; do + try + done +done + +[ $listfailed == "no" ] || [ -z $flist ] || echo -e "\n FAILED TESTS: $flist" + +#cat $targetdir/log +rm -f /dev/disk/by-path/loop* diff --git a/tests/12imsm-r0_2d-grow-r0_3d b/tests/12imsm-r0_2d-grow-r0_3d new file mode 100644 index 00000000..3c6cf743 --- /dev/null +++ b/tests/12imsm-r0_2d-grow-r0_3d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks grow to RAID 0 volume, 3 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2" + +# Before: RAID 0 volume, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 3 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r0_2d-grow-r0_4d b/tests/12imsm-r0_2d-grow-r0_4d new file mode 100644 index 00000000..e4fccda5 --- /dev/null +++ b/tests/12imsm-r0_2d-grow-r0_4d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks grow to RAID 0 volume, 4 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3" + +# Before: RAID 0 volume, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 4 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 2)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r0_2d-grow-r0_5d b/tests/12imsm-r0_2d-grow-r0_5d new file mode 100644 index 00000000..388a5bbd --- /dev/null +++ b/tests/12imsm-r0_2d-grow-r0_5d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks grow to RAID 0 volume, 5 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3 $dev4" + +# Before: RAID 0 volume, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 5 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 3)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r0_3d-grow-r0_4d b/tests/12imsm-r0_3d-grow-r0_4d new file mode 100644 index 00000000..7065f07b --- /dev/null +++ b/tests/12imsm-r0_3d-grow-r0_4d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 3 disks grow to RAID 0 volume, 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume, 3 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 4 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r5_3d-grow-r5_4d b/tests/12imsm-r5_3d-grow-r5_4d new file mode 100644 index 00000000..097da0a7 --- /dev/null +++ b/tests/12imsm-r5_3d-grow-r5_4d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 5 volume, 3 disks grow to RAID 5 volume, 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 5 volume, 4 disks, 64k chunk size +vol0_new_num_comps=$num_disks + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r5_3d-grow-r5_5d b/tests/12imsm-r5_3d-grow-r5_5d new file mode 100644 index 00000000..2e5c7d25 --- /dev/null +++ b/tests/12imsm-r5_3d-grow-r5_5d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 5 volume, 3 disks grow to RAID 5 volume, 5 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3 $dev4" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 5 volume, 5 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r0_2d-grow-r0_r0_4d b/tests/13imsm-r0_r0_2d-grow-r0_r0_4d new file mode 100644 index 00000000..f85efa5d --- /dev/null +++ b/tests/13imsm-r0_r0_2d-grow-r0_r0_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 2 disks to 4 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3" + +# Before: RAID 0 volume in slot #0, 2 disks, 128k chunk size +# RAID 0 volume in slot #1, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID 0 volume in slot #0, 4 disks, 128k chunk size +# RAID 0 volume in slot #1, 4 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 2)) +vol1_new_num_comps=$vol0_new_num_comps + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r0_2d-grow-r0_r0_5d b/tests/13imsm-r0_r0_2d-grow-r0_r0_5d new file mode 100644 index 00000000..1b851a9b --- /dev/null +++ b/tests/13imsm-r0_r0_2d-grow-r0_r0_5d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow both members from 2 disks to 5 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3 $dev4" + +# Before: RAID 0 volume in slot #0, 2 disks, 64k chunk size +# RAID 0 volume in slot #1, 2 disks, 256k chunk size +vol0_level=0 +vol0_comp_size=$((4 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((6 * 1024)) +vol1_chunk=256 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID 0 volume in slot #0, 5 disks, 64k chunk size +# RAID 0 volume in slot #1, 5 disks, 256k chunk size +vol0_new_num_comps=$((num_disks + 3)) +vol1_new_num_comps=$vol0_new_num_comps + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r0_3d-grow-r0_r0_4d b/tests/13imsm-r0_r0_3d-grow-r0_r0_4d new file mode 100644 index 00000000..27ba83b3 --- /dev/null +++ b/tests/13imsm-r0_r0_3d-grow-r0_r0_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow a container (arrays inside) from 3 disks to 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume in slot #0, 3 disks, 128k chunk size +# RAID 0 volume in slot #1, 3 disks, 512k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=128 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID0 volume in slot #0, 4 disks, 128k chunk size +# RAID0 volume in slot #1, 4 disks, 512k chunk size +vol0_new_num_comps=$((num_disks + 1)) +vol1_new_num_comps=$vol0_new_num_comps + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r5_3d-grow-r0_r5_4d b/tests/13imsm-r0_r5_3d-grow-r0_r5_4d new file mode 100644 index 00000000..b4bde449 --- /dev/null +++ b/tests/13imsm-r0_r5_3d-grow-r0_r5_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume in slot #0, 3 disks, 64k chunk size +# RAID 5 volume in slot #1, 3 disks, 128k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=128 +vol1_num_comps=$((num_disks - 1)) +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID 0 volume in slot #0, 4 disks, 64k chunk size +# RAID 5 volume in slot #1, 4 disks, 128k chunk size +vol1_new_num_comps=$num_disks +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r5_3d-grow-r0_r5_5d b/tests/13imsm-r0_r5_3d-grow-r0_r5_5d new file mode 100644 index 00000000..d0db9aeb --- /dev/null +++ b/tests/13imsm-r0_r5_3d-grow-r0_r5_5d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 5 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3 $dev4" + +# Before: RAID 0 volume in slot #0, 3 disks, 256k chunk size +# RAID 5 volume in slot #1, 3 disks, 512k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=128 +vol1_num_comps=$((num_disks - 1)) +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID 0 volume in slot #0, 5 disks, 256k chunk size +# RAID 5 volume in slot #1, 5 disks, 512k chunk size +vol0_new_num_comps=$((num_disks + 2)) +vol1_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r5_r0_3d-grow-r5_r0_4d b/tests/13imsm-r5_r0_3d-grow-r5_r0_4d new file mode 100644 index 00000000..32ebc924 --- /dev/null +++ b/tests/13imsm-r5_r0_3d-grow-r5_r0_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 5 volume in slot #0, 3 disks, 64k chunk size +# RAID 0 volume in slot #1, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_offset=$((vol0_comp_size + 2048)) +vol1_num_comps=$num_disks + +# After: RAID 5 volume in slot #0, 4 disks, 64k chunk size +# RAID 0 volume in slot #1, 4 disks, 64k chunk size +vol0_new_num_comps=$num_disks +vol1_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r5_r0_3d-grow-r5_r0_5d b/tests/13imsm-r5_r0_3d-grow-r5_r0_5d new file mode 100644 index 00000000..a97002d0 --- /dev/null +++ b/tests/13imsm-r5_r0_3d-grow-r5_r0_5d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 5 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3 $dev4" + +# Before: RAID 5 volume in slot #0, 3 disks, 128k chunk size +# RAID 0 volume in slot #1, 3 disks, 256k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_offset=$((vol0_comp_size + 2048)) +vol1_num_comps=$num_disks + +# After: RAID 5 volume in slot #0, 5 disks, 128k chunk size +# RAID 0 volume in slot #1, 5 disks, 256k chunk size +vol0_new_num_comps=$((num_disks + 1)) +vol1_new_num_comps=$((num_disks + 2)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d b/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d new file mode 100644 index 00000000..386abeee --- /dev/null +++ b/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# RAID 0 and RAID 5 volumes (3 disks) migrate to RAID 5 and RAID 5 volumes (4 disks) +# NEGATIVE test - migration is not allowed if there is more then one array in a container + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume, 3 disks, 64k chunk size, as member #0 +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# Extra: RAID 5 volume, 3 disks, 64k chunk size, as member #1 +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$((num_disks - 1)) +vol1_offset=$((vol0_comp_size + 2048)) + +# After: RAID 5 volume, 4 disks, 64k chunk size (only member #0) +vol0_new_level=5 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/14imsm-r0_3d_no_spares-migrate-r5_3d b/tests/14imsm-r0_3d_no_spares-migrate-r5_3d new file mode 100644 index 00000000..10bbab6d --- /dev/null +++ b/tests/14imsm-r0_3d_no_spares-migrate-r5_3d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 0 volume (3 disks, no spares) migrate to RAID 5 volume (3 disks) +# NEGATIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 0 volume, 3 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 5, 3 disks, 64k chunk size +vol0_new_level=5 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 diff --git a/tests/14imsm-r0_r0_2d-takeover-r10_4d b/tests/14imsm-r0_r0_2d-takeover-r10_4d new file mode 100644 index 00000000..df5b0ce3 --- /dev/null +++ b/tests/14imsm-r0_r0_2d-takeover-r10_4d @@ -0,0 +1,30 @@ +. tests/env-imsm-template + + +# Two RAID 0 volumes (2 disks) migrate to RAID 10 volume (4 disks) +# NEGATIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# Before: RAID 0 volume in slot #0, 2 disks, 64k chunk size +# RAID 0 volume in slot #1, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# Before: RAID 0 volume, disks, 64k chunk size +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=num_disks +vol1_offset=$(( $vol0_comp_size + 2048 )) + +# After: RAID 10, 4 disks, 64k chunk size +vol0_new_level=10 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/14imsm-r10_4d-grow-r10_5d b/tests/14imsm-r10_4d-grow-r10_5d new file mode 100644 index 00000000..bcbe1476 --- /dev/null +++ b/tests/14imsm-r10_4d-grow-r10_5d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 10 volume, 4 disks grow to RAID 10 volume, 5 disks +# NEGATIVE test + +num_disks=4 +device_list="$dev0 $dev1 $dev2 $dev3" +spare_list="$dev4" + +# Before: RAID 10 volume, 4 disks, 128k chunk size +vol0_level=10 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$((num_disks - 2)) +vol0_offset=0 + +# After: RAID 10 volume, 5 disks, 128k chunks size (test should fail) +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 1 0 diff --git a/tests/14imsm-r10_r5_4d-takeover-r0_2d b/tests/14imsm-r10_r5_4d-takeover-r0_2d new file mode 100644 index 00000000..9e5205e2 --- /dev/null +++ b/tests/14imsm-r10_r5_4d-takeover-r0_2d @@ -0,0 +1,30 @@ +. tests/env-imsm-template + + +# Two RAID volumes: RAID10 and RAID5 (4 disks) migrate to RAID 0 volume (2 disks) +# NEGATIVE test + +num_disks=4 +device_list="$dev0 $dev1 $dev2 $dev3" + +# Before: RAID 10 volume in slot #0, 4 disks, 64k chunk size +# RAID 5 volume in slot #1, 4 disks, 64k chunk size +vol0_level=10 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$(( $num_disks - 2 )) +vol0_offset=0 + +# Before: RAID 0 volume, disks, 64k chunk size +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$(( $num_disks - 1 )) +vol1_offset=$(( $vol0_comp_size + 2048 )) + +# After: RAID 10, 4 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=2 +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/14imsm-r1_2d-grow-r1_3d b/tests/14imsm-r1_2d-grow-r1_3d new file mode 100644 index 00000000..1edd50e4 --- /dev/null +++ b/tests/14imsm-r1_2d-grow-r1_3d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 1 volume, 2 disks grow to RAID 1 volume, 3 disks +# NEGATIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev4" + +# Before: RAID 1 volume, 2 disks, 64k chunk size +vol0_level=1 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 1 volume, 3 disks, 64k chunks size (test should fail) +vol0_new_num_comps=$num_disks + +. tests/imsm-grow-template 1 0 diff --git a/tests/14imsm-r1_2d-takeover-r0_2d b/tests/14imsm-r1_2d-takeover-r0_2d new file mode 100644 index 00000000..d8296815 --- /dev/null +++ b/tests/14imsm-r1_2d-takeover-r0_2d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 1 volume, 2 disks change to RAID 0 volume, 2 disks +# +#NEGATIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# Before: RAID 1 volume, 2 disks, 64k chunk size +vol0_level=1 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 0 volume, 2 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 diff --git a/tests/14imsm-r5_3d-grow-r5_5d-no-spares b/tests/14imsm-r5_3d-grow-r5_5d-no-spares new file mode 100644 index 00000000..ed18e72b --- /dev/null +++ b/tests/14imsm-r5_3d-grow-r5_5d-no-spares @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 5 volume, 3 disks grow to RAID 5 volume, 4 disks +# NEGATIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 5 volume, 4 disks, 64k chunks size +add_to_num_disks=2 +vol0_new_num_comps=$((num_disks + 2)) + +. tests/imsm-grow-template 1 0 diff --git a/tests/14imsm-r5_3d-migrate-r4_3d b/tests/14imsm-r5_3d-migrate-r4_3d new file mode 100644 index 00000000..e3b971cc --- /dev/null +++ b/tests/14imsm-r5_3d-migrate-r4_3d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume (3 disks) migrate to RAID 4 volume (3 disks) +# NEGATIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 4, 3 disks, 64k chunk size +vol0_new_level=4 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 diff --git a/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k b/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k new file mode 100644 index 00000000..4fe3807e --- /dev/null +++ b/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 0 volume, Migration from 64k to 256k chunk size. +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# RAID 0, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# RAID 0, 2 disks, 256k chunk size +vol0_new_level=0 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k b/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k new file mode 100644 index 00000000..025e9efb --- /dev/null +++ b/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume, Migration from 4k to 256 chunk size. +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# RAID 5, 3 disks, 4k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=4 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# RAID 5, 3 disks, 256k chunk size +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k b/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k new file mode 100644 index 00000000..37547b74 --- /dev/null +++ b/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume, Migration from 64k to 256k chunk size. +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# RAID 5, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# RAID 5, 3 disks, 256k chunk size +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k b/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k new file mode 100644 index 00000000..d2f6c707 --- /dev/null +++ b/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume, Migration from 4k to 256k chunk size. +# POSITIVE test + +num_disks=6 +device_list="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5" + +# RAID 5, 6 disks, 4k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=4 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# RAID 5, 6 disks, 256k chunk size +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k b/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k new file mode 100644 index 00000000..da218efa --- /dev/null +++ b/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k @@ -0,0 +1,34 @@ +. tests/env-imsm-template + +# Member 0: RAID 5 volume, Member 1: RAID 0 volume +# Migration from 64k to 256k chunk size (both members) +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# RAID 5, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After migration parameters +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +# RAID 0, 3 disks, 64k chunk size +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 2048)) + +# After migration paramters +vol1_new_level=0 +vol1_new_num_comps=$vol1_num_comps +vol1_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r0_3d-migrate-r5_4d b/tests/16imsm-r0_3d-migrate-r5_4d new file mode 100644 index 00000000..4f45479a --- /dev/null +++ b/tests/16imsm-r0_3d-migrate-r5_4d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 0 volume (3 disks) migrate to RAID 5 volume (4 disks) +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 0, 3 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 5, 4 disks, 64k chunk size +vol0_new_level=5 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r0_5d-migrate-r5_6d b/tests/16imsm-r0_5d-migrate-r5_6d new file mode 100644 index 00000000..bee505bf --- /dev/null +++ b/tests/16imsm-r0_5d-migrate-r5_6d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 0 volume (5 disks) migrate to RAID 5 volume (6 disks) +# POSITIVE test + +num_disks=5 +device_list="$dev0 $dev1 $dev2 $dev3 $dev4" + +# Before: RAID 0, 5 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 5, 6 disks, 64k chunk size +vol0_new_level=5 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r5_3d-migrate-r0_3d b/tests/16imsm-r5_3d-migrate-r0_3d new file mode 100644 index 00000000..b1459cc1 --- /dev/null +++ b/tests/16imsm-r5_3d-migrate-r0_3d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume (3 disks) migrate to RAID 0 volume (2 disks) +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 5, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 0, 3 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=$((num_disks-1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r5_5d-migrate-r0_5d b/tests/16imsm-r5_5d-migrate-r0_5d new file mode 100644 index 00000000..323ca52e --- /dev/null +++ b/tests/16imsm-r5_5d-migrate-r0_5d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume (5 disks) migration to RAID 0 volume (4 disks) +# POSITIVE test + +num_disks=5 +device_list="$dev0 $dev1 $dev2 $dev3 $dev4" + +# Before: RAID 5 volume, 5 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 0 volume, 5 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 0 1 diff --git a/tests/18imsm-1d-takeover-r0_1d b/tests/18imsm-1d-takeover-r0_1d new file mode 100644 index 00000000..6f5cf5a6 --- /dev/null +++ b/tests/18imsm-1d-takeover-r0_1d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# Create RAID 0 from a single disk. +# POSITIVE test + +vol0_num_comps=1 +vol0_comp_size=$((10 * 1024)) + +# Create container +mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0 +check wait +imsm_check container $vol0_num_comps + +# Create RAID 0 volume +mdadm --create --run $member0 --auto=md --level=0 --size=$vol0_comp_size --chunk=64 --force --raid-disks=$vol0_num_comps $dev0 +check wait + +# Test the member +imsm_check member $member0 $vol0_num_comps 0 $vol0_comp_size $((vol0_num_comps * vol0_comp_size)) 0 64 +testdev $member0 $vol0_num_comps $vol0_comp_size 64 + +exit 0 diff --git a/tests/18imsm-1d-takeover-r1_2d b/tests/18imsm-1d-takeover-r1_2d new file mode 100644 index 00000000..72e4173e --- /dev/null +++ b/tests/18imsm-1d-takeover-r1_2d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# Create RAID 1 from a single disk +# POSITIVE test + +vol0_num_comps=1 +vol0_comp_size=$((10 * 1024)) + +# Create container +mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0 +check wait +imsm_check container $vol0_num_comps + +# Create RAID 1 volume +mdadm --create --run $member0 --auto=md --level=1 --size=$vol0_comp_size --chunk=64 --raid-disks=$((vol0_num_comps + 1)) $dev0 missing +check wait + +# Test the member0 +imsm_check member $member0 $((vol_num_comps + 1)) 1 $vol0_comp_size $((vol0_num_comps * vol0_comp_size)) 0 64 +testdev $member0 $vol0_num_comps $vol0_comp_size 64 diff --git a/tests/18imsm-r0_2d-takeover-r10_4d b/tests/18imsm-r0_2d-takeover-r10_4d new file mode 100644 index 00000000..0e77e5da --- /dev/null +++ b/tests/18imsm-r0_2d-takeover-r10_4d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks change to RAID 10 volume, 4 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3" + +# Before: RAID 0 volume, 2 disks, 256k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 10 volume, 4 disks, 256k chunk size +vol0_new_level=10 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=128 + +. tests/imsm-grow-template 0 1 diff --git a/tests/18imsm-r10_4d-takeover-r0_2d b/tests/18imsm-r10_4d-takeover-r0_2d new file mode 100644 index 00000000..8a9606b4 --- /dev/null +++ b/tests/18imsm-r10_4d-takeover-r0_2d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 10 volume, 4 disks change to RAID 0 volume, 2 disks +# POSITIVE test + +num_disks=4 +device_list="$dev0 $dev1 $dev2 $dev3" + +# Before: RAID 10 volume, 4 disks, 128k chunk size +vol0_level=10 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$((num_disks - 2)) +vol0_offset=0 + +# After: RAID 0 volume, 2 disks, 128k chunk size +vol0_new_level=0 +vol0_new_num_comps=2 +vol0_new_chunk=128 +new_num_disks=2 + +. tests/imsm-grow-template 0 1 diff --git a/tests/18imsm-r1_2d-takeover-r0_1d b/tests/18imsm-r1_2d-takeover-r0_1d new file mode 100644 index 00000000..cb10ec97 --- /dev/null +++ b/tests/18imsm-r1_2d-takeover-r0_1d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 1 volume, 2 disks change to RAID 0 volume, 1 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# Before: RAID 1 volume, 2 disks +vol0_level=1 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$(( $num_disks - 1 )) +vol0_offset=0 + +# After: RAID 0 volume, 1 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=1 +vol0_new_chunk=64 +new_num_disks=0 + +. tests/imsm-grow-template 0 1 diff --git a/tests/19raid6auto-repair b/tests/19raid6auto-repair new file mode 100644 index 00000000..ce4a7c08 --- /dev/null +++ b/tests/19raid6auto-repair @@ -0,0 +1,49 @@ +number_of_disks=5 +chunksize_in_kib=512 +chunksize_in_b=$[chunksize_in_kib*1024] +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev0 $dev1 $dev2 $dev3 $dev4" + +# default 2048 sectors +data_offset_in_kib=$[2048/2] + +# make a raid5 from a file +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib + +# perform test for every layout +layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \ + right-symmetric-6 parity-first-6" + +for layout in $layouts +do + mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs + dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib + blockdev --flushbufs $md0; sync + check wait + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } + + # wipe out 5 chunks on each device + dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0] + dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5] + dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10] + dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15] + dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20] + + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } + + $dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; } + blockdev --flushbufs $md0 $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + mdadm -S $md0 +done diff --git a/tests/19raid6check b/tests/19raid6check new file mode 100644 index 00000000..67958c6a --- /dev/null +++ b/tests/19raid6check @@ -0,0 +1,27 @@ +# +# Confirm that raid6check handles all RAID6 layouts. +# Try both 4 and 5 devices. + +layouts='ls rs la ra' +lv=`uname -r` +if expr $lv '>=' 2.6.30 > /dev/null +then + layouts="$layouts parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6" +fi + +for layout in $layouts +do + for devs in 4 5 + do + dl="$dev0 $dev1 $dev2 $dev3" + if [ $devs = 5 ]; then dl="$dl $dev4"; fi + + mdadm -CR $md0 -l6 --layout $layout -n$devs $dl + check wait + tar cf - /etc > $md0 + ./raid6check $md0 0 0 | grep 'Error detected' && exit 1 + mdadm -S $md0 + done +done + diff --git a/tests/19raid6repair b/tests/19raid6repair new file mode 100644 index 00000000..26846cc9 --- /dev/null +++ b/tests/19raid6repair @@ -0,0 +1,56 @@ +number_of_disks=4 +chunksize_in_kib=512 +chunksize_in_b=$[chunksize_in_kib*1024] +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev1 $dev2 $dev3 $dev4" + +# default 2048 sectors +data_offset_in_kib=$[2048/2] + +layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \ + right-symmetric-6 parity-first-6" + +for layout in $layouts +do + for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" \ + "$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \ + "$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" \ + "$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do + failure_split=( $failure ) + device_with_error=${failure_split[0]} + stripe_with_error=${failure_split[1]} + repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}" + start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error] + + # make a raid5 from a file + dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib + mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs + dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib + blockdev --flushbufs $md0; sync + + check wait + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } + + dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib + blockdev --flushbufs $device_with_error; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } + + $dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; } + blockdev --flushbufs $md0 $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + mdadm -S $md0 + udevadm settle + sync + echo 3 > /proc/sys/vm/drop_caches + done +done diff --git a/tests/19repair-does-not-destroy b/tests/19repair-does-not-destroy new file mode 100644 index 00000000..a92883fd --- /dev/null +++ b/tests/19repair-does-not-destroy @@ -0,0 +1,28 @@ +number_of_disks=7 +chunksize_in_kib=512 +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6" + +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib +mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs +dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib +blockdev --flushbufs $md0; sync +check wait +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches +$dir/raid6check $md0 repair 1 2 3 > /dev/null # D D +$dir/raid6check $md0 repair 8 2 5 > /dev/null # D P +$dir/raid6check $md0 repair 15 4 6 > /dev/null # D Q +$dir/raid6check $md0 repair 22 5 6 > /dev/null # P Q +$dir/raid6check $md0 repair 3 4 0 > /dev/null # Q D +$dir/raid6check $md0 repair 3 3 1 > /dev/null # P D +$dir/raid6check $md0 repair 6 4 5 > /dev/null # D /dev/null # D>D +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches +$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } +cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo should not mess up correct stripe ; exit 2; } + +mdadm -S $md0 +udevadm settle diff --git a/tests/20raid5journal b/tests/20raid5journal new file mode 100644 index 00000000..f751aceb --- /dev/null +++ b/tests/20raid5journal @@ -0,0 +1,64 @@ +# check write journal of raid456 + +# test --detail +test_detail_shows_journal() { + mdadm -D $1 | grep journal || { + echo >&2 "ERROR --detail does show journal device!"; mdadm -D $1 ; exit 1; } +} + +# test --examine +test_examine_shows_journal() { + mdadm -E $1 | grep Journal || { + echo >&2 "ERROR --examine does show Journal device!"; mdadm -E $1 ; exit 1; } +} + +# test --create +create_with_journal_and_stop() { + mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3 --write-journal $dev4 + check wait + tar cf - /etc > $md0 + ./raid6check $md0 0 0 | grep 'Error detected' && exit 1 + test_detail_shows_journal $md0 + test_examine_shows_journal $dev4 + mdadm -S $md0 +} + +# test --assemble +test_assemble() { + create_with_journal_and_stop + if mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 + then + echo >&2 "ERROR should return 1 when journal is missing!"; cat /proc/mdstat ; exit 1; + fi + mdadm -S $md0 + + mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 --force + check readonly + mdadm -S $md0 +} + +# test --incremental +test_incremental() { + create_with_journal_and_stop + for d in $dev0 $dev1 $dev2 $dev3 + do + mdadm -I $d + done + check inactive + mdadm -I $dev4 + check raid5 + mdadm -S $md0 + + # test --incremental with journal missing + for d in $dev0 $dev1 $dev2 $dev3 + do + mdadm -I $d + done + mdadm -R $md0 + check readonly + mdadm -S $md0 +} + +create_with_journal_and_stop +test_assemble +test_incremental diff --git a/tests/ToTest b/tests/ToTest new file mode 100644 index 00000000..b98e266d --- /dev/null +++ b/tests/ToTest @@ -0,0 +1,44 @@ + +multipath!! + +add/remove/fail + raid1 DONE + raid5 DONE + raid6/10 needed?? + +assemble + by devices DONE + by uuid DONE + by superminor DONE + by config file DONE + + various --updates DONE (not sparc2.2 or summaries) + +stop + --scan + +readonly/readwrite + +bitmap + separate file + internal + filename in config file + +examine + --scan + --brief + +detail + +grow: + size + raid1/5/6 DONE + devices + raid1 add DONE + raid1 shrink DONE + +'--quiet' option, and remove "" +'--name' option fo v1, and configfile etc... + +faulty + errors in raid1/5/6 diff --git a/tests/check b/tests/check new file mode 100644 index 00000000..f4ed6d5b --- /dev/null +++ b/tests/check @@ -0,0 +1,35 @@ + +case $1 in + raid* | linear ) + grep -s "active $1 " /proc/mdstat > /dev/null || { + echo >&2 "ERROR active $1 not found" ; cat /proc/mdstat ; exit 1;} + ;; + resync | recovery ) + sleep 0.1 + grep -s $1 /proc/mdstat > /dev/null || { + echo >&2 ERROR no $1 happening; cat /proc/mdstat; exit 1; } + ;; + + nosync ) + sleep 0.5 + grep -s 're[synccovery]* =' > /dev/null /proc/mdstat && { + echo >&2 "ERROR resync or recovery is happening!"; cat /proc/mdstat ; exit 1; } + ;; + + wait ) + sleep 0.1 + while grep 're[synccovery]* =' > /dev/null /proc/mdstat + do sleep 2; + done + ;; + + state ) + grep -s "blocks.*\[$2\]\$" /proc/mdstat > /dev/null || { + echo >&2 "ERROR state $2 not found!"; cat /proc/mdstat ; exit 1; } + sleep 0.5 + ;; + + * ) echo >&2 ERROR unknown check $1 ; exit 1; +esac + +exit 0 diff --git a/tests/env-ddf-template b/tests/env-ddf-template new file mode 100644 index 00000000..90d7272f --- /dev/null +++ b/tests/env-ddf-template @@ -0,0 +1,113 @@ +sha1_sum() { + sha1sum "$1" | cut -c 1-40 +} + +get_rootdev() { + local dev=$(stat -c %D /) + local maj=$(expr $dev : '\(..*\)..') + local min=${dev#$maj} + local bd=/dev/$(basename $(readlink /sys/dev/block/$((0x$maj)):$((0x$min)))) + [ -b $bd ] || exit 1 + echo $bd +} + +get_sysdir() { + local mddev=$1 + [ -L $mddev ] && mddev=$(readlink -f $mddev) + echo "/sys/class/block/$(basename $mddev)/md" +} + +get_raiddisks() { + sysdir=$(get_sysdir "$1") + for i in $(seq 0 $(($(cat $sysdir/raid_disks)-1))); do + if [ -d $sysdir/rd$i ]; then + readlink -f /dev/block/$(cat $sysdir/rd$i/block/dev) + else + echo MISSING + fi + done +} + +get_present() { + get_raiddisks $1 | grep -vc MISSING +} + +ddf_check() { + udevadm settle + case $1 in + container ) + grep -s "blocks super external:ddf" /proc/mdstat > /dev/null || { + echo >&2 "**Fatal** Correctly formed container not found"; cat /proc/mdstat; exit 1; } + ;; + member ) + t_member=$2 + t_num_disks=$3 + t_level=$4 + t_rd_size=$5 + t_size=$6 + t_offset=$7 + t_chunk=$8 + t_layout=$9 + + if [ $t_chunk -ne 0 ]; then + t_rd_size=$((t_rd_size & ~(t_chunk - 1))) + fi + case $t_level in + 0) t_size=$((t_num_disks*$t_rd_size));; + 1) t_size=$t_rd_size;; + 4|5) t_size=$(((t_num_disks-1)*$t_rd_size));; + 6) t_size=$(((t_num_disks-2)*$t_rd_size));; + 10) t_size=$((t_num_disks*$t_rd_size/t_layout));; + esac + + err=0 + + eval `stat -L -c "let major=0x%t; let minor=0x%T;" $t_member` + sysfs=/sys/dev/block/${major}:${minor} + if [ ! -f ${sysfs}/md/array_state ]; then + echo "**Fatal**: Array member $t_member not found" >&2; cat /proc/mdstat >&2; exit 1 + fi + _chunk=`cat ${sysfs}/md/chunk_size` + if [ $t_chunk -ne $((_chunk/1024)) ]; then + echo "**Error**: Chunk size mismatch - expected $t_chunk, actual $_chunk" >&2 + err=$((err + 1)) + fi + for i in `seq 0 $((t_num_disks - 1))`; do + _offset=`cat ${sysfs}/md/rd${i}/offset` + if [ $t_offset -ne $((_offset / 2)) ]; then + echo "**Error**: Offset mismatch - expected $t_offset, actual $((_offset/2))" >&2 + err=$((err + 1)) + fi + _rd_size=`cat ${sysfs}/md/rd${i}/size` + if [ $t_rd_size -ne $_rd_size ]; then + echo "**Error**: Component size mismatch - expected $t_rd_size, actual $_rd_size" >&2 + err=$((err + 1)) + fi + done + _size=`cat ${sysfs}/md/array_size` + [ o$_size = odefault ] && _size=$(($(cat ${sysfs}/size)/2)) + if [ $t_size -ne $_size ]; then + echo "**Error**: Array size mismatch - expected $t_size, actual $_size" >&2 + err=$((err + 1)) + fi + if [ $err -gt 0 ]; then + echo "$t_member failed check" >&2 + cat /proc/mdstat >&2 + mdadm -E /dev/loop8 >&2 + exit 1 + fi + ;; + * ) + echo >&2 "**Error** unknown check $1"; exit 1; + esac +} + +container=/dev/md/ddf0 +member0=/dev/md/vol0 +member1=/dev/md/vol1 +member2=/dev/md/vol2 +member3=/dev/md/vol3 +member4=/dev/md/vol4 + +# We don't want systemd to start system mdmon; start our own +export MDADM_NO_SYSTEMCTL=1 diff --git a/tests/env-imsm-template b/tests/env-imsm-template new file mode 100644 index 00000000..bc5f5852 --- /dev/null +++ b/tests/env-imsm-template @@ -0,0 +1,74 @@ +imsm_check() { + udevadm settle + case $1 in + container ) + grep -s "blocks super external:imsm" /proc/mdstat > /dev/null || { + echo >&2 "**Fatal** Correctly formed container not found"; cat /proc/mdstat; exit 1; } + ;; + member ) + t_member=$2 + t_num_disks=$3 + t_level=$4 + t_rd_size=$5 + t_size=$6 + t_offset=$7 + t_chunk=$8 + + if [ $t_level -ne 1 ]; then + t_rd_size=$((t_rd_size & ~(t_chunk - 1))) + else + t_chunk=64 + fi + t_size=$((t_size/1024)) + t_size=$((t_size*1024)) + err=0 + + eval `stat -L -c "let major=0x%t; let minor=0x%T;" $t_member` + sysfs=/sys/dev/block/${major}:${minor} + if [ ! -f ${sysfs}/md/array_state ]; then + echo "**Fatal**: Array member $t_member not found" >&2; cat /proc/mdstat >&2; exit 1 + fi + _chunk=`cat ${sysfs}/md/chunk_size` + if [ $t_chunk -ne $((_chunk/1024)) ]; then + echo "**Error**: Chunk size mismatch - expected $t_chunk, actual $_chunk" >&2 + err=$((err + 1)) + fi + for i in `seq 0 $((t_num_disks - 1))`; do + _offset=`cat ${sysfs}/md/rd${i}/offset` + if [ $t_offset -ne $((_offset / 2)) ]; then + echo "**Error**: Offset mismatch - expected $t_offset, actual $_offset" >&2 + err=$((err + 1)) + fi + _rd_size=`cat ${sysfs}/md/rd${i}/size` + if [ $t_rd_size -ne $_rd_size ]; then + echo "**Error**: Component size mismatch - expected $t_rd_size, actual $_rd_size" >&2 + err=$((err + 1)) + fi + done + _size=`cat ${sysfs}/md/array_size` + if [ $t_size -ne $_size ]; then + echo "**Error**: Array size mismatch - expected $t_size, actual $_size" >&2 + err=$((err + 1)) + fi + if [ $err -gt 0 ]; then + echo "$t_member failed check" >&2 + cat /proc/mdstat >&2 + mdadm -E /dev/loop0 >&2 + exit 1 + fi + ;; + * ) + echo >&2 "**Error** unknown check $1"; exit 1; + esac +} + +export IMSM_NO_PLATFORM=1 +export IMSM_DEVNAME_AS_SERIAL=1 +export IMSM_TEST_OROM=1 +export MDADM_EXPERIMENTAL=1 +container=/dev/md/container +member0=/dev/md/vol0 +member1=/dev/md/vol1 +member2=/dev/md/vol2 +member3=/dev/md/vol3 +member4=/dev/md/vol4 diff --git a/tests/imsm-grow-template b/tests/imsm-grow-template new file mode 100644 index 00000000..71a0bbb1 --- /dev/null +++ b/tests/imsm-grow-template @@ -0,0 +1,106 @@ + +# 0 - POSITIVE test, otherwise NEGATIVE test +negative_test=$1 + +# 0 - On-line Capacity Expansion test, otherwise LEVEL migration or CHUNK size migration test +migration_test=$2 + +function grow_member() { + local member=$1 + local disks=$2 + local comps=$3 + local level=$4 + local size=$5 + local offset=$6 + local chunk=$7 + local array_size=$((comps * size)) + + rm -f $backup_imsm + ( set -ex; mdadm --grow $member --chunk=$chunk --level=$level ) + local status=$? + if [ $negative_test -ne 0 ]; then + if [ $status -eq 0 ]; then + echo >&2 "**Error**: $member: --grow should failed, but it completed successfuly" + exit 1 + fi + return + fi + check wait + sleep 5 + imsm_check member $member $disks $level $size $array_size $offset $chunk + testdev $member $comps $size $chunk +} + +# Create container +mdadm --create --run $container --auto=md --metadata=imsm --raid-disks=$num_disks $device_list +check wait +imsm_check container $num_disks + +# Create first volume inside the container +mdadm --create --run $member0 --auto=md --level=$vol0_level --size=$vol0_comp_size --chunk=$vol0_chunk --raid-disks=$num_disks $device_list +check wait + +# Create second volume inside the container (if defined) +if [ ! -z $vol1_chunk ]; then + mdadm --create --run $member1 --auto=md --level=$vol1_level --size=$vol1_comp_size --chunk=$vol1_chunk --raid-disks=$num_disks $device_list + check wait +fi + +# Wait for any RESYNC to complete +check wait + +# Test first volume +imsm_check member $member0 $num_disks $vol0_level $vol0_comp_size $((vol0_comp_size * vol0_num_comps)) $vol0_offset $vol0_chunk +testdev $member0 $vol0_num_comps $vol0_comp_size $vol0_chunk + +# Test second volume (if defined) +if [ ! -z $vol1_chunk ]; then + imsm_check member $member1 $num_disks $vol1_level $vol1_comp_size $((vol1_comp_size * vol1_num_comps)) $vol1_offset $vol1_chunk + testdev $member1 $vol1_num_comps $vol1_comp_size $vol1_chunk +fi + +# Add extra disks to container if operation requires spares in container. +for i in $spare_list +do + mdadm --add $container $i + check wait + num_disks=$((num_disks + 1)) +done + +imsm_check container $num_disks +num_disks=$((num_disks + add_to_num_disks)) +backup_imsm=/tmp/backup_imsm + +# Grow each member or a container depending on the type of an operation +if [ $migration_test -ne 0 ]; then + if [ -z $new_num_disks ]; then + new_num_disks=$num_disks + fi + grow_member $member0 $new_num_disks $vol0_new_num_comps $vol0_new_level $vol0_comp_size $vol0_offset $vol0_new_chunk + if [[ $vol1_new_chunk -ne 0 ]] ; then + grow_member $member1 $new_num_disks $vol1_new_num_comps $vol1_new_level $vol1_comp_size $vol1_offset $vol1_new_chunk + fi +else + rm -f $backup_imsm + ( set -x; mdadm --grow $container --raid-disks=$num_disks ) + grow_status=$? + if [ $negative_test -ne 0 ]; then + if [ $grow_status -eq 0 ]; then + echo >&2 "**Error**: $container: --grow should failed, but it completed successfuly" + exit 1 + fi + else + sleep 5 + check wait + sleep 5 + check wait + imsm_check member $member0 $num_disks $vol0_level $vol0_comp_size $((vol0_comp_size * vol0_new_num_comps)) $vol0_offset $vol0_chunk + testdev $member0 $vol0_new_num_comps $vol0_comp_size $vol0_chunk + if [ $vol1_new_num_comps -ne 0 ]; then + imsm_check member $member1 $num_disks $vol1_level $vol1_comp_size $((vol1_comp_size * vol1_new_num_comps)) $vol1_offset $vol1_chunk + testdev $member1 $vol1_new_num_comps $vol1_comp_size $vol1_chunk + fi + fi +fi + +exit 0 diff --git a/tests/testdev b/tests/testdev new file mode 100644 index 00000000..8b6e6f06 --- /dev/null +++ b/tests/testdev @@ -0,0 +1,13 @@ +dev=$1 +cnt=$2 +size=$3 +chunk=$4 +mkfs -j $dev > /dev/null 2>&1 && fsck -fn $dev >&2 +dsize=$[size/chunk] +dsize=$[dsize*chunk] +rasize=$[dsize*2*cnt] +if [ $rasize -ne `/sbin/blockdev --getsize $dev` ] +then + echo "ERROR: size is wrong for $dev: $cnt * $size (chunk=$chunk) = $rasize, not `/sbin/blockdev --getsize $dev`" + exit 1; +fi diff --git a/tests/utils b/tests/utils new file mode 100644 index 00000000..3acebd77 --- /dev/null +++ b/tests/utils @@ -0,0 +1,191 @@ +# set of functions used to test policy framework with assemble, incremental and Monitor + +set +e +#create links to be able to use domains +for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 +do + eval ln -s \$dev$d /dev/disk/by-path/loop$d + eval d$d="loop$d" + eval mdadm --zero-superblock \$dev$d +done + +devices="/dev/loop[0-9] /dev/loop10 /dev/loop11 /dev/loop12" + +# on failure print out few things before exit +# uses testdsc and platform global variables +err(){ + echo >&2 "ERROR: $*" + cat $config >&2 || true + cat /proc/mdstat >&2 + [ -z "$testdsc" ] || { echo >&2 $platform: $testdsc "- failed"; } + ps -e | grep mdadm >&2 || true + if [ $listfailed == "yes" ]; then + [ "$verbose" != "yes" ] || echo ---FAILED--- + flist="$flist \n $platform $testdsc" + failed=1 + else + exit 1 + fi +} + +# set test description +dsc(){ + failed=0 + testdsc="$*" + [ "$verbose" != "yes" ] || echo $testdsc +} + +killmonitor(){ + [ -z "$monitorpid" ] || { kill -9 $monitorpid; unset monitorpid; } +} + +tidyup(){ + killmonitor + mdadm -Ss || true + mdadm -Ss + mdadm --zero-superblock $devices || true + udevadm settle + rm -f $config +} + +trap tidyup 0 1 2 3 15 + +# create a RAID 1 array or container and subarray(s) on 2 disks +# if platform not specified imsm is used +# if subsize is given, first subarray is created with given size and second one on remaining space +ccv(){ + # mddevno used to name created array + local mddevno="$1" + # numbers of devices to be used in array + local devno1="$2" + local devno2="$3" + local platform="$4" + local subsize="$5" + local onearray="$6" + [ -n "$platform" ] || platform="imsm" + if [ "$platform" == "imsm" ] || [ "$platform" == "ddf" ]; then + eval mdadm -CR /dev/md/con$mddevno -e $platform -n 2 \$dev$devno1 \$dev$devno2 + udevadm settle + [ -z "$subsize" ] || eval mdadm -CR sub$mddevno"_" -l 1 -n 2 /dev/md/con$mddevno -z $subsize + [ -n "$onearray" ] || eval mdadm -CR sub$mddevno -l 1 -n 2 /dev/md/con$mddevno + else + [ -z "$subsize" ] || sizepar="-z $subsize" + eval mdadm -CR arr$mddevno -e $platform -l 1 -n 2 \$dev$devno1 \$dev$devno2 $sizepar + unset sizepar + fi +} + +# get container and subarray using given device from mdstat +# sets global variables c and v +getarray(){ + local devname=`basename $1` + local platformtype=`grep -A 1 $devname /proc/mdstat | awk '/active/ {getline; print $4 }' | awk -F ":" 'END {print $1}'` + c=`grep "inactive.*$devname" /proc/mdstat | awk -F " " '{print $1}'` + v=`grep " active.*$devname" /proc/mdstat | awk -F " " '{print $1}'` + [ "$platformtype" == "external" ] || c=$v +} + +# check if given device belongs to any container and subarray +# if $2 given then only container checked +chkarray(){ + local devname="$1" + local subcheck="$2" + getarray $devname + [ -n "$c" ] || err "$devname not in any container" + [ -n "$subcheck" ] || [ -n "$v" ] || err " $devname not in subarray" +} + +# test if two devices in the same container/subarray +# $1 $2 - devices +# $3 don't check subarrays, only containers +tst(){ + local device1=`basename $1` + local device2=`basename $2` + local subcheck="$3" + chkarray $device1 $subcheck + local x="$c" + local y="$v" + chkarray $device2 $subcheck + [ "$c" == "$x" ] || err "$device1 and $device2 not in the same container" + [ -n "$subcheck" ] || [ "$v" == "$y" ] || err "$device1 and $device2 not in the same subarray" +} + +# same as tst, just use numbers of devices instead of names as parameters +dtst(){ + local devno1="$1" + local devno2="$2" + local subcheck="$3" + eval tst \$dev$devno1 \$dev$devno2 $subcheck +} + +# create containers/subarrays, check if created properly, +# set global variables c$mddevno v$mddevno, usually c0=md127, v0=md126 , etc. +setupdevs(){ + local mddevno="$1" + local devno1="$2" + local devno2="$3" + local p="$4" + local subsize="$5" + local onearray="$6" + [ -n "$p" ] || p=$platform + ccv $mddevno $devno1 $devno2 $p $subsize $onearray + dtst $devno1 $devno2 + eval c$mddevno=\"$c\" + eval v$mddevno=\"$v\" +} + +# check if given spare in container +# usage: chkspare container spare [n] (n if spare shouldn't be in container) +chkspare(){ + local container=`basename $1` + local spare=$2 + local expected=$3 + getarray $spare + [ -n "$expected" ] || expected="y" + if [ "$expected" == "y" ]; then + [ "$c" == "$container" ] || err "$spare not in container $container" + else + [ "$c" != "$container" ] || err "$spare in container $container" + fi +} + +#check if spare was moved from one container to another +# args: from_container to_container spare [yn] +# n when spare should remain in original container +chksparemoved(){ + sleep $sleeptime + from_container="$1" + to_container="$2" + spare="$3" + expected="$4" + [ -n "$expected" ] || expected="y" + notexpected="n"; [ "$expected" == "y" ] || notexpected="y" + chkspare $from_container $spare $notexpected + [ $failed -eq 1 ] || chkspare $to_container $spare $expected +} + + +# for domains defined through policy +createconfig(){ +if [ "$1" != "a" ]; then +{ + domain=$1 + metadata=$2 + action=$3 + while [ -n "$4" ]; do + echo="policy domain=$domain" + [ "$metadata" == "noplatform" ] || echo="$echo metadata=$metadata" + echo="$echo path=loop$4" + echo="$echo action=$action" + echo "$echo" + shift + done +} >> $config +else +{ + echo "DEVICES $devlist /dev/md1*" + mdadm -Ebs +} > $config +fi +#[ "$verbose" != "yes" ] || cat $config | grep policy || true +} diff --git a/udev-md-raid-arrays.rules b/udev-md-raid-arrays.rules new file mode 100644 index 00000000..440febcb --- /dev/null +++ b/udev-md-raid-arrays.rules @@ -0,0 +1,41 @@ +# do not edit this file, it will be overwritten on update + +SUBSYSTEM!="block", GOTO="md_end" + +# handle md arrays +ACTION!="add|change", GOTO="md_end" +KERNEL!="md*", GOTO="md_end" + +# partitions have no md/{array_state,metadata_version}, but should not +# for that reason be ignored. +ENV{DEVTYPE}=="partition", GOTO="md_ignore_state" + +# container devices have a metadata version of e.g. 'external:ddf' and +# never leave state 'inactive' +ATTR{md/metadata_version}=="external:[A-Za-z]*", ATTR{md/array_state}=="inactive", GOTO="md_ignore_state" +TEST!="md/array_state", ENV{SYSTEMD_READY}="0", GOTO="md_end" +ATTR{md/array_state}=="|clear|inactive", ENV{SYSTEMD_READY}="0", GOTO="md_end" +LABEL="md_ignore_state" + +IMPORT{program}="BINDIR/mdadm --detail --export $devnode" +ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace" +ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}" +ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}" +ENV{DEVTYPE}=="partition", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}-part%n", OPTIONS+="string_escape=replace" +ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}-part%n" +ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n" +ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n" + +IMPORT{builtin}="blkid" +OPTIONS+="link_priority=100" +OPTIONS+="watch" +ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" +ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" + +ENV{MD_LEVEL}=="raid[1-9]*", ENV{SYSTEMD_WANTS}+="mdmonitor.service" + +# Tell systemd to run mdmon for our container, if we need it. +ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c" +ENV{MD_MON_THIS}=="?*", PROGRAM="/usr/bin/basename $env{MD_MON_THIS}", ENV{SYSTEMD_WANTS}+="mdmon@%c.service" + +LABEL="md_end" diff --git a/udev-md-raid-assembly.rules b/udev-md-raid-assembly.rules new file mode 100644 index 00000000..d0d440a6 --- /dev/null +++ b/udev-md-raid-assembly.rules @@ -0,0 +1,35 @@ +# do not edit this file, it will be overwritten on update + +# Don't process any events if anaconda is running as anaconda brings up +# raid devices manually +ENV{ANACONDA}=="?*", GOTO="md_inc_end" +# assemble md arrays + +SUBSYSTEM!="block", GOTO="md_inc_end" + +# handle potential components of arrays (the ones supported by md) +ENV{ID_FS_TYPE}=="linux_raid_member", GOTO="md_inc" + +# "noiswmd" on kernel command line stops mdadm from handling +# "isw" (aka IMSM - Intel RAID). +# "nodmraid" on kernel command line stops mdadm from handling +# "isw" or "ddf". +IMPORT{cmdline}="noiswmd" +IMPORT{cmdline}="nodmraid" + +ENV{nodmraid}=="?*", GOTO="md_inc_end" +ENV{ID_FS_TYPE}=="ddf_raid_member", GOTO="md_inc" +ENV{noiswmd}=="?*", GOTO="md_inc_end" +ENV{ID_FS_TYPE}=="isw_raid_member", GOTO="md_inc" +GOTO="md_inc_end" + +LABEL="md_inc" + +# remember you can limit what gets auto/incrementally assembled by +# mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY' +ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $devnode --offroot ${DEVLINKS}" +ACTION=="add|change", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer" +ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="BINDIR/mdadm -If $name --path $env{ID_PATH}" +ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="BINDIR/mdadm -If $name" + +LABEL="md_inc_end" diff --git a/util.c b/util.c new file mode 100644 index 00000000..970d4847 --- /dev/null +++ b/util.c @@ -0,0 +1,2205 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +#include "md_p.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * following taken from linux/blkpg.h because they aren't + * anywhere else and it isn't safe to #include linux/ * stuff. + */ + +#define BLKPG _IO(0x12,105) + +/* The argument structure */ +struct blkpg_ioctl_arg { + int op; + int flags; + int datalen; + void *data; +}; + +/* The subfunctions (for the op field) */ +#define BLKPG_ADD_PARTITION 1 +#define BLKPG_DEL_PARTITION 2 + +/* Sizes of name fields. Unused at present. */ +#define BLKPG_DEVNAMELTH 64 +#define BLKPG_VOLNAMELTH 64 + +/* The data structure for ADD_PARTITION and DEL_PARTITION */ +struct blkpg_partition { + long long start; /* starting offset in bytes */ + long long length; /* length in bytes */ + int pno; /* partition number */ + char devname[BLKPG_DEVNAMELTH]; /* partition name, like sda5 or c0d1p2, + to be used in kernel messages */ + char volname[BLKPG_VOLNAMELTH]; /* volume label */ +}; + +#include "part.h" + +/* Force a compilation error if condition is true */ +#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) + +/* Force a compilation error if condition is true, but also produce a + result (of value 0 and type size_t), so the expression can be used + e.g. in a structure initializer (or where-ever else comma expressions + aren't permitted). */ +#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) + +static int is_dlm_hooks_ready = 0; + +int dlm_funs_ready(void) +{ + return is_dlm_hooks_ready ? 1 : 0; +} + +#ifndef MDASSEMBLE +static struct dlm_hooks *dlm_hooks = NULL; +struct dlm_lock_resource *dlm_lock_res = NULL; +static int ast_called = 0; + +struct dlm_lock_resource { + dlm_lshandle_t *ls; + struct dlm_lksb lksb; +}; + +/* Using poll(2) to wait for and dispatch ASTs */ +static int poll_for_ast(dlm_lshandle_t ls) +{ + struct pollfd pfd; + + pfd.fd = dlm_hooks->ls_get_fd(ls); + pfd.events = POLLIN; + + while (!ast_called) + { + if (poll(&pfd, 1, 0) < 0) + { + perror("poll"); + return -1; + } + dlm_hooks->dispatch(dlm_hooks->ls_get_fd(ls)); + } + ast_called = 0; + + return 0; +} + +static void dlm_ast(void *arg) +{ + ast_called = 1; +} + +static char *cluster_name = NULL; +/* Create the lockspace, take bitmapXXX locks on all the bitmaps. */ +int cluster_get_dlmlock(int *lockid) +{ + int ret = -1; + char str[64]; + int flags = LKF_NOQUEUE; + + ret = get_cluster_name(&cluster_name); + if (ret) { + pr_err("The md can't get cluster name\n"); + return -1; + } + + dlm_lock_res = xmalloc(sizeof(struct dlm_lock_resource)); + dlm_lock_res->ls = dlm_hooks->create_lockspace(cluster_name, O_RDWR); + if (!dlm_lock_res->ls) { + pr_err("%s failed to create lockspace\n", cluster_name); + return -ENOMEM; + } + + /* Conversions need the lockid in the LKSB */ + if (flags & LKF_CONVERT) + dlm_lock_res->lksb.sb_lkid = *lockid; + + snprintf(str, 64, "bitmap%s", cluster_name); + /* if flags with LKF_CONVERT causes below return ENOENT which means + * "No such file or directory" */ + ret = dlm_hooks->ls_lock(dlm_lock_res->ls, LKM_PWMODE, &dlm_lock_res->lksb, + flags, str, strlen(str), 0, dlm_ast, + dlm_lock_res, NULL, NULL); + if (ret) { + pr_err("error %d when get PW mode on lock %s\n", errno, str); + dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1); + return ret; + } + + /* Wait for it to complete */ + poll_for_ast(dlm_lock_res->ls); + *lockid = dlm_lock_res->lksb.sb_lkid; + + return dlm_lock_res->lksb.sb_status; +} + +int cluster_release_dlmlock(int lockid) +{ + int ret = -1; + + if (!cluster_name) + return -1; + + /* if flags with LKF_CONVERT causes below return EINVAL which means + * "Invalid argument" */ + ret = dlm_hooks->ls_unlock(dlm_lock_res->ls, lockid, 0, + &dlm_lock_res->lksb, dlm_lock_res); + if (ret) { + pr_err("error %d happened when unlock\n", errno); + /* XXX make sure the lock is unlocked eventually */ + goto out; + } + + /* Wait for it to complete */ + poll_for_ast(dlm_lock_res->ls); + + errno = dlm_lock_res->lksb.sb_status; + if (errno != EUNLOCK) { + pr_err("error %d happened in ast when unlock lockspace\n", errno); + /* XXX make sure the lockspace is unlocked eventually */ + goto out; + } + + ret = dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1); + if (ret) { + pr_err("error %d happened when release lockspace\n", errno); + /* XXX make sure the lockspace is released eventually */ + goto out; + } + free(dlm_lock_res); + +out: + return ret; +} +#else +int cluster_get_dlmlock(int *lockid) +{ + return -1; +} +int cluster_release_dlmlock(int lockid) +{ + return -1; +} +#endif + +/* + * Parse a 128 bit uuid in 4 integers + * format is 32 hexx nibbles with options :. separator + * If not exactly 32 hex digits are found, return 0 + * else return 1 + */ +int parse_uuid(char *str, int uuid[4]) +{ + int hit = 0; /* number of Hex digIT */ + int i; + char c; + for (i = 0; i < 4; i++) + uuid[i] = 0; + + while ((c = *str++) != 0) { + int n; + if (c >= '0' && c <= '9') + n = c-'0'; + else if (c >= 'a' && c <= 'f') + n = 10 + c - 'a'; + else if (c >= 'A' && c <= 'F') + n = 10 + c - 'A'; + else if (strchr(":. -", c)) + continue; + else return 0; + + if (hit<32) { + uuid[hit/8] <<= 4; + uuid[hit/8] += n; + } + hit++; + } + if (hit == 32) + return 1; + return 0; +} + +/* + * Get the md version number. + * We use the RAID_VERSION ioctl if it is supported + * If not, but we have a block device with major '9', we assume + * 0.36.0 + * + * Return version number as 24 but number - assume version parts + * always < 255 + */ + +int md_get_version(int fd) +{ + struct stat stb; + mdu_version_t vers; + + if (fstat(fd, &stb)<0) + return -1; + if ((S_IFMT&stb.st_mode) != S_IFBLK) + return -1; + + if (ioctl(fd, RAID_VERSION, &vers) == 0) + return (vers.major*10000) + (vers.minor*100) + vers.patchlevel; + if (errno == EACCES) + return -1; + if (major(stb.st_rdev) == MD_MAJOR) + return (3600); + return -1; +} + +int get_linux_version() +{ + struct utsname name; + char *cp; + int a = 0, b = 0,c = 0; + if (uname(&name) <0) + return -1; + + cp = name.release; + a = strtoul(cp, &cp, 10); + if (*cp == '.') + b = strtoul(cp+1, &cp, 10); + if (*cp == '.') + c = strtoul(cp+1, &cp, 10); + + return (a*1000000)+(b*1000)+c; +} + +#ifndef MDASSEMBLE +int mdadm_version(char *version) +{ + int a, b, c; + char *cp; + + if (!version) + version = Version; + + cp = strchr(version, '-'); + if (!cp || *(cp+1) != ' ' || *(cp+2) != 'v') + return -1; + cp += 3; + a = strtoul(cp, &cp, 10); + if (*cp != '.') + return -1; + b = strtoul(cp+1, &cp, 10); + if (*cp == '.') + c = strtoul(cp+1, &cp, 10); + else + c = 0; + if (*cp != ' ' && *cp != '-') + return -1; + return (a*1000000)+(b*1000)+c; +} + +unsigned long long parse_size(char *size) +{ + /* parse 'size' which should be a number optionally + * followed by 'K', 'M', or 'G'. + * Without a suffix, K is assumed. + * Number returned is in sectors (half-K) + * INVALID_SECTORS returned on error. + */ + char *c; + long long s = strtoll(size, &c, 10); + if (s > 0) { + switch (*c) { + case 'K': + c++; + default: + s *= 2; + break; + case 'M': + c++; + s *= 1024 * 2; + break; + case 'G': + c++; + s *= 1024 * 1024 * 2; + break; + case 's': /* sectors */ + c++; + break; + } + } else + s = INVALID_SECTORS; + if (*c) + s = INVALID_SECTORS; + return s; +} + +int parse_layout_10(char *layout) +{ + int copies, rv; + char *cp; + /* Parse the layout string for raid10 */ + /* 'f', 'o' or 'n' followed by a number <= raid_disks */ + if ((layout[0] != 'n' && layout[0] != 'f' && layout[0] != 'o') || + (copies = strtoul(layout+1, &cp, 10)) < 1 || + copies > 200 || + *cp) + return -1; + if (layout[0] == 'n') + rv = 256 + copies; + else if (layout[0] == 'o') + rv = 0x10000 + (copies<<8) + 1; + else + rv = 1 + (copies<<8); + return rv; +} + +int parse_layout_faulty(char *layout) +{ + /* Parse the layout string for 'faulty' */ + int ln = strcspn(layout, "0123456789"); + char *m = xstrdup(layout); + int mode; + m[ln] = 0; + mode = map_name(faultylayout, m); + if (mode == UnSet) + return -1; + + return mode | (atoi(layout+ln)<< ModeShift); +} + +long parse_num(char *num) +{ + /* Either return a valid number, or -1 */ + char *c; + long rv = strtol(num, &c, 10); + if (rv < 0 || *c || !num[0]) + return -1; + else + return rv; +} +#endif + +int parse_cluster_confirm_arg(char *input, char **devname, int *slot) +{ + char *dev; + *slot = strtoul(input, &dev, 10); + if (dev == input || dev[0] != ':') + return -1; + *devname = dev+1; + return 0; +} + +void remove_partitions(int fd) +{ + /* remove partitions from this block devices. + * This is used for components added to an array + */ +#ifdef BLKPG_DEL_PARTITION + struct blkpg_ioctl_arg a; + struct blkpg_partition p; + + a.op = BLKPG_DEL_PARTITION; + a.data = (void*)&p; + a.datalen = sizeof(p); + a.flags = 0; + memset(a.data, 0, a.datalen); + for (p.pno = 0; p.pno < 16; p.pno++) + ioctl(fd, BLKPG, &a); +#endif +} + +int test_partition(int fd) +{ + /* Check if fd is a whole-disk or a partition. + * BLKPG will return EINVAL on a partition, and BLKPG_DEL_PARTITION + * will return ENXIO on an invalid partition number. + */ + struct blkpg_ioctl_arg a; + struct blkpg_partition p; + a.op = BLKPG_DEL_PARTITION; + a.data = (void*)&p; + a.datalen = sizeof(p); + a.flags = 0; + memset(a.data, 0, a.datalen); + p.pno = 1<<30; + if (ioctl(fd, BLKPG, &a) == 0) + /* Very unlikely, but not a partition */ + return 0; + if (errno == ENXIO || errno == ENOTTY) + /* not a partition */ + return 0; + + return 1; +} + +int test_partition_from_id(dev_t id) +{ + char buf[20]; + int fd, rv; + + sprintf(buf, "%d:%d", major(id), minor(id)); + fd = dev_open(buf, O_RDONLY); + if (fd < 0) + return -1; + rv = test_partition(fd); + close(fd); + return rv; +} + +int enough(int level, int raid_disks, int layout, int clean, char *avail) +{ + int copies, first; + int i; + int avail_disks = 0; + + for (i = 0; i < raid_disks; i++) + avail_disks += !!avail[i]; + + switch (level) { + case 10: + /* This is the tricky one - we need to check + * which actual disks are present. + */ + copies = (layout&255)* ((layout>>8) & 255); + first = 0; + do { + /* there must be one of the 'copies' form 'first' */ + int n = copies; + int cnt = 0; + int this = first; + while (n--) { + if (avail[this]) + cnt++; + this = (this+1) % raid_disks; + } + if (cnt == 0) + return 0; + first = (first+(layout&255)) % raid_disks; + } while (first != 0); + return 1; + + case LEVEL_MULTIPATH: + return avail_disks>= 1; + case LEVEL_LINEAR: + case 0: + return avail_disks == raid_disks; + case 1: + return avail_disks >= 1; + case 4: + if (avail_disks == raid_disks - 1 && + !avail[raid_disks - 1]) + /* If just the parity device is missing, then we + * have enough, even if not clean + */ + return 1; + /* FALL THROUGH */ + case 5: + if (clean) + return avail_disks >= raid_disks-1; + else + return avail_disks >= raid_disks; + case 6: + if (clean) + return avail_disks >= raid_disks-2; + else + return avail_disks >= raid_disks; + default: + return 0; + } +} + +int enough_fd(int fd) +{ + struct mdu_array_info_s array; + struct mdu_disk_info_s disk; + int i, rv; + char *avail; + + if (ioctl(fd, GET_ARRAY_INFO, &array) != 0 || + array.raid_disks <= 0) + return 0; + avail = xcalloc(array.raid_disks, 1); + for (i = 0; i < MAX_DISKS && array.nr_disks > 0; i++) { + disk.number = i; + if (ioctl(fd, GET_DISK_INFO, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + array.nr_disks--; + + if (! (disk.state & (1<= array.raid_disks) + continue; + avail[disk.raid_disk] = 1; + } + /* This is used on an active array, so assume it is clean */ + rv = enough(array.level, array.raid_disks, array.layout, + 1, avail); + free(avail); + return rv; +} + +const int uuid_zero[4] = { 0, 0, 0, 0 }; + +int same_uuid(int a[4], int b[4], int swapuuid) +{ + if (swapuuid) { + /* parse uuids are hostendian. + * uuid's from some superblocks are big-ending + * if there is a difference, we need to swap.. + */ + unsigned char *ac = (unsigned char *)a; + unsigned char *bc = (unsigned char *)b; + int i; + for (i = 0; i < 16; i += 4) { + if (ac[i+0] != bc[i+3] || + ac[i+1] != bc[i+2] || + ac[i+2] != bc[i+1] || + ac[i+3] != bc[i+0]) + return 0; + } + return 1; + } else { + if (a[0]==b[0] && + a[1]==b[1] && + a[2]==b[2] && + a[3]==b[3]) + return 1; + return 0; + } +} + +void copy_uuid(void *a, int b[4], int swapuuid) +{ + if (swapuuid) { + /* parse uuids are hostendian. + * uuid's from some superblocks are big-ending + * if there is a difference, we need to swap.. + */ + unsigned char *ac = (unsigned char *)a; + unsigned char *bc = (unsigned char *)b; + int i; + for (i = 0; i < 16; i += 4) { + ac[i+0] = bc[i+3]; + ac[i+1] = bc[i+2]; + ac[i+2] = bc[i+1]; + ac[i+3] = bc[i+0]; + } + } else + memcpy(a, b, 16); +} + +char *__fname_from_uuid(int id[4], int swap, char *buf, char sep) +{ + int i, j; + char uuid[16]; + char *c = buf; + strcpy(c, "UUID-"); + c += strlen(c); + copy_uuid(uuid, id, swap); + for (i = 0; i < 4; i++) { + if (i) + *c++ = sep; + for (j = 3; j >= 0; j--) { + sprintf(c,"%02x", (unsigned char) uuid[j+4*i]); + c+= 2; + } + } + return buf; + +} + +char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep) +{ + // dirty hack to work around an issue with super1 superblocks... + // super1 superblocks need swapuuid set in order for assembly to + // work, but can't have it set if we want this printout to match + // all the other uuid printouts in super1.c, so we force swapuuid + // to 1 to make our printout match the rest of super1 + return __fname_from_uuid(info->uuid, (st->ss == &super1) ? 1 : st->ss->swapuuid, buf, sep); +} + +#ifndef MDASSEMBLE +int check_ext2(int fd, char *name) +{ + /* + * Check for an ext2fs file system. + * Superblock is always 1K at 1K offset + * + * s_magic is le16 at 56 == 0xEF53 + * report mtime - le32 at 44 + * blocks - le32 at 4 + * logblksize - le32 at 24 + */ + unsigned char sb[1024]; + time_t mtime; + unsigned long long size; + int bsize; + if (lseek(fd, 1024,0)!= 1024) + return 0; + if (read(fd, sb, 1024)!= 1024) + return 0; + if (sb[56] != 0x53 || sb[57] != 0xef) + return 0; + + mtime = sb[44]|(sb[45]|(sb[46]|sb[47]<<8)<<8)<<8; + bsize = sb[24]|(sb[25]|(sb[26]|sb[27]<<8)<<8)<<8; + size = sb[4]|(sb[5]|(sb[6]|sb[7]<<8)<<8)<<8; + size <<= bsize; + pr_err("%s appears to contain an ext2fs file system\n", + name); + cont_err("size=%lluK mtime=%s", size, ctime(&mtime)); + return 1; +} + +int check_reiser(int fd, char *name) +{ + /* + * superblock is at 64K + * size is 1024; + * Magic string "ReIsErFs" or "ReIsEr2Fs" at 52 + * + */ + unsigned char sb[1024]; + unsigned long long size; + if (lseek(fd, 64*1024, 0) != 64*1024) + return 0; + if (read(fd, sb, 1024) != 1024) + return 0; + if (strncmp((char*)sb+52, "ReIsErFs",8) != 0 && + strncmp((char*)sb+52, "ReIsEr2Fs",9) != 0) + return 0; + pr_err("%s appears to contain a reiserfs file system\n",name); + size = sb[0]|(sb[1]|(sb[2]|sb[3]<<8)<<8)<<8; + cont_err("size = %lluK\n", size*4); + + return 1; +} + +int check_raid(int fd, char *name) +{ + struct mdinfo info; + time_t crtime; + char *level; + struct supertype *st = guess_super(fd); + + if (!st) + return 0; + st->ss->load_super(st, fd, name); + /* Looks like a raid array .. */ + pr_err("%s appears to be part of a raid array:\n", + name); + st->ss->getinfo_super(st, &info, NULL); + st->ss->free_super(st); + crtime = info.array.ctime; + level = map_num(pers, info.array.level); + if (!level) level = "-unknown-"; + cont_err("level=%s devices=%d ctime=%s", + level, info.array.raid_disks, ctime(&crtime)); + return 1; +} + +int ask(char *mesg) +{ + char *add = ""; + int i; + for (i = 0; i < 5; i++) { + char buf[100]; + fprintf(stderr, "%s%s", mesg, add); + fflush(stderr); + if (fgets(buf, 100, stdin)==NULL) + return 0; + if (buf[0]=='y' || buf[0]=='Y') + return 1; + if (buf[0]=='n' || buf[0]=='N') + return 0; + add = "(y/n) "; + } + pr_err("assuming 'no'\n"); + return 0; +} +#endif /* MDASSEMBLE */ + +int is_standard(char *dev, int *nump) +{ + /* tests if dev is a "standard" md dev name. + * i.e if the last component is "/dNN" or "/mdNN", + * where NN is a string of digits + * Returns 1 if a partitionable standard, + * -1 if non-partitonable, + * 0 if not a standard name. + */ + char *d = strrchr(dev, '/'); + int type = 0; + int num; + if (!d) + return 0; + if (strncmp(d, "/d",2) == 0) + d += 2, type = 1; /* /dev/md/dN{pM} */ + else if (strncmp(d, "/md_d", 5) == 0) + d += 5, type = 1; /* /dev/md_dN{pM} */ + else if (strncmp(d, "/md", 3) == 0) + d += 3, type = -1; /* /dev/mdN */ + else if (d-dev > 3 && strncmp(d-2, "md/", 3) == 0) + d += 1, type = -1; /* /dev/md/N */ + else + return 0; + if (!*d) + return 0; + num = atoi(d); + while (isdigit(*d)) + d++; + if (*d) + return 0; + if (nump) *nump = num; + + return type; +} + +unsigned long calc_csum(void *super, int bytes) +{ + unsigned long long newcsum = 0; + int i; + unsigned int csum; + unsigned int *superc = (unsigned int*) super; + + for(i = 0; i < bytes/4; i++) + newcsum += superc[i]; + csum = (newcsum& 0xffffffff) + (newcsum>>32); +#ifdef __alpha__ +/* The in-kernel checksum calculation is always 16bit on + * the alpha, though it is 32 bit on i386... + * I wonder what it is elsewhere... (it uses an API in + * a way that it shouldn't). + */ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); +#endif + return csum; +} + +#ifndef MDASSEMBLE +char *human_size(long long bytes) +{ + static char buf[30]; + + /* We convert bytes to either centi-M{ega,ibi}bytes or + * centi-G{igi,ibi}bytes, with appropriate rounding, + * and then print 1/100th of those as a decimal. + * We allow upto 2048Megabytes before converting to + * gigabytes, as that shows more precision and isn't + * too large a number. + * Terabytes are not yet handled. + */ + + if (bytes < 5000*1024) + buf[0] = 0; + else if (bytes < 2*1024LL*1024LL*1024LL) { + long cMiB = (bytes * 200LL / (1LL<<20) + 1) / 2; + long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2; + snprintf(buf, sizeof(buf), " (%ld.%02ld MiB %ld.%02ld MB)", + cMiB/100 , cMiB % 100, + cMB/100, cMB % 100); + } else { + long cGiB = (bytes * 200LL / (1LL<<30) +1) / 2; + long cGB = (bytes / (1000000000LL/200LL ) +1) /2; + snprintf(buf, sizeof(buf), " (%ld.%02ld GiB %ld.%02ld GB)", + cGiB/100 , cGiB % 100, + cGB/100, cGB % 100); + } + return buf; +} + +char *human_size_brief(long long bytes, int prefix) +{ + static char buf[30]; + + /* We convert bytes to either centi-M{ega,ibi}bytes or + * centi-G{igi,ibi}bytes, with appropriate rounding, + * and then print 1/100th of those as a decimal. + * We allow upto 2048Megabytes before converting to + * gigabytes, as that shows more precision and isn't + * too large a number. + * Terabytes are not yet handled. + * + * If prefix == IEC, we mean prefixes like kibi,mebi,gibi etc. + * If prefix == JEDEC, we mean prefixes like kilo,mega,giga etc. + */ + + if (bytes < 5000*1024) + buf[0] = 0; + else if (prefix == IEC) { + if (bytes < 2*1024LL*1024LL*1024LL) { + long cMiB = (bytes * 200LL / (1LL<<20) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldMiB", + cMiB/100 , cMiB % 100); + } else { + long cGiB = (bytes * 200LL / (1LL<<30) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldGiB", + cGiB/100 , cGiB % 100); + } + } + else if (prefix == JEDEC) { + if (bytes < 2*1024LL*1024LL*1024LL) { + long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldMB", + cMB/100, cMB % 100); + } else { + long cGB = (bytes / (1000000000LL/200LL ) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldGB", + cGB/100 , cGB % 100); + } + } + else + buf[0] = 0; + + return buf; +} + +void print_r10_layout(int layout) +{ + int near = layout & 255; + int far = (layout >> 8) & 255; + int offset = (layout&0x10000); + char *sep = ""; + + if (near != 1) { + printf("%s near=%d", sep, near); + sep = ","; + } + if (far != 1) + printf("%s %s=%d", sep, offset?"offset":"far", far); + if (near*far == 1) + printf("NO REDUNDANCY"); +} +#endif + +unsigned long long calc_array_size(int level, int raid_disks, int layout, + int chunksize, unsigned long long devsize) +{ + if (level == 1) + return devsize; + devsize &= ~(unsigned long long)((chunksize>>9)-1); + return get_data_disks(level, layout, raid_disks) * devsize; +} + +int get_data_disks(int level, int layout, int raid_disks) +{ + int data_disks = 0; + switch (level) { + case 0: data_disks = raid_disks; + break; + case 1: data_disks = 1; + break; + case 4: + case 5: data_disks = raid_disks - 1; + break; + case 6: data_disks = raid_disks - 2; + break; + case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255); + break; + } + + return data_disks; +} + +int devnm2devid(char *devnm) +{ + /* First look in /sys/block/$DEVNM/dev for %d:%d + * If that fails, try parsing out a number + */ + char path[100]; + char *ep; + int fd; + int mjr,mnr; + + sprintf(path, "/sys/block/%s/dev", devnm); + fd = open(path, O_RDONLY); + if (fd >= 0) { + char buf[20]; + int n = read(fd, buf, sizeof(buf)); + close(fd); + if (n > 0) + buf[n] = 0; + if (n > 0 && sscanf(buf, "%d:%d\n", &mjr, &mnr) == 2) + return makedev(mjr, mnr); + } + if (strncmp(devnm, "md_d", 4) == 0 && + isdigit(devnm[4]) && + (mnr = strtoul(devnm+4, &ep, 10)) >= 0 && + ep > devnm && *ep == 0) + return makedev(get_mdp_major(), mnr << MdpMinorShift); + + if (strncmp(devnm, "md", 2) == 0 && + isdigit(devnm[2]) && + (mnr = strtoul(devnm+2, &ep, 10)) >= 0 && + ep > devnm && *ep == 0) + return makedev(MD_MAJOR, mnr); + + return 0; +} + +#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) +char *get_md_name(char *devnm) +{ + /* find /dev/md%d or /dev/md/%d or make a device /dev/.tmp.md%d */ + /* if dev < 0, want /dev/md/d%d or find mdp in /proc/devices ... */ + + static char devname[50]; + struct stat stb; + dev_t rdev = devnm2devid(devnm); + char *dn; + + if (rdev == 0) + return 0; + if (strncmp(devnm, "md_", 3) == 0) { + snprintf(devname, sizeof(devname), "/dev/md/%s", + devnm + 3); + if (stat(devname, &stb) == 0 + && (S_IFMT&stb.st_mode) == S_IFBLK + && (stb.st_rdev == rdev)) + return devname; + } + snprintf(devname, sizeof(devname), "/dev/%s", devnm); + if (stat(devname, &stb) == 0 + && (S_IFMT&stb.st_mode) == S_IFBLK + && (stb.st_rdev == rdev)) + return devname; + + snprintf(devname, sizeof(devname), "/dev/md/%s", devnm+2); + if (stat(devname, &stb) == 0 + && (S_IFMT&stb.st_mode) == S_IFBLK + && (stb.st_rdev == rdev)) + return devname; + + dn = map_dev(major(rdev), minor(rdev), 0); + if (dn) + return dn; + snprintf(devname, sizeof(devname), "/dev/.tmp.%s", devnm); + if (mknod(devname, S_IFBLK | 0600, rdev) == -1) + if (errno != EEXIST) + return NULL; + + if (stat(devname, &stb) == 0 + && (S_IFMT&stb.st_mode) == S_IFBLK + && (stb.st_rdev == rdev)) + return devname; + unlink(devname); + return NULL; +} + +void put_md_name(char *name) +{ + if (strncmp(name, "/dev/.tmp.md", 12) == 0) + unlink(name); +} +#endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */ + +int get_maj_min(char *dev, int *major, int *minor) +{ + char *e; + *major = strtoul(dev, &e, 0); + return (e > dev && *e == ':' && e[1] && + (*minor = strtoul(e+1, &e, 0)) >= 0 && + *e == 0); +} + +int dev_open(char *dev, int flags) +{ + /* like 'open', but if 'dev' matches %d:%d, create a temp + * block device and open that + */ + int fd = -1; + char devname[32]; + int major; + int minor; + + if (!dev) return -1; + flags |= O_DIRECT; + + if (get_maj_min(dev, &major, &minor)) { + snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d", + (int)getpid(), major, minor); + if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) { + fd = open(devname, flags); + unlink(devname); + } + if (fd < 0) { + /* Try /tmp as /dev appear to be read-only */ + snprintf(devname, sizeof(devname), "/tmp/.tmp.md.%d:%d:%d", + (int)getpid(), major, minor); + if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) { + fd = open(devname, flags); + unlink(devname); + } + } + } else + fd = open(dev, flags); + return fd; +} + +int open_dev_flags(char *devnm, int flags) +{ + int devid; + char buf[20]; + + devid = devnm2devid(devnm); + sprintf(buf, "%d:%d", major(devid), minor(devid)); + return dev_open(buf, flags); +} + +int open_dev(char *devnm) +{ + return open_dev_flags(devnm, O_RDONLY); +} + +int open_dev_excl(char *devnm) +{ + char buf[20]; + int i; + int flags = O_RDWR; + int devid = devnm2devid(devnm); + long delay = 1000; + + sprintf(buf, "%d:%d", major(devid), minor(devid)); + for (i = 0 ; i < 25 ; i++) { + int fd = dev_open(buf, flags|O_EXCL); + if (fd >= 0) + return fd; + if (errno == EACCES && flags == O_RDWR) { + flags = O_RDONLY; + continue; + } + if (errno != EBUSY) + return fd; + usleep(delay); + if (delay < 200000) + delay *= 2; + } + return -1; +} + +int same_dev(char *one, char *two) +{ + struct stat st1, st2; + if (stat(one, &st1) != 0) + return 0; + if (stat(two, &st2) != 0) + return 0; + if ((st1.st_mode & S_IFMT) != S_IFBLK) + return 0; + if ((st2.st_mode & S_IFMT) != S_IFBLK) + return 0; + return st1.st_rdev == st2.st_rdev; +} + +void wait_for(char *dev, int fd) +{ + int i; + struct stat stb_want; + long delay = 1000; + + if (fstat(fd, &stb_want) != 0 || + (stb_want.st_mode & S_IFMT) != S_IFBLK) + return; + + for (i = 0 ; i < 25 ; i++) { + struct stat stb; + if (stat(dev, &stb) == 0 && + (stb.st_mode & S_IFMT) == S_IFBLK && + (stb.st_rdev == stb_want.st_rdev)) + return; + usleep(delay); + if (delay < 200000) + delay *= 2; + } + if (i == 25) + dprintf("timeout waiting for %s\n", dev); +} + +struct superswitch *superlist[] = +{ + &super0, &super1, + &super_ddf, &super_imsm, + &mbr, &gpt, + NULL }; + +#if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) + +struct supertype *super_by_fd(int fd, char **subarrayp) +{ + mdu_array_info_t array; + int vers; + int minor; + struct supertype *st = NULL; + struct mdinfo *sra; + char *verstr; + char version[20]; + int i; + char *subarray = NULL; + char container[32] = ""; + + sra = sysfs_read(fd, NULL, GET_VERSION); + + if (sra) { + vers = sra->array.major_version; + minor = sra->array.minor_version; + verstr = sra->text_version; + } else { + if (ioctl(fd, GET_ARRAY_INFO, &array)) + array.major_version = array.minor_version = 0; + vers = array.major_version; + minor = array.minor_version; + verstr = ""; + } + + if (vers != -1) { + sprintf(version, "%d.%d", vers, minor); + verstr = version; + } + if (minor == -2 && is_subarray(verstr)) { + char *dev = verstr+1; + + subarray = strchr(dev, '/'); + if (subarray) { + *subarray++ = '\0'; + subarray = xstrdup(subarray); + } + strcpy(container, dev); + if (sra) + sysfs_free(sra); + sra = sysfs_read(-1, container, GET_VERSION); + if (sra && sra->text_version[0]) + verstr = sra->text_version; + else + verstr = "-no-metadata-"; + } + + for (i = 0; st == NULL && superlist[i] ; i++) + st = superlist[i]->match_metadata_desc(verstr); + + if (sra) + sysfs_free(sra); + if (st) { + st->sb = NULL; + if (subarrayp) + *subarrayp = subarray; + strcpy(st->container_devnm, container); + strcpy(st->devnm, fd2devnm(fd)); + } else + free(subarray); + + return st; +} +#endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */ + +int dev_size_from_id(dev_t id, unsigned long long *size) +{ + char buf[20]; + int fd; + + sprintf(buf, "%d:%d", major(id), minor(id)); + fd = dev_open(buf, O_RDONLY); + if (fd < 0) + return 0; + if (get_dev_size(fd, NULL, size)) { + close(fd); + return 1; + } + close(fd); + return 0; +} + +struct supertype *dup_super(struct supertype *orig) +{ + struct supertype *st; + + if (!orig) + return orig; + st = xcalloc(1, sizeof(*st)); + st->ss = orig->ss; + st->max_devs = orig->max_devs; + st->minor_version = orig->minor_version; + st->ignore_hw_compat = orig->ignore_hw_compat; + st->data_offset = orig->data_offset; + st->sb = NULL; + st->info = NULL; + return st; +} + +struct supertype *guess_super_type(int fd, enum guess_types guess_type) +{ + /* try each load_super to find the best match, + * and return the best superswitch + */ + struct superswitch *ss; + struct supertype *st; + time_t besttime = 0; + int bestsuper = -1; + int i; + + st = xcalloc(1, sizeof(*st)); + st->container_devnm[0] = 0; + + for (i = 0 ; superlist[i]; i++) { + int rv; + ss = superlist[i]; + if (guess_type == guess_array && ss->add_to_super == NULL) + continue; + if (guess_type == guess_partitions && ss->add_to_super != NULL) + continue; + memset(st, 0, sizeof(*st)); + st->ignore_hw_compat = 1; + rv = ss->load_super(st, fd, NULL); + if (rv == 0) { + struct mdinfo info; + st->ss->getinfo_super(st, &info, NULL); + if (bestsuper == -1 || + besttime < info.array.ctime) { + bestsuper = i; + besttime = info.array.ctime; + } + ss->free_super(st); + } + } + if (bestsuper != -1) { + int rv; + memset(st, 0, sizeof(*st)); + st->ignore_hw_compat = 1; + rv = superlist[bestsuper]->load_super(st, fd, NULL); + if (rv == 0) { + superlist[bestsuper]->free_super(st); + return st; + } + } + free(st); + return NULL; +} + +/* Return size of device in bytes */ +int get_dev_size(int fd, char *dname, unsigned long long *sizep) +{ + unsigned long long ldsize; + struct stat st; + + if (fstat(fd, &st) != -1 && S_ISREG(st.st_mode)) + ldsize = (unsigned long long)st.st_size; + else +#ifdef BLKGETSIZE64 + if (ioctl(fd, BLKGETSIZE64, &ldsize) != 0) +#endif + { + unsigned long dsize; + if (ioctl(fd, BLKGETSIZE, &dsize) == 0) { + ldsize = dsize; + ldsize <<= 9; + } else { + if (dname) + pr_err("Cannot get size of %s: %s\b", + dname, strerror(errno)); + return 0; + } + } + *sizep = ldsize; + return 1; +} + +/* Return true if this can only be a container, not a member device. + * i.e. is and md device and size is zero + */ +int must_be_container(int fd) +{ + unsigned long long size; + if (md_get_version(fd) < 0) + return 0; + if (get_dev_size(fd, NULL, &size) == 0) + return 1; + if (size == 0) + return 1; + return 0; +} + +/* Sets endofpart parameter to the last block used by the last GPT partition on the device. + * Returns: 1 if successful + * -1 for unknown partition type + * 0 for other errors + */ +static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart) +{ + struct GPT gpt; + unsigned char empty_gpt_entry[16]= {0}; + struct GPT_part_entry *part; + char buf[512]; + unsigned long long curr_part_end; + unsigned all_partitions, entry_size; + unsigned part_nr; + + *endofpart = 0; + + BUILD_BUG_ON(sizeof(gpt) != 512); + /* skip protective MBR */ + lseek(fd, 512, SEEK_SET); + /* read GPT header */ + if (read(fd, &gpt, 512) != 512) + return 0; + + /* get the number of partition entries and the entry size */ + all_partitions = __le32_to_cpu(gpt.part_cnt); + entry_size = __le32_to_cpu(gpt.part_size); + + /* Check GPT signature*/ + if (gpt.magic != GPT_SIGNATURE_MAGIC) + return -1; + + /* sanity checks */ + if (all_partitions > 1024 || + entry_size > sizeof(buf)) + return -1; + + part = (struct GPT_part_entry *)buf; + + for (part_nr = 0; part_nr < all_partitions; part_nr++) { + /* read partition entry */ + if (read(fd, buf, entry_size) != (ssize_t)entry_size) + return 0; + + /* is this valid partition? */ + if (memcmp(part->type_guid, empty_gpt_entry, 16) != 0) { + /* check the last lba for the current partition */ + curr_part_end = __le64_to_cpu(part->ending_lba); + if (curr_part_end > *endofpart) + *endofpart = curr_part_end; + } + + } + return 1; +} + +/* Sets endofpart parameter to the last block used by the last partition on the device. + * Returns: 1 if successful + * -1 for unknown partition type + * 0 for other errors + */ +static int get_last_partition_end(int fd, unsigned long long *endofpart) +{ + struct MBR boot_sect; + struct MBR_part_record *part; + unsigned long long curr_part_end; + unsigned part_nr; + int retval = 0; + + *endofpart = 0; + + BUILD_BUG_ON(sizeof(boot_sect) != 512); + /* read MBR */ + lseek(fd, 0, 0); + if (read(fd, &boot_sect, 512) != 512) + goto abort; + + /* check MBP signature */ + if (boot_sect.magic == MBR_SIGNATURE_MAGIC) { + retval = 1; + /* found the correct signature */ + part = boot_sect.parts; + + for (part_nr = 0; part_nr < MBR_PARTITIONS; part_nr++) { + /* check for GPT type */ + if (part->part_type == MBR_GPT_PARTITION_TYPE) { + retval = get_gpt_last_partition_end(fd, endofpart); + break; + } + /* check the last used lba for the current partition */ + curr_part_end = __le32_to_cpu(part->first_sect_lba) + + __le32_to_cpu(part->blocks_num); + if (curr_part_end > *endofpart) + *endofpart = curr_part_end; + + part++; + } + } else { + /* Unknown partition table */ + retval = -1; + } + abort: + return retval; +} + +int check_partitions(int fd, char *dname, unsigned long long freesize, + unsigned long long size) +{ + /* + * Check where the last partition ends + */ + unsigned long long endofpart; + int ret; + + if ((ret = get_last_partition_end(fd, &endofpart)) > 0) { + /* There appears to be a partition table here */ + if (freesize == 0) { + /* partitions will not be visible in new device */ + pr_err("partition table exists on %s but will be lost or\n" + " meaningless after creating array\n", + dname); + return 1; + } else if (endofpart > freesize) { + /* last partition overlaps metadata */ + pr_err("metadata will over-write last partition on %s.\n", + dname); + return 1; + } else if (size && endofpart > size) { + /* partitions will be truncated in new device */ + pr_err("array size is too small to cover all partitions on %s.\n", + dname); + return 1; + } + } + return 0; +} + +int open_container(int fd) +{ + /* 'fd' is a block device. Find out if it is in use + * by a container, and return an open fd on that container. + */ + char path[256]; + char *e; + DIR *dir; + struct dirent *de; + int dfd, n; + char buf[200]; + int major, minor; + struct stat st; + + if (fstat(fd, &st) != 0) + return -1; + sprintf(path, "/sys/dev/block/%d:%d/holders", + (int)major(st.st_rdev), (int)minor(st.st_rdev)); + e = path + strlen(path); + + dir = opendir(path); + if (!dir) + return -1; + while ((de = readdir(dir))) { + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + /* Need to make sure it is a container and not a volume */ + sprintf(e, "/%s/md/metadata_version", de->d_name); + dfd = open(path, O_RDONLY); + if (dfd < 0) + continue; + n = read(dfd, buf, sizeof(buf)); + close(dfd); + if (n <= 0 || (unsigned)n >= sizeof(buf)) + continue; + buf[n] = 0; + if (strncmp(buf, "external", 8) != 0 || + n < 10 || + buf[9] == '/') + continue; + sprintf(e, "/%s/dev", de->d_name); + dfd = open(path, O_RDONLY); + if (dfd < 0) + continue; + n = read(dfd, buf, sizeof(buf)); + close(dfd); + if (n <= 0 || (unsigned)n >= sizeof(buf)) + continue; + buf[n] = 0; + if (sscanf(buf, "%d:%d", &major, &minor) != 2) + continue; + sprintf(buf, "%d:%d", major, minor); + dfd = dev_open(buf, O_RDONLY); + if (dfd >= 0) { + closedir(dir); + return dfd; + } + } + closedir(dir); + return -1; +} + +struct superswitch *version_to_superswitch(char *vers) +{ + int i; + + for (i = 0; superlist[i]; i++) { + struct superswitch *ss = superlist[i]; + + if (strcmp(vers, ss->name) == 0) + return ss; + } + + return NULL; +} + +int metadata_container_matches(char *metadata, char *devnm) +{ + /* Check if 'devnm' is the container named in 'metadata' + * which is + * /containername/componentname or + * -containername/componentname + */ + int l; + if (*metadata != '/' && *metadata != '-') + return 0; + l = strlen(devnm); + if (strncmp(metadata+1, devnm, l) != 0) + return 0; + if (metadata[l+1] != '/') + return 0; + return 1; +} + +int metadata_subdev_matches(char *metadata, char *devnm) +{ + /* Check if 'devnm' is the subdev named in 'metadata' + * which is + * /containername/subdev or + * -containername/subdev + */ + char *sl; + if (*metadata != '/' && *metadata != '-') + return 0; + sl = strchr(metadata+1, '/'); + if (!sl) + return 0; + if (strcmp(sl+1, devnm) == 0) + return 1; + return 0; +} + +int is_container_member(struct mdstat_ent *mdstat, char *container) +{ + if (mdstat->metadata_version == NULL || + strncmp(mdstat->metadata_version, "external:", 9) != 0 || + !metadata_container_matches(mdstat->metadata_version+9, container)) + return 0; + + return 1; +} + +int is_subarray_active(char *subarray, char *container) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + + for (ent = mdstat; ent; ent = ent->next) + if (is_container_member(ent, container)) + if (strcmp(to_subarray(ent, container), subarray) == 0) + break; + + free_mdstat(mdstat); + + return ent != NULL; +} + +/* open_subarray - opens a subarray in a container + * @dev: container device name + * @st: empty supertype + * @quiet: block reporting errors flag + * + * On success returns an fd to a container and fills in *st + */ +int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet) +{ + struct mdinfo *mdi; + struct mdinfo *info; + int fd, err = 1; + char *_devnm; + + fd = open(dev, O_RDWR|O_EXCL); + if (fd < 0) { + if (!quiet) + pr_err("Couldn't open %s, aborting\n", + dev); + return -1; + } + + _devnm = fd2devnm(fd); + if (_devnm == NULL) { + if (!quiet) + pr_err("Failed to determine device number for %s\n", + dev); + goto close_fd; + } + strcpy(st->devnm, _devnm); + + mdi = sysfs_read(fd, st->devnm, GET_VERSION|GET_LEVEL); + if (!mdi) { + if (!quiet) + pr_err("Failed to read sysfs for %s\n", + dev); + goto close_fd; + } + + if (mdi->array.level != UnSet) { + if (!quiet) + pr_err("%s is not a container\n", dev); + goto free_sysfs; + } + + st->ss = version_to_superswitch(mdi->text_version); + if (!st->ss) { + if (!quiet) + pr_err("Operation not supported for %s metadata\n", + mdi->text_version); + goto free_sysfs; + } + + if (st->devnm[0] == 0) { + if (!quiet) + pr_err("Failed to allocate device name\n"); + goto free_sysfs; + } + + if (!st->ss->load_container) { + if (!quiet) + pr_err("%s is not a container\n", dev); + goto free_sysfs; + } + + if (st->ss->load_container(st, fd, NULL)) { + if (!quiet) + pr_err("Failed to load metadata for %s\n", + dev); + goto free_sysfs; + } + + info = st->ss->container_content(st, subarray); + if (!info) { + if (!quiet) + pr_err("Failed to find subarray-%s in %s\n", + subarray, dev); + goto free_super; + } + free(info); + + err = 0; + + free_super: + if (err) + st->ss->free_super(st); + free_sysfs: + sysfs_free(mdi); + close_fd: + if (err) + close(fd); + + if (err) + return -1; + else + return fd; +} + +int add_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info) +{ + /* Add a device to an array, in one of 2 ways. */ + int rv; +#ifndef MDASSEMBLE + if (st->ss->external) { + if (info->disk.state & (1<recovery_start = MaxSector; + else + info->recovery_start = 0; + rv = sysfs_add_disk(sra, info, 0); + if (! rv) { + struct mdinfo *sd2; + for (sd2 = sra->devs; sd2; sd2=sd2->next) + if (sd2 == info) + break; + if (sd2 == NULL) { + sd2 = xmalloc(sizeof(*sd2)); + *sd2 = *info; + sd2->next = sra->devs; + sra->devs = sd2; + } + } + } else +#endif + rv = ioctl(mdfd, ADD_NEW_DISK, &info->disk); + return rv; +} + +int remove_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info) +{ + int rv; + /* Remove the disk given by 'info' from the array */ +#ifndef MDASSEMBLE + if (st->ss->external) + rv = sysfs_set_str(sra, info, "slot", "none"); + else +#endif + rv = ioctl(mdfd, HOT_REMOVE_DISK, makedev(info->disk.major, + info->disk.minor)); + return rv; +} + +int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info) +{ + /* Initialise kernel's knowledge of array. + * This varies between externally managed arrays + * and older kernels + */ + int vers = md_get_version(mdfd); + int rv; + +#ifndef MDASSEMBLE + if (st->ss->external) + rv = sysfs_set_array(info, vers); + else +#endif + if ((vers % 100) >= 1) { /* can use different versions */ + mdu_array_info_t inf; + memset(&inf, 0, sizeof(inf)); + inf.major_version = info->array.major_version; + inf.minor_version = info->array.minor_version; + rv = ioctl(mdfd, SET_ARRAY_INFO, &inf); + } else + rv = ioctl(mdfd, SET_ARRAY_INFO, NULL); + return rv; +} + +unsigned long long min_recovery_start(struct mdinfo *array) +{ + /* find the minimum recovery_start in an array for metadata + * formats that only record per-array recovery progress instead + * of per-device + */ + unsigned long long recovery_start = MaxSector; + struct mdinfo *d; + + for (d = array->devs; d; d = d->next) + recovery_start = min(recovery_start, d->recovery_start); + + return recovery_start; +} + +int mdmon_pid(char *devnm) +{ + char path[100]; + char pid[10]; + int fd; + int n; + + sprintf(path, "%s/%s.pid", MDMON_DIR, devnm); + + fd = open(path, O_RDONLY | O_NOATIME, 0); + + if (fd < 0) + return -1; + n = read(fd, pid, 9); + close(fd); + if (n <= 0) + return -1; + return atoi(pid); +} + +int mdmon_running(char *devnm) +{ + int pid = mdmon_pid(devnm); + if (pid <= 0) + return 0; + if (kill(pid, 0) == 0) + return 1; + return 0; +} + +int start_mdmon(char *devnm) +{ + int i, skipped; + int len; + pid_t pid; + int status; + char pathbuf[1024]; + char *paths[4] = { + pathbuf, + BINDIR "/mdmon", + "./mdmon", + NULL + }; + + if (check_env("MDADM_NO_MDMON")) + return 0; + + len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf)-1); + if (len > 0) { + char *sl; + pathbuf[len] = 0; + sl = strrchr(pathbuf, '/'); + if (sl) + sl++; + else + sl = pathbuf; + strcpy(sl, "mdmon"); + } else + pathbuf[0] = '\0'; + + /* First try to run systemctl */ + if (!check_env("MDADM_NO_SYSTEMCTL")) + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + skipped = 0; + for (i = 3; skipped < 20; i++) + if (close(i) < 0) + skipped++; + else + skipped = 0; + + /* Don't want to see error messages from + * systemctl. If the service doesn't exist, + * we start mdmon ourselves. + */ + close(2); + open("/dev/null", O_WRONLY); + snprintf(pathbuf, sizeof(pathbuf), "mdmon@%s.service", + devnm); + status = execl("/usr/bin/systemctl", "systemctl", + "start", + pathbuf, NULL); + status = execl("/bin/systemctl", "systemctl", "start", + pathbuf, NULL); + exit(1); + case -1: pr_err("cannot run mdmon. Array remains readonly\n"); + return -1; + default: /* parent - good */ + pid = wait(&status); + if (pid >= 0 && status == 0) + return 0; + } + + /* That failed, try running mdmon directly */ + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + skipped = 0; + for (i = 3; skipped < 20; i++) + if (close(i) < 0) + skipped++; + else + skipped = 0; + + for (i = 0; paths[i]; i++) + if (paths[i][0]) { + execl(paths[i], paths[i], + devnm, NULL); + } + exit(1); + case -1: pr_err("cannot run mdmon. Array remains readonly\n"); + return -1; + default: /* parent - good */ + pid = wait(&status); + if (pid < 0 || status != 0) { + pr_err("failed to launch mdmon. Array remains readonly\n"); + return -1; + } + } + return 0; +} + +__u32 random32(void) +{ + __u32 rv; + int rfd = open("/dev/urandom", O_RDONLY); + if (rfd < 0 || read(rfd, &rv, 4) != 4) + rv = random(); + if (rfd >= 0) + close(rfd); + return rv; +} + +#ifndef MDASSEMBLE +int flush_metadata_updates(struct supertype *st) +{ + int sfd; + if (!st->updates) { + st->update_tail = NULL; + return -1; + } + + sfd = connect_monitor(st->container_devnm); + if (sfd < 0) + return -1; + + while (st->updates) { + struct metadata_update *mu = st->updates; + st->updates = mu->next; + + send_message(sfd, mu, 0); + wait_reply(sfd, 0); + free(mu->buf); + free(mu); + } + ack(sfd, 0); + wait_reply(sfd, 0); + close(sfd); + st->update_tail = NULL; + return 0; +} + +void append_metadata_update(struct supertype *st, void *buf, int len) +{ + + struct metadata_update *mu = xmalloc(sizeof(*mu)); + + mu->buf = buf; + mu->len = len; + mu->space = NULL; + mu->space_list = NULL; + mu->next = NULL; + *st->update_tail = mu; + st->update_tail = &mu->next; +} +#endif /* MDASSEMBLE */ + +#ifdef __TINYC__ +/* tinyc doesn't optimize this check in ioctl.h out ... */ +unsigned int __invalid_size_argument_for_IOC = 0; +#endif + +int experimental(void) +{ + if (check_env("MDADM_EXPERIMENTAL")) + return 1; + else { + pr_err("To use this feature MDADM_EXPERIMENTAL environment variable has to be defined.\n"); + return 0; + } +} + +/* Pick all spares matching given criteria from a container + * if min_size == 0 do not check size + * if domlist == NULL do not check domains + * if spare_group given add it to domains of each spare + * metadata allows to test domains using metadata of destination array */ +struct mdinfo *container_choose_spares(struct supertype *st, + unsigned long long min_size, + struct domainlist *domlist, + char *spare_group, + const char *metadata, int get_one) +{ + struct mdinfo *d, **dp, *disks = NULL; + + /* get list of all disks in container */ + if (st->ss->getinfo_super_disks) + disks = st->ss->getinfo_super_disks(st); + + if (!disks) + return disks; + /* find spare devices on the list */ + dp = &disks->devs; + disks->array.spare_disks = 0; + while (*dp) { + int found = 0; + d = *dp; + if (d->disk.state == 0) { + /* check if size is acceptable */ + unsigned long long dev_size; + dev_t dev = makedev(d->disk.major,d->disk.minor); + + if (!min_size || + (dev_size_from_id(dev, &dev_size) && + dev_size >= min_size)) + found = 1; + /* check if domain matches */ + if (found && domlist) { + struct dev_policy *pol = devid_policy(dev); + if (spare_group) + pol_add(&pol, pol_domain, + spare_group, NULL); + if (domain_test(domlist, pol, metadata) != 1) + found = 0; + dev_policy_free(pol); + } + } + if (found) { + dp = &d->next; + disks->array.spare_disks++; + if (get_one) { + sysfs_free(*dp); + d->next = NULL; + } + } else { + *dp = d->next; + d->next = NULL; + sysfs_free(d); + } + } + return disks; +} + +/* Checks if paths point to the same device + * Returns 0 if they do. + * Returns 1 if they don't. + * Returns -1 if something went wrong, + * e.g. paths are empty or the files + * they point to don't exist */ +int compare_paths (char* path1, char* path2) +{ + struct stat st1,st2; + + if (path1 == NULL || path2 == NULL) + return -1; + if (stat(path1,&st1) != 0) + return -1; + if (stat(path2,&st2) != 0) + return -1; + if ((st1.st_ino == st2.st_ino) && (st1.st_dev == st2.st_dev)) + return 0; + return 1; +} + +/* Make sure we can open as many devices as needed */ +void enable_fds(int devices) +{ + unsigned int fds = 20 + devices; + struct rlimit lim; + if (getrlimit(RLIMIT_NOFILE, &lim) != 0 + || lim.rlim_cur >= fds) + return; + if (lim.rlim_max < fds) + lim.rlim_max = fds; + lim.rlim_cur = fds; + setrlimit(RLIMIT_NOFILE, &lim); +} + +int in_initrd(void) +{ + /* This is based on similar function in systemd. */ + struct statfs s; + /* statfs.f_type is signed long on s390x and MIPS, causing all + sorts of sign extension problems with RAMFS_MAGIC being + defined as 0x858458f6 */ + return statfs("/", &s) >= 0 && + ((unsigned long)s.f_type == TMPFS_MAGIC || + ((unsigned long)s.f_type & 0xFFFFFFFFUL) == + ((unsigned long)RAMFS_MAGIC & 0xFFFFFFFFUL)); +} + +void reopen_mddev(int mdfd) +{ + /* Re-open without any O_EXCL, but keep + * the same fd + */ + char *devnm; + int fd; + devnm = fd2devnm(mdfd); + close(mdfd); + fd = open_dev(devnm); + if (fd >= 0 && fd != mdfd) + dup2(fd, mdfd); +} + +#ifndef MDASSEMBLE +static struct cmap_hooks *cmap_hooks = NULL; +static int is_cmap_hooks_ready = 0; + +void set_cmap_hooks(void) +{ + cmap_hooks = xmalloc(sizeof(struct cmap_hooks)); + cmap_hooks->cmap_handle = dlopen("libcmap.so.4", RTLD_NOW | RTLD_LOCAL); + if (!cmap_hooks->cmap_handle) + return; + + cmap_hooks->initialize = dlsym(cmap_hooks->cmap_handle, "cmap_initialize"); + cmap_hooks->get_string = dlsym(cmap_hooks->cmap_handle, "cmap_get_string"); + cmap_hooks->finalize = dlsym(cmap_hooks->cmap_handle, "cmap_finalize"); + + if (!cmap_hooks->initialize || !cmap_hooks->get_string || + !cmap_hooks->finalize) + dlclose(cmap_hooks->cmap_handle); + else + is_cmap_hooks_ready = 1; +} + +int get_cluster_name(char **cluster_name) +{ + int rv = -1; + cmap_handle_t handle; + + if (!is_cmap_hooks_ready) + return rv; + + rv = cmap_hooks->initialize(&handle); + if (rv != CS_OK) + goto out; + + rv = cmap_hooks->get_string(handle, "totem.cluster_name", cluster_name); + if (rv != CS_OK) { + free(*cluster_name); + rv = -1; + goto name_err; + } + + rv = 0; +name_err: + cmap_hooks->finalize(handle); +out: + return rv; +} + +void set_dlm_hooks(void) +{ + dlm_hooks = xmalloc(sizeof(struct dlm_hooks)); + dlm_hooks->dlm_handle = dlopen("libdlm_lt.so.3", RTLD_NOW | RTLD_LOCAL); + if (!dlm_hooks->dlm_handle) + return; + + dlm_hooks->create_lockspace = dlsym(dlm_hooks->dlm_handle, "dlm_create_lockspace"); + dlm_hooks->release_lockspace = dlsym(dlm_hooks->dlm_handle, "dlm_release_lockspace"); + dlm_hooks->ls_lock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_lock"); + dlm_hooks->ls_unlock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_unlock"); + dlm_hooks->ls_get_fd = dlsym(dlm_hooks->dlm_handle, "dlm_ls_get_fd"); + dlm_hooks->dispatch = dlsym(dlm_hooks->dlm_handle, "dlm_dispatch"); + + if (!dlm_hooks->create_lockspace || !dlm_hooks->ls_lock || + !dlm_hooks->ls_unlock || !dlm_hooks->release_lockspace || + !dlm_hooks->ls_get_fd || !dlm_hooks->dispatch) + dlclose(dlm_hooks->dlm_handle); + else + is_dlm_hooks_ready = 1; +} + +void set_hooks(void) +{ + set_dlm_hooks(); + set_cmap_hooks(); +} +#endif diff --git a/xmalloc.c b/xmalloc.c new file mode 100644 index 00000000..8b3f78a6 --- /dev/null +++ b/xmalloc.c @@ -0,0 +1,84 @@ +/* mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: + */ + +#include "mdadm.h" +/*#include +#include +#include +#include +#include +#include +#include +*/ + +void *xmalloc(size_t len) +{ + void *rv = malloc(len); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} + +void *xrealloc(void *ptr, size_t len) +{ + void *rv = realloc(ptr, len); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} + +void *xcalloc(size_t num, size_t size) +{ + void *rv = calloc(num, size); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} + +char *xstrdup(const char *str) +{ + char *rv = strdup(str); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} -- cgit v1.2.3