diff options
-rw-r--r-- | ANNOUNCE-3.4 | 24 | ||||
-rw-r--r-- | Assemble.c | 59 | ||||
-rw-r--r-- | Create.c | 35 | ||||
-rw-r--r-- | Detail.c | 27 | ||||
-rwxr-xr-x[-rw-r--r--] | Grow.c | 36 | ||||
-rw-r--r-- | Incremental.c | 32 | ||||
-rw-r--r-- | Makefile | 37 | ||||
-rw-r--r-- | Manage.c | 101 | ||||
-rw-r--r-- | ReadMe.c | 14 | ||||
-rw-r--r-- | bitmap.c | 63 | ||||
-rw-r--r-- | bitmap.h | 8 | ||||
-rw-r--r-- | config.c | 30 | ||||
-rw-r--r-- | crc32c.c | 104 | ||||
-rw-r--r-- | debian/changelog | 6 | ||||
-rwxr-xr-x | inventory | 3 | ||||
-rw-r--r-- | mapfile.c | 2 | ||||
-rw-r--r-- | md.4 | 20 | ||||
-rw-r--r-- | md_p.h | 71 | ||||
-rw-r--r-- | md_u.h | 5 | ||||
-rw-r--r-- | mdadm.8.in | 66 | ||||
-rw-r--r-- | mdadm.c | 114 | ||||
-rw-r--r-- | mdadm.conf.5 | 2 | ||||
-rwxr-xr-x[-rw-r--r--] | mdadm.h | 123 | ||||
-rw-r--r-- | mdadm.spec | 2 | ||||
-rw-r--r-- | mdassemble.8 | 4 | ||||
-rw-r--r-- | mdmon.8 | 2 | ||||
-rw-r--r-- | mdmon.c | 2 | ||||
-rw-r--r-- | msg.c | 2 | ||||
-rw-r--r-- | platform-intel.c | 86 | ||||
-rw-r--r-- | platform-intel.h | 4 | ||||
-rw-r--r-- | raid6check.c | 3 | ||||
-rw-r--r-- | restripe.c | 6 | ||||
-rw-r--r-- | sha1.h | 8 | ||||
-rw-r--r-- | super-intel.c | 138 | ||||
-rw-r--r-- | super0.c | 14 | ||||
-rw-r--r-- | super1.c | 349 | ||||
-rw-r--r-- | sysfs.c | 13 | ||||
-rw-r--r-- | systemd/mdadm-last-resort@.service | 1 | ||||
-rw-r--r-- | systemd/mdmonitor.service | 5 | ||||
-rwxr-xr-x | test | 9 | ||||
-rw-r--r-- | tests/19raid6auto-repair | 66 | ||||
-rw-r--r-- | tests/19raid6repair | 81 | ||||
-rw-r--r-- | tests/20raid5journal | 64 | ||||
-rw-r--r-- | udev-md-raid-arrays.rules | 8 | ||||
-rw-r--r-- | udev-md-raid-assembly.rules | 5 | ||||
-rw-r--r-- | util.c | 227 |
46 files changed, 1799 insertions, 282 deletions
diff --git a/ANNOUNCE-3.4 b/ANNOUNCE-3.4 new file mode 100644 index 00000000..2689732d --- /dev/null +++ b/ANNOUNCE-3.4 @@ -0,0 +1,24 @@ +Subject: ANNOUNCE: mdadm 3.4 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.4 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm + +The new second-level version number reflects significant new +functionality, particular support for journalled RAID5/6 and clustered +RAID1. This new support is probably still buggy. Please report bugs. + +There are also a number of fixes for Intel's IMSM metadata support, +and an assortment of minor bug fixes. + +I plan for this to be the last release of mdadm that I provide as I am +retiring from MD and mdadm maintenance. Jes Sorensen has volunteered +to oversee mdadm for the next while. Thanks Jes! + +NeilBrown 28th January 2016 @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com> * * * This program is free software; you can redistribute it and/or modify @@ -637,6 +637,19 @@ static int load_devices(struct devs *devices, char *devmap, if (strcmp(c->update, "byteorder") == 0) err = 0; + else if (strcmp(c->update, "home-cluster") == 0) { + tst->cluster_name = c->homecluster; + err = tst->ss->write_bitmap(tst, dfd, NameUpdate); + } else if (strcmp(c->update, "nodes") == 0) { + tst->nodes = c->nodes; + err = tst->ss->write_bitmap(tst, dfd, NodeNumUpdate); + } else if (strcmp(c->update, "revert-reshape") == 0 && + c->invalid_backup) + err = tst->ss->update_super(tst, content, + "revert-reshape-nobackup", + devname, c->verbose, + ident->uuid_set, + c->homehost); else err = tst->ss->update_super(tst, content, c->update, devname, c->verbose, @@ -729,7 +742,7 @@ static int load_devices(struct devs *devices, char *devmap, i = devcnt; else i = devices[devcnt].i.disk.raid_disk; - if (i+1 == 0) { + if (i+1 == 0 || i == MD_DISK_ROLE_JOURNAL) { if (nextspare < content->array.raid_disks*2) nextspare = content->array.raid_disks*2; i = nextspare++; @@ -907,7 +920,6 @@ static int force_array(struct mdinfo *content, avail[chosen_drive] = 1; okcnt++; tst->ss->free_super(tst); - /* If there are any other drives of the same vintage, * add them in as well. We can't lose and we might gain */ @@ -938,6 +950,7 @@ static int start_array(int mdfd, unsigned int okcnt, unsigned int sparecnt, unsigned int rebuilding_cnt, + unsigned int journalcnt, struct context *c, int clean, char *avail, int start_partial_ok, @@ -949,6 +962,15 @@ static int start_array(int mdfd, int i; unsigned int req_cnt; + if (content->journal_device_required && (content->journal_clean == 0)) { + if (!c->force) { + pr_err("Not safe to assemble with missing or stale journal device, consider --force.\n"); + return 1; + } + pr_err("Journal is missing or stale, starting array read only.\n"); + c->readonly = 1; + } + rv = set_array_info(mdfd, st, content); if (rv && !err_ok) { pr_err("failed to set array info for %s: %s\n", @@ -1026,7 +1048,8 @@ static int start_array(int mdfd, if (content->array.level == LEVEL_CONTAINER) { if (c->verbose >= 0) { pr_err("Container %s has been assembled with %d drive%s", - mddev, okcnt+sparecnt, okcnt+sparecnt==1?"":"s"); + mddev, okcnt+sparecnt+journalcnt, + okcnt+sparecnt+journalcnt==1?"":"s"); if (okcnt < (unsigned)content->array.raid_disks) fprintf(stderr, " (out of %d)", content->array.raid_disks); @@ -1112,6 +1135,8 @@ static int start_array(int mdfd, fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt); if (sparecnt) fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s"); + if (content->journal_clean) + fprintf(stderr, " and %d journal", journalcnt); fprintf(stderr, ".\n"); } if (content->reshape_active && @@ -1283,7 +1308,8 @@ int Assemble(struct supertype *st, char *mddev, int *best = NULL; /* indexed by raid_disk */ int bestcnt = 0; int devcnt; - unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt; + unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt, journalcnt; + int journal_clean = 0; int i; int was_forced = 0; int most_recent = 0; @@ -1524,6 +1550,7 @@ try_again: okcnt = 0; replcnt = 0; sparecnt=0; + journalcnt=0; rebuilding_cnt=0; for (i=0; i< bestcnt; i++) { int j = best[i]; @@ -1534,8 +1561,13 @@ try_again: /* note: we ignore error flags in multipath arrays * as they don't make sense */ - if (content->array.level != LEVEL_MULTIPATH) - if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) { + if (content->array.level != LEVEL_MULTIPATH) { + if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL)) { + if (content->journal_device_required) + journalcnt++; + else /* unexpected journal, mark as faulty */ + devices[j].i.disk.state |= (1<<MD_DISK_FAULTY); + } else if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) { if (!(devices[j].i.disk.state & (1<<MD_DISK_FAULTY))) { devices[j].uptodate = 1; @@ -1543,6 +1575,7 @@ try_again: } continue; } + } /* If this device thinks that 'most_recent' has failed, then * we must reject this device. */ @@ -1566,6 +1599,8 @@ try_again: devices[most_recent].i.events ) { devices[j].uptodate = 1; + if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL)) + journal_clean = 1; if (i < content->array.raid_disks * 2) { if (devices[j].i.recovery_start == MaxSector || (content->reshape_active && @@ -1577,7 +1612,7 @@ try_again: replcnt++; } else rebuilding_cnt++; - } else + } else if (devices[j].i.disk.raid_disk != MD_DISK_ROLE_JOURNAL) sparecnt++; } } @@ -1637,11 +1672,15 @@ try_again: #ifndef MDASSEMBLE sysfs_init(content, mdfd, NULL); #endif + /* after reload context, store journal_clean in context */ + content->journal_clean = journal_clean; for (i=0; i<bestcnt; i++) { int j = best[i]; unsigned int desired_state; - if (i >= content->array.raid_disks * 2) + if (devices[j].i.disk.raid_disk == MD_DISK_ROLE_JOURNAL) + desired_state = (1<<MD_DISK_JOURNAL); + else if (i >= content->array.raid_disks * 2) desired_state = 0; else if (i & 1) desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_REPLACEMENT); @@ -1788,7 +1827,7 @@ try_again: rv = start_array(mdfd, mddev, content, st, ident, best, bestcnt, chosen_drive, devices, okcnt, sparecnt, - rebuilding_cnt, + rebuilding_cnt, journalcnt, c, clean, avail, start_partial_ok, pre_exist != NULL, @@ -87,7 +87,7 @@ int Create(struct supertype *st, char *mddev, unsigned long long minsize=0, maxsize=0; char *mindisc = NULL; char *maxdisc = NULL; - int dnum; + int dnum, raid_disk_num; struct mddev_dev *dv; int fail=0, warn=0; struct stat stb; @@ -114,6 +114,8 @@ int Create(struct supertype *st, char *mddev, unsigned long long newsize; int major_num = BITMAP_MAJOR_HI; + if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0) + major_num = BITMAP_MAJOR_CLUSTERED; memset(&info, 0, sizeof(info)); if (s->level == UnSet && st && st->ss->default_geometry) @@ -180,11 +182,11 @@ int Create(struct supertype *st, char *mddev, pr_err("This metadata type does not support spare disks at create time\n"); return 1; } - if (subdevs > s->raiddisks+s->sparedisks) { + if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) { pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks); return 1; } - if (!have_container && subdevs < s->raiddisks+s->sparedisks) { + if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) { pr_err("You haven't given enough devices (real or missing) to create this array\n"); return 1; } @@ -328,7 +330,7 @@ int Create(struct supertype *st, char *mddev, } close(dfd); info.array.working_disks++; - if (dnum < s->raiddisks) + if (dnum < s->raiddisks && dv->disposition != 'j') info.array.active_disks++; if (st == NULL) { struct createinfo *ci = conf_get_create_info(); @@ -397,6 +399,9 @@ int Create(struct supertype *st, char *mddev, } } + if (dv->disposition == 'j') + goto skip_size_check; /* skip write journal for size check */ + freesize /= 2; /* convert to K */ if (s->chunk && s->chunk != UnSet) { /* round to chunk size */ @@ -429,6 +434,7 @@ int Create(struct supertype *st, char *mddev, mindisc = dname; minsize = freesize; } + skip_size_check: if (c->runstop != 1 || c->verbose >= 0) { int fd = open(dname, O_RDONLY); if (fd <0 ) { @@ -531,6 +537,8 @@ int Create(struct supertype *st, char *mddev, st->ss->name); warn = 1; } + st->nodes = c->nodes; + st->cluster_name = c->homecluster; if (warn) { if (c->runstop!= 1) { @@ -750,7 +758,8 @@ int Create(struct supertype *st, char *mddev, #endif } - if (s->bitmap_file && strcmp(s->bitmap_file, "internal")==0) { + if (s->bitmap_file && (strcmp(s->bitmap_file, "internal")==0 || + strcmp(s->bitmap_file, "clustered")==0)) { if ((vers%100) < 2) { pr_err("internal bitmaps not supported by this kernel.\n"); goto abort_locked; @@ -834,7 +843,7 @@ int Create(struct supertype *st, char *mddev, for (pass=1; pass <=2 ; pass++) { struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */ - for (dnum=0, dv = devlist ; dv ; + for (dnum=0, raid_disk_num=0, dv = devlist ; dv ; dv=(dv->next)?(dv->next):moved_disk, dnum++) { int fd; struct stat stb; @@ -843,11 +852,14 @@ int Create(struct supertype *st, char *mddev, if (dnum >= total_slots) abort(); if (dnum == insert_point) { + raid_disk_num += 1; moved_disk = dv; continue; } - if (strcasecmp(dv->devname, "missing")==0) + if (strcasecmp(dv->devname, "missing")==0) { + raid_disk_num += 1; continue; + } if (have_container) moved_disk = NULL; if (have_container && dnum < info.array.raid_disks - 1) @@ -859,8 +871,13 @@ int Create(struct supertype *st, char *mddev, *inf = info; inf->disk.number = dnum; - inf->disk.raid_disk = dnum; - if (inf->disk.raid_disk < s->raiddisks) + inf->disk.raid_disk = raid_disk_num++; + + if (dv->disposition == 'j') { + inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL; + inf->disk.state = (1<<MD_DISK_JOURNAL); + raid_disk_num--; + } else if (inf->disk.raid_disk < s->raiddisks) inf->disk.state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC); else @@ -299,7 +299,8 @@ int Detail(char *dev, struct context *c) for (d = 0; d < max_disks * 2; d++) { disks[d].state = (1<<MD_DISK_REMOVED); disks[d].major = disks[d].minor = 0; - disks[d].number = disks[d].raid_disk = d; + disks[d].number = -1; + disks[d].raid_disk = d/2; } next = array.raid_disks*2; @@ -325,7 +326,8 @@ int Detail(char *dev, struct context *c) && disks[disk.raid_disk*2].state == (1<<MD_DISK_REMOVED)) disks[disk.raid_disk*2] = disk; else if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks - && disks[disk.raid_disk*2+1].state == (1<<MD_DISK_REMOVED)) + && disks[disk.raid_disk*2+1].state == (1<<MD_DISK_REMOVED) + && !(disk.state & (1<<MD_DISK_JOURNAL))) disks[disk.raid_disk*2+1] = disk; else if (next < max_disks*2) disks[next++] = disk; @@ -339,7 +341,8 @@ int Detail(char *dev, struct context *c) (disks[d*2+1].state & (1<<MD_DISK_SYNC))) { avail_disks ++; avail[d] = 1; - } + } else + rv |= !! c->test; } if (c->brief) { @@ -422,8 +425,9 @@ int Detail(char *dev, struct context *c) else printf(" Used Dev Size : unknown\n"); } else - printf(" Used Dev Size : %d%s\n", array.size, - human_size((long long)array.size<<10)); + printf(" Used Dev Size : %lu%s\n", + (unsigned long)array.size, + human_size((unsigned long long)array.size<<10)); } if (array.raid_disks) printf(" Raid Devices : %d\n", array.raid_disks); @@ -616,12 +620,15 @@ This is pretty boring continue; if (!c->brief) { if (d == array.raid_disks*2) printf("\n"); - if (disk.number < 0) + if (disk.number < 0 && disk.raid_disk < 0) printf(" - %5d %5d - ", disk.major, disk.minor); - else if (disk.raid_disk < 0) + else if (disk.raid_disk < 0 || disk.state & (1<<MD_DISK_JOURNAL)) printf(" %5d %5d %5d - ", disk.number, disk.major, disk.minor); + else if (disk.number < 0) + printf(" - %5d %5d %5d ", + disk.major, disk.minor, disk.raid_disk); else printf(" %5d %5d %5d %5d ", disk.number, disk.major, disk.minor, disk.raid_disk); @@ -650,9 +657,10 @@ This is pretty boring } if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed"); if (disk.state & (1<<MD_DISK_WRITEMOSTLY)) printf(" writemostly"); + if (disk.state & (1<<MD_DISK_JOURNAL)) printf(" journal"); if ((disk.state & ((1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC) - |(1<<MD_DISK_REMOVED)|(1<<MD_DISK_FAULTY))) + |(1<<MD_DISK_REMOVED)|(1<<MD_DISK_FAULTY)|(1<<MD_DISK_JOURNAL))) == 0) { printf(" spare"); if (is_26) { @@ -671,9 +679,6 @@ This is pretty boring } } if (disk.state == 0) spares++; - if (c->test && d < array.raid_disks - && !(disk.state & (1<<MD_DISK_SYNC))) - rv |= 1; dv=map_dev_preferred(disk.major, disk.minor, 0, c->prefer); if (dv != NULL) { if (c->brief) @@ -297,6 +297,9 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) " between different architectures. Consider upgrading the Linux kernel.\n"); } + if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0) + major = BITMAP_MAJOR_CLUSTERED; + if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) { if (errno == ENOMEM) pr_err("Memory allocation failure.\n"); @@ -325,13 +328,15 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) if (strcmp(s->bitmap_file, "none")==0) { array.state &= ~(1<<MD_SB_BITMAP_PRESENT); if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) { - pr_err("failed to remove internal bitmap.\n"); + if (array.state & (1<<MD_SB_CLUSTERED)) + pr_err("failed to remove clustered bitmap.\n"); + else + pr_err("failed to remove internal bitmap.\n"); return 1; } return 0; } - pr_err("Internal bitmap already present on %s\n", - devname); + pr_err("bitmap already present on %s\n", devname); return 1; } @@ -375,7 +380,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) free(st); return 1; } - if (strcmp(s->bitmap_file, "internal") == 0) { + if (strcmp(s->bitmap_file, "internal") == 0 || + strcmp(s->bitmap_file, "clustered") == 0) { int rv; int d; int offset_setable = 0; @@ -384,6 +390,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name); return 1; } + st->nodes = c->nodes; + st->cluster_name = c->homecluster; mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION); if (mdi) offset_setable = 1; @@ -410,7 +418,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) bitmapsize, offset_setable, major) ) - st->ss->write_bitmap(st, fd2); + st->ss->write_bitmap(st, fd2, NoUpdate); else { pr_err("failed to create internal bitmap - chunksize problem.\n"); close(fd2); @@ -426,6 +434,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location", mdi->bitmap_offset); } else { + if (strcmp(s->bitmap_file, "clustered") == 0) + array.state |= (1<<MD_SB_CLUSTERED); array.state |= (1<<MD_SB_BITMAP_PRESENT); rv = ioctl(fd, SET_ARRAY_INFO, &array); } @@ -1580,6 +1590,15 @@ int Grow_reshape(char *devname, int fd, pr_err("Cannot increase raid-disks on this array beyond %d\n", st->max_devs); return 1; } + if (s->level == 0 && + (array.state & (1<<MD_SB_BITMAP_PRESENT)) && + !(array.state & (1<<MD_SB_CLUSTERED))) { + array.state &= ~(1<<MD_SB_BITMAP_PRESENT); + if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) { + pr_err("failed to remove internal bitmap.\n"); + return 1; + } + } /* in the external case we need to check that the requested reshape is * supported, and perform an initial check that the container holds the @@ -4496,8 +4515,8 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt * sometimes they aren't... So allow considerable flexability in matching, and allow * this test to be overridden by an environment variable. */ - if (info->array.utime > (int)__le64_to_cpu(bsb.mtime) + 2*60*60 || - info->array.utime < (int)__le64_to_cpu(bsb.mtime) - 10*60) { + if(time_after(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) + 2*60*60) || + time_before(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) - 10*60)) { if (check_env("MDADM_GROW_ALLOW_OLD")) { pr_err("accepting backup with timestamp %lu for array with timestamp %lu\n", (unsigned long)__le64_to_cpu(bsb.mtime), @@ -4866,6 +4885,9 @@ int Grow_continue_command(char *devname, int fd, sysfs_init(content, fd2, mdstat->devnm); + close(fd2); + fd2 = -1; + /* start mdmon in case it is not running */ if (!mdmon_running(container)) diff --git a/Incremental.c b/Incremental.c index 41876b9e..24fd8276 100644 --- a/Incremental.c +++ b/Incremental.c @@ -104,6 +104,7 @@ int Incremental(struct mddev_dev *devlist, struct context *c, struct map_ent target_array; int have_target; char *devname = devlist->devname; + int journal_device_missing = 0; struct createinfo *ci = conf_get_create_info(); @@ -312,6 +313,12 @@ int Incremental(struct mddev_dev *devlist, struct context *c, if (mdfd < 0) { + /* Skip the clustered ones. This should be started by + * clustering resource agents + */ + if (info.array.state & (1 << MD_SB_CLUSTERED)) + goto out; + /* Couldn't find an existing array, maybe make a new one */ mdfd = create_mddev(match ? match->devname : NULL, name_to_use, c->autof, trustworthy, chosen_name); @@ -437,6 +444,10 @@ int Incremental(struct mddev_dev *devlist, struct context *c, /* add disk needs to know about containers */ if (st->ss->external) sra->array.level = LEVEL_CONTAINER; + + if (info.array.state & (1 << MD_SB_CLUSTERED)) + info.disk.state |= (1 << MD_DISK_CLUSTER_ADD); + err = add_disk(mdfd, st, sra, &info); if (err < 0 && errno == EBUSY) { /* could be another device present with the same @@ -514,6 +525,9 @@ int Incremental(struct mddev_dev *devlist, struct context *c, sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | GET_OFFSET | GET_SIZE)); active_disks = count_active(st, sra, mdfd, &avail, &info); + + journal_device_missing = (info.journal_device_required) && (info.journal_clean == 0); + if (enough(info.array.level, info.array.raid_disks, info.array.layout, info.array.state & 1, avail) == 0) { @@ -543,10 +557,12 @@ int Incremental(struct mddev_dev *devlist, struct context *c, } map_unlock(&map); - if (c->runstop > 0 || active_disks >= info.array.working_disks) { + if (c->runstop > 0 || (!journal_device_missing && active_disks >= info.array.working_disks)) { struct mdinfo *dsk; /* Let's try to start it */ + if (journal_device_missing) + pr_err("Trying to run with missing journal device\n"); if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) { pr_err("%s: This array is being reshaped and cannot be started\n", chosen_name); @@ -613,6 +629,8 @@ int Incremental(struct mddev_dev *devlist, struct context *c, } else { if (c->export) { printf("MD_STARTED=unsafe\n"); + } else if (journal_device_missing) { + pr_err("Journal device is missing, not safe to start yet.\n"); } else if (c->verbose >= 0) pr_err("%s attached to %s, not enough to start safely.\n", devname, chosen_name); @@ -649,7 +667,7 @@ static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, * without thinking more */ for (d = sra->devs; d ; d = d->next) { - char dn[10]; + char dn[24]; // 2*11 bytes for ints (including sign) + colon + null byte int dfd; struct mdinfo info; sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); @@ -713,8 +731,11 @@ static int count_active(struct supertype *st, struct mdinfo *sra, close(dfd); if (ok != 0) continue; + info.array.raid_disks = raid_disks; st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum); + if (info.disk.raid_disk == MD_DISK_ROLE_JOURNAL) + bestinfo->journal_clean = 1; if (!avail) { raid_disks = info.array.raid_disks; avail = xcalloc(raid_disks, 1); @@ -764,6 +785,7 @@ static int count_active(struct supertype *st, struct mdinfo *sra, replcnt++; st->ss->free_super(st); } + if (!avail) return 0; /* We need to reject any device that thinks the best device is @@ -1012,12 +1034,12 @@ static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol, int mdfd = open_dev(chosen->sys_name); if (mdfd >= 0) { struct mddev_dev devlist; - char devname[20]; + char chosen_devname[24]; // 2*11 for int (including signs) + colon + null devlist.next = NULL; devlist.used = 0; devlist.writemostly = 0; - devlist.devname = devname; - sprintf(devname, "%d:%d", major(stb.st_rdev), + devlist.devname = chosen_devname; + sprintf(chosen_devname, "%d:%d", major(stb.st_rdev), minor(stb.st_rdev)); devlist.disposition = 'a'; close(dfd); @@ -43,7 +43,7 @@ KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIB CC = $(CROSS_COMPILE)gcc CXFLAGS ?= -ggdb -CWFLAGS = -Wall -Wstrict-prototypes -Wextra -Wno-unused-parameter +CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter ifdef WARN_UNUSED CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3 endif @@ -62,8 +62,8 @@ CPPFLAGS += -DBINDIR=\"$(BINDIR)\" PKG_CONFIG ?= pkg-config SYSCONFDIR = /etc -CONFFILE = $(SYSCONFDIR)/mdadm/mdadm.conf -CONFFILE2 = $(SYSCONFDIR)/mdadm.conf +CONFFILE = $(SYSCONFDIR)/mdadm.conf +CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf MAILCMD =/usr/sbin/sendmail -t CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" # Both MAP_DIR and MDMON_DIR should be somewhere that persists across the @@ -79,10 +79,14 @@ MDMON_DIR = $(RUN_DIR) # place for autoreplace cookies FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots SYSTEMD_DIR=/lib/systemd/system + +COROSYNC:=$(shell [ -d /usr/include/corosync ] || echo -DNO_COROSYNC) +DLM:=$(shell [ -f /usr/include/libdlm.h ] || echo -DNO_DLM) + DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\" DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\" DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\" -CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) +CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) $(COROSYNC) $(DLM) VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//') VERS_DATE = $(shell [ -d .git ] && date --date="`git log -n1 --format=format:%cd --date=short`" '+%0dth %B %Y' | sed -e 's/1th/1st/' -e 's/2th/2nd/' -e 's/11st/11th/' -e 's/12nd/12th/') @@ -101,6 +105,7 @@ endif # If you want a static binary, you might uncomment these # LDFLAGS = -static # STRIP = -s +LDLIBS=-ldl INSTALL = /usr/bin/install DESTDIR = @@ -115,6 +120,12 @@ ifndef UDEVDIR UDEVDIR = /lib/udev endif +ifeq (,$(findstring s,$(MAKEFLAGS))) + ECHO=echo +else + ECHO=: +endif + OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o util.o maps.o lib.o \ Manage.o Assemble.o Build.o \ Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \ @@ -122,7 +133,7 @@ OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o util.o maps.o lib.o \ mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \ super-mbr.o super-gpt.o \ restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \ - platform-intel.o probe_roms.o + platform-intel.o probe_roms.o crc32c.o CHECK_OBJS = restripe.o sysfs.o maps.o lib.o xmalloc.o dlink.o @@ -176,7 +187,7 @@ mdadm : $(OBJS) | check_rundir $(CC) $(CFLAGS) $(LDFLAGS) -o mdadm $(OBJS) $(LDLIBS) mdadm.static : $(OBJS) $(STATICOBJS) - $(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS) $(LDLIBS) mdadm.tcc : $(SRCS) $(INCL) $(TCC) -o mdadm.tcc $(SRCS) @@ -186,13 +197,13 @@ mdadm.klibc : $(SRCS) $(INCL) $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS) mdadm.Os : $(SRCS) $(INCL) - $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) + $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) $(LDLIBS) mdadm.O2 : $(SRCS) $(INCL) mdmon.O2 - $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) + $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) $(LDLIBS) mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h - $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) + $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) $(LDLIBS) # use '-z now' to guarantee no dynamic linker interactions with the monitor thread mdmon : $(MON_OBJS) | check_rundir @@ -200,7 +211,7 @@ mdmon : $(MON_OBJS) | check_rundir msg.o: msg.c msg.h test_stripe : restripe.c xmalloc.o mdadm.h - $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c + $(CC) $(CFLAGS) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c raid6check : raid6check.o mdadm.h $(CHECK_OBJS) $(CC) $(CXFLAGS) $(LDFLAGS) -o raid6check raid6check.o $(CHECK_OBJS) @@ -283,7 +294,7 @@ install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8 install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules @for file in 63-md-raid-arrays.rules 64-md-raid-assembly.rules ; \ do sed -e 's,BINDIR,$(BINDIR),g' udev-$${file#??-} > .install.tmp.1 && \ - echo $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ + $(ECHO) $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ $(INSTALL) -D -m 644 .install.tmp.1 $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ rm -f .install.tmp.1; \ done @@ -292,13 +303,13 @@ install-systemd: systemd/mdmon@.service @for file in mdmon@.service mdmonitor.service mdadm-last-resort@.timer \ mdadm-last-resort@.service mdadm-grow-continue@.service; \ do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.2 && \ - echo $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ + $(ECHO) $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ $(INSTALL) -D -m 644 .install.tmp.2 $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ rm -f .install.tmp.2; \ done @for file in mdadm.shutdown ; \ do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.3 && \ - echo $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ + $(ECHO) $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ $(INSTALL) -D -m 755 .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ rm -f .install.tmp.3; \ done @@ -669,6 +669,15 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv, disc.number = mdi.disk.number; disc.raid_disk = mdi.disk.raid_disk; disc.state = mdi.disk.state; + if (array->state & (1 << MD_SB_CLUSTERED)) { + /* extra flags are needed when adding to a cluster as + * there are two cases to distinguish + */ + if (dv->disposition == 'c') + disc.state |= (1 << MD_DISK_CANDIDATE); + else + disc.state |= (1 << MD_DISK_CLUSTER_ADD); + } if (dv->writemostly == 1) disc.state |= 1 << MD_DISK_WRITEMOSTLY; if (dv->writemostly == 2) @@ -724,7 +733,8 @@ skip_re_add: int Manage_add(int fd, int tfd, struct mddev_dev *dv, struct supertype *tst, mdu_array_info_t *array, int force, int verbose, char *devname, - char *update, unsigned long rdev, unsigned long long array_size) + char *update, unsigned long rdev, unsigned long long array_size, + int raid_slot) { unsigned long long ldsize; struct supertype *dev_st = NULL; @@ -815,7 +825,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, } /* Make sure device is large enough */ - if (tst->sb && + if (dv->disposition != 'j' && /* skip size check for Journal */ + tst->sb && tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) < array_size) { if (dv->disposition == 'M') @@ -914,10 +925,36 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, } disc.major = major(rdev); disc.minor = minor(rdev); - disc.number =j; + if (raid_slot < 0) + disc.number = j; + else + disc.number = raid_slot; disc.state = 0; + + /* only add journal to array that supports journaling */ + if (dv->disposition == 'j') { + struct mdinfo mdi; + struct mdinfo *mdp; + + mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE); + + if (strncmp(mdp->sysfs_array_state, "readonly", 8) != 0) { + pr_err("%s is not readonly, cannot add journal.\n", devname); + return -1; + } + + tst->ss->getinfo_super(tst, &mdi, NULL); + if (mdi.journal_device_required == 0) { + pr_err("%s does not support journal device.\n", devname); + return -1; + } + disc.raid_disk = 0; + } + if (array->not_persistent==0) { int dfd; + if (dv->disposition == 'j') + disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC); if (dv->writemostly == 1) disc.state |= 1 << MD_DISK_WRITEMOSTLY; dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); @@ -955,6 +992,14 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, } free(used); } + + if (array->state & (1 << MD_SB_CLUSTERED)) { + if (dv->disposition == 'c') + disc.state |= (1 << MD_DISK_CANDIDATE); + else + disc.state |= (1 << MD_DISK_CLUSTER_ADD); + } + if (dv->writemostly == 1) disc.state |= (1 << MD_DISK_WRITEMOSTLY); if (tst->ss->external) { @@ -1020,10 +1065,20 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, } else { tst->ss->free_super(tst); if (ioctl(fd, ADD_NEW_DISK, &disc)) { - pr_err("add new device failed for %s as %d: %s\n", - dv->devname, j, strerror(errno)); + if (dv->disposition == 'j') + pr_err("Failed to hot add %s as journal, " + "please try restart %s.\n", dv->devname, devname); + else + pr_err("add new device failed for %s as %d: %s\n", + dv->devname, j, strerror(errno)); return -1; } + if (dv->disposition == 'j') { + pr_err("Journal added successfully, making %s read-write\n", devname); + if (Manage_ro(devname, fd, -1)) + pr_err("Failed to make %s read-write\n", devname); + } + } if (verbose >= 0) pr_err("added %s\n", dv->devname); @@ -1256,6 +1311,7 @@ int Manage_subdevs(char *devname, int fd, * try HOT_ADD_DISK * If that fails EINVAL, try ADD_NEW_DISK * 'S' - add the device as a spare - don't try re-add + * 'j' - add the device as a journal device * 'A' - re-add the device * 'r' - remove the device: HOT_REMOVE_DISK * device can be 'faulty' or 'detached' in which case all @@ -1274,6 +1330,7 @@ int Manage_subdevs(char *devname, int fd, * variant on 'A' * 'F' - Another variant of 'A', where the device was faulty * so must be removed from the array first. + * 'c' - confirm the device as found (for clustered environments) * * For 'f' and 'r', the device can also be a kernel-internal * name such as 'sdb'. @@ -1287,8 +1344,10 @@ int Manage_subdevs(char *devname, int fd, int sysfd = -1; int count = 0; /* number of actions taken */ struct mdinfo info; + struct mdinfo devinfo; int frozen = 0; int busy = 0; + int raid_slot = -1; if (ioctl(fd, GET_ARRAY_INFO, &array)) { pr_err("Cannot get array info for %s\n", @@ -1317,6 +1376,17 @@ int Manage_subdevs(char *devname, int fd, int rv; int mj,mn; + raid_slot = -1; + if (dv->disposition == 'c') { + rv = parse_cluster_confirm_arg(dv->devname, + &dv->devname, + &raid_slot); + if (rv) { + pr_err("Could not get the devname of cluster\n"); + goto abort; + } + } + if (strcmp(dv->devname, "failed") == 0 || strcmp(dv->devname, "faulty") == 0) { if (dv->disposition != 'A' @@ -1342,6 +1412,11 @@ int Manage_subdevs(char *devname, int fd, if (strcmp(dv->devname, "missing") == 0) { struct mddev_dev *add_devlist = NULL; struct mddev_dev **dp; + if (dv->disposition == 'c') { + rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL); + break; + } + if (dv->disposition != 'A') { pr_err("'missing' only meaningful with --re-add\n"); goto abort; @@ -1469,14 +1544,28 @@ int Manage_subdevs(char *devname, int fd, goto abort; case 'a': case 'S': /* --add-spare */ + case 'j': /* --add-journal */ case 'A': case 'M': /* --re-add missing */ case 'F': /* --re-add faulty */ + case 'c': /* --cluster-confirm */ /* add the device */ if (subarray) { pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n"); goto abort; } + + /* Let's first try to write re-add to sysfs */ + if (rdev != 0 && + (dv->disposition == 'A' || dv->disposition == 'F')) { + sysfs_init_dev(&devinfo, rdev); + if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) { + pr_err("re-add %s to %s succeed\n", + dv->devname, info.sys_name); + break; + } + } + if (dv->disposition == 'F') /* Need to remove first */ ioctl(fd, HOT_REMOVE_DISK, rdev); @@ -1505,7 +1594,7 @@ int Manage_subdevs(char *devname, int fd, } rv = Manage_add(fd, tfd, dv, tst, &array, force, verbose, devname, update, - rdev, array_size); + rdev, array_size, raid_slot); close(tfd); tfd = -1; if (rv < 0) @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2015 Neil Brown <neilb@suse.de> + * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com> * * * This program is free software; you can redistribute it and/or modify @@ -25,10 +25,10 @@ #include "mdadm.h" #ifndef VERSION -#define VERSION "3.3.4" +#define VERSION "3.4" #endif #ifndef VERS_DATE -#define VERS_DATE "3rd August 2015" +#define VERS_DATE "28th January 2016" #endif char Version[] = "mdadm - v" VERSION " - " VERS_DATE "\n"; @@ -140,6 +140,9 @@ struct option long_options[] = { {"homehost", 1, 0, HomeHost}, {"symlinks", 1, 0, Symlinks}, {"data-offset",1, 0, DataOffset}, + {"nodes",1, 0, Nodes}, /* also for --assemble */ + {"home-cluster",1, 0, ClusterName}, + {"write-journal",1, 0, WriteJournal}, /* For assemble */ {"uuid", 1, 0, 'u'}, @@ -154,6 +157,7 @@ struct option long_options[] = { /* Management */ {"add", 0, 0, Add}, {"add-spare", 0, 0, AddSpare}, + {"add-journal", 0, 0, AddJournal}, {"remove", 0, 0, Remove}, {"fail", 0, 0, Fail}, {"set-faulty",0, 0, Fail}, @@ -167,6 +171,7 @@ struct option long_options[] = { {"wait", 0, 0, WaitOpt}, {"wait-clean", 0, 0, Waitclean }, {"action", 1, 0, Action }, + {"cluster-confirm", 0, 0, ClusterConfirm}, /* For Detail/Examine */ {"brief", 0, 0, Brief}, @@ -372,6 +377,7 @@ char Help_create[] = " --name= -N : Textual name for array - max 32 characters\n" " --bitmap-chunk= : bitmap chunksize in Kilobytes.\n" " --delay= -d : bitmap update delay in seconds.\n" +" --write-journal= : Specify journal device for RAID-4/5/6 array\n" "\n" ; @@ -593,7 +599,7 @@ char Help_incr[] = ; char Help_config[] = -"The /etc/mdadm/mdadm.conf config file:\n\n" +"The /etc/mdadm.conf config file:\n\n" " The config file contains, apart from blank lines and comment lines that\n" " start with a hash(#), array lines, device lines, and various\n" " configuration lines.\n" @@ -32,6 +32,8 @@ static inline void sb_le_to_cpu(bitmap_super_t *sb) sb->daemon_sleep = __le32_to_cpu(sb->daemon_sleep); sb->sync_size = __le64_to_cpu(sb->sync_size); sb->write_behind = __le32_to_cpu(sb->write_behind); + sb->nodes = __le32_to_cpu(sb->nodes); + sb->sectors_reserved = __le32_to_cpu(sb->sectors_reserved); } static inline void sb_cpu_to_le(bitmap_super_t *sb) @@ -219,8 +221,12 @@ int bitmap_file_open(char *filename, struct supertype **stp) pr_err("No bitmap possible with %s metadata\n", st->ss->name); return -1; - } else - st->ss->locate_bitmap(st, fd); + } else { + if (st->ss->locate_bitmap(st, fd)) { + pr_err("%s doesn't have bitmap\n", filename); + fd = -1; + } + } *stp = st; } else { @@ -258,7 +264,7 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st) int rv = 1; char buf[64]; int swap; - int fd; + int fd, i; __u32 uuid32[4]; fd = bitmap_file_open(filename, &st); @@ -285,7 +291,7 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st) } printf(" Version : %d\n", sb->version); if (sb->version < BITMAP_MAJOR_LO || - sb->version > BITMAP_MAJOR_HI) { + sb->version > BITMAP_MAJOR_CLUSTERED) { pr_err("unknown bitmap version %d, either the bitmap file\n", sb->version); pr_err("is corrupted or you need to upgrade your tools\n"); @@ -315,9 +321,13 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st) uuid32[2], uuid32[3]); - printf(" Events : %llu\n", (unsigned long long)sb->events); - printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared); - printf(" State : %s\n", bitmap_state(sb->state)); + if (sb->nodes == 0) { + printf(" Events : %llu\n", (unsigned long long)sb->events); + printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared); + printf(" State : %s\n", bitmap_state(sb->state)); + + } + printf(" Chunksize : %s\n", human_chunksize(sb->chunksize)); printf(" Daemon : %ds flush period\n", sb->daemon_sleep); if (sb->write_behind) @@ -327,11 +337,40 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st) printf(" Write Mode : %s\n", buf); printf(" Sync Size : %llu%s\n", (unsigned long long)sb->sync_size/2, human_size(sb->sync_size * 512)); - if (brief) - goto free_info; - printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", - info->total_bits, info->dirty_bits, - 100.0 * info->dirty_bits / (info->total_bits?:1)); + + if (sb->nodes == 0) { + if (brief) + goto free_info; + printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", + info->total_bits, info->dirty_bits, + 100.0 * info->dirty_bits / (info->total_bits?:1)); + } else { + printf(" Cluster nodes : %d\n", sb->nodes); + printf(" Cluster name : %-64s\n", sb->cluster_name); + for (i = 0; i < (int)sb->nodes; i++) { + if (i) { + free(info); + info = bitmap_fd_read(fd, brief); + sb = &info->sb; + } + if (sb->magic != BITMAP_MAGIC) + pr_err("invalid bitmap magic 0x%x, the bitmap file appears to be corrupted\n", sb->magic); + + printf(" Node Slot : %d\n", i); + printf(" Events : %llu\n", + (unsigned long long)sb->events); + printf(" Events Cleared : %llu\n", + (unsigned long long)sb->events_cleared); + printf(" State : %s\n", bitmap_state(sb->state)); + if (brief) + continue; + printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", + info->total_bits, info->dirty_bits, + 100.0 * info->dirty_bits / (info->total_bits?:1)); + + } + } + free_info: free(info); return rv; @@ -12,6 +12,7 @@ */ #define BITMAP_MAJOR_HI 4 #define BITMAP_MAJOR_HOSTENDIAN 3 +#define BITMAP_MAJOR_CLUSTERED 5 #define BITMAP_MINOR 39 @@ -154,8 +155,11 @@ typedef struct bitmap_super_s { __u32 chunksize; /* 52 the bitmap chunk size in bytes */ __u32 daemon_sleep; /* 56 seconds between disk flushes */ __u32 write_behind; /* 60 number of outstanding write-behind writes */ - - __u8 pad[256 - 64]; /* set to zero */ + __u32 sectors_reserved; /* 64 number of 512-byte sectors that are + * reserved for the bitmap. */ + __u32 nodes; /* 68 the maximum number of nodes in cluster. */ + __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ + __u8 pad[256 - 136]; /* set to zero */ } bitmap_super_t; /* notes: @@ -63,6 +63,9 @@ * but may not wrap over lines * */ +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif #ifndef CONFFILE #define CONFFILE "/etc/mdadm.conf" @@ -77,7 +80,7 @@ char DefaultAltConfFile[] = CONFFILE2; char DefaultAltConfDir[] = CONFFILE2 ".d"; enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev, - Homehost, AutoMode, Policy, PartPolicy, LTEnd }; + Homehost, HomeCluster, AutoMode, Policy, PartPolicy, LTEnd }; char *keywords[] = { [Devices] = "devices", [Array] = "array", @@ -86,6 +89,7 @@ char *keywords[] = { [Program] = "program", [CreateDev]= "create", [Homehost] = "homehost", + [HomeCluster] = "homecluster", [AutoMode] = "auto", [Policy] = "policy", [PartPolicy]="part-policy", @@ -562,6 +566,21 @@ void homehostline(char *line) } } +static char *home_cluster = NULL; +void homeclusterline(char *line) +{ + char *w; + + for (w=dl_next(line); w != line ; w=dl_next(w)) { + if (home_cluster == NULL) { + if (strcasecmp(w, "<none>")==0) + home_cluster = xstrdup(""); + else + home_cluster = xstrdup(w); + } + } +} + char auto_yes[] = "yes"; char auto_no[] = "no"; char auto_homehost[] = "homehost"; @@ -724,6 +743,9 @@ void conf_file(FILE *f) case Homehost: homehostline(line); break; + case HomeCluster: + homeclusterline(line); + break; case AutoMode: autoline(line); break; @@ -884,6 +906,12 @@ char *conf_get_homehost(int *require_homehostp) return home_host; } +char *conf_get_homecluster(void) +{ + load_conffile(); + return home_cluster; +} + struct createinfo *conf_get_create_info(void) { load_conffile(); diff --git a/crc32c.c b/crc32c.c new file mode 100644 index 00000000..156cba19 --- /dev/null +++ b/crc32c.c @@ -0,0 +1,104 @@ +/* + * Oct 28, 2015 Song Liu simplified the code and port it to mdadm + * + * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin + * cleaned up code to current version of sparse and added the slicing-by-8 + * algorithm to the closely similar existing slicing-by-4 algorithm. + * + * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> + * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! + * Code was from the public domain, copyright abandoned. Code was + * subsequently included in the kernel, thus was re-licensed under the + * GNU GPL v2. + * + * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> + * Same crc32 function was used in 5 other places in the kernel. + * I made one version, and deleted the others. + * There are various incantations of crc32(). Some use a seed of 0 or ~0. + * Some xor at the end with ~0. The generic crc32() function takes + * seed as an argument, and doesn't xor at the end. Then individual + * users can do whatever they need. + * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. + * fs/jffs2 uses seed 0, doesn't xor with ~0. + * fs/partitions/efi.c uses seed ~0, xor's with ~0. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <sys/types.h> +#include <asm/types.h> +#include <stdlib.h> + +/* + * There are multiple 16-bit CRC polynomials in common use, but this is + * *the* standard CRC-32 polynomial, first popularized by Ethernet. + * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0 + */ +#define CRCPOLY_LE 0xedb88320 +#define CRCPOLY_BE 0x04c11db7 + +/* + * This is the CRC32c polynomial, as outlined by Castagnoli. + * x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+x^11+x^10+x^9+ + * x^8+x^6+x^0 + */ +#define CRC32C_POLY_LE 0x82F63B78 + +/** + * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II + * CRC32/CRC32C + * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other + * uses, or the previous crc32/crc32c value if computing incrementally. + * @p: pointer to buffer over which CRC32/CRC32C is run + * @len: length of buffer @p + * @polynomial: CRC32/CRC32c LE polynomial + */ +static inline __u32 crc32_le_generic(__u32 crc, unsigned char const *p, + size_t len, __u32 polynomial) +{ + int i; + while (len--) { + crc ^= *p++; + for (i = 0; i < 8; i++) + crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0); + } + return crc; +} + +__u32 crc32_le(__u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, CRCPOLY_LE); +} + +__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, CRC32C_POLY_LE); +} + +/** + * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 + * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for + * other uses, or the previous crc32 value if computing incrementally. + * @p: pointer to buffer over which CRC32 is run + * @len: length of buffer @p + * @polynomial: CRC32 BE polynomial + */ +static inline __u32 crc32_be_generic(__u32 crc, unsigned char const *p, + size_t len, __u32 polynomial) +{ + int i; + while (len--) { + crc ^= *p++ << 24; + for (i = 0; i < 8; i++) + crc = + (crc << 1) ^ ((crc & 0x80000000) ? polynomial : + 0); + } + return crc; +} + +__u32 crc32_be(__u32 crc, unsigned char const *p, size_t len) +{ + return crc32_be_generic(crc, p, len, CRCPOLY_BE); +} diff --git a/debian/changelog b/debian/changelog index cd2f9c2b..36762287 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +mdadm (3.4-1) unstable; urgency=medium + + * New upstream release. + + -- Dimitri John Ledkov <xnox@ubuntu.com> Fri, 19 Feb 2016 16:18:36 +0000 + mdadm (3.3.4-1.1) unstable; urgency=medium * Non-maintainer upload. @@ -22,6 +22,7 @@ ANNOUNCE-3.3.1 ANNOUNCE-3.3.2 ANNOUNCE-3.3.3 ANNOUNCE-3.3.4 +ANNOUNCE-3.4 Assemble.c Build.c COPYING @@ -46,6 +47,7 @@ bitmap.h config.c crc32.c crc32.h +crc32c.c dlink.c dlink.h external-reshape-design.txt @@ -239,6 +241,7 @@ tests/19raid6auto-repair tests/19raid6check tests/19raid6repair tests/19repair-does-not-destroy +tests/20raid5journal tests/ToTest tests/check tests/env-ddf-template @@ -176,7 +176,7 @@ void map_read(struct map_ent **melp) { FILE *f; char buf[8192]; - char path[200]; + char path[201]; int uuid[4]; char devnm[32]; char metadata[30]; @@ -874,6 +874,26 @@ The list is particularly useful when recovering to a spare. If a few blocks cannot be read from the other devices, the bulk of the recovery can complete and those few bad blocks will be recorded in the bad block list. +.SS RAID456 WRITE JOURNAL + +Due to non-atomicity nature of RAID write operations, interruption of +write operations (system crash, etc.) to RAID456 array can lead to +inconsistent parity and data loss (so called RAID-5 write hole). + +To plug the write hole, from Linux 4.4 (to be confirmed), +.I md +supports write ahead journal for RAID456. When the array is created, +an additional journal device can be added to the array through +.IR write-journal +option. The RAID write journal works similar to file system journals. +Before writing to the data disks, md persists data AND parity of the +stripe to the journal device. After crashes, md searches the journal +device for incomplete write operations, and replay them to the data +disks. + +When the journal device fails, the RAID array is forced to run in +read-only mode. + .SS WRITE-BEHIND From Linux 2.6.14, @@ -78,6 +78,12 @@ #define MD_DISK_ACTIVE 1 /* disk is running but may not be in sync */ #define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ #define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster + * For clustered enviroments only. + */ +#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed + * For clustered enviroments only. + */ #define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. * read requests will only be sent here in @@ -85,6 +91,12 @@ */ #define MD_DISK_REPLACEMENT 17 +#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */ + +#define MD_DISK_ROLE_SPARE 0xffff +#define MD_DISK_ROLE_FAULTY 0xfffe +#define MD_DISK_ROLE_JOURNAL 0xfffd +#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */ typedef struct mdp_device_descriptor_s { __u32 number; /* 0 Device number in the entire set */ @@ -106,6 +118,7 @@ typedef struct mdp_device_descriptor_s { #define MD_SB_BLOCK_CONTAINER_RESHAPE 3 /* block container wide reshapes */ #define MD_SB_BLOCK_VOLUME 4 /* block activation of array, other arrays * in container can be activated */ +#define MD_SB_CLUSTERED 5 /* MD is clustered */ #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ typedef struct mdp_superblock_s { @@ -195,4 +208,62 @@ static inline __u64 md_event(mdp_super_t *sb) { return (ev<<32)| sb->events_lo; } +struct r5l_payload_header { + __u16 type; + __u16 flags; +} __attribute__ ((__packed__)); + +enum r5l_payload_type { + R5LOG_PAYLOAD_DATA = 0, + R5LOG_PAYLOAD_PARITY = 1, + R5LOG_PAYLOAD_FLUSH = 2, +}; + +struct r5l_payload_data_parity { + struct r5l_payload_header header; + __u32 size; /* sector. data/parity size. each 4k has a checksum */ + __u64 location; /* sector. For data, it's raid sector. For + parity, it's stripe sector */ + __u32 checksum[]; +} __attribute__ ((__packed__)); + +enum r5l_payload_data_parity_flag { + R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */ + /* + * RESHAPED/RESHAPING is only set when there is reshape activity. Note, + * both data/parity of a stripe should have the same flag set + * + * RESHAPED: reshape is running, and this stripe finished reshape + * RESHAPING: reshape is running, and this stripe isn't reshaped + * */ + R5LOG_PAYLOAD_FLAG_RESHAPED = 2, + R5LOG_PAYLOAD_FLAG_RESHAPING = 3, +}; + +struct r5l_payload_flush { + struct r5l_payload_header header; + __u32 size; /* flush_stripes size, bytes */ + __u64 flush_stripes[]; +} __attribute__ ((__packed__)); + +enum r5l_payload_flush_flag { + R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */ +}; + +struct r5l_meta_block { + __u32 magic; + __u32 checksum; + __u8 version; + __u8 __zero_pading_1; + __u16 __zero_pading_2; + __u32 meta_size; /* whole size of the block */ + + __u64 seq; + __u64 position; /* sector, start from rdev->data_offset, current position */ + struct r5l_payload_header payloads[]; +} __attribute__ ((__packed__)); + +#define R5LOG_VERSION 0x1 +#define R5LOG_MAGIC 0x6433c509 + #endif @@ -44,6 +44,7 @@ #define STOP_ARRAY _IO (MD_MAJOR, 0x32) #define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) #define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) +#define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35) typedef struct mdu_version_s { int major; @@ -58,7 +59,7 @@ typedef struct mdu_array_info_s { int major_version; int minor_version; int patch_version; - int ctime; + unsigned int ctime; int level; int size; int nr_disks; @@ -69,7 +70,7 @@ typedef struct mdu_array_info_s { /* * Generic state information */ - int utime; /* 0 Superblock update time */ + unsigned int utime; /* 0 Superblock update time */ int state; /* 1 State bits (clean, ...) */ int active_disks; /* 2 Number of currently active disks */ int working_disks; /* 3 Number of working disks */ @@ -5,7 +5,7 @@ .\" the Free Software Foundation; either version 2 of the License, or .\" (at your option) any later version. .\" See file COPYING in distribution for details. -.TH MDADM 8 "" v3.3.4 +.TH MDADM 8 "" v3.4 .SH NAME mdadm \- manage MD devices .I aka @@ -267,13 +267,13 @@ the exact meaning of this option in different contexts. .TP .BR \-c ", " \-\-config= Specify the config file or directory. Default is to use -.B /etc/mdadm/mdadm.conf +.B /etc/mdadm.conf and -.BR /etc/mdadm/mdadm.conf.d , +.BR /etc/mdadm.conf.d , or if those are missing then -.B /etc/mdadm.conf +.B /etc/mdadm/mdadm.conf and -.BR /etc/mdadm.conf.d . +.BR /etc/mdadm/mdadm.conf.d . If the config file given is .B "partitions" then nothing will be read, but @@ -422,6 +422,12 @@ This functionality is currently only provided by and .BR \-\-monitor . +.TP +.B \-\-home\-cluster= +specifies the cluster name for the md device. The md device can be assembled +only on the cluster which matches the name specified. If this option is not +provided, mdadm tries to detect the cluster name automatically. + .SH For create, build, or grow: .TP @@ -701,7 +707,12 @@ and so is replicated on all devices. If the word .B "none" is given with .B \-\-grow -mode, then any bitmap that is present is removed. +mode, then any bitmap that is present is removed. If the word +.B "clustered" +is given, the array is created for a clustered environment. One bitmap +is created for each node as defined by the +.B \-\-nodes +parameter and are stored internally. To help catch typing errors, the filename must contain at least one slash ('/') if it is a real file (not 'internal' or 'none'). @@ -973,6 +984,18 @@ However for RAID0, it is not possible to add spares. So to increase the number of devices in a RAID0, it is necessary to set the new number of devices, and to add the new devices, in the same command. +.TP +.BR \-\-nodes +Only works when the array is for clustered environment. It specifies +the maximum number of nodes in the cluster that will use this device +simultaneously. If not specified, this defaults to 4. + +.TP +.BR \-\-write-journal +Specify journal device for the RAID-4/5/6 array. The journal device +should be a SSD with reasonable lifetime. + + .SH For assemble: .TP @@ -1087,7 +1110,9 @@ argument given to this flag can be one of .BR summaries , .BR uuid , .BR name , +.BR nodes , .BR homehost , +.BR home-cluster , .BR resync , .BR byteorder , .BR devicesize , @@ -1142,6 +1167,13 @@ of the array as stored in the superblock. This is only supported for version-1 superblocks. The +.B nodes +option will change the +.I nodes +of the array as stored in the bitmap superblock. This option only +works for a clustered environment. + +The .B homehost option will change the .I homehost @@ -1150,6 +1182,11 @@ same as updating the UUID. For version-1 superblocks, this involves updating the name. The +.B home\-cluster +option will change the cluster name as recorded in the superblock and +bitmap. This option only works for clustered environment. + +The .B resync option will cause the array to be marked .I dirty @@ -1396,6 +1433,15 @@ will avoid reading from these devices if possible. .BR \-\-readwrite Subsequent devices that are added or re\-added will have the 'write-mostly' flag cleared. +.TP +.BR \-\-cluster\-confirm +Confirm the existence of the device. This is issued in response to an \-\-add +request by a node in a cluster. When a node adds a device it sends a message +to all nodes in the cluster to look for a device with a UUID. This translates +to a udev notification with the UUID of the device to be added and the slot +number. The receiving node must acknowledge this message +with \-\-cluster\-confirm. Valid arguments are <slot>:<devicename> in case +the device is found or <slot>:missing in case the device is not found. .P Each of these options requires that the first device listed is the array @@ -1803,9 +1849,9 @@ The config file is only used if explicitly named with or requested with (a possibly implicit) .BR \-\-scan . In the later case, -.B /etc/mdadm/mdadm.conf -or .B /etc/mdadm.conf +or +.B /etc/mdadm/mdadm.conf is used. If @@ -3099,7 +3145,7 @@ uses this to find arrays when is given in Misc mode, and to monitor array reconstruction on Monitor mode. -.SS /etc/mdadm/mdadm.conf (or /etc/mdadm.conf) +.SS /etc/mdadm.conf The config file lists which devices may be scanned to see if they contain MD super block, and gives identifying information @@ -3107,7 +3153,7 @@ they contain MD super block, and gives identifying information .BR mdadm.conf (5) for more details. -.SS /etc/mdadm/mdadm.conf.d (or /etc/mdadm.conf.d) +.SS /etc/mdadm.conf.d A directory containing configuration files which are read in lexical order. @@ -74,6 +74,7 @@ int main(int argc, char *argv[]) .require_homehost = 1, }; struct shape s = { + .journaldisks = 0, .level = UnSet, .layout = UnSet, .bitmap_chunk = UnSet, @@ -189,6 +190,7 @@ int main(int argc, char *argv[]) case 'a': case Add: case AddSpare: + case AddJournal: case 'r': case Remove: case Replace: @@ -196,6 +198,7 @@ int main(int argc, char *argv[]) case 'f': case Fail: case ReAdd: /* re-add */ + case ClusterConfirm: if (!mode) { newmode = MANAGE; shortopt = short_bitmap_options; @@ -588,7 +591,23 @@ int main(int argc, char *argv[]) } ident.raid_disks = s.raiddisks; continue; - + case O(ASSEMBLE, Nodes): + case O(CREATE, Nodes): + c.nodes = parse_num(optarg); + if (c.nodes <= 0) { + pr_err("invalid number for the number of cluster nodes: %s\n", + optarg); + exit(2); + } + continue; + case O(CREATE, ClusterName): + case O(ASSEMBLE, ClusterName): + c.homecluster = optarg; + if (strlen(c.homecluster) > 64) { + pr_err("Cluster name too big.\n"); + exit(ERANGE); + } + continue; case O(CREATE,'x'): /* number of spare (eXtra) disks */ if (s.sparedisks) { pr_err("spare-devices set twice: %d and %s\n", @@ -726,6 +745,10 @@ int main(int argc, char *argv[]) continue; if (strcmp(c.update, "homehost")==0) continue; + if (strcmp(c.update, "home-cluster")==0) + continue; + if (strcmp(c.update, "nodes")==0) + continue; if (strcmp(c.update, "devicesize")==0) continue; if (strcmp(c.update, "no-bitmap")==0) @@ -734,6 +757,8 @@ int main(int argc, char *argv[]) continue; if (strcmp(c.update, "no-bbl") == 0) continue; + if (strcmp(c.update, "force-no-bbl") == 0) + continue; if (strcmp(c.update, "metadata") == 0) continue; if (strcmp(c.update, "revert-reshape") == 0) @@ -764,10 +789,10 @@ int main(int argc, char *argv[]) Name, c.update); } fprintf(outf, "Valid --update options are:\n" - " 'sparc2.2', 'super-minor', 'uuid', 'name', 'resync',\n" - " 'summaries', 'homehost', 'byteorder', 'devicesize',\n" + " 'sparc2.2', 'super-minor', 'uuid', 'name', 'nodes', 'resync',\n" + " 'summaries', 'homehost', 'home-cluster', 'byteorder', 'devicesize',\n" " 'no-bitmap', 'metadata', 'revert-reshape'\n" - " 'bbl', 'no-bbl'\n" + " 'bbl', 'no-bbl', 'force-no-bbl'\n" ); exit(outf == stdout ? 0 : 2); @@ -785,8 +810,9 @@ int main(int argc, char *argv[]) c.update = optarg; if (strcmp(c.update, "devicesize") != 0 && strcmp(c.update, "bbl") != 0 && + strcmp(c.update, "force-no-bbl") != 0 && strcmp(c.update, "no-bbl") != 0) { - pr_err("only 'devicesize', 'bbl' and 'no-bbl' can be updated with --re-add\n"); + pr_err("only 'devicesize', 'bbl', 'no-bbl', and 'force-no-bbl' can be updated with --re-add\n"); exit(2); } continue; @@ -903,6 +929,13 @@ int main(int argc, char *argv[]) case O(MANAGE,AddSpare): /* add drive - never re-add */ devmode = 'S'; continue; + case O(MANAGE,AddJournal): /* add journal */ + if (s.journaldisks && (s.level < 4 || s.level > 6)) { + pr_err("--add-journal is only supported for RAID level 4/5/6.\n"); + exit(2); + } + devmode = 'j'; + continue; case O(MANAGE,ReAdd): devmode = 'A'; continue; @@ -919,6 +952,9 @@ int main(int argc, char *argv[]) * remove the device */ devmode = 'f'; continue; + case O(MANAGE, ClusterConfirm): + devmode = 'c'; + continue; case O(MANAGE,Replace): /* Mark these devices for replacement */ devmode = 'R'; @@ -1097,6 +1133,15 @@ int main(int argc, char *argv[]) s.bitmap_file = optarg; continue; } + if (strcmp(optarg, "clustered")== 0) { + s.bitmap_file = optarg; + /* Set the default number of cluster nodes + * to 4 if not already set by user + */ + if (c.nodes < 1) + c.nodes = 4; + continue; + } /* probable typo */ pr_err("bitmap file must contain a '/', or be 'internal', or 'none'\n" " not '%s'\n", optarg); @@ -1137,6 +1182,23 @@ int main(int argc, char *argv[]) case O(INCREMENTAL, IncrementalPath): remove_path = optarg; continue; + case O(CREATE, WriteJournal): + if (s.journaldisks) { + pr_err("Please specify only one journal device for the array.\n"); + pr_err("Ignoring --write-journal %s...\n", optarg); + continue; + } + dv = xmalloc(sizeof(*dv)); + dv->devname = optarg; + dv->disposition = 'j'; /* WriteJournal */ + dv->used = 0; + dv->next = NULL; + *devlistend = dv; + devlistend = &dv->next; + devs_found++; + + s.journaldisks = 1; + continue; } /* We have now processed all the valid options. Anything else is * an error @@ -1164,6 +1226,11 @@ int main(int argc, char *argv[]) exit(0); } + if (s.journaldisks && (s.level < 4 || s.level > 6)) { + pr_err("--write-journal is only supported for RAID level 4/5/6.\n"); + exit(2); + } + if (!mode && devs_found) { mode = MISC; devmode = 'Q'; @@ -1260,6 +1327,20 @@ int main(int argc, char *argv[]) c.require_homehost = 0; } + rv = 0; + + set_hooks(); /* set hooks from libs */ + + if (c.homecluster == NULL && (c.nodes > 0)) { + c.homecluster = conf_get_homecluster(); + if (c.homecluster == NULL) + rv = get_cluster_name(&c.homecluster); + if (rv) { + pr_err("The md can't get cluster name\n"); + exit(1); + } + } + if (c.backup_file && data_offset != INVALID_SECTORS) { pr_err("--backup-file and --data-offset are incompatible\n"); exit(2); @@ -1279,7 +1360,6 @@ int main(int argc, char *argv[]) /* --scan implied --brief unless -vv */ c.brief = 1; - rv = 0; switch(mode) { case MANAGE: /* readonly, add/remove, readwrite, runstop */ @@ -1366,8 +1446,9 @@ int main(int argc, char *argv[]) } if (s.bitmap_file) { - if (strcmp(s.bitmap_file, "internal")==0) { - pr_err("'internal' bitmaps not supported with --build\n"); + if (strcmp(s.bitmap_file, "internal")==0 || + strcmp(s.bitmap_file, "clustered") == 0) { + pr_err("'internal' and 'clustered' bitmaps not supported with --build\n"); rv |= 1; break; } @@ -1377,6 +1458,21 @@ int main(int argc, char *argv[]) case CREATE: if (c.delay == 0) c.delay = DEFAULT_BITMAP_DELAY; + + if (c.nodes) { + if (!s.bitmap_file || strcmp(s.bitmap_file, "clustered") != 0) { + pr_err("--nodes argument only compatible with --bitmap=clustered\n"); + rv = 1; + break; + } + + if (s.level != 1) { + pr_err("--bitmap=clustered is currently supported with RAID mirror only\n"); + rv = 1; + break; + } + } + if (s.write_behind && !s.bitmap_file) { pr_err("write-behind mode requires a bitmap.\n"); rv = 1; @@ -1442,8 +1538,6 @@ int main(int argc, char *argv[]) else c.delay = 60; } - if (c.delay == 0) - c.delay = 60; rv= Monitor(devlist, mailaddr, program, &c, daemonise, oneshot, dosyslog, pidfile, increments, diff --git a/mdadm.conf.5 b/mdadm.conf.5 index 542e2635..18512cb0 100644 --- a/mdadm.conf.5 +++ b/mdadm.conf.5 @@ -8,7 +8,7 @@ .SH NAME mdadm.conf \- configuration for management of Software RAID with mdadm .SH SYNOPSIS -/etc/mdadm/mdadm.conf +/etc/mdadm.conf .SH DESCRIPTION .PP .I mdadm @@ -35,6 +35,7 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); #include <sys/types.h> #include <sys/stat.h> +#include <stdint.h> #include <stdlib.h> #include <time.h> #include <sys/time.h> @@ -51,6 +52,32 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); #define srandom srand #endif +#ifdef NO_COROSYNC +#define CS_OK 1 +typedef uint64_t cmap_handle_t; +#else +#include <corosync/cmap.h> +#endif + +#ifndef NO_DLM +#include <libdlm.h> +#include <errno.h> +#else +#define LKF_NOQUEUE 0x00000001 +#define LKF_CONVERT 0x00000004 +#define LKM_PWMODE 4 +#define EUNLOCK 0x10002 + +typedef void *dlm_lshandle_t; + +struct dlm_lksb { + int sb_status; + uint32_t sb_lkid; + char sb_flags; + char *sb_lvbptr; +}; +#endif + #include <linux/kdev_t.h> /*#include <linux/fs.h> */ #include <sys/mount.h> @@ -162,6 +189,31 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); #endif /* __KLIBC__ */ /* + * Check at compile time that something is of a particular type. + * Always evaluates to 1 so you may use it easily in comparisons. +*/ + +#define typecheck(type,x) \ +({ type __dummy; \ + typeof(x) __dummy2; \ + (void)(&__dummy == &__dummy2); \ + 1; \ +}) + +/* + * These inlines deal with timer wrapping correctly. + * + * time_after(a,b) returns true if the time a is after time b. +*/ + +#define time_after(a,b) \ + (typecheck(unsigned int, a) && \ + typecheck(unsigned int, b) && \ + ((int)((b) - (a)) < 0)) + +#define time_before(a,b) time_after(b,a) + +/* * min()/max()/clamp() macros that also do * strict type-checking.. See the * "unnecessary" pointer comparison. @@ -210,6 +262,9 @@ struct mdinfo { * for native metadata it is * reshape_active field mirror */ + int journal_device_required; + int journal_clean; + /* During reshape we can sometimes change the data_offset to avoid * over-writing still-valid data. We need to know if there is space. * So getinfo_super will fill in space_before and space_after in sectors. @@ -251,6 +306,8 @@ struct mdinfo { #define DS_UNBLOCK 2048 int prev_state, curr_state, next_state; + /* info read from sysfs */ + char sysfs_array_state[20]; }; struct createinfo { @@ -313,6 +370,7 @@ enum special_options { ManageOpt, Add, AddSpare, + AddJournal, Remove, Fail, Replace, @@ -344,6 +402,10 @@ enum special_options { Dump, Restore, Action, + Nodes, + ClusterName, + ClusterConfirm, + WriteJournal, }; enum prefix_standard { @@ -351,6 +413,12 @@ enum prefix_standard { IEC }; +enum bitmap_update { + NoUpdate, + NameUpdate, + NodeNumUpdate, +}; + /* structures read from config file */ /* List of mddevice names and identifiers * Identifiers can be: @@ -418,11 +486,14 @@ struct context { char *backup_file; int invalid_backup; char *action; + int nodes; + char *homecluster; }; struct shape { int raiddisks; int sparedisks; + int journaldisks; int level; int layout; char *layout_str; @@ -521,6 +592,7 @@ enum sysfs_read_flags { GET_SIZE = (1 << 22), GET_STATE = (1 << 23), GET_ERROR = (1 << 24), + GET_ARRAY_STATE = (1 << 25), }; /* If fd >= 0, get the array it is open on, @@ -528,6 +600,7 @@ enum sysfs_read_flags { */ extern int sysfs_open(char *devnm, char *devname, char *attr); extern void sysfs_init(struct mdinfo *mdi, int fd, char *devnm); +extern void sysfs_init_dev(struct mdinfo *mdi, unsigned long devid); extern void sysfs_free(struct mdinfo *sra); extern struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options); extern int sysfs_attr_match(const char *attr, const char *str); @@ -747,7 +820,8 @@ extern struct superswitch { * readwrite - clear the WriteMostly1 bit in the superblock devflags * no-bitmap - clear any record that a bitmap is present. * bbl - add a bad-block-log if possible - * no-bbl - remove and bad-block-log is it is empty. + * no-bbl - remove any bad-block-log is it is empty. + * force-no-bbl - remove any bad-block-log even if empty. * revert-reshape - If a reshape is in progress, modify metadata so * it will resume going in the opposite direction. */ @@ -830,11 +904,11 @@ extern struct superswitch { /* Seek 'fd' to start of write-intent-bitmap. Must be an * md-native format bitmap */ - void (*locate_bitmap)(struct supertype *st, int fd); + int (*locate_bitmap)(struct supertype *st, int fd); /* if add_internal_bitmap succeeded for existing array, this * writes it out. */ - int (*write_bitmap)(struct supertype *st, int fd); + int (*write_bitmap)(struct supertype *st, int fd, enum bitmap_update update); /* Free the superblock and any other allocated data */ void (*free_super)(struct supertype *st); @@ -1018,6 +1092,8 @@ struct supertype { */ int devcnt; int retry_soon; + int nodes; + char *cluster_name; struct mdinfo *devs; @@ -1264,6 +1340,7 @@ extern int parse_uuid(char *str, int uuid[4]); extern int parse_layout_10(char *layout); extern int parse_layout_faulty(char *layout); extern long parse_num(char *num); +extern int parse_cluster_confirm_arg(char *inp, char **devname, int *slot); extern int check_ext2(int fd, char *name); extern int check_reiser(int fd, char *name); extern int check_raid(int fd, char *name); @@ -1294,6 +1371,7 @@ extern char *conf_get_mailaddr(void); extern char *conf_get_mailfrom(void); extern char *conf_get_program(void); extern char *conf_get_homehost(int *require_homehostp); +extern char *conf_get_homecluster(void); extern char *conf_line(FILE *file); extern char *conf_word(FILE *file, int allow_key); extern void print_quoted(char *str); @@ -1403,6 +1481,45 @@ extern char *fd2devnm(int fd); extern int in_initrd(void); +struct cmap_hooks { + void *cmap_handle; /* corosync lib related */ + + int (*initialize)(cmap_handle_t *handle); + int (*get_string)(cmap_handle_t handle, + const char *string, + char **name); + int (*finalize)(cmap_handle_t handle); +}; + +extern void set_cmap_hooks(void); +extern void set_hooks(void); + +struct dlm_hooks { + void *dlm_handle; /* dlm lib related */ + + dlm_lshandle_t (*create_lockspace)(const char *name, + unsigned int mode); + int (*release_lockspace)(const char *name, dlm_lshandle_t ls, + int force); + int (*ls_lock)(dlm_lshandle_t lockspace, uint32_t mode, + struct dlm_lksb *lksb, uint32_t flags, + const void *name, unsigned int namelen, + uint32_t parent, void (*astaddr) (void *astarg), + void *astarg, void (*bastaddr) (void *astarg), + void *range); + int (*ls_unlock)(dlm_lshandle_t lockspace, uint32_t lkid, + uint32_t flags, struct dlm_lksb *lksb, + void *astarg); + int (*ls_get_fd)(dlm_lshandle_t ls); + int (*dispatch)(int fd); +}; + +extern int get_cluster_name(char **name); +extern int dlm_funs_ready(void); +extern int cluster_get_dlmlock(int *lockid); +extern int cluster_release_dlmlock(int lockid); +extern void set_dlm_hooks(void); + #define _ROUND_UP(val, base) (((val) + (base) - 1) & ~(base - 1)) #define ROUND_UP(val, base) _ROUND_UP(val, (typeof(val))(base)) #define ROUND_UP_PTR(ptr, base) ((typeof(ptr)) \ @@ -1,6 +1,6 @@ Summary: mdadm is used for controlling Linux md devices (aka RAID arrays) Name: mdadm -Version: 3.3.4 +Version: 3.4 Release: 1 Source: http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tar.gz URL: http://neil.brown.name/blog/mdadm diff --git a/mdassemble.8 b/mdassemble.8 index 601c1d10..d0c83c39 100644 --- a/mdassemble.8 +++ b/mdassemble.8 @@ -1,5 +1,5 @@ .\" -*- nroff -*- -.TH MDASSEMBLE 8 "" v3.3.4 +.TH MDASSEMBLE 8 "" v3.4 .SH NAME mdassemble \- assemble MD devices .I aka @@ -40,7 +40,7 @@ There are no options to .SH FILES -.SS /etc/mdadm/mdadm.conf +.SS /etc/mdadm.conf The config file lists which devices may be scanned to see if they contain MD super block, and gives identifying information @@ -1,5 +1,5 @@ .\" See file COPYING in distribution for details. -.TH MDMON 8 "" v3.3.4 +.TH MDMON 8 "" v3.4 .SH NAME mdmon \- monitor MD external metadata arrays @@ -235,7 +235,7 @@ static int make_control_sock(char *devname) addr.sun_family = PF_LOCAL; strcpy(addr.sun_path, path); umask(077); /* ensure no world write access */ - if (bind(sfd, &addr, sizeof(addr)) < 0) { + if (bind(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) { close(sfd); return -1; } @@ -170,7 +170,7 @@ int connect_monitor(char *devname) addr.sun_family = PF_LOCAL; strcpy(addr.sun_path, path); - if (connect(sfd, &addr, sizeof(addr)) < 0) { + if (connect(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) { close(sfd); return -1; } diff --git a/platform-intel.c b/platform-intel.c index edb86795..88818f34 100644 --- a/platform-intel.c +++ b/platform-intel.c @@ -33,8 +33,6 @@ static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long long *val); -static __u16 devpath_to_vendor(const char *dev_path); - static void free_sys_dev(struct sys_dev **list) { while (*list) { @@ -57,6 +55,7 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver) struct dirent *de; struct sys_dev *head = NULL; struct sys_dev *list = NULL; + struct sys_dev *vmd = NULL; enum sys_dev_type type; unsigned long long dev_id; unsigned long long class; @@ -65,17 +64,25 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver) type = SYS_DEV_SAS; else if (strcmp(driver, "ahci") == 0) type = SYS_DEV_SATA; - else if (strcmp(driver, "nvme") == 0) + else if (strcmp(driver, "nvme") == 0) { + /* if looking for nvme devs, first look for vmd */ + vmd = find_driver_devices("pci", "vmd"); type = SYS_DEV_NVME; + } else if (strcmp(driver, "vmd") == 0) + type = SYS_DEV_VMD; else type = SYS_DEV_UNKNOWN; sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver); driver_dir = opendir(path); - if (!driver_dir) + if (!driver_dir) { + if (vmd) + free_sys_dev(&vmd); return NULL; + } for (de = readdir(driver_dir); de; de = readdir(driver_dir)) { int n; + int skip = 0; /* is 'de' a device? check that the 'subsystem' link exists and * that its target matches 'bus' @@ -95,8 +102,19 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver) sprintf(path, "/sys/bus/%s/drivers/%s/%s", bus, driver, de->d_name); - /* if it's not Intel device skip it. */ - if (devpath_to_vendor(path) != 0x8086) + /* if searching for nvme - skip vmd connected one */ + if (type == SYS_DEV_NVME) { + struct sys_dev *dev; + char *rp = realpath(path, NULL); + for (dev = vmd; dev; dev = dev->next) { + if ((strncmp(dev->path, rp, strlen(dev->path)) == 0)) + skip = 1; + } + free(rp); + } + + /* if it's not Intel device or mark as VMD connected - skip it. */ + if (devpath_to_vendor(path) != 0x8086 || skip == 1) continue; if (devpath_to_ll(path, "device", &dev_id) != 0) @@ -122,12 +140,28 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver) list->dev_id = (__u16) dev_id; list->class = (__u32) class; list->type = type; + /* Each VMD device (domain) adds separate PCI bus, it is better to + * store path as a path to that bus (easier further determination which + * NVMe dev is connected to this particular VMD domain). + */ + if (type == SYS_DEV_VMD) { + sprintf(path, "/sys/bus/%s/drivers/%s/%s/domain/device", + bus, driver, de->d_name); + } list->path = realpath(path, NULL); list->next = NULL; if ((list->pci_id = strrchr(list->path, '/')) != NULL) list->pci_id++; } closedir(driver_dir); + + if (vmd) { + if (list) + list->next = vmd; + else + head = vmd; + } + return head; } @@ -160,7 +194,7 @@ static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long return n; } -static __u16 devpath_to_vendor(const char *dev_path) +__u16 devpath_to_vendor(const char *dev_path) { char path[strlen(dev_path) + strlen("/vendor") + 1]; char vendor[7]; @@ -196,6 +230,7 @@ struct sys_dev *find_intel_devices(void) isci = find_driver_devices("pci", "isci"); ahci = find_driver_devices("pci", "ahci"); + /* Searching for NVMe will return list of NVMe and VMD controllers */ nvme = find_driver_devices("pci", "nvme"); if (!isci && !ahci) { @@ -430,6 +465,7 @@ static const struct imsm_orom *find_imsm_hba_orom(struct sys_dev *hba) #define AHCI_PROP "RstSataV" #define AHCI_SSATA_PROP "RstsSatV" #define AHCI_CSATA_PROP "RstCSatV" +#define VMD_PROP "RstUefiV" #define VENDOR_GUID \ EFI_GUID(0x193dfefa, 0xa445, 0x4302, 0x99, 0xd8, 0xef, 0x3a, 0xad, 0x1a, 0x04, 0xc6) @@ -545,15 +581,21 @@ const struct imsm_orom *find_imsm_efi(struct sys_dev *hba) if (!csata) csata = add_orom(&orom); add_orom_device_id(csata, hba->dev_id); + csata->type = hba->type; return &csata->orom; } } + if (hba->type == SYS_DEV_VMD) { + err = read_efi_variable(&orom, sizeof(orom), VMD_PROP, VENDOR_GUID); + } + if (err) return NULL; ret = add_orom(&orom); add_orom_device_id(ret, hba->dev_id); + ret->type = hba->type; return &ret->orom; } @@ -583,6 +625,7 @@ const struct imsm_orom *find_imsm_nvme(struct sys_dev *hba) nvme_orom = add_orom(&nvme_orom_compat); } add_orom_device_id(nvme_orom, hba->dev_id); + nvme_orom->type = SYS_DEV_NVME; return &nvme_orom->orom; } @@ -667,3 +710,32 @@ int disk_attached_to_hba(int fd, const char *hba_path) return rc; } + +char *vmd_domain_to_controller(struct sys_dev *hba, char *buf) +{ + struct dirent *ent; + DIR *dir; + char path[PATH_MAX]; + + if (!hba) + return NULL; + + if (hba->type != SYS_DEV_VMD) + return NULL; + + dir = opendir("/sys/bus/pci/drivers/vmd"); + + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + sprintf(path, "/sys/bus/pci/drivers/vmd/%s/domain/device", + ent->d_name); + + if (!realpath(path, buf)) + continue; + + if (strncmp(buf, hba->path, strlen(buf)) == 0) { + sprintf(path, "/sys/bus/pci/drivers/vmd/%s", ent->d_name); + return realpath(path, buf); + } + } + return NULL; +} diff --git a/platform-intel.h b/platform-intel.h index 695d6c66..a8ae85f4 100644 --- a/platform-intel.h +++ b/platform-intel.h @@ -189,6 +189,7 @@ enum sys_dev_type { SYS_DEV_SAS, SYS_DEV_SATA, SYS_DEV_NVME, + SYS_DEV_VMD, SYS_DEV_MAX }; @@ -213,6 +214,7 @@ struct devid_list { struct orom_entry { struct imsm_orom orom; struct devid_list *devid_list; + enum sys_dev_type type; struct orom_entry *next; }; @@ -229,6 +231,7 @@ static inline char *guid_str(char *buf, struct efi_guid guid) } char *diskfd_to_devpath(int fd); +__u16 devpath_to_vendor(const char *dev_path); struct sys_dev *find_driver_devices(const char *bus, const char *driver); struct sys_dev *find_intel_devices(void); const struct imsm_orom *find_imsm_capability(struct sys_dev *hba); @@ -241,3 +244,4 @@ const char *get_sys_dev_type(enum sys_dev_type); const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id); const struct imsm_orom *get_orom_by_device_id(__u16 device_id); struct sys_dev *device_by_id(__u16 device_id); +char *vmd_domain_to_controller(struct sys_dev *hba, char *buf); diff --git a/raid6check.c b/raid6check.c index cb8522e5..ad7ffe7e 100644 --- a/raid6check.c +++ b/raid6check.c @@ -349,7 +349,8 @@ int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets, if (!tables_ready) make_tables(); - posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size); + if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size) != 0) + exit(4); block_index_for_slot += 2; blocks += 2; blocks_page += 2; @@ -434,7 +434,7 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs, /* Try to find out if a specific disk has a problem */ int raid6_check_disks(int data_disks, int start, int chunk_size, int level, int layout, int diskP, int diskQ, - char *p, char *q, char **stripes) + uint8_t *p, uint8_t *q, char **stripes) { int i; int data_id, diskD; @@ -827,8 +827,8 @@ int test_stripes(int *source, unsigned long long *offsets, char *stripe_buf = xmalloc(raid_disks * chunk_size); char **stripes = xmalloc(raid_disks * sizeof(char*)); char **blocks = xmalloc(raid_disks * sizeof(char*)); - char *p = xmalloc(chunk_size); - char *q = xmalloc(chunk_size); + uint8_t *p = xmalloc(chunk_size); + uint8_t *q = xmalloc(chunk_size); int i; int diskP, diskQ; @@ -22,7 +22,7 @@ #include <stdio.h> -#if 1 /* defined HAVE_LIMITS_H || _LIBC */ +#if defined HAVE_LIMITS_H || _LIBC # include <limits.h> #endif @@ -33,9 +33,9 @@ the resulting executable. Locally running cross-compiled executables is usually not possible. */ -#if 1 /* def _LIBC */ -# include <stdint.h> -typedef uint32_t sha1_uint32; +#ifdef _LIBC +# include <sys/types.h> +typedef u_int32_t sha1_uint32; typedef uintptr_t sha1_uintptr; #else # define INT_MAX_32_BITS 2147483647 diff --git a/super-intel.c b/super-intel.c index 95a72b6a..90b7b6de 100644 --- a/super-intel.c +++ b/super-intel.c @@ -510,7 +510,8 @@ static const char *_sys_dev_type[] = { [SYS_DEV_UNKNOWN] = "Unknown", [SYS_DEV_SAS] = "SAS", [SYS_DEV_SATA] = "SATA", - [SYS_DEV_NVME] = "NVMe" + [SYS_DEV_NVME] = "NVMe", + [SYS_DEV_VMD] = "VMD" }; const char *get_sys_dev_type(enum sys_dev_type type) @@ -565,6 +566,10 @@ static int attach_hba_to_super(struct intel_super *super, struct sys_dev *device if (device->type != hba->type) return 2; + /* Always forbid spanning between VMD domains (seen as different controllers by mdadm) */ + if (device->type == SYS_DEV_VMD && !path_attached_to_hba(device->path, hba->path)) + return 2; + /* Multiple same type HBAs can be used if they share the same OROM */ const struct imsm_orom *device_orom = get_orom_by_device_id(device->dev_id); @@ -1761,6 +1766,57 @@ static int ahci_enumerate_ports(const char *hba_path, int port_count, int host_b return err; } +static int print_vmd_attached_devs(struct sys_dev *hba) +{ + struct dirent *ent; + DIR *dir; + char path[292]; + char link[256]; + char *c, *rp; + + if (hba->type != SYS_DEV_VMD) + return 1; + + /* scroll through /sys/dev/block looking for devices attached to + * this hba + */ + dir = opendir("/sys/bus/pci/drivers/nvme"); + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + int n; + + /* is 'ent' a device? check that the 'subsystem' link exists and + * that its target matches 'bus' + */ + sprintf(path, "/sys/bus/pci/drivers/nvme/%s/subsystem", + ent->d_name); + n = readlink(path, link, sizeof(link)); + if (n < 0 || n >= (int)sizeof(link)) + continue; + link[n] = '\0'; + c = strrchr(link, '/'); + if (!c) + continue; + if (strncmp("pci", c+1, strlen("pci")) != 0) + continue; + + sprintf(path, "/sys/bus/pci/drivers/nvme/%s", ent->d_name); + /* if not a intel NVMe - skip it*/ + if (devpath_to_vendor(path) != 0x8086) + continue; + + rp = realpath(path, NULL); + if (!rp) + continue; + + if (path_attached_to_hba(rp, hba->path)) { + printf(" NVMe under VMD : %s\n", rp); + } + free(rp); + } + + return 0; +} + static void print_found_intel_controllers(struct sys_dev *elem) { for (; elem; elem = elem->next) { @@ -1771,7 +1827,12 @@ static void print_found_intel_controllers(struct sys_dev *elem) fprintf(stderr, "SAS "); else if (elem->type == SYS_DEV_NVME) fprintf(stderr, "NVMe "); - fprintf(stderr, "RAID controller"); + + if (elem->type == SYS_DEV_VMD) + fprintf(stderr, "VMD domain"); + else + fprintf(stderr, "RAID controller"); + if (elem->pci_id) fprintf(stderr, " at %s", elem->pci_id); fprintf(stderr, ".\n"); @@ -1935,8 +1996,10 @@ static int detail_platform_imsm(int verbose, int enumerate_only, char *controlle if (controller_path && (compare_paths(hba->path, controller_path) != 0)) continue; if (!find_imsm_capability(hba)) { + char buf[PATH_MAX]; pr_err("imsm capabilities not found for controller: %s (type %s)\n", - hba->path, get_sys_dev_type(hba->type)); + hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path, + get_sys_dev_type(hba->type)); continue; } result = 0; @@ -1951,13 +2014,27 @@ static int detail_platform_imsm(int verbose, int enumerate_only, char *controlle const struct orom_entry *entry; for (entry = orom_entries; entry; entry = entry->next) { - print_imsm_capability(&entry->orom); + if (entry->type == SYS_DEV_VMD) { + for (hba = list; hba; hba = hba->next) { + if (hba->type == SYS_DEV_VMD) { + char buf[PATH_MAX]; + print_imsm_capability(&entry->orom); + printf(" I/O Controller : %s (%s)\n", + vmd_domain_to_controller(hba, buf), get_sys_dev_type(hba->type)); + print_vmd_attached_devs(hba); + printf("\n"); + } + } + continue; + } - if (imsm_orom_is_nvme(&entry->orom)) { + print_imsm_capability(&entry->orom); + if (entry->type == SYS_DEV_NVME) { for (hba = list; hba; hba = hba->next) { if (hba->type == SYS_DEV_NVME) printf(" NVMe Device : %s\n", hba->path); } + printf("\n"); continue; } @@ -2000,16 +2077,25 @@ static int export_detail_platform_imsm(int verbose, char *controller_path) for (hba = list; hba; hba = hba->next) { if (controller_path && (compare_paths(hba->path,controller_path) != 0)) continue; - if (!find_imsm_capability(hba) && verbose > 0) - pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n", hba->path); + if (!find_imsm_capability(hba) && verbose > 0) { + char buf[PATH_MAX]; + pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n", + hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path); + } else result = 0; } const struct orom_entry *entry; - for (entry = orom_entries; entry; entry = entry->next) + for (entry = orom_entries; entry; entry = entry->next) { + if (entry->type == SYS_DEV_VMD) { + for (hba = list; hba; hba = hba->next) + print_imsm_capability_export(&entry->orom); + continue; + } print_imsm_capability_export(&entry->orom); + } return result; } @@ -3862,12 +3948,14 @@ static int find_intel_hba_capability(int fd, struct intel_super *super, char *de if (devname) { struct intel_hba *hba = super->hba; - pr_err("%s is attached to Intel(R) %s RAID controller (%s),\n" - " but the container is assigned to Intel(R) %s RAID controller (", + pr_err("%s is attached to Intel(R) %s %s (%s),\n" + " but the container is assigned to Intel(R) %s %s (", devname, get_sys_dev_type(hba_name->type), + hba_name->type == SYS_DEV_VMD ? "domain" : "RAID controller", hba_name->pci_id ? : "Err!", - get_sys_dev_type(super->hba->type)); + get_sys_dev_type(super->hba->type), + hba->type == SYS_DEV_VMD ? "domain" : "RAID controller"); while (hba) { fprintf(stderr, "%s", hba->pci_id ? : "Err!"); @@ -3876,7 +3964,8 @@ static int find_intel_hba_capability(int fd, struct intel_super *super, char *de hba = hba->next; } fprintf(stderr, ").\n" - " Mixing devices attached to different controllers is not allowed.\n"); + " Mixing devices attached to different %s is not allowed.\n", + hba_name->type == SYS_DEV_VMD ? "VMD domains" : "controllers"); } return 2; } @@ -5878,7 +5967,6 @@ count_volumes(struct intel_hba *hba, int dpa, int verbose) devid_list = entry->devid_list; for (dv = devid_list; dv; dv = dv->next) { - struct md_list *devlist = NULL; struct sys_dev *device = device_by_id(dv->devid); char *hba_path; @@ -5889,6 +5977,14 @@ count_volumes(struct intel_hba *hba, int dpa, int verbose) else return 0; + /* VMD has one orom entry for all domain, but spanning is not allowed. + * VMD arrays should be counted per domain (controller), so skip + * domains that are not the given one. + */ + if ((hba->type == SYS_DEV_VMD) && + (strncmp(device->path, hba->path, strlen(device->path)) != 0)) + continue; + devlist = get_devices(hba_path); /* if no intel devices return zero volumes */ if (devlist == NULL) @@ -9150,7 +9246,7 @@ int validate_container_imsm(struct mdinfo *info) return 1; } - if (orom != orom2) { + if ((orom != orom2) || ((hba->type == SYS_DEV_VMD) && (hba != hba2))) { pr_err("WARNING - IMSM container assembled with disks under different HBAs!\n" " This operation is not supported and can lead to data loss.\n"); return 1; @@ -10277,7 +10373,7 @@ int wait_for_reshape_imsm(struct mdinfo *sra, int ndata) if (sysfs_fd_get_ll(fd, &completed) < 0) { dprintf("cannot read reshape_position (no reshape in progres)\n"); close(fd); - return 0; + return 1; } if (completed > position_to_set) { @@ -10297,11 +10393,14 @@ int wait_for_reshape_imsm(struct mdinfo *sra, int ndata) do { char action[20]; - sysfs_wait(fd, NULL); + int timeout = 3000; + sysfs_wait(fd, &timeout); if (sysfs_get_str(sra, NULL, "sync_action", action, 20) > 0 && - strncmp(action, "reshape", 7) != 0) - break; + strncmp(action, "reshape", 7) != 0) { + close(fd); + return -1; + } if (sysfs_fd_get_ll(fd, &completed) < 0) { dprintf("cannot read reshape_position (in loop)\n"); close(fd); @@ -10563,7 +10662,7 @@ static int imsm_manage_reshape( sra->reshape_progress = next_step; /* wait until reshape finish */ - if (wait_for_reshape_imsm(sra, ndata) < 0) { + if (wait_for_reshape_imsm(sra, ndata)) { dprintf("wait_for_reshape_imsm returned error!\n"); goto abort; } @@ -10601,7 +10700,6 @@ static int imsm_manage_reshape( ret_val = 1; abort: free(buf); - abort_reshape(sra); return ret_val; } @@ -405,7 +405,8 @@ static void getinfo_super0(struct supertype *st, struct mdinfo *info, char *map) info->array.utime = sb->utime; info->array.chunk_size = sb->chunk_size; info->array.state = sb->state; - info->component_size = sb->size*2; + info->component_size = sb->size; + info->component_size *= 2; if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) info->bitmap_offset = 8; @@ -900,7 +901,7 @@ static int write_init_super0(struct supertype *st) rv = store_super0(st, di->fd); if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT))) - rv = st->ss->write_bitmap(st, di->fd); + rv = st->ss->write_bitmap(st, di->fd, NoUpdate); if (rv) pr_err("failed to write superblock to %s\n", @@ -1155,16 +1156,16 @@ static int add_internal_bitmap0(struct supertype *st, int *chunkp, return 1; } -static void locate_bitmap0(struct supertype *st, int fd) +static int locate_bitmap0(struct supertype *st, int fd) { unsigned long long dsize; unsigned long long offset; if (!get_dev_size(fd, NULL, &dsize)) - return; + return -1; if (dsize < MD_RESERVED_SECTORS*512) - return; + return -1; offset = MD_NEW_SIZE_SECTORS(dsize>>9); @@ -1173,9 +1174,10 @@ static void locate_bitmap0(struct supertype *st, int fd) offset += MD_SB_BYTES; lseek64(fd, offset, 0); + return 0; } -static int write_bitmap0(struct supertype *st, int fd) +static int write_bitmap0(struct supertype *st, int fd, enum bitmap_update update) { unsigned long long dsize; unsigned long long offset; @@ -1,7 +1,7 @@ /* * mdadm - manage Linux "md" devices aka RAID arrays. * - * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com> * * * This program is free software; you can redistribute it and/or modify @@ -68,7 +68,10 @@ struct mdp_superblock_1 { __u64 data_offset; /* sector start of data, often 0 */ __u64 data_size; /* sectors in this device that can be used for data */ __u64 super_offset; /* sector start of this superblock */ - __u64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + union { + __u64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + __u64 journal_tail;/* journal tail of journal device (from data_offset) */ + }; __u32 dev_number; /* permanent identifier of this device - not role in raid */ __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ @@ -125,6 +128,8 @@ struct misc_dev_info { * backwards anyway. */ #define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ +#define MD_FEATURE_BITMAP_VERSIONED 256 /* bitmap version number checked properly */ +#define MD_FEATURE_JOURNAL 512 /* support write journal */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ @@ -132,8 +137,39 @@ struct misc_dev_info { |MD_FEATURE_REPLACEMENT \ |MD_FEATURE_RESHAPE_BACKWARDS \ |MD_FEATURE_NEW_OFFSET \ + |MD_FEATURE_BITMAP_VERSIONED \ + |MD_FEATURE_JOURNAL \ ) +#ifndef MDASSEMBLE +static int role_from_sb(struct mdp_superblock_1 *sb) +{ + unsigned int d; + int role; + + d = __le32_to_cpu(sb->dev_number); + if (d < __le32_to_cpu(sb->max_dev)) + role = __le16_to_cpu(sb->dev_roles[d]); + else + role = MD_DISK_ROLE_SPARE; + return role; +} +#endif + +/* return how many bytes are needed for bitmap, for cluster-md each node + * should have it's own bitmap */ +static unsigned int calc_bitmap_size(bitmap_super_t *bms, unsigned int boundary) +{ + unsigned long long bits, bytes; + + bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); + bytes = (bits+7) >> 3; + bytes += sizeof(bitmap_super_t); + bytes = ROUND_UP(bytes, boundary); + + return bytes; +} + static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) { unsigned int disk_csum, csum; @@ -256,6 +292,7 @@ static int awrite(struct align_fd *afd, void *buf, int len) static void examine_super1(struct supertype *st, char *homehost) { struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE); time_t atime; unsigned int d; int role; @@ -289,6 +326,8 @@ static void examine_super1(struct supertype *st, char *homehost) strncmp(sb->set_name, homehost, l) == 0) printf(" (local to host %s)", homehost); printf("\n"); + if (bms->nodes > 0 && (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) + printf(" Cluster Name : %-64s\n", bms->cluster_name); atime = __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL; printf(" Creation Time : %.24s\n", ctime(&atime)); c=map_num(pers, __le32_to_cpu(sb->level)); @@ -446,25 +485,23 @@ static void examine_super1(struct supertype *st, char *homehost) /* This turns out to just be confusing */ printf(" Array Slot : %d (", __le32_to_cpu(sb->dev_number)); for (i= __le32_to_cpu(sb->max_dev); i> 0 ; i--) - if (__le16_to_cpu(sb->dev_roles[i-1]) != 0xffff) + if (__le16_to_cpu(sb->dev_roles[i-1]) != MD_DISK_ROLE_SPARE) break; for (d=0; d < i; d++) { int role = __le16_to_cpu(sb->dev_roles[d]); if (d) printf(", "); - if (role == 0xffff) printf("empty"); - else if(role == 0xfffe) printf("failed"); + if (role == MD_DISK_ROLE_SPARE) printf("empty"); + else if(role == MD_DISK_ROLE_FAULTY) printf("failed"); else printf("%d", role); } printf(")\n"); #endif printf(" Device Role : "); - d = __le32_to_cpu(sb->dev_number); - if (d < __le32_to_cpu(sb->max_dev)) - role = __le16_to_cpu(sb->dev_roles[d]); - else - role = 0xFFFF; - if (role >= 0xFFFE) + role = role_from_sb(sb); + if (role >= MD_DISK_ROLE_FAULTY) printf("spare\n"); + else if (role == MD_DISK_ROLE_JOURNAL) + printf("Journal\n"); else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_REPLACEMENT)) printf("Replacement device %d\n", role); else @@ -493,7 +530,7 @@ static void examine_super1(struct supertype *st, char *homehost) faulty = 0; for (i=0; i< __le32_to_cpu(sb->max_dev); i++) { int role = __le16_to_cpu(sb->dev_roles[i]); - if (role == 0xFFFE) + if (role == MD_DISK_ROLE_FAULTY) faulty++; } if (faulty) printf(" %d failed", faulty); @@ -681,12 +718,8 @@ static int copy_metadata1(struct supertype *st, int from, int to) /* have the header, can calculate * correct bitmap bytes */ bitmap_super_t *bms; - int bits; bms = (void*)buf; - bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); - bytes = (bits+7) >> 3; - bytes += sizeof(bitmap_super_t); - bytes = ROUND_UP(bytes, 512); + bytes = calc_bitmap_size(bms, 512); if (n > bytes) n = bytes; } @@ -740,6 +773,7 @@ err: static void detail_super1(struct supertype *st, char *homehost) { struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); int i; int l = homehost ? strlen(homehost) : 0; @@ -748,6 +782,8 @@ static void detail_super1(struct supertype *st, char *homehost) sb->set_name[l] == ':' && strncmp(sb->set_name, homehost, l) == 0) printf(" (local to host %s)", homehost); + if (bms->nodes > 0 && (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) + printf("\n Cluster Name : %-64s", bms->cluster_name); printf("\n UUID : "); for (i=0; i<16; i++) { if ((i&3)==0 && i != 0) printf(":"); @@ -891,6 +927,8 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) info->array.state = (__le64_to_cpu(sb->resync_offset) == MaxSector) ? 1 : 0; + if (__le32_to_cpu(bsb->nodes) > 1) + info->array.state |= (1 << MD_SB_CLUSTERED); info->data_offset = __le64_to_cpu(sb->data_offset); info->component_size = __le64_to_cpu(sb->size); @@ -902,7 +940,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) info->disk.number = __le32_to_cpu(sb->dev_number); if (__le32_to_cpu(sb->dev_number) >= __le32_to_cpu(sb->max_dev) || __le32_to_cpu(sb->dev_number) >= MAX_DEVS) - role = 0xfffe; + role = MD_DISK_ROLE_FAULTY; else role = __le16_to_cpu(sb->dev_roles[__le32_to_cpu(sb->dev_number)]); @@ -943,7 +981,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) size /= 512; bmend += size; if (bmend > earliest) - bmend = earliest; + earliest = bmend; } if (sb->bblog_offset && sb->bblog_size) { unsigned long long bbend = super_offset; @@ -969,12 +1007,17 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) info->disk.raid_disk = -1; switch(role) { - case 0xFFFF: + case MD_DISK_ROLE_SPARE: info->disk.state = 0; /* spare: not active, not sync, not faulty */ break; - case 0xFFFE: + case MD_DISK_ROLE_FAULTY: info->disk.state = 1; /* faulty */ break; + case MD_DISK_ROLE_JOURNAL: + info->disk.state = (1 << MD_DISK_JOURNAL); + info->disk.raid_disk = role; + info->space_after = (misc->device_size - info->data_offset) % 8; /* journal uses all 4kB blocks*/ + break; default: info->disk.state = 6; /* active and in sync */ info->disk.raid_disk = role; @@ -1022,7 +1065,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) map[i] = 0; for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) { role = __le16_to_cpu(sb->dev_roles[i]); - if (/*role == 0xFFFF || */role < (unsigned) info->array.raid_disks) { + if (/*role == MD_DISK_ROLE_SPARE || */role < (unsigned) info->array.raid_disks) { working++; if (map && role < map_disks) map[role] = 1; @@ -1030,6 +1073,9 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) } info->array.working_disks = working; + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_JOURNAL)) + info->journal_device_required = 1; + info->journal_clean = 0; } static struct mdinfo *container_content1(struct supertype *st, char *subarray) @@ -1054,7 +1100,18 @@ static int update_super1(struct supertype *st, struct mdinfo *info, * ignored. */ int rv = 0; + int lockid; struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) { + rv = cluster_get_dlmlock(&lockid); + if (rv) { + pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv); + cluster_release_dlmlock(lockid); + return rv; + } + } if (strcmp(update, "homehost") == 0 && homehost) { @@ -1094,8 +1151,10 @@ static int update_super1(struct supertype *st, struct mdinfo *info, int want; if (info->disk.state & (1<<MD_DISK_ACTIVE)) want = info->disk.raid_disk; + else if (info->disk.state & (1<<MD_DISK_JOURNAL)) + want = MD_DISK_ROLE_JOURNAL; else - want = 0xFFFF; + want = MD_DISK_ROLE_SPARE; if (sb->dev_roles[d] != __cpu_to_le16(want)) { sb->dev_roles[d] = __cpu_to_le16(want); rv = 1; @@ -1120,7 +1179,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info, unsigned int max = __le32_to_cpu(sb->max_dev); for (i=0 ; i < max ; i++) - if (__le16_to_cpu(sb->dev_roles[i]) >= 0xfffe) + if (__le16_to_cpu(sb->dev_roles[i]) >= MD_DISK_ROLE_FAULTY) break; sb->dev_number = __cpu_to_le32(i); info->disk.number = i; @@ -1225,6 +1284,11 @@ static int update_super1(struct supertype *st, struct mdinfo *info, sb->bblog_shift = 0; sb->bblog_offset = 0; } + } else if (strcmp(update, "force-no-bbl") == 0) { + sb->feature_map &= ~ __cpu_to_le32(MD_FEATURE_BAD_BLOCKS); + sb->bblog_size = 0; + sb->bblog_shift = 0; + sb->bblog_offset = 0; } else if (strcmp(update, "name") == 0) { if (info->name[0] == 0) sprintf(info->name, "%d", info->array.md_minor); @@ -1245,7 +1309,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info, (st->sb + MAX_SB_SIZE + BM_SUPER_SIZE); sb->data_size = __cpu_to_le64( misc->device_size - __le64_to_cpu(sb->data_offset)); - } else if (strcmp(update, "revert-reshape") == 0) { + } else if (strncmp(update, "revert-reshape", 14) == 0) { rv = -2; if (!(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE))) pr_err("No active reshape to revert on %s\n", @@ -1255,6 +1319,24 @@ static int update_super1(struct supertype *st, struct mdinfo *info, unsigned long long reshape_sectors; long reshape_chunk; rv = 0; + /* If the reshape hasn't started, just stop it. + * It is conceivable that a stripe was modified but + * the metadata not updated. In that case the backup + * should have been used to get passed the critical stage. + * If that couldn't happen, the "-nobackup" version + * will be used. + */ + if (strcmp(update, "revert-reshape-nobackup") == 0 && + sb->reshape_position == 0 && + (__le32_to_cpu(sb->delta_disks) > 0 || + (__le32_to_cpu(sb->delta_disks) == 0 && + !(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS))))) { + sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); + sb->raid_disks = __cpu_to_le32(__le32_to_cpu(sb->raid_disks) - + __le32_to_cpu(sb->delta_disks)); + sb->delta_disks = 0; + goto done; + } /* reshape_position is a little messy. * Its value must be a multiple of the larger * chunk size, and of the "after" data disks. @@ -1301,6 +1383,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info, sb->new_offset = __cpu_to_le32(-offset_delta); sb->data_size = __cpu_to_le64(__le64_to_cpu(sb->data_size) - offset_delta); } + done:; } } else if (strcmp(update, "_reshape_progress")==0) sb->reshape_position = __cpu_to_le64(info->reshape_progress); @@ -1312,6 +1395,9 @@ static int update_super1(struct supertype *st, struct mdinfo *info, rv = -1; sb->sb_csum = calc_sb_1_csum(sb); + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) + cluster_release_dlmlock(lockid); + return rv; } @@ -1415,13 +1501,26 @@ static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk, struct mdp_superblock_1 *sb = st->sb; __u16 *rp = sb->dev_roles + dk->number; struct devinfo *di, **dip; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + int rv, lockid; + + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) { + rv = cluster_get_dlmlock(&lockid); + if (rv) { + pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv); + cluster_release_dlmlock(lockid); + return rv; + } + } if ((dk->state & 6) == 6) /* active, sync */ *rp = __cpu_to_le16(dk->raid_disk); + else if (dk->state & (1<<MD_DISK_JOURNAL)) + *rp = MD_DISK_ROLE_JOURNAL; else if ((dk->state & ~2) == 0) /* active or idle -> spare */ - *rp = 0xffff; + *rp = MD_DISK_ROLE_SPARE; else - *rp = 0xfffe; + *rp = MD_DISK_ROLE_FAULTY; if (dk->number >= (int)__le32_to_cpu(sb->max_dev) && __le32_to_cpu(sb->max_dev) < MAX_DEVS) @@ -1442,11 +1541,14 @@ static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk, di->next = NULL; *dip = di; + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) + cluster_release_dlmlock(lockid); + return 0; } #endif -static void locate_bitmap1(struct supertype *st, int fd); +static int locate_bitmap1(struct supertype *st, int fd); static int store_super1(struct supertype *st, int fd) { @@ -1455,6 +1557,17 @@ static int store_super1(struct supertype *st, int fd) struct align_fd afd; int sbsize; unsigned long long dsize; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + int rv, lockid; + + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) { + rv = cluster_get_dlmlock(&lockid); + if (rv) { + pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv); + cluster_release_dlmlock(lockid); + return rv; + } + } if (!get_dev_size(fd, NULL, &dsize)) return 1; @@ -1515,6 +1628,9 @@ static int store_super1(struct supertype *st, int fd) } } fsync(fd); + if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) + cluster_release_dlmlock(lockid); + return 0; } @@ -1537,7 +1653,55 @@ static unsigned long choose_bm_space(unsigned long devsize) static void free_super1(struct supertype *st); +#define META_BLOCK_SIZE 4096 +__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len); + #ifndef MDASSEMBLE +static int write_empty_r5l_meta_block(struct supertype *st, int fd) +{ + struct r5l_meta_block *mb; + struct mdp_superblock_1 *sb = st->sb; + struct align_fd afd; + __u32 crc; + + init_afd(&afd, fd); + + if (posix_memalign((void**)&mb, 4096, META_BLOCK_SIZE) != 0) { + pr_err("Could not allocate memory for the meta block.\n"); + return 1; + } + + memset(mb, 0, META_BLOCK_SIZE); + + mb->magic = __cpu_to_le32(R5LOG_MAGIC); + mb->version = R5LOG_VERSION; + mb->meta_size = __cpu_to_le32(sizeof(struct r5l_meta_block)); + mb->seq = __cpu_to_le64(random32()); + mb->position = __cpu_to_le64(0); + + crc = crc32c_le(0xffffffff, sb->set_uuid, sizeof(sb->set_uuid)); + crc = crc32c_le(crc, (void *)mb, META_BLOCK_SIZE); + mb->checksum = crc; + + if (lseek64(fd, (sb->data_offset) * 512, 0) < 0LL) { + pr_err("cannot seek to offset of the meta block\n"); + goto fail_to_write; + } + + if (awrite(&afd, mb, META_BLOCK_SIZE) != META_BLOCK_SIZE) { + pr_err("failed to store write the meta block \n"); + goto fail_to_write; + } + fsync(fd); + + free(mb); + return 0; + +fail_to_write: + free(mb); + return 1; +} + static int write_init_super1(struct supertype *st) { struct mdp_superblock_1 *sb = st->sb; @@ -1551,6 +1715,11 @@ static int write_init_super1(struct supertype *st) unsigned long long data_offset; for (di = st->info; di; di = di->next) { + if (di->disk.state & (1 << MD_DISK_JOURNAL)) + sb->feature_map |= MD_FEATURE_JOURNAL; + } + + for (di = st->info; di; di = di->next) { if (di->disk.state & (1 << MD_DISK_FAULTY)) continue; if (di->fd < 0) @@ -1573,7 +1742,8 @@ static int write_init_super1(struct supertype *st) if (rfd >= 0) close(rfd); - sb->events = 0; + if (!(di->disk.state & (1<<MD_DISK_JOURNAL))) + sb->events = 0; refst = dup_super(st); if (load_super1(refst, di->fd, NULL)==0) { @@ -1681,15 +1851,23 @@ static int write_init_super1(struct supertype *st) rv = -EINVAL; goto out; } - if (conf_get_create_info()->bblist == 0) { + /* Disable badblock log on clusters, or when explicitly requested */ + if (st->nodes > 0 || conf_get_create_info()->bblist == 0) { sb->bblog_size = 0; sb->bblog_offset = 0; } sb->sb_csum = calc_sb_1_csum(sb); rv = store_super1(st, di->fd); + + if (rv == 0 && (di->disk.state & (1 << MD_DISK_JOURNAL))) { + rv = write_empty_r5l_meta_block(st, di->fd); + if (rv) + goto error_out; + } + if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1)) - rv = st->ss->write_bitmap(st, di->fd); + rv = st->ss->write_bitmap(st, di->fd, NoUpdate); close(di->fd); di->fd = -1; if (rv) @@ -2054,7 +2232,7 @@ add_internal_bitmap1(struct supertype *st, bbl_size = -bbl_offset; if (!may_change || (room < 3*2 && - __le32_to_cpu(sb->max_dev) <= 384)) { + __le32_to_cpu(sb->max_dev) <= 384)) { room = 3*2; offset = 1*2; bbl_size = 0; @@ -2144,32 +2322,45 @@ add_internal_bitmap1(struct supertype *st, bms->daemon_sleep = __cpu_to_le32(delay); bms->sync_size = __cpu_to_le64(size); bms->write_behind = __cpu_to_le32(write_behind); + bms->nodes = __cpu_to_le32(st->nodes); + if (st->nodes) + sb->feature_map = __cpu_to_le32(__le32_to_cpu(sb->feature_map) + | MD_FEATURE_BITMAP_VERSIONED); + if (st->cluster_name) + strncpy((char *)bms->cluster_name, + st->cluster_name, strlen(st->cluster_name)); *chunkp = chunk; return 1; } -static void locate_bitmap1(struct supertype *st, int fd) +static int locate_bitmap1(struct supertype *st, int fd) { unsigned long long offset; struct mdp_superblock_1 *sb; int mustfree = 0; + int ret; if (!st->sb) { if (st->ss->load_super(st, fd, NULL)) - return; /* no error I hope... */ + return -1; /* no error I hope... */ mustfree = 1; } sb = st->sb; + if ((__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) + ret = 0; + else + ret = -1; offset = __le64_to_cpu(sb->super_offset); offset += (int32_t) __le32_to_cpu(sb->bitmap_offset); if (mustfree) free(sb); lseek64(fd, offset<<9, 0); + return ret; } -static int write_bitmap1(struct supertype *st, int fd) +static int write_bitmap1(struct supertype *st, int fd, enum bitmap_update update) { struct mdp_superblock_1 *sb = st->sb; bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE); @@ -2177,6 +2368,43 @@ static int write_bitmap1(struct supertype *st, int fd) void *buf; int towrite, n; struct align_fd afd; + unsigned int i = 0; + unsigned long long total_bm_space, bm_space_per_node; + + switch (update) { + case NameUpdate: + /* update cluster name */ + if (st->cluster_name) { + memset((char *)bms->cluster_name, 0, sizeof(bms->cluster_name)); + strncpy((char *)bms->cluster_name, st->cluster_name, 64); + } + break; + case NodeNumUpdate: + /* cluster md only supports superblock 1.2 now */ + if (st->minor_version != 2) { + pr_err("Warning: cluster md only works with superblock 1.2\n"); + return -EINVAL; + } + + /* Each node has an independent bitmap, it is necessary to calculate the + * space is enough or not, first get how many bytes for the total bitmap */ + bm_space_per_node = calc_bitmap_size(bms, 4096); + + total_bm_space = 512 * (__le64_to_cpu(sb->data_offset) - __le64_to_cpu(sb->super_offset)); + total_bm_space = total_bm_space - 4096; /* leave another 4k for superblock */ + + if (bm_space_per_node * st->nodes > total_bm_space) { + pr_err("Warning: The max num of nodes can't exceed %llu\n", + total_bm_space / bm_space_per_node); + return -ENOMEM; + } + + bms->nodes = __cpu_to_le32(st->nodes); + break; + case NoUpdate: + default: + break; + } init_afd(&afd, fd); @@ -2185,27 +2413,37 @@ static int write_bitmap1(struct supertype *st, int fd) if (posix_memalign(&buf, 4096, 4096)) return -ENOMEM; - memset(buf, 0xff, 4096); - memcpy(buf, (char *)bms, sizeof(bitmap_super_t)); - - towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); - towrite = (towrite+7) >> 3; /* bits to bytes */ - towrite += sizeof(bitmap_super_t); - towrite = ROUND_UP(towrite, 512); - while (towrite > 0) { - n = towrite; - if (n > 4096) - n = 4096; - n = awrite(&afd, buf, n); - if (n > 0) - towrite -= n; + do { + /* Only the bitmap[0] should resync + * whole device on initial assembly + */ + if (i) + memset(buf, 0x00, 4096); else + memset(buf, 0xff, 4096); + memcpy(buf, (char *)bms, sizeof(bitmap_super_t)); + + towrite = calc_bitmap_size(bms, 4096); + while (towrite > 0) { + n = towrite; + if (n > 4096) + n = 4096; + n = awrite(&afd, buf, n); + if (n > 0) + towrite -= n; + else + break; + if (i) + memset(buf, 0x00, 4096); + else + memset(buf, 0xff, 4096); + } + fsync(fd); + if (towrite) { + rv = -2; break; - memset(buf, 0xff, 4096); - } - fsync(fd); - if (towrite) - rv = -2; + } + } while (++i < __le32_to_cpu(bms->nodes)); free(buf); return rv; @@ -2213,6 +2451,7 @@ static int write_bitmap1(struct supertype *st, int fd) static void free_super1(struct supertype *st) { + if (st->sb) free(st->sb); while (st->info) { @@ -2370,7 +2609,7 @@ void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0 for (i = 0; i < MD_SB_DISKS; i++) { int state = sb0->disks[i].state; - sb->dev_roles[i] = 0xFFFF; + sb->dev_roles[i] = MD_DISK_ROLE_SPARE; if ((state & (1<<MD_DISK_SYNC)) && !(state & (1<<MD_DISK_FAULTY))) sb->dev_roles[i] = __cpu_to_le16(sb0->disks[i].raid_disk); @@ -74,6 +74,12 @@ int sysfs_open(char *devnm, char *devname, char *attr) return fd; } +void sysfs_init_dev(struct mdinfo *mdi, unsigned long devid) +{ + snprintf(mdi->sys_name, + sizeof(mdi->sys_name), "dev-%s", devid2kname(devid)); +} + void sysfs_init(struct mdinfo *mdi, int fd, char *devnm) { mdi->sys_name[0] = 0; @@ -224,6 +230,13 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options) goto abort; } + if (options & GET_ARRAY_STATE) { + strcpy(base, "array_state"); + if (load_sys(fname, sra->sysfs_array_state)) + goto abort; + } else + sra->sysfs_array_state[0] = 0; + if (! (options & GET_DEVS)) return sra; diff --git a/systemd/mdadm-last-resort@.service b/systemd/mdadm-last-resort@.service index 5179f194..e93d72b2 100644 --- a/systemd/mdadm-last-resort@.service +++ b/systemd/mdadm-last-resort@.service @@ -1,6 +1,7 @@ [Unit] Description=Activate md array even though degraded DefaultDependencies=no +Conflicts=sys-devices-virtual-block-%i.device [Service] Type=oneshot diff --git a/systemd/mdmonitor.service b/systemd/mdmonitor.service index 9aff2f56..c7cff3e4 100644 --- a/systemd/mdmonitor.service +++ b/systemd/mdmonitor.service @@ -10,4 +10,7 @@ Description=MD array monitor DefaultDependencies=no [Service] -ExecStart=BINDIR/mdadm --monitor --scan +Environment= MDADM_MONITOR_ARGS=--scan +EnvironmentFile=-/run/sysconfig/mdadm +ExecStartPre=-/usr/lib/systemd/scripts/mdadm_env.sh +ExecStart=BINDIR/mdadm --monitor $MDADM_MONITOR_ARGS @@ -246,6 +246,15 @@ check() { fi ;; + readonly ) + grep -s "read-only" > /dev/null /proc/mdstat || { + echo >&2 "ERROR array is not read-only!"; cat /proc/mdstat ; exit 1; } + ;; + + inactive ) + grep -s "inactive" > /dev/null /proc/mdstat || { + echo >&2 "ERROR array is not inactive!"; cat /proc/mdstat ; exit 1; } + ;; * ) echo >&2 ERROR unknown check $1 ; exit 1; esac } diff --git a/tests/19raid6auto-repair b/tests/19raid6auto-repair index 7fb1c72f..ce4a7c08 100644 --- a/tests/19raid6auto-repair +++ b/tests/19raid6auto-repair @@ -10,32 +10,40 @@ data_offset_in_kib=$[2048/2] # make a raid5 from a file dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib -mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs -dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib -blockdev --flushbufs $md0; sync -check wait -blockdev --flushbufs $devs; sync -echo 3 > /proc/sys/vm/drop_caches -cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } - -# wipe out 5 chunks on each device -dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0] -dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5] -dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10] -dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15] -dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20] - -blockdev --flushbufs $devs; sync -echo 3 > /proc/sys/vm/drop_caches - -$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } - -$dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; } -blockdev --flushbufs $md0 $devs; sync -echo 3 > /proc/sys/vm/drop_caches - -$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } -cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } - -mdadm -S $md0 -udevadm settle + +# perform test for every layout +layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \ + right-symmetric-6 parity-first-6" + +for layout in $layouts +do + mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs + dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib + blockdev --flushbufs $md0; sync + check wait + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } + + # wipe out 5 chunks on each device + dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0] + dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5] + dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10] + dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15] + dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20] + + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } + + $dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; } + blockdev --flushbufs $md0 $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + mdadm -S $md0 +done diff --git a/tests/19raid6repair b/tests/19raid6repair index 1159bd3e..26846cc9 100644 --- a/tests/19raid6repair +++ b/tests/19raid6repair @@ -8,40 +8,49 @@ devs="$dev1 $dev2 $dev3 $dev4" # default 2048 sectors data_offset_in_kib=$[2048/2] -for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" "$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \ - "$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" "$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do - failure_split=( $failure ) - device_with_error=${failure_split[0]} - stripe_with_error=${failure_split[1]} - repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}" - start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error] - - # make a raid5 from a file - dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib - mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs - dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib - blockdev --flushbufs $md0; sync - - check wait - blockdev --flushbufs $devs; sync - echo 3 > /proc/sys/vm/drop_caches - cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } - - dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib - blockdev --flushbufs $device_with_error; sync - echo 3 > /proc/sys/vm/drop_caches - - $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } - - $dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; } - blockdev --flushbufs $md0 $devs; sync - echo 3 > /proc/sys/vm/drop_caches - - $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } - cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } - - mdadm -S $md0 - udevadm settle - sync - echo 3 > /proc/sys/vm/drop_caches +layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \ + right-symmetric-6 parity-first-6" + +for layout in $layouts +do + for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" \ + "$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \ + "$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" \ + "$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do + failure_split=( $failure ) + device_with_error=${failure_split[0]} + stripe_with_error=${failure_split[1]} + repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}" + start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error] + + # make a raid5 from a file + dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib + mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs + dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib + blockdev --flushbufs $md0; sync + + check wait + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } + + dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib + blockdev --flushbufs $device_with_error; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } + + $dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; } + blockdev --flushbufs $md0 $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + mdadm -S $md0 + udevadm settle + sync + echo 3 > /proc/sys/vm/drop_caches + done done diff --git a/tests/20raid5journal b/tests/20raid5journal new file mode 100644 index 00000000..f751aceb --- /dev/null +++ b/tests/20raid5journal @@ -0,0 +1,64 @@ +# check write journal of raid456 + +# test --detail +test_detail_shows_journal() { + mdadm -D $1 | grep journal || { + echo >&2 "ERROR --detail does show journal device!"; mdadm -D $1 ; exit 1; } +} + +# test --examine +test_examine_shows_journal() { + mdadm -E $1 | grep Journal || { + echo >&2 "ERROR --examine does show Journal device!"; mdadm -E $1 ; exit 1; } +} + +# test --create +create_with_journal_and_stop() { + mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3 --write-journal $dev4 + check wait + tar cf - /etc > $md0 + ./raid6check $md0 0 0 | grep 'Error detected' && exit 1 + test_detail_shows_journal $md0 + test_examine_shows_journal $dev4 + mdadm -S $md0 +} + +# test --assemble +test_assemble() { + create_with_journal_and_stop + if mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 + then + echo >&2 "ERROR should return 1 when journal is missing!"; cat /proc/mdstat ; exit 1; + fi + mdadm -S $md0 + + mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 --force + check readonly + mdadm -S $md0 +} + +# test --incremental +test_incremental() { + create_with_journal_and_stop + for d in $dev0 $dev1 $dev2 $dev3 + do + mdadm -I $d + done + check inactive + mdadm -I $dev4 + check raid5 + mdadm -S $md0 + + # test --incremental with journal missing + for d in $dev0 $dev1 $dev2 $dev3 + do + mdadm -I $d + done + mdadm -R $md0 + check readonly + mdadm -S $md0 +} + +create_with_journal_and_stop +test_assemble +test_incremental diff --git a/udev-md-raid-arrays.rules b/udev-md-raid-arrays.rules index a32b6d2d..c95ec7b1 100644 --- a/udev-md-raid-arrays.rules +++ b/udev-md-raid-arrays.rules @@ -17,7 +17,7 @@ TEST!="md/array_state", ENV{SYSTEMD_READY}="0", GOTO="md_end" ATTR{md/array_state}=="|clear|inactive", ENV{SYSTEMD_READY}="0", GOTO="md_end" LABEL="md_ignore_state" -IMPORT{program}="BINDIR/mdadm --detail --export $tempnode" +IMPORT{program}="BINDIR/mdadm --detail --export $devnode" ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace" ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}" ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}" @@ -26,14 +26,16 @@ ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n" ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n" -IMPORT{program}="/sbin/blkid -o udev -p -u noraid $tempnode" +IMPORT{builtin}="blkid" +OPTIONS+="link_priority=100" +OPTIONS+="watch" ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" ENV{MD_LEVEL}=="raid[1-9]*", ENV{SYSTEMD_WANTS}+="mdmonitor.service" # Tell systemd to run mdmon for our container, if we need it. -ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c" +ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/usr/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c" ENV{MD_MON_THIS}=="?*", PROGRAM="/usr/bin/basename $env{MD_MON_THIS}", ENV{SYSTEMD_WANTS}+="mdmon@%c.service" LABEL="md_end" diff --git a/udev-md-raid-assembly.rules b/udev-md-raid-assembly.rules index 5bde607f..d0d440a6 100644 --- a/udev-md-raid-assembly.rules +++ b/udev-md-raid-assembly.rules @@ -25,12 +25,9 @@ GOTO="md_inc_end" LABEL="md_inc" -# Disable incremental assembly to fix Debian bug #784070 -GOTO="md_inc_end" - # remember you can limit what gets auto/incrementally assembled by # mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY' -ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $tempnode --offroot ${DEVLINKS}" +ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $devnode --offroot ${DEVLINKS}" ACTION=="add|change", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer" ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="BINDIR/mdadm -If $name --path $env{ID_PATH}" ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="BINDIR/mdadm -If $name" @@ -24,6 +24,7 @@ #include "mdadm.h" #include "md_p.h" +#include <sys/poll.h> #include <sys/socket.h> #include <sys/utsname.h> #include <sys/wait.h> @@ -34,6 +35,8 @@ #include <ctype.h> #include <dirent.h> #include <signal.h> +#include <dlfcn.h> + /* * following taken from linux/blkpg.h because they aren't @@ -79,6 +82,143 @@ struct blkpg_partition { aren't permitted). */ #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) +static int is_dlm_hooks_ready = 0; + +int dlm_funs_ready(void) +{ + return is_dlm_hooks_ready ? 1 : 0; +} + +#ifndef MDASSEMBLE +static struct dlm_hooks *dlm_hooks = NULL; +struct dlm_lock_resource *dlm_lock_res = NULL; +static int ast_called = 0; + +struct dlm_lock_resource { + dlm_lshandle_t *ls; + struct dlm_lksb lksb; +}; + +/* Using poll(2) to wait for and dispatch ASTs */ +static int poll_for_ast(dlm_lshandle_t ls) +{ + struct pollfd pfd; + + pfd.fd = dlm_hooks->ls_get_fd(ls); + pfd.events = POLLIN; + + while (!ast_called) + { + if (poll(&pfd, 1, 0) < 0) + { + perror("poll"); + return -1; + } + dlm_hooks->dispatch(dlm_hooks->ls_get_fd(ls)); + } + ast_called = 0; + + return 0; +} + +static void dlm_ast(void *arg) +{ + ast_called = 1; +} + +static char *cluster_name = NULL; +/* Create the lockspace, take bitmapXXX locks on all the bitmaps. */ +int cluster_get_dlmlock(int *lockid) +{ + int ret = -1; + char str[64]; + int flags = LKF_NOQUEUE; + + ret = get_cluster_name(&cluster_name); + if (ret) { + pr_err("The md can't get cluster name\n"); + return -1; + } + + dlm_lock_res = xmalloc(sizeof(struct dlm_lock_resource)); + dlm_lock_res->ls = dlm_hooks->create_lockspace(cluster_name, O_RDWR); + if (!dlm_lock_res->ls) { + pr_err("%s failed to create lockspace\n", cluster_name); + return -ENOMEM; + } + + /* Conversions need the lockid in the LKSB */ + if (flags & LKF_CONVERT) + dlm_lock_res->lksb.sb_lkid = *lockid; + + snprintf(str, 64, "bitmap%s", cluster_name); + /* if flags with LKF_CONVERT causes below return ENOENT which means + * "No such file or directory" */ + ret = dlm_hooks->ls_lock(dlm_lock_res->ls, LKM_PWMODE, &dlm_lock_res->lksb, + flags, str, strlen(str), 0, dlm_ast, + dlm_lock_res, NULL, NULL); + if (ret) { + pr_err("error %d when get PW mode on lock %s\n", errno, str); + dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1); + return ret; + } + + /* Wait for it to complete */ + poll_for_ast(dlm_lock_res->ls); + *lockid = dlm_lock_res->lksb.sb_lkid; + + return dlm_lock_res->lksb.sb_status; +} + +int cluster_release_dlmlock(int lockid) +{ + int ret = -1; + + if (!cluster_name) + return -1; + + /* if flags with LKF_CONVERT causes below return EINVAL which means + * "Invalid argument" */ + ret = dlm_hooks->ls_unlock(dlm_lock_res->ls, lockid, 0, + &dlm_lock_res->lksb, dlm_lock_res); + if (ret) { + pr_err("error %d happened when unlock\n", errno); + /* XXX make sure the lock is unlocked eventually */ + goto out; + } + + /* Wait for it to complete */ + poll_for_ast(dlm_lock_res->ls); + + errno = dlm_lock_res->lksb.sb_status; + if (errno != EUNLOCK) { + pr_err("error %d happened in ast when unlock lockspace\n", errno); + /* XXX make sure the lockspace is unlocked eventually */ + goto out; + } + + ret = dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1); + if (ret) { + pr_err("error %d happened when release lockspace\n", errno); + /* XXX make sure the lockspace is released eventually */ + goto out; + } + free(dlm_lock_res); + +out: + return ret; +} +#else +int cluster_get_dlmlock(int *lockid) +{ + return -1; +} +int cluster_release_dlmlock(int lockid) +{ + return -1; +} +#endif + /* * Parse a 128 bit uuid in 4 integers * format is 32 hexx nibbles with options :.<space> separator @@ -271,6 +411,16 @@ long parse_num(char *num) } #endif +int parse_cluster_confirm_arg(char *input, char **devname, int *slot) +{ + char *dev; + *slot = strtoul(input, &dev, 10); + if (dev == input || dev[0] != ':') + return -1; + *devname = dev+1; + return 0; +} + void remove_partitions(int fd) { /* remove partitions from this block devices. @@ -1976,3 +2126,80 @@ void reopen_mddev(int mdfd) if (fd >= 0 && fd != mdfd) dup2(fd, mdfd); } + +#ifndef MDASSEMBLE +static struct cmap_hooks *cmap_hooks = NULL; +static int is_cmap_hooks_ready = 0; + +void set_cmap_hooks(void) +{ + cmap_hooks = xmalloc(sizeof(struct cmap_hooks)); + cmap_hooks->cmap_handle = dlopen("libcmap.so.4", RTLD_NOW | RTLD_LOCAL); + if (!cmap_hooks->cmap_handle) + return; + + cmap_hooks->initialize = dlsym(cmap_hooks->cmap_handle, "cmap_initialize"); + cmap_hooks->get_string = dlsym(cmap_hooks->cmap_handle, "cmap_get_string"); + cmap_hooks->finalize = dlsym(cmap_hooks->cmap_handle, "cmap_finalize"); + + if (!cmap_hooks->initialize || !cmap_hooks->get_string || + !cmap_hooks->finalize) + dlclose(cmap_hooks->cmap_handle); + else + is_cmap_hooks_ready = 1; +} + +int get_cluster_name(char **cluster_name) +{ + int rv = -1; + cmap_handle_t handle; + + if (!is_cmap_hooks_ready) + return rv; + + rv = cmap_hooks->initialize(&handle); + if (rv != CS_OK) + goto out; + + rv = cmap_hooks->get_string(handle, "totem.cluster_name", cluster_name); + if (rv != CS_OK) { + free(*cluster_name); + rv = -1; + goto name_err; + } + + rv = 0; +name_err: + cmap_hooks->finalize(handle); +out: + return rv; +} + +void set_dlm_hooks(void) +{ + dlm_hooks = xmalloc(sizeof(struct dlm_hooks)); + dlm_hooks->dlm_handle = dlopen("libdlm_lt.so.3", RTLD_NOW | RTLD_LOCAL); + if (!dlm_hooks->dlm_handle) + return; + + dlm_hooks->create_lockspace = dlsym(dlm_hooks->dlm_handle, "dlm_create_lockspace"); + dlm_hooks->release_lockspace = dlsym(dlm_hooks->dlm_handle, "dlm_release_lockspace"); + dlm_hooks->ls_lock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_lock"); + dlm_hooks->ls_unlock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_unlock"); + dlm_hooks->ls_get_fd = dlsym(dlm_hooks->dlm_handle, "dlm_ls_get_fd"); + dlm_hooks->dispatch = dlsym(dlm_hooks->dlm_handle, "dlm_dispatch"); + + if (!dlm_hooks->create_lockspace || !dlm_hooks->ls_lock || + !dlm_hooks->ls_unlock || !dlm_hooks->release_lockspace || + !dlm_hooks->ls_get_fd || !dlm_hooks->dispatch) + dlclose(dlm_hooks->dlm_handle); + else + is_dlm_hooks_ready = 1; +} + +void set_hooks(void) +{ + set_dlm_hooks(); + set_cmap_hooks(); +} +#endif |