diff options
-rw-r--r-- | Assemble.c | 54 | ||||
-rw-r--r-- | Create.c | 362 | ||||
-rw-r--r-- | Grow.c | 2 | ||||
-rw-r--r-- | Incremental.c | 202 | ||||
-rw-r--r-- | Kill.c | 8 | ||||
-rw-r--r-- | Makefile | 27 | ||||
-rw-r--r-- | Manage.c | 143 | ||||
-rw-r--r-- | Query.c | 4 | ||||
-rw-r--r-- | ReadMe.c | 1 | ||||
-rw-r--r-- | TODO | 35 | ||||
-rw-r--r-- | bitmap.c | 6 | ||||
-rw-r--r-- | crc32.c | 340 | ||||
-rw-r--r-- | crc32.h | 441 | ||||
-rw-r--r-- | kernel-patch-2.6.25 | 199 | ||||
-rw-r--r-- | managemon.c | 524 | ||||
-rw-r--r-- | mapfile.c | 27 | ||||
-rw-r--r-- | md.4 | 16 | ||||
-rw-r--r-- | mdadm.8 | 10 | ||||
-rw-r--r-- | mdadm.c | 3 | ||||
-rw-r--r-- | mdadm.h | 298 | ||||
-rw-r--r-- | mdmon.c | 348 | ||||
-rw-r--r-- | mdmon.h | 65 | ||||
-rw-r--r-- | mdstat.c | 58 | ||||
-rw-r--r-- | monitor.c | 527 | ||||
-rw-r--r-- | msg.c | 185 | ||||
-rw-r--r-- | msg.h | 31 | ||||
-rw-r--r-- | sg_io.c | 42 | ||||
-rw-r--r-- | super-ddf.c | 3227 | ||||
-rw-r--r-- | super-intel.c | 2552 | ||||
-rw-r--r-- | super0.c | 160 | ||||
-rw-r--r-- | super1.c | 314 | ||||
-rw-r--r-- | sysfs.c | 276 | ||||
-rw-r--r-- | test | 2 | ||||
-rw-r--r-- | util.c | 357 |
34 files changed, 10441 insertions, 405 deletions
@@ -542,8 +542,8 @@ int Assemble(struct supertype *st, char *mddev, int mdfd, == devices[devcnt].i.events && (devices[best[i]].i.disk.minor != devices[devcnt].i.disk.minor) - && st->ss->major == 0 - && info.array.level != -4) { + && st->ss == &super0 + && info.array.level != LEVEL_MULTIPATH) { /* two different devices with identical superblock. * Could be a mis-detection caused by overlapping * partitions. fail-safe. @@ -845,11 +845,29 @@ int Assemble(struct supertype *st, char *mddev, int mdfd, /* Almost ready to actually *do* something */ if (!old_linux) { int rv; + +#ifndef MDASSEMBLE + struct mdinfo *sra; + if (st->ss->external) { + char ver[100]; + strcat(strcpy(ver, "external:"), info.text_version); + sra = sysfs_read(mdfd, 0, 0); + if ((vers % 100) < 2 || + sra == NULL || + sysfs_set_str(sra, NULL, "metadata_version", + ver) < 0) { + fprintf(stderr, Name ": This kernel does not " + "support external metadata.\n"); + return 1; + } + rv = sysfs_set_array(sra, &info); + } else +#endif if ((vers % 100) >= 1) { /* can use different versions */ mdu_array_info_t inf; memset(&inf, 0, sizeof(inf)); - inf.major_version = st->ss->major; - inf.minor_version = st->minor_version; + inf.major_version = info.array.major_version; + inf.minor_version = info.array.minor_version; rv = ioctl(mdfd, SET_ARRAY_INFO, &inf); } else rv = ioctl(mdfd, SET_ARRAY_INFO, NULL); @@ -895,8 +913,14 @@ int Assemble(struct supertype *st, char *mddev, int mdfd, j = chosen_drive; if (j >= 0 /* && devices[j].uptodate */) { - if (ioctl(mdfd, ADD_NEW_DISK, - &devices[j].i.disk)!=0) { +#ifndef MDASSEMBLE + if (st->ss->external) + rv = sysfs_add_disk(sra, &devices[j].i); + else +#endif + rv = ioctl(mdfd, ADD_NEW_DISK, + &devices[j].i.disk); + if (rv) { fprintf(stderr, Name ": failed to add " "%s to %s: %s\n", devices[j].devname, @@ -918,6 +942,21 @@ int Assemble(struct supertype *st, char *mddev, int mdfd, i, mddev); } + if (info.array.level == LEVEL_CONTAINER) { + if (verbose >= 0) { + fprintf(stderr, Name ": Container %s has been " + "assembled with %d drive%s", + mddev, okcnt, okcnt==1?"":"s"); + if (okcnt < info.array.raid_disks) + fprintf(stderr, " (out of %d)", + info.array.raid_disks); + fprintf(stderr, "\n"); + } + if (must_close) + close(mdfd); + return 0; + } + if (runstop == 1 || (runstop <= 0 && ( enough(info.array.level, info.array.raid_disks, @@ -940,7 +979,8 @@ int Assemble(struct supertype *st, char *mddev, int mdfd, /* There is a nasty race with 'mdadm --monitor'. * If it opens this device before we close it, * it gets an incomplete open on which IO - * doesn't work and the capacity if wrong. + * doesn't work and the capacity is + * wrong. * If we reopen (to check for layered devices) * before --monitor closes, we loose. * @@ -66,12 +66,18 @@ int Create(struct supertype *st, char *mddev, int mdfd, int second_missing = subdevs * 2; int missing_disks = 0; int insert_point = subdevs * 2; /* where to insert a missing drive */ + int total_slots; int pass; int vers; int rv; int bitmap_fd; + int have_container = 0; + int container_fd; + int need_mdmon = 0; unsigned long long bitmapsize; - struct mdinfo info; + struct mdinfo *sra; + struct mdinfo info, *infos; + int did_default = 0; int major_num = BITMAP_MAJOR_HI; @@ -92,6 +98,14 @@ int Create(struct supertype *st, char *mddev, int mdfd, } } if (level == UnSet) { + /* "ddf" and "imsm" metadata only supports one level - should possibly + * push this into metadata handler?? + */ + if (st && (st->ss == &super_ddf || st->ss == &super_imsm)) + level = LEVEL_CONTAINER; + } + + if (level == UnSet) { fprintf(stderr, Name ": a RAID level is needed to create an array.\n"); return 1; @@ -116,11 +130,47 @@ int Create(struct supertype *st, char *mddev, int mdfd, Name ": This level does not support spare devices\n"); return 1; } + + if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) { + /* If given a single device, it might be a container, and we can + * extract a device list from there + */ + mdu_array_info_t inf; + int fd; + + memset(&inf, 0, sizeof(inf)); + fd = open(devlist->devname, O_RDONLY); + if (fd >= 0 && + ioctl(fd, GET_ARRAY_INFO, &inf) == 0 && + inf.raid_disks == 0) { + /* yep, looks like a container */ + if (st) { + rv = st->ss->load_super(st, fd, + devlist->devname); + if (rv == 0) + have_container = 1; + } else { + st = guess_super(fd); + if (st && !(rv = st->ss-> + load_super(st, fd, + devlist->devname))) + have_container = 1; + else + st = NULL; + } + } + if (fd >= 0) + close(fd); + if (have_container) { + subdevs = 0; + devlist = NULL; + } + } if (subdevs > raiddisks+sparedisks) { fprintf(stderr, Name ": You have listed more devices (%d) than are in the array(%d)!\n", subdevs, raiddisks+sparedisks); return 1; } - if (subdevs < raiddisks+sparedisks) { + if (!have_container && subdevs < raiddisks+sparedisks) { fprintf(stderr, Name ": You haven't given enough devices (real or missing) to create this array\n"); return 1; } @@ -182,6 +232,7 @@ int Create(struct supertype *st, char *mddev, int mdfd, case 1: case LEVEL_FAULTY: case LEVEL_MULTIPATH: + case LEVEL_CONTAINER: if (chunk) { chunk = 0; if (verbose > 0) @@ -193,14 +244,17 @@ int Create(struct supertype *st, char *mddev, int mdfd, return 1; } + if (st && ! st->ss->validate_geometry(st, level, layout, raiddisks, + chunk, size, NULL, NULL, verbose>=0)) + return 1; + /* now look at the subdevs */ info.array.active_disks = 0; info.array.working_disks = 0; dnum = 0; for (dv=devlist; dv; dv=dv->next, dnum++) { char *dname = dv->devname; - unsigned long long ldsize, freesize; - int fd; + unsigned long long freesize; if (strcasecmp(dname, "missing")==0) { if (first_missing > dnum) first_missing = dnum; @@ -212,18 +266,6 @@ int Create(struct supertype *st, char *mddev, int mdfd, info.array.working_disks++; if (dnum < raiddisks) info.array.active_disks++; - fd = open(dname, O_RDONLY|O_EXCL); - if (fd <0 ) { - fprintf(stderr, Name ": Cannot open %s: %s\n", - dname, strerror(errno)); - fail=1; - continue; - } - if (!get_dev_size(fd, dname, &ldsize)) { - fail = 1; - close(fd); - continue; - } if (st == NULL) { struct createinfo *ci = conf_get_create_info(); if (ci) @@ -231,33 +273,42 @@ int Create(struct supertype *st, char *mddev, int mdfd, } if (st == NULL) { /* Need to choose a default metadata, which is different - * depending on the sizes of devices + * depending on geometry of array. */ int i; char *name = "default"; - if (level >= 1 && ldsize > (0x7fffffffULL<<10)) - name = "default/large"; - for(i=0; !st && superlist[i]; i++) + for(i=0; !st && superlist[i]; i++) { st = superlist[i]->match_metadata_desc(name); + if (st && !st->ss->validate_geometry + (st, level, layout, raiddisks, + chunk, size, dname, &freesize, + verbose > 0)) + st = NULL; + } if (!st) { - fprintf(stderr, Name ": internal error - no default metadata style\n"); + fprintf(stderr, Name ": device %s not suitable " + "for any style of array\n", + dname); exit(2); } - if (st->ss->major != 0 || + if (st->ss != &super0 || st->minor_version != 90) - fprintf(stderr, Name ": Defaulting to version" - " %d.%d metadata\n", - st->ss->major, - st->minor_version); - } - freesize = st->ss->avail_size(st, ldsize >> 9); - if (freesize == 0) { - fprintf(stderr, Name ": %s is too small: %luK\n", - dname, (unsigned long)(ldsize>>10)); - fail = 1; - close(fd); - continue; + did_default = 1; + } else { + if (!st->ss->validate_geometry(st, level, layout, + raiddisks, + chunk, size, dname, + &freesize, + verbose > 0)) { + + fprintf(stderr, + Name ": %s is not suitable for " + "this array.\n", + dname); + fail = 1; + continue; + } } freesize /= 2; /* convert to K */ @@ -268,9 +319,9 @@ int Create(struct supertype *st, char *mddev, int mdfd, if (size && freesize < size) { fprintf(stderr, Name ": %s is smaller that given size." - " %lluK < %lluK + superblock\n", dname, freesize, size); + " %lluK < %lluK + metadata\n", + dname, freesize, size); fail = 1; - close(fd); continue; } if (maxdisc == NULL || (maxdisc && freesize > maxsize)) { @@ -282,24 +333,36 @@ int Create(struct supertype *st, char *mddev, int mdfd, minsize = freesize; } if (runstop != 1 || verbose >= 0) { + int fd = open(dname, O_RDONLY); + if (fd <0 ) { + fprintf(stderr, Name ": Cannot open %s: %s\n", + dname, strerror(errno)); + fail=1; + continue; + } warn |= check_ext2(fd, dname); warn |= check_reiser(fd, dname); warn |= check_raid(fd, dname); + close(fd); } - close(fd); } if (fail) { fprintf(stderr, Name ": create aborted\n"); return 1; } if (size == 0) { - if (mindisc == NULL) { + if (mindisc == NULL && !have_container) { fprintf(stderr, Name ": no size and no drives given - aborting create.\n"); return 1; } - if (level > 0 || level == LEVEL_MULTIPATH || level == LEVEL_FAULTY) { + if (level > 0 || level == LEVEL_MULTIPATH + || level == LEVEL_FAULTY + || st->ss->external ) { /* size is meaningful */ - if (minsize > 0x100000000ULL && st->ss->major == 0) { + if (!st->ss->validate_geometry(st, level, layout, + raiddisks, + chunk, minsize, + NULL, NULL, 0)) { fprintf(stderr, Name ": devices too large for RAID level %d\n", level); return 1; } @@ -357,7 +420,7 @@ int Create(struct supertype *st, char *mddev, int mdfd, missing_disks++; } - if (level <= 0 && first_missing != subdevs * 2) { + if (level <= 0 && first_missing < subdevs * 2) { fprintf(stderr, Name ": This level does not support missing devices\n"); return 1; @@ -382,12 +445,16 @@ int Create(struct supertype *st, char *mddev, int mdfd, ( level == 6 && (insert_point < raiddisks || second_missing < raiddisks)) || + ( level <= 0 ) + || assume_clean - ) + ) { info.array.state = 1; /* clean, but one+ drive will be missing*/ - else + info.resync_start = ~0ULL; + } else { info.array.state = 0; /* not clean, but no errors */ - + info.resync_start = 0; + } if (level == 10) { /* for raid10, the bitmap size is the capacity of the array, * which is array.size * raid_disks / ncopies; @@ -424,7 +491,6 @@ int Create(struct supertype *st, char *mddev, int mdfd, + info.array.failed_disks; info.array.layout = layout; info.array.chunk_size = chunk*1024; - info.array.major_version = st->ss->major; if (name == NULL || *name == 0) { /* base name on mddev */ @@ -453,6 +519,31 @@ int Create(struct supertype *st, char *mddev, int mdfd, if (!st->ss->init_super(st, &info.array, size, name, homehost, uuid)) return 1; + total_slots = info.array.nr_disks; + st->ss->getinfo_super(st, &info); + + if (did_default && verbose >= 0) { + if (info.text_version[0] == '/') { + int dnum = devname2devnum(info.text_version+1); + char *path; + int mdp = get_mdp_major(); + struct mdinfo *mdi; + if (dnum > 0) + path = map_dev(MD_MAJOR, dnum, 1); + else + path = map_dev(mdp, (-1-dnum)<< 6, 1); + + mdi = sysfs_read(-1, dnum, GET_VERSION); + + fprintf(stderr, Name ": Creating array inside " + "%s container %s\n", + mdi?mdi->text_version:"managed", path); + sysfs_free(mdi); + } else + fprintf(stderr, Name ": Defaulting to version" + " %s metadata\n", info.text_version); + } + if (bitmap_file && vers < 9003) { major_num = BITMAP_MAJOR_HOSTENDIAN; #ifdef __BIG_ENDIAN @@ -476,12 +567,56 @@ int Create(struct supertype *st, char *mddev, int mdfd, } - - if ((vers % 100) >= 1) { /* can use different versions */ + sra = sysfs_read(mdfd, 0, 0); + + if (st->ss->external) { + char ver[100]; + strcat(strcpy(ver, "external:"), + info.text_version); + if (st->ss->external && st->subarray[0]) { + /* member */ + + /* When creating a member, we need to be careful + * to negotiate with mdmon properly. + * If it is already running, we cannot write to + * the devices and must ask it to do that part. + * If it isn't running, we write to the devices, + * and then start it. + * We hold an exclusive open on the container + * device to make sure mdmon doesn't exit after + * we checked that it is running. + * + * For now, fail if it is already running. + */ + container_fd = open_dev_excl(st->container_dev); + if (container_fd < 0) { + fprintf(stderr, Name ": Cannot get exclusive " + "open on container - weird.\n"); + return 1; + } + if (mdmon_running(st->container_dev)) { + if (verbose) + fprintf(stderr, Name ": reusing mdmon " + "for %s.\n", + devnum2devname(st->container_dev)); + st->update_tail = &st->updates; + } else + need_mdmon = 1; + } + if ((vers % 100) < 2 || + sra == NULL || + sysfs_set_str(sra, NULL, "metadata_version", + ver) < 0) { + fprintf(stderr, Name ": This kernel does not " + "support external metadata.\n"); + return 1; + } + rv = sysfs_set_array(sra, &info); + } else if ((vers % 100) >= 1) { /* can use different versions */ mdu_array_info_t inf; memset(&inf, 0, sizeof(inf)); - inf.major_version = st->ss->major; - inf.minor_version = st->minor_version; + inf.major_version = info.array.major_version; + inf.minor_version = info.array.minor_version; rv = ioctl(mdfd, SET_ARRAY_INFO, &inf); } else rv = ioctl(mdfd, SET_ARRAY_INFO, NULL); @@ -514,7 +649,7 @@ int Create(struct supertype *st, char *mddev, int mdfd, } } - + infos = malloc(sizeof(*infos) * total_slots); for (pass=1; pass <=2 ; pass++) { mddev_dev_t moved_disk = NULL; /* the disk that was moved out of the insert point */ @@ -523,74 +658,123 @@ int Create(struct supertype *st, char *mddev, int mdfd, dv=(dv->next)?(dv->next):moved_disk, dnum++) { int fd; struct stat stb; + struct mdinfo *inf = &infos[dnum]; - info.disk.number = dnum; + if (dnum >= total_slots) + abort(); if (dnum == insert_point) { moved_disk = dv; } - info.disk.raid_disk = info.disk.number; - if (info.disk.raid_disk < raiddisks) - info.disk.state = (1<<MD_DISK_ACTIVE) | + if (dnum == insert_point || + strcasecmp(dv->devname, "missing")==0) + continue; + + switch(pass) { + case 1: + *inf = info; + + inf->disk.number = dnum; + inf->disk.raid_disk = dnum; + if (inf->disk.raid_disk < raiddisks) + inf->disk.state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC); - else - info.disk.state = 0; - if (dv->writemostly) - info.disk.state |= (1<<MD_DISK_WRITEMOSTLY); + else + inf->disk.state = 0; + + if (dv->writemostly) + inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY); + + if (st->ss->external && st->subarray[0]) + fd = open(dv->devname, O_RDWR); + else + fd = open(dv->devname, O_RDWR|O_EXCL); - if (dnum == insert_point || - strcasecmp(dv->devname, "missing")==0) { - info.disk.major = 0; - info.disk.minor = 0; - info.disk.state = (1<<MD_DISK_FAULTY); - } else { - fd = open(dv->devname, O_RDONLY|O_EXCL); if (fd < 0) { - fprintf(stderr, Name ": failed to open %s after earlier success - aborting\n", + fprintf(stderr, Name ": failed to open %s " + "after earlier success - aborting\n", dv->devname); return 1; } fstat(fd, &stb); - info.disk.major = major(stb.st_rdev); - info.disk.minor = minor(stb.st_rdev); + inf->disk.major = major(stb.st_rdev); + inf->disk.minor = minor(stb.st_rdev); + remove_partitions(fd); - close(fd); - } - switch(pass){ - case 1: - st->ss->add_to_super(st, &info.disk); + st->ss->add_to_super(st, &inf->disk, + fd, dv->devname); + st->ss->getinfo_super(st, inf); + + /* getinfo_super might have lost these ... */ + inf->disk.major = major(stb.st_rdev); + inf->disk.minor = minor(stb.st_rdev); break; case 2: - if (info.disk.state == 1) break; - Kill(dv->devname, 0, 1); /* Just be sure it is clean */ - Kill(dv->devname, 0, 1); /* and again, there could be two superblocks */ - st->ss->write_init_super(st, &info.disk, - dv->devname); - - if (ioctl(mdfd, ADD_NEW_DISK, &info.disk)) { - fprintf(stderr, Name ": ADD_NEW_DISK for %s failed: %s\n", + inf->errors = 0; + rv = 0; + + if (st->ss->external) + rv = sysfs_add_disk(sra, inf); + else + rv = ioctl(mdfd, ADD_NEW_DISK, + &inf->disk); + + if (rv) { + fprintf(stderr, + Name ": ADD_NEW_DISK for %s " + "failed: %s\n", dv->devname, strerror(errno)); st->ss->free_super(st); return 1; } - break; } if (dv == moved_disk && dnum != insert_point) break; } + if (pass == 1) { + st->ss->write_init_super(st); + flush_metadata_updates(st); + } } + free(infos); st->ss->free_super(st); /* param is not actually used */ - if (runstop == 1 || subdevs >= raiddisks) { - mdu_param_t param; - if (ioctl(mdfd, RUN_ARRAY, ¶m)) { - fprintf(stderr, Name ": RUN_ARRAY failed: %s\n", - strerror(errno)); - Manage_runstop(mddev, mdfd, -1, 0); - return 1; + if (level == LEVEL_CONTAINER) + /* No need to start */ + ; + else if (runstop == 1 || subdevs >= raiddisks) { + if (st->ss->external) { + switch(level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + sysfs_set_str(sra, NULL, "array_state", + "active"); + need_mdmon = 0; + break; + default: + sysfs_set_str(sra, NULL, "array_state", + "readonly"); + break; + } + } else { + mdu_param_t param; + if (ioctl(mdfd, RUN_ARRAY, ¶m)) { + fprintf(stderr, Name ": RUN_ARRAY failed: %s\n", + strerror(errno)); + Manage_runstop(mddev, mdfd, -1, 0); + return 1; + } } if (verbose >= 0) fprintf(stderr, Name ": array %s started.\n", mddev); + if (st->ss->external && st->subarray[0]) { + if (need_mdmon) + start_mdmon(st->container_dev); + + ping_monitor(devnum2devname(st->container_dev)); + close(container_fd); + } } else { fprintf(stderr, Name ": not starting array - not enough devices.\n"); } @@ -69,7 +69,7 @@ int Grow_Add_device(char *devname, int fd, char *newdev) return 1; } - nfd = open(newdev, O_RDWR|O_EXCL); + nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT); if (nfd < 0) { fprintf(stderr, Name ": cannot open %s\n", newdev); return 1; diff --git a/Incremental.c b/Incremental.c index 0fb9afd3..9b222206 100644 --- a/Incremental.c +++ b/Incremental.c @@ -40,7 +40,7 @@ int Incremental(char *devname, int verbose, int runstop, struct supertype *st, char *homehost, int autof) { /* Add this device to an array, creating the array if necessary - * and starting the array if sensibe or - if runstop>0 - if possible. + * and starting the array if sensible or - if runstop>0 - if possible. * * This has several steps: * @@ -140,9 +140,17 @@ int Incremental(char *devname, int verbose, int runstop, close(dfd); return 1; } - st->ss->getinfo_super(st, &info); close (dfd); + if (st->ss->container_content) { + /* This is a pre-built container array, so we do something + * rather different. + */ + return Incremental_container(st, devname, verbose, runstop, + autof); + } + + st->ss->getinfo_super(st, &info); /* 3/ Check if there is a match in mdadm.conf */ array_list = conf_get_ident(NULL); @@ -229,6 +237,7 @@ int Incremental(char *devname, int verbose, int runstop, /* Have to guess a bit. */ int use_partitions = 1; char *np, *ep; + char *nm, nbuf[1024]; if ((autof&7) == 3 || (autof&7) == 5) use_partitions = 0; np = strchr(info.name, ':'); @@ -244,6 +253,24 @@ int Incremental(char *devname, int verbose, int runstop, } else devnum = -1; + if (match) + nm = match->devname; + else { + sprintf(nbuf, "/dev/md/%s", np); + nm = nbuf; + } + if (stat(nm, &stb) == 0 && + S_ISBLK(stb.st_mode) && + major(stb.st_rdev) == (use_partitions ? + get_mdp_major() : MD_MAJOR)) { + if (use_partitions) + devnum = minor(stb.st_rdev) >> MdpMinorShift; + else + devnum = minor(stb.st_rdev); + if (mddev_busy(use_partitions ? (-1-devnum) : devnum)) + devnum = -1; + } + if (devnum < 0) { /* Haven't found anything yet, choose something free */ devnum = find_free_devnum(use_partitions); @@ -273,12 +300,11 @@ int Incremental(char *devname, int verbose, int runstop, /* - add the device */ mdu_array_info_t ainf; mdu_disk_info_t disk; - char md[20]; struct mdinfo *sra; memset(&ainf, 0, sizeof(ainf)); - ainf.major_version = st->ss->major; - ainf.minor_version = st->minor_version; + ainf.major_version = info.array.major_version; + ainf.minor_version = info.array.minor_version; if (ioctl(mdfd, SET_ARRAY_INFO, &ainf) != 0) { fprintf(stderr, Name ": SET_ARRAY_INFO failed for %s: %s\b", @@ -286,9 +312,8 @@ int Incremental(char *devname, int verbose, int runstop, close(mdfd); return 2; } - sprintf(md, "%d.%d\n", st->ss->major, st->minor_version); sra = sysfs_read(mdfd, devnum, GET_VERSION); - sysfs_set_str(sra, NULL, "metadata_version", md); + sysfs_set_str(sra, NULL, "metadata_version", info.text_version); memset(&disk, 0, sizeof(disk)); disk.major = major(stb.st_rdev); disk.minor = minor(stb.st_rdev); @@ -325,29 +350,18 @@ int Incremental(char *devname, int verbose, int runstop, int err; struct mdinfo *sra; struct supertype *st2; - sra = sysfs_read(mdfd, devnum, (GET_VERSION | GET_DEVS | - GET_STATE)); + sra = sysfs_read(mdfd, devnum, (GET_DEVS | GET_STATE)); - if (sra->array.major_version != st->ss->major || - sra->array.minor_version != st->minor_version) { - if (verbose >= 0) - fprintf(stderr, Name - ": %s has different metadata to chosen array %s %d.%d %d.%d.\n", - devname, chosen_name, - sra->array.major_version, - sra->array.minor_version, - st->ss->major, st->minor_version); - close(mdfd); - return 1; - } sprintf(dn, "%d:%d", sra->devs->disk.major, sra->devs->disk.minor); dfd2 = dev_open(dn, O_RDONLY); st2 = dup_super(st); - if (st2->ss->load_super(st2, dfd2, NULL)) { + if (st2->ss->load_super(st2, dfd2, NULL) || + st->ss->compare_super(st, st2) != 0) { fprintf(stderr, Name - ": Strange error loading metadata for %s.\n", - chosen_name); + ": metadata mismatch between %s and " + "chosen array %s\n", + devname, chosen_name); close(mdfd); close(dfd2); return 2; @@ -385,8 +399,7 @@ int Incremental(char *devname, int verbose, int runstop, } /* 6/ Make sure /var/run/mdadm.map contains this array. */ map_update(&map, devnum, - info.array.major_version, - info.array.minor_version, + info.text_version, info.uuid, chosen_name); /* 7/ Is there enough devices to possibly start the array? */ @@ -620,8 +633,8 @@ void RebuildMap(void) path = map_dev(MD_MAJOR, md->devnum, 0); else path = map_dev(mdp, (-1-md->devnum)<< 6, 0); - map_add(&map, md->devnum, st->ss->major, - st->minor_version, + map_add(&map, md->devnum, + info.text_version, info.uuid, path ? : "/unknown"); st->ss->free_super(st); break; @@ -708,3 +721,136 @@ int IncrementalScan(int verbose) } return rv; } + +static char *container2devname(char *devname) +{ + int fd = open(devname, O_RDONLY); + char *mdname = NULL; + + if (fd >= 0) { + mdname = devnum2devname(fd2devnum(fd)); + close(fd); + } + + return mdname; +} + +int Incremental_container(struct supertype *st, char *devname, int verbose, + int runstop, int autof) +{ + /* Collect the contents of this container and for each + * array, choose a device name and assemble the array. + */ + + struct mdinfo *list = st->ss->container_content(st); + struct mdinfo *ra; + char *mdname = container2devname(devname); + + if (!mdname) { + fprintf(stderr, Name": failed to determine device name\n"); + return 2; + } + + for (ra = list ; ra ; ra = ra->next) { + struct mdinfo *sra; + struct mdinfo *dev; + int devnum = -1; + int mdfd; + char chosen_name[1024]; + int usepart = 1; + char *n; + int working = 0; + char ver[100]; + + if ((autof&7) == 3 || (autof&7) == 5) + usepart = 0; + + n = ra->name; + if (*n == 'd') + n++; + if (*n) { + devnum = strtoul(n, &n, 10); + if (devnum >= 0 && (*n == 0 || *n == ' ')) { + /* Use this devnum */ + usepart = (ra->name[0] == 'd'); + if (mddev_busy(usepart ? (-1-devnum) : devnum)) + devnum = -1; + } else + devnum = -1; + } + + if (devnum < 0) { + char *nm = ra->name; + char nbuf[1024]; + struct stat stb; + if (strchr(nm, ':')) + nm = strchr(nm, ':')+1; + sprintf(nbuf, "/dev/md/%s", nm); + + if (stat(nbuf, &stb) == 0 && + S_ISBLK(stb.st_mode) && + major(stb.st_rdev) == (usepart ? + get_mdp_major() : MD_MAJOR)){ + if (usepart) + devnum = minor(stb.st_rdev) + >> MdpMinorShift; + else + devnum = minor(stb.st_rdev); + if (mddev_busy(usepart ? (-1-devnum) : devnum)) + devnum = -1; + } + } + + if (devnum >= 0) + devnum = usepart ? (-1-devnum) : devnum; + else + devnum = find_free_devnum(usepart); + mdfd = open_mddev_devnum(NULL, devnum, ra->name, + chosen_name, autof>>3); + + if (mdfd < 0) { + fprintf(stderr, Name ": failed to open %s: %s.\n", + chosen_name, strerror(errno)); + return 2; + } + + sra = sysfs_read(mdfd, 0, 0); + + sprintf(ver, "external:%s", ra->text_version); + sysfs_set_str(sra, NULL, "metadata_version", ver); + + sysfs_set_array(sra, ra); + for (dev = ra->devs; dev; dev = dev->next) + if (sysfs_add_disk(sra, dev) == 0) + working++; + + if (runstop > 0 || working >= ra->array.working_disks) { + switch(ra->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + sysfs_set_str(sra, NULL, "array_state", + "active"); + break; + default: + sysfs_set_str(sra, NULL, "array_state", + "readonly"); + /* start mdmon if needed. */ + if (!mdmon_running(st->container_dev)) + start_mdmon(st->container_dev); + ping_monitor(devnum2devname(st->container_dev)); + break; + } + if (verbose >= 0) + printf("Started %s with %d devices\n", + chosen_name, working); + /* FIXME should have an O_EXCL and wait for read-auto */ + } else + if (verbose >= 0) + printf("%s assembled with %d devices but " + "not started\n", + chosen_name, working); + close(mdfd); + } + return 0; +} @@ -34,7 +34,7 @@ #include "md_u.h" #include "md_p.h" -int Kill(char *dev, int force, int quiet) +int Kill(char *dev, int force, int quiet, int noexcl) { /* * Nothing fancy about Kill. It just zeroes out a superblock @@ -44,7 +44,7 @@ int Kill(char *dev, int force, int quiet) int fd, rv = 0; struct supertype *st; - fd = open(dev, O_RDWR|O_EXCL); + fd = open(dev, O_DIRECT | (noexcl ? O_RDWR : (O_RDWR|O_EXCL))); if (fd < 0) { if (!quiet) fprintf(stderr, Name ": Couldn't open %s for write - not zeroing\n", @@ -63,10 +63,8 @@ int Kill(char *dev, int force, int quiet) if (force && rv >= 2) rv = 0; /* ignore bad data in superblock */ if (rv== 0 || (force && rv >= 2)) { - mdu_array_info_t info; - info.major_version = -1; /* zero superblock */ st->ss->free_super(st); - st->ss->init_super(st, &info, 0, "", NULL, NULL); + st->ss->init_super(st, NULL, 0, "", NULL, NULL); if (st->ss->store_super(st, fd)) { if (!quiet) fprintf(stderr, Name ": Could not zero superblock on %s\n", @@ -69,19 +69,24 @@ MAN8DIR = $(MANDIR)/man8 OBJS = mdadm.o config.o mdstat.o ReadMe.o util.o Manage.o Assemble.o Build.o \ Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \ Incremental.o \ - mdopen.o super0.o super1.o bitmap.o restripe.o sysfs.o sha1.o \ - mapfile.o + mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \ + restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o SRCS = mdadm.c config.c mdstat.c ReadMe.c util.c Manage.c Assemble.c Build.c \ Create.c Detail.c Examine.c Grow.c Monitor.c dlink.c Kill.c Query.c \ Incremental.c \ - mdopen.c super0.c super1.c bitmap.c restripe.c sysfs.c sha1.c \ - mapfile.c + mdopen.c super0.c super1.c super-ddf.c super-intel.c bitmap.c \ + restripe.c sysfs.c sha1.c mapfile.c crc32.c sg_io.c msg.c + +MON_OBJS = mdmon.o monitor.o managemon.o util.o mdstat.o sysfs.o config.o \ + Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \ + super-ddf.o sha1.o crc32.o msg.o + STATICSRC = pwgr.c STATICOBJS = pwgr.o ASSEMBLE_SRCS := mdassemble.c Assemble.c Manage.c config.c dlink.c util.c \ - super0.c super1.c sha1.c + super0.c super1.c super-ddf.c super-intel.c sha1.c crc32.c sg_io.c ASSEMBLE_AUTO_SRCS := mdopen.c mdstat.c sysfs.c ASSEMBLE_FLAGS:= $(CFLAGS) -DMDASSEMBLE ifdef MDASSEMBLE_AUTO @@ -89,7 +94,7 @@ ASSEMBLE_SRCS += $(ASSEMBLE_AUTO_SRCS) ASSEMBLE_FLAGS += -DMDASSEMBLE_AUTO endif -all : mdadm mdadm.man md.man mdadm.conf.man +all : mdadm mdmon mdadm.man md.man mdadm.conf.man everything: all mdadm.static swap_super test_stripe \ mdassemble mdassemble.auto mdassemble.static mdassemble.man \ @@ -119,6 +124,10 @@ mdadm.Os : $(SRCS) mdadm.h mdadm.O2 : $(SRCS) mdadm.h gcc -o mdadm.O2 $(CFLAGS) -DHAVE_STDINT_H -O2 $(SRCS) +mdmon : $(MON_OBJS) + $(CC) $(LDFLAGS) -o mdmon $(MON_OBJS) $(LDLIBS) +msg.o: msg.c msg.h + test_stripe : restripe.c mdadm.h $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe -DMAIN restripe.c @@ -161,8 +170,9 @@ $(OBJS) : mdadm.h bitmap.h sha1.o : sha1.c sha1.h md5.h $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c -install : mdadm install-man +install : mdadm mdmon install-man $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm + $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon install-static : mdadm.static install-man $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm @@ -188,7 +198,8 @@ test: mdadm test_stripe swap_super @echo "Please run 'sh ./test' as root" clean : - rm -f mdadm $(OBJS) $(STATICOBJS) core *.man mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \ + rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \ + mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \ mdadm.Os mdadm.O2 \ mdassemble mdassemble.static mdassemble.auto mdassemble.uclibc \ mdassemble.klibc swap_super \ @@ -78,13 +78,18 @@ int Manage_runstop(char *devname, int fd, int runstop, int quiet) { /* Run or stop the array. array must already be configured * required >= 0.90.0 + * Only print failure messages if quiet == 0; + * quiet > 0 means really be quiet + * quiet < 0 means we will try again if it fails. */ mdu_param_t param; /* unused */ if (runstop == -1 && md_get_version(fd) < 9000) { if (ioctl(fd, STOP_MD, 0)) { - if (!quiet) fprintf(stderr, Name ": stopping device %s failed: %s\n", - devname, strerror(errno)); + if (quiet == 0) fprintf(stderr, + Name ": stopping device %s " + "failed: %s\n", + devname, strerror(errno)); return 1; } } @@ -111,9 +116,51 @@ int Manage_runstop(char *devname, int fd, int runstop, int quiet) } else if (runstop < 0){ struct map_ent *map = NULL; struct stat stb; - if (ioctl(fd, STOP_ARRAY, NULL)) { - if (quiet==0) { - fprintf(stderr, Name ": fail to stop array %s: %s\n", + struct mdinfo *mdi; + /* If this is an mdmon managed array, just write 'inactive' + * to the array state and let mdmon clear up. + */ + mdi = sysfs_read(fd, -1, GET_LEVEL|GET_VERSION); + if (mdi && + mdi->array.level > 0 && + mdi->text_version[0] == '/') { + char *cp; + + /* This is mdmon managed. */ + close(fd); + if (sysfs_set_str(mdi, NULL, + "array_state", "inactive") < 0) { + if (quiet == 0) + fprintf(stderr, Name + ": failed to stop array %s: %s\n", + devname, strerror(errno)); + return 1; + } + + /* Give monitor a chance to act */ + cp = strchr(mdi->text_version+1, '/'); + if (*cp) + *cp = 0; + ping_monitor(mdi->text_version+1); + + fd = open(devname, O_RDONLY); + } else if (mdi && + mdi->array.major_version == -1 && + mdi->array.minor_version == -2 && + mdi->text_version[0] != '/') { + /* container, possibly mdmon-managed. + * Make sure mdmon isn't opening it, which + * would interfere with the 'stop' + */ + ping_monitor(mdi->sys_name); + } + if (mdi) + sysfs_free(mdi); + + if (fd >= 0 && ioctl(fd, STOP_ARRAY, NULL)) { + if (quiet == 0) { + fprintf(stderr, Name + ": failed to stop array %s: %s\n", devname, strerror(errno)); if (errno == EBUSY) fprintf(stderr, "Perhaps a running " @@ -122,9 +169,10 @@ int Manage_runstop(char *devname, int fd, int runstop, int quiet) } return 1; } + if (quiet <= 0) fprintf(stderr, Name ": stopped %s\n", devname); - if (fstat(fd, &stb) == 0) { + if (fd >= 0 && fstat(fd, &stb) == 0) { int devnum; if (major(stb.st_rdev) == MD_MAJOR) devnum = minor(stb.st_rdev); @@ -201,6 +249,7 @@ int Manage_subdevs(char *devname, int fd, struct supertype *st, *tst; int duuid[4]; int ouuid[4]; + int lfd = -1; if (ioctl(fd, GET_ARRAY_INFO, &array)) { fprintf(stderr, Name ": cannot get array info for %s\n", @@ -227,6 +276,7 @@ int Manage_subdevs(char *devname, int fd, unsigned long long ldsize; char dvname[20]; char *dnprintable = dv->devname; + int err; next = dv->next; jnext = 0; @@ -311,9 +361,14 @@ int Manage_subdevs(char *devname, int fd, return 1; case 'a': /* add the device */ - + if (tst->subarray[0]) { + fprintf(stderr, Name ": Cannot add disks to a" + " \'member\' array, perform this" + " operation on the parent container\n"); + return 1; + } /* Make sure it isn't in use (in 2.6 or later) */ - tfd = open(dv->devname, O_RDONLY|O_EXCL); + tfd = open(dv->devname, O_RDONLY|O_EXCL|O_DIRECT); if (tfd < 0) { fprintf(stderr, Name ": Cannot open %s: %s\n", dv->devname, strerror(errno)); @@ -332,7 +387,9 @@ int Manage_subdevs(char *devname, int fd, } close(tfd); - if (array.major_version == 0 && + + if (!tst->ss->external && + array.major_version == 0 && md_get_version(fd)%100 < 2) { if (ioctl(fd, HOT_ADD_DISK, (unsigned long)stb.st_rdev)==0) { @@ -451,11 +508,14 @@ int Manage_subdevs(char *devname, int fd, disc.number =j; disc.state = 0; if (array.not_persistent==0) { + int dfd; if (dv->writemostly) disc.state |= 1 << MD_DISK_WRITEMOSTLY; - tst->ss->add_to_super(tst, &disc); - if (tst->ss->write_init_super(tst, &disc, - dv->devname)) + dfd = open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + tst->ss->add_to_super(tst, &disc, dfd, + dv->devname); + /* write_init_super will close 'dfd' */ + if (tst->ss->write_init_super(tst)) return 1; } else if (dv->re_add) { /* this had better be raid1. @@ -499,13 +559,70 @@ int Manage_subdevs(char *devname, int fd, case 'r': /* hot remove */ + if (tst->subarray[0]) { + fprintf(stderr, Name ": Cannot remove disks from a" + " \'member\' array, perform this" + " operation on the parent container\n"); + return 1; + } + if (tst->ss->external) { + /* To remove a device from a container, we must + * check that it isn't in use in an array. + * This involves looking in the 'holders' + * directory - there must be just one entry, + * the container. + * To ensure that it doesn't get used as a + * hold spare while we are checking, we + * get an O_EXCL open on the container + */ + int dnum = fd2devnum(fd); + lfd = open_dev_excl(dnum); + if (lfd < 0) { + fprintf(stderr, Name + ": Cannot get exclusive access " + " to container - odd\n"); + return 1; + } + if (!sysfs_unique_holder(dnum, stb.st_rdev)) { + fprintf(stderr, Name + ": %s is %s, cannot remove.\n", + dnprintable, + errno == EEXIST ? "still in use": + "not a member"); + close(lfd); + return 1; + } + } /* FIXME check that it is a current member */ - if (ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev)) { + err = ioctl(fd, HOT_REMOVE_DISK, (unsigned long)stb.st_rdev); + if (err && errno == ENODEV) { + /* Old kernels rejected this if no personality + * registered */ + struct mdinfo *sra = sysfs_read(fd, 0, GET_DEVS); + struct mdinfo *dv = NULL; + if (sra) + dv = sra->devs; + for ( ; dv ; dv=dv->next) + if (dv->disk.major == major(stb.st_rdev) && + dv->disk.minor == minor(stb.st_rdev)) + break; + if (dv) + err = sysfs_set_str(sra, dv, + "state", "remove"); + else + err = -1; + if (sra) + sysfs_free(sra); + } + if (err) { fprintf(stderr, Name ": hot remove failed " "for %s: %s\n", dnprintable, strerror(errno)); + if (lfd >= 0) + close(lfd); return 1; } + close(lfd); if (verbose >= 0) fprintf(stderr, Name ": hot removed %s\n", dnprintable); @@ -96,7 +96,7 @@ int Query(char *dev) if (superror == 0) { /* array might be active... */ st->ss->getinfo_super(st, &info); - if (st->ss->major == 0) { + if (st->ss == &super0) { mddev = get_md_name(info.array.md_minor); disc.number = info.disk.number; activity = "undetected"; @@ -121,7 +121,7 @@ int Query(char *dev) activity, map_num(pers, info.array.level), mddev); - if (st->ss->major == 0) + if (st->ss == &super0) put_md_name(mddev); } return 0; @@ -612,6 +612,7 @@ mapping_t pers[] = { { "raid10", 10}, { "10", 10}, { "faulty", LEVEL_FAULTY}, + { "container", LEVEL_CONTAINER}, { NULL, 0} }; @@ -1,3 +1,38 @@ + - add 'name' field to metadata type and use it. + - use validate_geometry more + - metadata should be able to check/reject bitmap stuff. + +DDF: + Three new metadata types: + ddf - used only to create a container. + ddf-bvd - used to create an array in a container + ddf-svd - used to create a secondary array from bvds. + + Usage: + mdadm -C /dev/ddf1 /dev/sd[abcdef] + mdadm -C /dev/md1 -e ddf /dev/sd[a-f] + mdadm -C /dev/md1 -l container /dev/sd[a-f] + + Each of these create a new ddf container using all those + devices. The name 'ddf*' signals that ddf metadata should be used. + '-e ddf' only supports one level - 'container'. 'container' is only + supported by ddf. + + mdadm -C /dev/md1 -l0 -n4 /dev/ddf1 # or maybe not ??? + mdadm -C /dev/md1 -l1 -n2 /dev/sda /dev/sdb + If exactly one device is given, and it is a container, we select + devices from that container. + If devices are given that are already in use, they must be in use by + a container, and the array is created in the container. + If devices given are bvds, we slip under the hood to make + the svd arrays. + + mdadm -A /dev/ddf ...... + base drives make a container. Anything in that container is started + auto-read-only. + if /dev/ddf is already assembled, we assemble bvds and svds inside it. + + 2005-dec-20 Want an incremental assembly mode to work nicely with udev. Core usage would be something like @@ -122,11 +122,10 @@ bitmap_info_t *bitmap_fd_read(int fd, int brief) */ unsigned long long total_bits = 0, read_bits = 0, dirty_bits = 0; bitmap_info_t *info; - char *buf, *unaligned; + void *buf; int n, skip; - unaligned = malloc(8192*2); - buf = (char*) ((unsigned long)unaligned | 8191)+1; + posix_memalign(&buf, 512, 8192); n = read(fd, buf, 8192); info = malloc(sizeof(*info)); @@ -145,7 +144,6 @@ bitmap_info_t *bitmap_fd_read(int fd, int brief) fprintf(stderr, Name ": failed to read superblock of bitmap " "file: %s\n", strerror(errno)); free(info); - free(unaligned); return NULL; } memcpy(&info->sb, buf, sizeof(info->sb)); diff --git a/crc32.c b/crc32.c new file mode 100644 index 00000000..12d08e52 --- /dev/null +++ b/crc32.c @@ -0,0 +1,340 @@ +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2003 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results about a factor + * of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +/* @(#) $Id$ */ + +/* + Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore + protection on the static variables used to control the first-use generation + of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should + first call get_crc_table() to initialize the tables before allowing more than + one thread to use crc32(). + */ + +#ifdef MAKECRCH +# include <stdio.h> +# ifndef DYNAMIC_CRC_TABLE +# define DYNAMIC_CRC_TABLE +# endif /* !DYNAMIC_CRC_TABLE */ +#endif /* MAKECRCH */ + +/* #include "zutil.h" / * for STDC and FAR definitions */ +#define STDC +#define FAR +#define Z_NULL ((void*)0) +#define OF(X) X +#define ZEXPORT +typedef long ptrdiff_t; +#define NOBYFOUR + +#define local static + +/* Find a four-byte integer type for crc32_little() and crc32_big(). */ +#ifndef NOBYFOUR +# ifdef STDC /* need ANSI C limits.h to determine sizes */ +# include <limits.h> +# define BYFOUR +# if (UINT_MAX == 0xffffffffUL) + typedef unsigned int u4; +# else +# if (ULONG_MAX == 0xffffffffUL) + typedef unsigned long u4; +# else +# if (USHRT_MAX == 0xffffffffUL) + typedef unsigned short u4; +# else +# undef BYFOUR /* can't find a four-byte integer type! */ +# endif +# endif +# endif +# endif /* STDC */ +#endif /* !NOBYFOUR */ + +/* Definitions for doing the crc four data bytes at a time. */ +#ifdef BYFOUR +# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \ + (((w)&0xff00)<<8)+(((w)&0xff)<<24)) + local unsigned long crc32_little OF((unsigned long, + const unsigned char FAR *, unsigned)); + local unsigned long crc32_big OF((unsigned long, + const unsigned char FAR *, unsigned)); +# define TBLS 8 +#else +# define TBLS 1 +#endif /* BYFOUR */ + +#ifdef DYNAMIC_CRC_TABLE + +local volatile int crc_table_empty = 1; +local unsigned long FAR crc_table[TBLS][256]; +local void make_crc_table OF((void)); +#ifdef MAKECRCH + local void write_table OF((FILE *, const unsigned long FAR *)); +#endif /* MAKECRCH */ + +/* + Generate tables for a byte-wise 32-bit CRC calculation on the polynomial: + x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. + + Polynomials over GF(2) are represented in binary, one bit per coefficient, + with the lowest powers in the most significant bit. Then adding polynomials + is just exclusive-or, and multiplying a polynomial by x is a right shift by + one. If we call the above polynomial p, and represent a byte as the + polynomial q, also with the lowest power in the most significant bit (so the + byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p, + where a mod b means the remainder after dividing a by b. + + This calculation is done using the shift-register method of multiplying and + taking the remainder. The register is initialized to zero, and for each + incoming bit, x^32 is added mod p to the register if the bit is a one (where + x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by + x (which is shifting right by one and adding x^32 mod p if the bit shifted + out is a one). We start with the highest power (least significant bit) of + q and repeat for all eight bits of q. + + The first table is simply the CRC of all possible eight bit values. This is + all the information needed to generate CRCs on data a byte at a time for all + combinations of CRC register values and incoming bytes. The remaining tables + allow for word-at-a-time CRC calculation for both big-endian and little- + endian machines, where a word is four bytes. +*/ +local void make_crc_table() +{ + unsigned long c; + int n, k; + unsigned long poly; /* polynomial exclusive-or pattern */ + /* terms of polynomial defining this crc (except x^32): */ + static volatile int first = 1; /* flag to limit concurrent making */ + static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26}; + + /* See if another task is already doing this (not thread-safe, but better + than nothing -- significantly reduces duration of vulnerability in + case the advice about DYNAMIC_CRC_TABLE is ignored) */ + if (first) { + first = 0; + + /* make exclusive-or pattern from polynomial (0xedb88320UL) */ + poly = 0UL; + for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++) + poly |= 1UL << (31 - p[n]); + + /* generate a crc for every 8-bit value */ + for (n = 0; n < 256; n++) { + c = (unsigned long)n; + for (k = 0; k < 8; k++) + c = c & 1 ? poly ^ (c >> 1) : c >> 1; + crc_table[0][n] = c; + } + +#ifdef BYFOUR + /* generate crc for each value followed by one, two, and three zeros, + and then the byte reversal of those as well as the first table */ + for (n = 0; n < 256; n++) { + c = crc_table[0][n]; + crc_table[4][n] = REV(c); + for (k = 1; k < 4; k++) { + c = crc_table[0][c & 0xff] ^ (c >> 8); + crc_table[k][n] = c; + crc_table[k + 4][n] = REV(c); + } + } +#endif /* BYFOUR */ + + crc_table_empty = 0; + } + else { /* not first */ + /* wait for the other guy to finish (not efficient, but rare) */ + while (crc_table_empty) + ; + } + +#ifdef MAKECRCH + /* write out CRC tables to crc32.h */ + { + FILE *out; + + out = fopen("crc32.h", "w"); + if (out == NULL) return; + fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n"); + fprintf(out, " * Generated automatically by crc32.c\n */\n\n"); + fprintf(out, "local const unsigned long FAR "); + fprintf(out, "crc_table[TBLS][256] =\n{\n {\n"); + write_table(out, crc_table[0]); +# ifdef BYFOUR + fprintf(out, "#ifdef BYFOUR\n"); + for (k = 1; k < 8; k++) { + fprintf(out, " },\n {\n"); + write_table(out, crc_table[k]); + } + fprintf(out, "#endif\n"); +# endif /* BYFOUR */ + fprintf(out, " }\n};\n"); + fclose(out); + } +#endif /* MAKECRCH */ +} + +#ifdef MAKECRCH +local void write_table(out, table) + FILE *out; + const unsigned long FAR *table; +{ + int n; + + for (n = 0; n < 256; n++) + fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n], + n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", ")); +} +#endif /* MAKECRCH */ + +#else /* !DYNAMIC_CRC_TABLE */ +/* ======================================================================== + * Tables of CRC-32s of all single-byte values, made by make_crc_table(). + */ +#include "crc32.h" +#endif /* DYNAMIC_CRC_TABLE */ + +/* ========================================================================= + * This function can be used by asm versions of crc32() + */ +const unsigned long FAR * ZEXPORT get_crc_table(void) +{ +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + return (const unsigned long FAR *)crc_table; +} + +/* ========================================================================= */ +#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8) +#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 + +/* ========================================================================= */ +unsigned long ZEXPORT crc32( + unsigned long crc, + const unsigned char FAR *buf, + unsigned len) +{ + if (buf == Z_NULL) return 0UL; + +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + +#ifdef BYFOUR + if (sizeof(void *) == sizeof(ptrdiff_t)) { + u4 endian; + + endian = 1; + if (*((unsigned char *)(&endian))) + return crc32_little(crc, buf, len); + else + return crc32_big(crc, buf, len); + } +#endif /* BYFOUR */ +/* crc = crc ^ 0xffffffffUL;*/ + while (len >= 8) { + DO8; + len -= 8; + } + if (len) do { + DO1; + } while (--len); + return crc /* ^ 0xffffffffUL*/; +} + +#ifdef BYFOUR + +/* ========================================================================= */ +#define DOLIT4 c ^= *buf4++; \ + c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ + crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] +#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 + +/* ========================================================================= */ +local unsigned long crc32_little(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = (u4)crc; + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + len--; + } + + buf4 = (const u4 FAR *)buf; + while (len >= 32) { + DOLIT32; + len -= 32; + } + while (len >= 4) { + DOLIT4; + len -= 4; + } + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + } while (--len); + c = ~c; + return (unsigned long)c; +} + +/* ========================================================================= */ +#define DOBIG4 c ^= *++buf4; \ + c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ + crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] +#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 + +/* ========================================================================= */ +local unsigned long crc32_big(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = REV((u4)crc); + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + len--; + } + + buf4 = (const u4 FAR *)buf; + buf4--; + while (len >= 32) { + DOBIG32; + len -= 32; + } + while (len >= 4) { + DOBIG4; + len -= 4; + } + buf4++; + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + } while (--len); + c = ~c; + return (unsigned long)(REV(c)); +} + +#endif /* BYFOUR */ diff --git a/crc32.h b/crc32.h new file mode 100644 index 00000000..8053b611 --- /dev/null +++ b/crc32.h @@ -0,0 +1,441 @@ +/* crc32.h -- tables for rapid CRC calculation + * Generated automatically by crc32.c + */ + +local const unsigned long FAR crc_table[TBLS][256] = +{ + { + 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL, + 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL, + 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL, + 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL, + 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL, + 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL, + 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL, + 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL, + 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL, + 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL, + 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL, + 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL, + 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL, + 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL, + 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL, + 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL, + 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL, + 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL, + 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL, + 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL, + 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL, + 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL, + 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL, + 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL, + 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL, + 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL, + 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL, + 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL, + 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL, + 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL, + 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL, + 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL, + 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL, + 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL, + 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL, + 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL, + 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL, + 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL, + 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL, + 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL, + 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL, + 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL, + 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL, + 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL, + 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL, + 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL, + 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL, + 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL, + 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL, + 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL, + 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, + 0x2d02ef8dUL +#ifdef BYFOUR + }, + { + 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL, + 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL, + 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL, + 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL, + 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL, + 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL, + 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL, + 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL, + 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL, + 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL, + 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL, + 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL, + 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL, + 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL, + 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL, + 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL, + 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL, + 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL, + 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL, + 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL, + 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL, + 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL, + 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL, + 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL, + 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL, + 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL, + 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL, + 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL, + 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL, + 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL, + 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL, + 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL, + 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL, + 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL, + 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL, + 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL, + 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL, + 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL, + 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL, + 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL, + 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL, + 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL, + 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL, + 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL, + 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL, + 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL, + 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL, + 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL, + 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL, + 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL, + 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL, + 0x9324fd72UL + }, + { + 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL, + 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL, + 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL, + 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL, + 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL, + 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL, + 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL, + 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL, + 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL, + 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL, + 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL, + 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL, + 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL, + 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL, + 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL, + 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL, + 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL, + 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL, + 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL, + 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL, + 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL, + 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL, + 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL, + 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL, + 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL, + 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL, + 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL, + 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL, + 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL, + 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL, + 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL, + 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL, + 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL, + 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL, + 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL, + 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL, + 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL, + 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL, + 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL, + 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL, + 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL, + 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL, + 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL, + 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL, + 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL, + 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL, + 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL, + 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL, + 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL, + 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL, + 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL, + 0xbe9834edUL + }, + { + 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL, + 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL, + 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL, + 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL, + 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL, + 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL, + 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL, + 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL, + 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL, + 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL, + 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL, + 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL, + 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL, + 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL, + 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL, + 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL, + 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL, + 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL, + 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL, + 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL, + 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL, + 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL, + 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL, + 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL, + 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL, + 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL, + 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL, + 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL, + 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL, + 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL, + 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL, + 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL, + 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL, + 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL, + 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL, + 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL, + 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL, + 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL, + 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL, + 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL, + 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL, + 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL, + 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL, + 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL, + 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL, + 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL, + 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL, + 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL, + 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL, + 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL, + 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL, + 0xde0506f1UL + }, + { + 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL, + 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL, + 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL, + 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL, + 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL, + 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL, + 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL, + 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL, + 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL, + 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL, + 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL, + 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL, + 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL, + 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL, + 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL, + 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL, + 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL, + 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL, + 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL, + 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL, + 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL, + 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL, + 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL, + 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL, + 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL, + 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL, + 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL, + 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL, + 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL, + 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL, + 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL, + 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL, + 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL, + 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL, + 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL, + 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL, + 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL, + 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL, + 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL, + 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL, + 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL, + 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL, + 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL, + 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL, + 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL, + 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL, + 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL, + 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL, + 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL, + 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL, + 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL, + 0x8def022dUL + }, + { + 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL, + 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL, + 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL, + 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL, + 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL, + 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL, + 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL, + 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL, + 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL, + 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL, + 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL, + 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL, + 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL, + 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL, + 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL, + 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL, + 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL, + 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL, + 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL, + 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL, + 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL, + 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL, + 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL, + 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL, + 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL, + 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL, + 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL, + 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL, + 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL, + 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL, + 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL, + 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL, + 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL, + 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL, + 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL, + 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL, + 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL, + 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL, + 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL, + 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL, + 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL, + 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL, + 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL, + 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL, + 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL, + 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL, + 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL, + 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL, + 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL, + 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL, + 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL, + 0x72fd2493UL + }, + { + 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL, + 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL, + 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL, + 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL, + 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL, + 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL, + 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL, + 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL, + 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL, + 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL, + 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL, + 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL, + 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL, + 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL, + 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL, + 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL, + 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL, + 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL, + 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL, + 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL, + 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL, + 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL, + 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL, + 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL, + 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL, + 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL, + 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL, + 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL, + 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL, + 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL, + 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL, + 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL, + 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL, + 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL, + 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL, + 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL, + 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL, + 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL, + 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL, + 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL, + 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL, + 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL, + 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL, + 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL, + 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL, + 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL, + 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL, + 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL, + 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL, + 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL, + 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL, + 0xed3498beUL + }, + { + 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL, + 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL, + 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL, + 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL, + 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL, + 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL, + 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL, + 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL, + 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL, + 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL, + 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL, + 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL, + 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL, + 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL, + 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL, + 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL, + 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL, + 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL, + 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL, + 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL, + 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL, + 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL, + 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL, + 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL, + 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL, + 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL, + 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL, + 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL, + 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL, + 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL, + 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL, + 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL, + 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL, + 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL, + 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL, + 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL, + 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL, + 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL, + 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL, + 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL, + 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL, + 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL, + 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL, + 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL, + 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL, + 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL, + 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL, + 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL, + 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL, + 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL, + 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL, + 0xf10605deUL +#endif + } +}; diff --git a/kernel-patch-2.6.25 b/kernel-patch-2.6.25 new file mode 100644 index 00000000..23290078 --- /dev/null +++ b/kernel-patch-2.6.25 @@ -0,0 +1,199 @@ +Status: ok + +Support adding a spare to a live md array with external metadata. + +i.e. extend the 'md/dev-XXX/slot' attribute so that you can +tell a device to fill an vacant slot in an and md array. + + +Signed-off-by: Neil Brown <neilb@suse.de> + +### Diffstat output + ./drivers/md/md.c | 44 ++++++++++++++++++++++++++++++++++++++++---- + ./drivers/md/multipath.c | 7 ++++++- + ./drivers/md/raid1.c | 7 ++++++- + ./drivers/md/raid10.c | 10 ++++++++-- + ./drivers/md/raid5.c | 10 ++++++++-- + 5 files changed, 68 insertions(+), 10 deletions(-) + +diff .prev/drivers/md/md.c ./drivers/md/md.c +--- .prev/drivers/md/md.c 2008-06-05 09:19:56.000000000 +1000 ++++ ./drivers/md/md.c 2008-06-10 10:41:21.000000000 +1000 +@@ -1932,7 +1932,7 @@ slot_store(mdk_rdev_t *rdev, const char + slot = -1; + else if (e==buf || (*e && *e!= '\n')) + return -EINVAL; +- if (rdev->mddev->pers) { ++ if (rdev->mddev->pers && slot == -1) { + /* Setting 'slot' on an active array requires also + * updating the 'rd%d' link, and communicating + * with the personality with ->hot_*_disk. +@@ -1940,8 +1940,6 @@ slot_store(mdk_rdev_t *rdev, const char + * failed/spare devices. This normally happens automatically, + * but not when the metadata is externally managed. + */ +- if (slot != -1) +- return -EBUSY; + if (rdev->raid_disk == -1) + return -EEXIST; + /* personality does all needed checks */ +@@ -1955,6 +1953,44 @@ slot_store(mdk_rdev_t *rdev, const char + sysfs_remove_link(&rdev->mddev->kobj, nm); + set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); + md_wakeup_thread(rdev->mddev->thread); ++ } else if (rdev->mddev->pers) { ++ mdk_rdev_t *rdev2; ++ struct list_head *tmp; ++ /* Activating a spare .. or possibly reactivating ++ * if we every get bitmaps working here. ++ */ ++ ++ if (rdev->raid_disk != -1) ++ return -EBUSY; ++ ++ if (rdev->mddev->pers->hot_add_disk == NULL) ++ return -EINVAL; ++ ++ rdev_for_each(rdev2, tmp, rdev->mddev) ++ if (rdev2->raid_disk == slot) ++ return -EEXIST; ++ ++ rdev->raid_disk = slot; ++ if (test_bit(In_sync, &rdev->flags)) ++ rdev->saved_raid_disk = slot; ++ else ++ rdev->saved_raid_disk = -1; ++ err = rdev->mddev->pers-> ++ hot_add_disk(rdev->mddev, rdev); ++ if (err != 1) { ++ rdev->raid_disk = -1; ++ if (err == 0) ++ return -EEXIST; ++ return err; ++ } ++ sprintf(nm, "rd%d", rdev->raid_disk); ++ if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) ++ printk(KERN_WARNING ++ "md: cannot register " ++ "%s for %s\n", ++ nm, mdname(rdev->mddev)); ++ ++ /* don't wakeup anyone, leave that to userspace. */ + } else { + if (slot >= rdev->mddev->raid_disks) + return -ENOSPC; +@@ -4205,7 +4241,7 @@ static int add_new_disk(mddev_t * mddev, + super_types[mddev->major_version]. + validate_super(mddev, rdev); + err = mddev->pers->hot_add_disk(mddev, rdev); +- if (err) ++ if (err < 0) + unbind_rdev_from_array(rdev); + } + if (err) + +diff .prev/drivers/md/multipath.c ./drivers/md/multipath.c +--- .prev/drivers/md/multipath.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/multipath.c 2008-06-10 10:35:03.000000000 +1000 +@@ -284,10 +284,15 @@ static int multipath_add_disk(mddev_t *m + int found = 0; + int path; + struct multipath_info *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; ++ ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; + + print_multipath_conf(conf); + +- for (path=0; path<mddev->raid_disks; path++) ++ for (path = first; path <= last; path++) + if ((p=conf->multipaths+path)->rdev == NULL) { + q = rdev->bdev->bd_disk->queue; + blk_queue_stack_limits(mddev->queue, q); + +diff .prev/drivers/md/raid10.c ./drivers/md/raid10.c +--- .prev/drivers/md/raid10.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/raid10.c 2008-06-10 10:28:53.000000000 +1000 +@@ -1116,6 +1116,8 @@ static int raid10_add_disk(mddev_t *mdde + int found = 0; + int mirror; + mirror_info_t *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; + + if (mddev->recovery_cp < MaxSector) + /* only hot-add to in-sync arrays, as recovery is +@@ -1125,12 +1127,16 @@ static int raid10_add_disk(mddev_t *mdde + if (!enough(conf)) + return 0; + ++ if (rdev->raid_disk) ++ first = last = rdev->raid_disk; ++ + if (rdev->saved_raid_disk >= 0 && ++ rdev->saved_raid_disk >= first && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + mirror = rdev->saved_raid_disk; + else +- mirror = 0; +- for ( ; mirror < mddev->raid_disks; mirror++) ++ mirror = first; ++ for ( ; mirror <= last ; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + + blk_queue_stack_limits(mddev->queue, + +diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c +--- .prev/drivers/md/raid1.c 2008-05-30 14:49:31.000000000 +1000 ++++ ./drivers/md/raid1.c 2008-06-10 10:41:00.000000000 +1000 +@@ -1103,8 +1103,13 @@ static int raid1_add_disk(mddev_t *mddev + int found = 0; + int mirror = 0; + mirror_info_t *p; ++ int first = 0; ++ int last = mddev->raid_disks - 1; + +- for (mirror=0; mirror < mddev->raid_disks; mirror++) ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; ++ ++ for (mirror = first; mirror <= last; mirror++) + if ( !(p=conf->mirrors+mirror)->rdev) { + + blk_queue_stack_limits(mddev->queue, + +diff .prev/drivers/md/raid5.c ./drivers/md/raid5.c +--- .prev/drivers/md/raid5.c 2008-05-30 14:49:35.000000000 +1000 ++++ ./drivers/md/raid5.c 2008-06-10 10:27:51.000000000 +1000 +@@ -4399,21 +4399,27 @@ static int raid5_add_disk(mddev_t *mddev + int found = 0; + int disk; + struct disk_info *p; ++ int first = 0; ++ int last = conf->raid_disks - 1; + + if (mddev->degraded > conf->max_degraded) + /* no point adding a device */ + return 0; + ++ if (rdev->raid_disk >= 0) ++ first = last = rdev->raid_disk; ++ + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && ++ rdev->saved_raid_disk >= first && + conf->disks[rdev->saved_raid_disk].rdev == NULL) + disk = rdev->saved_raid_disk; + else +- disk = 0; +- for ( ; disk < conf->raid_disks; disk++) ++ disk = first; ++ for ( ; disk <= last ; disk++) + if ((p=conf->disks + disk)->rdev == NULL) { + clear_bit(In_sync, &rdev->flags); + rdev->raid_disk = disk; diff --git a/managemon.c b/managemon.c new file mode 100644 index 00000000..c947552e --- /dev/null +++ b/managemon.c @@ -0,0 +1,524 @@ + +/* + * The management thread for monitoring active md arrays. + * This thread does things which might block such as memory + * allocation. + * In particular: + * + * - Find out about new arrays in this container. + * Allocate the data structures and open the files. + * + * For this we watch /proc/mdstat and find new arrays with + * metadata type that confirms sharing. e.g. "md4" + * When we find a new array we slip it into the list of + * arrays and signal 'monitor' by writing to a pipe. + * + * - Respond to reshape requests by allocating new data structures + * and opening new files. + * + * These come as a change to raid_disks. We allocate a new + * version of the data structures and slip it into the list. + * 'monitor' will notice and release the old version. + * Changes to level, chunksize, layout.. do not need re-allocation. + * Reductions in raid_disks don't really either, but we handle + * them the same way for consistency. + * + * - When a device is added to the container, we add it to the metadata + * as a spare. + * + * - Deal with degraded array + * We only do this when first noticing the array is degraded. + * This can be when we first see the array, when sync completes or + * when recovery completes. + * + * Check if number of failed devices suggests recovery is needed, and + * skip if not. + * Ask metadata to allocate a spare device + * Add device as not in_sync and give a role + * Update metadata. + * Open sysfs files and pass to monitor. + * Make sure that monitor Starts recovery.... + * + * - Pass on metadata updates from external programs such as + * mdadm creating a new array. + * + * This is most-messy. + * It might involve adding a new array or changing the status of + * a spare, or any reconfig that the kernel doesn't get involved in. + * + * The required updates are received via a named pipe. There will + * be one named pipe for each container. Each message contains a + * sync marker: 0x5a5aa5a5, A byte count, and the message. This is + * passed to the metadata handler which will interpret and process it. + * For 'DDF' messages are internal data blocks with the leading + * 'magic number' signifying what sort of data it is. + * + */ + +/* + * We select on /proc/mdstat and the named pipe. + * We create new arrays or updated version of arrays and slip + * them into the head of the list, then signal 'monitor' via a pipe write. + * 'monitor' will notice and place the old array on a return list. + * Metadata updates are placed on a queue just like they arrive + * from the named pipe. + * + * When new arrays are found based on correct metadata string, we + * need to identify them with an entry in the metadata. Maybe we require + * the metadata to be mdX/NN when NN is the index into an appropriate table. + * + */ + +/* + * List of tasks: + * - Watch for spares to be added to the container, and write updated + * metadata to them. + * - Watch for new arrays using this container, confirm they match metadata + * and if so, start monitoring them + * - Watch for spares being added to monitored arrays. This shouldn't + * happen, as we should do all the adding. Just remove them. + * - Watch for change in raid-disks, chunk-size, etc. Update metadata and + * start a reshape. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "mdadm.h" +#include "mdmon.h" +#include <sys/syscall.h> +#include <sys/socket.h> +#include <signal.h> + +static void close_aa(struct active_array *aa) +{ + struct mdinfo *d; + + for (d = aa->info.devs; d; d = d->next) + close(d->state_fd); + + close(aa->action_fd); + close(aa->info.state_fd); + close(aa->resync_start_fd); +} + +static void free_aa(struct active_array *aa) +{ + /* Note that this doesn't close fds if they are being used + * by a clone. ->container will be set for a clone + */ + dprintf("%s: devnum: %d\n", __func__, aa->devnum); + if (!aa->container) + close_aa(aa); + while (aa->info.devs) { + struct mdinfo *d = aa->info.devs; + aa->info.devs = d->next; + free(d); + } + free(aa); +} + +static struct active_array *duplicate_aa(struct active_array *aa) +{ + struct active_array *newa = malloc(sizeof(*newa)); + struct mdinfo **dp1, **dp2; + + *newa = *aa; + newa->next = NULL; + newa->replaces = NULL; + newa->info.next = NULL; + + dp2 = &newa->info.devs; + + for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) { + struct mdinfo *d; + if ((*dp1)->state_fd < 0) + continue; + + d = malloc(sizeof(*d)); + *d = **dp1; + *dp2 = d; + dp2 = & d->next; + } + *dp2 = NULL; + + return newa; +} + +static void wakeup_monitor(void) +{ + /* tgkill(getpid(), mon_tid, SIGUSR1); */ + int pid = getpid(); + syscall(SYS_tgkill, pid, mon_tid, SIGUSR1); +} + +static void remove_old(void) +{ + if (discard_this) { + discard_this->next = NULL; + free_aa(discard_this); + if (pending_discard == discard_this) + pending_discard = NULL; + discard_this = NULL; + wakeup_monitor(); + } +} + +static void replace_array(struct supertype *container, + struct active_array *old, + struct active_array *new) +{ + /* To replace an array, we add it to the top of the list + * marked with ->replaces to point to the original. + * 'monitor' will take the original out of the list + * and put it on 'discard_this'. We take it from there + * and discard it. + */ + remove_old(); + while (pending_discard) { + while (discard_this == NULL) + sleep(1); + remove_old(); + } + pending_discard = old; + new->replaces = old; + new->next = container->arrays; + container->arrays = new; + wakeup_monitor(); +} + +struct metadata_update *update_queue = NULL; +struct metadata_update *update_queue_handled = NULL; +struct metadata_update *update_queue_pending = NULL; + +void check_update_queue(struct supertype *container) +{ + while (update_queue_handled) { + struct metadata_update *this = update_queue_handled; + update_queue_handled = this->next; + free(this->buf); + if (this->space) + free(this->space); + free(this); + } + if (update_queue == NULL && + update_queue_pending) { + update_queue = update_queue_pending; + update_queue_pending = NULL; + wakeup_monitor(); + } +} + +static void queue_metadata_update(struct metadata_update *mu) +{ + struct metadata_update **qp; + + qp = &update_queue_pending; + while (*qp) + qp = & ((*qp)->next); + *qp = mu; +} + +void wait_update_handled(void) +{ + /* Wait for any pending update to be handled by monitor. + * i.e. wait until update_queue is NULL + */ + while (update_queue) + usleep(100 * 1000); +} + +static void manage_container(struct mdstat_ent *mdstat, + struct supertype *container) +{ + /* The only thing of interest here is if a new device + * has been added to the container. We add it to the + * array ignoring any metadata on it. + * FIXME should we look for compatible metadata and take hints + * about spare assignment.... probably not. + */ + if (mdstat->devcnt != container->devcnt) { + /* read /sys/block/NAME/md/dev-??/block/dev to find out + * what is there, and compare with container->info.devs + * To see what is removed and what is added. + * These need to be remove from, or added to, the array + */ + // FIXME + container->devcnt = mdstat->devcnt; + } +} + +static void manage_member(struct mdstat_ent *mdstat, + struct active_array *a) +{ + /* Compare mdstat info with known state of member array. + * We do not need to look for device state changes here, that + * is dealt with by the monitor. + * + * We just look for changes which suggest that a reshape is + * being requested. + * Unfortunately decreases in raid_disks don't show up in + * mdstat until the reshape completes FIXME. + * + * Actually, we also want to handle degraded arrays here by + * trying to find and assign a spare. + * We do that whenever the monitor tells us too. + */ + // FIXME + a->info.array.raid_disks = mdstat->raid_disks; + a->info.array.chunk_size = mdstat->chunk_size; + // MORE + + if (a->check_degraded) { + struct metadata_update *updates = NULL; + struct mdinfo *newdev; + struct active_array *newa; + wait_update_handled(); + a->check_degraded = 0; + + /* The array may not be degraded, this is just a good time + * to check. + */ + newdev = a->container->ss->activate_spare(a, &updates); + if (newdev) { + struct mdinfo *d; + /* Cool, we can add a device or several. */ + newa = duplicate_aa(a); + /* suspend recovery - maybe not needed */ + + /* Add device to array and set offset/size/slot. + * and open files for each newdev */ + for (d = newdev; d ; d = d->next) { + struct mdinfo *newd; + if (sysfs_add_disk(&newa->info, d) < 0) + continue; + newd = newa->info.devs; + newd->state_fd = sysfs_open(a->devnum, + newd->sys_name, + "state"); + newd->prev_state + = read_dev_state(newd->state_fd); + newd->curr_state = newd->prev_state; + } + queue_metadata_update(updates); + replace_array(a->container, a, newa); + sysfs_set_str(&a->info, NULL, "sync_action", "repair"); + } + } +} + +static void manage_new(struct mdstat_ent *mdstat, + struct supertype *container, + struct active_array *victim) +{ + /* A new array has appeared in this container. + * Hopefully it is already recorded in the metadata. + * Check, then create the new array to report it to + * the monitor. + */ + + struct active_array *new; + struct mdinfo *mdi, *di; + char *inst; + int i; + + new = malloc(sizeof(*new)); + + memset(new, 0, sizeof(*new)); + + new->devnum = mdstat->devnum; + strcpy(new->info.sys_name, devnum2devname(new->devnum)); + + new->prev_state = new->curr_state = new->next_state = inactive; + new->prev_action= new->curr_action= new->next_action= idle; + + new->container = container; + + inst = &mdstat->metadata_version[10+strlen(container->devname)+1]; + + mdi = sysfs_read(-1, new->devnum, + GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT| + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE); + if (!mdi) { + /* Eeek. Cannot monitor this array. + * Mark it to be ignored by setting container to NULL + */ + new->container = NULL; + replace_array(container, victim, new); + return; + } + + new->info.array = mdi->array; + new->info.component_size = mdi->component_size; + + for (i = 0; i < new->info.array.raid_disks; i++) { + struct mdinfo *newd = malloc(sizeof(*newd)); + + for (di = mdi->devs; di; di = di->next) + if (i == di->disk.raid_disk) + break; + + if (di) { + memcpy(newd, di, sizeof(*newd)); + + newd->state_fd = sysfs_open(new->devnum, + newd->sys_name, + "state"); + + newd->prev_state = read_dev_state(newd->state_fd); + newd->curr_state = newd->prev_state; + } else { + newd->state_fd = -1; + newd->disk.raid_disk = i; + newd->prev_state = DS_REMOVE; + newd->curr_state = DS_REMOVE; + } + sprintf(newd->sys_name, "rd%d", i); + newd->next = new->info.devs; + new->info.devs = newd; + } + new->action_fd = sysfs_open(new->devnum, NULL, "sync_action"); + new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state"); + new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start"); + get_resync_start(new); + dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst), + new->action_fd, new->info.state_fd); + + sysfs_free(mdi); + // finds and compares. + if (container->ss->open_new(container, new, inst) < 0) { + // FIXME close all those files + new->container = NULL; + replace_array(container, victim, new); + return; + } + replace_array(container, victim, new); + return; +} + +void manage(struct mdstat_ent *mdstat, struct supertype *container) +{ + /* We have just read mdstat and need to compare it with + * the known active arrays. + * Arrays with the wrong metadata are ignored. + */ + + for ( ; mdstat ; mdstat = mdstat->next) { + struct active_array *a; + if (mdstat->devnum == container->devnum) { + manage_container(mdstat, container); + continue; + } + if (mdstat->metadata_version == NULL || + strncmp(mdstat->metadata_version, "external:/", 10) != 0 || + strncmp(mdstat->metadata_version+10, container->devname, + strlen(container->devname)) != 0 || + mdstat->metadata_version[10+strlen(container->devname)] + != '/') + /* Not for this array */ + continue; + /* Looks like a member of this container */ + for (a = container->arrays; a; a = a->next) { + if (mdstat->devnum == a->devnum) { + if (a->container) + manage_member(mdstat, a); + break; + } + } + if (a == NULL || !a->container) + manage_new(mdstat, container, a); + } +} + +static void handle_message(struct supertype *container, struct metadata_update *msg) +{ + /* queue this metadata update through to the monitor */ + + struct metadata_update *mu; + + if (msg->len == 0) { + int cnt = monitor_loop_cnt; + if (cnt & 1) + cnt += 2; /* wait until next pselect */ + else + cnt += 3; /* wait for 2 pselects */ + wakeup_monitor(); + wait_update_handled(); + while (monitor_loop_cnt - cnt < 0) + usleep(10 * 1000); + } else { + mu = malloc(sizeof(*mu)); + mu->len = msg->len; + mu->buf = msg->buf; + msg->buf = NULL; + mu->space = NULL; + mu->next = NULL; + if (container->ss->prepare_update) + container->ss->prepare_update(container, mu); + queue_metadata_update(mu); + } +} + +void read_sock(struct supertype *container) +{ + int fd; + struct metadata_update msg; + int terminate = 0; + long fl; + int tmo = 3; /* 3 second timeout before hanging up the socket */ + + fd = accept(container->sock, NULL, NULL); + if (fd < 0) + return; + + fl = fcntl(fd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(fd, F_SETFL, fl); + + do { + msg.buf = NULL; + + /* read and validate the message */ + if (receive_message(fd, &msg, tmo) == 0) { + handle_message(container, &msg); + if (ack(fd, tmo) < 0) + terminate = 1; + } else + terminate = 1; + + } while (!terminate); + + close(fd); +} + +int exit_now = 0; +int manager_ready = 0; +void do_manager(struct supertype *container) +{ + struct mdstat_ent *mdstat; + sigset_t set; + + sigprocmask(SIG_UNBLOCK, NULL, &set); + sigdelset(&set, SIGUSR1); + + do { + + if (exit_now) + exit(0); + + mdstat = mdstat_read(1, 0); + + manage(mdstat, container); + + read_sock(container); + + free_mdstat(mdstat); + + remove_old(); + + check_update_queue(container); + + manager_ready = 1; + + mdstat_wait_fd(container->sock, &set); + } while(1); +} @@ -33,8 +33,8 @@ * also allows the array device name to be easily found. * * The map file is line based with space separated fields. The fields are: - * Device id - mdX or mdpX where is a number. - * metadata - 0.90 1.0 1.1 1.2 + * Device id - mdX or mdpX where X is a number. + * metadata - 0.90 1.0 1.1 1.2 ddf ... * UUID - uuid of the array * path - path where device created: /dev/md/home * @@ -62,7 +62,7 @@ int map_write(struct map_ent *mel) fprintf(f, "mdp%d ", -1-mel->devnum); else fprintf(f, "md%d ", mel->devnum); - fprintf(f, "%d.%d ", mel->major, mel->minor); + fprintf(f, "%s ", mel->metadata); fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0], mel->uuid[1], mel->uuid[2], mel->uuid[3]); fprintf(f, "%s\n", mel->path); @@ -87,13 +87,12 @@ int map_write(struct map_ent *mel) } void map_add(struct map_ent **melp, - int devnum, int major, int minor, int uuid[4], char *path) + int devnum, char *metadata, int uuid[4], char *path) { struct map_ent *me = malloc(sizeof(*me)); me->devnum = devnum; - me->major = major; - me->minor = minor; + strcpy(me->metadata, metadata); memcpy(me->uuid, uuid, 16); me->path = strdup(path); me->next = *melp; @@ -105,7 +104,8 @@ void map_read(struct map_ent **melp) FILE *f; char buf[8192]; char path[200]; - int devnum, major, minor, uuid[4]; + int devnum, uuid[4]; + char metadata[30]; char nam[4]; *melp = NULL; @@ -117,12 +117,12 @@ void map_read(struct map_ent **melp) return; while (fgets(buf, sizeof(buf), f)) { - if (sscanf(buf, " md%1[p]%d %d.%d %x:%x:%x:%x %200s", - nam, &devnum, &major, &minor, uuid, uuid+1, + if (sscanf(buf, " md%1[p]%d %s %x:%x:%x:%x %200s", + nam, &devnum, metadata, uuid, uuid+1, uuid+2, uuid+3, path) == 9) { if (nam[0] == 'p') devnum = -1 - devnum; - map_add(melp, devnum, major, minor, uuid, path); + map_add(melp, devnum, metadata, uuid, path); } } fclose(f); @@ -138,7 +138,7 @@ void map_free(struct map_ent *map) } } -int map_update(struct map_ent **mpp, int devnum, int major, int minor, +int map_update(struct map_ent **mpp, int devnum, char *metadata, int *uuid, char *path) { struct map_ent *map, *mp; @@ -151,15 +151,14 @@ int map_update(struct map_ent **mpp, int devnum, int major, int minor, for (mp = map ; mp ; mp=mp->next) if (mp->devnum == devnum) { - mp->major = major; - mp->minor = minor; + strcpy(mp->metadata, metadata); memcpy(mp->uuid, uuid, 16); free(mp->path); mp->path = strdup(path); break; } if (!mp) - map_add(&map, devnum, major, minor, uuid, path); + map_add(&map, devnum, metadata, uuid, path); *mpp = NULL; rv = map_write(map); map_free(map); @@ -526,10 +526,22 @@ Finally, "idle" can be written to stop the check/repair process. .B md/stripe_cache_size This is only available on RAID5 and RAID6. It records the size (in pages per device) of the stripe cache which is used for synchronising -all read and write operations to the array. The default is 128. +all write operations to the array and all read operations if the array +is degraded. The default is 256. Valid values are 17 to 32768. Increasing this number can increase performance in some situations, at -some cost in system memory. +some cost in system memory. Note, setting this value too high can +result in an "out of memory" condition for the system. +memory_consumed = system_page_size * nr_disks * stripe_cache_size + +.TP +.B md/preread_bypass_threshold +This is only available on RAID5 and RAID6. This variable sets the +number of times MD will service a full-stripe-write before servicing a +stripe that requires some "prereading". For fairness this defaults to +1. Valid values are 0 to stripe_cache_size. Setting this to 0 +maximizes sequential-write throughput at the cost of fairness to threads +doing small or random writes. .SS KERNEL PARAMETERS @@ -1937,6 +1937,16 @@ that no metadata updates are made and no attempt at resync or recovery happens. Further devices that are found before the first write can still be added safely. + +.SH ENVIRONMENT +This section describes environment variables that affect how mdadm +operates. + +.TP +.B MDADM_NO_MDMON +Setting this value to 1 will prevent mdadm from automatically launching +mdmon. This variable is intended primarily for debugging mdadm/mdmon. + .SH EXAMPLES .B " mdadm \-\-query /dev/name-of-device" @@ -1272,7 +1272,8 @@ int main(int argc, char *argv[]) export, test, homehost); continue; case 'K': /* Zero superblock */ - rv |= Kill(dv->devname, force, quiet); continue; + rv |= Kill(dv->devname, force, quiet,0); + continue; case 'Q': rv |= Query(dv->devname); continue; case 'X': @@ -76,6 +76,7 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); #include "md_u.h" #include "md_p.h" #include "bitmap.h" +#include "msg.h" #include <endian.h> /* Redhat don't like to #include <asm/byteorder.h>, and @@ -106,6 +107,13 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); #define __le16_to_cpu(_x) (_x) #define __le32_to_cpu(_x) (_x) #define __le64_to_cpu(_x) (_x) + +#define __cpu_to_be16(_x) bswap_16(_x) +#define __cpu_to_be32(_x) bswap_32(_x) +#define __cpu_to_be64(_x) bswap_64(_x) +#define __be16_to_cpu(_x) bswap_16(_x) +#define __be32_to_cpu(_x) bswap_32(_x) +#define __be64_to_cpu(_x) bswap_64(_x) #elif BYTE_ORDER == BIG_ENDIAN #define __cpu_to_le16(_x) bswap_16(_x) #define __cpu_to_le32(_x) bswap_32(_x) @@ -113,6 +121,13 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); #define __le16_to_cpu(_x) bswap_16(_x) #define __le32_to_cpu(_x) bswap_32(_x) #define __le64_to_cpu(_x) bswap_64(_x) + +#define __cpu_to_be16(_x) (_x) +#define __cpu_to_be32(_x) (_x) +#define __cpu_to_be64(_x) (_x) +#define __be16_to_cpu(_x) (_x) +#define __be32_to_cpu(_x) (_x) +#define __be64_to_cpu(_x) (_x) #else # error "unknown endianness." #endif @@ -128,18 +143,36 @@ struct mdinfo { int uuid[4]; char name[33]; unsigned long long data_offset; - unsigned long long component_size; + unsigned long long component_size; /* same as array.size, except in + * sectors and up to 64bits. + */ int reshape_active; unsigned long long reshape_progress; + unsigned long long resync_start; int new_level, delta_disks, new_layout, new_chunk; int errors; int cache_size; /* size of raid456 stripe cache*/ int mismatch_cnt; char text_version[50]; + int container_member; /* for assembling external-metatdata arrays + * This is to be used internally by metadata + * handler only */ + char sys_name[20]; struct mdinfo *devs; struct mdinfo *next; + + /* Device info for mdmon: */ + int state_fd; + #define DS_FAULTY 1 + #define DS_INSYNC 2 + #define DS_WRITE_MOSTLY 4 + #define DS_SPARE 8 + #define DS_BLOCKED 16 + #define DS_REMOVE 1024 + int prev_state, curr_state, next_state; + }; struct createinfo { @@ -252,22 +285,27 @@ struct mdstat_ent { char *pattern; /* U or up, _ for down */ int percent; /* -1 if no resync */ int resync; /* 1 if resync, 0 if recovery */ + int devcnt; + int raid_disks; + int chunk_size; + char * metadata_version; struct mdstat_ent *next; }; extern struct mdstat_ent *mdstat_read(int hold, int start); extern void free_mdstat(struct mdstat_ent *ms); extern void mdstat_wait(int seconds); +extern void mdstat_wait_fd(int fd, const sigset_t *sigmask); extern int mddev_busy(int devnum); struct map_ent { struct map_ent *next; int devnum; - int major,minor; + char metadata[20]; int uuid[4]; char *path; }; -extern int map_update(struct map_ent **mpp, int devnum, int major, int minor, +extern int map_update(struct map_ent **mpp, int devnum, char *metadata, int uuid[4], char *path); extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]); extern void map_read(struct map_ent **melp); @@ -275,7 +313,7 @@ extern int map_write(struct map_ent *mel); extern void map_delete(struct map_ent **mapp, int devnum); extern void map_free(struct map_ent *map); extern void map_add(struct map_ent **melp, - int devnum, int major, int minor, int uuid[4], char *path); + int devnum, char *metadata, int uuid[4], char *path); /* various details can be requested */ #define GET_LEVEL 1 @@ -285,6 +323,7 @@ extern void map_add(struct map_ent **melp, #define GET_CACHE 16 #define GET_MISMATCH 32 #define GET_VERSION 64 +#define GET_DISKS 128 #define GET_DEVS 1024 /* gets role, major, minor */ #define GET_OFFSET 2048 @@ -295,6 +334,7 @@ extern void map_add(struct map_ent **melp, /* If fd >= 0, get the array it is open on, * else use devnum. >=0 -> major9. <0..... */ +extern int sysfs_open(int devnum, char *devname, char *attr); extern void sysfs_free(struct mdinfo *sra); extern struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options); extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, @@ -303,6 +343,11 @@ extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, char *name, unsigned long long val); extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, char *name, unsigned long long *val); +extern int sysfs_set_array(struct mdinfo *sra, + struct mdinfo *info); +extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd); +extern int sysfs_disk_to_scsi_id(int fd, __u32 *id); +extern int sysfs_unique_holder(int devnum, long rdev); extern int save_stripes(int *source, unsigned long long *offsets, @@ -326,28 +371,126 @@ extern mapping_t r5layout[], pers[], modes[], faultylayout[]; extern char *map_dev(int major, int minor, int create); +struct active_array; +struct metadata_update; +/* A superswitch provides entry point the a metadata handler. + * + * The super_switch primarily operates on some "metadata" that + * is accessed via the 'supertype'. + * This metadata has one of three possible sources. + * 1/ It is read from a single device. In this case it may not completely + * describe the array or arrays as some information might be on other + * devices. + * 2/ It is read from all devices in a container. In this case all + * information is present. + * 3/ It is created by ->init_super / ->add_to_super. In this case it will + * be complete once enough ->add_to_super calls have completed. + * + * When creating an array inside a container, the metadata will be + * formed by a combination of 2 and 3. The metadata or the array is read, + * then new information is added. + * + * The metadata must sometimes have a concept of a 'current' array + * and a 'current' device. + * The 'current' array is set by init_super to be the newly created array, + * or is set by super_by_fd when it finds it is looking at an array inside + * a container. + * + * The 'current' device is either the device that the metadata was read from + * in case 1, or the last device added by add_to_super in case 3. + * Case 2 does not identify a 'current' device. + */ extern struct superswitch { + + /* Used to report details of metadata read from a component + * device. ->load_super has been called. + */ void (*examine_super)(struct supertype *st, char *homehost); void (*brief_examine_super)(struct supertype *st); void (*export_examine_super)(struct supertype *st); + + /* Used to report details of an active array. + * ->load_super was possibly given a 'component' string. + */ void (*detail_super)(struct supertype *st, char *homehost); void (*brief_detail_super)(struct supertype *st); void (*export_detail_super)(struct supertype *st); + + /* Used: + * to get uuid to storing in bitmap metadata + * and 'reshape' backup-data metadata + * To see if a device is being re-added to an array it was part of. + */ void (*uuid_from_super)(struct supertype *st, int uuid[4]); + + /* Extra generic details from metadata. This could be details about + * the container, or about an individual array within the container. + * The determination is made either by: + * load_super being given a 'component' string. + * validate_geometry determining what to create. + * The info includes both array information and device information. + * The particular device should be: + * The last device added by add_to_super + * The device the metadata was loaded from by load_super + */ void (*getinfo_super)(struct supertype *st, struct mdinfo *info); + + /* Check if the given metadata is flagged as belonging to "this" + * host. For arrays that don't determine a minor-number, this + * can always be true (??) + */ int (*match_home)(struct supertype *st, char *homehost); + + /* Make one of several generic modifications to metadata + * prior to assembly (or other times). + * sparc2.2 - first bug in early 0.90 metadata + * super-minor - change name of 0.90 metadata + * summaries - 'correct' any redundant data + * resync - mark array as dirty to trigger a resync. + * uuid - set new uuid - only 0.90 or 1.x + * name - change the name of the array (where supported) + * homehost - change which host this array is tied to. + * devicesize - If metadata is at start of device, change recorded + * device size to match actual device size + * byteorder - swap bytes for 0.90 metadata + * + * force-one - mark that device as uptodate, not old or failed. + * force-array - mark array as clean if it would not otherwise + * assemble + * assemble - not sure how this is different from force-one... + * linear-grow-new - add a new device to a linear array, but don't + * change the size: so superblock still matches + * linear-grow-update - now change the size of the array. + */ int (*update_super)(struct supertype *st, struct mdinfo *info, char *update, char *devname, int verbose, int uuid_set, char *homehost); + + /* Create new metadata for new array as described. This could + * be a new container, or an array in a pre-existing container. + * Also used to zero metadata prior to writing it to invalidate old + * metadata. + */ int (*init_super)(struct supertype *st, mdu_array_info_t *info, unsigned long long size, char *name, char *homehost, int *uuid); - void (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo); + + /* update the metadata to include new device, either at create or + * when hot-adding a spare. + */ + void (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo, + int fd, char *devname); + + /* Write metadata to one device when fixing problems or adding + * a new device. + */ int (*store_super)(struct supertype *st, int fd); - int (*write_init_super)(struct supertype *st, mdu_disk_info_t *dinfo, - char *devname); + + /* Write all metadata for this array. + */ + int (*write_init_super)(struct supertype *st); int (*compare_super)(struct supertype *st, struct supertype *tst); int (*load_super)(struct supertype *st, int fd, char *devname); struct supertype * (*match_metadata_desc)(char *arg); @@ -358,15 +501,108 @@ extern struct superswitch { void (*locate_bitmap)(struct supertype *st, int fd); int (*write_bitmap)(struct supertype *st, int fd); void (*free_super)(struct supertype *st); - int major; + + /* validate_geometry is called with an st returned by + * match_metadata_desc. + * It should check that the geometry described in compatible with + * the metadata type. It will be called repeatedly as devices + * added to validate changing size and new devices. If there are + * inter-device dependencies, it should record sufficient details + * so these can be validated. + */ + int (*validate_geometry)(struct supertype *st, int level, int layout, + int raiddisks, + int chunk, unsigned long long size, + char *subdev, unsigned long long *freesize, + int verbose); + + struct mdinfo *(*container_content)(struct supertype *st); + +/* for mdmon */ + int (*open_new)(struct supertype *c, struct active_array *a, + char *inst); + + /* Tell the metadata handler the current state of the array. + * This covers whether it is known to be consistent (no pending writes) + * when how far along a resync is known to have progressed + * (in a->resync_start). + * resync status is really irrelevant if the array is not consistent, + * but some metadata (DDF!) have a place to record the distinction. + */ + void (*set_array_state)(struct active_array *a, int consistent); + + /* When the state of a device might have changed, we call set_disk to + * tell the metadata what the current state is. + * Typically this happens on spare->in_sync and (spare|in_sync)->faulty + * transitions. + * set_disk might be called when the state of the particular disk has + * not in fact changed. + */ + void (*set_disk)(struct active_array *a, int n, int state); + void (*sync_metadata)(struct supertype *st); + void (*process_update)(struct supertype *st, + struct metadata_update *update); + void (*prepare_update)(struct supertype *st, + struct metadata_update *update); + + /* activate_spare will check if the array is degraded and, if it + * is, try to find some spare space in the container. + * On success, it add appropriate updates (For process_update) to + * to the 'updates' list and returns a list of 'mdinfo' identifying + * the device, or devices as there might be multiple missing + * devices and multiple spares available. + */ + struct mdinfo *(*activate_spare)(struct active_array *a, + struct metadata_update **updates); + int swapuuid; /* true if uuid is bigending rather than hostendian */ -} super0, super1, *superlist[]; + int external; +} super0, super1, super_ddf, *superlist[]; +extern struct superswitch super_imsm; + +struct metadata_update { + int len; + char *buf; + void *space; /* allocated space that monitor will use */ + struct metadata_update *next; +}; + +/* A supertype holds a particular collection of metadata. + * It identifies the metadata type by the superswitch, and the particular + * sub-version of that metadata type. + * metadata read in or created is stored in 'sb' and 'info'. + * There are also fields used by mdmon to track containers. + * + * A supertype is created by: + * super_by_fd + * guess_super + * dup_super + */ struct supertype { struct superswitch *ss; int minor_version; int max_devs; + int container_dev; /* devnum of container */ + char subarray[32]; /* name of array inside container */ void *sb; + void *info; + + struct metadata_update *updates; + struct metadata_update **update_tail; + + /* extra stuff used by mdmon */ + struct active_array *arrays; + int sock; /* listen to external programs */ + int devnum; + char *devname; /* e.g. md0. This appears in metadata_verison: + * external:/md0/12 + */ + int devcnt; + char *device_name; /* e.g. /dev/md/whatever */ + + struct mdinfo *devs; + }; extern struct supertype *super_by_fd(int fd); @@ -459,11 +695,13 @@ extern int Monitor(mddev_dev_t devlist, int period, int daemonise, int scan, int oneshot, int dosyslog, int test, char *pidfile); -extern int Kill(char *dev, int force, int quiet); +extern int Kill(char *dev, int force, int quiet, int noexcl); extern int Wait(char *dev); extern int Incremental(char *devname, int verbose, int runstop, struct supertype *st, char *homehost, int autof); +extern int Incremental_container(struct supertype *st, char *devname, + int verbose, int runstop, int autof); extern void RebuildMap(void); extern int IncrementalScan(int verbose); @@ -484,6 +722,7 @@ extern int check_raid(int fd, char *name); extern int get_mdp_major(void); extern int dev_open(char *dev, int flags); +extern int open_dev_excl(int devnum); extern int is_standard(char *dev, int *nump); extern int parse_auto(char *str, char *msg, int config); @@ -509,6 +748,10 @@ extern int enough(int level, int raid_disks, int layout, int clean, extern int ask(char *mesg); extern unsigned long long get_component_size(int fd); extern void remove_partitions(int fd); +extern unsigned long long calc_array_size(int level, int raid_disks, int layout, + int chunksize, unsigned long long devsize); +extern int flush_metadata_updates(struct supertype *st); +extern void append_metadata_update(struct supertype *st, void *buf, int len); extern char *human_size(long long bytes); @@ -525,12 +768,45 @@ extern char DefaultConfFile[]; extern int open_mddev(char *dev, int autof); extern int open_mddev_devnum(char *devname, int devnum, char *name, char *chosen_name, int parts); - +extern int open_container(int fd); + +extern int mdmon_running(int devnum); +extern int signal_mdmon(int devnum); +extern int env_no_mdmon(void); +extern int start_mdmon(int devnum); + +extern char *devnum2devname(int num); +extern int devname2devnum(char *name); +extern int fd2devnum(int fd); + +static inline int dev2major(int d) +{ + if (d >= 0) + return MD_MAJOR; + else + return get_mdp_major(); +} + +static inline int dev2minor(int d) +{ + if (d >= 0) + return d; + return (-1-d) << MdpMinorShift; +} + +static inline int ROUND_UP(int a, int base) +{ + return ((a+base-1)/base)*base; +} #define LEVEL_MULTIPATH (-4) #define LEVEL_LINEAR (-1) #define LEVEL_FAULTY (-5) +/* kernel module doesn't know about these */ +#define LEVEL_CONTAINER (-100) +#define LEVEL_UNSUPPORTED (-200) + /* faulty stuff */ diff --git a/mdmon.c b/mdmon.c new file mode 100644 index 00000000..85f44bc2 --- /dev/null +++ b/mdmon.c @@ -0,0 +1,348 @@ + +/* + * md array manager. + * When md arrays have user-space managed metadata, this is the program + * that does the managing. + * + * Given one argument: the name of the array (e.g. /dev/md0) that is + * the container. + * We fork off a helper that runs high priority and mlocked. It responds to + * device failures and other events that might stop writeout, or that are + * trivial to deal with. + * The main thread then watches for new arrays being created in the container + * and starts monitoring them too ... along with a few other tasks. + * + * The main thread communicates with the priority thread by writing over + * a pipe. + * Separate programs can communicate with the main thread via Unix-domain + * socket. + * The two threads share address space and open file table. + * + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/wait.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <fcntl.h> +#include <signal.h> + +#include <sched.h> + +#include "mdadm.h" +#include "mdmon.h" + +struct active_array *discard_this; +struct active_array *pending_discard; + +int mon_tid, mgr_tid; + +int run_child(void *v) +{ + struct supertype *c = v; + + do_monitor(c); + return 0; +} + +int clone_monitor(struct supertype *container) +{ + static char stack[4096]; + + mon_tid = clone(run_child, stack+4096-64, + CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD, + container); + + mgr_tid = syscall(SYS_gettid); + + return mon_tid; +} + +static struct superswitch *find_metadata_methods(char *vers) +{ + if (strcmp(vers, "ddf") == 0) + return &super_ddf; + if (strcmp(vers, "imsm") == 0) + return &super_imsm; + return NULL; +} + + +static int make_pidfile(char *devname, int o_excl) +{ + char path[100]; + char pid[10]; + int fd; + sprintf(path, "/var/run/mdadm/%s.pid", devname); + + fd = open(path, O_RDWR|O_CREAT|o_excl, 0600); + if (fd < 0) + return -1; + sprintf(pid, "%d\n", getpid()); + write(fd, pid, strlen(pid)); + close(fd); + return 0; +} + +static void try_kill_monitor(char *devname) +{ + char buf[100]; + int fd; + pid_t pid; + + sprintf(buf, "/var/run/mdadm/%s.pid", devname); + fd = open(buf, O_RDONLY); + if (fd < 0) + return; + + if (read(fd, buf, sizeof(buf)) < 0) { + close(fd); + return; + } + + close(fd); + pid = strtoul(buf, NULL, 10); + + /* kill this process if it is mdmon */ + sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid); + fd = open(buf, O_RDONLY); + if (fd < 0) + return; + + if (read(fd, buf, sizeof(buf)) < 0) { + close(fd); + return; + } + + if (strstr(buf, "mdmon") != NULL) + kill(pid, SIGTERM); +} + +void remove_pidfile(char *devname) +{ + char buf[100]; + + sprintf(buf, "/var/run/mdadm/%s.pid", devname); + unlink(buf); +} + +static int make_control_sock(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + + sprintf(path, "/var/run/mdadm/%s.sock", devname); + unlink(path); + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + if (bind(sfd, &addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + listen(sfd, 10); + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + return sfd; +} + +static void wake_me(int sig) +{ + +} + +/* if we are debugging and starting mdmon by hand then don't fork */ +static int do_fork(void) +{ + #ifdef DEBUG + if (env_no_mdmon()) + return 0; + #endif + + return 1; +} + + + +int main(int argc, char *argv[]) +{ + int mdfd; + struct mdinfo *mdi, *di; + struct supertype *container; + sigset_t set; + struct sigaction act; + int pfd[2]; + int status; + + if (argc != 2) { + fprintf(stderr, "Usage: md-manage /device/name/for/container\n"); + exit(2); + } + mdfd = open(argv[1], O_RDWR); + if (mdfd < 0) { + fprintf(stderr, "md-manage: %s: %s\n", argv[1], + strerror(errno)); + exit(1); + } + if (md_get_version(mdfd) < 0) { + fprintf(stderr, "md-manage: %s: Not an md device\n", + argv[1]); + exit(1); + } + + /* Fork, and have the child tell us when they are ready */ + if (do_fork()) { + pipe(pfd); + switch(fork()) { + case -1: + fprintf(stderr, "mdmon: failed to fork: %s\n", + strerror(errno)); + exit(1); + case 0: /* child */ + close(pfd[0]); + break; + default: /* parent */ + close(pfd[1]); + if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) { + wait(&status); + status = WEXITSTATUS(status); + } + exit(status); + } + } else + pfd[0] = pfd[1] = -1; + /* hopefully it is a container - we'll check later */ + + container = malloc(sizeof(*container)); + container->devnum = fd2devnum(mdfd); + container->devname = devnum2devname(container->devnum); + container->device_name = argv[1]; + + /* If this fails, we hope it already exists */ + mkdir("/var/run/mdadm", 0600); + /* pid file lives in /var/run/mdadm/mdXX.pid */ + if (make_pidfile(container->devname, O_EXCL) < 0) { + if (ping_monitor(container->devname) == 0) { + fprintf(stderr, "mdmon: %s already managed\n", + container->devname); + exit(3); + } else { + /* cleanup the old monitor, this one is taking over */ + try_kill_monitor(container->devname); + if (make_pidfile(container->devname, 0) < 0) { + fprintf(stderr, "mdmon: %s Cannot create pidfile\n", + container->devname); + exit(3); + } + } + } + + container->sock = make_control_sock(container->devname); + if (container->sock < 0) { + fprintf(stderr, "mdmon: Cannot create socket in /var/run/mdadm\n"); + exit(3); + } + container->arrays = NULL; + + mdi = sysfs_read(mdfd, container->devnum, + GET_VERSION|GET_LEVEL|GET_DEVS); + + if (!mdi) { + fprintf(stderr, "mdmon: failed to load sysfs info for %s\n", + container->devname); + exit(3); + } + if (mdi->array.level != UnSet) { + fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n", + argv[1]); + exit(3); + } + if (mdi->array.major_version != -1 || + mdi->array.minor_version != -2) { + fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n", + argv[1]); + exit(3); + } + + container->ss = find_metadata_methods(mdi->text_version); + if (container->ss == NULL) { + fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n", + argv[1], mdi->text_version); + exit(3); + } + + container->devs = NULL; + for (di = mdi->devs; di; di = di->next) { + struct mdinfo *cd = malloc(sizeof(*cd)); + cd = di; + cd->next = container->devs; + container->devs = cd; + } + sysfs_free(mdi); + + + if (container->ss->load_super(container, mdfd, argv[1])) { + fprintf(stderr, "mdmon: Cannot load metadata for %s\n", + argv[1]); + exit(3); + } + + /* Ok, this is close enough. We can say goodbye to our parent now. + */ + status = 0; + write(pfd[1], &status, sizeof(status)); + close(pfd[1]); + + chdir("/"); + setsid(); + close(0); + open("/dev/null", O_RDWR); + close(1); + dup(0); +#ifndef DEBUG + close(2); + dup(0); +#endif + + mlockall(MCL_FUTURE); + + /* SIGUSR is sent between parent and child. So both block it + * and enable it only with pselect. + */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + sigprocmask(SIG_BLOCK, &set, NULL); + act.sa_handler = wake_me; + act.sa_flags = 0; + sigaction(SIGUSR1, &act, NULL); + act.sa_handler = SIG_IGN; + sigaction(SIGPIPE, &act, NULL); + + if (clone_monitor(container) < 0) { + fprintf(stderr, "md-manage: failed to start monitor process: %s\n", + strerror(errno)); + exit(2); + } + + do_manager(container); + + exit(0); +} diff --git a/mdmon.h b/mdmon.h new file mode 100644 index 00000000..6c1961ad --- /dev/null +++ b/mdmon.h @@ -0,0 +1,65 @@ +#ifdef DEBUG +#define dprintf(fmt, arg...) \ + fprintf(stderr, fmt, ##arg) +#else +#define dprintf(fmt, arg...) \ + ({ if (0) fprintf(stderr, fmt, ##arg); 0; }) +#endif + +enum array_state { clear, inactive, suspended, readonly, read_auto, + clean, active, write_pending, active_idle, bad_word}; + +enum sync_action { idle, reshape, resync, recover, check, repair, bad_action }; + + +struct active_array { + struct mdinfo info; + struct supertype *container; + struct active_array *next, *replaces; + + int action_fd; + int resync_start_fd; + + enum array_state prev_state, curr_state, next_state; + enum sync_action prev_action, curr_action, next_action; + + int check_degraded; /* flag set by mon, read by manage */ + + int devnum; + + unsigned long long resync_start; +}; + +/* + * Metadata updates are handled by the monitor thread, + * as it has exclusive access to the metadata. + * When the manager want to updates metadata, either + * for it's own reason (e.g. committing a spare) or + * on behalf of mdadm, it creates a metadata_update + * structure and queues it to the monitor. + * Updates are created and processed by code under the + * superswitch. All common code sees them as opaque + * blobs. + */ +extern struct metadata_update *update_queue, *update_queue_handled; + +#define MD_MAJOR 9 + +extern struct active_array *container; +extern struct active_array *discard_this; +extern struct active_array *pending_discard; +extern struct md_generic_cmd *active_cmd; + + +void remove_pidfile(char *devname); +void do_monitor(struct supertype *container); +void do_manager(struct supertype *container); + +int read_dev_state(int fd); +int get_resync_start(struct active_array *a); + +struct mdstat_ent *mdstat_read(int hold, int start); + +extern int exit_now, manager_ready; +extern int mon_tid, mgr_tid; +extern int monitor_loop_cnt; @@ -86,6 +86,7 @@ #include "mdadm.h" #include "dlink.h" #include <sys/select.h> +#include <ctype.h> void free_mdstat(struct mdstat_ent *ms) { @@ -94,6 +95,7 @@ void free_mdstat(struct mdstat_ent *ms) if (ms->dev) free(ms->dev); if (ms->level) free(ms->level); if (ms->pattern) free(ms->pattern); + if (ms->metadata_version) free(ms->metadata_version); t = ms; ms = ms->next; free(t); @@ -158,6 +160,10 @@ struct mdstat_ent *mdstat_read(int hold, int start) ent->percent = -1; ent->active = -1; ent->resync = 0; + ent->metadata_version = NULL; + ent->raid_disks = 0; + ent->chunk_size = 0; + ent->devcnt = 0; ent->dev = strdup(line); ent->devnum = devnum; @@ -176,22 +182,28 @@ struct mdstat_ent *mdstat_read(int hold, int start) in_devs = 1; } else if (in_devs && strcmp(w, "blocks")==0) in_devs = 0; - else if (in_devs && strncmp(w, "md", 2)==0) { - /* This has an md device as a component. - * If that device is already in the list, - * make sure we insert before there. - */ - struct mdstat_ent **ih; - int dn2; - if (strncmp(w, "md_d", 4)==0) - dn2 = -1-strtoul(w+4, &ep, 10); - else - dn2 = strtoul(w+2, &ep, 10); - ih = &all; - while (ih != insert_here && *ih && - (*ih)->devnum != dn2) - ih = & (*ih)->next; - insert_here = ih; + else if (in_devs) { + ent->devcnt++; + if (strncmp(w, "md", 2)==0) { + /* This has an md device as a component. + * If that device is already in the + * list, make sure we insert before + * there. + */ + struct mdstat_ent **ih; + int dn2 = devname2devnum(w); + ih = &all; + while (ih != insert_here && *ih && + (*ih)->devnum != dn2) + ih = & (*ih)->next; + insert_here = ih; + } + } else if (strcmp(w, "super") == 0 && + dl_next(w) != line) { + w = dl_next(w); + ent->metadata_version = strdup(w); + } else if (w[0] == '[' && isdigit(w[1])) { + ent->raid_disks = atoi(w+1); } else if (!ent->pattern && w[0] == '[' && (w[1] == 'U' || w[1] == '_')) { @@ -256,6 +268,20 @@ void mdstat_wait(int seconds) select(mdstat_fd >2 ? mdstat_fd+1:3, NULL, NULL, &fds, &tm); } +void mdstat_wait_fd(int fd, const sigset_t *sigmask) +{ + fd_set fds, rfds; + + FD_ZERO(&fds); + FD_ZERO(&rfds); + if (mdstat_fd >= 0) + FD_SET(mdstat_fd, &fds); + FD_SET(fd, &rfds); + + pselect(mdstat_fd >2 ? mdstat_fd+1:3, &rfds, NULL, &fds, + NULL, sigmask); +} + int mddev_busy(int devnum) { struct mdstat_ent *mdstat = mdstat_read(0, 0); diff --git a/monitor.c b/monitor.c new file mode 100644 index 00000000..7cce5a8b --- /dev/null +++ b/monitor.c @@ -0,0 +1,527 @@ + +#include "mdadm.h" +#include "mdmon.h" +#include <sys/syscall.h> +#include <sys/select.h> +#include <signal.h> + +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", + "clean", "active", "write-pending", "active-idle", NULL }; +static char *sync_actions[] = { + "idle", "reshape", "resync", "recover", "check", "repair", NULL +}; + +static int write_attr(char *attr, int fd) +{ + return write(fd, attr, strlen(attr)); +} + +static void add_fd(fd_set *fds, int *maxfd, int fd) +{ + if (fd < 0) + return; + if (fd > *maxfd) + *maxfd = fd; + FD_SET(fd, fds); +} + +static int read_attr(char *buf, int len, int fd) +{ + int n; + + if (fd < 0) { + buf[0] = 0; + return 0; + } + lseek(fd, 0, 0); + n = read(fd, buf, len - 1); + + if (n <= 0) { + buf[0] = 0; + return 0; + } + buf[n] = 0; + if (buf[n-1] == '\n') + buf[n-1] = 0; + return n; +} + + +int get_resync_start(struct active_array *a) +{ + char buf[30]; + int n; + + n = read_attr(buf, 30, a->resync_start_fd); + if (n <= 0) + return n; + + a->resync_start = strtoull(buf, NULL, 10); + + return 1; +} + +static int attr_match(const char *attr, const char *str) +{ + /* See if attr, read from a sysfs file, matches + * str. They must either be the same, or attr can + * have a trailing newline or comma + */ + while (*attr && *str && *attr == *str) { + attr++; + str++; + } + + if (*str || (*attr && *attr != ',' && *attr != '\n')) + return 0; + return 1; +} + +static int match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (attr_match(word, list[n])) + break; + return n; +} + +static enum array_state read_state(int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_word; + return (enum array_state) match_word(buf, array_states); +} + +static enum sync_action read_action( int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_action; + return (enum sync_action) match_word(buf, sync_actions); +} + +int read_dev_state(int fd) +{ + char buf[60]; + int n = read_attr(buf, 60, fd); + char *cp; + int rv = 0; + + if (n <= 0) + return 0; + + cp = buf; + while (cp) { + if (attr_match(cp, "faulty")) + rv |= DS_FAULTY; + if (attr_match(cp, "in_sync")) + rv |= DS_INSYNC; + if (attr_match(cp, "write_mostly")) + rv |= DS_WRITE_MOSTLY; + if (attr_match(cp, "spare")) + rv |= DS_SPARE; + if (attr_match(cp, "blocked")) + rv |= DS_BLOCKED; + cp = strchr(cp, ','); + if (cp) + cp++; + } + return rv; +} + +static void signal_manager(void) +{ + /* tgkill(getpid(), mon_tid, SIGUSR1); */ + int pid = getpid(); + syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1); +} + +/* Monitor a set of active md arrays - all of which share the + * same metadata - and respond to events that require + * metadata update. + * + * New arrays are detected by another thread which allocates + * required memory and attaches the data structure to our list. + * + * Events: + * Array stops. + * This is detected by array_state going to 'clear' or 'inactive'. + * while we thought it was active. + * Response is to mark metadata as clean and 'clear' the array(??) + * write-pending + * array_state if 'write-pending' + * We mark metadata as 'dirty' then set array to 'active'. + * active_idle + * Either ignore, or mark clean, then mark metadata as clean. + * + * device fails + * detected by rd-N/state reporting "faulty" + * mark device as 'failed' in metadata, let the kernel release the + * device by writing '-blocked' to rd/state, and finally write 'remove' to + * rd/state. Before a disk can be replaced it must be failed and removed + * from all container members, this will be preemptive for the other + * arrays... safe? + * + * sync completes + * sync_action was 'resync' and becomes 'idle' and resync_start becomes + * MaxSector + * Notify metadata that sync is complete. + * + * recovery completes + * sync_action changes from 'recover' to 'idle' + * Check each device state and mark metadata if 'faulty' or 'in_sync'. + * + * deal with resync + * This only happens on finding a new array... mdadm will have set + * 'resync_start' to the correct value. If 'resync_start' indicates that an + * resync needs to occur set the array to the 'active' state rather than the + * initial read-auto state. + * + * + * + * We wait for a change (poll/select) on array_state, sync_action, and + * each rd-X/state file. + * When we get any change, we check everything. So read each state file, + * then decide what to do. + * + * The core action is to write new metadata to all devices in the array. + * This is done at most once on any wakeup. + * After that we might: + * - update the array_state + * - set the role of some devices. + * - request a sync_action + * + */ + +static int read_and_act(struct active_array *a) +{ + int check_degraded = 0; + int deactivate = 0; + struct mdinfo *mdi; + + a->next_state = bad_word; + a->next_action = bad_action; + + a->curr_state = read_state(a->info.state_fd); + a->curr_action = read_action(a->action_fd); + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->next_state = 0; + if (mdi->state_fd >= 0) + mdi->curr_state = read_dev_state(mdi->state_fd); + } + + if (a->curr_state <= inactive && + a->prev_state > inactive) { + /* array has been stopped */ + get_resync_start(a); + a->container->ss->set_array_state(a, 1); + a->next_state = clear; + deactivate = 1; + } + if (a->curr_state == write_pending) { + get_resync_start(a); + a->container->ss->set_array_state(a, 0); + a->next_state = active; + } + if (a->curr_state == active_idle) { + /* Set array to 'clean' FIRST, then + * a->ss->mark_clean(a, ~0ULL); + * just ignore for now. + */ + } + + if (a->curr_state == readonly) { + /* Well, I'm ready to handle things, so + * read-auto is OK. FIXME what if we really want + * readonly ??? + */ + get_resync_start(a); +// printf("Found a readonly array at %llu\n", a->resync_start); + if (a->resync_start == ~0ULL) + a->next_state = read_auto; /* array is clean */ + else { + a->container->ss->set_array_state(a, 0); + a->next_state = active; + } + } + + if (!deactivate && + a->curr_action == idle && + a->prev_action == resync) { + /* A resync has finished. The endpoint is recorded in + * 'sync_start'. We don't update the metadata + * until the array goes inactive or readonly though. + * Just check if we need to fiddle spares. + */ + get_resync_start(a); + a->container->ss->set_array_state(a, 0); + check_degraded = 1; + } + + if (!deactivate && + a->curr_action == idle && + a->prev_action == recover) { + /* A recovery has finished. Some disks may be in sync now, + * and the array may no longer be degraded + */ + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + a->container->ss->set_disk(a, mdi->disk.raid_disk, + mdi->curr_state); + if (! (mdi->curr_state & DS_INSYNC)) + check_degraded = 1; + } + } + + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + if (mdi->curr_state & DS_FAULTY) { + a->container->ss->set_disk(a, mdi->disk.raid_disk, + mdi->curr_state); + check_degraded = 1; + mdi->next_state = DS_REMOVE; + } + } + + a->container->ss->sync_metadata(a->container); + dprintf("%s: update[%d]: (", __func__, a->info.container_member); + + /* Effect state changes in the array */ + if (a->next_state != bad_word) { + dprintf(" state:%s", array_states[a->next_state]); + write_attr(array_states[a->next_state], a->info.state_fd); + } + if (a->next_action != bad_action) { + write_attr(sync_actions[a->next_action], a->action_fd); + dprintf(" action:%s", array_states[a->next_state]); + } + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + if (mdi->next_state == DS_REMOVE && mdi->state_fd >= 0) { + int remove_result; + + write_attr("-blocked", mdi->state_fd); + /* the kernel may not be able to immediately remove the + * disk, we can simply wait until the next event to try + * again. + */ + dprintf(" %d:-blocked", mdi->disk.raid_disk); + remove_result = write_attr("remove", mdi->state_fd); + if (remove_result > 0) { + dprintf(" %d:removed", mdi->disk.raid_disk); + close(mdi->state_fd); + mdi->state_fd = -1; + } + } + if (mdi->next_state & DS_INSYNC) { + write_attr("+in_sync", mdi->state_fd); + dprintf(" %d:+in_sync", mdi->disk.raid_disk); + } + } + dprintf(" )\n"); + + /* move curr_ to prev_ */ + a->prev_state = a->curr_state; + + a->prev_action = a->curr_action; + + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->prev_state = mdi->curr_state; + mdi->next_state = 0; + } + + if (check_degraded) { + /* manager will do the actual check */ + a->check_degraded = 1; + signal_manager(); + } + + if (deactivate) + a->container = NULL; + + return 1; +} + +static struct mdinfo * +find_device(struct active_array *a, int major, int minor) +{ + struct mdinfo *mdi; + + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + if (mdi->disk.major == major && mdi->disk.minor == minor) + return mdi; + + return NULL; +} + +static void reconcile_failed(struct active_array *aa, struct mdinfo *failed) +{ + struct active_array *a; + struct mdinfo *victim; + + for (a = aa; a; a = a->next) { + if (!a->container) + continue; + victim = find_device(a, failed->disk.major, failed->disk.minor); + if (!victim) + continue; + + if (!(victim->curr_state & DS_FAULTY)) + write_attr("faulty", victim->state_fd); + } +} + +#ifdef DEBUG +static void dprint_wake_reasons(fd_set *fds) +{ + int i; + char proc_path[256]; + char link[256]; + char *basename; + int rv; + + fprintf(stderr, "monitor: wake ( "); + for (i = 0; i < FD_SETSIZE; i++) { + if (FD_ISSET(i, fds)) { + sprintf(proc_path, "/proc/%d/fd/%d", + (int) getpid(), i); + + rv = readlink(proc_path, link, sizeof(link) - 1); + if (rv < 0) { + fprintf(stderr, "%d:unknown ", i); + continue; + } + link[rv] = '\0'; + basename = strrchr(link, '/'); + fprintf(stderr, "%d:%s ", + i, basename ? ++basename : link); + } + } + fprintf(stderr, ")\n"); +} +#endif + +int monitor_loop_cnt; + +static int wait_and_act(struct supertype *container, int nowait) +{ + fd_set rfds; + int maxfd = 0; + struct active_array **aap = &container->arrays; + struct active_array *a, **ap; + int rv; + struct mdinfo *mdi; + + FD_ZERO(&rfds); + + for (ap = aap ; *ap ;) { + a = *ap; + /* once an array has been deactivated we want to + * ask the manager to discard it. + */ + if (!a->container) { + if (discard_this) { + ap = &(*ap)->next; + continue; + } + *ap = a->next; + a->next = NULL; + discard_this = a; + signal_manager(); + continue; + } + + add_fd(&rfds, &maxfd, a->info.state_fd); + add_fd(&rfds, &maxfd, a->action_fd); + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + add_fd(&rfds, &maxfd, mdi->state_fd); + + ap = &(*ap)->next; + } + + if (manager_ready && *aap == NULL) { + /* No interesting arrays. Lets see about exiting. + * Note that blocking at this point is not a problem + * as there are no active arrays, there is nothing that + * we need to be ready to do. + */ + int fd = open(container->device_name, O_RDONLY|O_EXCL); + if (fd >= 0 || errno != EBUSY) { + /* OK, we are safe to leave */ + dprintf("no arrays to monitor... exiting\n"); + remove_pidfile(container->devname); + exit_now = 1; + signal_manager(); + exit(0); + } + } + + if (!nowait) { + sigset_t set; + sigprocmask(SIG_UNBLOCK, NULL, &set); + sigdelset(&set, SIGUSR1); + monitor_loop_cnt |= 1; + rv = pselect(maxfd+1, &rfds, NULL, NULL, NULL, &set); + monitor_loop_cnt += 1; + if (rv == -1 && errno == EINTR) + rv = 0; + #ifdef DEBUG + dprint_wake_reasons(&rfds); + #endif + + } + + if (update_queue) { + struct metadata_update *this; + + for (this = update_queue; this ; this = this->next) + container->ss->process_update(container, this); + + update_queue_handled = update_queue; + update_queue = NULL; + signal_manager(); + container->ss->sync_metadata(container); + } + + for (a = *aap; a ; a = a->next) { + if (a->replaces && !discard_this) { + struct active_array **ap; + for (ap = &a->next; *ap && *ap != a->replaces; + ap = & (*ap)->next) + ; + if (*ap) + *ap = (*ap)->next; + discard_this = a->replaces; + a->replaces = NULL; + /* FIXME check if device->state_fd need to be cleared?*/ + signal_manager(); + } + if (a->container) + rv += read_and_act(a); + } + + /* propagate failures across container members */ + for (a = *aap; a ; a = a->next) { + if (!a->container) + continue; + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + if (mdi->curr_state & DS_FAULTY) + reconcile_failed(*aap, mdi); + } + + return rv; +} + +void do_monitor(struct supertype *container) +{ + int rv; + int first = 1; + do { + rv = wait_and_act(container, first); + first = 0; + } while (rv >= 0); +} @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2008 Intel Corporation + * + * mdmon socket / message handling + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <unistd.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include "mdadm.h" +#include "mdmon.h" + +static const __u32 start_magic = 0x5a5aa5a5; +static const __u32 end_magic = 0xa5a55a5a; + +static int send_buf(int fd, const void* buf, int len, int tmo) +{ + fd_set set; + int rv; + struct timeval timeout = {tmo, 0}; + struct timeval *ptmo = tmo ? &timeout : NULL; + + while (len) { + FD_ZERO(&set); + FD_SET(fd, &set); + rv = select(fd+1, NULL, &set, NULL, ptmo); + if (rv <= 0) + return -1; + rv = write(fd, buf, len); + if (rv <= 0) + return -1; + len -= rv; + buf += rv; + } + return 0; +} + +static int recv_buf(int fd, void* buf, int len, int tmo) +{ + fd_set set; + int rv; + struct timeval timeout = {tmo, 0}; + struct timeval *ptmo = tmo ? &timeout : NULL; + + while (len) { + FD_ZERO(&set); + FD_SET(fd, &set); + rv = select(fd+1, &set, NULL, NULL, ptmo); + if (rv <= 0) + return -1; + rv = read(fd, buf, len); + if (rv <= 0) + return -1; + len -= rv; + buf += rv; + } + return 0; +} + + +int send_message(int fd, struct metadata_update *msg, int tmo) +{ + __u32 len = msg->len; + int rv; + + rv = send_buf(fd, &start_magic, 4, tmo); + rv = rv ?: send_buf(fd, &len, 4, tmo); + if (len) + rv = rv ?: send_buf(fd, msg->buf, msg->len, tmo); + rv = send_buf(fd, &end_magic, 4, tmo); + + return rv; +} + +int receive_message(int fd, struct metadata_update *msg, int tmo) +{ + __u32 magic; + __u32 len; + int rv; + + rv = recv_buf(fd, &magic, 4, tmo); + if (rv < 0 || magic != start_magic) + return -1; + rv = recv_buf(fd, &len, 4, tmo); + if (rv < 0 || len > MSG_MAX_LEN) + return -1; + if (len) { + msg->buf = malloc(len); + if (msg->buf == NULL) + return -1; + rv = recv_buf(fd, msg->buf, len, tmo); + if (rv < 0) { + free(msg->buf); + return -1; + } + } else + msg->buf = NULL; + rv = recv_buf(fd, &magic, 4, tmo); + if (rv < 0 || magic != end_magic) { + free(msg->buf); + return -1; + } + msg->len = len; + return 0; +} + +int ack(int fd, int tmo) +{ + struct metadata_update msg = { .len = 0 }; + + return send_message(fd, &msg, tmo); +} + +int wait_reply(int fd, int tmo) +{ + struct metadata_update msg; + return receive_message(fd, &msg, tmo); +} + +int connect_monitor(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + + sprintf(path, "/var/run/mdadm/%s.sock", devname); + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + if (connect(sfd, &addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + + return sfd; +} + +int ping_monitor(char *devname) +{ + int sfd = connect_monitor(devname); + int err = 0; + + if (sfd < 0) + return sfd; + + /* try to ping existing socket */ + if (ack(sfd, 20) != 0) + err = -1; + + /* check the reply */ + if (!err && wait_reply(sfd, 20) != 0) + err = -1; + + close(sfd); + return err; +} @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2008 Intel Corporation + * + * mdmon socket / message handling + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + + +struct mdinfo; +struct metadata_update; + +extern int receive_message(int fd, struct metadata_update *msg, int tmo); +extern int send_message(int fd, struct metadata_update *msg, int tmo); +extern int ack(int fd, int tmo); +extern int wait_reply(int fd, int tmo); +extern int connect_monitor(char *devname); +extern int ping_monitor(char *devname); + +#define MSG_MAX_LEN (4*1024*1024) diff --git a/sg_io.c b/sg_io.c new file mode 100644 index 00000000..4ae5d927 --- /dev/null +++ b/sg_io.c @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007 Intel Corporation + * + * Retrieve drive serial numbers for scsi disks + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include <string.h> +#include <scsi/scsi.h> +#include <scsi/sg.h> +#include <sys/ioctl.h> + +int scsi_get_serial(int fd, void *buf, size_t buf_len) +{ + unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, buf_len, 0}; + unsigned char sense[32]; + struct sg_io_hdr io_hdr; + + memset(&io_hdr, 0, sizeof(io_hdr)); + io_hdr.interface_id = 'S'; + io_hdr.cmdp = inq_cmd; + io_hdr.cmd_len = sizeof(inq_cmd); + io_hdr.dxferp = buf; + io_hdr.dxfer_len = buf_len; + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.sbp = sense; + io_hdr.mx_sb_len = sizeof(sense); + io_hdr.timeout = 5000; + + return ioctl(fd, SG_IO, &io_hdr); +} diff --git a/super-ddf.c b/super-ddf.c new file mode 100644 index 00000000..5d387504 --- /dev/null +++ b/super-ddf.c @@ -0,0 +1,3227 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2007 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neil@brown.name> + * + * Specifications for DDF takes from Common RAID DDF Specification Revision 1.2 + * (July 28 2006). Reused by permission of SNIA. + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "mdmon.h" +#include "sha1.h" +#include <values.h> + +/* a non-official T10 name for creation GUIDs */ +static char T10[] = "Linux-MD"; + +/* DDF timestamps are 1980 based, so we need to add + * second-in-decade-of-seventies to convert to linux timestamps. + * 10 years with 2 leap years. + */ +#define DECADE (3600*24*(365*10+2)) +unsigned long crc32( + unsigned long crc, + const unsigned char *buf, + unsigned len); + +/* The DDF metadata handling. + * DDF metadata lives at the end of the device. + * The last 512 byte block provides an 'anchor' which is used to locate + * the rest of the metadata which usually lives immediately behind the anchor. + * + * Note: + * - all multibyte numeric fields are bigendian. + * - all strings are space padded. + * + */ + +/* Primary Raid Level (PRL) */ +#define DDF_RAID0 0x00 +#define DDF_RAID1 0x01 +#define DDF_RAID3 0x03 +#define DDF_RAID4 0x04 +#define DDF_RAID5 0x05 +#define DDF_RAID1E 0x11 +#define DDF_JBOD 0x0f +#define DDF_CONCAT 0x1f +#define DDF_RAID5E 0x15 +#define DDF_RAID5EE 0x25 +#define DDF_RAID6 0x06 + +/* Raid Level Qualifier (RLQ) */ +#define DDF_RAID0_SIMPLE 0x00 +#define DDF_RAID1_SIMPLE 0x00 /* just 2 devices in this plex */ +#define DDF_RAID1_MULTI 0x01 /* exactly 3 devices in this plex */ +#define DDF_RAID3_0 0x00 /* parity in first extent */ +#define DDF_RAID3_N 0x01 /* parity in last extent */ +#define DDF_RAID4_0 0x00 /* parity in first extent */ +#define DDF_RAID4_N 0x01 /* parity in last extent */ +/* these apply to raid5e and raid5ee as well */ +#define DDF_RAID5_0_RESTART 0x00 /* same as 'right asymmetric' - layout 1 */ +#define DDF_RAID6_0_RESTART 0x01 /* raid6 different from raid5 here!!! */ +#define DDF_RAID5_N_RESTART 0x02 /* same as 'left asymmetric' - layout 0 */ +#define DDF_RAID5_N_CONTINUE 0x03 /* same as 'left symmetric' - layout 2 */ + +#define DDF_RAID1E_ADJACENT 0x00 /* raid10 nearcopies==2 */ +#define DDF_RAID1E_OFFSET 0x01 /* raid10 offsetcopies==2 */ + +/* Secondary RAID Level (SRL) */ +#define DDF_2STRIPED 0x00 /* This is weirder than RAID0 !! */ +#define DDF_2MIRRORED 0x01 +#define DDF_2CONCAT 0x02 +#define DDF_2SPANNED 0x03 /* This is also weird - be careful */ + +/* Magic numbers */ +#define DDF_HEADER_MAGIC __cpu_to_be32(0xDE11DE11) +#define DDF_CONTROLLER_MAGIC __cpu_to_be32(0xAD111111) +#define DDF_PHYS_RECORDS_MAGIC __cpu_to_be32(0x22222222) +#define DDF_PHYS_DATA_MAGIC __cpu_to_be32(0x33333333) +#define DDF_VIRT_RECORDS_MAGIC __cpu_to_be32(0xDDDDDDDD) +#define DDF_VD_CONF_MAGIC __cpu_to_be32(0xEEEEEEEE) +#define DDF_SPARE_ASSIGN_MAGIC __cpu_to_be32(0x55555555) +#define DDF_VU_CONF_MAGIC __cpu_to_be32(0x88888888) +#define DDF_VENDOR_LOG_MAGIC __cpu_to_be32(0x01dBEEF0) +#define DDF_BBM_LOG_MAGIC __cpu_to_be32(0xABADB10C) + +#define DDF_GUID_LEN 24 +#define DDF_REVISION_0 "01.00.00" +#define DDF_REVISION_2 "01.02.00" + +struct ddf_header { + __u32 magic; /* DDF_HEADER_MAGIC */ + __u32 crc; + char guid[DDF_GUID_LEN]; + char revision[8]; /* 01.02.00 */ + __u32 seq; /* starts at '1' */ + __u32 timestamp; + __u8 openflag; + __u8 foreignflag; + __u8 enforcegroups; + __u8 pad0; /* 0xff */ + __u8 pad1[12]; /* 12 * 0xff */ + /* 64 bytes so far */ + __u8 header_ext[32]; /* reserved: fill with 0xff */ + __u64 primary_lba; + __u64 secondary_lba; + __u8 type; + __u8 pad2[3]; /* 0xff */ + __u32 workspace_len; /* sectors for vendor space - + * at least 32768(sectors) */ + __u64 workspace_lba; + __u16 max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */ + __u16 max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */ + __u16 max_partitions; /* i.e. max num of configuration + record entries per disk */ + __u16 config_record_len; /* 1 +ROUNDUP(max_primary_element_entries + *12/512) */ + __u16 max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */ + __u8 pad3[54]; /* 0xff */ + /* 192 bytes so far */ + __u32 controller_section_offset; + __u32 controller_section_length; + __u32 phys_section_offset; + __u32 phys_section_length; + __u32 virt_section_offset; + __u32 virt_section_length; + __u32 config_section_offset; + __u32 config_section_length; + __u32 data_section_offset; + __u32 data_section_length; + __u32 bbm_section_offset; + __u32 bbm_section_length; + __u32 diag_space_offset; + __u32 diag_space_length; + __u32 vendor_offset; + __u32 vendor_length; + /* 256 bytes so far */ + __u8 pad4[256]; /* 0xff */ +}; + +/* type field */ +#define DDF_HEADER_ANCHOR 0x00 +#define DDF_HEADER_PRIMARY 0x01 +#define DDF_HEADER_SECONDARY 0x02 + +/* The content of the 'controller section' - global scope */ +struct ddf_controller_data { + __u32 magic; /* DDF_CONTROLLER_MAGIC */ + __u32 crc; + char guid[DDF_GUID_LEN]; + struct controller_type { + __u16 vendor_id; + __u16 device_id; + __u16 sub_vendor_id; + __u16 sub_device_id; + } type; + char product_id[16]; + __u8 pad[8]; /* 0xff */ + __u8 vendor_data[448]; +}; + +/* The content of phys_section - global scope */ +struct phys_disk { + __u32 magic; /* DDF_PHYS_RECORDS_MAGIC */ + __u32 crc; + __u16 used_pdes; + __u16 max_pdes; + __u8 pad[52]; + struct phys_disk_entry { + char guid[DDF_GUID_LEN]; + __u32 refnum; + __u16 type; + __u16 state; + __u64 config_size; /* DDF structures must be after here */ + char path[18]; /* another horrible structure really */ + __u8 pad[6]; + } entries[0]; +}; + +/* phys_disk_entry.type is a bitmap - bigendian remember */ +#define DDF_Forced_PD_GUID 1 +#define DDF_Active_in_VD 2 +#define DDF_Global_Spare 4 /* VD_CONF records are ignored */ +#define DDF_Spare 8 /* overrides Global_spare */ +#define DDF_Foreign 16 +#define DDF_Legacy 32 /* no DDF on this device */ + +#define DDF_Interface_mask 0xf00 +#define DDF_Interface_SCSI 0x100 +#define DDF_Interface_SAS 0x200 +#define DDF_Interface_SATA 0x300 +#define DDF_Interface_FC 0x400 + +/* phys_disk_entry.state is a bigendian bitmap */ +#define DDF_Online 1 +#define DDF_Failed 2 /* overrides 1,4,8 */ +#define DDF_Rebuilding 4 +#define DDF_Transition 8 +#define DDF_SMART 16 +#define DDF_ReadErrors 32 +#define DDF_Missing 64 + +/* The content of the virt_section global scope */ +struct virtual_disk { + __u32 magic; /* DDF_VIRT_RECORDS_MAGIC */ + __u32 crc; + __u16 populated_vdes; + __u16 max_vdes; + __u8 pad[52]; + struct virtual_entry { + char guid[DDF_GUID_LEN]; + __u16 unit; + __u16 pad0; /* 0xffff */ + __u16 guid_crc; + __u16 type; + __u8 state; + __u8 init_state; + __u8 pad1[14]; + char name[16]; + } entries[0]; +}; + +/* virtual_entry.type is a bitmap - bigendian */ +#define DDF_Shared 1 +#define DDF_Enforce_Groups 2 +#define DDF_Unicode 4 +#define DDF_Owner_Valid 8 + +/* virtual_entry.state is a bigendian bitmap */ +#define DDF_state_mask 0x7 +#define DDF_state_optimal 0x0 +#define DDF_state_degraded 0x1 +#define DDF_state_deleted 0x2 +#define DDF_state_missing 0x3 +#define DDF_state_failed 0x4 +#define DDF_state_part_optimal 0x5 + +#define DDF_state_morphing 0x8 +#define DDF_state_inconsistent 0x10 + +/* virtual_entry.init_state is a bigendian bitmap */ +#define DDF_initstate_mask 0x03 +#define DDF_init_not 0x00 +#define DDF_init_quick 0x01 /* initialisation is progress. + * i.e. 'state_inconsistent' */ +#define DDF_init_full 0x02 + +#define DDF_access_mask 0xc0 +#define DDF_access_rw 0x00 +#define DDF_access_ro 0x80 +#define DDF_access_blocked 0xc0 + +/* The content of the config_section - local scope + * It has multiple records each config_record_len sectors + * They can be vd_config or spare_assign + */ + +struct vd_config { + __u32 magic; /* DDF_VD_CONF_MAGIC */ + __u32 crc; + char guid[DDF_GUID_LEN]; + __u32 timestamp; + __u32 seqnum; + __u8 pad0[24]; + __u16 prim_elmnt_count; + __u8 chunk_shift; /* 0 == 512, 1==1024 etc */ + __u8 prl; + __u8 rlq; + __u8 sec_elmnt_count; + __u8 sec_elmnt_seq; + __u8 srl; + __u64 blocks; /* blocks per component could be different + * on different component devices...(only + * for concat I hope) */ + __u64 array_blocks; /* blocks in array */ + __u8 pad1[8]; + __u32 spare_refs[8]; + __u8 cache_pol[8]; + __u8 bg_rate; + __u8 pad2[3]; + __u8 pad3[52]; + __u8 pad4[192]; + __u8 v0[32]; /* reserved- 0xff */ + __u8 v1[32]; /* reserved- 0xff */ + __u8 v2[16]; /* reserved- 0xff */ + __u8 v3[16]; /* reserved- 0xff */ + __u8 vendor[32]; + __u32 phys_refnum[0]; /* refnum of each disk in sequence */ + /*__u64 lba_offset[0]; LBA offset in each phys. Note extents in a + bvd are always the same size */ +}; + +/* vd_config.cache_pol[7] is a bitmap */ +#define DDF_cache_writeback 1 /* else writethrough */ +#define DDF_cache_wadaptive 2 /* only applies if writeback */ +#define DDF_cache_readahead 4 +#define DDF_cache_radaptive 8 /* only if doing read-ahead */ +#define DDF_cache_ifnobatt 16 /* even to write cache if battery is poor */ +#define DDF_cache_wallowed 32 /* enable write caching */ +#define DDF_cache_rallowed 64 /* enable read caching */ + +struct spare_assign { + __u32 magic; /* DDF_SPARE_ASSIGN_MAGIC */ + __u32 crc; + __u32 timestamp; + __u8 reserved[7]; + __u8 type; + __u16 populated; /* SAEs used */ + __u16 max; /* max SAEs */ + __u8 pad[8]; + struct spare_assign_entry { + char guid[DDF_GUID_LEN]; + __u16 secondary_element; + __u8 pad[6]; + } spare_ents[0]; +}; +/* spare_assign.type is a bitmap */ +#define DDF_spare_dedicated 0x1 /* else global */ +#define DDF_spare_revertible 0x2 /* else committable */ +#define DDF_spare_active 0x4 /* else not active */ +#define DDF_spare_affinity 0x8 /* enclosure affinity */ + +/* The data_section contents - local scope */ +struct disk_data { + __u32 magic; /* DDF_PHYS_DATA_MAGIC */ + __u32 crc; + char guid[DDF_GUID_LEN]; + __u32 refnum; /* crc of some magic drive data ... */ + __u8 forced_ref; /* set when above was not result of magic */ + __u8 forced_guid; /* set if guid was forced rather than magic */ + __u8 vendor[32]; + __u8 pad[442]; +}; + +/* bbm_section content */ +struct bad_block_log { + __u32 magic; + __u32 crc; + __u16 entry_count; + __u32 spare_count; + __u8 pad[10]; + __u64 first_spare; + struct mapped_block { + __u64 defective_start; + __u32 replacement_start; + __u16 remap_count; + __u8 pad[2]; + } entries[0]; +}; + +/* Struct for internally holding ddf structures */ +/* The DDF structure stored on each device is potentially + * quite different, as some data is global and some is local. + * The global data is: + * - ddf header + * - controller_data + * - Physical disk records + * - Virtual disk records + * The local data is: + * - Configuration records + * - Physical Disk data section + * ( and Bad block and vendor which I don't care about yet). + * + * The local data is parsed into separate lists as it is read + * and reconstructed for writing. This means that we only need + * to make config changes once and they are automatically + * propagated to all devices. + * Note that the ddf_super has space of the conf and disk data + * for this disk and also for a list of all such data. + * The list is only used for the superblock that is being + * built in Create or Assemble to describe the whole array. + */ +struct ddf_super { + struct ddf_header anchor, primary, secondary; + struct ddf_controller_data controller; + struct ddf_header *active; + struct phys_disk *phys; + struct virtual_disk *virt; + int pdsize, vdsize; + int max_part, mppe, conf_rec_len; + int currentdev; + int updates_pending; + struct vcl { + union { + char space[512]; + struct { + struct vcl *next; + __u64 *lba_offset; /* location in 'conf' of + * the lba table */ + int vcnum; /* index into ->virt */ + __u64 *block_sizes; /* NULL if all the same */ + }; + }; + struct vd_config conf; + } *conflist, *currentconf; + struct dl { + union { + char space[512]; + struct { + struct dl *next; + int major, minor; + char *devname; + int fd; + unsigned long long size; /* sectors */ + int pdnum; /* index in ->phys */ + struct spare_assign *spare; + }; + }; + struct disk_data disk; + struct vcl *vlist[0]; /* max_part in size */ + } *dlist; +}; + +#ifndef offsetof +#define offsetof(t,f) ((size_t)&(((t*)0)->f)) +#endif + + +static int calc_crc(void *buf, int len) +{ + /* crcs are always at the same place as in the ddf_header */ + struct ddf_header *ddf = buf; + __u32 oldcrc = ddf->crc; + __u32 newcrc; + ddf->crc = 0xffffffff; + + newcrc = crc32(0, buf, len); + ddf->crc = oldcrc; + return newcrc; +} + +static int load_ddf_header(int fd, unsigned long long lba, + unsigned long long size, + int type, + struct ddf_header *hdr, struct ddf_header *anchor) +{ + /* read a ddf header (primary or secondary) from fd/lba + * and check that it is consistent with anchor + * Need to check: + * magic, crc, guid, rev, and LBA's header_type, and + * everything after header_type must be the same + */ + if (lba >= size-1) + return 0; + + if (lseek64(fd, lba<<9, 0) < 0) + return 0; + + if (read(fd, hdr, 512) != 512) + return 0; + + if (hdr->magic != DDF_HEADER_MAGIC) + return 0; + if (calc_crc(hdr, 512) != hdr->crc) + return 0; + if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 || + memcmp(anchor->revision, hdr->revision, 8) != 0 || + anchor->primary_lba != hdr->primary_lba || + anchor->secondary_lba != hdr->secondary_lba || + hdr->type != type || + memcmp(anchor->pad2, hdr->pad2, 512 - + offsetof(struct ddf_header, pad2)) != 0) + return 0; + + /* Looks good enough to me... */ + return 1; +} + +static void *load_section(int fd, struct ddf_super *super, void *buf, + __u32 offset_be, __u32 len_be, int check) +{ + unsigned long long offset = __be32_to_cpu(offset_be); + unsigned long long len = __be32_to_cpu(len_be); + int dofree = (buf == NULL); + + if (check) + if (len != 2 && len != 8 && len != 32 + && len != 128 && len != 512) + return NULL; + + if (len > 1024) + return NULL; + if (buf) { + /* All pre-allocated sections are a single block */ + if (len != 1) + return NULL; + } else { + posix_memalign(&buf, 512, len<<9); + } + + if (!buf) + return NULL; + + if (super->active->type == 1) + offset += __be64_to_cpu(super->active->primary_lba); + else + offset += __be64_to_cpu(super->active->secondary_lba); + + if (lseek64(fd, offset<<9, 0) != (offset<<9)) { + if (dofree) + free(buf); + return NULL; + } + if (read(fd, buf, len<<9) != (len<<9)) { + if (dofree) + free(buf); + return NULL; + } + return buf; +} + +static int load_ddf_headers(int fd, struct ddf_super *super, char *devname) +{ + unsigned long long dsize; + + get_dev_size(fd, NULL, &dsize); + + if (lseek64(fd, dsize-512, 0) < 0) { + if (devname) + fprintf(stderr, + Name": Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + if (read(fd, &super->anchor, 512) != 512) { + if (devname) + fprintf(stderr, + Name ": Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + if (super->anchor.magic != DDF_HEADER_MAGIC) { + if (devname) + fprintf(stderr, Name ": no DDF anchor found on %s\n", + devname); + return 2; + } + if (calc_crc(&super->anchor, 512) != super->anchor.crc) { + if (devname) + fprintf(stderr, Name ": bad CRC on anchor on %s\n", + devname); + return 2; + } + if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 && + memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) { + if (devname) + fprintf(stderr, Name ": can only support super revision" + " %.8s and earlier, not %.8s on %s\n", + DDF_REVISION_2, super->anchor.revision,devname); + return 2; + } + if (load_ddf_header(fd, __be64_to_cpu(super->anchor.primary_lba), + dsize >> 9, 1, + &super->primary, &super->anchor) == 0) { + if (devname) + fprintf(stderr, + Name ": Failed to load primary DDF header " + "on %s\n", devname); + return 2; + } + super->active = &super->primary; + if (load_ddf_header(fd, __be64_to_cpu(super->anchor.secondary_lba), + dsize >> 9, 2, + &super->secondary, &super->anchor)) { + if ((__be32_to_cpu(super->primary.seq) + < __be32_to_cpu(super->secondary.seq) && + !super->secondary.openflag) + || (__be32_to_cpu(super->primary.seq) + == __be32_to_cpu(super->secondary.seq) && + super->primary.openflag && !super->secondary.openflag) + ) + super->active = &super->secondary; + } + return 0; +} + +static int load_ddf_global(int fd, struct ddf_super *super, char *devname) +{ + void *ok; + ok = load_section(fd, super, &super->controller, + super->active->controller_section_offset, + super->active->controller_section_length, + 0); + super->phys = load_section(fd, super, NULL, + super->active->phys_section_offset, + super->active->phys_section_length, + 1); + super->pdsize = __be32_to_cpu(super->active->phys_section_length) * 512; + + super->virt = load_section(fd, super, NULL, + super->active->virt_section_offset, + super->active->virt_section_length, + 1); + super->vdsize = __be32_to_cpu(super->active->virt_section_length) * 512; + if (!ok || + !super->phys || + !super->virt) { + free(super->phys); + free(super->virt); + super->phys = NULL; + super->virt = NULL; + return 2; + } + super->conflist = NULL; + super->dlist = NULL; + + super->max_part = __be16_to_cpu(super->active->max_partitions); + super->mppe = __be16_to_cpu(super->active->max_primary_element_entries); + super->conf_rec_len = __be16_to_cpu(super->active->config_record_len); + return 0; +} + +static int load_ddf_local(int fd, struct ddf_super *super, + char *devname, int keep) +{ + struct dl *dl; + struct stat stb; + char *conf; + int i; + int vnum; + int max_virt_disks = __be16_to_cpu(super->active->max_vd_entries); + unsigned long long dsize; + + /* First the local disk info */ + posix_memalign((void**)&dl, 512, + sizeof(*dl) + + (super->max_part) * sizeof(dl->vlist[0])); + + load_section(fd, super, &dl->disk, + super->active->data_section_offset, + super->active->data_section_length, + 0); + dl->devname = devname ? strdup(devname) : NULL; + + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->dlist; + dl->fd = keep ? fd : -1; + + dl->size = 0; + if (get_dev_size(fd, devname, &dsize)) + dl->size = dsize >> 9; + dl->spare = NULL; + for (i=0 ; i < super->max_part ; i++) + dl->vlist[i] = NULL; + super->dlist = dl; + dl->pdnum = -1; + for (i=0; i < __be16_to_cpu(super->active->max_pd_entries); i++) + if (memcmp(super->phys->entries[i].guid, + dl->disk.guid, DDF_GUID_LEN) == 0) + dl->pdnum = i; + + /* Now the config list. */ + /* 'conf' is an array of config entries, some of which are + * probably invalid. Those which are good need to be copied into + * the conflist + */ + + conf = load_section(fd, super, NULL, + super->active->config_section_offset, + super->active->config_section_length, + 0); + + vnum = 0; + for (i = 0; + i < __be32_to_cpu(super->active->config_section_length); + i += super->conf_rec_len) { + struct vd_config *vd = + (struct vd_config *)((char*)conf + i*512); + struct vcl *vcl; + + if (vd->magic == DDF_SPARE_ASSIGN_MAGIC) { + if (dl->spare) + continue; + posix_memalign((void**)&dl->spare, 512, + super->conf_rec_len*512); + memcpy(dl->spare, vd, super->conf_rec_len*512); + continue; + } + if (vd->magic != DDF_VD_CONF_MAGIC) + continue; + for (vcl = super->conflist; vcl; vcl = vcl->next) { + if (memcmp(vcl->conf.guid, + vd->guid, DDF_GUID_LEN) == 0) + break; + } + + if (vcl) { + dl->vlist[vnum++] = vcl; + if (__be32_to_cpu(vd->seqnum) <= + __be32_to_cpu(vcl->conf.seqnum)) + continue; + } else { + posix_memalign((void**)&vcl, 512, + (super->conf_rec_len*512 + + offsetof(struct vcl, conf))); + vcl->next = super->conflist; + vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + super->conflist = vcl; + dl->vlist[vnum++] = vcl; + } + memcpy(&vcl->conf, vd, super->conf_rec_len*512); + vcl->lba_offset = (__u64*) + &vcl->conf.phys_refnum[super->mppe]; + + for (i=0; i < max_virt_disks ; i++) + if (memcmp(super->virt->entries[i].guid, + vcl->conf.guid, DDF_GUID_LEN)==0) + break; + if (i < max_virt_disks) + vcl->vcnum = i; + } + free(conf); + + return 0; +} + +#ifndef MDASSEMBLE +static int load_super_ddf_all(struct supertype *st, int fd, + void **sbp, char *devname, int keep_fd); +#endif +static int load_super_ddf(struct supertype *st, int fd, + char *devname) +{ + unsigned long long dsize; + struct ddf_super *super; + int rv; + +#ifndef MDASSEMBLE + /* if 'fd' is a container, load metadata from all the devices */ + if (load_super_ddf_all(st, fd, &st->sb, devname, 1) == 0) + return 0; +#endif + if (st->subarray[0]) + return 1; /* FIXME Is this correct */ + + if (get_dev_size(fd, devname, &dsize) == 0) + return 1; + + /* 32M is a lower bound */ + if (dsize <= 32*1024*1024) { + if (devname) { + fprintf(stderr, + Name ": %s is too small for ddf: " + "size is %llu sectors.\n", + devname, dsize>>9); + return 1; + } + } + if (dsize & 511) { + if (devname) { + fprintf(stderr, + Name ": %s is an odd size for ddf: " + "size is %llu bytes.\n", + devname, dsize); + return 1; + } + } + + if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) { + fprintf(stderr, Name ": malloc of %zu failed.\n", + sizeof(*super)); + return 1; + } + memset(super, 0, sizeof(*super)); + + rv = load_ddf_headers(fd, super, devname); + if (rv) { + free(super); + return rv; + } + + /* Have valid headers and have chosen the best. Let's read in the rest*/ + + rv = load_ddf_global(fd, super, devname); + + if (rv) { + if (devname) + fprintf(stderr, + Name ": Failed to load all information " + "sections on %s\n", devname); + free(super); + return rv; + } + + load_ddf_local(fd, super, devname, 0); + + /* Should possibly check the sections .... */ + + st->sb = super; + if (st->ss == NULL) { + st->ss = &super_ddf; + st->minor_version = 0; + st->max_devs = 512; + } + return 0; + +} + +static void free_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + if (ddf == NULL) + return; + free(ddf->phys); + free(ddf->virt); + while (ddf->conflist) { + struct vcl *v = ddf->conflist; + ddf->conflist = v->next; + if (v->block_sizes) + free(v->block_sizes); + free(v); + } + while (ddf->dlist) { + struct dl *d = ddf->dlist; + ddf->dlist = d->next; + if (d->fd >= 0) + close(d->fd); + if (d->spare) + free(d->spare); + free(d); + } + free(ddf); + st->sb = NULL; +} + +static struct supertype *match_metadata_desc_ddf(char *arg) +{ + /* 'ddf' only support containers */ + struct supertype *st; + if (strcmp(arg, "ddf") != 0 && + strcmp(arg, "default") != 0 + ) + return NULL; + + st = malloc(sizeof(*st)); + memset(st, 0, sizeof(*st)); + st->ss = &super_ddf; + st->max_devs = 512; + st->minor_version = 0; + st->sb = NULL; + return st; +} + + +#ifndef MDASSEMBLE + +static mapping_t ddf_state[] = { + { "Optimal", 0}, + { "Degraded", 1}, + { "Deleted", 2}, + { "Missing", 3}, + { "Failed", 4}, + { "Partially Optimal", 5}, + { "-reserved-", 6}, + { "-reserved-", 7}, + { NULL, 0} +}; + +static mapping_t ddf_init_state[] = { + { "Not Initialised", 0}, + { "QuickInit in Progress", 1}, + { "Fully Initialised", 2}, + { "*UNKNOWN*", 3}, + { NULL, 0} +}; +static mapping_t ddf_access[] = { + { "Read/Write", 0}, + { "Reserved", 1}, + { "Read Only", 2}, + { "Blocked (no access)", 3}, + { NULL ,0} +}; + +static mapping_t ddf_level[] = { + { "RAID0", DDF_RAID0}, + { "RAID1", DDF_RAID1}, + { "RAID3", DDF_RAID3}, + { "RAID4", DDF_RAID4}, + { "RAID5", DDF_RAID5}, + { "RAID1E",DDF_RAID1E}, + { "JBOD", DDF_JBOD}, + { "CONCAT",DDF_CONCAT}, + { "RAID5E",DDF_RAID5E}, + { "RAID5EE",DDF_RAID5EE}, + { "RAID6", DDF_RAID6}, + { NULL, 0} +}; +static mapping_t ddf_sec_level[] = { + { "Striped", DDF_2STRIPED}, + { "Mirrored", DDF_2MIRRORED}, + { "Concat", DDF_2CONCAT}, + { "Spanned", DDF_2SPANNED}, + { NULL, 0} +}; +#endif + +struct num_mapping { + int num1, num2; +}; +static struct num_mapping ddf_level_num[] = { + { DDF_RAID0, 0 }, + { DDF_RAID1, 1 }, + { DDF_RAID3, LEVEL_UNSUPPORTED }, + { DDF_RAID4, 4 }, + { DDF_RAID5, 5 }, + { DDF_RAID1E, LEVEL_UNSUPPORTED }, + { DDF_JBOD, LEVEL_UNSUPPORTED }, + { DDF_CONCAT, LEVEL_LINEAR }, + { DDF_RAID5E, LEVEL_UNSUPPORTED }, + { DDF_RAID5EE, LEVEL_UNSUPPORTED }, + { DDF_RAID6, 6}, + { MAXINT, MAXINT } +}; + +static int map_num1(struct num_mapping *map, int num) +{ + int i; + for (i=0 ; map[i].num1 != MAXINT; i++) + if (map[i].num1 == num) + break; + return map[i].num2; +} + +#ifndef MDASSEMBLE +static void print_guid(char *guid, int tstamp) +{ + /* A GUIDs are part (or all) ASCII and part binary. + * They tend to be space padded. + * We print the GUID in HEX, then in parentheses add + * any initial ASCII sequence, and a possible + * time stamp from bytes 16-19 + */ + int l = DDF_GUID_LEN; + int i; + + for (i=0 ; i<DDF_GUID_LEN ; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02X", guid[i]&255); + } + + printf(" ("); + while (l && guid[l-1] == ' ') + l--; + for (i=0 ; i<l ; i++) { + if (guid[i] >= 0x20 && guid[i] < 0x7f) + fputc(guid[i], stdout); + else + break; + } + if (tstamp) { + time_t then = __be32_to_cpu(*(__u32*)(guid+16)) + DECADE; + char tbuf[100]; + struct tm *tm; + tm = localtime(&then); + strftime(tbuf, 100, " %D %T",tm); + fputs(tbuf, stdout); + } + printf(")"); +} + +static void examine_vd(int n, struct ddf_super *sb, char *guid) +{ + int crl = sb->conf_rec_len; + struct vcl *vcl; + + for (vcl = sb->conflist ; vcl ; vcl = vcl->next) { + struct vd_config *vc = &vcl->conf; + + if (calc_crc(vc, crl*512) != vc->crc) + continue; + if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0) + continue; + + /* Ok, we know about this VD, let's give more details */ + printf(" Raid Devices[%d] : %d\n", n, + __be16_to_cpu(vc->prim_elmnt_count)); + printf(" Chunk Size[%d] : %d sectors\n", n, + 1 << vc->chunk_shift); + printf(" Raid Level[%d] : %s\n", n, + map_num(ddf_level, vc->prl)?:"-unknown-"); + if (vc->sec_elmnt_count != 1) { + printf(" Secondary Position[%d] : %d of %d\n", n, + vc->sec_elmnt_seq, vc->sec_elmnt_count); + printf(" Secondary Level[%d] : %s\n", n, + map_num(ddf_sec_level, vc->srl) ?: "-unknown-"); + } + printf(" Device Size[%d] : %llu\n", n, + __be64_to_cpu(vc->blocks)/2); + printf(" Array Size[%d] : %llu\n", n, + __be64_to_cpu(vc->array_blocks)/2); + } +} + +static void examine_vds(struct ddf_super *sb) +{ + int cnt = __be16_to_cpu(sb->virt->populated_vdes); + int i; + printf(" Virtual Disks : %d\n", cnt); + + for (i=0; i<cnt; i++) { + struct virtual_entry *ve = &sb->virt->entries[i]; + printf(" VD GUID[%d] : ", i); print_guid(ve->guid, 1); + printf("\n"); + printf(" unit[%d] : %d\n", i, __be16_to_cpu(ve->unit)); + printf(" state[%d] : %s, %s%s\n", i, + map_num(ddf_state, ve->state & 7), + (ve->state & 8) ? "Morphing, ": "", + (ve->state & 16)? "Not Consistent" : "Consistent"); + printf(" init state[%d] : %s\n", i, + map_num(ddf_init_state, ve->init_state&3)); + printf(" access[%d] : %s\n", i, + map_num(ddf_access, (ve->init_state>>6) & 3)); + printf(" Name[%d] : %.16s\n", i, ve->name); + examine_vd(i, sb, ve->guid); + } + if (cnt) printf("\n"); +} + +static void examine_pds(struct ddf_super *sb) +{ + int cnt = __be16_to_cpu(sb->phys->used_pdes); + int i; + struct dl *dl; + printf(" Physical Disks : %d\n", cnt); + + for (i=0 ; i<cnt ; i++) { + struct phys_disk_entry *pd = &sb->phys->entries[i]; + int type = __be16_to_cpu(pd->type); + int state = __be16_to_cpu(pd->state); + + printf(" PD GUID[%d] : ", i); print_guid(pd->guid, 0); + printf("\n"); + printf(" ref[%d] : %08x\n", i, + __be32_to_cpu(pd->refnum)); + printf(" mode[%d] : %s%s%s%s%s\n", i, + (type&2) ? "active":"", + (type&4) ? "Global Spare":"", + (type&8) ? "spare" : "", + (type&16)? ", foreign" : "", + (type&32)? "pass-through" : ""); + printf(" state[%d] : %s%s%s%s%s%s%s\n", i, + (state&1)? "Online": "Offline", + (state&2)? ", Failed": "", + (state&4)? ", Rebuilding": "", + (state&8)? ", in-transition": "", + (state&16)? ", SMART errors": "", + (state&32)? ", Unrecovered Read Errors": "", + (state&64)? ", Missing" : ""); + printf(" Avail Size[%d] : %llu K\n", i, + __be64_to_cpu(pd->config_size)>>1); + for (dl = sb->dlist; dl ; dl = dl->next) { + if (dl->disk.refnum == pd->refnum) { + char *dv = map_dev(dl->major, dl->minor, 0); + if (dv) + printf(" Device[%d] : %s\n", + i, dv); + } + } + printf("\n"); + } +} + +static void examine_super_ddf(struct supertype *st, char *homehost) +{ + struct ddf_super *sb = st->sb; + + printf(" Magic : %08x\n", __be32_to_cpu(sb->anchor.magic)); + printf(" Version : %.8s\n", sb->anchor.revision); + printf("Controller GUID : "); print_guid(sb->controller.guid, 0); + printf("\n"); + printf(" Container GUID : "); print_guid(sb->anchor.guid, 1); + printf("\n"); + printf(" Seq : %08x\n", __be32_to_cpu(sb->active->seq)); + printf(" Redundant hdr : %s\n", sb->secondary.magic == DDF_HEADER_MAGIC + ?"yes" : "no"); + examine_vds(sb); + examine_pds(sb); +} + +static void brief_examine_super_ddf(struct supertype *st) +{ + /* We just write a generic DDF ARRAY entry + * The uuid is all hex, 6 groups of 4 bytes + */ + struct ddf_super *ddf = st->sb; + int i; + printf("ARRAY /dev/ddf metadata=ddf UUID="); + for (i = 0; i < DDF_GUID_LEN; i++) { + if ((i&3) == 0 && i != 0) + printf(":"); + printf("%02X", 255&ddf->anchor.guid[i]); + } + printf("\n"); +} + +static void detail_super_ddf(struct supertype *st, char *homehost) +{ + /* FIXME later + * Could print DDF GUID + * Need to find which array + * If whole, briefly list all arrays + * If one, give name + */ +} + +static void brief_detail_super_ddf(struct supertype *st) +{ + /* FIXME I really need to know which array we are detailing. + * Can that be stored in ddf_super?? + */ +// struct ddf_super *ddf = st->sb; +} +#endif + +static int match_home_ddf(struct supertype *st, char *homehost) +{ + /* It matches 'this' host if the controller is a + * Linux-MD controller with vendor_data matching + * the hostname + */ + struct ddf_super *ddf = st->sb; + int len = strlen(homehost); + + return (memcmp(ddf->controller.guid, T10, 8) == 0 && + len < sizeof(ddf->controller.vendor_data) && + memcmp(ddf->controller.vendor_data, homehost,len) == 0 && + ddf->controller.vendor_data[len] == 0); +} + +static struct vd_config *find_vdcr(struct ddf_super *ddf, int inst) +{ + struct vcl *v; + + for (v = ddf->conflist; v; v = v->next) + if (inst == v->vcnum) + return &v->conf; + return NULL; +} + +static int find_phys(struct ddf_super *ddf, __u32 phys_refnum) +{ + /* Find the entry in phys_disk which has the given refnum + * and return it's index + */ + int i; + for (i=0; i < __be16_to_cpu(ddf->phys->max_pdes); i++) + if (ddf->phys->entries[i].refnum == phys_refnum) + return i; + return -1; +} + +static void uuid_from_super_ddf(struct supertype *st, int uuid[4]) +{ + /* The uuid returned here is used for: + * uuid to put into bitmap file (Create, Grow) + * uuid for backup header when saving critical section (Grow) + * comparing uuids when re-adding a device into an array + * For each of these we can make do with a truncated + * or hashed uuid rather than the original, as long as + * everyone agrees. + * In each case the uuid required is that of the data-array, + * not the device-set. + * In the case of SVD we assume the BVD is of interest, + * though that might be the case if a bitmap were made for + * a mirrored SVD - worry about that later. + * So we need to find the VD configuration record for the + * relevant BVD and extract the GUID and Secondary_Element_Seq. + * The first 16 bytes of the sha1 of these is used. + */ + struct ddf_super *ddf = st->sb; + struct vcl *vcl = ddf->currentconf; + + if (!vcl) + memset(uuid, 0, sizeof (uuid)); + else { + char buf[20]; + struct sha1_ctx ctx; + sha1_init_ctx(&ctx); + sha1_process_bytes(&vcl->conf.guid, DDF_GUID_LEN, &ctx); + if (vcl->conf.sec_elmnt_count > 1) + sha1_process_bytes(&vcl->conf.sec_elmnt_seq, 1, &ctx); + sha1_finish_ctx(&ctx, buf); + memcpy(uuid, buf, sizeof(uuid)); + } +} + +static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info); + +static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info) +{ + struct ddf_super *ddf = st->sb; + + if (ddf->currentconf) { + getinfo_super_ddf_bvd(st, info); + return; + } + + info->array.raid_disks = __be16_to_cpu(ddf->phys->used_pdes); + info->array.level = LEVEL_CONTAINER; + info->array.layout = 0; + info->array.md_minor = -1; + info->array.ctime = DECADE + __be32_to_cpu(*(__u32*) + (ddf->anchor.guid+16)); + info->array.utime = 0; + info->array.chunk_size = 0; + + + info->disk.major = 0; + info->disk.minor = 0; + if (ddf->dlist) { + info->disk.number = __be32_to_cpu(ddf->dlist->disk.refnum); + info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum); + + info->data_offset = __be64_to_cpu(ddf->phys-> + entries[info->disk.raid_disk]. + config_size); + info->component_size = ddf->dlist->size - info->data_offset; + } else { + info->disk.number = -1; +// info->disk.raid_disk = find refnum in the table and use index; + } + info->disk.state = (1 << MD_DISK_SYNC); + + + info->reshape_active = 0; + + strcpy(info->text_version, "ddf"); + +// uuid_from_super_ddf(info->uuid, sbv); + +// info->name[] ?? ; +} + +static int rlq_to_layout(int rlq, int prl, int raiddisks); + +static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info) +{ + struct ddf_super *ddf = st->sb; + struct vcl *vc = ddf->currentconf; + int cd = ddf->currentdev; + + /* FIXME this returns BVD info - what if we want SVD ?? */ + + info->array.raid_disks = __be16_to_cpu(vc->conf.prim_elmnt_count); + info->array.level = map_num1(ddf_level_num, vc->conf.prl); + info->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl, + info->array.raid_disks); + info->array.md_minor = -1; + info->array.ctime = DECADE + + __be32_to_cpu(*(__u32*)(vc->conf.guid+16)); + info->array.utime = DECADE + __be32_to_cpu(vc->conf.timestamp); + info->array.chunk_size = 512 << vc->conf.chunk_shift; + + if (cd >= 0 && cd < ddf->mppe) { + info->data_offset = __be64_to_cpu(vc->lba_offset[cd]); + if (vc->block_sizes) + info->component_size = vc->block_sizes[cd]; + else + info->component_size = __be64_to_cpu(vc->conf.blocks); + } + + info->disk.major = 0; + info->disk.minor = 0; +// info->disk.number = __be32_to_cpu(ddf->disk.refnum); +// info->disk.raid_disk = find refnum in the table and use index; +// info->disk.state = ???; + + info->container_member = ddf->currentconf->vcnum; + + info->resync_start = 0; + if (!(ddf->virt->entries[info->container_member].state + & DDF_state_inconsistent) && + (ddf->virt->entries[info->container_member].init_state + & DDF_initstate_mask) + == DDF_init_full) + info->resync_start = ~0ULL; + + uuid_from_super_ddf(st, info->uuid); + + info->container_member = atoi(st->subarray); + sprintf(info->text_version, "/%s/%s", + devnum2devname(st->container_dev), + st->subarray); + +// info->name[] ?? ; +} + + +static int update_super_ddf(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* For 'assemble' and 'force' we need to return non-zero if any + * change was made. For others, the return value is ignored. + * Update options are: + * force-one : This device looks a bit old but needs to be included, + * update age info appropriately. + * assemble: clear any 'faulty' flag to allow this device to + * be assembled. + * force-array: Array is degraded but being forced, mark it clean + * if that will be needed to assemble it. + * + * newdev: not used ???? + * grow: Array has gained a new device - this is currently for + * linear only + * resync: mark as dirty so a resync will happen. + * uuid: Change the uuid of the array to match what is given + * homehost: update the recorded homehost + * name: update the name - preserving the homehost + * _reshape_progress: record new reshape_progress position. + * + * Following are not relevant for this version: + * sparc2.2 : update from old dodgey metadata + * super-minor: change the preferred_minor number + * summaries: update redundant counters. + */ + int rv = 0; +// struct ddf_super *ddf = st->sb; +// struct vd_config *vd = find_vdcr(ddf, info->container_member); +// struct virtual_entry *ve = find_ve(ddf); + + /* we don't need to handle "force-*" or "assemble" as + * there is no need to 'trick' the kernel. We the metadata is + * first updated to activate the array, all the implied modifications + * will just happen. + */ + + if (strcmp(update, "grow") == 0) { + /* FIXME */ + } + if (strcmp(update, "resync") == 0) { +// info->resync_checkpoint = 0; + } + /* We ignore UUID updates as they make even less sense + * with DDF + */ + if (strcmp(update, "homehost") == 0) { + /* homehost is stored in controller->vendor_data, + * or it is when we are the vendor + */ +// if (info->vendor_is_local) +// strcpy(ddf->controller.vendor_data, homehost); + } + if (strcmp(update, "name") == 0) { + /* name is stored in virtual_entry->name */ +// memset(ve->name, ' ', 16); +// strncpy(ve->name, info->name, 16); + } + if (strcmp(update, "_reshape_progress") == 0) { + /* We don't support reshape yet */ + } + +// update_all_csum(ddf); + + return rv; +} + +static void make_header_guid(char *guid) +{ + __u32 stamp; + int rfd; + /* Create a DDF Header of Virtual Disk GUID */ + + /* 24 bytes of fiction required. + * first 8 are a 'vendor-id' - "Linux-MD" + * next 8 are controller type.. how about 0X DEAD BEEF 0000 0000 + * Remaining 8 random number plus timestamp + */ + memcpy(guid, T10, sizeof(T10)); + stamp = __cpu_to_be32(0xdeadbeef); + memcpy(guid+8, &stamp, 4); + stamp = __cpu_to_be32(0); + memcpy(guid+12, &stamp, 4); + stamp = __cpu_to_be32(time(0) - DECADE); + memcpy(guid+16, &stamp, 4); + rfd = open("/dev/urandom", O_RDONLY); + if (rfd < 0 || read(rfd, &stamp, 4) != 4) + stamp = random(); + memcpy(guid+20, &stamp, 4); + if (rfd >= 0) close(rfd); +} + +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid); + +static int init_super_ddf(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, char *name, char *homehost, + int *uuid) +{ + /* This is primarily called by Create when creating a new array. + * We will then get add_to_super called for each component, and then + * write_init_super called to write it out to each device. + * For DDF, Create can create on fresh devices or on a pre-existing + * array. + * To create on a pre-existing array a different method will be called. + * This one is just for fresh drives. + * + * We need to create the entire 'ddf' structure which includes: + * DDF headers - these are easy. + * Controller data - a Sector describing this controller .. not that + * this is a controller exactly. + * Physical Disk Record - one entry per device, so + * leave plenty of space. + * Virtual Disk Records - again, just leave plenty of space. + * This just lists VDs, doesn't give details + * Config records - describes the VDs that use this disk + * DiskData - describes 'this' device. + * BadBlockManagement - empty + * Diag Space - empty + * Vendor Logs - Could we put bitmaps here? + * + */ + struct ddf_super *ddf; + char hostname[17]; + int hostlen; + int max_phys_disks, max_virt_disks; + unsigned long long sector; + int clen; + int i; + int pdsize, vdsize; + struct phys_disk *pd; + struct virtual_disk *vd; + + if (!info) { + st->sb = NULL; + return 0; + } + if (st->sb) + return init_super_ddf_bvd(st, info, size, name, homehost, + uuid); + + posix_memalign((void**)&ddf, 512, sizeof(*ddf)); + memset(ddf, 0, sizeof(*ddf)); + ddf->dlist = NULL; /* no physical disks yet */ + ddf->conflist = NULL; /* No virtual disks yet */ + + /* At least 32MB *must* be reserved for the ddf. So let's just + * start 32MB from the end, and put the primary header there. + * Don't do secondary for now. + * We don't know exactly where that will be yet as it could be + * different on each device. To just set up the lengths. + * + */ + + ddf->anchor.magic = DDF_HEADER_MAGIC; + make_header_guid(ddf->anchor.guid); + + memcpy(ddf->anchor.revision, DDF_REVISION_2, 8); + ddf->anchor.seq = __cpu_to_be32(1); + ddf->anchor.timestamp = __cpu_to_be32(time(0) - DECADE); + ddf->anchor.openflag = 0xFF; + ddf->anchor.foreignflag = 0; + ddf->anchor.enforcegroups = 0; /* Is this best?? */ + ddf->anchor.pad0 = 0xff; + memset(ddf->anchor.pad1, 0xff, 12); + memset(ddf->anchor.header_ext, 0xff, 32); + ddf->anchor.primary_lba = ~(__u64)0; + ddf->anchor.secondary_lba = ~(__u64)0; + ddf->anchor.type = DDF_HEADER_ANCHOR; + memset(ddf->anchor.pad2, 0xff, 3); + ddf->anchor.workspace_len = __cpu_to_be32(32768); /* Must be reserved */ + ddf->anchor.workspace_lba = ~(__u64)0; /* Put this at bottom + of 32M reserved.. */ + max_phys_disks = 1023; /* Should be enough */ + ddf->anchor.max_pd_entries = __cpu_to_be16(max_phys_disks); + max_virt_disks = 255; + ddf->anchor.max_vd_entries = __cpu_to_be16(max_virt_disks); /* ?? */ + ddf->anchor.max_partitions = __cpu_to_be16(64); /* ?? */ + ddf->max_part = 64; + ddf->mppe = 256; + ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512; + ddf->anchor.config_record_len = __cpu_to_be16(ddf->conf_rec_len); + ddf->anchor.max_primary_element_entries = __cpu_to_be16(ddf->mppe); + memset(ddf->anchor.pad3, 0xff, 54); + /* controller sections is one sector long immediately + * after the ddf header */ + sector = 1; + ddf->anchor.controller_section_offset = __cpu_to_be32(sector); + ddf->anchor.controller_section_length = __cpu_to_be32(1); + sector += 1; + + /* phys is 8 sectors after that */ + pdsize = ROUND_UP(sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)*max_phys_disks, + 512); + switch(pdsize/512) { + case 2: case 8: case 32: case 128: case 512: break; + default: abort(); + } + ddf->anchor.phys_section_offset = __cpu_to_be32(sector); + ddf->anchor.phys_section_length = + __cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */ + sector += pdsize/512; + + /* virt is another 32 sectors */ + vdsize = ROUND_UP(sizeof(struct virtual_disk) + + sizeof(struct virtual_entry) * max_virt_disks, + 512); + switch(vdsize/512) { + case 2: case 8: case 32: case 128: case 512: break; + default: abort(); + } + ddf->anchor.virt_section_offset = __cpu_to_be32(sector); + ddf->anchor.virt_section_length = + __cpu_to_be32(vdsize/512); /* max_vd_entries/8 */ + sector += vdsize/512; + + clen = ddf->conf_rec_len * (ddf->max_part+1); + ddf->anchor.config_section_offset = __cpu_to_be32(sector); + ddf->anchor.config_section_length = __cpu_to_be32(clen); + sector += clen; + + ddf->anchor.data_section_offset = __cpu_to_be32(sector); + ddf->anchor.data_section_length = __cpu_to_be32(1); + sector += 1; + + ddf->anchor.bbm_section_length = __cpu_to_be32(0); + ddf->anchor.bbm_section_offset = __cpu_to_be32(0xFFFFFFFF); + ddf->anchor.diag_space_length = __cpu_to_be32(0); + ddf->anchor.diag_space_offset = __cpu_to_be32(0xFFFFFFFF); + ddf->anchor.vendor_length = __cpu_to_be32(0); + ddf->anchor.vendor_offset = __cpu_to_be32(0xFFFFFFFF); + + memset(ddf->anchor.pad4, 0xff, 256); + + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); + + ddf->primary.openflag = 1; /* I guess.. */ + ddf->primary.type = DDF_HEADER_PRIMARY; + + ddf->secondary.openflag = 1; /* I guess.. */ + ddf->secondary.type = DDF_HEADER_SECONDARY; + + ddf->active = &ddf->primary; + + ddf->controller.magic = DDF_CONTROLLER_MAGIC; + + /* 24 more bytes of fiction required. + * first 8 are a 'vendor-id' - "Linux-MD" + * Remaining 16 are serial number.... maybe a hostname would do? + */ + memcpy(ddf->controller.guid, T10, sizeof(T10)); + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = 0; + hostlen = strlen(hostname); + memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen); + for (i = strlen(T10) ; i+hostlen < 24; i++) + ddf->controller.guid[i] = ' '; + + ddf->controller.type.vendor_id = __cpu_to_be16(0xDEAD); + ddf->controller.type.device_id = __cpu_to_be16(0xBEEF); + ddf->controller.type.sub_vendor_id = 0; + ddf->controller.type.sub_device_id = 0; + memcpy(ddf->controller.product_id, "What Is My PID??", 16); + memset(ddf->controller.pad, 0xff, 8); + memset(ddf->controller.vendor_data, 0xff, 448); + + posix_memalign((void**)&pd, 512, pdsize); + ddf->phys = pd; + ddf->pdsize = pdsize; + + memset(pd, 0xff, pdsize); + memset(pd, 0, sizeof(*pd)); + pd->magic = DDF_PHYS_DATA_MAGIC; + pd->used_pdes = __cpu_to_be16(0); + pd->max_pdes = __cpu_to_be16(max_phys_disks); + memset(pd->pad, 0xff, 52); + + posix_memalign((void**)&vd, 512, vdsize); + ddf->virt = vd; + ddf->vdsize = vdsize; + memset(vd, 0, vdsize); + vd->magic = DDF_VIRT_RECORDS_MAGIC; + vd->populated_vdes = __cpu_to_be16(0); + vd->max_vdes = __cpu_to_be16(max_virt_disks); + memset(vd->pad, 0xff, 52); + + for (i=0; i<max_virt_disks; i++) + memset(&vd->entries[i], 0xff, sizeof(struct virtual_entry)); + + st->sb = ddf; + ddf->updates_pending = 1; + return 1; +} + +static int all_ff(char *guid) +{ + int i; + for (i = 0; i < DDF_GUID_LEN; i++) + if (guid[i] != (char)0xff) + return 0; + return 1; +} +static int chunk_to_shift(int chunksize) +{ + return ffs(chunksize/512)-1; +} + +static int level_to_prl(int level) +{ + switch (level) { + case LEVEL_LINEAR: return DDF_CONCAT; + case 0: return DDF_RAID0; + case 1: return DDF_RAID1; + case 4: return DDF_RAID4; + case 5: return DDF_RAID5; + case 6: return DDF_RAID6; + default: return -1; + } +} +static int layout_to_rlq(int level, int layout, int raiddisks) +{ + switch(level) { + case 0: + return DDF_RAID0_SIMPLE; + case 1: + switch(raiddisks) { + case 2: return DDF_RAID1_SIMPLE; + case 3: return DDF_RAID1_MULTI; + default: return -1; + } + case 4: + switch(layout) { + case 0: return DDF_RAID4_N; + } + break; + case 5: + case 6: + switch(layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + return DDF_RAID5_N_RESTART; + case ALGORITHM_RIGHT_ASYMMETRIC: + if (level == 5) + return DDF_RAID5_0_RESTART; + else + return DDF_RAID6_0_RESTART; + case ALGORITHM_LEFT_SYMMETRIC: + return DDF_RAID5_N_CONTINUE; + case ALGORITHM_RIGHT_SYMMETRIC: + return -1; /* not mentioned in standard */ + } + } + return -1; +} + +static int rlq_to_layout(int rlq, int prl, int raiddisks) +{ + switch(prl) { + case DDF_RAID0: + return 0; /* hopefully rlq == DDF_RAID0_SIMPLE */ + case DDF_RAID1: + return 0; /* hopefully rlq == SIMPLE or MULTI depending + on raiddisks*/ + case DDF_RAID4: + switch(rlq) { + case DDF_RAID4_N: + return 0; + default: + /* not supported */ + return -1; /* FIXME this isn't checked */ + } + case DDF_RAID5: + switch(rlq) { + case DDF_RAID5_N_RESTART: + return ALGORITHM_LEFT_ASYMMETRIC; + case DDF_RAID5_0_RESTART: + return ALGORITHM_RIGHT_ASYMMETRIC; + case DDF_RAID5_N_CONTINUE: + return ALGORITHM_LEFT_SYMMETRIC; + default: + return -1; + } + case DDF_RAID6: + switch(rlq) { + case DDF_RAID5_N_RESTART: + return ALGORITHM_LEFT_ASYMMETRIC; + case DDF_RAID6_0_RESTART: + return ALGORITHM_RIGHT_ASYMMETRIC; + case DDF_RAID5_N_CONTINUE: + return ALGORITHM_LEFT_SYMMETRIC; + default: + return -1; + } + } + return -1; +} + +struct extent { + unsigned long long start, size; +}; +static int cmp_extent(const void *av, const void *bv) +{ + const struct extent *a = av; + const struct extent *b = bv; + if (a->start < b->start) + return -1; + if (a->start > b->start) + return 1; + return 0; +} + +static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl) +{ + /* find a list of used extents on the give physical device + * (dnum) of the given ddf. + * Return a malloced array of 'struct extent' + +FIXME ignore DDF_Legacy devices? + + */ + struct extent *rv; + int n = 0; + int i, j; + + rv = malloc(sizeof(struct extent) * (ddf->max_part + 2)); + if (!rv) + return NULL; + + for (i = 0; i < ddf->max_part; i++) { + struct vcl *v = dl->vlist[i]; + if (v == NULL) + continue; + for (j=0; j < v->conf.prim_elmnt_count; j++) + if (v->conf.phys_refnum[j] == dl->disk.refnum) { + /* This device plays role 'j' in 'v'. */ + rv[n].start = __be64_to_cpu(v->lba_offset[j]); + rv[n].size = __be64_to_cpu(v->conf.blocks); + n++; + break; + } + } + qsort(rv, n, sizeof(*rv), cmp_extent); + + rv[n].start = __be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size); + rv[n].size = 0; + return rv; +} + +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid) +{ + /* We are creating a BVD inside a pre-existing container. + * so st->sb is already set. + * We need to create a new vd_config and a new virtual_entry + */ + struct ddf_super *ddf = st->sb; + int venum; + struct virtual_entry *ve; + struct vcl *vcl; + struct vd_config *vc; + + if (__be16_to_cpu(ddf->virt->populated_vdes) + >= __be16_to_cpu(ddf->virt->max_vdes)) { + fprintf(stderr, Name": This ddf already has the " + "maximum of %d virtual devices\n", + __be16_to_cpu(ddf->virt->max_vdes)); + return 0; + } + + for (venum = 0; venum < __be16_to_cpu(ddf->virt->max_vdes); venum++) + if (all_ff(ddf->virt->entries[venum].guid)) + break; + if (venum == __be16_to_cpu(ddf->virt->max_vdes)) { + fprintf(stderr, Name ": Cannot find spare slot for " + "virtual disk - DDF is corrupt\n"); + return 0; + } + ve = &ddf->virt->entries[venum]; + + /* A Virtual Disk GUID contains the T10 Vendor ID, controller type, + * timestamp, random number + */ + make_header_guid(ve->guid); + ve->unit = __cpu_to_be16(info->md_minor); + ve->pad0 = 0xFFFF; + ve->guid_crc = crc32(0, (unsigned char*)ddf->anchor.guid, DDF_GUID_LEN); + ve->type = 0; + ve->state = DDF_state_degraded; /* Will be modified as devices are added */ + if (info->state & 1) /* clean */ + ve->init_state = DDF_init_full; + else + ve->init_state = DDF_init_not; + + memset(ve->pad1, 0xff, 14); + memset(ve->name, ' ', 16); + if (name) + strncpy(ve->name, name, 16); + ddf->virt->populated_vdes = + __cpu_to_be16(__be16_to_cpu(ddf->virt->populated_vdes)+1); + + /* Now create a new vd_config */ + posix_memalign((void**)&vcl, 512, + (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512)); + vcl->lba_offset = (__u64*) &vcl->conf.phys_refnum[ddf->mppe]; + vcl->vcnum = venum; + sprintf(st->subarray, "%d", venum); + vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + + vc = &vcl->conf; + + vc->magic = DDF_VD_CONF_MAGIC; + memcpy(vc->guid, ve->guid, DDF_GUID_LEN); + vc->timestamp = __cpu_to_be32(time(0)-DECADE); + vc->seqnum = __cpu_to_be32(1); + memset(vc->pad0, 0xff, 24); + vc->prim_elmnt_count = __cpu_to_be16(info->raid_disks); + vc->chunk_shift = chunk_to_shift(info->chunk_size); + vc->prl = level_to_prl(info->level); + vc->rlq = layout_to_rlq(info->level, info->layout, info->raid_disks); + vc->sec_elmnt_count = 1; + vc->sec_elmnt_seq = 0; + vc->srl = 0; + vc->blocks = __cpu_to_be64(info->size * 2); + vc->array_blocks = __cpu_to_be64( + calc_array_size(info->level, info->raid_disks, info->layout, + info->chunk_size, info->size*2)); + memset(vc->pad1, 0xff, 8); + vc->spare_refs[0] = 0xffffffff; + vc->spare_refs[1] = 0xffffffff; + vc->spare_refs[2] = 0xffffffff; + vc->spare_refs[3] = 0xffffffff; + vc->spare_refs[4] = 0xffffffff; + vc->spare_refs[5] = 0xffffffff; + vc->spare_refs[6] = 0xffffffff; + vc->spare_refs[7] = 0xffffffff; + memset(vc->cache_pol, 0, 8); + vc->bg_rate = 0x80; + memset(vc->pad2, 0xff, 3); + memset(vc->pad3, 0xff, 52); + memset(vc->pad4, 0xff, 192); + memset(vc->v0, 0xff, 32); + memset(vc->v1, 0xff, 32); + memset(vc->v2, 0xff, 16); + memset(vc->v3, 0xff, 16); + memset(vc->vendor, 0xff, 32); + + memset(vc->phys_refnum, 0xff, 4*ddf->mppe); + memset(vc->phys_refnum+(ddf->mppe * 4), 0x00, 8*ddf->mppe); + + vcl->next = ddf->conflist; + ddf->conflist = vcl; + ddf->currentconf = vcl; + ddf->updates_pending = 1; + return 1; +} + +static void add_to_super_ddf_bvd(struct supertype *st, + mdu_disk_info_t *dk, int fd, char *devname) +{ + /* fd and devname identify a device with-in the ddf container (st). + * dk identifies a location in the new BVD. + * We need to find suitable free space in that device and update + * the phys_refnum and lba_offset for the newly created vd_config. + * We might also want to update the type in the phys_disk + * section. + */ + struct dl *dl; + struct ddf_super *ddf = st->sb; + struct vd_config *vc; + __u64 *lba_offset; + int working; + int i; + unsigned long long blocks, pos, esize; + struct extent *ex; + + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + if (!dl || ! (dk->state & (1<<MD_DISK_SYNC))) + return; + + vc = &ddf->currentconf->conf; + lba_offset = ddf->currentconf->lba_offset; + + ex = get_extents(ddf, dl); + if (!ex) + return; + + i = 0; pos = 0; + blocks = __be64_to_cpu(vc->blocks); + if (ddf->currentconf->block_sizes) + blocks = ddf->currentconf->block_sizes[dk->raid_disk]; + + do { + esize = ex[i].start - pos; + if (esize >= blocks) + break; + pos = ex[i].start + ex[i].size; + i++; + } while (ex[i-1].size); + + free(ex); + if (esize < blocks) + return; + + ddf->currentdev = dk->raid_disk; + vc->phys_refnum[dk->raid_disk] = dl->disk.refnum; + lba_offset[dk->raid_disk] = __cpu_to_be64(pos); + + for (i=0; i < ddf->max_part ; i++) + if (dl->vlist[i] == NULL) + break; + if (i == ddf->max_part) + return; + dl->vlist[i] = ddf->currentconf; + + dl->fd = fd; + dl->devname = devname; + + /* Check how many working raid_disks, and if we can mark + * array as optimal yet + */ + working = 0; + + for (i=0; i < __be16_to_cpu(vc->prim_elmnt_count); i++) + if (vc->phys_refnum[i] != 0xffffffff) + working++; + + /* Find which virtual_entry */ + i = ddf->currentconf->vcnum; + if (working == __be16_to_cpu(vc->prim_elmnt_count)) + ddf->virt->entries[i].state = + (ddf->virt->entries[i].state & ~DDF_state_mask) + | DDF_state_optimal; + + if (vc->prl == DDF_RAID6 && + working+1 == __be16_to_cpu(vc->prim_elmnt_count)) + ddf->virt->entries[i].state = + (ddf->virt->entries[i].state & ~DDF_state_mask) + | DDF_state_part_optimal; + + ddf->phys->entries[dl->pdnum].type &= ~__cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type |= __cpu_to_be16(DDF_Active_in_VD); + ddf->updates_pending = 1; +} + +/* add a device to a container, either while creating it or while + * expanding a pre-existing container + */ +static void add_to_super_ddf(struct supertype *st, + mdu_disk_info_t *dk, int fd, char *devname) +{ + struct ddf_super *ddf = st->sb; + struct dl *dd; + time_t now; + struct tm *tm; + unsigned long long size; + struct phys_disk_entry *pde; + int n, i; + struct stat stb; + + if (ddf->currentconf) { + add_to_super_ddf_bvd(st, dk, fd, devname); + return; + } + + /* This is device numbered dk->number. We need to create + * a phys_disk entry and a more detailed disk_data entry. + */ + fstat(fd, &stb); + posix_memalign((void**)&dd, 512, + sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part); + dd->major = major(stb.st_rdev); + dd->minor = minor(stb.st_rdev); + dd->devname = devname; + dd->next = ddf->dlist; + dd->fd = fd; + dd->spare = NULL; + + dd->disk.magic = DDF_PHYS_DATA_MAGIC; + now = time(0); + tm = localtime(&now); + sprintf(dd->disk.guid, "%8s%04d%02d%02d", + T10, tm->tm_year+1900, tm->tm_mon+1, tm->tm_mday); + *(__u32*)(dd->disk.guid + 16) = random(); + *(__u32*)(dd->disk.guid + 20) = random(); + + do { + /* Cannot be bothered finding a CRC of some irrelevant details*/ + dd->disk.refnum = random(); + for (i = __be16_to_cpu(ddf->active->max_pd_entries) - 1; + i >= 0; i--) + if (ddf->phys->entries[i].refnum == dd->disk.refnum) + break; + } while (i >= 0); + + dd->disk.forced_ref = 1; + dd->disk.forced_guid = 1; + memset(dd->disk.vendor, ' ', 32); + memcpy(dd->disk.vendor, "Linux", 5); + memset(dd->disk.pad, 0xff, 442); + for (i = 0; i < ddf->max_part ; i++) + dd->vlist[i] = NULL; + + n = __be16_to_cpu(ddf->phys->used_pdes); + pde = &ddf->phys->entries[n]; + dd->pdnum = n; + + n++; + ddf->phys->used_pdes = __cpu_to_be16(n); + + memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN); + pde->refnum = dd->disk.refnum; + pde->type = __cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare); + pde->state = __cpu_to_be16(DDF_Online); + get_dev_size(fd, NULL, &size); + /* We are required to reserve 32Meg, and record the size in sectors */ + pde->config_size = __cpu_to_be64( (size - 32*1024*1024) / 512); + sprintf(pde->path, "%17.17s","Information: nil") ; + memset(pde->pad, 0xff, 6); + + dd->size = size >> 9; + ddf->dlist = dd; + ddf->updates_pending = 1; +} + +/* + * This is the write_init_super method for a ddf container. It is + * called when creating a container or adding another device to a + * container. + */ + +#ifndef MDASSEMBLE + +static unsigned char null_conf[4096+512]; + +static int __write_init_super_ddf(struct supertype *st, int do_close) +{ + + struct ddf_super *ddf = st->sb; + int i; + struct dl *d; + int n_config; + int conf_size; + + unsigned long long size, sector; + + for (d = ddf->dlist; d; d=d->next) { + int fd = d->fd; + + if (fd < 0) + continue; + + /* We need to fill in the primary, (secondary) and workspace + * lba's in the headers, set their checksums, + * Also checksum phys, virt.... + * + * Then write everything out, finally the anchor is written. + */ + get_dev_size(fd, NULL, &size); + size /= 512; + ddf->anchor.workspace_lba = __cpu_to_be64(size - 32*1024*2); + ddf->anchor.primary_lba = __cpu_to_be64(size - 16*1024*2); + ddf->anchor.seq = __cpu_to_be32(1); + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); + + ddf->anchor.openflag = 0xFF; /* 'open' means nothing */ + ddf->anchor.seq = 0xFFFFFFFF; /* no sequencing in anchor */ + ddf->anchor.crc = calc_crc(&ddf->anchor, 512); + + ddf->primary.openflag = 0; + ddf->primary.type = DDF_HEADER_PRIMARY; + + ddf->secondary.openflag = 0; + ddf->secondary.type = DDF_HEADER_SECONDARY; + + ddf->primary.crc = calc_crc(&ddf->primary, 512); + ddf->secondary.crc = calc_crc(&ddf->secondary, 512); + + sector = size - 16*1024*2; + lseek64(fd, sector<<9, 0); + write(fd, &ddf->primary, 512); + + ddf->controller.crc = calc_crc(&ddf->controller, 512); + write(fd, &ddf->controller, 512); + + ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize); + + write(fd, ddf->phys, ddf->pdsize); + + ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize); + write(fd, ddf->virt, ddf->vdsize); + + /* Now write lots of config records. */ + n_config = ddf->max_part; + conf_size = ddf->conf_rec_len * 512; + for (i = 0 ; i <= n_config ; i++) { + struct vcl *c = d->vlist[i]; + if (i == n_config) + c = (struct vcl*)d->spare; + + if (c) { + c->conf.crc = calc_crc(&c->conf, conf_size); + write(fd, &c->conf, conf_size); + } else { + char *null_aligned = (char*)((((unsigned long)null_conf)+511)&~511UL); + if (null_conf[0] != 0xff) + memset(null_conf, 0xff, sizeof(null_conf)); + int togo = conf_size; + while (togo > sizeof(null_conf)-512) { + write(fd, null_aligned, sizeof(null_conf)-512); + togo -= sizeof(null_conf)-512; + } + write(fd, null_aligned, togo); + } + } + d->disk.crc = calc_crc(&d->disk, 512); + write(fd, &d->disk, 512); + + /* Maybe do the same for secondary */ + + lseek64(fd, (size-1)*512, SEEK_SET); + write(fd, &ddf->anchor, 512); + if (do_close) { + close(fd); + d->fd = -1; + } + } + return 1; +} + +static int write_init_super_ddf(struct supertype *st) +{ + + if (st->update_tail) { + /* queue the virtual_disk and vd_config as metadata updates */ + struct virtual_disk *vd; + struct vd_config *vc; + struct ddf_super *ddf = st->sb; + int len; + + /* First the virtual disk. We have a slightly fake header */ + len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry); + vd = malloc(len); + *vd = *ddf->virt; + vd->entries[0] = ddf->virt->entries[ddf->currentconf->vcnum]; + vd->populated_vdes = __cpu_to_be16(ddf->currentconf->vcnum); + append_metadata_update(st, vd, len); + + /* Then the vd_config */ + len = ddf->conf_rec_len * 512; + vc = malloc(len); + memcpy(vc, &ddf->currentconf->conf, len); + append_metadata_update(st, vc, len); + + /* FIXME I need to close the fds! */ + return 0; + } else + return __write_init_super_ddf(st, 1); +} + +#endif + +static __u64 avail_size_ddf(struct supertype *st, __u64 devsize) +{ + /* We must reserve the last 32Meg */ + if (devsize <= 32*1024*2) + return 0; + return devsize - 32*1024*2; +} + +#ifndef MDASSEMBLE +static int +validate_geometry_ddf_container(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose); + +static int validate_geometry_ddf_bvd(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose); + +static int validate_geometry_ddf(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd; + struct mdinfo *sra; + int cfd; + + /* ddf potentially supports lots of things, but it depends on + * what devices are offered (and maybe kernel version?) + * If given unused devices, we will make a container. + * If given devices in a container, we will make a BVD. + * If given BVDs, we make an SVD, changing all the GUIDs in the process. + */ + + if (level == LEVEL_CONTAINER) { + /* Must be a fresh device to add to a container */ + return validate_geometry_ddf_container(st, level, layout, + raiddisks, chunk, + size, dev, freesize, + verbose); + } + + if (st->sb) { + /* A container has already been opened, so we are + * creating in there. Maybe a BVD, maybe an SVD. + * Should make a distinction one day. + */ + return validate_geometry_ddf_bvd(st, level, layout, raiddisks, + chunk, size, dev, freesize, + verbose); + } + if (!dev) { + /* Initial sanity check. Exclude illegal levels. */ + int i; + for (i=0; ddf_level_num[i].num1 != MAXINT; i++) + if (ddf_level_num[i].num2 == level) + break; + if (ddf_level_num[i].num1 == MAXINT) + return 0; + /* Should check layout? etc */ + return 1; + } + + /* This is the first device for the array. + * If it is a container, we read it in and do automagic allocations, + * no other devices should be given. + * Otherwise it must be a member device of a container, and we + * do manual allocation. + * Later we should check for a BVD and make an SVD. + */ + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd >= 0) { + sra = sysfs_read(fd, 0, GET_VERSION); + close(fd); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "ddf") == 0) { + + /* load super */ + /* find space for 'n' devices. */ + /* remember the devices */ + /* Somehow return the fact that we have enough */ + } + + if (verbose) + fprintf(stderr, + Name ": ddf: Cannot create this array " + "on device %s\n", + dev); + return 0; + } + if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { + if (verbose) + fprintf(stderr, Name ": ddf: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + /* Well, it is in use by someone, maybe a 'ddf' container. */ + cfd = open_container(fd); + if (cfd < 0) { + close(fd); + if (verbose) + fprintf(stderr, Name ": ddf: Cannot use %s: %s\n", + dev, strerror(EBUSY)); + return 0; + } + sra = sysfs_read(cfd, 0, GET_VERSION); + close(fd); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "ddf") == 0) { + /* This is a member of a ddf container. Load the container + * and try to create a bvd + */ + struct ddf_super *ddf; + if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL, 1) == 0) { + st->sb = ddf; + st->container_dev = fd2devnum(cfd); + close(cfd); + return validate_geometry_ddf_bvd(st, level, layout, + raiddisks, chunk, size, + dev, freesize, + verbose); + } + close(cfd); + } else /* device may belong to a different container */ + return 0; + + return 1; +} + +static int +validate_geometry_ddf_container(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd; + unsigned long long ldsize; + + if (level != LEVEL_CONTAINER) + return 0; + if (!dev) + return 1; + + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + fprintf(stderr, Name ": ddf: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + if (!get_dev_size(fd, dev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + *freesize = avail_size_ddf(st, ldsize >> 9); + + return 1; +} + +static int validate_geometry_ddf_bvd(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose) +{ + struct stat stb; + struct ddf_super *ddf = st->sb; + struct dl *dl; + unsigned long long pos = 0; + unsigned long long maxsize; + struct extent *e; + int i; + /* ddf/bvd supports lots of things, but not containers */ + if (level == LEVEL_CONTAINER) + return 0; + /* We must have the container info already read in. */ + if (!ddf) + return 0; + + if (!dev) { + /* General test: make sure there is space for + * 'raiddisks' device extents of size 'size'. + */ + unsigned long long minsize = size; + int dcnt = 0; + if (minsize == 0) + minsize = 8; + for (dl = ddf->dlist; dl ; dl = dl->next) + { + int found = 0; + pos = 0; + + i = 0; + e = get_extents(ddf, dl); + if (!e) continue; + do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= minsize) + found = 1; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + if (found) + dcnt++; + free(e); + } + if (dcnt < raiddisks) { + if (verbose) + fprintf(stderr, + Name ": ddf: Not enough devices with " + "space for this array (%d < %d)\n", + dcnt, raiddisks); + return 0; + } + return 1; + } + /* This device must be a member of the set */ + if (stat(dev, &stb) < 0) + return 0; + if ((S_IFMT & stb.st_mode) != S_IFBLK) + return 0; + for (dl = ddf->dlist ; dl ; dl = dl->next) { + if (dl->major == major(stb.st_rdev) && + dl->minor == minor(stb.st_rdev)) + break; + } + if (!dl) { + if (verbose) + fprintf(stderr, Name ": ddf: %s is not in the " + "same DDF set\n", + dev); + return 0; + } + e = get_extents(ddf, dl); + maxsize = 0; + i = 0; + if (e) do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= maxsize) + maxsize = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + *freesize = maxsize; + // FIXME here I am + + return 1; +} + +static int load_super_ddf_all(struct supertype *st, int fd, + void **sbp, char *devname, int keep_fd) +{ + struct mdinfo *sra; + struct ddf_super *super; + struct mdinfo *sd, *best = NULL; + int bestseq = 0; + int seq; + char nm[20]; + int dfd; + + sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + if (!sra) + return 1; + if (sra->array.major_version != -1 || + sra->array.minor_version != -2 || + strcmp(sra->text_version, "ddf") != 0) + return 1; + + if (posix_memalign((void**)&super, 512, sizeof(*super)) != 0) + return 1; + memset(super, 0, sizeof(*super)); + + /* first, try each device, and choose the best ddf */ + for (sd = sra->devs ; sd ; sd = sd->next) { + int rv; + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) + return 2; + rv = load_ddf_headers(dfd, super, NULL); + close(dfd); + if (rv == 0) { + seq = __be32_to_cpu(super->active->seq); + if (super->active->openflag) + seq--; + if (!best || seq > bestseq) { + bestseq = seq; + best = sd; + } + } + } + if (!best) + return 1; + /* OK, load this ddf */ + sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) + return 1; + load_ddf_headers(dfd, super, NULL); + load_ddf_global(dfd, super, NULL); + close(dfd); + /* Now we need the device-local bits */ + for (sd = sra->devs ; sd ; sd = sd->next) { + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY); + if (dfd < 0) + return 2; + seq = load_ddf_local(dfd, super, NULL, keep_fd); + if (!keep_fd) close(dfd); + } + if (st->subarray[0]) { + struct vcl *v; + + for (v = super->conflist; v; v = v->next) + if (v->vcnum == atoi(st->subarray)) + super->currentconf = v; + if (!super->currentconf) + return 1; + } + *sbp = super; + if (st->ss == NULL) { + st->ss = &super_ddf; + st->minor_version = 0; + st->max_devs = 512; + st->container_dev = fd2devnum(fd); + } + return 0; +} +#endif + +static struct mdinfo *container_content_ddf(struct supertype *st) +{ + /* Given a container loaded by load_super_ddf_all, + * extract information about all the arrays into + * an mdinfo tree. + * + * For each vcl in conflist: create an mdinfo, fill it in, + * then look for matching devices (phys_refnum) in dlist + * and create appropriate device mdinfo. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo *rest = NULL; + struct vcl *vc; + + for (vc = ddf->conflist ; vc ; vc=vc->next) + { + int i; + struct mdinfo *this; + this = malloc(sizeof(*this)); + memset(this, 0, sizeof(*this)); + this->next = rest; + rest = this; + + this->array.level = map_num1(ddf_level_num, vc->conf.prl); + this->array.raid_disks = + __be16_to_cpu(vc->conf.prim_elmnt_count); + this->array.layout = rlq_to_layout(vc->conf.rlq, vc->conf.prl, + this->array.raid_disks); + this->array.md_minor = -1; + this->array.ctime = DECADE + + __be32_to_cpu(*(__u32*)(vc->conf.guid+16)); + this->array.utime = DECADE + + __be32_to_cpu(vc->conf.timestamp); + this->array.chunk_size = 512 << vc->conf.chunk_shift; + + i = vc->vcnum; + if ((ddf->virt->entries[i].state & DDF_state_inconsistent) || + (ddf->virt->entries[i].init_state & DDF_initstate_mask) != + DDF_init_full) { + this->array.state = 0; + this->resync_start = 0; + } else { + this->array.state = 1; + this->resync_start = ~0ULL; + } + memcpy(this->name, ddf->virt->entries[i].name, 32); + this->name[33]=0; + + memset(this->uuid, 0, sizeof(this->uuid)); + this->component_size = __be64_to_cpu(vc->conf.blocks); + this->array.size = this->component_size / 2; + this->container_member = i; + + sprintf(this->text_version, "/%s/%d", + devnum2devname(st->container_dev), + this->container_member); + + for (i=0 ; i < ddf->mppe ; i++) { + struct mdinfo *dev; + struct dl *d; + + if (vc->conf.phys_refnum[i] == 0xFFFFFFFF) + continue; + + this->array.working_disks++; + + for (d = ddf->dlist; d ; d=d->next) + if (d->disk.refnum == vc->conf.phys_refnum[i]) + break; + if (d == NULL) + break; + + dev = malloc(sizeof(*dev)); + memset(dev, 0, sizeof(*dev)); + dev->next = this->devs; + this->devs = dev; + + dev->disk.number = __be32_to_cpu(d->disk.refnum); + dev->disk.major = d->major; + dev->disk.minor = d->minor; + dev->disk.raid_disk = i; + dev->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE); + + dev->events = __be32_to_cpu(ddf->primary.seq); + dev->data_offset = __be64_to_cpu(vc->lba_offset[i]); + dev->component_size = __be64_to_cpu(vc->conf.blocks); + if (d->devname) + strcpy(dev->name, d->devname); + } + } + return rest; +} + +static int store_zero_ddf(struct supertype *st, int fd) +{ + unsigned long long dsize; + void *buf; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + posix_memalign(&buf, 512, 512); + memset(buf, 0, 512); + + lseek64(fd, dsize-512, 0); + write(fd, buf, 512); + free(buf); + return 0; +} + +static int compare_super_ddf(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + struct ddf_super *first = st->sb; + struct ddf_super *second = tst->sb; + + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } + + if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0) + return 2; + + /* FIXME should I look at anything else? */ + return 0; +} + +/* + * A new array 'a' has been started which claims to be instance 'inst' + * within container 'c'. + * We need to confirm that the array matches the metadata in 'c' so + * that we don't corrupt any metadata. + */ +static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst) +{ + dprintf("ddf: open_new %s\n", inst); + a->info.container_member = atoi(inst); + return 0; +} + +/* + * The array 'a' is to be marked clean in the metadata. + * If '->resync_start' is not ~(unsigned long long)0, then the array is only + * clean up to the point (in sectors). If that cannot be recorded in the + * metadata, then leave it as dirty. + * + * For DDF, we need to clear the DDF_state_inconsistent bit in the + * !global! virtual_disk.virtual_entry structure. + */ +static void ddf_set_array_state(struct active_array *a, int consistent) +{ + struct ddf_super *ddf = a->container->sb; + int inst = a->info.container_member; + int old = ddf->virt->entries[inst].state; + if (consistent) + ddf->virt->entries[inst].state &= ~DDF_state_inconsistent; + else + ddf->virt->entries[inst].state |= DDF_state_inconsistent; + if (old != ddf->virt->entries[inst].state) + ddf->updates_pending = 1; + + old = ddf->virt->entries[inst].init_state; + ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask; + if (a->resync_start == ~0ULL) + ddf->virt->entries[inst].init_state |= DDF_init_full; + else if (a->resync_start == 0) + ddf->virt->entries[inst].init_state |= DDF_init_not; + else + ddf->virt->entries[inst].init_state |= DDF_init_quick; + if (old != ddf->virt->entries[inst].init_state) + ddf->updates_pending = 1; + + dprintf("ddf mark %d %s %llu\n", inst, consistent?"clean":"dirty", + a->resync_start); +} + +/* + * The state of each disk is stored in the global phys_disk structure + * in phys_disk.entries[n].state. + * This makes various combinations awkward. + * - When a device fails in any array, it must be failed in all arrays + * that include a part of this device. + * - When a component is rebuilding, we cannot include it officially in the + * array unless this is the only array that uses the device. + * + * So: when transitioning: + * Online -> failed, just set failed flag. monitor will propagate + * spare -> online, the device might need to be added to the array. + * spare -> failed, just set failed. Don't worry if in array or not. + */ +static void ddf_set_disk(struct active_array *a, int n, int state) +{ + struct ddf_super *ddf = a->container->sb; + int inst = a->info.container_member; + struct vd_config *vc = find_vdcr(ddf, inst); + int pd = find_phys(ddf, vc->phys_refnum[n]); + int i, st, working; + + if (vc == NULL) { + dprintf("ddf: cannot find instance %d!!\n", inst); + return; + } + if (pd < 0) { + /* disk doesn't currently exist. If it is now in_sync, + * insert it. */ + if ((state & DS_INSYNC) && ! (state & DS_FAULTY)) { + /* Find dev 'n' in a->info->devs, determine the + * ddf refnum, and set vc->phys_refnum and update + * phys->entries[] + */ + /* FIXME */ + } + } else { + int old = ddf->phys->entries[pd].state; + if (state & DS_FAULTY) + ddf->phys->entries[pd].state |= __cpu_to_be16(DDF_Failed); + if (state & DS_INSYNC) { + ddf->phys->entries[pd].state |= __cpu_to_be16(DDF_Online); + ddf->phys->entries[pd].state &= __cpu_to_be16(~DDF_Rebuilding); + } + if (old != ddf->phys->entries[pd].state) + ddf->updates_pending = 1; + } + + dprintf("ddf: set_disk %d to %x\n", n, state); + + /* Now we need to check the state of the array and update + * virtual_disk.entries[n].state. + * It needs to be one of "optimal", "degraded", "failed". + * I don't understand 'deleted' or 'missing'. + */ + working = 0; + for (i=0; i < a->info.array.raid_disks; i++) { + pd = find_phys(ddf, vc->phys_refnum[i]); + if (pd < 0) + continue; + st = __be16_to_cpu(ddf->phys->entries[pd].state); + if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding)) + == DDF_Online) + working++; + } + state = DDF_state_degraded; + if (working == a->info.array.raid_disks) + state = DDF_state_optimal; + else switch(vc->prl) { + case DDF_RAID0: + case DDF_CONCAT: + case DDF_JBOD: + state = DDF_state_failed; + break; + case DDF_RAID1: + if (working == 0) + state = DDF_state_failed; + break; + case DDF_RAID4: + case DDF_RAID5: + if (working < a->info.array.raid_disks-1) + state = DDF_state_failed; + break; + case DDF_RAID6: + if (working < a->info.array.raid_disks-2) + state = DDF_state_failed; + else if (working == a->info.array.raid_disks-1) + state = DDF_state_part_optimal; + break; + } + + if (ddf->virt->entries[inst].state != + ((ddf->virt->entries[inst].state & ~DDF_state_mask) + | state)) { + + ddf->virt->entries[inst].state = + (ddf->virt->entries[inst].state & ~DDF_state_mask) + | state; + ddf->updates_pending = 1; + } + +} + +static void ddf_sync_metadata(struct supertype *st) +{ + + /* + * Write all data to all devices. + * Later, we might be able to track whether only local changes + * have been made, or whether any global data has been changed, + * but ddf is sufficiently weird that it probably always + * changes global data .... + */ + struct ddf_super *ddf = st->sb; + if (!ddf->updates_pending) + return; + ddf->updates_pending = 0; + __write_init_super_ddf(st, 0); + dprintf("ddf: sync_metadata\n"); +} + +static void ddf_process_update(struct supertype *st, + struct metadata_update *update) +{ + /* Apply this update to the metadata. + * The first 4 bytes are a DDF_*_MAGIC which guides + * our actions. + * Possible update are: + * DDF_PHYS_RECORDS_MAGIC + * Add a new physical device. Changes to this record + * only happen implicitly. + * used_pdes is the device number. + * DDF_VIRT_RECORDS_MAGIC + * Add a new VD. Possibly also change the 'access' bits. + * populated_vdes is the entry number. + * DDF_VD_CONF_MAGIC + * New or updated VD. the VIRT_RECORD must already + * exist. For an update, phys_refnum and lba_offset + * (at least) are updated, and the VD_CONF must + * be written to precisely those devices listed with + * a phys_refnum. + * DDF_SPARE_ASSIGN_MAGIC + * replacement Spare Assignment Record... but for which device? + * + * So, e.g.: + * - to create a new array, we send a VIRT_RECORD and + * a VD_CONF. Then assemble and start the array. + * - to activate a spare we send a VD_CONF to add the phys_refnum + * and offset. This will also mark the spare as active with + * a spare-assignment record. + */ + struct ddf_super *ddf = st->sb; + __u32 *magic = (__u32*)update->buf; + struct phys_disk *pd; + struct virtual_disk *vd; + struct vd_config *vc; + struct vcl *vcl; + struct dl *dl; + int mppe; + int ent; + + dprintf("Process update %x\n", *magic); + + switch (*magic) { + case DDF_PHYS_RECORDS_MAGIC: + + if (update->len != (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry))) + return; + pd = (struct phys_disk*)update->buf; + + ent = __be16_to_cpu(pd->used_pdes); + if (ent >= __be16_to_cpu(ddf->phys->max_pdes)) + return; + if (!all_ff(ddf->phys->entries[ent].guid)) + return; + ddf->phys->entries[ent] = pd->entries[0]; + ddf->phys->used_pdes = __cpu_to_be16(1 + + __be16_to_cpu(ddf->phys->used_pdes)); + ddf->updates_pending = 1; + break; + + case DDF_VIRT_RECORDS_MAGIC: + + if (update->len != (sizeof(struct virtual_disk) + + sizeof(struct virtual_entry))) + return; + vd = (struct virtual_disk*)update->buf; + + ent = __be16_to_cpu(vd->populated_vdes); + if (ent >= __be16_to_cpu(ddf->virt->max_vdes)) + return; + if (!all_ff(ddf->virt->entries[ent].guid)) + return; + ddf->virt->entries[ent] = vd->entries[0]; + ddf->virt->populated_vdes = __cpu_to_be16(1 + + __be16_to_cpu(ddf->virt->populated_vdes)); + ddf->updates_pending = 1; + break; + + case DDF_VD_CONF_MAGIC: + dprintf("len %d %d\n", update->len, ddf->conf_rec_len); + + mppe = __be16_to_cpu(ddf->anchor.max_primary_element_entries); + if (update->len != ddf->conf_rec_len * 512) + return; + vc = (struct vd_config*)update->buf; + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) + if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0) + break; + dprintf("vcl = %p\n", vcl); + if (vcl) { + /* An update, just copy the phys_refnum and lba_offset + * fields + */ + memcpy(vcl->conf.phys_refnum, vc->phys_refnum, + mppe * (sizeof(__u32) + sizeof(__u64))); + } else { + /* A new VD_CONF */ + vcl = update->space; + update->space = NULL; + vcl->next = ddf->conflist; + memcpy(&vcl->conf, vc, update->len); + vcl->lba_offset = (__u64*) + &vcl->conf.phys_refnum[mppe]; + ddf->conflist = vcl; + } + /* Now make sure vlist is correct for each dl. */ + for (dl = ddf->dlist; dl; dl = dl->next) { + int dn; + int vn = 0; + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) + for (dn=0; dn < ddf->mppe ; dn++) + if (vcl->conf.phys_refnum[dn] == + dl->disk.refnum) { + dprintf("dev %d has %p at %d\n", + dl->pdnum, vcl, vn); + dl->vlist[vn++] = vcl; + break; + } + while (vn < ddf->max_part) + dl->vlist[vn++] = NULL; + if (dl->vlist[0]) { + ddf->phys->entries[dl->pdnum].type &= + ~__cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type |= + __cpu_to_be16(DDF_Active_in_VD); + } + if (dl->spare) { + ddf->phys->entries[dl->pdnum].type &= + ~__cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type |= + __cpu_to_be16(DDF_Spare); + } + if (!dl->vlist[0] && !dl->spare) { + ddf->phys->entries[dl->pdnum].type |= + __cpu_to_be16(DDF_Global_Spare); + ddf->phys->entries[dl->pdnum].type &= + ~__cpu_to_be16(DDF_Spare | + DDF_Active_in_VD); + } + } + ddf->updates_pending = 1; + break; + case DDF_SPARE_ASSIGN_MAGIC: + default: break; + } +} + +static void ddf_prepare_update(struct supertype *st, + struct metadata_update *update) +{ + /* This update arrived at managemon. + * We are about to pass it to monitor. + * If a malloc is needed, do it here. + */ + struct ddf_super *ddf = st->sb; + __u32 *magic = (__u32*)update->buf; + if (*magic == DDF_VD_CONF_MAGIC) + posix_memalign(&update->space, 512, + offsetof(struct vcl, conf) + + ddf->conf_rec_len * 512); +} + +/* + * Check if the array 'a' is degraded but not failed. + * If it is, find as many spares as are available and needed and + * arrange for their inclusion. + * We only choose devices which are not already in the array, + * and prefer those with a spare-assignment to this array. + * otherwise we choose global spares - assuming always that + * there is enough room. + * For each spare that we assign, we return an 'mdinfo' which + * describes the position for the device in the array. + * We also add to 'updates' a DDF_VD_CONF_MAGIC update with + * the new phys_refnum and lba_offset values. + * + * Only worry about BVDs at the moment. + */ +static struct mdinfo *ddf_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + int working = 0; + struct mdinfo *d; + struct ddf_super *ddf = a->container->sb; + int global_ok = 0; + struct mdinfo *rv = NULL; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + int i; + struct vd_config *vc; + __u64 *lba; + + for (d = a->info.devs ; d ; d = d->next) { + if ((d->curr_state & DS_FAULTY) && + d->state_fd >= 0) + /* wait for Removal to happen */ + return NULL; + if (d->state_fd >= 0) + working ++; + } + + dprintf("ddf_activate: working=%d (%d) level=%d\n", working, a->info.array.raid_disks, + a->info.array.level); + if (working == a->info.array.raid_disks) + return NULL; /* array not degraded */ + switch (a->info.array.level) { + case 1: + if (working == 0) + return NULL; /* failed */ + break; + case 4: + case 5: + if (working < a->info.array.raid_disks - 1) + return NULL; /* failed */ + break; + case 6: + if (working < a->info.array.raid_disks - 2) + return NULL; /* failed */ + break; + default: /* concat or stripe */ + return NULL; /* failed */ + } + + /* For each slot, if it is not working, find a spare */ + dl = ddf->dlist; + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && (d->state_fd >= 0)) + continue; + + /* OK, this device needs recovery. Find a spare */ + again: + for ( ; dl ; dl = dl->next) { + unsigned long long esize; + unsigned long long pos; + struct mdinfo *d2; + int is_global = 0; + int is_dedicated = 0; + struct extent *ex; + int j; + /* If in this array, skip */ + for (d2 = a->info.devs ; d2 ; d2 = d2->next) + if (d2->disk.major == dl->major && + d2->disk.minor == dl->minor) { + dprintf("%x:%x already in array\n", dl->major, dl->minor); + break; + } + if (d2) + continue; + if (ddf->phys->entries[dl->pdnum].type & + __cpu_to_be16(DDF_Spare)) { + /* Check spare assign record */ + if (dl->spare) { + if (dl->spare->type & DDF_spare_dedicated) { + /* check spare_ents for guid */ + for (j = 0 ; + j < __be16_to_cpu(dl->spare->populated); + j++) { + if (memcmp(dl->spare->spare_ents[j].guid, + ddf->virt->entries[a->info.container_member].guid, + DDF_GUID_LEN) == 0) + is_dedicated = 1; + } + } else + is_global = 1; + } + } else if (ddf->phys->entries[dl->pdnum].type & + __cpu_to_be16(DDF_Global_Spare)) { + is_global = 1; + } + if ( ! (is_dedicated || + (is_global && global_ok))) { + dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor, + is_dedicated, is_global); + continue; + } + + /* We are allowed to use this device - is there space? + * We need a->info.component_size sectors */ + ex = get_extents(ddf, dl); + if (!ex) { + dprintf("cannot get extents\n"); + continue; + } + j = 0; pos = 0; + esize = 0; + + do { + esize = ex[j].start - pos; + if (esize >= a->info.component_size) + break; + pos = ex[i].start + ex[i].size; + i++; + } while (ex[i-1].size); + + free(ex); + if (esize < a->info.component_size) { + dprintf("%x:%x has no room: %llu %llu\n", dl->major, dl->minor, + esize, a->info.component_size); + /* No room */ + continue; + } + + /* Cool, we have a device with some space at pos */ + di = malloc(sizeof(*di)); + memset(di, 0, sizeof(*di)); + di->disk.number = i; + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->data_offset = pos; + di->component_size = a->info.component_size; + di->container_member = dl->pdnum; + di->next = rv; + rv = di; + dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor, + i, pos); + + break; + } + if (!dl && ! global_ok) { + /* not enough dedicated spares, try global */ + global_ok = 1; + dl = ddf->dlist; + goto again; + } + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * phys_refnum and lba_offset values + */ + mu = malloc(sizeof(*mu)); + mu->buf = malloc(ddf->conf_rec_len * 512); + posix_memalign(&mu->space, 512, sizeof(struct vcl)); + mu->len = ddf->conf_rec_len; + mu->next = *updates; + vc = find_vdcr(ddf, a->info.container_member); + memcpy(mu->buf, vc, ddf->conf_rec_len * 512); + + vc = (struct vd_config*)mu->buf; + lba = (__u64*)&vc->phys_refnum[ddf->mppe]; + for (di = rv ; di ; di = di->next) { + vc->phys_refnum[di->disk.raid_disk] = + ddf->phys->entries[dl->pdnum].refnum; + lba[di->disk.raid_disk] = di->data_offset; + } + *updates = mu; + return rv; +} + +struct superswitch super_ddf = { +#ifndef MDASSEMBLE + .examine_super = examine_super_ddf, + .brief_examine_super = brief_examine_super_ddf, + .detail_super = detail_super_ddf, + .brief_detail_super = brief_detail_super_ddf, + .validate_geometry = validate_geometry_ddf, + .write_init_super = write_init_super_ddf, +#endif + .match_home = match_home_ddf, + .uuid_from_super= uuid_from_super_ddf, + .getinfo_super = getinfo_super_ddf, + .update_super = update_super_ddf, + + .avail_size = avail_size_ddf, + + .compare_super = compare_super_ddf, + + .load_super = load_super_ddf, + .init_super = init_super_ddf, + .store_super = store_zero_ddf, + .free_super = free_super_ddf, + .match_metadata_desc = match_metadata_desc_ddf, + .add_to_super = add_to_super_ddf, + .container_content = container_content_ddf, + + .external = 1, + +/* for mdmon */ + .open_new = ddf_open_new, + .set_array_state= ddf_set_array_state, + .set_disk = ddf_set_disk, + .sync_metadata = ddf_sync_metadata, + .process_update = ddf_process_update, + .prepare_update = ddf_prepare_update, + .activate_spare = ddf_activate_spare, + +}; diff --git a/super-intel.c b/super-intel.c new file mode 100644 index 00000000..caa3881b --- /dev/null +++ b/super-intel.c @@ -0,0 +1,2552 @@ +/* + * mdadm - Intel(R) Matrix Storage Manager Support + * + * Copyright (C) 2002-2007 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "mdadm.h" +#include "mdmon.h" +#include <values.h> +#include <scsi/sg.h> +#include <ctype.h> + +/* MPB == Metadata Parameter Block */ +#define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. " +#define MPB_SIG_LEN (strlen(MPB_SIGNATURE)) +#define MPB_VERSION_RAID0 "1.0.00" +#define MPB_VERSION_RAID1 "1.1.00" +#define MPB_VERSION_RAID5 "1.2.02" +#define MAX_SIGNATURE_LENGTH 32 +#define MAX_RAID_SERIAL_LEN 16 +#define MPB_SECTOR_CNT 418 +#define IMSM_RESERVED_SECTORS 4096 + +/* Disk configuration info. */ +#define IMSM_MAX_DEVICES 255 +struct imsm_disk { + __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */ + __u32 total_blocks; /* 0xE8 - 0xEB total blocks */ + __u32 scsi_id; /* 0xEC - 0xEF scsi ID */ + __u32 status; /* 0xF0 - 0xF3 */ +#define SPARE_DISK 0x01 /* Spare */ +#define CONFIGURED_DISK 0x02 /* Member of some RaidDev */ +#define FAILED_DISK 0x04 /* Permanent failure */ +#define USABLE_DISK 0x08 /* Fully usable unless FAILED_DISK is set */ + +#define IMSM_DISK_FILLERS 5 + __u32 filler[IMSM_DISK_FILLERS]; /* 0xF4 - 0x107 MPB_DISK_FILLERS for future expansion */ +}; + +/* RAID map configuration infos. */ +struct imsm_map { + __u32 pba_of_lba0; /* start address of partition */ + __u32 blocks_per_member;/* blocks per member */ + __u32 num_data_stripes; /* number of data stripes */ + __u16 blocks_per_strip; + __u8 map_state; /* Normal, Uninitialized, Degraded, Failed */ +#define IMSM_T_STATE_NORMAL 0 +#define IMSM_T_STATE_UNINITIALIZED 1 +#define IMSM_T_STATE_DEGRADED 2 /* FIXME: is this correct? */ +#define IMSM_T_STATE_FAILED 3 /* FIXME: is this correct? */ + __u8 raid_level; +#define IMSM_T_RAID0 0 +#define IMSM_T_RAID1 1 +#define IMSM_T_RAID5 5 /* since metadata version 1.2.02 ? */ + __u8 num_members; /* number of member disks */ + __u8 reserved[3]; + __u32 filler[7]; /* expansion area */ + __u32 disk_ord_tbl[1]; /* disk_ord_tbl[num_members], + top byte special */ +} __attribute__ ((packed)); + +struct imsm_vol { + __u32 reserved[2]; + __u8 migr_state; /* Normal or Migrating */ + __u8 migr_type; /* Initializing, Rebuilding, ... */ + __u8 dirty; + __u8 fill[1]; + __u32 filler[5]; + struct imsm_map map[1]; + /* here comes another one if migr_state */ +} __attribute__ ((packed)); + +struct imsm_dev { + __u8 volume[MAX_RAID_SERIAL_LEN]; + __u32 size_low; + __u32 size_high; + __u32 status; /* Persistent RaidDev status */ + __u32 reserved_blocks; /* Reserved blocks at beginning of volume */ +#define IMSM_DEV_FILLERS 12 + __u32 filler[IMSM_DEV_FILLERS]; + struct imsm_vol vol; +} __attribute__ ((packed)); + +struct imsm_super { + __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */ + __u32 check_sum; /* 0x20 - 0x23 MPB Checksum */ + __u32 mpb_size; /* 0x24 - 0x27 Size of MPB */ + __u32 family_num; /* 0x28 - 0x2B Checksum from first time this config was written */ + __u32 generation_num; /* 0x2C - 0x2F Incremented each time this array's MPB is written */ + __u32 reserved[2]; /* 0x30 - 0x37 */ + __u8 num_disks; /* 0x38 Number of configured disks */ + __u8 num_raid_devs; /* 0x39 Number of configured volumes */ + __u8 fill[2]; /* 0x3A - 0x3B */ +#define IMSM_FILLERS 39 + __u32 filler[IMSM_FILLERS]; /* 0x3C - 0xD7 RAID_MPB_FILLERS */ + struct imsm_disk disk[1]; /* 0xD8 diskTbl[numDisks] */ + /* here comes imsm_dev[num_raid_devs] */ +} __attribute__ ((packed)); + +#ifndef MDASSEMBLE +static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" }; +#endif + +static unsigned int sector_count(__u32 bytes) +{ + return ((bytes + (512-1)) & (~(512-1))) / 512; +} + +static unsigned int mpb_sectors(struct imsm_super *mpb) +{ + return sector_count(__le32_to_cpu(mpb->mpb_size)); +} + +/* internal representation of IMSM metadata */ +struct intel_super { + union { + void *buf; /* O_DIRECT buffer for reading/writing metadata */ + struct imsm_super *anchor; /* immovable parameters */ + }; + size_t len; /* size of the 'buf' allocation */ + int updates_pending; /* count of pending updates for mdmon */ + int creating_imsm; /* flag to indicate container creation */ + int current_vol; /* index of raid device undergoing creation */ + #define IMSM_MAX_DISKS 6 + struct imsm_disk *disk_tbl[IMSM_MAX_DISKS]; + #define IMSM_MAX_RAID_DEVS 2 + struct imsm_dev *dev_tbl[IMSM_MAX_RAID_DEVS]; + struct dl { + struct dl *next; + int index; + __u8 serial[MAX_RAID_SERIAL_LEN]; + int major, minor; + char *devname; + int fd; + } *disks; +}; + +struct extent { + unsigned long long start, size; +}; + +/* definition of messages passed to imsm_process_update */ +enum imsm_update_type { + update_activate_spare, + update_create_array, +}; + +struct imsm_update_activate_spare { + enum imsm_update_type type; + int disk_idx; + int slot; + int array; + struct imsm_update_activate_spare *next; +}; + +struct imsm_update_create_array { + enum imsm_update_type type; + struct imsm_dev dev; + int dev_idx; +}; + +static int imsm_env_devname_as_serial(void) +{ + char *val = getenv("IMSM_DEVNAME_AS_SERIAL"); + + if (val && atoi(val) == 1) + return 1; + + return 0; +} + + +static struct supertype *match_metadata_desc_imsm(char *arg) +{ + struct supertype *st; + + if (strcmp(arg, "imsm") != 0 && + strcmp(arg, "default") != 0 + ) + return NULL; + + st = malloc(sizeof(*st)); + memset(st, 0, sizeof(*st)); + st->ss = &super_imsm; + st->max_devs = IMSM_MAX_DEVICES; + st->minor_version = 0; + st->sb = NULL; + return st; +} + +static __u8 *get_imsm_version(struct imsm_super *mpb) +{ + return &mpb->sig[MPB_SIG_LEN]; +} + +/* retrieve a disk directly from the anchor when the anchor is known to be + * up-to-date, currently only at load time + */ +static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index) +{ + if (index >= mpb->num_disks) + return NULL; + return &mpb->disk[index]; +} + +static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index) +{ + if (index >= super->anchor->num_disks) + return NULL; + return super->disk_tbl[index]; +} + +/* generate a checksum directly from the anchor when the anchor is known to be + * up-to-date, currently only at load or write_super after coalescing + */ +static __u32 __gen_imsm_checksum(struct imsm_super *mpb) +{ + __u32 end = mpb->mpb_size / sizeof(end); + __u32 *p = (__u32 *) mpb; + __u32 sum = 0; + + while (end--) + sum += __le32_to_cpu(*p++); + + return sum - __le32_to_cpu(mpb->check_sum); +} + +static size_t sizeof_imsm_dev(struct imsm_dev *dev) +{ + size_t size = sizeof(*dev); + + /* each map has disk_ord_tbl[num_members - 1] additional space */ + size += sizeof(__u32) * (dev->vol.map[0].num_members - 1); + + /* migrating means an additional map */ + if (dev->vol.migr_state) { + size += sizeof(struct imsm_map); + size += sizeof(__u32) * (dev->vol.map[1].num_members - 1); + } + + return size; +} + +static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index) +{ + int offset; + int i; + void *_mpb = mpb; + + if (index >= mpb->num_raid_devs) + return NULL; + + /* devices start after all disks */ + offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb; + + for (i = 0; i <= index; i++) + if (i == index) + return _mpb + offset; + else + offset += sizeof_imsm_dev(_mpb + offset); + + return NULL; +} + +static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index) +{ + if (index >= super->anchor->num_raid_devs) + return NULL; + return super->dev_tbl[index]; +} + +static __u32 get_imsm_disk_idx(struct imsm_map *map, int slot) +{ + __u32 *ord_tbl = &map->disk_ord_tbl[slot]; + + /* top byte is 'special' */ + return __le32_to_cpu(*ord_tbl & ~(0xff << 24)); +} + +static int get_imsm_raid_level(struct imsm_map *map) +{ + if (map->raid_level == 1) { + if (map->num_members == 2) + return 1; + else + return 10; + } + + return map->raid_level; +} + +static int cmp_extent(const void *av, const void *bv) +{ + const struct extent *a = av; + const struct extent *b = bv; + if (a->start < b->start) + return -1; + if (a->start > b->start) + return 1; + return 0; +} + +static struct extent *get_extents(struct intel_super *super, struct dl *dl) +{ + /* find a list of used extents on the given physical device */ + struct imsm_disk *disk; + struct extent *rv, *e; + int i, j; + int memberships = 0; + + disk = get_imsm_disk(super, dl->index); + if (!disk) + return NULL; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = dev->vol.map; + + for (j = 0; j < map->num_members; j++) { + __u32 index = get_imsm_disk_idx(map, j); + + if (index == dl->index) + memberships++; + } + } + rv = malloc(sizeof(struct extent) * (memberships + 1)); + if (!rv) + return NULL; + e = rv; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = dev->vol.map; + + for (j = 0; j < map->num_members; j++) { + __u32 index = get_imsm_disk_idx(map, j); + + if (index == dl->index) { + e->start = __le32_to_cpu(map->pba_of_lba0); + e->size = __le32_to_cpu(map->blocks_per_member); + e++; + } + } + } + qsort(rv, memberships, sizeof(*rv), cmp_extent); + + e->start = __le32_to_cpu(disk->total_blocks) - + (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS); + e->size = 0; + return rv; +} + +#ifndef MDASSEMBLE +static void print_imsm_dev(struct imsm_dev *dev, int index) +{ + __u64 sz; + int slot; + struct imsm_map *map = dev->vol.map; + + printf("\n"); + printf("[%s]:\n", dev->volume); + printf(" RAID Level : %d\n", get_imsm_raid_level(map)); + printf(" Members : %d\n", map->num_members); + for (slot = 0; slot < map->num_members; slot++) + if (index == get_imsm_disk_idx(map, slot)) + break; + if (slot < map->num_members) + printf(" This Slot : %d\n", slot); + else + printf(" This Slot : ?\n"); + sz = __le32_to_cpu(dev->size_high); + sz <<= 32; + sz += __le32_to_cpu(dev->size_low); + printf(" Array Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); + sz = __le32_to_cpu(map->blocks_per_member); + printf(" Per Dev Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); + printf(" Sector Offset : %u\n", + __le32_to_cpu(map->pba_of_lba0)); + printf(" Num Stripes : %u\n", + __le32_to_cpu(map->num_data_stripes)); + printf(" Chunk Size : %u KiB\n", + __le16_to_cpu(map->blocks_per_strip) / 2); + printf(" Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks)); + printf(" Migrate State : %s\n", dev->vol.migr_state ? "migrating" : "idle"); + printf(" Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean"); + printf(" Map State : %s\n", map_state_str[map->map_state]); +} + +static void print_imsm_disk(struct imsm_super *mpb, int index) +{ + struct imsm_disk *disk = __get_imsm_disk(mpb, index); + char str[MAX_RAID_SERIAL_LEN]; + __u32 s; + __u64 sz; + + if (index < 0) + return; + + printf("\n"); + snprintf(str, MAX_RAID_SERIAL_LEN, "%s", disk->serial); + printf(" Disk%02d Serial : %s\n", index, str); + s = __le32_to_cpu(disk->status); + printf(" State :%s%s%s%s\n", s&SPARE_DISK ? " spare" : "", + s&CONFIGURED_DISK ? " active" : "", + s&FAILED_DISK ? " failed" : "", + s&USABLE_DISK ? " usable" : ""); + printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id)); + sz = __le32_to_cpu(disk->total_blocks) - + (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS * mpb->num_raid_devs); + printf(" Usable Size : %llu%s\n", (unsigned long long)sz, + human_size(sz * 512)); +} + +static void examine_super_imsm(struct supertype *st, char *homehost) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + char str[MAX_SIGNATURE_LENGTH]; + int i; + __u32 sum; + + snprintf(str, MPB_SIG_LEN, "%s", mpb->sig); + printf(" Magic : %s\n", str); + snprintf(str, strlen(MPB_VERSION_RAID0), "%s", get_imsm_version(mpb)); + printf(" Version : %s\n", get_imsm_version(mpb)); + printf(" Family : %08x\n", __le32_to_cpu(mpb->family_num)); + printf(" Generation : %08x\n", __le32_to_cpu(mpb->generation_num)); + sum = __le32_to_cpu(mpb->check_sum); + printf(" Checksum : %08x %s\n", sum, + __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect"); + printf(" MPB Sectors : %d\n", mpb_sectors(mpb)); + printf(" Disks : %d\n", mpb->num_disks); + printf(" RAID Devices : %d\n", mpb->num_raid_devs); + print_imsm_disk(mpb, super->disks->index); + for (i = 0; i < mpb->num_raid_devs; i++) + print_imsm_dev(__get_imsm_dev(mpb, i), super->disks->index); + for (i = 0; i < mpb->num_disks; i++) { + if (i == super->disks->index) + continue; + print_imsm_disk(mpb, i); + } +} + +static void brief_examine_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + + printf("ARRAY /dev/imsm family=%08x metadata=external:imsm\n", + __le32_to_cpu(super->anchor->family_num)); +} + +static void detail_super_imsm(struct supertype *st, char *homehost) +{ + printf("%s\n", __FUNCTION__); +} + +static void brief_detail_super_imsm(struct supertype *st) +{ + printf("%s\n", __FUNCTION__); +} +#endif + +static int match_home_imsm(struct supertype *st, char *homehost) +{ + printf("%s\n", __FUNCTION__); + + return 0; +} + +static void uuid_from_super_imsm(struct supertype *st, int uuid[4]) +{ + printf("%s\n", __FUNCTION__); +} + +#if 0 +static void +get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p) +{ + __u8 *v = get_imsm_version(mpb); + __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH; + char major[] = { 0, 0, 0 }; + char minor[] = { 0 ,0, 0 }; + char patch[] = { 0, 0, 0 }; + char *ver_parse[] = { major, minor, patch }; + int i, j; + + i = j = 0; + while (*v != '\0' && v < end) { + if (*v != '.' && j < 2) + ver_parse[i][j++] = *v; + else { + i++; + j = 0; + } + v++; + } + + *m = strtol(minor, NULL, 0); + *p = strtol(patch, NULL, 0); +} +#endif + +static int imsm_level_to_layout(int level) +{ + switch (level) { + case 0: + case 1: + return 0; + case 5: + case 6: + return ALGORITHM_LEFT_SYMMETRIC; + case 10: + return 0x102; //FIXME is this correct? + } + return -1; +} + +static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); + struct imsm_map *map = &dev->vol.map[0]; + + info->container_member = super->current_vol; + info->array.raid_disks = map->num_members; + info->array.level = get_imsm_raid_level(map); + info->array.layout = imsm_level_to_layout(info->array.level); + info->array.md_minor = -1; + info->array.ctime = 0; + info->array.utime = 0; + info->array.chunk_size = __le16_to_cpu(map->blocks_per_strip * 512); + + info->data_offset = __le32_to_cpu(map->pba_of_lba0); + info->component_size = __le32_to_cpu(map->blocks_per_member); + + info->disk.major = 0; + info->disk.minor = 0; + + sprintf(info->text_version, "/%s/%d", + devnum2devname(st->container_dev), + info->container_member); +} + + +static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct imsm_disk *disk; + __u32 s; + + if (super->current_vol >= 0) { + getinfo_super_imsm_volume(st, info); + return; + } + info->array.raid_disks = super->anchor->num_disks; + info->array.level = LEVEL_CONTAINER; + info->array.layout = 0; + info->array.md_minor = -1; + info->array.ctime = 0; /* N/A for imsm */ + info->array.utime = 0; + info->array.chunk_size = 0; + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.raid_disk = -1; + info->reshape_active = 0; + strcpy(info->text_version, "imsm"); + info->disk.number = -1; + info->disk.state = 0; + + if (super->disks) { + disk = get_imsm_disk(super, super->disks->index); + if (!disk) { + info->disk.number = -1; + info->disk.raid_disk = -1; + return; + } + info->disk.number = super->disks->index; + info->disk.raid_disk = super->disks->index; + info->data_offset = __le32_to_cpu(disk->total_blocks) - + (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS); + info->component_size = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + s = __le32_to_cpu(disk->status); + info->disk.state = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0; + info->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0; + info->disk.state |= s & USABLE_DISK ? (1 << MD_DISK_SYNC) : 0; + } +} + +static int update_super_imsm(struct supertype *st, struct mdinfo *info, + char *update, char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* FIXME */ + + /* For 'assemble' and 'force' we need to return non-zero if any + * change was made. For others, the return value is ignored. + * Update options are: + * force-one : This device looks a bit old but needs to be included, + * update age info appropriately. + * assemble: clear any 'faulty' flag to allow this device to + * be assembled. + * force-array: Array is degraded but being forced, mark it clean + * if that will be needed to assemble it. + * + * newdev: not used ???? + * grow: Array has gained a new device - this is currently for + * linear only + * resync: mark as dirty so a resync will happen. + * name: update the name - preserving the homehost + * + * Following are not relevant for this imsm: + * sparc2.2 : update from old dodgey metadata + * super-minor: change the preferred_minor number + * summaries: update redundant counters. + * uuid: Change the uuid of the array to match watch is given + * homehost: update the recorded homehost + * _reshape_progress: record new reshape_progress position. + */ + int rv = 0; + //struct intel_super *super = st->sb; + //struct imsm_super *mpb = super->mpb; + + if (strcmp(update, "grow") == 0) { + } + if (strcmp(update, "resync") == 0) { + /* dev->vol.dirty = 1; */ + } + + /* IMSM has no concept of UUID or homehost */ + + return rv; +} + +static size_t disks_to_mpb_size(int disks) +{ + size_t size; + + size = sizeof(struct imsm_super); + size += (disks - 1) * sizeof(struct imsm_disk); + size += 2 * sizeof(struct imsm_dev); + /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */ + size += (4 - 2) * sizeof(struct imsm_map); + /* 4 possible disk_ord_tbl's */ + size += 4 * (disks - 1) * sizeof(__u32); + + return size; +} + +static __u64 avail_size_imsm(struct supertype *st, __u64 devsize) +{ + if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS)) + return 0; + + return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS); +} + +static int compare_super_imsm(struct supertype *st, struct supertype *tst) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + struct intel_super *first = st->sb; + struct intel_super *sec = tst->sb; + + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } + + if (memcmp(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH) != 0) + return 3; + if (first->anchor->family_num != sec->anchor->family_num) + return 3; + if (first->anchor->mpb_size != sec->anchor->mpb_size) + return 3; + if (first->anchor->check_sum != sec->anchor->check_sum) + return 3; + + return 0; +} + +static void fd2devname(int fd, char *name) +{ + struct stat st; + char path[256]; + char dname[100]; + char *nm; + int rv; + + name[0] = '\0'; + if (fstat(fd, &st) != 0) + return; + sprintf(path, "/sys/dev/block/%d:%d", + major(st.st_rdev), minor(st.st_rdev)); + + rv = readlink(path, dname, sizeof(dname)); + if (rv <= 0) + return; + + dname[rv] = '\0'; + nm = strrchr(dname, '/'); + nm++; + snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm); +} + + +extern int scsi_get_serial(int fd, void *buf, size_t buf_len); + +static int imsm_read_serial(int fd, char *devname, + __u8 serial[MAX_RAID_SERIAL_LEN]) +{ + unsigned char scsi_serial[255]; + int rv; + int rsp_len; + int i, cnt; + + memset(scsi_serial, 0, sizeof(scsi_serial)); + + if (imsm_env_devname_as_serial()) { + char name[MAX_RAID_SERIAL_LEN]; + + fd2devname(fd, name); + strcpy((char *) serial, name); + return 0; + } + + rv = scsi_get_serial(fd, scsi_serial, sizeof(scsi_serial)); + + if (rv != 0) { + if (devname) + fprintf(stderr, + Name ": Failed to retrieve serial for %s\n", + devname); + return rv; + } + + rsp_len = scsi_serial[3]; + for (i = 0, cnt = 0; i < rsp_len; i++) { + if (!isspace(scsi_serial[4 + i])) + serial[cnt++] = scsi_serial[4 + i]; + if (cnt == MAX_RAID_SERIAL_LEN) + break; + } + + serial[MAX_RAID_SERIAL_LEN - 1] = '\0'; + + return 0; +} + +static int +load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + struct dl *dl; + struct stat stb; + struct imsm_disk *disk; + int rv; + int i; + + dl = malloc(sizeof(*dl)); + disk = malloc(sizeof(*disk)); + if (!dl || !disk) { + if (devname) + fprintf(stderr, + Name ": failed to allocate disk buffer for %s\n", + devname); + if (disk) + free(disk); + if (dl) + free(dl); + return 2; + } + memset(dl, 0, sizeof(*dl)); + memset(disk, 0, sizeof(*disk)); + + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->disks; + dl->fd = keep_fd ? fd : -1; + dl->devname = devname ? strdup(devname) : NULL; + dl->index = -1; + super->disks = dl; + rv = imsm_read_serial(fd, devname, dl->serial); + + if (rv != 0) + return 2; + + /* look up this disk's index */ + for (i = 0; i < super->anchor->num_disks; i++) { + struct imsm_disk *disk_iter; + + disk_iter = __get_imsm_disk(super->anchor, i); + + if (memcmp(disk_iter->serial, dl->serial, + MAX_RAID_SERIAL_LEN) == 0) { + *disk = *disk_iter; + super->disk_tbl[i] = disk; + dl->index = i; + break; + } + } + + if (i == super->anchor->num_disks) { + if (devname) + fprintf(stderr, + Name ": failed to match serial \'%s\' for %s\n", + dl->serial, devname); + free(disk); + return 0; + } + + return 0; +} + +static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src) +{ + int i; + + *dest = *src; + + for (i = 0; i < src->vol.map[0].num_members; i++) + dest->vol.map[0].disk_ord_tbl[i] = src->vol.map[0].disk_ord_tbl[i]; + + if (!src->vol.migr_state) + return; + + dest->vol.map[1] = src->vol.map[1]; + for (i = 0; i < src->vol.map[1].num_members; i++) + dest->vol.map[1].disk_ord_tbl[i] = src->vol.map[1].disk_ord_tbl[i]; +} + +static int parse_raid_devices(struct intel_super *super) +{ + int i; + struct imsm_dev *dev_new; + size_t len; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); + + len = sizeof_imsm_dev(dev_iter); + dev_new = malloc(len); + if (!dev_new) + return 1; + imsm_copy_dev(dev_new, dev_iter); + super->dev_tbl[i] = dev_new; + } + + return 0; +} + +static void __free_imsm(struct intel_super *super); + +/* load_imsm_mpb - read matrix metadata + * allocates super->mpb to be freed by free_super + */ +static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) +{ + unsigned long long dsize; + unsigned long long sectors; + struct stat; + struct imsm_super *anchor; + __u32 check_sum; + int rc; + + get_dev_size(fd, NULL, &dsize); + + if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) { + if (devname) + fprintf(stderr, + Name ": Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void**)&anchor, 512, 512) != 0) { + if (devname) + fprintf(stderr, + Name ": Failed to allocate imsm anchor buffer" + " on %s\n", devname); + return 1; + } + if (read(fd, anchor, 512) != 512) { + if (devname) + fprintf(stderr, + Name ": Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); + free(anchor); + return 1; + } + + if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) { + if (devname) + fprintf(stderr, + Name ": no IMSM anchor on %s\n", devname); + free(anchor); + return 2; + } + + __free_imsm(super); + super->len = __le32_to_cpu(anchor->mpb_size); + super->len = ROUND_UP(anchor->mpb_size, 512); + if (posix_memalign(&super->buf, 512, super->len) != 0) { + if (devname) + fprintf(stderr, + Name ": unable to allocate %zu byte mpb buffer\n", + super->len); + free(anchor); + return 2; + } + memcpy(super->buf, anchor, 512); + + sectors = mpb_sectors(anchor) - 1; + free(anchor); + if (!sectors) { + rc = load_imsm_disk(fd, super, devname, 0); + if (rc == 0) + rc = parse_raid_devices(super); + return rc; + } + + /* read the extended mpb */ + if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) { + if (devname) + fprintf(stderr, + Name ": Cannot seek to extended mpb on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (read(fd, super->buf + 512, super->len - 512) != super->len - 512) { + if (devname) + fprintf(stderr, + Name ": Cannot read extended mpb on %s: %s\n", + devname, strerror(errno)); + return 2; + } + + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { + if (devname) + fprintf(stderr, + Name ": IMSM checksum %x != %x on %s\n", + check_sum, __le32_to_cpu(super->anchor->check_sum), + devname); + return 2; + } + + rc = load_imsm_disk(fd, super, devname, 0); + if (rc == 0) + rc = parse_raid_devices(super); + return rc; +} + +static void free_imsm_disks(struct intel_super *super) +{ + int i; + + while (super->disks) { + struct dl *d = super->disks; + + super->disks = d->next; + if (d->fd >= 0) + close(d->fd); + if (d->devname) + free(d->devname); + free(d); + } + for (i = 0; i < IMSM_MAX_DISKS; i++) + if (super->disk_tbl[i]) { + free(super->disk_tbl[i]); + super->disk_tbl[i] = NULL; + } +} + +/* free all the pieces hanging off of a super pointer */ +static void __free_imsm(struct intel_super *super) +{ + int i; + + if (super->buf) { + free(super->buf); + super->buf = NULL; + } + free_imsm_disks(super); + for (i = 0; i < IMSM_MAX_RAID_DEVS; i++) + if (super->dev_tbl[i]) { + free(super->dev_tbl[i]); + super->dev_tbl[i] = NULL; + } +} + +static void free_imsm(struct intel_super *super) +{ + __free_imsm(super); + free(super); +} + +static void free_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + + if (!super) + return; + + free_imsm(super); + st->sb = NULL; +} + +static struct intel_super *alloc_super(int creating_imsm) +{ + struct intel_super *super = malloc(sizeof(*super)); + + if (super) { + memset(super, 0, sizeof(*super)); + super->creating_imsm = creating_imsm; + super->current_vol = -1; + } + + return super; +} + +#ifndef MDASSEMBLE +static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, + char *devname, int keep_fd) +{ + struct mdinfo *sra; + struct intel_super *super; + struct mdinfo *sd, *best = NULL; + __u32 bestgen = 0; + __u32 gen; + char nm[20]; + int dfd; + int rv; + + /* check if this disk is a member of an active array */ + sra = sysfs_read(fd, 0, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + if (!sra) + return 1; + + if (sra->array.major_version != -1 || + sra->array.minor_version != -2 || + strcmp(sra->text_version, "imsm") != 0) + return 1; + + super = alloc_super(0); + if (!super) + return 1; + + /* find the most up to date disk in this array */ + for (sd = sra->devs; sd; sd = sd->next) { + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY); + if (!dfd) { + free_imsm(super); + return 2; + } + rv = load_imsm_mpb(dfd, super, NULL); + if (!keep_fd) + close(dfd); + if (rv == 0) { + gen = __le32_to_cpu(super->anchor->generation_num); + if (!best || gen > bestgen) { + bestgen = gen; + best = sd; + } + } else { + free_imsm(super); + return 2; + } + } + + if (!best) { + free_imsm(super); + return 1; + } + + /* load the most up to date anchor */ + sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (!dfd) { + free_imsm(super); + return 1; + } + rv = load_imsm_mpb(dfd, super, NULL); + close(dfd); + if (rv != 0) { + free_imsm(super); + return 2; + } + + /* reset the disk list */ + free_imsm_disks(super); + + /* populate disk list */ + for (sd = sra->devs ; sd ; sd = sd->next) { + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY); + if (!dfd) { + free_imsm(super); + return 2; + } + load_imsm_disk(dfd, super, NULL, keep_fd); + if (!keep_fd) + close(dfd); + } + + if (st->subarray[0]) { + if (atoi(st->subarray) <= super->anchor->num_raid_devs) + super->current_vol = atoi(st->subarray); + else + return 1; + } + + *sbp = super; + if (st->ss == NULL) { + st->ss = &super_imsm; + st->minor_version = 0; + st->max_devs = IMSM_MAX_DEVICES; + st->container_dev = fd2devnum(fd); + } + + return 0; +} +#endif + +static int load_super_imsm(struct supertype *st, int fd, char *devname) +{ + struct intel_super *super; + int rv; + +#ifndef MDASSEMBLE + if (load_super_imsm_all(st, fd, &st->sb, devname, 1) == 0) + return 0; +#endif + if (st->subarray[0]) + return 1; /* FIXME */ + + super = alloc_super(0); + if (!super) { + fprintf(stderr, + Name ": malloc of %zu failed.\n", + sizeof(*super)); + return 1; + } + + rv = load_imsm_mpb(fd, super, devname); + + if (rv) { + if (devname) + fprintf(stderr, + Name ": Failed to load all information " + "sections on %s\n", devname); + free_imsm(super); + return rv; + } + + st->sb = super; + if (st->ss == NULL) { + st->ss = &super_imsm; + st->minor_version = 0; + st->max_devs = IMSM_MAX_DEVICES; + } + + return 0; +} + +static __u16 info_to_blocks_per_strip(mdu_array_info_t *info) +{ + if (info->level == 1) + return 128; + return info->chunk_size >> 9; +} + +static __u32 info_to_num_data_stripes(mdu_array_info_t *info) +{ + __u32 num_stripes; + + num_stripes = (info->size * 2) / info_to_blocks_per_strip(info); + if (info->level == 1) + num_stripes /= 2; + + return num_stripes; +} + +static __u32 info_to_blocks_per_member(mdu_array_info_t *info) +{ + return (info->size * 2) & ~(info_to_blocks_per_strip(info) - 1); +} + +static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, + char *homehost, int *uuid) +{ + /* We are creating a volume inside a pre-existing container. + * so st->sb is already set. + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct imsm_dev *dev; + struct imsm_vol *vol; + struct imsm_map *map; + int idx = mpb->num_raid_devs; + int i; + unsigned long long array_blocks; + __u32 offset = 0; + size_t size_old, size_new; + + if (mpb->num_raid_devs >= 2) { + fprintf(stderr, Name": This imsm-container already has the " + "maximum of 2 volumes\n"); + return 0; + } + + /* ensure the mpb is large enough for the new data */ + size_old = __le32_to_cpu(mpb->mpb_size); + size_new = disks_to_mpb_size(info->nr_disks); + if (size_new > size_old) { + void *mpb_new; + size_t size_round = ROUND_UP(size_new, 512); + + if (posix_memalign(&mpb_new, 512, size_round) != 0) { + fprintf(stderr, Name": could not allocate new mpb\n"); + return 0; + } + memcpy(mpb_new, mpb, size_old); + free(mpb); + mpb = mpb_new; + super->anchor = mpb_new; + mpb->mpb_size = __cpu_to_le32(size_new); + memset(mpb_new + size_old, 0, size_round - size_old); + } + super->current_vol = idx; + sprintf(st->subarray, "%d", idx); + dev = malloc(sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1)); + if (!dev) { + fprintf(stderr, Name": could not allocate raid device\n"); + return 0; + } + strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN); + array_blocks = calc_array_size(info->level, info->raid_disks, + info->layout, info->chunk_size, + info->size*2); + dev->size_low = __cpu_to_le32((__u32) array_blocks); + dev->size_high = __cpu_to_le32((__u32) (array_blocks >> 32)); + dev->status = __cpu_to_le32(0); + dev->reserved_blocks = __cpu_to_le32(0); + vol = &dev->vol; + vol->migr_state = 0; + vol->migr_type = 0; + vol->dirty = 0; + for (i = 0; i < idx; i++) { + struct imsm_dev *prev = get_imsm_dev(super, i); + struct imsm_map *pmap = &prev->vol.map[0]; + + offset += __le32_to_cpu(pmap->blocks_per_member); + offset += IMSM_RESERVED_SECTORS; + } + map = &vol->map[0]; + map->pba_of_lba0 = __cpu_to_le32(offset); + map->blocks_per_member = __cpu_to_le32(info_to_blocks_per_member(info)); + map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info)); + map->num_data_stripes = __cpu_to_le32(info_to_num_data_stripes(info)); + map->map_state = info->level ? IMSM_T_STATE_UNINITIALIZED : + IMSM_T_STATE_NORMAL; + + if (info->level == 1 && info->raid_disks > 2) { + fprintf(stderr, Name": imsm does not support more than 2 disks" + "in a raid1 volume\n"); + return 0; + } + if (info->level == 10) + map->raid_level = 1; + else + map->raid_level = info->level; + + map->num_members = info->raid_disks; + for (i = 0; i < map->num_members; i++) { + /* initialized in add_to_super */ + map->disk_ord_tbl[i] = __cpu_to_le32(0); + } + mpb->num_raid_devs++; + super->dev_tbl[super->current_vol] = dev; + + return 1; +} + +static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, + unsigned long long size, char *name, + char *homehost, int *uuid) +{ + /* This is primarily called by Create when creating a new array. + * We will then get add_to_super called for each component, and then + * write_init_super called to write it out to each device. + * For IMSM, Create can create on fresh devices or on a pre-existing + * array. + * To create on a pre-existing array a different method will be called. + * This one is just for fresh drives. + */ + struct intel_super *super; + struct imsm_super *mpb; + size_t mpb_size; + + if (!info) { + st->sb = NULL; + return 0; + } + if (st->sb) + return init_super_imsm_volume(st, info, size, name, homehost, + uuid); + + super = alloc_super(1); + if (!super) + return 0; + mpb_size = disks_to_mpb_size(info->nr_disks); + if (posix_memalign(&super->buf, 512, mpb_size) != 0) { + free(super); + return 0; + } + mpb = super->buf; + memset(mpb, 0, mpb_size); + + memcpy(mpb->sig, MPB_SIGNATURE, strlen(MPB_SIGNATURE)); + memcpy(mpb->sig + strlen(MPB_SIGNATURE), MPB_VERSION_RAID5, + strlen(MPB_VERSION_RAID5)); + mpb->mpb_size = mpb_size; + + st->sb = super; + return 1; +} + +static void add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname) +{ + struct intel_super *super = st->sb; + struct dl *dl; + struct imsm_dev *dev; + struct imsm_map *map; + struct imsm_disk *disk; + __u32 status; + + dev = get_imsm_dev(super, super->current_vol); + map = &dev->vol.map[0]; + + for (dl = super->disks; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + if (!dl || ! (dk->state & (1<<MD_DISK_SYNC))) + return; + + map->disk_ord_tbl[dk->number] = __cpu_to_le32(dl->index); + + disk = get_imsm_disk(super, dl->index); + status = CONFIGURED_DISK | USABLE_DISK; + disk->status = __cpu_to_le32(status); +} + +static void add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct imsm_disk *disk; + struct dl *dd; + unsigned long long size; + __u32 status, id; + int rv; + struct stat stb; + + if (super->current_vol >= 0) { + add_to_super_imsm_volume(st, dk, fd, devname); + return; + } + + fstat(fd, &stb); + dd = malloc(sizeof(*dd)); + disk = malloc(sizeof(*disk)); + if (!dd || !disk) { + fprintf(stderr, + Name ": malloc failed %s:%d.\n", __func__, __LINE__); + if (!dd) + free(dd); + if (!disk) + free(disk); + abort(); + } + memset(dd, 0, sizeof(*dd)); + memset(disk, 0, sizeof(*disk)); + dd->major = major(stb.st_rdev); + dd->minor = minor(stb.st_rdev); + dd->index = dk->number; + dd->devname = devname ? strdup(devname) : NULL; + dd->next = super->disks; + dd->fd = fd; + rv = imsm_read_serial(fd, devname, dd->serial); + if (rv) { + fprintf(stderr, + Name ": failed to retrieve scsi serial, aborting\n"); + free(dd); + free(disk); + abort(); + } + + if (mpb->num_disks <= dk->number) + mpb->num_disks = dk->number + 1; + + get_dev_size(fd, NULL, &size); + size /= 512; + status = USABLE_DISK | SPARE_DISK; + strcpy((char *) disk->serial, (char *) dd->serial); + disk->total_blocks = __cpu_to_le32(size); + disk->status = __cpu_to_le32(status); + if (sysfs_disk_to_scsi_id(fd, &id) == 0) + disk->scsi_id = __cpu_to_le32(id); + else + disk->scsi_id = __cpu_to_le32(0); + super->disk_tbl[dd->index] = disk; + + /* update the family number if we are creating a container */ + if (super->creating_imsm) { + disk = __get_imsm_disk(mpb, dd->index); + *disk = *super->disk_tbl[dd->index]; /* copy in new disk */ + mpb->family_num = __cpu_to_le32(__gen_imsm_checksum(mpb)); + } + + super->disks = dd; +} + +static int store_imsm_mpb(int fd, struct intel_super *super); + +static int write_super_imsm(struct intel_super *super, int doclose) +{ + struct imsm_super *mpb = super->anchor; + struct dl *d; + __u32 generation; + __u32 sum; + int i; + + /* 'generation' is incremented everytime the metadata is written */ + generation = __le32_to_cpu(mpb->generation_num); + generation++; + mpb->generation_num = __cpu_to_le32(generation); + + for (i = 0; i < mpb->num_disks; i++) + mpb->disk[i] = *super->disk_tbl[i]; + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + + imsm_copy_dev(dev, super->dev_tbl[i]); + } + + /* recalculate checksum */ + sum = __gen_imsm_checksum(mpb); + mpb->check_sum = __cpu_to_le32(sum); + + for (d = super->disks; d ; d = d->next) { + if (store_imsm_mpb(d->fd, super)) { + fprintf(stderr, "%s: failed for device %d:%d %s\n", + __func__, d->major, d->minor, strerror(errno)); + return 0; + } + if (doclose) { + close(d->fd); + d->fd = -1; + } + } + + return 1; +} + +static int write_init_super_imsm(struct supertype *st) +{ + if (st->update_tail) { + /* queue the recently created array as a metadata update */ + size_t len; + struct imsm_update_create_array *u; + struct intel_super *super = st->sb; + struct imsm_dev *dev; + struct dl *d; + + if (super->current_vol < 0 || + !(dev = get_imsm_dev(super, super->current_vol))) { + fprintf(stderr, "%s: could not determine sub-array\n", + __func__); + return 1; + } + + + len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev); + u = malloc(len); + if (!u) { + fprintf(stderr, "%s: failed to allocate update buffer\n", + __func__); + return 1; + } + + u->type = update_create_array; + u->dev_idx = super->current_vol; + imsm_copy_dev(&u->dev, dev); + append_metadata_update(st, u, len); + + for (d = super->disks; d ; d = d->next) { + close(d->fd); + d->fd = -1; + } + + return 0; + } else + return write_super_imsm(st->sb, 1); +} + +static int store_zero_imsm(struct supertype *st, int fd) +{ + unsigned long long dsize; + void *buf; + + get_dev_size(fd, NULL, &dsize); + + /* first block is stored on second to last sector of the disk */ + if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) + return 1; + + if (posix_memalign(&buf, 512, 512) != 0) + return 1; + + memset(buf, 0, 512); + if (write(fd, buf, 512) != 512) + return 1; + return 0; +} + +static int validate_geometry_imsm_container(struct supertype *st, int level, + int layout, int raiddisks, int chunk, + unsigned long long size, char *dev, + unsigned long long *freesize, + int verbose) +{ + int fd; + unsigned long long ldsize; + + if (level != LEVEL_CONTAINER) + return 0; + if (!dev) + return 1; + + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + fprintf(stderr, Name ": imsm: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + if (!get_dev_size(fd, dev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + *freesize = avail_size_imsm(st, ldsize >> 9); + + return 1; +} + +/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd + * FIX ME add ahci details + */ +static int validate_geometry_imsm_volume(struct supertype *st, int level, + int layout, int raiddisks, int chunk, + unsigned long long size, char *dev, + unsigned long long *freesize, + int verbose) +{ + struct stat stb; + struct intel_super *super = st->sb; + struct dl *dl; + unsigned long long pos = 0; + unsigned long long maxsize; + struct extent *e; + int i; + + if (level == LEVEL_CONTAINER) + return 0; + + if (level == 1 && raiddisks > 2) { + if (verbose) + fprintf(stderr, Name ": imsm does not support more " + "than 2 in a raid1 configuration\n"); + return 0; + } + + /* We must have the container info already read in. */ + if (!super) + return 0; + + if (!dev) { + /* General test: make sure there is space for + * 'raiddisks' device extents of size 'size' at a given + * offset + */ + unsigned long long minsize = size*2 /* convert to blocks */; + unsigned long long start_offset = ~0ULL; + int dcnt = 0; + if (minsize == 0) + minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + for (dl = super->disks; dl ; dl = dl->next) { + int found = 0; + + pos = 0; + i = 0; + e = get_extents(super, dl); + if (!e) continue; + do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= minsize) + found = 1; + if (found && start_offset == ~0ULL) { + start_offset = pos; + break; + } else if (found && pos != start_offset) { + found = 0; + break; + } + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + if (found) + dcnt++; + free(e); + } + if (dcnt < raiddisks) { + if (verbose) + fprintf(stderr, Name ": imsm: Not enough " + "devices with space for this array " + "(%d < %d)\n", + dcnt, raiddisks); + return 0; + } + return 1; + } + /* This device must be a member of the set */ + if (stat(dev, &stb) < 0) + return 0; + if ((S_IFMT & stb.st_mode) != S_IFBLK) + return 0; + for (dl = super->disks ; dl ; dl = dl->next) { + if (dl->major == major(stb.st_rdev) && + dl->minor == minor(stb.st_rdev)) + break; + } + if (!dl) { + if (verbose) + fprintf(stderr, Name ": %s is not in the " + "same imsm set\n", dev); + return 0; + } + e = get_extents(super, dl); + maxsize = 0; + i = 0; + if (e) do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= maxsize) + maxsize = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + *freesize = maxsize; + + return 1; +} + +static int validate_geometry_imsm(struct supertype *st, int level, int layout, + int raiddisks, int chunk, unsigned long long size, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd, cfd; + struct mdinfo *sra; + + /* if given unused devices create a container + * if given given devices in a container create a member volume + */ + if (level == LEVEL_CONTAINER) { + /* Must be a fresh device to add to a container */ + return validate_geometry_imsm_container(st, level, layout, + raiddisks, chunk, size, + dev, freesize, + verbose); + } + + if (st->sb) { + /* creating in a given container */ + return validate_geometry_imsm_volume(st, level, layout, + raiddisks, chunk, size, + dev, freesize, verbose); + } + + /* limit creation to the following levels */ + if (!dev) + switch (level) { + case 0: + case 1: + case 10: + case 5: + break; + default: + return 1; + } + + /* This device needs to be a device in an 'imsm' container */ + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd >= 0) { + if (verbose) + fprintf(stderr, + Name ": Cannot create this array on device %s\n", + dev); + close(fd); + return 0; + } + if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { + if (verbose) + fprintf(stderr, Name ": Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + /* Well, it is in use by someone, maybe an 'imsm' container. */ + cfd = open_container(fd); + if (cfd < 0) { + close(fd); + if (verbose) + fprintf(stderr, Name ": Cannot use %s: It is busy\n", + dev); + return 0; + } + sra = sysfs_read(cfd, 0, GET_VERSION); + close(fd); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "imsm") == 0) { + /* This is a member of a imsm container. Load the container + * and try to create a volume + */ + struct intel_super *super; + + if (load_super_imsm_all(st, cfd, (void **) &super, NULL, 1) == 0) { + st->sb = super; + st->container_dev = fd2devnum(cfd); + close(cfd); + return validate_geometry_imsm_volume(st, level, layout, + raiddisks, chunk, + size, dev, + freesize, verbose); + } + close(cfd); + } else /* may belong to another container */ + return 0; + + return 1; +} + +static struct mdinfo *container_content_imsm(struct supertype *st) +{ + /* Given a container loaded by load_super_imsm_all, + * extract information about all the arrays into + * an mdinfo tree. + * + * For each imsm_dev create an mdinfo, fill it in, + * then look for matching devices in super->disks + * and create appropriate device mdinfo. + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct mdinfo *rest = NULL; + int i; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_vol *vol = &dev->vol; + struct imsm_map *map = vol->map; + struct mdinfo *this; + int slot; + + this = malloc(sizeof(*this)); + memset(this, 0, sizeof(*this)); + this->next = rest; + rest = this; + + this->array.level = get_imsm_raid_level(map); + this->array.raid_disks = map->num_members; + this->array.layout = imsm_level_to_layout(this->array.level); + this->array.md_minor = -1; + this->array.ctime = 0; + this->array.utime = 0; + this->array.chunk_size = __le16_to_cpu(map->blocks_per_strip) << 9; + this->array.state = !vol->dirty; + this->container_member = i; + if (map->map_state == IMSM_T_STATE_UNINITIALIZED || dev->vol.dirty) + this->resync_start = 0; + else + this->resync_start = ~0ULL; + + strncpy(this->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN); + this->name[MAX_RAID_SERIAL_LEN] = 0; + + sprintf(this->text_version, "/%s/%d", + devnum2devname(st->container_dev), + this->container_member); + + memset(this->uuid, 0, sizeof(this->uuid)); + + this->component_size = __le32_to_cpu(map->blocks_per_member); + + for (slot = 0 ; slot < map->num_members; slot++) { + struct imsm_disk *disk; + struct mdinfo *info_d; + struct dl *d; + int idx; + __u32 s; + + idx = __le32_to_cpu(map->disk_ord_tbl[slot] & ~(0xff << 24)); + for (d = super->disks; d ; d = d->next) + if (d->index == idx) + break; + + if (d == NULL) + break; /* shouldn't this be continue ?? */ + + info_d = malloc(sizeof(*info_d)); + if (!info_d) + break; /* ditto ?? */ + memset(info_d, 0, sizeof(*info_d)); + info_d->next = this->devs; + this->devs = info_d; + + disk = get_imsm_disk(super, idx); + s = __le32_to_cpu(disk->status); + + info_d->disk.number = d->index; + info_d->disk.major = d->major; + info_d->disk.minor = d->minor; + info_d->disk.raid_disk = slot; + info_d->disk.state = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0; + info_d->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0; + info_d->disk.state |= s & USABLE_DISK ? (1 << MD_DISK_SYNC) : 0; + + this->array.working_disks++; + + info_d->events = __le32_to_cpu(mpb->generation_num); + info_d->data_offset = __le32_to_cpu(map->pba_of_lba0); + info_d->component_size = __le32_to_cpu(map->blocks_per_member); + if (d->devname) + strcpy(info_d->name, d->devname); + } + } + + return rest; +} + + +static int imsm_open_new(struct supertype *c, struct active_array *a, + char *inst) +{ + struct intel_super *super = c->sb; + struct imsm_super *mpb = super->anchor; + + if (atoi(inst) >= mpb->num_raid_devs) { + fprintf(stderr, "%s: subarry index %d, out of range\n", + __func__, atoi(inst)); + return -ENODEV; + } + + dprintf("imsm: open_new %s\n", inst); + a->info.container_member = atoi(inst); + return 0; +} + +static __u8 imsm_check_degraded(struct intel_super *super, int n, int failed) +{ + struct imsm_dev *dev = get_imsm_dev(super, n); + struct imsm_map *map = dev->vol.map; + + if (!failed) + return map->map_state; + + switch (get_imsm_raid_level(map)) { + case 0: + return IMSM_T_STATE_FAILED; + break; + case 1: + if (failed < map->num_members) + return IMSM_T_STATE_DEGRADED; + else + return IMSM_T_STATE_FAILED; + break; + case 10: + { + /** + * check to see if any mirrors have failed, + * otherwise we are degraded + */ + int device_per_mirror = 2; /* FIXME is this always the case? + * and are they always adjacent? + */ + int failed = 0; + int i; + + for (i = 0; i < map->num_members; i++) { + int idx = get_imsm_disk_idx(map, i); + struct imsm_disk *disk = get_imsm_disk(super, idx); + + if (__le32_to_cpu(disk->status) & FAILED_DISK) + failed++; + + if (failed >= device_per_mirror) + return IMSM_T_STATE_FAILED; + + /* reset 'failed' for next mirror set */ + if (!((i + 1) % device_per_mirror)) + failed = 0; + } + + return IMSM_T_STATE_DEGRADED; + } + case 5: + if (failed < 2) + return IMSM_T_STATE_DEGRADED; + else + return IMSM_T_STATE_FAILED; + break; + default: + break; + } + + return map->map_state; +} + +static int imsm_count_failed(struct intel_super *super, struct imsm_map *map) +{ + int i; + int failed = 0; + struct imsm_disk *disk; + + for (i = 0; i < map->num_members; i++) { + int idx = get_imsm_disk_idx(map, i); + + disk = get_imsm_disk(super, idx); + if (__le32_to_cpu(disk->status) & FAILED_DISK) + failed++; + } + + return failed; +} + +static void imsm_set_array_state(struct active_array *a, int consistent) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = &dev->vol.map[0]; + int dirty = !consistent; + int failed; + __u8 map_state; + + if (a->resync_start == ~0ULL) { + failed = imsm_count_failed(super, map); + map_state = imsm_check_degraded(super, inst, failed); + if (!failed) + map_state = IMSM_T_STATE_NORMAL; + if (map->map_state != map_state) { + dprintf("imsm: map_state %d: %d\n", + inst, map_state); + map->map_state = map_state; + super->updates_pending++; + } + } + + if (dev->vol.dirty != dirty) { + dprintf("imsm: mark '%s' (%llu)\n", + dirty?"dirty":"clean", a->resync_start); + + dev->vol.dirty = dirty; + super->updates_pending++; + } +} + +static void imsm_set_disk(struct active_array *a, int n, int state) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = dev->vol.map; + struct imsm_disk *disk; + __u32 status; + int failed = 0; + int new_failure = 0; + + if (n > map->num_members) + fprintf(stderr, "imsm: set_disk %d out of range 0..%d\n", + n, map->num_members - 1); + + if (n < 0) + return; + + dprintf("imsm: set_disk %d:%x\n", n, state); + + disk = get_imsm_disk(super, get_imsm_disk_idx(map, n)); + + /* check for new failures */ + status = __le32_to_cpu(disk->status); + if ((state & DS_FAULTY) && !(status & FAILED_DISK)) { + status |= FAILED_DISK; + disk->status = __cpu_to_le32(status); + new_failure = 1; + super->updates_pending++; + } + + /* the number of failures have changed, count up 'failed' to determine + * degraded / failed status + */ + if (new_failure && map->map_state != IMSM_T_STATE_FAILED) + failed = imsm_count_failed(super, map); + + /* determine map_state based on failed or in_sync count */ + if (failed) + map->map_state = imsm_check_degraded(super, inst, failed); + else if (map->map_state == IMSM_T_STATE_DEGRADED) { + struct mdinfo *d; + int working = 0; + + for (d = a->info.devs ; d ; d = d->next) + if (d->curr_state & DS_INSYNC) + working++; + + if (working == a->info.array.raid_disks) { + map->map_state = IMSM_T_STATE_NORMAL; + super->updates_pending++; + } + } +} + +static int store_imsm_mpb(int fd, struct intel_super *super) +{ + struct imsm_super *mpb = super->anchor; + __u32 mpb_size = __le32_to_cpu(mpb->mpb_size); + unsigned long long dsize; + unsigned long long sectors; + + get_dev_size(fd, NULL, &dsize); + + if (mpb_size > 512) { + /* -1 to account for anchor */ + sectors = mpb_sectors(mpb) - 1; + + /* write the extended mpb to the sectors preceeding the anchor */ + if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) + return 1; + + if (write(fd, super->buf + 512, 512 * sectors) != 512 * sectors) + return 1; + } + + /* first block is stored on second to last sector of the disk */ + if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) + return 1; + + if (write(fd, super->buf, 512) != 512) + return 1; + + return 0; +} + +static void imsm_sync_metadata(struct supertype *container) +{ + struct intel_super *super = container->sb; + + if (!super->updates_pending) + return; + + write_super_imsm(super, 0); + + super->updates_pending = 0; +} + +static struct mdinfo *imsm_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + /** + * Take a device that is marked spare in the metadata and use it to + * replace a failed/vacant slot in an array. There may be a case where + * a device is failed in one array but active in a second. + * imsm_process_update catches this case and does not clear the SPARE_DISK + * flag, allowing the second array to start using the device on failure. + * SPARE_DISK is cleared when all arrays are using a device. + * + * FIXME: is this a valid use of SPARE_DISK? + */ + + struct intel_super *super = a->container->sb; + int inst = a->info.container_member; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = dev->vol.map; + int failed = a->info.array.raid_disks; + struct mdinfo *rv = NULL; + struct mdinfo *d; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + struct imsm_update_activate_spare *u; + int num_spares = 0; + int i; + + for (d = a->info.devs ; d ; d = d->next) { + if ((d->curr_state & DS_FAULTY) && + d->state_fd >= 0) + /* wait for Removal to happen */ + return NULL; + if (d->state_fd >= 0) + failed--; + } + + dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n", + inst, failed, a->info.array.raid_disks, a->info.array.level); + if (imsm_check_degraded(super, inst, failed) != IMSM_T_STATE_DEGRADED) + return NULL; + + /* For each slot, if it is not working, find a spare */ + dl = super->disks; + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && (d->state_fd >= 0)) + continue; + + /* OK, this device needs recovery. Find a spare */ + for ( ; dl ; dl = dl->next) { + unsigned long long esize; + unsigned long long pos; + struct mdinfo *d2; + struct extent *ex; + struct imsm_disk *disk; + int j; + int found; + __u32 array_start; + + /* If in this array, skip */ + for (d2 = a->info.devs ; d2 ; d2 = d2->next) + if (d2->disk.major == dl->major && + d2->disk.minor == dl->minor) { + dprintf("%x:%x already in array\n", dl->major, dl->minor); + break; + } + if (d2) + continue; + + /* is this unused device marked as a spare? */ + disk = get_imsm_disk(super, dl->index); + if (!(__le32_to_cpu(disk->status) & SPARE_DISK)) + continue; + + /* We are allowed to use this device - is there space? + * We need a->info.component_size sectors */ + ex = get_extents(super, dl); + if (!ex) { + dprintf("cannot get extents\n"); + continue; + } + found = 0; + j = 0; + pos = 0; + array_start = __le32_to_cpu(map->pba_of_lba0); + + do { + /* check that we can start at pba_of_lba0 with + * a->info.component_size of space + */ + esize = ex[j].start - pos; + if (array_start >= pos && + array_start + a->info.component_size < ex[j].start) { + found = 1; + break; + } + pos = ex[j].start + ex[j].size; + j++; + + } while (ex[j-1].size); + + free(ex); + if (!found) { + dprintf("%x:%x does not have %llu at %d\n", + dl->major, dl->minor, + a->info.component_size, + __le32_to_cpu(map->pba_of_lba0)); + /* No room */ + continue; + } + + /* found a usable disk with enough space */ + di = malloc(sizeof(*di)); + memset(di, 0, sizeof(*di)); + di->disk.number = dl->index; + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->data_offset = array_start; + di->component_size = a->info.component_size; + di->container_member = inst; + di->next = rv; + rv = di; + num_spares++; + dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor, + i, pos); + + break; + } + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * disk_ord_tbl for the array + */ + mu = malloc(sizeof(*mu)); + mu->buf = malloc(sizeof(struct imsm_update_activate_spare) * num_spares); + mu->space = NULL; + mu->len = sizeof(struct imsm_update_activate_spare) * num_spares; + mu->next = *updates; + u = (struct imsm_update_activate_spare *) mu->buf; + + for (di = rv ; di ; di = di->next) { + u->type = update_activate_spare; + u->disk_idx = di->disk.number; + u->slot = di->disk.raid_disk; + u->array = inst; + u->next = u + 1; + u++; + } + (u-1)->next = NULL; + *updates = mu; + + return rv; +} + +static int weight(unsigned int field) +{ + int weight; + + for (weight = 0; field; weight++) + field &= field - 1; + + return weight; +} + +static int disks_overlap(struct imsm_map *m1, struct imsm_map *m2) +{ + int i; + int j; + int idx; + + for (i = 0; i < m1->num_members; i++) { + idx = get_imsm_disk_idx(m1, i); + for (j = 0; j < m2->num_members; j++) + if (idx == get_imsm_disk_idx(m2, j)) + return 1; + } + + return 0; +} + +static void imsm_process_update(struct supertype *st, + struct metadata_update *update) +{ + /** + * crack open the metadata_update envelope to find the update record + * update can be one of: + * update_activate_spare - a spare device has replaced a failed + * device in an array, update the disk_ord_tbl. If this disk is + * present in all member arrays then also clear the SPARE_DISK + * flag + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + enum imsm_update_type type = *(enum imsm_update_type *) update->buf; + + switch (type) { + case update_activate_spare: { + struct imsm_update_activate_spare *u = (void *) update->buf; + struct imsm_dev *dev = get_imsm_dev(super, u->array); + struct imsm_map *map = &dev->vol.map[0]; + struct active_array *a; + struct imsm_disk *disk; + __u32 status; + struct dl *dl; + struct mdinfo *d; + unsigned int members; + unsigned int found; + int victim; + int i; + + for (dl = super->disks; dl; dl = dl->next) + if (dl->index == u->disk_idx) + break; + + if (!dl) { + fprintf(stderr, "error: imsm_activate_spare passed " + "an unknown disk_idx: %d\n", u->disk_idx); + return; + } + + super->updates_pending++; + + victim = get_imsm_disk_idx(map, u->slot); + map->disk_ord_tbl[u->slot] = __cpu_to_le32(u->disk_idx); + disk = get_imsm_disk(super, u->disk_idx); + status = __le32_to_cpu(disk->status); + status |= CONFIGURED_DISK; + disk->status = __cpu_to_le32(status); + + /* map unique/live arrays using the spare */ + members = 0; + found = 0; + for (a = st->arrays; a; a = a->next) { + int inst = a->info.container_member; + + dev = get_imsm_dev(super, inst); + map = &dev->vol.map[0]; + if (map->raid_level > 0) + members |= 1 << inst; + for (d = a->info.devs; d; d = d->next) + if (d->disk.major == dl->major && + d->disk.minor == dl->minor) + found |= 1 << inst; + } + + /* until all arrays that can absorb this disk have absorbed + * this disk it can still be considered a spare + */ + if (weight(found) >= weight(members)) { + status = __le32_to_cpu(disk->status); + status &= ~SPARE_DISK; + disk->status = __cpu_to_le32(status); + } + + /* count arrays using the victim in the metadata */ + found = 0; + for (a = st->arrays; a ; a = a->next) { + dev = get_imsm_dev(super, a->info.container_member); + map = &dev->vol.map[0]; + for (i = 0; i < map->num_members; i++) + if (victim == get_imsm_disk_idx(map, i)) + found++; + } + + /* clear some flags if the victim is no longer being + * utilized anywhere + */ + disk = get_imsm_disk(super, victim); + if (!found) { + status = __le32_to_cpu(disk->status); + status &= ~(CONFIGURED_DISK | USABLE_DISK); + disk->status = __cpu_to_le32(status); + } + break; + } + case update_create_array: { + /* someone wants to create a new array, we need to be aware of + * a few races/collisions: + * 1/ 'Create' called by two separate instances of mdadm + * 2/ 'Create' versus 'activate_spare': mdadm has chosen + * devices that have since been assimilated via + * activate_spare. + * In the event this update can not be carried out mdadm will + * (FIX ME) notice that its update did not take hold. + */ + struct imsm_update_create_array *u = (void *) update->buf; + struct imsm_dev *dev; + struct imsm_map *map, *new_map; + unsigned long long start, end; + unsigned long long new_start, new_end; + int i; + int overlap = 0; + + /* handle racing creates: first come first serve */ + if (u->dev_idx < mpb->num_raid_devs) { + dprintf("%s: subarray %d already defined\n", + __func__, u->dev_idx); + return; + } + + /* check update is next in sequence */ + if (u->dev_idx != mpb->num_raid_devs) { + dprintf("%s: can not create arrays out of sequence\n", + __func__); + return; + } + + new_map = &u->dev.vol.map[0]; + new_start = __le32_to_cpu(new_map->pba_of_lba0); + new_end = new_start + __le32_to_cpu(new_map->blocks_per_member); + + /* handle activate_spare versus create race: + * check to make sure that overlapping arrays do not include + * overalpping disks + */ + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = &dev->vol.map[0]; + start = __le32_to_cpu(map->pba_of_lba0); + end = start + __le32_to_cpu(map->blocks_per_member); + if ((new_start >= start && new_start <= end) || + (start >= new_start && start <= new_end)) + overlap = 1; + if (overlap && disks_overlap(map, new_map)) { + dprintf("%s: arrays overlap\n", __func__); + return; + } + } + /* check num_members sanity */ + if (new_map->num_members > mpb->num_disks) { + dprintf("%s: num_disks out of range\n", __func__); + return; + } + + /* check that prepare update was successful */ + if (!update->space) { + dprintf("%s: prepare update failed\n", __func__); + return; + } + + super->updates_pending++; + dev = update->space; + update->space = NULL; + imsm_copy_dev(dev, &u->dev); + super->dev_tbl[u->dev_idx] = dev; + mpb->num_raid_devs++; + + /* fix up flags, if arrays overlap then the drives can not be + * spares + */ + for (i = 0; i < map->num_members; i++) { + struct imsm_disk *disk; + __u32 status; + + disk = get_imsm_disk(super, get_imsm_disk_idx(map, i)); + status = __le32_to_cpu(disk->status); + status |= CONFIGURED_DISK; + if (overlap) + status &= ~SPARE_DISK; + disk->status = __cpu_to_le32(status); + } + break; + } + } +} + +static void imsm_prepare_update(struct supertype *st, + struct metadata_update *update) +{ + /** + * Allocate space to hold new disk entries, raid-device entries or a + * new mpb if necessary. We currently maintain an mpb large enough to + * hold 2 subarrays for the given number of disks. This may not be + * sufficient when reshaping. + * + * FIX ME handle the reshape case. + * + * The monitor will be able to safely change super->mpb by arranging + * for it to be freed in check_update_queue(). I.e. the monitor thread + * will start using the new pointer and the manager can continue to use + * the old value until check_update_queue() runs. + */ + enum imsm_update_type type = *(enum imsm_update_type *) update->buf; + + switch (type) { + case update_create_array: { + struct imsm_update_create_array *u = (void *) update->buf; + size_t len = sizeof_imsm_dev(&u->dev); + + update->space = malloc(len); + break; + default: + break; + } + } + + return; +} + +struct superswitch super_imsm = { +#ifndef MDASSEMBLE + .examine_super = examine_super_imsm, + .brief_examine_super = brief_examine_super_imsm, + .detail_super = detail_super_imsm, + .brief_detail_super = brief_detail_super_imsm, + .write_init_super = write_init_super_imsm, +#endif + .match_home = match_home_imsm, + .uuid_from_super= uuid_from_super_imsm, + .getinfo_super = getinfo_super_imsm, + .update_super = update_super_imsm, + + .avail_size = avail_size_imsm, + + .compare_super = compare_super_imsm, + + .load_super = load_super_imsm, + .init_super = init_super_imsm, + .add_to_super = add_to_super_imsm, + .store_super = store_zero_imsm, + .free_super = free_super_imsm, + .match_metadata_desc = match_metadata_desc_imsm, + .container_content = container_content_imsm, + + .validate_geometry = validate_geometry_imsm, + .external = 1, + +/* for mdmon */ + .open_new = imsm_open_new, + .load_super = load_super_imsm, + .set_array_state= imsm_set_array_state, + .set_disk = imsm_set_disk, + .sync_metadata = imsm_sync_metadata, + .activate_spare = imsm_activate_spare, + .process_update = imsm_process_update, + .prepare_update = imsm_prepare_update, +}; @@ -53,7 +53,7 @@ static unsigned long calc_sb0_csum(mdp_super_t *super) } -void super0_swap_endian(struct mdp_superblock_s *sb) +static void super0_swap_endian(struct mdp_superblock_s *sb) { /* as super0 superblocks are host-endian, it is sometimes * useful to be able to swap the endianness @@ -369,6 +369,8 @@ static void getinfo_super0(struct supertype *st, struct mdinfo *info) info->events = md_event(sb); info->data_offset = 0; + sprintf(info->text_version, "0.%d", sb->minor_version); + uuid_from_super0(st, info->uuid); if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) { @@ -552,12 +554,14 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info, unsigned long long size, char *ignored_name, char *homehost, int *uuid) { - mdp_super_t *sb = malloc(MD_SB_BYTES + sizeof(bitmap_super_t)); + mdp_super_t *sb; int spares; + + posix_memalign((void**)&sb, 512, MD_SB_BYTES + sizeof(bitmap_super_t)); memset(sb, 0, MD_SB_BYTES + sizeof(bitmap_super_t)); st->sb = sb; - if (info->major_version == -1) { + if (info == NULL) { /* zeroing the superblock */ return 0; } @@ -623,17 +627,38 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info, return 1; } +struct devinfo { + int fd; + char *devname; + mdu_disk_info_t disk; + struct devinfo *next; +}; /* Add a device to the superblock being created */ -static void add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo) +static void add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo, + int fd, char *devname) { mdp_super_t *sb = st->sb; mdp_disk_t *dk = &sb->disks[dinfo->number]; + struct devinfo *di, **dip; dk->number = dinfo->number; dk->major = dinfo->major; dk->minor = dinfo->minor; dk->raid_disk = dinfo->raid_disk; dk->state = dinfo->state; + + sb->this_disk = sb->disks[dinfo->number]; + sb->sb_csum = calc_sb0_csum(sb); + + dip = (struct devinfo **)&st->info; + while (*dip) + dip = &(*dip)->next; + di = malloc(sizeof(struct devinfo)); + di->fd = fd; + di->devname = devname; + di->disk = *dinfo; + di->next = NULL; + *dip = di; } static int store_super0(struct supertype *st, int fd) @@ -661,7 +686,8 @@ static int store_super0(struct supertype *st, int fd) if (super->state & (1<<MD_SB_BITMAP_PRESENT)) { struct bitmap_super_s * bm = (struct bitmap_super_s*)(super+1); if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) - if (write(fd, bm, sizeof(*bm)) != sizeof(*bm)) + if (write(fd, bm, ROUND_UP(sizeof(*bm),512)) != + ROUND_UP(sizeof(*bm),512)) return 5; } @@ -669,32 +695,41 @@ static int store_super0(struct supertype *st, int fd) return 0; } -static int write_init_super0(struct supertype *st, - mdu_disk_info_t *dinfo, char *devname) +#ifndef MDASSEMBLE +static int write_init_super0(struct supertype *st) { mdp_super_t *sb = st->sb; - int fd = open(devname, O_RDWR|O_EXCL); - int rv; + int rv = 0; + struct devinfo *di; - if (fd < 0) { - fprintf(stderr, Name ": Failed to open %s to write superblock\n", devname); - return -1; - } + for (di = st->info ; di && ! rv ; di = di->next) { - sb->disks[dinfo->number].state &= ~(1<<MD_DISK_FAULTY); + if (di->disk.state == 1) + continue; + if (di->fd == -1) + continue; + Kill(di->devname, 0, 1, 1); + Kill(di->devname, 0, 1, 1); - sb->this_disk = sb->disks[dinfo->number]; - sb->sb_csum = calc_sb0_csum(sb); - rv = store_super0(st, fd); + sb->disks[di->disk.number].state &= ~(1<<MD_DISK_FAULTY); - if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT))) - rv = st->ss->write_bitmap(st, fd); + sb->this_disk = sb->disks[di->disk.number]; + sb->sb_csum = calc_sb0_csum(sb); + rv = store_super0(st, di->fd); - close(fd); - if (rv) - fprintf(stderr, Name ": failed to write superblock to %s\n", devname); + if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT))) + rv = st->ss->write_bitmap(st, di->fd); + + if (rv) + fprintf(stderr, + Name ": failed to write superblock to %s\n", + di->devname); + close(di->fd); + di->fd = -1; + } return rv; } +#endif static int compare_super0(struct supertype *st, struct supertype *tst) { @@ -712,7 +747,8 @@ static int compare_super0(struct supertype *st, struct supertype *tst) if (second->md_magic != MD_SB_MAGIC) return 1; if (!first) { - first = malloc(MD_SB_BYTES + sizeof(struct bitmap_super_s)); + posix_memalign((void**)&first, 512, + MD_SB_BYTES + sizeof(struct bitmap_super_s)); memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s)); st->sb = first; return 0; @@ -754,6 +790,9 @@ static int load_super0(struct supertype *st, int fd, char *devname) free_super0(st); + if (st->subarray[0]) + return 1; + if (!get_dev_size(fd, devname, &dsize)) return 1; @@ -778,7 +817,7 @@ static int load_super0(struct supertype *st, int fd, char *devname) return 1; } - super = malloc(MD_SB_BYTES + sizeof(bitmap_super_t)); + posix_memalign((void**)&super, 512, MD_SB_BYTES + sizeof(bitmap_super_t)+512); if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) { if (devname) @@ -812,6 +851,7 @@ static int load_super0(struct supertype *st, int fd, char *devname) st->ss = &super0; st->minor_version = super->minor_version; st->max_devs = MD_SB_DISKS; + st->info = NULL; } /* Now check on the bitmap superblock */ @@ -821,8 +861,8 @@ static int load_super0(struct supertype *st, int fd, char *devname) * valid. If it doesn't clear the bit. An --assemble --force * should get that written out. */ - if (read(fd, super+1, sizeof(struct bitmap_super_s)) - != sizeof(struct bitmap_super_s)) + if (read(fd, super+1, ROUND_UP(sizeof(struct bitmap_super_s),512)) + != ROUND_UP(sizeof(struct bitmap_super_s),512)) goto no_bitmap; uuid_from_super0(st, uuid); @@ -843,7 +883,9 @@ static struct supertype *match_metadata_desc0(char *arg) struct supertype *st = malloc(sizeof(*st)); if (!st) return st; + memset(st, 0, sizeof(*st)); st->ss = &super0; + st->info = NULL; st->minor_version = 90; st->max_devs = MD_SB_DISKS; st->sb = NULL; @@ -919,7 +961,7 @@ static int add_internal_bitmap0(struct supertype *st, int *chunkp, } -void locate_bitmap0(struct supertype *st, int fd) +static void locate_bitmap0(struct supertype *st, int fd) { unsigned long long dsize; unsigned long long offset; @@ -939,7 +981,7 @@ void locate_bitmap0(struct supertype *st, int fd) lseek64(fd, offset, 0); } -int write_bitmap0(struct supertype *st, int fd) +static int write_bitmap0(struct supertype *st, int fd) { unsigned long long dsize; unsigned long long offset; @@ -948,7 +990,8 @@ int write_bitmap0(struct supertype *st, int fd) int rv = 0; int towrite, n; - char buf[4096]; + char abuf[4096+512]; + char *buf = (char*)(((long)(abuf+512))&~511UL); if (!get_dev_size(fd, NULL, &dsize)) return 1; @@ -964,21 +1007,19 @@ int write_bitmap0(struct supertype *st, int fd) if (lseek64(fd, offset + 4096, 0)< 0LL) return 3; - - if (write(fd, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t)) != - sizeof(bitmap_super_t)) - return -2; - towrite = 64*1024 - MD_SB_BYTES - sizeof(bitmap_super_t); - memset(buf, 0xff, sizeof(buf)); + memset(buf, 0xff, 4096); + memcpy(buf, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t)); + towrite = 64*1024; while (towrite > 0) { n = towrite; - if (n > sizeof(buf)) - n = sizeof(buf); + if (n > 4096) + n = 4096; n = write(fd, buf, n); if (n > 0) towrite -= n; else break; + memset(buf, 0xff, 4096); } fsync(fd); if (towrite) @@ -994,6 +1035,46 @@ static void free_super0(struct supertype *st) st->sb = NULL; } +static int validate_geometry0(struct supertype *st, int level, + int layout, int raiddisks, + int chunk, unsigned long long size, + char *subdev, unsigned long long *freesize, + int verbose) +{ + unsigned long long ldsize; + int fd; + + if (level == LEVEL_CONTAINER) + return 0; + if (raiddisks > MD_SB_DISKS) + return 0; + if (size > (0x7fffffffULL<<10)) + return 0; + if (!subdev) + return 1; + + fd = open(subdev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + fprintf(stderr, Name ": super0.90 cannot open %s: %s\n", + subdev, strerror(errno)); + return 0; + } + + if (!get_dev_size(fd, subdev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + if (ldsize < MD_RESERVED_SECTORS * 512) + return 0; + if (size > (0x7fffffffULL<<10)) + return 0; + *freesize = MD_NEW_SIZE_SECTORS(ldsize >> 9); + return 1; +} + struct superswitch super0 = { #ifndef MDASSEMBLE .examine_super = examine_super0, @@ -1002,6 +1083,7 @@ struct superswitch super0 = { .detail_super = detail_super0, .brief_detail_super = brief_detail_super0, .export_detail_super = export_detail_super0, + .write_init_super = write_init_super0, #endif .match_home = match_home0, .uuid_from_super = uuid_from_super0, @@ -1010,7 +1092,6 @@ struct superswitch super0 = { .init_super = init_super0, .add_to_super = add_to_super0, .store_super = store_super0, - .write_init_super = write_init_super0, .compare_super = compare_super0, .load_super = load_super0, .match_metadata_desc = match_metadata_desc0, @@ -1019,6 +1100,5 @@ struct superswitch super0 = { .locate_bitmap = locate_bitmap0, .write_bitmap = write_bitmap0, .free_super = free_super0, - .major = 0, - .swapuuid = 0, + .validate_geometry = validate_geometry0, }; @@ -493,7 +493,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info) int role; info->array.major_version = 1; - info->array.minor_version = __le32_to_cpu(sb->feature_map); + info->array.minor_version = st->minor_version; info->array.patch_version = 0; info->array.raid_disks = __le32_to_cpu(sb->raid_disks); info->array.level = __le32_to_cpu(sb->level); @@ -531,6 +531,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info) info->disk.raid_disk = role; } info->events = __le64_to_cpu(sb->events); + sprintf(info->text_version, "1.%d", st->minor_version); memcpy(info->uuid, sb->set_uuid, 16); @@ -670,7 +671,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info, __le64_to_cpu(sb->data_offset)) { /* set data_size to device size less data_offset */ struct misc_dev_info *misc = (struct misc_dev_info*) - (st->sb + 1024 + sizeof(struct bitmap_super_s)); + (st->sb + 1024 + 512); printf("Size was %llu\n", (unsigned long long) __le64_to_cpu(sb->data_size)); sb->data_size = __cpu_to_le64( @@ -688,15 +689,17 @@ static int update_super1(struct supertype *st, struct mdinfo *info, static int init_super1(struct supertype *st, mdu_array_info_t *info, unsigned long long size, char *name, char *homehost, int *uuid) { - struct mdp_superblock_1 *sb = malloc(1024 + sizeof(bitmap_super_t) + - sizeof(struct misc_dev_info)); + struct mdp_superblock_1 *sb; int spares; int rfd; char defname[10]; + + posix_memalign((void**)&sb, 512, (1024 + 512 + + sizeof(struct misc_dev_info))); memset(sb, 0, 1024); st->sb = sb; - if (info->major_version == -1) { + if (info == NULL) { /* zeroing superblock */ return 0; } @@ -767,17 +770,39 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info, return 1; } +struct devinfo { + int fd; + char *devname; + mdu_disk_info_t disk; + struct devinfo *next; +}; /* Add a device to the superblock being created */ -static void add_to_super1(struct supertype *st, mdu_disk_info_t *dk) +static void add_to_super1(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname) { struct mdp_superblock_1 *sb = st->sb; __u16 *rp = sb->dev_roles + dk->number; + struct devinfo *di, **dip; + if ((dk->state & 6) == 6) /* active, sync */ *rp = __cpu_to_le16(dk->raid_disk); else if ((dk->state & ~2) == 0) /* active or idle -> spare */ *rp = 0xffff; else *rp = 0xfffe; + + sb->dev_number = __cpu_to_le32(dk->number); + sb->sb_csum = calc_sb_1_csum(sb); + + dip = (struct devinfo **)&st->info; + while (*dip) + dip = &(*dip)->next; + di = malloc(sizeof(struct devinfo)); + di->fd = fd; + di->devname = devname; + di->disk = *dk; + di->next = NULL; + *dip = di; } static void locate_bitmap1(struct supertype *st, int fd); @@ -834,6 +859,7 @@ static int store_super1(struct supertype *st, int fd) return 3; sbsize = sizeof(*sb) + 2 * __le32_to_cpu(sb->max_dev); + sbsize = (sbsize+511)&(~511UL); if (write(fd, sb, sbsize) != sbsize) return 4; @@ -843,7 +869,8 @@ static int store_super1(struct supertype *st, int fd) (((char*)sb)+1024); if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) { locate_bitmap1(st, fd); - if (write(fd, bm, sizeof(*bm)) != sizeof(*bm)) + if (write(fd, bm, ROUND_UP(sizeof(*bm),512)) != + ROUND_UP(sizeof(*bm),512)) return 5; } } @@ -866,123 +893,133 @@ static unsigned long choose_bm_space(unsigned long devsize) return 4*2; } -static int write_init_super1(struct supertype *st, - mdu_disk_info_t *dinfo, char *devname) +#ifndef MDASSEMBLE +static int write_init_super1(struct supertype *st) { struct mdp_superblock_1 *sb = st->sb; struct supertype refst; - int fd = open(devname, O_RDWR | O_EXCL); int rfd; - int rv; + int rv = 0; int bm_space; - + struct devinfo *di; unsigned long long dsize, array_size; long long sb_offset; + for (di = st->info; di && ! rv ; di = di->next) { + if (di->disk.state == 1) + continue; + if (di->fd < 0) + continue; - if (fd < 0) { - fprintf(stderr, Name ": Failed to open %s to write superblock\n", - devname); - return -1; - } + Kill(di->devname, 0, 1, 1); + Kill(di->devname, 0, 1, 1); - sb->dev_number = __cpu_to_le32(dinfo->number); - if (dinfo->state & (1<<MD_DISK_WRITEMOSTLY)) - sb->devflags |= __cpu_to_le32(WriteMostly1); + sb->dev_number = __cpu_to_le32(di->disk.number); + if (di->disk.state & (1<<MD_DISK_WRITEMOSTLY)) + sb->devflags |= __cpu_to_le32(WriteMostly1); - if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || - read(rfd, sb->device_uuid, 16) != 16) { - *(__u32*)(sb->device_uuid) = random(); - *(__u32*)(sb->device_uuid+4) = random(); - *(__u32*)(sb->device_uuid+8) = random(); - *(__u32*)(sb->device_uuid+12) = random(); - } - if (rfd >= 0) close(rfd); - sb->events = 0; - - refst =*st; - refst.sb = NULL; - if (load_super1(&refst, fd, NULL)==0) { - struct mdp_superblock_1 *refsb = refst.sb; - - memcpy(sb->device_uuid, refsb->device_uuid, 16); - if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) { - /* same array, so preserve events and dev_number */ - sb->events = refsb->events; - /* bugs in 2.6.17 and earlier mean the dev_number - * chosen in Manage must be preserved - */ - if (get_linux_version() >= 2006018) - sb->dev_number = refsb->dev_number; + if ((rfd = open("/dev/urandom", O_RDONLY)) < 0 || + read(rfd, sb->device_uuid, 16) != 16) { + *(__u32*)(sb->device_uuid) = random(); + *(__u32*)(sb->device_uuid+4) = random(); + *(__u32*)(sb->device_uuid+8) = random(); + *(__u32*)(sb->device_uuid+12) = random(); + } + if (rfd >= 0) close(rfd); + sb->events = 0; + + refst =*st; + refst.sb = NULL; + if (load_super1(&refst, di->fd, NULL)==0) { + struct mdp_superblock_1 *refsb = refst.sb; + + memcpy(sb->device_uuid, refsb->device_uuid, 16); + if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) { + /* same array, so preserve events and + * dev_number */ + sb->events = refsb->events; + /* bugs in 2.6.17 and earlier mean the + * dev_number chosen in Manage must be preserved + */ + if (get_linux_version() >= 2006018) + sb->dev_number = refsb->dev_number; + } + free(refsb); } - free(refsb); - } - - if (!get_dev_size(fd, NULL, &dsize)) - return 1; - dsize >>= 9; - if (dsize < 24) { - close(fd); - return 2; - } + if (!get_dev_size(di->fd, NULL, &dsize)) + return 1; + dsize >>= 9; + if (dsize < 24) { + close(di->fd); + return 2; + } - /* - * Calculate the position of the superblock. - * It is always aligned to a 4K boundary and - * depending on minor_version, it can be: - * 0: At least 8K, but less than 12K, from end of device - * 1: At start of device - * 2: 4K from start of device. - * Depending on the array size, we might leave extra space - * for a bitmap. - */ - array_size = __le64_to_cpu(sb->size); - /* work out how much space we left for a bitmap */ - bm_space = choose_bm_space(array_size); - switch(st->minor_version) { - case 0: - sb_offset = dsize; - sb_offset -= 8*2; - sb_offset &= ~(4*2-1); - sb->super_offset = __cpu_to_le64(sb_offset); - sb->data_offset = __cpu_to_le64(0); + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + * Depending on the array size, we might leave extra space + * for a bitmap. + */ + array_size = __le64_to_cpu(sb->size); + /* work out how much space we left for a bitmap */ + bm_space = choose_bm_space(array_size); + + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + sb->super_offset = __cpu_to_le64(sb_offset); + sb->data_offset = __cpu_to_le64(0); if (sb_offset - bm_space < array_size) bm_space = sb_offset - array_size; - sb->data_size = __cpu_to_le64(sb_offset - bm_space); - break; - case 1: - sb->super_offset = __cpu_to_le64(0); - if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize) - bm_space = dsize - __le64_to_cpu(sb->size) - 4*2; - sb->data_offset = __cpu_to_le64(bm_space + 4*2); - sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2); - break; - case 2: - sb_offset = 4*2; - sb->super_offset = __cpu_to_le64(4*2); - if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size) > dsize) - bm_space = dsize - __le64_to_cpu(sb->size) - 4*2 - 4*2; - sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space); - sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2 - bm_space ); - break; - default: - return -EINVAL; - } + sb->data_size = __cpu_to_le64(sb_offset - bm_space); + break; + case 1: + sb->super_offset = __cpu_to_le64(0); + if (4*2 + bm_space + __le64_to_cpu(sb->size) > dsize) + bm_space = dsize - __le64_to_cpu(sb->size) -4*2; + sb->data_offset = __cpu_to_le64(bm_space + 4*2); + sb->data_size = __cpu_to_le64(dsize - bm_space - 4*2); + break; + case 2: + sb_offset = 4*2; + sb->super_offset = __cpu_to_le64(4*2); + if (4*2 + 4*2 + bm_space + __le64_to_cpu(sb->size) + > dsize) + bm_space = dsize - __le64_to_cpu(sb->size) + - 4*2 - 4*2; + sb->data_offset = __cpu_to_le64(4*2 + 4*2 + bm_space); + sb->data_size = __cpu_to_le64(dsize - 4*2 - 4*2 + - bm_space ); + break; + default: + return -EINVAL; + } - sb->sb_csum = calc_sb_1_csum(sb); - rv = store_super1(st, fd); - if (rv) - fprintf(stderr, Name ": failed to write superblock to %s\n", devname); + sb->sb_csum = calc_sb_1_csum(sb); + rv = store_super1(st, di->fd); + if (rv) + fprintf(stderr, + Name ": failed to write superblock to %s\n", + di->devname); - if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1)) - rv = st->ss->write_bitmap(st, fd); - close(fd); + if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1)) + rv = st->ss->write_bitmap(st, di->fd); + close(di->fd); + di->fd = -1; + } return rv; } +#endif static int compare_super1(struct supertype *st, struct supertype *tst) { @@ -1002,9 +1039,10 @@ static int compare_super1(struct supertype *st, struct supertype *tst) return 1; if (!first) { - first = malloc(1024+sizeof(bitmap_super_t) + + posix_memalign((void**)&first, 512, + 1024 + 512 + sizeof(struct misc_dev_info)); - memcpy(first, second, 1024+sizeof(bitmap_super_t) + + memcpy(first, second, 1024 + 512 + sizeof(struct misc_dev_info)); st->sb = first; return 0; @@ -1035,13 +1073,16 @@ static int load_super1(struct supertype *st, int fd, char *devname) free_super1(st); + if (st->subarray[0]) + return 1; + if (st->ss == NULL || st->minor_version == -1) { int bestvers = -1; struct supertype tst; __u64 bestctime = 0; /* guess... choose latest ctime */ + memset(&tst, 0, sizeof(tst)); tst.ss = &super1; - tst.sb = NULL; for (tst.minor_version = 0; tst.minor_version <= 2 ; tst.minor_version++) { switch(load_super1(&tst, fd, devname)) { case 0: super = tst.sb; @@ -1114,7 +1155,8 @@ static int load_super1(struct supertype *st, int fd, char *devname) return 1; } - super = malloc(1024 + sizeof(bitmap_super_t) + + posix_memalign((void**)&super, 512, + 1024 + 512 + sizeof(struct misc_dev_info)); if (read(fd, super, 1024) != 1024) { @@ -1151,7 +1193,7 @@ static int load_super1(struct supertype *st, int fd, char *devname) bsb = (struct bitmap_super_s *)(((char*)super)+1024); - misc = (struct misc_dev_info*) (bsb+1); + misc = (struct misc_dev_info*) (((char*)super)+1024+512); misc->device_size = dsize; /* Now check on the bitmap superblock */ @@ -1162,8 +1204,8 @@ static int load_super1(struct supertype *st, int fd, char *devname) * should get that written out. */ locate_bitmap1(st, fd); - if (read(fd, ((char*)super)+1024, sizeof(struct bitmap_super_s)) - != sizeof(struct bitmap_super_s)) + if (read(fd, ((char*)super)+1024, 512) + != 512) goto no_bitmap; uuid_from_super1(st, uuid); @@ -1183,6 +1225,7 @@ static struct supertype *match_metadata_desc1(char *arg) struct supertype *st = malloc(sizeof(*st)); if (!st) return st; + memset(st, 0, sizeof(*st)); st->ss = &super1; st->max_devs = 384; st->sb = NULL; @@ -1199,7 +1242,7 @@ static struct supertype *match_metadata_desc1(char *arg) return st; } if (strcmp(arg, "1") == 0 || - strcmp(arg, "default/large") == 0) { + strcmp(arg, "default") == 0) { st->minor_version = -1; return st; } @@ -1382,25 +1425,28 @@ static int write_bitmap1(struct supertype *st, int fd) int rv = 0; int towrite, n; - char buf[4096]; + char abuf[4096+512]; + char *buf = (char*)(((long)(abuf+512))&~511UL); locate_bitmap1(st, fd); - if (write(fd, ((char*)sb)+1024, sizeof(bitmap_super_t)) != - sizeof(bitmap_super_t)) - return -2; + memset(buf, 0xff, 4096); + memcpy(buf, ((char*)sb)+1024, sizeof(bitmap_super_t)); + towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9); towrite = (towrite+7) >> 3; /* bits to bytes */ - memset(buf, 0xff, sizeof(buf)); + towrite += sizeof(bitmap_super_t); + towrite = ROUND_UP(towrite, 512); while (towrite > 0) { n = towrite; - if (n > sizeof(buf)) - n = sizeof(buf); + if (n > 4096) + n = 4096; n = write(fd, buf, n); if (n > 0) towrite -= n; else break; + memset(buf, 0xff, 4096); } fsync(fd); if (towrite) @@ -1416,6 +1462,38 @@ static void free_super1(struct supertype *st) st->sb = NULL; } +static int validate_geometry1(struct supertype *st, int level, + int layout, int raiddisks, + int chunk, unsigned long long size, + char *subdev, unsigned long long *freesize, + int verbose) +{ + unsigned long long ldsize; + int fd; + + if (level == LEVEL_CONTAINER) + return 0; + if (!subdev) + return 1; + + fd = open(subdev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + fprintf(stderr, Name ": super1.x cannot open %s: %s\n", + subdev, strerror(errno)); + return 0; + } + + if (!get_dev_size(fd, subdev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + *freesize = avail_size1(st, ldsize >> 9); + return 1; +} + struct superswitch super1 = { #ifndef MDASSEMBLE .examine_super = examine_super1, @@ -1424,6 +1502,7 @@ struct superswitch super1 = { .detail_super = detail_super1, .brief_detail_super = brief_detail_super1, .export_detail_super = export_detail_super1, + .write_init_super = write_init_super1, #endif .match_home = match_home1, .uuid_from_super = uuid_from_super1, @@ -1432,7 +1511,6 @@ struct superswitch super1 = { .init_super = init_super1, .add_to_super = add_to_super1, .store_super = store_super1, - .write_init_super = write_init_super1, .compare_super = compare_super1, .load_super = load_super1, .match_metadata_desc = match_metadata_desc1, @@ -1441,7 +1519,7 @@ struct superswitch super1 = { .locate_bitmap = locate_bitmap1, .write_bitmap = write_bitmap1, .free_super = free_super1, - .major = 1, + .validate_geometry = validate_geometry1, #if __BYTE_ORDER == BIG_ENDIAN .swapuuid = 0, #else @@ -34,10 +34,10 @@ int load_sys(char *path, char *buf) return -1; n = read(fd, buf, 1024); close(fd); - if (n <=0 || n >= 1024) + if (n <0 || n >= 1024) return -1; buf[n] = 0; - if (buf[n-1] == '\n') + if (n && buf[n-1] == '\n') buf[n-1] = 0; return 0; } @@ -56,6 +56,23 @@ void sysfs_free(struct mdinfo *sra) } } +int sysfs_open(int devnum, char *devname, char *attr) +{ + char fname[50]; + int fd; + + sprintf(fname, "/sys/block/%s/md/", devnum2devname(devnum)); + if (devname) { + strcat(fname, devname); + strcat(fname, "/"); + } + strcat(fname, attr); + fd = open(fname, O_RDWR); + if (fd < 0 && errno == EACCES) + fd = open(fname, O_RDONLY); + return fd; +} + struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) { /* Longest possible name in sysfs, mounted at /sys, is @@ -69,7 +86,7 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) char *dbase; struct mdinfo *sra; struct mdinfo *dev; - DIR *dir; + DIR *dir = NULL; struct dirent *de; sra = malloc(sizeof(*sra)); @@ -111,10 +128,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) sra->array.major_version = -1; sra->array.minor_version = -2; strcpy(sra->text_version, buf+9); - } else + } else { sscanf(buf, "%d.%d", &sra->array.major_version, &sra->array.minor_version); + strcpy(sra->text_version, buf); + } } if (options & GET_LEVEL) { strcpy(base, "level"); @@ -128,6 +147,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) goto abort; sra->array.layout = strtoul(buf, NULL, 0); } + if (options & GET_DISKS) { + strcpy(base, "raid_disks"); + if (load_sys(fname, buf)) + goto abort; + sra->array.raid_disks = strtoul(buf, NULL, 0); + } if (options & GET_COMPONENT) { strcpy(base, "component_size"); if (load_sys(fname, buf)) @@ -203,7 +228,7 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) strcpy(dbase, "size"); if (load_sys(fname, buf)) goto abort; - dev->component_size = strtoull(buf, NULL, 0); + dev->component_size = strtoull(buf, NULL, 0) * 2; } if (options & GET_STATE) { dev->disk.state = 0; @@ -224,9 +249,12 @@ struct mdinfo *sysfs_read(int fd, int devnum, unsigned long options) dev->errors = strtoul(buf, NULL, 0); } } + closedir(dir); return sra; abort: + if (dir) + closedir(dir); sysfs_free(sra); return NULL; } @@ -267,6 +295,7 @@ int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, char fname[50]; int n; int fd; + sprintf(fname, "/sys/block/%s/md/%s/%s", sra->sys_name, dev?dev->sys_name:"", name); fd = open(fname, O_WRONLY); @@ -310,3 +339,240 @@ int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, return -1; return 0; } + +int sysfs_set_array(struct mdinfo *sra, + struct mdinfo *info) +{ + int rv = 0; + sra->array = info->array; + + if (info->array.level < 0) + return 0; /* FIXME */ + rv |= sysfs_set_str(sra, NULL, "level", + map_num(pers, info->array.level)); + rv |= sysfs_set_num(sra, NULL, "raid_disks", info->array.raid_disks); + rv |= sysfs_set_num(sra, NULL, "chunk_size", info->array.chunk_size); + rv |= sysfs_set_num(sra, NULL, "layout", info->array.layout); + rv |= sysfs_set_num(sra, NULL, "component_size", info->component_size/2); + rv |= sysfs_set_num(sra, NULL, "resync_start", info->resync_start); + sra->array = info->array; + return rv; +} + +int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd) +{ + char dv[100]; + char nm[100]; + struct mdinfo *sd2; + char *dname; + int rv; + + sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor); + rv = sysfs_set_str(sra, NULL, "new_dev", dv); + if (rv) + return rv; + + memset(nm, 0, sizeof(nm)); + sprintf(dv, "/sys/dev/block/%d:%d", sd->disk.major, sd->disk.minor); + rv = readlink(dv, nm, sizeof(nm)); + if (rv <= 0) + return -1; + nm[rv] = '\0'; + dname = strrchr(nm, '/'); + if (dname) dname++; + strcpy(sd->sys_name, "dev-"); + strcpy(sd->sys_name+4, dname); + + rv = sysfs_set_num(sra, sd, "offset", sd->data_offset); + rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2); + if (sra->array.level != LEVEL_CONTAINER) { + rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk); +// rv |= sysfs_set_str(sra, sd, "state", "in_sync"); + } + if (! rv) { + sd2 = malloc(sizeof(*sd2)); + *sd2 = *sd; + sd2->next = sra->devs; + sra->devs = sd2; + } + return rv; +} + +#if 0 +int sysfs_disk_to_sg(int fd) +{ + /* from an open block device, try find and open its corresponding + * scsi_generic interface + */ + struct stat st; + char path[256]; + char sg_path[256]; + char sg_major_minor[8]; + char *c; + DIR *dir; + struct dirent *de; + int major, minor, rv; + + if (fstat(fd, &st)) + return -1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return -1; + + de = readdir(dir); + while (de) { + if (strncmp("scsi_generic:", de->d_name, + strlen("scsi_generic:")) == 0) + break; + de = readdir(dir); + } + closedir(dir); + + if (!de) + return -1; + + snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name); + fd = open(sg_path, O_RDONLY); + if (fd < 0) + return fd; + + rv = read(fd, sg_major_minor, sizeof(sg_major_minor)); + close(fd); + if (rv < 0) + return -1; + else + sg_major_minor[rv - 1] = '\0'; + + c = strchr(sg_major_minor, ':'); + *c = '\0'; + c++; + major = strtol(sg_major_minor, NULL, 10); + minor = strtol(c, NULL, 10); + snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d", + (int) getpid(), major, minor); + if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) { + fd = open(path, O_RDONLY); + unlink(path); + return fd; + } + + return -1; +} +#endif + +int sysfs_disk_to_scsi_id(int fd, __u32 *id) +{ + /* from an open block device, try to retrieve it scsi_id */ + struct stat st; + char path[256]; + char *c1, *c2; + DIR *dir; + struct dirent *de; + + if (fstat(fd, &st)) + return 1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return 1; + + de = readdir(dir); + while (de) { + if (strncmp("scsi_disk:", de->d_name, + strlen("scsi_disk:")) == 0) + break; + de = readdir(dir); + } + closedir(dir); + + if (!de) + return 1; + + c1 = strchr(de->d_name, ':'); + c1++; + c2 = strchr(c1, ':'); + *c2 = '\0'; + *id = strtol(c1, NULL, 10) << 24; /* host */ + c1 = c2 + 1; + c2 = strchr(c1, ':'); + *c2 = '\0'; + *id |= strtol(c1, NULL, 10) << 16; /* channel */ + c1 = c2 + 1; + c2 = strchr(c1, ':'); + *c2 = '\0'; + *id |= strtol(c1, NULL, 10) << 8; /* lun */ + c1 = c2 + 1; + *id |= strtol(c1, NULL, 10); /* id */ + + return 0; +} + + +int sysfs_unique_holder(int devnum, long rdev) +{ + /* Check that devnum is a holder of rdev, + * and is the only holder. + * we should be locked against races by + * an O_EXCL on devnum + */ + DIR *dir; + struct dirent *de; + char dirname[100]; + char l; + int found = 0; + sprintf(dirname, "/sys/dev/block/%d:%d/holders", + major(rdev), minor(rdev)); + dir = opendir(dirname); + errno = ENOENT; + if (!dir) + return 0; + l = strlen(dirname); + while ((de = readdir(dir)) != NULL) { + char buf[10]; + int n; + int mj, mn; + char c; + int fd; + + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + strcpy(dirname+l, "/"); + strcat(dirname+l, de->d_name); + strcat(dirname+l, "/dev"); + fd = open(dirname, O_RDONLY); + if (fd < 0) { + errno = ENOENT; + break; + } + n = read(fd, buf, sizeof(buf)-1); + close(fd); + buf[n] = 0; + if (sscanf(buf, "%d:%d%c", &mj, &mn, &c) != 3 || + c != '\n') { + errno = ENOENT; + break; + } + if (mj != MD_MAJOR) + mn = -1-(mn>>6); + + if (devnum != mn) { + errno = EEXIST; + break; + } + found = 1; + } + closedir(dir); + if (de) + return 0; + else + return found; +} @@ -174,6 +174,8 @@ do if [ -f "$script" ] then rm -f $targetdir/stderr + # stop all arrays, just incase some script left an array active. + mdadm -Ssq # source script in a subshell, so it has access to our # namespace, but cannot change it. if ( set -ex ; . $script ) 2> $targetdir/log @@ -29,8 +29,13 @@ #include "mdadm.h" #include "md_p.h" +#include <sys/socket.h> #include <sys/utsname.h> +#include <sys/wait.h> +#include <sys/un.h> #include <ctype.h> +#include <dirent.h> +#include <signal.h> /* * following taken from linux/blkpg.h because they aren't @@ -389,6 +394,9 @@ int is_standard(char *dev, int *nump) /* tests if dev is a "standard" md dev name. * i.e if the last component is "/dNN" or "/mdNN", * where NN is a string of digits + * Returns 1 if a partitionable standard, + * -1 if non-partitonable, + * 0 if not a standard name. */ char *d = strrchr(dev, '/'); int type=0; @@ -608,6 +616,23 @@ char *human_size_brief(long long bytes) } #endif +unsigned long long calc_array_size(int level, int raid_disks, int layout, + int chunksize, unsigned long long devsize) +{ + int data_disks = 0; + switch (level) { + case 0: data_disks = raid_disks; break; + case 1: data_disks = 1; break; + case 4: + case 5: data_disks = raid_disks - 1; break; + case 6: data_disks = raid_disks - 2; break; + case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255); + break; + } + devsize &= ~(unsigned long long)((chunksize>>9)-1); + return data_disks * devsize; +} + #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) int get_mdp_major(void) { @@ -693,21 +718,6 @@ void put_md_name(char *name) unlink(name); } -static int dev2major(int d) -{ - if (d >= 0) - return MD_MAJOR; - else - return get_mdp_major(); -} - -static int dev2minor(int d) -{ - if (d >= 0) - return d; - return (-1-d) << MdpMinorShift; -} - int find_free_devnum(int use_partitions) { int devnum; @@ -749,19 +759,38 @@ int dev_open(char *dev, int flags) if (e > dev && *e == ':' && e[1] && (minor = strtoul(e+1, &e, 0)) >= 0 && *e == 0) { - snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d", major, minor); + snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d", + (int)getpid(), major, minor); if (mknod(devname, S_IFBLK|0600, makedev(major, minor))==0) { - fd = open(devname, flags); + fd = open(devname, flags|O_DIRECT); unlink(devname); } } else - fd = open(dev, flags); + fd = open(dev, flags|O_DIRECT); return fd; } -struct superswitch *superlist[] = { &super0, &super1, NULL }; +int open_dev_excl(int devnum) +{ + char buf[20]; + int i; + + sprintf(buf, "%d:%d", dev2major(devnum), dev2minor(devnum)); + for (i=0 ; i<25 ; i++) { + int fd = dev_open(buf, O_RDWR|O_EXCL); + if (fd >= 0) + return fd; + if (errno != EBUSY) + return fd; + usleep(200000); + } + return -1; +} + +struct superswitch *superlist[] = { &super0, &super1, &super_ddf, &super_imsm, NULL }; #if !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) + struct supertype *super_by_fd(int fd) { mdu_array_info_t array; @@ -772,6 +801,7 @@ struct supertype *super_by_fd(int fd) char *verstr; char version[20]; int i; + char *subarray = NULL; sra = sysfs_read(fd, 0, GET_VERSION); @@ -791,40 +821,56 @@ struct supertype *super_by_fd(int fd) sprintf(version, "%d.%d", vers, minor); verstr = version; } + if (minor == -2 && verstr[0] == '/') { + char *dev = verstr+1; + subarray = strchr(dev, '/'); + int devnum; + if (subarray) + *subarray++ = '\0'; + devnum = devname2devnum(dev); + subarray = strdup(subarray); + if (sra) + sysfs_free(sra); + sra = sysfs_read(-1, devnum, GET_VERSION); + verstr = sra->text_version ? : "-no-metadata-"; + } + for (i = 0; st == NULL && superlist[i] ; i++) st = superlist[i]->match_metadata_desc(verstr); if (sra) sysfs_free(sra); - if (st) + if (st) { st->sb = NULL; + if (subarray) { + strncpy(st->subarray, subarray, 32); + st->subarray[31] = 0; + free(subarray); + } else + st->subarray[0] = 0; + } return st; } #endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */ -struct supertype *dup_super(struct supertype *st) +struct supertype *dup_super(struct supertype *orig) { - struct supertype *stnew = NULL; - char *verstr = NULL; - char version[20]; - int i; + struct supertype *st; + if (!orig) + return orig; + st = malloc(sizeof(*st)); if (!st) return st; - - if (st->minor_version == -1) - sprintf(version, "%d", st->ss->major); - else - sprintf(version, "%d.%d", st->ss->major, st->minor_version); - verstr = version; - - for (i = 0; stnew == NULL && superlist[i] ; i++) - stnew = superlist[i]->match_metadata_desc(verstr); - - if (stnew) - stnew->sb = NULL; - return stnew; + memset(st, 0, sizeof(*st)); + st->ss = orig->ss; + st->max_devs = orig->max_devs; + st->minor_version = orig->minor_version; + strcpy(st->subarray, orig->subarray); + st->sb = NULL; + st->info = NULL; + return st; } struct supertype *guess_super(int fd) @@ -839,11 +885,10 @@ struct supertype *guess_super(int fd) int i; st = malloc(sizeof(*st)); - memset(st, 0, sizeof(*st)); for (i=0 ; superlist[i]; i++) { int rv; ss = superlist[i]; - st->ss = NULL; + memset(st, 0, sizeof(*st)); rv = ss->load_super(st, fd, NULL); if (rv == 0) { struct mdinfo info; @@ -858,7 +903,7 @@ struct supertype *guess_super(int fd) } if (bestsuper != -1) { int rv; - st->ss = NULL; + memset(st, 0, sizeof(*st)); rv = superlist[bestsuper]->load_super(st, fd, NULL); if (rv == 0) { superlist[bestsuper]->free_super(st); @@ -906,6 +951,236 @@ void get_one_disk(int mdfd, mdu_array_info_t *ainf, mdu_disk_info_t *disk) return; } +int open_container(int fd) +{ + /* 'fd' is a block device. Find out if it is in use + * by a container, and return an open fd on that container. + */ + char path[256]; + char *e; + DIR *dir; + struct dirent *de; + int dfd, n; + char buf[200]; + int major, minor; + struct stat st; + + if (fstat(fd, &st) != 0) + return -1; + sprintf(path, "/sys/dev/block/%d:%d/holders", + (int)major(st.st_rdev), (int)minor(st.st_rdev)); + e = path + strlen(path); + + dir = opendir(path); + if (!dir) + return -1; + while ((de = readdir(dir))) { + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + sprintf(e, "/%s/dev", de->d_name); + dfd = open(path, O_RDONLY); + if (dfd < 0) + continue; + n = read(dfd, buf, sizeof(buf)); + close(dfd); + if (n <= 0 || n >= sizeof(buf)) + continue; + buf[n] = 0; + if (sscanf(buf, "%d:%d", &major, &minor) != 2) + continue; + sprintf(buf, "%d:%d", major, minor); + dfd = dev_open(buf, O_RDONLY); + if (dfd >= 0) { + closedir(dir); + return dfd; + } + } + closedir(dir); + return -1; +} + +char *devnum2devname(int num) +{ + char name[100]; + if (num > 0) + sprintf(name, "md%d", num); + else + sprintf(name, "md_d%d", -1-num); + return strdup(name); +} + +int devname2devnum(char *name) +{ + char *ep; + int num; + if (strncmp(name, "md_d", 4)==0) + num = -1-strtoul(name+4, &ep, 10); + else + num = strtoul(name+2, &ep, 10); + return num; +} + +int fd2devnum(int fd) +{ + struct stat stb; + if (fstat(fd, &stb) == 0 && + (S_IFMT&stb.st_mode)==S_IFBLK) { + if (major(stb.st_rdev) == MD_MAJOR) + return minor(stb.st_rdev); + else + return -1- (minor(stb.st_rdev)>>6); + } + return -1; +} + +int mdmon_running(int devnum) +{ + char path[100]; + char pid[10]; + int fd; + int n; + sprintf(path, "/var/run/mdadm/%s.pid", devnum2devname(devnum)); + fd = open(path, O_RDONLY, 0); + + if (fd < 0) + return 0; + n = read(fd, pid, 9); + close(fd); + if (n <= 0) + return 0; + if (kill(atoi(pid), 0) == 0) + return 1; + return 0; +} + +int signal_mdmon(int devnum) +{ + char path[100]; + char pid[10]; + int fd; + int n; + sprintf(path, "/var/run/mdadm/%s.pid", devnum2devname(devnum)); + fd = open(path, O_RDONLY, 0); + + if (fd < 0) + return 0; + n = read(fd, pid, 9); + close(fd); + if (n <= 0) + return 0; + if (kill(atoi(pid), SIGUSR1) == 0) + return 1; + return 0; +} + +int start_mdmon(int devnum) +{ + int i; + int len; + pid_t pid; + int status; + char pathbuf[1024]; + char *paths[4] = { + pathbuf, + "/sbin/mdmon", + "mdmon", + NULL + }; + + if (env_no_mdmon()) + return 0; + + len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf)); + if (len > 0) { + char *sl; + pathbuf[len] = 0; + sl = strrchr(pathbuf, '/'); + if (sl) + sl++; + else + sl = pathbuf; + strcpy(sl, "mdmon"); + } else + pathbuf[0] = '\0'; + + switch(fork()) { + case 0: + /* FIXME yuk. CLOSE_EXEC?? */ + for (i=3; i < 100; i++) + close(i); + for (i=0; paths[i]; i++) + if (paths[i][0]) + execl(paths[i], "mdmon", + map_dev(dev2major(devnum), + dev2minor(devnum), + 1), NULL); + exit(1); + case -1: fprintf(stderr, Name ": cannot run mdmon. " + "Array remains readonly\n"); + return -1; + default: /* parent - good */ + pid = wait(&status); + if (pid < 0 || status != 0) + return -1; + } + return 0; +} + +int env_no_mdmon(void) +{ + char *val = getenv("MDADM_NO_MDMON"); + + if (val && atoi(val) == 1) + return 1; + + return 0; +} + + +int flush_metadata_updates(struct supertype *st) +{ + int sfd; + if (!st->updates) { + st->update_tail = NULL; + return -1; + } + + sfd = connect_monitor(devnum2devname(st->container_dev)); + if (sfd < 0) + return -1; + + while (st->updates) { + struct metadata_update *mu = st->updates; + st->updates = mu->next; + + send_message(sfd, mu, 0); + wait_reply(sfd, 0); + free(mu->buf); + free(mu); + } + ack(sfd, 0); + wait_reply(sfd, 0); + close(sfd); + st->update_tail = NULL; + return 0; +} + +void append_metadata_update(struct supertype *st, void *buf, int len) +{ + + struct metadata_update *mu = malloc(sizeof(*mu)); + + mu->buf = buf; + mu->len = len; + mu->space = NULL; + mu->next = NULL; + *st->update_tail = mu; + st->update_tail = &mu->next; +} + + #ifdef __TINYC__ /* tinyc doesn't optimize this check in ioctl.h out ... */ unsigned int __invalid_size_argument_for_IOC = 0; |