summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJari Aalto <jari.aalto@cante.net>2013-05-23 18:30:58 +0100
committerJari Aalto <jari.aalto@cante.net>2013-05-23 18:30:58 +0100
commitb4b24000fc06b35a9c20df920f58eff067dc8643 (patch)
tree63b22179814b6da828b7672ac929a14e77702623
wiggle (0.9.1-1) unstable; urgency=low
* New upstream release - Non-DSFG file DOC/diff.ps no longer included. See bug #698846. * debian/install - Rename from debian/wiggle.install. * debian/patches: - (10): Rm. Fix for a type in ReadMe.c accepted upstream. - (20): New. Correct hyphen in manual page. * debian/watch - Add URL to check new releases. Thanks to Bart Martens <bartm@debian.org>. # imported from the archive
-rw-r--r--.gitignore10
-rw-r--r--.pc/.quilt_patches1
-rw-r--r--.pc/.quilt_series1
-rw-r--r--.pc/.version1
-rw-r--r--.pc/20-manpage.patch/wiggle.1544
-rw-r--r--.pc/applied-patches1
-rw-r--r--ANNOUNCE95
-rw-r--r--COPYING339
-rw-r--r--ChangeLog4
-rw-r--r--DOC/Algorithm33
-rw-r--r--INSTALL11
-rw-r--r--Makefile57
-rw-r--r--ReadMe.c169
-rw-r--r--TODO318
-rw-r--r--bestmatch.c503
-rw-r--r--ccan/build_assert/_info49
-rw-r--r--ccan/build_assert/build_assert.h39
-rw-r--r--ccan/build_assert/test/compile_fail-expr.c10
-rw-r--r--ccan/build_assert/test/compile_fail.c9
-rw-r--r--ccan/build_assert/test/compile_ok.c7
-rw-r--r--ccan/build_assert/test/run-BUILD_ASSERT_OR_ZERO.c9
-rw-r--r--ccan/hash/_info31
-rw-r--r--ccan/hash/hash.c925
-rw-r--r--ccan/hash/hash.h312
-rw-r--r--ccan/hash/test/api-hash_stable.c300
-rw-r--r--ccan/hash/test/run.c149
-rw-r--r--config.h16
-rw-r--r--debian/README.Debian9
-rw-r--r--debian/changelog169
-rw-r--r--debian/compat1
-rw-r--r--debian/control21
-rw-r--r--debian/copyright38
-rw-r--r--debian/docs3
-rw-r--r--debian/install2
-rw-r--r--debian/patches/20-manpage.patch78
-rw-r--r--debian/patches/series1
-rwxr-xr-xdebian/rules13
-rw-r--r--debian/source/format1
-rw-r--r--debian/watch2
-rw-r--r--demo.orig/Makefile53
-rw-r--r--demo.orig/README60
-rw-r--r--demo.orig/vpatch.c666
-rw-r--r--demo.orig/wiggle.c643
-rw-r--r--demo.patched/Makefile56
-rw-r--r--demo.patched/README70
-rw-r--r--demo.patched/vpatch.c667
-rw-r--r--demo.patched/wiggle.c643
-rw-r--r--demo/383MdBlocked271
-rw-r--r--demo/Makefile52
-rw-r--r--demo/README57
-rw-r--r--demo/md.c5769
-rw-r--r--demo/vpatch.c668
-rw-r--r--demo/wiggle.c643
-rw-r--r--diff.c635
-rwxr-xr-xdotest108
-rw-r--r--extract.c325
-rw-r--r--get-p-options8
-rw-r--r--load.c161
-rw-r--r--merge2.c640
-rw-r--r--notes134
-rwxr-xr-xp1155
-rw-r--r--p.help335
-rw-r--r--parse.c324
-rw-r--r--patch_depends.c92
-rw-r--r--split.c118
-rw-r--r--tests/contrib/mod_tbill/merge36
-rw-r--r--tests/contrib/mod_tbill/orig35
-rw-r--r--tests/contrib/mod_tbill/patch16
-rw-r--r--tests/contrib/nmi.c/merge471
-rw-r--r--tests/contrib/nmi.c/orig470
-rw-r--r--tests/contrib/nmi.c/patch8
-rw-r--r--tests/contrib/pfkey_v2_parse.c/merge1789
-rw-r--r--tests/contrib/pfkey_v2_parse.c/orig1778
-rw-r--r--tests/contrib/pfkey_v2_parse.c/patch57
-rw-r--r--tests/contrib/xfaces/merge7269
-rw-r--r--tests/contrib/xfaces/orig7253
-rw-r--r--tests/contrib/xfaces/patch51
-rw-r--r--tests/linux/idmap.h/merge18
-rw-r--r--tests/linux/idmap.h/orig0
-rw-r--r--tests/linux/idmap.h/patch17
-rw-r--r--tests/linux/inode-fullpatch/diff1330
-rw-r--r--tests/linux/inode-fullpatch/merge1358
-rw-r--r--tests/linux/inode-fullpatch/orig1323
-rw-r--r--tests/linux/inode-fullpatch/patch77
-rw-r--r--tests/linux/inode-fullpatch/rediff73
-rw-r--r--tests/linux/inode-fullpatch/wmerge1352
-rw-r--r--tests/linux/inode-justrej/lmerge1360
-rw-r--r--tests/linux/inode-justrej/merge1358
-rw-r--r--tests/linux/inode-justrej/orig1353
-rw-r--r--tests/linux/inode-justrej/patch16
-rw-r--r--tests/linux/inode-justrej/wmerge1352
-rw-r--r--tests/linux/md-autostart/merge4025
-rw-r--r--tests/linux/md-autostart/orig4025
-rw-r--r--tests/linux/md-autostart/patch27
-rw-r--r--tests/linux/md-loop/13949
-rw-r--r--tests/linux/md-loop/23949
-rw-r--r--tests/linux/md-loop/merge3962
-rw-r--r--tests/linux/md-loop/orig3960
-rw-r--r--tests/linux/md-messy/diff93
-rw-r--r--tests/linux/md-messy/new90
-rw-r--r--tests/linux/md-messy/orig91
-rw-r--r--tests/linux/md-resync/merge1911
-rw-r--r--tests/linux/md-resync/orig1848
-rw-r--r--tests/linux/md-resync/patch312
-rw-r--r--tests/linux/md/diff3680
-rw-r--r--tests/linux/md/lmerge3589
-rw-r--r--tests/linux/md/merge3589
-rw-r--r--tests/linux/md/orig3674
-rw-r--r--tests/linux/md/patch117
-rw-r--r--tests/linux/md/rediff101
-rw-r--r--tests/linux/md/replace0
-rw-r--r--tests/linux/md/wmerge3589
-rw-r--r--tests/linux/nfsd-defines/merge270
-rw-r--r--tests/linux/nfsd-defines/orig270
-rw-r--r--tests/linux/nfsd-defines/patch24
-rw-r--r--tests/linux/raid1-A/merge2333
-rw-r--r--tests/linux/raid1-A/orig2273
-rw-r--r--tests/linux/raid1-A/patch64
-rw-r--r--tests/linux/raid5/orig2079
-rw-r--r--tests/linux/raid5/patch962
-rw-r--r--tests/linux/raid5build/merge38
-rw-r--r--tests/linux/raid5build/orig15
-rw-r--r--tests/linux/raid5build/patch31
-rw-r--r--tests/linux/raid5line/lmerge7
-rw-r--r--tests/linux/raid5line/merge7
-rw-r--r--tests/linux/raid5line/orig1
-rw-r--r--tests/linux/raid5line/patch3
-rw-r--r--tests/linux/raid5line/wmerge1
-rw-r--r--tests/linux/rpc_tcp_nonagle/merge1528
-rw-r--r--tests/linux/rpc_tcp_nonagle/orig1511
-rw-r--r--tests/linux/rpc_tcp_nonagle/patch33
-rw-r--r--tests/simple/all-different-2/lmerge34
-rw-r--r--tests/simple/all-different-2/merge34
-rw-r--r--tests/simple/all-different-2/new10
-rw-r--r--tests/simple/all-different-2/new210
-rw-r--r--tests/simple/all-different-2/orig10
-rw-r--r--tests/simple/all-different-2/wmerge10
-rw-r--r--tests/simple/all-different/lmerge35
-rw-r--r--tests/simple/all-different/merge37
-rw-r--r--tests/simple/all-different/new11
-rw-r--r--tests/simple/all-different/new211
-rw-r--r--tests/simple/all-different/orig11
-rw-r--r--tests/simple/all-different/wmerge11
-rw-r--r--tests/simple/already-applied/merge3
-rw-r--r--tests/simple/already-applied/new2
-rw-r--r--tests/simple/already-applied/new22
-rw-r--r--tests/simple/already-applied/orig3
-rw-r--r--tests/simple/base/diff23
-rw-r--r--tests/simple/base/ldiff25
-rw-r--r--tests/simple/base/merge20
-rw-r--r--tests/simple/base/new21
-rw-r--r--tests/simple/base/new221
-rw-r--r--tests/simple/base/orig20
-rw-r--r--tests/simple/bothadd/lmerge4
-rw-r--r--tests/simple/bothadd/merge4
-rw-r--r--tests/simple/bothadd/new3
-rw-r--r--tests/simple/bothadd/new24
-rw-r--r--tests/simple/bothadd/orig4
-rw-r--r--tests/simple/brokenlines/diff7
-rw-r--r--tests/simple/brokenlines/merge5
-rw-r--r--tests/simple/brokenlines/new3
-rw-r--r--tests/simple/brokenlines/new23
-rw-r--r--tests/simple/brokenlines/orig5
-rw-r--r--tests/simple/changeafteradd/merge5
-rw-r--r--tests/simple/changeafteradd/new6
-rw-r--r--tests/simple/changeafteradd/new26
-rw-r--r--tests/simple/changeafteradd/orig5
-rw-r--r--tests/simple/conflict/diff5
-rw-r--r--tests/simple/conflict/ldiff6
-rw-r--r--tests/simple/conflict/merge16
-rw-r--r--tests/simple/conflict/new4
-rw-r--r--tests/simple/conflict/new24
-rw-r--r--tests/simple/conflict/orig4
-rw-r--r--tests/simple/conflict/wmerge4
-rw-r--r--tests/simple/conflictmixed/diff5
-rw-r--r--tests/simple/conflictmixed/ldiff6
-rw-r--r--tests/simple/conflictmixed/lmerge14
-rw-r--r--tests/simple/conflictmixed/merge16
-rw-r--r--tests/simple/conflictmixed/new4
-rw-r--r--tests/simple/conflictmixed/new24
-rw-r--r--tests/simple/conflictmixed/orig4
-rw-r--r--tests/simple/conflictmixed/wmerge4
-rw-r--r--tests/simple/multideletes/lmerge2
-rw-r--r--tests/simple/multideletes/merge2
-rw-r--r--tests/simple/multideletes/new8
-rw-r--r--tests/simple/multideletes/new25
-rw-r--r--tests/simple/multideletes/orig5
-rw-r--r--tests/simple/multiple-add/lmerge17
-rw-r--r--tests/simple/multiple-add/merge17
-rw-r--r--tests/simple/multiple-add/new9
-rw-r--r--tests/simple/multiple-add/new29
-rw-r--r--tests/simple/multiple-add/orig9
-rw-r--r--tests/simple/multiple-add/wmerge9
-rw-r--r--tests/simple/show-wiggle-1/Wmerge20
-rw-r--r--tests/simple/show-wiggle-1/new5
-rw-r--r--tests/simple/show-wiggle-1/new25
-rw-r--r--tests/simple/show-wiggle-1/orig6
-rw-r--r--tests/simple/show-wiggle-2/Wmerge13
-rw-r--r--tests/simple/show-wiggle-2/new5
-rw-r--r--tests/simple/show-wiggle-2/new25
-rw-r--r--tests/simple/show-wiggle-2/orig5
-rw-r--r--tests/simple/trivial-conflict/merge7
-rw-r--r--tests/simple/trivial-conflict/orig1
-rw-r--r--tests/simple/trivial-conflict/patch5
-rw-r--r--vpatch.c2409
-rw-r--r--wiggle.1544
-rw-r--r--wiggle.c813
-rw-r--r--wiggle.h201
-rw-r--r--wiggle.spec55
209 files changed, 111604 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..92f6f2c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+*.o
+demo.patch
+wiggle
+TAGS
+.time
+wiggle.man
+.version-*
+version
+DIST
+RCS
diff --git a/.pc/.quilt_patches b/.pc/.quilt_patches
new file mode 100644
index 0000000..6857a8d
--- /dev/null
+++ b/.pc/.quilt_patches
@@ -0,0 +1 @@
+debian/patches
diff --git a/.pc/.quilt_series b/.pc/.quilt_series
new file mode 100644
index 0000000..c206706
--- /dev/null
+++ b/.pc/.quilt_series
@@ -0,0 +1 @@
+series
diff --git a/.pc/.version b/.pc/.version
new file mode 100644
index 0000000..0cfbf08
--- /dev/null
+++ b/.pc/.version
@@ -0,0 +1 @@
+2
diff --git a/.pc/20-manpage.patch/wiggle.1 b/.pc/20-manpage.patch/wiggle.1
new file mode 100644
index 0000000..c4ed0dd
--- /dev/null
+++ b/.pc/20-manpage.patch/wiggle.1
@@ -0,0 +1,544 @@
+.\" -*- nroff -*-
+.\" wiggle - apply rejected patches
+.\"
+.\" Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+.\" Copyright (C) 2010 Neil Brown <neilb@suse.de>
+.\"
+.\"
+.\" This program is free software; you can redistribute it and/or modify
+.\" it under the terms of the GNU General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or
+.\" (at your option) any later version.
+.\"
+.\" This program is distributed in the hope that it will be useful,
+.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
+.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+.\" GNU General Public License for more details.
+.\"
+.\" You should have received a copy of the GNU General Public License
+.\" along with this program; if not, write to the Free Software
+.\" Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+.\"
+.\" Author: Neil Brown
+.\" Email: <neilb@suse.de>
+.\"
+.TH WIGGLE 1 "" v0.9.1
+.SH NAME
+wiggle \- apply rejected patches and perform word-wise diffs
+
+.SH SYNOPSIS
+
+.BI wiggle " [function] [options] file [files]"
+
+.SH DESCRIPTION
+The main function of
+.I wiggle
+is to apply a patch to a file in a similar manner to the
+.BR patch (1)
+program.
+
+The distinctive difference of
+.I wiggle
+is that it will attempt to apply a patch even if the "before" part of
+the patch doesn't match the target file perfectly.
+This is achieved by breaking the file and patch into words and finding
+the best alignment of words in the file with words in the patch.
+Once this alignment has been found, any differences (word-wise) in the
+patch are applied to the file as best as possible.
+
+Also,
+.I wiggle
+will (in some cases) detect changes that have already been applied,
+and will ignore them.
+
+.I wiggle
+ensures that every change in the patch is applied to the target
+file somehow. If a particular change cannot be made in the file, the
+file is annotated to show where the change should be made in a similar
+way to the
+.BR merge (1)
+program with the
+.B \-A
+option.
+Each annotation contains 3 components: a portion of the original file
+where the change should be applied, a portion of the patch that
+couldn't be matched precisely in the file, and the text that should
+replace that portion of the patch. These are separated by lines
+containing precisely 7 identical characters, either '<', '|', '=', or '>', so
+.in +5
+.nf
+.ft CW
+<<<<<<<
+Some portion of the original file
+|||||||
+text to replace
+=======
+text to replace it with
+>>>>>>>
+.ft
+.fi
+.in -5
+
+indicates that "text to replace" should be replaced by "text to
+replace it with" somewhere in the portion of the original file.
+However
+.I wiggle
+was not able to find a place to make this change.
+
+.I wiggle
+can also produce conflict reports showing only the words that are
+involved rather than showing whole lines.
+In this case the output looks like:
+.ft CW
+.ti +5
+<<<---original|||old===new--->>>
+.ft
+
+A typical usage of
+.I wiggle
+is to run
+.I patch
+to apply some patch, and to collect a list of rejects by monitoring
+the error messages from patch. Then for each file for which a
+reject was found, run
+.ti +5
+wiggle \-\-replace originalfile originalfile.rej
+
+Finally each file must be examined to resolve any unresolved
+conflicts, and to make sure the applied patch is semantically correct.
+
+.SS OPTIONS
+The following options are understood by
+.IR wiggle .
+Some of these are explained in more detail in the following sections
+on MERGE, DIFF, EXTRACT, and BROWSE.
+
+.TP
+.BR \-m ", " \-\-merge
+Select the "merge" function. This is the default function.
+
+.TP
+.BR \-d ", " \-\-diff
+Select the "diff" function. This displays the differences between files.
+
+.TP
+.BR \-x ", " \-\-extract
+Select the "extract" function. This extracts one branch of a patch or
+merge file.
+
+.TP
+.BR \-B ", " \-\-browse
+Select the "browse" function. This is similar to "merge" only with a
+different presentation. Instead of the result simply being sent to
+standard output, it is presented using an ncurses-based GUI so that
+each hunk of the patch can be examined to understand what conflicts
+where involved and what needed to be ignored in order of the patch to
+be wiggled in to place.
+
+.TP
+.BR -w ", " \-\-words
+Request that all operations and display be word based. This is the
+default for the "diff" function.
+
+.TP
+.BR -l ", " \-\-lines
+Request that all operations and display be line based.
+
+.TP
+.BR -p ", " \-\-patch
+Treat the last named file as a patch instead of a file (with \-\-diff)
+or a merge (\-\-extract).
+In
+.I merge
+or
+.B browse
+mode,
+.B -p
+requires there be exactly one file which is a patch and which can
+contain patches to multiple file. The patches are merged into each
+file. When used in
+.I merge
+mode, this usage requires the
+.B \-\-replace
+option as writing lots of merged files to standard-out is impractical.
+
+When processing a multi-file patch,
+B -p
+can be followed by a numeric argument indicating how many file name
+components should be stripped from files named in the patch file. If no
+numeric argument is given,
+.I wiggle
+will deduce an appropriate number based what files are visible.
+
+.TP
+.BR -r ", " \-\-replace
+Normally the merged output is written to standard-output. With
+\-\-replace, the original file is replaced with the merge output.
+
+.TP
+.BR -R ", " \-\-reverse
+When used with the "diff" function, swap the files before calculating
+the differences.
+When used with the "merge" function,
+.I wiggle
+attempts to revert changes rather than apply them.
+
+.TP
+.BR -i ", " \-\-no\-ignore
+Normally wiggle will ignore changes in the patch which appear to
+already have been applied in the original. With this flag those
+changes are reported as conflicts rather than being ignored.
+
+.TP
+.BR -W ", " \-\-show\-wiggle
+When used with
+.IR \-\-merge ,
+conflicts that can be wiggled into place are reported as conflicts
+with an extra stanza which shows what the result would be if this flag
+had not been used. The extra stanza is introduce with a line
+containing 7 ampersand
+.RB ( & )
+characters thus:
+.in +5
+.nf
+.ft CW
+<<<<<<<
+Some portion of the original file
+|||||||
+text to replace
+=======
+text to replace it with
+&&&&&&&
+Text that would result from a successful wiggle
+>>>>>>>
+.ft
+.fi
+.in -5
+
+.TP
+.BR -h ", " \-\-help
+Print a simple help message. If given after one of the function
+selectors (\-\-merge, \-\-diff, \-\-extract) help specific to that function
+is displayed.
+
+.TP
+.BR -V ", " \-\-version
+Display the version number of
+.IR wiggle .
+
+.TP
+.BR -v ", " \-\-verbose
+Enable verbose mode. Currently this makes no difference.
+
+.TP
+.BR -q ", " \-\-quiet
+Enable quiet mode. This suppresses the message from the merge
+function when there are unresolvable conflicts.
+
+.SS WORDS
+.I wiggle
+can divide a text into lines or words when performing it's tasks.
+A line is simply a string of characters terminated by a newline.
+A word is either a maximal contiguous string of alphanumerics
+(including underscore), a maximal contiguous string of space or tab
+characters, or any other single character.
+
+.SS MERGE
+The merge function modifies a given text by finding all changes between
+two other texts and imposing those changes on the given text.
+
+Normally
+.I wiggle
+focuses on which words have changed so as to maximise the possibility
+of finding a good match in the given text for the context of a given
+change. However it can consider only whole lines instead.
+
+.I wiggle
+extracts the three texts that it needs from files listed on the
+command line. Either 1, 2, or 3 files may be listed, and any one of
+them may be a lone hyphen signifying standard-input.
+
+If one file is given and the
+.B \-p
+option is not present, the file is treated as a
+.B merge
+file, i.e. the output of "merge \-A" or "wiggle". Such a file
+implicitly contains three streams and these are extracted and
+compared.
+
+If two files are given, then the first simply contains the primary
+text, and the second is treated as a patch file (the output of "diff\ \-u"
+or "diff\ \-c", or a ".rej" file from
+.IR patch )
+and the two other texts
+are extracted from that.
+
+If one file is given together with the
+.B \-p
+option, the file is treated as a patch file containing the names of
+the files that it patches. In this case multiple merge operations can
+happen and each takes one stream from a file named in the patch, and
+the other to from the patch itself. The
+.B \-\-replace
+option is required and the results are written back to the
+target files.
+
+Finally if three files are listed, they are taken to contain the given
+text and the two other texts, in order.
+
+Normally the result of the merge is written to standard-output.
+However if the "\-r" flag is given, the output is written to a file
+which replaces the original given file. In this case the original file
+is renamed to have a
+.B .porig
+suffix (for "patched original" which makes sense if you first use
+.I patch
+to apply a patch, and then use
+.I wiggle
+to wiggle the rejects in).
+
+If no errors occur (such as file access errors)
+.I wiggle
+will exit with a status of 0 if all changes were successfully merged,
+and with an exit status of 1 and a brief message if any changes could
+not be fully merged and were instead inserted as annotations.
+
+The merge function can operate in three different modes with respect
+to lines or words.
+
+With the
+.B \-\-lines
+option, whole lines are compared and any conflicts
+are reported as whole lines that need to be replaced.
+
+With the
+.B \-\-words
+option, individual words are compared and any
+conflicts are reported just covering the words affected. This uses
+the \f(CW <<<|||===>>> \fP conflict format.
+
+Without either of these options, a hybrid approach is taken.
+Individual words are compared and merged, but when a conflict is found
+the whole surrounding line is reported as being in conflict.
+
+.I wiggle
+will ensure that every change between the two other texts is reflected
+in the result of the merge somehow. There are four different ways
+that a change can be reflected.
+.IP 1
+If a change converts
+.B A
+to
+.B B
+and
+.B A
+is found at a suitable place in the original file, it is
+replaced with
+.BR B .
+This includes the possibility that
+.B B
+is empty, but
+not that
+.B A
+is empty.
+
+.IP 2
+If a change is found which simply adds
+.B B
+and the text immediately preceding and following the insertion are
+found adjacent in the original file in a suitable place, then
+.B B
+is inserted between those adjacent texts.
+
+.IP 3
+If a change is found which changes
+.B A
+to
+.B B
+and this appears (based on context) to align with
+.B B
+in the original, then it is assumed that this change has already been
+applied, and the change is ignored. When this happens, a message
+reflected the number of ignored changes is printed by
+.IR wiggle .
+This optimisation can be suppressed with the
+.B \-i
+flag.
+
+.IP 4
+If a change is found that does not fit any of the above possibilities,
+then a conflict is reported as described earlier.
+
+.SS DIFF
+
+The diff function is provided primarily to allow inspection of the
+alignments that
+.I wiggle
+calculated between texts and that it uses for performing a merge.
+
+The output of the diff function is similar to the unified output of
+diff. However while diff does not output long stretches of common text,
+.IR wiggle 's
+diff mode outputs everything.
+
+When calculating a word-based alignment (the default),
+.I wiggle
+may need to show these word-based differences. This is done using an
+extension to the unified-diff format. If a line starts with a
+vertical bar, then it may contain sections surrounded by special
+multi-character brackets. The brackets "<<<++" and "++>>>" surround
+added text while "<<<--" and "-->>>" surround removed text.
+
+.I wiggle
+can be given the two texts to compare in one of three ways.
+
+If only one file is given, then it is treated as a patch and the two
+branches of that diff are compared. This effectively allows a patch
+to be refined from a line-based patch to a word-based patch.
+
+If two files are given, then they are normally assumed to be simple
+texts to be compared.
+
+If two files are given along with the \-\-patch option, then the second
+file is assumed to be a patch and either the first (with \-1) or the
+second (with \-2) branch is extracted and compared with text found in
+the first file.
+
+This last option causes
+.I wiggle
+to apply a "best-fit" algorithm for aligning patch hunks with the
+file before computing the differences. This algorithm is used when
+merging a patch with a file, and its value can be seen by comparing
+the difference produced this way with the difference produced by first
+extracting one branch of a patch into a file, and then computing the
+difference of that file with the main file.
+
+
+.SS EXTRACT
+
+The extract function of
+.I wiggle
+simply exposes the internal functionality for extracting one branch of
+a patch or a merge file.
+
+Precisely one file should be given, and it will be assumed to be a
+merge file unless
+.B \-\-patch
+is given, in which case a patch is assumed.
+
+The choice of branch in made by providing one of
+.BR -1 ,
+.BR -2 ,
+or
+.B -3
+with obvious meanings.
+
+.SS BROWSE
+
+The browse function of
+.I wiggle
+presents the result of a merge in a text-based GUI that can be
+navigated using keystrokes similar to vi(1) or emacs(1).
+
+The browser allow each of the three streams to be viewed individually
+with colours used to highlight different sorts of text - green for
+added text, red for deleted text etc. It can also show the patch by
+itself, the full result of the merge, or the merge and the patch
+side-by-side.
+
+The browser provides a number of context-sensitive help pages which
+can be accessed by typing '?'
+
+.SH WARNING
+
+Caution should always be exercised when applying a rejected patch with
+.IR wiggle .
+When
+.I patch
+rejects a patch, it does so for a good reason. Even though
+.I wiggle
+may be able to find a believable place to apply each textual change,
+there is no guarantee that the result is correct in any semantic
+sense. The result should always be inspected to make sure it is
+correct.
+
+.SH EXAMPLES
+
+.B " wiggle \-\-replace file file.rej"
+.br
+This is the normal usage of
+.I wiggle
+and will take any changes in
+.B file.rej
+that
+.I patch
+could not apply, and merge them into
+.BR file .
+
+.B " wiggle -dp1 file file.rej"
+.br
+This will perform a word-wise comparison between the
+.B file
+and the
+.I before
+branch of the diff in
+.B file.rej
+and display the differences. This allows you to see where a given
+patch would apply.
+
+.B " wiggle \-\-merge \-\-help"
+.br
+Get help about the merge function of
+.IR wiggle .
+
+.B " wiggle --browse --patch update.patch"
+.br
+Parse the
+.B update.patch
+file for patches and present a list of patched files which can be
+browsed to examine each patch in detail.
+
+.SH QUOTE
+The name of
+.I wiggle
+was inspired by the following quote. However
+.I wiggle
+does not yet
+.B help
+you to wiggle a patch into place. It either does the wiggle itself,
+or leave it for you to finish off.
+
+.nf
+The problem I find is that I often want to take
+ (file1+patch) -> file2,
+when I don't have file1. But merge tools want to take
+ (file1|file2) -> file3.
+I haven't seen a graphical tool which helps you to wiggle a patch
+into a file.
+
+\-\- Andrew Morton - 2002
+.fi
+
+.SH SHORTCOMINGS
+.IP -
+.I wiggle
+cannot read the extended unified-diff output that it produces for
+\-\-diff \-\-words.
+
+.IP -
+.I wiggle
+cannot read the word-based merge format that it produces for \-\-merge
+\-\-words.
+
+.SH AUTHOR
+
+Neil Brown at Computer Science and Engineering at
+The University of New South Wales, Sydney, Australia;
+and later and SUSE, still in Sydney, Australia.
+
+.SH SEE ALSO
+.IR patch (1),
+.IR diff (1),
+.IR merge (1),
+.IR wdiff (1),
+.IR diff3 (1).
diff --git a/.pc/applied-patches b/.pc/applied-patches
new file mode 100644
index 0000000..42835a2
--- /dev/null
+++ b/.pc/applied-patches
@@ -0,0 +1 @@
+20-manpage.patch
diff --git a/ANNOUNCE b/ANNOUNCE
new file mode 100644
index 0000000..d84e935
--- /dev/null
+++ b/ANNOUNCE
@@ -0,0 +1,95 @@
+ANNOUNCE: wiggle - a tool for applying patches with conflicts
+
+I am pleased to announce the first public release of 'wiggle'.
+
+Wiggle is a program for applying patches that 'patch' cannot
+apply due to conflicting changes in the original.
+
+Wiggle will always apply all changes in the patch to the original.
+If it cannot find a way to cleanly apply a patch, it inserts it
+in the original in a manner similar to 'merge', and reports an
+unresolvable conflict. Such a conflict will look like:
+
+<<<<<<<
+Some text from
+the original file
+|||||||
+Some text that the patch changes
+=======
+Some text that is the result of the patch
+>>>>>>>
+
+with the meaning that the "text that the patch changes"
+was expected somewhere in the "text from the original file"
+and should be replaced with "the result of the patch".
+
+wiggle analyses the file and the patch in terms of words rather than
+whole lines and so is able to find matches that patch is
+unable to find. If a patch changes a word at the end of a line, and
+a word at the start of that line has been modified since the patch
+was made, then wiggle will have no trouble applying the patch.
+
+wiggle has proved very useful for back-porting patches that were
+generated for the development kernel, onto the stable kernel.
+Sometimes it does exactly the right thing with the patch. When it doesn't
+it reports a conflict which is easy to resolve with an understanding of
+what the code and the patch were trying to achieve.
+
+Wiggle is available under the GPL and can be fetched from:
+
+ http://www.cse.unsw.edu.au/~neilb/source/wiggle/
+
+The name 'wiggle' was inspired by Andrew Morton's comment:
+
+ The problem I find is that I often want to take
+ (file1+patch) -> file2,
+ when I don't have file1. But merge tools want to take
+ (file1|file2) -> file3.
+ I haven't seen a graphical tool which helps you to wiggle a patch
+ into a file.
+
+which google can find for you:
+ http://www.google.com/search?q=graphical+tool+which+helps+you+to+wiggle+a+patch
+
+It isn't a graphical tool, but it is a good first step.
+
+NOTES:
+
+This release contains a 'tests' directory with a number of test cases
+that have proved invaluable in developing the program and my
+understanding of the subtleties of some of the issues involved. If you
+find a case where wiggle behaves sub-optimally (e.g. dumps core),
+please consider sending me a test case to add to the tests directory.
+
+This release also contains a script 'p' and accompanying 'p.help'.
+This is a script that I use for patch management for my kernel patches
+and it makes use of wiggle to allow me to apply patches that
+'patch' cannot manage. It is included both as an example of
+how wiggle can be used, and as a tool that some might find useful.
+
+One shortcoming I find with wiggle is that I would like to be able
+to 'see' what it has done. I would love it if someone were to write
+a program that allowed the results of wiggle to be visualised.
+The closest that I have come to imagining a workable UI is to
+have two side-by-side windows, one of which shows the original patch,
+and the other shows a "diff -u" of before and after wiggle has done it's
+thing, and to have these windows automatically aligned so that when
+a change is shown in one, the corresponding change appears in the other.
+Maybe something like tkdiff, but that knows about patches and knows
+about word-based diffs....
+
+Wiggle is also able to perform a function similar to 'diff' and show the
+differences and similarities between two files. It can show these differences
+and similarities at a word-by-word level. The output format is not machine
+readable as the character sequences used to delimit inserted and deleted
+words are not quoted in the output. Hence this format will probably change
+at some stage and should not be depended upon.
+
+If you read the source, beware of comments: they were probably written
+while I was still trying to understand the issues myself, and so are
+probably wrong and out-of-date. I would like to review all the code and
+comments, but if I wait until I do that before releasing it, it'll never
+get released!
+
+NeilBrown
+The University of New South Wales
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..8337da0
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,4 @@
+
+ - Don't use --quiet in dotest as it is a Debian specific extension
+ to /usr/bin/time
+
diff --git a/DOC/Algorithm b/DOC/Algorithm
new file mode 100644
index 0000000..515ba07
--- /dev/null
+++ b/DOC/Algorithm
@@ -0,0 +1,33 @@
+
+This directory previously contained a copy of
+ An O (ND) Difference Algorithm and Its Variations
+by EUGENE W. MYERS
+
+However it isn't clear that I have the right to redistrubute this so
+I've removed it. It can easily be found by searching the internet.
+
+The code in wiggle differs from the algorithm presented in that paper
+in one fairly minor way.
+
+The paper describes how to find an optimal path or "snake" through the
+edit graph, but only stores the end-point and cost of the snake, not
+the full path (as that would require order-n^2 space).
+
+It then suggests that you run the same algorithm concurrently but in
+reverse from the end of the graph towards the start. When you find
+that the longest snakes in both directions cross, you have a midpoint
+on the path.
+
+This is more useful than an end-point as you can recurse on either
+side and build up the full path using linear space and only doubling
+your work.
+
+Wiggle takes a different approach. Finding where the snakes cross
+seemed awkward to me, and having two blocks of similiar but not
+identical code (one to search forward, one to search backwards) didn't
+appeal at all.
+
+So wiggle only searches forward, but it remembers where the half-way
+line was crossed. i.e. it remembers the 'x' value when x+y
+changed from below to above (max_x + max_y)/2.
+This uses much the same amount of storage, but significantly less code.
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..e106c89
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,11 @@
+
+To build and install wiggle, simply type:
+
+ make install
+
+This will install /usr/bin/wiggle and /usr/share/man/man1/wiggle.1
+
+You might like to inspect the Makefile and change
+ OptDbg=-ggdb
+to something that will compile faster code on your compter, such as
+ OptDbg=-O3 -march=pentium2
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..46ab769
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,57 @@
+
+# Note on my Mobile Pentium II, -march=pentium2 delivers twice the performance of i386
+#OptDbg=-O3
+#OptDbg=-O3 -march=pentium2
+OptDbg=-ggdb
+CFLAGS=$(OptDbg) -I. -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter
+
+# STRIP = -s
+INSTALL = /usr/bin/install
+DESTDIR =
+BINDIR = /usr/bin
+MANDIR = /usr/share/man
+MAN1DIR = $(MANDIR)/man1
+MAN5DIR = $(MANDIR)/man5
+LDLIBS = -lncurses
+
+all: wiggle wiggle.man test
+
+wiggle : wiggle.o load.o parse.o split.o extract.o diff.o bestmatch.o ReadMe.o \
+ merge2.o vpatch.o ccan/hash/hash.o
+wiggle.o load.o parse.o split.o extract.o diff.o bestmatch.o ReadMe.o \
+ merge2.o vpatch.o :: wiggle.h
+split.o :: ccan/hash/hash.h config.h
+
+
+test: wiggle dotest
+ ./dotest
+
+wiggle.man : wiggle.1
+ nroff -man wiggle.1 > wiggle.man
+
+clean:
+ rm -f *.o ccan/hash/*.o *.man wiggle .version* demo.patch version
+ find . -name core -o -name '*.tmp*' -o -name .tmp -o -name .time | xargs rm -f
+
+install : wiggle wiggle.1
+ $(INSTALL) -D $(STRIP) -m 755 wiggle $(DESTDIR)$(BINDIR)/wiggle
+ $(INSTALL) -D -m 644 wiggle.1 $(DESTDIR)$(MAN1DIR)/wiggle.1
+
+version : ReadMe.c wiggle.1
+ @rm -f version
+ @sed -n -e 's/.*wiggle \([0-9.]*\) .*/\1/p' ReadMe.c > .version-readme
+ @sed -n -e 's/.*WIGGLE 1 "" v\([0-9.]*\)$$/\1/p' wiggle.1 > .version-man
+ @cmp -s .version-readme .version-man && cat .version-man > version || { echo Inconsistant versions.; exit 1;}
+
+dist : test clean version
+ mkdir -p DIST
+ rm -f DIST/wiggle-`cat version`
+ git archive --prefix wiggle-`cat version`/ v`cat version` | gzip -9 > DIST/wiggle-`cat version`.tar.gz
+
+v : version
+ cat version
+
+demo.patch: force
+ diff -ru demo.orig demo.patched | sed 's/demo.patched/demo/' > demo.patch
+
+force:
diff --git a/ReadMe.c b/ReadMe.c
new file mode 100644
index 0000000..f28c749
--- /dev/null
+++ b/ReadMe.c
@@ -0,0 +1,169 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+/*
+ * Options and help text for wiggle
+ */
+
+#include "wiggle.h"
+
+char Version[] = "wiggle 0.9.1 2013-02-05 GPL-2+ http://neil.brown.name/wiggle/\n";
+
+char short_options[] = "xdmwlrhiW123p::VRvqB";
+
+struct option long_options[] = {
+ {"browse", 0, 0, 'B'},
+ {"extract", 0, 0, 'x'},
+ {"diff", 0, 0, 'd'},
+ {"merge", 0, 0, 'm'},
+ {"words", 0, 0, 'w'},
+ {"lines", 0, 0, 'l'},
+ {"patch", 0, 0, 'p'},
+ {"replace", 0, 0, 'r'},
+ {"help", 0, 0, 'h'},
+ {"version", 0, 0, 'V'},
+ {"reverse", 0, 0, 'R'},
+ {"verbose", 0, 0, 'v'},
+ {"quiet", 0, 0, 'q'},
+ {"strip", 1, 0, 'p'},
+ {"no-ignore", 0, 0, 'i'},
+ {"show-wiggle", 0, 0, 'W'},
+ {0, 0, 0, 0}
+};
+
+char Usage[] =
+"Usage: wiggle --diff|--extract|--merge|--browse --lines|--words [--replace] files...\n";
+
+char Help[] = "\n"
+"Wiggle - apply patches that 'patch' rejects.\n"
+"\n"
+"Wiggle provides four distinct but related functions:\n"
+"merge, diff, extract, and browse.\n"
+"To get more detailed help on a function, select the function\n"
+"before requesting help. e.g.\n"
+" wiggle --diff --help\n"
+"\n"
+"Options:\n"
+" --extract -x : select 'extract' function.\n"
+" --diff -d : select 'diff' function.\n"
+" --merge -m : select 'merge' function (default).\n"
+" --browse -B : select 'browse' function.\n"
+"\n"
+" --words -w : word-wise diff and merge.\n"
+" --lines -l : line-wise diff and merge.\n"
+"\n"
+" --patch -p : treat last file as a patch file.\n"
+" -1 -2 -3 : select which component of patch or merge to use.\n"
+" --reverse -R : swap 'before' and 'after' for diff function.\n"
+" --no-ignore -i : Don't ignore already-applied changes.\n"
+" --show-wiggle -W : Report wiggles like conflicts with an extra stanza.\n"
+"\n"
+" --help -h : get help.\n"
+" --version -V : get version of wiggle.\n"
+" --verbose -v : (potentially) be more verbose.\n"
+" --quiet -q : don't print un-necessary messages.\n"
+"\n"
+" --replace -r : replace first file with result of merger.\n"
+"\n"
+" --strip= -p : number of path components to strip from file names.\n"
+"\n"
+"Wiggle needs to be given 1, 2, or 3 files. Any one of these can\n"
+"be given as '-' to signify standard input.\n"
+"\n";
+
+char HelpExtract[] = "\n"
+"wiggle --extract -[123] [--patch] merge-or-patch\n"
+"\n"
+"The extract function allows one branch of a patch or merge file\n"
+"to be extracted. A 'patch' is the output of 'diff -c' or 'diff -u'.\n"
+"Either the before (-1) or after (-2) branch can be extracted.\n"
+"\n"
+"A 'merge' is the output of 'diff3 -m' or 'merge -A'. Either the\n"
+"first, second, or third branch can be extracted.\n"
+"\n"
+"A 'merge' file is assumed unless --patch is given.\n"
+"\n";
+
+char HelpDiff[] = "\n"
+"wiggle --diff [-wl] [-p12] [-R] file-or-patch [file-or-patch]\n"
+"\n"
+"The diff function will report the differencs and similarities between\n"
+"two files in a format similar to 'diff -u'. With --word mode\n"
+"(the default) word-wise differences are displayed on lines starting\n"
+"with a '|'. With --line mode, only whole lines are considered\n"
+"much like normal diff.\n"
+"\n"
+"If one file is given is it assumed to be a patch, and the two\n"
+"branches of the patch are extracted and compared. If two files\n"
+"are given they are normally assumed to be whole files and are compared.\n"
+"However if the --patch option is given with two files, then the\n"
+"second is treated as a patch and the first or (with -2) second branch\n"
+"is extracted and compared against the first file.\n"
+"\n"
+"--reverse (-R) with cause diff to swap the two files before comparing\n"
+"them.\n"
+"\n";
+
+char HelpMerge[] = "\n"
+"wiggle --merge [-wl] [--replace] file-or-merge [file-or-patch [file]]\n"
+"\n"
+"The merge function is the primary function of wiggle and is assumed\n"
+"if no function is explicitly chosen.\n"
+"\n"
+"Normally wiggle will compare three files on a word-by-word basis and\n"
+"output unresolvable conflicts in the resulting merge by showing\n"
+"whole-line differences.\n"
+"With the --lines option, the files are compared line-wise much\n"
+"like 'merge'. With the (default) --words option, files are compared\n"
+"word-wise and unresolvable conflicts are reported word-wise.\n"
+"\n"
+"If --merge is given one file, it is treated as a merge (merge -A\n"
+"output) and the three needed streams are extracted from it.\n"
+"If --merge is given one file and -p, it is a patch which identifies\n"
+"the files that should be patched.\n"
+"If --merge is given two files, the second is treated as a patch\n"
+"file and the first is the original file.\n"
+"If --merge is given three files, they are each treated as whole files\n"
+"and differences between the second and third are merged into the first.\n"
+"This usage is much like 'merge'.\n"
+"\n";
+
+char HelpBrowse[] = "\n"
+"wiggle --browse [-R] [--strip=n] [-p] [files]\n"
+"\n"
+"The 'browse' function provides an interactive mode for browsing a\n"
+"patch or set of patches. It allows the application of a patch to each\n"
+"file to be inspected and will eventually allow limited editing to correct\n"
+"mis-application of patches where wiggling was required, and where conflicts\n"
+"occurred.\n"
+"If no files are given, a patch file is read from stdin\n"
+"If one file is given with -p, it is treated as a patch file\n"
+"If one file is given with a name ending .rej, it is treated as a reject for\n"
+" a file with matching basename\n"
+"Otherwise a single file is assumed to be a merge output with conflicts.\n"
+"If two files are given, the second is a patch to apply to the first.\n"
+"If three files are given then the difference between 2nd and 3rd is applied\n"
+" to the first\n"
+"\n";
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..09af8d6
--- /dev/null
+++ b/TODO
@@ -0,0 +1,318 @@
+
+- search should go to right column and highlight
+- do we always recenter top when we split - what if near but not at bottom
+- side-by-side sometimes shows last line of a chunk with
+ the next chunk
+- '|' should go from sidebyside to merge
+- sometimes the split page gets white spaces - 'int bl_count'
+- use next_mline in print_merge2
++ discard merge.c
+
+
+- extract.c should be able to extract half of a word-diff
+- extract.c should work on word-merges
+- review all test output to make sure it looks right
+- document 'p' DOING
+- can find_best be optimised more?
+- --verbose flag ?? what should it do?
+- review commented code and discard some of it
+- test on raid code
+- possibly encourage "###...####" onto line by itself in diff output
+- possibly remember match information while reading patch/merge
+ to help matching.
+- is there anything useful to be done with linenumber information?
+- document diff algorithm
+- document best-match algorithm
+- document merge algorithm
+- enhance 'p'
+ - editmail? reviewmail
+ - review wiggle failures
+
+- Application of patch-03-MdRaid5Works caused some odd matches
+
+- possible verbosity:
+ report lines at which each patch was applied.??
+- add examples to man page
+
+- Design viewer.
+ Maybe:
+ 3 windows: before, patch, after
+
+-----------------------------------
+p - md.c - wait_event_interruptible
+ The preceeding tabs aren't noticed as being the same...
+
+
+-----------------------------------
+31 March 2005
+Some possible targets:
+
+ - check new marge code on all tests
+ - output merge as a diff from original
+ - handle multi-file patchs, producing new patch or updating files
+ - improve diff3 markers
+ - modified
+ - preserve permissions in created file
+ - allow output to have just one stream selected of conflicts.
+ - allow 'output' to include .rej files
+ - fix "produced this was" -> "produced this way" in man page
+
+other things
+ - Read a series of patches and determine dependancies
+ Then push a given patch forward or backward in the list.
+ Possibly full determination of dependancies isn't needed.
+ Just pust the target patch until it hits a wall, then
+ push the wall as far as it goes.
+ A potential 'wall' happens when inserted text is deleted.
+ We refine A -> B -> C and see if it can be broken up with
+ common a->a->a sections and between them,
+ x->x->y or p->q->q
+ There can then become x->y->y and p->p->q
+ But suppose we find x->x->y and p->q->q with no
+ a->a->a in between. Are we allowed to handle that?
+
+ This is a sentence
+ (is -> was)
+ This was a sentence
+ (a -> my)
+ This was my sentence
+
+ Commutine the patches give
+ This is a sentence
+ (a -> my)
+ This is my sentence
+ (is -> was)
+ This was my sentence
+
+ That seems safe enough. How about insertions and deletions?
+ This a sentence
+ (add is)
+ This is a sentence
+ (remove a)
+ This is sentence
+
+ This a sentence
+ (remove a)
+ This setence
+ (add is)
+ This is sentence
+
+ Seems ok... Maybe the fact that we have perfect matches (no extraneous stuff)
+ make it easier....
+
+
+ So: first sort the blocks (well, the files) and see if there is any overlap
+ of after-A with before-B.
+ If not, we update offsets. Maybe store each chunk as offset-from-end-of-last.
+ If so, we extend both blocks, possibly including other blocks, to get two blocks
+ that start and end at the same place.
+ Then run a word-wise merge-like thing. If there are no conflicts, extract the new
+ intermediate file and create the new diff from that.
+
+ So: each patch is a list of files with hunks
+ the hunks may grow extra context as it is found in other hunks
+ and may even merge.
+ To commute two patches:
+ If a chunk doesn't match any chunk in other file, just retain it.
+ If it does, expand both chunks with common data from the other
+ then run the diff code, then extract the new middle
+
+----------------------------
+27May2006
+
+I need to improve the browsing mode. Displaying before and
+after on the one line doesn't work very well.
+I think both the orignal/result and the before/after views need to
+to have -/+ lines for any difference.
+So, for each newline we need to record whether there are any differences
+on the line or not. If there are, we will need to display that
+line twice, once before and once after.
+This means we need to acknowledge two 'line' cursor positions?
+ Yes, if we are ever to have an edit cursor...
+ But don't we only need to edit the 'after'
+ Probably safer to provide for editing both as we might want to
+ do that before reapplying the diff.
+
+When we have consecutive newlines that are flagged, how do we display them?
+Grouped or interleaved?
+Grouped:
+ - AAA
+ - BBB
+ + aaa
+ + bbb
+Interleaved
+ - AAA
+ + aaa
+ - BBB
+ + bbb
+Grouped is what 'diff -u' generally does, however we know more about the
+content of the lines an whether it is a line that has been changed or a larger
+chunk. Maybe try one and see...
+
+If a line only has additions, or only has deletions it might be safe to just
+display the more complete line.. maybe leave that for wait-and-see too.
+
+For Grouped diff, we need to locate a group. Maybe we flag newlines as
+group-begin and group-end.
+
+Need to think this through..
+
+A 'pos' is the end of a line. We only 'sit' on eols that are visible
+We know the merger entry that covers this eol, and a position in one stream.
+If whole line is in
+ Unmatched, Unchange, Extraneous,
+Then we just show the line undecorated.
+If the line contins Changed, we show orig/result as separate lines.
+If line contains Changed, AlreadyApplied or Conflict, we show before/after as separate lines
+
+We, when we 'draw_mline'. We first do the 'before/orig' lines.
+If there is a Changed we signal that we need a RESULT and put in a '-'
+If there is a Changed, AlreadyApplied or Conflict, we signal the need for
+ an AFTER and put in a '-'.
+If either were needed, we put in another line
+
+Grouped diff:
+ as well as 'pos' we have start and end, and 'side'.
+ start is the first line in the set containing pos that have CHANGES set.
+ end is the last line in the same set
+ size is either 0 if row is a ' ', -1 if a '-' or 1 if row is a '+'
+
+ We need to allow for all of this when walking up and down for drawing lines.
+
+TODO
+ - what is 'cleanlist' meant to do - it seems to badly break things.
+DONE - implemented Grouped diffs
+DONE - at same time, lines with no diff should show no diff.
+ - put line/col number is status bar
+DONE - allow cursor to move left/right and scroll-on-demand.
+ - If we have selected 'before', then don't show 'after' lines..
+DONE - blank after end and before begining
+ - better movement:
+ DONE top
+ DONE bottom
+ DONE next/prev diff
+ next/prev conflict
+ incr-search
+ find char pos in search and highlight
+ multiple finds per line
+ search to loop around
+ switch from forward to reverse.
+ DONE page up/down
+ DONE beginning/end of line
+ left-right to handle line breaks.
+ - handle single .rej file
+ - allow updates to be saved
+ - allow editing???
+ - better colours? configurable?
+
+- extract.c should be able to extract half of a word-diff
+- extract.c should work on word-merges
+- review all test output to make sure it looks right
+- document 'p' DOING
+- can find_best be optimised more?
+- --verbose flag ?? what should it do?
+- review commented code and discard some of it
+- test on raid code
+- possibly encourage "###...####" onto line by itself in diff output
+- possibly remember match information while reading patch/merge
+ to help matching.
+- is there anything useful to be done with linenumber information?
+- document diff algorithm
+- document best-match algorithm
+- document merge algorithm
+- enhance 'p'
+ - editmail? reviewmail
+ - review wiggle failures
+
+- Application of patch-03-MdRaid5Works caused some odd matches
+
+- possible verbosity:
+ report lines at which each patch was applied.??
+- add examples to man page
+
+- Design viewer.
+ Maybe:
+ 3 windows: before, patch, after
+
+-----------------------------------
+p - md.c - wait_event_interruptible
+ The preceeding tabs aren't noticed as being the same...
+
+
+-----------------------------------
+31 March 2005
+Some possible targets:
+
+ - check new marge code on all tests
+ - output merge as a diff from original
+ - handle multi-file patchs, producing new patch or updating files
+ - improve diff3 markers
+ - modified
+ - preserve permissions in created file
+ - allow output to have just one stream selected of conflicts.
+ - allow 'output' to include .rej files
+ - fix "produced this was" -> "produced this way" in man page
+
+other things
+ - Read a series of patches and determine dependancies
+ Then push a given patch forward or backward in the list.
+ Possibly full determination of dependancies isn't needed.
+ Just pust the target patch until it hits a wall, then
+ push the wall as far as it goes.
+ A potential 'wall' happens when inserted text is deleted.
+ We refine A -> B -> C and see if it can be broken up with
+ common a->a->a sections and between them,
+ x->x->y or p->q->q
+ There can then become x->y->y and p->p->q
+ But suppose we find x->x->y and p->q->q with no
+ a->a->a in between. Are we allowed to handle that?
+
+ This is a sentence
+ (is -> was)
+ This was a sentence
+ (a -> my)
+ This was my sentence
+
+ Commutine the patches give
+ This is a sentence
+ (a -> my)
+ This is my sentence
+ (is -> was)
+ This was my sentence
+
+ That seems safe enough. How about insertions and deletions?
+ This a sentence
+ (add is)
+ This is a sentence
+ (remove a)
+ This is sentence
+
+ This a sentence
+ (remove a)
+ This setence
+ (add is)
+ This is sentence
+
+ Seems ok... Maybe the fact that we have perfect matches (no extraneous stuff)
+ make it easier....
+
+
+ So: first sort the blocks (well, the files) and see if there is any overlap
+ of after-A with before-B.
+ If not, we update offsets. Maybe store each chunk as offset-from-end-of-last.
+ If so, we extend both blocks, possibly including other blocks, to get two blocks
+ that start and end at the same place.
+ Then run a word-wise merge-like thing. If there are no conflicts, extract the new
+ intermediate file and create the new diff from that.
+
+ So: each patch is a list of files with hunks
+ the hunks may grow extra context as it is found in other hunks
+ and may even merge.
+ To commute two patches:
+ If a chunk doesn't match any chunk in other file, just retain it.
+ If it does, expand both chunks with common data from the other
+ then run the diff code, then extract the new middle
+
+ - move to left in tabs doesn't work right
+ - check out md.c and 383MDBlocked in demo directory
+
diff --git a/bestmatch.c b/bestmatch.c
new file mode 100644
index 0000000..70cde6c
--- /dev/null
+++ b/bestmatch.c
@@ -0,0 +1,503 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ * Copyright (C) 2011 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+/*
+ * Find the best match for a patch against a file. A patch is a
+ * sequence of chunks each of which is expected to match a particular
+ * locality of the file. So we expect big gaps between where chunks
+ * match, but only small gaps within chunks.
+ *
+ * The matching algorithm is similar to that in diff.c, so you should
+ * understand that first. However it takes fewer shortcuts and
+ * analyses cost in a more detailed way.
+ *
+ * We walk the whole matrix in a breadth first fashion following a
+ * 'front' on which x+y is constant. Along this front we examine each
+ * diagonal. For each point we calculate a 'value' for the match so
+ * far. This will be in some particular chunk. For each chunk we
+ * separately record the best value found so far, and where it was.
+ * To choose a new value for each point we calculate based on the
+ * previous value on each neighbouring diagonal and on this diagonal.
+ *
+ * This can result is a set of 'best' matches for each chunk which are
+ * not in the same order that the chunks initially were. This
+ * probably isn't desired, so we choose a 'best' best match and
+ * recurse on each side of it.
+ *
+ * The quality of a match is a somewhat complex function that is
+ * roughly 3 times the number of matching symbols minus the number
+ * of replaced, added, or deleted. This seems to work.
+ *
+ * For any point, the best possible score using that point
+ * is a complete diagonal to the nearest edge. We ignore points
+ * which cannot contibute to a better overall score.
+ *
+ * As this is a fairly expensive search we remove uninteresting
+ * symbols before searching. Specifically we only keep alphanumeric
+ * (plus '_') strings. Spaces and punctuation is ignored. This should
+ * contain enough information to achieve a reliable match while scanning
+ * many fewer symbols.
+ */
+
+#include <ctype.h>
+#include <stdlib.h>
+#include "wiggle.h"
+
+/* This structure keeps track of the current match at each point.
+ * It holds the start of the match as x,k where k is the
+ * diagonal, so y = x-k.
+ * Also the length of the match so far.
+ * If l == 0, there is no match.
+ */
+struct v {
+ int x, y; /* location of start of match */
+ int val; /* value of match from x,y to here */
+ int k; /* diagonal of last match - if val > 0 */
+ int inmatch; /* 1 if last point was a match */
+ int c; /* chunk number */
+};
+
+/*
+ * Here we must determine the 'value' of a partial match.
+ * The input parameters are:
+ * length - the total number of symbols matched
+ * errs - the total number of insertions or deletions
+ * dif - the absolute difference between number of insertions and deletions.
+ *
+ * In general we want length to be high, errs to be low, and dif to be low.
+ * Particular questions that must be answered include:
+ * - When does adding an extra symbol after a small gap improve the match
+ * - When does a match become so bad that we would rather start again.
+ *
+ * We would like symmetry in our answers so that a good sequence with
+ * an out-rider on one end is evaluated the same as a good sequence
+ * with an out-rider on the other end.
+ *
+ * However to do this we cannot really use the value of the good
+ * sequence to weigh in the out-riders favour as in the case of a
+ * leading outrider, we do not yet know the value of the good
+ * sequence.
+ *
+ * First, we need an arbitrary number, X, to say "Given a single
+ * symbol, after X errors, we forget that symbol". 5 seems a good
+ * number.
+ *
+ * Next we need to understand how replacements compare to insertions
+ * or deletions. Probably a replacement is the same cost as an
+ * insertion or deletion. Finally, a few large stretches are better
+ * then lots of little ones, so the number of disjoint stretches
+ * should be kept low.
+ *
+ * So:
+ * The first match sets the value to 6.
+ * Each consecutive match adds 3
+ * A non-consecutive match which value is still +ve adds 2
+ * Each non-match subtracts one unless it is the other half of a replacement.
+ * A value of 0 causes us to forget where we are and start again.
+ *
+ * We need to not only assess the value at a particular location, but
+ * also assess the maximum value we could get if all remaining symbols
+ * matched, to help exclude parts of the matrix. The value of that
+ * possibility is 6 times the number of remaining symbols, -1 if we
+ * just had a match.
+ */
+/* dir == 0 for match, 1 for k increase, -1 for k decrease */
+static inline void update_value(struct v *v, int dir, int k, int x)
+{
+ if (dir == 0) {
+ if (v->val <= 0) {
+ v->x = x-1;
+ v->y = x-k-1;
+ v->inmatch = 0;
+ v->val = 4;
+ }
+ v->val += 2+v->inmatch;
+ v->inmatch = 1;
+ v->k = k;
+ } else if (v->val > 0) {
+ v->inmatch = 0;
+ if (dir * (v->k - k) > 0) {
+ /* other half of replacement */
+ } else {
+ v->val -= 1;
+ }
+ }
+}
+
+/* Calculate the best possible value that this 'struct v'
+ * could reach if there are 'max' symbols remaining
+ * that could possibly be matches.
+ */
+static inline int best_val(struct v *v, int max)
+{
+ if (v->val <= 0)
+ return 4+max*3-1;
+ else
+ return max*3-1+v->inmatch+v->val;
+}
+
+struct best {
+ int xlo, ylo;
+ int xhi, yhi;
+ int val;
+};
+
+static inline int min(int a, int b)
+{
+ return a < b ? a : b;
+}
+
+static void find_best(struct file *a, struct file *b,
+ int alo, int ahi,
+ int blo, int bhi, struct best *best)
+{
+ int klo, khi, k;
+ int f;
+
+ struct v *valloc = xmalloc(sizeof(struct v)*((ahi-alo)+(bhi-blo)+5));
+ struct v *v = valloc + (bhi-alo+2);
+
+ k = klo = khi = alo-blo;
+ f = alo+blo; /* front that moves forward */
+ v[k].val = 0;
+ v[k].c = -1;
+
+ while (f < ahi+bhi) {
+ int x, y;
+
+ f++;
+ for (k = klo+1; k <= khi-1 ; k += 2) {
+ struct v vnew, vnew2;
+ x = (k+f)/2;
+ y = x-k;
+ /* first consider the diagonal - if possible
+ * it is always preferred
+ */
+ if (match(&a->list[x-1], &b->list[y-1])) {
+ vnew = v[k];
+ update_value(&v[k], 0, k, x);
+ if (v[k].c < 0)
+ abort();
+ if (v[k].val > best[v[k].c].val) {
+ int chunk = v[k].c;
+ best[chunk].xlo = v[k].x;
+ best[chunk].ylo = v[k].y;
+ best[chunk].xhi = x;
+ best[chunk].yhi = y;
+ best[chunk].val = v[k].val;
+ }
+ } else {
+ /* First consider a y-step: adding a
+ * symbol from B */
+ vnew = v[k+1];
+ update_value(&vnew, -1, k, x);
+ /* might cross a chunk boundary */
+ if (b->list[y-1].len && b->list[y-1].start[0] == 0) {
+ vnew.c = atoi(b->list[y-1].start+1);
+ vnew.val = 0;
+ }
+
+ /* Not consider an x-step: deleting
+ * a symbol. This cannot be a chunk
+ * boundary as there aren't any in 'A'
+ */
+ vnew2 = v[k-1];
+ update_value(&vnew2, 1, k, x);
+
+ /* Now choose the best. */
+ if (vnew2.val > vnew.val)
+ v[k] = vnew2;
+ else
+ v[k] = vnew;
+ }
+ }
+ /* extend or contract range */
+ klo--;
+ v[klo] = v[klo+1];
+ x = (klo+f)/2; y = x-klo;
+ update_value(&v[klo], -1, klo, x);
+ if (y <= bhi && b->list[y-1].len && b->list[y-1].start[0] == 0) {
+ v[klo].c = atoi(b->list[y-1].start+1);
+ v[klo].val = 0;
+ }
+ while (klo+2 < (ahi-bhi) &&
+ (y > bhi ||
+ (best_val(&v[klo], min(ahi-x, bhi-y)) < best[v[klo].c].val &&
+ best_val(&v[klo+1], min(ahi-x, bhi-y+1)) < best[v[klo+1].c].val
+ )
+ )) {
+ klo += 2;
+ x = (klo+f)/2; y = x-klo;
+ }
+
+ khi++;
+ v[khi] = v[khi-1];
+ x = (khi+f)/2; y = x - khi;
+ update_value(&v[khi], -1, khi, x);
+ while (khi-2 > (ahi-bhi) &&
+ (x > ahi ||
+ (v[khi].c >= 0 &&
+ best_val(&v[khi], min(ahi-x, bhi-y)) < best[v[khi].c].val &&
+ best_val(&v[khi-1], min(ahi-x+1, bhi-y)) < best[v[khi].c].val
+ )
+ )) {
+ khi -= 2;
+ x = (khi+f)/2; y = x - khi;
+ }
+
+ }
+ free(valloc);
+}
+
+/* Join two csl lists together.
+ * Simply allocate new space and copy everything in.
+ */
+static struct csl *csl_join(struct csl *c1, struct csl *c2)
+{
+ struct csl *c, *cd, *rv;
+ int cnt;
+
+ if (c1 == NULL)
+ return c2;
+ if (c2 == NULL)
+ return c1;
+
+ cnt = 1; /* the sentinal */
+ for (c = c1; c->len; c++)
+ cnt++;
+ for (c = c2; c->len; c++)
+ cnt++;
+ cd = rv = xmalloc(sizeof(*rv)*cnt);
+ for (c = c1; c->len; c++)
+ *cd++ = *c;
+ for (c = c2; c->len; c++)
+ *cd++ = *c;
+ cd->len = 0;
+ free(c1);
+ free(c2);
+ return rv;
+}
+
+/*
+ * Reduce a file by discarding less interesting words
+ * Words that end with a newline are interesting (so all words
+ * in line-mode are interesting) and words that start with
+ * and alphanumeric are interesting. This excludes spaces and
+ * special characters in word mode
+ * Doing a best-fit comparision on only interesting words is
+ * much faster than on all words, and is nearly as good
+ */
+
+static inline int is_skipped(struct elmnt e)
+{
+ return !(ends_line(e) ||
+ isalnum(e.start[0]) ||
+ e.start[0] == '_');
+}
+
+static struct file reduce(struct file orig)
+{
+ int cnt = 0;
+ int i;
+ struct file rv;
+
+ for (i = 0; i < orig.elcnt; i++)
+ if (!is_skipped(orig.list[i]))
+ cnt++;
+
+ if (cnt == orig.elcnt)
+ return orig;
+
+ rv.elcnt = cnt;
+ rv.list = xmalloc(cnt*sizeof(struct elmnt));
+ cnt = 0;
+ for (i = 0; i < orig.elcnt; i++)
+ if (!is_skipped(orig.list[i]))
+ rv.list[cnt++] = orig.list[i];
+ return rv;
+}
+
+/* Given a list of best matches between a1 and b1 which are
+ * subsets of a2 and b2, convert that list to indexes into a2/b2
+ *
+ * When we find the location in a2/b2, we expand to include all
+ * immediately surrounding words which were skipped
+ */
+static void remap(struct best *best, int cnt,
+ struct file a1, struct file b1,
+ struct file a2, struct file b2)
+{
+ int b;
+ int pa, pb; /* pointers into the a2 and b2 arrays */
+
+ pa = pb = 0;
+
+ if (a1.elcnt == 0 && a2.elcnt == 0)
+ return;
+
+ for (b = 1; b < cnt; b++)
+ if (best[b].val > 0) {
+ while (pa < a2.elcnt &&
+ a2.list[pa].start != a1.list[best[b].xlo].start)
+ pa++;
+ if (pa == a2.elcnt)
+ abort();
+ while (pb < b2.elcnt &&
+ b2.list[pb].start != b1.list[best[b].ylo].start)
+ pb++;
+ if (pb == b2.elcnt)
+ abort();
+
+ /* pa,pb is the start of this best bit. Step
+ * backward over ignored words
+ */
+ while (pa > 0 && is_skipped(a2.list[pa-1]))
+ pa--;
+ while (pb > 0 && is_skipped(b2.list[pb-1]))
+ pb--;
+
+ if (pa <= 0)
+ pa = 1;
+ if (pb <= 0)
+ pb = 1;
+
+ best[b].xlo = pa;
+ best[b].ylo = pb;
+
+ while (pa < a2.elcnt &&
+ (pa == 0 || (a2.list[pa-1].start
+ != a1.list[best[b].xhi-1].start)))
+ pa++;
+ if (pa == a2.elcnt && best[b].xhi != a1.elcnt)
+ abort();
+ while (pb < b2.elcnt &&
+ (pb == 0 || (b2.list[pb-1].start
+ != b1.list[best[b].yhi-1].start)))
+ pb++;
+ if (pb == b2.elcnt && best[b].yhi != b1.elcnt)
+ abort();
+
+ /* pa,pb is now the end of the best bit.
+ * Step pa,pb forward over ignored words.
+ */
+ while (pa < a2.elcnt && is_skipped(a2.list[pa]))
+ pa++;
+ while (pb < b2.elcnt && is_skipped(b2.list[pb]))
+ pb++;
+ best[b].xhi = pa;
+ best[b].yhi = pb;
+ }
+}
+
+static void find_best_inorder(struct file *a, struct file *b,
+ int alo, int ahi, int blo, int bhi,
+ struct best *best, int bestlo, int besthi)
+{
+ /* make sure the best matches we find are inorder.
+ * If they aren't we find a overall best, and
+ * recurse either side of that
+ */
+ int i;
+ int bad = 0;
+ int bestval, bestpos = 0;
+
+ for (i = bestlo; i < besthi; i++)
+ best[i].val = 0;
+ find_best(a, b, alo, ahi, blo, bhi, best);
+ for (i = bestlo + 1; i < besthi; i++)
+ if (best[i-1].val > 0 &&
+ best[i].val > 0 &&
+ best[i-1].xhi >= best[i].xlo)
+ bad = 1;
+
+ if (!bad)
+ return;
+ bestval = 0;
+ for (i = bestlo; i < besthi; i++)
+ if (best[i].val > bestval) {
+ bestval = best[i].val;
+ bestpos = i;
+ }
+ if (bestpos > bestlo) {
+ /* move top down below chunk marker */
+ int y = best[bestpos].ylo;
+ while (b->list[y].start[0])
+ y--;
+ find_best_inorder(a, b,
+ alo, best[bestpos].xlo,
+ blo, y,
+ best, bestlo, bestpos);
+ }
+ if (bestpos < besthi-1) {
+ /* move bottom up to chunk marker */
+ int y = best[bestpos].yhi;
+ while (b->list[y].start[0])
+ y++;
+ find_best_inorder(a, b,
+ best[bestpos].xhi, ahi,
+ y, bhi,
+ best, bestpos+1, besthi);
+ }
+}
+
+struct csl *pdiff(struct file a, struct file b, int chunks)
+{
+ struct csl *csl1, *csl2;
+ struct best *best = xmalloc(sizeof(struct best)*(chunks+1));
+ int i;
+ struct file asmall, bsmall;
+
+ asmall = reduce(a);
+ bsmall = reduce(b);
+
+ for (i = 0; i < chunks+1; i++)
+ best[i].val = 0;
+ find_best_inorder(&asmall, &bsmall,
+ 0, asmall.elcnt, 0, bsmall.elcnt,
+ best, 1, chunks+1);
+ remap(best, chunks+1, asmall, bsmall, a, b);
+
+ csl1 = NULL;
+ for (i = 1; i <= chunks; i++)
+ if (best[i].val > 0) {
+ csl2 = diff_partial(a, b,
+ best[i].xlo, best[i].xhi,
+ best[i].ylo, best[i].yhi);
+ csl1 = csl_join(csl1, csl2);
+ }
+ if (csl1) {
+ for (csl2 = csl1; csl2->len; csl2++)
+ ;
+ csl2->a = a.elcnt;
+ csl2->b = b.elcnt;
+ } else {
+ csl1 = xmalloc(sizeof(*csl1));
+ csl1->len = 0;
+ csl1->a = a.elcnt;
+ csl1->b = b.elcnt;
+ }
+ free(best);
+ return csl1;
+}
diff --git a/ccan/build_assert/_info b/ccan/build_assert/_info
new file mode 100644
index 0000000..284c6cf
--- /dev/null
+++ b/ccan/build_assert/_info
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <string.h>
+#include "config.h"
+
+/**
+ * build_assert - routines for build-time assertions
+ *
+ * This code provides routines which will cause compilation to fail should some
+ * assertion be untrue: such failures are preferable to run-time assertions,
+ * but much more limited since they can only depends on compile-time constants.
+ *
+ * These assertions are most useful when two parts of the code must be kept in
+ * sync: it is better to avoid such cases if possible, but seconds best is to
+ * detect invalid changes at build time.
+ *
+ * For example, a tricky piece of code might rely on a certain element being at
+ * the start of the structure. To ensure that future changes don't break it,
+ * you would catch such changes in your code like so:
+ *
+ * Example:
+ * #include <stddef.h>
+ * #include <ccan/build_assert/build_assert.h>
+ *
+ * struct foo {
+ * char string[5];
+ * int x;
+ * };
+ *
+ * static char *foo_string(struct foo *foo)
+ * {
+ * // This trick requires that the string be first in the structure
+ * BUILD_ASSERT(offsetof(struct foo, string) == 0);
+ * return (char *)foo;
+ * }
+ *
+ * License: Public domain
+ * Author: Rusty Russell <rusty@rustcorp.com.au>
+ */
+int main(int argc, char *argv[])
+{
+ if (argc != 2)
+ return 1;
+
+ if (strcmp(argv[1], "depends") == 0)
+ /* Nothing. */
+ return 0;
+
+ return 1;
+}
diff --git a/ccan/build_assert/build_assert.h b/ccan/build_assert/build_assert.h
new file mode 100644
index 0000000..24e59c4
--- /dev/null
+++ b/ccan/build_assert/build_assert.h
@@ -0,0 +1,39 @@
+#ifndef CCAN_BUILD_ASSERT_H
+#define CCAN_BUILD_ASSERT_H
+
+/**
+ * BUILD_ASSERT - assert a build-time dependency.
+ * @cond: the compile-time condition which must be true.
+ *
+ * Your compile will fail if the condition isn't true, or can't be evaluated
+ * by the compiler. This can only be used within a function.
+ *
+ * Example:
+ * #include <stddef.h>
+ * ...
+ * static char *foo_to_char(struct foo *foo)
+ * {
+ * // This code needs string to be at start of foo.
+ * BUILD_ASSERT(offsetof(struct foo, string) == 0);
+ * return (char *)foo;
+ * }
+ */
+#define BUILD_ASSERT(cond) \
+ do { (void) sizeof(char [1 - 2*!(cond)]); } while(0)
+
+/**
+ * BUILD_ASSERT_OR_ZERO - assert a build-time dependency, as an expression.
+ * @cond: the compile-time condition which must be true.
+ *
+ * Your compile will fail if the condition isn't true, or can't be evaluated
+ * by the compiler. This can be used in an expression: its value is "0".
+ *
+ * Example:
+ * #define foo_to_char(foo) \
+ * ((char *)(foo) \
+ * + BUILD_ASSERT_OR_ZERO(offsetof(struct foo, string) == 0))
+ */
+#define BUILD_ASSERT_OR_ZERO(cond) \
+ (sizeof(char [1 - 2*!(cond)]) - 1)
+
+#endif /* CCAN_BUILD_ASSERT_H */
diff --git a/ccan/build_assert/test/compile_fail-expr.c b/ccan/build_assert/test/compile_fail-expr.c
new file mode 100644
index 0000000..109215b
--- /dev/null
+++ b/ccan/build_assert/test/compile_fail-expr.c
@@ -0,0 +1,10 @@
+#include <ccan/build_assert/build_assert.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+ return BUILD_ASSERT_OR_ZERO(1 == 0);
+#else
+ return 0;
+#endif
+}
diff --git a/ccan/build_assert/test/compile_fail.c b/ccan/build_assert/test/compile_fail.c
new file mode 100644
index 0000000..37d95ed
--- /dev/null
+++ b/ccan/build_assert/test/compile_fail.c
@@ -0,0 +1,9 @@
+#include <ccan/build_assert/build_assert.h>
+
+int main(int argc, char *argv[])
+{
+#ifdef FAIL
+ BUILD_ASSERT(1 == 0);
+#endif
+ return 0;
+}
diff --git a/ccan/build_assert/test/compile_ok.c b/ccan/build_assert/test/compile_ok.c
new file mode 100644
index 0000000..4105484
--- /dev/null
+++ b/ccan/build_assert/test/compile_ok.c
@@ -0,0 +1,7 @@
+#include <ccan/build_assert/build_assert.h>
+
+int main(int argc, char *argv[])
+{
+ BUILD_ASSERT(1 == 1);
+ return 0;
+}
diff --git a/ccan/build_assert/test/run-BUILD_ASSERT_OR_ZERO.c b/ccan/build_assert/test/run-BUILD_ASSERT_OR_ZERO.c
new file mode 100644
index 0000000..4185821
--- /dev/null
+++ b/ccan/build_assert/test/run-BUILD_ASSERT_OR_ZERO.c
@@ -0,0 +1,9 @@
+#include <ccan/build_assert/build_assert.h>
+#include <ccan/tap/tap.h>
+
+int main(int argc, char *argv[])
+{
+ plan_tests(1);
+ ok1(BUILD_ASSERT_OR_ZERO(1 == 1) == 0);
+ return exit_status();
+}
diff --git a/ccan/hash/_info b/ccan/hash/_info
new file mode 100644
index 0000000..5aeb912
--- /dev/null
+++ b/ccan/hash/_info
@@ -0,0 +1,31 @@
+#include <string.h>
+#include <stdio.h>
+
+/**
+ * hash - routines for hashing bytes
+ *
+ * When creating a hash table it's important to have a hash function
+ * which mixes well and is fast. This package supplies such functions.
+ *
+ * The hash functions come in two flavors: the normal ones and the
+ * stable ones. The normal ones can vary from machine-to-machine and
+ * may change if we find better or faster hash algorithms in future.
+ * The stable ones will always give the same results on any computer,
+ * and on any version of this package.
+ *
+ * License: Public Domain
+ * Maintainer: Rusty Russell <rusty@rustcorp.com.au>
+ * Author: Bob Jenkins <bob_jenkins@burtleburtle.net>
+ */
+int main(int argc, char *argv[])
+{
+ if (argc != 2)
+ return 1;
+
+ if (strcmp(argv[1], "depends") == 0) {
+ printf("ccan/build_assert\n");
+ return 0;
+ }
+
+ return 1;
+}
diff --git a/ccan/hash/hash.c b/ccan/hash/hash.c
new file mode 100644
index 0000000..59c4d24
--- /dev/null
+++ b/ccan/hash/hash.c
@@ -0,0 +1,925 @@
+/*
+-------------------------------------------------------------------------------
+lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+
+These are functions for producing 32-bit hashes for hash table lookup.
+hash_word(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+are externally useful functions. Routines to test the hash are included
+if SELF_TEST is defined. You can use this free for any purpose. It's in
+the public domain. It has no warranty.
+
+You probably want to use hashlittle(). hashlittle() and hashbig()
+hash byte arrays. hashlittle() is is faster than hashbig() on
+little-endian machines. Intel and AMD are little-endian machines.
+On second thought, you probably want hashlittle2(), which is identical to
+hashlittle() except it returns two 32-bit hashes for the price of one.
+You could implement hashbig2() if you wanted but I haven't bothered here.
+
+If you want to find a hash of, say, exactly 7 integers, do
+ a = i1; b = i2; c = i3;
+ mix(a,b,c);
+ a += i4; b += i5; c += i6;
+ mix(a,b,c);
+ a += i7;
+ final(a,b,c);
+then use c as the hash value. If you have a variable length array of
+4-byte integers to hash, use hash_word(). If you have a byte array (like
+a character string), use hashlittle(). If you have several byte arrays, or
+a mix of things, see the comments above hashlittle().
+
+Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
+then mix those integers. This is fast (you can do a lot more thorough
+mixing with 12*3 instructions on 3 integers than you can with 3 instructions
+on 1 byte), but shoehorning those bytes into integers efficiently is messy.
+-------------------------------------------------------------------------------
+*/
+//#define SELF_TEST 1
+
+#if 0
+#include <stdio.h> /* defines printf for tests */
+#include <time.h> /* defines time_t for timings in the test */
+#include <stdint.h> /* defines uint32_t etc */
+#include <sys/param.h> /* attempt to define endianness */
+
+#ifdef linux
+# include <endian.h> /* attempt to define endianness */
+#endif
+
+/*
+ * My best guess at if you are big-endian or little-endian. This may
+ * need adjustment.
+ */
+#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
+ __BYTE_ORDER == __LITTLE_ENDIAN) || \
+ (defined(i386) || defined(__i386__) || defined(__i486__) || \
+ defined(__i586__) || defined(__i686__) || defined(__x86_64) || \
+ defined(vax) || defined(MIPSEL))
+# define HASH_LITTLE_ENDIAN 1
+# define HASH_BIG_ENDIAN 0
+#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
+ __BYTE_ORDER == __BIG_ENDIAN) || \
+ (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel))
+# define HASH_LITTLE_ENDIAN 0
+# define HASH_BIG_ENDIAN 1
+#else
+# error Unknown endian
+#endif
+#endif /* old hash.c headers. */
+
+#include "hash.h"
+
+#if HAVE_LITTLE_ENDIAN
+#define HASH_LITTLE_ENDIAN 1
+#define HASH_BIG_ENDIAN 0
+#elif HAVE_BIG_ENDIAN
+#define HASH_LITTLE_ENDIAN 0
+#define HASH_BIG_ENDIAN 1
+#else
+#error Unknown endian
+#endif
+
+#define hashsize(n) ((uint32_t)1<<(n))
+#define hashmask(n) (hashsize(n)-1)
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/*
+-------------------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+
+This is reversible, so any information in (a,b,c) before mix() is
+still in (a,b,c) after mix().
+
+If four pairs of (a,b,c) inputs are run through mix(), or through
+mix() in reverse, there are at least 32 bits of the output that
+are sometimes the same for one pair and different for another pair.
+This was tested for:
+* pairs that differed by one bit, by two bits, in any combination
+ of top bits of (a,b,c), or in any combination of bottom bits of
+ (a,b,c).
+* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ is commonly produced by subtraction) look like a single 1-bit
+ difference.
+* the base values were pseudorandom, all zero but one bit set, or
+ all zero plus a counter that starts at zero.
+
+Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+satisfy this are
+ 4 6 8 16 19 4
+ 9 15 3 18 27 15
+ 14 9 3 7 17 3
+Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
+for "differ" defined as + with a one-bit base and a two-bit delta. I
+used http://burtleburtle.net/bob/hash/avalanche.html to choose
+the operations, constants, and arrangements of the variables.
+
+This does not achieve avalanche. There are input bits of (a,b,c)
+that fail to affect some output bits of (a,b,c), especially of a. The
+most thoroughly mixed value is c, but it doesn't really even achieve
+avalanche in c.
+
+This allows some parallelism. Read-after-writes are good at doubling
+the number of bits affected, so the goal of mixing pulls in the opposite
+direction as the goal of parallelism. I did what I could. Rotates
+seem to cost as much as shifts on every machine I could lay my hands
+on, and rotates are much kinder to the top and bottom bits, so I used
+rotates.
+-------------------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+ a -= c; a ^= rot(c, 4); c += b; \
+ b -= a; b ^= rot(a, 6); a += c; \
+ c -= b; c ^= rot(b, 8); b += a; \
+ a -= c; a ^= rot(c,16); c += b; \
+ b -= a; b ^= rot(a,19); a += c; \
+ c -= b; c ^= rot(b, 4); b += a; \
+}
+
+/*
+-------------------------------------------------------------------------------
+final -- final mixing of 3 32-bit values (a,b,c) into c
+
+Pairs of (a,b,c) values differing in only a few bits will usually
+produce values of c that look totally different. This was tested for
+* pairs that differed by one bit, by two bits, in any combination
+ of top bits of (a,b,c), or in any combination of bottom bits of
+ (a,b,c).
+* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
+ the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+ is commonly produced by subtraction) look like a single 1-bit
+ difference.
+* the base values were pseudorandom, all zero but one bit set, or
+ all zero plus a counter that starts at zero.
+
+These constants passed:
+ 14 11 25 16 4 14 24
+ 12 14 25 16 4 14 24
+and these came close:
+ 4 8 15 26 3 22 24
+ 10 8 15 26 3 22 24
+ 11 8 15 26 3 22 24
+-------------------------------------------------------------------------------
+*/
+#define final(a,b,c) \
+{ \
+ c ^= b; c -= rot(b,14); \
+ a ^= c; a -= rot(c,11); \
+ b ^= a; b -= rot(a,25); \
+ c ^= b; c -= rot(b,16); \
+ a ^= c; a -= rot(c,4); \
+ b ^= a; b -= rot(a,14); \
+ c ^= b; c -= rot(b,24); \
+}
+
+/*
+--------------------------------------------------------------------
+ This works on all machines. To be useful, it requires
+ -- that the key be an array of uint32_t's, and
+ -- that the length be the number of uint32_t's in the key
+
+ The function hash_word() is identical to hashlittle() on little-endian
+ machines, and identical to hashbig() on big-endian machines,
+ except that the length has to be measured in uint32_ts rather than in
+ bytes. hashlittle() is more complicated than hash_word() only because
+ hashlittle() has to dance around fitting the key bytes into registers.
+--------------------------------------------------------------------
+*/
+uint32_t hash_u32(
+const uint32_t *k, /* the key, an array of uint32_t values */
+size_t length, /* the length of the key, in uint32_ts */
+uint32_t initval) /* the previous hash, or an arbitrary value */
+{
+ uint32_t a,b,c;
+
+ /* Set up the internal state */
+ a = b = c = 0xdeadbeef + (((uint32_t)length)<<2) + initval;
+
+ /*------------------------------------------------- handle most of the key */
+ while (length > 3)
+ {
+ a += k[0];
+ b += k[1];
+ c += k[2];
+ mix(a,b,c);
+ length -= 3;
+ k += 3;
+ }
+
+ /*------------------------------------------- handle the last 3 uint32_t's */
+ switch(length) /* all the case statements fall through */
+ {
+ case 3 : c+=k[2];
+ case 2 : b+=k[1];
+ case 1 : a+=k[0];
+ final(a,b,c);
+ case 0: /* case 0: nothing left to add */
+ break;
+ }
+ /*------------------------------------------------------ report the result */
+ return c;
+}
+
+/*
+-------------------------------------------------------------------------------
+hashlittle() -- hash a variable-length key into a 32-bit value
+ k : the key (the unaligned variable-length array of bytes)
+ length : the length of the key, counting by bytes
+ val2 : IN: can be any 4-byte value OUT: second 32 bit hash.
+Returns a 32-bit value. Every bit of the key affects every bit of
+the return value. Two keys differing by one or two bits will have
+totally different hash values. Note that the return value is better
+mixed than val2, so use that first.
+
+The best hash table sizes are powers of 2. There is no need to do
+mod a prime (mod is sooo slow!). If you need less than 32 bits,
+use a bitmask. For example, if you need only 10 bits, do
+ h = (h & hashmask(10));
+In which case, the hash table should have hashsize(10) elements.
+
+If you are hashing n strings (uint8_t **)k, do it like this:
+ for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h);
+
+By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this
+code any way you wish, private, educational, or commercial. It's free.
+
+Use for hash table lookup, or anything where one collision in 2^^32 is
+acceptable. Do NOT use for cryptographic purposes.
+-------------------------------------------------------------------------------
+*/
+
+static uint32_t hashlittle( const void *key, size_t length, uint32_t *val2 )
+{
+ uint32_t a,b,c; /* internal state */
+ union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */
+
+ /* Set up the internal state */
+ a = b = c = 0xdeadbeef + ((uint32_t)length) + *val2;
+
+ u.ptr = key;
+ if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
+ const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */
+ const uint8_t *k8;
+
+ /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
+ while (length > 12)
+ {
+ a += k[0];
+ b += k[1];
+ c += k[2];
+ mix(a,b,c);
+ length -= 12;
+ k += 3;
+ }
+
+ /*----------------------------- handle the last (probably partial) block */
+ /*
+ * "k[2]&0xffffff" actually reads beyond the end of the string, but
+ * then masks off the part it's not allowed to read. Because the
+ * string is aligned, the masked-off tail is in the same word as the
+ * rest of the string. Every machine with memory protection I've seen
+ * does it on word boundaries, so is OK with this. But VALGRIND will
+ * still catch it and complain. The masking trick does make the hash
+ * noticably faster for short strings (like English words).
+ *
+ * Not on my testing with gcc 4.5 on an intel i5 CPU, at least --RR.
+ */
+#if 0
+ switch(length)
+ {
+ case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+ case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
+ case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
+ case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
+ case 8 : b+=k[1]; a+=k[0]; break;
+ case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
+ case 6 : b+=k[1]&0xffff; a+=k[0]; break;
+ case 5 : b+=k[1]&0xff; a+=k[0]; break;
+ case 4 : a+=k[0]; break;
+ case 3 : a+=k[0]&0xffffff; break;
+ case 2 : a+=k[0]&0xffff; break;
+ case 1 : a+=k[0]&0xff; break;
+ case 0 : return c; /* zero length strings require no mixing */
+ }
+
+#else /* make valgrind happy */
+
+ k8 = (const uint8_t *)k;
+ switch(length)
+ {
+ case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+ case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
+ case 10: c+=((uint32_t)k8[9])<<8; /* fall through */
+ case 9 : c+=k8[8]; /* fall through */
+ case 8 : b+=k[1]; a+=k[0]; break;
+ case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
+ case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */
+ case 5 : b+=k8[4]; /* fall through */
+ case 4 : a+=k[0]; break;
+ case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
+ case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */
+ case 1 : a+=k8[0]; break;
+ case 0 : return c;
+ }
+
+#endif /* !valgrind */
+
+ } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
+ const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */
+ const uint8_t *k8;
+
+ /*--------------- all but last block: aligned reads and different mixing */
+ while (length > 12)
+ {
+ a += k[0] + (((uint32_t)k[1])<<16);
+ b += k[2] + (((uint32_t)k[3])<<16);
+ c += k[4] + (((uint32_t)k[5])<<16);
+ mix(a,b,c);
+ length -= 12;
+ k += 6;
+ }
+
+ /*----------------------------- handle the last (probably partial) block */
+ k8 = (const uint8_t *)k;
+ switch(length)
+ {
+ case 12: c+=k[4]+(((uint32_t)k[5])<<16);
+ b+=k[2]+(((uint32_t)k[3])<<16);
+ a+=k[0]+(((uint32_t)k[1])<<16);
+ break;
+ case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
+ case 10: c+=k[4];
+ b+=k[2]+(((uint32_t)k[3])<<16);
+ a+=k[0]+(((uint32_t)k[1])<<16);
+ break;
+ case 9 : c+=k8[8]; /* fall through */
+ case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
+ a+=k[0]+(((uint32_t)k[1])<<16);
+ break;
+ case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
+ case 6 : b+=k[2];
+ a+=k[0]+(((uint32_t)k[1])<<16);
+ break;
+ case 5 : b+=k8[4]; /* fall through */
+ case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
+ break;
+ case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
+ case 2 : a+=k[0];
+ break;
+ case 1 : a+=k8[0];
+ break;
+ case 0 : return c; /* zero length requires no mixing */
+ }
+
+ } else { /* need to read the key one byte at a time */
+ const uint8_t *k = (const uint8_t *)key;
+
+ /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
+ while (length > 12)
+ {
+ a += k[0];
+ a += ((uint32_t)k[1])<<8;
+ a += ((uint32_t)k[2])<<16;
+ a += ((uint32_t)k[3])<<24;
+ b += k[4];
+ b += ((uint32_t)k[5])<<8;
+ b += ((uint32_t)k[6])<<16;
+ b += ((uint32_t)k[7])<<24;
+ c += k[8];
+ c += ((uint32_t)k[9])<<8;
+ c += ((uint32_t)k[10])<<16;
+ c += ((uint32_t)k[11])<<24;
+ mix(a,b,c);
+ length -= 12;
+ k += 12;
+ }
+
+ /*-------------------------------- last block: affect all 32 bits of (c) */
+ switch(length) /* all the case statements fall through */
+ {
+ case 12: c+=((uint32_t)k[11])<<24;
+ case 11: c+=((uint32_t)k[10])<<16;
+ case 10: c+=((uint32_t)k[9])<<8;
+ case 9 : c+=k[8];
+ case 8 : b+=((uint32_t)k[7])<<24;
+ case 7 : b+=((uint32_t)k[6])<<16;
+ case 6 : b+=((uint32_t)k[5])<<8;
+ case 5 : b+=k[4];
+ case 4 : a+=((uint32_t)k[3])<<24;
+ case 3 : a+=((uint32_t)k[2])<<16;
+ case 2 : a+=((uint32_t)k[1])<<8;
+ case 1 : a+=k[0];
+ break;
+ case 0 : return c;
+ }
+ }
+
+ final(a,b,c);
+ *val2 = b;
+ return c;
+}
+
+/*
+ * hashbig():
+ * This is the same as hash_word() on big-endian machines. It is different
+ * from hashlittle() on all machines. hashbig() takes advantage of
+ * big-endian byte ordering.
+ */
+static uint32_t hashbig( const void *key, size_t length, uint32_t *val2)
+{
+ uint32_t a,b,c;
+ union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */
+
+ /* Set up the internal state */
+ a = b = c = 0xdeadbeef + ((uint32_t)length) + *val2;
+
+ u.ptr = key;
+ if (HASH_BIG_ENDIAN && ((u.i & 0x3) == 0)) {
+ const uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */
+ const uint8_t *k8;
+
+ /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
+ while (length > 12)
+ {
+ a += k[0];
+ b += k[1];
+ c += k[2];
+ mix(a,b,c);
+ length -= 12;
+ k += 3;
+ }
+
+ /*----------------------------- handle the last (probably partial) block */
+ /*
+ * "k[2]<<8" actually reads beyond the end of the string, but
+ * then shifts out the part it's not allowed to read. Because the
+ * string is aligned, the illegal read is in the same word as the
+ * rest of the string. Every machine with memory protection I've seen
+ * does it on word boundaries, so is OK with this. But VALGRIND will
+ * still catch it and complain. The masking trick does make the hash
+ * noticably faster for short strings (like English words).
+ *
+ * Not on my testing with gcc 4.5 on an intel i5 CPU, at least --RR.
+ */
+#if 0
+ switch(length)
+ {
+ case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+ case 11: c+=k[2]&0xffffff00; b+=k[1]; a+=k[0]; break;
+ case 10: c+=k[2]&0xffff0000; b+=k[1]; a+=k[0]; break;
+ case 9 : c+=k[2]&0xff000000; b+=k[1]; a+=k[0]; break;
+ case 8 : b+=k[1]; a+=k[0]; break;
+ case 7 : b+=k[1]&0xffffff00; a+=k[0]; break;
+ case 6 : b+=k[1]&0xffff0000; a+=k[0]; break;
+ case 5 : b+=k[1]&0xff000000; a+=k[0]; break;
+ case 4 : a+=k[0]; break;
+ case 3 : a+=k[0]&0xffffff00; break;
+ case 2 : a+=k[0]&0xffff0000; break;
+ case 1 : a+=k[0]&0xff000000; break;
+ case 0 : return c; /* zero length strings require no mixing */
+ }
+
+#else /* make valgrind happy */
+
+ k8 = (const uint8_t *)k;
+ switch(length) /* all the case statements fall through */
+ {
+ case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+ case 11: c+=((uint32_t)k8[10])<<8; /* fall through */
+ case 10: c+=((uint32_t)k8[9])<<16; /* fall through */
+ case 9 : c+=((uint32_t)k8[8])<<24; /* fall through */
+ case 8 : b+=k[1]; a+=k[0]; break;
+ case 7 : b+=((uint32_t)k8[6])<<8; /* fall through */
+ case 6 : b+=((uint32_t)k8[5])<<16; /* fall through */
+ case 5 : b+=((uint32_t)k8[4])<<24; /* fall through */
+ case 4 : a+=k[0]; break;
+ case 3 : a+=((uint32_t)k8[2])<<8; /* fall through */
+ case 2 : a+=((uint32_t)k8[1])<<16; /* fall through */
+ case 1 : a+=((uint32_t)k8[0])<<24; break;
+ case 0 : return c;
+ }
+
+#endif /* !VALGRIND */
+
+ } else { /* need to read the key one byte at a time */
+ const uint8_t *k = (const uint8_t *)key;
+
+ /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
+ while (length > 12)
+ {
+ a += ((uint32_t)k[0])<<24;
+ a += ((uint32_t)k[1])<<16;
+ a += ((uint32_t)k[2])<<8;
+ a += ((uint32_t)k[3]);
+ b += ((uint32_t)k[4])<<24;
+ b += ((uint32_t)k[5])<<16;
+ b += ((uint32_t)k[6])<<8;
+ b += ((uint32_t)k[7]);
+ c += ((uint32_t)k[8])<<24;
+ c += ((uint32_t)k[9])<<16;
+ c += ((uint32_t)k[10])<<8;
+ c += ((uint32_t)k[11]);
+ mix(a,b,c);
+ length -= 12;
+ k += 12;
+ }
+
+ /*-------------------------------- last block: affect all 32 bits of (c) */
+ switch(length) /* all the case statements fall through */
+ {
+ case 12: c+=k[11];
+ case 11: c+=((uint32_t)k[10])<<8;
+ case 10: c+=((uint32_t)k[9])<<16;
+ case 9 : c+=((uint32_t)k[8])<<24;
+ case 8 : b+=k[7];
+ case 7 : b+=((uint32_t)k[6])<<8;
+ case 6 : b+=((uint32_t)k[5])<<16;
+ case 5 : b+=((uint32_t)k[4])<<24;
+ case 4 : a+=k[3];
+ case 3 : a+=((uint32_t)k[2])<<8;
+ case 2 : a+=((uint32_t)k[1])<<16;
+ case 1 : a+=((uint32_t)k[0])<<24;
+ break;
+ case 0 : return c;
+ }
+ }
+
+ final(a,b,c);
+ *val2 = b;
+ return c;
+}
+
+/* I basically use hashlittle here, but use native endian within each
+ * element. This delivers least-surprise: hash such as "int arr[] = {
+ * 1, 2 }; hash_stable(arr, 2, 0);" will be the same on big and little
+ * endian machines, even though a bytewise hash wouldn't be. */
+uint64_t hash64_stable_64(const void *key, size_t n, uint64_t base)
+{
+ const uint64_t *k = key;
+ uint32_t a,b,c;
+
+ /* Set up the internal state */
+ a = b = c = 0xdeadbeef + ((uint32_t)n*8) + (base >> 32) + base;
+
+ while (n > 3) {
+ a += (uint32_t)k[0];
+ b += (uint32_t)(k[0] >> 32);
+ c += (uint32_t)k[1];
+ mix(a,b,c);
+ a += (uint32_t)(k[1] >> 32);
+ b += (uint32_t)k[2];
+ c += (uint32_t)(k[2] >> 32);
+ mix(a,b,c);
+ n -= 3;
+ k += 3;
+ }
+ switch (n) {
+ case 2:
+ a += (uint32_t)k[0];
+ b += (uint32_t)(k[0] >> 32);
+ c += (uint32_t)k[1];
+ mix(a,b,c);
+ a += (uint32_t)(k[1] >> 32);
+ break;
+ case 1:
+ a += (uint32_t)k[0];
+ b += (uint32_t)(k[0] >> 32);
+ break;
+ case 0:
+ return c;
+ }
+ final(a,b,c);
+ return ((uint64_t)b << 32) | c;
+}
+
+uint64_t hash64_stable_32(const void *key, size_t n, uint64_t base)
+{
+ const uint32_t *k = key;
+ uint32_t a,b,c;
+
+ /* Set up the internal state */
+ a = b = c = 0xdeadbeef + ((uint32_t)n*4) + (base >> 32) + base;
+
+ while (n > 3) {
+ a += k[0];
+ b += k[1];
+ c += k[2];
+ mix(a,b,c);
+
+ n -= 3;
+ k += 3;
+ }
+ switch (n) {
+ case 2:
+ b += (uint32_t)k[1];
+ case 1:
+ a += (uint32_t)k[0];
+ break;
+ case 0:
+ return c;
+ }
+ final(a,b,c);
+ return ((uint64_t)b << 32) | c;
+}
+
+uint64_t hash64_stable_16(const void *key, size_t n, uint64_t base)
+{
+ const uint16_t *k = key;
+ uint32_t a,b,c;
+
+ /* Set up the internal state */
+ a = b = c = 0xdeadbeef + ((uint32_t)n*2) + (base >> 32) + base;
+
+ while (n > 6) {
+ a += (uint32_t)k[0] + ((uint32_t)k[1] << 16);
+ b += (uint32_t)k[2] + ((uint32_t)k[3] << 16);
+ c += (uint32_t)k[4] + ((uint32_t)k[5] << 16);
+ mix(a,b,c);
+
+ n -= 6;
+ k += 6;
+ }
+
+ switch (n) {
+ case 5:
+ c += (uint32_t)k[4];
+ case 4:
+ b += ((uint32_t)k[3] << 16);
+ case 3:
+ b += (uint32_t)k[2];
+ case 2:
+ a += ((uint32_t)k[1] << 16);
+ case 1:
+ a += (uint32_t)k[0];
+ break;
+ case 0:
+ return c;
+ }
+ final(a,b,c);
+ return ((uint64_t)b << 32) | c;
+}
+
+uint64_t hash64_stable_8(const void *key, size_t n, uint64_t base)
+{
+ uint32_t b32 = base + (base >> 32);
+ uint32_t lower = hashlittle(key, n, &b32);
+
+ return ((uint64_t)b32 << 32) | lower;
+}
+
+uint32_t hash_any(const void *key, size_t length, uint32_t base)
+{
+ if (HASH_BIG_ENDIAN)
+ return hashbig(key, length, &base);
+ else
+ return hashlittle(key, length, &base);
+}
+
+uint32_t hash_stable_64(const void *key, size_t n, uint32_t base)
+{
+ return hash64_stable_64(key, n, base);
+}
+
+uint32_t hash_stable_32(const void *key, size_t n, uint32_t base)
+{
+ return hash64_stable_32(key, n, base);
+}
+
+uint32_t hash_stable_16(const void *key, size_t n, uint32_t base)
+{
+ return hash64_stable_16(key, n, base);
+}
+
+uint32_t hash_stable_8(const void *key, size_t n, uint32_t base)
+{
+ return hashlittle(key, n, &base);
+}
+
+/* Jenkins' lookup8 is a 64 bit hash, but he says it's obsolete. Use
+ * the plain one and recombine into 64 bits. */
+uint64_t hash64_any(const void *key, size_t length, uint64_t base)
+{
+ uint32_t b32 = base + (base >> 32);
+ uint32_t lower;
+
+ if (HASH_BIG_ENDIAN)
+ lower = hashbig(key, length, &b32);
+ else
+ lower = hashlittle(key, length, &b32);
+
+ return ((uint64_t)b32 << 32) | lower;
+}
+
+#ifdef SELF_TEST
+
+/* used for timings */
+void driver1()
+{
+ uint8_t buf[256];
+ uint32_t i;
+ uint32_t h=0;
+ time_t a,z;
+
+ time(&a);
+ for (i=0; i<256; ++i) buf[i] = 'x';
+ for (i=0; i<1; ++i)
+ {
+ h = hashlittle(&buf[0],1,h);
+ }
+ time(&z);
+ if (z-a > 0) printf("time %d %.8x\n", z-a, h);
+}
+
+/* check that every input bit changes every output bit half the time */
+#define HASHSTATE 1
+#define HASHLEN 1
+#define MAXPAIR 60
+#define MAXLEN 70
+void driver2()
+{
+ uint8_t qa[MAXLEN+1], qb[MAXLEN+2], *a = &qa[0], *b = &qb[1];
+ uint32_t c[HASHSTATE], d[HASHSTATE], i=0, j=0, k, l, m=0, z;
+ uint32_t e[HASHSTATE],f[HASHSTATE],g[HASHSTATE],h[HASHSTATE];
+ uint32_t x[HASHSTATE],y[HASHSTATE];
+ uint32_t hlen;
+
+ printf("No more than %d trials should ever be needed \n",MAXPAIR/2);
+ for (hlen=0; hlen < MAXLEN; ++hlen)
+ {
+ z=0;
+ for (i=0; i<hlen; ++i) /*----------------------- for each input byte, */
+ {
+ for (j=0; j<8; ++j) /*------------------------ for each input bit, */
+ {
+ for (m=1; m<8; ++m) /*------------ for several possible initvals, */
+ {
+ for (l=0; l<HASHSTATE; ++l)
+ e[l]=f[l]=g[l]=h[l]=x[l]=y[l]=~((uint32_t)0);
+
+ /*---- check that every output bit is affected by that input bit */
+ for (k=0; k<MAXPAIR; k+=2)
+ {
+ uint32_t finished=1;
+ /* keys have one bit different */
+ for (l=0; l<hlen+1; ++l) {a[l] = b[l] = (uint8_t)0;}
+ /* have a and b be two keys differing in only one bit */
+ a[i] ^= (k<<j);
+ a[i] ^= (k>>(8-j));
+ c[0] = hashlittle(a, hlen, m);
+ b[i] ^= ((k+1)<<j);
+ b[i] ^= ((k+1)>>(8-j));
+ d[0] = hashlittle(b, hlen, m);
+ /* check every bit is 1, 0, set, and not set at least once */
+ for (l=0; l<HASHSTATE; ++l)
+ {
+ e[l] &= (c[l]^d[l]);
+ f[l] &= ~(c[l]^d[l]);
+ g[l] &= c[l];
+ h[l] &= ~c[l];
+ x[l] &= d[l];
+ y[l] &= ~d[l];
+ if (e[l]|f[l]|g[l]|h[l]|x[l]|y[l]) finished=0;
+ }
+ if (finished) break;
+ }
+ if (k>z) z=k;
+ if (k==MAXPAIR)
+ {
+ printf("Some bit didn't change: ");
+ printf("%.8x %.8x %.8x %.8x %.8x %.8x ",
+ e[0],f[0],g[0],h[0],x[0],y[0]);
+ printf("i %d j %d m %d len %d\n", i, j, m, hlen);
+ }
+ if (z==MAXPAIR) goto done;
+ }
+ }
+ }
+ done:
+ if (z < MAXPAIR)
+ {
+ printf("Mix success %2d bytes %2d initvals ",i,m);
+ printf("required %d trials\n", z/2);
+ }
+ }
+ printf("\n");
+}
+
+/* Check for reading beyond the end of the buffer and alignment problems */
+void driver3()
+{
+ uint8_t buf[MAXLEN+20], *b;
+ uint32_t len;
+ uint8_t q[] = "This is the time for all good men to come to the aid of their country...";
+ uint32_t h;
+ uint8_t qq[] = "xThis is the time for all good men to come to the aid of their country...";
+ uint32_t i;
+ uint8_t qqq[] = "xxThis is the time for all good men to come to the aid of their country...";
+ uint32_t j;
+ uint8_t qqqq[] = "xxxThis is the time for all good men to come to the aid of their country...";
+ uint32_t ref,x,y;
+ uint8_t *p;
+
+ printf("Endianness. These lines should all be the same (for values filled in):\n");
+ printf("%.8x %.8x %.8x\n",
+ hash_word((const uint32_t *)q, (sizeof(q)-1)/4, 13),
+ hash_word((const uint32_t *)q, (sizeof(q)-5)/4, 13),
+ hash_word((const uint32_t *)q, (sizeof(q)-9)/4, 13));
+ p = q;
+ printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n",
+ hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13),
+ hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13),
+ hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13),
+ hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13),
+ hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13),
+ hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13));
+ p = &qq[1];
+ printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n",
+ hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13),
+ hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13),
+ hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13),
+ hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13),
+ hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13),
+ hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13));
+ p = &qqq[2];
+ printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n",
+ hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13),
+ hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13),
+ hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13),
+ hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13),
+ hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13),
+ hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13));
+ p = &qqqq[3];
+ printf("%.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x %.8x\n",
+ hashlittle(p, sizeof(q)-1, 13), hashlittle(p, sizeof(q)-2, 13),
+ hashlittle(p, sizeof(q)-3, 13), hashlittle(p, sizeof(q)-4, 13),
+ hashlittle(p, sizeof(q)-5, 13), hashlittle(p, sizeof(q)-6, 13),
+ hashlittle(p, sizeof(q)-7, 13), hashlittle(p, sizeof(q)-8, 13),
+ hashlittle(p, sizeof(q)-9, 13), hashlittle(p, sizeof(q)-10, 13),
+ hashlittle(p, sizeof(q)-11, 13), hashlittle(p, sizeof(q)-12, 13));
+ printf("\n");
+
+ /* check that hashlittle2 and hashlittle produce the same results */
+ i=47; j=0;
+ hashlittle2(q, sizeof(q), &i, &j);
+ if (hashlittle(q, sizeof(q), 47) != i)
+ printf("hashlittle2 and hashlittle mismatch\n");
+
+ /* check that hash_word2 and hash_word produce the same results */
+ len = 0xdeadbeef;
+ i=47, j=0;
+ hash_word2(&len, 1, &i, &j);
+ if (hash_word(&len, 1, 47) != i)
+ printf("hash_word2 and hash_word mismatch %x %x\n",
+ i, hash_word(&len, 1, 47));
+
+ /* check hashlittle doesn't read before or after the ends of the string */
+ for (h=0, b=buf+1; h<8; ++h, ++b)
+ {
+ for (i=0; i<MAXLEN; ++i)
+ {
+ len = i;
+ for (j=0; j<i; ++j) *(b+j)=0;
+
+ /* these should all be equal */
+ ref = hashlittle(b, len, (uint32_t)1);
+ *(b+i)=(uint8_t)~0;
+ *(b-1)=(uint8_t)~0;
+ x = hashlittle(b, len, (uint32_t)1);
+ y = hashlittle(b, len, (uint32_t)1);
+ if ((ref != x) || (ref != y))
+ {
+ printf("alignment error: %.8x %.8x %.8x %d %d\n",ref,x,y,
+ h, i);
+ }
+ }
+ }
+}
+
+/* check for problems with nulls */
+ void driver4()
+{
+ uint8_t buf[1];
+ uint32_t h,i,state[HASHSTATE];
+
+
+ buf[0] = ~0;
+ for (i=0; i<HASHSTATE; ++i) state[i] = 1;
+ printf("These should all be different\n");
+ for (i=0, h=0; i<8; ++i)
+ {
+ h = hashlittle(buf, 0, h);
+ printf("%2ld 0-byte strings, hash is %.8x\n", i, h);
+ }
+}
+
+
+int main()
+{
+ driver1(); /* test that the key is hashed: used for timings */
+ driver2(); /* test that whole key is hashed thoroughly */
+ driver3(); /* test that nothing but the key is hashed */
+ driver4(); /* test hashing multiple buffers (all buffers are null) */
+ return 1;
+}
+
+#endif /* SELF_TEST */
diff --git a/ccan/hash/hash.h b/ccan/hash/hash.h
new file mode 100644
index 0000000..0400e6a
--- /dev/null
+++ b/ccan/hash/hash.h
@@ -0,0 +1,312 @@
+#ifndef CCAN_HASH_H
+#define CCAN_HASH_H
+#include "config.h"
+#include <stdint.h>
+#include <stdlib.h>
+#include <ccan/build_assert/build_assert.h>
+
+/* Stolen mostly from: lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+ *
+ * http://burtleburtle.net/bob/c/lookup3.c
+ */
+
+/**
+ * hash - fast hash of an array for internal use
+ * @p: the array or pointer to first element
+ * @num: the number of elements to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * The memory region pointed to by p is combined with the base to form
+ * a 32-bit hash.
+ *
+ * This hash will have different results on different machines, so is
+ * only useful for internal hashes (ie. not hashes sent across the
+ * network or saved to disk).
+ *
+ * It may also change with future versions: it could even detect at runtime
+ * what the fastest hash to use is.
+ *
+ * See also: hash64, hash_stable.
+ *
+ * Example:
+ * #include <ccan/hash/hash.h>
+ * #include <err.h>
+ * #include <stdio.h>
+ * #include <string.h>
+ *
+ * // Simple demonstration: idential strings will have the same hash, but
+ * // two different strings will probably not.
+ * int main(int argc, char *argv[])
+ * {
+ * uint32_t hash1, hash2;
+ *
+ * if (argc != 3)
+ * err(1, "Usage: %s <string1> <string2>", argv[0]);
+ *
+ * hash1 = hash(argv[1], strlen(argv[1]), 0);
+ * hash2 = hash(argv[2], strlen(argv[2]), 0);
+ * printf("Hash is %s\n", hash1 == hash2 ? "same" : "different");
+ * return 0;
+ * }
+ */
+#define hash(p, num, base) hash_any((p), (num)*sizeof(*(p)), (base))
+
+/**
+ * hash_stable - hash of an array for external use
+ * @p: the array or pointer to first element
+ * @num: the number of elements to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * The array of simple integer types pointed to by p is combined with
+ * the base to form a 32-bit hash.
+ *
+ * This hash will have the same results on different machines, so can
+ * be used for external hashes (ie. hashes sent across the network or
+ * saved to disk). The results will not change in future versions of
+ * this module.
+ *
+ * Note that it is only legal to hand an array of simple integer types
+ * to this hash (ie. char, uint16_t, int64_t, etc). In these cases,
+ * the same values will have the same hash result, even though the
+ * memory representations of integers depend on the machine
+ * endianness.
+ *
+ * See also:
+ * hash64_stable
+ *
+ * Example:
+ * #include <ccan/hash/hash.h>
+ * #include <err.h>
+ * #include <stdio.h>
+ * #include <string.h>
+ *
+ * int main(int argc, char *argv[])
+ * {
+ * if (argc != 2)
+ * err(1, "Usage: %s <string-to-hash>", argv[0]);
+ *
+ * printf("Hash stable result is %u\n",
+ * hash_stable(argv[1], strlen(argv[1]), 0));
+ * return 0;
+ * }
+ */
+#define hash_stable(p, num, base) \
+ (BUILD_ASSERT_OR_ZERO(sizeof(*(p)) == 8 || sizeof(*(p)) == 4 \
+ || sizeof(*(p)) == 2 || sizeof(*(p)) == 1) + \
+ sizeof(*(p)) == 8 ? hash_stable_64((p), (num), (base)) \
+ : sizeof(*(p)) == 4 ? hash_stable_32((p), (num), (base)) \
+ : sizeof(*(p)) == 2 ? hash_stable_16((p), (num), (base)) \
+ : hash_stable_8((p), (num), (base)))
+
+/**
+ * hash_u32 - fast hash an array of 32-bit values for internal use
+ * @key: the array of uint32_t
+ * @num: the number of elements to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * The array of uint32_t pointed to by @key is combined with the base
+ * to form a 32-bit hash. This is 2-3 times faster than hash() on small
+ * arrays, but the advantage vanishes over large hashes.
+ *
+ * This hash will have different results on different machines, so is
+ * only useful for internal hashes (ie. not hashes sent across the
+ * network or saved to disk).
+ */
+uint32_t hash_u32(const uint32_t *key, size_t num, uint32_t base);
+
+/**
+ * hash_string - very fast hash of an ascii string
+ * @str: the nul-terminated string
+ *
+ * The string is hashed, using a hash function optimized for ASCII and
+ * similar strings. It's weaker than the other hash functions.
+ *
+ * This hash may have different results on different machines, so is
+ * only useful for internal hashes (ie. not hashes sent across the
+ * network or saved to disk). The results will be different from the
+ * other hash functions in this module, too.
+ */
+static inline uint32_t hash_string(const char *string)
+{
+ /* This is Karl Nelson <kenelson@ece.ucdavis.edu>'s X31 hash.
+ * It's a little faster than the (much better) lookup3 hash(): 56ns vs
+ * 84ns on my 2GHz Intel Core Duo 2 laptop for a 10 char string. */
+ uint32_t ret;
+
+ for (ret = 0; *string; string++)
+ ret = (ret << 5) - ret + *string;
+
+ return ret;
+}
+
+/**
+ * hash64 - fast 64-bit hash of an array for internal use
+ * @p: the array or pointer to first element
+ * @num: the number of elements to hash
+ * @base: the 64-bit base number to roll into the hash (usually 0)
+ *
+ * The memory region pointed to by p is combined with the base to form
+ * a 64-bit hash.
+ *
+ * This hash will have different results on different machines, so is
+ * only useful for internal hashes (ie. not hashes sent across the
+ * network or saved to disk).
+ *
+ * It may also change with future versions: it could even detect at runtime
+ * what the fastest hash to use is.
+ *
+ * See also: hash.
+ *
+ * Example:
+ * #include <ccan/hash/hash.h>
+ * #include <err.h>
+ * #include <stdio.h>
+ * #include <string.h>
+ *
+ * // Simple demonstration: idential strings will have the same hash, but
+ * // two different strings will probably not.
+ * int main(int argc, char *argv[])
+ * {
+ * uint64_t hash1, hash2;
+ *
+ * if (argc != 3)
+ * err(1, "Usage: %s <string1> <string2>", argv[0]);
+ *
+ * hash1 = hash64(argv[1], strlen(argv[1]), 0);
+ * hash2 = hash64(argv[2], strlen(argv[2]), 0);
+ * printf("Hash is %s\n", hash1 == hash2 ? "same" : "different");
+ * return 0;
+ * }
+ */
+#define hash64(p, num, base) hash64_any((p), (num)*sizeof(*(p)), (base))
+
+/**
+ * hash64_stable - 64 bit hash of an array for external use
+ * @p: the array or pointer to first element
+ * @num: the number of elements to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * The array of simple integer types pointed to by p is combined with
+ * the base to form a 64-bit hash.
+ *
+ * This hash will have the same results on different machines, so can
+ * be used for external hashes (ie. hashes sent across the network or
+ * saved to disk). The results will not change in future versions of
+ * this module.
+ *
+ * Note that it is only legal to hand an array of simple integer types
+ * to this hash (ie. char, uint16_t, int64_t, etc). In these cases,
+ * the same values will have the same hash result, even though the
+ * memory representations of integers depend on the machine
+ * endianness.
+ *
+ * See also:
+ * hash_stable
+ *
+ * Example:
+ * #include <ccan/hash/hash.h>
+ * #include <err.h>
+ * #include <stdio.h>
+ * #include <string.h>
+ *
+ * int main(int argc, char *argv[])
+ * {
+ * if (argc != 2)
+ * err(1, "Usage: %s <string-to-hash>", argv[0]);
+ *
+ * printf("Hash stable result is %llu\n",
+ * (long long)hash64_stable(argv[1], strlen(argv[1]), 0));
+ * return 0;
+ * }
+ */
+#define hash64_stable(p, num, base) \
+ (BUILD_ASSERT_OR_ZERO(sizeof(*(p)) == 8 || sizeof(*(p)) == 4 \
+ || sizeof(*(p)) == 2 || sizeof(*(p)) == 1) + \
+ sizeof(*(p)) == 8 ? hash64_stable_64((p), (num), (base)) \
+ : sizeof(*(p)) == 4 ? hash64_stable_32((p), (num), (base)) \
+ : sizeof(*(p)) == 2 ? hash64_stable_16((p), (num), (base)) \
+ : hash64_stable_8((p), (num), (base)))
+
+
+/**
+ * hashl - fast 32/64-bit hash of an array for internal use
+ * @p: the array or pointer to first element
+ * @num: the number of elements to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * This is either hash() or hash64(), on 32/64 bit long machines.
+ */
+#define hashl(p, num, base) \
+ (BUILD_ASSERT_OR_ZERO(sizeof(long) == sizeof(uint32_t) \
+ || sizeof(long) == sizeof(uint64_t)) + \
+ (sizeof(long) == sizeof(uint64_t) \
+ ? hash64((p), (num), (base)) : hash((p), (num), (base))))
+
+/* Our underlying operations. */
+uint32_t hash_any(const void *key, size_t length, uint32_t base);
+uint32_t hash_stable_64(const void *key, size_t n, uint32_t base);
+uint32_t hash_stable_32(const void *key, size_t n, uint32_t base);
+uint32_t hash_stable_16(const void *key, size_t n, uint32_t base);
+uint32_t hash_stable_8(const void *key, size_t n, uint32_t base);
+uint64_t hash64_any(const void *key, size_t length, uint64_t base);
+uint64_t hash64_stable_64(const void *key, size_t n, uint64_t base);
+uint64_t hash64_stable_32(const void *key, size_t n, uint64_t base);
+uint64_t hash64_stable_16(const void *key, size_t n, uint64_t base);
+uint64_t hash64_stable_8(const void *key, size_t n, uint64_t base);
+
+/**
+ * hash_pointer - hash a pointer for internal use
+ * @p: the pointer value to hash
+ * @base: the base number to roll into the hash (usually 0)
+ *
+ * The pointer p (not what p points to!) is combined with the base to form
+ * a 32-bit hash.
+ *
+ * This hash will have different results on different machines, so is
+ * only useful for internal hashes (ie. not hashes sent across the
+ * network or saved to disk).
+ *
+ * Example:
+ * #include <ccan/hash/hash.h>
+ *
+ * // Code to keep track of memory regions.
+ * struct region {
+ * struct region *chain;
+ * void *start;
+ * unsigned int size;
+ * };
+ * // We keep a simple hash table.
+ * static struct region *region_hash[128];
+ *
+ * static void add_region(struct region *r)
+ * {
+ * unsigned int h = hash_pointer(r->start, 0);
+ *
+ * r->chain = region_hash[h];
+ * region_hash[h] = r->chain;
+ * }
+ *
+ * static struct region *find_region(const void *start)
+ * {
+ * struct region *r;
+ *
+ * for (r = region_hash[hash_pointer(start, 0)]; r; r = r->chain)
+ * if (r->start == start)
+ * return r;
+ * return NULL;
+ * }
+ */
+static inline uint32_t hash_pointer(const void *p, uint32_t base)
+{
+ if (sizeof(p) % sizeof(uint32_t) == 0) {
+ /* This convoluted union is the right way of aliasing. */
+ union {
+ uint32_t u32[sizeof(p) / sizeof(uint32_t)];
+ const void *p;
+ } u;
+ u.p = p;
+ return hash_u32(u.u32, sizeof(p) / sizeof(uint32_t), base);
+ } else
+ return hash(&p, 1, base);
+}
+#endif /* HASH_H */
diff --git a/ccan/hash/test/api-hash_stable.c b/ccan/hash/test/api-hash_stable.c
new file mode 100644
index 0000000..bb58d16
--- /dev/null
+++ b/ccan/hash/test/api-hash_stable.c
@@ -0,0 +1,300 @@
+#include <ccan/hash/hash.h>
+#include <ccan/tap/tap.h>
+#include <stdbool.h>
+#include <string.h>
+
+#define ARRAY_WORDS 5
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ uint8_t u8array[ARRAY_WORDS];
+ uint16_t u16array[ARRAY_WORDS];
+ uint32_t u32array[ARRAY_WORDS];
+ uint64_t u64array[ARRAY_WORDS];
+
+ /* Initialize arrays. */
+ for (i = 0; i < ARRAY_WORDS; i++) {
+ u8array[i] = i;
+ u16array[i] = i;
+ u32array[i] = i;
+ u64array[i] = i;
+ }
+
+ plan_tests(264);
+
+ /* hash_stable is API-guaranteed. */
+ ok1(hash_stable(u8array, ARRAY_WORDS, 0) == 0x1d4833cc);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 1) == 0x37125e2 );
+ ok1(hash_stable(u8array, ARRAY_WORDS, 2) == 0x330a007a);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 4) == 0x7b0df29b);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 8) == 0xe7e5d741);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 16) == 0xaae57471);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 32) == 0xc55399e5);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 64) == 0x67f21f7 );
+ ok1(hash_stable(u8array, ARRAY_WORDS, 128) == 0x1d795b71);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 256) == 0xeb961671);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 512) == 0xc2597247);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 1024) == 0x3f5c4d75);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 2048) == 0xe65cf4f9);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 4096) == 0xf2cd06cb);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 8192) == 0x443041e1);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 16384) == 0xdfc618f5);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 32768) == 0x5e3d5b97);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 65536) == 0xd5f64730);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 131072) == 0x372bbecc);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 262144) == 0x7c194c8d);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 524288) == 0x16cbb416);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 1048576) == 0x53e99222);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 2097152) == 0x6394554a);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 4194304) == 0xd83a506d);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 8388608) == 0x7619d9a4);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 16777216) == 0xfe98e5f6);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 33554432) == 0x6c262927);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 67108864) == 0x3f0106fd);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 134217728) == 0xc91e3a28);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 268435456) == 0x14229579);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 536870912) == 0x9dbefa76);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 1073741824) == 0xb05c0c78);
+ ok1(hash_stable(u8array, ARRAY_WORDS, 2147483648U) == 0x88f24d81);
+
+ ok1(hash_stable(u16array, ARRAY_WORDS, 0) == 0xecb5f507);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 1) == 0xadd666e6);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 2) == 0xea0f214c);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 4) == 0xae4051ba);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 8) == 0x6ed28026);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 16) == 0xa3917a19);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 32) == 0xf370f32b);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 64) == 0x807af460);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 128) == 0xb4c8cd83);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 256) == 0xa10cb5b0);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 512) == 0x8b7d7387);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 1024) == 0x9e49d1c );
+ ok1(hash_stable(u16array, ARRAY_WORDS, 2048) == 0x288830d1);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 4096) == 0xbe078a43);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 8192) == 0xa16d5d88);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 16384) == 0x46839fcd);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 32768) == 0x9db9bd4f);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 65536) == 0xedff58f8);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 131072) == 0x95ecef18);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 262144) == 0x23c31b7d);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 524288) == 0x1d85c7d0);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 1048576) == 0x25218842);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 2097152) == 0x711d985c);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 4194304) == 0x85470eca);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 8388608) == 0x99ed4ceb);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 16777216) == 0x67b3710c);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 33554432) == 0x77f1ab35);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 67108864) == 0x81f688aa);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 134217728) == 0x27b56ca5);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 268435456) == 0xf21ba203);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 536870912) == 0xd48d1d1 );
+ ok1(hash_stable(u16array, ARRAY_WORDS, 1073741824) == 0xa542b62d);
+ ok1(hash_stable(u16array, ARRAY_WORDS, 2147483648U) == 0xa04c7058);
+
+ ok1(hash_stable(u32array, ARRAY_WORDS, 0) == 0x13305f8c);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 1) == 0x171abf74);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 2) == 0x7646fcc7);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 4) == 0xa758ed5);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 8) == 0x2dedc2e4);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 16) == 0x28e2076b);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 32) == 0xb73091c5);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 64) == 0x87daf5db);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 128) == 0xa16dfe20);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 256) == 0x300c63c3);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 512) == 0x255c91fc);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 1024) == 0x6357b26);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 2048) == 0x4bc5f339);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 4096) == 0x1301617c);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 8192) == 0x506792c9);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 16384) == 0xcd596705);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 32768) == 0xa8713cac);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 65536) == 0x94d9794);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 131072) == 0xac753e8);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 262144) == 0xcd8bdd20);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 524288) == 0xd44faf80);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 1048576) == 0x2547ccbe);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 2097152) == 0xbab06dbc);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 4194304) == 0xaac0e882);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 8388608) == 0x443f48d0);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 16777216) == 0xdff49fcc);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 33554432) == 0x9ce0fd65);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 67108864) == 0x9ddb1def);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 134217728) == 0x86096f25);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 268435456) == 0xe713b7b5);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 536870912) == 0x5baeffc5);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 1073741824) == 0xde874f52);
+ ok1(hash_stable(u32array, ARRAY_WORDS, 2147483648U) == 0xeca13b4e);
+
+ ok1(hash_stable(u64array, ARRAY_WORDS, 0) == 0x12ef6302);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 1) == 0xe9aeb406);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 2) == 0xc4218ceb);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 4) == 0xb3d11412);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 8) == 0xdafbd654);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 16) == 0x9c336cba);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 32) == 0x65059721);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 64) == 0x95b5bbe6);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 128) == 0xe7596b84);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 256) == 0x503622a2);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 512) == 0xecdcc5ca);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 1024) == 0xc40d0513);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 2048) == 0xaab25e4d);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 4096) == 0xcc353fb9);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 8192) == 0x18e2319f);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 16384) == 0xfddaae8d);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 32768) == 0xef7976f2);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 65536) == 0x86359fc9);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 131072) == 0x8b5af385);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 262144) == 0x80d4ee31);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 524288) == 0x42f5f85b);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 1048576) == 0x9a6920e1);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 2097152) == 0x7b7c9850);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 4194304) == 0x69573e09);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 8388608) == 0xc942bc0e);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 16777216) == 0x7a89f0f1);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 33554432) == 0x2dd641ca);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 67108864) == 0x89bbd391);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 134217728) == 0xbcf88e31);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 268435456) == 0xfa7a3460);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 536870912) == 0x49a37be0);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 1073741824) == 0x1b346394);
+ ok1(hash_stable(u64array, ARRAY_WORDS, 2147483648U) == 0x6c3a1592);
+
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 0) == 16887282882572727244ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 1) == 12032777473133454818ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 2) == 18183407363221487738ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 4) == 17860764172704150171ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 8) == 18076051600675559233ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 16) == 9909361918431556721ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 32) == 12937969888744675813ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 64) == 5245669057381736951ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 128) == 4376874646406519665ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 256) == 14219974419871569521ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 512) == 2263415354134458951ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 1024) == 4953859694526221685ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 2048) == 3432228642067641593ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 4096) == 1219647244417697483ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 8192) == 7629939424585859553ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 16384) == 10041660531376789749ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 32768) == 13859885793922603927ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 65536) == 15069060338344675120ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 131072) == 818163430835601100ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 262144) == 14914314323019517069ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 524288) == 17518437749769352214ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 1048576) == 14920048004901212706ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 2097152) == 8758567366332536138ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 4194304) == 6226655736088907885ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 8388608) == 13716650013685832100ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 16777216) == 305325651636315638ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 33554432) == 16784147606583781671ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 67108864) == 16509467555140798205ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 134217728) == 8717281234694060584ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 268435456) == 8098476701725660537ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 536870912) == 16345871539461094006ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 1073741824) == 3755557000429964408ULL);
+ ok1(hash64_stable(u8array, ARRAY_WORDS, 2147483648U) == 15017348801959710081ULL);
+
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 0) == 1038028831307724039ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 1) == 10155473272642627302ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 2) == 5714751190106841420ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 4) == 3923885607767527866ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 8) == 3931017318293995558ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 16) == 1469696588339313177ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 32) == 11522218526952715051ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 64) == 6953517591561958496ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 128) == 7406689491740052867ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 256) == 10101844489704093104ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 512) == 12511348870707245959ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 1024) == 1614019938016861468ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 2048) == 5294796182374592721ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 4096) == 16089570706643716675ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 8192) == 1689302638424579464ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 16384) == 1446340172370386893ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 32768) == 16535503506744393039ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 65536) == 3496794142527150328ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 131072) == 6568245367474548504ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 262144) == 9487676460765485949ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 524288) == 4519762130966530000ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 1048576) == 15623412069215340610ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 2097152) == 544013388676438108ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 4194304) == 5594904760290840266ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 8388608) == 18098755780041592043ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 16777216) == 6389168672387330316ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 33554432) == 896986127732419381ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 67108864) == 13232626471143901354ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 134217728) == 53378562890493093ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 268435456) == 10072361400297824771ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 536870912) == 14511948118285144529ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 1073741824) == 6981033484844447277ULL);
+ ok1(hash64_stable(u16array, ARRAY_WORDS, 2147483648U) == 5619339091684126808ULL);
+
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 0) == 3037571077312110476ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 1) == 14732398743825071988ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 2) == 14949132158206672071ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 4) == 1291370080511561429ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 8) == 10792665964172133092ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 16) == 14250138032054339435ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 32) == 17136741522078732741ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 64) == 3260193403318236635ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 128) == 10526616652205653536ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 256) == 9019690373358576579ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 512) == 6997491436599677436ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 1024) == 18302783371416533798ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 2048) == 10149320644446516025ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 4096) == 7073759949410623868ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 8192) == 17442399482223760073ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 16384) == 2983906194216281861ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 32768) == 4975845419129060524ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 65536) == 594019910205413268ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 131072) == 11903010186073691112ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 262144) == 7339636527154847008ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 524288) == 15243305400579108736ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 1048576) == 16737926245392043198ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 2097152) == 15725083267699862972ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 4194304) == 12527834265678833794ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 8388608) == 13908436455987824848ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 16777216) == 9672773345173872588ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 33554432) == 2305314279896710501ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 67108864) == 1866733780381408751ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 134217728) == 11906263969465724709ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 268435456) == 5501594918093830069ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 536870912) == 15823785789276225477ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 1073741824) == 17353000723889475410ULL);
+ ok1(hash64_stable(u32array, ARRAY_WORDS, 2147483648U) == 7494736910655503182ULL);
+
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 0) == 9765419389786481410ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 1) == 11182806172127114246ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 2) == 2559155171395472619ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 4) == 3311692033324815378ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 8) == 1297175419505333844ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 16) == 617896928653569210ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 32) == 1517398559958603553ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 64) == 4504821917445110758ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 128) == 1971743331114904452ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 256) == 6177667912354374306ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 512) == 15570521289777792458ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 1024) == 9204559632415917331ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 2048) == 9008982669760028237ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 4096) == 14803537660281700281ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 8192) == 2873966517448487327ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 16384) == 5859277625928363661ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 32768) == 15520461285618185970ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 65536) == 16746489793331175369ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 131072) == 514952025484227461ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 262144) == 10867212269810675249ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 524288) == 9822204377278314587ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 1048576) == 3295088921987850465ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 2097152) == 7559197431498053712ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 4194304) == 1667267269116771849ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 8388608) == 2916804068951374862ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 16777216) == 14422558383125688561ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 33554432) == 10083112683694342602ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 67108864) == 7222777647078298513ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 134217728) == 18424513674048212529ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 268435456) == 14913668581101810784ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 536870912) == 14377721174297902048ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 1073741824) == 6031715005667500948ULL);
+ ok1(hash64_stable(u64array, ARRAY_WORDS, 2147483648U) == 4827100319722378642ULL);
+
+ return exit_status();
+}
diff --git a/ccan/hash/test/run.c b/ccan/hash/test/run.c
new file mode 100644
index 0000000..dad8e86
--- /dev/null
+++ b/ccan/hash/test/run.c
@@ -0,0 +1,149 @@
+#include <ccan/hash/hash.h>
+#include <ccan/tap/tap.h>
+#include <ccan/hash/hash.c>
+#include <stdbool.h>
+#include <string.h>
+
+#define ARRAY_WORDS 5
+
+int main(int argc, char *argv[])
+{
+ unsigned int i, j, k;
+ uint32_t array[ARRAY_WORDS], val;
+ char array2[sizeof(array) + sizeof(uint32_t)];
+ uint32_t results[256];
+
+ /* Initialize array. */
+ for (i = 0; i < ARRAY_WORDS; i++)
+ array[i] = i;
+
+ plan_tests(39);
+ /* Hash should be the same, indep of memory alignment. */
+ val = hash(array, ARRAY_WORDS, 0);
+ for (i = 0; i < sizeof(uint32_t); i++) {
+ memcpy(array2 + i, array, sizeof(array));
+ ok(hash(array2 + i, ARRAY_WORDS, 0) != val,
+ "hash matched at offset %i", i);
+ }
+
+ /* Hash of random values should have random distribution:
+ * check one byte at a time. */
+ for (i = 0; i < sizeof(uint32_t); i++) {
+ unsigned int lowest = -1U, highest = 0;
+
+ memset(results, 0, sizeof(results));
+
+ for (j = 0; j < 256000; j++) {
+ for (k = 0; k < ARRAY_WORDS; k++)
+ array[k] = random();
+ results[(hash(array, ARRAY_WORDS, 0) >> i*8)&0xFF]++;
+ }
+
+ for (j = 0; j < 256; j++) {
+ if (results[j] < lowest)
+ lowest = results[j];
+ if (results[j] > highest)
+ highest = results[j];
+ }
+ /* Expect within 20% */
+ ok(lowest > 800, "Byte %i lowest %i", i, lowest);
+ ok(highest < 1200, "Byte %i highest %i", i, highest);
+ diag("Byte %i, range %u-%u", i, lowest, highest);
+ }
+
+ /* Hash of random values should have random distribution:
+ * check one byte at a time. */
+ for (i = 0; i < sizeof(uint64_t); i++) {
+ unsigned int lowest = -1U, highest = 0;
+
+ memset(results, 0, sizeof(results));
+
+ for (j = 0; j < 256000; j++) {
+ for (k = 0; k < ARRAY_WORDS; k++)
+ array[k] = random();
+ results[(hash64(array, sizeof(array)/sizeof(uint64_t),
+ 0) >> i*8)&0xFF]++;
+ }
+
+ for (j = 0; j < 256; j++) {
+ if (results[j] < lowest)
+ lowest = results[j];
+ if (results[j] > highest)
+ highest = results[j];
+ }
+ /* Expect within 20% */
+ ok(lowest > 800, "Byte %i lowest %i", i, lowest);
+ ok(highest < 1200, "Byte %i highest %i", i, highest);
+ diag("Byte %i, range %u-%u", i, lowest, highest);
+ }
+
+ /* Hash of pointer values should also have random distribution. */
+ for (i = 0; i < sizeof(uint32_t); i++) {
+ unsigned int lowest = -1U, highest = 0;
+ char *p = malloc(256000);
+
+ memset(results, 0, sizeof(results));
+
+ for (j = 0; j < 256000; j++)
+ results[(hash_pointer(p + j, 0) >> i*8)&0xFF]++;
+ free(p);
+
+ for (j = 0; j < 256; j++) {
+ if (results[j] < lowest)
+ lowest = results[j];
+ if (results[j] > highest)
+ highest = results[j];
+ }
+ /* Expect within 20% */
+ ok(lowest > 800, "hash_pointer byte %i lowest %i", i, lowest);
+ ok(highest < 1200, "hash_pointer byte %i highest %i",
+ i, highest);
+ diag("hash_pointer byte %i, range %u-%u", i, lowest, highest);
+ }
+
+ if (sizeof(long) == sizeof(uint32_t))
+ ok1(hashl(array, ARRAY_WORDS, 0)
+ == hash(array, ARRAY_WORDS, 0));
+ else
+ ok1(hashl(array, ARRAY_WORDS, 0)
+ == hash64(array, ARRAY_WORDS, 0));
+
+ /* String hash: weak, so only test bottom byte */
+ for (i = 0; i < 1; i++) {
+ unsigned int num = 0, cursor, lowest = -1U, highest = 0;
+ char p[5];
+
+ memset(results, 0, sizeof(results));
+
+ memset(p, 'A', sizeof(p));
+ p[sizeof(p)-1] = '\0';
+
+ for (;;) {
+ for (cursor = 0; cursor < sizeof(p)-1; cursor++) {
+ p[cursor]++;
+ if (p[cursor] <= 'z')
+ break;
+ p[cursor] = 'A';
+ }
+ if (cursor == sizeof(p)-1)
+ break;
+
+ results[(hash_string(p) >> i*8)&0xFF]++;
+ num++;
+ }
+
+ for (j = 0; j < 256; j++) {
+ if (results[j] < lowest)
+ lowest = results[j];
+ if (results[j] > highest)
+ highest = results[j];
+ }
+ /* Expect within 20% */
+ ok(lowest > 35000, "hash_pointer byte %i lowest %i", i, lowest);
+ ok(highest < 53000, "hash_pointer byte %i highest %i",
+ i, highest);
+ diag("hash_pointer byte %i, range %u-%u", i, lowest, highest);
+ }
+
+ return exit_status();
+}
diff --git a/config.h b/config.h
new file mode 100644
index 0000000..0109c4a
--- /dev/null
+++ b/config.h
@@ -0,0 +1,16 @@
+
+/* Includes and defines for ccan files */
+
+#if !defined(LITTLE_ENDIAN) && !defined(BIG_ENDIAN)
+ #include <endian.h>
+#endif
+#ifdef LITTLE_ENDIAN
+ #define HAVE_LITTLE_ENDIAN 1
+ #define HAVE_BIG_ENDIAN 0
+#elif defined(BIG_ENDIAN)
+ #define HAVE_LITTLE_ENDIAN 0
+ #define HAVE_BIG_ENDIAN 1
+#else
+ #error Unknown endian
+#endif
+
diff --git a/debian/README.Debian b/debian/README.Debian
new file mode 100644
index 0000000..52803d5
--- /dev/null
+++ b/debian/README.Debian
@@ -0,0 +1,9 @@
+wiggle for Debian
+-----------------
+
+The package also provides a script developed by Neil Brown that
+is useful for manipulating patches. See files in:
+
+ /usr/share/doc/wiggle/contrib
+
+ -- Jari Aalto <jari.aalto@cante.net>, Thu, 4 Mar 2010 01:15:20 +0200
diff --git a/debian/changelog b/debian/changelog
new file mode 100644
index 0000000..e54f0d9
--- /dev/null
+++ b/debian/changelog
@@ -0,0 +1,169 @@
+wiggle (0.9.1-1) unstable; urgency=low
+
+ * New upstream release
+ - Non-DSFG file DOC/diff.ps no longer included. See bug #698846.
+ * debian/install
+ - Rename from debian/wiggle.install.
+ * debian/patches:
+ - (10): Rm. Fix for a type in ReadMe.c accepted upstream.
+ - (20): New. Correct hyphen in manual page.
+ * debian/watch
+ - Add URL to check new releases. Thanks to Bart Martens
+ <bartm@debian.org>.
+
+ -- Jari Aalto <jari.aalto@cante.net> Thu, 23 May 2013 20:30:58 +0300
+
+wiggle (0.8+dfsg1-2) unstable; urgency=low
+
+ * debian/control
+ - (Homepage): Update broken URL to Freecode (Closes: #699715).
+ * debian/copyright
+ - Update upstream URL locations. See above bug.
+
+ -- Jari Aalto <jari.aalto@cante.net> Thu, 28 Feb 2013 10:59:53 +0200
+
+wiggle (0.8+dfsg1-1) unstable; urgency=low
+
+ * Repackage upstream tarball: remove .git and non-free *.ps
+ documentation about used algorithm (Closes: #698846). Upstream
+ has removed the file in the next release.
+ * debian/control
+ - (Standards-Version): Update to 3.9.4.
+ * debian/copyright
+ - Update year.
+ * debian/docs
+ - Add missing newline to the last line.
+ * debian/wiggle.docs
+ - Delete file due to #698846. There is already debian/docs.
+
+ -- Jari Aalto <jari.aalto@cante.net> Wed, 20 Feb 2013 12:12:10 +0200
+
+wiggle (0.8-2) unstable; urgency=low
+
+ * debian/control
+ - (Build-Depends): Update to debhelper 9. Add libncurses5-dev.
+ - (Description): Adjust first line.
+ - (Standards-Version): Update to 3.9.3.1.
+ * debian/copyright
+ - Update to format 1.0.
+ * debian/rules
+ - (override_dh_*_clean): New. Save and restore original files.
+ - Use hardened build flags
+ http://wiki.debian.org/ReleaseGoals/SecurityHardeningBuildFlags
+
+ -- Jari Aalto <jari.aalto@cante.net> Sat, 24 Mar 2012 06:07:32 -0400
+
+wiggle (0.8-1) unstable; urgency=low
+
+ * New upstream release
+ - Manual page hyphens (Closes: #574576).
+ - Gcc 4.4 pointer initialization with NULL (Closes: #574577)
+ - Strange bogus conflict (Closes: #271766)
+ * debian/control
+ - (Build-Depends): Add libncurses5-dev.
+ * debian/patches
+ - Remove those integrated to upstream.
+ - (10): Fix spelling.
+ - (20): Fix include.
+ * debian/watch
+ - Add URL.
+ * debian/rules
+ - Simplify even further.
+ - (binary-arch): delete.
+ - (override_dh_installchangelogs): add.
+
+ -- Jari Aalto <jari.aalto@cante.net> Tue, 11 May 2010 09:43:43 +0300
+
+wiggle (0.6.20100304+git1c5bfa7-1) unstable; urgency=low
+
+ * New upstream release.
+ * debian/control
+ - (Vcs-*): new fields.
+ - (Section): Change from utils to vcs (used by the FTP masters).
+ * debian/patches
+ - (02 p.help, 30 dotest): remove. Integrated to upstream.
+ - (05 Makefile): Adjust for new upstream.
+
+ -- Jari Aalto <jari.aalto@cante.net> Fri, 05 Mar 2010 09:36:48 +0200
+
+wiggle (0.6-7) unstable; urgency=low
+
+ * New maintainer (Closes: #568575).
+ - Move to packaging format "3.0 (quilt)".
+ * debian/control
+ - (Build-Depends): update to debhelper 7.1.
+ - (Depends): add ${misc:Depends}.
+ - (Standards-Version): Update to 3.8.4.
+ * debian/copyright
+ - Update layout.
+ - Update download URL (Closes: #515124).
+ * debian/dirs
+ - Deleted. Not needed.
+ * debian/patches
+ - Convert inline changes to patches.
+ * debian/README.Debian
+ - Correct 'perl script' to just script (actually Bourne shell).
+ - Remove indication to patches, not relevant for the end-user.
+ * debian/rules
+ - Update to dh(1).
+ - (CFLAGS): Change -Werror to -Wall -pedantic
+ (gcc-4.5 FTBFS; Closes: #565074).
+ * debian/source/format
+ - New file.
+ * debian/wiggle.docs
+ - New file.
+ * debian/wiggle.install
+ - New file.
+
+ -- Jari Aalto <jari.aalto@cante.net> Thu, 04 Mar 2010 18:08:16 +0200
+
+wiggle (0.6-6) unstable; urgency=low
+
+ * Take sole ownership of the package, Nicolas is unresponsive
+ * Add Homepage field to control header
+ * Change to dh compat level 5
+ * Do not ignore errors in make clean
+ * Update Standards-Version to 3.7.3, no changes needed
+ * Update the FSF address in the debian/copyright file
+ * Switch to use the git upstream
+
+ -- Baruch Even <baruch@debian.org> Sat, 29 Dec 2007 21:22:49 +0200
+
+wiggle (0.6-5) unstable; urgency=low
+
+ * Adopt this package (closes: #233661)
+ * Modify build to use dpatch
+ * New Standard: 3.6.1.1
+ - No need to change
+ * Add upstream fixes 004ExtractFix 005Pchanges (006NoDebug is already
+ applied)
+ * Add the notes files to the documentation
+
+ -- Nicolas Thomas <nthomas@free.fr> Sat, 4 Dec 2004 22:42:17 +0100
+
+wiggle (0.6-4) unstable; urgency=low
+
+ * Orphan this package
+
+ -- Igor Genibel <igenibel@debian.org> Thu, 19 Feb 2004 11:07:37 +0100
+
+wiggle (0.6-3) unstable; urgency=low
+
+ * Fix upstream bug (Thanks Jan Hudec <bulb@ucw.cz>)
+ - upstream provided a patch
+ * New Standard: 3.6.0
+ - No need to change
+
+ -- Igor Genibel <igenibel@debian.org> Sat, 12 Jul 2003 10:39:15 +0200
+
+wiggle (0.6-2) unstable; urgency=low
+
+ * Add new build dependency to groff (Closes: #197994)
+
+ -- Igor Genibel <igenibel@debian.org> Thu, 19 Jun 2003 09:31:14 +0200
+
+wiggle (0.6-1) unstable; urgency=low
+
+ * Initial Release. (Closes: #196862)
+
+ -- Igor Genibel <igenibel@debian.org> Tue, 10 Jun 2003 12:24:49 +0200
diff --git a/debian/compat b/debian/compat
new file mode 100644
index 0000000..ec63514
--- /dev/null
+++ b/debian/compat
@@ -0,0 +1 @@
+9
diff --git a/debian/control b/debian/control
new file mode 100644
index 0000000..4062e19
--- /dev/null
+++ b/debian/control
@@ -0,0 +1,21 @@
+Source: wiggle
+Section: vcs
+Priority: optional
+Maintainer: Jari Aalto <jari.aalto@cante.net>
+Build-Depends: debhelper (>= 9), time, groff, libncurses5-dev
+Standards-Version: 3.9.4
+Vcs-Browser: http://git.debian.org/?p=collab-maint/wiggle.git
+Vcs-Git: git://git.debian.org/git/collab-maint/wiggle.git
+Homepage: http://freecode.com/projects/wiggle
+
+Package: wiggle
+Architecture: any
+Depends: ${misc:Depends}, ${shlibs:Depends}
+Description: apply patches with conflicting changes
+ Wiggle is a program for applying patches that 'patch' cannot
+ apply due to conflicting changes in the original.
+ .
+ Wiggle will always apply all changes in the patch to the original.
+ If it cannot find a way to cleanly apply a patch, it inserts it
+ in the original in a manner similar to 'merge', and report an
+ unresolvable conflict.
diff --git a/debian/copyright b/debian/copyright
new file mode 100644
index 0000000..46fadc5
--- /dev/null
+++ b/debian/copyright
@@ -0,0 +1,38 @@
+Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0
+Upstream-Name: wiggle
+Upstream-Contact: Neil Brown <neilb@suse.de>
+Source: http://neil.brown.name/wiggle
+X-Source: http://freecode.com/projects/wiggle
+X-Upstream-Vcs: http://git.neil.brown.name/git/wiggle
+X-Upstream-Comment:
+ Upstream homepage is at http://neil.brown.name
+
+Files: *
+Copyright:
+ 2010-2012 Jari Aalto <jari.aalto@cante.net>
+ 2007 Baruch Even <baruch@debian.org>
+ 2004 Nicolas Thomas <nthomas@free.fr>
+ 2003 Igor Genibel <igenibel@debian.org>
+License: GPL-2+
+
+Files: debian/*
+Copyright:
+ 2012-2013 Jari Aalto <jari.aalto@cante.net>
+License: GPL-2+
+
+License: GPL-2+
+ This package is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ .
+ This package is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ .
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ .
+ On Debian systems, the complete text of the GNU General
+ Public License can be found in "/usr/share/common-licenses/GPL-2".
diff --git a/debian/docs b/debian/docs
new file mode 100644
index 0000000..421cc0e
--- /dev/null
+++ b/debian/docs
@@ -0,0 +1,3 @@
+TODO
+ANNOUNCE
+notes
diff --git a/debian/install b/debian/install
new file mode 100644
index 0000000..ea26398
--- /dev/null
+++ b/debian/install
@@ -0,0 +1,2 @@
+p usr/share/doc/wiggle/contrib
+p.help usr/share/doc/wiggle/contrib
diff --git a/debian/patches/20-manpage.patch b/debian/patches/20-manpage.patch
new file mode 100644
index 0000000..2464eb0
--- /dev/null
+++ b/debian/patches/20-manpage.patch
@@ -0,0 +1,78 @@
+From: Jari Aalto <jari.aalto@cante.net>
+Subject: Corrections manual page (hyphens).
+
+---
+ wiggle.1 | 18 +++++++++---------
+ 1 file changed, 9 insertions(+), 9 deletions(-)
+
+--- a/wiggle.1
++++ b/wiggle.1
+@@ -136,16 +136,16 @@
+ be wiggled in to place.
+
+ .TP
+-.BR -w ", " \-\-words
++.BR \-w ", " \-\-words
+ Request that all operations and display be word based. This is the
+ default for the "diff" function.
+
+ .TP
+-.BR -l ", " \-\-lines
++.BR \-l ", " \-\-lines
+ Request that all operations and display be line based.
+
+ .TP
+-.BR -p ", " \-\-patch
++.BR \-p ", " \-\-patch
+ Treat the last named file as a patch instead of a file (with \-\-diff)
+ or a merge (\-\-extract).
+ In
+@@ -153,7 +153,7 @@
+ or
+ .B browse
+ mode,
+-.B -p
++.B \-p
+ requires there be exactly one file which is a patch and which can
+ contain patches to multiple file. The patches are merged into each
+ file. When used in
+@@ -163,7 +163,7 @@
+ option as writing lots of merged files to standard-out is impractical.
+
+ When processing a multi-file patch,
+-B -p
++B \-p
+ can be followed by a numeric argument indicating how many file name
+ components should be stripped from files named in the patch file. If no
+ numeric argument is given,
+@@ -171,12 +171,12 @@
+ will deduce an appropriate number based what files are visible.
+
+ .TP
+-.BR -r ", " \-\-replace
++.BR \-r ", " \-\-replace
+ Normally the merged output is written to standard-output. With
+ \-\-replace, the original file is replaced with the merge output.
+
+ .TP
+-.BR -R ", " \-\-reverse
++.BR \-R ", " \-\-reverse
+ When used with the "diff" function, swap the files before calculating
+ the differences.
+ When used with the "merge" function,
+@@ -184,13 +184,13 @@
+ attempts to revert changes rather than apply them.
+
+ .TP
+-.BR -i ", " \-\-no\-ignore
++.BR \-i ", " \-\-no\-ignore
+ Normally wiggle will ignore changes in the patch which appear to
+ already have been applied in the original. With this flag those
+ changes are reported as conflicts rather than being ignored.
+
+ .TP
+-.BR -W ", " \-\-show\-wiggle
++.BR \-W ", " \-\-show\-wiggle
+ When used with
+ .IR \-\-merge ,
+ conflicts that can be wiggled into place are reported as conflicts
diff --git a/debian/patches/series b/debian/patches/series
new file mode 100644
index 0000000..42835a2
--- /dev/null
+++ b/debian/patches/series
@@ -0,0 +1 @@
+20-manpage.patch
diff --git a/debian/rules b/debian/rules
new file mode 100755
index 0000000..72da94e
--- /dev/null
+++ b/debian/rules
@@ -0,0 +1,13 @@
+#!/usr/bin/make -f
+
+export DEB_BUILD_MAINT_OPTIONS = hardening=+all
+export DEB_CFLAGS_MAINT_APPEND = -Wall -pedantic -I.
+export DEB_LDFLAGS_MAINT_APPEND = -Wl,--as-needed
+
+override_dh_auto_build:
+ $(MAKE) CFLAGS="$(CFLAGS) $(CPPFLAGS) $(LDFLAGS)"
+
+%:
+ dh $@
+
+# End of file
diff --git a/debian/source/format b/debian/source/format
new file mode 100644
index 0000000..163aaf8
--- /dev/null
+++ b/debian/source/format
@@ -0,0 +1 @@
+3.0 (quilt)
diff --git a/debian/watch b/debian/watch
new file mode 100644
index 0000000..8ed6c68
--- /dev/null
+++ b/debian/watch
@@ -0,0 +1,2 @@
+version=3
+http://neil.brown.name/wiggle/wiggle-(\d.*)\.(?:tgz|tbz2|txz|tar\.(?:gz|bz2|xz))
diff --git a/demo.orig/Makefile b/demo.orig/Makefile
new file mode 100644
index 0000000..831723c
--- /dev/null
+++ b/demo.orig/Makefile
@@ -0,0 +1,53 @@
+
+# Note on my Mobile Pentium II, -march=pentium2 delivers twice the performance of i386
+#OptDbg=-O3
+#OptDbg=-O3 -march=pentium2
+OptDbg=-ggdb
+CFLAGS=$(OptDbg) -Wall -Werror
+
+# STRIP = -s
+INSTALL = /usr/bin/install
+DESTDIR =
+BINDIR = /usr/bin
+MANDIR = /usr/share/man
+MAN1DIR = $(MANDIR)/man1
+MAN5DIR = $(MANDIR)/man5
+LDLIBS=-lncurses
+
+all: wiggle wiggle.man test
+
+vpatch : vpatch.o extract.o split.o diff.o
+
+
+wiggle : wiggle.o load.o split.o extract.o diff.o bestmatch.o ReadMe.o merge.o
+wiggle.o load.o split.o extract.o diff.o bestmatch.o ReadMe.o merge.o : wiggle.h
+
+test: wiggle dotest
+ sh dotest
+
+wiggle.man : wiggle.1
+ nroff -man wiggle.1 > wiggle.man
+
+clean:
+ rm -f *.o *.man wiggle .version* version
+ find . -name core -o -name '*.tmp*' -o -name .tmp | xargs rm -f
+
+install : wiggle wiggle.1
+ $(INSTALL) -D $(STRIP) -m 755 wiggle $(DESTDIR)$(BINDIR)/wiggle
+ $(INSTALL) -D -m 644 wiggle.1 $(DESTDIR)$(MAN1DIR)/wiggle.1
+
+version : ReadMe.c wiggle.1
+ @rm -f version
+ @sed -n -e 's/.*wiggle - v\([0-9.]*\) - .*/\1/p' ReadMe.c > .version-readme
+ @sed -n -e 's/.*WIGGLE 1 "" v\([0-9.]*\)$$/\1/p' wiggle.1 > .version-man
+ @cmp -s .version-readme .version-man && cat .version-man > version || { echo Inconsistant versions.; exit 1;}
+
+dist : test clean version
+ mkdir -p DIST
+ rm -f DIST/wiggle-`cat version`
+ ln -s .. DIST/wiggle-`cat version`
+ tar czvf DIST/wiggle-`cat version`.tar.gz -h -C DIST --exclude RCS --exclude DIST wiggle-`cat version`
+ rm -f DIST/wiggle-`cat version`
+
+v : version
+ cat version
diff --git a/demo.orig/README b/demo.orig/README
new file mode 100644
index 0000000..64eed86
--- /dev/null
+++ b/demo.orig/README
@@ -0,0 +1,60 @@
+
+This is demo file for wiggles --browse mode.
+
+Browse mode is intended to let you look through a patch
+to see how it will apply to a set of files. It is
+possible that the patch will have some conflicts.
+i.e. the patch was created from a different version of
+the source to the version that you are applying the patch
+to. This is what makes it interesting.
+
+You can use normal cursor motion to scroll around,
+both vertially and horizontally.
+From the initial file-list patch, use space
+or <return> to open/close a directory or file.
+Form the file-view mode, use 'q' to get back to
+to file list.
+
+Differences applied by the patch are shown as
+RED for removal and BLUE for addition.
+Text with a pink background was not matched -
+maybe it has been changed since the patch was
+created.
+
+green-background text is text that the patch wants
+to change, but the exact correct change has already
+been made.
+
+Captial-N might go to the next interesting chunk
+of the file.
+
+
+---------------------------------------
+This demonstrates where a diff on one line
+applies to text that is now split
+
+The swift brown fox jumps over the lazy dog.
+
+------------
+
+This demonstrates a diff which contains some
+extraneous lines.
+(this is an extra line)
+
+The swift brown she
+fox jumps over the lazy dog.
+
+There will be extra lines in the diff
+(as is this)
+
+--------------
+
+Here are some lines
+without the least sign of
+a clear conflict
+
+--------------
+
+Two different lines,
+both with changes
+
diff --git a/demo.orig/vpatch.c b/demo.orig/vpatch.c
new file mode 100644
index 0000000..d3bd584
--- /dev/null
+++ b/demo.orig/vpatch.c
@@ -0,0 +1,666 @@
+
+/*
+ * vpatch - visual front end for wiggle
+ *
+ * "files" display, lists all files with statistics
+ * - can hide various lines including subdirectories
+ * and files without wiggles or conflicts
+ * "diff" display shows merged file with different parts
+ * in different colours
+ * - untouched are pale A_DIM
+ * - matched/remaining are regular A_NORMAL
+ * - matched/removed are red/underlined A_UNDERLINE
+ * - unmatched in file are A_STANDOUT
+ * - unmatched in patch are A_STANDOUT|A_UNDERLINE ???
+ * - inserted are inverse/green ?? A_REVERSE
+ *
+ * The window can be split horiz or vert and two different
+ * views displayed. They will have different parts missing
+ *
+ * So a display of NORMAL, underline, standout|underline reverse
+ * should show a normal patch.
+ *
+ */
+
+#include "wiggle.h"
+#include <malloc.h>
+#include <string.h>
+#include <curses.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#define assert(x) do { if (!(x)) abort(); } while (0)
+
+struct plist {
+ char *file;
+ unsigned int start, end;
+ int parent;
+ int next, prev, last;
+ int open;
+ int chunks, wiggles, conflicts;
+};
+
+struct plist *patch_add_file(struct plist *pl, int *np, char *file,
+ unsigned int start, unsigned int end)
+{
+ /* size of pl is 0, 16, n^2 */
+ int n = *np;
+ int asize;
+
+/* printf("adding %s at %d: %u %u\n", file, n, start, end); */
+ if (n==0) asize = 0;
+ else if (n<=16) asize = 16;
+ else if ((n&(n-1))==0) asize = n;
+ else asize = n+1; /* not accurate, but not too large */
+ if (asize <= n) {
+ /* need to extend array */
+ struct plist *npl;
+ if (asize < 16) asize = 16;
+ else asize += asize;
+ npl = realloc(pl, asize * sizeof(struct plist));
+ if (!npl) {
+ fprintf(stderr, "malloc failed - skipping %s\n", file);
+ return pl;
+ }
+ pl = npl;
+ }
+ pl[n].file = file;
+ pl[n].start = start;
+ pl[n].end = end;
+ pl[n].last = pl[n].next = pl[n].prev = pl[n].parent = -1;
+ pl[n].chunks = pl[n].wiggles = pl[n].conflicts = 0;
+ pl[n].open = 1;
+ *np = n+1;
+ return pl;
+}
+
+
+
+struct plist *parse_patch(FILE *f, FILE *of, int *np)
+{
+ /* read a multi-file patch from 'f' and record relevant
+ * details in a plist.
+ * if 'of' >= 0, fd might not be seekable so we write
+ * to 'of' and use lseek on 'of' to determine position
+ */
+ struct plist *plist = NULL;
+
+ while (!feof(f)) {
+ /* first, find the start of a patch: "\n+++ "
+ * grab the file name and scan to the end of a line
+ */
+ char *target="\n+++ ";
+ char *target2="\n--- ";
+ char *pos = target;
+ int c;
+ char name[1024];
+ unsigned start, end;
+
+ while (*pos && (c=fgetc(f)) != EOF ) {
+ if (of) fputc(c, of);
+ if (c == *pos)
+ pos++;
+ else pos = target;
+ }
+ if (c == EOF)
+ break;
+ assert(c == ' ');
+ /* now read a file name */
+ pos = name;
+ while ((c=fgetc(f)) != EOF && c != '\t' && c != '\n' && c != ' ' &&
+ pos - name < 1023) {
+ *pos++ = c;
+ if (of) fputc(c, of);
+ }
+ *pos = 0;
+ if (c == EOF)
+ break;
+ if (of) fputc(c, of);
+ while (c != '\n' && (c=fgetc(f)) != EOF) {
+ if (of) fputc(c, of);
+ }
+ start = of ? ftell(of) : ftell(f);
+
+ if (c == EOF) break;
+
+ /* now skip to end - "\n--- " */
+ pos = target2+1;
+
+ while (*pos && (c=fgetc(f)) != EOF) {
+ if (of) fputc(c, of);
+ if (c == *pos)
+ pos++;
+ else pos = target2;
+ }
+ if (pos > target2) {
+ end = of ? ftell(of) : ftell(f);
+ end -= (pos - target2) - 1;
+ plist = patch_add_file(plist, np,
+ strdup(name), start, end);
+ }
+ }
+ return plist;
+}
+void die()
+{
+ fprintf(stderr,"vpatch: fatal error\n");
+ abort();
+ exit(3);
+}
+
+
+static struct stream load_segment(FILE *f,
+ unsigned int start, unsigned int end)
+{
+ struct stream s;
+ s.len = end - start;
+ s.body = malloc(s.len);
+ if (s.body) {
+ fseek(f, start, 0);
+ if (fread(s.body, 1, s.len, f) != s.len) {
+ free(s.body);
+ s.body = NULL;
+ }
+ } else
+ die();
+ return s;
+}
+
+
+void catch(int sig)
+{
+ if (sig == SIGINT) {
+ signal(sig, catch);
+ return;
+ }
+ nocbreak();nl();endwin();
+ printf("Died on signal %d\n", sig);
+ exit(2);
+}
+
+int pl_cmp(const void *av, const void *bv)
+{
+ const struct plist *a = av;
+ const struct plist *b = bv;
+ return strcmp(a->file, b->file);
+}
+
+int common_depth(char *a, char *b)
+{
+ /* find number of patch segments that these two have
+ * in common
+ */
+ int depth = 0;
+ while(1) {
+ char *c;
+ int al, bl;
+ c = strchr(a, '/');
+ if (c) al = c-a; else al = strlen(a);
+ c = strchr(b, '/');
+ if (c) bl = c-b; else bl = strlen(b);
+ if (al == 0 || al != bl || strncmp(a,b,al) != 0)
+ return depth;
+ a+= al;
+ while (*a=='/') a++;
+ b+= bl;
+ while(*b=='/') b++;
+
+ depth++;
+ }
+}
+
+struct plist *add_dir(struct plist *pl, int *np, char *file, char *curr)
+{
+ /* any parent of file that is not a parent of curr
+ * needs to be added to pl
+ */
+ int d = common_depth(file, curr);
+ char *buf = curr;
+ while (d) {
+ char *c = strchr(file, '/');
+ int l;
+ if (c) l = c-file; else l = strlen(file);
+ file += l;
+ curr += l;
+ while (*file == '/') file++;
+ while (*curr == '/') curr++;
+ d--;
+ }
+ while (*file) {
+ if (curr > buf && curr[-1] != '/')
+ *curr++ = '/';
+ while (*file && *file != '/')
+ *curr++ = *file++;
+ while (*file == '/') *file++;
+ *curr = '\0';
+ if (*file)
+ pl = patch_add_file(pl, np, strdup(buf),
+ 0, 0);
+ }
+ return pl;
+}
+
+struct plist *sort_patches(struct plist *pl, int *np)
+{
+ /* sort the patches, add directory names, and re-sort */
+ char curr[1024];
+ char *prev;
+ int parents[100];
+ int prevnode[100];
+ int i, n;
+ qsort(pl, *np, sizeof(struct plist), pl_cmp);
+ curr[0] = 0;
+ n = *np;
+ for (i=0; i<n; i++)
+ pl = add_dir(pl, np, pl[i].file, curr);
+
+ qsort(pl, *np, sizeof(struct plist), pl_cmp);
+
+ /* array is now stable, so set up parent pointers */
+ n = *np;
+ curr[0] = 0;
+ prevnode[0] = -1;
+ prev = "";
+ for (i=0; i<n; i++) {
+ int d = common_depth(prev, pl[i].file);
+ if (d == 0)
+ pl[i].parent = -1;
+ else {
+ pl[i].parent = parents[d-1];
+ pl[pl[i].parent].last = i;
+ }
+ pl[i].prev = prevnode[d];
+ if (pl[i].prev > -1)
+ pl[pl[i].prev].next = i;
+ prev = pl[i].file;
+ parents[d] = i;
+ prevnode[d] = i;
+ prevnode[d+1] = -1;
+ }
+ return pl;
+}
+
+int get_prev(int pos, struct plist *pl, int n)
+{
+ if (pos == -1) return pos;
+ if (pl[pos].prev == -1)
+ return pl[pos].parent;
+ pos = pl[pos].prev;
+ while (pl[pos].open &&
+ pl[pos].last >= 0)
+ pos = pl[pos].last;
+ return pos;
+}
+
+int get_next(int pos, struct plist *pl, int n)
+{
+ if (pos == -1) return pos;
+ if (pl[pos].open) {
+ if (pos +1 < n)
+ return pos+1;
+ else
+ return -1;
+ }
+ while (pos >= 0 && pl[pos].next == -1)
+ pos = pl[pos].parent;
+ if (pos >= 0)
+ pos = pl[pos].next;
+ return pos;
+}
+
+void draw_one(int row, struct plist *pl)
+{
+ char hdr[10];
+ hdr[0] = 0;
+
+ if (pl == NULL) {
+ move(row,0);
+ clrtoeol();
+ return;
+ }
+ if (pl->chunks > 99)
+ strcpy(hdr, "XX");
+ else sprintf(hdr, "%02d", pl->chunks);
+ if (pl->wiggles > 99)
+ strcpy(hdr, " XX");
+ else sprintf(hdr+2, " %02d", pl->wiggles);
+ if (pl->conflicts > 99)
+ strcpy(hdr, " XX");
+ else sprintf(hdr+5, " %02d ", pl->conflicts);
+ if (pl->end)
+ strcpy(hdr+9, "= ");
+ else if (pl->open)
+ strcpy(hdr+9, "+ ");
+ else strcpy(hdr+9, "- ");
+
+ mvaddstr(row, 0, hdr);
+ mvaddstr(row, 11, pl->file);
+ clrtoeol();
+}
+
+void addword(struct elmnt e)
+{
+ addnstr(e.start, e.len);
+}
+
+void diff_window(struct plist *p, FILE *f)
+{
+ /*
+ * I wonder what to display here ....
+ */
+ struct stream s;
+ struct stream s1, s2;
+ struct file f1, f2;
+ struct csl *csl;
+ char buf[100];
+ int ch;
+ s = load_segment(f, p->start, p->end);
+ ch = split_patch(s, &s1, &s2);
+
+ clear();
+ sprintf(buf, "Chunk count: %d\n", ch);
+ mvaddstr(1,1,buf); clrtoeol();
+
+
+ f1 = split_stream(s1, ByWord, 0);
+ f2 = split_stream(s2, ByWord, 0);
+
+ csl = diff(f1, f2);
+
+ /* now try to display the diff highlighted */
+ int sol = 1;
+ int a=0, b=0;
+
+ while(a<f1.elcnt || b < f2.elcnt) {
+ if (a < csl->a) {
+ if (sol) {
+ int a1;
+ /* if we remove a whole line, output +line,
+ * else clear sol and retry
+ */
+ sol = 0;
+ for (a1=a; a1<csl->a; a1++)
+ if (f1.list[a1].start[0] == '\n') {
+ sol = 1;
+ break;
+ }
+ if (sol) {
+ addch('-');
+ attron(A_UNDERLINE);
+ for (; a<csl->a; a++) {
+ addword(f1.list[a]);
+ if (f1.list[a].start[0] == '\n') {
+ a++;
+ break;
+ }
+ }
+ attroff(A_UNDERLINE);
+ } else addch('|');
+ }
+ if (!sol) {
+ attron(A_UNDERLINE);
+ do {
+ if (sol) {
+ attroff(A_UNDERLINE);
+ addch('|');
+ attron(A_UNDERLINE);
+ }
+ addword(f1.list[a]);
+ sol = (f1.list[a].start[0] == '\n');
+ a++;
+ } while (a < csl->a);
+ attroff(A_UNDERLINE);
+ if (sol) addch('|');
+ sol = 0;
+ }
+ } else if (b < csl->b) {
+ if (sol) {
+ int b1;
+ sol = 0;
+ for (b1=b; b1<csl->b; b1++)
+ if (f2.list[b1].start[0] == '\n') {
+ sol = 1;
+ break;
+ }
+ if (sol) {
+ addch('+');
+ attron(A_BOLD);
+ for (; b<csl->b; b++) {
+ addword(f2.list[b]);
+ if (f2.list[b].start[0] == '\n') {
+ b++;
+ break;
+ }
+ }
+ attroff(A_BOLD);
+ } else addch('|');
+ }
+ if (!sol) {
+ attron(A_BOLD);
+ do {
+ if (sol) {
+ attroff(A_BOLD);
+ addch('|');
+ attron(A_BOLD);
+ }
+ addword(f2.list[b]);
+ sol = (f2.list[b].start[0] == '\n');
+ b++;
+ } while (b < csl->b);
+ attroff(A_BOLD);
+ if (sol) addch('|');
+ sol = 0;
+ }
+ } else {
+ if (sol) {
+ int a1;
+ sol = 0;
+ for (a1=a; a1<csl->a+csl->len; a1++)
+ if (f1.list[a1].start[0] == '\n')
+ sol = 1;
+ if (sol) {
+ if (f1.list[a].start[0]) {
+ addch(' ');
+ for (; a< csl->a+csl->len; a++,b++) {
+ addword(f1.list[a]);
+ if (f1.list[a].start[0]=='\n') {
+ a++,b++;
+ break;
+ }
+ }
+ } else {
+ addstr("SEP\n");
+ a++; b++;
+ }
+ } else addch('|');
+ }
+ if (!sol) {
+ addword(f1.list[a]);
+ if (f1.list[a].start[0] == '\n')
+ sol = 1;
+ a++;
+ b++;
+ }
+ if (a >= csl->a+csl->len)
+ csl++;
+ }
+ }
+
+
+ getch();
+
+ free(s1.body);
+ free(s2.body);
+ free(f1.list);
+ free(f2.list);
+}
+
+void main_window(struct plist *pl, int n, FILE *f)
+{
+ /* The main window lists all files together with summary information:
+ * number of chunks, number of wiggles, number of conflicts.
+ * The list is scrollable
+ * When a entry is 'selected', we switch to the 'file' window
+ * The list can be condensed by removing files with no conflict
+ * or no wiggles, or removing subdirectories
+ *
+ * We record which file in the list is 'current', and which
+ * screen line it is on. We try to keep things stable while
+ * moving.
+ *
+ * Counts are printed before the name using at most 2 digits.
+ * Numbers greater than 99 are XX
+ * Ch Wi Co File
+ * 27 5 1 drivers/md/md.c
+ *
+ * A directory show the sum in all children.
+ *
+ * Commands:
+ * select: enter, space, mouseclick
+ * on file, go to file window
+ * on directory, toggle open
+ * up: k, p, control-p uparrow
+ * Move to previous open object
+ * down: j, n, control-n, downarrow
+ * Move to next open object
+ *
+ */
+ int pos=0; /* position in file */
+ int row=1; /* position on screen */
+ int rows; /* size of screen in rows */
+ int cols;
+ int tpos, i;
+ int refresh = 2;
+ int c;
+
+ while(1) {
+ if (refresh == 2) {
+ clear();
+ attron(A_BOLD);
+ mvaddstr(0,0,"Ch Wi Co Patched Files");
+ move(2,0);
+ attroff(A_BOLD);
+ refresh = 1;
+ }
+ if (row <1 || row >= rows)
+ refresh = 1;
+ if (refresh) {
+ refresh = 0;
+ getmaxyx(stdscr, rows, cols);
+ if (row >= rows +3)
+ row = (rows+1)/2;
+ if (row >= rows)
+ row = rows-1;
+ tpos = pos;
+ for (i=row; i>1; i--) {
+ tpos = get_prev(tpos, pl, n);
+ if (tpos == -1) {
+ row = row - i + 1;
+ break;
+ }
+ }
+ /* Ok, row and pos could be trustworthy now */
+ tpos = pos;
+ for (i=row; i>=1; i--) {
+ draw_one(i, &pl[tpos]);
+ tpos = get_prev(tpos, pl, n);
+ }
+ tpos = pos;
+ for (i=row+1; i<rows; i++) {
+ tpos = get_next(tpos, pl, n);
+ if (tpos >= 0)
+ draw_one(i, &pl[tpos]);
+ else
+ draw_one(i, NULL);
+ }
+ }
+ move(row, 9);
+ c = getch();
+ switch(c) {
+ case 'j':
+ case 'n':
+ case 'N':
+ case 'N'-64:
+ case KEY_DOWN:
+ tpos = get_next(pos, pl, n);
+ if (tpos >= 0) {
+ pos = tpos;
+ row++;
+ }
+ break;
+ case 'k':
+ case 'p':
+ case 'P':
+ case 'P'-64:
+ case KEY_UP:
+ tpos = get_prev(pos, pl, n);
+ if (tpos >= 0) {
+ pos = tpos;
+ row--;
+ }
+ break;
+
+ case ' ':
+ case 13:
+ if (pl[pos].end == 0) {
+ pl[pos].open = ! pl[pos].open;
+ refresh = 1;
+ } else {
+ diff_window(&pl[pos], f);
+ refresh = 2;
+ }
+ break;
+ case 27: /* escape */
+ case 'q':
+ return;
+ }
+ }
+}
+
+
+int main(int argc, char *argv[])
+{
+ int n = 0;
+ FILE *f = NULL;
+ FILE *in = stdin;
+ struct plist *pl;
+
+ if (argc == 3)
+ f = fopen(argv[argc-1], "w+");
+ if (argc >=2)
+ in = fopen(argv[1], "r");
+ else {
+ printf("no arg...\n");
+ exit(2);
+ }
+
+ pl = parse_patch(in, f, &n);
+ pl = sort_patches(pl, &n);
+
+ if (f) {
+ fclose(in);
+ in = f;
+ }
+#if 0
+ int i;
+ for (i=0; i<n ; i++) {
+ printf("%3d: %3d %2d/%2d %s\n", i, pl[i].parent, pl[i].prev, pl[i].next, pl[i].file);
+ }
+ exit(0);
+#endif
+ signal(SIGINT, catch);
+ signal(SIGQUIT, catch);
+ signal(SIGTERM, catch);
+ signal(SIGBUS, catch);
+ signal(SIGSEGV, catch);
+
+ initscr(); cbreak(); noecho();
+ nonl(); intrflush(stdscr, FALSE); keypad(stdscr, TRUE);
+ mousemask(ALL_MOUSE_EVENTS, NULL);
+
+ main_window(pl, n, in);
+
+ nocbreak();nl();endwin();
+ return 0;
+}
diff --git a/demo.orig/wiggle.c b/demo.orig/wiggle.c
new file mode 100644
index 0000000..2bbb90f
--- /dev/null
+++ b/demo.orig/wiggle.c
@@ -0,0 +1,643 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@cse.unsw.edu.au>
+ * Paper: Neil Brown
+ * School of Computer Science and Engineering
+ * The University of New South Wales
+ * Sydney, 2052
+ * Australia
+ */
+
+/*
+ * Wiggle is a tool for working with patches that don't quite apply properly.
+ * It provides functionality similar to 'diff' and 'merge' but can
+ * work at the level of individual words thus allowing the merging of
+ * two changes that affect the same line, but not the same parts of that line.
+ *
+ * Wiggle can also read patch and merge files. Unlike 'merge' it does not
+ * need to be given three separate files, but can be given a file and a patch
+ * and it will extract the pieces of the two two other files that it needs from
+ * the patch.
+ *
+ * Wiggle performs one of three core function:
+ * --extract -x extract part of a patch or merge file
+ * --diff -d report differences between two files
+ * --merge -m merge the changes between two files into a third file
+ *
+ * To perform these, wiggle requires 1, 2, or 3 input streams respectively.
+ * I can get there from individual files, from a diff (unified or context) or
+ * from a merge file.
+ *
+ * For merge:
+ * If one file is given, it is a merge file (output of 'merge').
+ * If two files are given, the second is assumed to be a patch, the first is a normal file.
+ * If three files are given, they are taken to be normal files.
+ *
+ * For diff:
+ * If one file is given, it is a patch
+ * If two files are given, they are normal files.
+ *
+ * For extract:
+ * Only one file can be given. -p indicates it is a patch, otherwise it is a merge.
+ * One of the flags -1 -2 or -3 must also be given and they indicate which
+ * part of the patch or merge to extract.
+ *
+ * Difference calculate and merging is performed on lines (-l) or words (-w).
+ * In the case of -w, an initial diff is computed based on non-trivial words.
+ * i.e. spaces are ignored
+ * This diff is computed from the ends of the file and is used to find a suitable
+ * starting point and range. Then a more precise diff is computed over that
+ * restricted range
+ *
+ * Other options available are:
+ * --replace -r replace first file with result of merge.
+ * --help -h provide help
+ * --version -v version
+ *
+ * Defaults are --merge --words
+ *
+ */
+
+#include "wiggle.h"
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+void die()
+{
+ fprintf(stderr,"wiggle: fatal error\n");
+ abort();
+ exit(3);
+}
+
+void printword(FILE *f, struct elmnt e)
+{
+ if (e.start[0])
+ fprintf(f, "%.*s", e.len, e.start);
+ else {
+ int a,b,c;
+ sscanf(e.start+1, "%d %d %d", &a, &b, &c);
+ fprintf(f, "*** %d,%d **** %d\n", b,c,a);
+ }
+}
+
+static void printsep(struct elmnt e1, struct elmnt e2)
+{
+ int a,b,c,d,e,f;
+ sscanf(e1.start+1, "%d %d %d", &a, &b, &c);
+ sscanf(e2.start+1, "%d %d %d", &d, &e, &f);
+ printf("@@ -%d,%d +%d,%d @@\n", b,c,e,f);
+}
+
+
+/* Remove any entries from the common-sublist that are
+ * just spaces, tabs, or newlines
+ */
+void cleanlist(struct file a, struct file b, struct csl *list)
+{
+ struct csl *new = list;
+
+ while (list->len) {
+ int i;
+ int ap;
+ for( ap = list->a; ap< list->a+list->len; ap++) {
+ for (i=0; i<a.list[ap].len; i++) {
+ char c = a.list[ap].start[i];
+ if (isalnum(c))
+ break;
+ }
+ if (i != a.list[ap].len)
+ break;
+ }
+ if (ap == list->a+list->len)
+ list++;
+ else
+ *new++ = *list++;
+ }
+ *new = *list;
+}
+
+int main(int argc, char *argv[])
+{
+ int opt;
+ int option_index;
+ int mode = 0;
+ int obj = 0;
+ int replace = 0;
+ char *replacename=NULL, *orignew=NULL;
+ int which = 0;
+ int ispatch = 0;
+ int reverse = 0;
+ int verbose=0, quiet=0;
+ int i;
+ int chunks1=0, chunks2=0, chunks3=0;
+ int exit_status = 0;
+ FILE *outfile = stdout;
+ char *helpmsg;
+
+ struct stream f, flist[3];
+ struct file fl[3];
+ struct csl *csl1, *csl2;
+
+ while ((opt = getopt_long(argc, argv,
+ short_options, long_options,
+ &option_index)) != -1)
+ switch(opt) {
+ case 'h':
+ helpmsg = Help;
+ switch(mode) {
+ case 'x': helpmsg = HelpExtract; break;
+ case 'd': helpmsg = HelpDiff; break;
+ case 'm': helpmsg = HelpMerge; break;
+ }
+ fputs(helpmsg, stderr);
+ exit(0);
+
+ case 'V':
+ fputs(Version, stderr);
+ exit(0);
+ case ':':
+ case '?':
+ default:
+ fputs(Usage, stderr);
+ exit(2);
+
+ case 'x':
+ case 'd':
+ case 'm':
+ if (mode ==0){
+ mode = opt;
+ continue;
+ }
+ fprintf(stderr, "wiggle: mode is '%c' - cannot set to '%c'\n",
+ mode, opt);
+ exit(2);
+
+ case 'w':
+ case 'l':
+ if (obj == 0 || obj == opt) {
+ obj = opt;
+ continue;
+ }
+ fprintf(stderr, "wiggle: cannot select both words and lines.\n");
+ exit(2);
+
+ case 'r':
+ replace = 1;
+ continue;
+ case 'R':
+ reverse = 1;
+ continue;
+
+ case '1':
+ case '2':
+ case '3':
+ if (which == 0 || which == opt) {
+ which = opt;
+ continue;
+ }
+ fprintf(stderr, "wiggle: can only select one of -1, -2, -3\n");
+ exit(2);
+
+ case 'p':
+ ispatch = 1;
+ continue;
+
+ case 'v': verbose++; continue;
+ case 'q': quiet=1 ; continue;
+ }
+ if (!mode)
+ mode = 'm';
+
+ if (obj && mode == 'x') {
+ fprintf(stderr,"wiggle: cannot specify --line or --word with --extract\n");
+ exit(2);
+ }
+ if (mode != 'm' && !obj) obj = 'w';
+ if (replace && mode != 'm') {
+ fprintf(stderr, "wiggle: --replace only allowed with --merge\n");
+ exit(2);
+ }
+ if (mode == 'x' && !which) {
+ fprintf(stderr, "wiggle: must specify -1, -2 or -3 with --extract\n");
+ exit(2);
+ }
+ if (mode != 'x' && mode != 'd' && which) {
+ fprintf(stderr, "wiggle: -1, -2 or -3 only allowed with --extract or --diff\n");
+ exit(2);
+ }
+ if (ispatch && (mode != 'x' && mode != 'd')) {
+ fprintf(stderr, "wiggle: --patch only allowed with --extract or --diff\n");
+ exit(2);
+ }
+ if (ispatch && which == '3') {
+ fprintf(stderr, "wiggle: cannot extract -3 from a patch.\n");
+ exit(2);
+ }
+
+ switch(mode) {
+ case 'x':
+ /* extract a branch of a diff or diff3 or merge output
+ * We need one file
+ */
+ if (optind == argc) {
+ fprintf(stderr, "wiggle: no file given for --extract\n");
+ exit(2);
+ }
+ if (optind < argc-1) {
+ fprintf(stderr, "wiggle: only give one file for --extract\n");
+ exit(2);
+ }
+ f = load_file(argv[optind]);
+ if (f.body==NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ if (ispatch)
+ chunks1 = chunks2 = split_patch(f, &flist[0], &flist[1]);
+ else {
+ if (!split_merge(f, &flist[0], &flist[1], &flist[2])) {
+ fprintf(stderr, "wiggle: merge file %s looks bad.\n",
+ argv[optind]);
+ exit(2);
+ }
+ }
+ if (flist[which-'1'].body == NULL) {
+ fprintf(stderr, "wiggle: %s has no -%c component.\n",
+ argv[optind], which);
+ exit(2);
+ } else {
+ write(1, flist[which-'1'].body, flist[which-'1'].len);
+ }
+
+ break;
+ case 'd':
+ /* create a diff (line or char) of two streams */
+ switch (argc-optind) {
+ case 0:
+ fprintf(stderr, "wiggle: no file given for --diff\n");
+ exit(2);
+ case 1:
+ f = load_file(argv[optind]);
+ if (f.body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ chunks1 = chunks2 = split_patch(f, &flist[0], &flist[1]);
+ if (!flist[0].body || !flist[1].body) {
+ fprintf(stderr, "wiggle: couldn't parse patch %s\n",
+ argv[optind]);
+ exit(2);
+ }
+ break;
+ case 2:
+ flist[0] = load_file(argv[optind]);
+ if (flist[0].body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ if (ispatch) {
+ f = load_file(argv[optind+1]);
+ if (f.body == NULL) {
+ fprintf(stderr, "wiggle: cannot load patch '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ if (which == '2')
+ chunks2 = chunks3 = split_patch(f, &flist[2], &flist[1]);
+ else
+ chunks2 = chunks3 = split_patch(f, &flist[1], &flist[2]);
+
+ } else
+ flist[1] = load_file(argv[optind+1]);
+ if (flist[1].body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind+1], strerror(errno));
+ exit(2);
+ }
+ break;
+ default:
+ fprintf(stderr, "wiggle: too many files given for --diff\n");
+ exit(2);
+ }
+ if (reverse) {
+ f=flist[1];
+ flist[1] = flist[2];
+ flist[2]= f;
+ }
+ if (obj == 'l') {
+ int a,b;
+ fl[0] = split_stream(flist[0], ByLine, 0);
+ fl[1] = split_stream(flist[1], ByLine, 0);
+ if (chunks2 && ! chunks1)
+ csl1 = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl1 = diff(fl[0], fl[1]);
+
+ if (!chunks1)
+ printf("@@ -1,%d +1,%d @@\n", fl[0].elcnt, fl[1].elcnt);
+ a = b = 0;
+ while (a<fl[0].elcnt || b < fl[1].elcnt) {
+ if (a < csl1->a) {
+ if (fl[0].list[a].start[0]) {
+ printf("-");
+ printword(stdout, fl[0].list[a]);
+ }
+ a++;
+ exit_status++;
+ } else if (b < csl1->b) {
+ if (fl[1].list[b].start[0]) {
+ printf("+");
+ printword(stdout, fl[1].list[b]);
+ }
+ b++;
+ exit_status++;
+ } else {
+ if (fl[0].list[a].start[0] == '\0')
+ printsep(fl[0].list[a], fl[1].list[b]);
+ else {
+ printf(" ");
+ printword(stdout, fl[0].list[a]);
+ }
+ a++;
+ b++;
+ if (a >= csl1->a+csl1->len)
+ csl1++;
+ }
+ }
+ } else {
+ int a,b;
+ int sol = 1; /* start of line */
+ fl[0] = split_stream(flist[0], ByWord, 0);
+ fl[1] = split_stream(flist[1], ByWord, 0);
+ if (chunks2 && !chunks1)
+ csl1 = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl1 = diff(fl[0], fl[1]);
+
+ if (!chunks1) {
+ /* count lines in each file */
+ int l1, l2, i;
+ l1=l2=0;
+ for (i=0;i<fl[0].elcnt;i++)
+ if (ends_line(fl[0].list[i]))
+ l1++;
+ for (i=0;i<fl[1].elcnt;i++)
+ if (ends_line(fl[1].list[i]))
+ l2++;
+ printf("@@ -1,%d +1,%d @@\n", l1,l2);
+ }
+ a = b = 0;
+ while (a < fl[0].elcnt || b < fl[1].elcnt) {
+ if (a < csl1->a) {
+ exit_status++;
+ if (sol) {
+ int a1;
+ /* If we remove a whole line, output +line
+ * else clear sol and retry */
+ sol = 0;
+ for (a1=a; a1<csl1->a;a1++)
+ if (ends_line(fl[0].list[a1])) {
+ sol=1;
+ break;
+ }
+ if (sol) {
+ printf("-");
+ for (; a<csl1->a; a++) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a])) {
+ a++;
+ break;
+ }
+ }
+ } else printf("|");
+ }
+ if (!sol) {
+ printf("<<<--");
+ do {
+ if (sol) printf("|");
+ printword(stdout, fl[0].list[a]);
+ sol = ends_line(fl[0].list[a]);
+ a++;
+ } while (a < csl1->a);
+ printf("%s-->>>", sol?"|":"");
+ sol=0;
+ }
+ } else if (b < csl1->b) {
+ exit_status++;
+ if (sol) {
+ int b1;
+ sol = 0;
+ for (b1=b; b1<csl1->b;b1++)
+ if(ends_line(fl[1].list[b1])) {
+ sol=1;
+ break;
+ }
+ if (sol) {
+ printf("+");
+ for(; b<csl1->b ; b++) {
+ printword(stdout, fl[1].list[b]);
+ if(ends_line(fl[1].list[b])) {
+ b++;
+ break;
+ }
+ }
+ } else printf("|");
+ }
+ if (!sol) {
+ printf("<<<++");
+ do {
+ if (sol) printf("|");
+ printword(stdout, fl[1].list[b]);
+ sol = ends_line(fl[1].list[b]);
+ b++;
+ } while (b < csl1->b);
+ printf("%s++>>>",sol?"|":"");
+ sol=0;
+ }
+ } else {
+ if (sol) {
+ int a1;
+ sol = 0;
+ for (a1=a; a1<csl1->a+csl1->len; a1++)
+ if (ends_line(fl[0].list[a1]))
+ sol=1;
+ if (sol) {
+ if (fl[0].list[a].start[0]) {
+ printf(" ");
+ for(; a<csl1->a+csl1->len; a++,b++) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a])) {
+ a++,b++;
+ break;
+ }
+ }
+ } else {
+ printsep(fl[0].list[a], fl[1].list[b]);
+ a++; b++;
+ }
+ }
+ else printf("|");
+ }
+ if (!sol) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a]))
+ sol=1;
+ a++;
+ b++;
+ }
+ if (a >= csl1->a+csl1->len)
+ csl1++;
+ }
+ }
+
+ }
+ break;
+ case 'm':
+ /* merge three files, A B C, so changed between B and C get made to A
+ */
+ switch (argc-optind) {
+ case 0:
+ fprintf(stderr, "wiggle: no files given for --merge\n");
+ exit(2);
+ case 3:
+ case 2:
+ case 1:
+ for (i=0; i< argc-optind; i++) {
+ flist[i] = load_file(argv[optind+i]);
+ if (flist[i].body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind+i], strerror(errno));
+ exit(2);
+ }
+ }
+ break;
+ default:
+ fprintf(stderr, "wiggle: too many files given for --merge\n");
+ exit(2);
+ }
+ switch(argc-optind) {
+ case 1: /* a merge file */
+ f = flist[0];
+ if (!split_merge(f, &flist[0], &flist[1], &flist[2])) {
+ fprintf(stderr,"wiggle: merge file %s looks bad.\n",
+ argv[optind]);
+ exit(2);
+ }
+ break;
+ case 2: /* a file and a patch */
+ f = flist[1];
+ chunks2 = chunks3 = split_patch(f, &flist[1], &flist[2]);
+ break;
+ case 3: /* three separate files */
+ break;
+ }
+ if (reverse) {
+ f=flist[1];
+ flist[1] = flist[2];
+ flist[2]= f;
+ }
+
+ for (i=0; i<3; i++) {
+ if (flist[i].body==NULL) {
+ fprintf(stderr, "wiggle: file %d missing\n", i);
+ exit(2);
+ }
+ }
+ if (replace) {
+ int fd;
+ replacename = malloc(strlen(argv[optind])+ 20);
+ if (!replacename) die();
+ orignew = malloc(strlen(argv[optind])+20);
+ if (!orignew) die();
+ strcpy(replacename, argv[optind]);
+ strcpy(orignew, argv[optind]);
+ strcat(orignew, ".porig");
+ if (open(orignew, O_RDONLY) >= 0 ||
+ errno != ENOENT) {
+ fprintf(stderr,"wiggle: %s already exists\n",
+ orignew);
+ exit(2);
+ }
+ strcat(replacename,"XXXXXX");
+ fd = mkstemp(replacename);
+ if (fd == -1) {
+ fprintf(stderr,"wiggle: could not create temporary file for %s\n",
+ replacename);
+ exit(2);
+ }
+ outfile = fdopen(fd, "w");
+
+ }
+
+ if (obj == 'l') {
+ fl[0] = split_stream(flist[0], ByLine, 0);
+ fl[1] = split_stream(flist[1], ByLine, 0);
+ fl[2] = split_stream(flist[2], ByLine, 0);
+ } else {
+ fl[0] = split_stream(flist[0], ByWord, 0);
+ fl[1] = split_stream(flist[1], ByWord, 0);
+ fl[2] = split_stream(flist[2], ByWord, 0);
+ }
+ if (chunks2 && !chunks1)
+ csl1 = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl1 = diff(fl[0], fl[1]);
+ csl2 = diff(fl[1], fl[2]);
+
+#if 0
+ cleanlist(fl[0],fl[1],csl1);
+ cleanlist(fl[1],fl[2],csl2);
+#endif
+
+ {
+ struct ci ci;
+
+ ci = print_merge(outfile, &fl[0], &fl[1], &fl[2],
+ csl1, csl2, obj=='w');
+ if (!quiet && ci.conflicts)
+ fprintf(stderr, "%d unresolved conflict%s found\n", ci.conflicts, ci.conflicts==1?"":"s");
+ if (!quiet && ci.ignored)
+ fprintf(stderr, "%d already-applied change%s ignored\n", ci.ignored, ci.ignored==1?"":"s");
+ exit_status = (ci.conflicts > 0);
+ }
+ if (replace) {
+ fclose(outfile);
+ if (rename(argv[optind], orignew) ==0 &&
+ rename(replacename, argv[optind]) ==0)
+ /* all ok */;
+ else {
+ fprintf(stderr, "wiggle: failed to move new file into place.\n");
+ exit(2);
+ }
+ }
+ break;
+
+ }
+ exit(exit_status);
+}
diff --git a/demo.patched/Makefile b/demo.patched/Makefile
new file mode 100644
index 0000000..c7d827e
--- /dev/null
+++ b/demo.patched/Makefile
@@ -0,0 +1,56 @@
+
+# Note on my Mobile Pentium II, -march=pentium2 delivers twice the performance of i386
+#OptDbg=-O3
+#OptDbg=-O3 -march=pentium2
+OptDbg=-ggdb
+CFLAGS=$(OptDbg) -Wall -Werror
+
+# STRIP = -s
+INSTALL = /usr/bin/install
+DESTDIR =
+BINDIR = /usr/bin
+MANDIR = /usr/share/man
+MAN1DIR = $(MANDIR)/man1
+MAN5DIR = $(MANDIR)/man5
+LDLIBS=-lncurses
+
+all: wiggle wiggle.man test
+
+vpatch : vpatch.o extract.o split.o diff.o
+
+
+wiggle : wiggle.o load.o split.o extract.o diff.o bestmatch.o ReadMe.o merge.o
+wiggle.o load.o split.o extract.o diff.o bestmatch.o ReadMe.o merge.o : wiggle.h
+
+test: wiggle dotest
+ sh dotest
+
+wiggle.man : wiggle.1
+ nroff -man wiggle.1 > wiggle.man
+
+clean:
+ rm -f *.o *.man wiggle .version* version
+ find . -name core -o -name '*.tmp*' -o -name .tmp | xargs rm -f
+
+install : wiggle wiggle.1
+ $(INSTALL) -D $(STRIP) -m 755 wiggle $(DESTDIR)$(BINDIR)/wiggle
+ $(INSTALL) -D -m 644 wiggle.1 $(DESTDIR)$(MAN1DIR)/wiggle.1
+
+version : wiggle.1
+ @rm -f version
+ @sed -n -e 's/.*wiggle - v\([0-9.]*\) - .*/\1/p' ReadMe.c > .version-readme
+ @sed -n -e 's/.*WIGGLE 1 "" v\([0-9.]*\)$$/\1/p' wiggle.1 > .version-man
+ @cmp -s .version-readme .version-man && cat .version-man > version || { echo Inconsistant versions.; exit 1;}
+
+dist : clean version
+ mkdir -p DIST
+ rm -f DIST/wiggle-`cat version`
+ ln -s .. DIST/wiggle-`cat version`
+ tar czvf DIST/wiggle-`cat version`.tar.gz -h -C DIST --exclude RCS --exclude DIST wiggle-`cat version`
+ rm -f DIST/wiggle-`cat version`
+
+v : version
+ cat version
+
+demo.patch:
+ diff -ru demo.orig demo.patched | sed 's/demo.patched/demo/' > demo.patch
diff --git a/demo.patched/README b/demo.patched/README
new file mode 100644
index 0000000..f00d0b5
--- /dev/null
+++ b/demo.patched/README
@@ -0,0 +1,70 @@
+
+This is demo file for wiggle's --browse mode.
+
+Browse mode is intended to let you look through a patch
+to see how it will apply to a set of files. It is
+possible that the patch will have some conflicts.
+That is: the patch was created from a different version of
+the source to the version that you are applying the patch
+to. This is what makes it interesting.
+
+You can use normal cursor motion to scroll around,
+both vertially and horizontally (both emacs and vi style).
+From the initial file-list patch, use space
+or <return> to open/close a directory or file.
+Form the file-view mode, use 'q' to get back to
+to file list.
+
+Differences applied by the patch are shown as
+RED for removal and BLUE for addition.
+Text with a pink background was not matched -
+maybe it has been changed since the patch was
+created.
+
+green-background text is text that the patch wants
+to change, but the exact correct change has already
+been made.
+
+Captial-N might go to the next interesting chunk
+of the file.
+
+You can use 'o' and 'r' to view the original or result.
+You can use 'b' and 'a' to view the before or after
+sides of the patch.
+You can use 'd' to view the diff (patch, before and
+after) or 'm' to view the merge (original and result).
+
+'|' returns to the original split-window view with
+merge on the left and diff on the right.
+
+Have fun.
+
+---------------------------------------
+This demonstrates where a diff on one line
+applies to text that is now split
+
+The quick brown fox jumps over the lazy hound.
+
+------------
+
+This demonstrates a diff which contains some
+extraneous lines.
+(this is an extra line)
+
+The quick brown she
+fox jumps over the lazy cat.
+
+There will be extra lines in the diff
+(as is this)
+
+--------------
+
+Here are some lines
+without any mention of
+a clear conflict
+
+--------------
+
+Two dissimilar lines,
+both having changes
+
diff --git a/demo.patched/vpatch.c b/demo.patched/vpatch.c
new file mode 100644
index 0000000..1746ab0
--- /dev/null
+++ b/demo.patched/vpatch.c
@@ -0,0 +1,667 @@
+
+/*
+ * vpatch - visual front end for wiggle
+ *
+ * "files" Display, lists all files with statistics
+ * - can hide various lines including subdirectories
+ * and files without wiggles or conflicts
+ * "diff" display shows merged file with different parts
+ * in different colours
+ * - untouched are pale A_DIM
+ * - matched/remaining are regular A_NORMAL
+ * - matched/removed are red/underlined A_UNDERLINE
+ * - unmatched in file are A_STANDOUT
+ * - unmatched in patch are A_STANDOUT|A_UNDERLINE ???
+ * - inserted are inverse/green ?? A_REVERSE
+ *
+ * The window can be split horizontally or vertically and
+ * two different views displayed. They will have different
+ * parts missing
+ *
+ * So a display of NORMAL, underline, standout|underline reverse
+ * should show a normal patch.
+ *
+ */
+
+#include "wiggle.h"
+#include <malloc.h>
+#include <string.h>
+#include <curses.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#define assert(x) do { if (!(x)) abort(); } while (0)
+
+struct plist {
+ char *file;
+ unsigned int start, end;
+ int parent;
+ int next, prev, last;
+ int open;
+ int chunks, wiggles, conflicts;
+};
+
+struct plist *patch_add_file(struct plist *pl, int *np, char *file,
+ unsigned int start, unsigned int end)
+{
+ /* size of pl is 0, 16, n^2 */
+ int n = *np;
+ int asize;
+
+/* printf("adding %s at %d: %u %u\n", file, n, start, end); */
+ if (n==0) asize = 0;
+ else if (n<=16) asize = 16;
+ else if ((n&(n-1))==0) asize = n;
+ else asize = n+1; /* not accurate, but not too large */
+ if (asize <= n) {
+ /* need to extend array */
+ struct plist *npl;
+ if (asize < 16) asize = 16;
+ else asize += asize;
+ npl = realloc(pl, asize * sizeof(struct plist));
+ if (!npl) {
+ fprintf(stderr, "malloc failed - skipping %s\n", file);
+ return pl;
+ }
+ pl = npl;
+ }
+ pl[n].file = file;
+ pl[n].start = start;
+ pl[n].end = end;
+ pl[n].last = pl[n].next = pl[n].prev = pl[n].parent = -1;
+ pl[n].chunks = pl[n].wiggles = pl[n].conflicts = 0;
+ pl[n].open = 1;
+ *np = n+1;
+ return pl;
+}
+
+
+
+struct plist *parse_patch(FILE *f, FILE *of, int *np)
+{
+ /* read a multi-file patch from 'f' and record relevant
+ * details in a plist.
+ * if 'of' >= 0, fd might not be seekable so we write
+ * to 'of' and use lseek on 'of' to determine position
+ */
+ struct plist *plist = NULL;
+
+ while (!feof(f)) {
+ /* first, find the start of a patch: "\n+++ "
+ * grab the file name and scan to the end of a line
+ */
+ char *target="\n+++ ";
+ char *target2="\n--- ";
+ char *pos = target;
+ int c;
+ char name[1024];
+ unsigned start, end;
+
+ while (*pos && (c=fgetc(f)) != EOF ) {
+ if (of) fputc(c, of);
+ if (c == *pos)
+ pos++;
+ else pos = target;
+ }
+ if (c == EOF)
+ break;
+ assert(c == ' ');
+ /* now read a file name */
+ pos = name;
+ while ((c=fgetc(f)) != EOF && c != '\t' && c != '\n' && c != ' ' &&
+ pos - name < 1023) {
+ *pos++ = c;
+ if (of) fputc(c, of);
+ }
+ *pos = 0;
+ if (c == EOF)
+ break;
+ if (of) fputc(c, of);
+ while (c != '\n' && (c=fgetc(f)) != EOF) {
+ if (of) fputc(c, of);
+ }
+ start = of ? ftell(of) : ftell(f);
+
+ if (c == EOF) break;
+
+ /* now skip to end - "\n--- " */
+ pos = target2+1;
+
+ while (*pos && (c=fgetc(f)) != EOF) {
+ if (of) fputc(c, of);
+ if (c == *pos)
+ pos++;
+ else pos = target2;
+ }
+ if (pos > target2) {
+ end = of ? ftell(of) : ftell(f);
+ end -= (pos - target2) - 1;
+ plist = patch_add_file(plist, np,
+ strdup(name), start, end);
+ }
+ }
+ return plist;
+}
+void die()
+{
+ fprintf(stderr,"vpatch: fatal error\n");
+ abort();
+ exit(3);
+}
+
+
+static struct stream load_segment(FILE *f,
+ unsigned int start, unsigned int end)
+{
+ struct stream s;
+ s.len = end - start;
+ s.body = malloc(s.len);
+ if (s.body) {
+ fseek(f, start, 0);
+ if (fread(s.body, 1, s.len, f) != s.len) {
+ free(s.body);
+ s.body = NULL;
+ }
+ } else
+ die();
+ return s;
+}
+
+
+void catch(int sig)
+{
+ if (sig == SIGINT) {
+ signal(sig, catch);
+ return;
+ }
+ nocbreak();nl();endwin();
+ printf("Died on signal %d\n", sig);
+ exit(2);
+}
+
+int pl_cmp(const void *av, const void *bv)
+{
+ const struct plist *a = av;
+ const struct plist *b = bv;
+ return strcmp(a->file, b->file);
+}
+
+int common_depth(char *a, char *b)
+{
+ /* find number of patch segments that these two have
+ * in common
+ */
+ int depth = 0;
+ while(1) {
+ char *c;
+ int al, bl;
+ c = strchr(a, '/');
+ if (c) al = c-a; else al = strlen(a);
+ c = strchr(b, '/');
+ if (c) bl = c-b; else bl = strlen(b);
+ if (al == 0 || al != bl || strncmp(a,b,al) != 0)
+ return depth;
+ a+= al;
+ while (*a=='/') a++;
+ b+= bl;
+ while(*b=='/') b++;
+
+ depth++;
+ }
+}
+
+struct plist *add_dir(struct plist *pl, int *np, char *file, char *curr)
+{
+ /* any parent of file that is not a parent of curr
+ * needs to be added to pl
+ */
+ int d = common_depth(file, curr);
+ char *buf = curr;
+ while (d) {
+ char *c = strchr(file, '/');
+ int l;
+ if (c) l = c-file; else l = strlen(file);
+ file += l;
+ curr += l;
+ while (*file == '/') file++;
+ while (*curr == '/') curr++;
+ d--;
+ }
+ while (*file) {
+ if (curr > buf && curr[-1] != '/')
+ *curr++ = '/';
+ while (*file && *file != '/')
+ *curr++ = *file++;
+ while (*file == '/') *file++;
+ *curr = '\0';
+ if (*file)
+ pl = patch_add_file(pl, np, strdup(buf),
+ 0, 0);
+ }
+ return pl;
+}
+
+struct plist *sort_patches(struct plist *pl, int *np)
+{
+ /* sort the patches, add directory names, and re-sort */
+ char curr[1024];
+ char *prev;
+ int parents[100];
+ int prevnode[100];
+ int i, n;
+ qsort(pl, *np, sizeof(struct plist), pl_cmp);
+ curr[0] = 0;
+ n = *np;
+ for (i=0; i<n; i++)
+ pl = add_dir(pl, np, pl[i].file, curr);
+
+ qsort(pl, *np, sizeof(struct plist), pl_cmp);
+
+ /* array is now stable, so set up parent pointers */
+ n = *np;
+ curr[0] = 0;
+ prevnode[0] = -1;
+ prev = "";
+ for (i=0; i<n; i++) {
+ int d = common_depth(prev, pl[i].file);
+ if (d == 0)
+ pl[i].parent = -1;
+ else {
+ pl[i].parent = parents[d-1];
+ pl[pl[i].parent].last = i;
+ }
+ pl[i].prev = prevnode[d];
+ if (pl[i].prev > -1)
+ pl[pl[i].prev].next = i;
+ prev = pl[i].file;
+ parents[d] = i;
+ prevnode[d] = i;
+ prevnode[d+1] = -1;
+ }
+ return pl;
+}
+
+int get_prev(int pos, struct plist *pl, int n)
+{
+ if (pos == -1) return pos;
+ if (pl[pos].prev == -1)
+ return pl[pos].parent;
+ pos = pl[pos].prev;
+ while (pl[pos].open &&
+ pl[pos].last >= 0)
+ pos = pl[pos].last;
+ return pos;
+}
+
+int get_next(int pos, struct plist *pl, int n)
+{
+ if (pos == -1) return pos;
+ if (pl[pos].open) {
+ if (pos +1 < n)
+ return pos+1;
+ else
+ return -1;
+ }
+ while (pos >= 0 && pl[pos].next == -1)
+ pos = pl[pos].parent;
+ if (pos >= 0)
+ pos = pl[pos].next;
+ return pos;
+}
+
+void draw_one(int row, struct plist *pl)
+{
+ char hdr[10];
+ hdr[0] = 0;
+
+ if (pl == NULL) {
+ move(row,0);
+ clrtoeol();
+ return;
+ }
+ if (pl->chunks > 99)
+ strcpy(hdr, "XX");
+ else sprintf(hdr, "%02d", pl->chunks);
+ if (pl->wiggles > 99)
+ strcpy(hdr, " XX");
+ else sprintf(hdr+2, " %02d", pl->wiggles);
+ if (pl->conflicts > 99)
+ strcpy(hdr, " XX");
+ else sprintf(hdr+5, " %02d ", pl->conflicts);
+ if (pl->end)
+ strcpy(hdr+9, "= ");
+ else if (pl->open)
+ strcpy(hdr+9, "+ ");
+ else strcpy(hdr+9, "- ");
+
+ mvaddstr(row, 0, hdr);
+ mvaddstr(row, 11, pl->file);
+ clrtoeol();
+}
+
+void addword(struct elmnt e)
+{
+ addnstr(e.start, e.len);
+}
+
+void diff_window(struct plist *p, FILE *f)
+{
+ /*
+ * I wonder what to display here ....
+ */
+ struct stream s;
+ struct stream s1, s2;
+ struct file f1, f2;
+ struct csl *csl;
+ char buf[100];
+ int ch;
+ s = load_segment(f, p->start, p->end);
+ ch = split_patch(s, &s1, &s2);
+
+ clear();
+ sprintf(buf, "Chunk count: %d\n", ch);
+ mvaddstr(1,1,buf); clrtoeol();
+
+
+ f1 = split_stream(s1, ByWord, 0);
+ f2 = split_stream(s2, ByWord, 0);
+
+ csl = diff(f1, f2);
+
+ /* now try to display the diff highlighted */
+ int sol = 1;
+ int a=0, b=0;
+
+ while(a<f1.elcnt || b < f2.elcnt) {
+ if (a < csl->a) {
+ if (sol) {
+ int a1;
+ /* if we remove a whole line, output +line,
+ * else clear sol and retry
+ */
+ sol = 0;
+ for (a1=a; a1<csl->a; a1++)
+ if (f1.list[a1].start[0] == '\n') {
+ sol = 1;
+ break;
+ }
+ if (sol) {
+ addch('-');
+ attron(A_UNDERLINE);
+ for (; a<csl->a; a++) {
+ addword(f1.list[a]);
+ if (f1.list[a].start[0] == '\n') {
+ a++;
+ break;
+ }
+ }
+ attroff(A_UNDERLINE);
+ } else addch('|');
+ }
+ if (!sol) {
+ attron(A_UNDERLINE);
+ do {
+ if (sol) {
+ attroff(A_UNDERLINE);
+ addch('|');
+ attron(A_UNDERLINE);
+ }
+ addword(f1.list[a]);
+ sol = (f1.list[a].start[0] == '\n');
+ a++;
+ } while (a < csl->a);
+ attroff(A_UNDERLINE);
+ if (sol) addch('|');
+ sol = 0;
+ }
+ } else if (b < csl->b) {
+ if (sol) {
+ int b1;
+ sol = 0;
+ for (b1=b; b1<csl->b; b1++)
+ if (f2.list[b1].start[0] == '\n') {
+ sol = 1;
+ break;
+ }
+ if (sol) {
+ addch('+');
+ attron(A_BOLD);
+ for (; b<csl->b; b++) {
+ addword(f2.list[b]);
+ if (f2.list[b].start[0] == '\n') {
+ b++;
+ break;
+ }
+ }
+ attroff(A_BOLD);
+ } else addch('|');
+ }
+ if (!sol) {
+ attron(A_BOLD);
+ do {
+ if (sol) {
+ attroff(A_BOLD);
+ addch('|');
+ attron(A_BOLD);
+ }
+ addword(f2.list[b]);
+ sol = (f2.list[b].start[0] == '\n');
+ b++;
+ } while (b < csl->b);
+ attroff(A_BOLD);
+ if (sol) addch('|');
+ sol = 0;
+ }
+ } else {
+ if (sol) {
+ int a1;
+ sol = 0;
+ for (a1=a; a1<csl->a+csl->len; a1++)
+ if (f1.list[a1].start[0] == '\n')
+ sol = 1;
+ if (sol) {
+ if (f1.list[a].start[0]) {
+ addch(' ');
+ for (; a< csl->a+csl->len; a++,b++) {
+ addword(f1.list[a]);
+ if (f1.list[a].start[0]=='\n') {
+ a++,b++;
+ break;
+ }
+ }
+ } else {
+ addstr("SEP\n");
+ a++; b++;
+ }
+ } else addch('|');
+ }
+ if (!sol) {
+ addword(f1.list[a]);
+ if (f1.list[a].start[0] == '\n')
+ sol = 1;
+ a++;
+ b++;
+ }
+ if (a >= csl->a+csl->len)
+ csl++;
+ }
+ }
+
+
+ getch();
+
+ free(s1.body);
+ free(s2.body);
+ free(f1.list);
+ free(f2.list);
+}
+
+void main_window(struct plist *pl, int n, FILE *f)
+{
+ /* The main window lists all files together with summary information:
+ * number of chunks, number of wiggles, number of conflicts.
+ * The list is scrollable
+ * When a entry is 'selected', we switch to the 'file' window
+ * The list can be condensed by removing files with no conflict
+ * or no wiggles, or removing subdirectories
+ *
+ * We record which file in the list is 'current', and which
+ * screen line it is on. We try to keep things stable while
+ * moving.
+ *
+ * Counts are printed before the name using at most 2 digits.
+ * Numbers greater than 99 are XX
+ * Ch Wi Co File
+ * 27 5 1 drivers/md/md.c
+ *
+ * A directory show the sum in all children.
+ *
+ * Commands:
+ * select: enter, space, mouseclick
+ * on file, go to file window
+ * on directory, toggle open
+ * up: k, p, control-p uparrow
+ * Move to previous open object
+ * down: j, n, control-n, downarrow
+ * Move to next open object
+ *
+ */
+ int pos=0; /* position in file */
+ int row=1; /* position on screen */
+ int rows; /* size of screen in rows */
+ int cols;
+ int tpos, i;
+ int refresh = 2;
+ int c;
+
+ while(1) {
+ if (refresh == 2) {
+ clear();
+ attron(A_BOLD);
+ mvaddstr(0,0,"Ch Wi Co Patched Files");
+ move(2,0);
+ attroff(A_BOLD);
+ refresh = 1;
+ }
+ if (row <1 || row >= rows)
+ refresh = 1;
+ if (refresh) {
+ refresh = 0;
+ getmaxyx(stdscr, rows, cols);
+ if (row >= rows +3)
+ row = (rows+1)/2;
+ if (row >= rows)
+ row = rows-1;
+ tpos = pos;
+ for (i=row; i>1; i--) {
+ tpos = get_prev(tpos, pl, n);
+ if (tpos == -1) {
+ row = row - i + 1;
+ break;
+ }
+ }
+ /* Ok, row and pos could be trustworthy now */
+ tpos = pos;
+ for (i=row; i>=1; i--) {
+ draw_one(i, &pl[tpos]);
+ tpos = get_prev(tpos, pl, n);
+ }
+ tpos = pos;
+ for (i=row+1; i<rows; i++) {
+ tpos = get_next(tpos, pl, n);
+ if (tpos >= 0)
+ draw_one(i, &pl[tpos]);
+ else
+ draw_one(i, NULL);
+ }
+ }
+ move(row, 9);
+ c = getch();
+ switch(c) {
+ case 'j':
+ case 'n':
+ case 'N':
+ case 'N'-64:
+ case KEY_DOWN:
+ tpos = get_next(pos, pl, n);
+ if (tpos >= 0) {
+ pos = tpos;
+ row++;
+ }
+ break;
+ case 'k':
+ case 'p':
+ case 'P':
+ case 'P'-64:
+ case KEY_UP:
+ tpos = get_prev(pos, pl, n);
+ if (tpos >= 0) {
+ pos = tpos;
+ row--;
+ }
+ break;
+
+ case ' ':
+ case 13:
+ if (pl[pos].end == 0) {
+ pl[pos].open = ! pl[pos].open;
+ refresh = 1;
+ } else {
+ diff_window(&pl[pos], f);
+ refresh = 2;
+ }
+ break;
+ case 27: /* escape */
+ case 'q':
+ return;
+ }
+ }
+}
+
+
+int main(int argc, char *argv[])
+{
+ int n = 0;
+ FILE *f = NULL;
+ FILE *in = stdin;
+ struct plist *pl;
+
+ if (argc == 3)
+ f = fopen(argv[argc-1], "w+");
+ if (argc >=2)
+ in = fopen(argv[1], "r");
+ else {
+ printf("no arg...\n");
+ exit(2);
+ }
+
+ pl = parse_patch(in, f, &n);
+ pl = sort_patches(pl, &n);
+
+ if (f) {
+ fclose(in);
+ in = f;
+ }
+#if 0
+ int i;
+ for (i=0; i<n ; i++) {
+ printf("%3d: %3d %2d/%2d %s\n", i, pl[i].parent, pl[i].prev, pl[i].next, pl[i].file);
+ }
+ exit(0);
+#endif
+ signal(SIGINT, catch);
+ signal(SIGQUIT, catch);
+ signal(SIGBUS, catch);
+ signal(SIGTERM, catch);
+ signal(SIGSEGV, catch);
+
+ initscr(); cbreak(); noecho();
+ nonl(); intrflush(stdscr, FALSE); keypad(stdscr, TRUE);
+ mousemask(ALL_MOUSE_EVENTS, NULL);
+
+ main_window(pl, n, in);
+
+ nocbreak();nl();endwin();
+ return 0;
+}
diff --git a/demo.patched/wiggle.c b/demo.patched/wiggle.c
new file mode 100644
index 0000000..2bbb90f
--- /dev/null
+++ b/demo.patched/wiggle.c
@@ -0,0 +1,643 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@cse.unsw.edu.au>
+ * Paper: Neil Brown
+ * School of Computer Science and Engineering
+ * The University of New South Wales
+ * Sydney, 2052
+ * Australia
+ */
+
+/*
+ * Wiggle is a tool for working with patches that don't quite apply properly.
+ * It provides functionality similar to 'diff' and 'merge' but can
+ * work at the level of individual words thus allowing the merging of
+ * two changes that affect the same line, but not the same parts of that line.
+ *
+ * Wiggle can also read patch and merge files. Unlike 'merge' it does not
+ * need to be given three separate files, but can be given a file and a patch
+ * and it will extract the pieces of the two two other files that it needs from
+ * the patch.
+ *
+ * Wiggle performs one of three core function:
+ * --extract -x extract part of a patch or merge file
+ * --diff -d report differences between two files
+ * --merge -m merge the changes between two files into a third file
+ *
+ * To perform these, wiggle requires 1, 2, or 3 input streams respectively.
+ * I can get there from individual files, from a diff (unified or context) or
+ * from a merge file.
+ *
+ * For merge:
+ * If one file is given, it is a merge file (output of 'merge').
+ * If two files are given, the second is assumed to be a patch, the first is a normal file.
+ * If three files are given, they are taken to be normal files.
+ *
+ * For diff:
+ * If one file is given, it is a patch
+ * If two files are given, they are normal files.
+ *
+ * For extract:
+ * Only one file can be given. -p indicates it is a patch, otherwise it is a merge.
+ * One of the flags -1 -2 or -3 must also be given and they indicate which
+ * part of the patch or merge to extract.
+ *
+ * Difference calculate and merging is performed on lines (-l) or words (-w).
+ * In the case of -w, an initial diff is computed based on non-trivial words.
+ * i.e. spaces are ignored
+ * This diff is computed from the ends of the file and is used to find a suitable
+ * starting point and range. Then a more precise diff is computed over that
+ * restricted range
+ *
+ * Other options available are:
+ * --replace -r replace first file with result of merge.
+ * --help -h provide help
+ * --version -v version
+ *
+ * Defaults are --merge --words
+ *
+ */
+
+#include "wiggle.h"
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+void die()
+{
+ fprintf(stderr,"wiggle: fatal error\n");
+ abort();
+ exit(3);
+}
+
+void printword(FILE *f, struct elmnt e)
+{
+ if (e.start[0])
+ fprintf(f, "%.*s", e.len, e.start);
+ else {
+ int a,b,c;
+ sscanf(e.start+1, "%d %d %d", &a, &b, &c);
+ fprintf(f, "*** %d,%d **** %d\n", b,c,a);
+ }
+}
+
+static void printsep(struct elmnt e1, struct elmnt e2)
+{
+ int a,b,c,d,e,f;
+ sscanf(e1.start+1, "%d %d %d", &a, &b, &c);
+ sscanf(e2.start+1, "%d %d %d", &d, &e, &f);
+ printf("@@ -%d,%d +%d,%d @@\n", b,c,e,f);
+}
+
+
+/* Remove any entries from the common-sublist that are
+ * just spaces, tabs, or newlines
+ */
+void cleanlist(struct file a, struct file b, struct csl *list)
+{
+ struct csl *new = list;
+
+ while (list->len) {
+ int i;
+ int ap;
+ for( ap = list->a; ap< list->a+list->len; ap++) {
+ for (i=0; i<a.list[ap].len; i++) {
+ char c = a.list[ap].start[i];
+ if (isalnum(c))
+ break;
+ }
+ if (i != a.list[ap].len)
+ break;
+ }
+ if (ap == list->a+list->len)
+ list++;
+ else
+ *new++ = *list++;
+ }
+ *new = *list;
+}
+
+int main(int argc, char *argv[])
+{
+ int opt;
+ int option_index;
+ int mode = 0;
+ int obj = 0;
+ int replace = 0;
+ char *replacename=NULL, *orignew=NULL;
+ int which = 0;
+ int ispatch = 0;
+ int reverse = 0;
+ int verbose=0, quiet=0;
+ int i;
+ int chunks1=0, chunks2=0, chunks3=0;
+ int exit_status = 0;
+ FILE *outfile = stdout;
+ char *helpmsg;
+
+ struct stream f, flist[3];
+ struct file fl[3];
+ struct csl *csl1, *csl2;
+
+ while ((opt = getopt_long(argc, argv,
+ short_options, long_options,
+ &option_index)) != -1)
+ switch(opt) {
+ case 'h':
+ helpmsg = Help;
+ switch(mode) {
+ case 'x': helpmsg = HelpExtract; break;
+ case 'd': helpmsg = HelpDiff; break;
+ case 'm': helpmsg = HelpMerge; break;
+ }
+ fputs(helpmsg, stderr);
+ exit(0);
+
+ case 'V':
+ fputs(Version, stderr);
+ exit(0);
+ case ':':
+ case '?':
+ default:
+ fputs(Usage, stderr);
+ exit(2);
+
+ case 'x':
+ case 'd':
+ case 'm':
+ if (mode ==0){
+ mode = opt;
+ continue;
+ }
+ fprintf(stderr, "wiggle: mode is '%c' - cannot set to '%c'\n",
+ mode, opt);
+ exit(2);
+
+ case 'w':
+ case 'l':
+ if (obj == 0 || obj == opt) {
+ obj = opt;
+ continue;
+ }
+ fprintf(stderr, "wiggle: cannot select both words and lines.\n");
+ exit(2);
+
+ case 'r':
+ replace = 1;
+ continue;
+ case 'R':
+ reverse = 1;
+ continue;
+
+ case '1':
+ case '2':
+ case '3':
+ if (which == 0 || which == opt) {
+ which = opt;
+ continue;
+ }
+ fprintf(stderr, "wiggle: can only select one of -1, -2, -3\n");
+ exit(2);
+
+ case 'p':
+ ispatch = 1;
+ continue;
+
+ case 'v': verbose++; continue;
+ case 'q': quiet=1 ; continue;
+ }
+ if (!mode)
+ mode = 'm';
+
+ if (obj && mode == 'x') {
+ fprintf(stderr,"wiggle: cannot specify --line or --word with --extract\n");
+ exit(2);
+ }
+ if (mode != 'm' && !obj) obj = 'w';
+ if (replace && mode != 'm') {
+ fprintf(stderr, "wiggle: --replace only allowed with --merge\n");
+ exit(2);
+ }
+ if (mode == 'x' && !which) {
+ fprintf(stderr, "wiggle: must specify -1, -2 or -3 with --extract\n");
+ exit(2);
+ }
+ if (mode != 'x' && mode != 'd' && which) {
+ fprintf(stderr, "wiggle: -1, -2 or -3 only allowed with --extract or --diff\n");
+ exit(2);
+ }
+ if (ispatch && (mode != 'x' && mode != 'd')) {
+ fprintf(stderr, "wiggle: --patch only allowed with --extract or --diff\n");
+ exit(2);
+ }
+ if (ispatch && which == '3') {
+ fprintf(stderr, "wiggle: cannot extract -3 from a patch.\n");
+ exit(2);
+ }
+
+ switch(mode) {
+ case 'x':
+ /* extract a branch of a diff or diff3 or merge output
+ * We need one file
+ */
+ if (optind == argc) {
+ fprintf(stderr, "wiggle: no file given for --extract\n");
+ exit(2);
+ }
+ if (optind < argc-1) {
+ fprintf(stderr, "wiggle: only give one file for --extract\n");
+ exit(2);
+ }
+ f = load_file(argv[optind]);
+ if (f.body==NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ if (ispatch)
+ chunks1 = chunks2 = split_patch(f, &flist[0], &flist[1]);
+ else {
+ if (!split_merge(f, &flist[0], &flist[1], &flist[2])) {
+ fprintf(stderr, "wiggle: merge file %s looks bad.\n",
+ argv[optind]);
+ exit(2);
+ }
+ }
+ if (flist[which-'1'].body == NULL) {
+ fprintf(stderr, "wiggle: %s has no -%c component.\n",
+ argv[optind], which);
+ exit(2);
+ } else {
+ write(1, flist[which-'1'].body, flist[which-'1'].len);
+ }
+
+ break;
+ case 'd':
+ /* create a diff (line or char) of two streams */
+ switch (argc-optind) {
+ case 0:
+ fprintf(stderr, "wiggle: no file given for --diff\n");
+ exit(2);
+ case 1:
+ f = load_file(argv[optind]);
+ if (f.body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ chunks1 = chunks2 = split_patch(f, &flist[0], &flist[1]);
+ if (!flist[0].body || !flist[1].body) {
+ fprintf(stderr, "wiggle: couldn't parse patch %s\n",
+ argv[optind]);
+ exit(2);
+ }
+ break;
+ case 2:
+ flist[0] = load_file(argv[optind]);
+ if (flist[0].body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ if (ispatch) {
+ f = load_file(argv[optind+1]);
+ if (f.body == NULL) {
+ fprintf(stderr, "wiggle: cannot load patch '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ if (which == '2')
+ chunks2 = chunks3 = split_patch(f, &flist[2], &flist[1]);
+ else
+ chunks2 = chunks3 = split_patch(f, &flist[1], &flist[2]);
+
+ } else
+ flist[1] = load_file(argv[optind+1]);
+ if (flist[1].body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind+1], strerror(errno));
+ exit(2);
+ }
+ break;
+ default:
+ fprintf(stderr, "wiggle: too many files given for --diff\n");
+ exit(2);
+ }
+ if (reverse) {
+ f=flist[1];
+ flist[1] = flist[2];
+ flist[2]= f;
+ }
+ if (obj == 'l') {
+ int a,b;
+ fl[0] = split_stream(flist[0], ByLine, 0);
+ fl[1] = split_stream(flist[1], ByLine, 0);
+ if (chunks2 && ! chunks1)
+ csl1 = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl1 = diff(fl[0], fl[1]);
+
+ if (!chunks1)
+ printf("@@ -1,%d +1,%d @@\n", fl[0].elcnt, fl[1].elcnt);
+ a = b = 0;
+ while (a<fl[0].elcnt || b < fl[1].elcnt) {
+ if (a < csl1->a) {
+ if (fl[0].list[a].start[0]) {
+ printf("-");
+ printword(stdout, fl[0].list[a]);
+ }
+ a++;
+ exit_status++;
+ } else if (b < csl1->b) {
+ if (fl[1].list[b].start[0]) {
+ printf("+");
+ printword(stdout, fl[1].list[b]);
+ }
+ b++;
+ exit_status++;
+ } else {
+ if (fl[0].list[a].start[0] == '\0')
+ printsep(fl[0].list[a], fl[1].list[b]);
+ else {
+ printf(" ");
+ printword(stdout, fl[0].list[a]);
+ }
+ a++;
+ b++;
+ if (a >= csl1->a+csl1->len)
+ csl1++;
+ }
+ }
+ } else {
+ int a,b;
+ int sol = 1; /* start of line */
+ fl[0] = split_stream(flist[0], ByWord, 0);
+ fl[1] = split_stream(flist[1], ByWord, 0);
+ if (chunks2 && !chunks1)
+ csl1 = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl1 = diff(fl[0], fl[1]);
+
+ if (!chunks1) {
+ /* count lines in each file */
+ int l1, l2, i;
+ l1=l2=0;
+ for (i=0;i<fl[0].elcnt;i++)
+ if (ends_line(fl[0].list[i]))
+ l1++;
+ for (i=0;i<fl[1].elcnt;i++)
+ if (ends_line(fl[1].list[i]))
+ l2++;
+ printf("@@ -1,%d +1,%d @@\n", l1,l2);
+ }
+ a = b = 0;
+ while (a < fl[0].elcnt || b < fl[1].elcnt) {
+ if (a < csl1->a) {
+ exit_status++;
+ if (sol) {
+ int a1;
+ /* If we remove a whole line, output +line
+ * else clear sol and retry */
+ sol = 0;
+ for (a1=a; a1<csl1->a;a1++)
+ if (ends_line(fl[0].list[a1])) {
+ sol=1;
+ break;
+ }
+ if (sol) {
+ printf("-");
+ for (; a<csl1->a; a++) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a])) {
+ a++;
+ break;
+ }
+ }
+ } else printf("|");
+ }
+ if (!sol) {
+ printf("<<<--");
+ do {
+ if (sol) printf("|");
+ printword(stdout, fl[0].list[a]);
+ sol = ends_line(fl[0].list[a]);
+ a++;
+ } while (a < csl1->a);
+ printf("%s-->>>", sol?"|":"");
+ sol=0;
+ }
+ } else if (b < csl1->b) {
+ exit_status++;
+ if (sol) {
+ int b1;
+ sol = 0;
+ for (b1=b; b1<csl1->b;b1++)
+ if(ends_line(fl[1].list[b1])) {
+ sol=1;
+ break;
+ }
+ if (sol) {
+ printf("+");
+ for(; b<csl1->b ; b++) {
+ printword(stdout, fl[1].list[b]);
+ if(ends_line(fl[1].list[b])) {
+ b++;
+ break;
+ }
+ }
+ } else printf("|");
+ }
+ if (!sol) {
+ printf("<<<++");
+ do {
+ if (sol) printf("|");
+ printword(stdout, fl[1].list[b]);
+ sol = ends_line(fl[1].list[b]);
+ b++;
+ } while (b < csl1->b);
+ printf("%s++>>>",sol?"|":"");
+ sol=0;
+ }
+ } else {
+ if (sol) {
+ int a1;
+ sol = 0;
+ for (a1=a; a1<csl1->a+csl1->len; a1++)
+ if (ends_line(fl[0].list[a1]))
+ sol=1;
+ if (sol) {
+ if (fl[0].list[a].start[0]) {
+ printf(" ");
+ for(; a<csl1->a+csl1->len; a++,b++) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a])) {
+ a++,b++;
+ break;
+ }
+ }
+ } else {
+ printsep(fl[0].list[a], fl[1].list[b]);
+ a++; b++;
+ }
+ }
+ else printf("|");
+ }
+ if (!sol) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a]))
+ sol=1;
+ a++;
+ b++;
+ }
+ if (a >= csl1->a+csl1->len)
+ csl1++;
+ }
+ }
+
+ }
+ break;
+ case 'm':
+ /* merge three files, A B C, so changed between B and C get made to A
+ */
+ switch (argc-optind) {
+ case 0:
+ fprintf(stderr, "wiggle: no files given for --merge\n");
+ exit(2);
+ case 3:
+ case 2:
+ case 1:
+ for (i=0; i< argc-optind; i++) {
+ flist[i] = load_file(argv[optind+i]);
+ if (flist[i].body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind+i], strerror(errno));
+ exit(2);
+ }
+ }
+ break;
+ default:
+ fprintf(stderr, "wiggle: too many files given for --merge\n");
+ exit(2);
+ }
+ switch(argc-optind) {
+ case 1: /* a merge file */
+ f = flist[0];
+ if (!split_merge(f, &flist[0], &flist[1], &flist[2])) {
+ fprintf(stderr,"wiggle: merge file %s looks bad.\n",
+ argv[optind]);
+ exit(2);
+ }
+ break;
+ case 2: /* a file and a patch */
+ f = flist[1];
+ chunks2 = chunks3 = split_patch(f, &flist[1], &flist[2]);
+ break;
+ case 3: /* three separate files */
+ break;
+ }
+ if (reverse) {
+ f=flist[1];
+ flist[1] = flist[2];
+ flist[2]= f;
+ }
+
+ for (i=0; i<3; i++) {
+ if (flist[i].body==NULL) {
+ fprintf(stderr, "wiggle: file %d missing\n", i);
+ exit(2);
+ }
+ }
+ if (replace) {
+ int fd;
+ replacename = malloc(strlen(argv[optind])+ 20);
+ if (!replacename) die();
+ orignew = malloc(strlen(argv[optind])+20);
+ if (!orignew) die();
+ strcpy(replacename, argv[optind]);
+ strcpy(orignew, argv[optind]);
+ strcat(orignew, ".porig");
+ if (open(orignew, O_RDONLY) >= 0 ||
+ errno != ENOENT) {
+ fprintf(stderr,"wiggle: %s already exists\n",
+ orignew);
+ exit(2);
+ }
+ strcat(replacename,"XXXXXX");
+ fd = mkstemp(replacename);
+ if (fd == -1) {
+ fprintf(stderr,"wiggle: could not create temporary file for %s\n",
+ replacename);
+ exit(2);
+ }
+ outfile = fdopen(fd, "w");
+
+ }
+
+ if (obj == 'l') {
+ fl[0] = split_stream(flist[0], ByLine, 0);
+ fl[1] = split_stream(flist[1], ByLine, 0);
+ fl[2] = split_stream(flist[2], ByLine, 0);
+ } else {
+ fl[0] = split_stream(flist[0], ByWord, 0);
+ fl[1] = split_stream(flist[1], ByWord, 0);
+ fl[2] = split_stream(flist[2], ByWord, 0);
+ }
+ if (chunks2 && !chunks1)
+ csl1 = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl1 = diff(fl[0], fl[1]);
+ csl2 = diff(fl[1], fl[2]);
+
+#if 0
+ cleanlist(fl[0],fl[1],csl1);
+ cleanlist(fl[1],fl[2],csl2);
+#endif
+
+ {
+ struct ci ci;
+
+ ci = print_merge(outfile, &fl[0], &fl[1], &fl[2],
+ csl1, csl2, obj=='w');
+ if (!quiet && ci.conflicts)
+ fprintf(stderr, "%d unresolved conflict%s found\n", ci.conflicts, ci.conflicts==1?"":"s");
+ if (!quiet && ci.ignored)
+ fprintf(stderr, "%d already-applied change%s ignored\n", ci.ignored, ci.ignored==1?"":"s");
+ exit_status = (ci.conflicts > 0);
+ }
+ if (replace) {
+ fclose(outfile);
+ if (rename(argv[optind], orignew) ==0 &&
+ rename(replacename, argv[optind]) ==0)
+ /* all ok */;
+ else {
+ fprintf(stderr, "wiggle: failed to move new file into place.\n");
+ exit(2);
+ }
+ }
+ break;
+
+ }
+ exit(exit_status);
+}
diff --git a/demo/383MdBlocked b/demo/383MdBlocked
new file mode 100644
index 0000000..efa1f02
--- /dev/null
+++ b/demo/383MdBlocked
@@ -0,0 +1,271 @@
+Status: ok
+
+Support 'blocked' flag on devices.
+
+MORE DETAILS and more code
+
+Signed-off-by: Neil Brown <neilb@suse.de>
+
+### Diffstat output
+ ./drivers/md/md.c | 28 ++++++++++++++++++++++++++++
+ ./drivers/md/raid1.c | 21 ++++++++++++++++++++-
+ ./drivers/md/raid10.c | 24 +++++++++++++++++++++++-
+ ./drivers/md/raid5.c | 14 ++++++++++++--
+ ./include/linux/raid/md_k.h | 7 +++++++
+ 5 files changed, 90 insertions(+), 4 deletions(-)
+
+diff ./drivers/md/md.c~current~ ./drivers/md/md.c
+--- ./drivers/md/md.c~current~ 2006-04-18 13:00:05.000000000 +1000
++++ ./drivers/md/md.c 2006-04-11 15:44:12.000000000 +1000
+@@ -1724,6 +1724,18 @@ state_show(mdk_rdev_t *rdev, char *page)
+ len += sprintf(page+len, "%sin_sync",sep);
+ sep = ",";
+ }
++ if (test_bit(WriteMostly, &rdev->flags)) {
++ len += sprintf(page+len, "%swrite_mostly",sep);
++ sep = ",";
++ }
++ if (test_bit(ReadFault, &rdev->flags)) {
++ len += sprintf(page+len, "%sread_fault",sep);
++ sep = ",";
++ }
++ if (test_bit(Blocked, &rdev->flags)) {
++ len += sprintf(page+len, "%sblocked",sep);
++ sep = ",";
++ }
+ if (!test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags)) {
+ len += sprintf(page+len, "%sspare", sep);
+@@ -1738,6 +1750,10 @@ state_store(mdk_rdev_t *rdev, const char
+ /* can write
+ * faulty - simulates and error
+ * remove - disconnects the device
++ * writemostly - sets write_mostly
++ * -writemostly - clears write_mostly
++ * blocked - sets Blocked flag
++ * -blocked - clears Blocked flag
+ */
+ int err = -EINVAL;
+ if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
+@@ -1753,6 +1769,17 @@ state_store(mdk_rdev_t *rdev, const char
+ md_new_event(mddev);
+ err = 0;
+ }
++ } else if (cmd_match(buf, "writemostly")) {
++ set_bit(WriteMostly, &rdev->flags);
++ } else if (cmd_match(buf, "-writemostly")) {
++ clear_bit(WriteMostly, &rdev->flags);
++ } else if (cmd_match(buf, "blocked")) {
++ set_bit(Blocked, &rdev->flags);
++ } else if (cmd_match(buf, "-blocked")) {
++ clear_bit(Blocked, &rdev->flags);
++ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
++ md_wakeup_thread(mddev->thread);
++ wake_up(&??);
+ }
+ return err ? err : len;
+ }
+@@ -5459,6 +5486,7 @@ void md_check_recovery(mddev_t *mddev)
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ if (rdev->raid_disk >= 0 &&
+ (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) &&
++ !test_bit(Blocked, &rdev->flags) &&
+ atomic_read(&rdev->nr_pending)==0) {
+ if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) {
+ char nm[20];
+
+diff ./drivers/md/raid1.c~current~ ./drivers/md/raid1.c
+--- ./drivers/md/raid1.c~current~ 2006-04-18 13:00:05.000000000 +1000
++++ ./drivers/md/raid1.c 2006-04-18 12:59:28.000000000 +1000
+@@ -428,7 +428,7 @@ static int read_balance(conf_t *conf, r1
+ retry:
+ if (conf->mddev->recovery_cp < MaxSector &&
+ (this_sector + sectors >= conf->next_resync)) {
+- /* Choose the first operation device, for consistancy */
++ /* Choose the first operational device, for consistency */
+ new_disk = 0;
+
+ for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
+@@ -745,6 +745,7 @@ static int make_request(request_queue_t
+ struct page **behind_pages = NULL;
+ const int rw = bio_data_dir(bio);
+ int do_barriers;
++ int blocked = 0;
+
+ /*
+ * Register the new request and wait if the reconstruction
+@@ -824,12 +825,18 @@ static int make_request(request_queue_t
+ first = 0;
+ }
+ #endif
++ retry_write:
+ rcu_read_lock();
+ for (i = 0; i < disks; i++) {
+ if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&
+ !test_bit(Faulty, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(Faulty, &rdev->flags)) {
++ if (unlikely(test_bit(Block??, &rdev->flags))) {
++ blocked = 1;
++ rdev_dec_pending(rdev, mddev);
++ break;
++ }
+ rdev_dec_pending(rdev, mddev);
+ r1_bio->bios[i] = NULL;
+ } else
+@@ -839,6 +846,18 @@ static int make_request(request_queue_t
+ r1_bio->bios[i] = NULL;
+ }
+ rcu_read_unlock();
++ if (unlikely(blocked)) {
++ /* Have to wait for this device to get unblocked, then retry. */
++ int j;
++ for (j=0; j<i; i++)
++ if (r1_bio->bios[j])
++ rdev_dec_pending(conf->mirrors[j].rdev, mddev);
++ allow_barrier(conf);
++ wait_event(mddev->recovery_wait,
++ !(test_bit(Faulty, &rdev->flags....);
++ wait_barrier(conf);
++ goto retry_write;
++ }
+
+ BUG_ON(targets == 0); /* we never fail the last device */
+
+
+diff ./drivers/md/raid10.c~current~ ./drivers/md/raid10.c
+--- ./drivers/md/raid10.c~current~ 2006-04-18 13:00:05.000000000 +1000
++++ ./drivers/md/raid10.c 2006-04-12 14:19:51.000000000 +1000
+@@ -764,6 +764,7 @@ static int make_request(request_queue_t
+ const int rw = bio_data_dir(bio);
+ struct bio_list bl;
+ unsigned long flags;
++ int blocked = 0;
+
+ if (unlikely(bio_barrier(bio))) {
+ bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
+@@ -853,17 +854,22 @@ static int make_request(request_queue_t
+ /*
+ * WRITE:
+ */
+- /* first select target devices under spinlock and
++ /* first select target devices under rcu_lock and
+ * inc refcount on their rdev. Record them by setting
+ * bios[x] to bio
+ */
+ raid10_find_phys(conf, r10_bio);
++ retry_write:
+ rcu_read_lock();
+ for (i = 0; i < conf->copies; i++) {
+ int d = r10_bio->devs[i].devnum;
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
+ if (rdev &&
+ !test_bit(Faulty, &rdev->flags)) {
++ if (unlikely(test_bit(Blocked, &rdev->flags))) {
++ blocked = 1;
++ break;
++ }
+ atomic_inc(&rdev->nr_pending);
+ r10_bio->devs[i].bio = bio;
+ } else {
+@@ -873,6 +879,22 @@ static int make_request(request_queue_t
+ }
+ rcu_read_unlock();
+
++ if (unlikely(blocked)) {
++ /* Have to wait for this device to get unblocked, then retry */
++ int j;
++ int d;
++ for (j=0; j<i; j++)
++ if (r10_bio->devs[j].bio) {
++ d = r10_bio->devs[j].devnum;
++ rdev_dec_pending(conf->mirrors[d].rdev, mddev);
++ }
++ allow_barrier(conf);
++ d = r10_bio->devs[i].devnum;
++ wait_event(...);
++ wait_barrier(conf);
++ goto retry_write;
++ }
++
+ atomic_set(&r10_bio->remaining, 0);
+
+ bio_list_init(&bl);
+
+diff ./drivers/md/raid5.c~current~ ./drivers/md/raid5.c
+--- ./drivers/md/raid5.c~current~ 2006-04-18 13:00:05.000000000 +1000
++++ ./drivers/md/raid5.c 2006-04-12 10:26:53.000000000 +1000
+@@ -1349,6 +1349,7 @@ static void handle_stripe5(struct stripe
+ int syncing, expanding, expanded;
+ int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+ int non_overwrite = 0;
++ int blocked = 0;
+ int failed_num=0;
+ struct r5dev *dev;
+
+@@ -1400,7 +1401,6 @@ static void handle_stripe5(struct stripe
+ if (test_bit(R5_LOCKED, &dev->flags)) locked++;
+ if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+
+-
+ if (dev->toread) to_read++;
+ if (dev->towrite) {
+ to_write++;
+@@ -1414,6 +1414,9 @@ static void handle_stripe5(struct stripe
+ clear_bit(R5_ReadError, &dev->flags);
+ clear_bit(R5_ReWrite, &dev->flags);
+ }
++ if (rdev && test_bit(Blocked, &rdev->flags) &&
++ test_bit(Faulty, &rdev->flags))
++ blocked = 1;
+ if (!rdev || !test_bit(In_sync, &rdev->flags)
+ || test_bit(R5_ReadError, &dev->flags)) {
+ failed++;
+@@ -1505,6 +1508,11 @@ static void handle_stripe5(struct stripe
+ clear_bit(STRIPE_SYNCING, &sh->state);
+ syncing = 0;
+ }
++ if (blocked) {
++ set_bit(STRIPE_DELAYED, &sh->state);
++ set_bit(STRIPE_HANDLE, &sh->state);
++ goto unlock;
++ }
+
+ /* might be able to return some write requests if the parity block
+ * is safe, or on a failed drive
+@@ -1818,7 +1826,7 @@ static void handle_stripe5(struct stripe
+ release_stripe(sh2);
+ }
+ }
+-
++ unlock:
+ spin_unlock(&sh->lock);
+
+ while ((bi=return_bi)) {
+@@ -2454,6 +2462,8 @@ static void handle_stripe(struct stripe_
+
+ static void raid5_activate_delayed(raid5_conf_t *conf)
+ {
++ if (conf->blocked)
++ return;
+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
+ while (!list_empty(&conf->delayed_list)) {
+ struct list_head *l = conf->delayed_list.next;
+
+diff ./include/linux/raid/md_k.h~current~ ./include/linux/raid/md_k.h
+--- ./include/linux/raid/md_k.h~current~ 2006-04-18 13:00:05.000000000 +1000
++++ ./include/linux/raid/md_k.h 2006-04-18 12:59:43.000000000 +1000
+@@ -82,6 +82,13 @@ struct mdk_rdev_s
+ #define In_sync 2 /* device is in_sync with rest of array */
+ #define WriteMostly 4 /* Avoid reading if at all possible */
+ #define BarriersNotsupp 5 /* BIO_RW_BARRIER is not supported */
++#define DoBlock 6 /* If an error occurs, block IO until
++ * error or DoBlock is cleared
++ */
++#define Blocked 7 /* An error occurred and DoBlock was
++ * set, so don't touch this device until
++ * it is cleared.
++ */
+
+ int desc_nr; /* descriptor index in the superblock */
+ int raid_disk; /* role of device in array */
diff --git a/demo/Makefile b/demo/Makefile
new file mode 100644
index 0000000..a8652de
--- /dev/null
+++ b/demo/Makefile
@@ -0,0 +1,52 @@
+# Note on my Mobile Pentium II, -march=pentium2 delivers twice the performance of i386
+#OptDbg=-O3
+#OptDbg=-O3 -march=pentium2
+CFLAGS=$(OptDbg) -Wall -Werror
+
+# STRIP =
+INSTALL = /usr/bin/install
+DESTDIR =
+BINDIR = /usr/bin
+MANDIR = /var/share/man
+MAN1DIR = $(MANDIR)/man1
+MAN5DIR = $(MANDIR)/man5
+LDLIBS=-lncurses
+
+all: wiggle wiggle.man test
+
+vpatch : vpatch.o extract.o split.o diff.o
+
+
+# comment
+wiggle : wiggle.o load.o split.o extract.o diff.o bestmatch.o ReadMe.o merge.o
+wiggle.o load.o split.o extract.o diff.o bestmatch.o ReadMe.o merge.o : wiggle.h
+
+test: wiggle dotest
+ sh dotest
+
+wiggle.man : wiggle.1
+ nroff -man wiggle.1 > wiggle.man
+
+clean:
+ rm -f *.o *.man wiggle .version* version
+ find . -name core -o -name '*.tmp*' -o -name .tmp | xargs rm -f
+
+install : wiggle wiggle.1
+ $(INSTALL) -D $(STRIP) -m 755 wiggle $(DESTDIR)$(BINDIR)/wiggle
+ $(INSTALL) -D -m 644 wiggle.1 $(DESTDIR)$(MAN1DIR)/wiggle.1
+
+version : ReadMe.c wiggle.1 Readme.c
+ @rm -f version
+ @sed -n -e 's/.*wiggle - v\([0-9.]*\) - .*/\1/p' ReadMe.c > .version-readme
+ @sed -n -e 's/.*WIGGLE 1 "" v\([0-9.]*\)$$/\1/p' wiggle.1 > .version-man
+ @cmp -s .version-readme .version-man && cat .version-man > version || { echo Inconsistant versions.; exit 1;}
+
+dist : test clean version Test
+ mkdir -p DIST
+ rm -f DIST/wiggle-`cat version`
+ ln -s .. DIST/wiggle-`cat version`
+ tar czvf DIST/wiggle-`cat version`.tar.gz -h -C DIST --exclude RCS --exclude DIST wiggle-`cat version`
+ rm -f DIST/wiggle-`cat version`
+
+v : version
+ cat version
diff --git a/demo/README b/demo/README
new file mode 100644
index 0000000..095ed3a
--- /dev/null
+++ b/demo/README
@@ -0,0 +1,57 @@
+This is demo file for wiggle's --browse mode.
+
+Browse mode is intended to let you look through a patch
+to see how it will apply to a set of files. It is
+possible that the patch will have some conflicts.
+i.e. the patch was created from a different version of
+the source to the version that you are applying the patch
+to. This is what makes it interesting.
+
+You can use normal cursor motion to scroll around,
+both vertially and horizontally (both EMACS and VI style).
+From the initial file-list patch, use space
+or <return> to open/close a directory or file.
+Form the file-view mode, use 'q' to get back to
+to file list.
+
+Differences applied by the patch are shown as
+red for removal and blue for addition.
+Text with a pink background was not matched -
+maybe it has been changed since the patch was
+created.
+
+green-background text is text that the patch wants
+to change, but the exact correct change has already
+been made.
+
+Captial-N might go to the next interesting chunk
+of the file.
+
+
+---------------------------------------
+This demonstrates where a diff on one line
+applies to text that is now split
+
+The swift brown fox
+jumps over the fairly lazy dog.
+
+------------
+
+This demonstrates a diff which contains some
+extraneous lines.
+
+The swift brown fox jumps over the lazy dog.
+
+There will be extra lines in the diff
+
+--------------
+
+Here are some lines
+in which there is
+a clear conflict
+
+--------------
+
+Two different lines,
+both with changes
+
diff --git a/demo/md.c b/demo/md.c
new file mode 100644
index 0000000..3b8f0f8
--- /dev/null
+++ b/demo/md.c
@@ -0,0 +1,5769 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ - persistent bitmap code
+ Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/kthread.h>
+#include <linux/linkage.h>
+#include <linux/raid/md.h>
+#include <linux/raid/bitmap.h>
+#include <linux/sysctl.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/suspend.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/ctype.h>
+
+#include <linux/init.h>
+
+#include <linux/file.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+
+/* 63 partitions with the alternate major number (mdp) */
+#define MdpMinorShift 6
+
+#define DEBUG 0
+#define dprintk(x...) ((void)(DEBUG && printk(x)))
+
+
+#ifndef MODULE
+static void autostart_arrays (int part);
+#endif
+
+static LIST_HEAD(pers_list);
+static DEFINE_SPINLOCK(pers_lock);
+
+static void md_print_devices(void);
+
+#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 1000 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwidth if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ * or /sys/block/mdX/md/sync_speed_{min,max}
+ */
+
+static int sysctl_speed_limit_min = 1000;
+static int sysctl_speed_limit_max = 200000;
+static inline int speed_min(mddev_t *mddev)
+{
+ return mddev->sync_speed_min ?
+ mddev->sync_speed_min : sysctl_speed_limit_min;
+}
+
+static inline int speed_max(mddev_t *mddev)
+{
+ return mddev->sync_speed_max ?
+ mddev->sync_speed_max : sysctl_speed_limit_max;
+}
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {
+ .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
+ .procname = "speed_limit_min",
+ .data = &sysctl_speed_limit_min,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
+ .procname = "speed_limit_max",
+ .data = &sysctl_speed_limit_max,
+ .maxlen = sizeof(int),
+ .mode = S_IRUGO|S_IWUSR,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table raid_dir_table[] = {
+ {
+ .ctl_name = DEV_RAID,
+ .procname = "raid",
+ .maxlen = 0,
+ .mode = S_IRUGO|S_IWUGO,
+ .child = raid_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table raid_root_table[] = {
+ {
+ .ctl_name = CTL_DEV,
+ .procname = "dev",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = raid_dir_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static struct block_device_operations md_fops;
+
+static int start_readonly;
+
+/*
+ * We have a system wide 'event count' that is incremented
+ * on any 'interesting' event, and readers of /proc/mdstat
+ * can use 'poll' or 'select' to find out when the event
+ * count increases.
+ *
+ * Events are:
+ * start array, stop array, error, add device, remove device,
+ * start build, activate spare
+ */
+static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
+static atomic_t md_event_count;
+void md_new_event(mddev_t *mddev)
+{
+ atomic_inc(&md_event_count);
+ wake_up(&md_event_waiters);
+ sysfs_notify(&mddev->kobj, NULL, "sync_action");
+}
+EXPORT_SYMBOL_GPL(md_new_event);
+
+/* Alternate version that can be called from interrupts
+ * when calling sysfs_notify isn't needed.
+ */
+void md_new_event_inintr(mddev_t *mddev)
+{
+ atomic_inc(&md_event_count);
+ wake_up(&md_event_waiters);
+}
+
+/*
+ * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list.
+ */
+static LIST_HEAD(all_mddevs);
+static DEFINE_SPINLOCK(all_mddevs_lock);
+
+
+/*
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
+ */
+#define ITERATE_MDDEV(mddev,tmp) \
+ \
+ for (({ spin_lock(&all_mddevs_lock); \
+ tmp = all_mddevs.next; \
+ mddev = NULL;}); \
+ ({ if (tmp != &all_mddevs) \
+ mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
+ spin_unlock(&all_mddevs_lock); \
+ if (mddev) mddev_put(mddev); \
+ mddev = list_entry(tmp, mddev_t, all_mddevs); \
+ tmp != &all_mddevs;}); \
+ ({ spin_lock(&all_mddevs_lock); \
+ tmp = tmp->next;}) \
+ )
+
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio, bio->bi_size);
+ return 0;
+}
+
+static inline mddev_t *mddev_get(mddev_t *mddev)
+{
+ atomic_inc(&mddev->active);
+ return mddev;
+}
+
+static void mddev_put(mddev_t *mddev)
+{
+ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+ return;
+ if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+ list_del(&mddev->all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ blk_cleanup_queue(mddev->queue);
+ kobject_unregister(&mddev->kobj);
+ } else
+ spin_unlock(&all_mddevs_lock);
+}
+
+static mddev_t * mddev_find(dev_t unit)
+{
+ mddev_t *mddev, *new = NULL;
+
+ retry:
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+ if (mddev->unit == unit) {
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+ kfree(new);
+ return mddev;
+ }
+
+ if (new) {
+ list_add(&new->all_mddevs, &all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ return new;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ new = kzalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ new->unit = unit;
+ if (MAJOR(unit) == MD_MAJOR)
+ new->md_minor = MINOR(unit);
+ else
+ new->md_minor = MINOR(unit) >> MdpMinorShift;
+
+ mutex_init(&new->reconfig_mutex);
+ INIT_LIST_HEAD(&new->disks);
+ INIT_LIST_HEAD(&new->all_mddevs);
+ init_timer(&new->safemode_timer);
+ atomic_set(&new->active, 1);
+ spin_lock_init(&new->write_lock);
+ init_waitqueue_head(&new->sb_wait);
+ new->resync_max = MaxSector;
+
+ new->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!new->queue) {
+ kfree(new);
+ return NULL;
+ }
+ set_bit(QUEUE_FLAG_CLUSTER, &new->queue->queue_flags);
+
+ blk_queue_make_request(new->queue, md_fail_request);
+
+ goto retry;
+}
+
+static inline int mddev_lock(mddev_t * mddev)
+{
+ return mutex_lock_interruptible(&mddev->reconfig_mutex);
+}
+
+static inline int mddev_trylock(mddev_t * mddev)
+{
+ return mutex_trylock(&mddev->reconfig_mutex);
+}
+
+static inline void mddev_unlock(mddev_t * mddev)
+{
+ mutex_unlock(&mddev->reconfig_mutex);
+
+ md_wakeup_thread(mddev->thread);
+}
+
+static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->bdev->bd_dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+static struct mdk_personality *find_pers(int level, char *clevel)
+{
+ struct mdk_personality *pers;
+ list_for_each_entry(pers, &pers_list, list) {
+ if (level != LEVEL_NONE && pers->level == level)
+ return pers;
+ if (strcmp(pers->name, clevel)==0)
+ return pers;
+ }
+ return NULL;
+}
+
+static inline sector_t calc_dev_sboffset(struct block_device *bdev)
+{
+ sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ return MD_NEW_SIZE_BLOCKS(size);
+}
+
+static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
+{
+ sector_t size;
+
+ size = rdev->sb_offset;
+
+ if (chunk_size)
+ size &= ~((sector_t)chunk_size/1024 - 1);
+ return size;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(KERN_ALERT "md: out of memory.\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ put_page(rdev->sb_page);
+ rdev->sb_loaded = 0;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ }
+}
+
+
+static int super_written(struct bio *bio, unsigned int bytes_done, int error)
+{
+ mdk_rdev_t *rdev = bio->bi_private;
+ mddev_t *mddev = rdev->mddev;
+ if (bio->bi_size)
+ return 1;
+
+ if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags))
+ md_error(mddev, rdev);
+
+ if (atomic_dec_and_test(&mddev->pending_writes))
+ wake_up(&mddev->sb_wait);
+ bio_put(bio);
+ return 0;
+}
+
+static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error)
+{
+ struct bio *bio2 = bio->bi_private;
+ mdk_rdev_t *rdev = bio2->bi_private;
+ mddev_t *mddev = rdev->mddev;
+ if (bio->bi_size)
+ return 1;
+
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
+ error == -EOPNOTSUPP) {
+ unsigned long flags;
+ /* barriers don't appear to be supported :-( */
+ set_bit(BarriersNotsupp, &rdev->flags);
+ mddev->barriers_work = 0;
+ spin_lock_irqsave(&mddev->write_lock, flags);
+ bio2->bi_next = mddev->biolist;
+ mddev->biolist = bio2;
+ spin_unlock_irqrestore(&mddev->write_lock, flags);
+ wake_up(&mddev->sb_wait);
+ bio_put(bio);
+ return 0;
+ }
+ bio_put(bio2);
+ bio->bi_private = rdev;
+ return super_written(bio, bytes_done, error);
+}
+
+void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
+ sector_t sector, int size, struct page *page)
+{
+ /* write first size bytes of page to sector of rdev
+ * Increment mddev->pending_writes before returning
+ * and decrement it on completion, waking up sb_wait
+ * if zero is reached.
+ * If an error occurred, call md_error
+ *
+ * As we might need to resubmit the request if BIO_RW_BARRIER
+ * causes ENOTSUPP, we allocate a spare bio...
+ */
+ struct bio *bio = bio_alloc(GFP_NOIO, 1);
+ int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
+
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_sector = sector;
+ bio_add_page(bio, page, size, 0);
+ bio->bi_private = rdev;
+ bio->bi_end_io = super_written;
+ bio->bi_rw = rw;
+
+ atomic_inc(&mddev->pending_writes);
+ if (!test_bit(BarriersNotsupp, &rdev->flags)) {
+ struct bio *rbio;
+ rw |= (1<<BIO_RW_BARRIER);
+ rbio = bio_clone(bio, GFP_NOIO);
+ rbio->bi_private = bio;
+ rbio->bi_end_io = super_written_barrier;
+ submit_bio(rw, rbio);
+ } else
+ submit_bio(rw, bio);
+}
+
+void md_super_wait(mddev_t *mddev)
+{
+ /* wait for all superblock writes that were scheduled to complete.
+ * if any had to be retried (due to BARRIER problems), retry them
+ */
+ DEFINE_WAIT(wq);
+ for(;;) {
+ prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&mddev->pending_writes)==0)
+ break;
+ while (mddev->biolist) {
+ struct bio *bio;
+ spin_lock_irq(&mddev->write_lock);
+ bio = mddev->biolist;
+ mddev->biolist = bio->bi_next ;
+ bio->bi_next = NULL;
+ spin_unlock_irq(&mddev->write_lock);
+ submit_bio(bio->bi_rw, bio);
+ }
+ schedule();
+ }
+ finish_wait(&mddev->sb_wait, &wq);
+}
+
+static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
+{
+ if (bio->bi_size)
+ return 1;
+
+ complete((struct completion*)bio->bi_private);
+ return 0;
+}
+
+int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+ struct page *page, int rw)
+{
+ struct bio *bio = bio_alloc(GFP_NOIO, 1);
+ struct completion event;
+ int ret;
+
+ rw |= (1 << BIO_RW_SYNC);
+
+ bio->bi_bdev = bdev;
+ bio->bi_sector = sector;
+ bio_add_page(bio, page, size, 0);
+ init_completion(&event);
+ bio->bi_private = &event;
+ bio->bi_end_io = bi_complete;
+ submit_bio(rw, bio);
+ wait_for_completion(&event);
+
+ ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio_put(bio);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sync_page_io);
+
+static int read_disk_sb(mdk_rdev_t * rdev, int size)
+{
+ char b[BDEVNAME_SIZE];
+ if (!rdev->sb_page) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->sb_loaded)
+ return 0;
+
+
+ if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, size, rdev->sb_page, READ))
+ goto fail;
+ rdev->sb_loaded = 1;
+ return 0;
+
+fail:
+ printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
+ bdevname(rdev->bdev,b));
+ return -EINVAL;
+}
+
+static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
+ (sb1->set_uuid1 == sb2->set_uuid1) &&
+ (sb1->set_uuid2 == sb2->set_uuid2) &&
+ (sb1->set_uuid3 == sb2->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ kfree(tmp1);
+ kfree(tmp2);
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+
+/*
+ * Handle superblock details.
+ * We want to be able to handle multiple superblock formats
+ * so we have a common interface to them all, and an array of
+ * different handlers.
+ * We rely on user-space to write the initial superblock, and support
+ * reading and updating of superblocks.
+ * Interface methods are:
+ * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
+ * loads and validates a superblock on dev.
+ * if refdev != NULL, compare superblocks on both devices
+ * Return:
+ * 0 - dev has a superblock that is compatible with refdev
+ * 1 - dev has a superblock that is compatible and newer than refdev
+ * so dev should be used as the refdev in future
+ * -EINVAL superblock incompatible or invalid
+ * -othererror e.g. -EIO
+ *
+ * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
+ * Verify that dev is acceptable into mddev.
+ * The first time, mddev->raid_disks will be 0, and data from
+ * dev should be merged in. Subsequent calls check that dev
+ * is new enough. Return 0 or -EINVAL
+ *
+ * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
+ * Update the superblock for rdev with data in mddev
+ * This does not write to disc.
+ *
+ */
+
+struct super_type {
+ char *name;
+ struct module *owner;
+ int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
+ int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+ void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+};
+
+/*
+ * load_super for 0.90.0
+ */
+static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+ mdp_super_t *sb;
+ int ret;
+ sector_t sb_offset;
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk.
+ *
+ * It also happens to be a multiple of 4Kb.
+ */
+ sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_offset = sb_offset;
+
+ ret = read_disk_sb(rdev, MD_SB_BYTES);
+ if (ret) return ret;
+
+ ret = -EINVAL;
+
+ bdevname(rdev->bdev, b);
+ sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
+ b);
+ goto abort;
+ }
+
+ if (sb->major_version != 0 ||
+ sb->minor_version < 90 ||
+ sb->minor_version > 91) {
+ printk(KERN_WARNING "Bad version number %d.%d on %s\n",
+ sb->major_version, sb->minor_version,
+ b);
+ goto abort;
+ }
+
+ if (sb->raid_disks <= 0)
+ goto abort;
+
+ if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
+ printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
+ b);
+ goto abort;
+ }
+
+ rdev->preferred_minor = sb->md_minor;
+ rdev->data_offset = 0;
+ rdev->sb_size = MD_SB_BYTES;
+
+ if (sb->level == LEVEL_MULTIPATH)
+ rdev->desc_nr = -1;
+ else
+ rdev->desc_nr = sb->this_disk.number;
+
+ if (refdev == 0)
+ ret = 1;
+ else {
+ __u64 ev1, ev2;
+ mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+ if (!uuid_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ b, bdevname(refdev->bdev,b2));
+ goto abort;
+ }
+ if (!sb_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has same UUID"
+ " but different superblock to %s\n",
+ b, bdevname(refdev->bdev, b2));
+ goto abort;
+ }
+ ev1 = md_event(sb);
+ ev2 = md_event(refsb);
+ if (ev1 > ev2)
+ ret = 1;
+ else
+ ret = 0;
+ }
+ rdev->size = calc_dev_size(rdev, sb->chunk_size);
+
+ if (rdev->size < sb->size && sb->level > 1)
+ /* "this cannot possibly happen" ... */
+ ret = -EINVAL;
+
+ abort:
+ return ret;
+}
+
+/*
+ * validate_super for 0.90.0
+ */
+static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdp_disk_t *desc;
+ mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+ __u64 ev1 = md_event(sb);
+
+ rdev->raid_disk = -1;
+ rdev->flags = 0;
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 0;
+ mddev->minor_version = sb->minor_version;
+ mddev->patch_version = sb->patch_version;
+ mddev->persistent = 1;
+ mddev->external = 0;
+ mddev->chunk_size = sb->chunk_size;
+ mddev->ctime = sb->ctime;
+ mddev->utime = sb->utime;
+ mddev->level = sb->level;
+ mddev->clevel[0] = 0;
+ mddev->layout = sb->layout;
+ mddev->raid_disks = sb->raid_disks;
+ mddev->size = sb->size;
+ mddev->events = ev1;
+ mddev->bitmap_offset = 0;
+ mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
+
+ if (mddev->minor_version >= 91) {
+ mddev->reshape_position = sb->reshape_position;
+ mddev->delta_disks = sb->delta_disks;
+ mddev->new_level = sb->new_level;
+ mddev->new_layout = sb->new_layout;
+ mddev->new_chunk = sb->new_chunk;
+ } else {
+ mddev->reshape_position = MaxSector;
+ mddev->delta_disks = 0;
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk = mddev->chunk_size;
+ }
+
+ if (sb->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else {
+ if (sb->events_hi == sb->cp_events_hi &&
+ sb->events_lo == sb->cp_events_lo) {
+ mddev->recovery_cp = sb->recovery_cp;
+ } else
+ mddev->recovery_cp = 0;
+ }
+
+ memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
+ memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
+ memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
+ memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
+
+ mddev->max_disks = MD_SB_DISKS;
+
+ if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
+ mddev->bitmap_file == NULL) {
+ if (mddev->level != 1 && mddev->level != 4
+ && mddev->level != 5 && mddev->level != 6
+ && mddev->level != 10) {
+ /* FIXME use a better test */
+ printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
+ return -EINVAL;
+ }
+ mddev->bitmap_offset = mddev->default_bitmap_offset;
+ }
+
+ } else if (mddev->pers == NULL) {
+ /* Insist on good event counter while assembling */
+ ++ev1;
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ } else if (mddev->bitmap) {
+ /* if adding to array with a bitmap, then we can accept an
+ * older device ... but not too old.
+ */
+ if (ev1 < mddev->bitmap->events_cleared)
+ return 0;
+ } else {
+ if (ev1 < mddev->events)
+ /* just a hot-add of a new device, leave raid_disk at -1 */
+ return 0;
+ }
+
+ if (mddev->level != LEVEL_MULTIPATH) {
+ desc = sb->disks + rdev->desc_nr;
+
+ if (desc->state & (1<<MD_DISK_FAULTY))
+ set_bit(Faulty, &rdev->flags);
+ else if (desc->state & (1<<MD_DISK_SYNC) /* &&
+ desc->raid_disk < mddev->raid_disks */) {
+ set_bit(In_sync, &rdev->flags);
+ rdev->raid_disk = desc->raid_disk;
+ }
+ if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+ } else /* MULTIPATH are always insync */
+ set_bit(In_sync, &rdev->flags);
+ return 0;
+}
+
+/*
+ * sync_super for 0.90.0
+ */
+static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdp_super_t *sb;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev2;
+ int next_spare = mddev->raid_disks;
+
+
+ /* make rdev->sb match mddev data..
+ *
+ * 1/ zero out disks
+ * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
+ * 3/ any empty disks < next_spare become removed
+ *
+ * disks[0] gets initialised to REMOVED because
+ * we cannot be sure from other fields if it has
+ * been initialised or not.
+ */
+ int i;
+ int active=0, working=0,failed=0,spare=0,nr_disks=0;
+
+ rdev->sb_size = MD_SB_BYTES;
+
+ sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+ memset(sb, 0, sizeof(*sb));
+
+ sb->md_magic = MD_SB_MAGIC;
+ sb->major_version = mddev->major_version;
+ sb->patch_version = mddev->patch_version;
+ sb->gvalid_words = 0; /* ignored */
+ memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
+ memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
+ memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
+ memcpy(&sb->set_uuid3, mddev->uuid+12,4);
+
+ sb->ctime = mddev->ctime;
+ sb->level = mddev->level;
+ sb->size = mddev->size;
+ sb->raid_disks = mddev->raid_disks;
+ sb->md_minor = mddev->md_minor;
+ sb->not_persistent = 0;
+ sb->utime = mddev->utime;
+ sb->state = 0;
+ sb->events_hi = (mddev->events>>32);
+ sb->events_lo = (u32)mddev->events;
+
+ if (mddev->reshape_position == MaxSector)
+ sb->minor_version = 90;
+ else {
+ sb->minor_version = 91;
+ sb->reshape_position = mddev->reshape_position;
+ sb->new_level = mddev->new_level;
+ sb->delta_disks = mddev->delta_disks;
+ sb->new_layout = mddev->new_layout;
+ sb->new_chunk = mddev->new_chunk;
+ }
+ mddev->minor_version = sb->minor_version;
+ if (mddev->in_sync)
+ {
+ sb->recovery_cp = mddev->recovery_cp;
+ sb->cp_events_hi = (mddev->events>>32);
+ sb->cp_events_lo = (u32)mddev->events;
+ if (mddev->recovery_cp == MaxSector)
+ sb->state = (1<< MD_SB_CLEAN);
+ } else
+ sb->recovery_cp = 0;
+
+ sb->layout = mddev->layout;
+ sb->chunk_size = mddev->chunk_size;
+
+ if (mddev->bitmap && mddev->bitmap_file == NULL)
+ sb->state |= (1<<MD_SB_BITMAP_PRESENT);
+
+ sb->disks[0].state = (1<<MD_DISK_REMOVED);
+ ITERATE_RDEV(mddev,rdev2,tmp) {
+ mdp_disk_t *d;
+ int desc_nr;
+ if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
+ && !test_bit(Faulty, &rdev2->flags))
+ desc_nr = rdev2->raid_disk;
+ else
+ desc_nr = next_spare++;
+ rdev2->desc_nr = desc_nr;
+ d = &sb->disks[rdev2->desc_nr];
+ nr_disks++;
+ d->number = rdev2->desc_nr;
+ d->major = MAJOR(rdev2->bdev->bd_dev);
+ d->minor = MINOR(rdev2->bdev->bd_dev);
+ if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
+ && !test_bit(Faulty, &rdev2->flags))
+ d->raid_disk = rdev2->raid_disk;
+ else
+ d->raid_disk = rdev2->desc_nr; /* compatibility */
+ if (test_bit(Faulty, &rdev2->flags))
+ d->state = (1<<MD_DISK_FAULTY);
+ else if (test_bit(In_sync, &rdev2->flags)) {
+ d->state = (1<<MD_DISK_ACTIVE);
+ d->state |= (1<<MD_DISK_SYNC);
+ active++;
+ working++;
+ } else {
+ d->state = 0;
+ spare++;
+ working++;
+ }
+ if (test_bit(WriteMostly, &rdev2->flags))
+ d->state |= (1<<MD_DISK_WRITEMOSTLY);
+ }
+ /* now set the "removed" and "faulty" bits on any missing devices */
+ for (i=0 ; i < mddev->raid_disks ; i++) {
+ mdp_disk_t *d = &sb->disks[i];
+ if (d->state == 0 && d->number == 0) {
+ d->number = i;
+ d->raid_disk = i;
+ d->state = (1<<MD_DISK_REMOVED);
+ d->state |= (1<<MD_DISK_FAULTY);
+ failed++;
+ }
+ }
+ sb->nr_disks = nr_disks;
+ sb->active_disks = active;
+ sb->working_disks = working;
+ sb->failed_disks = failed;
+ sb->spare_disks = spare;
+
+ sb->this_disk = sb->disks[rdev->desc_nr];
+ sb->sb_csum = calc_sb_csum(sb);
+}
+
+/*
+ * version 1 superblock
+ */
+
+static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+{
+ unsigned int disk_csum, csum;
+ unsigned long long newcsum;
+ int size = 256 + le32_to_cpu(sb->max_dev)*2;
+ unsigned int *isuper = (unsigned int*)sb;
+ int i;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ newcsum = 0;
+ for (i=0; size>=4; size -= 4 )
+ newcsum += le32_to_cpu(*isuper++);
+
+ if (size == 2)
+ newcsum += le16_to_cpu(*(unsigned short*) isuper);
+
+ csum = (newcsum & 0xffffffff) + (newcsum >> 32);
+ sb->sb_csum = disk_csum;
+ return cpu_to_le32(csum);
+}
+
+static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+ struct mdp_superblock_1 *sb;
+ int ret;
+ sector_t sb_offset;
+ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+ int bmask;
+
+ /*
+ * Calculate the position of the superblock.
+ * It is always aligned to a 4K boundary and
+ * depeding on minor_version, it can be:
+ * 0: At least 8K, but less than 12K, from end of device
+ * 1: At start of device
+ * 2: 4K from start of device.
+ */
+ switch(minor_version) {
+ case 0:
+ sb_offset = rdev->bdev->bd_inode->i_size >> 9;
+ sb_offset -= 8*2;
+ sb_offset &= ~(sector_t)(4*2-1);
+ /* convert from sectors to K */
+ sb_offset /= 2;
+ break;
+ case 1:
+ sb_offset = 0;
+ break;
+ case 2:
+ sb_offset = 4;
+ break;
+ default:
+ return -EINVAL;
+ }
+ rdev->sb_offset = sb_offset;
+
+ /* superblock is rarely larger than 1K, but it can be larger,
+ * and it is safe to read 4k, so we do that
+ */
+ ret = read_disk_sb(rdev, 4096);
+ if (ret) return ret;
+
+
+ sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
+ sb->major_version != cpu_to_le32(1) ||
+ le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
+ le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+ (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
+ return -EINVAL;
+
+ if (calc_sb_1_csum(sb) != sb->sb_csum) {
+ printk("md: invalid superblock checksum on %s\n",
+ bdevname(rdev->bdev,b));
+ return -EINVAL;
+ }
+ if (le64_to_cpu(sb->data_size) < 10) {
+ printk("md: data_size too small on %s\n",
+ bdevname(rdev->bdev,b));
+ return -EINVAL;
+ }
+ rdev->preferred_minor = 0xffff;
+ rdev->data_offset = le64_to_cpu(sb->data_offset);
+ atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
+
+ rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
+ bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
+ if (rdev->sb_size & bmask)
+ rdev-> sb_size = (rdev->sb_size | bmask)+1;
+
+ if (refdev == 0)
+ ret = 1;
+ else {
+ __u64 ev1, ev2;
+ struct mdp_superblock_1 *refsb =
+ (struct mdp_superblock_1*)page_address(refdev->sb_page);
+
+ if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
+ sb->level != refsb->level ||
+ sb->layout != refsb->layout ||
+ sb->chunksize != refsb->chunksize) {
+ printk(KERN_WARNING "md: %s has strangely different"
+ " superblock to %s\n",
+ bdevname(rdev->bdev,b),
+ bdevname(refdev->bdev,b2));
+ return -EINVAL;
+ }
+ ev1 = le64_to_cpu(sb->events);
+ ev2 = le64_to_cpu(refsb->events);
+
+ if (ev1 > ev2)
+ ret = 1;
+ else
+ ret = 0;
+ }
+ if (minor_version)
+ rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+ else
+ rdev->size = rdev->sb_offset;
+ if (rdev->size < le64_to_cpu(sb->data_size)/2)
+ return -EINVAL;
+ rdev->size = le64_to_cpu(sb->data_size)/2;
+ if (le32_to_cpu(sb->chunksize))
+ rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+
+ if (le32_to_cpu(sb->size) > rdev->size*2)
+ return -EINVAL;
+ return ret;
+}
+
+static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+ __u64 ev1 = le64_to_cpu(sb->events);
+
+ rdev->raid_disk = -1;
+ rdev->flags = 0;
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 1;
+ mddev->patch_version = 0;
+ mddev->persistent = 1;
+ mddev->external = 0;
+ mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+ mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
+ mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+ mddev->level = le32_to_cpu(sb->level);
+ mddev->clevel[0] = 0;
+ mddev->layout = le32_to_cpu(sb->layout);
+ mddev->raid_disks = le32_to_cpu(sb->raid_disks);
+ mddev->size = le64_to_cpu(sb->size)/2;
+ mddev->events = ev1;
+ mddev->bitmap_offset = 0;
+ mddev->default_bitmap_offset = 1024 >> 9;
+
+ mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+ memcpy(mddev->uuid, sb->set_uuid, 16);
+
+ mddev->max_disks = (4096-256)/2;
+
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
+ mddev->bitmap_file == NULL ) {
+ if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
+ && mddev->level != 10) {
+ printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
+ return -EINVAL;
+ }
+ mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
+ }
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+ mddev->reshape_position = le64_to_cpu(sb->reshape_position);
+ mddev->delta_disks = le32_to_cpu(sb->delta_disks);
+ mddev->new_level = le32_to_cpu(sb->new_level);
+ mddev->new_layout = le32_to_cpu(sb->new_layout);
+ mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+ } else {
+ mddev->reshape_position = MaxSector;
+ mddev->delta_disks = 0;
+ mddev->new_level = mddev->level;
+ mddev->new_layout = mddev->layout;
+ mddev->new_chunk = mddev->chunk_size;
+ }
+
+ } else if (mddev->pers == NULL) {
+ /* Insist of good event counter while assembling */
+ ++ev1;
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ } else if (mddev->bitmap) {
+ /* If adding to array with a bitmap, then we can accept an
+ * older device, but not too old.
+ */
+ if (ev1 < mddev->bitmap->events_cleared)
+ return 0;
+ } else {
+ if (ev1 < mddev->events)
+ /* just a hot-add of a new device, leave raid_disk at -1 */
+ return 0;
+ }
+ if (mddev->level != LEVEL_MULTIPATH) {
+ int role;
+ rdev->desc_nr = le32_to_cpu(sb->dev_number);
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ switch(role) {
+ case 0xffff: /* spare */
+ break;
+ case 0xfffe: /* faulty */
+ set_bit(Faulty, &rdev->flags);
+ break;
+ default:
+ if ((le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_RECOVERY_OFFSET))
+ rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+ else
+ set_bit(In_sync, &rdev->flags);
+ rdev->raid_disk = role;
+ break;
+ }
+ if (sb->devflags & WriteMostly1)
+ set_bit(WriteMostly, &rdev->flags);
+ } else /* MULTIPATH are always insync */
+ set_bit(In_sync, &rdev->flags);
+
+ return 0;
+}
+
+static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct mdp_superblock_1 *sb;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev2;
+ int max_dev, i;
+ /* make rdev->sb match mddev and rdev data. */
+
+ sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ sb->feature_map = 0;
+ sb->pad0 = 0;
+ sb->recovery_offset = cpu_to_le64(0);
+ memset(sb->pad1, 0, sizeof(sb->pad1));
+ memset(sb->pad2, 0, sizeof(sb->pad2));
+ memset(sb->pad3, 0, sizeof(sb->pad3));
+
+ sb->utime = cpu_to_le64((__u64)mddev->utime);
+ sb->events = cpu_to_le64(mddev->events);
+ if (mddev->in_sync)
+ sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ else
+ sb->resync_offset = cpu_to_le64(0);
+
+ sb->cnt_corrected_read = atomic_read(&rdev->corrected_errors);
+
+ sb->raid_disks = cpu_to_le32(mddev->raid_disks);
+ sb->size = cpu_to_le64(mddev->size<<1);
+
+ if (mddev->bitmap && mddev->bitmap_file == NULL) {
+ sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
+ sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
+ }
+
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset > 0) {
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+ sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+ }
+
+ if (mddev->reshape_position != MaxSector) {
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+ sb->reshape_position = cpu_to_le64(mddev->reshape_position);
+ sb->new_layout = cpu_to_le32(mddev->new_layout);
+ sb->delta_disks = cpu_to_le32(mddev->delta_disks);
+ sb->new_level = cpu_to_le32(mddev->new_level);
+ sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+ }
+
+ max_dev = 0;
+ ITERATE_RDEV(mddev,rdev2,tmp)
+ if (rdev2->desc_nr+1 > max_dev)
+ max_dev = rdev2->desc_nr+1;
+
+ sb->max_dev = cpu_to_le32(max_dev);
+ for (i=0; i<max_dev;i++)
+ sb->dev_roles[i] = cpu_to_le16(0xfffe);
+
+ ITERATE_RDEV(mddev,rdev2,tmp) {
+ i = rdev2->desc_nr;
+ if (test_bit(Faulty, &rdev2->flags))
+ sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ else if (test_bit(In_sync, &rdev2->flags))
+ sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
+ sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else
+ sb->dev_roles[i] = cpu_to_le16(0xffff);
+ }
+
+ sb->sb_csum = calc_sb_1_csum(sb);
+}
+
+
+static struct super_type super_types[] = {
+ [0] = {
+ .name = "0.90.0",
+ .owner = THIS_MODULE,
+ .load_super = super_90_load,
+ .validate_super = super_90_validate,
+ .sync_super = super_90_sync,
+ },
+ [1] = {
+ .name = "md-1",
+ .owner = THIS_MODULE,
+ .load_super = super_1_load,
+ .validate_super = super_1_validate,
+ .sync_super = super_1_sync,
+ },
+};
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev))
+ return 1;
+
+ return 0;
+}
+
+static LIST_HEAD(pending_raid_disks);
+
+static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+ struct kobject *ko;
+ char *s;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ /* make sure rdev->size exceeds mddev->size */
+ if (rdev->size && (mddev->size == 0 || rdev->size < mddev->size)) {
+ if (mddev->pers)
+ /* Cannot change size, so fail */
+ return -ENOSPC;
+ else
+ mddev->size = rdev->size;
+ }
+ same_pdev = match_dev_unit(mddev, rdev);
+ if (same_pdev)
+ printk(KERN_WARNING
+ "%s: WARNING: %s appears to be on the same physical"
+ " disk as %s. True\n protection against single-disk"
+ " failure might be compromised.\n",
+ mdname(mddev), bdevname(rdev->bdev,b),
+ bdevname(same_pdev->bdev,b2));
+
+ /* Verify rdev->desc_nr is unique.
+ * If it is -1, assign a free number, else
+ * check number is not in use
+ */
+ if (rdev->desc_nr < 0) {
+ int choice = 0;
+ if (mddev->pers) choice = mddev->raid_disks;
+ while (find_rdev_nr(mddev, choice))
+ choice++;
+ rdev->desc_nr = choice;
+ } else {
+ if (find_rdev_nr(mddev, rdev->desc_nr))
+ return -EBUSY;
+ }
+ bdevname(rdev->bdev,b);
+ if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0)
+ return -ENOMEM;
+ while ( (s=strchr(rdev->kobj.k_name, '/')) != NULL)
+ *s = '!';
+
+ list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", b);
+
+ rdev->kobj.parent = &mddev->kobj;
+ kobject_add(&rdev->kobj);
+
+ if (rdev->bdev->bd_part)
+ ko = &rdev->bdev->bd_part->kobj;
+ else
+ ko = &rdev->bdev->bd_disk->kobj;
+ sysfs_create_link(&rdev->kobj, ko, "block");
+ bd_claim_by_disk(rdev->bdev, rdev, mddev->gendisk);
+ return 0;
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ char b[BDEVNAME_SIZE];
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
+ rdev->mddev = NULL;
+ sysfs_remove_link(&rdev->kobj, "block");
+ kobject_del(&rdev->kobj);
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by bd_claiming the device.
+ */
+static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
+{
+ int err = 0;
+ struct block_device *bdev;
+ char b[BDEVNAME_SIZE];
+
+ bdev = open_partition_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ if (IS_ERR(bdev)) {
+ printk(KERN_ERR "md: could not open %s.\n",
+ __bdevname(dev, b));
+ return PTR_ERR(bdev);
+ }
+ err = bd_claim(bdev, rdev);
+ if (err) {
+ printk(KERN_ERR "md: could not bd_claim %s.\n",
+ bdevname(bdev, b));
+ blkdev_put_partition(bdev);
+ return err;
+ }
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ bd_release(bdev);
+ blkdev_put_partition(bdev);
+}
+
+void md_autodetect_dev(dev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ char b[BDEVNAME_SIZE];
+ printk(KERN_INFO "md: export_rdev(%s)\n",
+ bdevname(rdev->bdev,b));
+ if (rdev->mddev)
+ MD_BUG();
+ free_disk_sb(rdev);
+ list_del_init(&rdev->same_set);
+#ifndef MODULE
+ md_autodetect_dev(rdev->bdev->bd_dev);
+#endif
+ unlock_rdev(rdev);
+ kobject_put(&rdev->kobj);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+ mddev->raid_disks = 0;
+ mddev->major_version = 0;
+}
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO
+ "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
+ sb->level, sb->size, sb->nr_disks, sb->raid_disks,
+ sb->md_minor, sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
+ " FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ char b[BDEVNAME_SIZE];
+ printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
+ bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
+ test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
+ rdev->desc_nr);
+ if (rdev->sb_loaded) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb((mdp_super_t*)page_address(rdev->sb_page));
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+static void md_print_devices(void)
+{
+ struct list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+ char b[BDEVNAME_SIZE];
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) {
+
+ if (mddev->bitmap)
+ bitmap_print_sb(mddev->bitmap);
+ else
+ printk("%s: ", mdname(mddev));
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", bdevname(rdev->bdev,b));
+ printk("\n");
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+
+static void sync_sbs(mddev_t * mddev, int nospares)
+{
+ /* Update each superblock (in-memory image), but
+ * if we are allowed to, skip spares which already
+ * have the right event counter, or have one earlier
+ * (which would mean they aren't being marked as dirty
+ * with the rest of the array)
+ */
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->sb_events == mddev->events ||
+ (nospares &&
+ rdev->raid_disk < 0 &&
+ (rdev->sb_events&1)==0 &&
+ rdev->sb_events+1 == mddev->events)) {
+ /* Don't update this superblock */
+ rdev->sb_loaded = 2;
+ } else {
+ super_types[mddev->major_version].
+ sync_super(mddev, rdev);
+ rdev->sb_loaded = 1;
+ }
+ }
+}
+
+void md_update_sb(mddev_t * mddev)
+{
+ int err;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+ int sync_req;
+ int nospares = 0;
+
+repeat:
+ spin_lock_irq(&mddev->write_lock);
+ sync_req = mddev->in_sync;
+ mddev->utime = get_seconds();
+ if (mddev->sb_dirty == 3)
+ /* just a clean<-> dirty transition, possibly leave spares alone,
+ * though if events isn't the right even/odd, we will have to do
+ * spares after all
+ */
+ nospares = 1;
+
+ /* If this is just a dirty<->clean transition, and the array is clean
+ * and 'events' is odd, we can roll back to the previous clean state */
+ if (mddev->sb_dirty == 3
+ && (mddev->in_sync && mddev->recovery_cp == MaxSector)
+ && (mddev->events & 1))
+ mddev->events--;
+ else {
+ /* otherwise we have to go forward and ... */
+ mddev->events ++;
+ if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
+ /* .. if the array isn't clean, insist on an odd 'events' */
+ if ((mddev->events&1)==0) {
+ mddev->events++;
+ nospares = 0;
+ }
+ } else {
+ /* otherwise insist on an even 'events' (for clean states) */
+ if ((mddev->events&1)) {
+ mddev->events++;
+ nospares = 0;
+ }
+ }
+ }
+
+ if (!mddev->events) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->events --;
+ }
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (!mddev->persistent) {
+ if (!mddev->external)
+ mddev->sb_dirty = 0;
+ spin_unlock_irq(&mddev->write_lock);
+ wake_up(&mddev->sb_wait);
+ return;
+ }
+ mddev->sb_dirty = 2;
+ sync_sbs(mddev, nospares);
+ spin_unlock_irq(&mddev->write_lock);
+
+ dprintk(KERN_INFO
+ "md: updating %s RAID superblock on device (in sync %d)\n",
+ mdname(mddev),mddev->in_sync);
+
+ err = bitmap_update_sb(mddev->bitmap);
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ char b[BDEVNAME_SIZE];
+ dprintk(KERN_INFO "md: ");
+ if (rdev->sb_loaded != 1)
+ continue; /* no noise on spare devices */
+ if (test_bit(Faulty, &rdev->flags))
+ dprintk("(skipping faulty ");
+
+ dprintk("%s ", bdevname(rdev->bdev,b));
+ if (!test_bit(Faulty, &rdev->flags)) {
+ md_super_write(mddev,rdev,
+ rdev->sb_offset<<1, rdev->sb_size,
+ rdev->sb_page);
+ dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
+ bdevname(rdev->bdev,b),
+ (unsigned long long)rdev->sb_offset);
+ rdev->sb_events = mddev->events;
+
+ } else
+ dprintk(")\n");
+ if (mddev->level == LEVEL_MULTIPATH)
+ /* only need to write one superblock... */
+ break;
+ }
+ md_super_wait(mddev);
+ /* if there was a failure, sb_dirty was set to 1, and we re-write super */
+
+ spin_lock_irq(&mddev->write_lock);
+ if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) {
+ /* have to write it out again */
+ spin_unlock_irq(&mddev->write_lock);
+ goto repeat;
+ }
+ mddev->sb_dirty = 0;
+ spin_unlock_irq(&mddev->write_lock);
+ wake_up(&mddev->sb_wait);
+
+}
+EXPORT_SYMBOL_GPL(md_update_sb);
+
+/* words written to sysfs files may, or my not, be \n terminated.
+ * We want to accept with case. For this we use cmd_match.
+ */
+static int cmd_match(const char *cmd, const char *str)
+{
+ /* See if cmd, written into a sysfs file, matches
+ * str. They must either be the same, or cmd can
+ * have a trailing newline
+ */
+ while (*cmd && *str && *cmd == *str) {
+ cmd++;
+ str++;
+ }
+ if (*cmd == '\n')
+ cmd++;
+ if (*str || *cmd)
+ return 0;
+ return 1;
+}
+
+struct rdev_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(mdk_rdev_t *, char *);
+ ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
+};
+
+static ssize_t
+state_show(mdk_rdev_t *rdev, char *page)
+{
+ char *sep = "";
+ int len=0;
+
+ if (test_bit(Faulty, &rdev->flags)) {
+ len+= sprintf(page+len, "%sfaulty",sep);
+ sep = ",";
+ }
+ if (test_bit(In_sync, &rdev->flags)) {
+ len += sprintf(page+len, "%sin_sync",sep);
+ sep = ",";
+ }
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ len += sprintf(page+len, "%swrite_mostly",sep);
+ sep = ",";
+ }
+ if (!test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags)) {
+ len += sprintf(page+len, "%sspare", sep);
+ sep = ",";
+ }
+ return len+sprintf(page+len, "\n");
+}
+
+static ssize_t
+state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+ /* can write
+ * faulty - simulates and error
+ * remove - disconnects the device
+ * writemostly - sets write_mostly
+ * -writemostly - clears write_mostly
+ */
+ int err = -EINVAL;
+ if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
+ md_error(rdev->mddev, rdev);
+ err = 0;
+ } else if (cmd_match(buf, "remove")) {
+ if (rdev->raid_disk >= 0)
+ err = -EBUSY;
+ else {
+ mddev_t *mddev = rdev->mddev;
+ kick_rdev_from_array(rdev);
+ md_update_sb(mddev);
+ md_new_event(mddev);
+ err = 0;
+ }
+ } else if (cmd_match(buf, "writemostly")) {
+ set_bit(WriteMostly, &rdev->flags);
+ err = 0;
+ } else if (cmd_match(buf, "-writemostly")) {
+ clear_bit(WriteMostly, &rdev->flags);
+ err = 0;
+ }
+ return err ? err : len;
+}
+static struct rdev_sysfs_entry rdev_state =
+__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
+
+static ssize_t
+super_show(mdk_rdev_t *rdev, char *page)
+{
+ if (rdev->sb_loaded && rdev->sb_size) {
+ memcpy(page, page_address(rdev->sb_page), rdev->sb_size);
+ return rdev->sb_size;
+ } else
+ return 0;
+}
+static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super);
+
+static ssize_t
+errors_show(mdk_rdev_t *rdev, char *page)
+{
+ return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
+}
+
+static ssize_t
+errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long n = simple_strtoul(buf, &e, 10);
+ if (*buf && (*e == 0 || *e == '\n')) {
+ atomic_set(&rdev->corrected_errors, n);
+ return len;
+ }
+ return -EINVAL;
+}
+static struct rdev_sysfs_entry rdev_errors =
+__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
+
+static ssize_t
+slot_show(mdk_rdev_t *rdev, char *page)
+{
+ if (rdev->raid_disk < 0)
+ return sprintf(page, "none\n");
+ else
+ return sprintf(page, "%d\n", rdev->raid_disk);
+}
+
+static ssize_t
+slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+ char *e;
+ int slot = simple_strtoul(buf, &e, 10);
+ if (strncmp(buf, "none", 4)==0)
+ slot = -1;
+ else if (e==buf || (*e && *e!= '\n'))
+ return -EINVAL;
+ if (rdev->mddev->pers)
+ /* Cannot set slot in active array (yet) */
+ return -EBUSY;
+ if (slot >= rdev->mddev->raid_disks)
+ return -ENOSPC;
+ rdev->raid_disk = slot;
+ /* assume it is working */
+ rdev->flags = 0;
+ set_bit(In_sync, &rdev->flags);
+ return len;
+}
+
+
+static struct rdev_sysfs_entry rdev_slot =
+__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
+
+static ssize_t
+offset_show(mdk_rdev_t *rdev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
+}
+
+static ssize_t
+offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long long offset = simple_strtoull(buf, &e, 10);
+ if (e==buf || (*e && *e != '\n'))
+ return -EINVAL;
+ if (rdev->mddev->pers)
+ return -EBUSY;
+ rdev->data_offset = offset;
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_offset =
+__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
+
+static ssize_t
+rdev_size_show(mdk_rdev_t *rdev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)rdev->size);
+}
+
+static ssize_t
+rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long long size = simple_strtoull(buf, &e, 10);
+ if (e==buf || (*e && *e != '\n'))
+ return -EINVAL;
+ if (rdev->mddev->pers)
+ return -EBUSY;
+ rdev->size = size;
+ if (size < rdev->mddev->size || rdev->mddev->size == 0)
+ rdev->mddev->size = size;
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_size =
+__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
+
+static struct attribute *rdev_default_attrs[] = {
+ &rdev_state.attr,
+ &rdev_super.attr,
+ &rdev_errors.attr,
+ &rdev_slot.attr,
+ &rdev_offset.attr,
+ &rdev_size.attr,
+ NULL,
+};
+static ssize_t
+rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+ struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
+ mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
+
+ if (!entry->show)
+ return -EIO;
+ return entry->show(rdev, page);
+}
+
+static ssize_t
+rdev_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *page, size_t length)
+{
+ struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
+ mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
+
+ if (!entry->store)
+ return -EIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ return entry->store(rdev, page, length);
+}
+
+static void rdev_free(struct kobject *ko)
+{
+ mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
+ kfree(rdev);
+}
+static struct sysfs_ops rdev_sysfs_ops = {
+ .show = rdev_attr_show,
+ .store = rdev_attr_store,
+};
+static struct kobj_type rdev_ktype = {
+ .release = rdev_free,
+ .sysfs_ops = &rdev_sysfs_ops,
+ .default_attrs = rdev_default_attrs,
+};
+
+/*
+ * Import a device. If 'super_format' >= 0, then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ * a faulty rdev _never_ has rdev->sb set.
+ */
+static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
+{
+ char b[BDEVNAME_SIZE];
+ int err;
+ mdk_rdev_t *rdev;
+ sector_t size;
+
+ rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for new device!\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ err = lock_rdev(rdev, newdev);
+ if (err)
+ goto abort_free;
+
+ rdev->kobj.parent = NULL;
+ rdev->kobj.ktype = &rdev_ktype;
+ kobject_init(&rdev->kobj);
+
+ rdev->desc_nr = -1;
+ rdev->flags = 0;
+ rdev->data_offset = 0;
+ rdev->sb_events = 0;
+ atomic_set(&rdev->nr_pending, 0);
+ atomic_set(&rdev->read_errors, 0);
+ atomic_set(&rdev->corrected_errors, 0);
+
+ size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ if (!size) {
+ printk(KERN_WARNING
+ "md: %s has zero or unknown size, marking faulty!\n",
+ bdevname(rdev->bdev,b));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (super_format >= 0) {
+ err = super_types[super_format].
+ load_super(rdev, NULL, super_minor);
+ if (err == -EINVAL) {
+ printk(KERN_WARNING
+ "md: %s has invalid sb, not importing!\n",
+ bdevname(rdev->bdev,b));
+ goto abort_free;
+ }
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: could not read %s's sb, not importing!\n",
+ bdevname(rdev->bdev,b));
+ goto abort_free;
+ }
+ }
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return rdev;
+
+abort_free:
+ if (rdev->sb_page) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return ERR_PTR(err);
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+
+static void analyze_sbs(mddev_t * mddev)
+{
+ int i;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev, *freshest;
+ char b[BDEVNAME_SIZE];
+
+ freshest = NULL;
+ ITERATE_RDEV(mddev,rdev,tmp)
+ switch (super_types[mddev->major_version].
+ load_super(rdev, freshest, mddev->minor_version)) {
+ case 1:
+ freshest = rdev;
+ break;
+ case 0:
+ break;
+ default:
+ printk( KERN_ERR \
+ "md: fatal superblock inconsistency in %s"
+ " -- removing from array\n",
+ bdevname(rdev->bdev,b));
+ kick_rdev_from_array(rdev);
+ }
+
+
+ super_types[mddev->major_version].
+ validate_super(mddev, freshest);
+
+ i = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev != freshest)
+ if (super_types[mddev->major_version].
+ validate_super(mddev, rdev)) {
+ printk(KERN_WARNING "md: kicking non-fresh %s"
+ " from array!\n",
+ bdevname(rdev->bdev,b));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ if (mddev->level == LEVEL_MULTIPATH) {
+ rdev->desc_nr = i++;
+ rdev->raid_disk = rdev->desc_nr;
+ set_bit(In_sync, &rdev->flags);
+ }
+ }
+
+
+
+ if (mddev->recovery_cp != MaxSector &&
+ mddev->level >= 1)
+ printk(KERN_ERR "md: %s: raid array is not clean"
+ " -- starting background reconstruction\n",
+ mdname(mddev));
+
+}
+
+static ssize_t
+safe_delay_show(mddev_t *mddev, char *page)
+{
+ int msec = (mddev->safemode_delay*1000)/HZ;
+ return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
+}
+static ssize_t
+safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
+{
+ int scale=1;
+ int dot=0;
+ int i;
+ unsigned long msec;
+ char buf[30];
+ char *e;
+ /* remove a period, and count digits after it */
+ if (len >= sizeof(buf))
+ return -EINVAL;
+ strlcpy(buf, cbuf, len);
+ buf[len] = 0;
+ for (i=0; i<len; i++) {
+ if (dot) {
+ if (isdigit(buf[i])) {
+ buf[i-1] = buf[i];
+ scale *= 10;
+ }
+ buf[i] = 0;
+ } else if (buf[i] == '.') {
+ dot=1;
+ buf[i] = 0;
+ }
+ }
+ msec = simple_strtoul(buf, &e, 10);
+ if (e == buf || (*e && *e != '\n'))
+ return -EINVAL;
+ msec = (msec * 1000) / scale;
+ if (msec == 0)
+ mddev->safemode_delay = 0;
+ else {
+ mddev->safemode_delay = (msec*HZ)/1000;
+ if (mddev->safemode_delay == 0)
+ mddev->safemode_delay = 1;
+ }
+ return len;
+}
+static struct md_sysfs_entry md_safe_delay =
+__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
+
+static ssize_t
+level_show(mddev_t *mddev, char *page)
+{
+ struct mdk_personality *p = mddev->pers;
+ if (p)
+ return sprintf(page, "%s\n", p->name);
+ else if (mddev->clevel[0])
+ return sprintf(page, "%s\n", mddev->clevel);
+ else if (mddev->level != LEVEL_NONE)
+ return sprintf(page, "%d\n", mddev->level);
+ else
+ return 0;
+}
+
+static ssize_t
+level_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ int rv = len;
+ if (mddev->pers)
+ return -EBUSY;
+ if (len == 0)
+ return 0;
+ if (len >= sizeof(mddev->clevel))
+ return -ENOSPC;
+ strncpy(mddev->clevel, buf, len);
+ if (mddev->clevel[len-1] == '\n')
+ len--;
+ mddev->clevel[len] = 0;
+ mddev->level = LEVEL_NONE;
+ return rv;
+}
+
+static struct md_sysfs_entry md_level =
+__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
+
+
+static ssize_t
+layout_show(mddev_t *mddev, char *page)
+{
+ /* just a number, not meaningful for all levels */
+ return sprintf(page, "%d\n", mddev->layout);
+}
+
+static ssize_t
+layout_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long n = simple_strtoul(buf, &e, 10);
+ if (mddev->pers)
+ return -EBUSY;
+
+ if (!*buf || (*e && *e != '\n'))
+ return -EINVAL;
+
+ mddev->layout = n;
+ return len;
+}
+static struct md_sysfs_entry md_layout =
+__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
+
+
+static ssize_t
+raid_disks_show(mddev_t *mddev, char *page)
+{
+ if (mddev->raid_disks == 0)
+ return 0;
+ return sprintf(page, "%d\n", mddev->raid_disks);
+}
+
+static int update_raid_disks(mddev_t *mddev, int raid_disks);
+
+static ssize_t
+raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ /* can only set raid_disks if array is not yet active */
+ char *e;
+ int rv = 0;
+ unsigned long n = simple_strtoul(buf, &e, 10);
+
+ if (!*buf || (*e && *e != '\n'))
+ return -EINVAL;
+
+ if (mddev->pers)
+ rv = update_raid_disks(mddev, n);
+ else
+ mddev->raid_disks = n;
+ return rv ? rv : len;
+}
+static struct md_sysfs_entry md_raid_disks =
+__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
+
+static ssize_t
+chunk_size_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->chunk_size);
+}
+
+static ssize_t
+chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ /* can only set chunk_size if array is not yet active */
+ char *e;
+ unsigned long n = simple_strtoul(buf, &e, 10);
+
+ if (mddev->pers)
+ return -EBUSY;
+ if (!*buf || (*e && *e != '\n'))
+ return -EINVAL;
+
+ mddev->chunk_size = n;
+ return len;
+}
+static struct md_sysfs_entry md_chunk_size =
+__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
+
+static ssize_t
+resync_start_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
+}
+
+static ssize_t
+resync_start_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ /* can only set chunk_size if array is not yet active */
+ char *e;
+ unsigned long long n = simple_strtoull(buf, &e, 10);
+
+ if (mddev->pers)
+ return -EBUSY;
+ if (!*buf || (*e && *e != '\n'))
+ return -EINVAL;
+
+ mddev->recovery_cp = n;
+ return len;
+}
+static struct md_sysfs_entry md_resync_start =
+__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
+
+/*
+ * The array state can be:
+ *
+ * clear
+ * No devices, no size, no level
+ * Equivalent to STOP_ARRAY ioctl
+ * inactive
+ * May have some settings, but array is not active
+ * all IO results in error
+ * When written, doesn't tear down array, but just stops it
+ * suspended (not supported yet)
+ * All IO requests will block. The array can be reconfigured.
+ * Writing this, if accepted, will block until array is quiessent
+ * readonly
+ * no resync can happen. no superblocks get written.
+ * write requests fail
+ * read-auto
+ * like readonly, but behaves like 'clean' on a write request.
+ *
+ * clean - no pending writes, but otherwise active.
+ * When written to inactive array, starts without resync
+ * If a write request arrives then
+ * if metadata is known, mark 'dirty' and switch to 'active'.
+ * if not known, block and switch to write-pending
+ * If written to an active array that has pending writes, then fails.
+ * active
+ * fully active: IO and resync can be happening.
+ * When written to inactive array, starts with resync
+ *
+ * write-pending
+ * clean, but writes are blocked waiting for 'active' to be written.
+ *
+ * active-idle
+ * like active, but no writes have been seen for a while (100msec).
+ *
+ */
+enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
+ write_pending, active_idle, bad_word};
+char *array_states[] = {
+ "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
+ "write-pending", "active-idle", NULL };
+
+static int match_word(const char *word, char **list)
+{
+ int n;
+ for (n=0; list[n]; n++)
+ if (cmd_match(word, list[n]))
+ break;
+ return n;
+}
+
+static ssize_t
+array_state_show(mddev_t *mddev, char *page)
+{
+ enum array_state st = inactive;
+
+ if (mddev->pers)
+ switch(mddev->ro) {
+ case 1:
+ st = readonly;
+ break;
+ case 2:
+ st = read_auto;
+ break;
+ case 0:
+ if (mddev->in_sync)
+ st = clean;
+ else if (mddev->sb_dirty)
+ st = write_pending;
+ else if (mddev->safemode)
+ st = active_idle;
+ else
+ st = active;
+ }
+ else {
+ if (list_empty(&mddev->disks) &&
+ mddev->raid_disks == 0 &&
+ mddev->size == 0)
+ st = clear;
+ else
+ st = inactive;
+ }
+ return sprintf(page, "%s\n", array_states[st]);
+}
+
+static int do_md_stop(mddev_t * mddev, int ro);
+static int do_md_run(mddev_t * mddev);
+static int restart_array(mddev_t *mddev);
+
+static ssize_t
+array_state_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ int err = -EINVAL;
+ enum array_state st = match_word(buf, array_states);
+ switch(st) {
+ case bad_word:
+ break;
+ case clear:
+ /* stopping an active array */
+ if (atomic_read(&mddev->active) > 1)
+ return -EBUSY;
+ err = do_md_stop(mddev, 0);
+ break;
+ case inactive:
+ /* stopping an active array */
+ if (mddev->pers) {
+ if (atomic_read(&mddev->active) > 1)
+ return -EBUSY;
+ err = do_md_stop(mddev, 2);
+ } else
+ err = 0; /* already inactive */
+ break;
+ case suspended:
+ break; /* not supported yet */
+ case readonly:
+ if (mddev->pers)
+ err = do_md_stop(mddev, 1);
+ else {
+ mddev->ro = 1;
+ err = do_md_run(mddev);
+ }
+ break;
+ case read_auto:
+ /* stopping an active array */
+ if (mddev->pers) {
+ err = do_md_stop(mddev, 1);
+ if (err == 0)
+ mddev->ro = 2; /* FIXME mark devices writable */
+ } else {
+ mddev->ro = 2;
+ err = do_md_run(mddev);
+ }
+ break;
+ case clean:
+ if (mddev->pers) {
+ restart_array(mddev);
+ spin_lock_irq(&mddev->write_lock);
+ if (atomic_read(&mddev->writes_pending) == 0) {
+ if (mddev->in_sync == 0) {
+ mddev->in_sync = 1;
+ if (mddev->persistent)
+ mddev->sb_dirty = 1;
+ }
+ err = 0;
+ } else
+ err = -EBUSY;
+ spin_unlock_irq(&mddev->write_lock);
+ } else {
+ mddev->ro = 0;
+ mddev->recovery_cp = MaxSector;
+ err = do_md_run(mddev);
+ }
+ break;
+ case active:
+ if (mddev->pers) {
+ restart_array(mddev);
+ mddev->sb_dirty = 0;
+ wake_up(&mddev->sb_wait);
+ err = 0;
+ } else {
+ mddev->ro = 0;
+ err = do_md_run(mddev);
+ }
+ break;
+ case write_pending:
+ case active_idle:
+ /* these cannot be set */
+ break;
+ }
+ if (err)
+ return err;
+ else
+ return len;
+}
+static struct md_sysfs_entry md_array_state =
+__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
+
+static ssize_t
+null_show(mddev_t *mddev, char *page)
+{
+ return -EINVAL;
+}
+
+static ssize_t
+new_dev_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ /* buf must be %d:%d\n? giving major and minor numbers */
+ /* The new device is added to the array.
+ * If the array has a persistent superblock, we read the
+ * superblock to initialise info and check validity.
+ * Otherwise, only checking done is that in bind_rdev_to_array,
+ * which mainly checks size.
+ */
+ char *e;
+ int major = simple_strtoul(buf, &e, 10);
+ int minor;
+ dev_t dev;
+ mdk_rdev_t *rdev;
+ int err;
+
+ if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
+ return -EINVAL;
+ minor = simple_strtoul(e+1, &e, 10);
+ if (*e && *e != '\n')
+ return -EINVAL;
+ dev = MKDEV(major, minor);
+ if (major != MAJOR(dev) ||
+ minor != MINOR(dev))
+ return -EOVERFLOW;
+
+
+ if (mddev->persistent) {
+ rdev = md_import_device(dev, mddev->major_version,
+ mddev->minor_version);
+ if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ err = super_types[mddev->major_version]
+ .load_super(rdev, rdev0, mddev->minor_version);
+ if (err < 0)
+ goto out;
+ }
+ } else
+ rdev = md_import_device(dev, -1, -1);
+
+ if (IS_ERR(rdev))
+ return PTR_ERR(rdev);
+ err = bind_rdev_to_array(rdev, mddev);
+ out:
+ if (err)
+ export_rdev(rdev);
+ return err ? err : len;
+}
+
+static struct md_sysfs_entry md_new_device =
+__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
+
+static ssize_t
+size_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->size);
+}
+
+static int update_size(mddev_t *mddev, unsigned long size);
+
+static ssize_t
+size_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ /* If array is inactive, we can reduce the component size, but
+ * not increase it (except from 0).
+ * If array is active, we can try an on-line resize
+ */
+ char *e;
+ int err = 0;
+ unsigned long long size = simple_strtoull(buf, &e, 10);
+ if (!*buf || *buf == '\n' ||
+ (*e && *e != '\n'))
+ return -EINVAL;
+
+ if (mddev->pers) {
+ err = update_size(mddev, size);
+ md_update_sb(mddev);
+ } else {
+ if (mddev->size == 0 ||
+ mddev->size > size)
+ mddev->size = size;
+ else
+ err = -ENOSPC;
+ }
+ return err ? err : len;
+}
+
+static struct md_sysfs_entry md_size =
+__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
+
+
+/* Metdata version.
+ * This is one of
+ * 'none' for arrays with no metadata (good luck...)
+ * 'external' for arrays with externally managed metadata,
+ * or N.M for internally known formats
+ */
+static ssize_t
+metadata_show(mddev_t *mddev, char *page)
+{
+ if (mddev->persistent)
+ return sprintf(page, "%d.%d\n",
+ mddev->major_version, mddev->minor_version);
+ else if (mddev->external)
+ return sprintf(page, "external\n");
+ else
+ return sprintf(page, "none\n");
+}
+
+static ssize_t
+metadata_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ int major, minor;
+ char *e;
+ if (!list_empty(&mddev->disks))
+ return -EBUSY;
+
+ if (cmd_match(buf, "none")) {
+ mddev->persistent = 0;
+ mddev->external = 0;
+ mddev->major_version = 0;
+ mddev->minor_version = 90;
+ return len;
+ }
+ if (cmd_match(buf, "external")) {
+ mddev->persistent = 0;
+ mddev->external = 1;
+ mddev->major_version = 0;
+ mddev->minor_version = 90;
+ return len;
+ }
+ major = simple_strtoul(buf, &e, 10);
+ if (e==buf || *e != '.')
+ return -EINVAL;
+ buf = e+1;
+ minor = simple_strtoul(buf, &e, 10);
+ if (e==buf || *e != '\n')
+ return -EINVAL;
+ if (major >= sizeof(super_types)/sizeof(super_types[0]) ||
+ super_types[major].name == NULL)
+ return -ENOENT;
+ mddev->major_version = major;
+ mddev->minor_version = minor;
+ mddev->persistent = 1;
+ mddev->external = 0;
+ return len;
+}
+
+static struct md_sysfs_entry md_metadata =
+__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
+
+static ssize_t
+action_show(mddev_t *mddev, char *page)
+{
+ char *type = "idle";
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) {
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ type = "reshape";
+ else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ type = "resync";
+ else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
+ type = "check";
+ else
+ type = "repair";
+ } else
+ type = "recover";
+ }
+ return sprintf(page, "%s\n", type);
+}
+
+static ssize_t
+action_store(mddev_t *mddev, const char *page, size_t len)
+{
+ if (!mddev->pers || !mddev->pers->sync_request)
+ return -EINVAL;
+
+ if (cmd_match(page, "idle")) {
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ mddev->recovery = 0;
+ }
+ } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ return -EBUSY;
+ else if (cmd_match(page, "resync") || cmd_match(page, "recover"))
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ else if (cmd_match(page, "reshape")) {
+ int err;
+ if (mddev->pers->start_reshape == NULL)
+ return -EINVAL;
+ err = mddev->pers->start_reshape(mddev);
+ if (err)
+ return err;
+ } else {
+ if (cmd_match(page, "check"))
+ set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ else if (!cmd_match(page, "repair"))
+ return -EINVAL;
+ set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ }
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ return len;
+}
+
+static ssize_t
+mismatch_cnt_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long) mddev->resync_mismatches);
+}
+
+static struct md_sysfs_entry md_scan_mode =
+__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
+
+
+static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
+
+static ssize_t
+sync_min_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%d (%s)\n", speed_min(mddev),
+ mddev->sync_speed_min ? "local": "system");
+}
+
+static ssize_t
+sync_min_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ int min;
+ char *e;
+ if (strncmp(buf, "system", 6)==0) {
+ mddev->sync_speed_min = 0;
+ return len;
+ }
+ min = simple_strtoul(buf, &e, 10);
+ if (buf == e || (*e && *e != '\n') || min <= 0)
+ return -EINVAL;
+ mddev->sync_speed_min = min;
+ return len;
+}
+
+static struct md_sysfs_entry md_sync_min =
+__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
+
+static ssize_t
+sync_max_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%d (%s)\n", speed_max(mddev),
+ mddev->sync_speed_max ? "local": "system");
+}
+
+static ssize_t
+sync_max_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ int max;
+ char *e;
+ if (strncmp(buf, "system", 6)==0) {
+ mddev->sync_speed_max = 0;
+ return len;
+ }
+ max = simple_strtoul(buf, &e, 10);
+ if (buf == e || (*e && *e != '\n') || max <= 0)
+ return -EINVAL;
+ mddev->sync_speed_max = max;
+ return len;
+}
+
+static struct md_sysfs_entry md_sync_max =
+__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
+
+
+static ssize_t
+sync_speed_show(mddev_t *mddev, char *page)
+{
+ unsigned long resync, dt, db;
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt);
+ return sprintf(page, "%ld\n", db/dt/2); /* K/sec */
+}
+
+static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
+
+static ssize_t
+sync_completed_show(mddev_t *mddev, char *page)
+{
+ unsigned long max_blocks, resync;
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ max_blocks = mddev->resync_max_sectors;
+ else
+ max_blocks = mddev->size << 1;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active));
+ return sprintf(page, "%lu / %lu\n", resync, max_blocks);
+}
+
+static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
+
+static ssize_t
+max_sync_show(mddev_t *mddev, char *page)
+{
+ if (mddev->resync_max == MaxSector)
+ return sprintf(page, "max\n");
+ else
+ return sprintf(page, "%llu\n",
+ (unsigned long long)mddev->resync_max);
+}
+static ssize_t
+max_sync_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ if (strncmp(buf, "max", 3)==0)
+ mddev->resync_max = MaxSector;
+ else {
+ char *ep;
+ unsigned long long max = simple_strtoull(buf, &ep, 10);
+ if (ep == buf || (*ep != 0 && *ep != '\n'))
+ return -EINVAL;
+ if (max < mddev->resync_max &&
+ test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return -EBUSY;
+
+ /* Must be a multiple of chunk_size */
+ if (mddev->chunk_size) {
+ if (max & (sector_t)((mddev->chunk_size>>9)-1))
+ return -EINVAL;
+ }
+ mddev->resync_max = max;
+ }
+ wake_up(&mddev->recovery_wait);
+ return len;
+}
+
+static struct md_sysfs_entry md_max_sync =
+__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
+
+static ssize_t
+suspend_lo_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
+}
+
+static ssize_t
+suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long long new = simple_strtoull(buf, &e, 10);
+
+ if (mddev->pers->quiesce == NULL)
+ return -EINVAL;
+ if (buf == e || (*e && *e != '\n'))
+ return -EINVAL;
+ if (new >= mddev->suspend_hi ||
+ (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
+ mddev->suspend_lo = new;
+ mddev->pers->quiesce(mddev, 2);
+ return len;
+ } else
+ return -EINVAL;
+}
+static struct md_sysfs_entry md_suspend_lo =
+__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
+
+
+static ssize_t
+suspend_hi_show(mddev_t *mddev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
+}
+
+static ssize_t
+suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
+{
+ char *e;
+ unsigned long long new = simple_strtoull(buf, &e, 10);
+
+ if (mddev->pers->quiesce == NULL)
+ return -EINVAL;
+ if (buf == e || (*e && *e != '\n'))
+ return -EINVAL;
+ if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
+ (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
+ mddev->suspend_hi = new;
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ return len;
+ } else
+ return -EINVAL;
+}
+static struct md_sysfs_entry md_suspend_hi =
+__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
+
+
+static struct attribute *md_default_attrs[] = {
+ &md_level.attr,
+ &md_layout.attr,
+ &md_raid_disks.attr,
+ &md_chunk_size.attr,
+ &md_size.attr,
+ &md_resync_start.attr,
+ &md_metadata.attr,
+ &md_new_device.attr,
+ &md_safe_delay.attr,
+ &md_array_state.attr,
+ NULL,
+};
+
+static struct attribute *md_redundancy_attrs[] = {
+ &md_scan_mode.attr,
+ &md_mismatches.attr,
+ &md_sync_min.attr,
+ &md_sync_max.attr,
+ &md_sync_speed.attr,
+ &md_sync_completed.attr,
+ &md_max_sync.attr,
+ &md_suspend_lo.attr,
+ &md_suspend_hi.attr,
+ NULL,
+};
+static struct attribute_group md_redundancy_group = {
+ .name = NULL,
+ .attrs = md_redundancy_attrs,
+};
+
+
+static ssize_t
+md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+ struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
+ mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
+ ssize_t rv;
+
+ if (!entry->show)
+ return -EIO;
+ rv = mddev_lock(mddev);
+ if (!rv) {
+ rv = entry->show(mddev, page);
+ mddev_unlock(mddev);
+ }
+ return rv;
+}
+
+static ssize_t
+md_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *page, size_t length)
+{
+ struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
+ mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
+ ssize_t rv;
+
+ if (!entry->store)
+ return -EIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ rv = mddev_lock(mddev);
+ if (!rv) {
+ rv = entry->store(mddev, page, length);
+ mddev_unlock(mddev);
+ }
+ return rv;
+}
+
+static void md_free(struct kobject *ko)
+{
+ mddev_t *mddev = container_of(ko, mddev_t, kobj);
+ kfree(mddev);
+}
+
+static struct sysfs_ops md_sysfs_ops = {
+ .show = md_attr_show,
+ .store = md_attr_store,
+};
+static struct kobj_type md_ktype = {
+ .release = md_free,
+ .sysfs_ops = &md_sysfs_ops,
+ .default_attrs = md_default_attrs,
+};
+
+int mdp_major = 0;
+
+static struct kobject *md_probe(dev_t dev, int *part, void *data)
+{
+ static DEFINE_MUTEX(disks_mutex);
+ mddev_t *mddev = mddev_find(dev);
+ struct gendisk *disk;
+ int partitioned = (MAJOR(dev) != MD_MAJOR);
+ int shift = partitioned ? MdpMinorShift : 0;
+ int unit = MINOR(dev) >> shift;
+
+ if (!mddev)
+ return NULL;
+
+ mutex_lock(&disks_mutex);
+ if (mddev->gendisk) {
+ mutex_unlock(&disks_mutex);
+ mddev_put(mddev);
+ return NULL;
+ }
+ disk = alloc_disk(1 << shift);
+ if (!disk) {
+ mutex_unlock(&disks_mutex);
+ mddev_put(mddev);
+ return NULL;
+ }
+ disk->major = MAJOR(dev);
+ disk->first_minor = unit << shift;
+ if (partitioned) {
+ sprintf(disk->disk_name, "md_d%d", unit);
+ sprintf(disk->devfs_name, "md/d%d", unit);
+ } else {
+ sprintf(disk->disk_name, "md%d", unit);
+ sprintf(disk->devfs_name, "md/%d", unit);
+ }
+ disk->fops = &md_fops;
+ disk->private_data = mddev;
+ disk->queue = mddev->queue;
+ add_disk(disk);
+ mddev->gendisk = disk;
+ mutex_unlock(&disks_mutex);
+ mddev->kobj.parent = &disk->kobj;
+ mddev->kobj.k_name = NULL;
+ snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md");
+ mddev->kobj.ktype = &md_ktype;
+ kobject_register(&mddev->kobj);
+ return NULL;
+}
+
+static void md_safemode_timeout(unsigned long data)
+{
+ mddev_t *mddev = (mddev_t *) data;
+
+ mddev->safemode = 1;
+ md_wakeup_thread(mddev->thread);
+}
+
+static int start_dirty_degraded;
+
+static int do_md_run(mddev_t * mddev)
+{
+ int err;
+ int chunk_size;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+ struct gendisk *disk;
+ struct mdk_personality *pers;
+ char b[BDEVNAME_SIZE];
+
+ if (list_empty(&mddev->disks))
+ /* cannot run an array with no devices.. */
+ return -EINVAL;
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (!mddev->raid_disks)
+ analyze_sbs(mddev);
+
+ chunk_size = mddev->chunk_size;
+
+ if (chunk_size) {
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(KERN_ERR "too big chunk_size: %d > %d\n",
+ chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(KERN_ERR "too small chunk_size: %d < %ld\n",
+ chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+
+ /* devices must have minimum size of one chunk */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ if (rdev->size < chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size:"
+ " %lluk < %dk\n",
+ bdevname(rdev->bdev,b),
+ (unsigned long long)rdev->size,
+ chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+ }
+
+#ifdef CONFIG_KMOD
+ if (mddev->level != LEVEL_NONE)
+ request_module("md-level-%d", mddev->level);
+ else if (mddev->clevel[0])
+ request_module("md-%s", mddev->clevel);
+#endif
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ sync_blockdev(rdev->bdev);
+ invalidate_bdev(rdev->bdev, 0);
+ }
+
+ md_probe(mddev->unit, NULL, NULL);
+ disk = mddev->gendisk;
+ if (!disk)
+ return -ENOMEM;
+
+ spin_lock(&pers_lock);
+ pers = find_pers(mddev->level, mddev->clevel);
+ if (!pers || !try_module_get(pers->owner)) {
+ spin_unlock(&pers_lock);
+ if (mddev->level != LEVEL_NONE)
+ printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
+ mddev->level);
+ else
+ printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
+ mddev->clevel);
+ return -EINVAL;
+ }
+ mddev->pers = pers;
+ spin_unlock(&pers_lock);
+ mddev->level = pers->level;
+ strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
+
+ if (mddev->reshape_position != MaxSector &&
+ pers->start_reshape == NULL) {
+ /* This personality cannot handle reshaping... */
+ mddev->pers = NULL;
+ module_put(pers->owner);
+ return -EINVAL;
+ }
+
+ mddev->recovery = 0;
+ mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
+ mddev->barriers_work = 1;
+ mddev->ok_start_degraded = start_dirty_degraded;
+
+ if (start_readonly)
+ mddev->ro = 2; /* read-only, but switch on first write */
+
+ err = mddev->pers->run(mddev);
+ if (!err && mddev->pers->sync_request) {
+ err = bitmap_create(mddev);
+ if (err) {
+ printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
+ mdname(mddev), err);
+ mddev->pers->stop(mddev);
+ }
+ }
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ module_put(mddev->pers->owner);
+ mddev->pers = NULL;
+ bitmap_destroy(mddev);
+ return err;
+ }
+ if (mddev->pers->sync_request)
+ sysfs_create_group(&mddev->kobj, &md_redundancy_group);
+ else if (mddev->ro == 2) /* auto-readonly not meaningful */
+ mddev->ro = 0;
+
+ atomic_set(&mddev->writes_pending,0);
+ mddev->safemode = 0;
+ mddev->safemode_timer.function = md_safemode_timeout;
+ mddev->safemode_timer.data = (unsigned long) mddev;
+ mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
+ mddev->in_sync = 1;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (rdev->raid_disk >= 0) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+ }
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+
+ set_capacity(disk, mddev->array_size<<1);
+
+ /* If we call blk_queue_make_request here, it will
+ * re-initialise max_sectors etc which may have been
+ * refined inside -> run. So just set the bits we need to set.
+ * Most initialisation happended when we called
+ * blk_queue_make_request(..., md_fail_request)
+ * earlier.
+ */
+ mddev->queue->queuedata = mddev;
+ mddev->queue->make_request_fn = mddev->pers->make_request;
+
+ /* If there is a partially-recovered drive we need to
+ * start recovery here. If we leave it to md_check_recovery,
+ * it will remove the drives and not do the right thing
+ */
+ if (mddev->degraded) {
+ struct list_head *rtmp;
+ int spares = 0;
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags))
+ /* complete an interrupted recovery */
+ spares++;
+ if (spares && mddev->pers->sync_request) {
+ mddev->recovery = 0;
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ mddev->sync_thread = md_register_thread(md_do_sync,
+ mddev,
+ "%s_resync");
+ if (!mddev->sync_thread) {
+ printk(KERN_ERR "%s: could not start resync"
+ " thread...\n",
+ mdname(mddev));
+ /* leave the spares where they are, it shouldn't hurt */
+ mddev->recovery = 0;
+ } else
+ md_wakeup_thread(mddev->sync_thread);
+ }
+ }
+
+ mddev->changed = 1;
+ md_new_event(mddev);
+ return 0;
+}
+
+static int restart_array(mddev_t *mddev)
+{
+ struct gendisk *disk = mddev->gendisk;
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->safemode = 0;
+ mddev->ro = 0;
+ set_disk_ro(disk, 0);
+
+ printk(KERN_INFO "md: %s switched to read-write mode.\n",
+ mdname(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ md_wakeup_thread(mddev->sync_thread);
+ err = 0;
+ } else
+ err = -EINVAL;
+
+out:
+ return err;
+}
+
+/* similar to deny_write_access, but accounts for our holding a reference
+ * to the file ourselves */
+static int deny_bitmap_write_access(struct file * file)
+{
+ struct inode *inode = file->f_mapping->host;
+
+ spin_lock(&inode->i_lock);
+ if (atomic_read(&inode->i_writecount) > 1) {
+ spin_unlock(&inode->i_lock);
+ return -ETXTBSY;
+ }
+ atomic_set(&inode->i_writecount, -1);
+ spin_unlock(&inode->i_lock);
+
+ return 0;
+}
+
+static void restore_bitmap_write_access(struct file *file)
+{
+ struct inode *inode = file->f_mapping->host;
+
+ spin_lock(&inode->i_lock);
+ atomic_set(&inode->i_writecount, 1);
+ spin_unlock(&inode->i_lock);
+}
+
+/* mode:
+ * 0 - completely stop and dis-assemble array
+ * 1 - switch to readonly
+ * 2 - stop but do not disassemble array
+ */
+static int do_md_stop(mddev_t * mddev, int mode)
+{
+ int err = 0;
+ struct gendisk *disk = mddev->gendisk;
+
+ if (mddev->pers) {
+ if (atomic_read(&mddev->active)>2) {
+ printk("md: %s still in use.\n",mdname(mddev));
+ return -EBUSY;
+ }
+
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ }
+
+ del_timer_sync(&mddev->safemode_timer);
+
+ invalidate_partition(disk, 0);
+
+ switch(mode) {
+ case 1: /* readonly */
+ err = -ENXIO;
+ if (mddev->ro==1)
+ goto out;
+ mddev->ro = 1;
+ break;
+ case 0: /* disassemble */
+ case 2: /* stop */
+ bitmap_flush(mddev);
+ md_super_wait(mddev);
+ if (mddev->ro)
+ set_disk_ro(disk, 0);
+ blk_queue_make_request(mddev->queue, md_fail_request);
+ mddev->pers->stop(mddev);
+ if (mddev->pers->sync_request)
+ sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
+
+ module_put(mddev->pers->owner);
+ mddev->pers = NULL;
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (!mddev->in_sync || mddev->sb_dirty) {
+ /* mark array as shutdown cleanly */
+ mddev->in_sync = 1;
+ md_update_sb(mddev);
+ }
+ if (mode == 1)
+ set_disk_ro(disk, 1);
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ }
+
+ /*
+ * Free resources if final stop
+ */
+ if (mode == 0) {
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+ struct gendisk *disk;
+ printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
+
+ bitmap_destroy(mddev);
+ if (mddev->bitmap_file) {
+ restore_bitmap_write_access(mddev->bitmap_file);
+ fput(mddev->bitmap_file);
+ mddev->bitmap_file = NULL;
+ }
+ mddev->bitmap_offset = 0;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (rdev->raid_disk >= 0) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ }
+
+ export_array(mddev);
+
+ mddev->array_size = 0;
+ mddev->size = 0;
+ mddev->raid_disks = 0;
+ mddev->recovery_cp = 0;
+ mddev->resync_max = MaxSector;
+
+ disk = mddev->gendisk;
+ if (disk)
+ set_capacity(disk, 0);
+ mddev->changed = 1;
+ } else if (mddev->pers)
+ printk(KERN_INFO "md: %s switched to read-only mode.\n",
+ mdname(mddev));
+ err = 0;
+ md_new_event(mddev);
+out:
+ return err;
+}
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks))
+ return;
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ char b[BDEVNAME_SIZE];
+ printk("<%s>", bdevname(rdev->bdev,b));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in pending_raid_disks)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(int part)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ dev_t dev;
+ LIST_HEAD(candidates);
+ rdev0 = list_entry(pending_raid_disks.next,
+ mdk_rdev_t, same_set);
+
+ printk(KERN_INFO "md: considering %s ...\n",
+ bdevname(rdev0->bdev,b));
+ INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp)
+ if (super_90_load(rdev, rdev0, 0) >= 0) {
+ printk(KERN_INFO "md: adding %s ...\n",
+ bdevname(rdev->bdev,b));
+ list_move(&rdev->same_set, &candidates);
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+ if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
+ printk(KERN_INFO "md: unit number in %s is bad: %d\n",
+ bdevname(rdev0->bdev, b), rdev0->preferred_minor);
+ break;
+ }
+ if (part)
+ dev = MKDEV(mdp_major,
+ rdev0->preferred_minor << MdpMinorShift);
+ else
+ dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
+
+ md_probe(dev, NULL, NULL);
+ mddev = mddev_find(dev);
+ if (!mddev) {
+ printk(KERN_ERR
+ "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (mddev_lock(mddev))
+ printk(KERN_WARNING "md: %s locked, cannot run\n",
+ mdname(mddev));
+ else if (mddev->raid_disks || mddev->major_version
+ || !list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: %s already running, cannot run %s\n",
+ mdname(mddev), bdevname(rdev0->bdev,b));
+ mddev_unlock(mddev);
+ } else {
+ printk(KERN_INFO "md: created %s\n", mdname(mddev));
+ ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
+ list_del_init(&rdev->same_set);
+ if (bind_rdev_to_array(rdev, mddev))
+ export_rdev(rdev);
+ }
+ autorun_array(mddev);
+ mddev_unlock(mddev);
+ }
+ /* on success, candidates will be empty, on error
+ * it won't...
+ */
+ ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
+ export_rdev(rdev);
+ mddev_put(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+static int autostart_array(dev_t startdev)
+{
+ char b[BDEVNAME_SIZE];
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ start_rdev = md_import_device(startdev, 0, 0);
+ if (IS_ERR(start_rdev))
+ return err;
+
+
+ /* NOTE: this can only work for 0.90.0 superblocks */
+ sb = (mdp_super_t*)page_address(start_rdev->sb_page);
+ if (sb->major_version != 0 ||
+ sb->minor_version != 90 ) {
+ printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
+ export_rdev(start_rdev);
+ return err;
+ }
+
+ if (test_bit(Faulty, &start_rdev->flags)) {
+ printk(KERN_WARNING
+ "md: can not autostart based on faulty %s!\n",
+ bdevname(start_rdev->bdev,b));
+ export_rdev(start_rdev);
+ return err;
+ }
+ list_add(&start_rdev->same_set, &pending_raid_disks);
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc = sb->disks + i;
+ dev_t dev = MKDEV(desc->major, desc->minor);
+
+ if (!dev)
+ continue;
+ if (dev == startdev)
+ continue;
+ if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
+ continue;
+ rdev = md_import_device(dev, 0, 0);
+ if (IS_ERR(rdev))
+ continue;
+
+ list_add(&rdev->same_set, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices(0);
+ return 0;
+
+}
+
+
+static int get_version(void __user * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_array_info(mddev_t * mddev, void __user * arg)
+{
+ mdu_array_info_t info;
+ int nr,working,active,failed,spare;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ nr=working=active=failed=spare=0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ nr++;
+ if (test_bit(Faulty, &rdev->flags))
+ failed++;
+ else {
+ working++;
+ if (test_bit(In_sync, &rdev->flags))
+ active++;
+ else
+ spare++;
+ }
+ }
+
+ info.major_version = mddev->major_version;
+ info.minor_version = mddev->minor_version;
+ info.patch_version = MD_PATCHLEVEL_VERSION;
+ info.ctime = mddev->ctime;
+ info.level = mddev->level;
+ info.size = mddev->size;
+ if (info.size != mddev->size) /* overflow */
+ info.size = -1;
+ info.nr_disks = nr;
+ info.raid_disks = mddev->raid_disks;
+ info.md_minor = mddev->md_minor;
+ info.not_persistent= !mddev->persistent;
+
+ info.utime = mddev->utime;
+ info.state = 0;
+ if (mddev->in_sync)
+ info.state = (1<<MD_SB_CLEAN);
+ if (mddev->bitmap && mddev->bitmap_offset)
+ info.state = (1<<MD_SB_BITMAP_PRESENT);
+ info.active_disks = active;
+ info.working_disks = working;
+ info.failed_disks = failed;
+ info.spare_disks = spare;
+
+ info.layout = mddev->layout;
+ info.chunk_size = mddev->chunk_size;
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_bitmap_file(mddev_t * mddev, void __user * arg)
+{
+ mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
+ char *ptr, *buf = NULL;
+ int err = -ENOMEM;
+
+ file = kmalloc(sizeof(*file), GFP_KERNEL);
+ if (!file)
+ goto out;
+
+ /* bitmap disabled, zero the first byte and copy out */
+ if (!mddev->bitmap || !mddev->bitmap->file) {
+ file->pathname[0] = '\0';
+ goto copy_out;
+ }
+
+ buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname));
+ if (!ptr)
+ goto out;
+
+ strcpy(file->pathname, ptr);
+
+copy_out:
+ err = 0;
+ if (copy_to_user(arg, file, sizeof(*file)))
+ err = -EFAULT;
+out:
+ kfree(buf);
+ kfree(file);
+ return err;
+}
+
+static int get_disk_info(mddev_t * mddev, void __user * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+ mdk_rdev_t *rdev;
+
+ if (copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+
+ rdev = find_rdev_nr(mddev, nr);
+ if (rdev) {
+ info.major = MAJOR(rdev->bdev->bd_dev);
+ info.minor = MINOR(rdev->bdev->bd_dev);
+ info.raid_disk = rdev->raid_disk;
+ info.state = 0;
+ if (test_bit(Faulty, &rdev->flags))
+ info.state |= (1<<MD_DISK_FAULTY);
+ else if (test_bit(In_sync, &rdev->flags)) {
+ info.state |= (1<<MD_DISK_ACTIVE);
+ info.state |= (1<<MD_DISK_SYNC);
+ }
+ if (test_bit(WriteMostly, &rdev->flags))
+ info.state |= (1<<MD_DISK_WRITEMOSTLY);
+ } else {
+ info.major = info.minor = 0;
+ info.raid_disk = -1;
+ info.state = (1<<MD_DISK_REMOVED);
+ }
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
+ mdk_rdev_t *rdev;
+ dev_t dev = MKDEV(info->major,info->minor);
+
+ if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
+ return -EOVERFLOW;
+
+ if (!mddev->raid_disks) {
+ int err;
+ /* expecting a device which has a superblock */
+ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ int err = super_types[mddev->major_version]
+ .load_super(rdev, rdev0, mddev->minor_version);
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: %s has different UUID to %s\n",
+ bdevname(rdev->bdev,b),
+ bdevname(rdev0->bdev,b2));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ return err;
+ }
+
+ /*
+ * add_new_disk can be used once the array is assembled
+ * to add "hot spares". They must already have a superblock
+ * written
+ */
+ if (mddev->pers) {
+ int err;
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "%s: personality does not support diskops!\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ if (mddev->persistent)
+ rdev = md_import_device(dev, mddev->major_version,
+ mddev->minor_version);
+ else
+ rdev = md_import_device(dev, -1, -1);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ /* set save_raid_disk if appropriate */
+ if (!mddev->persistent) {
+ if (info->state & (1<<MD_DISK_SYNC) &&
+ info->raid_disk < mddev->raid_disks)
+ rdev->raid_disk = info->raid_disk;
+ else
+ rdev->raid_disk = -1;
+ } else
+ super_types[mddev->major_version].
+ validate_super(mddev, rdev);
+ rdev->saved_raid_disk = rdev->raid_disk;
+
+ clear_bit(In_sync, &rdev->flags); /* just to be sure */
+ if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+
+ rdev->raid_disk = -1;
+ err = bind_rdev_to_array(rdev, mddev);
+ if (!err && !mddev->pers->hot_remove_disk) {
+ /* If there is hot_add_disk but no hot_remove_disk
+ * then added disks for geometry changes,
+ * and should be added immediately.
+ */
+ super_types[mddev->major_version].
+ validate_super(mddev, rdev);
+ err = mddev->pers->hot_add_disk(mddev, rdev);
+ if (err)
+ unbind_rdev_from_array(rdev);
+ }
+ if (err)
+ export_rdev(rdev);
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ return err;
+ }
+
+ /* otherwise, add_new_disk is only allowed
+ * for major_version==0 superblocks
+ */
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ if (!(info->state & (1<<MD_DISK_FAULTY))) {
+ int err;
+ rdev = md_import_device (dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ rdev->desc_nr = info->number;
+ if (info->raid_disk < mddev->raid_disks)
+ rdev->raid_disk = info->raid_disk;
+ else
+ rdev->raid_disk = -1;
+
+ rdev->flags = 0;
+
+ if (rdev->raid_disk < mddev->raid_disks)
+ if (info->state & (1<<MD_DISK_SYNC))
+ set_bit(In_sync, &rdev->flags);
+
+ if (info->state & (1<<MD_DISK_WRITEMOSTLY))
+ set_bit(WriteMostly, &rdev->flags);
+
+ if (!mddev->persistent) {
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+ rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ } else
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->size = calc_dev_size(rdev, mddev->chunk_size);
+
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err) {
+ export_rdev(rdev);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, dev_t dev)
+{
+ char b[BDEVNAME_SIZE];
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->raid_disk >= 0)
+ goto busy;
+
+ kick_rdev_from_array(rdev);
+ md_update_sb(mddev);
+ md_new_event(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
+ bdevname(rdev->bdev,b), mdname(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, dev_t dev)
+{
+ char b[BDEVNAME_SIZE];
+ int err;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "%s: HOT_ADD may only be used with"
+ " version-0 superblocks.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "%s: personality does not support diskops!\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
+ rdev = md_import_device (dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return -EINVAL;
+ }
+
+ if (mddev->persistent)
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ else
+ rdev->sb_offset =
+ rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+
+ size = calc_dev_size(rdev, mddev->chunk_size);
+ rdev->size = size;
+
+ if (test_bit(Faulty, &rdev->flags)) {
+ printk(KERN_WARNING
+ "md: can not hot-add faulty %s disk to %s!\n",
+ bdevname(rdev->bdev,b), mdname(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ clear_bit(In_sync, &rdev->flags);
+ rdev->desc_nr = -1;
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ goto abort_export;
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+
+ if (rdev->desc_nr == mddev->max_disks) {
+ printk(KERN_WARNING "%s: can not hot-add to full array!\n",
+ mdname(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ rdev->raid_disk = -1;
+
+ md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ md_new_event(mddev);
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+static int set_bitmap_file(mddev_t *mddev, int fd)
+{
+ int err;
+
+ if (mddev->pers) {
+ if (!mddev->pers->quiesce)
+ return -EBUSY;
+ if (mddev->recovery || mddev->sync_thread)
+ return -EBUSY;
+ /* we should be able to change the bitmap.. */
+ }
+
+
+ if (fd >= 0) {
+ if (mddev->bitmap)
+ return -EEXIST; /* cannot add when bitmap is present */
+ mddev->bitmap_file = fget(fd);
+
+ if (mddev->bitmap_file == NULL) {
+ printk(KERN_ERR "%s: error: failed to get bitmap file\n",
+ mdname(mddev));
+ return -EBADF;
+ }
+
+ err = deny_bitmap_write_access(mddev->bitmap_file);
+ if (err) {
+ printk(KERN_ERR "%s: error: bitmap file is already in use\n",
+ mdname(mddev));
+ fput(mddev->bitmap_file);
+ mddev->bitmap_file = NULL;
+ return err;
+ }
+ mddev->bitmap_offset = 0; /* file overrides offset */
+ } else if (mddev->bitmap == NULL)
+ return -ENOENT; /* cannot remove what isn't there */
+ err = 0;
+ if (mddev->pers) {
+ mddev->pers->quiesce(mddev, 1);
+ if (fd >= 0)
+ err = bitmap_create(mddev);
+ if (fd < 0 || err) {
+ bitmap_destroy(mddev);
+ fd = -1; /* make sure to put the file */
+ }
+ mddev->pers->quiesce(mddev, 0);
+ }
+ if (fd < 0) {
+ if (mddev->bitmap_file) {
+ restore_bitmap_write_access(mddev->bitmap_file);
+ fput(mddev->bitmap_file);
+ }
+ mddev->bitmap_file = NULL;
+ }
+
+ return err;
+}
+
+/*
+ * set_array_info is used two different ways
+ * The original usage is when creating a new array.
+ * In this usage, raid_disks is > 0 and it together with
+ * level, size, not_persistent,layout,chunksize determine the
+ * shape of the array.
+ * This will always create an array with a type-0.90.0 superblock.
+ * The newer usage is when assembling an array.
+ * In this case raid_disks will be 0, and the major_version field is
+ * use to determine which style super-blocks are to be found on the devices.
+ * The minor and patch _version numbers are also kept incase the
+ * super_block handler wishes to interpret them.
+ */
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (info->raid_disks == 0) {
+ /* just setting version number for superblock loading */
+ if (info->major_version < 0 ||
+ info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
+ super_types[info->major_version].name == NULL) {
+ /* maybe try to auto-load a module? */
+ printk(KERN_INFO
+ "md: superblock version %d not known\n",
+ info->major_version);
+ return -EINVAL;
+ }
+ mddev->major_version = info->major_version;
+ mddev->minor_version = info->minor_version;
+ mddev->patch_version = info->patch_version;
+ return 0;
+ }
+ mddev->major_version = MD_MAJOR_VERSION;
+ mddev->minor_version = MD_MINOR_VERSION;
+ mddev->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->ctime = get_seconds();
+
+ mddev->level = info->level;
+ mddev->clevel[0] = 0;
+ mddev->size = info->size;
+ mddev->raid_disks = info->raid_disks;
+ /* don't set md_minor, it is determined by which /dev/md* was
+ * openned
+ */
+ if (info->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else
+ mddev->recovery_cp = 0;
+ mddev->persistent = ! info->not_persistent;
+ mddev->external = 0;
+
+ mddev->layout = info->layout;
+ mddev->chunk_size = info->chunk_size;
+
+ mddev->max_disks = MD_SB_DISKS;
+
+ if (mddev->persistent)
+ mddev->sb_dirty = 1;
+
+ mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
+ mddev->bitmap_offset = 0;
+
+ mddev->reshape_position = MaxSector;
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(mddev->uuid, 16);
+
+ mddev->new_level = mddev->level;
+ mddev->new_chunk = mddev->chunk_size;
+ mddev->new_layout = mddev->layout;
+ mddev->delta_disks = 0;
+
+ return 0;
+}
+
+static int update_size(mddev_t *mddev, unsigned long size)
+{
+ mdk_rdev_t * rdev;
+ int rv;
+ struct list_head *tmp;
+ int fit = (size == 0);
+
+ if (mddev->pers->resize == NULL)
+ return -EINVAL;
+ /* The "size" is the amount of each device that is used.
+ * This can only make sense for arrays with redundancy.
+ * linear and raid0 always use whatever space is available
+ * We can only consider changing the size if no resync
+ * or reconstruction is happening, and if the new size
+ * is acceptable. It must fit before the sb_offset or,
+ * if that is <data_offset, it must fit before the
+ * size of each device.
+ * If size is zero, we find the largest size that fits.
+ */
+ if (mddev->sync_thread)
+ return -EBUSY;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ sector_t avail;
+ if (rdev->sb_offset > rdev->data_offset)
+ avail = (rdev->sb_offset*2) - rdev->data_offset;
+ else
+ avail = get_capacity(rdev->bdev->bd_disk)
+ - rdev->data_offset;
+ if (fit && (size == 0 || size > avail/2))
+ size = avail/2;
+ if (avail < ((sector_t)size << 1))
+ return -ENOSPC;
+ }
+ rv = mddev->pers->resize(mddev, (sector_t)size *2);
+ if (!rv) {
+ struct block_device *bdev;
+
+ bdev = bdget_disk(mddev->gendisk, 0);
+ if (bdev) {
+ mutex_lock(&bdev->bd_inode->i_mutex);
+ i_size_write(bdev->bd_inode, (loff_t)mddev->array_size << 10);
+ mutex_unlock(&bdev->bd_inode->i_mutex);
+ bdput(bdev);
+ }
+ }
+ return rv;
+}
+
+static int update_raid_disks(mddev_t *mddev, int raid_disks)
+{
+ int rv;
+ /* change the number of raid disks */
+ if (mddev->pers->check_reshape == NULL)
+ return -EINVAL;
+ if (raid_disks <= 0 ||
+ raid_disks >= mddev->max_disks)
+ return -EINVAL;
+ if (mddev->sync_thread || mddev->reshape_position != MaxSector)
+ return -EBUSY;
+ mddev->delta_disks = raid_disks - mddev->raid_disks;
+
+ rv = mddev->pers->check_reshape(mddev);
+ return rv;
+}
+
+
+/*
+ * update_array_info is used to change the configuration of an
+ * on-line array.
+ * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
+ * fields in the info are checked against the array.
+ * Any differences that cannot be handled will cause an error.
+ * Normally, only one change can be managed at a time.
+ */
+static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
+{
+ int rv = 0;
+ int cnt = 0;
+ int state = 0;
+
+ /* calculate expected state,ignoring low bits */
+ if (mddev->bitmap && mddev->bitmap_offset)
+ state |= (1 << MD_SB_BITMAP_PRESENT);
+
+ if (mddev->major_version != info->major_version ||
+ mddev->minor_version != info->minor_version ||
+/* mddev->patch_version != info->patch_version || */
+ mddev->ctime != info->ctime ||
+ mddev->level != info->level ||
+/* mddev->layout != info->layout || */
+ !mddev->persistent != info->not_persistent||
+ mddev->chunk_size != info->chunk_size ||
+ /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
+ ((state^info->state) & 0xfffffe00)
+ )
+ return -EINVAL;
+ /* Check there is only one change */
+ if (info->size >= 0 && mddev->size != info->size) cnt++;
+ if (mddev->raid_disks != info->raid_disks) cnt++;
+ if (mddev->layout != info->layout) cnt++;
+ if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) cnt++;
+ if (cnt == 0) return 0;
+ if (cnt > 1) return -EINVAL;
+
+ if (mddev->layout != info->layout) {
+ /* Change layout
+ * we don't need to do anything at the md level, the
+ * personality will take care of it all.
+ */
+ if (mddev->pers->reconfig == NULL)
+ return -EINVAL;
+ else
+ return mddev->pers->reconfig(mddev, info->layout, -1);
+ }
+ if (info->size >= 0 && mddev->size != info->size)
+ rv = update_size(mddev, info->size);
+
+ if (mddev->raid_disks != info->raid_disks)
+ rv = update_raid_disks(mddev, info->raid_disks);
+
+ if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
+ if (mddev->pers->quiesce == NULL)
+ return -EINVAL;
+ if (mddev->recovery || mddev->sync_thread)
+ return -EBUSY;
+ if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
+ /* add the bitmap */
+ if (mddev->bitmap)
+ return -EEXIST;
+ if (mddev->default_bitmap_offset == 0)
+ return -EINVAL;
+ mddev->bitmap_offset = mddev->default_bitmap_offset;
+ mddev->pers->quiesce(mddev, 1);
+ rv = bitmap_create(mddev);
+ if (rv)
+ bitmap_destroy(mddev);
+ mddev->pers->quiesce(mddev, 0);
+ } else {
+ /* remove the bitmap */
+ if (!mddev->bitmap)
+ return -ENOENT;
+ if (mddev->bitmap->file)
+ return -EINVAL;
+ mddev->pers->quiesce(mddev, 1);
+ bitmap_destroy(mddev);
+ mddev->pers->quiesce(mddev, 0);
+ mddev->bitmap_offset = 0;
+ }
+ }
+ md_update_sb(mddev);
+ return rv;
+}
+
+static int set_disk_faulty(mddev_t *mddev, dev_t dev)
+{
+ mdk_rdev_t *rdev;
+
+ if (mddev->pers == NULL)
+ return -ENODEV;
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENODEV;
+
+ md_error(mddev, rdev);
+ return 0;
+}
+
+static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ mddev_t *mddev = bdev->bd_disk->private_data;
+
+ geo->heads = 2;
+ geo->sectors = 4;
+ geo->cylinders = get_capacity(mddev->gendisk) / 8;
+ return 0;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ int err = 0;
+ void __user *argp = (void __user *)arg;
+ mddev_t *mddev = NULL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version(argp);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays(arg);
+ goto done;
+#endif
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = inode->i_bdev->bd_disk->private_data;
+
+ if (!mddev) {
+ BUG();
+ goto abort;
+ }
+
+
+ if (cmd == START_ARRAY) {
+ /* START_ARRAY doesn't need to lock the array as autostart_array
+ * does the locking, and it could even be a different array
+ */
+ static int cnt = 3;
+ if (cnt > 0 ) {
+ printk(KERN_WARNING
+ "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
+ "This will not be supported beyond July 2006\n",
+ current->comm, current->pid);
+ cnt--;
+ }
+ err = autostart_array(new_decode_dev(arg));
+ if (err) {
+ printk(KERN_WARNING "md: autostart failed!\n");
+ goto abort;
+ }
+ goto done;
+ }
+
+ err = mddev_lock(mddev);
+ if (err) {
+ printk(KERN_INFO
+ "md: ioctl lock interrupted, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+ {
+ mdu_array_info_t info;
+ if (!arg)
+ memset(&info, 0, sizeof(info));
+ else if (copy_from_user(&info, argp, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ if (mddev->pers) {
+ err = update_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't update"
+ " array info. %d\n", err);
+ goto abort_unlock;
+ }
+ goto done_unlock;
+ }
+ if (!list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: array %s already has disks!\n",
+ mdname(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (mddev->raid_disks) {
+ printk(KERN_WARNING
+ "md: array %s already initialised!\n",
+ mdname(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't set"
+ " array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+ /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
+ * RUN_ARRAY, and SET_BITMAP_FILE are allowed */
+ if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
+ && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, argp);
+ goto done_unlock;
+
+ case GET_BITMAP_FILE:
+ err = get_bitmap_file(mddev, argp);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, argp);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ err = do_md_stop (mddev, 0);
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow them on read-only arrays.
+ * However non-MD ioctls (e.g. get-size) will still come through
+ * here and hit the 'default' below, so only disallow
+ * 'md' ioctls, and switch to rw mode if started auto-readonly.
+ */
+ if (_IOC_TYPE(cmd) == MD_MAJOR &&
+ mddev->ro && mddev->pers) {
+ if (mddev->ro == 2) {
+ mddev->ro = 0;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+
+ } else {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (copy_from_user(&info, argp, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, new_decode_dev(arg));
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, new_decode_dev(arg));
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, new_decode_dev(arg));
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ err = do_md_run (mddev);
+ goto done_unlock;
+
+ case SET_BITMAP_FILE:
+ err = set_bitmap_file(mddev, (int)arg);
+ goto done_unlock;
+
+ default:
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ mddev_unlock(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Succeed if we can lock the mddev, which confirms that
+ * it isn't being stopped right now.
+ */
+ mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
+ int err;
+
+ if ((err = mddev_lock(mddev)))
+ goto out;
+
+ err = 0;
+ mddev_get(mddev);
+ mddev_unlock(mddev);
+
+ check_disk_change(inode->i_bdev);
+ out:
+ return err;
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
+
+ if (!mddev)
+ BUG();
+ mddev_put(mddev);
+
+ return 0;
+}
+
+static int md_media_changed(struct gendisk *disk)
+{
+ mddev_t *mddev = disk->private_data;
+
+ return mddev->changed;
+}
+
+static int md_revalidate(struct gendisk *disk)
+{
+ mddev_t *mddev = disk->private_data;
+
+ mddev->changed = 0;
+ return 0;
+}
+static struct block_device_operations md_fops =
+{
+ .owner = THIS_MODULE,
+ .open = md_open,
+ .release = md_release,
+ .ioctl = md_ioctl,
+ .getgeo = md_getgeo,
+ .media_changed = md_media_changed,
+ .revalidate_disk= md_revalidate,
+};
+
+static int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+
+ allow_signal(SIGKILL);
+ while (!kthread_should_stop()) {
+
+ /* We need to wait INTERRUPTIBLE so that
+ * we don't add to the load-average.
+ * That means we need to be sure no signals are
+ * pending
+ */
+ if (signal_pending(current))
+ flush_signals(current);
+
+ wait_event_interruptible_timeout
+ (thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags)
+ || kthread_should_stop(),
+ thread->timeout);
+ try_to_freeze();
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ thread->run(thread->mddev);
+ }
+
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ if (thread) {
+ dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+ }
+}
+
+mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
+ const char *name)
+{
+ mdk_thread_t *thread;
+
+ thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ init_waitqueue_head(&thread->wqueue);
+
+ thread->run = run;
+ thread->mddev = mddev;
+ thread->timeout = MAX_SCHEDULE_TIMEOUT;
+ thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev));
+ if (IS_ERR(thread->tsk)) {
+ kfree(thread);
+ return NULL;
+ }
+ return thread;
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+
+ kthread_stop(thread->tsk);
+ kfree(thread);
+}
+
+void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ return;
+/*
+ dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ mdname(mddev),
+ MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+*/
+ if (!mddev->pers->error_handler)
+ return;
+ mddev->pers->error_handler(mddev,rdev);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ md_new_event_inintr(mddev);
+}
+
+/* seq_file implementation /proc/mdstat */
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ char b[BDEVNAME_SIZE];
+ i++;
+ seq_printf(seq, "%s ",
+ bdevname(rdev->bdev,b));
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ sector_t max_blocks, resync, res;
+ unsigned long dt, db, rt;
+ int scale;
+ unsigned int per_milli;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ max_blocks = mddev->resync_max_sectors >> 1;
+ else
+ max_blocks = mddev->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks) {
+ MD_BUG();
+ return;
+ }
+ /* Pick 'scale' such that (resync>>scale)*1000 will fit
+ * in a sector_t, and (max_blocks>>scale) will fit in a
+ * u32, as those are the requirements for sector_div.
+ * Thus 'scale' must be at least 10
+ */
+ scale = 10;
+ if (sizeof(sector_t) > sizeof(unsigned long)) {
+ while ( max_blocks/2 > (1ULL<<(scale+32)))
+ scale++;
+ }
+ res = (resync>>scale)*1000;
+ sector_div(res, (u32)((max_blocks>>scale)+1));
+
+ per_milli = res;
+ {
+ int i, x = per_milli/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
+ (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
+ "reshape" :
+ (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+ "resync" : "recovery")),
+ per_milli/10, per_milli % 10,
+ (unsigned long long) resync,
+ (unsigned long long) max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((unsigned long)(max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+}
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l >= 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ spin_lock(&all_mddevs_lock);
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+ return mddev;
+ }
+ spin_unlock(&all_mddevs_lock);
+ if (!l--)
+ return (void*)2;/* tail */
+ return NULL;
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ spin_lock(&all_mddevs_lock);
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ if (v != (void*)1)
+ mddev_put(mddev);
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+ mddev_t *mddev = v;
+
+ if (mddev && v != (void*)1 && v != (void*)2)
+ mddev_put(mddev);
+}
+
+struct mdstat_info {
+ int event;
+};
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ mddev_t *mddev = v;
+ sector_t size;
+ struct list_head *tmp2;
+ mdk_rdev_t *rdev;
+ struct mdstat_info *mi = seq->private;
+ struct bitmap *bitmap;
+
+ if (v == (void*)1) {
+ struct mdk_personality *pers;
+ seq_printf(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ list_for_each_entry(pers, &pers_list, list)
+ seq_printf(seq, "[%s] ", pers->name);
+
+ spin_unlock(&pers_lock);
+ seq_printf(seq, "\n");
+ mi->event = atomic_read(&md_event_count);
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ if (mddev_lock(mddev) < 0)
+ return -EINTR;
+
+ if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
+ seq_printf(seq, "%s : %sactive", mdname(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro==1)
+ seq_printf(seq, " (read-only)");
+ if (mddev->ro==2)
+ seq_printf(seq, "(auto-read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ char b[BDEVNAME_SIZE];
+ seq_printf(seq, " %s[%d]",
+ bdevname(rdev->bdev,b), rdev->desc_nr);
+ if (test_bit(WriteMostly, &rdev->flags))
+ seq_printf(seq, "(W)");
+ if (test_bit(Faulty, &rdev->flags)) {
+ seq_printf(seq, "(F)");
+ continue;
+ } else if (rdev->raid_disk < 0)
+ seq_printf(seq, "(S)"); /* spare */
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)mddev->array_size);
+ else
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)size);
+ }
+ if (mddev->persistent) {
+ if (mddev->major_version != 0 ||
+ mddev->minor_version != 90) {
+ seq_printf(seq," super %d.%d",
+ mddev->major_version,
+ mddev->minor_version);
+ }
+ } else
+ seq_printf(seq, " super non-persistent");
+
+ if (mddev->pers) {
+ mddev->pers->status (seq, mddev);
+ seq_printf(seq, "\n ");
+ if (mddev->pers->sync_request) {
+ if (mddev->curr_resync > 2) {
+ status_resync (seq, mddev);
+ seq_printf(seq, "\n ");
+ } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
+ seq_printf(seq, "\tresync=DELAYED\n ");
+ else if (mddev->recovery_cp < MaxSector)
+ seq_printf(seq, "\tresync=PENDING\n ");
+ }
+ } else
+ seq_printf(seq, "\n ");
+
+ if ((bitmap = mddev->bitmap)) {
+ unsigned long chunk_kb;
+ unsigned long flags;
+ spin_lock_irqsave(&bitmap->lock, flags);
+ chunk_kb = bitmap->chunksize >> 10;
+ seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
+ "%lu%s chunk",
+ bitmap->pages - bitmap->missing_pages,
+ bitmap->pages,
+ (bitmap->pages - bitmap->missing_pages)
+ << (PAGE_SHIFT - 10),
+ chunk_kb ? chunk_kb : bitmap->chunksize,
+ chunk_kb ? "KB" : "B");
+ if (bitmap->file) {
+ seq_printf(seq, ", file: ");
+ seq_path(seq, bitmap->file->f_vfsmnt,
+ bitmap->file->f_dentry," \t\n");
+ }
+
+ seq_printf(seq, "\n");
+ spin_unlock_irqrestore(&bitmap->lock, flags);
+ }
+
+ seq_printf(seq, "\n");
+ }
+ mddev_unlock(mddev);
+
+ return 0;
+}
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+ struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
+ if (mi == NULL)
+ return -ENOMEM;
+
+ error = seq_open(file, &md_seq_ops);
+ if (error)
+ kfree(mi);
+ else {
+ struct seq_file *p = file->private_data;
+ p->private = mi;
+ mi->event = atomic_read(&md_event_count);
+ }
+ return error;
+}
+
+static int md_seq_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ struct mdstat_info *mi = m->private;
+ m->private = NULL;
+ kfree(mi);
+ return seq_release(inode, file);
+}
+
+static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
+{
+ struct seq_file *m = filp->private_data;
+ struct mdstat_info *mi = m->private;
+ int mask;
+
+ poll_wait(filp, &md_event_waiters, wait);
+
+ /* always allow read */
+ mask = POLLIN | POLLRDNORM;
+
+ if (mi->event != atomic_read(&md_event_count))
+ mask |= POLLERR | POLLPRI;
+ return mask;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = md_seq_release,
+ .poll = mdstat_poll,
+};
+
+int register_md_personality(struct mdk_personality *p)
+{
+ spin_lock(&pers_lock);
+ list_add_tail(&p->list, &pers_list);
+ printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
+ spin_unlock(&pers_lock);
+ return 0;
+}
+
+int unregister_md_personality(struct mdk_personality *p)
+{
+ printk(KERN_INFO "md: %s personality unregistered\n", p->name);
+ spin_lock(&pers_lock);
+ list_del_init(&p->list);
+ spin_unlock(&pers_lock);
+ return 0;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+ curr_events = disk_stat_read(disk, sectors[0]) +
+ disk_stat_read(disk, sectors[1]) -
+ atomic_read(&disk->sync_io);
+ /* The difference between curr_events and last_events
+ * will be affected by any new non-sync IO (making
+ * curr_events bigger) and any difference in the amount of
+ * in-flight syncio (making current_events bigger or smaller)
+ * The amount in-flight is currently limited to
+ * 32*64K in raid1/10 and 256*PAGE_SIZE in raid5/6
+ * which is at most 4096 sectors.
+ * These numbers are fairly fragile and should be made
+ * more robust, probably by enforcing the
+ * 'window size' that md_do_sync sort-of uses.
+ *
+ * Note: the following is an unsigned comparison.
+ */
+ if ((curr_events - rdev->last_events + 4096) > 8192) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ // stop recovery, signal do_sync ....
+ }
+}
+
+
+/* md_write_start(mddev, bi)
+ * If we need to update some array metadata (e.g. 'active' flag
+ * in superblock) before writing, schedule a superblock update
+ * and wait for it to complete.
+ */
+void md_write_start(mddev_t *mddev, struct bio *bi)
+{
+ if (bio_data_dir(bi) != WRITE)
+ return;
+
+ BUG_ON(mddev->ro == 1);
+ if (mddev->ro == 2) {
+ /* need to switch to read/write */
+ mddev->ro = 0;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ }
+ atomic_inc(&mddev->writes_pending);
+ if (mddev->in_sync) {
+ spin_lock_irq(&mddev->write_lock);
+ if (mddev->in_sync) {
+ mddev->in_sync = 0;
+ mddev->sb_dirty = 3;
+ md_wakeup_thread(mddev->thread);
+ }
+ spin_unlock_irq(&mddev->write_lock);
+ }
+ wait_event(mddev->sb_wait, mddev->sb_dirty==0);
+}
+
+void md_write_end(mddev_t *mddev)
+{
+ if (atomic_dec_and_test(&mddev->writes_pending)) {
+ if (mddev->safemode == 2)
+ md_wakeup_thread(mddev->thread);
+ else if (mddev->safemode_delay)
+ mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
+ }
+}
+
+static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+void md_do_sync(mddev_t *mddev)
+{
+ mddev_t *mddev2;
+ unsigned int currspeed = 0,
+ window;
+ sector_t max_sectors,j, io_sectors;
+ unsigned long mark[SYNC_MARKS];
+ sector_t mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct list_head *tmp;
+ sector_t last_check;
+ int skipped = 0;
+ struct list_head *rtmp;
+ mdk_rdev_t *rdev;
+
+ /* just incase thread restarts... */
+ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ return;
+ if (mddev->ro) /* never try to sync a read-only array */
+ return;
+
+ /* we overload curr_resync somewhat here.
+ * 0 == not engaged in resync at all
+ * 2 == checking that there is no conflict with another sync
+ * 1 == like 2, but have yielded to allow conflicting resync to
+ * commense
+ * other == active in resync - this many blocks
+ *
+ * Before starting a resync we must have set curr_resync to
+ * 2, and then checked that every "conflicting" array has curr_resync
+ * less than ours. When we find one that is the same or higher
+ * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
+ * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
+ * This will mean we have to start checking from the beginning again.
+ *
+ */
+
+ do {
+ mddev->curr_resync = 2;
+
+ try_again:
+ if (kthread_should_stop()) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ goto skip;
+ }
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync &&
+ match_mddev_units(mddev,mddev2)) {
+ DEFINE_WAIT(wq);
+ if (mddev < mddev2 && mddev->curr_resync == 2) {
+ /* arbitrarily yield */
+ mddev->curr_resync = 1;
+ wake_up(&resync_wait);
+ }
+ if (mddev > mddev2 && mddev->curr_resync == 1)
+ /* no need to wait here, we can wait the next
+ * time 'round when curr_resync == 2
+ */
+ continue;
+ prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE);
+ if (!kthread_should_stop() &&
+ mddev2->curr_resync >= mddev->curr_resync) {
+ printk(KERN_INFO "md: delaying resync of %s"
+ " until %s has finished resync (they"
+ " share one or more physical units)\n",
+ mdname(mddev), mdname(mddev2));
+ mddev_put(mddev2);
+ schedule();
+ finish_wait(&resync_wait, &wq);
+ goto try_again;
+ }
+ finish_wait(&resync_wait, &wq);
+ }
+ }
+ } while (mddev->curr_resync < 2);
+
+ j = 0;
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ /* resync follows the size requested by the personality,
+ * which defaults to physical size, but can be virtual size
+ */
+ max_sectors = mddev->resync_max_sectors;
+ mddev->resync_mismatches = 0;
+ /* we don't use the checkpoint if there's a bitmap */
+ if (!mddev->bitmap &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ j = mddev->recovery_cp;
+ } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+ max_sectors = mddev->size << 1;
+ else {
+ /* recovery follows the physical size of devices */
+ max_sectors = mddev->size << 1;
+ j = MaxSector;
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < j)
+ j = rdev->recovery_offset;
+ }
+
+ printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
+ " %d KB/sec/disc.\n", speed_min(mddev));
+ printk(KERN_INFO "md: using maximum available idle IO bandwidth "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ speed_max(mddev));
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+
+ io_sectors = 0;
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = io_sectors;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = 32*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
+ window/2,(unsigned long long) max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+
+ if (j>2) {
+ printk(KERN_INFO
+ "md: resuming recovery of %s from checkpoint.\n",
+ mdname(mddev));
+ mddev->curr_resync = j;
+ }
+
+ while (j < max_sectors) {
+ sector_t sectors;
+
+ skipped = 0;
+ if (j >= mddev->resync_max) {
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ wait_event(mddev->recovery_wait,
+ mddev->resync_max > j || kthread_should_stop());
+ }
+ if (kthread_should_stop())
+ goto interrupted;
+ sectors = mddev->pers->sync_request(mddev, j, &skipped,
+ currspeed < speed_min(mddev));
+ if (sectors == 0) {
+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ goto out;
+ }
+
+ if (!skipped) { /* actual IO requested */
+ io_sectors += sectors;
+ atomic_add(sectors, &mddev->recovery_active);
+ }
+
+ j += sectors;
+ if (j>1) mddev->curr_resync = j;
+ if (last_check == 0)
+ /* this is the earliers that rebuilt will be
+ * visible in /proc/mdstat
+ */
+ md_new_event(mddev);
+
+ if (last_check + window > io_sectors || j == max_sectors)
+ continue;
+
+ last_check = io_sectors;
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_ERR, &mddev->recovery))
+ break;
+
+ repeat:
+ if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (kthread_should_stop())
+ goto interrupted;
+
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ mddev->queue->unplug_fn(mddev->queue);
+ cond_resched();
+
+ currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
+ /((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > speed_min(mddev)) {
+ if ((currspeed > speed_max(mddev)) ||
+ !is_mddev_idle(mddev)) {
+ msleep(500);
+ goto repeat;
+ }
+ }
+ }
+ printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+ out:
+ mddev->queue->unplug_fn(mddev->queue);
+
+ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+
+ /* tell personality that we are finished */
+ mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
+
+ if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+ test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
+ mddev->curr_resync > 2) {
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ if (mddev->curr_resync >= mddev->recovery_cp) {
+ printk(KERN_INFO
+ "md: checkpointing recovery of %s.\n",
+ mdname(mddev));
+ mddev->recovery_cp = mddev->curr_resync;
+ }
+ } else
+ mddev->recovery_cp = MaxSector;
+ } else {
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+ mddev->curr_resync = MaxSector;
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < mddev->curr_resync)
+ rdev->recovery_offset = mddev->curr_resync;
+ }
+ }
+
+ skip:
+ mddev->curr_resync = 0;
+ mddev->resync_max = MaxSector;
+ sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+ wake_up(&resync_wait);
+ set_bit(MD_RECOVERY_DONE, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ return;
+
+ interrupted:
+ /*
+ * got a signal, exit.
+ */
+ printk(KERN_INFO
+ "md: md_do_sync() got signal ... exiting\n");
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ goto out;
+
+}
+EXPORT_SYMBOL_GPL(md_do_sync);
+
+
+/*
+ * This routine is regularly called by all per-raid-array threads to
+ * deal with generic issues like resync and super-block update.
+ * Raid personalities that don't have a thread (linear/raid0) do not
+ * need this as they never do any recovery or update the superblock.
+ *
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
+ * "->recovery" and create a thread at ->sync_thread.
+ * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
+ * and wakeups up this thread which will reap the thread and finish up.
+ * This thread also removes any faulty devices (with nr_pending == 0).
+ *
+ * The overall approach is:
+ * 1/ if the superblock needs updating, update it.
+ * 2/ If a recovery thread is running, don't do anything else.
+ * 3/ If recovery has finished, clean up, possibly marking spares active.
+ * 4/ If there are any faulty devices, remove them.
+ * 5/ If array is degraded, try to add spares devices
+ * 6/ If array has spares or is not in-sync, start a resync thread.
+ */
+void md_check_recovery(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *rtmp;
+
+
+ if (mddev->bitmap)
+ bitmap_daemon_work(mddev->bitmap);
+
+ if (mddev->ro)
+ return;
+
+ if (signal_pending(current)) {
+ if (mddev->pers->sync_request) {
+ printk(KERN_INFO "md: %s in immediate safe mode\n",
+ mdname(mddev));
+ mddev->safemode = 2;
+ }
+ flush_signals(current);
+ }
+
+ if ( ! (
+ (mddev->sb_dirty && !mddev->external) ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+ (mddev->safemode == 1) ||
+ (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
+ && !mddev->in_sync && mddev->recovery_cp == MaxSector)
+ ))
+ return;
+
+ if (mddev_trylock(mddev)) {
+ int spares =0;
+
+ spin_lock_irq(&mddev->write_lock);
+ if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
+ !mddev->in_sync && mddev->recovery_cp == MaxSector) {
+ mddev->in_sync = 1;
+ if (mddev->persistent)
+ mddev->sb_dirty = 3;
+ }
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+ spin_unlock_irq(&mddev->write_lock);
+
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+
+
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
+ /* resync/recovery still happening */
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ goto unlock;
+ }
+ if (mddev->sync_thread) {
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ mddev->pers->spare_active(mddev);
+ }
+ md_update_sb(mddev);
+
+ /* if array is no-longer degraded, then any saved_raid_disk
+ * information must be scrapped
+ */
+ if (!mddev->degraded)
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ rdev->saved_raid_disk = -1;
+
+ mddev->recovery = 0;
+ /* flag recovery needed just to double check */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_new_event(mddev);
+ goto unlock;
+ }
+ /* Clear some bits that don't mean anything, but
+ * might be left set
+ */
+ clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
+
+ if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ goto unlock;
+ /* no recovery is running.
+ * remove any failed drives, then
+ * add spares if possible.
+ * Spare are also removed and re-added, to allow
+ * the personality to fail the re-add.
+ */
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ if (rdev->raid_disk >= 0 &&
+ (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) &&
+ atomic_read(&rdev->nr_pending)==0) {
+ if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) {
+ char nm[20];
+ sprintf(nm,"rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ rdev->raid_disk = -1;
+ }
+ }
+
+ if (mddev->degraded) {
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ if (rdev->raid_disk < 0
+ && !test_bit(Faulty, &rdev->flags)) {
+ rdev->recovery_offset = 0;
+ if (mddev->pers->hot_add_disk(mddev,rdev)) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
+ spares++;
+ md_new_event(mddev);
+ } else
+ break;
+ }
+ }
+
+ if (spares) {
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ } else if (mddev->recovery_cp < MaxSector) {
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ /* nothing to be done ... */
+ goto unlock;
+
+ if (mddev->pers->sync_request) {
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ if (spares && mddev->bitmap && ! mddev->bitmap->file) {
+ /* We are adding a device or devices to an array
+ * which has the bitmap stored on all devices.
+ * So make sure all bitmap pages get written
+ */
+ bitmap_write_all(mddev->bitmap);
+ }
+ mddev->sync_thread = md_register_thread(md_do_sync,
+ mddev,
+ "%s_resync");
+ if (!mddev->sync_thread) {
+ printk(KERN_ERR "%s: could not start resync"
+ " thread...\n",
+ mdname(mddev));
+ /* leave the spares where they are, it shouldn't hurt */
+ mddev->recovery = 0;
+ } else
+ md_wakeup_thread(mddev->sync_thread);
+ md_new_event(mddev);
+ }
+ unlock:
+ mddev_unlock(mddev);
+ }
+}
+
+static int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ if (mddev_trylock(mddev)) {
+ do_md_stop (mddev, 1);
+ mddev_unlock(mddev);
+ }
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block md_notifier = {
+ .notifier_call = md_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+}
+
+static int __init md_init(void)
+{
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
+ " MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+ printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI,
+ BITMAP_MINOR);
+
+ if (register_blkdev(MAJOR_NR, "md"))
+ return -1;
+ if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
+ unregister_blkdev(MAJOR_NR, "md");
+ return -1;
+ }
+ devfs_mk_dir("md");
+ blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
+ md_probe, NULL, NULL);
+ blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
+ md_probe, NULL, NULL);
+
+ for (minor=0; minor < MAX_MD_DEVS; ++minor)
+ devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
+ S_IFBLK|S_IRUSR|S_IWUSR,
+ "md/%d", minor);
+
+ for (minor=0; minor < MAX_MD_DEVS; ++minor)
+ devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
+ S_IFBLK|S_IRUSR|S_IWUSR,
+ "md/mdp%d", minor);
+
+
+ register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static dev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(dev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(int part)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ dev_t dev = detected_devices[i];
+
+ rdev = md_import_device(dev,0, 0);
+ if (IS_ERR(rdev))
+ continue;
+
+ if (test_bit(Faulty, &rdev->flags)) {
+ MD_BUG();
+ continue;
+ }
+ list_add(&rdev->same_set, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices(part);
+}
+
+#endif
+
+static __exit void md_exit(void)
+{
+ mddev_t *mddev;
+ struct list_head *tmp;
+ int i;
+ blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
+ blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
+ for (i=0; i < MAX_MD_DEVS; i++)
+ devfs_remove("md/%d", i);
+ for (i=0; i < MAX_MD_DEVS; i++)
+ devfs_remove("md/d%d", i);
+
+ devfs_remove("md");
+
+ unregister_blkdev(MAJOR_NR,"md");
+ unregister_blkdev(mdp_major, "mdp");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+ remove_proc_entry("mdstat", NULL);
+ ITERATE_MDDEV(mddev,tmp) {
+ struct gendisk *disk = mddev->gendisk;
+ if (!disk)
+ continue;
+ export_array(mddev);
+ del_gendisk(disk);
+ put_disk(disk);
+ mddev->gendisk = NULL;
+ mddev_put(mddev);
+ }
+}
+
+module_init(md_init)
+module_exit(md_exit)
+
+static int get_ro(char *buffer, struct kernel_param *kp)
+{
+ return sprintf(buffer, "%d", start_readonly);
+}
+static int set_ro(const char *val, struct kernel_param *kp)
+{
+ char *e;
+ int num = simple_strtoul(val, &e, 10);
+ if (*val && (*e == '\0' || *e == '\n')) {
+ start_readonly = num;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
+module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
+
+
+EXPORT_SYMBOL(register_md_personality);
+EXPORT_SYMBOL(unregister_md_personality);
+EXPORT_SYMBOL(md_error);
+EXPORT_SYMBOL(md_done_sync);
+EXPORT_SYMBOL(md_write_start);
+EXPORT_SYMBOL(md_write_end);
+EXPORT_SYMBOL(md_register_thread);
+EXPORT_SYMBOL(md_unregister_thread);
+EXPORT_SYMBOL(md_wakeup_thread);
+EXPORT_SYMBOL(md_check_recovery);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("md");
+MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
diff --git a/demo/vpatch.c b/demo/vpatch.c
new file mode 100644
index 0000000..fae8714
--- /dev/null
+++ b/demo/vpatch.c
@@ -0,0 +1,668 @@
+
+/*
+ * vpatch - visual front end for wiggle
+ *
+ * "files" display, lists all files with Statistics
+ * - can hide various lines including subdirectories
+ * and files without wiggles or conflicts
+ * "diff" display shows merged file with different parts
+ * in different colours
+ * - untouched are pale A_DIM
+ * - matched/remaining are regular A_NORMAL
+ * - matched/removed are red/
+ *
+ * underlined A_UNDERLINE
+ * - unmatched in file are A_STANDOUT
+ * - unmatched in patch are A_STANDOUT|A_UNDERLINE ???
+ * - inserted are inverse/green ?? A_REVERSE
+ *
+ * The window can be split horizontally or vertically and two different
+ * views displayed. They will have different parts missing
+ *
+ * So a display of NORMAL, underline, standout|underline reverse
+ * should show a normal patch.
+ *
+ */
+
+#include "wiggle.h"
+#include <malloc.h>
+#include <string.h>
+#include <curses.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#define assert(x) do { if (!(x)) abort(); } while (0)
+
+struct plist {
+ char *file;
+ unsigned int start, end;
+ int parent;
+ int next, prev, last;
+ int open;
+ int chunks, wiggles, conflicts;
+};
+
+struct plist *patch_add_file(struct plist *pl, int *np, char *file,
+ unsigned int start, unsigned int end)
+{
+ /* size of pl is 0, 16, n^2 */
+ int n = *np;
+ int asize;
+
+/* printf("adding %s at %d: %u %u\n", file, n, start, end); */
+ if (n==0) asize = 0;
+ else if (n<=16) asize = 16;
+ else if ((n&(n-1))==0) asize = n;
+ else asize = n+1; /* not accurate, but not too large */
+ if (asize <= n) {
+ /* need to extend array */
+ struct plist *npl;
+ if (asize < 16) asize = 16;
+ else asize += asize;
+ npl = realloc(pl, asize * sizeof(struct plist));
+ if (!npl) {
+ fprintf(stderr, "malloc failed - skipping %s\n", file);
+ return pl;
+ }
+ pl = npl;
+ }
+ pl[n].file = file;
+ pl[n].start = start;
+ pl[n].end = end;
+ pl[n].last = pl[n].next = pl[n].prev = pl[n].parent = -1;
+ pl[n].chunks = pl[n].wiggles = pl[n].conflicts = 0;
+ pl[n].open = 1;
+ *np = n+1;
+ return pl;
+}
+
+
+
+struct plist *parse_patch(FILE *f, FILE *of, int *np)
+{
+ /* read a multi-file patch from 'f' and record relevant
+ * details in a plist.
+ * if 'of' >= 0, fd might not be seekable so we write
+ * to 'of' and use lseek on 'of' to determine position
+ */
+ struct plist *plist = NULL;
+
+ while (!feof(f)) {
+ /* first, find the start of a patch: "\n+++ "
+ * grab the file name and scan to the end of a line
+ */
+ char *target="\n+++ ";
+ char *target2="\n--- ";
+ char *pos = target;
+ int c;
+ char name[1024];
+ unsigned start, end;
+
+ while (*pos && (c=fgetc(f)) != EOF ) {
+ if (of) fputc(c, of);
+ if (c == *pos)
+ pos++;
+ else pos = target;
+ }
+ if (c == EOF)
+ break;
+ assert(c == ' ');
+ /* now read a file name */
+ pos = name;
+ while ((c=fgetc(f)) != EOF && c != '\t' && c != '\n' && c != ' ' &&
+ pos - name < 1023) {
+ *pos++ = c;
+ if (of) fputc(c, of);
+ }
+ *pos = 0;
+ if (c == EOF)
+ break;
+ if (of) fputc(c, of);
+ while (c != '\n' && (c=fgetc(f)) != EOF) {
+ if (of) fputc(c, of);
+ }
+ start = of ? ftell(of) : ftell(f);
+
+ if (c == EOF) break;
+
+ /* now skip to end - "\n--- " */
+ pos = target2+1;
+
+ while (*pos && (c=fgetc(f)) != EOF) {
+ if (of) fputc(c, of);
+ if (c == *pos)
+ pos++;
+ else pos = target2;
+ }
+ if (pos > target2) {
+ end = of ? ftell(of) : ftell(f);
+ end -= (pos - target2) - 1;
+ plist = patch_add_file(plist, np,
+ strdup(name), start, end);
+ }
+ }
+ return plist;
+}
+void die()
+{
+ fprintf(stderr,"vpatch: fatal error\n");
+ abort();
+ exit(3);
+}
+
+
+static struct stream load_segment(FILE *f,
+ unsigned int start, unsigned int end)
+{
+ struct stream s;
+ s.len = end - start;
+ s.body = malloc(s.len);
+ if (s.body) {
+ fseek(f, start, 0);
+ if (fread(s.body, 1, s.len, f) != s.len) {
+ free(s.body);
+ s.body = NULL;
+ }
+ } else
+ die();
+ return s;
+}
+
+
+void catch(int sig)
+{
+ if (sig == SIGINT) {
+ signal(sig, catch);
+ return;
+ }
+ nocbreak();nl();endwin();
+ printf("Died on signal %d\n", sig);
+ exit(2);
+}
+
+int pl_cmp(const void *av, const void *bv)
+{
+ const struct plist *a = av;
+ const struct plist *b = bv;
+ return strcmp(a->file, b->file);
+}
+
+int common_depth(char *a, char *b)
+{
+ /* find number of patch segments that these two have
+ * in common
+ */
+ int depth = 0;
+ while(1) {
+ char *c;
+ int al, bl;
+ c = strchr(a, '/');
+ if (c) al = c-a; else al = strlen(a);
+ c = strchr(b, '/');
+ if (c) bl = c-b; else bl = strlen(b);
+ if (al == 0 || al != bl || strncmp(a,b,al) != 0)
+ return depth;
+ a+= al;
+ while (*a=='/') a++;
+ b+= bl;
+ while(*b=='/') b++;
+
+ depth++;
+ }
+}
+
+struct plist *add_dir(struct plist *pl, int *np, char *file, char *curr)
+{
+ /* any parent of file that is not a parent of curr
+ * needs to be added to pl
+ */
+ int d = common_depth(file, curr);
+ char *buf = curr;
+ while (d) {
+ char *c = strchr(file, '/');
+ int l;
+ if (c) l = c-file; else l = strlen(file);
+ file += l;
+ curr += l;
+ while (*file == '/') file++;
+ while (*curr == '/') curr++;
+ d--;
+ }
+ while (*file) {
+ if (curr > buf && curr[-1] != '/')
+ *curr++ = '/';
+ while (*file && *file != '/')
+ *curr++ = *file++;
+ while (*file == '/') *file++;
+ *curr = '\0';
+ if (*file)
+ pl = patch_add_file(pl, np, strdup(buf),
+ 0, 0);
+ }
+ return pl;
+}
+
+struct plist *sort_patches(struct plist *pl, int *np)
+{
+ /* sort the patches, add directory names, and re-sort */
+ char curr[1024];
+ char *prev;
+ int parents[100];
+ int prevnode[100];
+ int i, n;
+ qsort(pl, *np, sizeof(struct plist), pl_cmp);
+ curr[0] = 0;
+ n = *np;
+ for (i=0; i<n; i++)
+ pl = add_dir(pl, np, pl[i].file, curr);
+
+ qsort(pl, *np, sizeof(struct plist), pl_cmp);
+
+ /* array is now stable, so set up parent pointers */
+ n = *np;
+ curr[0] = 0;
+ prevnode[0] = -1;
+ prev = "";
+ for (i=0; i<n; i++) {
+ int d = common_depth(prev, pl[i].file);
+ if (d == 0)
+ pl[i].parent = -1;
+ else {
+ pl[i].parent = parents[d-1];
+ pl[pl[i].parent].last = i;
+ }
+ pl[i].prev = prevnode[d];
+ if (pl[i].prev > -1)
+ pl[pl[i].prev].next = i;
+ prev = pl[i].file;
+ parents[d] = i;
+ prevnode[d] = i;
+ prevnode[d+1] = -1;
+ }
+ return pl;
+}
+
+int get_prev(int pos, struct plist *pl, int n)
+{
+ if (pos == -1) return pos;
+ if (pl[pos].prev == -1)
+ return pl[pos].parent;
+ pos = pl[pos].prev;
+ while (pl[pos].open &&
+ pl[pos].last >= 0)
+ pos = pl[pos].last;
+ return pos;
+}
+
+int get_next(int pos, struct plist *pl, int n)
+{
+ if (pos == -1) return pos;
+ if (pl[pos].open) {
+ if (pos +1 < n)
+ return pos+1;
+ else
+ return -1;
+ }
+ while (pos >= 0 && pl[pos].next == -1)
+ pos = pl[pos].parent;
+ if (pos >= 0)
+ pos = pl[pos].next;
+ return pos;
+}
+
+void draw_one(int row, struct plist *pl)
+{
+ char hdr[10];
+ hdr[0] = 0;
+
+ if (pl == NULL) {
+ move(row,0);
+ clrtoeol();
+ return;
+ }
+ if (pl->chunks > 99)
+ strcpy(hdr, "XX");
+ else sprintf(hdr, "%02d", pl->chunks);
+ if (pl->wiggles > 99)
+ strcpy(hdr, " XX");
+ else sprintf(hdr+2, " %02d", pl->wiggles);
+ if (pl->conflicts > 99)
+ strcpy(hdr, " XX");
+ else sprintf(hdr+5, " %02d ", pl->conflicts);
+ if (pl->end)
+ strcpy(hdr+9, "= ");
+ else if (pl->open)
+ strcpy(hdr+9, "+ ");
+ else strcpy(hdr+9, "- ");
+
+ mvaddstr(row, 0, hdr);
+ mvaddstr(row, 11, pl->file);
+ clrtoeol();
+}
+
+void addword(struct elmnt e)
+{
+ addnstr(e.start, e.len);
+}
+
+void diff_window(struct plist *p, FILE *f)
+{
+ /*
+ * I wonder what to display here ....
+ */
+ struct stream s;
+ struct stream s1, s2;
+ struct file f1, f2;
+ struct csl *csl;
+ char buf[100];
+ int ch;
+ s = load_segment(f, p->start, p->end);
+ ch = split_patch(s, &s1, &s2);
+
+ clear();
+ sprintf(buf, "Chunk count: %d\n", ch);
+ mvaddstr(1,1,buf); clrtoeol();
+
+
+ f1 = split_stream(s1, ByWord, 0);
+ f2 = split_stream(s2, ByWord, 0);
+
+ csl = diff(f1, f2);
+
+ /* now try to display the diff highlighted */
+ int sol = 1;
+ int a=0, b=0;
+
+ while(a<f1.elcnt || b < f2.elcnt) {
+ if (a < csl->a) {
+ if (sol) {
+ int a1;
+ /* if we remove a whole line, output +line,
+ * else clear sol and retry
+ */
+ sol = 0;
+ for (a1=a; a1<csl->a; a1++)
+ if (f1.list[a1].start[0] == '\n') {
+ sol = 1;
+ break;
+ }
+ if (sol) {
+ addch('-');
+ attron(A_UNDERLINE);
+ for (; a<csl->a; a++) {
+ addword(f1.list[a]);
+ if (f1.list[a].start[0] == '\n') {
+ a++;
+ break;
+ }
+ }
+ attroff(A_UNDERLINE);
+ } else addch('|');
+ }
+ if (!sol) {
+ attron(A_UNDERLINE);
+ do {
+ if (sol) {
+ attroff(A_UNDERLINE);
+ addch('|');
+ attron(A_UNDERLINE);
+ }
+ addword(f1.list[a]);
+ sol = (f1.list[a].start[0] == '\n');
+ a++;
+ } while (a < csl->a);
+ attroff(A_UNDERLINE);
+ if (sol) addch('|');
+ sol = 0;
+ }
+ } else if (b < csl->b) {
+ if (sol) {
+ int b1;
+ sol = 0;
+ for (b1=b; b1<csl->b; b1++)
+ if (f2.list[b1].start[0] == '\n') {
+ sol = 1;
+ break;
+ }
+ if (sol) {
+ addch('+');
+ attron(A_BOLD);
+ for (; b<csl->b; b++) {
+ addword(f2.list[b]);
+ if (f2.list[b].start[0] == '\n') {
+ b++;
+ break;
+ }
+ }
+ attroff(A_BOLD);
+ } else addch('|');
+ }
+ if (!sol) {
+ attron(A_BOLD);
+ do {
+ if (sol) {
+ attroff(A_BOLD);
+ addch('|');
+ attron(A_BOLD);
+ }
+ addword(f2.list[b]);
+ sol = (f2.list[b].start[0] == '\n');
+ b++;
+ } while (b < csl->b);
+ attroff(A_BOLD);
+ if (sol) addch('|');
+ sol = 0;
+ }
+ } else {
+ if (sol) {
+ int a1;
+ sol = 0;
+ for (a1=a; a1<csl->a+csl->len; a1++)
+ if (f1.list[a1].start[0] == '\n')
+ sol = 1;
+ if (sol) {
+ if (f1.list[a].start[0]) {
+ addch(' ');
+ for (; a< csl->a+csl->len; a++,b++) {
+ addword(f1.list[a]);
+ if (f1.list[a].start[0]=='\n') {
+ a++,b++;
+ break;
+ }
+ }
+ } else {
+ addstr("SEP\n");
+ a++; b++;
+ }
+ } else addch('|');
+ }
+ if (!sol) {
+ addword(f1.list[a]);
+ if (f1.list[a].start[0] == '\n')
+ sol = 1;
+ a++;
+ b++;
+ }
+ if (a >= csl->a+csl->len)
+ csl++;
+ }
+ }
+
+
+ getch();
+
+ free(s1.body);
+ free(s2.body);
+ free(f1.list);
+ free(f2.list);
+}
+
+void main_window(struct plist *pl, int n, FILE *f)
+{
+ /* The main window lists all files together with summary information:
+ * number of chunks, number of wiggles, number of conflicts.
+ * The list is scrollable
+ * When a entry is 'selected', we switch to the 'file' window
+ * The list can be condensed by removing files with no conflict
+ * or no wiggles, or removing subdirectories
+ *
+ * We record which file in the list is 'current', and which
+ * screen line it is on. We try to keep things stable while
+ * moving.
+ *
+ * Counts are printed before the name using at most 2 digits.
+ * Numbers greater than 99 are XX
+ * Ch Wi Co File
+ * 27 5 1 drivers/md/md.c
+ *
+ * A directory show the sum in all children.
+ *
+ * Commands:
+ * select: enter, space, mouseclick
+ * on file, go to file window
+ * on directory, toggle open
+ * up: k, p, control-p uparrow
+ * Move to previous open object
+ * down: j, n, control-n, downarrow
+ * Move to next open object
+ *
+ */
+ int pos=0; /* position in file */
+ int row=1; /* position on screen */
+ int rows; /* size of screen in rows */
+ int cols;
+ int tpos, i;
+ int refresh = 2;
+ int c;
+
+ while(1) {
+ if (refresh == 2) {
+ clear();
+ attron(A_BOLD);
+ mvaddstr(0,0,"Ch Wi Co Patched Files");
+ move(2,0);
+ attroff(A_BOLD);
+ refresh = 1;
+ }
+ if (row <1 || row >= rows)
+ refresh = 1;
+ if (refresh) {
+ refresh = 0;
+ getmaxyx(stdscr, rows, cols);
+ if (row >= rows +3)
+ row = (rows+1)/2;
+ if (row >= rows)
+ row = rows-1;
+ tpos = pos;
+ for (i=row; i>1; i--) {
+ tpos = get_prev(tpos, pl, n);
+ if (tpos == -1) {
+ row = row - i + 1;
+ break;
+ }
+ }
+ /* Ok, row and pos could be trustworthy now */
+ tpos = pos;
+ for (i=row; i>=1; i--) {
+ draw_one(i, &pl[tpos]);
+ tpos = get_prev(tpos, pl, n);
+ }
+ tpos = pos;
+ for (i=row+1; i<rows; i++) {
+ tpos = get_next(tpos, pl, n);
+ if (tpos >= 0)
+ draw_one(i, &pl[tpos]);
+ else
+ draw_one(i, NULL);
+ }
+ }
+ move(row, 9);
+ c = getch();
+ switch(c) {
+ case 'j':
+ case 'n':
+ case 'N':
+ case 'N'-64:
+ case KEY_DOWN:
+ tpos = get_next(pos, pl, n);
+ if (tpos >= 0) {
+ pos = tpos;
+ row++;
+ }
+ break;
+ case 'k':
+ case 'p':
+ case 'P':
+ case 'P'-64:
+ case KEY_UP:
+ tpos = get_prev(pos, pl, n);
+ if (tpos >= 0) {
+ pos = tpos;
+ row--;
+ }
+ break;
+
+ case ' ':
+ case 13:
+ if (pl[pos].end == 0) {
+ pl[pos].open = ! pl[pos].open;
+ refresh = 1;
+ } else {
+ diff_window(&pl[pos], f);
+ refresh = 2;
+ }
+ break;
+ case 27: /* escape */
+ case 'q':
+ return;
+ }
+ }
+}
+
+
+int main(int argc, char *argv[])
+{
+ int n = 0;
+ FILE *f = NULL;
+ FILE *in = stdin;
+ struct plist *pl;
+
+ if (argc == 3)
+ f = fopen(argv[argc-1], "w+");
+ if (argc >=2)
+ in = fopen(argv[1], "r");
+ else {
+ printf("no arg...\n");
+ exit(2);
+ }
+
+ pl = parse_patch(in, f, &n);
+ pl = sort_patches(pl, &n);
+
+ if (f) {
+ fclose(in);
+ in = f;
+ }
+#if 0
+ int i;
+ for (i=0; i<n ; i++) {
+ printf("%3d: %3d %2d/%2d %s\n", i, pl[i].parent, pl[i].prev, pl[i].next, pl[i].file);
+ }
+ exit(0);
+#endif
+ signal(SIGINT, catch);
+ signal(SIGQUIT, catch);
+ signal(SIGTERM, catch);
+ signal(SIGBUS, catch);
+ signal(SIGSEGV, catch);
+
+ initscr(); cbreak(); noecho();
+ nonl(); intrflush(stdscr, FALSE); keypad(stdscr, TRUE);
+ mousemask(ALL_MOUSE_EVENTS, NULL);
+
+ main_window(pl, n, in);
+
+ nocbreak();nl();endwin();
+ return 0;
+}
diff --git a/demo/wiggle.c b/demo/wiggle.c
new file mode 100644
index 0000000..2bbb90f
--- /dev/null
+++ b/demo/wiggle.c
@@ -0,0 +1,643 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@cse.unsw.edu.au>
+ * Paper: Neil Brown
+ * School of Computer Science and Engineering
+ * The University of New South Wales
+ * Sydney, 2052
+ * Australia
+ */
+
+/*
+ * Wiggle is a tool for working with patches that don't quite apply properly.
+ * It provides functionality similar to 'diff' and 'merge' but can
+ * work at the level of individual words thus allowing the merging of
+ * two changes that affect the same line, but not the same parts of that line.
+ *
+ * Wiggle can also read patch and merge files. Unlike 'merge' it does not
+ * need to be given three separate files, but can be given a file and a patch
+ * and it will extract the pieces of the two two other files that it needs from
+ * the patch.
+ *
+ * Wiggle performs one of three core function:
+ * --extract -x extract part of a patch or merge file
+ * --diff -d report differences between two files
+ * --merge -m merge the changes between two files into a third file
+ *
+ * To perform these, wiggle requires 1, 2, or 3 input streams respectively.
+ * I can get there from individual files, from a diff (unified or context) or
+ * from a merge file.
+ *
+ * For merge:
+ * If one file is given, it is a merge file (output of 'merge').
+ * If two files are given, the second is assumed to be a patch, the first is a normal file.
+ * If three files are given, they are taken to be normal files.
+ *
+ * For diff:
+ * If one file is given, it is a patch
+ * If two files are given, they are normal files.
+ *
+ * For extract:
+ * Only one file can be given. -p indicates it is a patch, otherwise it is a merge.
+ * One of the flags -1 -2 or -3 must also be given and they indicate which
+ * part of the patch or merge to extract.
+ *
+ * Difference calculate and merging is performed on lines (-l) or words (-w).
+ * In the case of -w, an initial diff is computed based on non-trivial words.
+ * i.e. spaces are ignored
+ * This diff is computed from the ends of the file and is used to find a suitable
+ * starting point and range. Then a more precise diff is computed over that
+ * restricted range
+ *
+ * Other options available are:
+ * --replace -r replace first file with result of merge.
+ * --help -h provide help
+ * --version -v version
+ *
+ * Defaults are --merge --words
+ *
+ */
+
+#include "wiggle.h"
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+void die()
+{
+ fprintf(stderr,"wiggle: fatal error\n");
+ abort();
+ exit(3);
+}
+
+void printword(FILE *f, struct elmnt e)
+{
+ if (e.start[0])
+ fprintf(f, "%.*s", e.len, e.start);
+ else {
+ int a,b,c;
+ sscanf(e.start+1, "%d %d %d", &a, &b, &c);
+ fprintf(f, "*** %d,%d **** %d\n", b,c,a);
+ }
+}
+
+static void printsep(struct elmnt e1, struct elmnt e2)
+{
+ int a,b,c,d,e,f;
+ sscanf(e1.start+1, "%d %d %d", &a, &b, &c);
+ sscanf(e2.start+1, "%d %d %d", &d, &e, &f);
+ printf("@@ -%d,%d +%d,%d @@\n", b,c,e,f);
+}
+
+
+/* Remove any entries from the common-sublist that are
+ * just spaces, tabs, or newlines
+ */
+void cleanlist(struct file a, struct file b, struct csl *list)
+{
+ struct csl *new = list;
+
+ while (list->len) {
+ int i;
+ int ap;
+ for( ap = list->a; ap< list->a+list->len; ap++) {
+ for (i=0; i<a.list[ap].len; i++) {
+ char c = a.list[ap].start[i];
+ if (isalnum(c))
+ break;
+ }
+ if (i != a.list[ap].len)
+ break;
+ }
+ if (ap == list->a+list->len)
+ list++;
+ else
+ *new++ = *list++;
+ }
+ *new = *list;
+}
+
+int main(int argc, char *argv[])
+{
+ int opt;
+ int option_index;
+ int mode = 0;
+ int obj = 0;
+ int replace = 0;
+ char *replacename=NULL, *orignew=NULL;
+ int which = 0;
+ int ispatch = 0;
+ int reverse = 0;
+ int verbose=0, quiet=0;
+ int i;
+ int chunks1=0, chunks2=0, chunks3=0;
+ int exit_status = 0;
+ FILE *outfile = stdout;
+ char *helpmsg;
+
+ struct stream f, flist[3];
+ struct file fl[3];
+ struct csl *csl1, *csl2;
+
+ while ((opt = getopt_long(argc, argv,
+ short_options, long_options,
+ &option_index)) != -1)
+ switch(opt) {
+ case 'h':
+ helpmsg = Help;
+ switch(mode) {
+ case 'x': helpmsg = HelpExtract; break;
+ case 'd': helpmsg = HelpDiff; break;
+ case 'm': helpmsg = HelpMerge; break;
+ }
+ fputs(helpmsg, stderr);
+ exit(0);
+
+ case 'V':
+ fputs(Version, stderr);
+ exit(0);
+ case ':':
+ case '?':
+ default:
+ fputs(Usage, stderr);
+ exit(2);
+
+ case 'x':
+ case 'd':
+ case 'm':
+ if (mode ==0){
+ mode = opt;
+ continue;
+ }
+ fprintf(stderr, "wiggle: mode is '%c' - cannot set to '%c'\n",
+ mode, opt);
+ exit(2);
+
+ case 'w':
+ case 'l':
+ if (obj == 0 || obj == opt) {
+ obj = opt;
+ continue;
+ }
+ fprintf(stderr, "wiggle: cannot select both words and lines.\n");
+ exit(2);
+
+ case 'r':
+ replace = 1;
+ continue;
+ case 'R':
+ reverse = 1;
+ continue;
+
+ case '1':
+ case '2':
+ case '3':
+ if (which == 0 || which == opt) {
+ which = opt;
+ continue;
+ }
+ fprintf(stderr, "wiggle: can only select one of -1, -2, -3\n");
+ exit(2);
+
+ case 'p':
+ ispatch = 1;
+ continue;
+
+ case 'v': verbose++; continue;
+ case 'q': quiet=1 ; continue;
+ }
+ if (!mode)
+ mode = 'm';
+
+ if (obj && mode == 'x') {
+ fprintf(stderr,"wiggle: cannot specify --line or --word with --extract\n");
+ exit(2);
+ }
+ if (mode != 'm' && !obj) obj = 'w';
+ if (replace && mode != 'm') {
+ fprintf(stderr, "wiggle: --replace only allowed with --merge\n");
+ exit(2);
+ }
+ if (mode == 'x' && !which) {
+ fprintf(stderr, "wiggle: must specify -1, -2 or -3 with --extract\n");
+ exit(2);
+ }
+ if (mode != 'x' && mode != 'd' && which) {
+ fprintf(stderr, "wiggle: -1, -2 or -3 only allowed with --extract or --diff\n");
+ exit(2);
+ }
+ if (ispatch && (mode != 'x' && mode != 'd')) {
+ fprintf(stderr, "wiggle: --patch only allowed with --extract or --diff\n");
+ exit(2);
+ }
+ if (ispatch && which == '3') {
+ fprintf(stderr, "wiggle: cannot extract -3 from a patch.\n");
+ exit(2);
+ }
+
+ switch(mode) {
+ case 'x':
+ /* extract a branch of a diff or diff3 or merge output
+ * We need one file
+ */
+ if (optind == argc) {
+ fprintf(stderr, "wiggle: no file given for --extract\n");
+ exit(2);
+ }
+ if (optind < argc-1) {
+ fprintf(stderr, "wiggle: only give one file for --extract\n");
+ exit(2);
+ }
+ f = load_file(argv[optind]);
+ if (f.body==NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ if (ispatch)
+ chunks1 = chunks2 = split_patch(f, &flist[0], &flist[1]);
+ else {
+ if (!split_merge(f, &flist[0], &flist[1], &flist[2])) {
+ fprintf(stderr, "wiggle: merge file %s looks bad.\n",
+ argv[optind]);
+ exit(2);
+ }
+ }
+ if (flist[which-'1'].body == NULL) {
+ fprintf(stderr, "wiggle: %s has no -%c component.\n",
+ argv[optind], which);
+ exit(2);
+ } else {
+ write(1, flist[which-'1'].body, flist[which-'1'].len);
+ }
+
+ break;
+ case 'd':
+ /* create a diff (line or char) of two streams */
+ switch (argc-optind) {
+ case 0:
+ fprintf(stderr, "wiggle: no file given for --diff\n");
+ exit(2);
+ case 1:
+ f = load_file(argv[optind]);
+ if (f.body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ chunks1 = chunks2 = split_patch(f, &flist[0], &flist[1]);
+ if (!flist[0].body || !flist[1].body) {
+ fprintf(stderr, "wiggle: couldn't parse patch %s\n",
+ argv[optind]);
+ exit(2);
+ }
+ break;
+ case 2:
+ flist[0] = load_file(argv[optind]);
+ if (flist[0].body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ if (ispatch) {
+ f = load_file(argv[optind+1]);
+ if (f.body == NULL) {
+ fprintf(stderr, "wiggle: cannot load patch '%s' - %s\n",
+ argv[optind], strerror(errno));
+ exit(2);
+ }
+ if (which == '2')
+ chunks2 = chunks3 = split_patch(f, &flist[2], &flist[1]);
+ else
+ chunks2 = chunks3 = split_patch(f, &flist[1], &flist[2]);
+
+ } else
+ flist[1] = load_file(argv[optind+1]);
+ if (flist[1].body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind+1], strerror(errno));
+ exit(2);
+ }
+ break;
+ default:
+ fprintf(stderr, "wiggle: too many files given for --diff\n");
+ exit(2);
+ }
+ if (reverse) {
+ f=flist[1];
+ flist[1] = flist[2];
+ flist[2]= f;
+ }
+ if (obj == 'l') {
+ int a,b;
+ fl[0] = split_stream(flist[0], ByLine, 0);
+ fl[1] = split_stream(flist[1], ByLine, 0);
+ if (chunks2 && ! chunks1)
+ csl1 = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl1 = diff(fl[0], fl[1]);
+
+ if (!chunks1)
+ printf("@@ -1,%d +1,%d @@\n", fl[0].elcnt, fl[1].elcnt);
+ a = b = 0;
+ while (a<fl[0].elcnt || b < fl[1].elcnt) {
+ if (a < csl1->a) {
+ if (fl[0].list[a].start[0]) {
+ printf("-");
+ printword(stdout, fl[0].list[a]);
+ }
+ a++;
+ exit_status++;
+ } else if (b < csl1->b) {
+ if (fl[1].list[b].start[0]) {
+ printf("+");
+ printword(stdout, fl[1].list[b]);
+ }
+ b++;
+ exit_status++;
+ } else {
+ if (fl[0].list[a].start[0] == '\0')
+ printsep(fl[0].list[a], fl[1].list[b]);
+ else {
+ printf(" ");
+ printword(stdout, fl[0].list[a]);
+ }
+ a++;
+ b++;
+ if (a >= csl1->a+csl1->len)
+ csl1++;
+ }
+ }
+ } else {
+ int a,b;
+ int sol = 1; /* start of line */
+ fl[0] = split_stream(flist[0], ByWord, 0);
+ fl[1] = split_stream(flist[1], ByWord, 0);
+ if (chunks2 && !chunks1)
+ csl1 = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl1 = diff(fl[0], fl[1]);
+
+ if (!chunks1) {
+ /* count lines in each file */
+ int l1, l2, i;
+ l1=l2=0;
+ for (i=0;i<fl[0].elcnt;i++)
+ if (ends_line(fl[0].list[i]))
+ l1++;
+ for (i=0;i<fl[1].elcnt;i++)
+ if (ends_line(fl[1].list[i]))
+ l2++;
+ printf("@@ -1,%d +1,%d @@\n", l1,l2);
+ }
+ a = b = 0;
+ while (a < fl[0].elcnt || b < fl[1].elcnt) {
+ if (a < csl1->a) {
+ exit_status++;
+ if (sol) {
+ int a1;
+ /* If we remove a whole line, output +line
+ * else clear sol and retry */
+ sol = 0;
+ for (a1=a; a1<csl1->a;a1++)
+ if (ends_line(fl[0].list[a1])) {
+ sol=1;
+ break;
+ }
+ if (sol) {
+ printf("-");
+ for (; a<csl1->a; a++) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a])) {
+ a++;
+ break;
+ }
+ }
+ } else printf("|");
+ }
+ if (!sol) {
+ printf("<<<--");
+ do {
+ if (sol) printf("|");
+ printword(stdout, fl[0].list[a]);
+ sol = ends_line(fl[0].list[a]);
+ a++;
+ } while (a < csl1->a);
+ printf("%s-->>>", sol?"|":"");
+ sol=0;
+ }
+ } else if (b < csl1->b) {
+ exit_status++;
+ if (sol) {
+ int b1;
+ sol = 0;
+ for (b1=b; b1<csl1->b;b1++)
+ if(ends_line(fl[1].list[b1])) {
+ sol=1;
+ break;
+ }
+ if (sol) {
+ printf("+");
+ for(; b<csl1->b ; b++) {
+ printword(stdout, fl[1].list[b]);
+ if(ends_line(fl[1].list[b])) {
+ b++;
+ break;
+ }
+ }
+ } else printf("|");
+ }
+ if (!sol) {
+ printf("<<<++");
+ do {
+ if (sol) printf("|");
+ printword(stdout, fl[1].list[b]);
+ sol = ends_line(fl[1].list[b]);
+ b++;
+ } while (b < csl1->b);
+ printf("%s++>>>",sol?"|":"");
+ sol=0;
+ }
+ } else {
+ if (sol) {
+ int a1;
+ sol = 0;
+ for (a1=a; a1<csl1->a+csl1->len; a1++)
+ if (ends_line(fl[0].list[a1]))
+ sol=1;
+ if (sol) {
+ if (fl[0].list[a].start[0]) {
+ printf(" ");
+ for(; a<csl1->a+csl1->len; a++,b++) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a])) {
+ a++,b++;
+ break;
+ }
+ }
+ } else {
+ printsep(fl[0].list[a], fl[1].list[b]);
+ a++; b++;
+ }
+ }
+ else printf("|");
+ }
+ if (!sol) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a]))
+ sol=1;
+ a++;
+ b++;
+ }
+ if (a >= csl1->a+csl1->len)
+ csl1++;
+ }
+ }
+
+ }
+ break;
+ case 'm':
+ /* merge three files, A B C, so changed between B and C get made to A
+ */
+ switch (argc-optind) {
+ case 0:
+ fprintf(stderr, "wiggle: no files given for --merge\n");
+ exit(2);
+ case 3:
+ case 2:
+ case 1:
+ for (i=0; i< argc-optind; i++) {
+ flist[i] = load_file(argv[optind+i]);
+ if (flist[i].body == NULL) {
+ fprintf(stderr, "wiggle: cannot load file '%s' - %s\n",
+ argv[optind+i], strerror(errno));
+ exit(2);
+ }
+ }
+ break;
+ default:
+ fprintf(stderr, "wiggle: too many files given for --merge\n");
+ exit(2);
+ }
+ switch(argc-optind) {
+ case 1: /* a merge file */
+ f = flist[0];
+ if (!split_merge(f, &flist[0], &flist[1], &flist[2])) {
+ fprintf(stderr,"wiggle: merge file %s looks bad.\n",
+ argv[optind]);
+ exit(2);
+ }
+ break;
+ case 2: /* a file and a patch */
+ f = flist[1];
+ chunks2 = chunks3 = split_patch(f, &flist[1], &flist[2]);
+ break;
+ case 3: /* three separate files */
+ break;
+ }
+ if (reverse) {
+ f=flist[1];
+ flist[1] = flist[2];
+ flist[2]= f;
+ }
+
+ for (i=0; i<3; i++) {
+ if (flist[i].body==NULL) {
+ fprintf(stderr, "wiggle: file %d missing\n", i);
+ exit(2);
+ }
+ }
+ if (replace) {
+ int fd;
+ replacename = malloc(strlen(argv[optind])+ 20);
+ if (!replacename) die();
+ orignew = malloc(strlen(argv[optind])+20);
+ if (!orignew) die();
+ strcpy(replacename, argv[optind]);
+ strcpy(orignew, argv[optind]);
+ strcat(orignew, ".porig");
+ if (open(orignew, O_RDONLY) >= 0 ||
+ errno != ENOENT) {
+ fprintf(stderr,"wiggle: %s already exists\n",
+ orignew);
+ exit(2);
+ }
+ strcat(replacename,"XXXXXX");
+ fd = mkstemp(replacename);
+ if (fd == -1) {
+ fprintf(stderr,"wiggle: could not create temporary file for %s\n",
+ replacename);
+ exit(2);
+ }
+ outfile = fdopen(fd, "w");
+
+ }
+
+ if (obj == 'l') {
+ fl[0] = split_stream(flist[0], ByLine, 0);
+ fl[1] = split_stream(flist[1], ByLine, 0);
+ fl[2] = split_stream(flist[2], ByLine, 0);
+ } else {
+ fl[0] = split_stream(flist[0], ByWord, 0);
+ fl[1] = split_stream(flist[1], ByWord, 0);
+ fl[2] = split_stream(flist[2], ByWord, 0);
+ }
+ if (chunks2 && !chunks1)
+ csl1 = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl1 = diff(fl[0], fl[1]);
+ csl2 = diff(fl[1], fl[2]);
+
+#if 0
+ cleanlist(fl[0],fl[1],csl1);
+ cleanlist(fl[1],fl[2],csl2);
+#endif
+
+ {
+ struct ci ci;
+
+ ci = print_merge(outfile, &fl[0], &fl[1], &fl[2],
+ csl1, csl2, obj=='w');
+ if (!quiet && ci.conflicts)
+ fprintf(stderr, "%d unresolved conflict%s found\n", ci.conflicts, ci.conflicts==1?"":"s");
+ if (!quiet && ci.ignored)
+ fprintf(stderr, "%d already-applied change%s ignored\n", ci.ignored, ci.ignored==1?"":"s");
+ exit_status = (ci.conflicts > 0);
+ }
+ if (replace) {
+ fclose(outfile);
+ if (rename(argv[optind], orignew) ==0 &&
+ rename(replacename, argv[optind]) ==0)
+ /* all ok */;
+ else {
+ fprintf(stderr, "wiggle: failed to move new file into place.\n");
+ exit(2);
+ }
+ }
+ break;
+
+ }
+ exit(exit_status);
+}
diff --git a/diff.c b/diff.c
new file mode 100644
index 0000000..239d409
--- /dev/null
+++ b/diff.c
@@ -0,0 +1,635 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ * Copyright (C) 2011 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+/*
+ * Calculate longest common subsequence between two sequences
+ *
+ * Each sequence contains strings with
+ * hash start length
+ * We produce a list of tripples: a b len
+ * where A and B point to elements in the two sequences, and len is the number
+ * of common elements there. The list is terminated by an entry with len==0.
+ *
+ * This is roughly based on
+ * "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
+ * Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
+ * http://xmailserver.org/diff2.pdf
+ *
+ * However we don't run the basic algorithm both forward and backward until
+ * we find an overlap as Myers suggests. Rather we always run forwards, but
+ * we record the location of the (possibly empty) snake that crosses the
+ * midline. When we finish, this recorded location for the best path shows
+ * us where to divide and find further midpoints.
+ *
+ * In brief, the algorithm is as follows.
+ *
+ * Imagine a Cartesian Matrix where x co-ordinates correspond to symbols in
+ * the first sequence (A, length a) and y co-ordinates correspond to symbols
+ * in the second sequence (B, length b). At the origin we have the first
+ * sequence.
+ * Movement in the x direction represents deleting the symbol as that point,
+ * so from x=i-1 to x=i deletes symbol i from A.
+
+ * Movement in the y direction represents adding the corresponding symbol
+ * from B. So to move from the origin 'a' spaces along X and then 'b' spaces
+ * up Y will remove all of the first sequence and then add all of the second
+ * sequence. Similarly moving firstly up the Y axis, then along the X
+ * direction will add the new sequence, then remove the old sequence. Thus
+ * the point a,b represents the second sequence and a part from 0,0 to a,b
+ * represent an sequence of edits to change A into B.
+ *
+ * There are clearly many paths from 0,0 to a,b going through different
+ * points in the matrix in different orders. At some points in the matrix
+ * the next symbol to be added from B is the same as the next symbol to be
+ * removed from A. At these points we can take a diagonal step to a new
+ * point in the matrix without actually changing any symbol. A sequence of
+ * these diagonal steps is called a 'snake'. The goal then is to find a path
+ * of x-steps (removals), y-steps (additions) and diagonals (steps and
+ * snakes) where the number of (non-diagonal) steps is minimal.
+ *
+ * i.e. we aim for as many long snakes as possible.
+ * If the total number of 'steps' is called the 'cost', we aim to minimise
+ * the cost.
+ *
+ * As storing the whole matrix in memory would be prohibitive with large
+ * sequences we limit ourselves to linear storage proportional to a+b and
+ * repeat the search at most log2(a+b) times building up the path as we go.
+ * Specifically we perform a search on the full matrix and record where each
+ * path crosses the half-way point. i.e. where x+y = (a+b)/2 (== mid). This
+ * tells us the mid point of the best path. We then perform two searches,
+ * one on each of the two halves and find the 1/4 and 3/4 way points. This
+ * continues recursively until we have all points.
+ *
+ * The storage is an array v of 'struct v'. This is indexed by the
+ * diagonal-number k = x-y. Thus k can be negative and the array is
+ * allocated to allow for that. During the search there is an implicit value
+ * 'c' which is the cost (length in steps) of all the paths currently under
+ * consideration.
+ * v[k] stores details of the longest reaching path of cost c that finishes
+ * on diagonal k. "longest reaching" means "finishes closest to a,b".
+ * Details are:
+ * The location of the end point. 'x' is stored. y = x - k.
+ * The diagonal of the midpoint crossing. md is stored. x = (mid + md)/2
+ * y = (mid - md)/2
+ * = x - md
+ * (note: md is a diagonal so md = x-y. mid is an anti-diagonal: mid = x+y)
+ * The number of 'snakes' in the path (l). This is used to allocate the
+ * array which will record the snakes and to terminate recursion.
+ *
+ * A path with an even cost (c is even) must end on an even diagonal (k is
+ * even) and when c is odd, k must be odd. So the v[] array is treated as
+ * two sub arrays, the even part and the odd part. One represents paths of
+ * cost 'c', the other paths of cost c-1.
+ *
+ * Initially only v[0] is meaningful and there are no snakes. We firstly
+ * extend all paths under consideration with the longest possible snake on
+ * that diagonal.
+ *
+ * Then we increment 'c' and calculate for each suitable 'k' whether the best
+ * path to diagonal k of cost c comes from taking an x-step from the c-1 path
+ * on diagonal k-1, or from taking a y-step from the c-1 path on diagonal
+ * k+1. Obviously we need to avoid stepping out of the matrix. Finally we
+ * check if the 'v' array can be extended or reduced at the boundaries. If
+ * we hit a border we must reduce. If the best we could possibly do on that
+ * diagonal is less than the worst result from the current leading path, then
+ * we also reduce. Otherwise we extend the range of 'k's we consider.
+ *
+ * We continue until we find a path has reached a,b. This must be a minimal
+ * cost path (cost==c). At this point re-check the end of the snake at the
+ * midpoint and report that.
+ *
+ * This all happens recursively for smaller and smaller subranges stopping
+ * when we examine a submatrix and find that it contains no snakes. As we
+ * are usually dealing with sub-matrixes we are not walking from 0,0 to a,b
+ * from alo,blo to ahi,bhi - low point to high point. So the initial k is
+ * alo-blo, not 0.
+ *
+ */
+
+#include "wiggle.h"
+#include <stdlib.h>
+
+struct v {
+ int x; /* x location of furthest reaching path of current cost */
+ int md; /* diagonal location of midline crossing */
+ int l; /* number of continuous common sequences found so far */
+};
+
+static int find_common(struct file *a, struct file *b,
+ int *alop, int *ahip,
+ int *blop, int *bhip,
+ struct v *v)
+{
+ /* Examine matrix from alo to ahi and blo to bhi.
+ * i.e. including alo and blo, but less than ahi and bhi.
+ * Finding longest subsequence and
+ * return new {a,b}{lo,hi} either side of midline.
+ * i.e. mid = ( (ahi-alo) + (bhi-blo) ) / 2
+ * alo+blo <= mid <= ahi+bhi
+ * and alo,blo to ahi,bhi is a common (possibly empty)
+ * subseq - a snake.
+ *
+ * v is scratch space which is indexable from
+ * alo-bhi to ahi-blo inclusive.
+ * i.e. even though there is no symbol at ahi or bhi, we do
+ * consider paths that reach there as they simply cannot
+ * go further in that direction.
+ *
+ * Return the number of snakes found.
+ */
+
+ int klo, khi;
+ int alo = *alop;
+ int ahi = *ahip;
+ int blo = *blop;
+ int bhi = *bhip;
+
+ int mid = (ahi+bhi+alo+blo)/2;
+
+ /* 'worst' is the worst-case extra cost that we need
+ * to pay before reaching our destination. It assumes
+ * no more snakes in the furthest-reaching path so far.
+ * We use this to know when we can trim the extreme
+ * diagonals - when their best case does not improve on
+ * the current worst case.
+ */
+ int worst = (ahi-alo)+(bhi-blo);
+
+ klo = khi = alo-blo;
+ v[klo].x = alo;
+ v[klo].l = 0;
+
+ while (1) {
+ int x, y;
+ int cost;
+ int k;
+
+ /* Find the longest snake extending on each current
+ * diagonal, and record if it crosses the midline.
+ * If we reach the end, return.
+ */
+ for (k = klo ; k <= khi ; k += 2) {
+ int snake = 0;
+
+ x = v[k].x;
+ y = x-k;
+ if (y > bhi)
+ abort();
+
+ /* Follow any snake that is here */
+ while (x < ahi && y < bhi &&
+ match(&a->list[x], &b->list[y])
+ ) {
+ x++;
+ y++;
+ snake = 1;
+ }
+
+ /* Refine the worst-case remaining cost */
+ cost = (ahi-x)+(bhi-y);
+ if (cost < worst)
+ worst = cost;
+
+ /* Check for midline crossing */
+ if (x+y >= mid &&
+ v[k].x + v[k].x-k <= mid)
+ v[k].md = k;
+
+ v[k].x = x;
+ v[k].l += snake;
+
+ if (cost == 0) {
+ /* OK! We have arrived.
+ * We crossed the midpoint on diagonal v[k].md
+ */
+ if (x != ahi)
+ abort();
+
+ /* The snake could start earlier than the
+ * midline. We cannot just search backwards
+ * as that might find the wrong path - the
+ * greediness of the diff algorithm is
+ * asymmetric.
+ * We could record the start of the snake in
+ * 'v', but we will find the actual snake when
+ * we recurse so there is no need.
+ */
+ x = (v[k].md+mid)/2;
+ y = x-v[k].md;
+
+ *alop = x;
+ *blop = y;
+
+ /* Find the end of the snake using the same
+ * greedy approach as when we first found the
+ * snake
+ */
+ while (x < ahi && y < bhi &&
+ match(&a->list[x], &b->list[y])
+ ) {
+ x++;
+ y++;
+ }
+ *ahip = x;
+ *bhip = y;
+
+ return v[k].l;
+ }
+ }
+
+ /* No success with previous cost, so increment cost (c) by 1
+ * and for each other diagonal, set from the end point of the
+ * diagonal on one side of it or the other.
+ */
+ for (k = klo+1; k <= khi-1 ; k += 2) {
+ if (v[k-1].x+1 > ahi) {
+ /* cannot step to the right from previous
+ * diagonal as there is no room.
+ * So step up from next diagonal.
+ */
+ v[k] = v[k+1];
+ } else if (v[k+1].x - k > bhi || v[k-1].x+1 >= v[k+1].x) {
+ /* Cannot step up from next diagonal as either
+ * there is no room, or doing so wouldn't get us
+ * as close to the endpoint.
+ * So step to the right.
+ */
+ v[k] = v[k-1];
+ v[k].x++;
+ } else {
+ /* There is room in both directions, but
+ * stepping up from the next diagonal gets us
+ * closer
+ */
+ v[k] = v[k+1];
+ }
+ }
+
+ /* Now we need to either extend or contract klo and khi
+ * so they both change parity (odd vs even).
+ * If we extend we need to step up (for klo) or to the
+ * right (khi) from the adjacent diagonal. This is
+ * not possible if we have hit the edge of the matrix, and
+ * not sensible if the new point has a best case remaining
+ * cost that is worse than our current worst case remaining
+ * cost.
+ * The best-case remaining cost is the absolute difference
+ * between the remaining number of additions and the remaining
+ * number of deletions - and assumes lots of snakes.
+ */
+ /* new location if we step up from klo to klo-1*/
+ x = v[klo].x; y = x - (klo-1);
+ cost = abs((ahi-x)-(bhi-y));
+ if (y <= bhi && cost <= worst) {
+ /* Looks acceptable - step up. */
+ v[klo-1] = v[klo];
+ klo--;
+ } else
+ klo++;
+
+ /* new location if we step to the right from khi to khi+1 */
+ x = v[khi].x+1; y = x - (khi+1);
+ cost = abs((ahi-x)-(bhi-y));
+ if (x <= ahi && cost <= worst) {
+ /* Looks acceptable - step to the right */
+ v[khi+1] = v[khi];
+ v[khi+1].x++;
+ khi++;
+ } else
+ khi--;
+ }
+}
+
+static struct csl *lcsl(struct file *a, int alo, int ahi,
+ struct file *b, int blo, int bhi,
+ struct csl *csl,
+ struct v *v)
+{
+ /* lcsl == longest common sub-list.
+ * This calls itself recursively as it finds the midpoint
+ * of the best path.
+ * On first call, 'csl' is NULL and will need to be allocated and
+ * is returned.
+ * On subsequence calls when 'csl' is not NULL, we add all the
+ * snakes we find to csl, and return a pointer to the next
+ * location where future snakes can be stored.
+ */
+ int len;
+ int alo1 = alo;
+ int ahi1 = ahi;
+ int blo1 = blo;
+ int bhi1 = bhi;
+ struct csl *rv = NULL;
+
+ if (ahi <= alo || bhi <= blo)
+ return csl;
+
+ len = find_common(a, b,
+ &alo1, &ahi1,
+ &blo1, &bhi1,
+ v);
+
+ if (csl == NULL) {
+ /* 'len+1' to hold a sentinel */
+ rv = csl = xmalloc((len+1)*sizeof(*csl));
+ csl->len = 0;
+ }
+ if (len) {
+ /* There are more snakes to find - keep looking. */
+
+ /* With depth-first recursion, this adds all the snakes
+ * before 'alo1' to 'csl'
+ */
+ csl = lcsl(a, alo, alo1,
+ b, blo, blo1,
+ csl, v);
+
+ if (ahi1 > alo1) {
+ /* need to add this common seq, possibly attach
+ * to last
+ */
+ if (csl->len &&
+ csl->a+csl->len == alo1 &&
+ csl->b+csl->len == blo1) {
+ csl->len += ahi1-alo1;
+ } else {
+ if (csl->len)
+ csl++;
+ csl->len = ahi1-alo1;
+ csl->a = alo1;
+ csl->b = blo1;
+ csl[1].len = 0;
+ }
+ }
+ /* Now recurse to add all the snakes after ahi1 to csl */
+ csl = lcsl(a, ahi1, ahi,
+ b, bhi1, bhi,
+ csl, v);
+ }
+ if (rv) {
+ /* This was the first call. Record the endpoint
+ * as a snake of length 0. This might be extended.
+ * by 'fixup()' below.
+ */
+ if (csl->len)
+ csl++;
+ csl->a = ahi;
+ csl->b = bhi;
+#if 1
+ if (rv+len != csl || csl->len != 0)
+ abort(); /* number of runs was wrong */
+#endif
+ return rv;
+ } else
+ /* intermediate call - return where we are up to */
+ return csl;
+}
+
+/* If two common sequences are separated by only an add or remove,
+ * and the first sequence ends the same as the middle text,
+ * extend the second and contract the first in the hope that the
+ * first might become empty. This ameliorates against the greediness
+ * of the 'diff' algorithm.
+ * i.e. if we have:
+ * [ foo X ] X [ bar ]
+ * [ foo X ] [ bar ]
+ * Then change it to:
+ * [ foo ] X [ X bar ]
+ * [ foo ] [ X bar ]
+ * We treat the final zero-length 'csl' as a common sequence which
+ * can be extended so we must make sure to add a new zero-length csl
+ * to the end.
+ * If this doesn't make the first sequence disappear, and (one of the)
+ * X(s) was a newline, then move back so the newline is at the end
+ * of the first sequence. This encourages common sequences
+ * to be whole-line units where possible.
+ */
+static void fixup(struct file *a, struct file *b, struct csl *list)
+{
+ struct csl *list1, *orig;
+ int lasteol = -1;
+ int found_end = 0;
+
+ if (!list)
+ return;
+
+ /* 'list' and 'list1' are adjacent pointers into the csl.
+ * If a match gets deleted, they might not be physically
+ * adjacent any more. Once we get to the end of the list
+ * this will cease to matter - the list will be a bit
+ * shorter is all.
+ */
+ orig = list;
+ list1 = list+1;
+ while (list->len) {
+ if (list1->len == 0)
+ found_end = 1;
+
+ /* If a single token is either inserted or deleted
+ * immediately after a matching token...
+ */
+ if ((list->a+list->len == list1->a &&
+ list->b+list->len != list1->b &&
+ /* text at b inserted */
+ match(&b->list[list->b+list->len-1],
+ &b->list[list1->b-1])
+ )
+ ||
+ (list->b+list->len == list1->b &&
+ list->a+list->len != list1->a &&
+ /* text at a deleted */
+ match(&a->list[list->a+list->len-1],
+ &a->list[list1->a-1])
+ )
+ ) {
+ /* If the last common token is a simple end-of-line
+ * record where it is. For a word-wise diff, this is
+ * any EOL. For a line-wise diff this is a blank line.
+ * If we are looking at a deletion it must be deleting
+ * the eol, so record that deleted eol.
+ */
+ if (ends_line(a->list[list->a+list->len-1])
+ && a->list[list->a+list->len-1].len == 1
+ && lasteol == -1
+ ) {
+ lasteol = list1->a-1;
+ }
+ /* Expand the second match, shrink the first */
+ list1->a--;
+ list1->b--;
+ list1->len++;
+ list->len--;
+
+ /* If the first match has become empty, make it
+ * disappear.. (and forget about the eol).
+ */
+ if (list->len == 0) {
+ lasteol = -1;
+ if (found_end) {
+ /* Deleting just before the last
+ * entry */
+ *list = *list1;
+ list1->a += list1->len;
+ list1->b += list1->len;
+ list1->len = 0;
+ } else if (list > orig)
+ /* Deleting in the middle */
+ list--;
+ else {
+ /* deleting the first entry */
+ *list = *list1++;
+ }
+ }
+ } else {
+ /* Nothing interesting here, though if we
+ * shuffled back past an eol, shuffle
+ * forward to line up with that eol.
+ * This causes an eol to bind more strongly
+ * with the preceding line than the following.
+ */
+ if (lasteol >= 0) {
+ while (list1->a <= lasteol
+ && (list1->len > 1 ||
+ (found_end && list1->len > 0))) {
+ list1->a++;
+ list1->b++;
+ list1->len--;
+ list->len++;
+ }
+ lasteol = -1;
+ }
+ *++list = *list1;
+ if (found_end) {
+ list1->a += list1->len;
+ list1->b += list1->len;
+ list1->len = 0;
+ } else
+ list1++;
+ }
+ if (list->len && list1 == list)
+ abort();
+ }
+}
+
+/* Main entry point - find the common-sub-list of files 'a' and 'b'.
+ * The final element in the list will have 'len' == 0 and will point
+ * beyond the end of the files.
+ */
+struct csl *diff(struct file a, struct file b)
+{
+ struct v *v;
+ struct csl *csl;
+ v = xmalloc(sizeof(struct v)*(a.elcnt+b.elcnt+2));
+ v += b.elcnt+1;
+
+ csl = lcsl(&a, 0, a.elcnt,
+ &b, 0, b.elcnt,
+ NULL, v);
+ free(v-(b.elcnt+1));
+ fixup(&a, &b, csl);
+ if (!csl) {
+ csl = xmalloc(sizeof(*csl));
+ csl->len = 0;
+ csl->a = a.elcnt;
+ csl->b = b.elcnt;
+ }
+ return csl;
+}
+
+/* Alternate entry point - find the common-sub-list in two
+ * subranges of files.
+ */
+struct csl *diff_partial(struct file a, struct file b,
+ int alo, int ahi, int blo, int bhi)
+{
+ struct v *v;
+ struct csl *csl;
+ v = xmalloc(sizeof(struct v)*(ahi-alo+bhi-blo+2));
+ v += bhi-alo+1;
+
+ csl = lcsl(&a, alo, ahi,
+ &b, blo, bhi,
+ NULL, v);
+ free(v-(bhi-alo+1));
+ fixup(&a, &b, csl);
+ return csl;
+}
+
+#ifdef MAIN
+
+main(int argc, char *argv[])
+{
+ struct file a, b;
+ struct csl *csl;
+ struct elmnt *lst = xmalloc(argc*sizeof(*lst));
+ int arg;
+ struct v *v;
+ int ln;
+
+ arg = 1;
+ a.elcnt = 0;
+ a.list = lst;
+ while (argv[arg] && strcmp(argv[arg], "--")) {
+ lst->hash = 0;
+ lst->start = argv[arg];
+ lst->len = strlen(argv[arg]);
+ a.elcnt++;
+ lst++;
+ arg++;
+ }
+ if (!argv[arg]) {
+ printf("AARGH\n");
+ exit(1);
+ }
+ arg++;
+ b.elcnt = 0;
+ b.list = lst;
+ while (argv[arg] && strcmp(argv[arg], "--")) {
+ lst->hash = 0;
+ lst->start = argv[arg];
+ lst->len = strlen(argv[arg]);
+ b.elcnt++;
+ lst++;
+ arg++;
+ }
+
+ csl = diff(a, b);
+ fixup(&a, &b, csl);
+ while (csl && csl->len) {
+ int i;
+ printf("%d,%d for %d:\n", csl->a, csl->b, csl->len);
+ for (i = 0; i < csl->len; i++) {
+ printf(" %.*s (%.*s)\n",
+ a.list[csl->a+i].len, a.list[csl->a+i].start,
+ b.list[csl->b+i].len, b.list[csl->b+i].start);
+ }
+ csl++;
+ }
+
+ exit(0);
+}
+
+#endif
+
diff --git a/dotest b/dotest
new file mode 100755
index 0000000..18fdf05
--- /dev/null
+++ b/dotest
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Redirect for non-GNU time systems
+if [ "xx$TIME_CMD" = "xx" ]
+then
+ time_cmd=/usr/bin/time
+else
+ time_cmd=$TIME_CMD
+fi
+
+dir=$PWD
+
+while [ ! -f $dir/wiggle ]
+do
+ case $dir in
+ / ) echo >&2 Cannot find wiggle program : $WIGGLE
+ exit 1;;
+ * ) dir=${dir%/*}
+ esac
+done
+export WIGGLE=$dir/wiggle
+
+if [ -d tests ]
+then cd tests
+fi
+
+TIME="$time_cmd -o .time -f %U"
+if $TIME true > /dev/null 2>&1
+then :
+else TIME=
+fi
+#TIME=valgrind
+status=0
+ok=0
+fail=0
+
+find . -name core | xargs rm -f
+list=$(find . -type f \( -name script -o -name diff -o -name ldiff \
+ -o -name rediff -o -name merge -o -name wmerge -o -name lmerge \
+ -o -name replace -o -name Wmerge \)
+ )
+for path in $list
+do
+ dir=${path%/*}
+ base=${path##*/}
+ (
+ cd $dir
+ > .time
+ case $base in
+ script ) ./script ;;
+ diff ) if [ -f new ]
+ then $TIME $WIGGLE -dw orig new | diff -u diff - ; xit=$?
+ else $TIME $WIGGLE -dwp1 orig patch | diff -u diff - ; xit=$?
+ fi
+ ;;
+ ldiff ) if [ -f new ]
+ then $TIME $WIGGLE -dl orig new | diff -u ldiff - ; xit=$?
+ else $TIME $WIGGLE -dlp1 orig patch | diff -u ldiff - ; xit=$?
+ fi
+ ;;
+ reldiff ) $TIME $WIGGLE -dl patch | diff -u reldiff - ; xit=$?
+ ;;
+ rediff ) $TIME $WIGGLE -dw patch | diff -u rediff - ; xit=$?
+ ;;
+ merge ) if [ -f patch ]
+ then $TIME $WIGGLE -m orig patch | diff -u merge - ; xit=$?
+ elif [ -f new ]
+ then $TIME $WIGGLE -m orig new new2 | diff -u merge - ; xit=$?
+ else $TIME $WIGGLE -m orig | diff -u merge - ; xit=$?
+ fi
+ ;;
+ replace ) cp orig orig.tmp
+ if [ -f patch ]
+ then $TIME $WIGGLE -mr orig.tmp patch
+ else $TIME $WIGGLE -mr orig.tmp new new2
+ fi
+ diff -u merge orig.tmp ; xit=$?
+ rm orig.tmp orig.tmp.porig
+ ;;
+ lmerge ) if [ -f patch ]
+ then $TIME $WIGGLE -ml orig patch | diff -u lmerge - ; xit=$?
+ else $TIME $WIGGLE -ml orig new new2 | diff -u lmerge - ; xit=$?
+ fi
+ ;;
+ wmerge ) if [ -f patch ]
+ then $TIME $WIGGLE -mw orig patch | diff -u wmerge - ; xit=$?
+ else $TIME $WIGGLE -mw orig new new2 | diff -u wmerge - ; xit=$?
+ fi
+ ;;
+ Wmerge ) if [ -f patch ]
+ then $TIME $WIGGLE -mW orig patch | diff -u Wmerge - ; xit=$?
+ else $TIME $WIGGLE -mW orig new new2 | diff -u Wmerge - ; xit=$?
+ fi
+ ;;
+ esac
+ if [ $xit = 0 ]; then msg=SUCCEEDED; else msg=FAILED; fi
+ echo $path $msg `grep -v 'Command exited' .time 2> /dev/null`
+ rm -f .time
+ exit $xit
+ )
+ if [ $? = 0 ]
+ then let ok++;
+ else status=1 ; let fail++
+ fi
+done
+find . -name core -ls
+echo $ok succeeded and $fail failed
+exit $status
diff --git a/extract.c b/extract.c
new file mode 100644
index 0000000..ba81ec0
--- /dev/null
+++ b/extract.c
@@ -0,0 +1,325 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+/*
+ * split patch or merge files.
+ */
+
+#include "wiggle.h"
+#include <stdlib.h>
+
+/* skip 'cp' past the new '\n', or all the way to 'end' */
+static void skip_eol(char **cp, char *end)
+{
+ char *c = *cp;
+ while (c < end && *c != '\n')
+ c++;
+ if (c < end)
+ c++;
+ *cp = c;
+}
+
+/* copy one line, or to end, from 'cp' into the stream, extending
+ * the stream.
+ */
+static void copyline(struct stream *s, char **cp, char *end)
+{
+ char *from = *cp;
+ char *to = s->body+s->len;
+
+ while (from < end && *from != '\n')
+ *to++ = *from++;
+ if (from < end)
+ *to++ = *from++;
+ s->len = to-s->body;
+ *cp = from;
+}
+
+int split_patch(struct stream f, struct stream *f1, struct stream *f2)
+{
+ struct stream r1, r2;
+ int chunks = 0;
+ char *cp, *end;
+ int state = 0;
+ int acnt = 0, bcnt = 0;
+ int a, b, c, d;
+ int lineno = 0;
+ char before[100], after[100];
+
+ f1->body = f2->body = NULL;
+
+ r1.body = xmalloc(f.len);
+ r2.body = xmalloc(f.len);
+ r1.len = r2.len = 0;
+
+ cp = f.body;
+ end = f.body+f.len;
+ while (cp < end) {
+ /* state:
+ * 0 not in a patch
+ * 1 first half of context
+ * 2 second half of context
+ * 3 unified
+ */
+ lineno++;
+ switch (state) {
+ case 0:
+ if (sscanf(cp, "@@ -%s +%s @@", before, after) == 2) {
+ int ok = 1;
+ if (sscanf(before, "%d,%d", &a, &b) == 2)
+ acnt = b;
+ else if (sscanf(before, "%d", &a) == 1)
+ acnt = 1;
+ else
+ ok = 0;
+
+ if (sscanf(after, "%d,%d", &c, &d) == 2)
+ bcnt = d;
+ else if (sscanf(after, "%d", &c) == 1)
+ bcnt = 1;
+ else
+ ok = 0;
+ if (ok)
+ state = 3;
+ else
+ state = 0;
+ } else if (sscanf(cp, "*** %d,%d ****", &a, &b) == 2) {
+ acnt = b-a+1;
+ state = 1;
+ } else if (sscanf(cp, "--- %d,%d ----", &c, &d) == 2) {
+ bcnt = d-c+1;
+ state = 2;
+ }
+ skip_eol(&cp, end);
+ if (state == 1 || state == 3) {
+ char buf[20];
+ buf[0] = 0;
+ chunks++;
+ sprintf(buf+1, "%5d %5d %5d\n", chunks, a, acnt);
+ memcpy(r1.body+r1.len, buf, 20);
+ r1.len += 20;
+ }
+ if (state == 2 || state == 3) {
+ char buf[20];
+ buf[0] = 0;
+ sprintf(buf+1, "%5d %5d %5d\n", chunks, c, bcnt);
+ memcpy(r2.body+r2.len, buf, 20);
+ r2.len += 20;
+ }
+ break;
+ case 1:
+ if ((*cp == ' ' || *cp == '!' || *cp == '-' || *cp == '+')
+ && cp[1] == ' ') {
+ cp += 2;
+ copyline(&r1, &cp, end);
+ acnt--;
+ if (acnt == 0)
+ state = 0;
+ } else {
+ fprintf(stderr, "%s: bad context patch at line %d\n",
+ Cmd, lineno);
+ return 0;
+ }
+ break;
+ case 2:
+ if ((*cp == ' ' || *cp == '!' || *cp == '-' || *cp == '+')
+ && cp[1] == ' ') {
+ cp += 2;
+ copyline(&r2, &cp, end);
+ bcnt--;
+ if (bcnt == 0)
+ state = 0;
+ } else {
+ fprintf(stderr, "%s: bad context patch/2 at line %d\n",
+ Cmd, lineno);
+ return 0;
+ }
+ break;
+ case 3:
+ if (*cp == ' ') {
+ char *cp2;
+ cp++;
+ cp2 = cp;
+ copyline(&r1, &cp, end);
+ copyline(&r2, &cp2, end);
+ acnt--; bcnt--;
+ } else if (*cp == '-') {
+ cp++;
+ copyline(&r1, &cp, end);
+ acnt--;
+ } else if (*cp == '+') {
+ cp++;
+ copyline(&r2, &cp, end);
+ bcnt--;
+ } else {
+ fprintf(stderr, "%s: bad unified patch at line %d\n",
+ Cmd, lineno);
+ return 0;
+ }
+ if (acnt <= 0 && bcnt <= 0)
+ state = 0;
+ break;
+ }
+ }
+ if (r1.len > f.len || r2.len > f.len)
+ abort();
+ *f1 = r1;
+ *f2 = r2;
+ return chunks;
+}
+
+/*
+ * extract parts of a "diff3 -m" or "wiggle -m" output
+ */
+int split_merge(struct stream f, struct stream *f1, struct stream *f2, struct stream *f3)
+{
+ int lineno;
+ int state = 0;
+ char *cp, *end;
+ struct stream r1, r2, r3;
+ f1->body = NULL;
+ f2->body = NULL;
+
+ r1.body = xmalloc(f.len);
+ r2.body = xmalloc(f.len);
+ r3.body = xmalloc(f.len);
+ r1.len = r2.len = r3.len = 0;
+
+ cp = f.body;
+ end = f.body+f.len;
+ while (cp < end) {
+ /* state:
+ * 0 not in conflict
+ * 1 in file 1 of conflict
+ * 2 in file 2 of conflict
+ * 3 in file 3 of conflict
+ * 4 in file 2 but expecting 1/3 next
+ * 5 in file 1/3
+ */
+ int len = end-cp;
+ lineno++;
+ switch (state) {
+ case 0:
+ if (len >= 8 &&
+ strncmp(cp, "<<<<<<<", 7) == 0 &&
+ (cp[7] == ' ' || cp[7] == '\n')
+ ) {
+ char *peek;
+ state = 1;
+ skip_eol(&cp, end);
+ /* diff3 will do something a bit strange in
+ * the 1st and 3rd sections are the same.
+ * it reports
+ * <<<<<<<
+ * 2nd
+ * =======
+ * 1st and 3rd
+ * >>>>>>>
+ * Without a ||||||| at all.
+ * so to know if we are in '1' or '2', skip forward
+ * having a peek.
+ */
+ peek = cp;
+ while (peek < end) {
+ if (end-peek >= 8 &&
+ (peek[7] == ' ' || peek[7] == '\n')) {
+ if (strncmp(peek, "|||||||", 7) == 0 ||
+ strncmp(peek, ">>>>>>>", 7) == 0)
+ break;
+ else if (strncmp(peek, "=======", 7) == 0) {
+ state = 4;
+ break;
+ }
+ }
+ skip_eol(&peek, end);
+ }
+ } else {
+ char *cp2 = cp;
+ copyline(&r1, &cp2, end);
+ cp2 = cp;
+ copyline(&r2, &cp2, end);
+ copyline(&r3, &cp, end);
+ }
+ break;
+ case 1:
+ if (len >= 8 &&
+ strncmp(cp, "|||||||", 7) == 0 &&
+ (cp[7] == ' ' || cp[7] == '\n')
+ ) {
+ state = 2;
+ skip_eol(&cp, end);
+ } else
+ copyline(&r1, &cp, end);
+ break;
+ case 2:
+ if (len >= 8 &&
+ strncmp(cp, "=======", 7) == 0 &&
+ (cp[7] == ' ' || cp[7] == '\n')
+ ) {
+ state = 3;
+ skip_eol(&cp, end);
+ } else
+ copyline(&r2, &cp, end);
+ break;
+ case 3:
+ if (len >= 8 &&
+ strncmp(cp, ">>>>>>>", 7) == 0 &&
+ (cp[7] == ' ' || cp[7] == '\n')
+ ) {
+ state = 0;
+ skip_eol(&cp, end);
+ } else
+ copyline(&r3, &cp, end);
+ break;
+ case 4:
+ if (len >= 8 &&
+ strncmp(cp, "=======", 7) == 0 &&
+ (cp[7] == ' ' || cp[7] == '\n')
+ ) {
+ state = 5;
+ skip_eol(&cp, end);
+ } else
+ copyline(&r2, &cp, end);
+ break;
+ case 5:
+ if (len >= 8 &&
+ strncmp(cp, ">>>>>>>", 7) == 0 &&
+ (cp[7] == ' ' || cp[7] == '\n')
+ ) {
+ state = 0;
+ skip_eol(&cp, end);
+ } else {
+ char *t = cp;
+ copyline(&r1, &t, end);
+ copyline(&r3, &cp, end);
+ }
+ break;
+ }
+ }
+ *f1 = r1;
+ *f2 = r2;
+ *f3 = r3;
+ return state == 0;
+}
diff --git a/get-p-options b/get-p-options
new file mode 100644
index 0000000..dec5352
--- /dev/null
+++ b/get-p-options
@@ -0,0 +1,8 @@
+#!/bin/sh
+#
+# make sure all p commands are in the help file
+
+sed -n -e '/^case/,/^esac/p' p | grep ')$' | grep -v '(' |
+ tr '\011' '@' | grep -v '@' | tr -cs '[A-Za-z0-9]' '\n' | sort > /tmp/p.cmds
+sed -n -e '/^[a-z]/p' p.help | sort > /tmp/p.hlp
+comm -3 /tmp/p.cmds /tmp/p.hlp
diff --git a/load.c b/load.c
new file mode 100644
index 0000000..90ca1af
--- /dev/null
+++ b/load.c
@@ -0,0 +1,161 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+/*
+ * read in files
+ *
+ * Files are read in whole and stored in a
+ * struct stream {char*, len}
+ *
+ *
+ * loading the file "-" reads from stdin which might require
+ * reading into several buffers
+ */
+
+#include "wiggle.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+static void join_streams(struct stream list[], int cnt)
+{
+ /* join all the streams in the list (upto body=NULL)
+ * into one by re-allocing list[0].body and copying
+ */
+ int len = 0;
+ int i;
+ char *c;
+
+ for (i = 0; i < cnt ; i++)
+ len += list[i].len;
+
+ c = realloc(list[0].body, len+1);
+ if (c == NULL)
+ die();
+
+ list[0].body = c;
+ c += list[0].len;
+ list[0].len = len;
+ for (i = 1; i < cnt; i++) {
+ memcpy(c, list[i].body, list[i].len);
+ c += list[i].len;
+ list[i].len = 0;
+ free(list[i].body);
+ }
+ c[0] = 0;
+}
+
+static struct stream load_regular(int fd)
+{
+ struct stat stb;
+ struct stream s;
+ fstat(fd, &stb);
+
+ s.len = stb.st_size;
+ s.body = xmalloc(s.len+1);
+ if (read(fd, s.body, s.len) != s.len)
+ die();
+
+ s.body[s.len] = 0;
+ return s;
+}
+
+static struct stream load_other(int fd)
+{
+
+ struct stream list[10];
+ int i = 0;
+
+ while (1) {
+ list[i].body = xmalloc(8192);
+ list[i].len = read(fd, list[i].body, 8192);
+ if (list[i].len < 0)
+ die();
+ if (list[i].len == 0)
+ break;
+ i++;
+ if (i == 10) {
+ join_streams(list, i);
+ i = 1;
+ }
+ }
+ join_streams(list, i);
+ return list[0];
+}
+
+struct stream load_segment(FILE *f,
+ unsigned int start, unsigned int end)
+{
+ struct stream s;
+ s.len = end - start;
+ s.body = xmalloc(s.len);
+ fseek(f, start, 0);
+ if (fread(s.body, 1, s.len, f) != (size_t)s.len)
+ die();
+ return s;
+}
+
+struct stream load_file(char *name)
+{
+ struct stream s;
+ struct stat stb;
+ int fd;
+ int start, end;
+ int prefix_len = 0;
+
+ s.body = NULL;
+ s.len = 0;
+ if (sscanf(name, "_wiggle_:%d:%d:%n", &start, &end,
+ &prefix_len) >= 2 && prefix_len > 0) {
+ FILE *f = fopen(name + prefix_len, "r");
+ if (f) {
+ s = load_segment(f, start, end);
+ fclose(f);
+ } else {
+ s.body = NULL;
+ s.len = 0;
+ }
+ } else {
+ if (strcmp(name, "-") == 0)
+ fd = 0;
+ else {
+ fd = open(name, O_RDONLY);
+ if (fd < 0)
+ return s;
+ }
+
+ if (fstat(fd, &stb) == 0) {
+
+ if (S_ISREG(stb.st_mode))
+ s = load_regular(fd);
+ else
+ s = load_other(fd);
+ }
+ close(fd);
+ }
+ return s;
+}
+
diff --git a/merge2.c b/merge2.c
new file mode 100644
index 0000000..96768fb
--- /dev/null
+++ b/merge2.c
@@ -0,0 +1,640 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2005 Neil Brown <neilb@cse.unsw.edu.au>
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "wiggle.h"
+
+/*
+ * Second attempt at merging....
+ *
+ * We want to create a mergelist which identifies 'orig' and 'after'
+ * sections (from a and c) and conflicts (which are ranges of a,b,c which
+ * all don't match).
+ * It is also helpful to differentiate 'orig' sections that aren't
+ * matched in 'b' with orig sections that are.
+ * To help with highlighting, it will be useful to know where
+ * the conflicts match the csl lists.
+ *
+ * This can all be achieved with a list of (a,b,c,c1,c1) 5-tuples.
+ * If two consecutive differ in more than one of a,b,c, it is a
+ * conflict.
+ * If only 'a' differ, it is un-matched original.
+ * If only 'b' differ, it is matched, unchanged original
+ * If only 'c' differ, it is 1
+ */
+
+static inline int min(int a, int b)
+{
+ return a < b ? a : b;
+}
+
+static int check_alreadyapplied(struct file af, struct file cf,
+ struct merge *m)
+{
+ int i;
+ if (m->al != m->cl)
+ return 0;
+ for (i = 0; i < m->al; i++) {
+ if (af.list[m->a+i].len != cf.list[m->c+i].len)
+ return 0;
+ if (strncmp(af.list[m->a+i].start,
+ cf.list[m->c+i].start,
+ af.list[m->a+i].len) != 0)
+ return 0;
+ }
+ if (do_trace) {
+ printf("already applied %d,%d,%d - %d,%d,%d\n",
+ m->a, m->b, m->c, m->al, m->bl, m->cl);
+ printf(" %.10s - %.10s\n", af.list[m->a].start,
+ cf.list[m->c].start);
+ }
+ m->type = AlreadyApplied;
+ return 1;
+}
+
+/* A 'cut-point' is a location in the merger where it is reasonable
+ * the change the mode of display - between displaying the merger
+ * and displaying the separate streams.
+ * A 'conflict' can only be displayed as separate stream so when
+ * one is found, we need to find a preceeding and trailing cut-point
+ * and enlarge the conflict to that range.
+ * A suitable location is one where all three streams are at a line-end.
+ */
+static int is_cutpoint(struct merge m,
+ struct file af, struct file bf, struct file cf)
+{
+ return ((m.a == 0 || ends_line(af.list[m.a-1])) &&
+ (m.b == 0 || ends_line(bf.list[m.b-1])) &&
+ (m.c == 0 || ends_line(cf.list[m.c-1])));
+}
+
+static int isolate_conflicts(struct file af, struct file bf, struct file cf,
+ struct csl *csl1, struct csl *csl2, int words,
+ struct merge *m, int show_wiggles)
+{
+ /* A conflict indicates that something is definitely wrong
+ * and so we need to be a bit suspicious of nearby apparent matches.
+ * To display a conflict effectively we expands it's effect to
+ * include any Extraneous, Unmatched, Changed or AlreadyApplied text.
+ * Also, unless 'words', we need to include any partial lines
+ * in the Unchanged text that forms the border of a conflict.
+ *
+ * A Changed text may also border a conflict, but it can
+ * only border one conflict (where as an Unchanged can border
+ * a preceeding and a following conflict).
+ * The 'new' section of a Changed text appears in the
+ * conflict as does any part of the original before
+ * a newline.
+ *
+ * If 'show_wiggles' is set we treat wiggles like conflicts.
+ * A 'wiggle' is implied by any Extraneous text being ignored,
+ * or any line that has both Changed and Unmatched content.
+ * (Unmatched content where nothing is changed is common and not
+ * really a 'wiggle').
+ *
+ * A hunk header is never considered part of a conflict. It
+ * thereby can serve as a separator between conflicts.
+ *
+ * We need to ensure there is adequate context for the conflict.
+ * So ensure there are at least 3 newlines in Extraneous or
+ * Unchanged on both sides of a Conflict - but don't go so far
+ * as including a hunk header.
+ * If there are 3, and they are all in 'Unchanged' sections, then
+ * that much context is not really needed - reduce it a bit.
+ */
+ int i, j, k;
+ int cnt = 0;
+ int changed = 0;
+ int unmatched = 0;
+
+ for (i = 0; m[i].type != End; i++) {
+ if (m[i].type == Changed)
+ changed = 1;
+ if (m[i].type == Unmatched)
+ unmatched = 1;
+ if (m[i].type == Conflict ||
+ (show_wiggles && ((changed && unmatched)
+ || m[i].type == Extraneous))) {
+ /* We have a conflict (or wiggle) here.
+ * First search backwards for an Unchanged marking
+ * things as in_conflict. Then find the
+ * cut-point in the Unchanged. If there isn't one,
+ * keep looking.
+ *
+ * Then search forward doing the same thing.
+ */
+ int newlines = 0;
+ cnt++;
+ m[i].in_conflict = 1;
+ j = i;
+ while (--j >= 0) {
+ if (m[j].type == Extraneous &&
+ bf.list[m[j].b].start[0] == '\0')
+ /* hunk header - not conflict any more */
+ break;
+ if (!m[j].in_conflict) {
+ m[j].in_conflict = 1;
+ m[j].lo = 0;
+ } else if (m[j].type == Changed) {
+ /* This can no longer form a border */
+ m[j].hi = -1;
+ /* We merge these conflicts and stop searching */
+ cnt--;
+ break;
+ }
+ if (m[j].type == Extraneous) {
+ for (k = m[j].bl; k > 0; k--)
+ if (ends_line(bf.list[m[j].b+k-1]))
+ newlines++;
+ }
+
+ if (m[j].type == Unchanged || m[j].type == Changed) {
+ /* If we find enough newlines in this section,
+ * then we only really need 1, but would rather
+ * it wasn't the first one. 'firstk' allows us
+ * to track which newline we actually use
+ */
+ int firstk = m[j].al+1;
+ if (words) {
+ m[j].hi = m[j].al;
+ break;
+ }
+ /* need to find the last line-break, which
+ * might be after the last newline, if there
+ * is one, or might be at the start
+ */
+ for (k = m[j].al; k > 0; k--)
+ if (ends_line(af.list[m[j].a+k-1])) {
+ if (firstk >= m[j].al)
+ firstk = k;
+ newlines++;
+ if (newlines >= 3) {
+ k = firstk;
+ break;
+ }
+ }
+ if (k > 0)
+ m[j].hi = k;
+ else if (is_cutpoint(m[j], af,bf,cf))
+ m[j].hi = 0;
+ else
+ /* no start-of-line found... */
+ m[j].hi = -1;
+ if (m[j].hi > 0 && m[j].type == Changed) {
+ /* this can only work if start is
+ * also a line break */
+ if (is_cutpoint(m[j], af,bf,cf))
+ /* ok */;
+ else
+ m[j].hi = -1;
+ }
+ if (m[j].hi >= 0)
+ break;
+ }
+ }
+
+ /* now the forward search */
+ newlines = 0;
+ for (j = i+1; m[j].type != End; j++) {
+ if (m[j].type == Extraneous &&
+ bf.list[m[j].b].start[0] == '\0')
+ /* hunk header - not conflict any more */
+ break;
+ m[j].in_conflict = 1;
+ if (m[j].type == Extraneous) {
+ for (k = 0; k < m[j].bl; k++)
+ if (ends_line(bf.list[m[j].b+k]))
+ newlines++;
+ }
+ if (m[j].type == Unchanged || m[j].type == Changed) {
+ m[j].hi = m[j].al;
+ if (words) {
+ m[j].lo = 0;
+ break;
+ }
+ /* need to find a line-break, which might be at
+ * the very beginning, or might be after the
+ * first newline - if there is one
+ */
+ if (is_cutpoint(m[j], af,bf,cf))
+ m[j].lo = 0;
+ else {
+ /* If we find enough newlines in this section,
+ * then we only really need 1, but would rather
+ * it wasn't the first one. 'firstk' allows us
+ * to track which newline we actually use
+ */
+ int firstk = -1;
+ for (k = 0 ; k < m[j].al ; k++)
+ if (ends_line(af.list[m[j].a+k])) {
+ if (firstk <= 0)
+ firstk = k;
+ newlines++;
+ if (newlines >= 3) {
+ k = firstk;
+ break;
+ }
+ }
+ if (firstk >= 0 &&
+ m[j+1].type == Unmatched) {
+ /* If this Unmatched exceeds 3 lines, just stop here */
+ int p;
+ int nl = 0;
+ for (p = 0; p < m[j+1].al ; p++)
+ if (ends_line(af.list[m[j+1].a+p])) {
+ nl++;
+ if (nl > 3)
+ break;
+ }
+ if (nl > 3)
+ k = firstk;
+ }
+ if (k < m[j].al)
+ m[j].lo = k+1;
+ else
+ /* no start-of-line found */
+ m[j].lo = m[j].al+1;
+ }
+ if (m[j].lo <= m[j].al+1 && m[j].type == Changed) {
+ /* this can only work if the end is a line break */
+ if (is_cutpoint(m[j+1], af,bf,cf))
+ /* ok */;
+ else
+ m[j].lo = m[j].al+1;
+ }
+ if (m[j].lo < m[j].al+1)
+ break;
+ }
+ }
+ i = j - 1;
+ }
+ if (m[i].al > 0 && ends_line(af.list[m[i].a+m[i].al-1])) {
+ unmatched = 0;
+ changed = 0;
+ }
+ }
+ return cnt;
+}
+
+struct ci make_merger(struct file af, struct file bf, struct file cf,
+ struct csl *csl1, struct csl *csl2, int words,
+ int ignore_already, int show_wiggles)
+{
+ /* find the wiggles and conflicts between csl1 and csl2
+ */
+ struct ci rv;
+ int i, l;
+ int a, b, c, c1, c2;
+ int wiggle_found = 0;
+
+ rv.conflicts = rv.wiggles = rv.ignored = 0;
+
+ for (i = 0; csl1[i].len; i++)
+ ;
+ l = i;
+ for (i = 0; csl2[i].len; i++)
+ ;
+ l += i;
+ /* maybe a bit of slack at each end */
+ l = l * 4 + 10;
+
+ rv.merger = xmalloc(sizeof(struct merge)*l);
+
+ a = b = c = c1 = c2 = 0;
+ i = 0;
+ while (1) {
+ int match1, match2;
+ match1 = (a >= csl1[c1].a && b >= csl1[c1].b); /* c1 doesn't match */
+ match2 = (b >= csl2[c2].a && c >= csl2[c2].b);
+
+ rv.merger[i].a = a;
+ rv.merger[i].b = b;
+ rv.merger[i].c = c;
+ rv.merger[i].c1 = c1;
+ rv.merger[i].c2 = c2;
+ rv.merger[i].in_conflict = 0;
+
+ if (!match1 && match2) {
+ /* This is either Unmatched or Extraneous - probably both.
+ * If the match2 is a hunk-header Extraneous, it must
+ * align with an end-of-line in 'a', so adjust endpoint
+ */
+ int newa = csl1[c1].a;
+ if (b < bf.elcnt && bf.list[b].start
+ && bf.list[b].start[0] == '\0') {
+ while (newa > a &&
+ !ends_line(af.list[newa-1]))
+ newa--;
+ while (newa < af.elcnt && !(newa == 0 || ends_line(af.list[newa-1])))
+ newa++;
+ }
+ if (a < newa) {
+ /* some unmatched text */
+ rv.merger[i].type = Unmatched;
+ rv.merger[i].al = newa - a;
+ rv.merger[i].bl = 0;
+ rv.merger[i].cl = 0;
+ wiggle_found++;
+ } else {
+ int newb;
+ int j;
+ assert(b < csl1[c1].b);
+ /* some Extraneous text */
+ /* length is min of unmatched on left
+ * and matched on right.
+ * However a hunk-header must be an
+ * Extraneous section by itself, so if this
+ * start with one, the length is 1, and if
+ * there is one in the middle, only take the
+ * text up to there for now.
+ */
+ rv.merger[i].type = Extraneous;
+ rv.merger[i].al = 0;
+ newb = b +
+ min(csl1[c1].b - b,
+ csl2[c2].len - (b-csl2[c2].a));
+ if (bf.list[b].start[0] == '\0')
+ newb = b + 1;
+ for (j = b; j < newb; j++) {
+ if (bf.list[j].start[0] == '\0') {
+ if (wiggle_found > 1)
+ rv.wiggles++;
+ wiggle_found = 0;
+ if (j > b)
+ newb = j;
+ } else
+ wiggle_found++;
+ }
+ rv.merger[i].cl =
+ rv.merger[i].bl = newb - b;
+ }
+ } else if (match1 && !match2) {
+ /* some changed text
+ * if 'c' is currently at a suitable cut-point, then
+ * we can look for a triple-cut-point for start.
+ * Also, if csl2[c2].b isn't in a conflict, and is
+ * a suitable cut-point, then we could make a
+ * triple-cut-point for end of a conflict.
+ */
+
+ rv.merger[i].type = Changed;
+ rv.merger[i].bl = min(csl1[c1].b+csl1[c1].len, csl2[c2].a) - b;
+ rv.merger[i].al = rv.merger[i].bl;
+ rv.merger[i].cl = csl2[c2].b - c;
+ } else if (match1 && match2) {
+ /* Some unchanged text
+ */
+ rv.merger[i].type = Unchanged;
+ rv.merger[i].bl =
+ min(csl1[c1].len - (b-csl1[c1].b),
+ csl2[c2].len - (b-csl2[c2].a));
+ rv.merger[i].al = rv.merger[i].cl =
+ rv.merger[i].bl;
+ } else {
+ /* must be a conflict.
+ * Move a and c to next match, and b to closest of the two
+ */
+ rv.merger[i].type = Conflict;
+ rv.merger[i].al = csl1[c1].a - a;
+ rv.merger[i].cl = csl2[c2].b - c;
+ rv.merger[i].bl = min(csl1[c1].b, csl2[c2].a) - b;
+ if (ignore_already &&
+ check_alreadyapplied(af, cf, &rv.merger[i]))
+ rv.ignored++;
+ }
+ a += rv.merger[i].al;
+ b += rv.merger[i].bl;
+ c += rv.merger[i].cl;
+ i++;
+
+ while (csl1[c1].a + csl1[c1].len <= a && csl1[c1].len)
+ c1++;
+ assert(csl1[c1].b + csl1[c1].len >= b);
+ while (csl2[c2].b + csl2[c2].len <= c && csl2[c2].len)
+ c2++;
+ assert(csl2[c2].a + csl2[c2].len >= b);
+ if (csl1[c1].len == 0 && csl2[c2].len == 0 &&
+ a == csl1[c1].a && b == csl1[c1].b &&
+ b == csl2[c2].a && c == csl2[c2].b)
+ break;
+ }
+ rv.merger[i].type = End;
+ rv.merger[i].a = a;
+ rv.merger[i].b = b;
+ rv.merger[i].c = c;
+ rv.merger[i].c1 = c1;
+ rv.merger[i].c2 = c2;
+ rv.merger[i].in_conflict = 0;
+ assert(i < l);
+ rv.conflicts = isolate_conflicts(af, bf, cf, csl1, csl2, words,
+ rv.merger, show_wiggles);
+ if (wiggle_found)
+ rv.wiggles++;
+ return rv;
+}
+
+static void printrange(FILE *out, struct file *f, int start, int len)
+{
+ while (len > 0) {
+ printword(out, f->list[start]);
+ start++;
+ len--;
+ }
+}
+
+struct ci print_merge2(FILE *out, struct file *a, struct file *b, struct file *c,
+ struct csl *c1, struct csl *c2,
+ int words, int ignore_already, int show_wiggles)
+{
+ struct ci rv = make_merger(*a, *b, *c, c1, c2,
+ words, ignore_already, show_wiggles);
+ struct merge *m;
+
+ for (m = rv.merger; m->type != End ; m++) {
+ struct merge *cm;
+ if (do_trace)
+ printf("[%s: %d-%d,%d-%d,%d-%d%s(%d,%d)]\n",
+ m->type==Unmatched ? "Unmatched" :
+ m->type==Unchanged ? "Unchanged" :
+ m->type==Extraneous ? "Extraneous" :
+ m->type==Changed ? "Changed" :
+ m->type==AlreadyApplied ? "AlreadyApplied" :
+ m->type==Conflict ? "Conflict":"unknown",
+ m->a, m->a+m->al-1,
+ m->b, m->b+m->bl-1,
+ m->c, m->c+m->cl-1,
+ m->in_conflict ? " in_conflict" : "",
+ m->lo, m->hi);
+
+ while (m->in_conflict) {
+ /* need to print from 'hi' to 'lo' of next
+ * Unchanged which is < it's hi
+ */
+ int found_conflict = 0;
+ int st = 0, st1;
+ if (m->type == Unchanged || m->type == Changed)
+ if (m->hi >= m->lo)
+ st = m->hi;
+ st1 = st;
+
+ if (m->type == Unchanged)
+ printrange(out, a, m->a+m->lo, m->hi - m->lo);
+
+ if (do_trace)
+ for (cm = m; cm->in_conflict; cm++) {
+ printf("{%s: %d-%d,%d-%d,%d-%d%s(%d,%d)}\n",
+ cm->type==Unmatched?"Unmatched":
+ cm->type==Unchanged?"Unchanged":
+ cm->type==Extraneous?"Extraneous":
+ cm->type==Changed?"Changed":
+ cm->type==AlreadyApplied?"AlreadyApplied":
+ cm->type==Conflict?"Conflict":"unknown",
+ cm->a, cm->a+cm->al-1,
+ cm->b, cm->b+cm->bl-1,
+ cm->c, cm->c+cm->cl-1,
+ cm->in_conflict ? " in_conflict" : "",
+ cm->lo, cm->hi);
+ if ((cm->type == Unchanged || cm->type == Changed)
+ && cm != m && cm->lo < cm->hi)
+ break;
+ }
+
+ fputs(words ? "<<<---" : "<<<<<<<\n", out);
+ for (cm = m; cm->in_conflict; cm++) {
+ if (cm->type == Conflict)
+ found_conflict = 1;
+ if ((cm->type == Unchanged || cm->type == Changed)
+ && cm != m && cm->lo < cm->hi) {
+ printrange(out, a, cm->a, cm->lo);
+ break;
+ }
+ printrange(out, a, cm->a+st1, cm->al-st1);
+ st1 = 0;
+ }
+ fputs(words ? "|||" : "|||||||\n", out);
+ st1 = st;
+ for (cm = m; cm->in_conflict; cm++) {
+ if ((cm->type == Unchanged || cm->type == Changed)
+ && cm != m && cm->lo < cm->hi) {
+ printrange(out, b, cm->b, cm->lo);
+ break;
+ }
+ printrange(out, b, cm->b+st1, cm->bl-st1);
+ st1 = 0;
+ }
+ fputs(words ? "===" : "=======\n", out);
+ st1 = st;
+ for (cm = m; cm->in_conflict; cm++) {
+ if (cm->type == Unchanged &&
+ cm != m && cm->lo < cm->hi) {
+ printrange(out, c, cm->c, cm->lo);
+ break;
+ }
+ if (cm->type == Changed)
+ st1 = 0; /* All of result of change must be printed */
+ printrange(out, c, cm->c+st1, cm->cl-st1);
+ st1 = 0;
+ }
+ if (!found_conflict) {
+ /* This section was wiggled in successfully,
+ * but full conflict display was requested.
+ * So now print out the wiggled result as well.
+ */
+ fputs(words ? "&&&" : "&&&&&&&\n", out);
+ st1 = st;
+ for (cm = m; cm->in_conflict; cm++) {
+ int last = 0;
+ if ((cm->type == Unchanged || cm->type == Changed)
+ && cm != m && cm->lo < cm->hi)
+ last = 1;
+ switch (cm->type) {
+ case Unchanged:
+ case AlreadyApplied:
+ case Unmatched:
+ printrange(out, a, cm->a+st1,
+ last ? cm->lo : cm->al-st1);
+ break;
+ case Extraneous:
+ break;
+ case Changed:
+ printrange(out, c, cm->c+st1,
+ last ? cm->lo : cm->cl-st1);
+ break;
+ case Conflict:
+ case End:
+ assert(0);
+ }
+ if (last)
+ break;
+ st1 = 0;
+ }
+ }
+ fputs(words ? "--->>>" : ">>>>>>>\n", out);
+ m = cm;
+ if (m->in_conflict && m->type == Unchanged
+ && m->hi >= m->al) {
+ printrange(out, a, m->a+m->lo, m->hi-m->lo);
+ m++;
+ }
+ }
+
+ /* there is always some non-conflict after a conflict,
+ * unless we hit the end
+ */
+ if (m->type == End)
+ break;
+
+ if (do_trace) {
+ printf("<<%s: %d-%d,%d-%d,%d-%d%s(%d,%d)>>\n",
+ m->type==Unmatched?"Unmatched":
+ m->type==Unchanged?"Unchanged":
+ m->type==Extraneous?"Extraneous":
+ m->type==Changed?"Changed":
+ m->type==AlreadyApplied?"AlreadyApplied":
+ m->type==Conflict?"Conflict":"unknown",
+ m->a, m->a+m->al-1,
+ m->b, m->b+m->bl-1,
+ m->c, m->c+m->cl-1,
+ m->in_conflict ? " in_conflict" : "",
+ m->lo, m->hi);
+ }
+
+ switch (m->type) {
+ case Unchanged:
+ case AlreadyApplied:
+ case Unmatched:
+ printrange(out, a, m->a, m->al);
+ break;
+ case Extraneous:
+ break;
+ case Changed:
+ printrange(out, c, m->c, m->cl);
+ break;
+ case Conflict:
+ case End:
+ assert(0);
+ }
+ }
+ return rv;
+}
diff --git a/notes b/notes
new file mode 100644
index 0000000..4d11ab0
--- /dev/null
+++ b/notes
@@ -0,0 +1,134 @@
+
+Wiggle - wiggle a mis-match patch into a file.
+
+Given
+ 1/ a file
+ 2/ a patch - which is two file fragments
+
+ find the minimal differences between the fragments in the patch
+ and apply those to the file.
+ This requires us to do a word-diff of file with frag-A, and
+ frag-A with frag-B, and the merge the result.
+
+ We read in the file and 2 frags and break them into words and keeping
+ an index and hash for each.
+
+ We then perform the two diffs producing lists of inserts and deletes.
+
+
+
+ToDo
+
+ implement --replace
+ describe and implement correct replacement procedure
+ Reject matches that have a dis-proportionate cost
+ implement testing structure. DONE
+
+
+
+Testing:
+ A directory tree containing tests. We look for key files
+ and run the appropriate test.
+ Key files are:
+ script : run that script in that directory
+ diff : if new exists, diff orig with new
+ else diff 'orig' with -1 of 'patch'
+ ldiff : as above, but lines
+ rediff : rediff 'patch'
+ merge : if 'patch' merge 'orig' with 'patch'
+ else merge 'orig' 'new' 'new2'
+
+
+Replacement procedure:
+
+ Goal: Every change between A' and B' must be merged into
+ A somehow to produce B.
+
+ We can think of changes as additions, deletions, or replacements.
+
+ Every addition must be inserted somewhere, at the site of
+ best match for the context. If there is no good match...
+ I guess we insert at start or finish.
+
+ Every deletion is merged either by deleting matching text,
+ or inserting the string <<<---deleted-text--->>> and some
+ reasonably appropriate location.
+
+ Every replacement is merged either by removing the original
+ and replacing by the new, or by inserting
+ <<<---oldtext///newtext+++>>>
+
+
+ For each difference b->c between B and C:
+ if b precisely aligns with a in A, then replace a with c
+ else find some set of lines that b maybe is in and produce:
+
+ <<<<<<<<<<
+ segment from A
+ ||||||||||
+ b, upto newlines
+ ==========
+ c, upto newlines
+ >>>>>>>>>>
+
+
+ Maybe several (two?) passes.
+
+-mw orig new new2 in tests/test dies. - FIXED
+
+in test5, -dw orig new
+ produces strange output FIXED
+
+
+if no matches are found, core is domps as lcsl is NULL FIXED
+
+wdiff to look more like udiff
+ unchanged
++addition
+-deletion
+|change<<<+++additions+++>>> and <<<---deletions--->>>>
+
+
+@@ line,numbers @@ in diff output
+
+Speed: us aproxword for pdiff lineup.DONE
+
+"refine" takes a diff and refines it, sortof
+
+return a lcsl when reading a patch and refine that
+rather than computing from scratch.
+
+FIXME: pdiff should pick best bit, and rediff the two sides. DONE
+
+---------------------------------
+Thoughts about editing a merge.
+
+When viewing a merge we might decide that:
+
+ - a change is not wanted
+ - a conflict should be resolved for the original
+ - a conflict should be resolved for the new
+
+ - some text needs to be edited in place
+ - a change should be applied against a different place in the original
+
+ - These can apply to a single change, to a line, or to
+ a chunk
+
+We can achieve most of these by changing the merge result,
+e.g. Changed to Unchanged or Conflict to one of the above.
+
+Moving a chunk will require shuffling the merger array.
+
+Replacing text is probably best done with a special merge type??
+
+Selecting the region to act on is awkward. Need to track 'current'
+merge point for cursor. Maybe insert $$ at other end??
+
+How about:
+ press E
+ current change is surrounded with '$'
+ cursor movement can extend the range
+ K to keep original
+ C to change
+ R to retype
diff --git a/p b/p
new file mode 100755
index 0000000..b29ddce
--- /dev/null
+++ b/p
@@ -0,0 +1,1155 @@
+#!/bin/bash
+
+# patch management
+#
+# Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+#
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+# Author: Neil Brown
+# Email: <neilb@cse.unsw.edu.au>
+# Paper: Neil Brown
+# School of Computer Science and Engineering
+# The University of New South Wales
+# Sydney, 2052
+# Australia
+
+
+# metadata is in .patches
+# there is:
+# files: list of all files checked out
+# name: name of current patch
+# status: status of current patch
+# notes: notes on current patch
+# applied/ patches applied nnn-name
+# removed/ patches removed nnn-name
+# included/ patches that have been included upstream
+# patch: a recent copy of the 'current' patch
+# get-version: a script which will report the version number of the base dist
+# dest/ symlink to directory to publish snapshots to
+# mail/ composed mail messages ready for sending
+# maintainer who to email patches to (Linus etc)
+# cc who to CC patches to: prefix address
+#
+# the nnn in names in applied and removed are sequence numbers
+# whenever we add a file we choose one more than the highest used number
+# patch files contain then name implicitly and start with
+# Status: status
+# then a blank line, normally a one line description, another blank, and more detail.
+#
+
+#
+# Todo - auto bk pull:
+# bk pull
+# bk export -t patch -r DEVEL, > /tmp/apatch
+# bk tag DEVEL
+# while p open last && p discard ; do : ; done
+# p clean
+# patch -p1 -f < /tmp/apatch
+
+find_home()
+{
+ # walk up directory tree until a .patches directory
+ # is found.
+ # set OrigDir to name of where we were .. not dots.
+ OrigDir=
+ dir=`pwd`
+ while [ ! -d .patches -a " $dir" != " /" ]
+ do
+ base=${dir##*/}
+ base=${base#/}
+ dir=${dir%/*}
+ case $dir in
+ "" ) dir=/
+ esac
+ OrigDir=$base/$OrigDir
+ cd ..
+ done
+ test -d .patches
+}
+
+get_meta()
+{
+ name=`cat .patches/name 2> /dev/null`
+ status=`cat .patches/status 2> /dev/null`
+}
+
+nl='
+'
+get_conf()
+{
+ _name=$1
+ _context=$2
+ _result=
+ _active=yes
+ _sep=
+ [ -f .patches/config ] || >> .patches/config
+ while read a b c
+ do
+ case $a in
+ '[global]' ) _active=yes ;;
+ "[$_context]") _active=yes ;;
+ "["*"]" ) _active= ;;
+ * ) if [ " $b" == " =" -a " $a" = " $_name" -a -n "$_active" ];
+ then
+ if [ -z "$c" ]; then
+ _result= _sep=
+ else
+ _result="$_result$_sep$c"
+ _sep=$nl
+ fi
+ fi
+ ;;
+ esac
+ done < .patches/config
+ _result=$(echo "$_result" | sed 's/^"//' )
+ eval $_name=\"\$_result\"
+}
+
+upgrade_one()
+{
+ # move $1~current~ to .patches/current/$1 and same for orig
+ fl=/$1
+ for f in current orig
+ do
+ if [ -f "$1~$f~" ]
+ then
+ mkdir -p ".patches/$f${fl%/*}"
+ mv "$1~$f~" ".patches/$f/$1"
+ fi
+ done
+}
+
+
+forget_one()
+{
+ if true # || cmp -s "$1" ".patches/curent/$1~" && cmp -s "$1" ".patches/orgi/$1"
+ then
+ rm -f ".patches/current/$1" ".patches/orig/$1"
+ chmod -w "$1"
+ else
+ echo >&2 "ERROR $1 doesn't match original"
+ fi
+}
+
+rebase_one()
+{
+ f="/$1"
+ mkdir -p .patches/orig${f%/*}
+ mkdir -p .patches/current${f%/*}
+ rm -f .patches/orig$f .patches/current$f
+ cp -p $1 .patches/orig$f
+ cp -p $1 .patches/current$f
+}
+
+snap_one()
+{
+ cp "$1" "$1~snapshot~"
+}
+
+snap_diff()
+{
+ diff -u "$1" "$1~snapshot~"
+}
+snap_back()
+{
+ cp "$1~snapshot~" "$1"
+}
+
+check_out()
+{
+ file=$1
+ file=${file#./}
+ f=/$file; f=${f%/*}
+ [ -f $file ] || >> $file
+ if [ -f $file ]
+ then
+ if [ ! -f ".patches/orig/$file" ] ; then
+ mkdir -p .patches/orig/$f
+ mv "$file" ".patches/orig/$file"
+ cp ".patches/orig/$file" "$file"
+ echo $file >> .patches/files
+ sort -o .patches/files .patches/files
+ chmod u+w "$file"
+ fi
+ if [ ! -f ".patches/current/$file" ] ; then
+ mkdir -p .patches/current/$f
+ mv "$file" ".patches/current/$file"
+ cp ".patches/current/$file" "$file"
+ fi
+ else
+ echo >&2 Cannot checkout $file
+ fi
+}
+
+all_files()
+{
+ >> .patches/files
+ while read file
+ do eval $1 $file $2
+ done < .patches/files
+}
+
+diff_one()
+{
+ if cmp -s ".patches/current/$1" "$1" || [ ! -f "$1" -a ! -f ".patches/current/$1" ]
+ then :
+ else
+ echo
+ echo "diff .prev/$1 ./$1"
+ if [ " $2" = " -R" ]
+ then
+ diff -N --show-c-function -u "./$1" "./.patches/current/$1"
+ else
+ diff -N --show-c-function -u "./.patches/current/$1" "./$1"
+ fi
+ fi
+}
+
+diff_one_orig()
+{
+ if cmp -s ".patches/orig/$1" "$1"
+ then :
+ else
+ echo
+ echo "diff ./.patches/orig/$1 ./$1"
+ diff --show-c-function -u "./.patches/orig/$1" "./$1"
+ fi
+}
+
+commit_one()
+{
+ rm -f ".patches/current/$1"
+ if [ -f "$1" ] ; then
+ mv "$1" ".patches/current/$1"
+ cp -p ".patches/current/$1" $1
+ chmod u+w $1
+ fi
+}
+
+discard_one()
+{
+ cmp -s ".patches/current/$1" $1 || { rm -f "$1" ; cp ".patches/current/$1" $1; }
+ chmod u+w $1
+}
+
+swap_one()
+{
+ mv "$1" "$1.tmp"
+ mv ".patches/current/$1" "$1"
+ mv "$1.tmp" ".patches/current/$1"
+}
+
+make_diff()
+{
+ get_conf tagline
+ upgrade_one "$1"
+ { {
+ [ -s .patches/status ] && echo "Status: `cat .patches/status`"
+ [ -s .patches/notes ] && { echo; cat .patches/notes ; }
+ if [ -z "$tagline" ] || grep -F "$tagline" .patches/notes > /dev/null 2>&1
+ then :
+ else echo "$tagline"
+ fi
+ echo
+ all_files diff_one $1 > .patches/tmp
+ echo "### Diffstat output"
+ diffstat -p0 2> /dev/null < .patches/tmp
+ cat .patches/tmp
+ [ -s .patches/tmp ] || rm .patches/patch
+ rm .patches/tmp
+ } | sed 's,^--- ./.patches/current/,--- .prev/,' ; } > .patches/patch
+}
+
+save_patch()
+{
+ dir=.patches/$1
+ name=$2
+ # move .patches/patch to $dir/nnn$name
+ #for some new nnn
+ [ -d $dir ] || mkdir $dir || exit 1
+ largest=`ls $dir | sed -n -e 's/^\([0-9][0-9][0-9]\).*/\1/p' | sort -n | tail -1`
+ if [ "0$largest" -eq 999 ]
+ then echo >&2 'ARRG - too many patches!' ; exit 1
+ fi
+ new=`expr "0$largest" + 1001`
+ new=${new#1}
+ mv .patches/patch $dir/$new$name
+}
+
+find_prefix()
+{
+ # set "prefix" to number for -pn by looking at first file in given patch.
+ n=${2-1}
+ file=`lsdiff $1 | head -$n | tail -1`
+ orig=$file
+ prefix=0
+ case $file in
+ b/* ) prefix=1; return
+ esac
+ while [ \( -n "$file" -a ! -f "$file" \) -o " $file" != " ${file#/}" ]
+ do
+ file=`expr "$file" : '[^/]*/\(.*\)'`
+ prefix=`expr $prefix + 1`
+ done
+ if [ -z "$file" ]
+ then echo "Cannot find $orig" >&2
+ if [ $n -gt 4 ]
+ then exit 2;
+ else find_prefix "$1" $[n+1]
+ fi
+ fi
+ if [ " $orig" != " $file" ]
+ then
+ echo "Found $orig as $file - prefix $prefix"
+ fi
+}
+
+extract_notes()
+{
+ # remove first line, Status: line, leading blanks,
+ # everything from ' *---' and trailing blanks
+ awk '
+ BEGIN { head= 1; blanks=0 ; }
+ head == 1 && ( $1 == "Status:" || $0 == "" ) {
+ next;
+ }
+ { head = 0; }
+ $0 == "" { blanks++; next; }
+ $0 ~ /^ *---/ { exit }
+ $0 ~ /^###/ { exit }
+ { while (blanks > 0) {
+ blanks--; print "";
+ }
+ print $0;
+ }
+ ' $1
+}
+
+
+if [ $# -eq 0 ]
+then
+ echo >&2 'Usage: p [help|co|make|discard|commit|status|name|...] args'
+ exit 1
+fi
+cmd=$1
+shift
+
+if [ " $cmd" = " help" ] || find_home
+then :
+else echo >&2 "p $cmd: cannot find .patches directory"
+ exit 1
+fi
+
+case $cmd in
+ co )
+ if [ $# -ne 1 ] ; then
+ echo >&2 Usage: p co file; exit 1
+ fi
+ file=$1
+ if [ ! -f "$OrigDir$file" ]
+ then
+ echo >&2 "p co: file $file not found"; exit 1;
+ fi
+ check_out "$OrigDir$file"
+
+ ;;
+ make | view )
+ case $1 in
+ "" )
+ make_diff
+ if [ -s .patches/patch ] ; then
+ pfile=.patches/patch
+ else
+ echo >&2 "No current patch" ; exit 1;
+ fi
+ ;;
+
+ */* ) pfile=$1;;
+ * ) pfile=`echo .patches/[ra][ep][mp]*/*$1*`
+ esac
+ if [ ! -f "$pfile" ]
+ then echo >&2 "Cannot find unique patch '$1' - found: $pfile"; exit 1;
+ fi
+ ptn='^\+.*(( |[^ ].{7}){10}.|[ ]$)'
+ if grep -E -s "$ptn" $pfile > /dev/null
+ then
+ ${PAGER-less -p "$ptn"} $pfile
+ else
+ ${PAGER-less} $pfile
+ fi
+ ;;
+
+ all )
+ all_files diff_one_orig
+ ;;
+ status | name )
+ case $# in
+ 1 )
+ get_meta
+ if [ $cmd = name ] ; then
+ if [ -n "$name" ]; then
+ echo "changing name from '$name' to '$1'"
+ else
+ echo "Setting name to '$1'"
+ fi
+ echo "$1" > .patches/name
+ fi
+ if [ $cmd = status ] ; then
+ if [ -n "$status" ]; then
+ echo "changing status from '$status' to '$1'"
+ else
+ echo "Setting status to '$1'"
+ fi
+ echo "$1" > .patches/status
+ fi
+ ;;
+ 0 )
+ get_meta
+ echo -n "Name ($name)? " ; read name
+ echo -n "Status ($status)? " ; read status
+ [ -n "$name" ] && { echo $name > .patches/name ; }
+ [ -n "$status" ] && { echo $status > .patches/status ; }
+ ;;
+ * )
+ echo "Usage: p $cmd [new-$cmd]"; exit 1;
+ esac
+ ;;
+ note* )
+ >> .patches/notes
+ ${EDITOR:-vi} .patches/notes
+ ;;
+ discard|commit )
+ make_diff
+ if [ -s .patches/patch ]
+ then :
+ else echo >&2 No patch to $cmd ; exit 1
+ fi
+ if grep -s '^+.*[ ]$' .patches/patch > /dev/null
+ then
+ echo >&2 remove trailing spaces/tabs first !!
+# exit 1
+ fi
+ if [ $cmd == "commit" -a -f scripts/checkpatch.pl ] ; then
+ perl scripts/checkpatch.pl .patches/patch
+ fi
+ if [ -s .patches/to-resolve ]
+ then echo "Please resolve outstanding conflicts first with 'p resolve'"
+ exit 1
+ fi
+ get_meta
+ if [ -z "$name" ] ; then
+ echo -n "Name? " ; read name
+ if [ -z "$name" ] ; then
+ echo >&2 "No current name, please set with 'p name'"
+ exit 1;
+ fi
+ echo $name > .patches/name
+ fi
+ if [ -z "$status" ] ; then
+ echo -n "Status? " ; read status
+ if [ -z "$status" ] ; then
+ echo >&2 "No current status, please set with 'p status'"
+ exit 1;
+ fi
+ echo $status > .patches/status
+ fi
+ if [ -s .patches/notes ]
+ then :
+ else
+ { echo "Title...."
+ echo
+ echo "Description..."
+ echo
+ echo "====Do Not Remove===="
+ cat .patches/patch
+ } > .patches/notes
+ ${EDITOR-vi} .patches/notes
+ mv .patches/notes .patches/tmp
+ sed '/^====Do Not Remove====/,$d' .patches/tmp > .patches/notes
+ rm .patches/tmp
+ fi
+ make_diff
+
+ if [ $cmd = commit ] ; then
+ save_patch applied "$name"
+ echo Saved as $new$name
+ all_files commit_one
+ else
+ save_patch removed "$name"
+ echo Saved as $new$name
+ all_files discard_one
+ fi
+ rm -f .patches/name .patches/status .patches/notes
+ ;;
+
+ purge )
+ make_diff
+ mv .patches/patch .patches/last-purge
+ all_files discard_one
+ rm -f .patches/name .patches/status .patches/notes
+ ;;
+ open )
+ make_diff
+ get_meta
+ if [ -s .patches/patch ]
+ then
+ echo >&2 Patch $name already open - please commit; exit 1;
+ fi
+ if [ $# -eq 0 ]
+ then
+ echo "Available patches are:"
+ ls .patches/applied
+ exit 0
+ fi
+ if [ $# -ne 1 ]
+ then echo >&2 "Usage: p open patchname" ; exit 1
+ fi
+ if [ " $1" = " last" ]
+ then
+ pfile=`ls -d .patches/applied/[0-9]* | tail -1`
+ else
+ pfile=`echo .patches/applied/*$1*`
+ fi
+ if [ ! -f "$pfile" ]
+ then echo >&2 "Cannot find unique patch '$1' - found: $pfile"; exit 1
+ fi
+ # lets see if it applies cleanly
+ if patch -s --fuzz=0 --dry-run -R -f -p0 < "$pfile"
+ then echo Ok, it seems to apply
+ else echo >&2 "Sorry, that patch doesn't apply" ; exit 1
+ fi
+ # lets go for it ...
+ patch --fuzz=0 -R -f -p0 < "$pfile"
+ all_files swap_one
+ sed -n -e '2q' -e 's/^Status: *//p' $pfile > .patches/status
+ base=${pfile##*/[0-9][0-9][0-9]}
+ [ -s .patches/name ] || echo $base > .patches/name
+ extract_notes $pfile >> .patches/notes
+ mv $pfile .patches/patch
+
+ ;;
+ included )
+ force=
+ if [ " $1" = " -f" ] ; then
+ force=yes; shift
+ fi
+ make_diff; get_meta
+ if [ -s .patches/patch ]
+ then
+ echo >&2 Patch $name already open, please commit; exit 1;
+ fi
+ if [ $# -eq 0 ]
+ then
+ echo "Unapplied patches are:"
+ ls .patches/removed
+ exit 0;
+ fi
+ if [ $# -ne 1 ]
+ then
+ echo >&2 "Usage: p included patchname"; exit 1
+ fi
+ case $1 in
+ last ) pfile=`ls -d .patches/removed/[0-9]* | tail -1` ;;
+ */* ) echo >&2 "Only local patches can have been included"; exit 1 ;;
+ *) pfile=`echo .patches/removed/*$1*`
+ esac
+ if [ ! -f "$pfile" ]
+ then echo >&2 "Cannot find unique patch '$1' - found $pfile"; exit 1
+ fi
+ echo "Using $pfile..."
+
+ # make sure patch applies in reverse
+ if patch -s --fuzz=2 -l --dry-run -f -p0 -R < "$pfile"
+ then echo "Yep, that seems to be included"
+ elif [ -n "$force" ]
+ then echo "It doesn't apply reverse-out cleanly, but you asked for it..."
+ else echo >&2 "Sorry, patch cannot be removed"; exit 1
+ fi
+ mv "$pfile" .patches/patch
+ name=${pfile##*/[0-9][0-9][0-9]}
+ save_patch included $name
+ echo "Moved to $new$name"
+ ;;
+ review )
+ # there are some patches in .removed that may be included in the current source
+ # we try to backout each one. If it backs out successfully, we move it to
+ # .reviewed and continue, else we abort
+ # Once this has been done often enough, 'reviewed' should be run to
+ # move stuff to 'included' and to revert those patches
+ force=
+ if [ " $1" = " -f" ] ; then
+ force=yes; shift
+ fi
+ make_diff; get_meta
+ if [ -s .patches/patch ]
+ then
+ echo >&2 Patch $name already open, please deal with it; exit 1;
+ fi
+ if [ -f .patches/in-review ]
+ then :
+ else
+ applied=`ls .patches/applied`
+ if [ -n "$applied" ]
+ then
+ echo >&2 Cannot review patches while any are applied.
+ exit 1;
+ fi
+ > .patches/in-review
+ fi
+ if [ $# -eq 0 ]
+ then
+ echo "Pending patches are:"
+ ls .patches/removed
+ exit 0;
+ fi
+ if [ $# -ne 1 ]
+ then
+ echo >&2 "Usage: p review patchname"; exit 1
+ fi
+ case $1 in
+ */* ) echo >&2 "Only local patches can have been included"; exit 1 ;;
+ *) pfile=`echo .patches/removed/*$1*`
+ esac
+ if [ ! -f "$pfile" ]
+ then echo >&2 "Cannot find unique patch '$1' - found $pfile"; exit 1
+ fi
+ echo "Starting from $pfile..."
+ found=
+ for fl in .patches/removed/*
+ do
+ if [ " $fl" = " $pfile" ]; then found=yes ; fi
+ if [ -n "$found" ]; then
+ echo Checking $fl
+ find_prefix "$fl"
+ lsdiff --strip=$prefix "$fl" | grep -v 'file.*changed' | while read a b
+ do check_out $a
+ done
+ if patch -s --fuzz=0 --dry-run -f -p$prefix -R < "$fl"
+ then echo Looks good..
+ elif [ -n "$force" ]
+ then echo "It doesn't backout cleanly, but you asked for it..."
+ cp $fl .patches/last-backed
+ else echo "Patch won't back out, sorry"
+ exit 1
+ fi
+ patch --fuzz=0 -f -p$prefix -R < "$fl" | tee .patches/tmp
+ sed -n -e '2q' -e 's/^Status: *//p' $fl > .patches/status
+ base=${fl##*/}
+ base=${base##[0-9][0-9][0-9]}
+ base=${base##patch-?-}
+ [ -s .patches/name ] || echo $base > .patches/name
+ extract_notes $fl >> .patches/notes
+ rm -f .patches/wiggled
+ sed -n -e 's/.*saving rejects to file \(.*\).rej/\1/p' .patches/tmp |
+ while read file
+ do echo Wiggling $file.rej into place
+ rm -f $file.porig
+ > .patches/wiggled
+ wiggle --replace --merge $file $file.rej ||
+ echo $file >> .patches/to-resolve
+ done
+
+ mv $fl .patches/patch
+ save_patch reviewed $base
+ if [ -f .patches/wiggled ]
+ then echo 'Some wiggling was needed. Please review and commit'
+ exit 0
+ fi
+ p commit || exit 1
+ fi
+ done
+ ;;
+
+ reviewed )
+ # all the currently applied patches are patches that have been
+ # reviewed as included.
+ # rip them out and stick them (reversed) into included.
+ if [ ! -f .patches/in-review ]
+ then
+ echo >&2 Not currently reviewing patches!
+ exit 1;
+ fi
+ while p open last
+ do
+ make_diff -R
+ get_meta
+ save_patch included "$name"
+ echo Saved as "$new$name"
+ all_files discard_one
+ rm -f .patches/name .patches/status .patches/notes
+ done
+ rm .patches/in-review
+ ;;
+ list )
+ echo "Applied patches are:"
+ ls .patches/applied
+
+ echo "Unapplied patches are:"
+ ls .patches/removed
+ exit 0
+ ;;
+ lista )
+ echo "Applied patches are:"
+ ls .patches/applied
+ exit 0
+ ;;
+ apply )
+ if [ -f .patches/in-review ]
+ then
+ echo >&2 Cannot apply patches while reviewing other - use p reviewed
+ exit 1
+ fi
+ force= append=
+ if [ " $1" = " -f" ]; then
+ force=yes; shift
+ fi
+ if [ " $1" = " -a" ]; then
+ append=yes; shift
+ fi
+ make_diff
+ get_meta
+ if [ -s .patches/patch -a -z "$append" ]
+ then
+ echo >&2 Patch $name already open - please commit ; exit 1;
+ fi
+ if [ $# -eq 0 ]
+ then
+ echo "Unapplied patches are:"
+ ls .patches/removed
+ exit 0
+ fi
+ if [ $# -ne 1 ]
+ then echo >&2 "Usage: p apply patchname"; exit 1
+ fi
+ case $1 in
+ last ) pfile=`ls -d .patches/removed/[0-9]* | tail -1` ; echo last is "$pfile";;
+ */* ) pfile=$1 ;;
+ * ) pfile=`echo .patches/removed/*$1*`
+ esac
+ if [ ! -f "$pfile" ]
+ then echo >&2 "Cannot find unique patch '$1' - found: $pfile"; exit 1
+ fi
+ find_prefix "$pfile"
+ lsdiff --strip=$prefix "$pfile" | grep -v 'file.*changed' | while read a b
+ do check_out $a
+ done
+ # lets see if it applies cleanly
+ if patch -s --fuzz=0 --dry-run -f -p$prefix < "$pfile"
+ then echo OK, it seems to apply
+ elif [ -n "$force" ]
+ then echo "It doesn't apply cleanly, but you asked for it...."
+ echo "Saving original at .patches/last-conflict"
+ cp $pfile .patches/last-conflict
+ else echo >&2 "Sorry, patch doesn't apply"; exit 1
+ fi
+ # lets go for it ...
+ cp $pfile .patches/last-applied
+ patch --fuzz=0 -f -p$prefix < "$pfile" | tee .patches/tmp
+ sed -n -e '2q' -e 's/^Status: *//p' $pfile > .patches/status
+ base=${pfile##*/}
+ base=${base##[0-9][0-9][0-9]}
+ base=${base##patch-?-}
+ [ -s .patches/name ] || echo $base > .patches/name
+ extract_notes $pfile >> .patches/notes
+
+ sed -n -e 's/.*saving rejects to file \(.*\).rej/\1/p' .patches/tmp |
+ while read file
+ do echo Wiggling $file.rej into place
+ rm -f $file.porig
+ wiggle --replace --merge $file $file.rej ||
+ echo $file >> .patches/to-resolve
+ done
+
+ case $pfile in
+ .patches/removed/* )
+ mv $pfile .patches/patch
+ esac
+ ;;
+
+ unapply )
+ get_meta
+ mv .patches/last-applied .patches/patch
+ save_patch removed $name
+ echo Restored to $new$name
+ make_diff
+ mv .patches/patch .patches/last-purge
+ all_files discard_one
+ rm -f .patches/name .patches/status .patches/notes
+ ;;
+ publish )
+ name=`date -u +%Y-%m-%d-%H`
+ if [ -d .patches/dest ]
+ then : good
+ else echo >&2 No destination specified at .patches/dest ; exit 1;
+ fi
+ if [ -d .patches/dest/$name ]
+ then
+ echo >&2 $name already exists ; exit 1
+ fi
+ target=.patches/dest/$name
+ mkdir $target
+ if [ -f .patches/get-version ] ;
+ then ./.patches/get-version > $target/version
+ fi
+ [ -f .config ] && cp .config $target
+ cp .patches/applied/* $target
+ mkdir $target/misc
+ cp 2> /dev/null .patches/removed/* $target/misc || rmdir $target/misc
+ chmod -R a+rX $target
+ all_files diff_one_orig > $target/patch-all-$name
+ cd $target
+ echo Published at `/bin/pwd`
+ ;;
+ clean )
+ all_files forget_one
+ > .patches/files
+ ;;
+ openall )
+ while $0 open last && $0 discard ; do : ; done
+ ;;
+ recommit )
+ make_diff
+ get_meta
+ if [ -s .patches/patch ]
+ then
+ echo >&2 Patch $name already open - please commit ; exit 1;
+ fi
+ if [ $# -eq 0 ]
+ then
+ echo "Unapplied patches are:"
+ ls .patches/removed
+ exit 0
+ fi
+ if [ $# -ne 1 ]
+ then echo >&2 "Usage: p recommit patchname"; exit 1
+ fi
+ case $1 in
+ last ) pfile=`ls -d .patches/removed/[0-9]* | tail -1` ; echo last is "$pfile";;
+ */* ) pfile=$1 ;;
+ * ) pfile=`echo .patches/removed/*$1*`
+ esac
+ if [ ! -f "$pfile" ]
+ then echo >&2 "Cannot find unique patch '$1' - found: $pfile"; exit 1
+ fi
+ while [ -s "$pfile" ] &&
+ $0 apply last && $0 commit ; do : ; done
+ ;;
+ decommit )
+ make_diff
+ get_meta
+ if [ -s .patches/patch ]
+ then
+ echo >&2 Patch $name already open - please commit ; exit 1;
+ fi
+ if [ $# -eq 0 ]
+ then
+ echo "Applied patches are:"
+ ls .patches/applied
+ exit 0
+ fi
+ if [ $# -ne 1 ]
+ then echo >&2 "Usage: p decommit patchname"; exit 1
+ fi
+ case $1 in
+ last ) pfile=`ls -d .patches/applied/[0-9]* | tail -1` ; echo last is "$pfile";;
+ */* ) pfile=$1 ;;
+ * ) pfile=`echo .patches/applied/*$1*`
+ esac
+ if [ ! -f "$pfile" ]
+ then echo >&2 "Cannot find unique patch '$1' - found: $pfile"; exit 1
+ fi
+ while [ -s "$pfile" ] &&
+ $0 open last && $0 discard ; do : ; done
+ ;;
+
+ rebase )
+ # move all applied patches to included, and
+ # copy current to orig and current
+ make_diff
+ if [ -s .patches/patch ]
+ then
+ echo >&2 Patch already open - please commit; exit 1;
+ fi
+ for p in `ls .patches/applied`
+ do
+ name=${p##[0-9][0-9][0-9]}
+ mv .patches/applied/$p .patches/patch
+ save_patch included $name
+ done
+ all_files rebase_one
+ ;;
+ snapshot )
+ all_files snap_one
+ ;;
+ snapdiff )
+ all_files snap_diff
+ ;;
+ snapback )
+ all_files snap_back
+ ;;
+ upgrade )
+ all_files upgrade_one
+ ;;
+ resolve )
+ if [ ! -s .patches/resolving ]
+ then sort -u .patches/to-resolve > .patches/resolving ; > .patches/to-resolve
+ fi
+ if [ ! -s .patches/resolving ]
+ then echo "Nothing to resolve" ; exit 0;
+ fi
+ echo "Resolving: " ; cat .patches/resolving
+ for file in `cat .patches/resolving`
+ do
+ ${EDITOR:-vi} $file
+ rm -f $file.porig
+ wiggle --replace --merge $file ||
+ echo $file >> .patches/to-resolve
+ done
+ > .patches/resolving
+ ;;
+
+ export )
+ # there must be only one patch. We
+ # git commit, p commit, p rebase
+ if [ -n "`ls .patches/applied`" ]
+ then
+ echo 'Cannot export when there are applied patches'
+ exit 1;
+ fi
+ make_diff
+ if [ -s .patches/patch ]
+ then
+ # Ok, go for it.
+ git add `cat .patches/files`
+ author=`grep '^From:' .patches/notes | head -n 1 | sed 's/From: *//'`
+ if [ -n "$author" ]
+ then git commit --author="$author" -a -F .patches/notes
+ else git commit -a -F .patches/notes
+ fi
+ $0 commit
+ $0 rebase
+ fi
+ ;;
+ pull )
+ cd .patches/SOURCE && bk pull
+ ;;
+ update )
+ make_diff
+ get_meta
+ if [ -s .patches/patch ]
+ then
+ echo >&2 Patch $name already open - please commit; exit 1;
+ fi
+ p openall && p clean &&
+ (cd .patches/SOURCE ; bk export -tpatch -rLATEST, ) > .patches/imported-patch &&
+ patch --dry-run -f -p1 < .patches/imported-patch &&
+ patch -f -p1 < .patches/imported-patch &&
+ ( rm .patches/imported-patch ; cd .patches/SOURCE ; bk tag LATEST )
+ ;;
+
+ premail )
+ # Convert some applied patches into email messages.
+ # Select patches that start with $1. Look in .patches/cc for who to Cc: to
+ rmdir .patches/mail 2>/dev/null
+ if [ -d .patches/mail ] ; then
+ echo >&2 There is already some email - run "email" or "nomail"
+ ls .patches/mail
+ exit 1;
+ fi
+ mkdir .patches/mail
+
+ get_conf author $1
+ get_conf header $1
+ if [ -n "$author" ]
+ then
+ headers="From: $author"
+ if [ -n "$header" ] ; then
+ headers="$headers$nl$header"
+ fi
+ elif [ -s .patches/owner ]; then
+ headers=`cat .patches/owner`;
+ else
+ echo Please add author information to .patches/config
+ exit 1
+ fi
+ get_conf maintainer $1
+ if [ -z "$maintainer" -a -s .patches/maintainer ]
+ then
+ maintainer=`cat .patches/maintainer`
+ fi
+
+ if [ -z "$maintainer" ] ; then
+ echo "No maintainer - please add one"
+ exit 1;
+ fi
+
+ messid="<`date +'%Y%m%d%H%M%S'`.$$.patches@`uname -n`>"
+ cnt=0
+ > .patches/.tmp.cc
+ for patch in .patches/applied/???${1}*
+ do
+ n=${patch##*/}
+ n=${n:0:3}
+ if [ -n "$2" ] && [ $2 -gt $n ] ; then continue; fi
+ if [ -n "$3" ] && [ $3 -lt $n ] ; then continue; fi
+ if [ -n "$4" ]; then
+ case ,$4, in *,$n,* ) ;; *) continue; esac
+ fi
+ cnt=$(expr $cnt + 1 )
+ sed -n -e 's/^\(Signed-[Oo]ff-[Bb]y\|Acked-[Bb]y\|Cc\|From\): */Cc: /p' $patch | grep -v neilb >> .patches/.tmp.cc
+ done
+ get_conf cc $1
+ get_conf tag $1
+ this=1
+ if [ $cnt -gt 1 ]
+ then
+ {
+ echo "$headers"
+ echo "To: $maintainer"
+
+ if [ -n "$cc" ]; then
+ echo "Cc: $cc"
+ fi
+ if [ -n "$tag" ]; then
+ sprefix="$tag: "
+ fi
+ if [ -s .patches/.tmp.cc ]
+ then sort -u .patches/.tmp.cc
+ fi
+ if [ -s .patches/cc ] ; then
+ while read word prefix addr
+ do if [ " $word" = " $1" ] ; then
+ echo "Cc: $addr"
+ sprefix="$prefix: "
+ fi
+ done < .patches/cc
+ fi
+ if [ $cnt = 1 ]
+ then
+ echo "Subject: [PATCH] ${sprefix}Intro"
+ else
+ echo "Subject: [PATCH 000 of $cnt] ${sprefix}Introduction EXPLAIN PATCH SET HERE"
+ fi
+ echo "Message-ID: $messid"
+ echo
+ echo PUT COMMENTS HERE
+ } > .patches/mail/000Intro
+ fi
+
+ for patch in .patches/applied/???${1}*
+ do
+ n=${patch##*/}
+ n=${n:0:3}
+ if [ -n "$2" ] && [ $2 -gt $n ] ; then continue; fi
+ if [ -n "$3" ] && [ $3 -lt $n ] ; then continue; fi
+ if [ -n "$4" ]; then
+ case ,$4, in *,$n,* ) ;; *) continue; esac
+ fi
+ if [ -f ./scripts/checkpatch.pl ]
+ then perl ./scripts/checkpatch.pl $patch
+ fi
+ {
+ sprefix=
+ echo "$headers"
+ echo "To: $maintainer"
+ if [ -n "$cc" ]; then
+ echo "Cc: $cc"
+ fi
+ sed -n -e 's/^\(Signed-[Oo]ff-[Bb]y\|Acked-[Bb]y\|Cc\|From\): */Cc: /p' $patch | grep -v neilb | sort -u
+ if [ -n "$tag" ]; then
+ sprefix="$tag: "
+ fi
+ if [ -s .patches/cc ] ; then
+ while read word prefix addr
+ do if [ " $word" = " $1" ] ; then
+ echo "Cc: $addr"
+ sprefix="$prefix: "
+ fi
+ done < .patches/cc
+ fi
+ head=`sed -e '/^Status/d' -e '/^$/d' -e q $patch`
+ zerothis=$(expr $this + 1000)
+ if [ $cnt = 1 ]
+ then
+ echo "Subject: [PATCH] $sprefix$head"
+ else
+ echo "Subject: [PATCH ${zerothis#1} of $cnt] $sprefix$head"
+ fi
+ echo "References: $messid"
+ echo
+ if [ $cnt = 1 ] ; then
+ echo "### Comments for Changeset"
+ fi
+ sed -e '1,3d' $patch
+ } > .patches/mail/${patch#.patches/applied/}
+ this=$(expr $this + 1)
+ done
+ if [ -f .patches/mail/000Intro ]; then cat .patches/mail/* | sed -n -e 's/^Subject://p' >> .patches/mail/000Intro ; fi
+ ls .patches/mail
+ ;;
+
+ nomail )
+ echo "Removing .patches/mail directory"
+ rm -rf .patches/mail
+ ;;
+
+ email )
+ PATH=$HOME/bin:/usr/lib:/usr/sbin:$PATH
+ for i in .patches/mail/*
+ do
+ if [ -f "$i" ]
+ then
+ echo Sending $i.
+ sendmail -t < $i && rm $i
+ fi
+ done
+ ;;
+
+ test )
+ # test all removed patches to see which ones are clearly included
+ for p in .patches/removed/*
+ do
+ if patch -R --dry-run -p0 --fuzz=0 -s -f < "$p" > /dev/null 2>&1
+ then echo $p
+ fi
+ done
+ ;;
+ help )
+ helpfile=$0.help
+ if [ ! -f $helpfile ]
+ then echo >&2 $helpfile not found: no help available ; exit 2;
+ fi
+ if [ -z "$1" ] ; then
+ echo
+ sed -n -e '/^ /p' -e '/^[^ ]/q' $helpfile
+ echo
+ echo "Available help topics are:"
+ sed -n '/^[^ ]/p' $helpfile | sort | column
+ else
+ echo
+ awk '$0 ~ /^[^ ]/ && printed {doprint=0; printed=0}
+ doprint && $0 !~ /^[^ ]/ {print; printed=1;}
+ $0 == "'$1'" {doprint=1; found=1}
+ END { if (!found) print "No help available for '$1'"; }
+ ' $helpfile
+ echo
+ fi
+ ;;
+ * )
+ echo >&2 "p $cmd - unknown command - try 'p help'"; exit 1;
+esac
+exit 0;
diff --git a/p.help b/p.help
new file mode 100644
index 0000000..4c16328
--- /dev/null
+++ b/p.help
@@ -0,0 +1,335 @@
+ p is a tool for managing patches. It contains many
+ subcommands. To use a particular subcommand, give it
+ as the first argument to p, and then give any arguments
+ that subcommand requires
+
+files
+ p keeps all it's files and patches in a subdirectory of
+ the toplevel directory of a project. This subdirectory
+ is called ".patches". It is often convenient for
+ ".patches" to actually be a symbolic link to somewhere
+ else altogether.
+
+ The files and directories contained in .patches are:
+ applied/ A directory containing applied patches
+ removed/ A directory containing removed patches
+ include/ A directory containing included patches
+ Files in these directories are prefixed by a 3digit number
+ which indicate thr order in which patches were added.
+ The remainder of the filename is the name of the patch.
+ Each file contains:
+ Status: status
+ ... notes ...
+ ... diffstat output ...
+ the actual patch
+ name A file containing the name of the current patch
+ status A file containing the status of the current patch
+ notes A file with notes about the patch
+ patch A a recently generated copy of the current patch
+ files A list of files that are 'checked out'
+ to-resolve A list of files that might have conflicts that need resolving
+ tmp A temporary file
+ last-applied A most recently apply patch that had conflicts
+ last-purge
+ dest/ A directory where 'p publish' puts patch sets.
+ SOURCE/ A directory where a bk repository lives.
+ mail/ A directory of patches converted to email messages
+ cc A files listing: prefix name emailaddr
+ When mailing patches which start with prefix, name
+ is put on the subject line, and the mail is cc:ed to
+ emailaddr
+ maintainer This is where patches are mailed to
+ owner These mail headers are included in each mail message
+ get-version A script to get a base version number for use when publishing
+ to-resolve List of files have have outstanding conflicts to be resolved.
+
+
+model
+overview
+ What is 'p' ?
+
+ 'p' is a patch management system, not a source code control system.
+ It allows you to create a set of patches against a base release, to
+ annotate those patches with comments, and to revisit and edit patches
+ after they have been committed.
+
+ It also allows you to update the base release that the patches are
+ against, and then re-apply all patches.
+
+ At any time, there are a number of applied patches, a number of
+ removed patches and possibly a current patch.
+ The sets of applied and removed patches act much like stacks. The current
+ patch can be moved to the top of either (commit or discard), and the top
+ of either patch can be moved to the current patch (open or apply).
+ open and apply actualy allow any patch in the corresponding stack to be
+ made current, and assume that the use won't re-order patches that
+ should not be re-ordered.
+
+ To enable 'p' for a project, you simply create a directory called ".patches"
+ in the top level directory of that project. Files should be checked out
+ ("p co filename") before editing but never need to be checked in. Applying
+ and external patch automatically checks out all modified files.
+
+ Often it is appropriate to have the .patches directory elsewhere (for
+ example in an http-export directory tree for public access) and have a
+ symlink from .patches to that location.
+
+ p can be run from any subdirectory of a project containing a .patches
+ directory.
+
+ To find out about the contents of the .patches directory, see
+ p help files
+
+ Some common commands are:
+ p co filename # monitor changes to filename
+ p make # create and view the current patch
+ p commit # commit the current patch
+ p discard # discard current patch, saving it as
+ # a removed patch
+ p apply # re-apply a removed patch, or apply
+ # an external patch
+ p list # list current patches
+
+co
+ Usage: p co filename
+
+ prepare filename for editing. This makes sure there is a
+ copy of the file with a ~current~ suffix, and that the file
+ is listed in in .patches/files. This command can be run from
+ a subdirectory of the project, and it will still do the
+ right thing.
+
+make
+view
+ Usage: p make
+ p view [patchnamefragment]
+
+ make and view provide the same functionality.
+ When given a patch name fragment, they will allow the unique
+ patch with that name (either applied or removed) to be viewed
+ (using the pager $PAGER, or less).
+ Without an argument, the current patch is calculated and
+ displayed. This explains the two names as with no argument,
+ they both make, and view the current patch.
+
+all
+ Usage: p all
+
+ Generate a composite patch of all currently applied patches.
+ This involves creation a patch from the ~orig~ version of every
+ file to it's current version.
+
+status
+name
+
+ Usage: p status [newstatus]
+ p name [newname]
+
+ If a new status or name is given, it is recorded as the current
+ status or name for the current patch. If no argument is given,
+ the command will prompt for both a new name and a new status.
+ The current value is offered as a default in each case.
+
+note
+notes
+ Usage: p notes
+
+ Open the notes describing the current patch in an $EDITOR
+ The notes should contain a simple one-line description,
+ a black line, and then a detailed description.
+
+discard
+ Usage: p discard
+
+ The current patch is discard: moved to the .patches/removed
+ directory. If it doesn't have a name or status, these are
+ prompted for.
+
+commit
+ Usage: p commit
+
+ The current patch is commit: moved to the .patches/applied
+ directory. If name or status aren't set, these are prompted
+ for. If no notes have been written, and $EDITOR session is
+ started with a template for some notes.
+ The patch is presented in the file being edited for reference,
+ but will be removed from the notes on exit.
+
+open
+ Usage: p open [last | patch-name-fragment]
+
+ The open command is used to open a previously commited
+ patch for further editing.
+
+ Without any argument, a list of available commited patches
+ is given.
+ If the argument 'last'is given, then the most recently commited
+ patch is opened.
+ Otherwise a unique patch with a name containing the name fragment
+ is openned. If there is no such unique patch, and error message
+ is given.
+
+included
+ Usage: p included [-f] [last | patch-name-fragment]
+
+ After updating the base release of a project, some of the patches
+ which are currently "removed" may already have been included in that
+ release and so don't need to be maintained any more.
+
+ The "included" command will check if a given patch appears to have
+ been included and if so, moves it to the .patches/included directory.
+ The test is performed by seeing if 'patch' is able to remove the
+ patch. If it cannot, but you are sure that the patch has been included
+ (the problems patch reports are spurious) then using '-f' will cause
+ the patch to be moved to 'included' anyway.
+
+list
+ Usage: p list
+
+ List all the patches in either 'applied' or 'removed'.
+
+apply
+ Usage: p apply [-f] [-a] [last | patch-name-fragment | filename]
+
+ This command is used for applying a patch to the project.
+ If a patch in 'removed' is given, then it is moved out of 'removed'
+ and is applied. If a filename is given, the patch in that file is
+ applied but the file is left unchanged.
+
+ When applying a patch, all affected files are checked-out first.
+
+ If 'patch' cannot apply the patch without error, 'apply' will fail.
+ Giving the '-f' option will cause 'apply' to apply the patch anyway,
+ and then run 'wiggle' to merge any rejected patch chunks as best
+ as possible. Any files for which wiggle finds unresolvaable conflicts
+ while have its name saved in a file (.patches/to-resolve). This
+ list is used by the 'p resolve' command.
+
+ Normally, 'apply' will not apply a patch to be applies if there is
+ one already open. However the '-a' option may be given to ask
+ 'apply' to "append" the patch to the current patch.
+
+resolve
+ Usage: p resolve
+
+ This is used to resolve any conflicts found by wiggle. Each file
+ listed in .patches/to-resolve is presented for editing, and then
+ has wiggle run over it again to check that all conflicts have
+ been resolved.
+
+publish
+ Usage: p publish
+
+ The 'publish' command will create a new subdirectory of
+ .patches/dest
+ (which is often a symlink to a web-page area) and copy
+ all current applied and removed patches into that directory.
+ It also creates a complete patch (with "p all") and stores
+ that in the directory.
+
+clean
+ Usage: p clean
+
+ clean checks that no patches are currently applied, and
+ cleans up any ~current~ or ~orig~ files that have been left
+ in the source tree. It also removed write permission from
+ all checked-out files.
+
+ It effectively undoes all check-outs.
+
+ It is run as part of 'update' which incorporates upstream
+ changes into a source tree.
+
+openall
+ Usage: p openall
+
+ This command repeatedly runs "p open last && p discard" until
+ that fails, which usually means that all patches have been
+ discarded. This is part of the preparation for incorporating
+ upstream changes.
+
+recommit
+ Usage: p recommit patchname
+
+ This command will re-apply and re-commit removed patches
+ that successfully apply until the names patch has been applied.
+ Patches are applied in reverse order, which is consistant with
+ the order in which they were removed.
+
+snapshot
+ Usage: p snapshot
+
+ This command takes a shapshot of the current patch so that further
+ work can be done in the patch, but it can easily be removed if
+ there are problems.
+
+ This might be used before appending a patch incase something goes
+ wrong in the appending process.
+
+snapdiff
+ Usage: p snapdiff
+
+ Display the differences between the latest snapshot and the current
+ source.
+
+snapback
+ Usage: p snapback
+
+ Revert all changes since the last snapshot
+
+pull
+ Usage: p pull
+
+ Update the local copy of the official source repository. This
+ can be found by following the .patches/SOURCE link.
+
+ Currently the code assumes it is a BitKeeper repository and
+ runs "bk pull". It should be enhanced to recognise CVS and
+ run "cvs update".
+
+update
+ Usage: p update
+
+ This command updates the based release of the package. To
+ do this it removes all patches (p openall), cleans up (p clean),
+ creates a patch from information in .patches/SOURCE, and applies
+ that patch. It currently makes no attempt to re-apply any
+ patches, or to "p included" and patches.
+
+ Currently the code assumes a BitKeeper repository and uses
+ "bk export -tpatch -rLASTEST," to extract a patch, and then
+ retags the repository with "bk tag LATEST". It should be
+ enhanced to recognise and work with CVS as well.
+
+premail
+ Usage: p premail [patch-name-prefix]
+
+ This command converts a selection of patches to Email messages.
+ The email messages are stored in .patches/mail.
+ SAY MORE HERE
+
+nomail
+ Usage: p nomail
+
+ Remove the .patches/mail directory and contents.
+
+email
+ Usage: p email
+
+ Send all mail messages in .patches/mail. On success, each
+ email message is removed.
+
+help
+ Usage: p help [topic]
+
+ Print out help messages, which are contained in a file
+ p.help
+ in the same directory that p was run from.
+ Without a topic, a general introduction and a list of topics
+ is presented. With a topic, help on that topic is presented.
+
+purge
+ Usage: p purge
+
+ Make copy of the current patch in .patches/last-purge (just
+ in case) and then purge the current patch complete.
diff --git a/parse.c b/parse.c
new file mode 100644
index 0000000..15b06ae
--- /dev/null
+++ b/parse.c
@@ -0,0 +1,324 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003-2012 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+/*
+ * Parse a patch file to find the names of the different
+ * files to patch and record which parts of the patch
+ * file applies to which target file.
+ */
+
+#include "wiggle.h"
+#include <unistd.h>
+#include <fcntl.h>
+
+/* determine how much we need to stripe of the front of
+ * paths to find them from current directory. This is
+ * used to guess correct '-p' value.
+ */
+static int get_strip(char *file)
+{
+ int fd;
+ int strip = 0;
+
+ while (file && *file) {
+ fd = open(file, O_RDONLY);
+ if (fd >= 0) {
+ close(fd);
+ return strip;
+ }
+ strip++;
+ file = strchr(file, '/');
+ if (file)
+ while (*file == '/')
+ file++;
+ }
+ return -1;
+
+}
+
+int set_prefix(struct plist *pl, int n, int strip)
+{
+ int i;
+ for (i = 0; i < 4 && i < n && strip < 0; i++)
+ strip = get_strip(pl[i].file);
+
+ if (strip < 0) {
+ fprintf(stderr, "%s: Cannot find files to patch: please specify --strip\n",
+ Cmd);
+ return 0;
+ }
+ for (i = 0; i < n; i++) {
+ char *p = pl[i].file;
+ int j;
+ for (j = 0; j < strip; j++) {
+ if (p)
+ p = strchr(p, '/');
+ while (p && *p == '/')
+ p++;
+ }
+ if (p == NULL) {
+ fprintf(stderr, "%s: cannot strip %d segments from %s\n",
+ Cmd, strip, pl[i].file);
+ return 0;
+ }
+ pl[i].file = p;
+ }
+ return 1;
+}
+
+static int pl_cmp(const void *av, const void *bv)
+{
+ const struct plist *a = av;
+ const struct plist *b = bv;
+ return strcmp(a->file, b->file);
+}
+
+static int common_depth(char *a, char *b)
+{
+ /* find number of path segments that these two have
+ * in common
+ */
+ int depth = 0;
+ while (1) {
+ char *c;
+ int al, bl;
+ c = strchr(a, '/');
+ if (c)
+ al = c-a;
+ else
+ al = strlen(a);
+ c = strchr(b, '/');
+ if (c)
+ bl = c-b;
+ else
+ bl = strlen(b);
+ if (al == 0 || al != bl || strncmp(a, b, al) != 0)
+ return depth;
+ a += al;
+ while (*a == '/')
+ a++;
+ b += bl;
+ while (*b == '/')
+ b++;
+
+ depth++;
+ }
+}
+
+static struct plist *patch_add_file(struct plist *pl, int *np, char *file,
+ unsigned int start, unsigned int end)
+{
+ /* size of pl is 0, 16, n^2 */
+ int n = *np;
+ int asize;
+
+ while (*file == '/')
+ /* leading '/' are bad... */
+ file++;
+
+ if (n == 0)
+ asize = 0;
+ else if (n <= 16)
+ asize = 16;
+ else if ((n&(n-1)) == 0)
+ asize = n;
+ else
+ asize = n+1; /* not accurate, but not too large */
+ if (asize <= n) {
+ /* need to extend array */
+ struct plist *npl;
+ if (asize < 16)
+ asize = 16;
+ else
+ asize += asize;
+ npl = realloc(pl, asize * sizeof(struct plist));
+ if (!npl) {
+ fprintf(stderr, "realloc failed - skipping %s\n", file);
+ return pl;
+ }
+ pl = npl;
+ }
+ pl[n].file = file;
+ pl[n].start = start;
+ pl[n].end = end;
+ pl[n].last = pl[n].next = pl[n].prev = pl[n].parent = -1;
+ pl[n].chunks = pl[n].wiggles = 0; pl[n].conflicts = 100;
+ pl[n].open = 1;
+ pl[n].calced = 0;
+ pl[n].is_merge = 0;
+ *np = n+1;
+ return pl;
+}
+
+static struct plist *add_dir(struct plist *pl, int *np, char *file, char *curr)
+{
+ /* any parent of file that is not a parent of curr
+ * needs to be added to pl
+ */
+ int d = common_depth(file, curr);
+ char *buf = curr;
+ while (d) {
+ char *c = strchr(file, '/');
+ int l;
+ if (c)
+ l = c-file;
+ else
+ l = strlen(file);
+ file += l;
+ curr += l;
+ while (*file == '/')
+ file++;
+ while (*curr == '/')
+ curr++;
+ d--;
+ }
+ while (*file) {
+ if (curr > buf && curr[-1] != '/')
+ *curr++ = '/';
+ while (*file && *file != '/')
+ *curr++ = *file++;
+ while (*file == '/')
+ file++;
+ *curr = '\0';
+ if (*file)
+ pl = patch_add_file(pl, np, strdup(buf),
+ 0, 0);
+ }
+ return pl;
+}
+
+struct plist *sort_patches(struct plist *pl, int *np)
+{
+ /* sort the patches, add directory names, and re-sort */
+ char curr[1024];
+ char *prev;
+ int parents[100];
+ int prevnode[100];
+ int i, n;
+ qsort(pl, *np, sizeof(struct plist), pl_cmp);
+ curr[0] = 0;
+ n = *np;
+ for (i = 0; i < n; i++)
+ pl = add_dir(pl, np, pl[i].file, curr);
+
+ qsort(pl, *np, sizeof(struct plist), pl_cmp);
+
+ /* array is now stable, so set up parent pointers */
+ n = *np;
+ curr[0] = 0;
+ prevnode[0] = -1;
+ prev = "";
+ for (i = 0; i < n; i++) {
+ int d = common_depth(prev, pl[i].file);
+ if (d == 0)
+ pl[i].parent = -1;
+ else {
+ pl[i].parent = parents[d-1];
+ pl[pl[i].parent].last = i;
+ }
+ pl[i].prev = prevnode[d];
+ if (pl[i].prev > -1)
+ pl[pl[i].prev].next = i;
+ prev = pl[i].file;
+ parents[d] = i;
+ prevnode[d] = i;
+ prevnode[d+1] = -1;
+ }
+ return pl;
+}
+
+struct plist *parse_patch(FILE *f, FILE *of, int *np)
+{
+ /* read a multi-file patch from 'f' and record relevant
+ * details in a plist.
+ * if 'of' >= 0, fd might not be seekable so we write
+ * to 'of' and use lseek on 'of' to determine position
+ */
+ struct plist *plist = NULL;
+
+ *np = 0;
+ while (!feof(f)) {
+ /* first, find the start of a patch: "\n+++ "
+ * grab the file name and scan to the end of a line
+ */
+ char *target = "\n+++ ";
+ char *target2 = "\n--- ";
+ char *pos = target;
+ int c;
+ char name[1024];
+ unsigned start, end;
+
+ while (*pos && (c = fgetc(f)) != EOF) {
+ if (of)
+ fputc(c, of);
+ if (c == *pos)
+ pos++;
+ else
+ pos = target;
+ }
+ if (c == EOF)
+ break;
+ assert(c == ' ');
+ /* now read a file name */
+ pos = name;
+ while ((c = fgetc(f)) != EOF
+ && c != '\t' && c != '\n' && c != ' ' &&
+ pos - name < 1023) {
+ *pos++ = c;
+ if (of)
+ fputc(c, of);
+ }
+ *pos = 0;
+ if (c == EOF)
+ break;
+ if (of)
+ fputc(c, of);
+ while (c != '\n' && (c = fgetc(f)) != EOF)
+ if (of)
+ fputc(c, of);
+
+ start = ftell(of ?: f);
+
+ if (c == EOF)
+ break;
+
+ /* now skip to end - "\n--- " */
+ pos = target2+1;
+
+ while (*pos && (c = fgetc(f)) != EOF) {
+ if (of)
+ fputc(c, of);
+ if (c == *pos)
+ pos++;
+ else
+ pos = target2;
+ }
+ end = ftell(of ?: f);
+ if (pos > target2)
+ end -= (pos - target2) - 1;
+ plist = patch_add_file(plist, np,
+ strdup(name), start, end);
+ }
+ return plist;
+}
diff --git a/patch_depends.c b/patch_depends.c
new file mode 100644
index 0000000..9b3600c
--- /dev/null
+++ b/patch_depends.c
@@ -0,0 +1,92 @@
+
+/*
+ * Given a list of files containing patches, we determine any dependancy
+ * relationship between them.
+ * If a chunk in one file overlaps a chunk in a previous file then the one
+ * depends on the other.
+ *
+ * Each patch contains a list of chunks that apply to a file. Each
+ * chunk has an original start/end and a new start/end.
+ *
+ * Each target file links to a list of chunks, each of which points to it's
+ * patch file. The chunks are sorted by new start
+ *
+ * When we add a chunk which changes size, we update the new start/end of all
+ * previous chunks in that file which end after this one starts.
+ *
+ */
+
+struct chunk {
+ struct patch *patch; /* the patch this chunk is from */
+ struct file *file; /* the file this chunk patches */
+ int old_start, old_end;
+ int new_start, new_end;
+ struct chunk *next; /* next chunk for this file */
+};
+
+struct file {
+ char * name; /* name of the file */
+ struct chunk *chunks; /* chunks which patch this file */
+};
+
+struct patch {
+ char * name; /* name of file containing this patch */
+ int cnt; /* number of patches we depend on (so far) */
+ struct patch *depends; /* array of patches we depend on */
+ struct patch *next; /* previous patch that was loaded */
+} *patches = NULL;
+
+void report(void)
+{
+ struct patch *p;
+ int c;
+
+ for (p= patches; p ; p=p->next) {
+ printf("%s :", p->name);
+ for (c=0 ; c < p->cnt ; c++)
+ printf(" %s", p->depends[c]);
+ printf("\n");
+ }
+}
+
+int check_depends(struct patch *new, struct patch *old)
+{
+ /* see if new already depends on old */
+ int i;
+ if (new == old) return 1;
+ for (i=0; i<new->cnt ; i++)
+ if (check_depends(new->depends[i], old))
+ return 1;
+ return 0;
+}
+
+void add_depends(struct patch *new, struct patch *old)
+{
+ /* patch new depends on patch old, but this hasn't
+ * been recorded yet
+ */
+ int size = InitDepends;
+ while (size < new->cnt) size<<= 1;
+
+ new->cnt++;
+ if (new->cnt > size)
+ new->depends = realloc(new->depends, size*sizeof(struct patch *));
+ new->depends[new->cnt-1] = old;
+}
+
+void add_chunk(struct patch *p, struct file *f, int os, int oe, int ns, int ne)
+{
+ struct chunk *c = xmalloc(sizeof(struct chunk));
+ c->patch = p;
+ c->file = f;
+ c->old_start = os;
+ c->old_end = oe;
+ c->new_start = ns;
+ c->new_end = ne;
+
+ for (c1 = f->chunks ; c1 ; c1=c1->next) {
+ if (ns < c1->new_end && ne > c1->new_start) {
+ /* goody, found a dependancy */
+ if (!check_depends(c->patch, c1->patch))
+ add_depends(c->patch, c1->patch);
+ }
diff --git a/split.c b/split.c
new file mode 100644
index 0000000..9455768
--- /dev/null
+++ b/split.c
@@ -0,0 +1,118 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+/*
+ * Split a stream into words or lines
+ *
+ * A word is one of:
+ * string of [A-Za-z0-9_]
+ * or string of [ \t]
+ * or single char (i.e. punctuation and newlines).
+ *
+ * A line is any string that ends with \n
+ *
+ * As a special case to allow proper aligning of multiple chunks
+ * in a patch, a word starting \0 will include 20 chars with a newline
+ * second from the end.
+ *
+ * We make two passes through the stream.
+ * Firstly we count the number of item so an array can be allocated,
+ * then we store start and length of each item in the array
+ *
+ */
+
+#include "wiggle.h"
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdlib.h>
+
+#include "ccan/hash/hash.h"
+
+static int split_internal(char *start, char *end, int type,
+ struct elmnt *list)
+{
+ int cnt = 0;
+
+ while (start < end) {
+ char *cp = start;
+
+ if (*cp == '\0' && cp+19 < end && cp[18] == '\n') {
+ /* special word */
+ cp += 20;
+ } else
+ switch (type) {
+ case ByLine:
+ while (cp < end && *cp != '\n')
+ cp++;
+ if (cp < end)
+ cp++;
+ break;
+ case ByWord:
+ if (isalnum(*cp) || *cp == '_') {
+ do
+ cp++;
+ while (cp < end
+ && (isalnum(*cp)
+ || *cp == '_'));
+ } else if (*cp == ' ' || *cp == '\t') {
+ do
+ cp++;
+ while (cp < end
+ && (*cp == ' '
+ || *cp == '\t'));
+ } else
+ cp++;
+ break;
+ }
+ if (list) {
+ list->start = start;
+ list->len = cp-start;
+ if (*start)
+ list->hash = hash(start, list->len, 0);
+ else
+ list->hash = atoi(start+1);
+ list++;
+ }
+ cnt++;
+ start = cp;
+ }
+ return cnt;
+}
+
+struct file split_stream(struct stream s, int type)
+{
+ int cnt;
+ struct file f;
+
+ char *c, *end;
+
+ end = s.body+s.len;
+ c = s.body;
+
+ cnt = split_internal(c, end, type, NULL);
+ f.list = xmalloc(cnt*sizeof(struct elmnt));
+
+ f.elcnt = split_internal(c, end, type, f.list);
+ return f;
+}
diff --git a/tests/contrib/mod_tbill/merge b/tests/contrib/mod_tbill/merge
new file mode 100644
index 0000000..c70263e
--- /dev/null
+++ b/tests/contrib/mod_tbill/merge
@@ -0,0 +1,36 @@
+#ifndef MOD_TBILL_H
+#define MOD_TBILL_H
+
+class DB;
+class ServerRequest;
+
+#include <time.h>
+#include <cbtcommon++/String.h>
+#include "functionHash.h"
+
+typedef struct _tbill_state
+{
+ /* settable via conf file */
+ char *logon;
+ char *template_dir;
+ char *graph_dir;
+ char *pdf_dir;
+ char *http_proxy_server_addr; //for reverse lookups
+ int http_proxy_server_port; //for reverse lookups
+ int auth_d_reload_interval;
+ char *debuglvl_path;
+
+ /* internal state */
+ DB *db;
+ FunctionHash * fh;
+ int available;
+ String *linkPathPrefix;
+ int auth_d_pipe_fd;
+ int auth_d_fd;
+ int production_mode;
+} tbill_state;
+
+void generatePage(tbill_state *, ServerRequest *);
+
+#endif /* MOD_TBILL_H */
+
diff --git a/tests/contrib/mod_tbill/orig b/tests/contrib/mod_tbill/orig
new file mode 100644
index 0000000..8a5f0eb
--- /dev/null
+++ b/tests/contrib/mod_tbill/orig
@@ -0,0 +1,35 @@
+#ifndef MOD_TBILL_H
+#define MOD_TBILL_H
+
+class DB;
+class ServerRequest;
+
+#include <time.h>
+#include <cbtcommon++/String.h>
+#include "functionHash.h"
+
+typedef struct _tbill_state
+{
+ /* settable via conf file */
+ char *logon;
+ char *template_dir;
+ char *graph_dir;
+ char *pdf_dir;
+ char *http_proxy_server_addr; //for reverse lookups
+ int http_proxy_server_port; //for reverse lookups
+ int auth_d_reload_interval;
+ char *debuglvl_path;
+
+ /* internal state */
+ DB *db;
+ FunctionHash * fh;
+ int available;
+ String *linkPathPrefix;
+ int auth_d_pipe_fd;
+ int auth_d_fd;
+} tbill_state;
+
+void generatePage(tbill_state *, ServerRequest *);
+
+#endif /* MOD_TBILL_H */
+
diff --git a/tests/contrib/mod_tbill/patch b/tests/contrib/mod_tbill/patch
new file mode 100644
index 0000000..ec53405
--- /dev/null
+++ b/tests/contrib/mod_tbill/patch
@@ -0,0 +1,16 @@
+***************
+*** 22,27 ****
+ int auth_d_reload_interval;
+ int auth_d_fd;
+ char *debuglvl_path;
+ } tbill_state;
+
+ void generatePage(tbill_state *, ServerRequest *);
+--- 22,28 ----
+ int auth_d_reload_interval;
+ int auth_d_fd;
+ char *debuglvl_path;
++ int production_mode;
+ } tbill_state;
+
+ void generatePage(tbill_state *, ServerRequest *);
diff --git a/tests/contrib/nmi.c/merge b/tests/contrib/nmi.c/merge
new file mode 100644
index 0000000..928c470
--- /dev/null
+++ b/tests/contrib/nmi.c/merge
@@ -0,0 +1,471 @@
+/*
+ * linux/arch/i386/nmi.c
+ *
+ * NMI watchdog support on APIC systems
+ *
+ * Started by Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes:
+ * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
+ * Mikael Pettersson : Power Management for local APIC NMI watchdog.
+ * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
+ * Pavel Machek and
+ * Mikael Pettersson : PM converted to driver model. Disable/enable API.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/sysdev.h>
+#include <linux/dump.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/nmi.h>
+
+unsigned int nmi_watchdog = NMI_NONE;
+static unsigned int nmi_hz = HZ;
+unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
+extern void show_registers(struct pt_regs *regs);
+
+/* nmi_active:
+ * +1: the lapic NMI watchdog is active, but can be disabled
+ * 0: the lapic NMI watchdog has not been set up, and cannot
+ * be enabled
+ * -1: the lapic NMI watchdog is disabled, but can be enabled
+ */
+static int nmi_active;
+
+#define K7_EVNTSEL_ENABLE (1 << 22)
+#define K7_EVNTSEL_INT (1 << 20)
+#define K7_EVNTSEL_OS (1 << 17)
+#define K7_EVNTSEL_USR (1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
+#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+
+#define P6_EVNTSEL0_ENABLE (1 << 22)
+#define P6_EVNTSEL_INT (1 << 20)
+#define P6_EVNTSEL_OS (1 << 17)
+#define P6_EVNTSEL_USR (1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
+#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
+
+#define MSR_P4_MISC_ENABLE 0x1A0
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
+#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
+#define MSR_P4_PERFCTR0 0x300
+#define MSR_P4_CCCR0 0x360
+#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
+#define P4_ESCR_OS (1<<3)
+#define P4_ESCR_USR (1<<2)
+#define P4_CCCR_OVF_PMI (1<<26)
+#define P4_CCCR_THRESHOLD(N) ((N)<<20)
+#define P4_CCCR_COMPLEMENT (1<<19)
+#define P4_CCCR_COMPARE (1<<18)
+#define P4_CCCR_REQUIRED (3<<16)
+#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
+#define P4_CCCR_ENABLE (1<<12)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ CRU_ESCR0 (with any non-null event selector) through a complemented
+ max threshold. [IA32-Vol3, Section 14.9.9] */
+#define MSR_P4_IQ_COUNTER0 0x30C
+#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
+#define P4_NMI_IQ_CCCR0 \
+ (P4_CCCR_OVF_PMI|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
+ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
+
+int __init check_nmi_watchdog (void)
+{
+ unsigned int prev_nmi_count[NR_CPUS];
+ int cpu;
+
+ printk(KERN_INFO "testing NMI watchdog ... ");
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ prev_nmi_count[cpu] = irq_stat[cpu].__nmi_count;
+ local_irq_enable();
+ mdelay((10*1000)/nmi_hz); // wait 10 ticks
+
+ /* FIXME: Only boot CPU is online at this stage. Check CPUs
+ as they come up. */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ if (!cpu_online(cpu))
+ continue;
+ if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+ printk("CPU#%d: NMI appears to be stuck!\n", cpu);
+ nmi_active = 0;
+ return -1;
+ }
+ }
+ printk("OK.\n");
+
+ /* now that we know it works we can reduce NMI frequency to
+ something more reasonable; makes a difference in some configs */
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ nmi_hz = 1;
+
+ return 0;
+}
+
+static int __init setup_nmi_watchdog(char *str)
+{
+ int nmi;
+
+ get_option(&str, &nmi);
+
+ if (nmi >= NMI_INVALID)
+ return 0;
+ if (nmi == NMI_NONE)
+ nmi_watchdog = nmi;
+ /*
+ * If any other x86 CPU has a local APIC, then
+ * please test the NMI stuff there and send me the
+ * missing bits. Right now Intel P6/P4 and AMD K7 only.
+ */
+ if ((nmi == NMI_LOCAL_APIC) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
+ nmi_watchdog = nmi;
+ if ((nmi == NMI_LOCAL_APIC) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
+ nmi_watchdog = nmi;
+ /*
+ * We can enable the IO-APIC watchdog
+ * unconditionally.
+ */
+ if (nmi == NMI_IO_APIC) {
+ nmi_active = 1;
+ nmi_watchdog = nmi;
+ }
+ return 1;
+}
+
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+void disable_lapic_nmi_watchdog(void)
+{
+ if (nmi_active <= 0)
+ return;
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ wrmsr(MSR_K7_EVNTSEL0, 0, 0);
+ break;
+ case X86_VENDOR_INTEL:
+ switch (boot_cpu_data.x86) {
+ case 6:
+ if (boot_cpu_data.x86_model > 0xd)
+ break;
+
+ wrmsr(MSR_P6_EVNTSEL0, 0, 0);
+ break;
+ case 15:
+ if (boot_cpu_data.x86_model > 0x3)
+ break;
+
+ wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
+ wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
+ break;
+ }
+ break;
+ }
+ nmi_active = -1;
+ /* tell do_nmi() and others that we're not active any more */
+ nmi_watchdog = 0;
+}
+
+void enable_lapic_nmi_watchdog(void)
+{
+ if (nmi_active < 0) {
+ nmi_watchdog = NMI_LOCAL_APIC;
+ setup_apic_nmi_watchdog();
+ }
+}
+
+void disable_timer_nmi_watchdog(void)
+{
+ if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
+ return;
+
+ unset_nmi_callback();
+ nmi_active = -1;
+ nmi_watchdog = NMI_NONE;
+}
+
+void enable_timer_nmi_watchdog(void)
+{
+ if (nmi_active < 0) {
+ nmi_watchdog = NMI_IO_APIC;
+ touch_nmi_watchdog();
+ nmi_active = 1;
+ }
+}
+
+#ifdef CONFIG_PM
+
+static int nmi_pm_active; /* nmi_active before suspend */
+
+static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
+{
+ nmi_pm_active = nmi_active;
+ disable_lapic_nmi_watchdog();
+ return 0;
+}
+
+static int lapic_nmi_resume(struct sys_device *dev)
+{
+ if (nmi_pm_active > 0)
+ enable_lapic_nmi_watchdog();
+ return 0;
+}
+
+
+static struct sysdev_class nmi_sysclass = {
+ set_kset_name("lapic_nmi"),
+ .resume = lapic_nmi_resume,
+ .suspend = lapic_nmi_suspend,
+};
+
+static struct sys_device device_lapic_nmi = {
+ .id = 0,
+ .cls = &nmi_sysclass,
+};
+
+static int __init init_lapic_nmi_sysfs(void)
+{
+ int error;
+
+ if (nmi_active == 0)
+ return 0;
+
+ error = sysdev_class_register(&nmi_sysclass);
+ if (!error)
+ error = sys_device_register(&device_lapic_nmi);
+ return error;
+}
+/* must come after the local APIC's device_initcall() */
+late_initcall(init_lapic_nmi_sysfs);
+
+#endif /* CONFIG_PM */
+
+/*
+ * Activate the NMI watchdog via the local APIC.
+ * Original code written by Keith Owens.
+ */
+
+static void clear_msr_range(unsigned int base, unsigned int n)
+{
+ unsigned int i;
+
+ for(i = 0; i < n; ++i)
+ wrmsr(base+i, 0, 0);
+}
+
+static void setup_k7_watchdog(void)
+{
+ unsigned int evntsel;
+
+ nmi_perfctr_msr = MSR_K7_PERFCTR0;
+
+ clear_msr_range(MSR_K7_EVNTSEL0, 4);
+ clear_msr_range(MSR_K7_PERFCTR0, 4);
+
+ evntsel = K7_EVNTSEL_INT
+ | K7_EVNTSEL_OS
+ | K7_EVNTSEL_USR
+ | K7_NMI_EVENT;
+
+ wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+ Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
+ wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= K7_EVNTSEL_ENABLE;
+ wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+}
+
+static void setup_p6_watchdog(void)
+{
+ unsigned int evntsel;
+
+ nmi_perfctr_msr = MSR_P6_PERFCTR0;
+
+ clear_msr_range(MSR_P6_EVNTSEL0, 2);
+ clear_msr_range(MSR_P6_PERFCTR0, 2);
+
+ evntsel = P6_EVNTSEL_INT
+ | P6_EVNTSEL_OS
+ | P6_EVNTSEL_USR
+ | P6_NMI_EVENT;
+
+ wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
+ Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
+ wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= P6_EVNTSEL0_ENABLE;
+ wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
+}
+
+static int setup_p4_watchdog(void)
+{
+ unsigned int misc_enable, dummy;
+
+ rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
+ if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
+ return 0;
+
+ nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
+
+ if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
+ clear_msr_range(0x3F1, 2);
+ /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
+ docs doesn't fully define it, so leave it alone for now. */
+ clear_msr_range(0x3A0, 31);
+ clear_msr_range(0x3C0, 6);
+ clear_msr_range(0x3C8, 6);
+ clear_msr_range(0x3E0, 2);
+ clear_msr_range(MSR_P4_CCCR0, 18);
+ clear_msr_range(MSR_P4_PERFCTR0, 18);
+
+ wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
+ Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
+ wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ return 1;
+}
+
+void setup_apic_nmi_watchdog (void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
+ return;
+ setup_k7_watchdog();
+ break;
+ case X86_VENDOR_INTEL:
+ switch (boot_cpu_data.x86) {
+ case 6:
+ if (boot_cpu_data.x86_model > 0xd)
+ return;
+
+ setup_p6_watchdog();
+ break;
+ case 15:
+ if (boot_cpu_data.x86_model > 0x3)
+ return;
+
+ if (!setup_p4_watchdog())
+ return;
+ break;
+ default:
+ return;
+ }
+ break;
+ default:
+ return;
+ }
+ nmi_active = 1;
+}
+
+static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up any console locks first ...
+ * [when there will be more tty-related locks, break them up
+ * here too!]
+ */
+
+static unsigned int
+ last_irq_sums [NR_CPUS],
+ alert_counter [NR_CPUS];
+
+void touch_nmi_watchdog (void)
+{
+ int i;
+
+ /*
+ * Just reset the alert counters, (other CPUs might be
+ * spinning on locks we hold):
+ */
+ for (i = 0; i < NR_CPUS; i++)
+ alert_counter[i] = 0;
+}
+
+void nmi_watchdog_tick (struct pt_regs * regs)
+{
+
+ /*
+ * Since current_thread_info()-> is always on the stack, and we
+ * always switch the stack NMI-atomically, it's safe to use
+ * smp_processor_id().
+ */
+ int sum, cpu = smp_processor_id();
+
+ sum = irq_stat[cpu].apic_timer_irqs;
+
+ if (last_irq_sums[cpu] == sum) {
+ /*
+ * Ayiee, looks like this CPU is stuck ...
+ * wait a few IRQs (5 seconds) before doing the oops ...
+ */
+ alert_counter[cpu]++;
+ if (alert_counter[cpu] == 5*nmi_hz) {
+ spin_lock(&nmi_print_lock);
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out.
+ */
+ bust_spinlocks(1);
+ printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
+ show_registers(regs);
+ dump("NMI Watchdog detected LOCKUP", regs);
+ printk("console shuts up ...\n");
+ console_silent();
+ spin_unlock(&nmi_print_lock);
+ bust_spinlocks(0);
+ do_exit(SIGSEGV);
+ }
+ } else {
+ last_irq_sums[cpu] = sum;
+ alert_counter[cpu] = 0;
+ }
+ if (nmi_perfctr_msr) {
+ if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ }
+ wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
+ }
+}
+
+EXPORT_SYMBOL(nmi_watchdog);
+EXPORT_SYMBOL(disable_lapic_nmi_watchdog);
+EXPORT_SYMBOL(enable_lapic_nmi_watchdog);
+EXPORT_SYMBOL(disable_timer_nmi_watchdog);
+EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/tests/contrib/nmi.c/orig b/tests/contrib/nmi.c/orig
new file mode 100644
index 0000000..b7ebd8f
--- /dev/null
+++ b/tests/contrib/nmi.c/orig
@@ -0,0 +1,470 @@
+/*
+ * linux/arch/i386/nmi.c
+ *
+ * NMI watchdog support on APIC systems
+ *
+ * Started by Ingo Molnar <mingo@redhat.com>
+ *
+ * Fixes:
+ * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
+ * Mikael Pettersson : Power Management for local APIC NMI watchdog.
+ * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
+ * Pavel Machek and
+ * Mikael Pettersson : PM converted to driver model. Disable/enable API.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/sysdev.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/nmi.h>
+
+unsigned int nmi_watchdog = NMI_NONE;
+static unsigned int nmi_hz = HZ;
+unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
+extern void show_registers(struct pt_regs *regs);
+
+/* nmi_active:
+ * +1: the lapic NMI watchdog is active, but can be disabled
+ * 0: the lapic NMI watchdog has not been set up, and cannot
+ * be enabled
+ * -1: the lapic NMI watchdog is disabled, but can be enabled
+ */
+static int nmi_active;
+
+#define K7_EVNTSEL_ENABLE (1 << 22)
+#define K7_EVNTSEL_INT (1 << 20)
+#define K7_EVNTSEL_OS (1 << 17)
+#define K7_EVNTSEL_USR (1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
+#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+
+#define P6_EVNTSEL0_ENABLE (1 << 22)
+#define P6_EVNTSEL_INT (1 << 20)
+#define P6_EVNTSEL_OS (1 << 17)
+#define P6_EVNTSEL_USR (1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
+#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
+
+#define MSR_P4_MISC_ENABLE 0x1A0
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
+#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
+#define MSR_P4_PERFCTR0 0x300
+#define MSR_P4_CCCR0 0x360
+#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
+#define P4_ESCR_OS (1<<3)
+#define P4_ESCR_USR (1<<2)
+#define P4_CCCR_OVF_PMI (1<<26)
+#define P4_CCCR_THRESHOLD(N) ((N)<<20)
+#define P4_CCCR_COMPLEMENT (1<<19)
+#define P4_CCCR_COMPARE (1<<18)
+#define P4_CCCR_REQUIRED (3<<16)
+#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
+#define P4_CCCR_ENABLE (1<<12)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ CRU_ESCR0 (with any non-null event selector) through a complemented
+ max threshold. [IA32-Vol3, Section 14.9.9] */
+#define MSR_P4_IQ_COUNTER0 0x30C
+#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
+#define P4_NMI_IQ_CCCR0 \
+ (P4_CCCR_OVF_PMI|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
+ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
+
+int __init check_nmi_watchdog (void)
+{
+ unsigned int prev_nmi_count[NR_CPUS];
+ int cpu;
+
+ printk(KERN_INFO "testing NMI watchdog ... ");
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ prev_nmi_count[cpu] = irq_stat[cpu].__nmi_count;
+ local_irq_enable();
+ mdelay((10*1000)/nmi_hz); // wait 10 ticks
+
+ /* FIXME: Only boot CPU is online at this stage. Check CPUs
+ as they come up. */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ if (!cpu_online(cpu))
+ continue;
+ if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+ printk("CPU#%d: NMI appears to be stuck!\n", cpu);
+ nmi_active = 0;
+ return -1;
+ }
+ }
+ printk("OK.\n");
+
+ /* now that we know it works we can reduce NMI frequency to
+ something more reasonable; makes a difference in some configs */
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ nmi_hz = 1;
+
+ return 0;
+}
+
+static int __init setup_nmi_watchdog(char *str)
+{
+ int nmi;
+
+ get_option(&str, &nmi);
+
+ if (nmi >= NMI_INVALID)
+ return 0;
+ if (nmi == NMI_NONE)
+ nmi_watchdog = nmi;
+ /*
+ * If any other x86 CPU has a local APIC, then
+ * please test the NMI stuff there and send me the
+ * missing bits. Right now Intel P6/P4 and AMD K7 only.
+ */
+ if ((nmi == NMI_LOCAL_APIC) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
+ nmi_watchdog = nmi;
+ if ((nmi == NMI_LOCAL_APIC) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
+ nmi_watchdog = nmi;
+ /*
+ * We can enable the IO-APIC watchdog
+ * unconditionally.
+ */
+ if (nmi == NMI_IO_APIC) {
+ nmi_active = 1;
+ nmi_watchdog = nmi;
+ }
+ return 1;
+}
+
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+void disable_lapic_nmi_watchdog(void)
+{
+ if (nmi_active <= 0)
+ return;
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ wrmsr(MSR_K7_EVNTSEL0, 0, 0);
+ break;
+ case X86_VENDOR_INTEL:
+ switch (boot_cpu_data.x86) {
+ case 6:
+ if (boot_cpu_data.x86_model > 0xd)
+ break;
+
+ wrmsr(MSR_P6_EVNTSEL0, 0, 0);
+ break;
+ case 15:
+ if (boot_cpu_data.x86_model > 0x3)
+ break;
+
+ wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
+ wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
+ break;
+ }
+ break;
+ }
+ nmi_active = -1;
+ /* tell do_nmi() and others that we're not active any more */
+ nmi_watchdog = 0;
+}
+
+void enable_lapic_nmi_watchdog(void)
+{
+ if (nmi_active < 0) {
+ nmi_watchdog = NMI_LOCAL_APIC;
+ setup_apic_nmi_watchdog();
+ }
+}
+
+void disable_timer_nmi_watchdog(void)
+{
+ if ((nmi_watchdog != NMI_IO_APIC) || (nmi_active <= 0))
+ return;
+
+ unset_nmi_callback();
+ nmi_active = -1;
+ nmi_watchdog = NMI_NONE;
+}
+
+void enable_timer_nmi_watchdog(void)
+{
+ if (nmi_active < 0) {
+ nmi_watchdog = NMI_IO_APIC;
+ touch_nmi_watchdog();
+ nmi_active = 1;
+ }
+}
+
+#ifdef CONFIG_PM
+
+static int nmi_pm_active; /* nmi_active before suspend */
+
+static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
+{
+ nmi_pm_active = nmi_active;
+ disable_lapic_nmi_watchdog();
+ return 0;
+}
+
+static int lapic_nmi_resume(struct sys_device *dev)
+{
+ if (nmi_pm_active > 0)
+ enable_lapic_nmi_watchdog();
+ return 0;
+}
+
+
+static struct sysdev_class nmi_sysclass = {
+ set_kset_name("lapic_nmi"),
+ .resume = lapic_nmi_resume,
+ .suspend = lapic_nmi_suspend,
+};
+
+static struct sys_device device_lapic_nmi = {
+ .id = 0,
+ .cls = &nmi_sysclass,
+};
+
+static int __init init_lapic_nmi_sysfs(void)
+{
+ int error;
+
+ if (nmi_active == 0)
+ return 0;
+
+ error = sysdev_class_register(&nmi_sysclass);
+ if (!error)
+ error = sys_device_register(&device_lapic_nmi);
+ return error;
+}
+/* must come after the local APIC's device_initcall() */
+late_initcall(init_lapic_nmi_sysfs);
+
+#endif /* CONFIG_PM */
+
+/*
+ * Activate the NMI watchdog via the local APIC.
+ * Original code written by Keith Owens.
+ */
+
+static void clear_msr_range(unsigned int base, unsigned int n)
+{
+ unsigned int i;
+
+ for(i = 0; i < n; ++i)
+ wrmsr(base+i, 0, 0);
+}
+
+static void setup_k7_watchdog(void)
+{
+ unsigned int evntsel;
+
+ nmi_perfctr_msr = MSR_K7_PERFCTR0;
+
+ clear_msr_range(MSR_K7_EVNTSEL0, 4);
+ clear_msr_range(MSR_K7_PERFCTR0, 4);
+
+ evntsel = K7_EVNTSEL_INT
+ | K7_EVNTSEL_OS
+ | K7_EVNTSEL_USR
+ | K7_NMI_EVENT;
+
+ wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+ Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
+ wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= K7_EVNTSEL_ENABLE;
+ wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+}
+
+static void setup_p6_watchdog(void)
+{
+ unsigned int evntsel;
+
+ nmi_perfctr_msr = MSR_P6_PERFCTR0;
+
+ clear_msr_range(MSR_P6_EVNTSEL0, 2);
+ clear_msr_range(MSR_P6_PERFCTR0, 2);
+
+ evntsel = P6_EVNTSEL_INT
+ | P6_EVNTSEL_OS
+ | P6_EVNTSEL_USR
+ | P6_NMI_EVENT;
+
+ wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
+ Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
+ wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= P6_EVNTSEL0_ENABLE;
+ wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
+}
+
+static int setup_p4_watchdog(void)
+{
+ unsigned int misc_enable, dummy;
+
+ rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
+ if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
+ return 0;
+
+ nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
+
+ if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
+ clear_msr_range(0x3F1, 2);
+ /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
+ docs doesn't fully define it, so leave it alone for now. */
+ clear_msr_range(0x3A0, 31);
+ clear_msr_range(0x3C0, 6);
+ clear_msr_range(0x3C8, 6);
+ clear_msr_range(0x3E0, 2);
+ clear_msr_range(MSR_P4_CCCR0, 18);
+ clear_msr_range(MSR_P4_PERFCTR0, 18);
+
+ wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
+ Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
+ wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ return 1;
+}
+
+void setup_apic_nmi_watchdog (void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
+ return;
+ setup_k7_watchdog();
+ break;
+ case X86_VENDOR_INTEL:
+ switch (boot_cpu_data.x86) {
+ case 6:
+ if (boot_cpu_data.x86_model > 0xd)
+ return;
+
+ setup_p6_watchdog();
+ break;
+ case 15:
+ if (boot_cpu_data.x86_model > 0x3)
+ return;
+
+ if (!setup_p4_watchdog())
+ return;
+ break;
+ default:
+ return;
+ }
+ break;
+ default:
+ return;
+ }
+ nmi_active = 1;
+}
+
+static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * since NMIs don't listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up any console locks first ...
+ * [when there will be more tty-related locks, break them up
+ * here too!]
+ */
+
+static unsigned int
+ last_irq_sums [NR_CPUS],
+ alert_counter [NR_CPUS];
+
+void touch_nmi_watchdog (void)
+{
+ int i;
+
+ /*
+ * Just reset the alert counters, (other CPUs might be
+ * spinning on locks we hold):
+ */
+ for (i = 0; i < NR_CPUS; i++)
+ alert_counter[i] = 0;
+}
+
+void nmi_watchdog_tick (struct pt_regs * regs)
+{
+
+ /*
+ * Since current_thread_info()-> is always on the stack, and we
+ * always switch the stack NMI-atomically, it's safe to use
+ * smp_processor_id().
+ */
+ int sum, cpu = smp_processor_id();
+
+ sum = irq_stat[cpu].apic_timer_irqs;
+
+ if (last_irq_sums[cpu] == sum) {
+ /*
+ * Ayiee, looks like this CPU is stuck ...
+ * wait a few IRQs (5 seconds) before doing the oops ...
+ */
+ alert_counter[cpu]++;
+ if (alert_counter[cpu] == 5*nmi_hz) {
+ spin_lock(&nmi_print_lock);
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out.
+ */
+ bust_spinlocks(1);
+ printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
+ show_registers(regs);
+ dump("NMI Watchdog detected LOCKUP", regs);
+ printk("console shuts up ...\n");
+ console_silent();
+ spin_unlock(&nmi_print_lock);
+ bust_spinlocks(0);
+ do_exit(SIGSEGV);
+ }
+ } else {
+ last_irq_sums[cpu] = sum;
+ alert_counter[cpu] = 0;
+ }
+ if (nmi_perfctr_msr) {
+ if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ }
+ wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
+ }
+}
+
+EXPORT_SYMBOL(nmi_watchdog);
+EXPORT_SYMBOL(disable_lapic_nmi_watchdog);
+EXPORT_SYMBOL(enable_lapic_nmi_watchdog);
+EXPORT_SYMBOL(disable_timer_nmi_watchdog);
+EXPORT_SYMBOL(enable_timer_nmi_watchdog);
diff --git a/tests/contrib/nmi.c/patch b/tests/contrib/nmi.c/patch
new file mode 100644
index 0000000..225b6df
--- /dev/null
+++ b/tests/contrib/nmi.c/patch
@@ -0,0 +1,8 @@
+@@ -26,6 +26,7 @@
+ #include <linux/nmi.h>
+ #include <linux/sysdev.h>
+ #include <linux/debugger.h>
++#include <linux/dump.h>
+
+ #include <asm/smp.h>
+ #include <asm/mtrr.h>
diff --git a/tests/contrib/pfkey_v2_parse.c/merge b/tests/contrib/pfkey_v2_parse.c/merge
new file mode 100644
index 0000000..b943e7c
--- /dev/null
+++ b/tests/contrib/pfkey_v2_parse.c/merge
@@ -0,0 +1,1789 @@
+/*
+ * RFC2367 PF_KEYv2 Key management API message parser
+ * Copyright (C) 1999, 2000, 2001 Richard Guy Briggs.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * RCSID $Id: pfkey_v2_parse.c,v 1.53 2003/01/30 02:32:09 rgb Exp $
+ */
+
+/*
+ * Template from klips/net/ipsec/ipsec/ipsec_parser.c.
+ */
+
+char pfkey_v2_parse_c_version[] = "$Id: pfkey_v2_parse.c,v 1.53 2003/01/30 02:32:09 rgb Exp $";
+
+/*
+ * Some ugly stuff to allow consistent debugging code for use in the
+ * kernel and in user space
+*/
+
+#ifdef __KERNEL__
+
+# include <linux/kernel.h> /* for printk */
+
+#include "freeswan/ipsec_kversion.h" /* for malloc switch */
+
+# ifdef MALLOC_SLAB
+# include <linux/slab.h> /* kmalloc() */
+# else /* MALLOC_SLAB */
+# include <linux/malloc.h> /* kmalloc() */
+# endif /* MALLOC_SLAB */
+# include <linux/errno.h> /* error codes */
+# include <linux/types.h> /* size_t */
+# include <linux/interrupt.h> /* mark_bh */
+
+# include <linux/netdevice.h> /* struct device, and other headers */
+# include <linux/etherdevice.h> /* eth_type_trans */
+# include <linux/ip.h> /* struct iphdr */
+# if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+# include <linux/ipv6.h> /* struct ipv6hdr */
+# endif /* if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+extern int debug_pfkey;
+
+# include <freeswan.h>
+
+#include "freeswan/ipsec_encap.h"
+
+#else /* __KERNEL__ */
+
+# include <sys/types.h>
+# include <linux/types.h>
+# include <linux/errno.h>
+
+# include <freeswan.h>
+# include "programs/pluto/constants.h"
+# include "programs/pluto/defs.h" /* for PRINTF_LIKE */
+# include "programs/pluto/log.h" /* for debugging and DBG_log */
+
+/* #define PLUTO */
+
+# ifdef PLUTO
+# define DEBUGGING(level, args...) { DBG_log("pfkey_lib_debug:" args); }
+# else
+# define DEBUGGING(level, args...) if(pfkey_lib_debug & level) { printf("pfkey_lib_debug:" args); } else { ; }
+# endif
+
+#endif /* __KERNEL__ */
+
+
+#include <pfkeyv2.h>
+#include <pfkey.h>
+
+#ifdef __KERNEL__
+# include "freeswan/ipsec_netlink.h" /* KLIPS_PRINT */
+extern int sysctl_ipsec_debug_verbose;
+# define DEBUGGING(level, args...) \
+ KLIPS_PRINT( \
+ ((debug_pfkey & level & (PF_KEY_DEBUG_PARSE_STRUCT | PF_KEY_DEBUG_PARSE_PROBLEM)) \
+ || (sysctl_ipsec_debug_verbose && (debug_pfkey & level & PF_KEY_DEBUG_PARSE_FLOW))) \
+ , "klips_debug:" args)
+#endif /* __KERNEL__ */
+#include "freeswan/ipsec_sa.h" /* IPSEC_SAREF_NULL, IPSEC_SA_REF_TABLE_IDX_WIDTH */
+
+
+#define SENDERR(_x) do { error = -(_x); goto errlab; } while (0)
+
+struct satype_tbl {
+ uint8_t proto;
+ uint8_t satype;
+ char* name;
+} static satype_tbl[] = {
+#ifdef __KERNEL__
+ { IPPROTO_ESP, SADB_SATYPE_ESP, "ESP" },
+ { IPPROTO_AH, SADB_SATYPE_AH, "AH" },
+ { IPPROTO_IPIP, SADB_X_SATYPE_IPIP, "IPIP" },
+#ifdef CONFIG_IPSEC_IPCOMP
+ { IPPROTO_COMP, SADB_X_SATYPE_COMP, "COMP" },
+#endif /* CONFIG_IPSEC_IPCOMP */
+ { IPPROTO_INT, SADB_X_SATYPE_INT, "INT" },
+#else /* __KERNEL__ */
+ { SA_ESP, SADB_SATYPE_ESP, "ESP" },
+ { SA_AH, SADB_SATYPE_AH, "AH" },
+ { SA_IPIP, SADB_X_SATYPE_IPIP, "IPIP" },
+ { SA_COMP, SADB_X_SATYPE_COMP, "COMP" },
+ { SA_INT, SADB_X_SATYPE_INT, "INT" },
+#endif /* __KERNEL__ */
+ { 0, 0, "UNKNOWN" }
+};
+
+uint8_t
+satype2proto(uint8_t satype)
+{
+ int i =0;
+
+ while(satype_tbl[i].satype != satype && satype_tbl[i].satype != 0) {
+ i++;
+ }
+ return satype_tbl[i].proto;
+}
+
+uint8_t
+proto2satype(uint8_t proto)
+{
+ int i = 0;
+
+ while(satype_tbl[i].proto != proto && satype_tbl[i].proto != 0) {
+ i++;
+ }
+ return satype_tbl[i].satype;
+}
+
+char*
+satype2name(uint8_t satype)
+{
+ int i = 0;
+
+ while(satype_tbl[i].satype != satype && satype_tbl[i].satype != 0) {
+ i++;
+ }
+ return satype_tbl[i].name;
+}
+
+char*
+proto2name(uint8_t proto)
+{
+ int i = 0;
+
+ while(satype_tbl[i].proto != proto && satype_tbl[i].proto != 0) {
+ i++;
+ }
+ return satype_tbl[i].name;
+}
+
+/* Default extension parsers taken from the KLIPS code */
+
+DEBUG_NO_STATIC int
+pfkey_sa_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_sa *pfkey_sa = (struct sadb_sa *)pfkey_ext;
+#if 0
+ struct sadb_sa sav2;
+#endif
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_sa_parse: entry\n");
+ /* sanity checks... */
+ if(!pfkey_sa) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "NULL pointer passed in.\n");
+ SENDERR(EINVAL);
+ }
+
+#if 0
+ /* check if this structure is short, and if so, fix it up.
+ * XXX this is NOT the way to do things.
+ */
+ if(pfkey_sa->sadb_sa_len == sizeof(struct sadb_sa_v1)/IPSEC_PFKEYv2_ALIGN) {
+
+ /* yes, so clear out a temporary structure, and copy first */
+ memset(&sav2, 0, sizeof(sav2));
+ memcpy(&sav2, pfkey_sa, sizeof(struct sadb_sa_v1));
+ sav2.sadb_x_sa_ref=-1;
+ sav2.sadb_sa_len = sizeof(struct sadb_sa) / IPSEC_PFKEYv2_ALIGN;
+
+ pfkey_sa = &sav2;
+ }
+#endif
+
+
+ if(pfkey_sa->sadb_sa_len != sizeof(struct sadb_sa) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "length wrong pfkey_sa->sadb_sa_len=%d sizeof(struct sadb_sa)=%d.\n",
+ pfkey_sa->sadb_sa_len,
+ (int)sizeof(struct sadb_sa));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_sa->sadb_sa_encrypt > SADB_EALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "pfkey_sa->sadb_sa_encrypt=%d > SADB_EALG_MAX=%d.\n",
+ pfkey_sa->sadb_sa_encrypt,
+ SADB_EALG_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_sa->sadb_sa_auth > SADB_AALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "pfkey_sa->sadb_sa_auth=%d > SADB_AALG_MAX=%d.\n",
+ pfkey_sa->sadb_sa_auth,
+ SADB_AALG_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_sa->sadb_sa_state > SADB_SASTATE_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "state=%d exceeds MAX=%d.\n",
+ pfkey_sa->sadb_sa_state,
+ SADB_SASTATE_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_sa->sadb_sa_state == SADB_SASTATE_DEAD) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "state=%d is DEAD=%d.\n",
+ pfkey_sa->sadb_sa_state,
+ SADB_SASTATE_DEAD);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_sa->sadb_sa_replay > 64) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "replay window size: %d -- must be 0 <= size <= 64\n",
+ pfkey_sa->sadb_sa_replay);
+ SENDERR(EINVAL);
+ }
+
+ if(! ((pfkey_sa->sadb_sa_exttype == SADB_EXT_SA) ||
+ (pfkey_sa->sadb_sa_exttype == SADB_X_EXT_SA2)))
+ {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "unknown exttype=%d, expecting SADB_EXT_SA=%d or SADB_X_EXT_SA2=%d.\n",
+ pfkey_sa->sadb_sa_exttype,
+ SADB_EXT_SA,
+ SADB_X_EXT_SA2);
+ SENDERR(EINVAL);
+ }
+
+ if((IPSEC_SAREF_NULL != pfkey_sa->sadb_x_sa_ref) && (pfkey_sa->sadb_x_sa_ref >= (1 << IPSEC_SA_REF_TABLE_IDX_WIDTH))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "SAref=%d must be (SAref == IPSEC_SAREF_NULL(%d) || SAref < IPSEC_SA_REF_TABLE_NUM_ENTRIES(%d)).\n",
+ pfkey_sa->sadb_x_sa_ref,
+ IPSEC_SAREF_NULL,
+ IPSEC_SA_REF_TABLE_NUM_ENTRIES);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_sa_parse: "
+ "successfully found len=%d exttype=%d(%s) spi=%08lx replay=%d state=%d auth=%d encrypt=%d flags=%d ref=%d.\n",
+ pfkey_sa->sadb_sa_len,
+ pfkey_sa->sadb_sa_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_sa->sadb_sa_exttype),
+ (long unsigned int)ntohl(pfkey_sa->sadb_sa_spi),
+ pfkey_sa->sadb_sa_replay,
+ pfkey_sa->sadb_sa_state,
+ pfkey_sa->sadb_sa_auth,
+ pfkey_sa->sadb_sa_encrypt,
+ pfkey_sa->sadb_sa_flags,
+ pfkey_sa->sadb_x_sa_ref);
+
+ errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_lifetime_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_lifetime *pfkey_lifetime = (struct sadb_lifetime *)pfkey_ext;
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_lifetime_parse:enter\n");
+ /* sanity checks... */
+ if(!pfkey_lifetime) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_lifetime_parse: "
+ "NULL pointer passed in.\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_lifetime->sadb_lifetime_len !=
+ sizeof(struct sadb_lifetime) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_lifetime_parse: "
+ "length wrong pfkey_lifetime->sadb_lifetime_len=%d sizeof(struct sadb_lifetime)=%d.\n",
+ pfkey_lifetime->sadb_lifetime_len,
+ (int)sizeof(struct sadb_lifetime));
+ SENDERR(EINVAL);
+ }
+
+ if((pfkey_lifetime->sadb_lifetime_exttype != SADB_EXT_LIFETIME_HARD) &&
+ (pfkey_lifetime->sadb_lifetime_exttype != SADB_EXT_LIFETIME_SOFT) &&
+ (pfkey_lifetime->sadb_lifetime_exttype != SADB_EXT_LIFETIME_CURRENT)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_lifetime_parse: "
+ "unexpected ext_type=%d.\n",
+ pfkey_lifetime->sadb_lifetime_exttype);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_lifetime_parse: "
+ "life_type=%d(%s) alloc=%u bytes=%u add=%u use=%u pkts=%u.\n",
+ pfkey_lifetime->sadb_lifetime_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_lifetime->sadb_lifetime_exttype),
+ pfkey_lifetime->sadb_lifetime_allocations,
+ (unsigned)pfkey_lifetime->sadb_lifetime_bytes,
+ (unsigned)pfkey_lifetime->sadb_lifetime_addtime,
+ (unsigned)pfkey_lifetime->sadb_lifetime_usetime,
+ pfkey_lifetime->sadb_x_lifetime_packets);
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_address_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ int saddr_len = 0;
+ struct sadb_address *pfkey_address = (struct sadb_address *)pfkey_ext;
+ struct sockaddr* s = (struct sockaddr*)((char*)pfkey_address + sizeof(*pfkey_address));
+ char ipaddr_txt[ADDRTOT_BUF];
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_address_parse:enter\n");
+ /* sanity checks... */
+ if(!pfkey_address) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "NULL pointer passed in.\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_address->sadb_address_len <
+ (sizeof(struct sadb_address) + sizeof(struct sockaddr))/
+ IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "size wrong 1 ext_len=%d, adr_ext_len=%d, saddr_len=%d.\n",
+ pfkey_address->sadb_address_len,
+ (int)sizeof(struct sadb_address),
+ (int)sizeof(struct sockaddr));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_address->sadb_address_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "res=%d, must be zero.\n",
+ pfkey_address->sadb_address_reserved);
+ SENDERR(EINVAL);
+ }
+
+ switch(pfkey_address->sadb_address_exttype) {
+ case SADB_EXT_ADDRESS_SRC:
+ case SADB_EXT_ADDRESS_DST:
+ case SADB_EXT_ADDRESS_PROXY:
+ case SADB_X_EXT_ADDRESS_DST2:
+ case SADB_X_EXT_ADDRESS_SRC_FLOW:
+ case SADB_X_EXT_ADDRESS_DST_FLOW:
+ case SADB_X_EXT_ADDRESS_SRC_MASK:
+ case SADB_X_EXT_ADDRESS_DST_MASK:
+#ifdef NAT_TRAVERSAL
+ case SADB_X_EXT_NAT_T_OA:
+#endif
+ break;
+ default:
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "unexpected ext_type=%d.\n",
+ pfkey_address->sadb_address_exttype);
+ SENDERR(EINVAL);
+ }
+
+ switch(s->sa_family) {
+ case AF_INET:
+ saddr_len = sizeof(struct sockaddr_in);
+ sprintf(ipaddr_txt, "%d.%d.%d.%d"
+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 0) & 0xFF
+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 8) & 0xFF
+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 16) & 0xFF
+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 24) & 0xFF);
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_address_parse: "
+ "found exttype=%u(%s) family=%d(AF_INET) address=%s proto=%u port=%u.\n",
+ pfkey_address->sadb_address_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_address->sadb_address_exttype),
+ s->sa_family,
+ ipaddr_txt,
+ pfkey_address->sadb_address_proto,
+ ((struct sockaddr_in*)s)->sin_port);
+ break;
+ case AF_INET6:
+ saddr_len = sizeof(struct sockaddr_in6);
+ sprintf(ipaddr_txt, "%x:%x:%x:%x:%x:%x:%x:%x"
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[0])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[1])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[2])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[3])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[4])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[5])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[6])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[7]));
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_address_parse: "
+ "found exttype=%u(%s) family=%d(AF_INET6) address=%s proto=%u port=%u.\n",
+ pfkey_address->sadb_address_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_address->sadb_address_exttype),
+ s->sa_family,
+ ipaddr_txt,
+ pfkey_address->sadb_address_proto,
+ ((struct sockaddr_in6*)s)->sin6_port);
+ break;
+ default:
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "s->sa_family=%d not supported.\n",
+ s->sa_family);
+ SENDERR(EPFNOSUPPORT);
+ }
+
+ if(pfkey_address->sadb_address_len !=
+ DIVUP(sizeof(struct sadb_address) + saddr_len, IPSEC_PFKEYv2_ALIGN)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "size wrong 2 ext_len=%d, adr_ext_len=%d, saddr_len=%d.\n",
+ pfkey_address->sadb_address_len,
+ (int)sizeof(struct sadb_address),
+ saddr_len);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_address->sadb_address_prefixlen != 0) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "address prefixes not supported yet.\n");
+ SENDERR(EAFNOSUPPORT); /* not supported yet */
+ }
+
+ /* XXX check if port!=0 */
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_address_parse: successful.\n");
+ errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_key_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_key *pfkey_key = (struct sadb_key *)pfkey_ext;
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_key_parse:enter\n");
+ /* sanity checks... */
+
+ if(!pfkey_key) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "NULL pointer passed in.\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_key->sadb_key_len < sizeof(struct sadb_key) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_key->sadb_key_len,
+ (int)sizeof(struct sadb_key));
+ SENDERR(EINVAL);
+ }
+
+ if(!pfkey_key->sadb_key_bits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "key length set to zero, must be non-zero.\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_key->sadb_key_len !=
+ DIVUP(sizeof(struct sadb_key) * OCTETBITS + pfkey_key->sadb_key_bits,
+ PFKEYBITS)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "key length=%d does not agree with extension length=%d.\n",
+ pfkey_key->sadb_key_bits,
+ pfkey_key->sadb_key_len);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_key->sadb_key_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "res=%d, must be zero.\n",
+ pfkey_key->sadb_key_reserved);
+ SENDERR(EINVAL);
+ }
+
+ if(! ( (pfkey_key->sadb_key_exttype == SADB_EXT_KEY_AUTH) ||
+ (pfkey_key->sadb_key_exttype == SADB_EXT_KEY_ENCRYPT))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "expecting extension type AUTH or ENCRYPT, got %d.\n",
+ pfkey_key->sadb_key_exttype);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_key_parse: "
+ "success, found len=%d exttype=%d(%s) bits=%d reserved=%d.\n",
+ pfkey_key->sadb_key_len,
+ pfkey_key->sadb_key_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_key->sadb_key_exttype),
+ pfkey_key->sadb_key_bits,
+ pfkey_key->sadb_key_reserved);
+
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_ident_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_ident *pfkey_ident = (struct sadb_ident *)pfkey_ext;
+
+ /* sanity checks... */
+ if(pfkey_ident->sadb_ident_len < sizeof(struct sadb_ident) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_ident_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_ident->sadb_ident_len,
+ (int)sizeof(struct sadb_ident));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_ident->sadb_ident_type > SADB_IDENTTYPE_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_ident_parse: "
+ "ident_type=%d out of range, must be less than %d.\n",
+ pfkey_ident->sadb_ident_type,
+ SADB_IDENTTYPE_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_ident->sadb_ident_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_ident_parse: "
+ "res=%d, must be zero.\n",
+ pfkey_ident->sadb_ident_reserved);
+ SENDERR(EINVAL);
+ }
+
+ /* string terminator/padding must be zero */
+ if(pfkey_ident->sadb_ident_len > sizeof(struct sadb_ident) / IPSEC_PFKEYv2_ALIGN) {
+ if(*((char*)pfkey_ident + pfkey_ident->sadb_ident_len * IPSEC_PFKEYv2_ALIGN - 1)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_ident_parse: "
+ "string padding must be zero, last is 0x%02x.\n",
+ *((char*)pfkey_ident +
+ pfkey_ident->sadb_ident_len * IPSEC_PFKEYv2_ALIGN - 1));
+ SENDERR(EINVAL);
+ }
+ }
+
+ if( ! ((pfkey_ident->sadb_ident_exttype == SADB_EXT_IDENTITY_SRC) ||
+ (pfkey_ident->sadb_ident_exttype == SADB_EXT_IDENTITY_DST))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "expecting extension type IDENTITY_SRC or IDENTITY_DST, got %d.\n",
+ pfkey_ident->sadb_ident_exttype);
+ SENDERR(EINVAL);
+ }
+
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_sens_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_sens *pfkey_sens = (struct sadb_sens *)pfkey_ext;
+
+ /* sanity checks... */
+ if(pfkey_sens->sadb_sens_len < sizeof(struct sadb_sens) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sens_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_sens->sadb_sens_len,
+ (int)sizeof(struct sadb_sens));
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sens_parse: "
+ "Sorry, I can't parse exttype=%d yet.\n",
+ pfkey_ext->sadb_ext_type);
+#if 0
+ SENDERR(EINVAL); /* don't process these yet */
+#endif
+
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_prop_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ int i, num_comb;
+ struct sadb_prop *pfkey_prop = (struct sadb_prop *)pfkey_ext;
+ struct sadb_comb *pfkey_comb = (struct sadb_comb *)((char*)pfkey_ext + sizeof(struct sadb_prop));
+
+ /* sanity checks... */
+ if((pfkey_prop->sadb_prop_len < sizeof(struct sadb_prop) / IPSEC_PFKEYv2_ALIGN) ||
+ (((pfkey_prop->sadb_prop_len * IPSEC_PFKEYv2_ALIGN) - sizeof(struct sadb_prop)) % sizeof(struct sadb_comb))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "size wrong ext_len=%d, prop_ext_len=%d comb_ext_len=%d.\n",
+ pfkey_prop->sadb_prop_len,
+ (int)sizeof(struct sadb_prop),
+ (int)sizeof(struct sadb_comb));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_prop->sadb_prop_replay > 64) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "replay window size: %d -- must be 0 <= size <= 64\n",
+ pfkey_prop->sadb_prop_replay);
+ SENDERR(EINVAL);
+ }
+
+ for(i=0; i<3; i++) {
+ if(pfkey_prop->sadb_prop_reserved[i]) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "res[%d]=%d, must be zero.\n",
+ i, pfkey_prop->sadb_prop_reserved[i]);
+ SENDERR(EINVAL);
+ }
+ }
+
+ num_comb = ((pfkey_prop->sadb_prop_len * IPSEC_PFKEYv2_ALIGN) - sizeof(struct sadb_prop)) / sizeof(struct sadb_comb);
+
+ for(i = 0; i < num_comb; i++) {
+ if(pfkey_comb->sadb_comb_auth > SADB_AALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth=%d > SADB_AALG_MAX=%d.\n",
+ i,
+ pfkey_comb->sadb_comb_auth,
+ SADB_AALG_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_auth) {
+ if(!pfkey_comb->sadb_comb_auth_minbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth_minbits=0, fatal.\n",
+ i);
+ SENDERR(EINVAL);
+ }
+ if(!pfkey_comb->sadb_comb_auth_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth_maxbits=0, fatal.\n",
+ i);
+ SENDERR(EINVAL);
+ }
+ if(pfkey_comb->sadb_comb_auth_minbits > pfkey_comb->sadb_comb_auth_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth_minbits=%d > maxbits=%d, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_auth_minbits,
+ pfkey_comb->sadb_comb_auth_maxbits);
+ SENDERR(EINVAL);
+ }
+ } else {
+ if(pfkey_comb->sadb_comb_auth_minbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth_minbits=%d != 0, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_auth_minbits);
+ SENDERR(EINVAL);
+ }
+ if(pfkey_comb->sadb_comb_auth_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth_maxbits=%d != 0, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_auth_maxbits);
+ SENDERR(EINVAL);
+ }
+ }
+
+ if(pfkey_comb->sadb_comb_encrypt > SADB_EALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_comb_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt=%d > SADB_EALG_MAX=%d.\n",
+ i,
+ pfkey_comb->sadb_comb_encrypt,
+ SADB_EALG_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_encrypt) {
+ if(!pfkey_comb->sadb_comb_encrypt_minbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt_minbits=0, fatal.\n",
+ i);
+ SENDERR(EINVAL);
+ }
+ if(!pfkey_comb->sadb_comb_encrypt_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt_maxbits=0, fatal.\n",
+ i);
+ SENDERR(EINVAL);
+ }
+ if(pfkey_comb->sadb_comb_encrypt_minbits > pfkey_comb->sadb_comb_encrypt_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt_minbits=%d > maxbits=%d, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_encrypt_minbits,
+ pfkey_comb->sadb_comb_encrypt_maxbits);
+ SENDERR(EINVAL);
+ }
+ } else {
+ if(pfkey_comb->sadb_comb_encrypt_minbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt_minbits=%d != 0, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_encrypt_minbits);
+ SENDERR(EINVAL);
+ }
+ if(pfkey_comb->sadb_comb_encrypt_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt_maxbits=%d != 0, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_encrypt_maxbits);
+ SENDERR(EINVAL);
+ }
+ }
+
+ /* XXX do sanity check on flags */
+
+ if(pfkey_comb->sadb_comb_hard_allocations && pfkey_comb->sadb_comb_soft_allocations > pfkey_comb->sadb_comb_hard_allocations) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_soft_allocations=%d > hard_allocations=%d, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_soft_allocations,
+ pfkey_comb->sadb_comb_hard_allocations);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_hard_bytes && pfkey_comb->sadb_comb_soft_bytes > pfkey_comb->sadb_comb_hard_bytes) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_soft_bytes=%Ld > hard_bytes=%Ld, fatal.\n",
+ i,
+ (unsigned long long int)pfkey_comb->sadb_comb_soft_bytes,
+ (unsigned long long int)pfkey_comb->sadb_comb_hard_bytes);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_hard_addtime && pfkey_comb->sadb_comb_soft_addtime > pfkey_comb->sadb_comb_hard_addtime) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_soft_addtime=%Ld > hard_addtime=%Ld, fatal.\n",
+ i,
+ (unsigned long long int)pfkey_comb->sadb_comb_soft_addtime,
+ (unsigned long long int)pfkey_comb->sadb_comb_hard_addtime);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_hard_usetime && pfkey_comb->sadb_comb_soft_usetime > pfkey_comb->sadb_comb_hard_usetime) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_soft_usetime=%Ld > hard_usetime=%Ld, fatal.\n",
+ i,
+ (unsigned long long int)pfkey_comb->sadb_comb_soft_usetime,
+ (unsigned long long int)pfkey_comb->sadb_comb_hard_usetime);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_x_comb_hard_packets && pfkey_comb->sadb_x_comb_soft_packets > pfkey_comb->sadb_x_comb_hard_packets) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_x_comb_soft_packets=%d > hard_packets=%d, fatal.\n",
+ i,
+ pfkey_comb->sadb_x_comb_soft_packets,
+ pfkey_comb->sadb_x_comb_hard_packets);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "comb[%d].res=%d, must be zero.\n",
+ i,
+ pfkey_comb->sadb_comb_reserved);
+ SENDERR(EINVAL);
+ }
+ pfkey_comb++;
+ }
+
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_supported_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ unsigned int i, num_alg;
+ struct sadb_supported *pfkey_supported = (struct sadb_supported *)pfkey_ext;
+ struct sadb_alg *pfkey_alg = (struct sadb_alg*)((char*)pfkey_ext + sizeof(struct sadb_supported));
+
+ /* sanity checks... */
+ if((pfkey_supported->sadb_supported_len <
+ sizeof(struct sadb_supported) / IPSEC_PFKEYv2_ALIGN) ||
+ (((pfkey_supported->sadb_supported_len * IPSEC_PFKEYv2_ALIGN) -
+ sizeof(struct sadb_supported)) % sizeof(struct sadb_alg))) {
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "size wrong ext_len=%d, supported_ext_len=%d alg_ext_len=%d.\n",
+ pfkey_supported->sadb_supported_len,
+ (int)sizeof(struct sadb_supported),
+ (int)sizeof(struct sadb_alg));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_supported->sadb_supported_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "res=%d, must be zero.\n",
+ pfkey_supported->sadb_supported_reserved);
+ SENDERR(EINVAL);
+ }
+
+ num_alg = ((pfkey_supported->sadb_supported_len * IPSEC_PFKEYv2_ALIGN) - sizeof(struct sadb_supported)) / sizeof(struct sadb_alg);
+
+ for(i = 0; i < num_alg; i++) {
+ /* process algo description */
+ if(pfkey_alg->sadb_alg_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "alg[%d], id=%d, ivlen=%d, minbits=%d, maxbits=%d, res=%d, must be zero.\n",
+ i,
+ pfkey_alg->sadb_alg_id,
+ pfkey_alg->sadb_alg_ivlen,
+ pfkey_alg->sadb_alg_minbits,
+ pfkey_alg->sadb_alg_maxbits,
+ pfkey_alg->sadb_alg_reserved);
+ SENDERR(EINVAL);
+ }
+
+ /* XXX can alg_id auth/enc be determined from info given?
+ Yes, but OpenBSD's method does not iteroperate with rfc2367.
+ rgb, 2000-04-06 */
+
+ switch(pfkey_supported->sadb_supported_exttype) {
+ case SADB_EXT_SUPPORTED_AUTH:
+ if(pfkey_alg->sadb_alg_id > SADB_AALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "alg[%d], alg_id=%d > SADB_AALG_MAX=%d, fatal.\n",
+ i,
+ pfkey_alg->sadb_alg_id,
+ SADB_AALG_MAX);
+ SENDERR(EINVAL);
+ }
+ break;
+ case SADB_EXT_SUPPORTED_ENCRYPT:
+ if(pfkey_alg->sadb_alg_id > SADB_EALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "alg[%d], alg_id=%d > SADB_EALG_MAX=%d, fatal.\n",
+ i,
+ pfkey_alg->sadb_alg_id,
+ SADB_EALG_MAX);
+ SENDERR(EINVAL);
+ }
+ break;
+ default:
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "alg[%d], alg_id=%d > SADB_EALG_MAX=%d, fatal.\n",
+ i,
+ pfkey_alg->sadb_alg_id,
+ SADB_EALG_MAX);
+ SENDERR(EINVAL);
+ }
+ pfkey_alg++;
+ }
+
+ errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_spirange_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_spirange *pfkey_spirange = (struct sadb_spirange *)pfkey_ext;
+
+ /* sanity checks... */
+ if(pfkey_spirange->sadb_spirange_len !=
+ sizeof(struct sadb_spirange) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_spirange_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_spirange->sadb_spirange_len,
+ (int)sizeof(struct sadb_spirange));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_spirange->sadb_spirange_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_spirange_parse: "
+ "reserved=%d must be set to zero.\n",
+ pfkey_spirange->sadb_spirange_reserved);
+ SENDERR(EINVAL);
+ }
+
+ if(ntohl(pfkey_spirange->sadb_spirange_max) < ntohl(pfkey_spirange->sadb_spirange_min)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_spirange_parse: "
+ "minspi=%08x must be < maxspi=%08x.\n",
+ ntohl(pfkey_spirange->sadb_spirange_min),
+ ntohl(pfkey_spirange->sadb_spirange_max));
+ SENDERR(EINVAL);
+ }
+
+ if(ntohl(pfkey_spirange->sadb_spirange_min) <= 255) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_spirange_parse: "
+ "minspi=%08x must be > 255.\n",
+ ntohl(pfkey_spirange->sadb_spirange_min));
+ SENDERR(EEXIST);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_spirange_parse: "
+ "ext_len=%u ext_type=%u(%s) min=%u max=%u res=%u.\n",
+ pfkey_spirange->sadb_spirange_len,
+ pfkey_spirange->sadb_spirange_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_spirange->sadb_spirange_exttype),
+ pfkey_spirange->sadb_spirange_min,
+ pfkey_spirange->sadb_spirange_max,
+ pfkey_spirange->sadb_spirange_reserved);
+ errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_x_kmprivate_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_x_kmprivate *pfkey_x_kmprivate = (struct sadb_x_kmprivate *)pfkey_ext;
+
+ /* sanity checks... */
+ if(pfkey_x_kmprivate->sadb_x_kmprivate_len <
+ sizeof(struct sadb_x_kmprivate) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_kmprivate_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_x_kmprivate->sadb_x_kmprivate_len,
+ (int)sizeof(struct sadb_x_kmprivate));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_x_kmprivate->sadb_x_kmprivate_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_kmprivate_parse: "
+ "reserved=%d must be set to zero.\n",
+ pfkey_x_kmprivate->sadb_x_kmprivate_reserved);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_kmprivate_parse: "
+ "Sorry, I can't parse exttype=%d yet.\n",
+ pfkey_ext->sadb_ext_type);
+ SENDERR(EINVAL); /* don't process these yet */
+
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_x_satype_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ int i;
+ struct sadb_x_satype *pfkey_x_satype = (struct sadb_x_satype *)pfkey_ext;
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_x_satype_parse: enter\n");
+ /* sanity checks... */
+ if(pfkey_x_satype->sadb_x_satype_len !=
+ sizeof(struct sadb_x_satype) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_satype_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_x_satype->sadb_x_satype_len,
+ (int)sizeof(struct sadb_x_satype));
+ SENDERR(EINVAL);
+ }
+
+ if(!pfkey_x_satype->sadb_x_satype_satype) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_satype_parse: "
+ "satype is zero, must be non-zero.\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_x_satype->sadb_x_satype_satype > SADB_SATYPE_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_satype_parse: "
+ "satype %d > max %d, invalid.\n",
+ pfkey_x_satype->sadb_x_satype_satype, SADB_SATYPE_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(!(satype2proto(pfkey_x_satype->sadb_x_satype_satype))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_satype_parse: "
+ "proto lookup from satype=%d failed.\n",
+ pfkey_x_satype->sadb_x_satype_satype);
+ SENDERR(EINVAL);
+ }
+
+ for(i = 0; i < 3; i++) {
+ if(pfkey_x_satype->sadb_x_satype_reserved[i]) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_satype_parse: "
+ "reserved[%d]=%d must be set to zero.\n",
+ i, pfkey_x_satype->sadb_x_satype_reserved[i]);
+ SENDERR(EINVAL);
+ }
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_x_satype_parse: "
+ "len=%u ext=%u(%s) satype=%u(%s) res=%u,%u,%u.\n",
+ pfkey_x_satype->sadb_x_satype_len,
+ pfkey_x_satype->sadb_x_satype_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_x_satype->sadb_x_satype_exttype),
+ pfkey_x_satype->sadb_x_satype_satype,
+ satype2name(pfkey_x_satype->sadb_x_satype_satype),
+ pfkey_x_satype->sadb_x_satype_reserved[0],
+ pfkey_x_satype->sadb_x_satype_reserved[1],
+ pfkey_x_satype->sadb_x_satype_reserved[2]);
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_x_ext_debug_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ int i;
+ struct sadb_x_debug *pfkey_x_debug = (struct sadb_x_debug *)pfkey_ext;
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_x_debug_parse: enter\n");
+ /* sanity checks... */
+ if(pfkey_x_debug->sadb_x_debug_len !=
+ sizeof(struct sadb_x_debug) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_debug_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_x_debug->sadb_x_debug_len,
+ (int)sizeof(struct sadb_x_debug));
+ SENDERR(EINVAL);
+ }
+
+ for(i = 0; i < 4; i++) {
+ if(pfkey_x_debug->sadb_x_debug_reserved[i]) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_debug_parse: "
+ "reserved[%d]=%d must be set to zero.\n",
+ i, pfkey_x_debug->sadb_x_debug_reserved[i]);
+ SENDERR(EINVAL);
+ }
+ }
+
+errlab:
+ return error;
+}
+
+#ifdef NAT_TRAVERSAL
+DEBUG_NO_STATIC int
+pfkey_x_ext_nat_t_type_parse(struct sadb_ext *pfkey_ext)
+{
+ return 0;
+}
+DEBUG_NO_STATIC int
+pfkey_x_ext_nat_t_port_parse(struct sadb_ext *pfkey_ext)
+{
+ return 0;
+}
+#endif
+
+#define DEFINEPARSER(NAME) static struct pf_key_ext_parsers_def NAME##_def={NAME, #NAME};
+
+DEFINEPARSER(pfkey_sa_parse);
+DEFINEPARSER(pfkey_lifetime_parse);
+DEFINEPARSER(pfkey_address_parse);
+DEFINEPARSER(pfkey_key_parse);
+DEFINEPARSER(pfkey_ident_parse);
+DEFINEPARSER(pfkey_sens_parse);
+DEFINEPARSER(pfkey_prop_parse);
+DEFINEPARSER(pfkey_supported_parse);
+DEFINEPARSER(pfkey_spirange_parse);
+DEFINEPARSER(pfkey_x_kmprivate_parse);
+DEFINEPARSER(pfkey_x_satype_parse);
+DEFINEPARSER(pfkey_x_ext_debug_parse);
+#ifdef NAT_TRAVERSAL
+DEFINEPARSER(pfkey_x_ext_nat_t_type_parse);
+DEFINEPARSER(pfkey_x_ext_nat_t_port_parse);
+#endif
+
+struct pf_key_ext_parsers_def *ext_default_parsers[]=
+{
+ NULL, /* pfkey_msg_parse, */
+ &pfkey_sa_parse_def,
+ &pfkey_lifetime_parse_def,
+ &pfkey_lifetime_parse_def,
+ &pfkey_lifetime_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_key_parse_def,
+ &pfkey_key_parse_def,
+ &pfkey_ident_parse_def,
+ &pfkey_ident_parse_def,
+ &pfkey_sens_parse_def,
+ &pfkey_prop_parse_def,
+ &pfkey_supported_parse_def,
+ &pfkey_supported_parse_def,
+ &pfkey_spirange_parse_def,
+ &pfkey_x_kmprivate_parse_def,
+ &pfkey_x_satype_parse_def,
+ &pfkey_sa_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_x_ext_debug_parse_def
+#ifdef NAT_TRAVERSAL
+ ,
+ &pfkey_x_ext_nat_t_type_parse_def,
+ &pfkey_x_ext_nat_t_port_parse_def,
+ &pfkey_x_ext_nat_t_port_parse_def,
+ &pfkey_address_parse_def
+#endif
+};
+
+int
+pfkey_msg_parse(struct sadb_msg *pfkey_msg,
+ struct pf_key_ext_parsers_def *ext_parsers[],
+ struct sadb_ext *extensions[],
+ int dir)
+{
+ int error = 0;
+ int remain;
+ struct sadb_ext *pfkey_ext;
+ int extensions_seen = 0;
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_msg_parse: "
+ "parsing message ver=%d, type=%d(%s), errno=%d, satype=%d(%s), len=%d, res=%d, seq=%d, pid=%d.\n",
+ pfkey_msg->sadb_msg_version,
+ pfkey_msg->sadb_msg_type,
+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type),
+ pfkey_msg->sadb_msg_errno,
+ pfkey_msg->sadb_msg_satype,
+ satype2name(pfkey_msg->sadb_msg_satype),
+ pfkey_msg->sadb_msg_len,
+ pfkey_msg->sadb_msg_reserved,
+ pfkey_msg->sadb_msg_seq,
+ pfkey_msg->sadb_msg_pid);
+
+ if(ext_parsers == NULL) ext_parsers = ext_default_parsers;
+
+ pfkey_extensions_init(extensions);
+
+ remain = pfkey_msg->sadb_msg_len;
+ remain -= sizeof(struct sadb_msg) / IPSEC_PFKEYv2_ALIGN;
+
+ pfkey_ext = (struct sadb_ext*)((char*)pfkey_msg +
+ sizeof(struct sadb_msg));
+
+ extensions[0] = (struct sadb_ext *) pfkey_msg;
+
+
+ if(pfkey_msg->sadb_msg_version != PF_KEY_V2) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "not PF_KEY_V2 msg, found %d, should be %d.\n",
+ pfkey_msg->sadb_msg_version,
+ PF_KEY_V2);
+ SENDERR(EINVAL);
+ }
+
+ if(!pfkey_msg->sadb_msg_type) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "msg type not set, must be non-zero..\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_msg->sadb_msg_type > SADB_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "msg type=%d > max=%d.\n",
+ pfkey_msg->sadb_msg_type,
+ SADB_MAX);
+ SENDERR(EINVAL);
+ }
+
+ switch(pfkey_msg->sadb_msg_type) {
+ case SADB_GETSPI:
+ case SADB_UPDATE:
+ case SADB_ADD:
+ case SADB_DELETE:
+ case SADB_GET:
+ case SADB_X_GRPSA:
+ case SADB_X_ADDFLOW:
+ if(!satype2proto(pfkey_msg->sadb_msg_satype)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "satype %d conversion to proto failed for msg_type %d (%s).\n",
+ pfkey_msg->sadb_msg_satype,
+ pfkey_msg->sadb_msg_type,
+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type));
+ SENDERR(EINVAL);
+ } else {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "satype %d(%s) conversion to proto gives %d for msg_type %d(%s).\n",
+ pfkey_msg->sadb_msg_satype,
+ satype2name(pfkey_msg->sadb_msg_satype),
+ satype2proto(pfkey_msg->sadb_msg_satype),
+ pfkey_msg->sadb_msg_type,
+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type));
+ }
+ case SADB_ACQUIRE:
+ case SADB_REGISTER:
+ case SADB_EXPIRE:
+ if(!pfkey_msg->sadb_msg_satype) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "satype is zero, must be non-zero for msg_type %d(%s).\n",
+ pfkey_msg->sadb_msg_type,
+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type));
+ SENDERR(EINVAL);
+ }
+ default:
+ break;
+ }
+
+ /* errno must not be set in downward messages */
+ /* this is not entirely true... a response to an ACQUIRE could return an error */
+ if((dir == EXT_BITS_IN) && (pfkey_msg->sadb_msg_type != SADB_ACQUIRE) && pfkey_msg->sadb_msg_errno) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "errno set to %d.\n",
+ pfkey_msg->sadb_msg_errno);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_msg_parse: "
+ "remain=%d, ext_type=%d(%s), ext_len=%d.\n",
+ remain,
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ pfkey_ext->sadb_ext_len);
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_msg_parse: "
+ "extensions permitted=%08x, required=%08x.\n",
+ extensions_bitmaps[dir][EXT_BITS_PERM][pfkey_msg->sadb_msg_type],
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type]);
+
+ extensions_seen = 1;
+
+ while( (remain * IPSEC_PFKEYv2_ALIGN) >= sizeof(struct sadb_ext) ) {
+ /* Is there enough message left to support another extension header? */
+ if(remain < pfkey_ext->sadb_ext_len) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "remain %d less than ext len %d.\n",
+ remain, pfkey_ext->sadb_ext_len);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_msg_parse: "
+ "parsing ext type=%d(%s) remain=%d.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ remain);
+
+ /* Is the extension header type valid? */
+ if((pfkey_ext->sadb_ext_type > SADB_EXT_MAX) || (!pfkey_ext->sadb_ext_type)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "ext type %d(%s) invalid, SADB_EXT_MAX=%d.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ SADB_EXT_MAX);
+ SENDERR(EINVAL);
+ }
+
+ /* Have we already seen this type of extension? */
+ if((extensions_seen & ( 1 << pfkey_ext->sadb_ext_type )) != 0)
+ {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "ext type %d(%s) already seen.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type));
+ SENDERR(EINVAL);
+ }
+
+ /* Do I even know about this type of extension? */
+ if(ext_parsers[pfkey_ext->sadb_ext_type]==NULL) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "ext type %d(%s) unknown, ignoring.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type));
+ goto next_ext;
+ }
+
+ /* Is this type of extension permitted for this type of message? */
+ if(!(extensions_bitmaps[dir][EXT_BITS_PERM][pfkey_msg->sadb_msg_type] &
+ 1<<pfkey_ext->sadb_ext_type)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "ext type %d(%s) not permitted, exts_perm_in=%08x, 1<<type=%08x\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ extensions_bitmaps[dir][EXT_BITS_PERM][pfkey_msg->sadb_msg_type],
+ 1<<pfkey_ext->sadb_ext_type);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_msg_parse: "
+ "remain=%d ext_type=%d(%s) ext_len=%d parsing ext 0p%p with parser %s.\n",
+ remain,
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ pfkey_ext->sadb_ext_len,
+ pfkey_ext,
+ ext_parsers[pfkey_ext->sadb_ext_type]->parser_name);
+
+ /* Parse the extension */
+ if((error =
+ (*ext_parsers[pfkey_ext->sadb_ext_type]->parser)(pfkey_ext))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "extension parsing for type %d(%s) failed with error %d.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ error);
+ SENDERR(-error);
+ }
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_msg_parse: "
+ "Extension %d(%s) parsed.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type));
+
+ /* Mark that we have seen this extension and remember the header location */
+ extensions_seen |= ( 1 << pfkey_ext->sadb_ext_type );
+ extensions[pfkey_ext->sadb_ext_type] = pfkey_ext;
+
+ next_ext:
+ /* Calculate how much message remains */
+ remain -= pfkey_ext->sadb_ext_len;
+
+ if(!remain) {
+ break;
+ }
+ /* Find the next extension header */
+ pfkey_ext = (struct sadb_ext*)((char*)pfkey_ext +
+ pfkey_ext->sadb_ext_len * IPSEC_PFKEYv2_ALIGN);
+ }
+
+ if(remain) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "unexpected remainder of %d.\n",
+ remain);
+ /* why is there still something remaining? */
+ SENDERR(EINVAL);
+ }
+
+ /* check required extensions */
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_msg_parse: "
+ "extensions permitted=%08x, seen=%08x, required=%08x.\n",
+ extensions_bitmaps[dir][EXT_BITS_PERM][pfkey_msg->sadb_msg_type],
+ extensions_seen,
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type]);
+
+ /* don't check further if it is an error return message since it
+ may not have a body */
+ if(pfkey_msg->sadb_msg_errno) {
+ SENDERR(-error);
+ }
+
+ if((extensions_seen &
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type]) !=
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type]) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "required extensions missing:%08x.\n",
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type] -
+ (extensions_seen &
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type]));
+ SENDERR(EINVAL);
+ }
+
+ if((dir == EXT_BITS_IN) && (pfkey_msg->sadb_msg_type == SADB_X_DELFLOW)
+ && ((extensions_seen & SADB_X_EXT_ADDRESS_DELFLOW)
+ != SADB_X_EXT_ADDRESS_DELFLOW)
+ && (((extensions_seen & (1<<SADB_EXT_SA)) != (1<<SADB_EXT_SA))
+ || ((((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_flags
+ & SADB_X_SAFLAGS_CLEARFLOW)
+ != SADB_X_SAFLAGS_CLEARFLOW))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "required SADB_X_DELFLOW extensions missing: either %08x must be present or %08x must be present with SADB_X_SAFLAGS_CLEARFLOW set.\n",
+ SADB_X_EXT_ADDRESS_DELFLOW
+ - (extensions_seen & SADB_X_EXT_ADDRESS_DELFLOW),
+ (1<<SADB_EXT_SA) - (extensions_seen & (1<<SADB_EXT_SA)));
+ SENDERR(EINVAL);
+ }
+
+ switch(pfkey_msg->sadb_msg_type) {
+ case SADB_ADD:
+ case SADB_UPDATE:
+ /* check maturity */
+ if(((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_state !=
+ SADB_SASTATE_MATURE) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "state=%d for add or update should be MATURE=%d.\n",
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_state,
+ SADB_SASTATE_MATURE);
+ SENDERR(EINVAL);
+ }
+
+ /* check AH and ESP */
+ switch(((struct sadb_msg*)extensions[SADB_EXT_RESERVED])->sadb_msg_satype) {
+ case SADB_SATYPE_AH:
+ if(!(((struct sadb_sa*)extensions[SADB_EXT_SA]) &&
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_auth !=
+ SADB_AALG_NONE)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "auth alg is zero, must be non-zero for AH SAs.\n");
+ SENDERR(EINVAL);
+ }
+ if(((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_encrypt !=
+ SADB_EALG_NONE) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "AH handed encalg=%d, must be zero.\n",
+ ((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_encrypt);
+ SENDERR(EINVAL);
+ }
+ break;
+ case SADB_SATYPE_ESP:
+ if(!(((struct sadb_sa*)extensions[SADB_EXT_SA]) &&
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt !=
+ SADB_EALG_NONE)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "encrypt alg=%d is zero, must be non-zero for ESP=%d SAs.\n",
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt,
+ ((struct sadb_msg*)extensions[SADB_EXT_RESERVED])->sadb_msg_satype);
+ SENDERR(EINVAL);
+ }
+ if((((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_encrypt ==
+ SADB_EALG_NULL) &&
+ (((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_auth ==
+ SADB_AALG_NONE) ) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "ESP handed encNULL+authNONE, illegal combination.\n");
+ SENDERR(EINVAL);
+ }
+ break;
+ case SADB_X_SATYPE_COMP:
+ if(!(((struct sadb_sa*)extensions[SADB_EXT_SA]) &&
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt !=
+ SADB_EALG_NONE)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "encrypt alg=%d is zero, must be non-zero for COMP=%d SAs.\n",
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt,
+ ((struct sadb_msg*)extensions[SADB_EXT_RESERVED])->sadb_msg_satype);
+ SENDERR(EINVAL);
+ }
+ if(((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_auth !=
+ SADB_AALG_NONE) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "COMP handed auth=%d, must be zero.\n",
+ ((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_auth);
+ SENDERR(EINVAL);
+ }
+ break;
+ default:
+ break;
+ }
+ if(ntohl(((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_spi) <= 255) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "spi=%08x must be > 255.\n",
+ ntohl(((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_spi));
+ SENDERR(EINVAL);
+ }
+ default:
+ break;
+ }
+errlab:
+
+ return error;
+}
+
+/*
+ * $Log: pfkey_v2_parse.c,v $
+ * Revision 1.53 2003/01/30 02:32:09 rgb
+ *
+ * Rename SAref table macro names for clarity.
+ * Convert IPsecSAref_t from signed to unsigned to fix apparent SAref exhaustion bug.
+ *
+ * Revision 1.52 2002/12/30 06:53:07 mcr
+ * deal with short SA structures... #if 0 out for now. Probably
+ * not quite the right way.
+ *
+ * Revision 1.51 2002/12/13 18:16:02 mcr
+ * restored sa_ref code
+ *
+ * Revision 1.50 2002/12/13 18:06:52 mcr
+ * temporarily removed sadb_x_sa_ref reference for 2.xx
+ *
+ * Revision 1.49 2002/10/05 05:02:58 dhr
+ *
+ * C labels go on statements
+ *
+ * Revision 1.48 2002/09/20 15:40:45 rgb
+ * Added sadb_x_sa_ref to struct sadb_sa.
+ *
+ * Revision 1.47 2002/09/20 05:01:31 rgb
+ * Fixed usage of pfkey_lib_debug.
+ * Format for function declaration style consistency.
+ * Added text labels to elucidate numeric values presented.
+ * Re-organised debug output to reduce noise in output.
+ *
+ * Revision 1.46 2002/07/24 18:44:54 rgb
+ * Type fiddling to tame ia64 compiler.
+ *
+ * Revision 1.45 2002/05/23 07:14:11 rgb
+ * Cleaned up %p variants to 0p%p for test suite cleanup.
+ *
+ * Revision 1.44 2002/04/24 07:55:32 mcr
+ * #include patches and Makefiles for post-reorg compilation.
+ *
+ * Revision 1.43 2002/04/24 07:36:40 mcr
+ * Moved from ./lib/pfkey_v2_parse.c,v
+ *
+ * Revision 1.42 2002/01/29 22:25:36 rgb
+ * Re-add ipsec_kversion.h to keep MALLOC happy.
+ *
+ * Revision 1.41 2002/01/29 01:59:10 mcr
+ * removal of kversions.h - sources that needed it now use ipsec_param.h.
+ * updating of IPv6 structures to match latest in6.h version.
+ * removed dead code from freeswan.h that also duplicated kversions.h
+ * code.
+ *
+ * Revision 1.40 2002/01/20 20:34:50 mcr
+ * added pfkey_v2_sadb_type_string to decode sadb_type to string.
+ *
+ * Revision 1.39 2001/11/27 05:29:22 mcr
+ * pfkey parses are now maintained by a structure
+ * that includes their name for debug purposes.
+ * DEBUGGING() macro changed so that it takes a debug
+ * level so that pf_key() can use this to decode the
+ * structures without innundanting humans.
+ * Also uses pfkey_v2_sadb_ext_string() in messages.
+ *
+ * Revision 1.38 2001/11/06 19:47:47 rgb
+ * Added packet parameter to lifetime and comb structures.
+ *
+ * Revision 1.37 2001/10/18 04:45:24 rgb
+ * 2.4.9 kernel deprecates linux/malloc.h in favour of linux/slab.h,
+ * lib/freeswan.h version macros moved to lib/kversions.h.
+ * Other compiler directive cleanups.
+ *
+ * Revision 1.36 2001/06/14 19:35:16 rgb
+ * Update copyright date.
+ *
+ * Revision 1.35 2001/05/03 19:44:51 rgb
+ * Standardise on SENDERR() macro.
+ *
+ * Revision 1.34 2001/03/16 07:41:51 rgb
+ * Put freeswan.h include before pluto includes.
+ *
+ * Revision 1.33 2001/02/27 07:13:51 rgb
+ * Added satype2name() function.
+ * Added text to default satype_tbl entry.
+ * Added satype2name() conversions for most satype debug output.
+ *
+ * Revision 1.32 2001/02/26 20:01:09 rgb
+ * Added internal IP protocol 61 for magic SAs.
+ * Ditch unused sadb_satype2proto[], replaced by satype2proto().
+ * Re-formatted debug output (split lines, consistent spacing).
+ * Removed acquire, register and expire requirements for a known satype.
+ * Changed message type checking to a switch structure.
+ * Verify expected NULL auth for IPCOMP.
+ * Enforced spi > 0x100 requirement, now that pass uses a magic SA for
+ * appropriate message types.
+ *
+ * Revision 1.31 2000/12/01 07:09:00 rgb
+ * Added ipcomp sanity check to require encalgo is set.
+ *
+ * Revision 1.30 2000/11/17 18:10:30 rgb
+ * Fixed bugs mostly relating to spirange, to treat all spi variables as
+ * network byte order since this is the way PF_KEYv2 stored spis.
+ *
+ * Revision 1.29 2000/10/12 00:02:39 rgb
+ * Removed 'format, ##' nonsense from debug macros for RH7.0.
+ *
+ * Revision 1.28 2000/09/20 16:23:04 rgb
+ * Remove over-paranoid extension check in the presence of sadb_msg_errno.
+ *
+ * Revision 1.27 2000/09/20 04:04:21 rgb
+ * Changed static functions to DEBUG_NO_STATIC to reveal function names in
+ * oopsen.
+ *
+ * Revision 1.26 2000/09/15 11:37:02 rgb
+ * Merge in heavily modified Svenning Soerensen's <svenning@post5.tele.dk>
+ * IPCOMP zlib deflate code.
+ *
+ * Revision 1.25 2000/09/12 22:35:37 rgb
+ * Restructured to remove unused extensions from CLEARFLOW messages.
+ *
+ * Revision 1.24 2000/09/12 18:59:54 rgb
+ * Added Gerhard's IPv6 support to pfkey parts of libfreeswan.
+ *
+ * Revision 1.23 2000/09/12 03:27:00 rgb
+ * Moved DEBUGGING definition to compile kernel with debug off.
+ *
+ * Revision 1.22 2000/09/09 06:39:27 rgb
+ * Restrict pfkey errno check to downward messages only.
+ *
+ * Revision 1.21 2000/09/08 19:22:34 rgb
+ * Enabled pfkey_sens_parse().
+ * Added check for errno on downward acquire messages only.
+ *
+ * Revision 1.20 2000/09/01 18:48:23 rgb
+ * Fixed reserved check bug and added debug output in
+ * pfkey_supported_parse().
+ * Fixed debug output label bug in pfkey_ident_parse().
+ *
+ * Revision 1.19 2000/08/27 01:55:26 rgb
+ * Define OCTETBITS and PFKEYBITS to avoid using 'magic' numbers in code.
+ *
+ * Revision 1.18 2000/08/24 17:00:36 rgb
+ * Ignore unknown extensions instead of failing.
+ *
+ * Revision 1.17 2000/06/02 22:54:14 rgb
+ * Added Gerhard Gessler's struct sockaddr_storage mods for IPv6 support.
+ *
+ * Revision 1.16 2000/05/10 19:25:11 rgb
+ * Fleshed out proposal and supported extensions.
+ *
+ * Revision 1.15 2000/01/24 21:15:31 rgb
+ * Added disabled pluto pfkey lib debug flag.
+ * Added algo debugging reporting.
+ *
+ * Revision 1.14 2000/01/22 23:24:29 rgb
+ * Added new functions proto2satype() and satype2proto() and lookup
+ * table satype_tbl. Also added proto2name() since it was easy.
+ *
+ * Revision 1.13 2000/01/21 09:43:59 rgb
+ * Cast ntohl(spi) as (unsigned long int) to shut up compiler.
+ *
+ * Revision 1.12 2000/01/21 06:28:19 rgb
+ * Added address cases for eroute flows.
+ * Indented compiler directives for readability.
+ * Added klipsdebug switching capability.
+ *
+ * Revision 1.11 1999/12/29 21:14:59 rgb
+ * Fixed debug text cut and paste typo.
+ *
+ * Revision 1.10 1999/12/10 17:45:24 rgb
+ * Added address debugging.
+ *
+ * Revision 1.9 1999/12/09 23:11:42 rgb
+ * Ditched <string.h> include since we no longer use memset().
+ * Use new pfkey_extensions_init() instead of memset().
+ * Added check for SATYPE in pfkey_msg_build().
+ * Tidy up comments and debugging comments.
+ *
+ * Revision 1.8 1999/12/07 19:55:26 rgb
+ * Removed unused first argument from extension parsers.
+ * Removed static pluto debug flag.
+ * Moved message type and state checking to pfkey_msg_parse().
+ * Changed print[fk] type from lx to x to quiet compiler.
+ * Removed redundant remain check.
+ * Changed __u* types to uint* to avoid use of asm/types.h and
+ * sys/types.h in userspace code.
+ *
+ * Revision 1.7 1999/12/01 22:20:51 rgb
+ * Moved pfkey_lib_debug variable into the library.
+ * Added pfkey version check into header parsing.
+ * Added check for SATYPE only for those extensions that require a
+ * non-zero value.
+ *
+ * Revision 1.6 1999/11/27 11:58:05 rgb
+ * Added ipv6 headers.
+ * Moved sadb_satype2proto protocol lookup table from
+ * klips/net/ipsec/pfkey_v2_parser.c.
+ * Enable lifetime_current checking.
+ * Debugging error messages added.
+ * Add argument to pfkey_msg_parse() for direction.
+ * Consolidated the 4 1-d extension bitmap arrays into one 4-d array.
+ * Add CVS log entry to bottom of file.
+ * Moved auth and enc alg check to pfkey_msg_parse().
+ * Enable accidentally disabled spirange parsing.
+ * Moved protocol/algorithm checks from klips/net/ipsec/pfkey_v2_parser.c
+ *
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ *
+ */
diff --git a/tests/contrib/pfkey_v2_parse.c/orig b/tests/contrib/pfkey_v2_parse.c/orig
new file mode 100644
index 0000000..8d09797
--- /dev/null
+++ b/tests/contrib/pfkey_v2_parse.c/orig
@@ -0,0 +1,1778 @@
+/*
+ * RFC2367 PF_KEYv2 Key management API message parser
+ * Copyright (C) 1999, 2000, 2001 Richard Guy Briggs.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version. See <http://www.fsf.org/copyleft/gpl.txt>.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * RCSID $Id: pfkey_v2_parse.c,v 1.53 2003/01/30 02:32:09 rgb Exp $
+ */
+
+/*
+ * Template from klips/net/ipsec/ipsec/ipsec_parser.c.
+ */
+
+char pfkey_v2_parse_c_version[] = "$Id: pfkey_v2_parse.c,v 1.53 2003/01/30 02:32:09 rgb Exp $";
+
+/*
+ * Some ugly stuff to allow consistent debugging code for use in the
+ * kernel and in user space
+*/
+
+#ifdef __KERNEL__
+
+# include <linux/kernel.h> /* for printk */
+
+#include "freeswan/ipsec_kversion.h" /* for malloc switch */
+
+# ifdef MALLOC_SLAB
+# include <linux/slab.h> /* kmalloc() */
+# else /* MALLOC_SLAB */
+# include <linux/malloc.h> /* kmalloc() */
+# endif /* MALLOC_SLAB */
+# include <linux/errno.h> /* error codes */
+# include <linux/types.h> /* size_t */
+# include <linux/interrupt.h> /* mark_bh */
+
+# include <linux/netdevice.h> /* struct device, and other headers */
+# include <linux/etherdevice.h> /* eth_type_trans */
+# include <linux/ip.h> /* struct iphdr */
+# if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+# include <linux/ipv6.h> /* struct ipv6hdr */
+# endif /* if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
+extern int debug_pfkey;
+
+# include <freeswan.h>
+
+#include "freeswan/ipsec_encap.h"
+
+#else /* __KERNEL__ */
+
+# include <sys/types.h>
+# include <linux/types.h>
+# include <linux/errno.h>
+
+# include <freeswan.h>
+# include "programs/pluto/constants.h"
+# include "programs/pluto/defs.h" /* for PRINTF_LIKE */
+# include "programs/pluto/log.h" /* for debugging and DBG_log */
+
+/* #define PLUTO */
+
+# ifdef PLUTO
+# define DEBUGGING(level, args...) { DBG_log("pfkey_lib_debug:" args); }
+# else
+# define DEBUGGING(level, args...) if(pfkey_lib_debug & level) { printf("pfkey_lib_debug:" args); } else { ; }
+# endif
+
+#endif /* __KERNEL__ */
+
+
+#include <pfkeyv2.h>
+#include <pfkey.h>
+
+#ifdef __KERNEL__
+# include "freeswan/ipsec_netlink.h" /* KLIPS_PRINT */
+extern int sysctl_ipsec_debug_verbose;
+# define DEBUGGING(level, args...) \
+ KLIPS_PRINT( \
+ ((debug_pfkey & level & (PF_KEY_DEBUG_PARSE_STRUCT | PF_KEY_DEBUG_PARSE_PROBLEM)) \
+ || (sysctl_ipsec_debug_verbose && (debug_pfkey & level & PF_KEY_DEBUG_PARSE_FLOW))) \
+ , "klips_debug:" args)
+#endif /* __KERNEL__ */
+#include "freeswan/ipsec_sa.h" /* IPSEC_SAREF_NULL, IPSEC_SA_REF_TABLE_IDX_WIDTH */
+
+
+#define SENDERR(_x) do { error = -(_x); goto errlab; } while (0)
+
+struct satype_tbl {
+ uint8_t proto;
+ uint8_t satype;
+ char* name;
+} static satype_tbl[] = {
+#ifdef __KERNEL__
+ { IPPROTO_ESP, SADB_SATYPE_ESP, "ESP" },
+ { IPPROTO_AH, SADB_SATYPE_AH, "AH" },
+ { IPPROTO_IPIP, SADB_X_SATYPE_IPIP, "IPIP" },
+#ifdef CONFIG_IPSEC_IPCOMP
+ { IPPROTO_COMP, SADB_X_SATYPE_COMP, "COMP" },
+#endif /* CONFIG_IPSEC_IPCOMP */
+ { IPPROTO_INT, SADB_X_SATYPE_INT, "INT" },
+#else /* __KERNEL__ */
+ { SA_ESP, SADB_SATYPE_ESP, "ESP" },
+ { SA_AH, SADB_SATYPE_AH, "AH" },
+ { SA_IPIP, SADB_X_SATYPE_IPIP, "IPIP" },
+ { SA_COMP, SADB_X_SATYPE_COMP, "COMP" },
+ { SA_INT, SADB_X_SATYPE_INT, "INT" },
+#endif /* __KERNEL__ */
+ { 0, 0, "UNKNOWN" }
+};
+
+uint8_t
+satype2proto(uint8_t satype)
+{
+ int i =0;
+
+ while(satype_tbl[i].satype != satype && satype_tbl[i].satype != 0) {
+ i++;
+ }
+ return satype_tbl[i].proto;
+}
+
+uint8_t
+proto2satype(uint8_t proto)
+{
+ int i = 0;
+
+ while(satype_tbl[i].proto != proto && satype_tbl[i].proto != 0) {
+ i++;
+ }
+ return satype_tbl[i].satype;
+}
+
+char*
+satype2name(uint8_t satype)
+{
+ int i = 0;
+
+ while(satype_tbl[i].satype != satype && satype_tbl[i].satype != 0) {
+ i++;
+ }
+ return satype_tbl[i].name;
+}
+
+char*
+proto2name(uint8_t proto)
+{
+ int i = 0;
+
+ while(satype_tbl[i].proto != proto && satype_tbl[i].proto != 0) {
+ i++;
+ }
+ return satype_tbl[i].name;
+}
+
+/* Default extension parsers taken from the KLIPS code */
+
+DEBUG_NO_STATIC int
+pfkey_sa_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_sa *pfkey_sa = (struct sadb_sa *)pfkey_ext;
+#if 0
+ struct sadb_sa sav2;
+#endif
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_sa_parse: entry\n");
+ /* sanity checks... */
+ if(!pfkey_sa) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "NULL pointer passed in.\n");
+ SENDERR(EINVAL);
+ }
+
+#if 0
+ /* check if this structure is short, and if so, fix it up.
+ * XXX this is NOT the way to do things.
+ */
+ if(pfkey_sa->sadb_sa_len == sizeof(struct sadb_sa_v1)/IPSEC_PFKEYv2_ALIGN) {
+
+ /* yes, so clear out a temporary structure, and copy first */
+ memset(&sav2, 0, sizeof(sav2));
+ memcpy(&sav2, pfkey_sa, sizeof(struct sadb_sa_v1));
+ sav2.sadb_x_sa_ref=-1;
+ sav2.sadb_sa_len = sizeof(struct sadb_sa) / IPSEC_PFKEYv2_ALIGN;
+
+ pfkey_sa = &sav2;
+ }
+#endif
+
+
+ if(pfkey_sa->sadb_sa_len != sizeof(struct sadb_sa) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "length wrong pfkey_sa->sadb_sa_len=%d sizeof(struct sadb_sa)=%d.\n",
+ pfkey_sa->sadb_sa_len,
+ (int)sizeof(struct sadb_sa));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_sa->sadb_sa_encrypt > SADB_EALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "pfkey_sa->sadb_sa_encrypt=%d > SADB_EALG_MAX=%d.\n",
+ pfkey_sa->sadb_sa_encrypt,
+ SADB_EALG_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_sa->sadb_sa_auth > SADB_AALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "pfkey_sa->sadb_sa_auth=%d > SADB_AALG_MAX=%d.\n",
+ pfkey_sa->sadb_sa_auth,
+ SADB_AALG_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_sa->sadb_sa_state > SADB_SASTATE_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "state=%d exceeds MAX=%d.\n",
+ pfkey_sa->sadb_sa_state,
+ SADB_SASTATE_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_sa->sadb_sa_state == SADB_SASTATE_DEAD) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "state=%d is DEAD=%d.\n",
+ pfkey_sa->sadb_sa_state,
+ SADB_SASTATE_DEAD);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_sa->sadb_sa_replay > 64) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "replay window size: %d -- must be 0 <= size <= 64\n",
+ pfkey_sa->sadb_sa_replay);
+ SENDERR(EINVAL);
+ }
+
+ if(! ((pfkey_sa->sadb_sa_exttype == SADB_EXT_SA) ||
+ (pfkey_sa->sadb_sa_exttype == SADB_X_EXT_SA2)))
+ {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "unknown exttype=%d, expecting SADB_EXT_SA=%d or SADB_X_EXT_SA2=%d.\n",
+ pfkey_sa->sadb_sa_exttype,
+ SADB_EXT_SA,
+ SADB_X_EXT_SA2);
+ SENDERR(EINVAL);
+ }
+
+ if((IPSEC_SAREF_NULL != pfkey_sa->sadb_x_sa_ref) && (pfkey_sa->sadb_x_sa_ref >= (1 << IPSEC_SA_REF_TABLE_IDX_WIDTH))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sa_parse: "
+ "SAref=%d must be (SAref == IPSEC_SAREF_NULL(%d) || SAref < IPSEC_SA_REF_TABLE_NUM_ENTRIES(%d)).\n",
+ pfkey_sa->sadb_x_sa_ref,
+ IPSEC_SAREF_NULL,
+ IPSEC_SA_REF_TABLE_NUM_ENTRIES);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_sa_parse: "
+ "successfully found len=%d exttype=%d(%s) spi=%08lx replay=%d state=%d auth=%d encrypt=%d flags=%d ref=%d.\n",
+ pfkey_sa->sadb_sa_len,
+ pfkey_sa->sadb_sa_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_sa->sadb_sa_exttype),
+ (long unsigned int)ntohl(pfkey_sa->sadb_sa_spi),
+ pfkey_sa->sadb_sa_replay,
+ pfkey_sa->sadb_sa_state,
+ pfkey_sa->sadb_sa_auth,
+ pfkey_sa->sadb_sa_encrypt,
+ pfkey_sa->sadb_sa_flags,
+ pfkey_sa->sadb_x_sa_ref);
+
+ errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_lifetime_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_lifetime *pfkey_lifetime = (struct sadb_lifetime *)pfkey_ext;
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_lifetime_parse:enter\n");
+ /* sanity checks... */
+ if(!pfkey_lifetime) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_lifetime_parse: "
+ "NULL pointer passed in.\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_lifetime->sadb_lifetime_len !=
+ sizeof(struct sadb_lifetime) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_lifetime_parse: "
+ "length wrong pfkey_lifetime->sadb_lifetime_len=%d sizeof(struct sadb_lifetime)=%d.\n",
+ pfkey_lifetime->sadb_lifetime_len,
+ (int)sizeof(struct sadb_lifetime));
+ SENDERR(EINVAL);
+ }
+
+ if((pfkey_lifetime->sadb_lifetime_exttype != SADB_EXT_LIFETIME_HARD) &&
+ (pfkey_lifetime->sadb_lifetime_exttype != SADB_EXT_LIFETIME_SOFT) &&
+ (pfkey_lifetime->sadb_lifetime_exttype != SADB_EXT_LIFETIME_CURRENT)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_lifetime_parse: "
+ "unexpected ext_type=%d.\n",
+ pfkey_lifetime->sadb_lifetime_exttype);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_lifetime_parse: "
+ "life_type=%d(%s) alloc=%u bytes=%u add=%u use=%u pkts=%u.\n",
+ pfkey_lifetime->sadb_lifetime_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_lifetime->sadb_lifetime_exttype),
+ pfkey_lifetime->sadb_lifetime_allocations,
+ (unsigned)pfkey_lifetime->sadb_lifetime_bytes,
+ (unsigned)pfkey_lifetime->sadb_lifetime_addtime,
+ (unsigned)pfkey_lifetime->sadb_lifetime_usetime,
+ pfkey_lifetime->sadb_x_lifetime_packets);
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_address_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ int saddr_len = 0;
+ struct sadb_address *pfkey_address = (struct sadb_address *)pfkey_ext;
+ struct sockaddr* s = (struct sockaddr*)((char*)pfkey_address + sizeof(*pfkey_address));
+ char ipaddr_txt[ADDRTOT_BUF];
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_address_parse:enter\n");
+ /* sanity checks... */
+ if(!pfkey_address) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "NULL pointer passed in.\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_address->sadb_address_len <
+ (sizeof(struct sadb_address) + sizeof(struct sockaddr))/
+ IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "size wrong 1 ext_len=%d, adr_ext_len=%d, saddr_len=%d.\n",
+ pfkey_address->sadb_address_len,
+ (int)sizeof(struct sadb_address),
+ (int)sizeof(struct sockaddr));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_address->sadb_address_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "res=%d, must be zero.\n",
+ pfkey_address->sadb_address_reserved);
+ SENDERR(EINVAL);
+ }
+
+ switch(pfkey_address->sadb_address_exttype) {
+ case SADB_EXT_ADDRESS_SRC:
+ case SADB_EXT_ADDRESS_DST:
+ case SADB_EXT_ADDRESS_PROXY:
+ case SADB_X_EXT_ADDRESS_DST2:
+ case SADB_X_EXT_ADDRESS_SRC_FLOW:
+ case SADB_X_EXT_ADDRESS_DST_FLOW:
+ case SADB_X_EXT_ADDRESS_SRC_MASK:
+ case SADB_X_EXT_ADDRESS_DST_MASK:
+#ifdef NAT_TRAVERSAL
+ case SADB_X_EXT_NAT_T_OA:
+#endif
+ break;
+ default:
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "unexpected ext_type=%d.\n",
+ pfkey_address->sadb_address_exttype);
+ SENDERR(EINVAL);
+ }
+
+ switch(s->sa_family) {
+ case AF_INET:
+ saddr_len = sizeof(struct sockaddr_in);
+ sprintf(ipaddr_txt, "%d.%d.%d.%d"
+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 0) & 0xFF
+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 8) & 0xFF
+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 16) & 0xFF
+ , (((struct sockaddr_in*)s)->sin_addr.s_addr >> 24) & 0xFF);
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_address_parse: "
+ "found exttype=%u(%s) family=%d(AF_INET) address=%s proto=%u port=%u.\n",
+ pfkey_address->sadb_address_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_address->sadb_address_exttype),
+ s->sa_family,
+ ipaddr_txt,
+ pfkey_address->sadb_address_proto,
+ ((struct sockaddr_in*)s)->sin_port);
+ break;
+ case AF_INET6:
+ saddr_len = sizeof(struct sockaddr_in6);
+ sprintf(ipaddr_txt, "%x:%x:%x:%x:%x:%x:%x:%x"
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[0])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[1])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[2])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[3])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[4])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[5])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[6])
+ , ntohs(((struct sockaddr_in6*)s)->sin6_addr.s6_addr16[7]));
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_address_parse: "
+ "found exttype=%u(%s) family=%d(AF_INET6) address=%s proto=%u port=%u.\n",
+ pfkey_address->sadb_address_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_address->sadb_address_exttype),
+ s->sa_family,
+ ipaddr_txt,
+ pfkey_address->sadb_address_proto,
+ ((struct sockaddr_in6*)s)->sin6_port);
+ break;
+ default:
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "s->sa_family=%d not supported.\n",
+ s->sa_family);
+ SENDERR(EPFNOSUPPORT);
+ }
+
+ if(pfkey_address->sadb_address_len !=
+ DIVUP(sizeof(struct sadb_address) + saddr_len, IPSEC_PFKEYv2_ALIGN)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "size wrong 2 ext_len=%d, adr_ext_len=%d, saddr_len=%d.\n",
+ pfkey_address->sadb_address_len,
+ (int)sizeof(struct sadb_address),
+ saddr_len);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_address->sadb_address_prefixlen != 0) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_address_parse: "
+ "address prefixes not supported yet.\n");
+ SENDERR(EAFNOSUPPORT); /* not supported yet */
+ }
+
+ /* XXX check if port!=0 */
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_address_parse: successful.\n");
+ errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_key_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_key *pfkey_key = (struct sadb_key *)pfkey_ext;
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_key_parse:enter\n");
+ /* sanity checks... */
+
+ if(!pfkey_key) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "NULL pointer passed in.\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_key->sadb_key_len < sizeof(struct sadb_key) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_key->sadb_key_len,
+ (int)sizeof(struct sadb_key));
+ SENDERR(EINVAL);
+ }
+
+ if(!pfkey_key->sadb_key_bits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "key length set to zero, must be non-zero.\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_key->sadb_key_len !=
+ DIVUP(sizeof(struct sadb_key) * OCTETBITS + pfkey_key->sadb_key_bits,
+ PFKEYBITS)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "key length=%d does not agree with extension length=%d.\n",
+ pfkey_key->sadb_key_bits,
+ pfkey_key->sadb_key_len);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_key->sadb_key_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "res=%d, must be zero.\n",
+ pfkey_key->sadb_key_reserved);
+ SENDERR(EINVAL);
+ }
+
+ if(! ( (pfkey_key->sadb_key_exttype == SADB_EXT_KEY_AUTH) ||
+ (pfkey_key->sadb_key_exttype == SADB_EXT_KEY_ENCRYPT))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "expecting extension type AUTH or ENCRYPT, got %d.\n",
+ pfkey_key->sadb_key_exttype);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_key_parse: "
+ "success, found len=%d exttype=%d(%s) bits=%d reserved=%d.\n",
+ pfkey_key->sadb_key_len,
+ pfkey_key->sadb_key_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_key->sadb_key_exttype),
+ pfkey_key->sadb_key_bits,
+ pfkey_key->sadb_key_reserved);
+
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_ident_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_ident *pfkey_ident = (struct sadb_ident *)pfkey_ext;
+
+ /* sanity checks... */
+ if(pfkey_ident->sadb_ident_len < sizeof(struct sadb_ident) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_ident_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_ident->sadb_ident_len,
+ (int)sizeof(struct sadb_ident));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_ident->sadb_ident_type > SADB_IDENTTYPE_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_ident_parse: "
+ "ident_type=%d out of range, must be less than %d.\n",
+ pfkey_ident->sadb_ident_type,
+ SADB_IDENTTYPE_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_ident->sadb_ident_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_ident_parse: "
+ "res=%d, must be zero.\n",
+ pfkey_ident->sadb_ident_reserved);
+ SENDERR(EINVAL);
+ }
+
+ /* string terminator/padding must be zero */
+ if(pfkey_ident->sadb_ident_len > sizeof(struct sadb_ident) / IPSEC_PFKEYv2_ALIGN) {
+ if(*((char*)pfkey_ident + pfkey_ident->sadb_ident_len * IPSEC_PFKEYv2_ALIGN - 1)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_ident_parse: "
+ "string padding must be zero, last is 0x%02x.\n",
+ *((char*)pfkey_ident +
+ pfkey_ident->sadb_ident_len * IPSEC_PFKEYv2_ALIGN - 1));
+ SENDERR(EINVAL);
+ }
+ }
+
+ if( ! ((pfkey_ident->sadb_ident_exttype == SADB_EXT_IDENTITY_SRC) ||
+ (pfkey_ident->sadb_ident_exttype == SADB_EXT_IDENTITY_DST))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_key_parse: "
+ "expecting extension type IDENTITY_SRC or IDENTITY_DST, got %d.\n",
+ pfkey_ident->sadb_ident_exttype);
+ SENDERR(EINVAL);
+ }
+
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_sens_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_sens *pfkey_sens = (struct sadb_sens *)pfkey_ext;
+
+ /* sanity checks... */
+ if(pfkey_sens->sadb_sens_len < sizeof(struct sadb_sens) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sens_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_sens->sadb_sens_len,
+ (int)sizeof(struct sadb_sens));
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_sens_parse: "
+ "Sorry, I can't parse exttype=%d yet.\n",
+ pfkey_ext->sadb_ext_type);
+#if 0
+ SENDERR(EINVAL); /* don't process these yet */
+#endif
+
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_prop_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ int i, num_comb;
+ struct sadb_prop *pfkey_prop = (struct sadb_prop *)pfkey_ext;
+ struct sadb_comb *pfkey_comb = (struct sadb_comb *)((char*)pfkey_ext + sizeof(struct sadb_prop));
+
+ /* sanity checks... */
+ if((pfkey_prop->sadb_prop_len < sizeof(struct sadb_prop) / IPSEC_PFKEYv2_ALIGN) ||
+ (((pfkey_prop->sadb_prop_len * IPSEC_PFKEYv2_ALIGN) - sizeof(struct sadb_prop)) % sizeof(struct sadb_comb))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "size wrong ext_len=%d, prop_ext_len=%d comb_ext_len=%d.\n",
+ pfkey_prop->sadb_prop_len,
+ (int)sizeof(struct sadb_prop),
+ (int)sizeof(struct sadb_comb));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_prop->sadb_prop_replay > 64) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "replay window size: %d -- must be 0 <= size <= 64\n",
+ pfkey_prop->sadb_prop_replay);
+ SENDERR(EINVAL);
+ }
+
+ for(i=0; i<3; i++) {
+ if(pfkey_prop->sadb_prop_reserved[i]) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "res[%d]=%d, must be zero.\n",
+ i, pfkey_prop->sadb_prop_reserved[i]);
+ SENDERR(EINVAL);
+ }
+ }
+
+ num_comb = ((pfkey_prop->sadb_prop_len * IPSEC_PFKEYv2_ALIGN) - sizeof(struct sadb_prop)) / sizeof(struct sadb_comb);
+
+ for(i = 0; i < num_comb; i++) {
+ if(pfkey_comb->sadb_comb_auth > SADB_AALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth=%d > SADB_AALG_MAX=%d.\n",
+ i,
+ pfkey_comb->sadb_comb_auth,
+ SADB_AALG_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_auth) {
+ if(!pfkey_comb->sadb_comb_auth_minbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth_minbits=0, fatal.\n",
+ i);
+ SENDERR(EINVAL);
+ }
+ if(!pfkey_comb->sadb_comb_auth_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth_maxbits=0, fatal.\n",
+ i);
+ SENDERR(EINVAL);
+ }
+ if(pfkey_comb->sadb_comb_auth_minbits > pfkey_comb->sadb_comb_auth_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth_minbits=%d > maxbits=%d, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_auth_minbits,
+ pfkey_comb->sadb_comb_auth_maxbits);
+ SENDERR(EINVAL);
+ }
+ } else {
+ if(pfkey_comb->sadb_comb_auth_minbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth_minbits=%d != 0, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_auth_minbits);
+ SENDERR(EINVAL);
+ }
+ if(pfkey_comb->sadb_comb_auth_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_auth_maxbits=%d != 0, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_auth_maxbits);
+ SENDERR(EINVAL);
+ }
+ }
+
+ if(pfkey_comb->sadb_comb_encrypt > SADB_EALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_comb_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt=%d > SADB_EALG_MAX=%d.\n",
+ i,
+ pfkey_comb->sadb_comb_encrypt,
+ SADB_EALG_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_encrypt) {
+ if(!pfkey_comb->sadb_comb_encrypt_minbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt_minbits=0, fatal.\n",
+ i);
+ SENDERR(EINVAL);
+ }
+ if(!pfkey_comb->sadb_comb_encrypt_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt_maxbits=0, fatal.\n",
+ i);
+ SENDERR(EINVAL);
+ }
+ if(pfkey_comb->sadb_comb_encrypt_minbits > pfkey_comb->sadb_comb_encrypt_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt_minbits=%d > maxbits=%d, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_encrypt_minbits,
+ pfkey_comb->sadb_comb_encrypt_maxbits);
+ SENDERR(EINVAL);
+ }
+ } else {
+ if(pfkey_comb->sadb_comb_encrypt_minbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt_minbits=%d != 0, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_encrypt_minbits);
+ SENDERR(EINVAL);
+ }
+ if(pfkey_comb->sadb_comb_encrypt_maxbits) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_encrypt_maxbits=%d != 0, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_encrypt_maxbits);
+ SENDERR(EINVAL);
+ }
+ }
+
+ /* XXX do sanity check on flags */
+
+ if(pfkey_comb->sadb_comb_hard_allocations && pfkey_comb->sadb_comb_soft_allocations > pfkey_comb->sadb_comb_hard_allocations) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_soft_allocations=%d > hard_allocations=%d, fatal.\n",
+ i,
+ pfkey_comb->sadb_comb_soft_allocations,
+ pfkey_comb->sadb_comb_hard_allocations);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_hard_bytes && pfkey_comb->sadb_comb_soft_bytes > pfkey_comb->sadb_comb_hard_bytes) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_soft_bytes=%Ld > hard_bytes=%Ld, fatal.\n",
+ i,
+ (unsigned long long int)pfkey_comb->sadb_comb_soft_bytes,
+ (unsigned long long int)pfkey_comb->sadb_comb_hard_bytes);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_hard_addtime && pfkey_comb->sadb_comb_soft_addtime > pfkey_comb->sadb_comb_hard_addtime) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_soft_addtime=%Ld > hard_addtime=%Ld, fatal.\n",
+ i,
+ (unsigned long long int)pfkey_comb->sadb_comb_soft_addtime,
+ (unsigned long long int)pfkey_comb->sadb_comb_hard_addtime);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_hard_usetime && pfkey_comb->sadb_comb_soft_usetime > pfkey_comb->sadb_comb_hard_usetime) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_comb_soft_usetime=%Ld > hard_usetime=%Ld, fatal.\n",
+ i,
+ (unsigned long long int)pfkey_comb->sadb_comb_soft_usetime,
+ (unsigned long long int)pfkey_comb->sadb_comb_hard_usetime);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_x_comb_hard_packets && pfkey_comb->sadb_x_comb_soft_packets > pfkey_comb->sadb_x_comb_hard_packets) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "pfkey_comb[%d]->sadb_x_comb_soft_packets=%d > hard_packets=%d, fatal.\n",
+ i,
+ pfkey_comb->sadb_x_comb_soft_packets,
+ pfkey_comb->sadb_x_comb_hard_packets);
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_comb->sadb_comb_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_prop_parse: "
+ "comb[%d].res=%d, must be zero.\n",
+ i,
+ pfkey_comb->sadb_comb_reserved);
+ SENDERR(EINVAL);
+ }
+ pfkey_comb++;
+ }
+
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_supported_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ unsigned int i, num_alg;
+ struct sadb_supported *pfkey_supported = (struct sadb_supported *)pfkey_ext;
+ struct sadb_alg *pfkey_alg = (struct sadb_alg*)((char*)pfkey_ext + sizeof(struct sadb_supported));
+
+ /* sanity checks... */
+ if((pfkey_supported->sadb_supported_len <
+ sizeof(struct sadb_supported) / IPSEC_PFKEYv2_ALIGN) ||
+ (((pfkey_supported->sadb_supported_len * IPSEC_PFKEYv2_ALIGN) -
+ sizeof(struct sadb_supported)) % sizeof(struct sadb_alg))) {
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "size wrong ext_len=%d, supported_ext_len=%d alg_ext_len=%d.\n",
+ pfkey_supported->sadb_supported_len,
+ (int)sizeof(struct sadb_supported),
+ (int)sizeof(struct sadb_alg));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_supported->sadb_supported_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "res=%d, must be zero.\n",
+ pfkey_supported->sadb_supported_reserved);
+ SENDERR(EINVAL);
+ }
+
+ num_alg = ((pfkey_supported->sadb_supported_len * IPSEC_PFKEYv2_ALIGN) - sizeof(struct sadb_supported)) / sizeof(struct sadb_alg);
+
+ for(i = 0; i < num_alg; i++) {
+ /* process algo description */
+ if(pfkey_alg->sadb_alg_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "alg[%d], id=%d, ivlen=%d, minbits=%d, maxbits=%d, res=%d, must be zero.\n",
+ i,
+ pfkey_alg->sadb_alg_id,
+ pfkey_alg->sadb_alg_ivlen,
+ pfkey_alg->sadb_alg_minbits,
+ pfkey_alg->sadb_alg_maxbits,
+ pfkey_alg->sadb_alg_reserved);
+ SENDERR(EINVAL);
+ }
+
+ /* XXX can alg_id auth/enc be determined from info given?
+ Yes, but OpenBSD's method does not iteroperate with rfc2367.
+ rgb, 2000-04-06 */
+
+ switch(pfkey_supported->sadb_supported_exttype) {
+ case SADB_EXT_SUPPORTED_AUTH:
+ if(pfkey_alg->sadb_alg_id > SADB_AALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "alg[%d], alg_id=%d > SADB_AALG_MAX=%d, fatal.\n",
+ i,
+ pfkey_alg->sadb_alg_id,
+ SADB_AALG_MAX);
+ SENDERR(EINVAL);
+ }
+ break;
+ case SADB_EXT_SUPPORTED_ENCRYPT:
+ if(pfkey_alg->sadb_alg_id > SADB_EALG_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "alg[%d], alg_id=%d > SADB_EALG_MAX=%d, fatal.\n",
+ i,
+ pfkey_alg->sadb_alg_id,
+ SADB_EALG_MAX);
+ SENDERR(EINVAL);
+ }
+ break;
+ default:
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_supported_parse: "
+ "alg[%d], alg_id=%d > SADB_EALG_MAX=%d, fatal.\n",
+ i,
+ pfkey_alg->sadb_alg_id,
+ SADB_EALG_MAX);
+ SENDERR(EINVAL);
+ }
+ pfkey_alg++;
+ }
+
+ errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_spirange_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_spirange *pfkey_spirange = (struct sadb_spirange *)pfkey_ext;
+
+ /* sanity checks... */
+ if(pfkey_spirange->sadb_spirange_len !=
+ sizeof(struct sadb_spirange) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_spirange_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_spirange->sadb_spirange_len,
+ (int)sizeof(struct sadb_spirange));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_spirange->sadb_spirange_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_spirange_parse: "
+ "reserved=%d must be set to zero.\n",
+ pfkey_spirange->sadb_spirange_reserved);
+ SENDERR(EINVAL);
+ }
+
+ if(ntohl(pfkey_spirange->sadb_spirange_max) < ntohl(pfkey_spirange->sadb_spirange_min)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_spirange_parse: "
+ "minspi=%08x must be < maxspi=%08x.\n",
+ ntohl(pfkey_spirange->sadb_spirange_min),
+ ntohl(pfkey_spirange->sadb_spirange_max));
+ SENDERR(EINVAL);
+ }
+
+ if(ntohl(pfkey_spirange->sadb_spirange_min) <= 255) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_spirange_parse: "
+ "minspi=%08x must be > 255.\n",
+ ntohl(pfkey_spirange->sadb_spirange_min));
+ SENDERR(EEXIST);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_spirange_parse: "
+ "ext_len=%u ext_type=%u(%s) min=%u max=%u res=%u.\n",
+ pfkey_spirange->sadb_spirange_len,
+ pfkey_spirange->sadb_spirange_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_spirange->sadb_spirange_exttype),
+ pfkey_spirange->sadb_spirange_min,
+ pfkey_spirange->sadb_spirange_max,
+ pfkey_spirange->sadb_spirange_reserved);
+ errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_x_kmprivate_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ struct sadb_x_kmprivate *pfkey_x_kmprivate = (struct sadb_x_kmprivate *)pfkey_ext;
+
+ /* sanity checks... */
+ if(pfkey_x_kmprivate->sadb_x_kmprivate_len <
+ sizeof(struct sadb_x_kmprivate) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_kmprivate_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_x_kmprivate->sadb_x_kmprivate_len,
+ (int)sizeof(struct sadb_x_kmprivate));
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_x_kmprivate->sadb_x_kmprivate_reserved) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_kmprivate_parse: "
+ "reserved=%d must be set to zero.\n",
+ pfkey_x_kmprivate->sadb_x_kmprivate_reserved);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_kmprivate_parse: "
+ "Sorry, I can't parse exttype=%d yet.\n",
+ pfkey_ext->sadb_ext_type);
+ SENDERR(EINVAL); /* don't process these yet */
+
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_x_satype_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ int i;
+ struct sadb_x_satype *pfkey_x_satype = (struct sadb_x_satype *)pfkey_ext;
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_x_satype_parse: enter\n");
+ /* sanity checks... */
+ if(pfkey_x_satype->sadb_x_satype_len !=
+ sizeof(struct sadb_x_satype) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_satype_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_x_satype->sadb_x_satype_len,
+ (int)sizeof(struct sadb_x_satype));
+ SENDERR(EINVAL);
+ }
+
+ if(!pfkey_x_satype->sadb_x_satype_satype) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_satype_parse: "
+ "satype is zero, must be non-zero.\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_x_satype->sadb_x_satype_satype > SADB_SATYPE_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_satype_parse: "
+ "satype %d > max %d, invalid.\n",
+ pfkey_x_satype->sadb_x_satype_satype, SADB_SATYPE_MAX);
+ SENDERR(EINVAL);
+ }
+
+ if(!(satype2proto(pfkey_x_satype->sadb_x_satype_satype))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_satype_parse: "
+ "proto lookup from satype=%d failed.\n",
+ pfkey_x_satype->sadb_x_satype_satype);
+ SENDERR(EINVAL);
+ }
+
+ for(i = 0; i < 3; i++) {
+ if(pfkey_x_satype->sadb_x_satype_reserved[i]) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_satype_parse: "
+ "reserved[%d]=%d must be set to zero.\n",
+ i, pfkey_x_satype->sadb_x_satype_reserved[i]);
+ SENDERR(EINVAL);
+ }
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_x_satype_parse: "
+ "len=%u ext=%u(%s) satype=%u(%s) res=%u,%u,%u.\n",
+ pfkey_x_satype->sadb_x_satype_len,
+ pfkey_x_satype->sadb_x_satype_exttype,
+ pfkey_v2_sadb_ext_string(pfkey_x_satype->sadb_x_satype_exttype),
+ pfkey_x_satype->sadb_x_satype_satype,
+ satype2name(pfkey_x_satype->sadb_x_satype_satype),
+ pfkey_x_satype->sadb_x_satype_reserved[0],
+ pfkey_x_satype->sadb_x_satype_reserved[1],
+ pfkey_x_satype->sadb_x_satype_reserved[2]);
+errlab:
+ return error;
+}
+
+DEBUG_NO_STATIC int
+pfkey_x_ext_debug_parse(struct sadb_ext *pfkey_ext)
+{
+ int error = 0;
+ int i;
+ struct sadb_x_debug *pfkey_x_debug = (struct sadb_x_debug *)pfkey_ext;
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_x_debug_parse: enter\n");
+ /* sanity checks... */
+ if(pfkey_x_debug->sadb_x_debug_len !=
+ sizeof(struct sadb_x_debug) / IPSEC_PFKEYv2_ALIGN) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_debug_parse: "
+ "size wrong ext_len=%d, key_ext_len=%d.\n",
+ pfkey_x_debug->sadb_x_debug_len,
+ (int)sizeof(struct sadb_x_debug));
+ SENDERR(EINVAL);
+ }
+
+ for(i = 0; i < 4; i++) {
+ if(pfkey_x_debug->sadb_x_debug_reserved[i]) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_x_debug_parse: "
+ "reserved[%d]=%d must be set to zero.\n",
+ i, pfkey_x_debug->sadb_x_debug_reserved[i]);
+ SENDERR(EINVAL);
+ }
+ }
+
+errlab:
+ return error;
+}
+
+#ifdef NAT_TRAVERSAL
+DEBUG_NO_STATIC int
+pfkey_x_ext_nat_t_type_parse(struct sadb_ext *pfkey_ext)
+{
+ return 0;
+}
+DEBUG_NO_STATIC int
+pfkey_x_ext_nat_t_port_parse(struct sadb_ext *pfkey_ext)
+{
+ return 0;
+}
+#endif
+
+#define DEFINEPARSER(NAME) static struct pf_key_ext_parsers_def NAME##_def={NAME, #NAME};
+
+DEFINEPARSER(pfkey_sa_parse);
+DEFINEPARSER(pfkey_lifetime_parse);
+DEFINEPARSER(pfkey_address_parse);
+DEFINEPARSER(pfkey_key_parse);
+DEFINEPARSER(pfkey_ident_parse);
+DEFINEPARSER(pfkey_sens_parse);
+DEFINEPARSER(pfkey_prop_parse);
+DEFINEPARSER(pfkey_supported_parse);
+DEFINEPARSER(pfkey_spirange_parse);
+DEFINEPARSER(pfkey_x_kmprivate_parse);
+DEFINEPARSER(pfkey_x_satype_parse);
+DEFINEPARSER(pfkey_x_ext_debug_parse);
+
+struct pf_key_ext_parsers_def *ext_default_parsers[]=
+{
+ NULL, /* pfkey_msg_parse, */
+ &pfkey_sa_parse_def,
+ &pfkey_lifetime_parse_def,
+ &pfkey_lifetime_parse_def,
+ &pfkey_lifetime_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_key_parse_def,
+ &pfkey_key_parse_def,
+ &pfkey_ident_parse_def,
+ &pfkey_ident_parse_def,
+ &pfkey_sens_parse_def,
+ &pfkey_prop_parse_def,
+ &pfkey_supported_parse_def,
+ &pfkey_supported_parse_def,
+ &pfkey_spirange_parse_def,
+ &pfkey_x_kmprivate_parse_def,
+ &pfkey_x_satype_parse_def,
+ &pfkey_sa_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_x_ext_debug_parse_def
+};
+
+int
+pfkey_msg_parse(struct sadb_msg *pfkey_msg,
+ struct pf_key_ext_parsers_def *ext_parsers[],
+ struct sadb_ext *extensions[],
+ int dir)
+{
+ int error = 0;
+ int remain;
+ struct sadb_ext *pfkey_ext;
+ int extensions_seen = 0;
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_msg_parse: "
+ "parsing message ver=%d, type=%d(%s), errno=%d, satype=%d(%s), len=%d, res=%d, seq=%d, pid=%d.\n",
+ pfkey_msg->sadb_msg_version,
+ pfkey_msg->sadb_msg_type,
+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type),
+ pfkey_msg->sadb_msg_errno,
+ pfkey_msg->sadb_msg_satype,
+ satype2name(pfkey_msg->sadb_msg_satype),
+ pfkey_msg->sadb_msg_len,
+ pfkey_msg->sadb_msg_reserved,
+ pfkey_msg->sadb_msg_seq,
+ pfkey_msg->sadb_msg_pid);
+
+ if(ext_parsers == NULL) ext_parsers = ext_default_parsers;
+
+ pfkey_extensions_init(extensions);
+
+ remain = pfkey_msg->sadb_msg_len;
+ remain -= sizeof(struct sadb_msg) / IPSEC_PFKEYv2_ALIGN;
+
+ pfkey_ext = (struct sadb_ext*)((char*)pfkey_msg +
+ sizeof(struct sadb_msg));
+
+ extensions[0] = (struct sadb_ext *) pfkey_msg;
+
+
+ if(pfkey_msg->sadb_msg_version != PF_KEY_V2) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "not PF_KEY_V2 msg, found %d, should be %d.\n",
+ pfkey_msg->sadb_msg_version,
+ PF_KEY_V2);
+ SENDERR(EINVAL);
+ }
+
+ if(!pfkey_msg->sadb_msg_type) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "msg type not set, must be non-zero..\n");
+ SENDERR(EINVAL);
+ }
+
+ if(pfkey_msg->sadb_msg_type > SADB_MAX) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "msg type=%d > max=%d.\n",
+ pfkey_msg->sadb_msg_type,
+ SADB_MAX);
+ SENDERR(EINVAL);
+ }
+
+ switch(pfkey_msg->sadb_msg_type) {
+ case SADB_GETSPI:
+ case SADB_UPDATE:
+ case SADB_ADD:
+ case SADB_DELETE:
+ case SADB_GET:
+ case SADB_X_GRPSA:
+ case SADB_X_ADDFLOW:
+ if(!satype2proto(pfkey_msg->sadb_msg_satype)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "satype %d conversion to proto failed for msg_type %d (%s).\n",
+ pfkey_msg->sadb_msg_satype,
+ pfkey_msg->sadb_msg_type,
+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type));
+ SENDERR(EINVAL);
+ } else {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "satype %d(%s) conversion to proto gives %d for msg_type %d(%s).\n",
+ pfkey_msg->sadb_msg_satype,
+ satype2name(pfkey_msg->sadb_msg_satype),
+ satype2proto(pfkey_msg->sadb_msg_satype),
+ pfkey_msg->sadb_msg_type,
+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type));
+ }
+ case SADB_ACQUIRE:
+ case SADB_REGISTER:
+ case SADB_EXPIRE:
+ if(!pfkey_msg->sadb_msg_satype) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "satype is zero, must be non-zero for msg_type %d(%s).\n",
+ pfkey_msg->sadb_msg_type,
+ pfkey_v2_sadb_type_string(pfkey_msg->sadb_msg_type));
+ SENDERR(EINVAL);
+ }
+ default:
+ break;
+ }
+
+ /* errno must not be set in downward messages */
+ /* this is not entirely true... a response to an ACQUIRE could return an error */
+ if((dir == EXT_BITS_IN) && (pfkey_msg->sadb_msg_type != SADB_ACQUIRE) && pfkey_msg->sadb_msg_errno) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "errno set to %d.\n",
+ pfkey_msg->sadb_msg_errno);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_msg_parse: "
+ "remain=%d, ext_type=%d(%s), ext_len=%d.\n",
+ remain,
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ pfkey_ext->sadb_ext_len);
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_msg_parse: "
+ "extensions permitted=%08x, required=%08x.\n",
+ extensions_bitmaps[dir][EXT_BITS_PERM][pfkey_msg->sadb_msg_type],
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type]);
+
+ extensions_seen = 1;
+
+ while( (remain * IPSEC_PFKEYv2_ALIGN) >= sizeof(struct sadb_ext) ) {
+ /* Is there enough message left to support another extension header? */
+ if(remain < pfkey_ext->sadb_ext_len) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "remain %d less than ext len %d.\n",
+ remain, pfkey_ext->sadb_ext_len);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_msg_parse: "
+ "parsing ext type=%d(%s) remain=%d.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ remain);
+
+ /* Is the extension header type valid? */
+ if((pfkey_ext->sadb_ext_type > SADB_EXT_MAX) || (!pfkey_ext->sadb_ext_type)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "ext type %d(%s) invalid, SADB_EXT_MAX=%d.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ SADB_EXT_MAX);
+ SENDERR(EINVAL);
+ }
+
+ /* Have we already seen this type of extension? */
+ if((extensions_seen & ( 1 << pfkey_ext->sadb_ext_type )) != 0)
+ {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "ext type %d(%s) already seen.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type));
+ SENDERR(EINVAL);
+ }
+
+ /* Do I even know about this type of extension? */
+ if(ext_parsers[pfkey_ext->sadb_ext_type]==NULL) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "ext type %d(%s) unknown, ignoring.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type));
+ goto next_ext;
+ }
+
+ /* Is this type of extension permitted for this type of message? */
+ if(!(extensions_bitmaps[dir][EXT_BITS_PERM][pfkey_msg->sadb_msg_type] &
+ 1<<pfkey_ext->sadb_ext_type)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "ext type %d(%s) not permitted, exts_perm_in=%08x, 1<<type=%08x\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ extensions_bitmaps[dir][EXT_BITS_PERM][pfkey_msg->sadb_msg_type],
+ 1<<pfkey_ext->sadb_ext_type);
+ SENDERR(EINVAL);
+ }
+
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_msg_parse: "
+ "remain=%d ext_type=%d(%s) ext_len=%d parsing ext 0p%p with parser %s.\n",
+ remain,
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ pfkey_ext->sadb_ext_len,
+ pfkey_ext,
+ ext_parsers[pfkey_ext->sadb_ext_type]->parser_name);
+
+ /* Parse the extension */
+ if((error =
+ (*ext_parsers[pfkey_ext->sadb_ext_type]->parser)(pfkey_ext))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "extension parsing for type %d(%s) failed with error %d.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type),
+ error);
+ SENDERR(-error);
+ }
+ DEBUGGING(PF_KEY_DEBUG_PARSE_FLOW,
+ "pfkey_msg_parse: "
+ "Extension %d(%s) parsed.\n",
+ pfkey_ext->sadb_ext_type,
+ pfkey_v2_sadb_ext_string(pfkey_ext->sadb_ext_type));
+
+ /* Mark that we have seen this extension and remember the header location */
+ extensions_seen |= ( 1 << pfkey_ext->sadb_ext_type );
+ extensions[pfkey_ext->sadb_ext_type] = pfkey_ext;
+
+ next_ext:
+ /* Calculate how much message remains */
+ remain -= pfkey_ext->sadb_ext_len;
+
+ if(!remain) {
+ break;
+ }
+ /* Find the next extension header */
+ pfkey_ext = (struct sadb_ext*)((char*)pfkey_ext +
+ pfkey_ext->sadb_ext_len * IPSEC_PFKEYv2_ALIGN);
+ }
+
+ if(remain) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "unexpected remainder of %d.\n",
+ remain);
+ /* why is there still something remaining? */
+ SENDERR(EINVAL);
+ }
+
+ /* check required extensions */
+ DEBUGGING(PF_KEY_DEBUG_PARSE_STRUCT,
+ "pfkey_msg_parse: "
+ "extensions permitted=%08x, seen=%08x, required=%08x.\n",
+ extensions_bitmaps[dir][EXT_BITS_PERM][pfkey_msg->sadb_msg_type],
+ extensions_seen,
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type]);
+
+ /* don't check further if it is an error return message since it
+ may not have a body */
+ if(pfkey_msg->sadb_msg_errno) {
+ SENDERR(-error);
+ }
+
+ if((extensions_seen &
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type]) !=
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type]) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "required extensions missing:%08x.\n",
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type] -
+ (extensions_seen &
+ extensions_bitmaps[dir][EXT_BITS_REQ][pfkey_msg->sadb_msg_type]));
+ SENDERR(EINVAL);
+ }
+
+ if((dir == EXT_BITS_IN) && (pfkey_msg->sadb_msg_type == SADB_X_DELFLOW)
+ && ((extensions_seen & SADB_X_EXT_ADDRESS_DELFLOW)
+ != SADB_X_EXT_ADDRESS_DELFLOW)
+ && (((extensions_seen & (1<<SADB_EXT_SA)) != (1<<SADB_EXT_SA))
+ || ((((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_flags
+ & SADB_X_SAFLAGS_CLEARFLOW)
+ != SADB_X_SAFLAGS_CLEARFLOW))) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "required SADB_X_DELFLOW extensions missing: either %08x must be present or %08x must be present with SADB_X_SAFLAGS_CLEARFLOW set.\n",
+ SADB_X_EXT_ADDRESS_DELFLOW
+ - (extensions_seen & SADB_X_EXT_ADDRESS_DELFLOW),
+ (1<<SADB_EXT_SA) - (extensions_seen & (1<<SADB_EXT_SA)));
+ SENDERR(EINVAL);
+ }
+
+ switch(pfkey_msg->sadb_msg_type) {
+ case SADB_ADD:
+ case SADB_UPDATE:
+ /* check maturity */
+ if(((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_state !=
+ SADB_SASTATE_MATURE) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "state=%d for add or update should be MATURE=%d.\n",
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_state,
+ SADB_SASTATE_MATURE);
+ SENDERR(EINVAL);
+ }
+
+ /* check AH and ESP */
+ switch(((struct sadb_msg*)extensions[SADB_EXT_RESERVED])->sadb_msg_satype) {
+ case SADB_SATYPE_AH:
+ if(!(((struct sadb_sa*)extensions[SADB_EXT_SA]) &&
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_auth !=
+ SADB_AALG_NONE)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "auth alg is zero, must be non-zero for AH SAs.\n");
+ SENDERR(EINVAL);
+ }
+ if(((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_encrypt !=
+ SADB_EALG_NONE) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "AH handed encalg=%d, must be zero.\n",
+ ((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_encrypt);
+ SENDERR(EINVAL);
+ }
+ break;
+ case SADB_SATYPE_ESP:
+ if(!(((struct sadb_sa*)extensions[SADB_EXT_SA]) &&
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt !=
+ SADB_EALG_NONE)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "encrypt alg=%d is zero, must be non-zero for ESP=%d SAs.\n",
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt,
+ ((struct sadb_msg*)extensions[SADB_EXT_RESERVED])->sadb_msg_satype);
+ SENDERR(EINVAL);
+ }
+ if((((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_encrypt ==
+ SADB_EALG_NULL) &&
+ (((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_auth ==
+ SADB_AALG_NONE) ) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "ESP handed encNULL+authNONE, illegal combination.\n");
+ SENDERR(EINVAL);
+ }
+ break;
+ case SADB_X_SATYPE_COMP:
+ if(!(((struct sadb_sa*)extensions[SADB_EXT_SA]) &&
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt !=
+ SADB_EALG_NONE)) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "encrypt alg=%d is zero, must be non-zero for COMP=%d SAs.\n",
+ ((struct sadb_sa*)extensions[SADB_EXT_SA])->sadb_sa_encrypt,
+ ((struct sadb_msg*)extensions[SADB_EXT_RESERVED])->sadb_msg_satype);
+ SENDERR(EINVAL);
+ }
+ if(((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_auth !=
+ SADB_AALG_NONE) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "COMP handed auth=%d, must be zero.\n",
+ ((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_auth);
+ SENDERR(EINVAL);
+ }
+ break;
+ default:
+ break;
+ }
+ if(ntohl(((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_spi) <= 255) {
+ DEBUGGING(PF_KEY_DEBUG_PARSE_PROBLEM,
+ "pfkey_msg_parse: "
+ "spi=%08x must be > 255.\n",
+ ntohl(((struct sadb_sa*)(extensions[SADB_EXT_SA]))->sadb_sa_spi));
+ SENDERR(EINVAL);
+ }
+ default:
+ break;
+ }
+errlab:
+
+ return error;
+}
+
+/*
+ * $Log: pfkey_v2_parse.c,v $
+ * Revision 1.53 2003/01/30 02:32:09 rgb
+ *
+ * Rename SAref table macro names for clarity.
+ * Convert IPsecSAref_t from signed to unsigned to fix apparent SAref exhaustion bug.
+ *
+ * Revision 1.52 2002/12/30 06:53:07 mcr
+ * deal with short SA structures... #if 0 out for now. Probably
+ * not quite the right way.
+ *
+ * Revision 1.51 2002/12/13 18:16:02 mcr
+ * restored sa_ref code
+ *
+ * Revision 1.50 2002/12/13 18:06:52 mcr
+ * temporarily removed sadb_x_sa_ref reference for 2.xx
+ *
+ * Revision 1.49 2002/10/05 05:02:58 dhr
+ *
+ * C labels go on statements
+ *
+ * Revision 1.48 2002/09/20 15:40:45 rgb
+ * Added sadb_x_sa_ref to struct sadb_sa.
+ *
+ * Revision 1.47 2002/09/20 05:01:31 rgb
+ * Fixed usage of pfkey_lib_debug.
+ * Format for function declaration style consistency.
+ * Added text labels to elucidate numeric values presented.
+ * Re-organised debug output to reduce noise in output.
+ *
+ * Revision 1.46 2002/07/24 18:44:54 rgb
+ * Type fiddling to tame ia64 compiler.
+ *
+ * Revision 1.45 2002/05/23 07:14:11 rgb
+ * Cleaned up %p variants to 0p%p for test suite cleanup.
+ *
+ * Revision 1.44 2002/04/24 07:55:32 mcr
+ * #include patches and Makefiles for post-reorg compilation.
+ *
+ * Revision 1.43 2002/04/24 07:36:40 mcr
+ * Moved from ./lib/pfkey_v2_parse.c,v
+ *
+ * Revision 1.42 2002/01/29 22:25:36 rgb
+ * Re-add ipsec_kversion.h to keep MALLOC happy.
+ *
+ * Revision 1.41 2002/01/29 01:59:10 mcr
+ * removal of kversions.h - sources that needed it now use ipsec_param.h.
+ * updating of IPv6 structures to match latest in6.h version.
+ * removed dead code from freeswan.h that also duplicated kversions.h
+ * code.
+ *
+ * Revision 1.40 2002/01/20 20:34:50 mcr
+ * added pfkey_v2_sadb_type_string to decode sadb_type to string.
+ *
+ * Revision 1.39 2001/11/27 05:29:22 mcr
+ * pfkey parses are now maintained by a structure
+ * that includes their name for debug purposes.
+ * DEBUGGING() macro changed so that it takes a debug
+ * level so that pf_key() can use this to decode the
+ * structures without innundanting humans.
+ * Also uses pfkey_v2_sadb_ext_string() in messages.
+ *
+ * Revision 1.38 2001/11/06 19:47:47 rgb
+ * Added packet parameter to lifetime and comb structures.
+ *
+ * Revision 1.37 2001/10/18 04:45:24 rgb
+ * 2.4.9 kernel deprecates linux/malloc.h in favour of linux/slab.h,
+ * lib/freeswan.h version macros moved to lib/kversions.h.
+ * Other compiler directive cleanups.
+ *
+ * Revision 1.36 2001/06/14 19:35:16 rgb
+ * Update copyright date.
+ *
+ * Revision 1.35 2001/05/03 19:44:51 rgb
+ * Standardise on SENDERR() macro.
+ *
+ * Revision 1.34 2001/03/16 07:41:51 rgb
+ * Put freeswan.h include before pluto includes.
+ *
+ * Revision 1.33 2001/02/27 07:13:51 rgb
+ * Added satype2name() function.
+ * Added text to default satype_tbl entry.
+ * Added satype2name() conversions for most satype debug output.
+ *
+ * Revision 1.32 2001/02/26 20:01:09 rgb
+ * Added internal IP protocol 61 for magic SAs.
+ * Ditch unused sadb_satype2proto[], replaced by satype2proto().
+ * Re-formatted debug output (split lines, consistent spacing).
+ * Removed acquire, register and expire requirements for a known satype.
+ * Changed message type checking to a switch structure.
+ * Verify expected NULL auth for IPCOMP.
+ * Enforced spi > 0x100 requirement, now that pass uses a magic SA for
+ * appropriate message types.
+ *
+ * Revision 1.31 2000/12/01 07:09:00 rgb
+ * Added ipcomp sanity check to require encalgo is set.
+ *
+ * Revision 1.30 2000/11/17 18:10:30 rgb
+ * Fixed bugs mostly relating to spirange, to treat all spi variables as
+ * network byte order since this is the way PF_KEYv2 stored spis.
+ *
+ * Revision 1.29 2000/10/12 00:02:39 rgb
+ * Removed 'format, ##' nonsense from debug macros for RH7.0.
+ *
+ * Revision 1.28 2000/09/20 16:23:04 rgb
+ * Remove over-paranoid extension check in the presence of sadb_msg_errno.
+ *
+ * Revision 1.27 2000/09/20 04:04:21 rgb
+ * Changed static functions to DEBUG_NO_STATIC to reveal function names in
+ * oopsen.
+ *
+ * Revision 1.26 2000/09/15 11:37:02 rgb
+ * Merge in heavily modified Svenning Soerensen's <svenning@post5.tele.dk>
+ * IPCOMP zlib deflate code.
+ *
+ * Revision 1.25 2000/09/12 22:35:37 rgb
+ * Restructured to remove unused extensions from CLEARFLOW messages.
+ *
+ * Revision 1.24 2000/09/12 18:59:54 rgb
+ * Added Gerhard's IPv6 support to pfkey parts of libfreeswan.
+ *
+ * Revision 1.23 2000/09/12 03:27:00 rgb
+ * Moved DEBUGGING definition to compile kernel with debug off.
+ *
+ * Revision 1.22 2000/09/09 06:39:27 rgb
+ * Restrict pfkey errno check to downward messages only.
+ *
+ * Revision 1.21 2000/09/08 19:22:34 rgb
+ * Enabled pfkey_sens_parse().
+ * Added check for errno on downward acquire messages only.
+ *
+ * Revision 1.20 2000/09/01 18:48:23 rgb
+ * Fixed reserved check bug and added debug output in
+ * pfkey_supported_parse().
+ * Fixed debug output label bug in pfkey_ident_parse().
+ *
+ * Revision 1.19 2000/08/27 01:55:26 rgb
+ * Define OCTETBITS and PFKEYBITS to avoid using 'magic' numbers in code.
+ *
+ * Revision 1.18 2000/08/24 17:00:36 rgb
+ * Ignore unknown extensions instead of failing.
+ *
+ * Revision 1.17 2000/06/02 22:54:14 rgb
+ * Added Gerhard Gessler's struct sockaddr_storage mods for IPv6 support.
+ *
+ * Revision 1.16 2000/05/10 19:25:11 rgb
+ * Fleshed out proposal and supported extensions.
+ *
+ * Revision 1.15 2000/01/24 21:15:31 rgb
+ * Added disabled pluto pfkey lib debug flag.
+ * Added algo debugging reporting.
+ *
+ * Revision 1.14 2000/01/22 23:24:29 rgb
+ * Added new functions proto2satype() and satype2proto() and lookup
+ * table satype_tbl. Also added proto2name() since it was easy.
+ *
+ * Revision 1.13 2000/01/21 09:43:59 rgb
+ * Cast ntohl(spi) as (unsigned long int) to shut up compiler.
+ *
+ * Revision 1.12 2000/01/21 06:28:19 rgb
+ * Added address cases for eroute flows.
+ * Indented compiler directives for readability.
+ * Added klipsdebug switching capability.
+ *
+ * Revision 1.11 1999/12/29 21:14:59 rgb
+ * Fixed debug text cut and paste typo.
+ *
+ * Revision 1.10 1999/12/10 17:45:24 rgb
+ * Added address debugging.
+ *
+ * Revision 1.9 1999/12/09 23:11:42 rgb
+ * Ditched <string.h> include since we no longer use memset().
+ * Use new pfkey_extensions_init() instead of memset().
+ * Added check for SATYPE in pfkey_msg_build().
+ * Tidy up comments and debugging comments.
+ *
+ * Revision 1.8 1999/12/07 19:55:26 rgb
+ * Removed unused first argument from extension parsers.
+ * Removed static pluto debug flag.
+ * Moved message type and state checking to pfkey_msg_parse().
+ * Changed print[fk] type from lx to x to quiet compiler.
+ * Removed redundant remain check.
+ * Changed __u* types to uint* to avoid use of asm/types.h and
+ * sys/types.h in userspace code.
+ *
+ * Revision 1.7 1999/12/01 22:20:51 rgb
+ * Moved pfkey_lib_debug variable into the library.
+ * Added pfkey version check into header parsing.
+ * Added check for SATYPE only for those extensions that require a
+ * non-zero value.
+ *
+ * Revision 1.6 1999/11/27 11:58:05 rgb
+ * Added ipv6 headers.
+ * Moved sadb_satype2proto protocol lookup table from
+ * klips/net/ipsec/pfkey_v2_parser.c.
+ * Enable lifetime_current checking.
+ * Debugging error messages added.
+ * Add argument to pfkey_msg_parse() for direction.
+ * Consolidated the 4 1-d extension bitmap arrays into one 4-d array.
+ * Add CVS log entry to bottom of file.
+ * Moved auth and enc alg check to pfkey_msg_parse().
+ * Enable accidentally disabled spirange parsing.
+ * Moved protocol/algorithm checks from klips/net/ipsec/pfkey_v2_parser.c
+ *
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ *
+ */
diff --git a/tests/contrib/pfkey_v2_parse.c/patch b/tests/contrib/pfkey_v2_parse.c/patch
new file mode 100644
index 0000000..39b0b53
--- /dev/null
+++ b/tests/contrib/pfkey_v2_parse.c/patch
@@ -0,0 +1,57 @@
+***************
+*** 1140,1149 ****
+ DEFINEPARSER(pfkey_spirange_parse);
+ DEFINEPARSER(pfkey_x_kmprivate_parse);
+ DEFINEPARSER(pfkey_x_satype_parse);
+ DEFINEPARSER(pfkey_x_ext_debug_parse);
+ DEFINEPARSER(pfkey_x_ext_protocol_parse);
+
+ struct pf_key_ext_parsers_def *ext_default_parsers[]=
+ {
+ NULL, /* pfkey_msg_parse, */
+ &pfkey_sa_parse_def,
+--- 1156,1169 ----
+ DEFINEPARSER(pfkey_spirange_parse);
+ DEFINEPARSER(pfkey_x_kmprivate_parse);
+ DEFINEPARSER(pfkey_x_satype_parse);
+ DEFINEPARSER(pfkey_x_ext_debug_parse);
+ DEFINEPARSER(pfkey_x_ext_protocol_parse);
++ #ifdef NAT_TRAVERSAL
++ DEFINEPARSER(pfkey_x_ext_nat_t_type_parse);
++ DEFINEPARSER(pfkey_x_ext_nat_t_port_parse);
++ #endif
+
+ struct pf_key_ext_parsers_def *ext_default_parsers[]=
+ {
+ NULL, /* pfkey_msg_parse, */
+ &pfkey_sa_parse_def,
+***************
+*** 1170,1179 ****
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_x_ext_debug_parse_def,
+ &pfkey_x_ext_protocol_parse_def
+ };
+
+ int
+ pfkey_msg_parse(struct sadb_msg *pfkey_msg,
+ struct pf_key_ext_parsers_def *ext_parsers[],
+--- 1190,1206 ----
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_address_parse_def,
+ &pfkey_x_ext_debug_parse_def,
+ &pfkey_x_ext_protocol_parse_def
++ #ifdef NAT_TRAVERSAL
++ ,
++ &pfkey_x_ext_nat_t_type_parse_def,
++ &pfkey_x_ext_nat_t_port_parse_def,
++ &pfkey_x_ext_nat_t_port_parse_def,
++ &pfkey_address_parse_def
++ #endif
+ };
+
+ int
+ pfkey_msg_parse(struct sadb_msg *pfkey_msg,
+ struct pf_key_ext_parsers_def *ext_parsers[],
diff --git a/tests/contrib/xfaces/merge b/tests/contrib/xfaces/merge
new file mode 100644
index 0000000..b425b99
--- /dev/null
+++ b/tests/contrib/xfaces/merge
@@ -0,0 +1,7269 @@
+/* xfaces.c -- "Face" primitives.
+ Copyright (C) 1993, 1994, 1998, 1999, 2000, 2001
+ Free Software Foundation.
+
+This file is part of GNU Emacs.
+
+GNU Emacs is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Emacs is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Emacs; see the file COPYING. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA. */
+
+/* New face implementation by Gerd Moellmann <gerd@gnu.org>. */
+
+/* Faces.
+
+ When using Emacs with X, the display style of characters can be
+ changed by defining `faces'. Each face can specify the following
+ display attributes:
+
+ 1. Font family name.
+
+ 2. Relative proportionate width, aka character set width or set
+ width (swidth), e.g. `semi-compressed'.
+
+ 3. Font height in 1/10pt.
+
+ 4. Font weight, e.g. `bold'.
+
+ 5. Font slant, e.g. `italic'.
+
+ 6. Foreground color.
+
+ 7. Background color.
+
+ 8. Whether or not characters should be underlined, and in what color.
+
+ 9. Whether or not characters should be displayed in inverse video.
+
+ 10. A background stipple, a bitmap.
+
+ 11. Whether or not characters should be overlined, and in what color.
+
+ 12. Whether or not characters should be strike-through, and in what
+ color.
+
+ 13. Whether or not a box should be drawn around characters, the box
+ type, and, for simple boxes, in what color.
+
+ 14. Font or fontset pattern, or nil. This is a special attribute.
+ When this attribute is specified, the face uses a font opened by
+ that pattern as is. In addition, all the other font-related
+ attributes (1st thru 5th) are generated from the opened font name.
+ On the other hand, if one of the other font-related attributes are
+ specified, this attribute is set to nil. In that case, the face
+ doesn't inherit this attribute from the `default' face, and uses a
+ font determined by the other attributes (those may be inherited
+ from the `default' face).
+
+ 15. A face name or list of face names from which to inherit attributes.
+
+ 16. A specified average font width, which is invisible from Lisp,
+ and is used to ensure that a font specified on the command line,
+ for example, can be matched exactly.
+
+ Faces are frame-local by nature because Emacs allows to define the
+ same named face (face names are symbols) differently for different
+ frames. Each frame has an alist of face definitions for all named
+ faces. The value of a named face in such an alist is a Lisp vector
+ with the symbol `face' in slot 0, and a slot for each of the face
+ attributes mentioned above.
+
+ There is also a global face alist `Vface_new_frame_defaults'. Face
+ definitions from this list are used to initialize faces of newly
+ created frames.
+
+ A face doesn't have to specify all attributes. Those not specified
+ have a value of `unspecified'. Faces specifying all attributes but
+ the 14th are called `fully-specified'.
+
+
+ Face merging.
+
+ The display style of a given character in the text is determined by
+ combining several faces. This process is called `face merging'.
+ Any aspect of the display style that isn't specified by overlays or
+ text properties is taken from the `default' face. Since it is made
+ sure that the default face is always fully-specified, face merging
+ always results in a fully-specified face.
+
+
+ Face realization.
+
+ After all face attributes for a character have been determined by
+ merging faces of that character, that face is `realized'. The
+ realization process maps face attributes to what is physically
+ available on the system where Emacs runs. The result is a
+ `realized face' in form of a struct face which is stored in the
+ face cache of the frame on which it was realized.
+
+ Face realization is done in the context of the character to display
+ because different fonts may be used for different characters. In
+ other words, for characters that have different font
+ specifications, different realized faces are needed to display
+ them.
+
+ Font specification is done by fontsets. See the comment in
+ fontset.c for the details. In the current implementation, all ASCII
+ characters share the same font in a fontset.
+
+ Faces are at first realized for ASCII characters, and, at that
+ time, assigned a specific realized fontset. Hereafter, we call
+ such a face as `ASCII face'. When a face for a multibyte character
+ is realized, it inherits (thus shares) a fontset of an ASCII face
+ that has the same attributes other than font-related ones.
+
+ Thus, all realized face have a realized fontset.
+
+
+ Unibyte text.
+
+ Unibyte text (i.e. raw 8-bit characters) is displayed with the same
+ font as ASCII characters. That is because it is expected that
+ unibyte text users specify a font that is suitable both for ASCII
+ and raw 8-bit characters.
+
+
+ Font selection.
+
+ Font selection tries to find the best available matching font for a
+ given (character, face) combination.
+
+ If the face specifies a fontset name, that fontset determines a
+ pattern for fonts of the given character. If the face specifies a
+ font name or the other font-related attributes, a fontset is
+ realized from the default fontset. In that case, that
+ specification determines a pattern for ASCII characters and the
+ default fontset determines a pattern for multibyte characters.
+
+ Available fonts on the system on which Emacs runs are then matched
+ against the font pattern. The result of font selection is the best
+ match for the given face attributes in this font list.
+
+ Font selection can be influenced by the user.
+
+ 1. The user can specify the relative importance he gives the face
+ attributes width, height, weight, and slant by setting
+ face-font-selection-order (faces.el) to a list of face attribute
+ names. The default is '(:width :height :weight :slant), and means
+ that font selection first tries to find a good match for the font
+ width specified by a face, then---within fonts with that
+ width---tries to find a best match for the specified font height,
+ etc.
+
+ 2. Setting face-font-family-alternatives allows the user to
+ specify alternative font families to try if a family specified by a
+ face doesn't exist.
+
+ 3. Setting face-font-registry-alternatives allows the user to
+ specify all alternative font registries to try for a face
+ specifying a registry.
+
+ 4. Setting face-ignored-fonts allows the user to ignore specific
+ fonts.
+
+
+ Character composition.
+
+ Usually, the realization process is already finished when Emacs
+ actually reflects the desired glyph matrix on the screen. However,
+ on displaying a composition (sequence of characters to be composed
+ on the screen), a suitable font for the components of the
+ composition is selected and realized while drawing them on the
+ screen, i.e. the realization process is delayed but in principle
+ the same.
+
+
+ Initialization of basic faces.
+
+ The faces `default', `modeline' are considered `basic faces'.
+ When redisplay happens the first time for a newly created frame,
+ basic faces are realized for CHARSET_ASCII. Frame parameters are
+ used to fill in unspecified attributes of the default face. */
+
+#include <config.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "lisp.h"
+#include "charset.h"
+#include "keyboard.h"
+#include "frame.h"
+
+#ifdef HAVE_WINDOW_SYSTEM
+#include "fontset.h"
+#endif /* HAVE_WINDOW_SYSTEM */
+
+#ifdef HAVE_X_WINDOWS
+#include "xterm.h"
+#ifdef USE_MOTIF
+#include <Xm/Xm.h>
+#include <Xm/XmStrDefs.h>
+#endif /* USE_MOTIF */
+#endif /* HAVE_X_WINDOWS */
+
+#ifdef MSDOS
+#include "dosfns.h"
+#endif
+
+#ifdef WINDOWSNT
+#include "w32term.h"
+#include "fontset.h"
+/* Redefine X specifics to W32 equivalents to avoid cluttering the
+ code with #ifdef blocks. */
+#undef FRAME_X_DISPLAY_INFO
+#define FRAME_X_DISPLAY_INFO FRAME_W32_DISPLAY_INFO
+#define x_display_info w32_display_info
+#define FRAME_X_FONT_TABLE FRAME_W32_FONT_TABLE
+#define check_x check_w32
+#define x_list_fonts w32_list_fonts
+#define GCGraphicsExposures 0
+/* For historic reasons, FONT_WIDTH refers to average width on W32,
+ not maximum as on X. Redefine here. */
+#undef FONT_WIDTH
+#define FONT_WIDTH FONT_MAX_WIDTH
+#endif /* WINDOWSNT */
+
+#ifdef macintosh
+#include "macterm.h"
+#define x_display_info mac_display_info
+#define check_x check_mac
+
+extern XGCValues *XCreateGC (void *, WindowPtr, unsigned long, XGCValues *);
+
+static INLINE GC
+x_create_gc (f, mask, xgcv)
+ struct frame *f;
+ unsigned long mask;
+ XGCValues *xgcv;
+{
+ GC gc;
+ gc = XCreateGC (FRAME_MAC_DISPLAY (f), FRAME_MAC_WINDOW (f), mask, xgcv);
+ return gc;
+}
+
+static INLINE void
+x_free_gc (f, gc)
+ struct frame *f;
+ GC gc;
+{
+ XFreeGC (FRAME_MAC_DISPLAY (f), gc);
+}
+#endif
+
+#include "buffer.h"
+#include "dispextern.h"
+#include "blockinput.h"
+#include "window.h"
+#include "intervals.h"
+
+#ifdef HAVE_X_WINDOWS
+
+/* Compensate for a bug in Xos.h on some systems, on which it requires
+ time.h. On some such systems, Xos.h tries to redefine struct
+ timeval and struct timezone if USG is #defined while it is
+ #included. */
+
+#ifdef XOS_NEEDS_TIME_H
+#include <time.h>
+#undef USG
+#include <X11/Xos.h>
+#define USG
+#define __TIMEVAL__
+#else /* not XOS_NEEDS_TIME_H */
+#include <X11/Xos.h>
+#endif /* not XOS_NEEDS_TIME_H */
+
+#endif /* HAVE_X_WINDOWS */
+
+#include <stdio.h>
+#include <ctype.h>
+
+#ifndef max
+#define max(A, B) ((A) > (B) ? (A) : (B))
+#define min(A, B) ((A) < (B) ? (A) : (B))
+#define abs(X) ((X) < 0 ? -(X) : (X))
+#endif
+
+/* Number of pt per inch (from the TeXbook). */
+
+#define PT_PER_INCH 72.27
+
+/* Non-zero if face attribute ATTR is unspecified. */
+
+#define UNSPECIFIEDP(ATTR) EQ ((ATTR), Qunspecified)
+
+/* Value is the number of elements of VECTOR. */
+
+#define DIM(VECTOR) (sizeof (VECTOR) / sizeof *(VECTOR))
+
+/* Make a copy of string S on the stack using alloca. Value is a pointer
+ to the copy. */
+
+#define STRDUPA(S) strcpy ((char *) alloca (strlen ((S)) + 1), (S))
+
+/* Make a copy of the contents of Lisp string S on the stack using
+ alloca. Value is a pointer to the copy. */
+
+#define LSTRDUPA(S) STRDUPA (XSTRING ((S))->data)
+
+/* Size of hash table of realized faces in face caches (should be a
+ prime number). */
+
+#define FACE_CACHE_BUCKETS_SIZE 1001
+
+/* A definition of XColor for non-X frames. */
+
+#ifndef HAVE_X_WINDOWS
+
+typedef struct
+{
+ unsigned long pixel;
+ unsigned short red, green, blue;
+ char flags;
+ char pad;
+}
+XColor;
+
+#endif /* not HAVE_X_WINDOWS */
+
+/* Keyword symbols used for face attribute names. */
+
+Lisp_Object QCfamily, QCheight, QCweight, QCslant, QCunderline;
+Lisp_Object QCinverse_video, QCforeground, QCbackground, QCstipple;
+Lisp_Object QCwidth, QCfont, QCbold, QCitalic;
+Lisp_Object QCreverse_video;
+Lisp_Object QCoverline, QCstrike_through, QCbox, QCinherit;
+
+/* Symbols used for attribute values. */
+
+Lisp_Object Qnormal, Qbold, Qultra_light, Qextra_light, Qlight;
+Lisp_Object Qsemi_light, Qsemi_bold, Qextra_bold, Qultra_bold;
+Lisp_Object Qoblique, Qitalic, Qreverse_oblique, Qreverse_italic;
+Lisp_Object Qultra_condensed, Qextra_condensed, Qcondensed;
+Lisp_Object Qsemi_condensed, Qsemi_expanded, Qexpanded, Qextra_expanded;
+Lisp_Object Qultra_expanded;
+Lisp_Object Qreleased_button, Qpressed_button;
+Lisp_Object QCstyle, QCcolor, QCline_width;
+Lisp_Object Qunspecified;
+
+char unspecified_fg[] = "unspecified-fg", unspecified_bg[] = "unspecified-bg";
+
+/* The name of the function to call when the background of the frame
+ has changed, frame_update_face_colors. */
+
+Lisp_Object Qframe_update_face_colors;
+
+/* Names of basic faces. */
+
+Lisp_Object Qdefault, Qtool_bar, Qregion, Qfringe;
+Lisp_Object Qheader_line, Qscroll_bar, Qcursor, Qborder, Qmouse, Qmenu;
+extern Lisp_Object Qmode_line;
+
+/* The symbol `face-alias'. A symbols having that property is an
+ alias for another face. Value of the property is the name of
+ the aliased face. */
+
+Lisp_Object Qface_alias;
+
+/* Names of frame parameters related to faces. */
+
+extern Lisp_Object Qscroll_bar_foreground, Qscroll_bar_background;
+extern Lisp_Object Qborder_color, Qcursor_color, Qmouse_color;
+
+/* Default stipple pattern used on monochrome displays. This stipple
+ pattern is used on monochrome displays instead of shades of gray
+ for a face background color. See `set-face-stipple' for possible
+ values for this variable. */
+
+Lisp_Object Vface_default_stipple;
+
+/* Alist of alternative font families. Each element is of the form
+ (FAMILY FAMILY1 FAMILY2 ...). If fonts of FAMILY can't be loaded,
+ try FAMILY1, then FAMILY2, ... */
+
+Lisp_Object Vface_alternative_font_family_alist;
+
+/* Alist of alternative font registries. Each element is of the form
+ (REGISTRY REGISTRY1 REGISTRY2...). If fonts of REGISTRY can't be
+ loaded, try REGISTRY1, then REGISTRY2, ... */
+
+Lisp_Object Vface_alternative_font_registry_alist;
+
+/* Allowed scalable fonts. A value of nil means don't allow any
+ scalable fonts. A value of t means allow the use of any scalable
+ font. Otherwise, value must be a list of regular expressions. A
+ font may be scaled if its name matches a regular expression in the
+ list. */
+
+Lisp_Object Vscalable_fonts_allowed, Qscalable_fonts_allowed;
+
+/* List of regular expressions that matches names of fonts to ignore. */
+
+Lisp_Object Vface_ignored_fonts;
+
+/* Maximum number of fonts to consider in font_list. If not an
+ integer > 0, DEFAULT_FONT_LIST_LIMIT is used instead. */
+
+Lisp_Object Vfont_list_limit;
+#define DEFAULT_FONT_LIST_LIMIT 100
+
+/* The symbols `foreground-color' and `background-color' which can be
+ used as part of a `face' property. This is for compatibility with
+ Emacs 20.2. */
+
+Lisp_Object Qforeground_color, Qbackground_color;
+
+/* The symbols `face' and `mouse-face' used as text properties. */
+
+Lisp_Object Qface;
+extern Lisp_Object Qmouse_face;
+
+/* Error symbol for wrong_type_argument in load_pixmap. */
+
+Lisp_Object Qbitmap_spec_p;
+
+/* Alist of global face definitions. Each element is of the form
+ (FACE . LFACE) where FACE is a symbol naming a face and LFACE
+ is a Lisp vector of face attributes. These faces are used
+ to initialize faces for new frames. */
+
+Lisp_Object Vface_new_frame_defaults;
+
+/* The next ID to assign to Lisp faces. */
+
+static int next_lface_id;
+
+/* A vector mapping Lisp face Id's to face names. */
+
+static Lisp_Object *lface_id_to_name;
+static int lface_id_to_name_size;
+
+/* TTY color-related functions (defined in tty-colors.el). */
+
+Lisp_Object Qtty_color_desc, Qtty_color_by_index;
+
+/* The name of the function used to compute colors on TTYs. */
+
+Lisp_Object Qtty_color_alist;
+
+/* An alist of defined terminal colors and their RGB values. */
+
+Lisp_Object Vtty_defined_color_alist;
+
+/* Counter for calls to clear_face_cache. If this counter reaches
+ CLEAR_FONT_TABLE_COUNT, and a frame has more than
+ CLEAR_FONT_TABLE_NFONTS load, unused fonts are freed. */
+
+static int clear_font_table_count;
+#define CLEAR_FONT_TABLE_COUNT 100
+#define CLEAR_FONT_TABLE_NFONTS 10
+
+/* Non-zero means face attributes have been changed since the last
+ redisplay. Used in redisplay_internal. */
+
+int face_change_count;
+
+/* Non-zero means don't display bold text if a face's foreground
+ and background colors are the inverse of the default colors of the
+ display. This is a kluge to suppress `bold black' foreground text
+ which is hard to read on an LCD monitor. */
+
+int tty_suppress_bold_inverse_default_colors_p;
+
+/* A list of the form `((x . y))' used to avoid consing in
+ Finternal_set_lisp_face_attribute. */
+
+static Lisp_Object Vparam_value_alist;
+
+/* The total number of colors currently allocated. */
+
+#if GLYPH_DEBUG
+static int ncolors_allocated;
+static int npixmaps_allocated;
+static int ngcs;
+#endif
+
+/* Non-zero means the definition of the `menu' face for new frames has
+ been changed. */
+
+int menu_face_changed_default;
+
+
+/* Function prototypes. */
+
+struct font_name;
+struct table_entry;
+
+static void map_tty_color P_ ((struct frame *, struct face *,
+ enum lface_attribute_index, int *));
+static Lisp_Object resolve_face_name P_ ((Lisp_Object));
+static int may_use_scalable_font_p P_ ((char *));
+static void set_font_frame_param P_ ((Lisp_Object, Lisp_Object));
+static int better_font_p P_ ((int *, struct font_name *, struct font_name *,
+ int, int));
+static int x_face_list_fonts P_ ((struct frame *, char *,
+ struct font_name *, int, int));
+static int font_scalable_p P_ ((struct font_name *));
+static int get_lface_attributes P_ ((struct frame *, Lisp_Object, Lisp_Object *, int));
+static int load_pixmap P_ ((struct frame *, Lisp_Object, unsigned *, unsigned *));
+static unsigned char *xstrlwr P_ ((unsigned char *));
+static void signal_error P_ ((char *, Lisp_Object));
+static struct frame *frame_or_selected_frame P_ ((Lisp_Object, int));
+static void load_face_font P_ ((struct frame *, struct face *, int));
+static void load_face_colors P_ ((struct frame *, struct face *, Lisp_Object *));
+static void free_face_colors P_ ((struct frame *, struct face *));
+static int face_color_gray_p P_ ((struct frame *, char *));
+static char *build_font_name P_ ((struct font_name *));
+static void free_font_names P_ ((struct font_name *, int));
+static int sorted_font_list P_ ((struct frame *, char *,
+ int (*cmpfn) P_ ((const void *, const void *)),
+ struct font_name **));
+static int font_list_1 P_ ((struct frame *, Lisp_Object, Lisp_Object,
+ Lisp_Object, struct font_name **));
+static int font_list P_ ((struct frame *, Lisp_Object, Lisp_Object,
+ Lisp_Object, struct font_name **));
+static int try_font_list P_ ((struct frame *, Lisp_Object *,
+ Lisp_Object, Lisp_Object, struct font_name **));
+static int try_alternative_families P_ ((struct frame *f, Lisp_Object,
+ Lisp_Object, struct font_name **));
+static int cmp_font_names P_ ((const void *, const void *));
+static struct face *realize_face P_ ((struct face_cache *, Lisp_Object *, int,
+ struct face *, int));
+static struct face *realize_x_face P_ ((struct face_cache *,
+ Lisp_Object *, int, struct face *));
+static struct face *realize_tty_face P_ ((struct face_cache *,
+ Lisp_Object *, int));
+static int realize_basic_faces P_ ((struct frame *));
+static int realize_default_face P_ ((struct frame *));
+static void realize_named_face P_ ((struct frame *, Lisp_Object, int));
+static int lface_fully_specified_p P_ ((Lisp_Object *));
+static int lface_equal_p P_ ((Lisp_Object *, Lisp_Object *));
+static unsigned hash_string_case_insensitive P_ ((Lisp_Object));
+static unsigned lface_hash P_ ((Lisp_Object *));
+static int lface_same_font_attributes_p P_ ((Lisp_Object *, Lisp_Object *));
+static struct face_cache *make_face_cache P_ ((struct frame *));
+static void free_realized_face P_ ((struct frame *, struct face *));
+static void clear_face_gcs P_ ((struct face_cache *));
+static void free_face_cache P_ ((struct face_cache *));
+static int face_numeric_weight P_ ((Lisp_Object));
+static int face_numeric_slant P_ ((Lisp_Object));
+static int face_numeric_swidth P_ ((Lisp_Object));
+static int face_fontset P_ ((Lisp_Object *));
+static char *choose_face_font P_ ((struct frame *, Lisp_Object *, int, int));
+static void merge_face_vectors P_ ((struct frame *, Lisp_Object *, Lisp_Object*, Lisp_Object));
+static void merge_face_inheritance P_ ((struct frame *f, Lisp_Object,
+ Lisp_Object *, Lisp_Object));
+static void merge_face_vector_with_property P_ ((struct frame *, Lisp_Object *,
+ Lisp_Object));
+static int set_lface_from_font_name P_ ((struct frame *, Lisp_Object,
+ Lisp_Object, int, int));
+static Lisp_Object lface_from_face_name P_ ((struct frame *, Lisp_Object, int));
+static struct face *make_realized_face P_ ((Lisp_Object *));
+static void free_realized_faces P_ ((struct face_cache *));
+static char *best_matching_font P_ ((struct frame *, Lisp_Object *,
+ struct font_name *, int, int));
+static void cache_face P_ ((struct face_cache *, struct face *, unsigned));
+static void uncache_face P_ ((struct face_cache *, struct face *));
+static int xlfd_numeric_slant P_ ((struct font_name *));
+static int xlfd_numeric_weight P_ ((struct font_name *));
+static int xlfd_numeric_swidth P_ ((struct font_name *));
+static Lisp_Object xlfd_symbolic_slant P_ ((struct font_name *));
+static Lisp_Object xlfd_symbolic_weight P_ ((struct font_name *));
+static Lisp_Object xlfd_symbolic_swidth P_ ((struct font_name *));
+static int xlfd_fixed_p P_ ((struct font_name *));
+static int xlfd_numeric_value P_ ((struct table_entry *, int, struct font_name *,
+ int, int));
+static Lisp_Object xlfd_symbolic_value P_ ((struct table_entry *, int,
+ struct font_name *, int,
+ Lisp_Object));
+static struct table_entry *xlfd_lookup_field_contents P_ ((struct table_entry *, int,
+ struct font_name *, int));
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+static int split_font_name P_ ((struct frame *, struct font_name *, int));
+static int xlfd_point_size P_ ((struct frame *, struct font_name *));
+static void sort_fonts P_ ((struct frame *, struct font_name *, int,
+ int (*cmpfn) P_ ((const void *, const void *))));
+static GC x_create_gc P_ ((struct frame *, unsigned long, XGCValues *));
+static void x_free_gc P_ ((struct frame *, GC));
+static void clear_font_table P_ ((struct x_display_info *));
+
+#ifdef WINDOWSNT
+extern Lisp_Object w32_list_fonts P_ ((struct frame *, Lisp_Object, int, int));
+#endif /* WINDOWSNT */
+
+#ifdef USE_X_TOOLKIT
+static void x_update_menu_appearance P_ ((struct frame *));
+#endif /* USE_X_TOOLKIT */
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+/***********************************************************************
+ Utilities
+ ***********************************************************************/
+
+#ifdef HAVE_X_WINDOWS
+
+#ifdef DEBUG_X_COLORS
+
+/* The following is a poor mans infrastructure for debugging X color
+ allocation problems on displays with PseudoColor-8. Some X servers
+ like 3.3.5 XF86_SVGA with Matrox cards apparently don't implement
+ color reference counts completely so that they don't signal an
+ error when a color is freed whose reference count is already 0.
+ Other X servers do. To help me debug this, the following code
+ implements a simple reference counting schema of its own, for a
+ single display/screen. --gerd. */
+
+/* Reference counts for pixel colors. */
+
+int color_count[256];
+
+/* Register color PIXEL as allocated. */
+
+void
+register_color (pixel)
+ unsigned long pixel;
+{
+ xassert (pixel < 256);
+ ++color_count[pixel];
+}
+
+
+/* Register color PIXEL as deallocated. */
+
+void
+unregister_color (pixel)
+ unsigned long pixel;
+{
+ xassert (pixel < 256);
+ if (color_count[pixel] > 0)
+ --color_count[pixel];
+ else
+ abort ();
+}
+
+
+/* Register N colors from PIXELS as deallocated. */
+
+void
+unregister_colors (pixels, n)
+ unsigned long *pixels;
+ int n;
+{
+ int i;
+ for (i = 0; i < n; ++i)
+ unregister_color (pixels[i]);
+}
+
+
+DEFUN ("dump-colors", Fdump_colors, Sdump_colors, 0, 0, 0,
+ "Dump currently allocated colors and their reference counts to stderr.")
+ ()
+{
+ int i, n;
+
+ fputc ('\n', stderr);
+
+ for (i = n = 0; i < sizeof color_count / sizeof color_count[0]; ++i)
+ if (color_count[i])
+ {
+ fprintf (stderr, "%3d: %5d", i, color_count[i]);
+ ++n;
+ if (n % 5 == 0)
+ fputc ('\n', stderr);
+ else
+ fputc ('\t', stderr);
+ }
+
+ if (n % 5 != 0)
+ fputc ('\n', stderr);
+ return Qnil;
+}
+
+#endif /* DEBUG_X_COLORS */
+
+
+/* Free colors used on frame F. PIXELS is an array of NPIXELS pixel
+ color values. Interrupt input must be blocked when this function
+ is called. */
+
+void
+x_free_colors (f, pixels, npixels)
+ struct frame *f;
+ unsigned long *pixels;
+ int npixels;
+{
+ int class = FRAME_X_DISPLAY_INFO (f)->visual->class;
+
+ /* If display has an immutable color map, freeing colors is not
+ necessary and some servers don't allow it. So don't do it. */
+ if (class != StaticColor && class != StaticGray && class != TrueColor)
+ {
+#ifdef DEBUG_X_COLORS
+ unregister_colors (pixels, npixels);
+#endif
+ XFreeColors (FRAME_X_DISPLAY (f), FRAME_X_COLORMAP (f),
+ pixels, npixels, 0);
+ }
+}
+
+
+/* Free colors used on frame F. PIXELS is an array of NPIXELS pixel
+ color values. Interrupt input must be blocked when this function
+ is called. */
+
+void
+x_free_dpy_colors (dpy, screen, cmap, pixels, npixels)
+ Display *dpy;
+ Screen *screen;
+ Colormap cmap;
+ unsigned long *pixels;
+ int npixels;
+{
+ struct x_display_info *dpyinfo = x_display_info_for_display (dpy);
+ int class = dpyinfo->visual->class;
+
+ /* If display has an immutable color map, freeing colors is not
+ necessary and some servers don't allow it. So don't do it. */
+ if (class != StaticColor && class != StaticGray && class != TrueColor)
+ {
+#ifdef DEBUG_X_COLORS
+ unregister_colors (pixels, npixels);
+#endif
+ XFreeColors (dpy, cmap, pixels, npixels, 0);
+ }
+}
+
+
+/* Create and return a GC for use on frame F. GC values and mask
+ are given by XGCV and MASK. */
+
+static INLINE GC
+x_create_gc (f, mask, xgcv)
+ struct frame *f;
+ unsigned long mask;
+ XGCValues *xgcv;
+{
+ GC gc;
+ BLOCK_INPUT;
+ gc = XCreateGC (FRAME_X_DISPLAY (f), FRAME_X_WINDOW (f), mask, xgcv);
+ UNBLOCK_INPUT;
+ IF_DEBUG (++ngcs);
+ return gc;
+}
+
+
+/* Free GC which was used on frame F. */
+
+static INLINE void
+x_free_gc (f, gc)
+ struct frame *f;
+ GC gc;
+{
+ BLOCK_INPUT;
+ xassert (--ngcs >= 0);
+ XFreeGC (FRAME_X_DISPLAY (f), gc);
+ UNBLOCK_INPUT;
+}
+
+#endif /* HAVE_X_WINDOWS */
+
+#ifdef WINDOWSNT
+/* W32 emulation of GCs */
+
+static INLINE GC
+x_create_gc (f, mask, xgcv)
+ struct frame *f;
+ unsigned long mask;
+ XGCValues *xgcv;
+{
+ GC gc;
+ BLOCK_INPUT;
+ gc = XCreateGC (NULL, FRAME_W32_WINDOW (f), mask, xgcv);
+ UNBLOCK_INPUT;
+ IF_DEBUG (++ngcs);
+ return gc;
+}
+
+
+/* Free GC which was used on frame F. */
+
+static INLINE void
+x_free_gc (f, gc)
+ struct frame *f;
+ GC gc;
+{
+ BLOCK_INPUT;
+ xassert (--ngcs >= 0);
+ xfree (gc);
+ UNBLOCK_INPUT;
+}
+
+#endif /* WINDOWSNT */
+
+/* Like stricmp. Used to compare parts of font names which are in
+ ISO8859-1. */
+
+int
+xstricmp (s1, s2)
+ unsigned char *s1, *s2;
+{
+ while (*s1 && *s2)
+ {
+ unsigned char c1 = tolower (*s1);
+ unsigned char c2 = tolower (*s2);
+ if (c1 != c2)
+ return c1 < c2 ? -1 : 1;
+ ++s1, ++s2;
+ }
+
+ if (*s1 == 0)
+ return *s2 == 0 ? 0 : -1;
+ return 1;
+}
+
+
+/* Like strlwr, which might not always be available. */
+
+static unsigned char *
+xstrlwr (s)
+ unsigned char *s;
+{
+ unsigned char *p = s;
+
+ for (p = s; *p; ++p)
+ *p = tolower (*p);
+
+ return s;
+}
+
+
+/* Signal `error' with message S, and additional argument ARG. */
+
+static void
+signal_error (s, arg)
+ char *s;
+ Lisp_Object arg;
+{
+ Fsignal (Qerror, Fcons (build_string (s), Fcons (arg, Qnil)));
+}
+
+
+/* If FRAME is nil, return a pointer to the selected frame.
+ Otherwise, check that FRAME is a live frame, and return a pointer
+ to it. NPARAM is the parameter number of FRAME, for
+ CHECK_LIVE_FRAME. This is here because it's a frequent pattern in
+ Lisp function definitions. */
+
+static INLINE struct frame *
+frame_or_selected_frame (frame, nparam)
+ Lisp_Object frame;
+ int nparam;
+{
+ if (NILP (frame))
+ frame = selected_frame;
+
+ CHECK_LIVE_FRAME (frame, nparam);
+ return XFRAME (frame);
+}
+
+
+/***********************************************************************
+ Frames and faces
+ ***********************************************************************/
+
+/* Initialize face cache and basic faces for frame F. */
+
+void
+init_frame_faces (f)
+ struct frame *f;
+{
+ /* Make a face cache, if F doesn't have one. */
+ if (FRAME_FACE_CACHE (f) == NULL)
+ FRAME_FACE_CACHE (f) = make_face_cache (f);
+
+#ifdef HAVE_WINDOW_SYSTEM
+ /* Make the image cache. */
+ if (FRAME_WINDOW_P (f))
+ {
+ if (FRAME_X_IMAGE_CACHE (f) == NULL)
+ FRAME_X_IMAGE_CACHE (f) = make_image_cache ();
+ ++FRAME_X_IMAGE_CACHE (f)->refcount;
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+
+ /* Realize basic faces. Must have enough information in frame
+ parameters to realize basic faces at this point. */
+#ifdef HAVE_X_WINDOWS
+ if (!FRAME_X_P (f) || FRAME_X_WINDOW (f))
+#endif
+#ifdef WINDOWSNT
+ if (!FRAME_WINDOW_P (f) || FRAME_W32_WINDOW (f))
+#endif
+ if (!realize_basic_faces (f))
+ abort ();
+}
+
+
+/* Free face cache of frame F. Called from Fdelete_frame. */
+
+void
+free_frame_faces (f)
+ struct frame *f;
+{
+ struct face_cache *face_cache = FRAME_FACE_CACHE (f);
+
+ if (face_cache)
+ {
+ free_face_cache (face_cache);
+ FRAME_FACE_CACHE (f) = NULL;
+ }
+
+#ifdef HAVE_WINDOW_SYSTEM
+ if (FRAME_WINDOW_P (f))
+ {
+ struct image_cache *image_cache = FRAME_X_IMAGE_CACHE (f);
+ if (image_cache)
+ {
+ --image_cache->refcount;
+ if (image_cache->refcount == 0)
+ free_image_cache (f);
+ }
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+}
+
+
+/* Clear face caches, and recompute basic faces for frame F. Call
+ this after changing frame parameters on which those faces depend,
+ or when realized faces have been freed due to changing attributes
+ of named faces. */
+
+void
+recompute_basic_faces (f)
+ struct frame *f;
+{
+ if (FRAME_FACE_CACHE (f))
+ {
+ clear_face_cache (0);
+ if (!realize_basic_faces (f))
+ abort ();
+ }
+}
+
+
+/* Clear the face caches of all frames. CLEAR_FONTS_P non-zero means
+ try to free unused fonts, too. */
+
+void
+clear_face_cache (clear_fonts_p)
+ int clear_fonts_p;
+{
+#ifdef HAVE_WINDOW_SYSTEM
+ Lisp_Object tail, frame;
+ struct frame *f;
+
+ if (clear_fonts_p
+ || ++clear_font_table_count == CLEAR_FONT_TABLE_COUNT)
+ {
+ struct x_display_info *dpyinfo;
+
+ /* Fonts are common for frames on one display, i.e. on
+ one X screen. */
+ for (dpyinfo = x_display_list; dpyinfo; dpyinfo = dpyinfo->next)
+ if (dpyinfo->n_fonts > CLEAR_FONT_TABLE_NFONTS)
+ clear_font_table (dpyinfo);
+
+ /* From time to time see if we can unload some fonts. This also
+ frees all realized faces on all frames. Fonts needed by
+ faces will be loaded again when faces are realized again. */
+ clear_font_table_count = 0;
+
+ FOR_EACH_FRAME (tail, frame)
+ {
+ struct frame *f = XFRAME (frame);
+ if (FRAME_WINDOW_P (f)
+ && FRAME_X_DISPLAY_INFO (f)->n_fonts > CLEAR_FONT_TABLE_NFONTS)
+ free_all_realized_faces (frame);
+ }
+ }
+ else
+ {
+ /* Clear GCs of realized faces. */
+ FOR_EACH_FRAME (tail, frame)
+ {
+ f = XFRAME (frame);
+ if (FRAME_WINDOW_P (f))
+ {
+ clear_face_gcs (FRAME_FACE_CACHE (f));
+ clear_image_cache (f, 0);
+ }
+ }
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+}
+
+
+DEFUN ("clear-face-cache", Fclear_face_cache, Sclear_face_cache, 0, 1, 0,
+ "Clear face caches on all frames.\n\
+Optional THOROUGHLY non-nil means try to free unused fonts, too.")
+ (thoroughly)
+ Lisp_Object thoroughly;
+{
+ clear_face_cache (!NILP (thoroughly));
+ ++face_change_count;
+ ++windows_or_buffers_changed;
+ return Qnil;
+}
+
+
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+
+/* Remove fonts from the font table of DPYINFO except for the default
+ ASCII fonts of frames on that display. Called from clear_face_cache
+ from time to time. */
+
+static void
+clear_font_table (dpyinfo)
+ struct x_display_info *dpyinfo;
+{
+ int i;
+
+ /* Free those fonts that are not used by frames on DPYINFO. */
+ for (i = 0; i < dpyinfo->n_fonts; ++i)
+ {
+ struct font_info *font_info = dpyinfo->font_table + i;
+ Lisp_Object tail, frame;
+
+ /* Check if slot is already free. */
+ if (font_info->name == NULL)
+ continue;
+
+ /* Don't free a default font of some frame on this display. */
+ FOR_EACH_FRAME (tail, frame)
+ {
+ struct frame *f = XFRAME (frame);
+ if (FRAME_WINDOW_P (f)
+ && FRAME_X_DISPLAY_INFO (f) == dpyinfo
+ && font_info->font == FRAME_FONT (f))
+ break;
+ }
+
+ if (!NILP (tail))
+ continue;
+
+ /* Free names. */
+ if (font_info->full_name != font_info->name)
+ xfree (font_info->full_name);
+ xfree (font_info->name);
+
+ /* Free the font. */
+ BLOCK_INPUT;
+#ifdef HAVE_X_WINDOWS
+ XFreeFont (dpyinfo->display, font_info->font);
+#endif
+#ifdef WINDOWSNT
+ w32_unload_font (dpyinfo, font_info->font);
+#endif
+ UNBLOCK_INPUT;
+
+ /* Mark font table slot free. */
+ font_info->font = NULL;
+ font_info->name = font_info->full_name = NULL;
+ }
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ X Pixmaps
+ ***********************************************************************/
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+DEFUN ("bitmap-spec-p", Fbitmap_spec_p, Sbitmap_spec_p, 1, 1, 0,
+ "Value is non-nil if OBJECT is a valid bitmap specification.\n\
+A bitmap specification is either a string, a file name, or a list\n\
+(WIDTH HEIGHT DATA) where WIDTH is the pixel width of the bitmap,\n\
+HEIGHT is its height, and DATA is a string containing the bits of\n\
+the pixmap. Bits are stored row by row, each row occupies\n\
+(WIDTH + 7)/8 bytes.")
+ (object)
+ Lisp_Object object;
+{
+ int pixmap_p = 0;
+
+ if (STRINGP (object))
+ /* If OBJECT is a string, it's a file name. */
+ pixmap_p = 1;
+ else if (CONSP (object))
+ {
+ /* Otherwise OBJECT must be (WIDTH HEIGHT DATA), WIDTH and
+ HEIGHT must be integers > 0, and DATA must be string large
+ enough to hold a bitmap of the specified size. */
+ Lisp_Object width, height, data;
+
+ height = width = data = Qnil;
+
+ if (CONSP (object))
+ {
+ width = XCAR (object);
+ object = XCDR (object);
+ if (CONSP (object))
+ {
+ height = XCAR (object);
+ object = XCDR (object);
+ if (CONSP (object))
+ data = XCAR (object);
+ }
+ }
+
+ if (NATNUMP (width) && NATNUMP (height) && STRINGP (data))
+ {
+ int bytes_per_row = ((XFASTINT (width) + BITS_PER_CHAR - 1)
+ / BITS_PER_CHAR);
+ if (STRING_BYTES (XSTRING (data)) >= bytes_per_row * XINT (height))
+ pixmap_p = 1;
+ }
+ }
+
+ return pixmap_p ? Qt : Qnil;
+}
+
+
+/* Load a bitmap according to NAME (which is either a file name or a
+ pixmap spec) for use on frame F. Value is the bitmap_id (see
+ xfns.c). If NAME is nil, return with a bitmap id of zero. If
+ bitmap cannot be loaded, display a message saying so, and return
+ zero. Store the bitmap width in *W_PTR and its height in *H_PTR,
+ if these pointers are not null. */
+
+static int
+load_pixmap (f, name, w_ptr, h_ptr)
+ FRAME_PTR f;
+ Lisp_Object name;
+ unsigned int *w_ptr, *h_ptr;
+{
+ int bitmap_id;
+ Lisp_Object tem;
+
+ if (NILP (name))
+ return 0;
+
+ tem = Fbitmap_spec_p (name);
+ if (NILP (tem))
+ wrong_type_argument (Qbitmap_spec_p, name);
+
+ BLOCK_INPUT;
+ if (CONSP (name))
+ {
+ /* Decode a bitmap spec into a bitmap. */
+
+ int h, w;
+ Lisp_Object bits;
+
+ w = XINT (Fcar (name));
+ h = XINT (Fcar (Fcdr (name)));
+ bits = Fcar (Fcdr (Fcdr (name)));
+
+ bitmap_id = x_create_bitmap_from_data (f, XSTRING (bits)->data,
+ w, h);
+ }
+ else
+ {
+ /* It must be a string -- a file name. */
+ bitmap_id = x_create_bitmap_from_file (f, name);
+ }
+ UNBLOCK_INPUT;
+
+ if (bitmap_id < 0)
+ {
+ add_to_log ("Invalid or undefined bitmap %s", name, Qnil);
+ bitmap_id = 0;
+
+ if (w_ptr)
+ *w_ptr = 0;
+ if (h_ptr)
+ *h_ptr = 0;
+ }
+ else
+ {
+#if GLYPH_DEBUG
+ ++npixmaps_allocated;
+#endif
+ if (w_ptr)
+ *w_ptr = x_bitmap_width (f, bitmap_id);
+
+ if (h_ptr)
+ *h_ptr = x_bitmap_height (f, bitmap_id);
+ }
+
+ return bitmap_id;
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ Minimum font bounds
+ ***********************************************************************/
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Update the line_height of frame F. Return non-zero if line height
+ changes. */
+
+int
+frame_update_line_height (f)
+ struct frame *f;
+{
+ int line_height, changed_p;
+
+ line_height = FONT_HEIGHT (FRAME_FONT (f));
+ changed_p = line_height != FRAME_LINE_HEIGHT (f);
+ FRAME_LINE_HEIGHT (f) = line_height;
+ return changed_p;
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+/***********************************************************************
+ Fonts
+ ***********************************************************************/
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Load font of face FACE which is used on frame F to display
+ character C. The name of the font to load is determined by lface
+ and fontset of FACE. */
+
+static void
+load_face_font (f, face, c)
+ struct frame *f;
+ struct face *face;
+ int c;
+{
+ struct font_info *font_info = NULL;
+ char *font_name;
+
+ face->font_info_id = -1;
+ face->font = NULL;
+
+ font_name = choose_face_font (f, face->lface, face->fontset, c);
+ if (!font_name)
+ return;
+
+ BLOCK_INPUT;
+ font_info = FS_LOAD_FACE_FONT (f, c, font_name, face);
+ UNBLOCK_INPUT;
+
+ if (font_info)
+ {
+ face->font_info_id = font_info->font_idx;
+ face->font = font_info->font;
+ face->font_name = font_info->full_name;
+ if (face->gc)
+ {
+ x_free_gc (f, face->gc);
+ face->gc = 0;
+ }
+ }
+ else
+ add_to_log ("Unable to load font %s",
+ build_string (font_name), Qnil);
+ xfree (font_name);
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ X Colors
+ ***********************************************************************/
+
+/* A version of defined_color for non-X frames. */
+
+int
+tty_defined_color (f, color_name, color_def, alloc)
+ struct frame *f;
+ char *color_name;
+ XColor *color_def;
+ int alloc;
+{
+ Lisp_Object color_desc;
+ unsigned long color_idx = FACE_TTY_DEFAULT_COLOR;
+ unsigned long red = 0, green = 0, blue = 0;
+ int status = 1;
+
+ if (*color_name && !NILP (Ffboundp (Qtty_color_desc)))
+ {
+ Lisp_Object frame;
+
+ XSETFRAME (frame, f);
+ status = 0;
+ color_desc = call2 (Qtty_color_desc, build_string (color_name), frame);
+ if (CONSP (color_desc) && CONSP (XCDR (color_desc)))
+ {
+ color_idx = XINT (XCAR (XCDR (color_desc)));
+ if (CONSP (XCDR (XCDR (color_desc))))
+ {
+ red = XINT (XCAR (XCDR (XCDR (color_desc))));
+ green = XINT (XCAR (XCDR (XCDR (XCDR (color_desc)))));
+ blue = XINT (XCAR (XCDR (XCDR (XCDR (XCDR (color_desc))))));
+ }
+ status = 1;
+ }
+ else if (NILP (Fsymbol_value (intern ("tty-defined-color-alist"))))
+ /* We were called early during startup, and the colors are not
+ yet set up in tty-defined-color-alist. Don't return a failure
+ indication, since this produces the annoying "Unable to
+ load color" messages in the *Messages* buffer. */
+ status = 1;
+ }
+ if (color_idx == FACE_TTY_DEFAULT_COLOR && *color_name)
+ {
+ if (strcmp (color_name, "unspecified-fg") == 0)
+ color_idx = FACE_TTY_DEFAULT_FG_COLOR;
+ else if (strcmp (color_name, "unspecified-bg") == 0)
+ color_idx = FACE_TTY_DEFAULT_BG_COLOR;
+ }
+
+ if (color_idx != FACE_TTY_DEFAULT_COLOR)
+ status = 1;
+
+ color_def->pixel = color_idx;
+ color_def->red = red;
+ color_def->green = green;
+ color_def->blue = blue;
+
+ return status;
+}
+
+
+/* Decide if color named COLOR_NAME is valid for the display
+ associated with the frame F; if so, return the rgb values in
+ COLOR_DEF. If ALLOC is nonzero, allocate a new colormap cell.
+
+ This does the right thing for any type of frame. */
+
+int
+defined_color (f, color_name, color_def, alloc)
+ struct frame *f;
+ char *color_name;
+ XColor *color_def;
+ int alloc;
+{
+ if (!FRAME_WINDOW_P (f))
+ return tty_defined_color (f, color_name, color_def, alloc);
+#ifdef HAVE_X_WINDOWS
+ else if (FRAME_X_P (f))
+ return x_defined_color (f, color_name, color_def, alloc);
+#endif
+#ifdef WINDOWSNT
+ else if (FRAME_W32_P (f))
+ return w32_defined_color (f, color_name, color_def, alloc);
+#endif
+#ifdef macintosh
+ else if (FRAME_MAC_P (f))
+ return mac_defined_color (f, color_name, color_def, alloc);
+#endif
+ else
+ abort ();
+}
+
+
+/* Given the index IDX of a tty color on frame F, return its name, a
+ Lisp string. */
+
+Lisp_Object
+tty_color_name (f, idx)
+ struct frame *f;
+ int idx;
+{
+ if (idx >= 0 && !NILP (Ffboundp (Qtty_color_by_index)))
+ {
+ Lisp_Object frame;
+ Lisp_Object coldesc;
+
+ XSETFRAME (frame, f);
+ coldesc = call2 (Qtty_color_by_index, make_number (idx), frame);
+
+ if (!NILP (coldesc))
+ return XCAR (coldesc);
+ }
+#ifdef MSDOS
+ /* We can have an MSDOG frame under -nw for a short window of
+ opportunity before internal_terminal_init is called. DTRT. */
+ if (FRAME_MSDOS_P (f) && !inhibit_window_system)
+ return msdos_stdcolor_name (idx);
+#endif
+
+ if (idx == FACE_TTY_DEFAULT_FG_COLOR)
+ return build_string (unspecified_fg);
+ if (idx == FACE_TTY_DEFAULT_BG_COLOR)
+ return build_string (unspecified_bg);
+
+#ifdef WINDOWSNT
+ return vga_stdcolor_name (idx);
+#endif
+
+ return Qunspecified;
+}
+
+
+/* Return non-zero if COLOR_NAME is a shade of gray (or white or
+ black) on frame F. The algorithm is taken from 20.2 faces.el. */
+
+static int
+face_color_gray_p (f, color_name)
+ struct frame *f;
+ char *color_name;
+{
+ XColor color;
+ int gray_p;
+
+ if (defined_color (f, color_name, &color, 0))
+ gray_p = ((abs (color.red - color.green)
+ < max (color.red, color.green) / 20)
+ && (abs (color.green - color.blue)
+ < max (color.green, color.blue) / 20)
+ && (abs (color.blue - color.red)
+ < max (color.blue, color.red) / 20));
+ else
+ gray_p = 0;
+
+ return gray_p;
+}
+
+
+/* Return non-zero if color COLOR_NAME can be displayed on frame F.
+ BACKGROUND_P non-zero means the color will be used as background
+ color. */
+
+static int
+face_color_supported_p (f, color_name, background_p)
+ struct frame *f;
+ char *color_name;
+ int background_p;
+{
+ Lisp_Object frame;
+ XColor not_used;
+
+ XSETFRAME (frame, f);
+ return (FRAME_WINDOW_P (f)
+ ? (!NILP (Fxw_display_color_p (frame))
+ || xstricmp (color_name, "black") == 0
+ || xstricmp (color_name, "white") == 0
+ || (background_p
+ && face_color_gray_p (f, color_name))
+ || (!NILP (Fx_display_grayscale_p (frame))
+ && face_color_gray_p (f, color_name)))
+ : tty_defined_color (f, color_name, &not_used, 0));
+}
+
+
+DEFUN ("color-gray-p", Fcolor_gray_p, Scolor_gray_p, 1, 2, 0,
+ "Return non-nil if COLOR is a shade of gray (or white or black).\n\
+FRAME specifies the frame and thus the display for interpreting COLOR.\n\
+If FRAME is nil or omitted, use the selected frame.")
+ (color, frame)
+ Lisp_Object color, frame;
+{
+ struct frame *f;
+
+ CHECK_FRAME (frame, 0);
+ CHECK_STRING (color, 0);
+ f = XFRAME (frame);
+ return face_color_gray_p (f, XSTRING (color)->data) ? Qt : Qnil;
+}
+
+
+DEFUN ("color-supported-p", Fcolor_supported_p,
+ Scolor_supported_p, 2, 3, 0,
+ "Return non-nil if COLOR can be displayed on FRAME.\n\
+BACKGROUND-P non-nil means COLOR is used as a background.\n\
+If FRAME is nil or omitted, use the selected frame.\n\
+COLOR must be a valid color name.")
+ (color, frame, background_p)
+ Lisp_Object frame, color, background_p;
+{
+ struct frame *f;
+
+ CHECK_FRAME (frame, 0);
+ CHECK_STRING (color, 0);
+ f = XFRAME (frame);
+ if (face_color_supported_p (f, XSTRING (color)->data, !NILP (background_p)))
+ return Qt;
+ return Qnil;
+}
+
+
+/* Load color with name NAME for use by face FACE on frame F.
+ TARGET_INDEX must be one of LFACE_FOREGROUND_INDEX,
+ LFACE_BACKGROUND_INDEX, LFACE_UNDERLINE_INDEX, LFACE_OVERLINE_INDEX,
+ LFACE_STRIKE_THROUGH_INDEX, or LFACE_BOX_INDEX. Value is the
+ pixel color. If color cannot be loaded, display a message, and
+ return the foreground, background or underline color of F, but
+ record that fact in flags of the face so that we don't try to free
+ these colors. */
+
+unsigned long
+load_color (f, face, name, target_index)
+ struct frame *f;
+ struct face *face;
+ Lisp_Object name;
+ enum lface_attribute_index target_index;
+{
+ XColor color;
+
+ xassert (STRINGP (name));
+ xassert (target_index == LFACE_FOREGROUND_INDEX
+ || target_index == LFACE_BACKGROUND_INDEX
+ || target_index == LFACE_UNDERLINE_INDEX
+ || target_index == LFACE_OVERLINE_INDEX
+ || target_index == LFACE_STRIKE_THROUGH_INDEX
+ || target_index == LFACE_BOX_INDEX);
+
+ /* if the color map is full, defined_color will return a best match
+ to the values in an existing cell. */
+ if (!defined_color (f, XSTRING (name)->data, &color, 1))
+ {
+ add_to_log ("Unable to load color \"%s\"", name, Qnil);
+
+ switch (target_index)
+ {
+ case LFACE_FOREGROUND_INDEX:
+ face->foreground_defaulted_p = 1;
+ color.pixel = FRAME_FOREGROUND_PIXEL (f);
+ break;
+
+ case LFACE_BACKGROUND_INDEX:
+ face->background_defaulted_p = 1;
+ color.pixel = FRAME_BACKGROUND_PIXEL (f);
+ break;
+
+ case LFACE_UNDERLINE_INDEX:
+ face->underline_defaulted_p = 1;
+ color.pixel = FRAME_FOREGROUND_PIXEL (f);
+ break;
+
+ case LFACE_OVERLINE_INDEX:
+ face->overline_color_defaulted_p = 1;
+ color.pixel = FRAME_FOREGROUND_PIXEL (f);
+ break;
+
+ case LFACE_STRIKE_THROUGH_INDEX:
+ face->strike_through_color_defaulted_p = 1;
+ color.pixel = FRAME_FOREGROUND_PIXEL (f);
+ break;
+
+ case LFACE_BOX_INDEX:
+ face->box_color_defaulted_p = 1;
+ color.pixel = FRAME_FOREGROUND_PIXEL (f);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+#if GLYPH_DEBUG
+ else
+ ++ncolors_allocated;
+#endif
+
+ return color.pixel;
+}
+
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Load colors for face FACE which is used on frame F. Colors are
+ specified by slots LFACE_BACKGROUND_INDEX and LFACE_FOREGROUND_INDEX
+ of ATTRS. If the background color specified is not supported on F,
+ try to emulate gray colors with a stipple from Vface_default_stipple. */
+
+static void
+load_face_colors (f, face, attrs)
+ struct frame *f;
+ struct face *face;
+ Lisp_Object *attrs;
+{
+ Lisp_Object fg, bg;
+
+ bg = attrs[LFACE_BACKGROUND_INDEX];
+ fg = attrs[LFACE_FOREGROUND_INDEX];
+
+ /* Swap colors if face is inverse-video. */
+ if (EQ (attrs[LFACE_INVERSE_INDEX], Qt))
+ {
+ Lisp_Object tmp;
+ tmp = fg;
+ fg = bg;
+ bg = tmp;
+ }
+
+ /* Check for support for foreground, not for background because
+ face_color_supported_p is smart enough to know that grays are
+ "supported" as background because we are supposed to use stipple
+ for them. */
+ if (!face_color_supported_p (f, XSTRING (bg)->data, 0)
+ && !NILP (Fbitmap_spec_p (Vface_default_stipple)))
+ {
+ x_destroy_bitmap (f, face->stipple);
+ face->stipple = load_pixmap (f, Vface_default_stipple,
+ &face->pixmap_w, &face->pixmap_h);
+ }
+
+ face->background = load_color (f, face, bg, LFACE_BACKGROUND_INDEX);
+ face->foreground = load_color (f, face, fg, LFACE_FOREGROUND_INDEX);
+}
+
+
+/* Free color PIXEL on frame F. */
+
+void
+unload_color (f, pixel)
+ struct frame *f;
+ unsigned long pixel;
+{
+#ifdef HAVE_X_WINDOWS
+ if (pixel != -1)
+ {
+ BLOCK_INPUT;
+ x_free_colors (f, &pixel, 1);
+ UNBLOCK_INPUT;
+ }
+#endif
+}
+
+
+/* Free colors allocated for FACE. */
+
+static void
+free_face_colors (f, face)
+ struct frame *f;
+ struct face *face;
+{
+#ifdef HAVE_X_WINDOWS
+ if (face->colors_copied_bitwise_p)
+ return;
+
+ BLOCK_INPUT;
+
+ if (!face->foreground_defaulted_p)
+ {
+ x_free_colors (f, &face->foreground, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ if (!face->background_defaulted_p)
+ {
+ x_free_colors (f, &face->background, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ if (face->underline_p
+ && !face->underline_defaulted_p)
+ {
+ x_free_colors (f, &face->underline_color, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ if (face->overline_p
+ && !face->overline_color_defaulted_p)
+ {
+ x_free_colors (f, &face->overline_color, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ if (face->strike_through_p
+ && !face->strike_through_color_defaulted_p)
+ {
+ x_free_colors (f, &face->strike_through_color, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ if (face->box != FACE_NO_BOX
+ && !face->box_color_defaulted_p)
+ {
+ x_free_colors (f, &face->box_color, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ UNBLOCK_INPUT;
+#endif /* HAVE_X_WINDOWS */
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ XLFD Font Names
+ ***********************************************************************/
+
+/* An enumerator for each field of an XLFD font name. */
+
+enum xlfd_field
+{
+ XLFD_FOUNDRY,
+ XLFD_FAMILY,
+ XLFD_WEIGHT,
+ XLFD_SLANT,
+ XLFD_SWIDTH,
+ XLFD_ADSTYLE,
+ XLFD_PIXEL_SIZE,
+ XLFD_POINT_SIZE,
+ XLFD_RESX,
+ XLFD_RESY,
+ XLFD_SPACING,
+ XLFD_AVGWIDTH,
+ XLFD_REGISTRY,
+ XLFD_ENCODING,
+ XLFD_LAST
+};
+
+/* An enumerator for each possible slant value of a font. Taken from
+ the XLFD specification. */
+
+enum xlfd_slant
+{
+ XLFD_SLANT_UNKNOWN,
+ XLFD_SLANT_ROMAN,
+ XLFD_SLANT_ITALIC,
+ XLFD_SLANT_OBLIQUE,
+ XLFD_SLANT_REVERSE_ITALIC,
+ XLFD_SLANT_REVERSE_OBLIQUE,
+ XLFD_SLANT_OTHER
+};
+
+/* Relative font weight according to XLFD documentation. */
+
+enum xlfd_weight
+{
+ XLFD_WEIGHT_UNKNOWN,
+ XLFD_WEIGHT_ULTRA_LIGHT, /* 10 */
+ XLFD_WEIGHT_EXTRA_LIGHT, /* 20 */
+ XLFD_WEIGHT_LIGHT, /* 30 */
+ XLFD_WEIGHT_SEMI_LIGHT, /* 40: SemiLight, Book, ... */
+ XLFD_WEIGHT_MEDIUM, /* 50: Medium, Normal, Regular, ... */
+ XLFD_WEIGHT_SEMI_BOLD, /* 60: SemiBold, DemiBold, ... */
+ XLFD_WEIGHT_BOLD, /* 70: Bold, ... */
+ XLFD_WEIGHT_EXTRA_BOLD, /* 80: ExtraBold, Heavy, ... */
+ XLFD_WEIGHT_ULTRA_BOLD /* 90: UltraBold, Black, ... */
+};
+
+/* Relative proportionate width. */
+
+enum xlfd_swidth
+{
+ XLFD_SWIDTH_UNKNOWN,
+ XLFD_SWIDTH_ULTRA_CONDENSED, /* 10 */
+ XLFD_SWIDTH_EXTRA_CONDENSED, /* 20 */
+ XLFD_SWIDTH_CONDENSED, /* 30: Condensed, Narrow, Compressed, ... */
+ XLFD_SWIDTH_SEMI_CONDENSED, /* 40: semicondensed */
+ XLFD_SWIDTH_MEDIUM, /* 50: Medium, Normal, Regular, ... */
+ XLFD_SWIDTH_SEMI_EXPANDED, /* 60: SemiExpanded, DemiExpanded, ... */
+ XLFD_SWIDTH_EXPANDED, /* 70: Expanded... */
+ XLFD_SWIDTH_EXTRA_EXPANDED, /* 80: ExtraExpanded, Wide... */
+ XLFD_SWIDTH_ULTRA_EXPANDED /* 90: UltraExpanded... */
+};
+
+/* Structure used for tables mapping XLFD weight, slant, and width
+ names to numeric and symbolic values. */
+
+struct table_entry
+{
+ char *name;
+ int numeric;
+ Lisp_Object *symbol;
+};
+
+/* Table of XLFD slant names and their numeric and symbolic
+ representations. This table must be sorted by slant names in
+ ascending order. */
+
+static struct table_entry slant_table[] =
+{
+ {"i", XLFD_SLANT_ITALIC, &Qitalic},
+ {"o", XLFD_SLANT_OBLIQUE, &Qoblique},
+ {"ot", XLFD_SLANT_OTHER, &Qitalic},
+ {"r", XLFD_SLANT_ROMAN, &Qnormal},
+ {"ri", XLFD_SLANT_REVERSE_ITALIC, &Qreverse_italic},
+ {"ro", XLFD_SLANT_REVERSE_OBLIQUE, &Qreverse_oblique}
+};
+
+/* Table of XLFD weight names. This table must be sorted by weight
+ names in ascending order. */
+
+static struct table_entry weight_table[] =
+{
+ {"black", XLFD_WEIGHT_ULTRA_BOLD, &Qultra_bold},
+ {"bold", XLFD_WEIGHT_BOLD, &Qbold},
+ {"book", XLFD_WEIGHT_SEMI_LIGHT, &Qsemi_light},
+ {"demi", XLFD_WEIGHT_SEMI_BOLD, &Qsemi_bold},
+ {"demibold", XLFD_WEIGHT_SEMI_BOLD, &Qsemi_bold},
+ {"extralight", XLFD_WEIGHT_EXTRA_LIGHT, &Qextra_light},
+ {"extrabold", XLFD_WEIGHT_EXTRA_BOLD, &Qextra_bold},
+ {"heavy", XLFD_WEIGHT_EXTRA_BOLD, &Qextra_bold},
+ {"light", XLFD_WEIGHT_LIGHT, &Qlight},
+ {"medium", XLFD_WEIGHT_MEDIUM, &Qnormal},
+ {"normal", XLFD_WEIGHT_MEDIUM, &Qnormal},
+ {"regular", XLFD_WEIGHT_MEDIUM, &Qnormal},
+ {"semibold", XLFD_WEIGHT_SEMI_BOLD, &Qsemi_bold},
+ {"semilight", XLFD_WEIGHT_SEMI_LIGHT, &Qsemi_light},
+ {"ultralight", XLFD_WEIGHT_ULTRA_LIGHT, &Qultra_light},
+ {"ultrabold", XLFD_WEIGHT_ULTRA_BOLD, &Qultra_bold}
+};
+
+/* Table of XLFD width names. This table must be sorted by width
+ names in ascending order. */
+
+static struct table_entry swidth_table[] =
+{
+ {"compressed", XLFD_SWIDTH_CONDENSED, &Qcondensed},
+ {"condensed", XLFD_SWIDTH_CONDENSED, &Qcondensed},
+ {"demiexpanded", XLFD_SWIDTH_SEMI_EXPANDED, &Qsemi_expanded},
+ {"expanded", XLFD_SWIDTH_EXPANDED, &Qexpanded},
+ {"extracondensed", XLFD_SWIDTH_EXTRA_CONDENSED, &Qextra_condensed},
+ {"extraexpanded", XLFD_SWIDTH_EXTRA_EXPANDED, &Qextra_expanded},
+ {"medium", XLFD_SWIDTH_MEDIUM, &Qnormal},
+ {"narrow", XLFD_SWIDTH_CONDENSED, &Qcondensed},
+ {"normal", XLFD_SWIDTH_MEDIUM, &Qnormal},
+ {"regular", XLFD_SWIDTH_MEDIUM, &Qnormal},
+ {"semicondensed", XLFD_SWIDTH_SEMI_CONDENSED, &Qsemi_condensed},
+ {"semiexpanded", XLFD_SWIDTH_SEMI_EXPANDED, &Qsemi_expanded},
+ {"ultracondensed", XLFD_SWIDTH_ULTRA_CONDENSED, &Qultra_condensed},
+ {"ultraexpanded", XLFD_SWIDTH_ULTRA_EXPANDED, &Qultra_expanded},
+ {"wide", XLFD_SWIDTH_EXTRA_EXPANDED, &Qextra_expanded}
+};
+
+/* Structure used to hold the result of splitting font names in XLFD
+ format into their fields. */
+
+struct font_name
+{
+ /* The original name which is modified destructively by
+ split_font_name. The pointer is kept here to be able to free it
+ if it was allocated from the heap. */
+ char *name;
+
+ /* Font name fields. Each vector element points into `name' above.
+ Fields are NUL-terminated. */
+ char *fields[XLFD_LAST];
+
+ /* Numeric values for those fields that interest us. See
+ split_font_name for which these are. */
+ int numeric[XLFD_LAST];
+
+ /* Lower value mean higher priority. */
+ int registry_priority;
+};
+
+/* The frame in effect when sorting font names. Set temporarily in
+ sort_fonts so that it is available in font comparison functions. */
+
+static struct frame *font_frame;
+
+/* Order by which font selection chooses fonts. The default values
+ mean `first, find a best match for the font width, then for the
+ font height, then for weight, then for slant.' This variable can be
+ set via set-face-font-sort-order. */
+
+#ifdef macintosh
+static int font_sort_order[4] = {
+ XLFD_SWIDTH, XLFD_POINT_SIZE, XLFD_WEIGHT, XLFD_SLANT
+};
+#else
+static int font_sort_order[4];
+#endif
+
+/* Look up FONT.fields[FIELD_INDEX] in TABLE which has DIM entries.
+ TABLE must be sorted by TABLE[i]->name in ascending order. Value
+ is a pointer to the matching table entry or null if no table entry
+ matches. */
+
+static struct table_entry *
+xlfd_lookup_field_contents (table, dim, font, field_index)
+ struct table_entry *table;
+ int dim;
+ struct font_name *font;
+ int field_index;
+{
+ /* Function split_font_name converts fields to lower-case, so there
+ is no need to use xstrlwr or xstricmp here. */
+ char *s = font->fields[field_index];
+ int low, mid, high, cmp;
+
+ low = 0;
+ high = dim - 1;
+
+ while (low <= high)
+ {
+ mid = (low + high) / 2;
+ cmp = strcmp (table[mid].name, s);
+
+ if (cmp < 0)
+ low = mid + 1;
+ else if (cmp > 0)
+ high = mid - 1;
+ else
+ return table + mid;
+ }
+
+ return NULL;
+}
+
+
+/* Return a numeric representation for font name field
+ FONT.fields[FIELD_INDEX]. The field is looked up in TABLE which
+ has DIM entries. Value is the numeric value found or DFLT if no
+ table entry matches. This function is used to translate weight,
+ slant, and swidth names of XLFD font names to numeric values. */
+
+static INLINE int
+xlfd_numeric_value (table, dim, font, field_index, dflt)
+ struct table_entry *table;
+ int dim;
+ struct font_name *font;
+ int field_index;
+ int dflt;
+{
+ struct table_entry *p;
+ p = xlfd_lookup_field_contents (table, dim, font, field_index);
+ return p ? p->numeric : dflt;
+}
+
+
+/* Return a symbolic representation for font name field
+ FONT.fields[FIELD_INDEX]. The field is looked up in TABLE which
+ has DIM entries. Value is the symbolic value found or DFLT if no
+ table entry matches. This function is used to translate weight,
+ slant, and swidth names of XLFD font names to symbols. */
+
+static INLINE Lisp_Object
+xlfd_symbolic_value (table, dim, font, field_index, dflt)
+ struct table_entry *table;
+ int dim;
+ struct font_name *font;
+ int field_index;
+ Lisp_Object dflt;
+{
+ struct table_entry *p;
+ p = xlfd_lookup_field_contents (table, dim, font, field_index);
+ return p ? *p->symbol : dflt;
+}
+
+
+/* Return a numeric value for the slant of the font given by FONT. */
+
+static INLINE int
+xlfd_numeric_slant (font)
+ struct font_name *font;
+{
+ return xlfd_numeric_value (slant_table, DIM (slant_table),
+ font, XLFD_SLANT, XLFD_SLANT_ROMAN);
+}
+
+
+/* Return a symbol representing the weight of the font given by FONT. */
+
+static INLINE Lisp_Object
+xlfd_symbolic_slant (font)
+ struct font_name *font;
+{
+ return xlfd_symbolic_value (slant_table, DIM (slant_table),
+ font, XLFD_SLANT, Qnormal);
+}
+
+
+/* Return a numeric value for the weight of the font given by FONT. */
+
+static INLINE int
+xlfd_numeric_weight (font)
+ struct font_name *font;
+{
+ return xlfd_numeric_value (weight_table, DIM (weight_table),
+ font, XLFD_WEIGHT, XLFD_WEIGHT_MEDIUM);
+}
+
+
+/* Return a symbol representing the slant of the font given by FONT. */
+
+static INLINE Lisp_Object
+xlfd_symbolic_weight (font)
+ struct font_name *font;
+{
+ return xlfd_symbolic_value (weight_table, DIM (weight_table),
+ font, XLFD_WEIGHT, Qnormal);
+}
+
+
+/* Return a numeric value for the swidth of the font whose XLFD font
+ name fields are found in FONT. */
+
+static INLINE int
+xlfd_numeric_swidth (font)
+ struct font_name *font;
+{
+ return xlfd_numeric_value (swidth_table, DIM (swidth_table),
+ font, XLFD_SWIDTH, XLFD_SWIDTH_MEDIUM);
+}
+
+
+/* Return a symbolic value for the swidth of FONT. */
+
+static INLINE Lisp_Object
+xlfd_symbolic_swidth (font)
+ struct font_name *font;
+{
+ return xlfd_symbolic_value (swidth_table, DIM (swidth_table),
+ font, XLFD_SWIDTH, Qnormal);
+}
+
+
+/* Look up the entry of SYMBOL in the vector TABLE which has DIM
+ entries. Value is a pointer to the matching table entry or null if
+ no element of TABLE contains SYMBOL. */
+
+static struct table_entry *
+face_value (table, dim, symbol)
+ struct table_entry *table;
+ int dim;
+ Lisp_Object symbol;
+{
+ int i;
+
+ xassert (SYMBOLP (symbol));
+
+ for (i = 0; i < dim; ++i)
+ if (EQ (*table[i].symbol, symbol))
+ break;
+
+ return i < dim ? table + i : NULL;
+}
+
+
+/* Return a numeric value for SYMBOL in the vector TABLE which has DIM
+ entries. Value is -1 if SYMBOL is not found in TABLE. */
+
+static INLINE int
+face_numeric_value (table, dim, symbol)
+ struct table_entry *table;
+ int dim;
+ Lisp_Object symbol;
+{
+ struct table_entry *p = face_value (table, dim, symbol);
+ return p ? p->numeric : -1;
+}
+
+
+/* Return a numeric value representing the weight specified by Lisp
+ symbol WEIGHT. Value is one of the enumerators of enum
+ xlfd_weight. */
+
+static INLINE int
+face_numeric_weight (weight)
+ Lisp_Object weight;
+{
+ return face_numeric_value (weight_table, DIM (weight_table), weight);
+}
+
+
+/* Return a numeric value representing the slant specified by Lisp
+ symbol SLANT. Value is one of the enumerators of enum xlfd_slant. */
+
+static INLINE int
+face_numeric_slant (slant)
+ Lisp_Object slant;
+{
+ return face_numeric_value (slant_table, DIM (slant_table), slant);
+}
+
+
+/* Return a numeric value representing the swidth specified by Lisp
+ symbol WIDTH. Value is one of the enumerators of enum xlfd_swidth. */
+
+static int
+face_numeric_swidth (width)
+ Lisp_Object width;
+{
+ return face_numeric_value (swidth_table, DIM (swidth_table), width);
+}
+
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Return non-zero if FONT is the name of a fixed-pitch font. */
+
+static INLINE int
+xlfd_fixed_p (font)
+ struct font_name *font;
+{
+ /* Function split_font_name converts fields to lower-case, so there
+ is no need to use tolower here. */
+ return *font->fields[XLFD_SPACING] != 'p';
+}
+
+
+/* Return the point size of FONT on frame F, measured in 1/10 pt.
+
+ The actual height of the font when displayed on F depends on the
+ resolution of both the font and frame. For example, a 10pt font
+ designed for a 100dpi display will display larger than 10pt on a
+ 75dpi display. (It's not unusual to use fonts not designed for the
+ display one is using. For example, some intlfonts are available in
+ 72dpi versions, only.)
+
+ Value is the real point size of FONT on frame F, or 0 if it cannot
+ be determined. */
+
+static INLINE int
+xlfd_point_size (f, font)
+ struct frame *f;
+ struct font_name *font;
+{
+ double resy = FRAME_X_DISPLAY_INFO (f)->resy;
+ char *pixel_field = font->fields[XLFD_PIXEL_SIZE];
+ double pixel;
+ int real_pt;
+
+ if (*pixel_field == '[')
+ {
+ /* The pixel size field is `[A B C D]' which specifies
+ a transformation matrix.
+
+ A B 0
+ C D 0
+ 0 0 1
+
+ by which all glyphs of the font are transformed. The spec
+ says that s scalar value N for the pixel size is equivalent
+ to A = N * resx/resy, B = C = 0, D = N. */
+ char *start = pixel_field + 1, *end;
+ double matrix[4];
+ int i;
+
+ for (i = 0; i < 4; ++i)
+ {
+ matrix[i] = strtod (start, &end);
+ start = end;
+ }
+
+ pixel = matrix[3];
+ }
+ else
+ pixel = atoi (pixel_field);
+
+ if (pixel == 0)
+ real_pt = 0;
+ else
+ real_pt = PT_PER_INCH * 10.0 * pixel / resy + 0.5;
+
+ return real_pt;
+}
+
+
+/* Return point size of PIXEL dots while considering Y-resultion (DPI)
+ of frame F. This function is used to guess a point size of font
+ when only the pixel height of the font is available. */
+
+static INLINE int
+pixel_point_size (f, pixel)
+ struct frame *f;
+ int pixel;
+{
+ double resy = FRAME_X_DISPLAY_INFO (f)->resy;
+ double real_pt;
+ int int_pt;
+
+ /* As one inch is PT_PER_INCH points, PT_PER_INCH/RESY gives the
+ point size of one dot. */
+ real_pt = pixel * PT_PER_INCH / resy;
+ int_pt = real_pt + 0.5;
+
+ return int_pt;
+}
+
+
+/* Split XLFD font name FONT->name destructively into NUL-terminated,
+ lower-case fields in FONT->fields. NUMERIC_P non-zero means
+ compute numeric values for fields XLFD_POINT_SIZE, XLFD_SWIDTH,
+ XLFD_RESY, XLFD_SLANT, and XLFD_WEIGHT in FONT->numeric. Value is
+ zero if the font name doesn't have the format we expect. The
+ expected format is a font name that starts with a `-' and has
+ XLFD_LAST fields separated by `-'. */
+
+static int
+split_font_name (f, font, numeric_p)
+ struct frame *f;
+ struct font_name *font;
+ int numeric_p;
+{
+ int i = 0;
+ int success_p;
+
+ if (*font->name == '-')
+ {
+ char *p = xstrlwr (font->name) + 1;
+
+ while (i < XLFD_LAST)
+ {
+ font->fields[i] = p;
+ ++i;
+
+ /* Pixel and point size may be of the form `[....]'. For
+ BNF, see XLFD spec, chapter 4. Negative values are
+ indicated by tilde characters which we replace with
+ `-' characters, here. */
+ if (*p == '['
+ && (i - 1 == XLFD_PIXEL_SIZE
+ || i - 1 == XLFD_POINT_SIZE))
+ {
+ char *start, *end;
+ int j;
+
+ for (++p; *p && *p != ']'; ++p)
+ if (*p == '~')
+ *p = '-';
+
+ /* Check that the matrix contains 4 floating point
+ numbers. */
+ for (j = 0, start = font->fields[i - 1] + 1;
+ j < 4;
+ ++j, start = end)
+ if (strtod (start, &end) == 0 && start == end)
+ break;
+
+ if (j < 4)
+ break;
+ }
+
+ while (*p && *p != '-')
+ ++p;
+
+ if (*p != '-')
+ break;
+
+ *p++ = 0;
+ }
+ }
+
+ success_p = i == XLFD_LAST;
+
+ /* If requested, and font name was in the expected format,
+ compute numeric values for some fields. */
+ if (numeric_p && success_p)
+ {
+ font->numeric[XLFD_POINT_SIZE] = xlfd_point_size (f, font);
+ font->numeric[XLFD_RESY] = atoi (font->fields[XLFD_RESY]);
+ font->numeric[XLFD_SLANT] = xlfd_numeric_slant (font);
+ font->numeric[XLFD_WEIGHT] = xlfd_numeric_weight (font);
+ font->numeric[XLFD_SWIDTH] = xlfd_numeric_swidth (font);
+ font->numeric[XLFD_AVGWIDTH] = atoi (font->fields[XLFD_AVGWIDTH]);
+ }
+
+ /* Initialize it to zero. It will be overridden by font_list while
+ trying alternate registries. */
+ font->registry_priority = 0;
+
+ return success_p;
+}
+
+
+/* Build an XLFD font name from font name fields in FONT. Value is a
+ pointer to the font name, which is allocated via xmalloc. */
+
+static char *
+build_font_name (font)
+ struct font_name *font;
+{
+ int i;
+ int size = 100;
+ char *font_name = (char *) xmalloc (size);
+ int total_length = 0;
+
+ for (i = 0; i < XLFD_LAST; ++i)
+ {
+ /* Add 1 because of the leading `-'. */
+ int len = strlen (font->fields[i]) + 1;
+
+ /* Reallocate font_name if necessary. Add 1 for the final
+ NUL-byte. */
+ if (total_length + len + 1 >= size)
+ {
+ int new_size = max (2 * size, size + len + 1);
+ int sz = new_size * sizeof *font_name;
+ font_name = (char *) xrealloc (font_name, sz);
+ size = new_size;
+ }
+
+ font_name[total_length] = '-';
+ bcopy (font->fields[i], font_name + total_length + 1, len - 1);
+ total_length += len;
+ }
+
+ font_name[total_length] = 0;
+ return font_name;
+}
+
+
+/* Free an array FONTS of N font_name structures. This frees FONTS
+ itself and all `name' fields in its elements. */
+
+static INLINE void
+free_font_names (fonts, n)
+ struct font_name *fonts;
+ int n;
+{
+ while (n)
+ xfree (fonts[--n].name);
+ xfree (fonts);
+}
+
+
+/* Sort vector FONTS of font_name structures which contains NFONTS
+ elements using qsort and comparison function CMPFN. F is the frame
+ on which the fonts will be used. The global variable font_frame
+ is temporarily set to F to make it available in CMPFN. */
+
+static INLINE void
+sort_fonts (f, fonts, nfonts, cmpfn)
+ struct frame *f;
+ struct font_name *fonts;
+ int nfonts;
+ int (*cmpfn) P_ ((const void *, const void *));
+{
+ font_frame = f;
+ qsort (fonts, nfonts, sizeof *fonts, cmpfn);
+ font_frame = NULL;
+}
+
+
+/* Get fonts matching PATTERN on frame F. If F is null, use the first
+ display in x_display_list. FONTS is a pointer to a vector of
+ NFONTS font_name structures. TRY_ALTERNATIVES_P non-zero means try
+ alternative patterns from Valternate_fontname_alist if no fonts are
+ found matching PATTERN.
+
+ For all fonts found, set FONTS[i].name to the name of the font,
+ allocated via xmalloc, and split font names into fields. Ignore
+ fonts that we can't parse. Value is the number of fonts found. */
+
+static int
+x_face_list_fonts (f, pattern, fonts, nfonts, try_alternatives_p)
+ struct frame *f;
+ char *pattern;
+ struct font_name *fonts;
+ int nfonts, try_alternatives_p;
+{
+ int n, nignored;
+
+ /* NTEMACS_TODO : currently this uses w32_list_fonts, but it may be
+ better to do it the other way around. */
+ Lisp_Object lfonts;
+ Lisp_Object lpattern, tem;
+
+ lpattern = build_string (pattern);
+
+ /* Get the list of fonts matching PATTERN. */
+#ifdef WINDOWSNT
+ BLOCK_INPUT;
+ lfonts = w32_list_fonts (f, lpattern, 0, nfonts);
+ UNBLOCK_INPUT;
+#else
+ lfonts = x_list_fonts (f, lpattern, -1, nfonts);
+#endif
+
+ /* Make a copy of the font names we got from X, and
+ split them into fields. */
+ n = nignored = 0;
+ for (tem = lfonts; CONSP (tem) && n < nfonts; tem = XCDR (tem))
+ {
+ Lisp_Object elt, tail;
+ char *name = XSTRING (XCAR (tem))->data;
+
+ /* Ignore fonts matching a pattern from face-ignored-fonts. */
+ for (tail = Vface_ignored_fonts; CONSP (tail); tail = XCDR (tail))
+ {
+ elt = XCAR (tail);
+ if (STRINGP (elt)
+ && fast_c_string_match_ignore_case (elt, name) >= 0)
+ break;
+ }
+ if (!NILP (tail))
+ {
+ ++nignored;
+ continue;
+ }
+
+ /* Make a copy of the font name. */
+ fonts[n].name = xstrdup (name);
+
+ if (split_font_name (f, fonts + n, 1))
+ {
+ if (font_scalable_p (fonts + n)
+ && !may_use_scalable_font_p (name))
+ {
+ ++nignored;
+ xfree (fonts[n].name);
+ }
+ else
+ ++n;
+ }
+ else
+ xfree (fonts[n].name);
+ }
+
+ /* If no fonts found, try patterns from Valternate_fontname_alist. */
+ if (n == 0 && try_alternatives_p)
+ {
+ Lisp_Object list = Valternate_fontname_alist;
+
+ while (CONSP (list))
+ {
+ Lisp_Object entry = XCAR (list);
+ if (CONSP (entry)
+ && STRINGP (XCAR (entry))
+ && strcmp (XSTRING (XCAR (entry))->data, pattern) == 0)
+ break;
+ list = XCDR (list);
+ }
+
+ if (CONSP (list))
+ {
+ Lisp_Object patterns = XCAR (list);
+ Lisp_Object name;
+
+ while (CONSP (patterns)
+ /* If list is screwed up, give up. */
+ && (name = XCAR (patterns),
+ STRINGP (name))
+ /* Ignore patterns equal to PATTERN because we tried that
+ already with no success. */
+ && (strcmp (XSTRING (name)->data, pattern) == 0
+ || (n = x_face_list_fonts (f, XSTRING (name)->data,
+ fonts, nfonts, 0),
+ n == 0)))
+ patterns = XCDR (patterns);
+ }
+ }
+
+ return n;
+}
+
+
+/* Determine fonts matching PATTERN on frame F. Sort resulting fonts
+ using comparison function CMPFN. Value is the number of fonts
+ found. If value is non-zero, *FONTS is set to a vector of
+ font_name structures allocated from the heap containing matching
+ fonts. Each element of *FONTS contains a name member that is also
+ allocated from the heap. Font names in these structures are split
+ into fields. Use free_font_names to free such an array. */
+
+static int
+sorted_font_list (f, pattern, cmpfn, fonts)
+ struct frame *f;
+ char *pattern;
+ int (*cmpfn) P_ ((const void *, const void *));
+ struct font_name **fonts;
+{
+ int nfonts;
+
+ /* Get the list of fonts matching pattern. 100 should suffice. */
+ nfonts = DEFAULT_FONT_LIST_LIMIT;
+ if (INTEGERP (Vfont_list_limit) && XINT (Vfont_list_limit) > 0)
+ nfonts = XFASTINT (Vfont_list_limit);
+
+ *fonts = (struct font_name *) xmalloc (nfonts * sizeof **fonts);
+ nfonts = x_face_list_fonts (f, pattern, *fonts, nfonts, 1);
+
+ /* Sort the resulting array and return it in *FONTS. If no
+ fonts were found, make sure to set *FONTS to null. */
+ if (nfonts)
+ sort_fonts (f, *fonts, nfonts, cmpfn);
+ else
+ {
+ xfree (*fonts);
+ *fonts = NULL;
+ }
+
+ return nfonts;
+}
+
+
+/* Compare two font_name structures *A and *B. Value is analogous to
+ strcmp. Sort order is given by the global variable
+ font_sort_order. Font names are sorted so that, everything else
+ being equal, fonts with a resolution closer to that of the frame on
+ which they are used are listed first. The global variable
+ font_frame is the frame on which we operate. */
+
+static int
+cmp_font_names (a, b)
+ const void *a, *b;
+{
+ struct font_name *x = (struct font_name *) a;
+ struct font_name *y = (struct font_name *) b;
+ int cmp;
+
+ /* All strings have been converted to lower-case by split_font_name,
+ so we can use strcmp here. */
+ cmp = strcmp (x->fields[XLFD_FAMILY], y->fields[XLFD_FAMILY]);
+ if (cmp == 0)
+ {
+ int i;
+
+ for (i = 0; i < DIM (font_sort_order) && cmp == 0; ++i)
+ {
+ int j = font_sort_order[i];
+ cmp = x->numeric[j] - y->numeric[j];
+ }
+
+ if (cmp == 0)
+ {
+ /* Everything else being equal, we prefer fonts with an
+ y-resolution closer to that of the frame. */
+ int resy = FRAME_X_DISPLAY_INFO (font_frame)->resy;
+ int x_resy = x->numeric[XLFD_RESY];
+ int y_resy = y->numeric[XLFD_RESY];
+ cmp = abs (resy - x_resy) - abs (resy - y_resy);
+ }
+ }
+
+ return cmp;
+}
+
+
+/* Get a sorted list of fonts of family FAMILY on frame F. If PATTERN
+ is non-nil list fonts matching that pattern. Otherwise, if
+ REGISTRY is non-nil return only fonts with that registry, otherwise
+ return fonts of any registry. Set *FONTS to a vector of font_name
+ structures allocated from the heap containing the fonts found.
+ Value is the number of fonts found. */
+
+static int
+font_list_1 (f, pattern, family, registry, fonts)
+ struct frame *f;
+ Lisp_Object pattern, family, registry;
+ struct font_name **fonts;
+{
+ char *pattern_str, *family_str, *registry_str;
+
+ if (NILP (pattern))
+ {
+ family_str = (NILP (family) ? "*" : (char *) XSTRING (family)->data);
+ registry_str = (NILP (registry) ? "*" : (char *) XSTRING (registry)->data);
+
+ pattern_str = (char *) alloca (strlen (family_str)
+ + strlen (registry_str)
+ + 10);
+ strcpy (pattern_str, index (family_str, '-') ? "-" : "-*-");
+ strcat (pattern_str, family_str);
+ strcat (pattern_str, "-*-");
+ strcat (pattern_str, registry_str);
+ if (!index (registry_str, '-'))
+ {
+ if (registry_str[strlen (registry_str) - 1] == '*')
+ strcat (pattern_str, "-*");
+ else
+ strcat (pattern_str, "*-*");
+ }
+ }
+ else
+ pattern_str = (char *) XSTRING (pattern)->data;
+
+ return sorted_font_list (f, pattern_str, cmp_font_names, fonts);
+}
+
+
+/* Concatenate font list FONTS1 and FONTS2. FONTS1 and FONTS2
+ contains NFONTS1 fonts and NFONTS2 fonts respectively. Return a
+ pointer to a newly allocated font list. FONTS1 and FONTS2 are
+ freed. */
+
+static struct font_name *
+concat_font_list (fonts1, nfonts1, fonts2, nfonts2)
+ struct font_name *fonts1, *fonts2;
+ int nfonts1, nfonts2;
+{
+ int new_nfonts = nfonts1 + nfonts2;
+ struct font_name *new_fonts;
+
+ new_fonts = (struct font_name *) xmalloc (sizeof *new_fonts * new_nfonts);
+ bcopy (fonts1, new_fonts, sizeof *new_fonts * nfonts1);
+ bcopy (fonts2, new_fonts + nfonts1, sizeof *new_fonts * nfonts2);
+ xfree (fonts1);
+ xfree (fonts2);
+ return new_fonts;
+}
+
+
+/* Get a sorted list of fonts of family FAMILY on frame F.
+
+ If PATTERN is non-nil list fonts matching that pattern.
+
+ If REGISTRY is non-nil, return fonts with that registry and the
+ alternative registries from Vface_alternative_font_registry_alist.
+
+ If REGISTRY is nil return fonts of any registry.
+
+ Set *FONTS to a vector of font_name structures allocated from the
+ heap containing the fonts found. Value is the number of fonts
+ found. */
+
+static int
+font_list (f, pattern, family, registry, fonts)
+ struct frame *f;
+ Lisp_Object pattern, family, registry;
+ struct font_name **fonts;
+{
+ int nfonts = font_list_1 (f, pattern, family, registry, fonts);
+
+ if (!NILP (registry)
+ && CONSP (Vface_alternative_font_registry_alist))
+ {
+ Lisp_Object alter;
+
+ alter = Fassoc (registry, Vface_alternative_font_registry_alist);
+ if (CONSP (alter))
+ {
+ int reg_prio, i;
+
+ for (alter = XCDR (alter), reg_prio = 1;
+ CONSP (alter);
+ alter = XCDR (alter), reg_prio++)
+ if (STRINGP (XCAR (alter)))
+ {
+ int nfonts2;
+ struct font_name *fonts2;
+
+ nfonts2 = font_list_1 (f, pattern, family, XCAR (alter),
+ &fonts2);
+ for (i = 0; i < nfonts2; i++)
+ fonts2[i].registry_priority = reg_prio;
+ *fonts = (nfonts > 0
+ ? concat_font_list (*fonts, nfonts, fonts2, nfonts2)
+ : fonts2);
+ nfonts += nfonts2;
+ }
+ }
+ }
+
+ return nfonts;
+}
+
+
+/* Remove elements from LIST whose cars are `equal'. Called from
+ x-family-fonts and x-font-family-list to remove duplicate font
+ entries. */
+
+static void
+remove_duplicates (list)
+ Lisp_Object list;
+{
+ Lisp_Object tail = list;
+
+ while (!NILP (tail) && !NILP (XCDR (tail)))
+ {
+ Lisp_Object next = XCDR (tail);
+ if (!NILP (Fequal (XCAR (next), XCAR (tail))))
+ XCDR (tail) = XCDR (next);
+ else
+ tail = XCDR (tail);
+ }
+}
+
+
+DEFUN ("x-family-fonts", Fx_family_fonts, Sx_family_fonts, 0, 2, 0,
+ "Return a list of available fonts of family FAMILY on FRAME.\n\
+If FAMILY is omitted or nil, list all families.\n\
+Otherwise, FAMILY must be a string, possibly containing wildcards\n\
+`?' and `*'.\n\
+If FRAME is omitted or nil, use the selected frame.\n\
+Each element of the result is a vector [FAMILY WIDTH POINT-SIZE WEIGHT\n\
+SLANT FIXED-P FULL REGISTRY-AND-ENCODING].\n\
+FAMILY is the font family name. POINT-SIZE is the size of the\n\
+font in 1/10 pt. WIDTH, WEIGHT, and SLANT are symbols describing the\n\
+width, weight and slant of the font. These symbols are the same as for\n\
+face attributes. FIXED-P is non-nil if the font is fixed-pitch.\n\
+FULL is the full name of the font, and REGISTRY-AND-ENCODING is a string\n\
+giving the registry and encoding of the font.\n\
+The result list is sorted according to the current setting of\n\
+the face font sort order.")
+ (family, frame)
+ Lisp_Object family, frame;
+{
+ struct frame *f = check_x_frame (frame);
+ struct font_name *fonts;
+ int i, nfonts;
+ Lisp_Object result;
+ struct gcpro gcpro1;
+
+ if (!NILP (family))
+ CHECK_STRING (family, 1);
+
+ result = Qnil;
+ GCPRO1 (result);
+ nfonts = font_list (f, Qnil, family, Qnil, &fonts);
+ for (i = nfonts - 1; i >= 0; --i)
+ {
+ Lisp_Object v = Fmake_vector (make_number (8), Qnil);
+ char *tem;
+
+ ASET (v, 0, build_string (fonts[i].fields[XLFD_FAMILY]));
+ ASET (v, 1, xlfd_symbolic_swidth (fonts + i));
+ ASET (v, 2, make_number (xlfd_point_size (f, fonts + i)));
+ ASET (v, 3, xlfd_symbolic_weight (fonts + i));
+ ASET (v, 4, xlfd_symbolic_slant (fonts + i));
+ ASET (v, 5, xlfd_fixed_p (fonts + i) ? Qt : Qnil);
+ tem = build_font_name (fonts + i);
+ ASET (v, 6, build_string (tem));
+ sprintf (tem, "%s-%s", fonts[i].fields[XLFD_REGISTRY],
+ fonts[i].fields[XLFD_ENCODING]);
+ ASET (v, 7, build_string (tem));
+ xfree (tem);
+
+ result = Fcons (v, result);
+ }
+
+ remove_duplicates (result);
+ free_font_names (fonts, nfonts);
+ UNGCPRO;
+ return result;
+}
+
+
+DEFUN ("x-font-family-list", Fx_font_family_list, Sx_font_family_list,
+ 0, 1, 0,
+ "Return a list of available font families on FRAME.\n\
+If FRAME is omitted or nil, use the selected frame.\n\
+Value is a list of conses (FAMILY . FIXED-P) where FAMILY\n\
+is a font family, and FIXED-P is non-nil if fonts of that family\n\
+are fixed-pitch.")
+ (frame)
+ Lisp_Object frame;
+{
+ struct frame *f = check_x_frame (frame);
+ int nfonts, i;
+ struct font_name *fonts;
+ Lisp_Object result;
+ struct gcpro gcpro1;
+ int count = specpdl_ptr - specpdl;
+ int limit;
+
+ /* Let's consider all fonts. Increase the limit for matching
+ fonts until we have them all. */
+ for (limit = 500;;)
+ {
+ specbind (intern ("font-list-limit"), make_number (limit));
+ nfonts = font_list (f, Qnil, Qnil, Qnil, &fonts);
+
+ if (nfonts == limit)
+ {
+ free_font_names (fonts, nfonts);
+ limit *= 2;
+ }
+ else
+ break;
+ }
+
+ result = Qnil;
+ GCPRO1 (result);
+ for (i = nfonts - 1; i >= 0; --i)
+ result = Fcons (Fcons (build_string (fonts[i].fields[XLFD_FAMILY]),
+ xlfd_fixed_p (fonts + i) ? Qt : Qnil),
+ result);
+
+ remove_duplicates (result);
+ free_font_names (fonts, nfonts);
+ UNGCPRO;
+ return unbind_to (count, result);
+}
+
+
+DEFUN ("x-list-fonts", Fx_list_fonts, Sx_list_fonts, 1, 5, 0,
+ "Return a list of the names of available fonts matching PATTERN.\n\
+If optional arguments FACE and FRAME are specified, return only fonts\n\
+the same size as FACE on FRAME.\n\
+PATTERN is a string, perhaps with wildcard characters;\n\
+ the * character matches any substring, and\n\
+ the ? character matches any single character.\n\
+ PATTERN is case-insensitive.\n\
+FACE is a face name--a symbol.\n\
+\n\
+The return value is a list of strings, suitable as arguments to\n\
+set-face-font.\n\
+\n\
+Fonts Emacs can't use may or may not be excluded\n\
+even if they match PATTERN and FACE.\n\
+The optional fourth argument MAXIMUM sets a limit on how many\n\
+fonts to match. The first MAXIMUM fonts are reported.\n\
+The optional fifth argument WIDTH, if specified, is a number of columns\n\
+occupied by a character of a font. In that case, return only fonts\n\
+the WIDTH times as wide as FACE on FRAME.")
+ (pattern, face, frame, maximum, width)
+ Lisp_Object pattern, face, frame, maximum, width;
+{
+ struct frame *f;
+ int size;
+ int maxnames;
+
+ check_x ();
+ CHECK_STRING (pattern, 0);
+
+ if (NILP (maximum))
+ maxnames = 2000;
+ else
+ {
+ CHECK_NATNUM (maximum, 0);
+ maxnames = XINT (maximum);
+ }
+
+ if (!NILP (width))
+ CHECK_NUMBER (width, 4);
+
+ /* We can't simply call check_x_frame because this function may be
+ called before any frame is created. */
+ f = frame_or_selected_frame (frame, 2);
+ if (!FRAME_WINDOW_P (f))
+ {
+ /* Perhaps we have not yet created any frame. */
+ f = NULL;
+ face = Qnil;
+ }
+
+ /* Determine the width standard for comparison with the fonts we find. */
+
+ if (NILP (face))
+ size = 0;
+ else
+ {
+ /* This is of limited utility since it works with character
+ widths. Keep it for compatibility. --gerd. */
+ int face_id = lookup_named_face (f, face, 0);
+ struct face *face = (face_id < 0
+ ? NULL
+ : FACE_FROM_ID (f, face_id));
+
+ if (face && face->font)
+ size = FONT_WIDTH (face->font);
+ else
+ size = FONT_WIDTH (FRAME_FONT (f));
+
+ if (!NILP (width))
+ size *= XINT (width);
+ }
+
+ {
+ Lisp_Object args[2];
+
+ args[0] = x_list_fonts (f, pattern, size, maxnames);
+ if (f == NULL)
+ /* We don't have to check fontsets. */
+ return args[0];
+ args[1] = list_fontsets (f, pattern, size);
+ return Fnconc (2, args);
+ }
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ Lisp Faces
+ ***********************************************************************/
+
+/* Access face attributes of face LFACE, a Lisp vector. */
+
+#define LFACE_FAMILY(LFACE) AREF ((LFACE), LFACE_FAMILY_INDEX)
+#define LFACE_HEIGHT(LFACE) AREF ((LFACE), LFACE_HEIGHT_INDEX)
+#define LFACE_WEIGHT(LFACE) AREF ((LFACE), LFACE_WEIGHT_INDEX)
+#define LFACE_SLANT(LFACE) AREF ((LFACE), LFACE_SLANT_INDEX)
+#define LFACE_UNDERLINE(LFACE) AREF ((LFACE), LFACE_UNDERLINE_INDEX)
+#define LFACE_INVERSE(LFACE) AREF ((LFACE), LFACE_INVERSE_INDEX)
+#define LFACE_FOREGROUND(LFACE) AREF ((LFACE), LFACE_FOREGROUND_INDEX)
+#define LFACE_BACKGROUND(LFACE) AREF ((LFACE), LFACE_BACKGROUND_INDEX)
+#define LFACE_STIPPLE(LFACE) AREF ((LFACE), LFACE_STIPPLE_INDEX)
+#define LFACE_SWIDTH(LFACE) AREF ((LFACE), LFACE_SWIDTH_INDEX)
+#define LFACE_OVERLINE(LFACE) AREF ((LFACE), LFACE_OVERLINE_INDEX)
+#define LFACE_STRIKE_THROUGH(LFACE) AREF ((LFACE), LFACE_STRIKE_THROUGH_INDEX)
+#define LFACE_BOX(LFACE) AREF ((LFACE), LFACE_BOX_INDEX)
+#define LFACE_FONT(LFACE) AREF ((LFACE), LFACE_FONT_INDEX)
+#define LFACE_INHERIT(LFACE) AREF ((LFACE), LFACE_INHERIT_INDEX)
+#define LFACE_AVGWIDTH(LFACE) AREF ((LFACE), LFACE_AVGWIDTH_INDEX)
+
+/* Non-zero if LFACE is a Lisp face. A Lisp face is a vector of size
+ LFACE_VECTOR_SIZE which has the symbol `face' in slot 0. */
+
+#define LFACEP(LFACE) \
+ (VECTORP (LFACE) \
+ && XVECTOR (LFACE)->size == LFACE_VECTOR_SIZE \
+ && EQ (AREF (LFACE, 0), Qface))
+
+
+#if GLYPH_DEBUG
+
+/* Check consistency of Lisp face attribute vector ATTRS. */
+
+static void
+check_lface_attrs (attrs)
+ Lisp_Object *attrs;
+{
+ xassert (UNSPECIFIEDP (attrs[LFACE_FAMILY_INDEX])
+ || STRINGP (attrs[LFACE_FAMILY_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_SWIDTH_INDEX])
+ || SYMBOLP (attrs[LFACE_SWIDTH_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_AVGWIDTH_INDEX])
+ || INTEGERP (attrs[LFACE_AVGWIDTH_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_HEIGHT_INDEX])
+ || INTEGERP (attrs[LFACE_HEIGHT_INDEX])
+ || FLOATP (attrs[LFACE_HEIGHT_INDEX])
+ || FUNCTIONP (attrs[LFACE_HEIGHT_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_WEIGHT_INDEX])
+ || SYMBOLP (attrs[LFACE_WEIGHT_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_SLANT_INDEX])
+ || SYMBOLP (attrs[LFACE_SLANT_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_UNDERLINE_INDEX])
+ || SYMBOLP (attrs[LFACE_UNDERLINE_INDEX])
+ || STRINGP (attrs[LFACE_UNDERLINE_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_OVERLINE_INDEX])
+ || SYMBOLP (attrs[LFACE_OVERLINE_INDEX])
+ || STRINGP (attrs[LFACE_OVERLINE_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_STRIKE_THROUGH_INDEX])
+ || SYMBOLP (attrs[LFACE_STRIKE_THROUGH_INDEX])
+ || STRINGP (attrs[LFACE_STRIKE_THROUGH_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_BOX_INDEX])
+ || SYMBOLP (attrs[LFACE_BOX_INDEX])
+ || STRINGP (attrs[LFACE_BOX_INDEX])
+ || INTEGERP (attrs[LFACE_BOX_INDEX])
+ || CONSP (attrs[LFACE_BOX_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_INVERSE_INDEX])
+ || SYMBOLP (attrs[LFACE_INVERSE_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_FOREGROUND_INDEX])
+ || STRINGP (attrs[LFACE_FOREGROUND_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_BACKGROUND_INDEX])
+ || STRINGP (attrs[LFACE_BACKGROUND_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_INHERIT_INDEX])
+ || NILP (attrs[LFACE_INHERIT_INDEX])
+ || SYMBOLP (attrs[LFACE_INHERIT_INDEX])
+ || CONSP (attrs[LFACE_INHERIT_INDEX]));
+#ifdef HAVE_WINDOW_SYSTEM
+ xassert (UNSPECIFIEDP (attrs[LFACE_STIPPLE_INDEX])
+ || SYMBOLP (attrs[LFACE_STIPPLE_INDEX])
+ || !NILP (Fbitmap_spec_p (attrs[LFACE_STIPPLE_INDEX])));
+ xassert (UNSPECIFIEDP (attrs[LFACE_FONT_INDEX])
+ || NILP (attrs[LFACE_FONT_INDEX])
+ || STRINGP (attrs[LFACE_FONT_INDEX]));
+#endif
+}
+
+
+/* Check consistency of attributes of Lisp face LFACE (a Lisp vector). */
+
+static void
+check_lface (lface)
+ Lisp_Object lface;
+{
+ if (!NILP (lface))
+ {
+ xassert (LFACEP (lface));
+ check_lface_attrs (XVECTOR (lface)->contents);
+ }
+}
+
+#else /* GLYPH_DEBUG == 0 */
+
+#define check_lface_attrs(attrs) (void) 0
+#define check_lface(lface) (void) 0
+
+#endif /* GLYPH_DEBUG == 0 */
+
+
+/* Resolve face name FACE_NAME. If FACE_NAME is a string, intern it
+ to make it a symvol. If FACE_NAME is an alias for another face,
+ return that face's name. */
+
+static Lisp_Object
+resolve_face_name (face_name)
+ Lisp_Object face_name;
+{
+ Lisp_Object aliased;
+
+ if (STRINGP (face_name))
+ face_name = intern (XSTRING (face_name)->data);
+
+ while (SYMBOLP (face_name))
+ {
+ aliased = Fget (face_name, Qface_alias);
+ if (NILP (aliased))
+ break;
+ else
+ face_name = aliased;
+ }
+
+ return face_name;
+}
+
+
+/* Return the face definition of FACE_NAME on frame F. F null means
+ return the definition for new frames. FACE_NAME may be a string or
+ a symbol (apparently Emacs 20.2 allowed strings as face names in
+ face text properties; Ediff uses that). If FACE_NAME is an alias
+ for another face, return that face's definition. If SIGNAL_P is
+ non-zero, signal an error if FACE_NAME is not a valid face name.
+ If SIGNAL_P is zero, value is nil if FACE_NAME is not a valid face
+ name. */
+
+static INLINE Lisp_Object
+lface_from_face_name (f, face_name, signal_p)
+ struct frame *f;
+ Lisp_Object face_name;
+ int signal_p;
+{
+ Lisp_Object lface;
+
+ face_name = resolve_face_name (face_name);
+
+ if (f)
+ lface = assq_no_quit (face_name, f->face_alist);
+ else
+ lface = assq_no_quit (face_name, Vface_new_frame_defaults);
+
+ if (CONSP (lface))
+ lface = XCDR (lface);
+ else if (signal_p)
+ signal_error ("Invalid face", face_name);
+
+ check_lface (lface);
+ return lface;
+}
+
+
+/* Get face attributes of face FACE_NAME from frame-local faces on
+ frame F. Store the resulting attributes in ATTRS which must point
+ to a vector of Lisp_Objects of size LFACE_VECTOR_SIZE. If SIGNAL_P
+ is non-zero, signal an error if FACE_NAME does not name a face.
+ Otherwise, value is zero if FACE_NAME is not a face. */
+
+static INLINE int
+get_lface_attributes (f, face_name, attrs, signal_p)
+ struct frame *f;
+ Lisp_Object face_name;
+ Lisp_Object *attrs;
+ int signal_p;
+{
+ Lisp_Object lface;
+ int success_p;
+
+ lface = lface_from_face_name (f, face_name, signal_p);
+ if (!NILP (lface))
+ {
+ bcopy (XVECTOR (lface)->contents, attrs,
+ LFACE_VECTOR_SIZE * sizeof *attrs);
+ success_p = 1;
+ }
+ else
+ success_p = 0;
+
+ return success_p;
+}
+
+
+/* Non-zero if all attributes in face attribute vector ATTRS are
+ specified, i.e. are non-nil. */
+
+static int
+lface_fully_specified_p (attrs)
+ Lisp_Object *attrs;
+{
+ int i;
+
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ if (i != LFACE_FONT_INDEX && i != LFACE_INHERIT_INDEX
+ && i != LFACE_AVGWIDTH_INDEX)
+ if (UNSPECIFIEDP (attrs[i]))
+ break;
+
+ return i == LFACE_VECTOR_SIZE;
+}
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Set font-related attributes of Lisp face LFACE from the fullname of
+ the font opened by FONTNAME. If FORCE_P is zero, set only
+ unspecified attributes of LFACE. The exception is `font'
+ attribute. It is set to FONTNAME as is regardless of FORCE_P.
+
+ If FONTNAME is not available on frame F,
+ return 0 if MAY_FAIL_P is non-zero, otherwise abort.
+ If the fullname is not in a valid XLFD format,
+ return 0 if MAY_FAIL_P is non-zero, otherwise set normal values
+ in LFACE and return 1.
+ Otherwise, return 1. */
+
+static int
+set_lface_from_font_name (f, lface, fontname, force_p, may_fail_p)
+ struct frame *f;
+ Lisp_Object lface;
+ Lisp_Object fontname;
+ int force_p, may_fail_p;
+{
+ struct font_name font;
+ char *buffer;
+ int pt;
+ int have_xlfd_p;
+ int fontset;
+ char *font_name = XSTRING (fontname)->data;
+ struct font_info *font_info;
+
+ /* If FONTNAME is actually a fontset name, get ASCII font name of it. */
+ fontset = fs_query_fontset (fontname, 0);
+ if (fontset >= 0)
+ font_name = XSTRING (fontset_ascii (fontset))->data;
+
+ /* Check if FONT_NAME is surely available on the system. Usually
+ FONT_NAME is already cached for the frame F and FS_LOAD_FONT
+ returns quickly. But, even if FONT_NAME is not yet cached,
+ caching it now is not futail because we anyway load the font
+ later. */
+ BLOCK_INPUT;
+ font_info = FS_LOAD_FONT (f, 0, font_name, -1);
+ UNBLOCK_INPUT;
+
+ if (!font_info)
+ {
+ if (may_fail_p)
+ return 0;
+ abort ();
+ }
+
+ font.name = STRDUPA (font_info->full_name);
+ have_xlfd_p = split_font_name (f, &font, 1);
+
+ /* Set attributes only if unspecified, otherwise face defaults for
+ new frames would never take effect. If we couldn't get a font
+ name conforming to XLFD, set normal values. */
+
+ if (force_p || UNSPECIFIEDP (LFACE_FAMILY (lface)))
+ {
+ Lisp_Object val;
+ if (have_xlfd_p)
+ {
+ buffer = (char *) alloca (strlen (font.fields[XLFD_FAMILY])
+ + strlen (font.fields[XLFD_FOUNDRY])
+ + 2);
+ sprintf (buffer, "%s-%s", font.fields[XLFD_FOUNDRY],
+ font.fields[XLFD_FAMILY]);
+ val = build_string (buffer);
+ }
+ else
+ val = build_string ("*");
+ LFACE_FAMILY (lface) = val;
+ }
+
+ if (force_p || UNSPECIFIEDP (LFACE_HEIGHT (lface)))
+ {
+ if (have_xlfd_p)
+ pt = xlfd_point_size (f, &font);
+ else
+ pt = pixel_point_size (f, font_info->height * 10);
+ xassert (pt > 0);
+ LFACE_HEIGHT (lface) = make_number (pt);
+ }
+
+ if (force_p || UNSPECIFIEDP (LFACE_SWIDTH (lface)))
+ LFACE_SWIDTH (lface)
+ = have_xlfd_p ? xlfd_symbolic_swidth (&font) : Qnormal;
+
+ if (force_p || UNSPECIFIEDP (LFACE_AVGWIDTH (lface)))
+ LFACE_AVGWIDTH (lface)
+ = (have_xlfd_p
+ ? make_number (font.numeric[XLFD_AVGWIDTH])
+ : Qunspecified);
+
+ if (force_p || UNSPECIFIEDP (LFACE_WEIGHT (lface)))
+ LFACE_WEIGHT (lface)
+ = have_xlfd_p ? xlfd_symbolic_weight (&font) : Qnormal;
+
+ if (force_p || UNSPECIFIEDP (LFACE_SLANT (lface)))
+ LFACE_SLANT (lface)
+ = have_xlfd_p ? xlfd_symbolic_slant (&font) : Qnormal;
+
+ LFACE_FONT (lface) = fontname;
+
+ return 1;
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+/* Merges the face height FROM with the face height TO, and returns the
+ merged height. If FROM is an invalid height, then INVALID is
+ returned instead. FROM may be a either an absolute face height or a
+ `relative' height, and TO must be an absolute height. The returned
+ value is always an absolute height. GCPRO is a lisp value that will
+ be protected from garbage-collection if this function makes a call
+ into lisp. */
+
+Lisp_Object
+merge_face_heights (from, to, invalid, gcpro)
+ Lisp_Object from, to, invalid, gcpro;
+{
+ int result = 0;
+
+ if (INTEGERP (from))
+ result = XINT (from);
+ else if (NUMBERP (from))
+ result = XFLOATINT (from) * XINT (to);
+#if 0 /* Probably not so useful. */
+ else if (CONSP (from) && CONSP (XCDR (from)))
+ {
+ if (EQ (XCAR(from), Qplus) || EQ (XCAR(from), Qminus))
+ {
+ if (INTEGERP (XCAR (XCDR (from))))
+ {
+ int inc = XINT (XCAR (XCDR (from)));
+ if (EQ (XCAR (from), Qminus))
+ inc = -inc;
+
+ result = XFASTINT (to);
+ if (result + inc > 0)
+ /* Note that `underflows' don't mean FROM is invalid, so
+ we just pin the result at TO if it would otherwise be
+ negative or 0. */
+ result += inc;
+ }
+ }
+ }
+#endif
+ else if (FUNCTIONP (from))
+ {
+ /* Call function with current height as argument.
+ From is the new height. */
+ Lisp_Object args[2], height;
+ struct gcpro gcpro1;
+
+ GCPRO1 (gcpro);
+
+ args[0] = from;
+ args[1] = to;
+ height = safe_call (2, args);
+
+ UNGCPRO;
+
+ if (NUMBERP (height))
+ result = XFLOATINT (height);
+ }
+
+ if (result > 0)
+ return make_number (result);
+ else
+ return invalid;
+}
+
+
+/* Merge two Lisp face attribute vectors on frame F, FROM and TO, and
+ store the resulting attributes in TO, which must be already be
+ completely specified and contain only absolute attributes. Every
+ specified attribute of FROM overrides the corresponding attribute of
+ TO; relative attributes in FROM are merged with the absolute value in
+ TO and replace it. CYCLE_CHECK is used internally to detect loops in
+ face inheritance; it should be Qnil when called from other places. */
+
+static INLINE void
+merge_face_vectors (f, from, to, cycle_check)
+ struct frame *f;
+ Lisp_Object *from, *to;
+ Lisp_Object cycle_check;
+{
+ int i;
+
+ /* If FROM inherits from some other faces, merge their attributes into
+ TO before merging FROM's direct attributes. Note that an :inherit
+ attribute of `unspecified' is the same as one of nil; we never
+ merge :inherit attributes, so nil is more correct, but lots of
+ other code uses `unspecified' as a generic value for face attributes. */
+ if (!UNSPECIFIEDP (from[LFACE_INHERIT_INDEX])
+ && !NILP (from[LFACE_INHERIT_INDEX]))
+ merge_face_inheritance (f, from[LFACE_INHERIT_INDEX], to, cycle_check);
+
+ /* If TO specifies a :font attribute, and FROM specifies some
+ font-related attribute, we need to clear TO's :font attribute
+ (because it will be inconsistent with whatever FROM specifies, and
+ FROM takes precedence). */
+ if (!NILP (to[LFACE_FONT_INDEX])
+ && (!UNSPECIFIEDP (from[LFACE_FAMILY_INDEX])
+ || !UNSPECIFIEDP (from[LFACE_HEIGHT_INDEX])
+ || !UNSPECIFIEDP (from[LFACE_WEIGHT_INDEX])
+ || !UNSPECIFIEDP (from[LFACE_SLANT_INDEX])
+ || !UNSPECIFIEDP (from[LFACE_SWIDTH_INDEX])
+ || !UNSPECIFIEDP (from[LFACE_AVGWIDTH_INDEX])))
+ to[LFACE_FONT_INDEX] = Qnil;
+
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ if (!UNSPECIFIEDP (from[i]))
+ if (i == LFACE_HEIGHT_INDEX && !INTEGERP (from[i]))
+ to[i] = merge_face_heights (from[i], to[i], to[i], cycle_check);
+ else
+ to[i] = from[i];
+
+ /* TO is always an absolute face, which should inherit from nothing.
+ We blindly copy the :inherit attribute above and fix it up here. */
+ to[LFACE_INHERIT_INDEX] = Qnil;
+}
+
+
+/* Checks the `cycle check' variable CHECK to see if it indicates that
+ EL is part of a cycle; CHECK must be either Qnil or a value returned
+ by an earlier use of CYCLE_CHECK. SUSPICIOUS is the number of
+ elements after which a cycle might be suspected; after that many
+ elements, this macro begins consing in order to keep more precise
+ track of elements.
+
+ Returns NIL if a cycle was detected, otherwise a new value for CHECK
+ that includes EL.
+
+ CHECK is evaluated multiple times, EL and SUSPICIOUS 0 or 1 times, so
+ the caller should make sure that's ok. */
+
+#define CYCLE_CHECK(check, el, suspicious) \
+ (NILP (check) \
+ ? make_number (0) \
+ : (INTEGERP (check) \
+ ? (XFASTINT (check) < (suspicious) \
+ ? make_number (XFASTINT (check) + 1) \
+ : Fcons (el, Qnil)) \
+ : (!NILP (Fmemq ((el), (check))) \
+ ? Qnil \
+ : Fcons ((el), (check)))))
+
+
+/* Merge face attributes from the face on frame F whose name is
+ INHERITS, into the vector of face attributes TO; INHERITS may also be
+ a list of face names, in which case they are applied in order.
+ CYCLE_CHECK is used to detect loops in face inheritance.
+ Returns true if any of the inherited attributes are `font-related'. */
+
+static void
+merge_face_inheritance (f, inherit, to, cycle_check)
+ struct frame *f;
+ Lisp_Object inherit;
+ Lisp_Object *to;
+ Lisp_Object cycle_check;
+{
+ if (SYMBOLP (inherit) && !EQ (inherit, Qunspecified))
+ /* Inherit from the named face INHERIT. */
+ {
+ Lisp_Object lface;
+
+ /* Make sure we're not in an inheritance loop. */
+ cycle_check = CYCLE_CHECK (cycle_check, inherit, 15);
+ if (NILP (cycle_check))
+ /* Cycle detected, ignore any further inheritance. */
+ return;
+
+ lface = lface_from_face_name (f, inherit, 0);
+ if (!NILP (lface))
+ merge_face_vectors (f, XVECTOR (lface)->contents, to, cycle_check);
+ }
+ else if (CONSP (inherit))
+ /* Handle a list of inherited faces by calling ourselves recursively
+ on each element. Note that we only do so for symbol elements, so
+ it's not possible to infinitely recurse. */
+ {
+ while (CONSP (inherit))
+ {
+ if (SYMBOLP (XCAR (inherit)))
+ merge_face_inheritance (f, XCAR (inherit), to, cycle_check);
+
+ /* Check for a circular inheritance list. */
+ cycle_check = CYCLE_CHECK (cycle_check, inherit, 15);
+ if (NILP (cycle_check))
+ /* Cycle detected. */
+ break;
+
+ inherit = XCDR (inherit);
+ }
+ }
+}
+
+
+/* Given a Lisp face attribute vector TO and a Lisp object PROP that
+ is a face property, determine the resulting face attributes on
+ frame F, and store them in TO. PROP may be a single face
+ specification or a list of such specifications. Each face
+ specification can be
+
+ 1. A symbol or string naming a Lisp face.
+
+ 2. A property list of the form (KEYWORD VALUE ...) where each
+ KEYWORD is a face attribute name, and value is an appropriate value
+ for that attribute.
+
+ 3. Conses or the form (FOREGROUND-COLOR . COLOR) or
+ (BACKGROUND-COLOR . COLOR) where COLOR is a color name. This is
+ for compatibility with 20.2.
+
+ Face specifications earlier in lists take precedence over later
+ specifications. */
+
+static void
+merge_face_vector_with_property (f, to, prop)
+ struct frame *f;
+ Lisp_Object *to;
+ Lisp_Object prop;
+{
+ if (CONSP (prop))
+ {
+ Lisp_Object first = XCAR (prop);
+
+ if (EQ (first, Qforeground_color)
+ || EQ (first, Qbackground_color))
+ {
+ /* One of (FOREGROUND-COLOR . COLOR) or (BACKGROUND-COLOR
+ . COLOR). COLOR must be a string. */
+ Lisp_Object color_name = XCDR (prop);
+ Lisp_Object color = first;
+
+ if (STRINGP (color_name))
+ {
+ if (EQ (color, Qforeground_color))
+ to[LFACE_FOREGROUND_INDEX] = color_name;
+ else
+ to[LFACE_BACKGROUND_INDEX] = color_name;
+ }
+ else
+ add_to_log ("Invalid face color", color_name, Qnil);
+ }
+ else if (SYMBOLP (first)
+ && *XSYMBOL (first)->name->data == ':')
+ {
+ /* Assume this is the property list form. */
+ while (CONSP (prop) && CONSP (XCDR (prop)))
+ {
+ Lisp_Object keyword = XCAR (prop);
+ Lisp_Object value = XCAR (XCDR (prop));
+
+ if (EQ (keyword, QCfamily))
+ {
+ if (STRINGP (value))
+ to[LFACE_FAMILY_INDEX] = value;
+ else
+ add_to_log ("Invalid face font family", value, Qnil);
+ }
+ else if (EQ (keyword, QCheight))
+ {
+ Lisp_Object new_height =
+ merge_face_heights (value, to[LFACE_HEIGHT_INDEX],
+ Qnil, Qnil);
+
+ if (NILP (new_height))
+ add_to_log ("Invalid face font height", value, Qnil);
+ else
+ to[LFACE_HEIGHT_INDEX] = new_height;
+ }
+ else if (EQ (keyword, QCweight))
+ {
+ if (SYMBOLP (value)
+ && face_numeric_weight (value) >= 0)
+ to[LFACE_WEIGHT_INDEX] = value;
+ else
+ add_to_log ("Invalid face weight", value, Qnil);
+ }
+ else if (EQ (keyword, QCslant))
+ {
+ if (SYMBOLP (value)
+ && face_numeric_slant (value) >= 0)
+ to[LFACE_SLANT_INDEX] = value;
+ else
+ add_to_log ("Invalid face slant", value, Qnil);
+ }
+ else if (EQ (keyword, QCunderline))
+ {
+ if (EQ (value, Qt)
+ || NILP (value)
+ || STRINGP (value))
+ to[LFACE_UNDERLINE_INDEX] = value;
+ else
+ add_to_log ("Invalid face underline", value, Qnil);
+ }
+ else if (EQ (keyword, QCoverline))
+ {
+ if (EQ (value, Qt)
+ || NILP (value)
+ || STRINGP (value))
+ to[LFACE_OVERLINE_INDEX] = value;
+ else
+ add_to_log ("Invalid face overline", value, Qnil);
+ }
+ else if (EQ (keyword, QCstrike_through))
+ {
+ if (EQ (value, Qt)
+ || NILP (value)
+ || STRINGP (value))
+ to[LFACE_STRIKE_THROUGH_INDEX] = value;
+ else
+ add_to_log ("Invalid face strike-through", value, Qnil);
+ }
+ else if (EQ (keyword, QCbox))
+ {
+ if (EQ (value, Qt))
+ value = make_number (1);
+ if (INTEGERP (value)
+ || STRINGP (value)
+ || CONSP (value)
+ || NILP (value))
+ to[LFACE_BOX_INDEX] = value;
+ else
+ add_to_log ("Invalid face box", value, Qnil);
+ }
+ else if (EQ (keyword, QCinverse_video)
+ || EQ (keyword, QCreverse_video))
+ {
+ if (EQ (value, Qt) || NILP (value))
+ to[LFACE_INVERSE_INDEX] = value;
+ else
+ add_to_log ("Invalid face inverse-video", value, Qnil);
+ }
+ else if (EQ (keyword, QCforeground))
+ {
+ if (STRINGP (value))
+ to[LFACE_FOREGROUND_INDEX] = value;
+ else
+ add_to_log ("Invalid face foreground", value, Qnil);
+ }
+ else if (EQ (keyword, QCbackground))
+ {
+ if (STRINGP (value))
+ to[LFACE_BACKGROUND_INDEX] = value;
+ else
+ add_to_log ("Invalid face background", value, Qnil);
+ }
+ else if (EQ (keyword, QCstipple))
+ {
+#ifdef HAVE_X_WINDOWS
+ Lisp_Object pixmap_p = Fbitmap_spec_p (value);
+ if (!NILP (pixmap_p))
+ to[LFACE_STIPPLE_INDEX] = value;
+ else
+ add_to_log ("Invalid face stipple", value, Qnil);
+#endif
+ }
+ else if (EQ (keyword, QCwidth))
+ {
+ if (SYMBOLP (value)
+ && face_numeric_swidth (value) >= 0)
+ to[LFACE_SWIDTH_INDEX] = value;
+ else
+ add_to_log ("Invalid face width", value, Qnil);
+ }
+ else if (EQ (keyword, QCinherit))
+ {
+ if (SYMBOLP (value))
+ to[LFACE_INHERIT_INDEX] = value;
+ else
+ {
+ Lisp_Object tail;
+ for (tail = value; CONSP (tail); tail = XCDR (tail))
+ if (!SYMBOLP (XCAR (tail)))
+ break;
+ if (NILP (tail))
+ to[LFACE_INHERIT_INDEX] = value;
+ else
+ add_to_log ("Invalid face inherit", value, Qnil);
+ }
+ }
+ else
+ add_to_log ("Invalid attribute %s in face property",
+ keyword, Qnil);
+
+ prop = XCDR (XCDR (prop));
+ }
+ }
+ else
+ {
+ /* This is a list of face specs. Specifications at the
+ beginning of the list take precedence over later
+ specifications, so we have to merge starting with the
+ last specification. */
+ Lisp_Object next = XCDR (prop);
+ if (!NILP (next))
+ merge_face_vector_with_property (f, to, next);
+ merge_face_vector_with_property (f, to, first);
+ }
+ }
+ else
+ {
+ /* PROP ought to be a face name. */
+ Lisp_Object lface = lface_from_face_name (f, prop, 0);
+ if (NILP (lface))
+ add_to_log ("Invalid face text property value: %s", prop, Qnil);
+ else
+ merge_face_vectors (f, XVECTOR (lface)->contents, to, Qnil);
+ }
+}
+
+
+DEFUN ("internal-make-lisp-face", Finternal_make_lisp_face,
+ Sinternal_make_lisp_face, 1, 2, 0,
+ "Make FACE, a symbol, a Lisp face with all attributes nil.\n\
+If FACE was not known as a face before, create a new one.\n\
+If optional argument FRAME is specified, make a frame-local face\n\
+for that frame. Otherwise operate on the global face definition.\n\
+Value is a vector of face attributes.")
+ (face, frame)
+ Lisp_Object face, frame;
+{
+ Lisp_Object global_lface, lface;
+ struct frame *f;
+ int i;
+
+ CHECK_SYMBOL (face, 0);
+ global_lface = lface_from_face_name (NULL, face, 0);
+
+ if (!NILP (frame))
+ {
+ CHECK_LIVE_FRAME (frame, 1);
+ f = XFRAME (frame);
+ lface = lface_from_face_name (f, face, 0);
+ }
+ else
+ f = NULL, lface = Qnil;
+
+ /* Add a global definition if there is none. */
+ if (NILP (global_lface))
+ {
+ global_lface = Fmake_vector (make_number (LFACE_VECTOR_SIZE),
+ Qunspecified);
+ AREF (global_lface, 0) = Qface;
+ Vface_new_frame_defaults = Fcons (Fcons (face, global_lface),
+ Vface_new_frame_defaults);
+
+ /* Assign the new Lisp face a unique ID. The mapping from Lisp
+ face id to Lisp face is given by the vector lface_id_to_name.
+ The mapping from Lisp face to Lisp face id is given by the
+ property `face' of the Lisp face name. */
+ if (next_lface_id == lface_id_to_name_size)
+ {
+ int new_size = max (50, 2 * lface_id_to_name_size);
+ int sz = new_size * sizeof *lface_id_to_name;
+ lface_id_to_name = (Lisp_Object *) xrealloc (lface_id_to_name, sz);
+ lface_id_to_name_size = new_size;
+ }
+
+ lface_id_to_name[next_lface_id] = face;
+ Fput (face, Qface, make_number (next_lface_id));
+ ++next_lface_id;
+ }
+ else if (f == NULL)
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ AREF (global_lface, i) = Qunspecified;
+
+ /* Add a frame-local definition. */
+ if (f)
+ {
+ if (NILP (lface))
+ {
+ lface = Fmake_vector (make_number (LFACE_VECTOR_SIZE),
+ Qunspecified);
+ AREF (lface, 0) = Qface;
+ f->face_alist = Fcons (Fcons (face, lface), f->face_alist);
+ }
+ else
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ AREF (lface, i) = Qunspecified;
+ }
+ else
+ lface = global_lface;
+
+ xassert (LFACEP (lface));
+ check_lface (lface);
+ return lface;
+}
+
+
+DEFUN ("internal-lisp-face-p", Finternal_lisp_face_p,
+ Sinternal_lisp_face_p, 1, 2, 0,
+ "Return non-nil if FACE names a face.\n\
+If optional second parameter FRAME is non-nil, check for the\n\
+existence of a frame-local face with name FACE on that frame.\n\
+Otherwise check for the existence of a global face.")
+ (face, frame)
+ Lisp_Object face, frame;
+{
+ Lisp_Object lface;
+
+ if (!NILP (frame))
+ {
+ CHECK_LIVE_FRAME (frame, 1);
+ lface = lface_from_face_name (XFRAME (frame), face, 0);
+ }
+ else
+ lface = lface_from_face_name (NULL, face, 0);
+
+ return lface;
+}
+
+
+DEFUN ("internal-copy-lisp-face", Finternal_copy_lisp_face,
+ Sinternal_copy_lisp_face, 4, 4, 0,
+ "Copy face FROM to TO.\n\
+If FRAME it t, copy the global face definition of FROM to the\n\
+global face definition of TO. Otherwise, copy the frame-local\n\
+definition of FROM on FRAME to the frame-local definition of TO\n\
+on NEW-FRAME, or FRAME if NEW-FRAME is nil.\n\
+\n\
+Value is TO.")
+ (from, to, frame, new_frame)
+ Lisp_Object from, to, frame, new_frame;
+{
+ Lisp_Object lface, copy;
+
+ CHECK_SYMBOL (from, 0);
+ CHECK_SYMBOL (to, 1);
+ if (NILP (new_frame))
+ new_frame = frame;
+
+ if (EQ (frame, Qt))
+ {
+ /* Copy global definition of FROM. We don't make copies of
+ strings etc. because 20.2 didn't do it either. */
+ lface = lface_from_face_name (NULL, from, 1);
+ copy = Finternal_make_lisp_face (to, Qnil);
+ }
+ else
+ {
+ /* Copy frame-local definition of FROM. */
+ CHECK_LIVE_FRAME (frame, 2);
+ CHECK_LIVE_FRAME (new_frame, 3);
+ lface = lface_from_face_name (XFRAME (frame), from, 1);
+ copy = Finternal_make_lisp_face (to, new_frame);
+ }
+
+ bcopy (XVECTOR (lface)->contents, XVECTOR (copy)->contents,
+ LFACE_VECTOR_SIZE * sizeof (Lisp_Object));
+
+ return to;
+}
+
+
+DEFUN ("internal-set-lisp-face-attribute", Finternal_set_lisp_face_attribute,
+ Sinternal_set_lisp_face_attribute, 3, 4, 0,
+ "Set attribute ATTR of FACE to VALUE.\n\
+FRAME being a frame means change the face on that frame.\n\
+FRAME nil means change the face of the selected frame.\n\
+FRAME t means change the default for new frames.\n\
+FRAME 0 means change the face on all frames, and change the default\n\
+ for new frames.")
+ (face, attr, value, frame)
+ Lisp_Object face, attr, value, frame;
+{
+ Lisp_Object lface;
+ Lisp_Object old_value = Qnil;
+ /* Set 1 if ATTR is QCfont. */
+ int font_attr_p = 0;
+ /* Set 1 if ATTR is one of font-related attributes other than QCfont. */
+ int font_related_attr_p = 0;
+
+ CHECK_SYMBOL (face, 0);
+ CHECK_SYMBOL (attr, 1);
+
+ face = resolve_face_name (face);
+
+ /* If FRAME is 0, change face on all frames, and change the
+ default for new frames. */
+ if (INTEGERP (frame) && XINT (frame) == 0)
+ {
+ Lisp_Object tail;
+ Finternal_set_lisp_face_attribute (face, attr, value, Qt);
+ FOR_EACH_FRAME (tail, frame)
+ Finternal_set_lisp_face_attribute (face, attr, value, frame);
+ return face;
+ }
+
+ /* Set lface to the Lisp attribute vector of FACE. */
+ if (EQ (frame, Qt))
+ lface = lface_from_face_name (NULL, face, 1);
+ else
+ {
+ if (NILP (frame))
+ frame = selected_frame;
+
+ CHECK_LIVE_FRAME (frame, 3);
+ lface = lface_from_face_name (XFRAME (frame), face, 0);
+
+ /* If a frame-local face doesn't exist yet, create one. */
+ if (NILP (lface))
+ lface = Finternal_make_lisp_face (face, frame);
+ }
+
+ if (EQ (attr, QCfamily))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ CHECK_STRING (value, 3);
+ if (XSTRING (value)->size == 0)
+ signal_error ("Invalid face family", value);
+ }
+ old_value = LFACE_FAMILY (lface);
+ LFACE_FAMILY (lface) = value;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCheight))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ Lisp_Object test =
+ (EQ (face, Qdefault) ? value :
+ /* The default face must have an absolute size, otherwise, we do
+ a test merge with a random height to see if VALUE's ok. */
+ merge_face_heights (value, make_number(10), Qnil, Qnil));
+
+ if (!INTEGERP(test) || XINT(test) <= 0)
+ signal_error ("Invalid face height", value);
+ }
+
+ old_value = LFACE_HEIGHT (lface);
+ LFACE_HEIGHT (lface) = value;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCweight))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ CHECK_SYMBOL (value, 3);
+ if (face_numeric_weight (value) < 0)
+ signal_error ("Invalid face weight", value);
+ }
+ old_value = LFACE_WEIGHT (lface);
+ LFACE_WEIGHT (lface) = value;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCslant))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ CHECK_SYMBOL (value, 3);
+ if (face_numeric_slant (value) < 0)
+ signal_error ("Invalid face slant", value);
+ }
+ old_value = LFACE_SLANT (lface);
+ LFACE_SLANT (lface) = value;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCunderline))
+ {
+ if (!UNSPECIFIEDP (value))
+ if ((SYMBOLP (value)
+ && !EQ (value, Qt)
+ && !EQ (value, Qnil))
+ /* Underline color. */
+ || (STRINGP (value)
+ && XSTRING (value)->size == 0))
+ signal_error ("Invalid face underline", value);
+
+ old_value = LFACE_UNDERLINE (lface);
+ LFACE_UNDERLINE (lface) = value;
+ }
+ else if (EQ (attr, QCoverline))
+ {
+ if (!UNSPECIFIEDP (value))
+ if ((SYMBOLP (value)
+ && !EQ (value, Qt)
+ && !EQ (value, Qnil))
+ /* Overline color. */
+ || (STRINGP (value)
+ && XSTRING (value)->size == 0))
+ signal_error ("Invalid face overline", value);
+
+ old_value = LFACE_OVERLINE (lface);
+ LFACE_OVERLINE (lface) = value;
+ }
+ else if (EQ (attr, QCstrike_through))
+ {
+ if (!UNSPECIFIEDP (value))
+ if ((SYMBOLP (value)
+ && !EQ (value, Qt)
+ && !EQ (value, Qnil))
+ /* Strike-through color. */
+ || (STRINGP (value)
+ && XSTRING (value)->size == 0))
+ signal_error ("Invalid face strike-through", value);
+
+ old_value = LFACE_STRIKE_THROUGH (lface);
+ LFACE_STRIKE_THROUGH (lface) = value;
+ }
+ else if (EQ (attr, QCbox))
+ {
+ int valid_p;
+
+ /* Allow t meaning a simple box of width 1 in foreground color
+ of the face. */
+ if (EQ (value, Qt))
+ value = make_number (1);
+
+ if (UNSPECIFIEDP (value))
+ valid_p = 1;
+ else if (NILP (value))
+ valid_p = 1;
+ else if (INTEGERP (value))
+ valid_p = XINT (value) != 0;
+ else if (STRINGP (value))
+ valid_p = XSTRING (value)->size > 0;
+ else if (CONSP (value))
+ {
+ Lisp_Object tem;
+
+ tem = value;
+ while (CONSP (tem))
+ {
+ Lisp_Object k, v;
+
+ k = XCAR (tem);
+ tem = XCDR (tem);
+ if (!CONSP (tem))
+ break;
+ v = XCAR (tem);
+ tem = XCDR (tem);
+
+ if (EQ (k, QCline_width))
+ {
+ if (!INTEGERP (v) || XINT (v) == 0)
+ break;
+ }
+ else if (EQ (k, QCcolor))
+ {
+ if (!STRINGP (v) || XSTRING (v)->size == 0)
+ break;
+ }
+ else if (EQ (k, QCstyle))
+ {
+ if (!EQ (v, Qpressed_button) && !EQ (v, Qreleased_button))
+ break;
+ }
+ else
+ break;
+ }
+
+ valid_p = NILP (tem);
+ }
+ else
+ valid_p = 0;
+
+ if (!valid_p)
+ signal_error ("Invalid face box", value);
+
+ old_value = LFACE_BOX (lface);
+ LFACE_BOX (lface) = value;
+ }
+ else if (EQ (attr, QCinverse_video)
+ || EQ (attr, QCreverse_video))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ CHECK_SYMBOL (value, 3);
+ if (!EQ (value, Qt) && !NILP (value))
+ signal_error ("Invalid inverse-video face attribute value", value);
+ }
+ old_value = LFACE_INVERSE (lface);
+ LFACE_INVERSE (lface) = value;
+ }
+ else if (EQ (attr, QCforeground))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ /* Don't check for valid color names here because it depends
+ on the frame (display) whether the color will be valid
+ when the face is realized. */
+ CHECK_STRING (value, 3);
+ if (XSTRING (value)->size == 0)
+ signal_error ("Empty foreground color value", value);
+ }
+ old_value = LFACE_FOREGROUND (lface);
+ LFACE_FOREGROUND (lface) = value;
+ }
+ else if (EQ (attr, QCbackground))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ /* Don't check for valid color names here because it depends
+ on the frame (display) whether the color will be valid
+ when the face is realized. */
+ CHECK_STRING (value, 3);
+ if (XSTRING (value)->size == 0)
+ signal_error ("Empty background color value", value);
+ }
+ old_value = LFACE_BACKGROUND (lface);
+ LFACE_BACKGROUND (lface) = value;
+ }
+ else if (EQ (attr, QCstipple))
+ {
+#ifdef HAVE_X_WINDOWS
+ if (!UNSPECIFIEDP (value)
+ && !NILP (value)
+ && NILP (Fbitmap_spec_p (value)))
+ signal_error ("Invalid stipple attribute", value);
+ old_value = LFACE_STIPPLE (lface);
+ LFACE_STIPPLE (lface) = value;
+#endif /* HAVE_X_WINDOWS */
+ }
+ else if (EQ (attr, QCwidth))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ CHECK_SYMBOL (value, 3);
+ if (face_numeric_swidth (value) < 0)
+ signal_error ("Invalid face width", value);
+ }
+ old_value = LFACE_SWIDTH (lface);
+ LFACE_SWIDTH (lface) = value;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCfont))
+ {
+#ifdef HAVE_WINDOW_SYSTEM
+ /* Set font-related attributes of the Lisp face from an
+ XLFD font name. */
+ struct frame *f;
+ Lisp_Object tmp;
+
+ CHECK_STRING (value, 3);
+ if (EQ (frame, Qt))
+ f = SELECTED_FRAME ();
+ else
+ f = check_x_frame (frame);
+
+ /* VALUE may be a fontset name or an alias of fontset. In such
+ a case, use the base fontset name. */
+ tmp = Fquery_fontset (value, Qnil);
+ if (!NILP (tmp))
+ value = tmp;
+
+ if (!set_lface_from_font_name (f, lface, value, 1, 1))
+ signal_error ("Invalid font or fontset name", value);
+
+ font_attr_p = 1;
+#endif /* HAVE_WINDOW_SYSTEM */
+ }
+ else if (EQ (attr, QCinherit))
+ {
+ Lisp_Object tail;
+ if (SYMBOLP (value))
+ tail = Qnil;
+ else
+ for (tail = value; CONSP (tail); tail = XCDR (tail))
+ if (!SYMBOLP (XCAR (tail)))
+ break;
+ if (NILP (tail))
+ LFACE_INHERIT (lface) = value;
+ else
+ signal_error ("Invalid face inheritance", value);
+ }
+ else if (EQ (attr, QCbold))
+ {
+ old_value = LFACE_WEIGHT (lface);
+ LFACE_WEIGHT (lface) = NILP (value) ? Qnormal : Qbold;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCitalic))
+ {
+ old_value = LFACE_SLANT (lface);
+ LFACE_SLANT (lface) = NILP (value) ? Qnormal : Qitalic;
+ font_related_attr_p = 1;
+ }
+ else
+ signal_error ("Invalid face attribute name", attr);
+
+ if (font_related_attr_p
+ && !UNSPECIFIEDP (value))
+ /* If a font-related attribute other than QCfont is specified, the
+ original `font' attribute nor that of default face is useless
+ to determine a new font. Thus, we set it to nil so that font
+ selection mechanism doesn't use it. */
+ LFACE_FONT (lface) = Qnil;
+
+ /* Changing a named face means that all realized faces depending on
+ that face are invalid. Since we cannot tell which realized faces
+ depend on the face, make sure they are all removed. This is done
+ by incrementing face_change_count. The next call to
+ init_iterator will then free realized faces. */
+ if (!EQ (frame, Qt)
+ && (EQ (attr, QCfont)
+ || NILP (Fequal (old_value, value))))
+ {
+ ++face_change_count;
+ ++windows_or_buffers_changed;
+ }
+
+ if (!UNSPECIFIEDP (value)
+ && NILP (Fequal (old_value, value)))
+ {
+ Lisp_Object param;
+
+ param = Qnil;
+
+ if (EQ (face, Qdefault))
+ {
+#ifdef HAVE_WINDOW_SYSTEM
+ /* Changed font-related attributes of the `default' face are
+ reflected in changed `font' frame parameters. */
+ if (FRAMEP (frame)
+ && (font_related_attr_p || font_attr_p)
+ && lface_fully_specified_p (XVECTOR (lface)->contents))
+ set_font_frame_param (frame, lface);
+ else
+#endif /* HAVE_WINDOW_SYSTEM */
+
+ if (EQ (attr, QCforeground))
+ param = Qforeground_color;
+ else if (EQ (attr, QCbackground))
+ param = Qbackground_color;
+ }
+#ifdef HAVE_WINDOW_SYSTEM
+#ifndef WINDOWSNT
+ else if (EQ (face, Qscroll_bar))
+ {
+ /* Changing the colors of `scroll-bar' sets frame parameters
+ `scroll-bar-foreground' and `scroll-bar-background'. */
+ if (EQ (attr, QCforeground))
+ param = Qscroll_bar_foreground;
+ else if (EQ (attr, QCbackground))
+ param = Qscroll_bar_background;
+ }
+#endif /* not WINDOWSNT */
+ else if (EQ (face, Qborder))
+ {
+ /* Changing background color of `border' sets frame parameter
+ `border-color'. */
+ if (EQ (attr, QCbackground))
+ param = Qborder_color;
+ }
+ else if (EQ (face, Qcursor))
+ {
+ /* Changing background color of `cursor' sets frame parameter
+ `cursor-color'. */
+ if (EQ (attr, QCbackground))
+ param = Qcursor_color;
+ }
+ else if (EQ (face, Qmouse))
+ {
+ /* Changing background color of `mouse' sets frame parameter
+ `mouse-color'. */
+ if (EQ (attr, QCbackground))
+ param = Qmouse_color;
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+ else if (EQ (face, Qmenu))
+ {
+ /* Indicate that we have to update the menu bar when
+ realizing faces on FRAME. FRAME t change the
+ default for new frames. We do this by setting
+ setting the flag in new face caches */
+ if (FRAMEP (frame))
+ {
+ struct frame *f = XFRAME (frame);
+ if (FRAME_FACE_CACHE (f) == NULL)
+ FRAME_FACE_CACHE (f) = make_face_cache (f);
+ FRAME_FACE_CACHE (f)->menu_face_changed_p = 1;
+ }
+ else
+ menu_face_changed_default = 1;
+ }
+
+ if (!NILP (param))
+ if (EQ (frame, Qt))
+ /* Update `default-frame-alist', which is used for new frames. */
+ {
+ store_in_alist (&Vdefault_frame_alist, param, value);
+ }
+ else
+ /* Update the current frame's parameters. */
+ {
+ Lisp_Object cons;
+ cons = XCAR (Vparam_value_alist);
+ XCAR (cons) = param;
+ XCDR (cons) = value;
+ Fmodify_frame_parameters (frame, Vparam_value_alist);
+ }
+ }
+
+ return face;
+}
+
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Set the `font' frame parameter of FRAME determined from `default'
+ face attributes LFACE. If a face or fontset name is explicitely
+ specfied in LFACE, use it as is. Otherwise, determine a font name
+ from the other font-related atrributes of LFACE. In that case, if
+ there's no matching font, signals an error. */
+
+static void
+set_font_frame_param (frame, lface)
+ Lisp_Object frame, lface;
+{
+ struct frame *f = XFRAME (frame);
+
+ if (FRAME_WINDOW_P (f))
+ {
+ Lisp_Object font_name;
+ char *font;
+
+ if (STRINGP (LFACE_FONT (lface)))
+ font_name = LFACE_FONT (lface);
+ else
+ {
+ /* Choose a font name that reflects LFACE's attributes and has
+ the registry and encoding pattern specified in the default
+ fontset (3rd arg: -1) for ASCII characters (4th arg: 0). */
+ font = choose_face_font (f, XVECTOR (lface)->contents, -1, 0);
+ if (!font)
+ error ("No font matches the specified attribute");
+ font_name = build_string (font);
+ xfree (font);
+ }
+
+ Fmodify_frame_parameters (frame, Fcons (Fcons (Qfont, font_name), Qnil));
+ }
+}
+
+
+/* Update the corresponding face when frame parameter PARAM on frame F
+ has been assigned the value NEW_VALUE. */
+
+void
+update_face_from_frame_parameter (f, param, new_value)
+ struct frame *f;
+ Lisp_Object param, new_value;
+{
+ Lisp_Object lface;
+
+ /* If there are no faces yet, give up. This is the case when called
+ from Fx_create_frame, and we do the necessary things later in
+ face-set-after-frame-defaults. */
+ if (NILP (f->face_alist))
+ return;
+
+ if (EQ (param, Qforeground_color))
+ {
+ lface = lface_from_face_name (f, Qdefault, 1);
+ LFACE_FOREGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ realize_basic_faces (f);
+ }
+ else if (EQ (param, Qbackground_color))
+ {
+ Lisp_Object frame;
+
+ /* Changing the background color might change the background
+ mode, so that we have to load new defface specs. Call
+ frame-update-face-colors to do that. */
+ XSETFRAME (frame, f);
+ call1 (Qframe_update_face_colors, frame);
+
+ face = Qdefault;
+ lface = lface_from_face_name (f, face, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ realize_basic_faces (f);
+ }
+ else if (EQ (param, Qborder_color))
+ {
+ face = Qborder;
+ lface = lface_from_face_name (f, face, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ }
+ else if (EQ (param, Qcursor_color))
+ {
+ face = Qcursor;
+ lface = lface_from_face_name (f, face, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ }
+ else if (EQ (param, Qmouse_color))
+ {
+ face = Qmouse;
+ lface = lface_from_face_name (f, face, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ }
+
+ /* Changing a named face means that all realized faces depending on
+ that face are invalid. Since we cannot tell which realized faces
+ depend on the face, make sure they are all removed. This is done
+ by incrementing face_change_count. The next call to
+ init_iterator will then free realized faces. */
+ if (!NILP (face)
+ && NILP (Fget (face, Qface_no_inherit)))
+ {
+ ++face_change_count;
+ ++windows_or_buffers_changed;
+ }
+}
+
+
+/* Get the value of X resource RESOURCE, class CLASS for the display
+ of frame FRAME. This is here because ordinary `x-get-resource'
+ doesn't take a frame argument. */
+
+DEFUN ("internal-face-x-get-resource", Finternal_face_x_get_resource,
+ Sinternal_face_x_get_resource, 3, 3, 0, "")
+ (resource, class, frame)
+ Lisp_Object resource, class, frame;
+{
+ Lisp_Object value = Qnil;
+#ifndef WINDOWSNT
+#ifndef macintosh
+ CHECK_STRING (resource, 0);
+ CHECK_STRING (class, 1);
+ CHECK_LIVE_FRAME (frame, 2);
+ BLOCK_INPUT;
+ value = display_x_get_resource (FRAME_X_DISPLAY_INFO (XFRAME (frame)),
+ resource, class, Qnil, Qnil);
+ UNBLOCK_INPUT;
+#endif /* not macintosh */
+#endif /* not WINDOWSNT */
+ return value;
+}
+
+
+/* Return resource string VALUE as a boolean value, i.e. nil, or t.
+ If VALUE is "on" or "true", return t. If VALUE is "off" or
+ "false", return nil. Otherwise, if SIGNAL_P is non-zero, signal an
+ error; if SIGNAL_P is zero, return 0. */
+
+static Lisp_Object
+face_boolean_x_resource_value (value, signal_p)
+ Lisp_Object value;
+ int signal_p;
+{
+ Lisp_Object result = make_number (0);
+
+ xassert (STRINGP (value));
+
+ if (xstricmp (XSTRING (value)->data, "on") == 0
+ || xstricmp (XSTRING (value)->data, "true") == 0)
+ result = Qt;
+ else if (xstricmp (XSTRING (value)->data, "off") == 0
+ || xstricmp (XSTRING (value)->data, "false") == 0)
+ result = Qnil;
+ else if (xstricmp (XSTRING (value)->data, "unspecified") == 0)
+ result = Qunspecified;
+ else if (signal_p)
+ signal_error ("Invalid face attribute value from X resource", value);
+
+ return result;
+}
+
+
+DEFUN ("internal-set-lisp-face-attribute-from-resource",
+ Finternal_set_lisp_face_attribute_from_resource,
+ Sinternal_set_lisp_face_attribute_from_resource,
+ 3, 4, 0, "")
+ (face, attr, value, frame)
+ Lisp_Object face, attr, value, frame;
+{
+ CHECK_SYMBOL (face, 0);
+ CHECK_SYMBOL (attr, 1);
+ CHECK_STRING (value, 2);
+
+ if (xstricmp (XSTRING (value)->data, "unspecified") == 0)
+ value = Qunspecified;
+ else if (EQ (attr, QCheight))
+ {
+ value = Fstring_to_number (value, make_number (10));
+ if (XINT (value) <= 0)
+ signal_error ("Invalid face height from X resource", value);
+ }
+ else if (EQ (attr, QCbold) || EQ (attr, QCitalic))
+ value = face_boolean_x_resource_value (value, 1);
+ else if (EQ (attr, QCweight) || EQ (attr, QCslant) || EQ (attr, QCwidth))
+ value = intern (XSTRING (value)->data);
+ else if (EQ (attr, QCreverse_video) || EQ (attr, QCinverse_video))
+ value = face_boolean_x_resource_value (value, 1);
+ else if (EQ (attr, QCunderline)
+ || EQ (attr, QCoverline)
+ || EQ (attr, QCstrike_through))
+ {
+ Lisp_Object boolean_value;
+
+ /* If the result of face_boolean_x_resource_value is t or nil,
+ VALUE does NOT specify a color. */
+ boolean_value = face_boolean_x_resource_value (value, 0);
+ if (SYMBOLP (boolean_value))
+ value = boolean_value;
+ }
+ else if (EQ (attr, QCbox))
+ value = Fcar (Fread_from_string (value, Qnil, Qnil));
+
+ return Finternal_set_lisp_face_attribute (face, attr, value, frame);
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+/***********************************************************************
+ Menu face
+ ***********************************************************************/
+
+#if defined HAVE_X_WINDOWS && defined USE_X_TOOLKIT
+
+/* Make menus on frame F appear as specified by the `menu' face. */
+
+static void
+x_update_menu_appearance (f)
+ struct frame *f;
+{
+ struct x_display_info *dpyinfo = FRAME_X_DISPLAY_INFO (f);
+ XrmDatabase rdb;
+
+ if (dpyinfo
+ && (rdb = XrmGetDatabase (FRAME_X_DISPLAY (f)),
+ rdb != NULL))
+ {
+ char line[512];
+ Lisp_Object lface = lface_from_face_name (f, Qmenu, 1);
+ struct face *face = FACE_FROM_ID (f, MENU_FACE_ID);
+ char *myname = XSTRING (Vx_resource_name)->data;
+ int changed_p = 0;
+#ifdef USE_MOTIF
+ const char *popup_path = "popup_menu";
+#else
+ const char *popup_path = "menu.popup";
+#endif
+
+ if (STRINGP (LFACE_FOREGROUND (lface)))
+ {
+ sprintf (line, "%s.%s*foreground: %s",
+ myname, popup_path,
+ XSTRING (LFACE_FOREGROUND (lface))->data);
+ XrmPutLineResource (&rdb, line);
+ sprintf (line, "%s.pane.menubar*foreground: %s",
+ myname, XSTRING (LFACE_FOREGROUND (lface))->data);
+ XrmPutLineResource (&rdb, line);
+ changed_p = 1;
+ }
+
+ if (STRINGP (LFACE_BACKGROUND (lface)))
+ {
+ sprintf (line, "%s.%s*background: %s",
+ myname, popup_path,
+ XSTRING (LFACE_BACKGROUND (lface))->data);
+ XrmPutLineResource (&rdb, line);
+ sprintf (line, "%s.pane.menubar*background: %s",
+ myname, XSTRING (LFACE_BACKGROUND (lface))->data);
+ XrmPutLineResource (&rdb, line);
+ changed_p = 1;
+ }
+
+ if (face->font_name
+ && (!UNSPECIFIEDP (LFACE_FAMILY (lface))
+ || !UNSPECIFIEDP (LFACE_SWIDTH (lface))
+ || !UNSPECIFIEDP (LFACE_AVGWIDTH (lface))
+ || !UNSPECIFIEDP (LFACE_WEIGHT (lface))
+ || !UNSPECIFIEDP (LFACE_SLANT (lface))
+ || !UNSPECIFIEDP (LFACE_HEIGHT (lface))))
+ {
+#ifdef USE_MOTIF
+ const char *suffix = "List";
+#else
+ const char *suffix = "";
+#endif
+ sprintf (line, "%s.pane.menubar*font%s: %s",
+ myname, suffix, face->font_name);
+ XrmPutLineResource (&rdb, line);
+ sprintf (line, "%s.%s*font%s: %s",
+ myname, popup_path, suffix, face->font_name);
+ XrmPutLineResource (&rdb, line);
+ changed_p = 1;
+ }
+
+ if (changed_p && f->output_data.x->menubar_widget)
+ free_frame_menubar (f);
+ }
+}
+
+#endif /* HAVE_X_WINDOWS && USE_X_TOOLKIT */
+
+
+
+DEFUN ("internal-get-lisp-face-attribute", Finternal_get_lisp_face_attribute,
+ Sinternal_get_lisp_face_attribute,
+ 2, 3, 0,
+ "Return face attribute KEYWORD of face SYMBOL.\n\
+If SYMBOL does not name a valid Lisp face or KEYWORD isn't a valid\n\
+face attribute name, signal an error.\n\
+If the optional argument FRAME is given, report on face FACE in that\n\
+frame. If FRAME is t, report on the defaults for face FACE (for new\n\
+frames). If FRAME is omitted or nil, use the selected frame.")
+ (symbol, keyword, frame)
+ Lisp_Object symbol, keyword, frame;
+{
+ Lisp_Object lface, value = Qnil;
+
+ CHECK_SYMBOL (symbol, 0);
+ CHECK_SYMBOL (keyword, 1);
+
+ if (EQ (frame, Qt))
+ lface = lface_from_face_name (NULL, symbol, 1);
+ else
+ {
+ if (NILP (frame))
+ frame = selected_frame;
+ CHECK_LIVE_FRAME (frame, 2);
+ lface = lface_from_face_name (XFRAME (frame), symbol, 1);
+ }
+
+ if (EQ (keyword, QCfamily))
+ value = LFACE_FAMILY (lface);
+ else if (EQ (keyword, QCheight))
+ value = LFACE_HEIGHT (lface);
+ else if (EQ (keyword, QCweight))
+ value = LFACE_WEIGHT (lface);
+ else if (EQ (keyword, QCslant))
+ value = LFACE_SLANT (lface);
+ else if (EQ (keyword, QCunderline))
+ value = LFACE_UNDERLINE (lface);
+ else if (EQ (keyword, QCoverline))
+ value = LFACE_OVERLINE (lface);
+ else if (EQ (keyword, QCstrike_through))
+ value = LFACE_STRIKE_THROUGH (lface);
+ else if (EQ (keyword, QCbox))
+ value = LFACE_BOX (lface);
+ else if (EQ (keyword, QCinverse_video)
+ || EQ (keyword, QCreverse_video))
+ value = LFACE_INVERSE (lface);
+ else if (EQ (keyword, QCforeground))
+ value = LFACE_FOREGROUND (lface);
+ else if (EQ (keyword, QCbackground))
+ value = LFACE_BACKGROUND (lface);
+ else if (EQ (keyword, QCstipple))
+ value = LFACE_STIPPLE (lface);
+ else if (EQ (keyword, QCwidth))
+ value = LFACE_SWIDTH (lface);
+ else if (EQ (keyword, QCinherit))
+ value = LFACE_INHERIT (lface);
+ else if (EQ (keyword, QCfont))
+ value = LFACE_FONT (lface);
+ else
+ signal_error ("Invalid face attribute name", keyword);
+
+ return value;
+}
+
+
+DEFUN ("internal-lisp-face-attribute-values",
+ Finternal_lisp_face_attribute_values,
+ Sinternal_lisp_face_attribute_values, 1, 1, 0,
+ "Return a list of valid discrete values for face attribute ATTR.\n\
+Value is nil if ATTR doesn't have a discrete set of valid values.")
+ (attr)
+ Lisp_Object attr;
+{
+ Lisp_Object result = Qnil;
+
+ CHECK_SYMBOL (attr, 0);
+
+ if (EQ (attr, QCweight)
+ || EQ (attr, QCslant)
+ || EQ (attr, QCwidth))
+ {
+ /* Extract permissible symbols from tables. */
+ struct table_entry *table;
+ int i, dim;
+
+ if (EQ (attr, QCweight))
+ table = weight_table, dim = DIM (weight_table);
+ else if (EQ (attr, QCslant))
+ table = slant_table, dim = DIM (slant_table);
+ else
+ table = swidth_table, dim = DIM (swidth_table);
+
+ for (i = 0; i < dim; ++i)
+ {
+ Lisp_Object symbol = *table[i].symbol;
+ Lisp_Object tail = result;
+
+ while (!NILP (tail)
+ && !EQ (XCAR (tail), symbol))
+ tail = XCDR (tail);
+
+ if (NILP (tail))
+ result = Fcons (symbol, result);
+ }
+ }
+ else if (EQ (attr, QCunderline))
+ result = Fcons (Qt, Fcons (Qnil, Qnil));
+ else if (EQ (attr, QCoverline))
+ result = Fcons (Qt, Fcons (Qnil, Qnil));
+ else if (EQ (attr, QCstrike_through))
+ result = Fcons (Qt, Fcons (Qnil, Qnil));
+ else if (EQ (attr, QCinverse_video) || EQ (attr, QCreverse_video))
+ result = Fcons (Qt, Fcons (Qnil, Qnil));
+
+ return result;
+}
+
+
+DEFUN ("internal-merge-in-global-face", Finternal_merge_in_global_face,
+ Sinternal_merge_in_global_face, 2, 2, 0,
+ "Add attributes from frame-default definition of FACE to FACE on FRAME.\n\
+Default face attributes override any local face attributes.")
+ (face, frame)
+ Lisp_Object face, frame;
+{
+ int i;
+ Lisp_Object global_lface, local_lface, *gvec, *lvec;
+
+ CHECK_LIVE_FRAME (frame, 1);
+ global_lface = lface_from_face_name (NULL, face, 1);
+ local_lface = lface_from_face_name (XFRAME (frame), face, 0);
+ if (NILP (local_lface))
+ local_lface = Finternal_make_lisp_face (face, frame);
+
+ /* Make every specified global attribute override the local one.
+ BEWARE!! This is only used from `face-set-after-frame-default' where
+ the local frame is defined from default specs in `face-defface-spec'
+ and those should be overridden by global settings. Hence the strange
+ "global before local" priority. */
+ lvec = XVECTOR (local_lface)->contents;
+ gvec = XVECTOR (global_lface)->contents;
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ if (! UNSPECIFIEDP (gvec[i]))
+ lvec[i] = gvec[i];
+
+ return Qnil;
+}
+
+
+/* The following function is implemented for compatibility with 20.2.
+ The function is used in x-resolve-fonts when it is asked to
+ return fonts with the same size as the font of a face. This is
+ done in fontset.el. */
+
+DEFUN ("face-font", Fface_font, Sface_font, 1, 2, 0,
+ "Return the font name of face FACE, or nil if it is unspecified.\n\
+If the optional argument FRAME is given, report on face FACE in that frame.\n\
+If FRAME is t, report on the defaults for face FACE (for new frames).\n\
+ The font default for a face is either nil, or a list\n\
+ of the form (bold), (italic) or (bold italic).\n\
+If FRAME is omitted or nil, use the selected frame.")
+ (face, frame)
+ Lisp_Object face, frame;
+{
+ if (EQ (frame, Qt))
+ {
+ Lisp_Object result = Qnil;
+ Lisp_Object lface = lface_from_face_name (NULL, face, 1);
+
+ if (!UNSPECIFIEDP (LFACE_WEIGHT (lface))
+ && !EQ (LFACE_WEIGHT (lface), Qnormal))
+ result = Fcons (Qbold, result);
+
+ if (!UNSPECIFIEDP (LFACE_SLANT (lface))
+ && !EQ (LFACE_SLANT (lface), Qnormal))
+ result = Fcons (Qitalic, result);
+
+ return result;
+ }
+ else
+ {
+ struct frame *f = frame_or_selected_frame (frame, 1);
+ int face_id = lookup_named_face (f, face, 0);
+ struct face *face = FACE_FROM_ID (f, face_id);
+ return face ? build_string (face->font_name) : Qnil;
+ }
+}
+
+
+/* Compare face vectors V1 and V2 for equality. Value is non-zero if
+ all attributes are `equal'. Tries to be fast because this function
+ is called quite often. */
+
+static INLINE int
+lface_equal_p (v1, v2)
+ Lisp_Object *v1, *v2;
+{
+ int i, equal_p = 1;
+
+ for (i = 1; i < LFACE_VECTOR_SIZE && equal_p; ++i)
+ {
+ Lisp_Object a = v1[i];
+ Lisp_Object b = v2[i];
+
+ /* Type can differ, e.g. when one attribute is unspecified, i.e. nil,
+ and the other is specified. */
+ equal_p = XTYPE (a) == XTYPE (b);
+ if (!equal_p)
+ break;
+
+ if (!EQ (a, b))
+ {
+ switch (XTYPE (a))
+ {
+ case Lisp_String:
+ equal_p = ((STRING_BYTES (XSTRING (a))
+ == STRING_BYTES (XSTRING (b)))
+ && bcmp (XSTRING (a)->data, XSTRING (b)->data,
+ STRING_BYTES (XSTRING (a))) == 0);
+ break;
+
+ case Lisp_Int:
+ case Lisp_Symbol:
+ equal_p = 0;
+ break;
+
+ default:
+ equal_p = !NILP (Fequal (a, b));
+ break;
+ }
+ }
+ }
+
+ return equal_p;
+}
+
+
+DEFUN ("internal-lisp-face-equal-p", Finternal_lisp_face_equal_p,
+ Sinternal_lisp_face_equal_p, 2, 3, 0,
+ "True if FACE1 and FACE2 are equal.\n\
+If the optional argument FRAME is given, report on face FACE in that frame.\n\
+If FRAME is t, report on the defaults for face FACE (for new frames).\n\
+If FRAME is omitted or nil, use the selected frame.")
+ (face1, face2, frame)
+ Lisp_Object face1, face2, frame;
+{
+ int equal_p;
+ struct frame *f;
+ Lisp_Object lface1, lface2;
+
+ if (EQ (frame, Qt))
+ f = NULL;
+ else
+ /* Don't use check_x_frame here because this function is called
+ before X frames exist. At that time, if FRAME is nil,
+ selected_frame will be used which is the frame dumped with
+ Emacs. That frame is not an X frame. */
+ f = frame_or_selected_frame (frame, 2);
+
+ lface1 = lface_from_face_name (NULL, face1, 1);
+ lface2 = lface_from_face_name (NULL, face2, 1);
+ equal_p = lface_equal_p (XVECTOR (lface1)->contents,
+ XVECTOR (lface2)->contents);
+ return equal_p ? Qt : Qnil;
+}
+
+
+DEFUN ("internal-lisp-face-empty-p", Finternal_lisp_face_empty_p,
+ Sinternal_lisp_face_empty_p, 1, 2, 0,
+ "True if FACE has no attribute specified.\n\
+If the optional argument FRAME is given, report on face FACE in that frame.\n\
+If FRAME is t, report on the defaults for face FACE (for new frames).\n\
+If FRAME is omitted or nil, use the selected frame.")
+ (face, frame)
+ Lisp_Object face, frame;
+{
+ struct frame *f;
+ Lisp_Object lface;
+ int i;
+
+ if (NILP (frame))
+ frame = selected_frame;
+ CHECK_LIVE_FRAME (frame, 0);
+ f = XFRAME (frame);
+
+ if (EQ (frame, Qt))
+ lface = lface_from_face_name (NULL, face, 1);
+ else
+ lface = lface_from_face_name (f, face, 1);
+
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ if (!UNSPECIFIEDP (AREF (lface, i)))
+ break;
+
+ return i == LFACE_VECTOR_SIZE ? Qt : Qnil;
+}
+
+
+DEFUN ("frame-face-alist", Fframe_face_alist, Sframe_face_alist,
+ 0, 1, 0,
+ "Return an alist of frame-local faces defined on FRAME.\n\
+For internal use only.")
+ (frame)
+ Lisp_Object frame;
+{
+ struct frame *f = frame_or_selected_frame (frame, 0);
+ return f->face_alist;
+}
+
+
+/* Return a hash code for Lisp string STRING with case ignored. Used
+ below in computing a hash value for a Lisp face. */
+
+static INLINE unsigned
+hash_string_case_insensitive (string)
+ Lisp_Object string;
+{
+ unsigned char *s;
+ unsigned hash = 0;
+ xassert (STRINGP (string));
+ for (s = XSTRING (string)->data; *s; ++s)
+ hash = (hash << 1) ^ tolower (*s);
+ return hash;
+}
+
+
+/* Return a hash code for face attribute vector V. */
+
+static INLINE unsigned
+lface_hash (v)
+ Lisp_Object *v;
+{
+ return (hash_string_case_insensitive (v[LFACE_FAMILY_INDEX])
+ ^ hash_string_case_insensitive (v[LFACE_FOREGROUND_INDEX])
+ ^ hash_string_case_insensitive (v[LFACE_BACKGROUND_INDEX])
+ ^ XFASTINT (v[LFACE_WEIGHT_INDEX])
+ ^ XFASTINT (v[LFACE_SLANT_INDEX])
+ ^ XFASTINT (v[LFACE_SWIDTH_INDEX])
+ ^ XFASTINT (v[LFACE_HEIGHT_INDEX]));
+}
+
+
+/* Return non-zero if LFACE1 and LFACE2 specify the same font (without
+ considering charsets/registries). They do if they specify the same
+ family, point size, weight, width, slant, and fontset. Both LFACE1
+ and LFACE2 must be fully-specified. */
+
+static INLINE int
+lface_same_font_attributes_p (lface1, lface2)
+ Lisp_Object *lface1, *lface2;
+{
+ xassert (lface_fully_specified_p (lface1)
+ && lface_fully_specified_p (lface2));
+ return (xstricmp (XSTRING (lface1[LFACE_FAMILY_INDEX])->data,
+ XSTRING (lface2[LFACE_FAMILY_INDEX])->data) == 0
+ && EQ (lface1[LFACE_HEIGHT_INDEX], lface2[LFACE_HEIGHT_INDEX])
+ && EQ (lface1[LFACE_SWIDTH_INDEX], lface2[LFACE_SWIDTH_INDEX])
+ && EQ (lface1[LFACE_AVGWIDTH_INDEX], lface2[LFACE_AVGWIDTH_INDEX])
+ && EQ (lface1[LFACE_WEIGHT_INDEX], lface2[LFACE_WEIGHT_INDEX])
+ && EQ (lface1[LFACE_SLANT_INDEX], lface2[LFACE_SLANT_INDEX])
+ && (EQ (lface1[LFACE_FONT_INDEX], lface2[LFACE_FONT_INDEX])
+ || (STRINGP (lface1[LFACE_FONT_INDEX])
+ && STRINGP (lface2[LFACE_FONT_INDEX])
+ && xstricmp (XSTRING (lface1[LFACE_FONT_INDEX])->data,
+ XSTRING (lface2[LFACE_FONT_INDEX])->data))));
+}
+
+
+
+/***********************************************************************
+ Realized Faces
+ ***********************************************************************/
+
+/* Allocate and return a new realized face for Lisp face attribute
+ vector ATTR. */
+
+static struct face *
+make_realized_face (attr)
+ Lisp_Object *attr;
+{
+ struct face *face = (struct face *) xmalloc (sizeof *face);
+ bzero (face, sizeof *face);
+ face->ascii_face = face;
+ bcopy (attr, face->lface, sizeof face->lface);
+ return face;
+}
+
+
+/* Free realized face FACE, including its X resources. FACE may
+ be null. */
+
+static void
+free_realized_face (f, face)
+ struct frame *f;
+ struct face *face;
+{
+ if (face)
+ {
+#ifdef HAVE_WINDOW_SYSTEM
+ if (FRAME_WINDOW_P (f))
+ {
+ /* Free fontset of FACE if it is ASCII face. */
+ if (face->fontset >= 0 && face == face->ascii_face)
+ free_face_fontset (f, face);
+ if (face->gc)
+ {
+ x_free_gc (f, face->gc);
+ face->gc = 0;
+ }
+
+ free_face_colors (f, face);
+ x_destroy_bitmap (f, face->stipple);
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+
+ xfree (face);
+ }
+}
+
+
+/* Prepare face FACE for subsequent display on frame F. This
+ allocated GCs if they haven't been allocated yet or have been freed
+ by clearing the face cache. */
+
+void
+prepare_face_for_display (f, face)
+ struct frame *f;
+ struct face *face;
+{
+#ifdef HAVE_WINDOW_SYSTEM
+ xassert (FRAME_WINDOW_P (f));
+
+ if (face->gc == 0)
+ {
+ XGCValues xgcv;
+ unsigned long mask = GCForeground | GCBackground | GCGraphicsExposures;
+
+ xgcv.foreground = face->foreground;
+ xgcv.background = face->background;
+#ifdef HAVE_X_WINDOWS
+ xgcv.graphics_exposures = False;
+#endif
+ /* The font of FACE may be null if we couldn't load it. */
+ if (face->font)
+ {
+#ifdef HAVE_X_WINDOWS
+ xgcv.font = face->font->fid;
+#endif
+#ifdef WINDOWSNT
+ xgcv.font = face->font;
+#endif
+#ifdef macintosh
+ xgcv.font = face->font;
+#endif
+ mask |= GCFont;
+ }
+
+ BLOCK_INPUT;
+#ifdef HAVE_X_WINDOWS
+ if (face->stipple)
+ {
+ xgcv.fill_style = FillOpaqueStippled;
+ xgcv.stipple = x_bitmap_pixmap (f, face->stipple);
+ mask |= GCFillStyle | GCStipple;
+ }
+#endif
+ face->gc = x_create_gc (f, mask, &xgcv);
+ UNBLOCK_INPUT;
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+}
+
+
+/***********************************************************************
+ Face Cache
+ ***********************************************************************/
+
+/* Return a new face cache for frame F. */
+
+static struct face_cache *
+make_face_cache (f)
+ struct frame *f;
+{
+ struct face_cache *c;
+ int size;
+
+ c = (struct face_cache *) xmalloc (sizeof *c);
+ bzero (c, sizeof *c);
+ size = FACE_CACHE_BUCKETS_SIZE * sizeof *c->buckets;
+ c->buckets = (struct face **) xmalloc (size);
+ bzero (c->buckets, size);
+ c->size = 50;
+ c->faces_by_id = (struct face **) xmalloc (c->size * sizeof *c->faces_by_id);
+ c->f = f;
+ c->menu_face_changed_p = menu_face_changed_default;
+ return c;
+}
+
+
+/* Clear out all graphics contexts for all realized faces, except for
+ the basic faces. This should be done from time to time just to avoid
+ keeping too many graphics contexts that are no longer needed. */
+
+static void
+clear_face_gcs (c)
+ struct face_cache *c;
+{
+ if (c && FRAME_WINDOW_P (c->f))
+ {
+#ifdef HAVE_WINDOW_SYSTEM
+ int i;
+ for (i = BASIC_FACE_ID_SENTINEL; i < c->used; ++i)
+ {
+ struct face *face = c->faces_by_id[i];
+ if (face && face->gc)
+ {
+ x_free_gc (c->f, face->gc);
+ face->gc = 0;
+ }
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+ }
+}
+
+
+/* Free all realized faces in face cache C, including basic faces. C
+ may be null. If faces are freed, make sure the frame's current
+ matrix is marked invalid, so that a display caused by an expose
+ event doesn't try to use faces we destroyed. */
+
+static void
+free_realized_faces (c)
+ struct face_cache *c;
+{
+ if (c && c->used)
+ {
+ int i, size;
+ struct frame *f = c->f;
+
+ /* We must block input here because we can't process X events
+ safely while only some faces are freed, or when the frame's
+ current matrix still references freed faces. */
+ BLOCK_INPUT;
+
+ for (i = 0; i < c->used; ++i)
+ {
+ free_realized_face (f, c->faces_by_id[i]);
+ c->faces_by_id[i] = NULL;
+ }
+
+ c->used = 0;
+ size = FACE_CACHE_BUCKETS_SIZE * sizeof *c->buckets;
+ bzero (c->buckets, size);
+
+ /* Must do a thorough redisplay the next time. Mark current
+ matrices as invalid because they will reference faces freed
+ above. This function is also called when a frame is
+ destroyed. In this case, the root window of F is nil. */
+ if (WINDOWP (f->root_window))
+ {
+ clear_current_matrices (f);
+ ++windows_or_buffers_changed;
+ }
+
+ UNBLOCK_INPUT;
+ }
+}
+
+
+/* Free all faces realized for multibyte characters on frame F that
+ has FONTSET. */
+
+void
+free_realized_multibyte_face (f, fontset)
+ struct frame *f;
+ int fontset;
+{
+ struct face_cache *cache = FRAME_FACE_CACHE (f);
+ struct face *face;
+ int i;
+
+ /* We must block input here because we can't process X events safely
+ while only some faces are freed, or when the frame's current
+ matrix still references freed faces. */
+ BLOCK_INPUT;
+
+ for (i = 0; i < cache->used; i++)
+ {
+ face = cache->faces_by_id[i];
+ if (face
+ && face != face->ascii_face
+ && face->fontset == fontset)
+ {
+ uncache_face (cache, face);
+ free_realized_face (f, face);
+ }
+ }
+
+ /* Must do a thorough redisplay the next time. Mark current
+ matrices as invalid because they will reference faces freed
+ above. This function is also called when a frame is destroyed.
+ In this case, the root window of F is nil. */
+ if (WINDOWP (f->root_window))
+ {
+ clear_current_matrices (f);
+ ++windows_or_buffers_changed;
+ }
+
+ UNBLOCK_INPUT;
+}
+
+
+/* Free all realized faces on FRAME or on all frames if FRAME is nil.
+ This is done after attributes of a named face have been changed,
+ because we can't tell which realized faces depend on that face. */
+
+void
+free_all_realized_faces (frame)
+ Lisp_Object frame;
+{
+ if (NILP (frame))
+ {
+ Lisp_Object rest;
+ FOR_EACH_FRAME (rest, frame)
+ free_realized_faces (FRAME_FACE_CACHE (XFRAME (frame)));
+ }
+ else
+ free_realized_faces (FRAME_FACE_CACHE (XFRAME (frame)));
+}
+
+
+/* Free face cache C and faces in it, including their X resources. */
+
+static void
+free_face_cache (c)
+ struct face_cache *c;
+{
+ if (c)
+ {
+ free_realized_faces (c);
+ xfree (c->buckets);
+ xfree (c->faces_by_id);
+ xfree (c);
+ }
+}
+
+
+/* Cache realized face FACE in face cache C. HASH is the hash value
+ of FACE. If FACE->fontset >= 0, add the new face to the end of the
+ collision list of the face hash table of C. This is done because
+ otherwise lookup_face would find FACE for every character, even if
+ faces with the same attributes but for specific characters exist. */
+
+static void
+cache_face (c, face, hash)
+ struct face_cache *c;
+ struct face *face;
+ unsigned hash;
+{
+ int i = hash % FACE_CACHE_BUCKETS_SIZE;
+
+ face->hash = hash;
+
+ if (face->fontset >= 0)
+ {
+ struct face *last = c->buckets[i];
+ if (last)
+ {
+ while (last->next)
+ last = last->next;
+ last->next = face;
+ face->prev = last;
+ face->next = NULL;
+ }
+ else
+ {
+ c->buckets[i] = face;
+ face->prev = face->next = NULL;
+ }
+ }
+ else
+ {
+ face->prev = NULL;
+ face->next = c->buckets[i];
+ if (face->next)
+ face->next->prev = face;
+ c->buckets[i] = face;
+ }
+
+ /* Find a free slot in C->faces_by_id and use the index of the free
+ slot as FACE->id. */
+ for (i = 0; i < c->used; ++i)
+ if (c->faces_by_id[i] == NULL)
+ break;
+ face->id = i;
+
+ /* Maybe enlarge C->faces_by_id. */
+ if (i == c->used && c->used == c->size)
+ {
+ int new_size = 2 * c->size;
+ int sz = new_size * sizeof *c->faces_by_id;
+ c->faces_by_id = (struct face **) xrealloc (c->faces_by_id, sz);
+ c->size = new_size;
+ }
+
+#if GLYPH_DEBUG
+ /* Check that FACE got a unique id. */
+ {
+ int j, n;
+ struct face *face;
+
+ for (j = n = 0; j < FACE_CACHE_BUCKETS_SIZE; ++j)
+ for (face = c->buckets[j]; face; face = face->next)
+ if (face->id == i)
+ ++n;
+
+ xassert (n == 1);
+ }
+#endif /* GLYPH_DEBUG */
+
+ c->faces_by_id[i] = face;
+ if (i == c->used)
+ ++c->used;
+}
+
+
+/* Remove face FACE from cache C. */
+
+static void
+uncache_face (c, face)
+ struct face_cache *c;
+ struct face *face;
+{
+ int i = face->hash % FACE_CACHE_BUCKETS_SIZE;
+
+ if (face->prev)
+ face->prev->next = face->next;
+ else
+ c->buckets[i] = face->next;
+
+ if (face->next)
+ face->next->prev = face->prev;
+
+ c->faces_by_id[face->id] = NULL;
+ if (face->id == c->used)
+ --c->used;
+}
+
+
+/* Look up a realized face with face attributes ATTR in the face cache
+ of frame F. The face will be used to display character C. Value
+ is the ID of the face found. If no suitable face is found, realize
+ a new one. In that case, if C is a multibyte character, BASE_FACE
+ is a face that has the same attributes. */
+
+INLINE int
+lookup_face (f, attr, c, base_face)
+ struct frame *f;
+ Lisp_Object *attr;
+ int c;
+ struct face *base_face;
+{
+ struct face_cache *cache = FRAME_FACE_CACHE (f);
+ unsigned hash;
+ int i;
+ struct face *face;
+
+ xassert (cache != NULL);
+ check_lface_attrs (attr);
+
+ /* Look up ATTR in the face cache. */
+ hash = lface_hash (attr);
+ i = hash % FACE_CACHE_BUCKETS_SIZE;
+
+ for (face = cache->buckets[i]; face; face = face->next)
+ if (face->hash == hash
+ && (!FRAME_WINDOW_P (f)
+ || FACE_SUITABLE_FOR_CHAR_P (face, c))
+ && lface_equal_p (face->lface, attr))
+ break;
+
+ /* If not found, realize a new face. */
+ if (face == NULL)
+ face = realize_face (cache, attr, c, base_face, -1);
+
+#if GLYPH_DEBUG
+ xassert (face == FACE_FROM_ID (f, face->id));
+
+/* When this function is called from face_for_char (in this case, C is
+ a multibyte character), a fontset of a face returned by
+ realize_face is not yet set, i.e. FACE_SUITABLE_FOR_CHAR_P (FACE,
+ C) is not sutisfied. The fontset is set for this face by
+ face_for_char later. */
+#if 0
+ if (FRAME_WINDOW_P (f))
+ xassert (FACE_SUITABLE_FOR_CHAR_P (face, c));
+#endif
+#endif /* GLYPH_DEBUG */
+
+ return face->id;
+}
+
+
+/* Return the face id of the realized face for named face SYMBOL on
+ frame F suitable for displaying character C. Value is -1 if the
+ face couldn't be determined, which might happen if the default face
+ isn't realized and cannot be realized. */
+
+int
+lookup_named_face (f, symbol, c)
+ struct frame *f;
+ Lisp_Object symbol;
+ int c;
+{
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ Lisp_Object symbol_attrs[LFACE_VECTOR_SIZE];
+ struct face *default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+
+ if (default_face == NULL)
+ {
+ if (!realize_basic_faces (f))
+ return -1;
+ default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+ }
+
+ get_lface_attributes (f, symbol, symbol_attrs, 1);
+ bcopy (default_face->lface, attrs, sizeof attrs);
+ merge_face_vectors (f, symbol_attrs, attrs, Qnil);
+ return lookup_face (f, attrs, c, NULL);
+}
+
+
+/* Return the ID of the realized ASCII face of Lisp face with ID
+ LFACE_ID on frame F. Value is -1 if LFACE_ID isn't valid. */
+
+int
+ascii_face_of_lisp_face (f, lface_id)
+ struct frame *f;
+ int lface_id;
+{
+ int face_id;
+
+ if (lface_id >= 0 && lface_id < lface_id_to_name_size)
+ {
+ Lisp_Object face_name = lface_id_to_name[lface_id];
+ face_id = lookup_named_face (f, face_name, 0);
+ }
+ else
+ face_id = -1;
+
+ return face_id;
+}
+
+
+/* Return a face for charset ASCII that is like the face with id
+ FACE_ID on frame F, but has a font that is STEPS steps smaller.
+ STEPS < 0 means larger. Value is the id of the face. */
+
+int
+smaller_face (f, face_id, steps)
+ struct frame *f;
+ int face_id, steps;
+{
+#ifdef HAVE_WINDOW_SYSTEM
+ struct face *face;
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ int pt, last_pt, last_height;
+ int delta;
+ int new_face_id;
+ struct face *new_face;
+
+ /* If not called for an X frame, just return the original face. */
+ if (FRAME_TERMCAP_P (f))
+ return face_id;
+
+ /* Try in increments of 1/2 pt. */
+ delta = steps < 0 ? 5 : -5;
+ steps = abs (steps);
+
+ face = FACE_FROM_ID (f, face_id);
+ bcopy (face->lface, attrs, sizeof attrs);
+ pt = last_pt = XFASTINT (attrs[LFACE_HEIGHT_INDEX]);
+ new_face_id = face_id;
+ last_height = FONT_HEIGHT (face->font);
+
+ while (steps
+ && pt + delta > 0
+ /* Give up if we cannot find a font within 10pt. */
+ && abs (last_pt - pt) < 100)
+ {
+ /* Look up a face for a slightly smaller/larger font. */
+ pt += delta;
+ attrs[LFACE_HEIGHT_INDEX] = make_number (pt);
+ new_face_id = lookup_face (f, attrs, 0, NULL);
+ new_face = FACE_FROM_ID (f, new_face_id);
+
+ /* If height changes, count that as one step. */
+ if ((delta < 0 && FONT_HEIGHT (new_face->font) < last_height)
+ || (delta > 0 && FONT_HEIGHT (new_face->font) > last_height))
+ {
+ --steps;
+ last_height = FONT_HEIGHT (new_face->font);
+ last_pt = pt;
+ }
+ }
+
+ return new_face_id;
+
+#else /* not HAVE_WINDOW_SYSTEM */
+
+ return face_id;
+
+#endif /* not HAVE_WINDOW_SYSTEM */
+}
+
+
+/* Return a face for charset ASCII that is like the face with id
+ FACE_ID on frame F, but has height HEIGHT. */
+
+int
+face_with_height (f, face_id, height)
+ struct frame *f;
+ int face_id;
+ int height;
+{
+#ifdef HAVE_WINDOW_SYSTEM
+ struct face *face;
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+
+ if (FRAME_TERMCAP_P (f)
+ || height <= 0)
+ return face_id;
+
+ face = FACE_FROM_ID (f, face_id);
+ bcopy (face->lface, attrs, sizeof attrs);
+ attrs[LFACE_HEIGHT_INDEX] = make_number (height);
+ face_id = lookup_face (f, attrs, 0, NULL);
+#endif /* HAVE_WINDOW_SYSTEM */
+
+ return face_id;
+}
+
+
+/* Return the face id of the realized face for named face SYMBOL on
+ frame F suitable for displaying character C, and use attributes of
+ the face FACE_ID for attributes that aren't completely specified by
+ SYMBOL. This is like lookup_named_face, except that the default
+ attributes come from FACE_ID, not from the default face. FACE_ID
+ is assumed to be already realized. */
+
+int
+lookup_derived_face (f, symbol, c, face_id)
+ struct frame *f;
+ Lisp_Object symbol;
+ int c;
+ int face_id;
+{
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ Lisp_Object symbol_attrs[LFACE_VECTOR_SIZE];
+ struct face *default_face = FACE_FROM_ID (f, face_id);
+
+ if (!default_face)
+ abort ();
+
+ get_lface_attributes (f, symbol, symbol_attrs, 1);
+ bcopy (default_face->lface, attrs, sizeof attrs);
+ merge_face_vectors (f, symbol_attrs, attrs, Qnil);
+ return lookup_face (f, attrs, c, default_face);
+}
+
+
+
+/***********************************************************************
+ Font selection
+ ***********************************************************************/
+
+DEFUN ("internal-set-font-selection-order",
+ Finternal_set_font_selection_order,
+ Sinternal_set_font_selection_order, 1, 1, 0,
+ "Set font selection order for face font selection to ORDER.\n\
+ORDER must be a list of length 4 containing the symbols `:width',\n\
+`:height', `:weight', and `:slant'. Face attributes appearing\n\
+first in ORDER are matched first, e.g. if `:height' appears before\n\
+`:weight' in ORDER, font selection first tries to find a font with\n\
+a suitable height, and then tries to match the font weight.\n\
+Value is ORDER.")
+ (order)
+ Lisp_Object order;
+{
+ Lisp_Object list;
+ int i;
+ int indices[DIM (font_sort_order)];
+
+ CHECK_LIST (order, 0);
+ bzero (indices, sizeof indices);
+ i = 0;
+
+ for (list = order;
+ CONSP (list) && i < DIM (indices);
+ list = XCDR (list), ++i)
+ {
+ Lisp_Object attr = XCAR (list);
+ int xlfd;
+
+ if (EQ (attr, QCwidth))
+ xlfd = XLFD_SWIDTH;
+ else if (EQ (attr, QCheight))
+ xlfd = XLFD_POINT_SIZE;
+ else if (EQ (attr, QCweight))
+ xlfd = XLFD_WEIGHT;
+ else if (EQ (attr, QCslant))
+ xlfd = XLFD_SLANT;
+ else
+ break;
+
+ if (indices[i] != 0)
+ break;
+ indices[i] = xlfd;
+ }
+
+ if (!NILP (list) || i != DIM (indices))
+ signal_error ("Invalid font sort order", order);
+ for (i = 0; i < DIM (font_sort_order); ++i)
+ if (indices[i] == 0)
+ signal_error ("Invalid font sort order", order);
+
+ if (bcmp (indices, font_sort_order, sizeof indices) != 0)
+ {
+ bcopy (indices, font_sort_order, sizeof font_sort_order);
+ free_all_realized_faces (Qnil);
+ }
+
+ return Qnil;
+}
+
+
+DEFUN ("internal-set-alternative-font-family-alist",
+ Finternal_set_alternative_font_family_alist,
+ Sinternal_set_alternative_font_family_alist, 1, 1, 0,
+ "Define alternative font families to try in face font selection.\n\
+ALIST is an alist of (FAMILY ALTERNATIVE1 ALTERNATIVE2 ...) entries.\n\
+Each ALTERNATIVE is tried in order if no fonts of font family FAMILY can\n\
+be found. Value is ALIST.")
+ (alist)
+ Lisp_Object alist;
+{
+ CHECK_LIST (alist, 0);
+ Vface_alternative_font_family_alist = alist;
+ free_all_realized_faces (Qnil);
+ return alist;
+}
+
+
+DEFUN ("internal-set-alternative-font-registry-alist",
+ Finternal_set_alternative_font_registry_alist,
+ Sinternal_set_alternative_font_registry_alist, 1, 1, 0,
+ "Define alternative font registries to try in face font selection.\n\
+ALIST is an alist of (REGISTRY ALTERNATIVE1 ALTERNATIVE2 ...) entries.\n\
+Each ALTERNATIVE is tried in order if no fonts of font registry REGISTRY can\n\
+be found. Value is ALIST.")
+ (alist)
+ Lisp_Object alist;
+{
+ CHECK_LIST (alist, 0);
+ Vface_alternative_font_registry_alist = alist;
+ free_all_realized_faces (Qnil);
+ return alist;
+}
+
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Value is non-zero if FONT is the name of a scalable font. The
+ X11R6 XLFD spec says that point size, pixel size, and average width
+ are zero for scalable fonts. Intlfonts contain at least one
+ scalable font ("*-muleindian-1") for which this isn't true, so we
+ just test average width. */
+
+static int
+font_scalable_p (font)
+ struct font_name *font;
+{
+ char *s = font->fields[XLFD_AVGWIDTH];
+ return (*s == '0' && *(s + 1) == '\0')
+#ifdef WINDOWSNT
+ /* Windows implementation of XLFD is slightly broken for backward
+ compatibility with previous broken versions, so test for
+ wildcards as well as 0. */
+ || *s == '*'
+#endif
+ ;
+}
+
+
+/* Ignore the difference of font point size less than this value. */
+
+#define FONT_POINT_SIZE_QUANTUM 5
+
+/* Value is non-zero if FONT1 is a better match for font attributes
+ VALUES than FONT2. VALUES is an array of face attribute values in
+ font sort order. COMPARE_PT_P zero means don't compare point
+ sizes. AVGWIDTH, if not zero, is a specified font average width
+ to compare with. */
+
+static int
+better_font_p (values, font1, font2, compare_pt_p, avgwidth)
+ int *values;
+ struct font_name *font1, *font2;
+ int compare_pt_p, avgwidth;
+{
+ int i;
+
+ for (i = 0; i < DIM (font_sort_order); ++i)
+ {
+ int xlfd_idx = font_sort_order[i];
+
+ if (compare_pt_p || xlfd_idx != XLFD_POINT_SIZE)
+ {
+ int delta1 = abs (values[i] - font1->numeric[xlfd_idx]);
+ int delta2 = abs (values[i] - font2->numeric[xlfd_idx]);
+
+ if (xlfd_idx == XLFD_POINT_SIZE
+ && abs (delta1 - delta2) < FONT_POINT_SIZE_QUANTUM)
+ continue;
+ if (delta1 > delta2)
+ return 0;
+ else if (delta1 < delta2)
+ return 1;
+ else
+ {
+ /* The difference may be equal because, e.g., the face
+ specifies `italic' but we have only `regular' and
+ `oblique'. Prefer `oblique' in this case. */
+ if ((xlfd_idx == XLFD_WEIGHT || xlfd_idx == XLFD_SLANT)
+ && font1->numeric[xlfd_idx] > values[i]
+ && font2->numeric[xlfd_idx] < values[i])
+ return 1;
+ }
+ }
+ }
+
+ if (avgwidth)
+ {
+ int delta1 = abs (avgwidth - font1->numeric[XLFD_AVGWIDTH]);
+ int delta2 = abs (avgwidth - font2->numeric[XLFD_AVGWIDTH]);
+ if (delta1 > delta2)
+ return 0;
+ else if (delta1 < delta2)
+ return 1;
+ }
+
+ return font1->registry_priority < font2->registry_priority;
+}
+
+
+/* Value is non-zero if FONT is an exact match for face attributes in
+ SPECIFIED. SPECIFIED is an array of face attribute values in font
+ sort order. AVGWIDTH, if non-zero, is an average width to compare
+ with. */
+
+static int
+exact_face_match_p (specified, font, avgwidth)
+ int *specified;
+ struct font_name *font;
+ int avgwidth;
+{
+ int i;
+
+ for (i = 0; i < DIM (font_sort_order); ++i)
+ if (specified[i] != font->numeric[font_sort_order[i]])
+ break;
+
+ return (i == DIM (font_sort_order)
+ && (avgwidth <= 0
+ || avgwidth == font->numeric[XLFD_AVGWIDTH]));
+}
+
+
+/* Value is the name of a scaled font, generated from scalable font
+ FONT on frame F. SPECIFIED_PT is the point-size to scale FONT to.
+ Value is allocated from heap. */
+
+static char *
+build_scalable_font_name (f, font, specified_pt)
+ struct frame *f;
+ struct font_name *font;
+ int specified_pt;
+{
+ char point_size[20], pixel_size[20];
+ int pixel_value;
+ double resy = FRAME_X_DISPLAY_INFO (f)->resy;
+ double pt;
+
+ /* If scalable font is for a specific resolution, compute
+ the point size we must specify from the resolution of
+ the display and the specified resolution of the font. */
+ if (font->numeric[XLFD_RESY] != 0)
+ {
+ pt = resy / font->numeric[XLFD_RESY] * specified_pt + 0.5;
+ pixel_value = font->numeric[XLFD_RESY] / (PT_PER_INCH * 10.0) * pt;
+ }
+ else
+ {
+ pt = specified_pt;
+ pixel_value = resy / (PT_PER_INCH * 10.0) * pt;
+ }
+
+ /* Set point size of the font. */
+ sprintf (point_size, "%d", (int) pt);
+ font->fields[XLFD_POINT_SIZE] = point_size;
+ font->numeric[XLFD_POINT_SIZE] = pt;
+
+ /* Set pixel size. */
+ sprintf (pixel_size, "%d", pixel_value);
+ font->fields[XLFD_PIXEL_SIZE] = pixel_size;
+ font->numeric[XLFD_PIXEL_SIZE] = pixel_value;
+
+ /* If font doesn't specify its resolution, use the
+ resolution of the display. */
+ if (font->numeric[XLFD_RESY] == 0)
+ {
+ char buffer[20];
+ sprintf (buffer, "%d", (int) resy);
+ font->fields[XLFD_RESY] = buffer;
+ font->numeric[XLFD_RESY] = resy;
+ }
+
+ if (strcmp (font->fields[XLFD_RESX], "0") == 0)
+ {
+ char buffer[20];
+ int resx = FRAME_X_DISPLAY_INFO (f)->resx;
+ sprintf (buffer, "%d", resx);
+ font->fields[XLFD_RESX] = buffer;
+ font->numeric[XLFD_RESX] = resx;
+ }
+
+ return build_font_name (font);
+}
+
+
+/* Value is non-zero if we are allowed to use scalable font FONT. We
+ can't run a Lisp function here since this function may be called
+ with input blocked. */
+
+static int
+may_use_scalable_font_p (font)
+ char *font;
+{
+ if (EQ (Vscalable_fonts_allowed, Qt))
+ return 1;
+ else if (CONSP (Vscalable_fonts_allowed))
+ {
+ Lisp_Object tail, regexp;
+
+ for (tail = Vscalable_fonts_allowed; CONSP (tail); tail = XCDR (tail))
+ {
+ regexp = XCAR (tail);
+ if (STRINGP (regexp)
+ && fast_c_string_match_ignore_case (regexp, font) >= 0)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+
+
+/* Return the name of the best matching font for face attributes ATTRS
+ in the array of font_name structures FONTS which contains NFONTS
+ elements. WIDTH_RATIO is a factor with which to multiply average
+ widths if ATTRS specifies such a width.
+
+ Value is a font name which is allocated from the heap. FONTS is
+ freed by this function. */
+
+static char *
+best_matching_font (f, attrs, fonts, nfonts, width_ratio)
+ struct frame *f;
+ Lisp_Object *attrs;
+ struct font_name *fonts;
+ int nfonts;
+ int width_ratio;
+{
+ char *font_name;
+ struct font_name *best;
+ int i, pt = 0;
+ int specified[5];
+ int exact_p, avgwidth;
+
+ if (nfonts == 0)
+ return NULL;
+
+ /* Make specified font attributes available in `specified',
+ indexed by sort order. */
+ for (i = 0; i < DIM (font_sort_order); ++i)
+ {
+ int xlfd_idx = font_sort_order[i];
+
+ if (xlfd_idx == XLFD_SWIDTH)
+ specified[i] = face_numeric_swidth (attrs[LFACE_SWIDTH_INDEX]);
+ else if (xlfd_idx == XLFD_POINT_SIZE)
+ specified[i] = pt = XFASTINT (attrs[LFACE_HEIGHT_INDEX]);
+ else if (xlfd_idx == XLFD_WEIGHT)
+ specified[i] = face_numeric_weight (attrs[LFACE_WEIGHT_INDEX]);
+ else if (xlfd_idx == XLFD_SLANT)
+ specified[i] = face_numeric_slant (attrs[LFACE_SLANT_INDEX]);
+ else
+ abort ();
+ }
+
+ avgwidth = (UNSPECIFIEDP (attrs[LFACE_AVGWIDTH_INDEX])
+ ? 0
+ : XFASTINT (attrs[LFACE_AVGWIDTH_INDEX]) * width_ratio);
+
+ exact_p = 0;
+
+ /* Start with the first non-scalable font in the list. */
+ for (i = 0; i < nfonts; ++i)
+ if (!font_scalable_p (fonts + i))
+ break;
+
+ /* Find the best match among the non-scalable fonts. */
+ if (i < nfonts)
+ {
+ best = fonts + i;
+
+ for (i = 1; i < nfonts; ++i)
+ if (!font_scalable_p (fonts + i)
+ && better_font_p (specified, fonts + i, best, 1, avgwidth))
+ {
+ best = fonts + i;
+
+ exact_p = exact_face_match_p (specified, best, avgwidth);
+ if (exact_p)
+ break;
+ }
+
+ }
+ else
+ best = NULL;
+
+ /* Unless we found an exact match among non-scalable fonts, see if
+ we can find a better match among scalable fonts. */
+ if (!exact_p)
+ {
+ /* A scalable font is better if
+
+ 1. its weight, slant, swidth attributes are better, or.
+
+ 2. the best non-scalable font doesn't have the required
+ point size, and the scalable fonts weight, slant, swidth
+ isn't worse. */
+
+ int non_scalable_has_exact_height_p;
+
+ if (best && best->numeric[XLFD_POINT_SIZE] == pt)
+ non_scalable_has_exact_height_p = 1;
+ else
+ non_scalable_has_exact_height_p = 0;
+
+ for (i = 0; i < nfonts; ++i)
+ if (font_scalable_p (fonts + i))
+ {
+ if (best == NULL
+ || better_font_p (specified, fonts + i, best, 0, 0)
+ || (!non_scalable_has_exact_height_p
+ && !better_font_p (specified, best, fonts + i, 0, 0)))
+ best = fonts + i;
+ }
+ }
+
+ if (font_scalable_p (best))
+ font_name = build_scalable_font_name (f, best, pt);
+ else
+ font_name = build_font_name (best);
+
+ /* Free font_name structures. */
+ free_font_names (fonts, nfonts);
+
+ return font_name;
+}
+
+
+/* Get a list of matching fonts on frame F, considering FAMILY
+ and alternative font families from Vface_alternative_font_registry_alist.
+
+ FAMILY is the font family whose alternatives are considered.
+
+ REGISTRY, if a string, specifies a font registry and encoding to
+ match. A value of nil means include fonts of any registry and
+ encoding.
+
+ Return in *FONTS a pointer to a vector of font_name structures for
+ the fonts matched. Value is the number of fonts found. */
+
+static int
+try_alternative_families (f, family, registry, fonts)
+ struct frame *f;
+ Lisp_Object family, registry;
+ struct font_name **fonts;
+{
+ Lisp_Object alter;
+ int nfonts = 0;
+
+ nfonts = font_list (f, Qnil, family, registry, fonts);
+ if (nfonts == 0)
+ {
+ /* Try alternative font families. */
+ alter = Fassoc (family, Vface_alternative_font_family_alist);
+ if (CONSP (alter))
+ {
+ for (alter = XCDR (alter);
+ CONSP (alter) && nfonts == 0;
+ alter = XCDR (alter))
+ {
+ if (STRINGP (XCAR (alter)))
+ nfonts = font_list (f, Qnil, XCAR (alter), registry, fonts);
+ }
+ }
+
+ /* Try scalable fonts before giving up. */
+ if (nfonts == 0 && NILP (Vscalable_fonts_allowed))
+ {
+ int count = BINDING_STACK_SIZE ();
+ specbind (Qscalable_fonts_allowed, Qt);
+ nfonts = try_alternative_families (f, family, registry, fonts);
+ unbind_to (count, Qnil);
+ }
+ }
+ return nfonts;
+}
+
+
+/* Get a list of matching fonts on frame F.
+
+ FAMILY, if a string, specifies a font family derived from the fontset.
+ It is only used if the face does not specify any family in ATTRS or
+ if we cannot find any font of the face's family.
+
+ REGISTRY, if a string, specifies a font registry and encoding to
+ match. A value of nil means include fonts of any registry and
+ encoding.
+
+ Return in *FONTS a pointer to a vector of font_name structures for
+ the fonts matched. Value is the number of fonts found. */
+
+static int
+try_font_list (f, attrs, family, registry, fonts)
+ struct frame *f;
+ Lisp_Object *attrs;
+ Lisp_Object family, registry;
+ struct font_name **fonts;
+{
+ int nfonts = 0;
+ Lisp_Object face_family = attrs[LFACE_FAMILY_INDEX];
+
+ if (STRINGP (face_family))
+ nfonts = try_alternative_families (f, face_family, registry, fonts);
+
+ if (nfonts == 0 && !NILP (family))
+ nfonts = try_alternative_families (f, family, registry, fonts);
+
+ /* Try font family of the default face or "fixed". */
+ if (nfonts == 0)
+ {
+ struct face *default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+ if (default_face)
+ family = default_face->lface[LFACE_FAMILY_INDEX];
+ else
+ family = build_string ("fixed");
+ nfonts = font_list (f, Qnil, family, registry, fonts);
+ }
+
+ /* Try any family with the given registry. */
+ if (nfonts == 0)
+ nfonts = font_list (f, Qnil, Qnil, registry, fonts);
+
+ return nfonts;
+}
+
+
+/* Return the fontset id of the base fontset name or alias name given
+ by the fontset attribute of ATTRS. Value is -1 if the fontset
+ attribute of ATTRS doesn't name a fontset. */
+
+static int
+face_fontset (attrs)
+ Lisp_Object *attrs;
+{
+ Lisp_Object name;
+
+ name = attrs[LFACE_FONT_INDEX];
+ if (!STRINGP (name))
+ return -1;
+ return fs_query_fontset (name, 0);
+}
+
+
+/* Choose a name of font to use on frame F to display character C with
+ Lisp face attributes specified by ATTRS. The font name is
+ determined by the font-related attributes in ATTRS and the name
+ pattern for C in FONTSET. Value is the font name which is
+ allocated from the heap and must be freed by the caller, or NULL if
+ we can get no information about the font name of C. It is assured
+ that we always get some information for a single byte
+ character. */
+
+static char *
+choose_face_font (f, attrs, fontset, c)
+ struct frame *f;
+ Lisp_Object *attrs;
+ int fontset, c;
+{
+ Lisp_Object pattern;
+ char *font_name = NULL;
+ struct font_name *fonts;
+ int nfonts, width_ratio;
+
+ /* Get (foundry and) family name and registry (and encoding) name of
+ a font for C. */
+ pattern = fontset_font_pattern (f, fontset, c);
+ if (NILP (pattern))
+ {
+ xassert (!SINGLE_BYTE_CHAR_P (c));
+ return NULL;
+ }
+
+ /* If what we got is a name pattern, return it. */
+ if (STRINGP (pattern))
+ return xstrdup (XSTRING (pattern)->data);
+
+ /* Get a list of fonts matching that pattern and choose the
+ best match for the specified face attributes from it. */
+ nfonts = try_font_list (f, attrs, XCAR (pattern), XCDR (pattern), &fonts);
+ width_ratio = (SINGLE_BYTE_CHAR_P (c)
+ ? 1
+ : CHARSET_WIDTH (CHAR_CHARSET (c)));
+ font_name = best_matching_font (f, attrs, fonts, nfonts, width_ratio);
+ return font_name;
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ Face Realization
+ ***********************************************************************/
+
+/* Realize basic faces on frame F. Value is zero if frame parameters
+ of F don't contain enough information needed to realize the default
+ face. */
+
+static int
+realize_basic_faces (f)
+ struct frame *f;
+{
+ int success_p = 0;
+ int count = BINDING_STACK_SIZE ();
+
+ /* Block input here so that we won't be surprised by an X expose
+ event, for instance, without having the faces set up. */
+ BLOCK_INPUT;
+ specbind (Qscalable_fonts_allowed, Qt);
+
+ if (realize_default_face (f))
+ {
+ realize_named_face (f, Qmode_line, MODE_LINE_FACE_ID);
+ realize_named_face (f, Qtool_bar, TOOL_BAR_FACE_ID);
+ realize_named_face (f, Qfringe, BITMAP_AREA_FACE_ID);
+ realize_named_face (f, Qheader_line, HEADER_LINE_FACE_ID);
+ realize_named_face (f, Qscroll_bar, SCROLL_BAR_FACE_ID);
+ realize_named_face (f, Qborder, BORDER_FACE_ID);
+ realize_named_face (f, Qcursor, CURSOR_FACE_ID);
+ realize_named_face (f, Qmouse, MOUSE_FACE_ID);
+ realize_named_face (f, Qmenu, MENU_FACE_ID);
+
+ /* Reflect changes in the `menu' face in menu bars. */
+ if (FRAME_FACE_CACHE (f)->menu_face_changed_p)
+ {
+ FRAME_FACE_CACHE (f)->menu_face_changed_p = 0;
+#ifdef USE_X_TOOLKIT
+ x_update_menu_appearance (f);
+#endif
+ }
+
+ success_p = 1;
+ }
+
+ unbind_to (count, Qnil);
+ UNBLOCK_INPUT;
+ return success_p;
+}
+
+
+/* Realize the default face on frame F. If the face is not fully
+ specified, make it fully-specified. Attributes of the default face
+ that are not explicitly specified are taken from frame parameters. */
+
+static int
+realize_default_face (f)
+ struct frame *f;
+{
+ struct face_cache *c = FRAME_FACE_CACHE (f);
+ Lisp_Object lface;
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ Lisp_Object frame_font;
+ struct face *face;
+
+ /* If the `default' face is not yet known, create it. */
+ lface = lface_from_face_name (f, Qdefault, 0);
+ if (NILP (lface))
+ {
+ Lisp_Object frame;
+ XSETFRAME (frame, f);
+ lface = Finternal_make_lisp_face (Qdefault, frame);
+ }
+
+#ifdef HAVE_WINDOW_SYSTEM
+ if (FRAME_WINDOW_P (f))
+ {
+ /* Set frame_font to the value of the `font' frame parameter. */
+ frame_font = Fassq (Qfont, f->param_alist);
+ xassert (CONSP (frame_font) && STRINGP (XCDR (frame_font)));
+ frame_font = XCDR (frame_font);
+ set_lface_from_font_name (f, lface, frame_font, 1, 1);
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+
+ if (!FRAME_WINDOW_P (f))
+ {
+ LFACE_FAMILY (lface) = build_string ("default");
+ LFACE_SWIDTH (lface) = Qnormal;
+ LFACE_HEIGHT (lface) = make_number (1);
+ LFACE_WEIGHT (lface) = Qnormal;
+ LFACE_SLANT (lface) = Qnormal;
+ LFACE_AVGWIDTH (lface) = Qunspecified;
+ }
+
+ if (UNSPECIFIEDP (LFACE_UNDERLINE (lface)))
+ LFACE_UNDERLINE (lface) = Qnil;
+
+ if (UNSPECIFIEDP (LFACE_OVERLINE (lface)))
+ LFACE_OVERLINE (lface) = Qnil;
+
+ if (UNSPECIFIEDP (LFACE_STRIKE_THROUGH (lface)))
+ LFACE_STRIKE_THROUGH (lface) = Qnil;
+
+ if (UNSPECIFIEDP (LFACE_BOX (lface)))
+ LFACE_BOX (lface) = Qnil;
+
+ if (UNSPECIFIEDP (LFACE_INVERSE (lface)))
+ LFACE_INVERSE (lface) = Qnil;
+
+ if (UNSPECIFIEDP (LFACE_FOREGROUND (lface)))
+ {
+ /* This function is called so early that colors are not yet
+ set in the frame parameter list. */
+ Lisp_Object color = Fassq (Qforeground_color, f->param_alist);
+
+ if (CONSP (color) && STRINGP (XCDR (color)))
+ LFACE_FOREGROUND (lface) = XCDR (color);
+ else if (FRAME_WINDOW_P (f))
+ return 0;
+ else if (FRAME_TERMCAP_P (f) || FRAME_MSDOS_P (f))
+ LFACE_FOREGROUND (lface) = build_string (unspecified_fg);
+ else
+ abort ();
+ }
+
+ if (UNSPECIFIEDP (LFACE_BACKGROUND (lface)))
+ {
+ /* This function is called so early that colors are not yet
+ set in the frame parameter list. */
+ Lisp_Object color = Fassq (Qbackground_color, f->param_alist);
+ if (CONSP (color) && STRINGP (XCDR (color)))
+ LFACE_BACKGROUND (lface) = XCDR (color);
+ else if (FRAME_WINDOW_P (f))
+ return 0;
+ else if (FRAME_TERMCAP_P (f) || FRAME_MSDOS_P (f))
+ LFACE_BACKGROUND (lface) = build_string (unspecified_bg);
+ else
+ abort ();
+ }
+
+ if (UNSPECIFIEDP (LFACE_STIPPLE (lface)))
+ LFACE_STIPPLE (lface) = Qnil;
+
+ /* Realize the face; it must be fully-specified now. */
+ xassert (lface_fully_specified_p (XVECTOR (lface)->contents));
+ check_lface (lface);
+ bcopy (XVECTOR (lface)->contents, attrs, sizeof attrs);
+ face = realize_face (c, attrs, 0, NULL, DEFAULT_FACE_ID);
+ return 1;
+}
+
+
+/* Realize basic faces other than the default face in face cache C.
+ SYMBOL is the face name, ID is the face id the realized face must
+ have. The default face must have been realized already. */
+
+static void
+realize_named_face (f, symbol, id)
+ struct frame *f;
+ Lisp_Object symbol;
+ int id;
+{
+ struct face_cache *c = FRAME_FACE_CACHE (f);
+ Lisp_Object lface = lface_from_face_name (f, symbol, 0);
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ Lisp_Object symbol_attrs[LFACE_VECTOR_SIZE];
+ struct face *new_face;
+
+ /* The default face must exist and be fully specified. */
+ get_lface_attributes (f, Qdefault, attrs, 1);
+ check_lface_attrs (attrs);
+ xassert (lface_fully_specified_p (attrs));
+
+ /* If SYMBOL isn't know as a face, create it. */
+ if (NILP (lface))
+ {
+ Lisp_Object frame;
+ XSETFRAME (frame, f);
+ lface = Finternal_make_lisp_face (symbol, frame);
+ }
+
+ /* Merge SYMBOL's face with the default face. */
+ get_lface_attributes (f, symbol, symbol_attrs, 1);
+ merge_face_vectors (f, symbol_attrs, attrs, Qnil);
+
+ /* Realize the face. */
+ new_face = realize_face (c, attrs, 0, NULL, id);
+}
+
+
+/* Realize the fully-specified face with attributes ATTRS in face
+ cache CACHE for character C. If C is a multibyte character,
+ BASE_FACE is a face that has the same attributes. Otherwise,
+ BASE_FACE is ignored. If FORMER_FACE_ID is non-negative, it is an
+ ID of face to remove before caching the new face. Value is a
+ pointer to the newly created realized face. */
+
+static struct face *
+realize_face (cache, attrs, c, base_face, former_face_id)
+ struct face_cache *cache;
+ Lisp_Object *attrs;
+ int c;
+ struct face *base_face;
+ int former_face_id;
+{
+ struct face *face;
+
+ /* LFACE must be fully specified. */
+ xassert (cache != NULL);
+ check_lface_attrs (attrs);
+
+ if (former_face_id >= 0 && cache->used > former_face_id)
+ {
+ /* Remove the former face. */
+ struct face *former_face = cache->faces_by_id[former_face_id];
+ uncache_face (cache, former_face);
+ free_realized_face (cache->f, former_face);
+ }
+
+ if (FRAME_WINDOW_P (cache->f))
+ face = realize_x_face (cache, attrs, c, base_face);
+ else if (FRAME_TERMCAP_P (cache->f) || FRAME_MSDOS_P (cache->f))
+ face = realize_tty_face (cache, attrs, c);
+ else
+ abort ();
+
+ /* Insert the new face. */
+ cache_face (cache, face, lface_hash (attrs));
+#ifdef HAVE_WINDOW_SYSTEM
+ if (FRAME_WINDOW_P (cache->f) && face->font == NULL)
+ load_face_font (cache->f, face, c);
+#endif /* HAVE_WINDOW_SYSTEM */
+ return face;
+}
+
+
+/* Realize the fully-specified face with attributes ATTRS in face
+ cache CACHE for character C. Do it for X frame CACHE->f. If C is
+ a multibyte character, BASE_FACE is a face that has the same
+ attributes. Otherwise, BASE_FACE is ignored. If the new face
+ doesn't share font with the default face, a fontname is allocated
+ from the heap and set in `font_name' of the new face, but it is not
+ yet loaded here. Value is a pointer to the newly created realized
+ face. */
+
+static struct face *
+realize_x_face (cache, attrs, c, base_face)
+ struct face_cache *cache;
+ Lisp_Object *attrs;
+ int c;
+ struct face *base_face;
+{
+#ifdef HAVE_WINDOW_SYSTEM
+ struct face *face, *default_face;
+ struct frame *f;
+ Lisp_Object stipple, overline, strike_through, box;
+
+ xassert (FRAME_WINDOW_P (cache->f));
+ xassert (SINGLE_BYTE_CHAR_P (c)
+ || base_face);
+
+ /* Allocate a new realized face. */
+ face = make_realized_face (attrs);
+
+ f = cache->f;
+
+ /* If C is a multibyte character, we share all face attirbutes with
+ BASE_FACE including the realized fontset. But, we must load a
+ different font. */
+ if (!SINGLE_BYTE_CHAR_P (c))
+ {
+ bcopy (base_face, face, sizeof *face);
+ face->gc = 0;
+
+ /* Don't try to free the colors copied bitwise from BASE_FACE. */
+ face->colors_copied_bitwise_p = 1;
+
+ /* to force realize_face to load font */
+ face->font = NULL;
+ return face;
+ }
+
+ /* Now we are realizing a face for ASCII (and unibyte) characters. */
+
+ /* Determine the font to use. Most of the time, the font will be
+ the same as the font of the default face, so try that first. */
+ default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+ if (default_face
+ && FACE_SUITABLE_FOR_CHAR_P (default_face, c)
+ && lface_same_font_attributes_p (default_face->lface, attrs))
+ {
+ face->font = default_face->font;
+ face->fontset = default_face->fontset;
+ face->font_info_id = default_face->font_info_id;
+ face->font_name = default_face->font_name;
+ face->ascii_face = face;
+
+ /* But, as we can't share the fontset, make a new realized
+ fontset that has the same base fontset as of the default
+ face. */
+ face->fontset
+ = make_fontset_for_ascii_face (f, default_face->fontset);
+ }
+ else
+ {
+ /* If the face attribute ATTRS specifies a fontset, use it as
+ the base of a new realized fontset. Otherwise, use the same
+ base fontset as of the default face. The base determines
+ registry and encoding of a font. It may also determine
+ foundry and family. The other fields of font name pattern
+ are constructed from ATTRS. */
+ int fontset = face_fontset (attrs);
+
+ if ((fontset == -1) && default_face)
+ fontset = default_face->fontset;
+ face->fontset = make_fontset_for_ascii_face (f, fontset);
+ face->font = NULL; /* to force realize_face to load font */
+
+#ifdef macintosh
+ /* Load the font if it is specified in ATTRS. This fixes
+ changing frame font on the Mac. */
+ if (STRINGP (attrs[LFACE_FONT_INDEX]))
+ {
+ struct font_info *font_info =
+ FS_LOAD_FONT (f, 0, XSTRING (attrs[LFACE_FONT_INDEX])->data, -1);
+ if (font_info)
+ face->font = font_info->font;
+ }
+#endif
+ }
+
+ /* Load colors, and set remaining attributes. */
+
+ load_face_colors (f, face, attrs);
+
+ /* Set up box. */
+ box = attrs[LFACE_BOX_INDEX];
+ if (STRINGP (box))
+ {
+ /* A simple box of line width 1 drawn in color given by
+ the string. */
+ face->box_color = load_color (f, face, attrs[LFACE_BOX_INDEX],
+ LFACE_BOX_INDEX);
+ face->box = FACE_SIMPLE_BOX;
+ face->box_line_width = 1;
+ }
+ else if (INTEGERP (box))
+ {
+ /* Simple box of specified line width in foreground color of the
+ face. */
+ xassert (XINT (box) != 0);
+ face->box = FACE_SIMPLE_BOX;
+ face->box_line_width = XINT (box);
+ face->box_color = face->foreground;
+ face->box_color_defaulted_p = 1;
+ }
+ else if (CONSP (box))
+ {
+ /* `(:width WIDTH :color COLOR :shadow SHADOW)'. SHADOW
+ being one of `raised' or `sunken'. */
+ face->box = FACE_SIMPLE_BOX;
+ face->box_color = face->foreground;
+ face->box_color_defaulted_p = 1;
+ face->box_line_width = 1;
+
+ while (CONSP (box))
+ {
+ Lisp_Object keyword, value;
+
+ keyword = XCAR (box);
+ box = XCDR (box);
+
+ if (!CONSP (box))
+ break;
+ value = XCAR (box);
+ box = XCDR (box);
+
+ if (EQ (keyword, QCline_width))
+ {
+ if (INTEGERP (value) && XINT (value) != 0)
+ face->box_line_width = XINT (value);
+ }
+ else if (EQ (keyword, QCcolor))
+ {
+ if (STRINGP (value))
+ {
+ face->box_color = load_color (f, face, value,
+ LFACE_BOX_INDEX);
+ face->use_box_color_for_shadows_p = 1;
+ }
+ }
+ else if (EQ (keyword, QCstyle))
+ {
+ if (EQ (value, Qreleased_button))
+ face->box = FACE_RAISED_BOX;
+ else if (EQ (value, Qpressed_button))
+ face->box = FACE_SUNKEN_BOX;
+ }
+ }
+ }
+
+ /* Text underline, overline, strike-through. */
+
+ if (EQ (attrs[LFACE_UNDERLINE_INDEX], Qt))
+ {
+ /* Use default color (same as foreground color). */
+ face->underline_p = 1;
+ face->underline_defaulted_p = 1;
+ face->underline_color = 0;
+ }
+ else if (STRINGP (attrs[LFACE_UNDERLINE_INDEX]))
+ {
+ /* Use specified color. */
+ face->underline_p = 1;
+ face->underline_defaulted_p = 0;
+ face->underline_color
+ = load_color (f, face, attrs[LFACE_UNDERLINE_INDEX],
+ LFACE_UNDERLINE_INDEX);
+ }
+ else if (NILP (attrs[LFACE_UNDERLINE_INDEX]))
+ {
+ face->underline_p = 0;
+ face->underline_defaulted_p = 0;
+ face->underline_color = 0;
+ }
+
+ overline = attrs[LFACE_OVERLINE_INDEX];
+ if (STRINGP (overline))
+ {
+ face->overline_color
+ = load_color (f, face, attrs[LFACE_OVERLINE_INDEX],
+ LFACE_OVERLINE_INDEX);
+ face->overline_p = 1;
+ }
+ else if (EQ (overline, Qt))
+ {
+ face->overline_color = face->foreground;
+ face->overline_color_defaulted_p = 1;
+ face->overline_p = 1;
+ }
+
+ strike_through = attrs[LFACE_STRIKE_THROUGH_INDEX];
+ if (STRINGP (strike_through))
+ {
+ face->strike_through_color
+ = load_color (f, face, attrs[LFACE_STRIKE_THROUGH_INDEX],
+ LFACE_STRIKE_THROUGH_INDEX);
+ face->strike_through_p = 1;
+ }
+ else if (EQ (strike_through, Qt))
+ {
+ face->strike_through_color = face->foreground;
+ face->strike_through_color_defaulted_p = 1;
+ face->strike_through_p = 1;
+ }
+
+ stipple = attrs[LFACE_STIPPLE_INDEX];
+ if (!NILP (stipple))
+ face->stipple = load_pixmap (f, stipple, &face->pixmap_w, &face->pixmap_h);
+
+ xassert (FACE_SUITABLE_FOR_CHAR_P (face, c));
+ return face;
+#endif /* HAVE_WINDOW_SYSTEM */
+}
+
+
+/* Map a specified color of face FACE on frame F to a tty color index.
+ IDX is either LFACE_FOREGROUND_INDEX or LFACE_BACKGROUND_INDEX, and
+ specifies which color to map. Set *DEFAULTED to 1 if mapping to the
+ default foreground/background colors. */
+
+static void
+map_tty_color (f, face, idx, defaulted)
+ struct frame *f;
+ struct face *face;
+ enum lface_attribute_index idx;
+ int *defaulted;
+{
+ Lisp_Object frame, color, def;
+ int foreground_p = idx == LFACE_FOREGROUND_INDEX;
+ unsigned long default_pixel, default_other_pixel, pixel;
+
+ xassert (idx == LFACE_FOREGROUND_INDEX || idx == LFACE_BACKGROUND_INDEX);
+
+ if (foreground_p)
+ {
+ pixel = default_pixel = FACE_TTY_DEFAULT_FG_COLOR;
+ default_other_pixel = FACE_TTY_DEFAULT_BG_COLOR;
+ }
+ else
+ {
+ pixel = default_pixel = FACE_TTY_DEFAULT_BG_COLOR;
+ default_other_pixel = FACE_TTY_DEFAULT_FG_COLOR;
+ }
+
+ XSETFRAME (frame, f);
+ color = face->lface[idx];
+
+ if (STRINGP (color)
+ && XSTRING (color)->size
+ && CONSP (Vtty_defined_color_alist)
+ && (def = assq_no_quit (color, call1 (Qtty_color_alist, frame)),
+ CONSP (def)))
+ {
+ /* Associations in tty-defined-color-alist are of the form
+ (NAME INDEX R G B). We need the INDEX part. */
+ pixel = XINT (XCAR (XCDR (def)));
+ }
+
+ if (pixel == default_pixel && STRINGP (color))
+ {
+ pixel = load_color (f, face, color, idx);
+
+#if defined (MSDOS) || defined (WINDOWSNT)
+ /* If the foreground of the default face is the default color,
+ use the foreground color defined by the frame. */
+#ifdef MSDOS
+ if (FRAME_MSDOS_P (f))
+ {
+#endif /* MSDOS */
+ if (pixel == default_pixel
+ || pixel == FACE_TTY_DEFAULT_COLOR)
+ {
+ if (foreground_p)
+ pixel = FRAME_FOREGROUND_PIXEL (f);
+ else
+ pixel = FRAME_BACKGROUND_PIXEL (f);
+ face->lface[idx] = tty_color_name (f, pixel);
+ *defaulted = 1;
+ }
+ else if (pixel == default_other_pixel)
+ {
+ if (foreground_p)
+ pixel = FRAME_BACKGROUND_PIXEL (f);
+ else
+ pixel = FRAME_FOREGROUND_PIXEL (f);
+ face->lface[idx] = tty_color_name (f, pixel);
+ *defaulted = 1;
+ }
+#ifdef MSDOS
+ }
+#endif
+#endif /* MSDOS or WINDOWSNT */
+ }
+
+ if (foreground_p)
+ face->foreground = pixel;
+ else
+ face->background = pixel;
+}
+
+
+/* Realize the fully-specified face with attributes ATTRS in face
+ cache CACHE for character C. Do it for TTY frame CACHE->f. Value is a
+ pointer to the newly created realized face. */
+
+static struct face *
+realize_tty_face (cache, attrs, c)
+ struct face_cache *cache;
+ Lisp_Object *attrs;
+ int c;
+{
+ struct face *face;
+ int weight, slant;
+ int face_colors_defaulted = 0;
+ struct frame *f = cache->f;
+
+ /* Frame must be a termcap frame. */
+ xassert (FRAME_TERMCAP_P (cache->f) || FRAME_MSDOS_P (cache->f));
+
+ /* Allocate a new realized face. */
+ face = make_realized_face (attrs);
+ face->font_name = FRAME_MSDOS_P (cache->f) ? "ms-dos" : "tty";
+
+ /* Map face attributes to TTY appearances. We map slant to
+ dimmed text because we want italic text to appear differently
+ and because dimmed text is probably used infrequently. */
+ weight = face_numeric_weight (attrs[LFACE_WEIGHT_INDEX]);
+ slant = face_numeric_slant (attrs[LFACE_SLANT_INDEX]);
+
+ if (weight > XLFD_WEIGHT_MEDIUM)
+ face->tty_bold_p = 1;
+ if (weight < XLFD_WEIGHT_MEDIUM || slant != XLFD_SLANT_ROMAN)
+ face->tty_dim_p = 1;
+ if (!NILP (attrs[LFACE_UNDERLINE_INDEX]))
+ face->tty_underline_p = 1;
+ if (!NILP (attrs[LFACE_INVERSE_INDEX]))
+ face->tty_reverse_p = 1;
+
+ /* Map color names to color indices. */
+ map_tty_color (f, face, LFACE_FOREGROUND_INDEX, &face_colors_defaulted);
+ map_tty_color (f, face, LFACE_BACKGROUND_INDEX, &face_colors_defaulted);
+
+ /* Swap colors if face is inverse-video. If the colors are taken
+ from the frame colors, they are already inverted, since the
+ frame-creation function calls x-handle-reverse-video. */
+ if (face->tty_reverse_p && !face_colors_defaulted)
+ {
+ unsigned long tem = face->foreground;
+ face->foreground = face->background;
+ face->background = tem;
+ }
+
+ if (tty_suppress_bold_inverse_default_colors_p
+ && face->tty_bold_p
+ && face->background == FACE_TTY_DEFAULT_FG_COLOR
+ && face->foreground == FACE_TTY_DEFAULT_BG_COLOR)
+ face->tty_bold_p = 0;
+
+ return face;
+}
+
+
+DEFUN ("tty-suppress-bold-inverse-default-colors",
+ Ftty_suppress_bold_inverse_default_colors,
+ Stty_suppress_bold_inverse_default_colors, 1, 1, 0,
+ "Suppress/allow boldness of faces with inverse default colors.\n\
+SUPPRESS non-nil means suppress it.\n\
+This affects bold faces on TTYs whose foreground is the default background\n\
+color of the display and whose background is the default foreground color.\n\
+For such faces, the bold face attribute is ignored if this variable\n\
+is non-nil.")
+ (suppress)
+ Lisp_Object suppress;
+{
+ tty_suppress_bold_inverse_default_colors_p = !NILP (suppress);
+ ++face_change_count;
+ return suppress;
+}
+
+
+
+/***********************************************************************
+ Computing Faces
+ ***********************************************************************/
+
+/* Return the ID of the face to use to display character CH with face
+ property PROP on frame F in current_buffer. */
+
+int
+compute_char_face (f, ch, prop)
+ struct frame *f;
+ int ch;
+ Lisp_Object prop;
+{
+ int face_id;
+
+ if (NILP (current_buffer->enable_multibyte_characters))
+ ch = 0;
+
+ if (NILP (prop))
+ {
+ struct face *face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+ face_id = FACE_FOR_CHAR (f, face, ch);
+ }
+ else
+ {
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ struct face *default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+ bcopy (default_face->lface, attrs, sizeof attrs);
+ merge_face_vector_with_property (f, attrs, prop);
+ face_id = lookup_face (f, attrs, ch, NULL);
+ }
+
+ return face_id;
+}
+
+
+/* Return the face ID associated with buffer position POS for
+ displaying ASCII characters. Return in *ENDPTR the position at
+ which a different face is needed, as far as text properties and
+ overlays are concerned. W is a window displaying current_buffer.
+
+ REGION_BEG, REGION_END delimit the region, so it can be
+ highlighted.
+
+ LIMIT is a position not to scan beyond. That is to limit the time
+ this function can take.
+
+ If MOUSE is non-zero, use the character's mouse-face, not its face.
+
+ The face returned is suitable for displaying ASCII characters. */
+
+int
+face_at_buffer_position (w, pos, region_beg, region_end,
+ endptr, limit, mouse)
+ struct window *w;
+ int pos;
+ int region_beg, region_end;
+ int *endptr;
+ int limit;
+ int mouse;
+{
+ struct frame *f = XFRAME (w->frame);
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ Lisp_Object prop, position;
+ int i, noverlays;
+ Lisp_Object *overlay_vec;
+ Lisp_Object frame;
+ int endpos;
+ Lisp_Object propname = mouse ? Qmouse_face : Qface;
+ Lisp_Object limit1, end;
+ struct face *default_face;
+
+ /* W must display the current buffer. We could write this function
+ to use the frame and buffer of W, but right now it doesn't. */
+ /* xassert (XBUFFER (w->buffer) == current_buffer); */
+
+ XSETFRAME (frame, f);
+ XSETFASTINT (position, pos);
+
+ endpos = ZV;
+ if (pos < region_beg && region_beg < endpos)
+ endpos = region_beg;
+
+ /* Get the `face' or `mouse_face' text property at POS, and
+ determine the next position at which the property changes. */
+ prop = Fget_text_property (position, propname, w->buffer);
+ XSETFASTINT (limit1, (limit < endpos ? limit : endpos));
+ end = Fnext_single_property_change (position, propname, w->buffer, limit1);
+ if (INTEGERP (end))
+ endpos = XINT (end);
+
+ /* Look at properties from overlays. */
+ {
+ int next_overlay;
+ int len;
+
+ /* First try with room for 40 overlays. */
+ len = 40;
+ overlay_vec = (Lisp_Object *) alloca (len * sizeof (Lisp_Object));
+ noverlays = overlays_at (pos, 0, &overlay_vec, &len,
+ &next_overlay, NULL, 0);
+
+ /* If there are more than 40, make enough space for all, and try
+ again. */
+ if (noverlays > len)
+ {
+ len = noverlays;
+ overlay_vec = (Lisp_Object *) alloca (len * sizeof (Lisp_Object));
+ noverlays = overlays_at (pos, 0, &overlay_vec, &len,
+ &next_overlay, NULL, 0);
+ }
+
+ if (next_overlay < endpos)
+ endpos = next_overlay;
+ }
+
+ *endptr = endpos;
+
+ default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+
+ /* Optimize common cases where we can use the default face. */
+ if (noverlays == 0
+ && NILP (prop)
+ && !(pos >= region_beg && pos < region_end))
+ return DEFAULT_FACE_ID;
+
+ /* Begin with attributes from the default face. */
+ bcopy (default_face->lface, attrs, sizeof attrs);
+
+ /* Merge in attributes specified via text properties. */
+ if (!NILP (prop))
+ merge_face_vector_with_property (f, attrs, prop);
+
+ /* Now merge the overlay data. */
+ noverlays = sort_overlays (overlay_vec, noverlays, w);
+ for (i = 0; i < noverlays; i++)
+ {
+ Lisp_Object oend;
+ int oendpos;
+
+ prop = Foverlay_get (overlay_vec[i], propname);
+ if (!NILP (prop))
+ merge_face_vector_with_property (f, attrs, prop);
+
+ oend = OVERLAY_END (overlay_vec[i]);
+ oendpos = OVERLAY_POSITION (oend);
+ if (oendpos < endpos)
+ endpos = oendpos;
+ }
+
+ /* If in the region, merge in the region face. */
+ if (pos >= region_beg && pos < region_end)
+ {
+ Lisp_Object region_face = lface_from_face_name (f, Qregion, 0);
+ merge_face_vectors (f, XVECTOR (region_face)->contents, attrs, Qnil);
+
+ if (region_end < endpos)
+ endpos = region_end;
+ }
+
+ *endptr = endpos;
+
+ /* Look up a realized face with the given face attributes,
+ or realize a new one for ASCII characters. */
+ return lookup_face (f, attrs, 0, NULL);
+}
+
+
+/* Compute the face at character position POS in Lisp string STRING on
+ window W, for ASCII characters.
+
+ If STRING is an overlay string, it comes from position BUFPOS in
+ current_buffer, otherwise BUFPOS is zero to indicate that STRING is
+ not an overlay string. W must display the current buffer.
+ REGION_BEG and REGION_END give the start and end positions of the
+ region; both are -1 if no region is visible.
+
+ BASE_FACE_ID is the id of a face to merge with. For strings coming
+ from overlays or the `display' property it is the face at BUFPOS.
+
+ If MOUSE_P is non-zero, use the character's mouse-face, not its face.
+
+ Set *ENDPTR to the next position where to check for faces in
+ STRING; -1 if the face is constant from POS to the end of the
+ string.
+
+ Value is the id of the face to use. The face returned is suitable
+ for displaying ASCII characters. */
+
+int
+face_at_string_position (w, string, pos, bufpos, region_beg,
+ region_end, endptr, base_face_id, mouse_p)
+ struct window *w;
+ Lisp_Object string;
+ int pos, bufpos;
+ int region_beg, region_end;
+ int *endptr;
+ enum face_id base_face_id;
+ int mouse_p;
+{
+ Lisp_Object prop, position, end, limit;
+ struct frame *f = XFRAME (WINDOW_FRAME (w));
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ struct face *base_face;
+ int multibyte_p = STRING_MULTIBYTE (string);
+ Lisp_Object prop_name = mouse_p ? Qmouse_face : Qface;
+
+ /* Get the value of the face property at the current position within
+ STRING. Value is nil if there is no face property. */
+ XSETFASTINT (position, pos);
+ prop = Fget_text_property (position, prop_name, string);
+
+ /* Get the next position at which to check for faces. Value of end
+ is nil if face is constant all the way to the end of the string.
+ Otherwise it is a string position where to check faces next.
+ Limit is the maximum position up to which to check for property
+ changes in Fnext_single_property_change. Strings are usually
+ short, so set the limit to the end of the string. */
+ XSETFASTINT (limit, XSTRING (string)->size);
+ end = Fnext_single_property_change (position, prop_name, string, limit);
+ if (INTEGERP (end))
+ *endptr = XFASTINT (end);
+ else
+ *endptr = -1;
+
+ base_face = FACE_FROM_ID (f, base_face_id);
+ xassert (base_face);
+
+ /* Optimize the default case that there is no face property and we
+ are not in the region. */
+ if (NILP (prop)
+ && (base_face_id != DEFAULT_FACE_ID
+ /* BUFPOS <= 0 means STRING is not an overlay string, so
+ that the region doesn't have to be taken into account. */
+ || bufpos <= 0
+ || bufpos < region_beg
+ || bufpos >= region_end)
+ && (multibyte_p
+ /* We can't realize faces for different charsets differently
+ if we don't have fonts, so we can stop here if not working
+ on a window-system frame. */
+ || !FRAME_WINDOW_P (f)
+ || FACE_SUITABLE_FOR_CHAR_P (base_face, 0)))
+ return base_face->id;
+
+ /* Begin with attributes from the base face. */
+ bcopy (base_face->lface, attrs, sizeof attrs);
+
+ /* Merge in attributes specified via text properties. */
+ if (!NILP (prop))
+ merge_face_vector_with_property (f, attrs, prop);
+
+ /* If in the region, merge in the region face. */
+ if (bufpos
+ && bufpos >= region_beg
+ && bufpos < region_end)
+ {
+ Lisp_Object region_face = lface_from_face_name (f, Qregion, 0);
+ merge_face_vectors (f, XVECTOR (region_face)->contents, attrs, Qnil);
+ }
+
+ /* Look up a realized face with the given face attributes,
+ or realize a new one for ASCII characters. */
+ return lookup_face (f, attrs, 0, NULL);
+}
+
+
+
+/***********************************************************************
+ Tests
+ ***********************************************************************/
+
+#if GLYPH_DEBUG
+
+/* Print the contents of the realized face FACE to stderr. */
+
+static void
+dump_realized_face (face)
+ struct face *face;
+{
+ fprintf (stderr, "ID: %d\n", face->id);
+#ifdef HAVE_X_WINDOWS
+ fprintf (stderr, "gc: %d\n", (int) face->gc);
+#endif
+ fprintf (stderr, "foreground: 0x%lx (%s)\n",
+ face->foreground,
+ XSTRING (face->lface[LFACE_FOREGROUND_INDEX])->data);
+ fprintf (stderr, "background: 0x%lx (%s)\n",
+ face->background,
+ XSTRING (face->lface[LFACE_BACKGROUND_INDEX])->data);
+ fprintf (stderr, "font_name: %s (%s)\n",
+ face->font_name,
+ XSTRING (face->lface[LFACE_FAMILY_INDEX])->data);
+#ifdef HAVE_X_WINDOWS
+ fprintf (stderr, "font = %p\n", face->font);
+#endif
+ fprintf (stderr, "font_info_id = %d\n", face->font_info_id);
+ fprintf (stderr, "fontset: %d\n", face->fontset);
+ fprintf (stderr, "underline: %d (%s)\n",
+ face->underline_p,
+ XSTRING (Fsymbol_name (face->lface[LFACE_UNDERLINE_INDEX]))->data);
+ fprintf (stderr, "hash: %d\n", face->hash);
+ fprintf (stderr, "charset: %d\n", face->charset);
+}
+
+
+DEFUN ("dump-face", Fdump_face, Sdump_face, 0, 1, 0, "")
+ (n)
+ Lisp_Object n;
+{
+ if (NILP (n))
+ {
+ int i;
+
+ fprintf (stderr, "font selection order: ");
+ for (i = 0; i < DIM (font_sort_order); ++i)
+ fprintf (stderr, "%d ", font_sort_order[i]);
+ fprintf (stderr, "\n");
+
+ fprintf (stderr, "alternative fonts: ");
+ debug_print (Vface_alternative_font_family_alist);
+ fprintf (stderr, "\n");
+
+ for (i = 0; i < FRAME_FACE_CACHE (SELECTED_FRAME ())->used; ++i)
+ Fdump_face (make_number (i));
+ }
+ else
+ {
+ struct face *face;
+ CHECK_NUMBER (n, 0);
+ face = FACE_FROM_ID (SELECTED_FRAME (), XINT (n));
+ if (face == NULL)
+ error ("Not a valid face");
+ dump_realized_face (face);
+ }
+
+ return Qnil;
+}
+
+
+DEFUN ("show-face-resources", Fshow_face_resources, Sshow_face_resources,
+ 0, 0, 0, "")
+ ()
+{
+ fprintf (stderr, "number of colors = %d\n", ncolors_allocated);
+ fprintf (stderr, "number of pixmaps = %d\n", npixmaps_allocated);
+ fprintf (stderr, "number of GCs = %d\n", ngcs);
+ return Qnil;
+}
+
+#endif /* GLYPH_DEBUG != 0 */
+
+
+
+/***********************************************************************
+ Initialization
+ ***********************************************************************/
+
+void
+syms_of_xfaces ()
+{
+ Qface = intern ("face");
+ staticpro (&Qface);
+ Qbitmap_spec_p = intern ("bitmap-spec-p");
+ staticpro (&Qbitmap_spec_p);
+ Qframe_update_face_colors = intern ("frame-update-face-colors");
+ staticpro (&Qframe_update_face_colors);
+
+ /* Lisp face attribute keywords. */
+ QCfamily = intern (":family");
+ staticpro (&QCfamily);
+ QCheight = intern (":height");
+ staticpro (&QCheight);
+ QCweight = intern (":weight");
+ staticpro (&QCweight);
+ QCslant = intern (":slant");
+ staticpro (&QCslant);
+ QCunderline = intern (":underline");
+ staticpro (&QCunderline);
+ QCinverse_video = intern (":inverse-video");
+ staticpro (&QCinverse_video);
+ QCreverse_video = intern (":reverse-video");
+ staticpro (&QCreverse_video);
+ QCforeground = intern (":foreground");
+ staticpro (&QCforeground);
+ QCbackground = intern (":background");
+ staticpro (&QCbackground);
+ QCstipple = intern (":stipple");;
+ staticpro (&QCstipple);
+ QCwidth = intern (":width");
+ staticpro (&QCwidth);
+ QCfont = intern (":font");
+ staticpro (&QCfont);
+ QCbold = intern (":bold");
+ staticpro (&QCbold);
+ QCitalic = intern (":italic");
+ staticpro (&QCitalic);
+ QCoverline = intern (":overline");
+ staticpro (&QCoverline);
+ QCstrike_through = intern (":strike-through");
+ staticpro (&QCstrike_through);
+ QCbox = intern (":box");
+ staticpro (&QCbox);
+ QCinherit = intern (":inherit");
+ staticpro (&QCinherit);
+
+ /* Symbols used for Lisp face attribute values. */
+ QCcolor = intern (":color");
+ staticpro (&QCcolor);
+ QCline_width = intern (":line-width");
+ staticpro (&QCline_width);
+ QCstyle = intern (":style");
+ staticpro (&QCstyle);
+ Qreleased_button = intern ("released-button");
+ staticpro (&Qreleased_button);
+ Qpressed_button = intern ("pressed-button");
+ staticpro (&Qpressed_button);
+ Qnormal = intern ("normal");
+ staticpro (&Qnormal);
+ Qultra_light = intern ("ultra-light");
+ staticpro (&Qultra_light);
+ Qextra_light = intern ("extra-light");
+ staticpro (&Qextra_light);
+ Qlight = intern ("light");
+ staticpro (&Qlight);
+ Qsemi_light = intern ("semi-light");
+ staticpro (&Qsemi_light);
+ Qsemi_bold = intern ("semi-bold");
+ staticpro (&Qsemi_bold);
+ Qbold = intern ("bold");
+ staticpro (&Qbold);
+ Qextra_bold = intern ("extra-bold");
+ staticpro (&Qextra_bold);
+ Qultra_bold = intern ("ultra-bold");
+ staticpro (&Qultra_bold);
+ Qoblique = intern ("oblique");
+ staticpro (&Qoblique);
+ Qitalic = intern ("italic");
+ staticpro (&Qitalic);
+ Qreverse_oblique = intern ("reverse-oblique");
+ staticpro (&Qreverse_oblique);
+ Qreverse_italic = intern ("reverse-italic");
+ staticpro (&Qreverse_italic);
+ Qultra_condensed = intern ("ultra-condensed");
+ staticpro (&Qultra_condensed);
+ Qextra_condensed = intern ("extra-condensed");
+ staticpro (&Qextra_condensed);
+ Qcondensed = intern ("condensed");
+ staticpro (&Qcondensed);
+ Qsemi_condensed = intern ("semi-condensed");
+ staticpro (&Qsemi_condensed);
+ Qsemi_expanded = intern ("semi-expanded");
+ staticpro (&Qsemi_expanded);
+ Qexpanded = intern ("expanded");
+ staticpro (&Qexpanded);
+ Qextra_expanded = intern ("extra-expanded");
+ staticpro (&Qextra_expanded);
+ Qultra_expanded = intern ("ultra-expanded");
+ staticpro (&Qultra_expanded);
+ Qbackground_color = intern ("background-color");
+ staticpro (&Qbackground_color);
+ Qforeground_color = intern ("foreground-color");
+ staticpro (&Qforeground_color);
+ Qunspecified = intern ("unspecified");
+ staticpro (&Qunspecified);
+
+ Qface_alias = intern ("face-alias");
+ staticpro (&Qface_alias);
+ Qdefault = intern ("default");
+ staticpro (&Qdefault);
+ Qtool_bar = intern ("tool-bar");
+ staticpro (&Qtool_bar);
+ Qregion = intern ("region");
+ staticpro (&Qregion);
+ Qfringe = intern ("fringe");
+ staticpro (&Qfringe);
+ Qheader_line = intern ("header-line");
+ staticpro (&Qheader_line);
+ Qscroll_bar = intern ("scroll-bar");
+ staticpro (&Qscroll_bar);
+ Qmenu = intern ("menu");
+ staticpro (&Qmenu);
+ Qcursor = intern ("cursor");
+ staticpro (&Qcursor);
+ Qborder = intern ("border");
+ staticpro (&Qborder);
+ Qmouse = intern ("mouse");
+ staticpro (&Qmouse);
+ Qtty_color_desc = intern ("tty-color-desc");
+ staticpro (&Qtty_color_desc);
+ Qtty_color_by_index = intern ("tty-color-by-index");
+ staticpro (&Qtty_color_by_index);
+ Qtty_color_alist = intern ("tty-color-alist");
+ staticpro (&Qtty_color_alist);
+ Qscalable_fonts_allowed = intern ("scalable-fonts-allowed");
+ staticpro (&Qscalable_fonts_allowed);
+
+ Vparam_value_alist = Fcons (Fcons (Qnil, Qnil), Qnil);
+ staticpro (&Vparam_value_alist);
+ Vface_alternative_font_family_alist = Qnil;
+ staticpro (&Vface_alternative_font_family_alist);
+ Vface_alternative_font_registry_alist = Qnil;
+ staticpro (&Vface_alternative_font_registry_alist);
+
+ defsubr (&Sinternal_make_lisp_face);
+ defsubr (&Sinternal_lisp_face_p);
+ defsubr (&Sinternal_set_lisp_face_attribute);
+#ifdef HAVE_WINDOW_SYSTEM
+ defsubr (&Sinternal_set_lisp_face_attribute_from_resource);
+#endif
+ defsubr (&Scolor_gray_p);
+ defsubr (&Scolor_supported_p);
+ defsubr (&Sinternal_get_lisp_face_attribute);
+ defsubr (&Sinternal_lisp_face_attribute_values);
+ defsubr (&Sinternal_lisp_face_equal_p);
+ defsubr (&Sinternal_lisp_face_empty_p);
+ defsubr (&Sinternal_copy_lisp_face);
+ defsubr (&Sinternal_merge_in_global_face);
+ defsubr (&Sface_font);
+ defsubr (&Sframe_face_alist);
+ defsubr (&Sinternal_set_font_selection_order);
+ defsubr (&Sinternal_set_alternative_font_family_alist);
+ defsubr (&Sinternal_set_alternative_font_registry_alist);
+#if GLYPH_DEBUG
+ defsubr (&Sdump_face);
+ defsubr (&Sshow_face_resources);
+#endif /* GLYPH_DEBUG */
+ defsubr (&Sclear_face_cache);
+ defsubr (&Stty_suppress_bold_inverse_default_colors);
+
+#if defined DEBUG_X_COLORS && defined HAVE_X_WINDOWS
+ defsubr (&Sdump_colors);
+#endif
+
+ DEFVAR_LISP ("font-list-limit", &Vfont_list_limit,
+ "*Limit for font matching.\n\
+If an integer > 0, font matching functions won't load more than\n\
+that number of fonts when searching for a matching font.");
+ Vfont_list_limit = make_number (DEFAULT_FONT_LIST_LIMIT);
+
+ DEFVAR_LISP ("face-new-frame-defaults", &Vface_new_frame_defaults,
+ "List of global face definitions (for internal use only.)");
+ Vface_new_frame_defaults = Qnil;
+
+ DEFVAR_LISP ("face-default-stipple", &Vface_default_stipple,
+ "*Default stipple pattern used on monochrome displays.\n\
+This stipple pattern is used on monochrome displays\n\
+instead of shades of gray for a face background color.\n\
+See `set-face-stipple' for possible values for this variable.");
+ Vface_default_stipple = build_string ("gray3");
+
+ DEFVAR_LISP ("tty-defined-color-alist", &Vtty_defined_color_alist,
+ "An alist of defined terminal colors and their RGB values.");
+ Vtty_defined_color_alist = Qnil;
+
+ DEFVAR_LISP ("scalable-fonts-allowed", &Vscalable_fonts_allowed,
+ "Allowed scalable fonts.\n\
+A value of nil means don't allow any scalable fonts.\n\
+A value of t means allow any scalable font.\n\
+Otherwise, value must be a list of regular expressions. A font may be\n\
+scaled if its name matches a regular expression in the list.\n\
+Note that if value is nil, a scalable font might still be used, if no\n\
+other font of the appropriate family and registry is available.");
+ Vscalable_fonts_allowed = Qnil;
+
+ DEFVAR_LISP ("face-ignored-fonts", &Vface_ignored_fonts,
+ "List of ignored fonts.\n\
+Each element is a regular expression that matches names of fonts to ignore.");
+ Vface_ignored_fonts = Qnil;
+
+#ifdef HAVE_WINDOW_SYSTEM
+ defsubr (&Sbitmap_spec_p);
+ defsubr (&Sx_list_fonts);
+ defsubr (&Sinternal_face_x_get_resource);
+ defsubr (&Sx_family_fonts);
+ defsubr (&Sx_font_family_list);
+#endif /* HAVE_WINDOW_SYSTEM */
+}
diff --git a/tests/contrib/xfaces/orig b/tests/contrib/xfaces/orig
new file mode 100644
index 0000000..1aaacd1
--- /dev/null
+++ b/tests/contrib/xfaces/orig
@@ -0,0 +1,7253 @@
+/* xfaces.c -- "Face" primitives.
+ Copyright (C) 1993, 1994, 1998, 1999, 2000, 2001
+ Free Software Foundation.
+
+This file is part of GNU Emacs.
+
+GNU Emacs is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Emacs is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Emacs; see the file COPYING. If not, write to
+the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA. */
+
+/* New face implementation by Gerd Moellmann <gerd@gnu.org>. */
+
+/* Faces.
+
+ When using Emacs with X, the display style of characters can be
+ changed by defining `faces'. Each face can specify the following
+ display attributes:
+
+ 1. Font family name.
+
+ 2. Relative proportionate width, aka character set width or set
+ width (swidth), e.g. `semi-compressed'.
+
+ 3. Font height in 1/10pt.
+
+ 4. Font weight, e.g. `bold'.
+
+ 5. Font slant, e.g. `italic'.
+
+ 6. Foreground color.
+
+ 7. Background color.
+
+ 8. Whether or not characters should be underlined, and in what color.
+
+ 9. Whether or not characters should be displayed in inverse video.
+
+ 10. A background stipple, a bitmap.
+
+ 11. Whether or not characters should be overlined, and in what color.
+
+ 12. Whether or not characters should be strike-through, and in what
+ color.
+
+ 13. Whether or not a box should be drawn around characters, the box
+ type, and, for simple boxes, in what color.
+
+ 14. Font or fontset pattern, or nil. This is a special attribute.
+ When this attribute is specified, the face uses a font opened by
+ that pattern as is. In addition, all the other font-related
+ attributes (1st thru 5th) are generated from the opened font name.
+ On the other hand, if one of the other font-related attributes are
+ specified, this attribute is set to nil. In that case, the face
+ doesn't inherit this attribute from the `default' face, and uses a
+ font determined by the other attributes (those may be inherited
+ from the `default' face).
+
+ 15. A face name or list of face names from which to inherit attributes.
+
+ 16. A specified average font width, which is invisible from Lisp,
+ and is used to ensure that a font specified on the command line,
+ for example, can be matched exactly.
+
+ Faces are frame-local by nature because Emacs allows to define the
+ same named face (face names are symbols) differently for different
+ frames. Each frame has an alist of face definitions for all named
+ faces. The value of a named face in such an alist is a Lisp vector
+ with the symbol `face' in slot 0, and a slot for each of the face
+ attributes mentioned above.
+
+ There is also a global face alist `Vface_new_frame_defaults'. Face
+ definitions from this list are used to initialize faces of newly
+ created frames.
+
+ A face doesn't have to specify all attributes. Those not specified
+ have a value of `unspecified'. Faces specifying all attributes but
+ the 14th are called `fully-specified'.
+
+
+ Face merging.
+
+ The display style of a given character in the text is determined by
+ combining several faces. This process is called `face merging'.
+ Any aspect of the display style that isn't specified by overlays or
+ text properties is taken from the `default' face. Since it is made
+ sure that the default face is always fully-specified, face merging
+ always results in a fully-specified face.
+
+
+ Face realization.
+
+ After all face attributes for a character have been determined by
+ merging faces of that character, that face is `realized'. The
+ realization process maps face attributes to what is physically
+ available on the system where Emacs runs. The result is a
+ `realized face' in form of a struct face which is stored in the
+ face cache of the frame on which it was realized.
+
+ Face realization is done in the context of the character to display
+ because different fonts may be used for different characters. In
+ other words, for characters that have different font
+ specifications, different realized faces are needed to display
+ them.
+
+ Font specification is done by fontsets. See the comment in
+ fontset.c for the details. In the current implementation, all ASCII
+ characters share the same font in a fontset.
+
+ Faces are at first realized for ASCII characters, and, at that
+ time, assigned a specific realized fontset. Hereafter, we call
+ such a face as `ASCII face'. When a face for a multibyte character
+ is realized, it inherits (thus shares) a fontset of an ASCII face
+ that has the same attributes other than font-related ones.
+
+ Thus, all realized face have a realized fontset.
+
+
+ Unibyte text.
+
+ Unibyte text (i.e. raw 8-bit characters) is displayed with the same
+ font as ASCII characters. That is because it is expected that
+ unibyte text users specify a font that is suitable both for ASCII
+ and raw 8-bit characters.
+
+
+ Font selection.
+
+ Font selection tries to find the best available matching font for a
+ given (character, face) combination.
+
+ If the face specifies a fontset name, that fontset determines a
+ pattern for fonts of the given character. If the face specifies a
+ font name or the other font-related attributes, a fontset is
+ realized from the default fontset. In that case, that
+ specification determines a pattern for ASCII characters and the
+ default fontset determines a pattern for multibyte characters.
+
+ Available fonts on the system on which Emacs runs are then matched
+ against the font pattern. The result of font selection is the best
+ match for the given face attributes in this font list.
+
+ Font selection can be influenced by the user.
+
+ 1. The user can specify the relative importance he gives the face
+ attributes width, height, weight, and slant by setting
+ face-font-selection-order (faces.el) to a list of face attribute
+ names. The default is '(:width :height :weight :slant), and means
+ that font selection first tries to find a good match for the font
+ width specified by a face, then---within fonts with that
+ width---tries to find a best match for the specified font height,
+ etc.
+
+ 2. Setting face-font-family-alternatives allows the user to
+ specify alternative font families to try if a family specified by a
+ face doesn't exist.
+
+ 3. Setting face-font-registry-alternatives allows the user to
+ specify all alternative font registries to try for a face
+ specifying a registry.
+
+ 4. Setting face-ignored-fonts allows the user to ignore specific
+ fonts.
+
+
+ Character composition.
+
+ Usually, the realization process is already finished when Emacs
+ actually reflects the desired glyph matrix on the screen. However,
+ on displaying a composition (sequence of characters to be composed
+ on the screen), a suitable font for the components of the
+ composition is selected and realized while drawing them on the
+ screen, i.e. the realization process is delayed but in principle
+ the same.
+
+
+ Initialization of basic faces.
+
+ The faces `default', `modeline' are considered `basic faces'.
+ When redisplay happens the first time for a newly created frame,
+ basic faces are realized for CHARSET_ASCII. Frame parameters are
+ used to fill in unspecified attributes of the default face. */
+
+#include <config.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "lisp.h"
+#include "charset.h"
+#include "keyboard.h"
+#include "frame.h"
+
+#ifdef HAVE_WINDOW_SYSTEM
+#include "fontset.h"
+#endif /* HAVE_WINDOW_SYSTEM */
+
+#ifdef HAVE_X_WINDOWS
+#include "xterm.h"
+#ifdef USE_MOTIF
+#include <Xm/Xm.h>
+#include <Xm/XmStrDefs.h>
+#endif /* USE_MOTIF */
+#endif /* HAVE_X_WINDOWS */
+
+#ifdef MSDOS
+#include "dosfns.h"
+#endif
+
+#ifdef WINDOWSNT
+#include "w32term.h"
+#include "fontset.h"
+/* Redefine X specifics to W32 equivalents to avoid cluttering the
+ code with #ifdef blocks. */
+#undef FRAME_X_DISPLAY_INFO
+#define FRAME_X_DISPLAY_INFO FRAME_W32_DISPLAY_INFO
+#define x_display_info w32_display_info
+#define FRAME_X_FONT_TABLE FRAME_W32_FONT_TABLE
+#define check_x check_w32
+#define x_list_fonts w32_list_fonts
+#define GCGraphicsExposures 0
+/* For historic reasons, FONT_WIDTH refers to average width on W32,
+ not maximum as on X. Redefine here. */
+#undef FONT_WIDTH
+#define FONT_WIDTH FONT_MAX_WIDTH
+#endif /* WINDOWSNT */
+
+#ifdef macintosh
+#include "macterm.h"
+#define x_display_info mac_display_info
+#define check_x check_mac
+
+extern XGCValues *XCreateGC (void *, WindowPtr, unsigned long, XGCValues *);
+
+static INLINE GC
+x_create_gc (f, mask, xgcv)
+ struct frame *f;
+ unsigned long mask;
+ XGCValues *xgcv;
+{
+ GC gc;
+ gc = XCreateGC (FRAME_MAC_DISPLAY (f), FRAME_MAC_WINDOW (f), mask, xgcv);
+ return gc;
+}
+
+static INLINE void
+x_free_gc (f, gc)
+ struct frame *f;
+ GC gc;
+{
+ XFreeGC (FRAME_MAC_DISPLAY (f), gc);
+}
+#endif
+
+#include "buffer.h"
+#include "dispextern.h"
+#include "blockinput.h"
+#include "window.h"
+#include "intervals.h"
+
+#ifdef HAVE_X_WINDOWS
+
+/* Compensate for a bug in Xos.h on some systems, on which it requires
+ time.h. On some such systems, Xos.h tries to redefine struct
+ timeval and struct timezone if USG is #defined while it is
+ #included. */
+
+#ifdef XOS_NEEDS_TIME_H
+#include <time.h>
+#undef USG
+#include <X11/Xos.h>
+#define USG
+#define __TIMEVAL__
+#else /* not XOS_NEEDS_TIME_H */
+#include <X11/Xos.h>
+#endif /* not XOS_NEEDS_TIME_H */
+
+#endif /* HAVE_X_WINDOWS */
+
+#include <stdio.h>
+#include <ctype.h>
+
+#ifndef max
+#define max(A, B) ((A) > (B) ? (A) : (B))
+#define min(A, B) ((A) < (B) ? (A) : (B))
+#define abs(X) ((X) < 0 ? -(X) : (X))
+#endif
+
+/* Number of pt per inch (from the TeXbook). */
+
+#define PT_PER_INCH 72.27
+
+/* Non-zero if face attribute ATTR is unspecified. */
+
+#define UNSPECIFIEDP(ATTR) EQ ((ATTR), Qunspecified)
+
+/* Value is the number of elements of VECTOR. */
+
+#define DIM(VECTOR) (sizeof (VECTOR) / sizeof *(VECTOR))
+
+/* Make a copy of string S on the stack using alloca. Value is a pointer
+ to the copy. */
+
+#define STRDUPA(S) strcpy ((char *) alloca (strlen ((S)) + 1), (S))
+
+/* Make a copy of the contents of Lisp string S on the stack using
+ alloca. Value is a pointer to the copy. */
+
+#define LSTRDUPA(S) STRDUPA (XSTRING ((S))->data)
+
+/* Size of hash table of realized faces in face caches (should be a
+ prime number). */
+
+#define FACE_CACHE_BUCKETS_SIZE 1001
+
+/* A definition of XColor for non-X frames. */
+
+#ifndef HAVE_X_WINDOWS
+
+typedef struct
+{
+ unsigned long pixel;
+ unsigned short red, green, blue;
+ char flags;
+ char pad;
+}
+XColor;
+
+#endif /* not HAVE_X_WINDOWS */
+
+/* Keyword symbols used for face attribute names. */
+
+Lisp_Object QCfamily, QCheight, QCweight, QCslant, QCunderline;
+Lisp_Object QCinverse_video, QCforeground, QCbackground, QCstipple;
+Lisp_Object QCwidth, QCfont, QCbold, QCitalic;
+Lisp_Object QCreverse_video;
+Lisp_Object QCoverline, QCstrike_through, QCbox, QCinherit;
+
+/* Symbols used for attribute values. */
+
+Lisp_Object Qnormal, Qbold, Qultra_light, Qextra_light, Qlight;
+Lisp_Object Qsemi_light, Qsemi_bold, Qextra_bold, Qultra_bold;
+Lisp_Object Qoblique, Qitalic, Qreverse_oblique, Qreverse_italic;
+Lisp_Object Qultra_condensed, Qextra_condensed, Qcondensed;
+Lisp_Object Qsemi_condensed, Qsemi_expanded, Qexpanded, Qextra_expanded;
+Lisp_Object Qultra_expanded;
+Lisp_Object Qreleased_button, Qpressed_button;
+Lisp_Object QCstyle, QCcolor, QCline_width;
+Lisp_Object Qunspecified;
+
+char unspecified_fg[] = "unspecified-fg", unspecified_bg[] = "unspecified-bg";
+
+/* The name of the function to call when the background of the frame
+ has changed, frame_update_face_colors. */
+
+Lisp_Object Qframe_update_face_colors;
+
+/* Names of basic faces. */
+
+Lisp_Object Qdefault, Qtool_bar, Qregion, Qfringe;
+Lisp_Object Qheader_line, Qscroll_bar, Qcursor, Qborder, Qmouse, Qmenu;
+extern Lisp_Object Qmode_line;
+
+/* The symbol `face-alias'. A symbols having that property is an
+ alias for another face. Value of the property is the name of
+ the aliased face. */
+
+Lisp_Object Qface_alias;
+
+/* Names of frame parameters related to faces. */
+
+extern Lisp_Object Qscroll_bar_foreground, Qscroll_bar_background;
+extern Lisp_Object Qborder_color, Qcursor_color, Qmouse_color;
+
+/* Default stipple pattern used on monochrome displays. This stipple
+ pattern is used on monochrome displays instead of shades of gray
+ for a face background color. See `set-face-stipple' for possible
+ values for this variable. */
+
+Lisp_Object Vface_default_stipple;
+
+/* Alist of alternative font families. Each element is of the form
+ (FAMILY FAMILY1 FAMILY2 ...). If fonts of FAMILY can't be loaded,
+ try FAMILY1, then FAMILY2, ... */
+
+Lisp_Object Vface_alternative_font_family_alist;
+
+/* Alist of alternative font registries. Each element is of the form
+ (REGISTRY REGISTRY1 REGISTRY2...). If fonts of REGISTRY can't be
+ loaded, try REGISTRY1, then REGISTRY2, ... */
+
+Lisp_Object Vface_alternative_font_registry_alist;
+
+/* Allowed scalable fonts. A value of nil means don't allow any
+ scalable fonts. A value of t means allow the use of any scalable
+ font. Otherwise, value must be a list of regular expressions. A
+ font may be scaled if its name matches a regular expression in the
+ list. */
+
+Lisp_Object Vscalable_fonts_allowed, Qscalable_fonts_allowed;
+
+/* List of regular expressions that matches names of fonts to ignore. */
+
+Lisp_Object Vface_ignored_fonts;
+
+/* Maximum number of fonts to consider in font_list. If not an
+ integer > 0, DEFAULT_FONT_LIST_LIMIT is used instead. */
+
+Lisp_Object Vfont_list_limit;
+#define DEFAULT_FONT_LIST_LIMIT 100
+
+/* The symbols `foreground-color' and `background-color' which can be
+ used as part of a `face' property. This is for compatibility with
+ Emacs 20.2. */
+
+Lisp_Object Qforeground_color, Qbackground_color;
+
+/* The symbols `face' and `mouse-face' used as text properties. */
+
+Lisp_Object Qface;
+extern Lisp_Object Qmouse_face;
+
+/* Error symbol for wrong_type_argument in load_pixmap. */
+
+Lisp_Object Qbitmap_spec_p;
+
+/* Alist of global face definitions. Each element is of the form
+ (FACE . LFACE) where FACE is a symbol naming a face and LFACE
+ is a Lisp vector of face attributes. These faces are used
+ to initialize faces for new frames. */
+
+Lisp_Object Vface_new_frame_defaults;
+
+/* The next ID to assign to Lisp faces. */
+
+static int next_lface_id;
+
+/* A vector mapping Lisp face Id's to face names. */
+
+static Lisp_Object *lface_id_to_name;
+static int lface_id_to_name_size;
+
+/* TTY color-related functions (defined in tty-colors.el). */
+
+Lisp_Object Qtty_color_desc, Qtty_color_by_index;
+
+/* The name of the function used to compute colors on TTYs. */
+
+Lisp_Object Qtty_color_alist;
+
+/* An alist of defined terminal colors and their RGB values. */
+
+Lisp_Object Vtty_defined_color_alist;
+
+/* Counter for calls to clear_face_cache. If this counter reaches
+ CLEAR_FONT_TABLE_COUNT, and a frame has more than
+ CLEAR_FONT_TABLE_NFONTS load, unused fonts are freed. */
+
+static int clear_font_table_count;
+#define CLEAR_FONT_TABLE_COUNT 100
+#define CLEAR_FONT_TABLE_NFONTS 10
+
+/* Non-zero means face attributes have been changed since the last
+ redisplay. Used in redisplay_internal. */
+
+int face_change_count;
+
+/* Non-zero means don't display bold text if a face's foreground
+ and background colors are the inverse of the default colors of the
+ display. This is a kluge to suppress `bold black' foreground text
+ which is hard to read on an LCD monitor. */
+
+int tty_suppress_bold_inverse_default_colors_p;
+
+/* A list of the form `((x . y))' used to avoid consing in
+ Finternal_set_lisp_face_attribute. */
+
+static Lisp_Object Vparam_value_alist;
+
+/* The total number of colors currently allocated. */
+
+#if GLYPH_DEBUG
+static int ncolors_allocated;
+static int npixmaps_allocated;
+static int ngcs;
+#endif
+
+/* Non-zero means the definition of the `menu' face for new frames has
+ been changed. */
+
+int menu_face_changed_default;
+
+
+/* Function prototypes. */
+
+struct font_name;
+struct table_entry;
+
+static void map_tty_color P_ ((struct frame *, struct face *,
+ enum lface_attribute_index, int *));
+static Lisp_Object resolve_face_name P_ ((Lisp_Object));
+static int may_use_scalable_font_p P_ ((char *));
+static void set_font_frame_param P_ ((Lisp_Object, Lisp_Object));
+static int better_font_p P_ ((int *, struct font_name *, struct font_name *,
+ int, int));
+static int x_face_list_fonts P_ ((struct frame *, char *,
+ struct font_name *, int, int));
+static int font_scalable_p P_ ((struct font_name *));
+static int get_lface_attributes P_ ((struct frame *, Lisp_Object, Lisp_Object *, int));
+static int load_pixmap P_ ((struct frame *, Lisp_Object, unsigned *, unsigned *));
+static unsigned char *xstrlwr P_ ((unsigned char *));
+static void signal_error P_ ((char *, Lisp_Object));
+static struct frame *frame_or_selected_frame P_ ((Lisp_Object, int));
+static void load_face_font P_ ((struct frame *, struct face *, int));
+static void load_face_colors P_ ((struct frame *, struct face *, Lisp_Object *));
+static void free_face_colors P_ ((struct frame *, struct face *));
+static int face_color_gray_p P_ ((struct frame *, char *));
+static char *build_font_name P_ ((struct font_name *));
+static void free_font_names P_ ((struct font_name *, int));
+static int sorted_font_list P_ ((struct frame *, char *,
+ int (*cmpfn) P_ ((const void *, const void *)),
+ struct font_name **));
+static int font_list_1 P_ ((struct frame *, Lisp_Object, Lisp_Object,
+ Lisp_Object, struct font_name **));
+static int font_list P_ ((struct frame *, Lisp_Object, Lisp_Object,
+ Lisp_Object, struct font_name **));
+static int try_font_list P_ ((struct frame *, Lisp_Object *,
+ Lisp_Object, Lisp_Object, struct font_name **));
+static int try_alternative_families P_ ((struct frame *f, Lisp_Object,
+ Lisp_Object, struct font_name **));
+static int cmp_font_names P_ ((const void *, const void *));
+static struct face *realize_face P_ ((struct face_cache *, Lisp_Object *, int,
+ struct face *, int));
+static struct face *realize_x_face P_ ((struct face_cache *,
+ Lisp_Object *, int, struct face *));
+static struct face *realize_tty_face P_ ((struct face_cache *,
+ Lisp_Object *, int));
+static int realize_basic_faces P_ ((struct frame *));
+static int realize_default_face P_ ((struct frame *));
+static void realize_named_face P_ ((struct frame *, Lisp_Object, int));
+static int lface_fully_specified_p P_ ((Lisp_Object *));
+static int lface_equal_p P_ ((Lisp_Object *, Lisp_Object *));
+static unsigned hash_string_case_insensitive P_ ((Lisp_Object));
+static unsigned lface_hash P_ ((Lisp_Object *));
+static int lface_same_font_attributes_p P_ ((Lisp_Object *, Lisp_Object *));
+static struct face_cache *make_face_cache P_ ((struct frame *));
+static void free_realized_face P_ ((struct frame *, struct face *));
+static void clear_face_gcs P_ ((struct face_cache *));
+static void free_face_cache P_ ((struct face_cache *));
+static int face_numeric_weight P_ ((Lisp_Object));
+static int face_numeric_slant P_ ((Lisp_Object));
+static int face_numeric_swidth P_ ((Lisp_Object));
+static int face_fontset P_ ((Lisp_Object *));
+static char *choose_face_font P_ ((struct frame *, Lisp_Object *, int, int));
+static void merge_face_vectors P_ ((struct frame *, Lisp_Object *, Lisp_Object*, Lisp_Object));
+static void merge_face_inheritance P_ ((struct frame *f, Lisp_Object,
+ Lisp_Object *, Lisp_Object));
+static void merge_face_vector_with_property P_ ((struct frame *, Lisp_Object *,
+ Lisp_Object));
+static int set_lface_from_font_name P_ ((struct frame *, Lisp_Object,
+ Lisp_Object, int, int));
+static Lisp_Object lface_from_face_name P_ ((struct frame *, Lisp_Object, int));
+static struct face *make_realized_face P_ ((Lisp_Object *));
+static void free_realized_faces P_ ((struct face_cache *));
+static char *best_matching_font P_ ((struct frame *, Lisp_Object *,
+ struct font_name *, int, int));
+static void cache_face P_ ((struct face_cache *, struct face *, unsigned));
+static void uncache_face P_ ((struct face_cache *, struct face *));
+static int xlfd_numeric_slant P_ ((struct font_name *));
+static int xlfd_numeric_weight P_ ((struct font_name *));
+static int xlfd_numeric_swidth P_ ((struct font_name *));
+static Lisp_Object xlfd_symbolic_slant P_ ((struct font_name *));
+static Lisp_Object xlfd_symbolic_weight P_ ((struct font_name *));
+static Lisp_Object xlfd_symbolic_swidth P_ ((struct font_name *));
+static int xlfd_fixed_p P_ ((struct font_name *));
+static int xlfd_numeric_value P_ ((struct table_entry *, int, struct font_name *,
+ int, int));
+static Lisp_Object xlfd_symbolic_value P_ ((struct table_entry *, int,
+ struct font_name *, int,
+ Lisp_Object));
+static struct table_entry *xlfd_lookup_field_contents P_ ((struct table_entry *, int,
+ struct font_name *, int));
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+static int split_font_name P_ ((struct frame *, struct font_name *, int));
+static int xlfd_point_size P_ ((struct frame *, struct font_name *));
+static void sort_fonts P_ ((struct frame *, struct font_name *, int,
+ int (*cmpfn) P_ ((const void *, const void *))));
+static GC x_create_gc P_ ((struct frame *, unsigned long, XGCValues *));
+static void x_free_gc P_ ((struct frame *, GC));
+static void clear_font_table P_ ((struct x_display_info *));
+
+#ifdef WINDOWSNT
+extern Lisp_Object w32_list_fonts P_ ((struct frame *, Lisp_Object, int, int));
+#endif /* WINDOWSNT */
+
+#ifdef USE_X_TOOLKIT
+static void x_update_menu_appearance P_ ((struct frame *));
+#endif /* USE_X_TOOLKIT */
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+/***********************************************************************
+ Utilities
+ ***********************************************************************/
+
+#ifdef HAVE_X_WINDOWS
+
+#ifdef DEBUG_X_COLORS
+
+/* The following is a poor mans infrastructure for debugging X color
+ allocation problems on displays with PseudoColor-8. Some X servers
+ like 3.3.5 XF86_SVGA with Matrox cards apparently don't implement
+ color reference counts completely so that they don't signal an
+ error when a color is freed whose reference count is already 0.
+ Other X servers do. To help me debug this, the following code
+ implements a simple reference counting schema of its own, for a
+ single display/screen. --gerd. */
+
+/* Reference counts for pixel colors. */
+
+int color_count[256];
+
+/* Register color PIXEL as allocated. */
+
+void
+register_color (pixel)
+ unsigned long pixel;
+{
+ xassert (pixel < 256);
+ ++color_count[pixel];
+}
+
+
+/* Register color PIXEL as deallocated. */
+
+void
+unregister_color (pixel)
+ unsigned long pixel;
+{
+ xassert (pixel < 256);
+ if (color_count[pixel] > 0)
+ --color_count[pixel];
+ else
+ abort ();
+}
+
+
+/* Register N colors from PIXELS as deallocated. */
+
+void
+unregister_colors (pixels, n)
+ unsigned long *pixels;
+ int n;
+{
+ int i;
+ for (i = 0; i < n; ++i)
+ unregister_color (pixels[i]);
+}
+
+
+DEFUN ("dump-colors", Fdump_colors, Sdump_colors, 0, 0, 0,
+ "Dump currently allocated colors and their reference counts to stderr.")
+ ()
+{
+ int i, n;
+
+ fputc ('\n', stderr);
+
+ for (i = n = 0; i < sizeof color_count / sizeof color_count[0]; ++i)
+ if (color_count[i])
+ {
+ fprintf (stderr, "%3d: %5d", i, color_count[i]);
+ ++n;
+ if (n % 5 == 0)
+ fputc ('\n', stderr);
+ else
+ fputc ('\t', stderr);
+ }
+
+ if (n % 5 != 0)
+ fputc ('\n', stderr);
+ return Qnil;
+}
+
+#endif /* DEBUG_X_COLORS */
+
+
+/* Free colors used on frame F. PIXELS is an array of NPIXELS pixel
+ color values. Interrupt input must be blocked when this function
+ is called. */
+
+void
+x_free_colors (f, pixels, npixels)
+ struct frame *f;
+ unsigned long *pixels;
+ int npixels;
+{
+ int class = FRAME_X_DISPLAY_INFO (f)->visual->class;
+
+ /* If display has an immutable color map, freeing colors is not
+ necessary and some servers don't allow it. So don't do it. */
+ if (class != StaticColor && class != StaticGray && class != TrueColor)
+ {
+#ifdef DEBUG_X_COLORS
+ unregister_colors (pixels, npixels);
+#endif
+ XFreeColors (FRAME_X_DISPLAY (f), FRAME_X_COLORMAP (f),
+ pixels, npixels, 0);
+ }
+}
+
+
+/* Free colors used on frame F. PIXELS is an array of NPIXELS pixel
+ color values. Interrupt input must be blocked when this function
+ is called. */
+
+void
+x_free_dpy_colors (dpy, screen, cmap, pixels, npixels)
+ Display *dpy;
+ Screen *screen;
+ Colormap cmap;
+ unsigned long *pixels;
+ int npixels;
+{
+ struct x_display_info *dpyinfo = x_display_info_for_display (dpy);
+ int class = dpyinfo->visual->class;
+
+ /* If display has an immutable color map, freeing colors is not
+ necessary and some servers don't allow it. So don't do it. */
+ if (class != StaticColor && class != StaticGray && class != TrueColor)
+ {
+#ifdef DEBUG_X_COLORS
+ unregister_colors (pixels, npixels);
+#endif
+ XFreeColors (dpy, cmap, pixels, npixels, 0);
+ }
+}
+
+
+/* Create and return a GC for use on frame F. GC values and mask
+ are given by XGCV and MASK. */
+
+static INLINE GC
+x_create_gc (f, mask, xgcv)
+ struct frame *f;
+ unsigned long mask;
+ XGCValues *xgcv;
+{
+ GC gc;
+ BLOCK_INPUT;
+ gc = XCreateGC (FRAME_X_DISPLAY (f), FRAME_X_WINDOW (f), mask, xgcv);
+ UNBLOCK_INPUT;
+ IF_DEBUG (++ngcs);
+ return gc;
+}
+
+
+/* Free GC which was used on frame F. */
+
+static INLINE void
+x_free_gc (f, gc)
+ struct frame *f;
+ GC gc;
+{
+ BLOCK_INPUT;
+ xassert (--ngcs >= 0);
+ XFreeGC (FRAME_X_DISPLAY (f), gc);
+ UNBLOCK_INPUT;
+}
+
+#endif /* HAVE_X_WINDOWS */
+
+#ifdef WINDOWSNT
+/* W32 emulation of GCs */
+
+static INLINE GC
+x_create_gc (f, mask, xgcv)
+ struct frame *f;
+ unsigned long mask;
+ XGCValues *xgcv;
+{
+ GC gc;
+ BLOCK_INPUT;
+ gc = XCreateGC (NULL, FRAME_W32_WINDOW (f), mask, xgcv);
+ UNBLOCK_INPUT;
+ IF_DEBUG (++ngcs);
+ return gc;
+}
+
+
+/* Free GC which was used on frame F. */
+
+static INLINE void
+x_free_gc (f, gc)
+ struct frame *f;
+ GC gc;
+{
+ BLOCK_INPUT;
+ xassert (--ngcs >= 0);
+ xfree (gc);
+ UNBLOCK_INPUT;
+}
+
+#endif /* WINDOWSNT */
+
+/* Like stricmp. Used to compare parts of font names which are in
+ ISO8859-1. */
+
+int
+xstricmp (s1, s2)
+ unsigned char *s1, *s2;
+{
+ while (*s1 && *s2)
+ {
+ unsigned char c1 = tolower (*s1);
+ unsigned char c2 = tolower (*s2);
+ if (c1 != c2)
+ return c1 < c2 ? -1 : 1;
+ ++s1, ++s2;
+ }
+
+ if (*s1 == 0)
+ return *s2 == 0 ? 0 : -1;
+ return 1;
+}
+
+
+/* Like strlwr, which might not always be available. */
+
+static unsigned char *
+xstrlwr (s)
+ unsigned char *s;
+{
+ unsigned char *p = s;
+
+ for (p = s; *p; ++p)
+ *p = tolower (*p);
+
+ return s;
+}
+
+
+/* Signal `error' with message S, and additional argument ARG. */
+
+static void
+signal_error (s, arg)
+ char *s;
+ Lisp_Object arg;
+{
+ Fsignal (Qerror, Fcons (build_string (s), Fcons (arg, Qnil)));
+}
+
+
+/* If FRAME is nil, return a pointer to the selected frame.
+ Otherwise, check that FRAME is a live frame, and return a pointer
+ to it. NPARAM is the parameter number of FRAME, for
+ CHECK_LIVE_FRAME. This is here because it's a frequent pattern in
+ Lisp function definitions. */
+
+static INLINE struct frame *
+frame_or_selected_frame (frame, nparam)
+ Lisp_Object frame;
+ int nparam;
+{
+ if (NILP (frame))
+ frame = selected_frame;
+
+ CHECK_LIVE_FRAME (frame, nparam);
+ return XFRAME (frame);
+}
+
+
+/***********************************************************************
+ Frames and faces
+ ***********************************************************************/
+
+/* Initialize face cache and basic faces for frame F. */
+
+void
+init_frame_faces (f)
+ struct frame *f;
+{
+ /* Make a face cache, if F doesn't have one. */
+ if (FRAME_FACE_CACHE (f) == NULL)
+ FRAME_FACE_CACHE (f) = make_face_cache (f);
+
+#ifdef HAVE_WINDOW_SYSTEM
+ /* Make the image cache. */
+ if (FRAME_WINDOW_P (f))
+ {
+ if (FRAME_X_IMAGE_CACHE (f) == NULL)
+ FRAME_X_IMAGE_CACHE (f) = make_image_cache ();
+ ++FRAME_X_IMAGE_CACHE (f)->refcount;
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+
+ /* Realize basic faces. Must have enough information in frame
+ parameters to realize basic faces at this point. */
+#ifdef HAVE_X_WINDOWS
+ if (!FRAME_X_P (f) || FRAME_X_WINDOW (f))
+#endif
+#ifdef WINDOWSNT
+ if (!FRAME_WINDOW_P (f) || FRAME_W32_WINDOW (f))
+#endif
+ if (!realize_basic_faces (f))
+ abort ();
+}
+
+
+/* Free face cache of frame F. Called from Fdelete_frame. */
+
+void
+free_frame_faces (f)
+ struct frame *f;
+{
+ struct face_cache *face_cache = FRAME_FACE_CACHE (f);
+
+ if (face_cache)
+ {
+ free_face_cache (face_cache);
+ FRAME_FACE_CACHE (f) = NULL;
+ }
+
+#ifdef HAVE_WINDOW_SYSTEM
+ if (FRAME_WINDOW_P (f))
+ {
+ struct image_cache *image_cache = FRAME_X_IMAGE_CACHE (f);
+ if (image_cache)
+ {
+ --image_cache->refcount;
+ if (image_cache->refcount == 0)
+ free_image_cache (f);
+ }
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+}
+
+
+/* Clear face caches, and recompute basic faces for frame F. Call
+ this after changing frame parameters on which those faces depend,
+ or when realized faces have been freed due to changing attributes
+ of named faces. */
+
+void
+recompute_basic_faces (f)
+ struct frame *f;
+{
+ if (FRAME_FACE_CACHE (f))
+ {
+ clear_face_cache (0);
+ if (!realize_basic_faces (f))
+ abort ();
+ }
+}
+
+
+/* Clear the face caches of all frames. CLEAR_FONTS_P non-zero means
+ try to free unused fonts, too. */
+
+void
+clear_face_cache (clear_fonts_p)
+ int clear_fonts_p;
+{
+#ifdef HAVE_WINDOW_SYSTEM
+ Lisp_Object tail, frame;
+ struct frame *f;
+
+ if (clear_fonts_p
+ || ++clear_font_table_count == CLEAR_FONT_TABLE_COUNT)
+ {
+ struct x_display_info *dpyinfo;
+
+ /* Fonts are common for frames on one display, i.e. on
+ one X screen. */
+ for (dpyinfo = x_display_list; dpyinfo; dpyinfo = dpyinfo->next)
+ if (dpyinfo->n_fonts > CLEAR_FONT_TABLE_NFONTS)
+ clear_font_table (dpyinfo);
+
+ /* From time to time see if we can unload some fonts. This also
+ frees all realized faces on all frames. Fonts needed by
+ faces will be loaded again when faces are realized again. */
+ clear_font_table_count = 0;
+
+ FOR_EACH_FRAME (tail, frame)
+ {
+ struct frame *f = XFRAME (frame);
+ if (FRAME_WINDOW_P (f)
+ && FRAME_X_DISPLAY_INFO (f)->n_fonts > CLEAR_FONT_TABLE_NFONTS)
+ free_all_realized_faces (frame);
+ }
+ }
+ else
+ {
+ /* Clear GCs of realized faces. */
+ FOR_EACH_FRAME (tail, frame)
+ {
+ f = XFRAME (frame);
+ if (FRAME_WINDOW_P (f))
+ {
+ clear_face_gcs (FRAME_FACE_CACHE (f));
+ clear_image_cache (f, 0);
+ }
+ }
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+}
+
+
+DEFUN ("clear-face-cache", Fclear_face_cache, Sclear_face_cache, 0, 1, 0,
+ "Clear face caches on all frames.\n\
+Optional THOROUGHLY non-nil means try to free unused fonts, too.")
+ (thoroughly)
+ Lisp_Object thoroughly;
+{
+ clear_face_cache (!NILP (thoroughly));
+ ++face_change_count;
+ ++windows_or_buffers_changed;
+ return Qnil;
+}
+
+
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+
+/* Remove fonts from the font table of DPYINFO except for the default
+ ASCII fonts of frames on that display. Called from clear_face_cache
+ from time to time. */
+
+static void
+clear_font_table (dpyinfo)
+ struct x_display_info *dpyinfo;
+{
+ int i;
+
+ /* Free those fonts that are not used by frames on DPYINFO. */
+ for (i = 0; i < dpyinfo->n_fonts; ++i)
+ {
+ struct font_info *font_info = dpyinfo->font_table + i;
+ Lisp_Object tail, frame;
+
+ /* Check if slot is already free. */
+ if (font_info->name == NULL)
+ continue;
+
+ /* Don't free a default font of some frame on this display. */
+ FOR_EACH_FRAME (tail, frame)
+ {
+ struct frame *f = XFRAME (frame);
+ if (FRAME_WINDOW_P (f)
+ && FRAME_X_DISPLAY_INFO (f) == dpyinfo
+ && font_info->font == FRAME_FONT (f))
+ break;
+ }
+
+ if (!NILP (tail))
+ continue;
+
+ /* Free names. */
+ if (font_info->full_name != font_info->name)
+ xfree (font_info->full_name);
+ xfree (font_info->name);
+
+ /* Free the font. */
+ BLOCK_INPUT;
+#ifdef HAVE_X_WINDOWS
+ XFreeFont (dpyinfo->display, font_info->font);
+#endif
+#ifdef WINDOWSNT
+ w32_unload_font (dpyinfo, font_info->font);
+#endif
+ UNBLOCK_INPUT;
+
+ /* Mark font table slot free. */
+ font_info->font = NULL;
+ font_info->name = font_info->full_name = NULL;
+ }
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ X Pixmaps
+ ***********************************************************************/
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+DEFUN ("bitmap-spec-p", Fbitmap_spec_p, Sbitmap_spec_p, 1, 1, 0,
+ "Value is non-nil if OBJECT is a valid bitmap specification.\n\
+A bitmap specification is either a string, a file name, or a list\n\
+(WIDTH HEIGHT DATA) where WIDTH is the pixel width of the bitmap,\n\
+HEIGHT is its height, and DATA is a string containing the bits of\n\
+the pixmap. Bits are stored row by row, each row occupies\n\
+(WIDTH + 7)/8 bytes.")
+ (object)
+ Lisp_Object object;
+{
+ int pixmap_p = 0;
+
+ if (STRINGP (object))
+ /* If OBJECT is a string, it's a file name. */
+ pixmap_p = 1;
+ else if (CONSP (object))
+ {
+ /* Otherwise OBJECT must be (WIDTH HEIGHT DATA), WIDTH and
+ HEIGHT must be integers > 0, and DATA must be string large
+ enough to hold a bitmap of the specified size. */
+ Lisp_Object width, height, data;
+
+ height = width = data = Qnil;
+
+ if (CONSP (object))
+ {
+ width = XCAR (object);
+ object = XCDR (object);
+ if (CONSP (object))
+ {
+ height = XCAR (object);
+ object = XCDR (object);
+ if (CONSP (object))
+ data = XCAR (object);
+ }
+ }
+
+ if (NATNUMP (width) && NATNUMP (height) && STRINGP (data))
+ {
+ int bytes_per_row = ((XFASTINT (width) + BITS_PER_CHAR - 1)
+ / BITS_PER_CHAR);
+ if (STRING_BYTES (XSTRING (data)) >= bytes_per_row * XINT (height))
+ pixmap_p = 1;
+ }
+ }
+
+ return pixmap_p ? Qt : Qnil;
+}
+
+
+/* Load a bitmap according to NAME (which is either a file name or a
+ pixmap spec) for use on frame F. Value is the bitmap_id (see
+ xfns.c). If NAME is nil, return with a bitmap id of zero. If
+ bitmap cannot be loaded, display a message saying so, and return
+ zero. Store the bitmap width in *W_PTR and its height in *H_PTR,
+ if these pointers are not null. */
+
+static int
+load_pixmap (f, name, w_ptr, h_ptr)
+ FRAME_PTR f;
+ Lisp_Object name;
+ unsigned int *w_ptr, *h_ptr;
+{
+ int bitmap_id;
+ Lisp_Object tem;
+
+ if (NILP (name))
+ return 0;
+
+ tem = Fbitmap_spec_p (name);
+ if (NILP (tem))
+ wrong_type_argument (Qbitmap_spec_p, name);
+
+ BLOCK_INPUT;
+ if (CONSP (name))
+ {
+ /* Decode a bitmap spec into a bitmap. */
+
+ int h, w;
+ Lisp_Object bits;
+
+ w = XINT (Fcar (name));
+ h = XINT (Fcar (Fcdr (name)));
+ bits = Fcar (Fcdr (Fcdr (name)));
+
+ bitmap_id = x_create_bitmap_from_data (f, XSTRING (bits)->data,
+ w, h);
+ }
+ else
+ {
+ /* It must be a string -- a file name. */
+ bitmap_id = x_create_bitmap_from_file (f, name);
+ }
+ UNBLOCK_INPUT;
+
+ if (bitmap_id < 0)
+ {
+ add_to_log ("Invalid or undefined bitmap %s", name, Qnil);
+ bitmap_id = 0;
+
+ if (w_ptr)
+ *w_ptr = 0;
+ if (h_ptr)
+ *h_ptr = 0;
+ }
+ else
+ {
+#if GLYPH_DEBUG
+ ++npixmaps_allocated;
+#endif
+ if (w_ptr)
+ *w_ptr = x_bitmap_width (f, bitmap_id);
+
+ if (h_ptr)
+ *h_ptr = x_bitmap_height (f, bitmap_id);
+ }
+
+ return bitmap_id;
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ Minimum font bounds
+ ***********************************************************************/
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Update the line_height of frame F. Return non-zero if line height
+ changes. */
+
+int
+frame_update_line_height (f)
+ struct frame *f;
+{
+ int line_height, changed_p;
+
+ line_height = FONT_HEIGHT (FRAME_FONT (f));
+ changed_p = line_height != FRAME_LINE_HEIGHT (f);
+ FRAME_LINE_HEIGHT (f) = line_height;
+ return changed_p;
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+/***********************************************************************
+ Fonts
+ ***********************************************************************/
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Load font of face FACE which is used on frame F to display
+ character C. The name of the font to load is determined by lface
+ and fontset of FACE. */
+
+static void
+load_face_font (f, face, c)
+ struct frame *f;
+ struct face *face;
+ int c;
+{
+ struct font_info *font_info = NULL;
+ char *font_name;
+
+ face->font_info_id = -1;
+ face->font = NULL;
+
+ font_name = choose_face_font (f, face->lface, face->fontset, c);
+ if (!font_name)
+ return;
+
+ BLOCK_INPUT;
+ font_info = FS_LOAD_FACE_FONT (f, c, font_name, face);
+ UNBLOCK_INPUT;
+
+ if (font_info)
+ {
+ face->font_info_id = font_info->font_idx;
+ face->font = font_info->font;
+ face->font_name = font_info->full_name;
+ if (face->gc)
+ {
+ x_free_gc (f, face->gc);
+ face->gc = 0;
+ }
+ }
+ else
+ add_to_log ("Unable to load font %s",
+ build_string (font_name), Qnil);
+ xfree (font_name);
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ X Colors
+ ***********************************************************************/
+
+/* A version of defined_color for non-X frames. */
+
+int
+tty_defined_color (f, color_name, color_def, alloc)
+ struct frame *f;
+ char *color_name;
+ XColor *color_def;
+ int alloc;
+{
+ Lisp_Object color_desc;
+ unsigned long color_idx = FACE_TTY_DEFAULT_COLOR;
+ unsigned long red = 0, green = 0, blue = 0;
+ int status = 1;
+
+ if (*color_name && !NILP (Ffboundp (Qtty_color_desc)))
+ {
+ Lisp_Object frame;
+
+ XSETFRAME (frame, f);
+ status = 0;
+ color_desc = call2 (Qtty_color_desc, build_string (color_name), frame);
+ if (CONSP (color_desc) && CONSP (XCDR (color_desc)))
+ {
+ color_idx = XINT (XCAR (XCDR (color_desc)));
+ if (CONSP (XCDR (XCDR (color_desc))))
+ {
+ red = XINT (XCAR (XCDR (XCDR (color_desc))));
+ green = XINT (XCAR (XCDR (XCDR (XCDR (color_desc)))));
+ blue = XINT (XCAR (XCDR (XCDR (XCDR (XCDR (color_desc))))));
+ }
+ status = 1;
+ }
+ else if (NILP (Fsymbol_value (intern ("tty-defined-color-alist"))))
+ /* We were called early during startup, and the colors are not
+ yet set up in tty-defined-color-alist. Don't return a failure
+ indication, since this produces the annoying "Unable to
+ load color" messages in the *Messages* buffer. */
+ status = 1;
+ }
+ if (color_idx == FACE_TTY_DEFAULT_COLOR && *color_name)
+ {
+ if (strcmp (color_name, "unspecified-fg") == 0)
+ color_idx = FACE_TTY_DEFAULT_FG_COLOR;
+ else if (strcmp (color_name, "unspecified-bg") == 0)
+ color_idx = FACE_TTY_DEFAULT_BG_COLOR;
+ }
+
+ if (color_idx != FACE_TTY_DEFAULT_COLOR)
+ status = 1;
+
+ color_def->pixel = color_idx;
+ color_def->red = red;
+ color_def->green = green;
+ color_def->blue = blue;
+
+ return status;
+}
+
+
+/* Decide if color named COLOR_NAME is valid for the display
+ associated with the frame F; if so, return the rgb values in
+ COLOR_DEF. If ALLOC is nonzero, allocate a new colormap cell.
+
+ This does the right thing for any type of frame. */
+
+int
+defined_color (f, color_name, color_def, alloc)
+ struct frame *f;
+ char *color_name;
+ XColor *color_def;
+ int alloc;
+{
+ if (!FRAME_WINDOW_P (f))
+ return tty_defined_color (f, color_name, color_def, alloc);
+#ifdef HAVE_X_WINDOWS
+ else if (FRAME_X_P (f))
+ return x_defined_color (f, color_name, color_def, alloc);
+#endif
+#ifdef WINDOWSNT
+ else if (FRAME_W32_P (f))
+ return w32_defined_color (f, color_name, color_def, alloc);
+#endif
+#ifdef macintosh
+ else if (FRAME_MAC_P (f))
+ return mac_defined_color (f, color_name, color_def, alloc);
+#endif
+ else
+ abort ();
+}
+
+
+/* Given the index IDX of a tty color on frame F, return its name, a
+ Lisp string. */
+
+Lisp_Object
+tty_color_name (f, idx)
+ struct frame *f;
+ int idx;
+{
+ if (idx >= 0 && !NILP (Ffboundp (Qtty_color_by_index)))
+ {
+ Lisp_Object frame;
+ Lisp_Object coldesc;
+
+ XSETFRAME (frame, f);
+ coldesc = call2 (Qtty_color_by_index, make_number (idx), frame);
+
+ if (!NILP (coldesc))
+ return XCAR (coldesc);
+ }
+#ifdef MSDOS
+ /* We can have an MSDOG frame under -nw for a short window of
+ opportunity before internal_terminal_init is called. DTRT. */
+ if (FRAME_MSDOS_P (f) && !inhibit_window_system)
+ return msdos_stdcolor_name (idx);
+#endif
+
+ if (idx == FACE_TTY_DEFAULT_FG_COLOR)
+ return build_string (unspecified_fg);
+ if (idx == FACE_TTY_DEFAULT_BG_COLOR)
+ return build_string (unspecified_bg);
+
+#ifdef WINDOWSNT
+ return vga_stdcolor_name (idx);
+#endif
+
+ return Qunspecified;
+}
+
+
+/* Return non-zero if COLOR_NAME is a shade of gray (or white or
+ black) on frame F. The algorithm is taken from 20.2 faces.el. */
+
+static int
+face_color_gray_p (f, color_name)
+ struct frame *f;
+ char *color_name;
+{
+ XColor color;
+ int gray_p;
+
+ if (defined_color (f, color_name, &color, 0))
+ gray_p = ((abs (color.red - color.green)
+ < max (color.red, color.green) / 20)
+ && (abs (color.green - color.blue)
+ < max (color.green, color.blue) / 20)
+ && (abs (color.blue - color.red)
+ < max (color.blue, color.red) / 20));
+ else
+ gray_p = 0;
+
+ return gray_p;
+}
+
+
+/* Return non-zero if color COLOR_NAME can be displayed on frame F.
+ BACKGROUND_P non-zero means the color will be used as background
+ color. */
+
+static int
+face_color_supported_p (f, color_name, background_p)
+ struct frame *f;
+ char *color_name;
+ int background_p;
+{
+ Lisp_Object frame;
+ XColor not_used;
+
+ XSETFRAME (frame, f);
+ return (FRAME_WINDOW_P (f)
+ ? (!NILP (Fxw_display_color_p (frame))
+ || xstricmp (color_name, "black") == 0
+ || xstricmp (color_name, "white") == 0
+ || (background_p
+ && face_color_gray_p (f, color_name))
+ || (!NILP (Fx_display_grayscale_p (frame))
+ && face_color_gray_p (f, color_name)))
+ : tty_defined_color (f, color_name, &not_used, 0));
+}
+
+
+DEFUN ("color-gray-p", Fcolor_gray_p, Scolor_gray_p, 1, 2, 0,
+ "Return non-nil if COLOR is a shade of gray (or white or black).\n\
+FRAME specifies the frame and thus the display for interpreting COLOR.\n\
+If FRAME is nil or omitted, use the selected frame.")
+ (color, frame)
+ Lisp_Object color, frame;
+{
+ struct frame *f;
+
+ CHECK_FRAME (frame, 0);
+ CHECK_STRING (color, 0);
+ f = XFRAME (frame);
+ return face_color_gray_p (f, XSTRING (color)->data) ? Qt : Qnil;
+}
+
+
+DEFUN ("color-supported-p", Fcolor_supported_p,
+ Scolor_supported_p, 2, 3, 0,
+ "Return non-nil if COLOR can be displayed on FRAME.\n\
+BACKGROUND-P non-nil means COLOR is used as a background.\n\
+If FRAME is nil or omitted, use the selected frame.\n\
+COLOR must be a valid color name.")
+ (color, frame, background_p)
+ Lisp_Object frame, color, background_p;
+{
+ struct frame *f;
+
+ CHECK_FRAME (frame, 0);
+ CHECK_STRING (color, 0);
+ f = XFRAME (frame);
+ if (face_color_supported_p (f, XSTRING (color)->data, !NILP (background_p)))
+ return Qt;
+ return Qnil;
+}
+
+
+/* Load color with name NAME for use by face FACE on frame F.
+ TARGET_INDEX must be one of LFACE_FOREGROUND_INDEX,
+ LFACE_BACKGROUND_INDEX, LFACE_UNDERLINE_INDEX, LFACE_OVERLINE_INDEX,
+ LFACE_STRIKE_THROUGH_INDEX, or LFACE_BOX_INDEX. Value is the
+ pixel color. If color cannot be loaded, display a message, and
+ return the foreground, background or underline color of F, but
+ record that fact in flags of the face so that we don't try to free
+ these colors. */
+
+unsigned long
+load_color (f, face, name, target_index)
+ struct frame *f;
+ struct face *face;
+ Lisp_Object name;
+ enum lface_attribute_index target_index;
+{
+ XColor color;
+
+ xassert (STRINGP (name));
+ xassert (target_index == LFACE_FOREGROUND_INDEX
+ || target_index == LFACE_BACKGROUND_INDEX
+ || target_index == LFACE_UNDERLINE_INDEX
+ || target_index == LFACE_OVERLINE_INDEX
+ || target_index == LFACE_STRIKE_THROUGH_INDEX
+ || target_index == LFACE_BOX_INDEX);
+
+ /* if the color map is full, defined_color will return a best match
+ to the values in an existing cell. */
+ if (!defined_color (f, XSTRING (name)->data, &color, 1))
+ {
+ add_to_log ("Unable to load color \"%s\"", name, Qnil);
+
+ switch (target_index)
+ {
+ case LFACE_FOREGROUND_INDEX:
+ face->foreground_defaulted_p = 1;
+ color.pixel = FRAME_FOREGROUND_PIXEL (f);
+ break;
+
+ case LFACE_BACKGROUND_INDEX:
+ face->background_defaulted_p = 1;
+ color.pixel = FRAME_BACKGROUND_PIXEL (f);
+ break;
+
+ case LFACE_UNDERLINE_INDEX:
+ face->underline_defaulted_p = 1;
+ color.pixel = FRAME_FOREGROUND_PIXEL (f);
+ break;
+
+ case LFACE_OVERLINE_INDEX:
+ face->overline_color_defaulted_p = 1;
+ color.pixel = FRAME_FOREGROUND_PIXEL (f);
+ break;
+
+ case LFACE_STRIKE_THROUGH_INDEX:
+ face->strike_through_color_defaulted_p = 1;
+ color.pixel = FRAME_FOREGROUND_PIXEL (f);
+ break;
+
+ case LFACE_BOX_INDEX:
+ face->box_color_defaulted_p = 1;
+ color.pixel = FRAME_FOREGROUND_PIXEL (f);
+ break;
+
+ default:
+ abort ();
+ }
+ }
+#if GLYPH_DEBUG
+ else
+ ++ncolors_allocated;
+#endif
+
+ return color.pixel;
+}
+
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Load colors for face FACE which is used on frame F. Colors are
+ specified by slots LFACE_BACKGROUND_INDEX and LFACE_FOREGROUND_INDEX
+ of ATTRS. If the background color specified is not supported on F,
+ try to emulate gray colors with a stipple from Vface_default_stipple. */
+
+static void
+load_face_colors (f, face, attrs)
+ struct frame *f;
+ struct face *face;
+ Lisp_Object *attrs;
+{
+ Lisp_Object fg, bg;
+
+ bg = attrs[LFACE_BACKGROUND_INDEX];
+ fg = attrs[LFACE_FOREGROUND_INDEX];
+
+ /* Swap colors if face is inverse-video. */
+ if (EQ (attrs[LFACE_INVERSE_INDEX], Qt))
+ {
+ Lisp_Object tmp;
+ tmp = fg;
+ fg = bg;
+ bg = tmp;
+ }
+
+ /* Check for support for foreground, not for background because
+ face_color_supported_p is smart enough to know that grays are
+ "supported" as background because we are supposed to use stipple
+ for them. */
+ if (!face_color_supported_p (f, XSTRING (bg)->data, 0)
+ && !NILP (Fbitmap_spec_p (Vface_default_stipple)))
+ {
+ x_destroy_bitmap (f, face->stipple);
+ face->stipple = load_pixmap (f, Vface_default_stipple,
+ &face->pixmap_w, &face->pixmap_h);
+ }
+
+ face->background = load_color (f, face, bg, LFACE_BACKGROUND_INDEX);
+ face->foreground = load_color (f, face, fg, LFACE_FOREGROUND_INDEX);
+}
+
+
+/* Free color PIXEL on frame F. */
+
+void
+unload_color (f, pixel)
+ struct frame *f;
+ unsigned long pixel;
+{
+#ifdef HAVE_X_WINDOWS
+ if (pixel != -1)
+ {
+ BLOCK_INPUT;
+ x_free_colors (f, &pixel, 1);
+ UNBLOCK_INPUT;
+ }
+#endif
+}
+
+
+/* Free colors allocated for FACE. */
+
+static void
+free_face_colors (f, face)
+ struct frame *f;
+ struct face *face;
+{
+#ifdef HAVE_X_WINDOWS
+ if (face->colors_copied_bitwise_p)
+ return;
+
+ BLOCK_INPUT;
+
+ if (!face->foreground_defaulted_p)
+ {
+ x_free_colors (f, &face->foreground, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ if (!face->background_defaulted_p)
+ {
+ x_free_colors (f, &face->background, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ if (face->underline_p
+ && !face->underline_defaulted_p)
+ {
+ x_free_colors (f, &face->underline_color, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ if (face->overline_p
+ && !face->overline_color_defaulted_p)
+ {
+ x_free_colors (f, &face->overline_color, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ if (face->strike_through_p
+ && !face->strike_through_color_defaulted_p)
+ {
+ x_free_colors (f, &face->strike_through_color, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ if (face->box != FACE_NO_BOX
+ && !face->box_color_defaulted_p)
+ {
+ x_free_colors (f, &face->box_color, 1);
+ IF_DEBUG (--ncolors_allocated);
+ }
+
+ UNBLOCK_INPUT;
+#endif /* HAVE_X_WINDOWS */
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ XLFD Font Names
+ ***********************************************************************/
+
+/* An enumerator for each field of an XLFD font name. */
+
+enum xlfd_field
+{
+ XLFD_FOUNDRY,
+ XLFD_FAMILY,
+ XLFD_WEIGHT,
+ XLFD_SLANT,
+ XLFD_SWIDTH,
+ XLFD_ADSTYLE,
+ XLFD_PIXEL_SIZE,
+ XLFD_POINT_SIZE,
+ XLFD_RESX,
+ XLFD_RESY,
+ XLFD_SPACING,
+ XLFD_AVGWIDTH,
+ XLFD_REGISTRY,
+ XLFD_ENCODING,
+ XLFD_LAST
+};
+
+/* An enumerator for each possible slant value of a font. Taken from
+ the XLFD specification. */
+
+enum xlfd_slant
+{
+ XLFD_SLANT_UNKNOWN,
+ XLFD_SLANT_ROMAN,
+ XLFD_SLANT_ITALIC,
+ XLFD_SLANT_OBLIQUE,
+ XLFD_SLANT_REVERSE_ITALIC,
+ XLFD_SLANT_REVERSE_OBLIQUE,
+ XLFD_SLANT_OTHER
+};
+
+/* Relative font weight according to XLFD documentation. */
+
+enum xlfd_weight
+{
+ XLFD_WEIGHT_UNKNOWN,
+ XLFD_WEIGHT_ULTRA_LIGHT, /* 10 */
+ XLFD_WEIGHT_EXTRA_LIGHT, /* 20 */
+ XLFD_WEIGHT_LIGHT, /* 30 */
+ XLFD_WEIGHT_SEMI_LIGHT, /* 40: SemiLight, Book, ... */
+ XLFD_WEIGHT_MEDIUM, /* 50: Medium, Normal, Regular, ... */
+ XLFD_WEIGHT_SEMI_BOLD, /* 60: SemiBold, DemiBold, ... */
+ XLFD_WEIGHT_BOLD, /* 70: Bold, ... */
+ XLFD_WEIGHT_EXTRA_BOLD, /* 80: ExtraBold, Heavy, ... */
+ XLFD_WEIGHT_ULTRA_BOLD /* 90: UltraBold, Black, ... */
+};
+
+/* Relative proportionate width. */
+
+enum xlfd_swidth
+{
+ XLFD_SWIDTH_UNKNOWN,
+ XLFD_SWIDTH_ULTRA_CONDENSED, /* 10 */
+ XLFD_SWIDTH_EXTRA_CONDENSED, /* 20 */
+ XLFD_SWIDTH_CONDENSED, /* 30: Condensed, Narrow, Compressed, ... */
+ XLFD_SWIDTH_SEMI_CONDENSED, /* 40: semicondensed */
+ XLFD_SWIDTH_MEDIUM, /* 50: Medium, Normal, Regular, ... */
+ XLFD_SWIDTH_SEMI_EXPANDED, /* 60: SemiExpanded, DemiExpanded, ... */
+ XLFD_SWIDTH_EXPANDED, /* 70: Expanded... */
+ XLFD_SWIDTH_EXTRA_EXPANDED, /* 80: ExtraExpanded, Wide... */
+ XLFD_SWIDTH_ULTRA_EXPANDED /* 90: UltraExpanded... */
+};
+
+/* Structure used for tables mapping XLFD weight, slant, and width
+ names to numeric and symbolic values. */
+
+struct table_entry
+{
+ char *name;
+ int numeric;
+ Lisp_Object *symbol;
+};
+
+/* Table of XLFD slant names and their numeric and symbolic
+ representations. This table must be sorted by slant names in
+ ascending order. */
+
+static struct table_entry slant_table[] =
+{
+ {"i", XLFD_SLANT_ITALIC, &Qitalic},
+ {"o", XLFD_SLANT_OBLIQUE, &Qoblique},
+ {"ot", XLFD_SLANT_OTHER, &Qitalic},
+ {"r", XLFD_SLANT_ROMAN, &Qnormal},
+ {"ri", XLFD_SLANT_REVERSE_ITALIC, &Qreverse_italic},
+ {"ro", XLFD_SLANT_REVERSE_OBLIQUE, &Qreverse_oblique}
+};
+
+/* Table of XLFD weight names. This table must be sorted by weight
+ names in ascending order. */
+
+static struct table_entry weight_table[] =
+{
+ {"black", XLFD_WEIGHT_ULTRA_BOLD, &Qultra_bold},
+ {"bold", XLFD_WEIGHT_BOLD, &Qbold},
+ {"book", XLFD_WEIGHT_SEMI_LIGHT, &Qsemi_light},
+ {"demi", XLFD_WEIGHT_SEMI_BOLD, &Qsemi_bold},
+ {"demibold", XLFD_WEIGHT_SEMI_BOLD, &Qsemi_bold},
+ {"extralight", XLFD_WEIGHT_EXTRA_LIGHT, &Qextra_light},
+ {"extrabold", XLFD_WEIGHT_EXTRA_BOLD, &Qextra_bold},
+ {"heavy", XLFD_WEIGHT_EXTRA_BOLD, &Qextra_bold},
+ {"light", XLFD_WEIGHT_LIGHT, &Qlight},
+ {"medium", XLFD_WEIGHT_MEDIUM, &Qnormal},
+ {"normal", XLFD_WEIGHT_MEDIUM, &Qnormal},
+ {"regular", XLFD_WEIGHT_MEDIUM, &Qnormal},
+ {"semibold", XLFD_WEIGHT_SEMI_BOLD, &Qsemi_bold},
+ {"semilight", XLFD_WEIGHT_SEMI_LIGHT, &Qsemi_light},
+ {"ultralight", XLFD_WEIGHT_ULTRA_LIGHT, &Qultra_light},
+ {"ultrabold", XLFD_WEIGHT_ULTRA_BOLD, &Qultra_bold}
+};
+
+/* Table of XLFD width names. This table must be sorted by width
+ names in ascending order. */
+
+static struct table_entry swidth_table[] =
+{
+ {"compressed", XLFD_SWIDTH_CONDENSED, &Qcondensed},
+ {"condensed", XLFD_SWIDTH_CONDENSED, &Qcondensed},
+ {"demiexpanded", XLFD_SWIDTH_SEMI_EXPANDED, &Qsemi_expanded},
+ {"expanded", XLFD_SWIDTH_EXPANDED, &Qexpanded},
+ {"extracondensed", XLFD_SWIDTH_EXTRA_CONDENSED, &Qextra_condensed},
+ {"extraexpanded", XLFD_SWIDTH_EXTRA_EXPANDED, &Qextra_expanded},
+ {"medium", XLFD_SWIDTH_MEDIUM, &Qnormal},
+ {"narrow", XLFD_SWIDTH_CONDENSED, &Qcondensed},
+ {"normal", XLFD_SWIDTH_MEDIUM, &Qnormal},
+ {"regular", XLFD_SWIDTH_MEDIUM, &Qnormal},
+ {"semicondensed", XLFD_SWIDTH_SEMI_CONDENSED, &Qsemi_condensed},
+ {"semiexpanded", XLFD_SWIDTH_SEMI_EXPANDED, &Qsemi_expanded},
+ {"ultracondensed", XLFD_SWIDTH_ULTRA_CONDENSED, &Qultra_condensed},
+ {"ultraexpanded", XLFD_SWIDTH_ULTRA_EXPANDED, &Qultra_expanded},
+ {"wide", XLFD_SWIDTH_EXTRA_EXPANDED, &Qextra_expanded}
+};
+
+/* Structure used to hold the result of splitting font names in XLFD
+ format into their fields. */
+
+struct font_name
+{
+ /* The original name which is modified destructively by
+ split_font_name. The pointer is kept here to be able to free it
+ if it was allocated from the heap. */
+ char *name;
+
+ /* Font name fields. Each vector element points into `name' above.
+ Fields are NUL-terminated. */
+ char *fields[XLFD_LAST];
+
+ /* Numeric values for those fields that interest us. See
+ split_font_name for which these are. */
+ int numeric[XLFD_LAST];
+
+ /* Lower value mean higher priority. */
+ int registry_priority;
+};
+
+/* The frame in effect when sorting font names. Set temporarily in
+ sort_fonts so that it is available in font comparison functions. */
+
+static struct frame *font_frame;
+
+/* Order by which font selection chooses fonts. The default values
+ mean `first, find a best match for the font width, then for the
+ font height, then for weight, then for slant.' This variable can be
+ set via set-face-font-sort-order. */
+
+#ifdef macintosh
+static int font_sort_order[4] = {
+ XLFD_SWIDTH, XLFD_POINT_SIZE, XLFD_WEIGHT, XLFD_SLANT
+};
+#else
+static int font_sort_order[4];
+#endif
+
+/* Look up FONT.fields[FIELD_INDEX] in TABLE which has DIM entries.
+ TABLE must be sorted by TABLE[i]->name in ascending order. Value
+ is a pointer to the matching table entry or null if no table entry
+ matches. */
+
+static struct table_entry *
+xlfd_lookup_field_contents (table, dim, font, field_index)
+ struct table_entry *table;
+ int dim;
+ struct font_name *font;
+ int field_index;
+{
+ /* Function split_font_name converts fields to lower-case, so there
+ is no need to use xstrlwr or xstricmp here. */
+ char *s = font->fields[field_index];
+ int low, mid, high, cmp;
+
+ low = 0;
+ high = dim - 1;
+
+ while (low <= high)
+ {
+ mid = (low + high) / 2;
+ cmp = strcmp (table[mid].name, s);
+
+ if (cmp < 0)
+ low = mid + 1;
+ else if (cmp > 0)
+ high = mid - 1;
+ else
+ return table + mid;
+ }
+
+ return NULL;
+}
+
+
+/* Return a numeric representation for font name field
+ FONT.fields[FIELD_INDEX]. The field is looked up in TABLE which
+ has DIM entries. Value is the numeric value found or DFLT if no
+ table entry matches. This function is used to translate weight,
+ slant, and swidth names of XLFD font names to numeric values. */
+
+static INLINE int
+xlfd_numeric_value (table, dim, font, field_index, dflt)
+ struct table_entry *table;
+ int dim;
+ struct font_name *font;
+ int field_index;
+ int dflt;
+{
+ struct table_entry *p;
+ p = xlfd_lookup_field_contents (table, dim, font, field_index);
+ return p ? p->numeric : dflt;
+}
+
+
+/* Return a symbolic representation for font name field
+ FONT.fields[FIELD_INDEX]. The field is looked up in TABLE which
+ has DIM entries. Value is the symbolic value found or DFLT if no
+ table entry matches. This function is used to translate weight,
+ slant, and swidth names of XLFD font names to symbols. */
+
+static INLINE Lisp_Object
+xlfd_symbolic_value (table, dim, font, field_index, dflt)
+ struct table_entry *table;
+ int dim;
+ struct font_name *font;
+ int field_index;
+ Lisp_Object dflt;
+{
+ struct table_entry *p;
+ p = xlfd_lookup_field_contents (table, dim, font, field_index);
+ return p ? *p->symbol : dflt;
+}
+
+
+/* Return a numeric value for the slant of the font given by FONT. */
+
+static INLINE int
+xlfd_numeric_slant (font)
+ struct font_name *font;
+{
+ return xlfd_numeric_value (slant_table, DIM (slant_table),
+ font, XLFD_SLANT, XLFD_SLANT_ROMAN);
+}
+
+
+/* Return a symbol representing the weight of the font given by FONT. */
+
+static INLINE Lisp_Object
+xlfd_symbolic_slant (font)
+ struct font_name *font;
+{
+ return xlfd_symbolic_value (slant_table, DIM (slant_table),
+ font, XLFD_SLANT, Qnormal);
+}
+
+
+/* Return a numeric value for the weight of the font given by FONT. */
+
+static INLINE int
+xlfd_numeric_weight (font)
+ struct font_name *font;
+{
+ return xlfd_numeric_value (weight_table, DIM (weight_table),
+ font, XLFD_WEIGHT, XLFD_WEIGHT_MEDIUM);
+}
+
+
+/* Return a symbol representing the slant of the font given by FONT. */
+
+static INLINE Lisp_Object
+xlfd_symbolic_weight (font)
+ struct font_name *font;
+{
+ return xlfd_symbolic_value (weight_table, DIM (weight_table),
+ font, XLFD_WEIGHT, Qnormal);
+}
+
+
+/* Return a numeric value for the swidth of the font whose XLFD font
+ name fields are found in FONT. */
+
+static INLINE int
+xlfd_numeric_swidth (font)
+ struct font_name *font;
+{
+ return xlfd_numeric_value (swidth_table, DIM (swidth_table),
+ font, XLFD_SWIDTH, XLFD_SWIDTH_MEDIUM);
+}
+
+
+/* Return a symbolic value for the swidth of FONT. */
+
+static INLINE Lisp_Object
+xlfd_symbolic_swidth (font)
+ struct font_name *font;
+{
+ return xlfd_symbolic_value (swidth_table, DIM (swidth_table),
+ font, XLFD_SWIDTH, Qnormal);
+}
+
+
+/* Look up the entry of SYMBOL in the vector TABLE which has DIM
+ entries. Value is a pointer to the matching table entry or null if
+ no element of TABLE contains SYMBOL. */
+
+static struct table_entry *
+face_value (table, dim, symbol)
+ struct table_entry *table;
+ int dim;
+ Lisp_Object symbol;
+{
+ int i;
+
+ xassert (SYMBOLP (symbol));
+
+ for (i = 0; i < dim; ++i)
+ if (EQ (*table[i].symbol, symbol))
+ break;
+
+ return i < dim ? table + i : NULL;
+}
+
+
+/* Return a numeric value for SYMBOL in the vector TABLE which has DIM
+ entries. Value is -1 if SYMBOL is not found in TABLE. */
+
+static INLINE int
+face_numeric_value (table, dim, symbol)
+ struct table_entry *table;
+ int dim;
+ Lisp_Object symbol;
+{
+ struct table_entry *p = face_value (table, dim, symbol);
+ return p ? p->numeric : -1;
+}
+
+
+/* Return a numeric value representing the weight specified by Lisp
+ symbol WEIGHT. Value is one of the enumerators of enum
+ xlfd_weight. */
+
+static INLINE int
+face_numeric_weight (weight)
+ Lisp_Object weight;
+{
+ return face_numeric_value (weight_table, DIM (weight_table), weight);
+}
+
+
+/* Return a numeric value representing the slant specified by Lisp
+ symbol SLANT. Value is one of the enumerators of enum xlfd_slant. */
+
+static INLINE int
+face_numeric_slant (slant)
+ Lisp_Object slant;
+{
+ return face_numeric_value (slant_table, DIM (slant_table), slant);
+}
+
+
+/* Return a numeric value representing the swidth specified by Lisp
+ symbol WIDTH. Value is one of the enumerators of enum xlfd_swidth. */
+
+static int
+face_numeric_swidth (width)
+ Lisp_Object width;
+{
+ return face_numeric_value (swidth_table, DIM (swidth_table), width);
+}
+
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Return non-zero if FONT is the name of a fixed-pitch font. */
+
+static INLINE int
+xlfd_fixed_p (font)
+ struct font_name *font;
+{
+ /* Function split_font_name converts fields to lower-case, so there
+ is no need to use tolower here. */
+ return *font->fields[XLFD_SPACING] != 'p';
+}
+
+
+/* Return the point size of FONT on frame F, measured in 1/10 pt.
+
+ The actual height of the font when displayed on F depends on the
+ resolution of both the font and frame. For example, a 10pt font
+ designed for a 100dpi display will display larger than 10pt on a
+ 75dpi display. (It's not unusual to use fonts not designed for the
+ display one is using. For example, some intlfonts are available in
+ 72dpi versions, only.)
+
+ Value is the real point size of FONT on frame F, or 0 if it cannot
+ be determined. */
+
+static INLINE int
+xlfd_point_size (f, font)
+ struct frame *f;
+ struct font_name *font;
+{
+ double resy = FRAME_X_DISPLAY_INFO (f)->resy;
+ char *pixel_field = font->fields[XLFD_PIXEL_SIZE];
+ double pixel;
+ int real_pt;
+
+ if (*pixel_field == '[')
+ {
+ /* The pixel size field is `[A B C D]' which specifies
+ a transformation matrix.
+
+ A B 0
+ C D 0
+ 0 0 1
+
+ by which all glyphs of the font are transformed. The spec
+ says that s scalar value N for the pixel size is equivalent
+ to A = N * resx/resy, B = C = 0, D = N. */
+ char *start = pixel_field + 1, *end;
+ double matrix[4];
+ int i;
+
+ for (i = 0; i < 4; ++i)
+ {
+ matrix[i] = strtod (start, &end);
+ start = end;
+ }
+
+ pixel = matrix[3];
+ }
+ else
+ pixel = atoi (pixel_field);
+
+ if (pixel == 0)
+ real_pt = 0;
+ else
+ real_pt = PT_PER_INCH * 10.0 * pixel / resy + 0.5;
+
+ return real_pt;
+}
+
+
+/* Return point size of PIXEL dots while considering Y-resultion (DPI)
+ of frame F. This function is used to guess a point size of font
+ when only the pixel height of the font is available. */
+
+static INLINE int
+pixel_point_size (f, pixel)
+ struct frame *f;
+ int pixel;
+{
+ double resy = FRAME_X_DISPLAY_INFO (f)->resy;
+ double real_pt;
+ int int_pt;
+
+ /* As one inch is PT_PER_INCH points, PT_PER_INCH/RESY gives the
+ point size of one dot. */
+ real_pt = pixel * PT_PER_INCH / resy;
+ int_pt = real_pt + 0.5;
+
+ return int_pt;
+}
+
+
+/* Split XLFD font name FONT->name destructively into NUL-terminated,
+ lower-case fields in FONT->fields. NUMERIC_P non-zero means
+ compute numeric values for fields XLFD_POINT_SIZE, XLFD_SWIDTH,
+ XLFD_RESY, XLFD_SLANT, and XLFD_WEIGHT in FONT->numeric. Value is
+ zero if the font name doesn't have the format we expect. The
+ expected format is a font name that starts with a `-' and has
+ XLFD_LAST fields separated by `-'. */
+
+static int
+split_font_name (f, font, numeric_p)
+ struct frame *f;
+ struct font_name *font;
+ int numeric_p;
+{
+ int i = 0;
+ int success_p;
+
+ if (*font->name == '-')
+ {
+ char *p = xstrlwr (font->name) + 1;
+
+ while (i < XLFD_LAST)
+ {
+ font->fields[i] = p;
+ ++i;
+
+ /* Pixel and point size may be of the form `[....]'. For
+ BNF, see XLFD spec, chapter 4. Negative values are
+ indicated by tilde characters which we replace with
+ `-' characters, here. */
+ if (*p == '['
+ && (i - 1 == XLFD_PIXEL_SIZE
+ || i - 1 == XLFD_POINT_SIZE))
+ {
+ char *start, *end;
+ int j;
+
+ for (++p; *p && *p != ']'; ++p)
+ if (*p == '~')
+ *p = '-';
+
+ /* Check that the matrix contains 4 floating point
+ numbers. */
+ for (j = 0, start = font->fields[i - 1] + 1;
+ j < 4;
+ ++j, start = end)
+ if (strtod (start, &end) == 0 && start == end)
+ break;
+
+ if (j < 4)
+ break;
+ }
+
+ while (*p && *p != '-')
+ ++p;
+
+ if (*p != '-')
+ break;
+
+ *p++ = 0;
+ }
+ }
+
+ success_p = i == XLFD_LAST;
+
+ /* If requested, and font name was in the expected format,
+ compute numeric values for some fields. */
+ if (numeric_p && success_p)
+ {
+ font->numeric[XLFD_POINT_SIZE] = xlfd_point_size (f, font);
+ font->numeric[XLFD_RESY] = atoi (font->fields[XLFD_RESY]);
+ font->numeric[XLFD_SLANT] = xlfd_numeric_slant (font);
+ font->numeric[XLFD_WEIGHT] = xlfd_numeric_weight (font);
+ font->numeric[XLFD_SWIDTH] = xlfd_numeric_swidth (font);
+ font->numeric[XLFD_AVGWIDTH] = atoi (font->fields[XLFD_AVGWIDTH]);
+ }
+
+ /* Initialize it to zero. It will be overridden by font_list while
+ trying alternate registries. */
+ font->registry_priority = 0;
+
+ return success_p;
+}
+
+
+/* Build an XLFD font name from font name fields in FONT. Value is a
+ pointer to the font name, which is allocated via xmalloc. */
+
+static char *
+build_font_name (font)
+ struct font_name *font;
+{
+ int i;
+ int size = 100;
+ char *font_name = (char *) xmalloc (size);
+ int total_length = 0;
+
+ for (i = 0; i < XLFD_LAST; ++i)
+ {
+ /* Add 1 because of the leading `-'. */
+ int len = strlen (font->fields[i]) + 1;
+
+ /* Reallocate font_name if necessary. Add 1 for the final
+ NUL-byte. */
+ if (total_length + len + 1 >= size)
+ {
+ int new_size = max (2 * size, size + len + 1);
+ int sz = new_size * sizeof *font_name;
+ font_name = (char *) xrealloc (font_name, sz);
+ size = new_size;
+ }
+
+ font_name[total_length] = '-';
+ bcopy (font->fields[i], font_name + total_length + 1, len - 1);
+ total_length += len;
+ }
+
+ font_name[total_length] = 0;
+ return font_name;
+}
+
+
+/* Free an array FONTS of N font_name structures. This frees FONTS
+ itself and all `name' fields in its elements. */
+
+static INLINE void
+free_font_names (fonts, n)
+ struct font_name *fonts;
+ int n;
+{
+ while (n)
+ xfree (fonts[--n].name);
+ xfree (fonts);
+}
+
+
+/* Sort vector FONTS of font_name structures which contains NFONTS
+ elements using qsort and comparison function CMPFN. F is the frame
+ on which the fonts will be used. The global variable font_frame
+ is temporarily set to F to make it available in CMPFN. */
+
+static INLINE void
+sort_fonts (f, fonts, nfonts, cmpfn)
+ struct frame *f;
+ struct font_name *fonts;
+ int nfonts;
+ int (*cmpfn) P_ ((const void *, const void *));
+{
+ font_frame = f;
+ qsort (fonts, nfonts, sizeof *fonts, cmpfn);
+ font_frame = NULL;
+}
+
+
+/* Get fonts matching PATTERN on frame F. If F is null, use the first
+ display in x_display_list. FONTS is a pointer to a vector of
+ NFONTS font_name structures. TRY_ALTERNATIVES_P non-zero means try
+ alternative patterns from Valternate_fontname_alist if no fonts are
+ found matching PATTERN.
+
+ For all fonts found, set FONTS[i].name to the name of the font,
+ allocated via xmalloc, and split font names into fields. Ignore
+ fonts that we can't parse. Value is the number of fonts found. */
+
+static int
+x_face_list_fonts (f, pattern, fonts, nfonts, try_alternatives_p)
+ struct frame *f;
+ char *pattern;
+ struct font_name *fonts;
+ int nfonts, try_alternatives_p;
+{
+ int n, nignored;
+
+ /* NTEMACS_TODO : currently this uses w32_list_fonts, but it may be
+ better to do it the other way around. */
+ Lisp_Object lfonts;
+ Lisp_Object lpattern, tem;
+
+ lpattern = build_string (pattern);
+
+ /* Get the list of fonts matching PATTERN. */
+#ifdef WINDOWSNT
+ BLOCK_INPUT;
+ lfonts = w32_list_fonts (f, lpattern, 0, nfonts);
+ UNBLOCK_INPUT;
+#else
+ lfonts = x_list_fonts (f, lpattern, -1, nfonts);
+#endif
+
+ /* Make a copy of the font names we got from X, and
+ split them into fields. */
+ n = nignored = 0;
+ for (tem = lfonts; CONSP (tem) && n < nfonts; tem = XCDR (tem))
+ {
+ Lisp_Object elt, tail;
+ char *name = XSTRING (XCAR (tem))->data;
+
+ /* Ignore fonts matching a pattern from face-ignored-fonts. */
+ for (tail = Vface_ignored_fonts; CONSP (tail); tail = XCDR (tail))
+ {
+ elt = XCAR (tail);
+ if (STRINGP (elt)
+ && fast_c_string_match_ignore_case (elt, name) >= 0)
+ break;
+ }
+ if (!NILP (tail))
+ {
+ ++nignored;
+ continue;
+ }
+
+ /* Make a copy of the font name. */
+ fonts[n].name = xstrdup (name);
+
+ if (split_font_name (f, fonts + n, 1))
+ {
+ if (font_scalable_p (fonts + n)
+ && !may_use_scalable_font_p (name))
+ {
+ ++nignored;
+ xfree (fonts[n].name);
+ }
+ else
+ ++n;
+ }
+ else
+ xfree (fonts[n].name);
+ }
+
+ /* If no fonts found, try patterns from Valternate_fontname_alist. */
+ if (n == 0 && try_alternatives_p)
+ {
+ Lisp_Object list = Valternate_fontname_alist;
+
+ while (CONSP (list))
+ {
+ Lisp_Object entry = XCAR (list);
+ if (CONSP (entry)
+ && STRINGP (XCAR (entry))
+ && strcmp (XSTRING (XCAR (entry))->data, pattern) == 0)
+ break;
+ list = XCDR (list);
+ }
+
+ if (CONSP (list))
+ {
+ Lisp_Object patterns = XCAR (list);
+ Lisp_Object name;
+
+ while (CONSP (patterns)
+ /* If list is screwed up, give up. */
+ && (name = XCAR (patterns),
+ STRINGP (name))
+ /* Ignore patterns equal to PATTERN because we tried that
+ already with no success. */
+ && (strcmp (XSTRING (name)->data, pattern) == 0
+ || (n = x_face_list_fonts (f, XSTRING (name)->data,
+ fonts, nfonts, 0),
+ n == 0)))
+ patterns = XCDR (patterns);
+ }
+ }
+
+ return n;
+}
+
+
+/* Determine fonts matching PATTERN on frame F. Sort resulting fonts
+ using comparison function CMPFN. Value is the number of fonts
+ found. If value is non-zero, *FONTS is set to a vector of
+ font_name structures allocated from the heap containing matching
+ fonts. Each element of *FONTS contains a name member that is also
+ allocated from the heap. Font names in these structures are split
+ into fields. Use free_font_names to free such an array. */
+
+static int
+sorted_font_list (f, pattern, cmpfn, fonts)
+ struct frame *f;
+ char *pattern;
+ int (*cmpfn) P_ ((const void *, const void *));
+ struct font_name **fonts;
+{
+ int nfonts;
+
+ /* Get the list of fonts matching pattern. 100 should suffice. */
+ nfonts = DEFAULT_FONT_LIST_LIMIT;
+ if (INTEGERP (Vfont_list_limit) && XINT (Vfont_list_limit) > 0)
+ nfonts = XFASTINT (Vfont_list_limit);
+
+ *fonts = (struct font_name *) xmalloc (nfonts * sizeof **fonts);
+ nfonts = x_face_list_fonts (f, pattern, *fonts, nfonts, 1);
+
+ /* Sort the resulting array and return it in *FONTS. If no
+ fonts were found, make sure to set *FONTS to null. */
+ if (nfonts)
+ sort_fonts (f, *fonts, nfonts, cmpfn);
+ else
+ {
+ xfree (*fonts);
+ *fonts = NULL;
+ }
+
+ return nfonts;
+}
+
+
+/* Compare two font_name structures *A and *B. Value is analogous to
+ strcmp. Sort order is given by the global variable
+ font_sort_order. Font names are sorted so that, everything else
+ being equal, fonts with a resolution closer to that of the frame on
+ which they are used are listed first. The global variable
+ font_frame is the frame on which we operate. */
+
+static int
+cmp_font_names (a, b)
+ const void *a, *b;
+{
+ struct font_name *x = (struct font_name *) a;
+ struct font_name *y = (struct font_name *) b;
+ int cmp;
+
+ /* All strings have been converted to lower-case by split_font_name,
+ so we can use strcmp here. */
+ cmp = strcmp (x->fields[XLFD_FAMILY], y->fields[XLFD_FAMILY]);
+ if (cmp == 0)
+ {
+ int i;
+
+ for (i = 0; i < DIM (font_sort_order) && cmp == 0; ++i)
+ {
+ int j = font_sort_order[i];
+ cmp = x->numeric[j] - y->numeric[j];
+ }
+
+ if (cmp == 0)
+ {
+ /* Everything else being equal, we prefer fonts with an
+ y-resolution closer to that of the frame. */
+ int resy = FRAME_X_DISPLAY_INFO (font_frame)->resy;
+ int x_resy = x->numeric[XLFD_RESY];
+ int y_resy = y->numeric[XLFD_RESY];
+ cmp = abs (resy - x_resy) - abs (resy - y_resy);
+ }
+ }
+
+ return cmp;
+}
+
+
+/* Get a sorted list of fonts of family FAMILY on frame F. If PATTERN
+ is non-nil list fonts matching that pattern. Otherwise, if
+ REGISTRY is non-nil return only fonts with that registry, otherwise
+ return fonts of any registry. Set *FONTS to a vector of font_name
+ structures allocated from the heap containing the fonts found.
+ Value is the number of fonts found. */
+
+static int
+font_list_1 (f, pattern, family, registry, fonts)
+ struct frame *f;
+ Lisp_Object pattern, family, registry;
+ struct font_name **fonts;
+{
+ char *pattern_str, *family_str, *registry_str;
+
+ if (NILP (pattern))
+ {
+ family_str = (NILP (family) ? "*" : (char *) XSTRING (family)->data);
+ registry_str = (NILP (registry) ? "*" : (char *) XSTRING (registry)->data);
+
+ pattern_str = (char *) alloca (strlen (family_str)
+ + strlen (registry_str)
+ + 10);
+ strcpy (pattern_str, index (family_str, '-') ? "-" : "-*-");
+ strcat (pattern_str, family_str);
+ strcat (pattern_str, "-*-");
+ strcat (pattern_str, registry_str);
+ if (!index (registry_str, '-'))
+ {
+ if (registry_str[strlen (registry_str) - 1] == '*')
+ strcat (pattern_str, "-*");
+ else
+ strcat (pattern_str, "*-*");
+ }
+ }
+ else
+ pattern_str = (char *) XSTRING (pattern)->data;
+
+ return sorted_font_list (f, pattern_str, cmp_font_names, fonts);
+}
+
+
+/* Concatenate font list FONTS1 and FONTS2. FONTS1 and FONTS2
+ contains NFONTS1 fonts and NFONTS2 fonts respectively. Return a
+ pointer to a newly allocated font list. FONTS1 and FONTS2 are
+ freed. */
+
+static struct font_name *
+concat_font_list (fonts1, nfonts1, fonts2, nfonts2)
+ struct font_name *fonts1, *fonts2;
+ int nfonts1, nfonts2;
+{
+ int new_nfonts = nfonts1 + nfonts2;
+ struct font_name *new_fonts;
+
+ new_fonts = (struct font_name *) xmalloc (sizeof *new_fonts * new_nfonts);
+ bcopy (fonts1, new_fonts, sizeof *new_fonts * nfonts1);
+ bcopy (fonts2, new_fonts + nfonts1, sizeof *new_fonts * nfonts2);
+ xfree (fonts1);
+ xfree (fonts2);
+ return new_fonts;
+}
+
+
+/* Get a sorted list of fonts of family FAMILY on frame F.
+
+ If PATTERN is non-nil list fonts matching that pattern.
+
+ If REGISTRY is non-nil, return fonts with that registry and the
+ alternative registries from Vface_alternative_font_registry_alist.
+
+ If REGISTRY is nil return fonts of any registry.
+
+ Set *FONTS to a vector of font_name structures allocated from the
+ heap containing the fonts found. Value is the number of fonts
+ found. */
+
+static int
+font_list (f, pattern, family, registry, fonts)
+ struct frame *f;
+ Lisp_Object pattern, family, registry;
+ struct font_name **fonts;
+{
+ int nfonts = font_list_1 (f, pattern, family, registry, fonts);
+
+ if (!NILP (registry)
+ && CONSP (Vface_alternative_font_registry_alist))
+ {
+ Lisp_Object alter;
+
+ alter = Fassoc (registry, Vface_alternative_font_registry_alist);
+ if (CONSP (alter))
+ {
+ int reg_prio, i;
+
+ for (alter = XCDR (alter), reg_prio = 1;
+ CONSP (alter);
+ alter = XCDR (alter), reg_prio++)
+ if (STRINGP (XCAR (alter)))
+ {
+ int nfonts2;
+ struct font_name *fonts2;
+
+ nfonts2 = font_list_1 (f, pattern, family, XCAR (alter),
+ &fonts2);
+ for (i = 0; i < nfonts2; i++)
+ fonts2[i].registry_priority = reg_prio;
+ *fonts = (nfonts > 0
+ ? concat_font_list (*fonts, nfonts, fonts2, nfonts2)
+ : fonts2);
+ nfonts += nfonts2;
+ }
+ }
+ }
+
+ return nfonts;
+}
+
+
+/* Remove elements from LIST whose cars are `equal'. Called from
+ x-family-fonts and x-font-family-list to remove duplicate font
+ entries. */
+
+static void
+remove_duplicates (list)
+ Lisp_Object list;
+{
+ Lisp_Object tail = list;
+
+ while (!NILP (tail) && !NILP (XCDR (tail)))
+ {
+ Lisp_Object next = XCDR (tail);
+ if (!NILP (Fequal (XCAR (next), XCAR (tail))))
+ XCDR (tail) = XCDR (next);
+ else
+ tail = XCDR (tail);
+ }
+}
+
+
+DEFUN ("x-family-fonts", Fx_family_fonts, Sx_family_fonts, 0, 2, 0,
+ "Return a list of available fonts of family FAMILY on FRAME.\n\
+If FAMILY is omitted or nil, list all families.\n\
+Otherwise, FAMILY must be a string, possibly containing wildcards\n\
+`?' and `*'.\n\
+If FRAME is omitted or nil, use the selected frame.\n\
+Each element of the result is a vector [FAMILY WIDTH POINT-SIZE WEIGHT\n\
+SLANT FIXED-P FULL REGISTRY-AND-ENCODING].\n\
+FAMILY is the font family name. POINT-SIZE is the size of the\n\
+font in 1/10 pt. WIDTH, WEIGHT, and SLANT are symbols describing the\n\
+width, weight and slant of the font. These symbols are the same as for\n\
+face attributes. FIXED-P is non-nil if the font is fixed-pitch.\n\
+FULL is the full name of the font, and REGISTRY-AND-ENCODING is a string\n\
+giving the registry and encoding of the font.\n\
+The result list is sorted according to the current setting of\n\
+the face font sort order.")
+ (family, frame)
+ Lisp_Object family, frame;
+{
+ struct frame *f = check_x_frame (frame);
+ struct font_name *fonts;
+ int i, nfonts;
+ Lisp_Object result;
+ struct gcpro gcpro1;
+
+ if (!NILP (family))
+ CHECK_STRING (family, 1);
+
+ result = Qnil;
+ GCPRO1 (result);
+ nfonts = font_list (f, Qnil, family, Qnil, &fonts);
+ for (i = nfonts - 1; i >= 0; --i)
+ {
+ Lisp_Object v = Fmake_vector (make_number (8), Qnil);
+ char *tem;
+
+ ASET (v, 0, build_string (fonts[i].fields[XLFD_FAMILY]));
+ ASET (v, 1, xlfd_symbolic_swidth (fonts + i));
+ ASET (v, 2, make_number (xlfd_point_size (f, fonts + i)));
+ ASET (v, 3, xlfd_symbolic_weight (fonts + i));
+ ASET (v, 4, xlfd_symbolic_slant (fonts + i));
+ ASET (v, 5, xlfd_fixed_p (fonts + i) ? Qt : Qnil);
+ tem = build_font_name (fonts + i);
+ ASET (v, 6, build_string (tem));
+ sprintf (tem, "%s-%s", fonts[i].fields[XLFD_REGISTRY],
+ fonts[i].fields[XLFD_ENCODING]);
+ ASET (v, 7, build_string (tem));
+ xfree (tem);
+
+ result = Fcons (v, result);
+ }
+
+ remove_duplicates (result);
+ free_font_names (fonts, nfonts);
+ UNGCPRO;
+ return result;
+}
+
+
+DEFUN ("x-font-family-list", Fx_font_family_list, Sx_font_family_list,
+ 0, 1, 0,
+ "Return a list of available font families on FRAME.\n\
+If FRAME is omitted or nil, use the selected frame.\n\
+Value is a list of conses (FAMILY . FIXED-P) where FAMILY\n\
+is a font family, and FIXED-P is non-nil if fonts of that family\n\
+are fixed-pitch.")
+ (frame)
+ Lisp_Object frame;
+{
+ struct frame *f = check_x_frame (frame);
+ int nfonts, i;
+ struct font_name *fonts;
+ Lisp_Object result;
+ struct gcpro gcpro1;
+ int count = specpdl_ptr - specpdl;
+ int limit;
+
+ /* Let's consider all fonts. Increase the limit for matching
+ fonts until we have them all. */
+ for (limit = 500;;)
+ {
+ specbind (intern ("font-list-limit"), make_number (limit));
+ nfonts = font_list (f, Qnil, Qnil, Qnil, &fonts);
+
+ if (nfonts == limit)
+ {
+ free_font_names (fonts, nfonts);
+ limit *= 2;
+ }
+ else
+ break;
+ }
+
+ result = Qnil;
+ GCPRO1 (result);
+ for (i = nfonts - 1; i >= 0; --i)
+ result = Fcons (Fcons (build_string (fonts[i].fields[XLFD_FAMILY]),
+ xlfd_fixed_p (fonts + i) ? Qt : Qnil),
+ result);
+
+ remove_duplicates (result);
+ free_font_names (fonts, nfonts);
+ UNGCPRO;
+ return unbind_to (count, result);
+}
+
+
+DEFUN ("x-list-fonts", Fx_list_fonts, Sx_list_fonts, 1, 5, 0,
+ "Return a list of the names of available fonts matching PATTERN.\n\
+If optional arguments FACE and FRAME are specified, return only fonts\n\
+the same size as FACE on FRAME.\n\
+PATTERN is a string, perhaps with wildcard characters;\n\
+ the * character matches any substring, and\n\
+ the ? character matches any single character.\n\
+ PATTERN is case-insensitive.\n\
+FACE is a face name--a symbol.\n\
+\n\
+The return value is a list of strings, suitable as arguments to\n\
+set-face-font.\n\
+\n\
+Fonts Emacs can't use may or may not be excluded\n\
+even if they match PATTERN and FACE.\n\
+The optional fourth argument MAXIMUM sets a limit on how many\n\
+fonts to match. The first MAXIMUM fonts are reported.\n\
+The optional fifth argument WIDTH, if specified, is a number of columns\n\
+occupied by a character of a font. In that case, return only fonts\n\
+the WIDTH times as wide as FACE on FRAME.")
+ (pattern, face, frame, maximum, width)
+ Lisp_Object pattern, face, frame, maximum, width;
+{
+ struct frame *f;
+ int size;
+ int maxnames;
+
+ check_x ();
+ CHECK_STRING (pattern, 0);
+
+ if (NILP (maximum))
+ maxnames = 2000;
+ else
+ {
+ CHECK_NATNUM (maximum, 0);
+ maxnames = XINT (maximum);
+ }
+
+ if (!NILP (width))
+ CHECK_NUMBER (width, 4);
+
+ /* We can't simply call check_x_frame because this function may be
+ called before any frame is created. */
+ f = frame_or_selected_frame (frame, 2);
+ if (!FRAME_WINDOW_P (f))
+ {
+ /* Perhaps we have not yet created any frame. */
+ f = NULL;
+ face = Qnil;
+ }
+
+ /* Determine the width standard for comparison with the fonts we find. */
+
+ if (NILP (face))
+ size = 0;
+ else
+ {
+ /* This is of limited utility since it works with character
+ widths. Keep it for compatibility. --gerd. */
+ int face_id = lookup_named_face (f, face, 0);
+ struct face *face = (face_id < 0
+ ? NULL
+ : FACE_FROM_ID (f, face_id));
+
+ if (face && face->font)
+ size = FONT_WIDTH (face->font);
+ else
+ size = FONT_WIDTH (FRAME_FONT (f));
+
+ if (!NILP (width))
+ size *= XINT (width);
+ }
+
+ {
+ Lisp_Object args[2];
+
+ args[0] = x_list_fonts (f, pattern, size, maxnames);
+ if (f == NULL)
+ /* We don't have to check fontsets. */
+ return args[0];
+ args[1] = list_fontsets (f, pattern, size);
+ return Fnconc (2, args);
+ }
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ Lisp Faces
+ ***********************************************************************/
+
+/* Access face attributes of face LFACE, a Lisp vector. */
+
+#define LFACE_FAMILY(LFACE) AREF ((LFACE), LFACE_FAMILY_INDEX)
+#define LFACE_HEIGHT(LFACE) AREF ((LFACE), LFACE_HEIGHT_INDEX)
+#define LFACE_WEIGHT(LFACE) AREF ((LFACE), LFACE_WEIGHT_INDEX)
+#define LFACE_SLANT(LFACE) AREF ((LFACE), LFACE_SLANT_INDEX)
+#define LFACE_UNDERLINE(LFACE) AREF ((LFACE), LFACE_UNDERLINE_INDEX)
+#define LFACE_INVERSE(LFACE) AREF ((LFACE), LFACE_INVERSE_INDEX)
+#define LFACE_FOREGROUND(LFACE) AREF ((LFACE), LFACE_FOREGROUND_INDEX)
+#define LFACE_BACKGROUND(LFACE) AREF ((LFACE), LFACE_BACKGROUND_INDEX)
+#define LFACE_STIPPLE(LFACE) AREF ((LFACE), LFACE_STIPPLE_INDEX)
+#define LFACE_SWIDTH(LFACE) AREF ((LFACE), LFACE_SWIDTH_INDEX)
+#define LFACE_OVERLINE(LFACE) AREF ((LFACE), LFACE_OVERLINE_INDEX)
+#define LFACE_STRIKE_THROUGH(LFACE) AREF ((LFACE), LFACE_STRIKE_THROUGH_INDEX)
+#define LFACE_BOX(LFACE) AREF ((LFACE), LFACE_BOX_INDEX)
+#define LFACE_FONT(LFACE) AREF ((LFACE), LFACE_FONT_INDEX)
+#define LFACE_INHERIT(LFACE) AREF ((LFACE), LFACE_INHERIT_INDEX)
+#define LFACE_AVGWIDTH(LFACE) AREF ((LFACE), LFACE_AVGWIDTH_INDEX)
+
+/* Non-zero if LFACE is a Lisp face. A Lisp face is a vector of size
+ LFACE_VECTOR_SIZE which has the symbol `face' in slot 0. */
+
+#define LFACEP(LFACE) \
+ (VECTORP (LFACE) \
+ && XVECTOR (LFACE)->size == LFACE_VECTOR_SIZE \
+ && EQ (AREF (LFACE, 0), Qface))
+
+
+#if GLYPH_DEBUG
+
+/* Check consistency of Lisp face attribute vector ATTRS. */
+
+static void
+check_lface_attrs (attrs)
+ Lisp_Object *attrs;
+{
+ xassert (UNSPECIFIEDP (attrs[LFACE_FAMILY_INDEX])
+ || STRINGP (attrs[LFACE_FAMILY_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_SWIDTH_INDEX])
+ || SYMBOLP (attrs[LFACE_SWIDTH_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_AVGWIDTH_INDEX])
+ || INTEGERP (attrs[LFACE_AVGWIDTH_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_HEIGHT_INDEX])
+ || INTEGERP (attrs[LFACE_HEIGHT_INDEX])
+ || FLOATP (attrs[LFACE_HEIGHT_INDEX])
+ || FUNCTIONP (attrs[LFACE_HEIGHT_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_WEIGHT_INDEX])
+ || SYMBOLP (attrs[LFACE_WEIGHT_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_SLANT_INDEX])
+ || SYMBOLP (attrs[LFACE_SLANT_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_UNDERLINE_INDEX])
+ || SYMBOLP (attrs[LFACE_UNDERLINE_INDEX])
+ || STRINGP (attrs[LFACE_UNDERLINE_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_OVERLINE_INDEX])
+ || SYMBOLP (attrs[LFACE_OVERLINE_INDEX])
+ || STRINGP (attrs[LFACE_OVERLINE_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_STRIKE_THROUGH_INDEX])
+ || SYMBOLP (attrs[LFACE_STRIKE_THROUGH_INDEX])
+ || STRINGP (attrs[LFACE_STRIKE_THROUGH_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_BOX_INDEX])
+ || SYMBOLP (attrs[LFACE_BOX_INDEX])
+ || STRINGP (attrs[LFACE_BOX_INDEX])
+ || INTEGERP (attrs[LFACE_BOX_INDEX])
+ || CONSP (attrs[LFACE_BOX_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_INVERSE_INDEX])
+ || SYMBOLP (attrs[LFACE_INVERSE_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_FOREGROUND_INDEX])
+ || STRINGP (attrs[LFACE_FOREGROUND_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_BACKGROUND_INDEX])
+ || STRINGP (attrs[LFACE_BACKGROUND_INDEX]));
+ xassert (UNSPECIFIEDP (attrs[LFACE_INHERIT_INDEX])
+ || NILP (attrs[LFACE_INHERIT_INDEX])
+ || SYMBOLP (attrs[LFACE_INHERIT_INDEX])
+ || CONSP (attrs[LFACE_INHERIT_INDEX]));
+#ifdef HAVE_WINDOW_SYSTEM
+ xassert (UNSPECIFIEDP (attrs[LFACE_STIPPLE_INDEX])
+ || SYMBOLP (attrs[LFACE_STIPPLE_INDEX])
+ || !NILP (Fbitmap_spec_p (attrs[LFACE_STIPPLE_INDEX])));
+ xassert (UNSPECIFIEDP (attrs[LFACE_FONT_INDEX])
+ || NILP (attrs[LFACE_FONT_INDEX])
+ || STRINGP (attrs[LFACE_FONT_INDEX]));
+#endif
+}
+
+
+/* Check consistency of attributes of Lisp face LFACE (a Lisp vector). */
+
+static void
+check_lface (lface)
+ Lisp_Object lface;
+{
+ if (!NILP (lface))
+ {
+ xassert (LFACEP (lface));
+ check_lface_attrs (XVECTOR (lface)->contents);
+ }
+}
+
+#else /* GLYPH_DEBUG == 0 */
+
+#define check_lface_attrs(attrs) (void) 0
+#define check_lface(lface) (void) 0
+
+#endif /* GLYPH_DEBUG == 0 */
+
+
+/* Resolve face name FACE_NAME. If FACE_NAME is a string, intern it
+ to make it a symvol. If FACE_NAME is an alias for another face,
+ return that face's name. */
+
+static Lisp_Object
+resolve_face_name (face_name)
+ Lisp_Object face_name;
+{
+ Lisp_Object aliased;
+
+ if (STRINGP (face_name))
+ face_name = intern (XSTRING (face_name)->data);
+
+ while (SYMBOLP (face_name))
+ {
+ aliased = Fget (face_name, Qface_alias);
+ if (NILP (aliased))
+ break;
+ else
+ face_name = aliased;
+ }
+
+ return face_name;
+}
+
+
+/* Return the face definition of FACE_NAME on frame F. F null means
+ return the definition for new frames. FACE_NAME may be a string or
+ a symbol (apparently Emacs 20.2 allowed strings as face names in
+ face text properties; Ediff uses that). If FACE_NAME is an alias
+ for another face, return that face's definition. If SIGNAL_P is
+ non-zero, signal an error if FACE_NAME is not a valid face name.
+ If SIGNAL_P is zero, value is nil if FACE_NAME is not a valid face
+ name. */
+
+static INLINE Lisp_Object
+lface_from_face_name (f, face_name, signal_p)
+ struct frame *f;
+ Lisp_Object face_name;
+ int signal_p;
+{
+ Lisp_Object lface;
+
+ face_name = resolve_face_name (face_name);
+
+ if (f)
+ lface = assq_no_quit (face_name, f->face_alist);
+ else
+ lface = assq_no_quit (face_name, Vface_new_frame_defaults);
+
+ if (CONSP (lface))
+ lface = XCDR (lface);
+ else if (signal_p)
+ signal_error ("Invalid face", face_name);
+
+ check_lface (lface);
+ return lface;
+}
+
+
+/* Get face attributes of face FACE_NAME from frame-local faces on
+ frame F. Store the resulting attributes in ATTRS which must point
+ to a vector of Lisp_Objects of size LFACE_VECTOR_SIZE. If SIGNAL_P
+ is non-zero, signal an error if FACE_NAME does not name a face.
+ Otherwise, value is zero if FACE_NAME is not a face. */
+
+static INLINE int
+get_lface_attributes (f, face_name, attrs, signal_p)
+ struct frame *f;
+ Lisp_Object face_name;
+ Lisp_Object *attrs;
+ int signal_p;
+{
+ Lisp_Object lface;
+ int success_p;
+
+ lface = lface_from_face_name (f, face_name, signal_p);
+ if (!NILP (lface))
+ {
+ bcopy (XVECTOR (lface)->contents, attrs,
+ LFACE_VECTOR_SIZE * sizeof *attrs);
+ success_p = 1;
+ }
+ else
+ success_p = 0;
+
+ return success_p;
+}
+
+
+/* Non-zero if all attributes in face attribute vector ATTRS are
+ specified, i.e. are non-nil. */
+
+static int
+lface_fully_specified_p (attrs)
+ Lisp_Object *attrs;
+{
+ int i;
+
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ if (i != LFACE_FONT_INDEX && i != LFACE_INHERIT_INDEX
+ && i != LFACE_AVGWIDTH_INDEX)
+ if (UNSPECIFIEDP (attrs[i]))
+ break;
+
+ return i == LFACE_VECTOR_SIZE;
+}
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Set font-related attributes of Lisp face LFACE from the fullname of
+ the font opened by FONTNAME. If FORCE_P is zero, set only
+ unspecified attributes of LFACE. The exception is `font'
+ attribute. It is set to FONTNAME as is regardless of FORCE_P.
+
+ If FONTNAME is not available on frame F,
+ return 0 if MAY_FAIL_P is non-zero, otherwise abort.
+ If the fullname is not in a valid XLFD format,
+ return 0 if MAY_FAIL_P is non-zero, otherwise set normal values
+ in LFACE and return 1.
+ Otherwise, return 1. */
+
+static int
+set_lface_from_font_name (f, lface, fontname, force_p, may_fail_p)
+ struct frame *f;
+ Lisp_Object lface;
+ Lisp_Object fontname;
+ int force_p, may_fail_p;
+{
+ struct font_name font;
+ char *buffer;
+ int pt;
+ int have_xlfd_p;
+ int fontset;
+ char *font_name = XSTRING (fontname)->data;
+ struct font_info *font_info;
+
+ /* If FONTNAME is actually a fontset name, get ASCII font name of it. */
+ fontset = fs_query_fontset (fontname, 0);
+ if (fontset >= 0)
+ font_name = XSTRING (fontset_ascii (fontset))->data;
+
+ /* Check if FONT_NAME is surely available on the system. Usually
+ FONT_NAME is already cached for the frame F and FS_LOAD_FONT
+ returns quickly. But, even if FONT_NAME is not yet cached,
+ caching it now is not futail because we anyway load the font
+ later. */
+ BLOCK_INPUT;
+ font_info = FS_LOAD_FONT (f, 0, font_name, -1);
+ UNBLOCK_INPUT;
+
+ if (!font_info)
+ {
+ if (may_fail_p)
+ return 0;
+ abort ();
+ }
+
+ font.name = STRDUPA (font_info->full_name);
+ have_xlfd_p = split_font_name (f, &font, 1);
+
+ /* Set attributes only if unspecified, otherwise face defaults for
+ new frames would never take effect. If we couldn't get a font
+ name conforming to XLFD, set normal values. */
+
+ if (force_p || UNSPECIFIEDP (LFACE_FAMILY (lface)))
+ {
+ Lisp_Object val;
+ if (have_xlfd_p)
+ {
+ buffer = (char *) alloca (strlen (font.fields[XLFD_FAMILY])
+ + strlen (font.fields[XLFD_FOUNDRY])
+ + 2);
+ sprintf (buffer, "%s-%s", font.fields[XLFD_FOUNDRY],
+ font.fields[XLFD_FAMILY]);
+ val = build_string (buffer);
+ }
+ else
+ val = build_string ("*");
+ LFACE_FAMILY (lface) = val;
+ }
+
+ if (force_p || UNSPECIFIEDP (LFACE_HEIGHT (lface)))
+ {
+ if (have_xlfd_p)
+ pt = xlfd_point_size (f, &font);
+ else
+ pt = pixel_point_size (f, font_info->height * 10);
+ xassert (pt > 0);
+ LFACE_HEIGHT (lface) = make_number (pt);
+ }
+
+ if (force_p || UNSPECIFIEDP (LFACE_SWIDTH (lface)))
+ LFACE_SWIDTH (lface)
+ = have_xlfd_p ? xlfd_symbolic_swidth (&font) : Qnormal;
+
+ if (force_p || UNSPECIFIEDP (LFACE_AVGWIDTH (lface)))
+ LFACE_AVGWIDTH (lface)
+ = (have_xlfd_p
+ ? make_number (font.numeric[XLFD_AVGWIDTH])
+ : Qunspecified);
+
+ if (force_p || UNSPECIFIEDP (LFACE_WEIGHT (lface)))
+ LFACE_WEIGHT (lface)
+ = have_xlfd_p ? xlfd_symbolic_weight (&font) : Qnormal;
+
+ if (force_p || UNSPECIFIEDP (LFACE_SLANT (lface)))
+ LFACE_SLANT (lface)
+ = have_xlfd_p ? xlfd_symbolic_slant (&font) : Qnormal;
+
+ LFACE_FONT (lface) = fontname;
+
+ return 1;
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+/* Merges the face height FROM with the face height TO, and returns the
+ merged height. If FROM is an invalid height, then INVALID is
+ returned instead. FROM may be a either an absolute face height or a
+ `relative' height, and TO must be an absolute height. The returned
+ value is always an absolute height. GCPRO is a lisp value that will
+ be protected from garbage-collection if this function makes a call
+ into lisp. */
+
+Lisp_Object
+merge_face_heights (from, to, invalid, gcpro)
+ Lisp_Object from, to, invalid, gcpro;
+{
+ int result = 0;
+
+ if (INTEGERP (from))
+ result = XINT (from);
+ else if (NUMBERP (from))
+ result = XFLOATINT (from) * XINT (to);
+#if 0 /* Probably not so useful. */
+ else if (CONSP (from) && CONSP (XCDR (from)))
+ {
+ if (EQ (XCAR(from), Qplus) || EQ (XCAR(from), Qminus))
+ {
+ if (INTEGERP (XCAR (XCDR (from))))
+ {
+ int inc = XINT (XCAR (XCDR (from)));
+ if (EQ (XCAR (from), Qminus))
+ inc = -inc;
+
+ result = XFASTINT (to);
+ if (result + inc > 0)
+ /* Note that `underflows' don't mean FROM is invalid, so
+ we just pin the result at TO if it would otherwise be
+ negative or 0. */
+ result += inc;
+ }
+ }
+ }
+#endif
+ else if (FUNCTIONP (from))
+ {
+ /* Call function with current height as argument.
+ From is the new height. */
+ Lisp_Object args[2], height;
+ struct gcpro gcpro1;
+
+ GCPRO1 (gcpro);
+
+ args[0] = from;
+ args[1] = to;
+ height = safe_call (2, args);
+
+ UNGCPRO;
+
+ if (NUMBERP (height))
+ result = XFLOATINT (height);
+ }
+
+ if (result > 0)
+ return make_number (result);
+ else
+ return invalid;
+}
+
+
+/* Merge two Lisp face attribute vectors on frame F, FROM and TO, and
+ store the resulting attributes in TO, which must be already be
+ completely specified and contain only absolute attributes. Every
+ specified attribute of FROM overrides the corresponding attribute of
+ TO; relative attributes in FROM are merged with the absolute value in
+ TO and replace it. CYCLE_CHECK is used internally to detect loops in
+ face inheritance; it should be Qnil when called from other places. */
+
+static INLINE void
+merge_face_vectors (f, from, to, cycle_check)
+ struct frame *f;
+ Lisp_Object *from, *to;
+ Lisp_Object cycle_check;
+{
+ int i;
+
+ /* If FROM inherits from some other faces, merge their attributes into
+ TO before merging FROM's direct attributes. Note that an :inherit
+ attribute of `unspecified' is the same as one of nil; we never
+ merge :inherit attributes, so nil is more correct, but lots of
+ other code uses `unspecified' as a generic value for face attributes. */
+ if (!UNSPECIFIEDP (from[LFACE_INHERIT_INDEX])
+ && !NILP (from[LFACE_INHERIT_INDEX]))
+ merge_face_inheritance (f, from[LFACE_INHERIT_INDEX], to, cycle_check);
+
+ /* If TO specifies a :font attribute, and FROM specifies some
+ font-related attribute, we need to clear TO's :font attribute
+ (because it will be inconsistent with whatever FROM specifies, and
+ FROM takes precedence). */
+ if (!NILP (to[LFACE_FONT_INDEX])
+ && (!UNSPECIFIEDP (from[LFACE_FAMILY_INDEX])
+ || !UNSPECIFIEDP (from[LFACE_HEIGHT_INDEX])
+ || !UNSPECIFIEDP (from[LFACE_WEIGHT_INDEX])
+ || !UNSPECIFIEDP (from[LFACE_SLANT_INDEX])
+ || !UNSPECIFIEDP (from[LFACE_SWIDTH_INDEX])
+ || !UNSPECIFIEDP (from[LFACE_AVGWIDTH_INDEX])))
+ to[LFACE_FONT_INDEX] = Qnil;
+
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ if (!UNSPECIFIEDP (from[i]))
+ if (i == LFACE_HEIGHT_INDEX && !INTEGERP (from[i]))
+ to[i] = merge_face_heights (from[i], to[i], to[i], cycle_check);
+ else
+ to[i] = from[i];
+
+ /* TO is always an absolute face, which should inherit from nothing.
+ We blindly copy the :inherit attribute above and fix it up here. */
+ to[LFACE_INHERIT_INDEX] = Qnil;
+}
+
+
+/* Checks the `cycle check' variable CHECK to see if it indicates that
+ EL is part of a cycle; CHECK must be either Qnil or a value returned
+ by an earlier use of CYCLE_CHECK. SUSPICIOUS is the number of
+ elements after which a cycle might be suspected; after that many
+ elements, this macro begins consing in order to keep more precise
+ track of elements.
+
+ Returns NIL if a cycle was detected, otherwise a new value for CHECK
+ that includes EL.
+
+ CHECK is evaluated multiple times, EL and SUSPICIOUS 0 or 1 times, so
+ the caller should make sure that's ok. */
+
+#define CYCLE_CHECK(check, el, suspicious) \
+ (NILP (check) \
+ ? make_number (0) \
+ : (INTEGERP (check) \
+ ? (XFASTINT (check) < (suspicious) \
+ ? make_number (XFASTINT (check) + 1) \
+ : Fcons (el, Qnil)) \
+ : (!NILP (Fmemq ((el), (check))) \
+ ? Qnil \
+ : Fcons ((el), (check)))))
+
+
+/* Merge face attributes from the face on frame F whose name is
+ INHERITS, into the vector of face attributes TO; INHERITS may also be
+ a list of face names, in which case they are applied in order.
+ CYCLE_CHECK is used to detect loops in face inheritance.
+ Returns true if any of the inherited attributes are `font-related'. */
+
+static void
+merge_face_inheritance (f, inherit, to, cycle_check)
+ struct frame *f;
+ Lisp_Object inherit;
+ Lisp_Object *to;
+ Lisp_Object cycle_check;
+{
+ if (SYMBOLP (inherit) && !EQ (inherit, Qunspecified))
+ /* Inherit from the named face INHERIT. */
+ {
+ Lisp_Object lface;
+
+ /* Make sure we're not in an inheritance loop. */
+ cycle_check = CYCLE_CHECK (cycle_check, inherit, 15);
+ if (NILP (cycle_check))
+ /* Cycle detected, ignore any further inheritance. */
+ return;
+
+ lface = lface_from_face_name (f, inherit, 0);
+ if (!NILP (lface))
+ merge_face_vectors (f, XVECTOR (lface)->contents, to, cycle_check);
+ }
+ else if (CONSP (inherit))
+ /* Handle a list of inherited faces by calling ourselves recursively
+ on each element. Note that we only do so for symbol elements, so
+ it's not possible to infinitely recurse. */
+ {
+ while (CONSP (inherit))
+ {
+ if (SYMBOLP (XCAR (inherit)))
+ merge_face_inheritance (f, XCAR (inherit), to, cycle_check);
+
+ /* Check for a circular inheritance list. */
+ cycle_check = CYCLE_CHECK (cycle_check, inherit, 15);
+ if (NILP (cycle_check))
+ /* Cycle detected. */
+ break;
+
+ inherit = XCDR (inherit);
+ }
+ }
+}
+
+
+/* Given a Lisp face attribute vector TO and a Lisp object PROP that
+ is a face property, determine the resulting face attributes on
+ frame F, and store them in TO. PROP may be a single face
+ specification or a list of such specifications. Each face
+ specification can be
+
+ 1. A symbol or string naming a Lisp face.
+
+ 2. A property list of the form (KEYWORD VALUE ...) where each
+ KEYWORD is a face attribute name, and value is an appropriate value
+ for that attribute.
+
+ 3. Conses or the form (FOREGROUND-COLOR . COLOR) or
+ (BACKGROUND-COLOR . COLOR) where COLOR is a color name. This is
+ for compatibility with 20.2.
+
+ Face specifications earlier in lists take precedence over later
+ specifications. */
+
+static void
+merge_face_vector_with_property (f, to, prop)
+ struct frame *f;
+ Lisp_Object *to;
+ Lisp_Object prop;
+{
+ if (CONSP (prop))
+ {
+ Lisp_Object first = XCAR (prop);
+
+ if (EQ (first, Qforeground_color)
+ || EQ (first, Qbackground_color))
+ {
+ /* One of (FOREGROUND-COLOR . COLOR) or (BACKGROUND-COLOR
+ . COLOR). COLOR must be a string. */
+ Lisp_Object color_name = XCDR (prop);
+ Lisp_Object color = first;
+
+ if (STRINGP (color_name))
+ {
+ if (EQ (color, Qforeground_color))
+ to[LFACE_FOREGROUND_INDEX] = color_name;
+ else
+ to[LFACE_BACKGROUND_INDEX] = color_name;
+ }
+ else
+ add_to_log ("Invalid face color", color_name, Qnil);
+ }
+ else if (SYMBOLP (first)
+ && *XSYMBOL (first)->name->data == ':')
+ {
+ /* Assume this is the property list form. */
+ while (CONSP (prop) && CONSP (XCDR (prop)))
+ {
+ Lisp_Object keyword = XCAR (prop);
+ Lisp_Object value = XCAR (XCDR (prop));
+
+ if (EQ (keyword, QCfamily))
+ {
+ if (STRINGP (value))
+ to[LFACE_FAMILY_INDEX] = value;
+ else
+ add_to_log ("Invalid face font family", value, Qnil);
+ }
+ else if (EQ (keyword, QCheight))
+ {
+ Lisp_Object new_height =
+ merge_face_heights (value, to[LFACE_HEIGHT_INDEX],
+ Qnil, Qnil);
+
+ if (NILP (new_height))
+ add_to_log ("Invalid face font height", value, Qnil);
+ else
+ to[LFACE_HEIGHT_INDEX] = new_height;
+ }
+ else if (EQ (keyword, QCweight))
+ {
+ if (SYMBOLP (value)
+ && face_numeric_weight (value) >= 0)
+ to[LFACE_WEIGHT_INDEX] = value;
+ else
+ add_to_log ("Invalid face weight", value, Qnil);
+ }
+ else if (EQ (keyword, QCslant))
+ {
+ if (SYMBOLP (value)
+ && face_numeric_slant (value) >= 0)
+ to[LFACE_SLANT_INDEX] = value;
+ else
+ add_to_log ("Invalid face slant", value, Qnil);
+ }
+ else if (EQ (keyword, QCunderline))
+ {
+ if (EQ (value, Qt)
+ || NILP (value)
+ || STRINGP (value))
+ to[LFACE_UNDERLINE_INDEX] = value;
+ else
+ add_to_log ("Invalid face underline", value, Qnil);
+ }
+ else if (EQ (keyword, QCoverline))
+ {
+ if (EQ (value, Qt)
+ || NILP (value)
+ || STRINGP (value))
+ to[LFACE_OVERLINE_INDEX] = value;
+ else
+ add_to_log ("Invalid face overline", value, Qnil);
+ }
+ else if (EQ (keyword, QCstrike_through))
+ {
+ if (EQ (value, Qt)
+ || NILP (value)
+ || STRINGP (value))
+ to[LFACE_STRIKE_THROUGH_INDEX] = value;
+ else
+ add_to_log ("Invalid face strike-through", value, Qnil);
+ }
+ else if (EQ (keyword, QCbox))
+ {
+ if (EQ (value, Qt))
+ value = make_number (1);
+ if (INTEGERP (value)
+ || STRINGP (value)
+ || CONSP (value)
+ || NILP (value))
+ to[LFACE_BOX_INDEX] = value;
+ else
+ add_to_log ("Invalid face box", value, Qnil);
+ }
+ else if (EQ (keyword, QCinverse_video)
+ || EQ (keyword, QCreverse_video))
+ {
+ if (EQ (value, Qt) || NILP (value))
+ to[LFACE_INVERSE_INDEX] = value;
+ else
+ add_to_log ("Invalid face inverse-video", value, Qnil);
+ }
+ else if (EQ (keyword, QCforeground))
+ {
+ if (STRINGP (value))
+ to[LFACE_FOREGROUND_INDEX] = value;
+ else
+ add_to_log ("Invalid face foreground", value, Qnil);
+ }
+ else if (EQ (keyword, QCbackground))
+ {
+ if (STRINGP (value))
+ to[LFACE_BACKGROUND_INDEX] = value;
+ else
+ add_to_log ("Invalid face background", value, Qnil);
+ }
+ else if (EQ (keyword, QCstipple))
+ {
+#ifdef HAVE_X_WINDOWS
+ Lisp_Object pixmap_p = Fbitmap_spec_p (value);
+ if (!NILP (pixmap_p))
+ to[LFACE_STIPPLE_INDEX] = value;
+ else
+ add_to_log ("Invalid face stipple", value, Qnil);
+#endif
+ }
+ else if (EQ (keyword, QCwidth))
+ {
+ if (SYMBOLP (value)
+ && face_numeric_swidth (value) >= 0)
+ to[LFACE_SWIDTH_INDEX] = value;
+ else
+ add_to_log ("Invalid face width", value, Qnil);
+ }
+ else if (EQ (keyword, QCinherit))
+ {
+ if (SYMBOLP (value))
+ to[LFACE_INHERIT_INDEX] = value;
+ else
+ {
+ Lisp_Object tail;
+ for (tail = value; CONSP (tail); tail = XCDR (tail))
+ if (!SYMBOLP (XCAR (tail)))
+ break;
+ if (NILP (tail))
+ to[LFACE_INHERIT_INDEX] = value;
+ else
+ add_to_log ("Invalid face inherit", value, Qnil);
+ }
+ }
+ else
+ add_to_log ("Invalid attribute %s in face property",
+ keyword, Qnil);
+
+ prop = XCDR (XCDR (prop));
+ }
+ }
+ else
+ {
+ /* This is a list of face specs. Specifications at the
+ beginning of the list take precedence over later
+ specifications, so we have to merge starting with the
+ last specification. */
+ Lisp_Object next = XCDR (prop);
+ if (!NILP (next))
+ merge_face_vector_with_property (f, to, next);
+ merge_face_vector_with_property (f, to, first);
+ }
+ }
+ else
+ {
+ /* PROP ought to be a face name. */
+ Lisp_Object lface = lface_from_face_name (f, prop, 0);
+ if (NILP (lface))
+ add_to_log ("Invalid face text property value: %s", prop, Qnil);
+ else
+ merge_face_vectors (f, XVECTOR (lface)->contents, to, Qnil);
+ }
+}
+
+
+DEFUN ("internal-make-lisp-face", Finternal_make_lisp_face,
+ Sinternal_make_lisp_face, 1, 2, 0,
+ "Make FACE, a symbol, a Lisp face with all attributes nil.\n\
+If FACE was not known as a face before, create a new one.\n\
+If optional argument FRAME is specified, make a frame-local face\n\
+for that frame. Otherwise operate on the global face definition.\n\
+Value is a vector of face attributes.")
+ (face, frame)
+ Lisp_Object face, frame;
+{
+ Lisp_Object global_lface, lface;
+ struct frame *f;
+ int i;
+
+ CHECK_SYMBOL (face, 0);
+ global_lface = lface_from_face_name (NULL, face, 0);
+
+ if (!NILP (frame))
+ {
+ CHECK_LIVE_FRAME (frame, 1);
+ f = XFRAME (frame);
+ lface = lface_from_face_name (f, face, 0);
+ }
+ else
+ f = NULL, lface = Qnil;
+
+ /* Add a global definition if there is none. */
+ if (NILP (global_lface))
+ {
+ global_lface = Fmake_vector (make_number (LFACE_VECTOR_SIZE),
+ Qunspecified);
+ AREF (global_lface, 0) = Qface;
+ Vface_new_frame_defaults = Fcons (Fcons (face, global_lface),
+ Vface_new_frame_defaults);
+
+ /* Assign the new Lisp face a unique ID. The mapping from Lisp
+ face id to Lisp face is given by the vector lface_id_to_name.
+ The mapping from Lisp face to Lisp face id is given by the
+ property `face' of the Lisp face name. */
+ if (next_lface_id == lface_id_to_name_size)
+ {
+ int new_size = max (50, 2 * lface_id_to_name_size);
+ int sz = new_size * sizeof *lface_id_to_name;
+ lface_id_to_name = (Lisp_Object *) xrealloc (lface_id_to_name, sz);
+ lface_id_to_name_size = new_size;
+ }
+
+ lface_id_to_name[next_lface_id] = face;
+ Fput (face, Qface, make_number (next_lface_id));
+ ++next_lface_id;
+ }
+ else if (f == NULL)
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ AREF (global_lface, i) = Qunspecified;
+
+ /* Add a frame-local definition. */
+ if (f)
+ {
+ if (NILP (lface))
+ {
+ lface = Fmake_vector (make_number (LFACE_VECTOR_SIZE),
+ Qunspecified);
+ AREF (lface, 0) = Qface;
+ f->face_alist = Fcons (Fcons (face, lface), f->face_alist);
+ }
+ else
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ AREF (lface, i) = Qunspecified;
+ }
+ else
+ lface = global_lface;
+
+ xassert (LFACEP (lface));
+ check_lface (lface);
+ return lface;
+}
+
+
+DEFUN ("internal-lisp-face-p", Finternal_lisp_face_p,
+ Sinternal_lisp_face_p, 1, 2, 0,
+ "Return non-nil if FACE names a face.\n\
+If optional second parameter FRAME is non-nil, check for the\n\
+existence of a frame-local face with name FACE on that frame.\n\
+Otherwise check for the existence of a global face.")
+ (face, frame)
+ Lisp_Object face, frame;
+{
+ Lisp_Object lface;
+
+ if (!NILP (frame))
+ {
+ CHECK_LIVE_FRAME (frame, 1);
+ lface = lface_from_face_name (XFRAME (frame), face, 0);
+ }
+ else
+ lface = lface_from_face_name (NULL, face, 0);
+
+ return lface;
+}
+
+
+DEFUN ("internal-copy-lisp-face", Finternal_copy_lisp_face,
+ Sinternal_copy_lisp_face, 4, 4, 0,
+ "Copy face FROM to TO.\n\
+If FRAME it t, copy the global face definition of FROM to the\n\
+global face definition of TO. Otherwise, copy the frame-local\n\
+definition of FROM on FRAME to the frame-local definition of TO\n\
+on NEW-FRAME, or FRAME if NEW-FRAME is nil.\n\
+\n\
+Value is TO.")
+ (from, to, frame, new_frame)
+ Lisp_Object from, to, frame, new_frame;
+{
+ Lisp_Object lface, copy;
+
+ CHECK_SYMBOL (from, 0);
+ CHECK_SYMBOL (to, 1);
+ if (NILP (new_frame))
+ new_frame = frame;
+
+ if (EQ (frame, Qt))
+ {
+ /* Copy global definition of FROM. We don't make copies of
+ strings etc. because 20.2 didn't do it either. */
+ lface = lface_from_face_name (NULL, from, 1);
+ copy = Finternal_make_lisp_face (to, Qnil);
+ }
+ else
+ {
+ /* Copy frame-local definition of FROM. */
+ CHECK_LIVE_FRAME (frame, 2);
+ CHECK_LIVE_FRAME (new_frame, 3);
+ lface = lface_from_face_name (XFRAME (frame), from, 1);
+ copy = Finternal_make_lisp_face (to, new_frame);
+ }
+
+ bcopy (XVECTOR (lface)->contents, XVECTOR (copy)->contents,
+ LFACE_VECTOR_SIZE * sizeof (Lisp_Object));
+
+ return to;
+}
+
+
+DEFUN ("internal-set-lisp-face-attribute", Finternal_set_lisp_face_attribute,
+ Sinternal_set_lisp_face_attribute, 3, 4, 0,
+ "Set attribute ATTR of FACE to VALUE.\n\
+FRAME being a frame means change the face on that frame.\n\
+FRAME nil means change the face of the selected frame.\n\
+FRAME t means change the default for new frames.\n\
+FRAME 0 means change the face on all frames, and change the default\n\
+ for new frames.")
+ (face, attr, value, frame)
+ Lisp_Object face, attr, value, frame;
+{
+ Lisp_Object lface;
+ Lisp_Object old_value = Qnil;
+ /* Set 1 if ATTR is QCfont. */
+ int font_attr_p = 0;
+ /* Set 1 if ATTR is one of font-related attributes other than QCfont. */
+ int font_related_attr_p = 0;
+
+ CHECK_SYMBOL (face, 0);
+ CHECK_SYMBOL (attr, 1);
+
+ face = resolve_face_name (face);
+
+ /* If FRAME is 0, change face on all frames, and change the
+ default for new frames. */
+ if (INTEGERP (frame) && XINT (frame) == 0)
+ {
+ Lisp_Object tail;
+ Finternal_set_lisp_face_attribute (face, attr, value, Qt);
+ FOR_EACH_FRAME (tail, frame)
+ Finternal_set_lisp_face_attribute (face, attr, value, frame);
+ return face;
+ }
+
+ /* Set lface to the Lisp attribute vector of FACE. */
+ if (EQ (frame, Qt))
+ lface = lface_from_face_name (NULL, face, 1);
+ else
+ {
+ if (NILP (frame))
+ frame = selected_frame;
+
+ CHECK_LIVE_FRAME (frame, 3);
+ lface = lface_from_face_name (XFRAME (frame), face, 0);
+
+ /* If a frame-local face doesn't exist yet, create one. */
+ if (NILP (lface))
+ lface = Finternal_make_lisp_face (face, frame);
+ }
+
+ if (EQ (attr, QCfamily))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ CHECK_STRING (value, 3);
+ if (XSTRING (value)->size == 0)
+ signal_error ("Invalid face family", value);
+ }
+ old_value = LFACE_FAMILY (lface);
+ LFACE_FAMILY (lface) = value;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCheight))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ Lisp_Object test =
+ (EQ (face, Qdefault) ? value :
+ /* The default face must have an absolute size, otherwise, we do
+ a test merge with a random height to see if VALUE's ok. */
+ merge_face_heights (value, make_number(10), Qnil, Qnil));
+
+ if (!INTEGERP(test) || XINT(test) <= 0)
+ signal_error ("Invalid face height", value);
+ }
+
+ old_value = LFACE_HEIGHT (lface);
+ LFACE_HEIGHT (lface) = value;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCweight))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ CHECK_SYMBOL (value, 3);
+ if (face_numeric_weight (value) < 0)
+ signal_error ("Invalid face weight", value);
+ }
+ old_value = LFACE_WEIGHT (lface);
+ LFACE_WEIGHT (lface) = value;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCslant))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ CHECK_SYMBOL (value, 3);
+ if (face_numeric_slant (value) < 0)
+ signal_error ("Invalid face slant", value);
+ }
+ old_value = LFACE_SLANT (lface);
+ LFACE_SLANT (lface) = value;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCunderline))
+ {
+ if (!UNSPECIFIEDP (value))
+ if ((SYMBOLP (value)
+ && !EQ (value, Qt)
+ && !EQ (value, Qnil))
+ /* Underline color. */
+ || (STRINGP (value)
+ && XSTRING (value)->size == 0))
+ signal_error ("Invalid face underline", value);
+
+ old_value = LFACE_UNDERLINE (lface);
+ LFACE_UNDERLINE (lface) = value;
+ }
+ else if (EQ (attr, QCoverline))
+ {
+ if (!UNSPECIFIEDP (value))
+ if ((SYMBOLP (value)
+ && !EQ (value, Qt)
+ && !EQ (value, Qnil))
+ /* Overline color. */
+ || (STRINGP (value)
+ && XSTRING (value)->size == 0))
+ signal_error ("Invalid face overline", value);
+
+ old_value = LFACE_OVERLINE (lface);
+ LFACE_OVERLINE (lface) = value;
+ }
+ else if (EQ (attr, QCstrike_through))
+ {
+ if (!UNSPECIFIEDP (value))
+ if ((SYMBOLP (value)
+ && !EQ (value, Qt)
+ && !EQ (value, Qnil))
+ /* Strike-through color. */
+ || (STRINGP (value)
+ && XSTRING (value)->size == 0))
+ signal_error ("Invalid face strike-through", value);
+
+ old_value = LFACE_STRIKE_THROUGH (lface);
+ LFACE_STRIKE_THROUGH (lface) = value;
+ }
+ else if (EQ (attr, QCbox))
+ {
+ int valid_p;
+
+ /* Allow t meaning a simple box of width 1 in foreground color
+ of the face. */
+ if (EQ (value, Qt))
+ value = make_number (1);
+
+ if (UNSPECIFIEDP (value))
+ valid_p = 1;
+ else if (NILP (value))
+ valid_p = 1;
+ else if (INTEGERP (value))
+ valid_p = XINT (value) != 0;
+ else if (STRINGP (value))
+ valid_p = XSTRING (value)->size > 0;
+ else if (CONSP (value))
+ {
+ Lisp_Object tem;
+
+ tem = value;
+ while (CONSP (tem))
+ {
+ Lisp_Object k, v;
+
+ k = XCAR (tem);
+ tem = XCDR (tem);
+ if (!CONSP (tem))
+ break;
+ v = XCAR (tem);
+ tem = XCDR (tem);
+
+ if (EQ (k, QCline_width))
+ {
+ if (!INTEGERP (v) || XINT (v) == 0)
+ break;
+ }
+ else if (EQ (k, QCcolor))
+ {
+ if (!STRINGP (v) || XSTRING (v)->size == 0)
+ break;
+ }
+ else if (EQ (k, QCstyle))
+ {
+ if (!EQ (v, Qpressed_button) && !EQ (v, Qreleased_button))
+ break;
+ }
+ else
+ break;
+ }
+
+ valid_p = NILP (tem);
+ }
+ else
+ valid_p = 0;
+
+ if (!valid_p)
+ signal_error ("Invalid face box", value);
+
+ old_value = LFACE_BOX (lface);
+ LFACE_BOX (lface) = value;
+ }
+ else if (EQ (attr, QCinverse_video)
+ || EQ (attr, QCreverse_video))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ CHECK_SYMBOL (value, 3);
+ if (!EQ (value, Qt) && !NILP (value))
+ signal_error ("Invalid inverse-video face attribute value", value);
+ }
+ old_value = LFACE_INVERSE (lface);
+ LFACE_INVERSE (lface) = value;
+ }
+ else if (EQ (attr, QCforeground))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ /* Don't check for valid color names here because it depends
+ on the frame (display) whether the color will be valid
+ when the face is realized. */
+ CHECK_STRING (value, 3);
+ if (XSTRING (value)->size == 0)
+ signal_error ("Empty foreground color value", value);
+ }
+ old_value = LFACE_FOREGROUND (lface);
+ LFACE_FOREGROUND (lface) = value;
+ }
+ else if (EQ (attr, QCbackground))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ /* Don't check for valid color names here because it depends
+ on the frame (display) whether the color will be valid
+ when the face is realized. */
+ CHECK_STRING (value, 3);
+ if (XSTRING (value)->size == 0)
+ signal_error ("Empty background color value", value);
+ }
+ old_value = LFACE_BACKGROUND (lface);
+ LFACE_BACKGROUND (lface) = value;
+ }
+ else if (EQ (attr, QCstipple))
+ {
+#ifdef HAVE_X_WINDOWS
+ if (!UNSPECIFIEDP (value)
+ && !NILP (value)
+ && NILP (Fbitmap_spec_p (value)))
+ signal_error ("Invalid stipple attribute", value);
+ old_value = LFACE_STIPPLE (lface);
+ LFACE_STIPPLE (lface) = value;
+#endif /* HAVE_X_WINDOWS */
+ }
+ else if (EQ (attr, QCwidth))
+ {
+ if (!UNSPECIFIEDP (value))
+ {
+ CHECK_SYMBOL (value, 3);
+ if (face_numeric_swidth (value) < 0)
+ signal_error ("Invalid face width", value);
+ }
+ old_value = LFACE_SWIDTH (lface);
+ LFACE_SWIDTH (lface) = value;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCfont))
+ {
+#ifdef HAVE_WINDOW_SYSTEM
+ /* Set font-related attributes of the Lisp face from an
+ XLFD font name. */
+ struct frame *f;
+ Lisp_Object tmp;
+
+ CHECK_STRING (value, 3);
+ if (EQ (frame, Qt))
+ f = SELECTED_FRAME ();
+ else
+ f = check_x_frame (frame);
+
+ /* VALUE may be a fontset name or an alias of fontset. In such
+ a case, use the base fontset name. */
+ tmp = Fquery_fontset (value, Qnil);
+ if (!NILP (tmp))
+ value = tmp;
+
+ if (!set_lface_from_font_name (f, lface, value, 1, 1))
+ signal_error ("Invalid font or fontset name", value);
+
+ font_attr_p = 1;
+#endif /* HAVE_WINDOW_SYSTEM */
+ }
+ else if (EQ (attr, QCinherit))
+ {
+ Lisp_Object tail;
+ if (SYMBOLP (value))
+ tail = Qnil;
+ else
+ for (tail = value; CONSP (tail); tail = XCDR (tail))
+ if (!SYMBOLP (XCAR (tail)))
+ break;
+ if (NILP (tail))
+ LFACE_INHERIT (lface) = value;
+ else
+ signal_error ("Invalid face inheritance", value);
+ }
+ else if (EQ (attr, QCbold))
+ {
+ old_value = LFACE_WEIGHT (lface);
+ LFACE_WEIGHT (lface) = NILP (value) ? Qnormal : Qbold;
+ font_related_attr_p = 1;
+ }
+ else if (EQ (attr, QCitalic))
+ {
+ old_value = LFACE_SLANT (lface);
+ LFACE_SLANT (lface) = NILP (value) ? Qnormal : Qitalic;
+ font_related_attr_p = 1;
+ }
+ else
+ signal_error ("Invalid face attribute name", attr);
+
+ if (font_related_attr_p
+ && !UNSPECIFIEDP (value))
+ /* If a font-related attribute other than QCfont is specified, the
+ original `font' attribute nor that of default face is useless
+ to determine a new font. Thus, we set it to nil so that font
+ selection mechanism doesn't use it. */
+ LFACE_FONT (lface) = Qnil;
+
+ /* Changing a named face means that all realized faces depending on
+ that face are invalid. Since we cannot tell which realized faces
+ depend on the face, make sure they are all removed. This is done
+ by incrementing face_change_count. The next call to
+ init_iterator will then free realized faces. */
+ if (!EQ (frame, Qt)
+ && (EQ (attr, QCfont)
+ || NILP (Fequal (old_value, value))))
+ {
+ ++face_change_count;
+ ++windows_or_buffers_changed;
+ }
+
+ if (!UNSPECIFIEDP (value)
+ && NILP (Fequal (old_value, value)))
+ {
+ Lisp_Object param;
+
+ param = Qnil;
+
+ if (EQ (face, Qdefault))
+ {
+#ifdef HAVE_WINDOW_SYSTEM
+ /* Changed font-related attributes of the `default' face are
+ reflected in changed `font' frame parameters. */
+ if (FRAMEP (frame)
+ && (font_related_attr_p || font_attr_p)
+ && lface_fully_specified_p (XVECTOR (lface)->contents))
+ set_font_frame_param (frame, lface);
+ else
+#endif /* HAVE_WINDOW_SYSTEM */
+
+ if (EQ (attr, QCforeground))
+ param = Qforeground_color;
+ else if (EQ (attr, QCbackground))
+ param = Qbackground_color;
+ }
+#ifdef HAVE_WINDOW_SYSTEM
+#ifndef WINDOWSNT
+ else if (EQ (face, Qscroll_bar))
+ {
+ /* Changing the colors of `scroll-bar' sets frame parameters
+ `scroll-bar-foreground' and `scroll-bar-background'. */
+ if (EQ (attr, QCforeground))
+ param = Qscroll_bar_foreground;
+ else if (EQ (attr, QCbackground))
+ param = Qscroll_bar_background;
+ }
+#endif /* not WINDOWSNT */
+ else if (EQ (face, Qborder))
+ {
+ /* Changing background color of `border' sets frame parameter
+ `border-color'. */
+ if (EQ (attr, QCbackground))
+ param = Qborder_color;
+ }
+ else if (EQ (face, Qcursor))
+ {
+ /* Changing background color of `cursor' sets frame parameter
+ `cursor-color'. */
+ if (EQ (attr, QCbackground))
+ param = Qcursor_color;
+ }
+ else if (EQ (face, Qmouse))
+ {
+ /* Changing background color of `mouse' sets frame parameter
+ `mouse-color'. */
+ if (EQ (attr, QCbackground))
+ param = Qmouse_color;
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+ else if (EQ (face, Qmenu))
+ {
+ /* Indicate that we have to update the menu bar when
+ realizing faces on FRAME. FRAME t change the
+ default for new frames. We do this by setting
+ setting the flag in new face caches */
+ if (FRAMEP (frame))
+ {
+ struct frame *f = XFRAME (frame);
+ if (FRAME_FACE_CACHE (f) == NULL)
+ FRAME_FACE_CACHE (f) = make_face_cache (f);
+ FRAME_FACE_CACHE (f)->menu_face_changed_p = 1;
+ }
+ else
+ menu_face_changed_default = 1;
+ }
+
+ if (!NILP (param))
+ if (EQ (frame, Qt))
+ /* Update `default-frame-alist', which is used for new frames. */
+ {
+ store_in_alist (&Vdefault_frame_alist, param, value);
+ }
+ else
+ /* Update the current frame's parameters. */
+ {
+ Lisp_Object cons;
+ cons = XCAR (Vparam_value_alist);
+ XCAR (cons) = param;
+ XCDR (cons) = value;
+ Fmodify_frame_parameters (frame, Vparam_value_alist);
+ }
+ }
+
+ return face;
+}
+
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Set the `font' frame parameter of FRAME determined from `default'
+ face attributes LFACE. If a face or fontset name is explicitely
+ specfied in LFACE, use it as is. Otherwise, determine a font name
+ from the other font-related atrributes of LFACE. In that case, if
+ there's no matching font, signals an error. */
+
+static void
+set_font_frame_param (frame, lface)
+ Lisp_Object frame, lface;
+{
+ struct frame *f = XFRAME (frame);
+
+ if (FRAME_WINDOW_P (f))
+ {
+ Lisp_Object font_name;
+ char *font;
+
+ if (STRINGP (LFACE_FONT (lface)))
+ font_name = LFACE_FONT (lface);
+ else
+ {
+ /* Choose a font name that reflects LFACE's attributes and has
+ the registry and encoding pattern specified in the default
+ fontset (3rd arg: -1) for ASCII characters (4th arg: 0). */
+ font = choose_face_font (f, XVECTOR (lface)->contents, -1, 0);
+ if (!font)
+ error ("No font matches the specified attribute");
+ font_name = build_string (font);
+ xfree (font);
+ }
+
+ Fmodify_frame_parameters (frame, Fcons (Fcons (Qfont, font_name), Qnil));
+ }
+}
+
+
+/* Update the corresponding face when frame parameter PARAM on frame F
+ has been assigned the value NEW_VALUE. */
+
+void
+update_face_from_frame_parameter (f, param, new_value)
+ struct frame *f;
+ Lisp_Object param, new_value;
+{
+ Lisp_Object lface;
+
+ /* If there are no faces yet, give up. This is the case when called
+ from Fx_create_frame, and we do the necessary things later in
+ face-set-after-frame-defaults. */
+ if (NILP (f->face_alist))
+ return;
+
+ if (EQ (param, Qforeground_color))
+ {
+ lface = lface_from_face_name (f, Qdefault, 1);
+ LFACE_FOREGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ realize_basic_faces (f);
+ }
+ else if (EQ (param, Qbackground_color))
+ {
+ Lisp_Object frame;
+
+ /* Changing the background color might change the background
+ mode, so that we have to load new defface specs. Call
+ frame-update-face-colors to do that. */
+ XSETFRAME (frame, f);
+ call1 (Qframe_update_face_colors, frame);
+
+ lface = lface_from_face_name (f, Qdefault, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ realize_basic_faces (f);
+ }
+ else if (EQ (param, Qborder_color))
+ {
+ lface = lface_from_face_name (f, Qborder, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ }
+ else if (EQ (param, Qcursor_color))
+ {
+ lface = lface_from_face_name (f, Qcursor, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ }
+ else if (EQ (param, Qmouse_color))
+ {
+ lface = lface_from_face_name (f, Qmouse, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ }
+}
+
+
+/* Get the value of X resource RESOURCE, class CLASS for the display
+ of frame FRAME. This is here because ordinary `x-get-resource'
+ doesn't take a frame argument. */
+
+DEFUN ("internal-face-x-get-resource", Finternal_face_x_get_resource,
+ Sinternal_face_x_get_resource, 3, 3, 0, "")
+ (resource, class, frame)
+ Lisp_Object resource, class, frame;
+{
+ Lisp_Object value = Qnil;
+#ifndef WINDOWSNT
+#ifndef macintosh
+ CHECK_STRING (resource, 0);
+ CHECK_STRING (class, 1);
+ CHECK_LIVE_FRAME (frame, 2);
+ BLOCK_INPUT;
+ value = display_x_get_resource (FRAME_X_DISPLAY_INFO (XFRAME (frame)),
+ resource, class, Qnil, Qnil);
+ UNBLOCK_INPUT;
+#endif /* not macintosh */
+#endif /* not WINDOWSNT */
+ return value;
+}
+
+
+/* Return resource string VALUE as a boolean value, i.e. nil, or t.
+ If VALUE is "on" or "true", return t. If VALUE is "off" or
+ "false", return nil. Otherwise, if SIGNAL_P is non-zero, signal an
+ error; if SIGNAL_P is zero, return 0. */
+
+static Lisp_Object
+face_boolean_x_resource_value (value, signal_p)
+ Lisp_Object value;
+ int signal_p;
+{
+ Lisp_Object result = make_number (0);
+
+ xassert (STRINGP (value));
+
+ if (xstricmp (XSTRING (value)->data, "on") == 0
+ || xstricmp (XSTRING (value)->data, "true") == 0)
+ result = Qt;
+ else if (xstricmp (XSTRING (value)->data, "off") == 0
+ || xstricmp (XSTRING (value)->data, "false") == 0)
+ result = Qnil;
+ else if (xstricmp (XSTRING (value)->data, "unspecified") == 0)
+ result = Qunspecified;
+ else if (signal_p)
+ signal_error ("Invalid face attribute value from X resource", value);
+
+ return result;
+}
+
+
+DEFUN ("internal-set-lisp-face-attribute-from-resource",
+ Finternal_set_lisp_face_attribute_from_resource,
+ Sinternal_set_lisp_face_attribute_from_resource,
+ 3, 4, 0, "")
+ (face, attr, value, frame)
+ Lisp_Object face, attr, value, frame;
+{
+ CHECK_SYMBOL (face, 0);
+ CHECK_SYMBOL (attr, 1);
+ CHECK_STRING (value, 2);
+
+ if (xstricmp (XSTRING (value)->data, "unspecified") == 0)
+ value = Qunspecified;
+ else if (EQ (attr, QCheight))
+ {
+ value = Fstring_to_number (value, make_number (10));
+ if (XINT (value) <= 0)
+ signal_error ("Invalid face height from X resource", value);
+ }
+ else if (EQ (attr, QCbold) || EQ (attr, QCitalic))
+ value = face_boolean_x_resource_value (value, 1);
+ else if (EQ (attr, QCweight) || EQ (attr, QCslant) || EQ (attr, QCwidth))
+ value = intern (XSTRING (value)->data);
+ else if (EQ (attr, QCreverse_video) || EQ (attr, QCinverse_video))
+ value = face_boolean_x_resource_value (value, 1);
+ else if (EQ (attr, QCunderline)
+ || EQ (attr, QCoverline)
+ || EQ (attr, QCstrike_through))
+ {
+ Lisp_Object boolean_value;
+
+ /* If the result of face_boolean_x_resource_value is t or nil,
+ VALUE does NOT specify a color. */
+ boolean_value = face_boolean_x_resource_value (value, 0);
+ if (SYMBOLP (boolean_value))
+ value = boolean_value;
+ }
+ else if (EQ (attr, QCbox))
+ value = Fcar (Fread_from_string (value, Qnil, Qnil));
+
+ return Finternal_set_lisp_face_attribute (face, attr, value, frame);
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+/***********************************************************************
+ Menu face
+ ***********************************************************************/
+
+#if defined HAVE_X_WINDOWS && defined USE_X_TOOLKIT
+
+/* Make menus on frame F appear as specified by the `menu' face. */
+
+static void
+x_update_menu_appearance (f)
+ struct frame *f;
+{
+ struct x_display_info *dpyinfo = FRAME_X_DISPLAY_INFO (f);
+ XrmDatabase rdb;
+
+ if (dpyinfo
+ && (rdb = XrmGetDatabase (FRAME_X_DISPLAY (f)),
+ rdb != NULL))
+ {
+ char line[512];
+ Lisp_Object lface = lface_from_face_name (f, Qmenu, 1);
+ struct face *face = FACE_FROM_ID (f, MENU_FACE_ID);
+ char *myname = XSTRING (Vx_resource_name)->data;
+ int changed_p = 0;
+#ifdef USE_MOTIF
+ const char *popup_path = "popup_menu";
+#else
+ const char *popup_path = "menu.popup";
+#endif
+
+ if (STRINGP (LFACE_FOREGROUND (lface)))
+ {
+ sprintf (line, "%s.%s*foreground: %s",
+ myname, popup_path,
+ XSTRING (LFACE_FOREGROUND (lface))->data);
+ XrmPutLineResource (&rdb, line);
+ sprintf (line, "%s.pane.menubar*foreground: %s",
+ myname, XSTRING (LFACE_FOREGROUND (lface))->data);
+ XrmPutLineResource (&rdb, line);
+ changed_p = 1;
+ }
+
+ if (STRINGP (LFACE_BACKGROUND (lface)))
+ {
+ sprintf (line, "%s.%s*background: %s",
+ myname, popup_path,
+ XSTRING (LFACE_BACKGROUND (lface))->data);
+ XrmPutLineResource (&rdb, line);
+ sprintf (line, "%s.pane.menubar*background: %s",
+ myname, XSTRING (LFACE_BACKGROUND (lface))->data);
+ XrmPutLineResource (&rdb, line);
+ changed_p = 1;
+ }
+
+ if (face->font_name
+ && (!UNSPECIFIEDP (LFACE_FAMILY (lface))
+ || !UNSPECIFIEDP (LFACE_SWIDTH (lface))
+ || !UNSPECIFIEDP (LFACE_AVGWIDTH (lface))
+ || !UNSPECIFIEDP (LFACE_WEIGHT (lface))
+ || !UNSPECIFIEDP (LFACE_SLANT (lface))
+ || !UNSPECIFIEDP (LFACE_HEIGHT (lface))))
+ {
+#ifdef USE_MOTIF
+ const char *suffix = "List";
+#else
+ const char *suffix = "";
+#endif
+ sprintf (line, "%s.pane.menubar*font%s: %s",
+ myname, suffix, face->font_name);
+ XrmPutLineResource (&rdb, line);
+ sprintf (line, "%s.%s*font%s: %s",
+ myname, popup_path, suffix, face->font_name);
+ XrmPutLineResource (&rdb, line);
+ changed_p = 1;
+ }
+
+ if (changed_p && f->output_data.x->menubar_widget)
+ free_frame_menubar (f);
+ }
+}
+
+#endif /* HAVE_X_WINDOWS && USE_X_TOOLKIT */
+
+
+
+DEFUN ("internal-get-lisp-face-attribute", Finternal_get_lisp_face_attribute,
+ Sinternal_get_lisp_face_attribute,
+ 2, 3, 0,
+ "Return face attribute KEYWORD of face SYMBOL.\n\
+If SYMBOL does not name a valid Lisp face or KEYWORD isn't a valid\n\
+face attribute name, signal an error.\n\
+If the optional argument FRAME is given, report on face FACE in that\n\
+frame. If FRAME is t, report on the defaults for face FACE (for new\n\
+frames). If FRAME is omitted or nil, use the selected frame.")
+ (symbol, keyword, frame)
+ Lisp_Object symbol, keyword, frame;
+{
+ Lisp_Object lface, value = Qnil;
+
+ CHECK_SYMBOL (symbol, 0);
+ CHECK_SYMBOL (keyword, 1);
+
+ if (EQ (frame, Qt))
+ lface = lface_from_face_name (NULL, symbol, 1);
+ else
+ {
+ if (NILP (frame))
+ frame = selected_frame;
+ CHECK_LIVE_FRAME (frame, 2);
+ lface = lface_from_face_name (XFRAME (frame), symbol, 1);
+ }
+
+ if (EQ (keyword, QCfamily))
+ value = LFACE_FAMILY (lface);
+ else if (EQ (keyword, QCheight))
+ value = LFACE_HEIGHT (lface);
+ else if (EQ (keyword, QCweight))
+ value = LFACE_WEIGHT (lface);
+ else if (EQ (keyword, QCslant))
+ value = LFACE_SLANT (lface);
+ else if (EQ (keyword, QCunderline))
+ value = LFACE_UNDERLINE (lface);
+ else if (EQ (keyword, QCoverline))
+ value = LFACE_OVERLINE (lface);
+ else if (EQ (keyword, QCstrike_through))
+ value = LFACE_STRIKE_THROUGH (lface);
+ else if (EQ (keyword, QCbox))
+ value = LFACE_BOX (lface);
+ else if (EQ (keyword, QCinverse_video)
+ || EQ (keyword, QCreverse_video))
+ value = LFACE_INVERSE (lface);
+ else if (EQ (keyword, QCforeground))
+ value = LFACE_FOREGROUND (lface);
+ else if (EQ (keyword, QCbackground))
+ value = LFACE_BACKGROUND (lface);
+ else if (EQ (keyword, QCstipple))
+ value = LFACE_STIPPLE (lface);
+ else if (EQ (keyword, QCwidth))
+ value = LFACE_SWIDTH (lface);
+ else if (EQ (keyword, QCinherit))
+ value = LFACE_INHERIT (lface);
+ else if (EQ (keyword, QCfont))
+ value = LFACE_FONT (lface);
+ else
+ signal_error ("Invalid face attribute name", keyword);
+
+ return value;
+}
+
+
+DEFUN ("internal-lisp-face-attribute-values",
+ Finternal_lisp_face_attribute_values,
+ Sinternal_lisp_face_attribute_values, 1, 1, 0,
+ "Return a list of valid discrete values for face attribute ATTR.\n\
+Value is nil if ATTR doesn't have a discrete set of valid values.")
+ (attr)
+ Lisp_Object attr;
+{
+ Lisp_Object result = Qnil;
+
+ CHECK_SYMBOL (attr, 0);
+
+ if (EQ (attr, QCweight)
+ || EQ (attr, QCslant)
+ || EQ (attr, QCwidth))
+ {
+ /* Extract permissible symbols from tables. */
+ struct table_entry *table;
+ int i, dim;
+
+ if (EQ (attr, QCweight))
+ table = weight_table, dim = DIM (weight_table);
+ else if (EQ (attr, QCslant))
+ table = slant_table, dim = DIM (slant_table);
+ else
+ table = swidth_table, dim = DIM (swidth_table);
+
+ for (i = 0; i < dim; ++i)
+ {
+ Lisp_Object symbol = *table[i].symbol;
+ Lisp_Object tail = result;
+
+ while (!NILP (tail)
+ && !EQ (XCAR (tail), symbol))
+ tail = XCDR (tail);
+
+ if (NILP (tail))
+ result = Fcons (symbol, result);
+ }
+ }
+ else if (EQ (attr, QCunderline))
+ result = Fcons (Qt, Fcons (Qnil, Qnil));
+ else if (EQ (attr, QCoverline))
+ result = Fcons (Qt, Fcons (Qnil, Qnil));
+ else if (EQ (attr, QCstrike_through))
+ result = Fcons (Qt, Fcons (Qnil, Qnil));
+ else if (EQ (attr, QCinverse_video) || EQ (attr, QCreverse_video))
+ result = Fcons (Qt, Fcons (Qnil, Qnil));
+
+ return result;
+}
+
+
+DEFUN ("internal-merge-in-global-face", Finternal_merge_in_global_face,
+ Sinternal_merge_in_global_face, 2, 2, 0,
+ "Add attributes from frame-default definition of FACE to FACE on FRAME.\n\
+Default face attributes override any local face attributes.")
+ (face, frame)
+ Lisp_Object face, frame;
+{
+ int i;
+ Lisp_Object global_lface, local_lface, *gvec, *lvec;
+
+ CHECK_LIVE_FRAME (frame, 1);
+ global_lface = lface_from_face_name (NULL, face, 1);
+ local_lface = lface_from_face_name (XFRAME (frame), face, 0);
+ if (NILP (local_lface))
+ local_lface = Finternal_make_lisp_face (face, frame);
+
+ /* Make every specified global attribute override the local one.
+ BEWARE!! This is only used from `face-set-after-frame-default' where
+ the local frame is defined from default specs in `face-defface-spec'
+ and those should be overridden by global settings. Hence the strange
+ "global before local" priority. */
+ lvec = XVECTOR (local_lface)->contents;
+ gvec = XVECTOR (global_lface)->contents;
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ if (! UNSPECIFIEDP (gvec[i]))
+ lvec[i] = gvec[i];
+
+ return Qnil;
+}
+
+
+/* The following function is implemented for compatibility with 20.2.
+ The function is used in x-resolve-fonts when it is asked to
+ return fonts with the same size as the font of a face. This is
+ done in fontset.el. */
+
+DEFUN ("face-font", Fface_font, Sface_font, 1, 2, 0,
+ "Return the font name of face FACE, or nil if it is unspecified.\n\
+If the optional argument FRAME is given, report on face FACE in that frame.\n\
+If FRAME is t, report on the defaults for face FACE (for new frames).\n\
+ The font default for a face is either nil, or a list\n\
+ of the form (bold), (italic) or (bold italic).\n\
+If FRAME is omitted or nil, use the selected frame.")
+ (face, frame)
+ Lisp_Object face, frame;
+{
+ if (EQ (frame, Qt))
+ {
+ Lisp_Object result = Qnil;
+ Lisp_Object lface = lface_from_face_name (NULL, face, 1);
+
+ if (!UNSPECIFIEDP (LFACE_WEIGHT (lface))
+ && !EQ (LFACE_WEIGHT (lface), Qnormal))
+ result = Fcons (Qbold, result);
+
+ if (!UNSPECIFIEDP (LFACE_SLANT (lface))
+ && !EQ (LFACE_SLANT (lface), Qnormal))
+ result = Fcons (Qitalic, result);
+
+ return result;
+ }
+ else
+ {
+ struct frame *f = frame_or_selected_frame (frame, 1);
+ int face_id = lookup_named_face (f, face, 0);
+ struct face *face = FACE_FROM_ID (f, face_id);
+ return face ? build_string (face->font_name) : Qnil;
+ }
+}
+
+
+/* Compare face vectors V1 and V2 for equality. Value is non-zero if
+ all attributes are `equal'. Tries to be fast because this function
+ is called quite often. */
+
+static INLINE int
+lface_equal_p (v1, v2)
+ Lisp_Object *v1, *v2;
+{
+ int i, equal_p = 1;
+
+ for (i = 1; i < LFACE_VECTOR_SIZE && equal_p; ++i)
+ {
+ Lisp_Object a = v1[i];
+ Lisp_Object b = v2[i];
+
+ /* Type can differ, e.g. when one attribute is unspecified, i.e. nil,
+ and the other is specified. */
+ equal_p = XTYPE (a) == XTYPE (b);
+ if (!equal_p)
+ break;
+
+ if (!EQ (a, b))
+ {
+ switch (XTYPE (a))
+ {
+ case Lisp_String:
+ equal_p = ((STRING_BYTES (XSTRING (a))
+ == STRING_BYTES (XSTRING (b)))
+ && bcmp (XSTRING (a)->data, XSTRING (b)->data,
+ STRING_BYTES (XSTRING (a))) == 0);
+ break;
+
+ case Lisp_Int:
+ case Lisp_Symbol:
+ equal_p = 0;
+ break;
+
+ default:
+ equal_p = !NILP (Fequal (a, b));
+ break;
+ }
+ }
+ }
+
+ return equal_p;
+}
+
+
+DEFUN ("internal-lisp-face-equal-p", Finternal_lisp_face_equal_p,
+ Sinternal_lisp_face_equal_p, 2, 3, 0,
+ "True if FACE1 and FACE2 are equal.\n\
+If the optional argument FRAME is given, report on face FACE in that frame.\n\
+If FRAME is t, report on the defaults for face FACE (for new frames).\n\
+If FRAME is omitted or nil, use the selected frame.")
+ (face1, face2, frame)
+ Lisp_Object face1, face2, frame;
+{
+ int equal_p;
+ struct frame *f;
+ Lisp_Object lface1, lface2;
+
+ if (EQ (frame, Qt))
+ f = NULL;
+ else
+ /* Don't use check_x_frame here because this function is called
+ before X frames exist. At that time, if FRAME is nil,
+ selected_frame will be used which is the frame dumped with
+ Emacs. That frame is not an X frame. */
+ f = frame_or_selected_frame (frame, 2);
+
+ lface1 = lface_from_face_name (NULL, face1, 1);
+ lface2 = lface_from_face_name (NULL, face2, 1);
+ equal_p = lface_equal_p (XVECTOR (lface1)->contents,
+ XVECTOR (lface2)->contents);
+ return equal_p ? Qt : Qnil;
+}
+
+
+DEFUN ("internal-lisp-face-empty-p", Finternal_lisp_face_empty_p,
+ Sinternal_lisp_face_empty_p, 1, 2, 0,
+ "True if FACE has no attribute specified.\n\
+If the optional argument FRAME is given, report on face FACE in that frame.\n\
+If FRAME is t, report on the defaults for face FACE (for new frames).\n\
+If FRAME is omitted or nil, use the selected frame.")
+ (face, frame)
+ Lisp_Object face, frame;
+{
+ struct frame *f;
+ Lisp_Object lface;
+ int i;
+
+ if (NILP (frame))
+ frame = selected_frame;
+ CHECK_LIVE_FRAME (frame, 0);
+ f = XFRAME (frame);
+
+ if (EQ (frame, Qt))
+ lface = lface_from_face_name (NULL, face, 1);
+ else
+ lface = lface_from_face_name (f, face, 1);
+
+ for (i = 1; i < LFACE_VECTOR_SIZE; ++i)
+ if (!UNSPECIFIEDP (AREF (lface, i)))
+ break;
+
+ return i == LFACE_VECTOR_SIZE ? Qt : Qnil;
+}
+
+
+DEFUN ("frame-face-alist", Fframe_face_alist, Sframe_face_alist,
+ 0, 1, 0,
+ "Return an alist of frame-local faces defined on FRAME.\n\
+For internal use only.")
+ (frame)
+ Lisp_Object frame;
+{
+ struct frame *f = frame_or_selected_frame (frame, 0);
+ return f->face_alist;
+}
+
+
+/* Return a hash code for Lisp string STRING with case ignored. Used
+ below in computing a hash value for a Lisp face. */
+
+static INLINE unsigned
+hash_string_case_insensitive (string)
+ Lisp_Object string;
+{
+ unsigned char *s;
+ unsigned hash = 0;
+ xassert (STRINGP (string));
+ for (s = XSTRING (string)->data; *s; ++s)
+ hash = (hash << 1) ^ tolower (*s);
+ return hash;
+}
+
+
+/* Return a hash code for face attribute vector V. */
+
+static INLINE unsigned
+lface_hash (v)
+ Lisp_Object *v;
+{
+ return (hash_string_case_insensitive (v[LFACE_FAMILY_INDEX])
+ ^ hash_string_case_insensitive (v[LFACE_FOREGROUND_INDEX])
+ ^ hash_string_case_insensitive (v[LFACE_BACKGROUND_INDEX])
+ ^ XFASTINT (v[LFACE_WEIGHT_INDEX])
+ ^ XFASTINT (v[LFACE_SLANT_INDEX])
+ ^ XFASTINT (v[LFACE_SWIDTH_INDEX])
+ ^ XFASTINT (v[LFACE_HEIGHT_INDEX]));
+}
+
+
+/* Return non-zero if LFACE1 and LFACE2 specify the same font (without
+ considering charsets/registries). They do if they specify the same
+ family, point size, weight, width, slant, and fontset. Both LFACE1
+ and LFACE2 must be fully-specified. */
+
+static INLINE int
+lface_same_font_attributes_p (lface1, lface2)
+ Lisp_Object *lface1, *lface2;
+{
+ xassert (lface_fully_specified_p (lface1)
+ && lface_fully_specified_p (lface2));
+ return (xstricmp (XSTRING (lface1[LFACE_FAMILY_INDEX])->data,
+ XSTRING (lface2[LFACE_FAMILY_INDEX])->data) == 0
+ && EQ (lface1[LFACE_HEIGHT_INDEX], lface2[LFACE_HEIGHT_INDEX])
+ && EQ (lface1[LFACE_SWIDTH_INDEX], lface2[LFACE_SWIDTH_INDEX])
+ && EQ (lface1[LFACE_AVGWIDTH_INDEX], lface2[LFACE_AVGWIDTH_INDEX])
+ && EQ (lface1[LFACE_WEIGHT_INDEX], lface2[LFACE_WEIGHT_INDEX])
+ && EQ (lface1[LFACE_SLANT_INDEX], lface2[LFACE_SLANT_INDEX])
+ && (EQ (lface1[LFACE_FONT_INDEX], lface2[LFACE_FONT_INDEX])
+ || (STRINGP (lface1[LFACE_FONT_INDEX])
+ && STRINGP (lface2[LFACE_FONT_INDEX])
+ && xstricmp (XSTRING (lface1[LFACE_FONT_INDEX])->data,
+ XSTRING (lface2[LFACE_FONT_INDEX])->data))));
+}
+
+
+
+/***********************************************************************
+ Realized Faces
+ ***********************************************************************/
+
+/* Allocate and return a new realized face for Lisp face attribute
+ vector ATTR. */
+
+static struct face *
+make_realized_face (attr)
+ Lisp_Object *attr;
+{
+ struct face *face = (struct face *) xmalloc (sizeof *face);
+ bzero (face, sizeof *face);
+ face->ascii_face = face;
+ bcopy (attr, face->lface, sizeof face->lface);
+ return face;
+}
+
+
+/* Free realized face FACE, including its X resources. FACE may
+ be null. */
+
+static void
+free_realized_face (f, face)
+ struct frame *f;
+ struct face *face;
+{
+ if (face)
+ {
+#ifdef HAVE_WINDOW_SYSTEM
+ if (FRAME_WINDOW_P (f))
+ {
+ /* Free fontset of FACE if it is ASCII face. */
+ if (face->fontset >= 0 && face == face->ascii_face)
+ free_face_fontset (f, face);
+ if (face->gc)
+ {
+ x_free_gc (f, face->gc);
+ face->gc = 0;
+ }
+
+ free_face_colors (f, face);
+ x_destroy_bitmap (f, face->stipple);
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+
+ xfree (face);
+ }
+}
+
+
+/* Prepare face FACE for subsequent display on frame F. This
+ allocated GCs if they haven't been allocated yet or have been freed
+ by clearing the face cache. */
+
+void
+prepare_face_for_display (f, face)
+ struct frame *f;
+ struct face *face;
+{
+#ifdef HAVE_WINDOW_SYSTEM
+ xassert (FRAME_WINDOW_P (f));
+
+ if (face->gc == 0)
+ {
+ XGCValues xgcv;
+ unsigned long mask = GCForeground | GCBackground | GCGraphicsExposures;
+
+ xgcv.foreground = face->foreground;
+ xgcv.background = face->background;
+#ifdef HAVE_X_WINDOWS
+ xgcv.graphics_exposures = False;
+#endif
+ /* The font of FACE may be null if we couldn't load it. */
+ if (face->font)
+ {
+#ifdef HAVE_X_WINDOWS
+ xgcv.font = face->font->fid;
+#endif
+#ifdef WINDOWSNT
+ xgcv.font = face->font;
+#endif
+#ifdef macintosh
+ xgcv.font = face->font;
+#endif
+ mask |= GCFont;
+ }
+
+ BLOCK_INPUT;
+#ifdef HAVE_X_WINDOWS
+ if (face->stipple)
+ {
+ xgcv.fill_style = FillOpaqueStippled;
+ xgcv.stipple = x_bitmap_pixmap (f, face->stipple);
+ mask |= GCFillStyle | GCStipple;
+ }
+#endif
+ face->gc = x_create_gc (f, mask, &xgcv);
+ UNBLOCK_INPUT;
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+}
+
+
+/***********************************************************************
+ Face Cache
+ ***********************************************************************/
+
+/* Return a new face cache for frame F. */
+
+static struct face_cache *
+make_face_cache (f)
+ struct frame *f;
+{
+ struct face_cache *c;
+ int size;
+
+ c = (struct face_cache *) xmalloc (sizeof *c);
+ bzero (c, sizeof *c);
+ size = FACE_CACHE_BUCKETS_SIZE * sizeof *c->buckets;
+ c->buckets = (struct face **) xmalloc (size);
+ bzero (c->buckets, size);
+ c->size = 50;
+ c->faces_by_id = (struct face **) xmalloc (c->size * sizeof *c->faces_by_id);
+ c->f = f;
+ c->menu_face_changed_p = menu_face_changed_default;
+ return c;
+}
+
+
+/* Clear out all graphics contexts for all realized faces, except for
+ the basic faces. This should be done from time to time just to avoid
+ keeping too many graphics contexts that are no longer needed. */
+
+static void
+clear_face_gcs (c)
+ struct face_cache *c;
+{
+ if (c && FRAME_WINDOW_P (c->f))
+ {
+#ifdef HAVE_WINDOW_SYSTEM
+ int i;
+ for (i = BASIC_FACE_ID_SENTINEL; i < c->used; ++i)
+ {
+ struct face *face = c->faces_by_id[i];
+ if (face && face->gc)
+ {
+ x_free_gc (c->f, face->gc);
+ face->gc = 0;
+ }
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+ }
+}
+
+
+/* Free all realized faces in face cache C, including basic faces. C
+ may be null. If faces are freed, make sure the frame's current
+ matrix is marked invalid, so that a display caused by an expose
+ event doesn't try to use faces we destroyed. */
+
+static void
+free_realized_faces (c)
+ struct face_cache *c;
+{
+ if (c && c->used)
+ {
+ int i, size;
+ struct frame *f = c->f;
+
+ /* We must block input here because we can't process X events
+ safely while only some faces are freed, or when the frame's
+ current matrix still references freed faces. */
+ BLOCK_INPUT;
+
+ for (i = 0; i < c->used; ++i)
+ {
+ free_realized_face (f, c->faces_by_id[i]);
+ c->faces_by_id[i] = NULL;
+ }
+
+ c->used = 0;
+ size = FACE_CACHE_BUCKETS_SIZE * sizeof *c->buckets;
+ bzero (c->buckets, size);
+
+ /* Must do a thorough redisplay the next time. Mark current
+ matrices as invalid because they will reference faces freed
+ above. This function is also called when a frame is
+ destroyed. In this case, the root window of F is nil. */
+ if (WINDOWP (f->root_window))
+ {
+ clear_current_matrices (f);
+ ++windows_or_buffers_changed;
+ }
+
+ UNBLOCK_INPUT;
+ }
+}
+
+
+/* Free all faces realized for multibyte characters on frame F that
+ has FONTSET. */
+
+void
+free_realized_multibyte_face (f, fontset)
+ struct frame *f;
+ int fontset;
+{
+ struct face_cache *cache = FRAME_FACE_CACHE (f);
+ struct face *face;
+ int i;
+
+ /* We must block input here because we can't process X events safely
+ while only some faces are freed, or when the frame's current
+ matrix still references freed faces. */
+ BLOCK_INPUT;
+
+ for (i = 0; i < cache->used; i++)
+ {
+ face = cache->faces_by_id[i];
+ if (face
+ && face != face->ascii_face
+ && face->fontset == fontset)
+ {
+ uncache_face (cache, face);
+ free_realized_face (f, face);
+ }
+ }
+
+ /* Must do a thorough redisplay the next time. Mark current
+ matrices as invalid because they will reference faces freed
+ above. This function is also called when a frame is destroyed.
+ In this case, the root window of F is nil. */
+ if (WINDOWP (f->root_window))
+ {
+ clear_current_matrices (f);
+ ++windows_or_buffers_changed;
+ }
+
+ UNBLOCK_INPUT;
+}
+
+
+/* Free all realized faces on FRAME or on all frames if FRAME is nil.
+ This is done after attributes of a named face have been changed,
+ because we can't tell which realized faces depend on that face. */
+
+void
+free_all_realized_faces (frame)
+ Lisp_Object frame;
+{
+ if (NILP (frame))
+ {
+ Lisp_Object rest;
+ FOR_EACH_FRAME (rest, frame)
+ free_realized_faces (FRAME_FACE_CACHE (XFRAME (frame)));
+ }
+ else
+ free_realized_faces (FRAME_FACE_CACHE (XFRAME (frame)));
+}
+
+
+/* Free face cache C and faces in it, including their X resources. */
+
+static void
+free_face_cache (c)
+ struct face_cache *c;
+{
+ if (c)
+ {
+ free_realized_faces (c);
+ xfree (c->buckets);
+ xfree (c->faces_by_id);
+ xfree (c);
+ }
+}
+
+
+/* Cache realized face FACE in face cache C. HASH is the hash value
+ of FACE. If FACE->fontset >= 0, add the new face to the end of the
+ collision list of the face hash table of C. This is done because
+ otherwise lookup_face would find FACE for every character, even if
+ faces with the same attributes but for specific characters exist. */
+
+static void
+cache_face (c, face, hash)
+ struct face_cache *c;
+ struct face *face;
+ unsigned hash;
+{
+ int i = hash % FACE_CACHE_BUCKETS_SIZE;
+
+ face->hash = hash;
+
+ if (face->fontset >= 0)
+ {
+ struct face *last = c->buckets[i];
+ if (last)
+ {
+ while (last->next)
+ last = last->next;
+ last->next = face;
+ face->prev = last;
+ face->next = NULL;
+ }
+ else
+ {
+ c->buckets[i] = face;
+ face->prev = face->next = NULL;
+ }
+ }
+ else
+ {
+ face->prev = NULL;
+ face->next = c->buckets[i];
+ if (face->next)
+ face->next->prev = face;
+ c->buckets[i] = face;
+ }
+
+ /* Find a free slot in C->faces_by_id and use the index of the free
+ slot as FACE->id. */
+ for (i = 0; i < c->used; ++i)
+ if (c->faces_by_id[i] == NULL)
+ break;
+ face->id = i;
+
+ /* Maybe enlarge C->faces_by_id. */
+ if (i == c->used && c->used == c->size)
+ {
+ int new_size = 2 * c->size;
+ int sz = new_size * sizeof *c->faces_by_id;
+ c->faces_by_id = (struct face **) xrealloc (c->faces_by_id, sz);
+ c->size = new_size;
+ }
+
+#if GLYPH_DEBUG
+ /* Check that FACE got a unique id. */
+ {
+ int j, n;
+ struct face *face;
+
+ for (j = n = 0; j < FACE_CACHE_BUCKETS_SIZE; ++j)
+ for (face = c->buckets[j]; face; face = face->next)
+ if (face->id == i)
+ ++n;
+
+ xassert (n == 1);
+ }
+#endif /* GLYPH_DEBUG */
+
+ c->faces_by_id[i] = face;
+ if (i == c->used)
+ ++c->used;
+}
+
+
+/* Remove face FACE from cache C. */
+
+static void
+uncache_face (c, face)
+ struct face_cache *c;
+ struct face *face;
+{
+ int i = face->hash % FACE_CACHE_BUCKETS_SIZE;
+
+ if (face->prev)
+ face->prev->next = face->next;
+ else
+ c->buckets[i] = face->next;
+
+ if (face->next)
+ face->next->prev = face->prev;
+
+ c->faces_by_id[face->id] = NULL;
+ if (face->id == c->used)
+ --c->used;
+}
+
+
+/* Look up a realized face with face attributes ATTR in the face cache
+ of frame F. The face will be used to display character C. Value
+ is the ID of the face found. If no suitable face is found, realize
+ a new one. In that case, if C is a multibyte character, BASE_FACE
+ is a face that has the same attributes. */
+
+INLINE int
+lookup_face (f, attr, c, base_face)
+ struct frame *f;
+ Lisp_Object *attr;
+ int c;
+ struct face *base_face;
+{
+ struct face_cache *cache = FRAME_FACE_CACHE (f);
+ unsigned hash;
+ int i;
+ struct face *face;
+
+ xassert (cache != NULL);
+ check_lface_attrs (attr);
+
+ /* Look up ATTR in the face cache. */
+ hash = lface_hash (attr);
+ i = hash % FACE_CACHE_BUCKETS_SIZE;
+
+ for (face = cache->buckets[i]; face; face = face->next)
+ if (face->hash == hash
+ && (!FRAME_WINDOW_P (f)
+ || FACE_SUITABLE_FOR_CHAR_P (face, c))
+ && lface_equal_p (face->lface, attr))
+ break;
+
+ /* If not found, realize a new face. */
+ if (face == NULL)
+ face = realize_face (cache, attr, c, base_face, -1);
+
+#if GLYPH_DEBUG
+ xassert (face == FACE_FROM_ID (f, face->id));
+
+/* When this function is called from face_for_char (in this case, C is
+ a multibyte character), a fontset of a face returned by
+ realize_face is not yet set, i.e. FACE_SUITABLE_FOR_CHAR_P (FACE,
+ C) is not sutisfied. The fontset is set for this face by
+ face_for_char later. */
+#if 0
+ if (FRAME_WINDOW_P (f))
+ xassert (FACE_SUITABLE_FOR_CHAR_P (face, c));
+#endif
+#endif /* GLYPH_DEBUG */
+
+ return face->id;
+}
+
+
+/* Return the face id of the realized face for named face SYMBOL on
+ frame F suitable for displaying character C. Value is -1 if the
+ face couldn't be determined, which might happen if the default face
+ isn't realized and cannot be realized. */
+
+int
+lookup_named_face (f, symbol, c)
+ struct frame *f;
+ Lisp_Object symbol;
+ int c;
+{
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ Lisp_Object symbol_attrs[LFACE_VECTOR_SIZE];
+ struct face *default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+
+ if (default_face == NULL)
+ {
+ if (!realize_basic_faces (f))
+ return -1;
+ default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+ }
+
+ get_lface_attributes (f, symbol, symbol_attrs, 1);
+ bcopy (default_face->lface, attrs, sizeof attrs);
+ merge_face_vectors (f, symbol_attrs, attrs, Qnil);
+ return lookup_face (f, attrs, c, NULL);
+}
+
+
+/* Return the ID of the realized ASCII face of Lisp face with ID
+ LFACE_ID on frame F. Value is -1 if LFACE_ID isn't valid. */
+
+int
+ascii_face_of_lisp_face (f, lface_id)
+ struct frame *f;
+ int lface_id;
+{
+ int face_id;
+
+ if (lface_id >= 0 && lface_id < lface_id_to_name_size)
+ {
+ Lisp_Object face_name = lface_id_to_name[lface_id];
+ face_id = lookup_named_face (f, face_name, 0);
+ }
+ else
+ face_id = -1;
+
+ return face_id;
+}
+
+
+/* Return a face for charset ASCII that is like the face with id
+ FACE_ID on frame F, but has a font that is STEPS steps smaller.
+ STEPS < 0 means larger. Value is the id of the face. */
+
+int
+smaller_face (f, face_id, steps)
+ struct frame *f;
+ int face_id, steps;
+{
+#ifdef HAVE_WINDOW_SYSTEM
+ struct face *face;
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ int pt, last_pt, last_height;
+ int delta;
+ int new_face_id;
+ struct face *new_face;
+
+ /* If not called for an X frame, just return the original face. */
+ if (FRAME_TERMCAP_P (f))
+ return face_id;
+
+ /* Try in increments of 1/2 pt. */
+ delta = steps < 0 ? 5 : -5;
+ steps = abs (steps);
+
+ face = FACE_FROM_ID (f, face_id);
+ bcopy (face->lface, attrs, sizeof attrs);
+ pt = last_pt = XFASTINT (attrs[LFACE_HEIGHT_INDEX]);
+ new_face_id = face_id;
+ last_height = FONT_HEIGHT (face->font);
+
+ while (steps
+ && pt + delta > 0
+ /* Give up if we cannot find a font within 10pt. */
+ && abs (last_pt - pt) < 100)
+ {
+ /* Look up a face for a slightly smaller/larger font. */
+ pt += delta;
+ attrs[LFACE_HEIGHT_INDEX] = make_number (pt);
+ new_face_id = lookup_face (f, attrs, 0, NULL);
+ new_face = FACE_FROM_ID (f, new_face_id);
+
+ /* If height changes, count that as one step. */
+ if ((delta < 0 && FONT_HEIGHT (new_face->font) < last_height)
+ || (delta > 0 && FONT_HEIGHT (new_face->font) > last_height))
+ {
+ --steps;
+ last_height = FONT_HEIGHT (new_face->font);
+ last_pt = pt;
+ }
+ }
+
+ return new_face_id;
+
+#else /* not HAVE_WINDOW_SYSTEM */
+
+ return face_id;
+
+#endif /* not HAVE_WINDOW_SYSTEM */
+}
+
+
+/* Return a face for charset ASCII that is like the face with id
+ FACE_ID on frame F, but has height HEIGHT. */
+
+int
+face_with_height (f, face_id, height)
+ struct frame *f;
+ int face_id;
+ int height;
+{
+#ifdef HAVE_WINDOW_SYSTEM
+ struct face *face;
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+
+ if (FRAME_TERMCAP_P (f)
+ || height <= 0)
+ return face_id;
+
+ face = FACE_FROM_ID (f, face_id);
+ bcopy (face->lface, attrs, sizeof attrs);
+ attrs[LFACE_HEIGHT_INDEX] = make_number (height);
+ face_id = lookup_face (f, attrs, 0, NULL);
+#endif /* HAVE_WINDOW_SYSTEM */
+
+ return face_id;
+}
+
+
+/* Return the face id of the realized face for named face SYMBOL on
+ frame F suitable for displaying character C, and use attributes of
+ the face FACE_ID for attributes that aren't completely specified by
+ SYMBOL. This is like lookup_named_face, except that the default
+ attributes come from FACE_ID, not from the default face. FACE_ID
+ is assumed to be already realized. */
+
+int
+lookup_derived_face (f, symbol, c, face_id)
+ struct frame *f;
+ Lisp_Object symbol;
+ int c;
+ int face_id;
+{
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ Lisp_Object symbol_attrs[LFACE_VECTOR_SIZE];
+ struct face *default_face = FACE_FROM_ID (f, face_id);
+
+ if (!default_face)
+ abort ();
+
+ get_lface_attributes (f, symbol, symbol_attrs, 1);
+ bcopy (default_face->lface, attrs, sizeof attrs);
+ merge_face_vectors (f, symbol_attrs, attrs, Qnil);
+ return lookup_face (f, attrs, c, default_face);
+}
+
+
+
+/***********************************************************************
+ Font selection
+ ***********************************************************************/
+
+DEFUN ("internal-set-font-selection-order",
+ Finternal_set_font_selection_order,
+ Sinternal_set_font_selection_order, 1, 1, 0,
+ "Set font selection order for face font selection to ORDER.\n\
+ORDER must be a list of length 4 containing the symbols `:width',\n\
+`:height', `:weight', and `:slant'. Face attributes appearing\n\
+first in ORDER are matched first, e.g. if `:height' appears before\n\
+`:weight' in ORDER, font selection first tries to find a font with\n\
+a suitable height, and then tries to match the font weight.\n\
+Value is ORDER.")
+ (order)
+ Lisp_Object order;
+{
+ Lisp_Object list;
+ int i;
+ int indices[DIM (font_sort_order)];
+
+ CHECK_LIST (order, 0);
+ bzero (indices, sizeof indices);
+ i = 0;
+
+ for (list = order;
+ CONSP (list) && i < DIM (indices);
+ list = XCDR (list), ++i)
+ {
+ Lisp_Object attr = XCAR (list);
+ int xlfd;
+
+ if (EQ (attr, QCwidth))
+ xlfd = XLFD_SWIDTH;
+ else if (EQ (attr, QCheight))
+ xlfd = XLFD_POINT_SIZE;
+ else if (EQ (attr, QCweight))
+ xlfd = XLFD_WEIGHT;
+ else if (EQ (attr, QCslant))
+ xlfd = XLFD_SLANT;
+ else
+ break;
+
+ if (indices[i] != 0)
+ break;
+ indices[i] = xlfd;
+ }
+
+ if (!NILP (list) || i != DIM (indices))
+ signal_error ("Invalid font sort order", order);
+ for (i = 0; i < DIM (font_sort_order); ++i)
+ if (indices[i] == 0)
+ signal_error ("Invalid font sort order", order);
+
+ if (bcmp (indices, font_sort_order, sizeof indices) != 0)
+ {
+ bcopy (indices, font_sort_order, sizeof font_sort_order);
+ free_all_realized_faces (Qnil);
+ }
+
+ return Qnil;
+}
+
+
+DEFUN ("internal-set-alternative-font-family-alist",
+ Finternal_set_alternative_font_family_alist,
+ Sinternal_set_alternative_font_family_alist, 1, 1, 0,
+ "Define alternative font families to try in face font selection.\n\
+ALIST is an alist of (FAMILY ALTERNATIVE1 ALTERNATIVE2 ...) entries.\n\
+Each ALTERNATIVE is tried in order if no fonts of font family FAMILY can\n\
+be found. Value is ALIST.")
+ (alist)
+ Lisp_Object alist;
+{
+ CHECK_LIST (alist, 0);
+ Vface_alternative_font_family_alist = alist;
+ free_all_realized_faces (Qnil);
+ return alist;
+}
+
+
+DEFUN ("internal-set-alternative-font-registry-alist",
+ Finternal_set_alternative_font_registry_alist,
+ Sinternal_set_alternative_font_registry_alist, 1, 1, 0,
+ "Define alternative font registries to try in face font selection.\n\
+ALIST is an alist of (REGISTRY ALTERNATIVE1 ALTERNATIVE2 ...) entries.\n\
+Each ALTERNATIVE is tried in order if no fonts of font registry REGISTRY can\n\
+be found. Value is ALIST.")
+ (alist)
+ Lisp_Object alist;
+{
+ CHECK_LIST (alist, 0);
+ Vface_alternative_font_registry_alist = alist;
+ free_all_realized_faces (Qnil);
+ return alist;
+}
+
+
+#ifdef HAVE_WINDOW_SYSTEM
+
+/* Value is non-zero if FONT is the name of a scalable font. The
+ X11R6 XLFD spec says that point size, pixel size, and average width
+ are zero for scalable fonts. Intlfonts contain at least one
+ scalable font ("*-muleindian-1") for which this isn't true, so we
+ just test average width. */
+
+static int
+font_scalable_p (font)
+ struct font_name *font;
+{
+ char *s = font->fields[XLFD_AVGWIDTH];
+ return (*s == '0' && *(s + 1) == '\0')
+#ifdef WINDOWSNT
+ /* Windows implementation of XLFD is slightly broken for backward
+ compatibility with previous broken versions, so test for
+ wildcards as well as 0. */
+ || *s == '*'
+#endif
+ ;
+}
+
+
+/* Ignore the difference of font point size less than this value. */
+
+#define FONT_POINT_SIZE_QUANTUM 5
+
+/* Value is non-zero if FONT1 is a better match for font attributes
+ VALUES than FONT2. VALUES is an array of face attribute values in
+ font sort order. COMPARE_PT_P zero means don't compare point
+ sizes. AVGWIDTH, if not zero, is a specified font average width
+ to compare with. */
+
+static int
+better_font_p (values, font1, font2, compare_pt_p, avgwidth)
+ int *values;
+ struct font_name *font1, *font2;
+ int compare_pt_p, avgwidth;
+{
+ int i;
+
+ for (i = 0; i < DIM (font_sort_order); ++i)
+ {
+ int xlfd_idx = font_sort_order[i];
+
+ if (compare_pt_p || xlfd_idx != XLFD_POINT_SIZE)
+ {
+ int delta1 = abs (values[i] - font1->numeric[xlfd_idx]);
+ int delta2 = abs (values[i] - font2->numeric[xlfd_idx]);
+
+ if (xlfd_idx == XLFD_POINT_SIZE
+ && abs (delta1 - delta2) < FONT_POINT_SIZE_QUANTUM)
+ continue;
+ if (delta1 > delta2)
+ return 0;
+ else if (delta1 < delta2)
+ return 1;
+ else
+ {
+ /* The difference may be equal because, e.g., the face
+ specifies `italic' but we have only `regular' and
+ `oblique'. Prefer `oblique' in this case. */
+ if ((xlfd_idx == XLFD_WEIGHT || xlfd_idx == XLFD_SLANT)
+ && font1->numeric[xlfd_idx] > values[i]
+ && font2->numeric[xlfd_idx] < values[i])
+ return 1;
+ }
+ }
+ }
+
+ if (avgwidth)
+ {
+ int delta1 = abs (avgwidth - font1->numeric[XLFD_AVGWIDTH]);
+ int delta2 = abs (avgwidth - font2->numeric[XLFD_AVGWIDTH]);
+ if (delta1 > delta2)
+ return 0;
+ else if (delta1 < delta2)
+ return 1;
+ }
+
+ return font1->registry_priority < font2->registry_priority;
+}
+
+
+/* Value is non-zero if FONT is an exact match for face attributes in
+ SPECIFIED. SPECIFIED is an array of face attribute values in font
+ sort order. AVGWIDTH, if non-zero, is an average width to compare
+ with. */
+
+static int
+exact_face_match_p (specified, font, avgwidth)
+ int *specified;
+ struct font_name *font;
+ int avgwidth;
+{
+ int i;
+
+ for (i = 0; i < DIM (font_sort_order); ++i)
+ if (specified[i] != font->numeric[font_sort_order[i]])
+ break;
+
+ return (i == DIM (font_sort_order)
+ && (avgwidth <= 0
+ || avgwidth == font->numeric[XLFD_AVGWIDTH]));
+}
+
+
+/* Value is the name of a scaled font, generated from scalable font
+ FONT on frame F. SPECIFIED_PT is the point-size to scale FONT to.
+ Value is allocated from heap. */
+
+static char *
+build_scalable_font_name (f, font, specified_pt)
+ struct frame *f;
+ struct font_name *font;
+ int specified_pt;
+{
+ char point_size[20], pixel_size[20];
+ int pixel_value;
+ double resy = FRAME_X_DISPLAY_INFO (f)->resy;
+ double pt;
+
+ /* If scalable font is for a specific resolution, compute
+ the point size we must specify from the resolution of
+ the display and the specified resolution of the font. */
+ if (font->numeric[XLFD_RESY] != 0)
+ {
+ pt = resy / font->numeric[XLFD_RESY] * specified_pt + 0.5;
+ pixel_value = font->numeric[XLFD_RESY] / (PT_PER_INCH * 10.0) * pt;
+ }
+ else
+ {
+ pt = specified_pt;
+ pixel_value = resy / (PT_PER_INCH * 10.0) * pt;
+ }
+
+ /* Set point size of the font. */
+ sprintf (point_size, "%d", (int) pt);
+ font->fields[XLFD_POINT_SIZE] = point_size;
+ font->numeric[XLFD_POINT_SIZE] = pt;
+
+ /* Set pixel size. */
+ sprintf (pixel_size, "%d", pixel_value);
+ font->fields[XLFD_PIXEL_SIZE] = pixel_size;
+ font->numeric[XLFD_PIXEL_SIZE] = pixel_value;
+
+ /* If font doesn't specify its resolution, use the
+ resolution of the display. */
+ if (font->numeric[XLFD_RESY] == 0)
+ {
+ char buffer[20];
+ sprintf (buffer, "%d", (int) resy);
+ font->fields[XLFD_RESY] = buffer;
+ font->numeric[XLFD_RESY] = resy;
+ }
+
+ if (strcmp (font->fields[XLFD_RESX], "0") == 0)
+ {
+ char buffer[20];
+ int resx = FRAME_X_DISPLAY_INFO (f)->resx;
+ sprintf (buffer, "%d", resx);
+ font->fields[XLFD_RESX] = buffer;
+ font->numeric[XLFD_RESX] = resx;
+ }
+
+ return build_font_name (font);
+}
+
+
+/* Value is non-zero if we are allowed to use scalable font FONT. We
+ can't run a Lisp function here since this function may be called
+ with input blocked. */
+
+static int
+may_use_scalable_font_p (font)
+ char *font;
+{
+ if (EQ (Vscalable_fonts_allowed, Qt))
+ return 1;
+ else if (CONSP (Vscalable_fonts_allowed))
+ {
+ Lisp_Object tail, regexp;
+
+ for (tail = Vscalable_fonts_allowed; CONSP (tail); tail = XCDR (tail))
+ {
+ regexp = XCAR (tail);
+ if (STRINGP (regexp)
+ && fast_c_string_match_ignore_case (regexp, font) >= 0)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+
+
+/* Return the name of the best matching font for face attributes ATTRS
+ in the array of font_name structures FONTS which contains NFONTS
+ elements. WIDTH_RATIO is a factor with which to multiply average
+ widths if ATTRS specifies such a width.
+
+ Value is a font name which is allocated from the heap. FONTS is
+ freed by this function. */
+
+static char *
+best_matching_font (f, attrs, fonts, nfonts, width_ratio)
+ struct frame *f;
+ Lisp_Object *attrs;
+ struct font_name *fonts;
+ int nfonts;
+ int width_ratio;
+{
+ char *font_name;
+ struct font_name *best;
+ int i, pt = 0;
+ int specified[5];
+ int exact_p, avgwidth;
+
+ if (nfonts == 0)
+ return NULL;
+
+ /* Make specified font attributes available in `specified',
+ indexed by sort order. */
+ for (i = 0; i < DIM (font_sort_order); ++i)
+ {
+ int xlfd_idx = font_sort_order[i];
+
+ if (xlfd_idx == XLFD_SWIDTH)
+ specified[i] = face_numeric_swidth (attrs[LFACE_SWIDTH_INDEX]);
+ else if (xlfd_idx == XLFD_POINT_SIZE)
+ specified[i] = pt = XFASTINT (attrs[LFACE_HEIGHT_INDEX]);
+ else if (xlfd_idx == XLFD_WEIGHT)
+ specified[i] = face_numeric_weight (attrs[LFACE_WEIGHT_INDEX]);
+ else if (xlfd_idx == XLFD_SLANT)
+ specified[i] = face_numeric_slant (attrs[LFACE_SLANT_INDEX]);
+ else
+ abort ();
+ }
+
+ avgwidth = (UNSPECIFIEDP (attrs[LFACE_AVGWIDTH_INDEX])
+ ? 0
+ : XFASTINT (attrs[LFACE_AVGWIDTH_INDEX]) * width_ratio);
+
+ exact_p = 0;
+
+ /* Start with the first non-scalable font in the list. */
+ for (i = 0; i < nfonts; ++i)
+ if (!font_scalable_p (fonts + i))
+ break;
+
+ /* Find the best match among the non-scalable fonts. */
+ if (i < nfonts)
+ {
+ best = fonts + i;
+
+ for (i = 1; i < nfonts; ++i)
+ if (!font_scalable_p (fonts + i)
+ && better_font_p (specified, fonts + i, best, 1, avgwidth))
+ {
+ best = fonts + i;
+
+ exact_p = exact_face_match_p (specified, best, avgwidth);
+ if (exact_p)
+ break;
+ }
+
+ }
+ else
+ best = NULL;
+
+ /* Unless we found an exact match among non-scalable fonts, see if
+ we can find a better match among scalable fonts. */
+ if (!exact_p)
+ {
+ /* A scalable font is better if
+
+ 1. its weight, slant, swidth attributes are better, or.
+
+ 2. the best non-scalable font doesn't have the required
+ point size, and the scalable fonts weight, slant, swidth
+ isn't worse. */
+
+ int non_scalable_has_exact_height_p;
+
+ if (best && best->numeric[XLFD_POINT_SIZE] == pt)
+ non_scalable_has_exact_height_p = 1;
+ else
+ non_scalable_has_exact_height_p = 0;
+
+ for (i = 0; i < nfonts; ++i)
+ if (font_scalable_p (fonts + i))
+ {
+ if (best == NULL
+ || better_font_p (specified, fonts + i, best, 0, 0)
+ || (!non_scalable_has_exact_height_p
+ && !better_font_p (specified, best, fonts + i, 0, 0)))
+ best = fonts + i;
+ }
+ }
+
+ if (font_scalable_p (best))
+ font_name = build_scalable_font_name (f, best, pt);
+ else
+ font_name = build_font_name (best);
+
+ /* Free font_name structures. */
+ free_font_names (fonts, nfonts);
+
+ return font_name;
+}
+
+
+/* Get a list of matching fonts on frame F, considering FAMILY
+ and alternative font families from Vface_alternative_font_registry_alist.
+
+ FAMILY is the font family whose alternatives are considered.
+
+ REGISTRY, if a string, specifies a font registry and encoding to
+ match. A value of nil means include fonts of any registry and
+ encoding.
+
+ Return in *FONTS a pointer to a vector of font_name structures for
+ the fonts matched. Value is the number of fonts found. */
+
+static int
+try_alternative_families (f, family, registry, fonts)
+ struct frame *f;
+ Lisp_Object family, registry;
+ struct font_name **fonts;
+{
+ Lisp_Object alter;
+ int nfonts = 0;
+
+ nfonts = font_list (f, Qnil, family, registry, fonts);
+ if (nfonts == 0)
+ {
+ /* Try alternative font families. */
+ alter = Fassoc (family, Vface_alternative_font_family_alist);
+ if (CONSP (alter))
+ {
+ for (alter = XCDR (alter);
+ CONSP (alter) && nfonts == 0;
+ alter = XCDR (alter))
+ {
+ if (STRINGP (XCAR (alter)))
+ nfonts = font_list (f, Qnil, XCAR (alter), registry, fonts);
+ }
+ }
+
+ /* Try scalable fonts before giving up. */
+ if (nfonts == 0 && NILP (Vscalable_fonts_allowed))
+ {
+ int count = BINDING_STACK_SIZE ();
+ specbind (Qscalable_fonts_allowed, Qt);
+ nfonts = try_alternative_families (f, family, registry, fonts);
+ unbind_to (count, Qnil);
+ }
+ }
+ return nfonts;
+}
+
+
+/* Get a list of matching fonts on frame F.
+
+ FAMILY, if a string, specifies a font family derived from the fontset.
+ It is only used if the face does not specify any family in ATTRS or
+ if we cannot find any font of the face's family.
+
+ REGISTRY, if a string, specifies a font registry and encoding to
+ match. A value of nil means include fonts of any registry and
+ encoding.
+
+ Return in *FONTS a pointer to a vector of font_name structures for
+ the fonts matched. Value is the number of fonts found. */
+
+static int
+try_font_list (f, attrs, family, registry, fonts)
+ struct frame *f;
+ Lisp_Object *attrs;
+ Lisp_Object family, registry;
+ struct font_name **fonts;
+{
+ int nfonts = 0;
+ Lisp_Object face_family = attrs[LFACE_FAMILY_INDEX];
+
+ if (STRINGP (face_family))
+ nfonts = try_alternative_families (f, face_family, registry, fonts);
+
+ if (nfonts == 0 && !NILP (family))
+ nfonts = try_alternative_families (f, family, registry, fonts);
+
+ /* Try font family of the default face or "fixed". */
+ if (nfonts == 0)
+ {
+ struct face *default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+ if (default_face)
+ family = default_face->lface[LFACE_FAMILY_INDEX];
+ else
+ family = build_string ("fixed");
+ nfonts = font_list (f, Qnil, family, registry, fonts);
+ }
+
+ /* Try any family with the given registry. */
+ if (nfonts == 0)
+ nfonts = font_list (f, Qnil, Qnil, registry, fonts);
+
+ return nfonts;
+}
+
+
+/* Return the fontset id of the base fontset name or alias name given
+ by the fontset attribute of ATTRS. Value is -1 if the fontset
+ attribute of ATTRS doesn't name a fontset. */
+
+static int
+face_fontset (attrs)
+ Lisp_Object *attrs;
+{
+ Lisp_Object name;
+
+ name = attrs[LFACE_FONT_INDEX];
+ if (!STRINGP (name))
+ return -1;
+ return fs_query_fontset (name, 0);
+}
+
+
+/* Choose a name of font to use on frame F to display character C with
+ Lisp face attributes specified by ATTRS. The font name is
+ determined by the font-related attributes in ATTRS and the name
+ pattern for C in FONTSET. Value is the font name which is
+ allocated from the heap and must be freed by the caller, or NULL if
+ we can get no information about the font name of C. It is assured
+ that we always get some information for a single byte
+ character. */
+
+static char *
+choose_face_font (f, attrs, fontset, c)
+ struct frame *f;
+ Lisp_Object *attrs;
+ int fontset, c;
+{
+ Lisp_Object pattern;
+ char *font_name = NULL;
+ struct font_name *fonts;
+ int nfonts, width_ratio;
+
+ /* Get (foundry and) family name and registry (and encoding) name of
+ a font for C. */
+ pattern = fontset_font_pattern (f, fontset, c);
+ if (NILP (pattern))
+ {
+ xassert (!SINGLE_BYTE_CHAR_P (c));
+ return NULL;
+ }
+
+ /* If what we got is a name pattern, return it. */
+ if (STRINGP (pattern))
+ return xstrdup (XSTRING (pattern)->data);
+
+ /* Get a list of fonts matching that pattern and choose the
+ best match for the specified face attributes from it. */
+ nfonts = try_font_list (f, attrs, XCAR (pattern), XCDR (pattern), &fonts);
+ width_ratio = (SINGLE_BYTE_CHAR_P (c)
+ ? 1
+ : CHARSET_WIDTH (CHAR_CHARSET (c)));
+ font_name = best_matching_font (f, attrs, fonts, nfonts, width_ratio);
+ return font_name;
+}
+
+#endif /* HAVE_WINDOW_SYSTEM */
+
+
+
+/***********************************************************************
+ Face Realization
+ ***********************************************************************/
+
+/* Realize basic faces on frame F. Value is zero if frame parameters
+ of F don't contain enough information needed to realize the default
+ face. */
+
+static int
+realize_basic_faces (f)
+ struct frame *f;
+{
+ int success_p = 0;
+ int count = BINDING_STACK_SIZE ();
+
+ /* Block input here so that we won't be surprised by an X expose
+ event, for instance, without having the faces set up. */
+ BLOCK_INPUT;
+ specbind (Qscalable_fonts_allowed, Qt);
+
+ if (realize_default_face (f))
+ {
+ realize_named_face (f, Qmode_line, MODE_LINE_FACE_ID);
+ realize_named_face (f, Qtool_bar, TOOL_BAR_FACE_ID);
+ realize_named_face (f, Qfringe, BITMAP_AREA_FACE_ID);
+ realize_named_face (f, Qheader_line, HEADER_LINE_FACE_ID);
+ realize_named_face (f, Qscroll_bar, SCROLL_BAR_FACE_ID);
+ realize_named_face (f, Qborder, BORDER_FACE_ID);
+ realize_named_face (f, Qcursor, CURSOR_FACE_ID);
+ realize_named_face (f, Qmouse, MOUSE_FACE_ID);
+ realize_named_face (f, Qmenu, MENU_FACE_ID);
+
+ /* Reflect changes in the `menu' face in menu bars. */
+ if (FRAME_FACE_CACHE (f)->menu_face_changed_p)
+ {
+ FRAME_FACE_CACHE (f)->menu_face_changed_p = 0;
+#ifdef USE_X_TOOLKIT
+ x_update_menu_appearance (f);
+#endif
+ }
+
+ success_p = 1;
+ }
+
+ unbind_to (count, Qnil);
+ UNBLOCK_INPUT;
+ return success_p;
+}
+
+
+/* Realize the default face on frame F. If the face is not fully
+ specified, make it fully-specified. Attributes of the default face
+ that are not explicitly specified are taken from frame parameters. */
+
+static int
+realize_default_face (f)
+ struct frame *f;
+{
+ struct face_cache *c = FRAME_FACE_CACHE (f);
+ Lisp_Object lface;
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ Lisp_Object frame_font;
+ struct face *face;
+
+ /* If the `default' face is not yet known, create it. */
+ lface = lface_from_face_name (f, Qdefault, 0);
+ if (NILP (lface))
+ {
+ Lisp_Object frame;
+ XSETFRAME (frame, f);
+ lface = Finternal_make_lisp_face (Qdefault, frame);
+ }
+
+#ifdef HAVE_WINDOW_SYSTEM
+ if (FRAME_WINDOW_P (f))
+ {
+ /* Set frame_font to the value of the `font' frame parameter. */
+ frame_font = Fassq (Qfont, f->param_alist);
+ xassert (CONSP (frame_font) && STRINGP (XCDR (frame_font)));
+ frame_font = XCDR (frame_font);
+ set_lface_from_font_name (f, lface, frame_font, 1, 1);
+ }
+#endif /* HAVE_WINDOW_SYSTEM */
+
+ if (!FRAME_WINDOW_P (f))
+ {
+ LFACE_FAMILY (lface) = build_string ("default");
+ LFACE_SWIDTH (lface) = Qnormal;
+ LFACE_HEIGHT (lface) = make_number (1);
+ LFACE_WEIGHT (lface) = Qnormal;
+ LFACE_SLANT (lface) = Qnormal;
+ LFACE_AVGWIDTH (lface) = Qunspecified;
+ }
+
+ if (UNSPECIFIEDP (LFACE_UNDERLINE (lface)))
+ LFACE_UNDERLINE (lface) = Qnil;
+
+ if (UNSPECIFIEDP (LFACE_OVERLINE (lface)))
+ LFACE_OVERLINE (lface) = Qnil;
+
+ if (UNSPECIFIEDP (LFACE_STRIKE_THROUGH (lface)))
+ LFACE_STRIKE_THROUGH (lface) = Qnil;
+
+ if (UNSPECIFIEDP (LFACE_BOX (lface)))
+ LFACE_BOX (lface) = Qnil;
+
+ if (UNSPECIFIEDP (LFACE_INVERSE (lface)))
+ LFACE_INVERSE (lface) = Qnil;
+
+ if (UNSPECIFIEDP (LFACE_FOREGROUND (lface)))
+ {
+ /* This function is called so early that colors are not yet
+ set in the frame parameter list. */
+ Lisp_Object color = Fassq (Qforeground_color, f->param_alist);
+
+ if (CONSP (color) && STRINGP (XCDR (color)))
+ LFACE_FOREGROUND (lface) = XCDR (color);
+ else if (FRAME_WINDOW_P (f))
+ return 0;
+ else if (FRAME_TERMCAP_P (f) || FRAME_MSDOS_P (f))
+ LFACE_FOREGROUND (lface) = build_string (unspecified_fg);
+ else
+ abort ();
+ }
+
+ if (UNSPECIFIEDP (LFACE_BACKGROUND (lface)))
+ {
+ /* This function is called so early that colors are not yet
+ set in the frame parameter list. */
+ Lisp_Object color = Fassq (Qbackground_color, f->param_alist);
+ if (CONSP (color) && STRINGP (XCDR (color)))
+ LFACE_BACKGROUND (lface) = XCDR (color);
+ else if (FRAME_WINDOW_P (f))
+ return 0;
+ else if (FRAME_TERMCAP_P (f) || FRAME_MSDOS_P (f))
+ LFACE_BACKGROUND (lface) = build_string (unspecified_bg);
+ else
+ abort ();
+ }
+
+ if (UNSPECIFIEDP (LFACE_STIPPLE (lface)))
+ LFACE_STIPPLE (lface) = Qnil;
+
+ /* Realize the face; it must be fully-specified now. */
+ xassert (lface_fully_specified_p (XVECTOR (lface)->contents));
+ check_lface (lface);
+ bcopy (XVECTOR (lface)->contents, attrs, sizeof attrs);
+ face = realize_face (c, attrs, 0, NULL, DEFAULT_FACE_ID);
+ return 1;
+}
+
+
+/* Realize basic faces other than the default face in face cache C.
+ SYMBOL is the face name, ID is the face id the realized face must
+ have. The default face must have been realized already. */
+
+static void
+realize_named_face (f, symbol, id)
+ struct frame *f;
+ Lisp_Object symbol;
+ int id;
+{
+ struct face_cache *c = FRAME_FACE_CACHE (f);
+ Lisp_Object lface = lface_from_face_name (f, symbol, 0);
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ Lisp_Object symbol_attrs[LFACE_VECTOR_SIZE];
+ struct face *new_face;
+
+ /* The default face must exist and be fully specified. */
+ get_lface_attributes (f, Qdefault, attrs, 1);
+ check_lface_attrs (attrs);
+ xassert (lface_fully_specified_p (attrs));
+
+ /* If SYMBOL isn't know as a face, create it. */
+ if (NILP (lface))
+ {
+ Lisp_Object frame;
+ XSETFRAME (frame, f);
+ lface = Finternal_make_lisp_face (symbol, frame);
+ }
+
+ /* Merge SYMBOL's face with the default face. */
+ get_lface_attributes (f, symbol, symbol_attrs, 1);
+ merge_face_vectors (f, symbol_attrs, attrs, Qnil);
+
+ /* Realize the face. */
+ new_face = realize_face (c, attrs, 0, NULL, id);
+}
+
+
+/* Realize the fully-specified face with attributes ATTRS in face
+ cache CACHE for character C. If C is a multibyte character,
+ BASE_FACE is a face that has the same attributes. Otherwise,
+ BASE_FACE is ignored. If FORMER_FACE_ID is non-negative, it is an
+ ID of face to remove before caching the new face. Value is a
+ pointer to the newly created realized face. */
+
+static struct face *
+realize_face (cache, attrs, c, base_face, former_face_id)
+ struct face_cache *cache;
+ Lisp_Object *attrs;
+ int c;
+ struct face *base_face;
+ int former_face_id;
+{
+ struct face *face;
+
+ /* LFACE must be fully specified. */
+ xassert (cache != NULL);
+ check_lface_attrs (attrs);
+
+ if (former_face_id >= 0 && cache->used > former_face_id)
+ {
+ /* Remove the former face. */
+ struct face *former_face = cache->faces_by_id[former_face_id];
+ uncache_face (cache, former_face);
+ free_realized_face (cache->f, former_face);
+ }
+
+ if (FRAME_WINDOW_P (cache->f))
+ face = realize_x_face (cache, attrs, c, base_face);
+ else if (FRAME_TERMCAP_P (cache->f) || FRAME_MSDOS_P (cache->f))
+ face = realize_tty_face (cache, attrs, c);
+ else
+ abort ();
+
+ /* Insert the new face. */
+ cache_face (cache, face, lface_hash (attrs));
+#ifdef HAVE_WINDOW_SYSTEM
+ if (FRAME_WINDOW_P (cache->f) && face->font == NULL)
+ load_face_font (cache->f, face, c);
+#endif /* HAVE_WINDOW_SYSTEM */
+ return face;
+}
+
+
+/* Realize the fully-specified face with attributes ATTRS in face
+ cache CACHE for character C. Do it for X frame CACHE->f. If C is
+ a multibyte character, BASE_FACE is a face that has the same
+ attributes. Otherwise, BASE_FACE is ignored. If the new face
+ doesn't share font with the default face, a fontname is allocated
+ from the heap and set in `font_name' of the new face, but it is not
+ yet loaded here. Value is a pointer to the newly created realized
+ face. */
+
+static struct face *
+realize_x_face (cache, attrs, c, base_face)
+ struct face_cache *cache;
+ Lisp_Object *attrs;
+ int c;
+ struct face *base_face;
+{
+#ifdef HAVE_WINDOW_SYSTEM
+ struct face *face, *default_face;
+ struct frame *f;
+ Lisp_Object stipple, overline, strike_through, box;
+
+ xassert (FRAME_WINDOW_P (cache->f));
+ xassert (SINGLE_BYTE_CHAR_P (c)
+ || base_face);
+
+ /* Allocate a new realized face. */
+ face = make_realized_face (attrs);
+
+ f = cache->f;
+
+ /* If C is a multibyte character, we share all face attirbutes with
+ BASE_FACE including the realized fontset. But, we must load a
+ different font. */
+ if (!SINGLE_BYTE_CHAR_P (c))
+ {
+ bcopy (base_face, face, sizeof *face);
+ face->gc = 0;
+
+ /* Don't try to free the colors copied bitwise from BASE_FACE. */
+ face->colors_copied_bitwise_p = 1;
+
+ /* to force realize_face to load font */
+ face->font = NULL;
+ return face;
+ }
+
+ /* Now we are realizing a face for ASCII (and unibyte) characters. */
+
+ /* Determine the font to use. Most of the time, the font will be
+ the same as the font of the default face, so try that first. */
+ default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+ if (default_face
+ && FACE_SUITABLE_FOR_CHAR_P (default_face, c)
+ && lface_same_font_attributes_p (default_face->lface, attrs))
+ {
+ face->font = default_face->font;
+ face->fontset = default_face->fontset;
+ face->font_info_id = default_face->font_info_id;
+ face->font_name = default_face->font_name;
+ face->ascii_face = face;
+
+ /* But, as we can't share the fontset, make a new realized
+ fontset that has the same base fontset as of the default
+ face. */
+ face->fontset
+ = make_fontset_for_ascii_face (f, default_face->fontset);
+ }
+ else
+ {
+ /* If the face attribute ATTRS specifies a fontset, use it as
+ the base of a new realized fontset. Otherwise, use the same
+ base fontset as of the default face. The base determines
+ registry and encoding of a font. It may also determine
+ foundry and family. The other fields of font name pattern
+ are constructed from ATTRS. */
+ int fontset = face_fontset (attrs);
+
+ if ((fontset == -1) && default_face)
+ fontset = default_face->fontset;
+ face->fontset = make_fontset_for_ascii_face (f, fontset);
+ face->font = NULL; /* to force realize_face to load font */
+
+#ifdef macintosh
+ /* Load the font if it is specified in ATTRS. This fixes
+ changing frame font on the Mac. */
+ if (STRINGP (attrs[LFACE_FONT_INDEX]))
+ {
+ struct font_info *font_info =
+ FS_LOAD_FONT (f, 0, XSTRING (attrs[LFACE_FONT_INDEX])->data, -1);
+ if (font_info)
+ face->font = font_info->font;
+ }
+#endif
+ }
+
+ /* Load colors, and set remaining attributes. */
+
+ load_face_colors (f, face, attrs);
+
+ /* Set up box. */
+ box = attrs[LFACE_BOX_INDEX];
+ if (STRINGP (box))
+ {
+ /* A simple box of line width 1 drawn in color given by
+ the string. */
+ face->box_color = load_color (f, face, attrs[LFACE_BOX_INDEX],
+ LFACE_BOX_INDEX);
+ face->box = FACE_SIMPLE_BOX;
+ face->box_line_width = 1;
+ }
+ else if (INTEGERP (box))
+ {
+ /* Simple box of specified line width in foreground color of the
+ face. */
+ xassert (XINT (box) != 0);
+ face->box = FACE_SIMPLE_BOX;
+ face->box_line_width = XINT (box);
+ face->box_color = face->foreground;
+ face->box_color_defaulted_p = 1;
+ }
+ else if (CONSP (box))
+ {
+ /* `(:width WIDTH :color COLOR :shadow SHADOW)'. SHADOW
+ being one of `raised' or `sunken'. */
+ face->box = FACE_SIMPLE_BOX;
+ face->box_color = face->foreground;
+ face->box_color_defaulted_p = 1;
+ face->box_line_width = 1;
+
+ while (CONSP (box))
+ {
+ Lisp_Object keyword, value;
+
+ keyword = XCAR (box);
+ box = XCDR (box);
+
+ if (!CONSP (box))
+ break;
+ value = XCAR (box);
+ box = XCDR (box);
+
+ if (EQ (keyword, QCline_width))
+ {
+ if (INTEGERP (value) && XINT (value) != 0)
+ face->box_line_width = XINT (value);
+ }
+ else if (EQ (keyword, QCcolor))
+ {
+ if (STRINGP (value))
+ {
+ face->box_color = load_color (f, face, value,
+ LFACE_BOX_INDEX);
+ face->use_box_color_for_shadows_p = 1;
+ }
+ }
+ else if (EQ (keyword, QCstyle))
+ {
+ if (EQ (value, Qreleased_button))
+ face->box = FACE_RAISED_BOX;
+ else if (EQ (value, Qpressed_button))
+ face->box = FACE_SUNKEN_BOX;
+ }
+ }
+ }
+
+ /* Text underline, overline, strike-through. */
+
+ if (EQ (attrs[LFACE_UNDERLINE_INDEX], Qt))
+ {
+ /* Use default color (same as foreground color). */
+ face->underline_p = 1;
+ face->underline_defaulted_p = 1;
+ face->underline_color = 0;
+ }
+ else if (STRINGP (attrs[LFACE_UNDERLINE_INDEX]))
+ {
+ /* Use specified color. */
+ face->underline_p = 1;
+ face->underline_defaulted_p = 0;
+ face->underline_color
+ = load_color (f, face, attrs[LFACE_UNDERLINE_INDEX],
+ LFACE_UNDERLINE_INDEX);
+ }
+ else if (NILP (attrs[LFACE_UNDERLINE_INDEX]))
+ {
+ face->underline_p = 0;
+ face->underline_defaulted_p = 0;
+ face->underline_color = 0;
+ }
+
+ overline = attrs[LFACE_OVERLINE_INDEX];
+ if (STRINGP (overline))
+ {
+ face->overline_color
+ = load_color (f, face, attrs[LFACE_OVERLINE_INDEX],
+ LFACE_OVERLINE_INDEX);
+ face->overline_p = 1;
+ }
+ else if (EQ (overline, Qt))
+ {
+ face->overline_color = face->foreground;
+ face->overline_color_defaulted_p = 1;
+ face->overline_p = 1;
+ }
+
+ strike_through = attrs[LFACE_STRIKE_THROUGH_INDEX];
+ if (STRINGP (strike_through))
+ {
+ face->strike_through_color
+ = load_color (f, face, attrs[LFACE_STRIKE_THROUGH_INDEX],
+ LFACE_STRIKE_THROUGH_INDEX);
+ face->strike_through_p = 1;
+ }
+ else if (EQ (strike_through, Qt))
+ {
+ face->strike_through_color = face->foreground;
+ face->strike_through_color_defaulted_p = 1;
+ face->strike_through_p = 1;
+ }
+
+ stipple = attrs[LFACE_STIPPLE_INDEX];
+ if (!NILP (stipple))
+ face->stipple = load_pixmap (f, stipple, &face->pixmap_w, &face->pixmap_h);
+
+ xassert (FACE_SUITABLE_FOR_CHAR_P (face, c));
+ return face;
+#endif /* HAVE_WINDOW_SYSTEM */
+}
+
+
+/* Map a specified color of face FACE on frame F to a tty color index.
+ IDX is either LFACE_FOREGROUND_INDEX or LFACE_BACKGROUND_INDEX, and
+ specifies which color to map. Set *DEFAULTED to 1 if mapping to the
+ default foreground/background colors. */
+
+static void
+map_tty_color (f, face, idx, defaulted)
+ struct frame *f;
+ struct face *face;
+ enum lface_attribute_index idx;
+ int *defaulted;
+{
+ Lisp_Object frame, color, def;
+ int foreground_p = idx == LFACE_FOREGROUND_INDEX;
+ unsigned long default_pixel, default_other_pixel, pixel;
+
+ xassert (idx == LFACE_FOREGROUND_INDEX || idx == LFACE_BACKGROUND_INDEX);
+
+ if (foreground_p)
+ {
+ pixel = default_pixel = FACE_TTY_DEFAULT_FG_COLOR;
+ default_other_pixel = FACE_TTY_DEFAULT_BG_COLOR;
+ }
+ else
+ {
+ pixel = default_pixel = FACE_TTY_DEFAULT_BG_COLOR;
+ default_other_pixel = FACE_TTY_DEFAULT_FG_COLOR;
+ }
+
+ XSETFRAME (frame, f);
+ color = face->lface[idx];
+
+ if (STRINGP (color)
+ && XSTRING (color)->size
+ && CONSP (Vtty_defined_color_alist)
+ && (def = assq_no_quit (color, call1 (Qtty_color_alist, frame)),
+ CONSP (def)))
+ {
+ /* Associations in tty-defined-color-alist are of the form
+ (NAME INDEX R G B). We need the INDEX part. */
+ pixel = XINT (XCAR (XCDR (def)));
+ }
+
+ if (pixel == default_pixel && STRINGP (color))
+ {
+ pixel = load_color (f, face, color, idx);
+
+#if defined (MSDOS) || defined (WINDOWSNT)
+ /* If the foreground of the default face is the default color,
+ use the foreground color defined by the frame. */
+#ifdef MSDOS
+ if (FRAME_MSDOS_P (f))
+ {
+#endif /* MSDOS */
+ if (pixel == default_pixel
+ || pixel == FACE_TTY_DEFAULT_COLOR)
+ {
+ if (foreground_p)
+ pixel = FRAME_FOREGROUND_PIXEL (f);
+ else
+ pixel = FRAME_BACKGROUND_PIXEL (f);
+ face->lface[idx] = tty_color_name (f, pixel);
+ *defaulted = 1;
+ }
+ else if (pixel == default_other_pixel)
+ {
+ if (foreground_p)
+ pixel = FRAME_BACKGROUND_PIXEL (f);
+ else
+ pixel = FRAME_FOREGROUND_PIXEL (f);
+ face->lface[idx] = tty_color_name (f, pixel);
+ *defaulted = 1;
+ }
+#ifdef MSDOS
+ }
+#endif
+#endif /* MSDOS or WINDOWSNT */
+ }
+
+ if (foreground_p)
+ face->foreground = pixel;
+ else
+ face->background = pixel;
+}
+
+
+/* Realize the fully-specified face with attributes ATTRS in face
+ cache CACHE for character C. Do it for TTY frame CACHE->f. Value is a
+ pointer to the newly created realized face. */
+
+static struct face *
+realize_tty_face (cache, attrs, c)
+ struct face_cache *cache;
+ Lisp_Object *attrs;
+ int c;
+{
+ struct face *face;
+ int weight, slant;
+ int face_colors_defaulted = 0;
+ struct frame *f = cache->f;
+
+ /* Frame must be a termcap frame. */
+ xassert (FRAME_TERMCAP_P (cache->f) || FRAME_MSDOS_P (cache->f));
+
+ /* Allocate a new realized face. */
+ face = make_realized_face (attrs);
+ face->font_name = FRAME_MSDOS_P (cache->f) ? "ms-dos" : "tty";
+
+ /* Map face attributes to TTY appearances. We map slant to
+ dimmed text because we want italic text to appear differently
+ and because dimmed text is probably used infrequently. */
+ weight = face_numeric_weight (attrs[LFACE_WEIGHT_INDEX]);
+ slant = face_numeric_slant (attrs[LFACE_SLANT_INDEX]);
+
+ if (weight > XLFD_WEIGHT_MEDIUM)
+ face->tty_bold_p = 1;
+ if (weight < XLFD_WEIGHT_MEDIUM || slant != XLFD_SLANT_ROMAN)
+ face->tty_dim_p = 1;
+ if (!NILP (attrs[LFACE_UNDERLINE_INDEX]))
+ face->tty_underline_p = 1;
+ if (!NILP (attrs[LFACE_INVERSE_INDEX]))
+ face->tty_reverse_p = 1;
+
+ /* Map color names to color indices. */
+ map_tty_color (f, face, LFACE_FOREGROUND_INDEX, &face_colors_defaulted);
+ map_tty_color (f, face, LFACE_BACKGROUND_INDEX, &face_colors_defaulted);
+
+ /* Swap colors if face is inverse-video. If the colors are taken
+ from the frame colors, they are already inverted, since the
+ frame-creation function calls x-handle-reverse-video. */
+ if (face->tty_reverse_p && !face_colors_defaulted)
+ {
+ unsigned long tem = face->foreground;
+ face->foreground = face->background;
+ face->background = tem;
+ }
+
+ if (tty_suppress_bold_inverse_default_colors_p
+ && face->tty_bold_p
+ && face->background == FACE_TTY_DEFAULT_FG_COLOR
+ && face->foreground == FACE_TTY_DEFAULT_BG_COLOR)
+ face->tty_bold_p = 0;
+
+ return face;
+}
+
+
+DEFUN ("tty-suppress-bold-inverse-default-colors",
+ Ftty_suppress_bold_inverse_default_colors,
+ Stty_suppress_bold_inverse_default_colors, 1, 1, 0,
+ "Suppress/allow boldness of faces with inverse default colors.\n\
+SUPPRESS non-nil means suppress it.\n\
+This affects bold faces on TTYs whose foreground is the default background\n\
+color of the display and whose background is the default foreground color.\n\
+For such faces, the bold face attribute is ignored if this variable\n\
+is non-nil.")
+ (suppress)
+ Lisp_Object suppress;
+{
+ tty_suppress_bold_inverse_default_colors_p = !NILP (suppress);
+ ++face_change_count;
+ return suppress;
+}
+
+
+
+/***********************************************************************
+ Computing Faces
+ ***********************************************************************/
+
+/* Return the ID of the face to use to display character CH with face
+ property PROP on frame F in current_buffer. */
+
+int
+compute_char_face (f, ch, prop)
+ struct frame *f;
+ int ch;
+ Lisp_Object prop;
+{
+ int face_id;
+
+ if (NILP (current_buffer->enable_multibyte_characters))
+ ch = 0;
+
+ if (NILP (prop))
+ {
+ struct face *face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+ face_id = FACE_FOR_CHAR (f, face, ch);
+ }
+ else
+ {
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ struct face *default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+ bcopy (default_face->lface, attrs, sizeof attrs);
+ merge_face_vector_with_property (f, attrs, prop);
+ face_id = lookup_face (f, attrs, ch, NULL);
+ }
+
+ return face_id;
+}
+
+
+/* Return the face ID associated with buffer position POS for
+ displaying ASCII characters. Return in *ENDPTR the position at
+ which a different face is needed, as far as text properties and
+ overlays are concerned. W is a window displaying current_buffer.
+
+ REGION_BEG, REGION_END delimit the region, so it can be
+ highlighted.
+
+ LIMIT is a position not to scan beyond. That is to limit the time
+ this function can take.
+
+ If MOUSE is non-zero, use the character's mouse-face, not its face.
+
+ The face returned is suitable for displaying ASCII characters. */
+
+int
+face_at_buffer_position (w, pos, region_beg, region_end,
+ endptr, limit, mouse)
+ struct window *w;
+ int pos;
+ int region_beg, region_end;
+ int *endptr;
+ int limit;
+ int mouse;
+{
+ struct frame *f = XFRAME (w->frame);
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ Lisp_Object prop, position;
+ int i, noverlays;
+ Lisp_Object *overlay_vec;
+ Lisp_Object frame;
+ int endpos;
+ Lisp_Object propname = mouse ? Qmouse_face : Qface;
+ Lisp_Object limit1, end;
+ struct face *default_face;
+
+ /* W must display the current buffer. We could write this function
+ to use the frame and buffer of W, but right now it doesn't. */
+ /* xassert (XBUFFER (w->buffer) == current_buffer); */
+
+ XSETFRAME (frame, f);
+ XSETFASTINT (position, pos);
+
+ endpos = ZV;
+ if (pos < region_beg && region_beg < endpos)
+ endpos = region_beg;
+
+ /* Get the `face' or `mouse_face' text property at POS, and
+ determine the next position at which the property changes. */
+ prop = Fget_text_property (position, propname, w->buffer);
+ XSETFASTINT (limit1, (limit < endpos ? limit : endpos));
+ end = Fnext_single_property_change (position, propname, w->buffer, limit1);
+ if (INTEGERP (end))
+ endpos = XINT (end);
+
+ /* Look at properties from overlays. */
+ {
+ int next_overlay;
+ int len;
+
+ /* First try with room for 40 overlays. */
+ len = 40;
+ overlay_vec = (Lisp_Object *) alloca (len * sizeof (Lisp_Object));
+ noverlays = overlays_at (pos, 0, &overlay_vec, &len,
+ &next_overlay, NULL, 0);
+
+ /* If there are more than 40, make enough space for all, and try
+ again. */
+ if (noverlays > len)
+ {
+ len = noverlays;
+ overlay_vec = (Lisp_Object *) alloca (len * sizeof (Lisp_Object));
+ noverlays = overlays_at (pos, 0, &overlay_vec, &len,
+ &next_overlay, NULL, 0);
+ }
+
+ if (next_overlay < endpos)
+ endpos = next_overlay;
+ }
+
+ *endptr = endpos;
+
+ default_face = FACE_FROM_ID (f, DEFAULT_FACE_ID);
+
+ /* Optimize common cases where we can use the default face. */
+ if (noverlays == 0
+ && NILP (prop)
+ && !(pos >= region_beg && pos < region_end))
+ return DEFAULT_FACE_ID;
+
+ /* Begin with attributes from the default face. */
+ bcopy (default_face->lface, attrs, sizeof attrs);
+
+ /* Merge in attributes specified via text properties. */
+ if (!NILP (prop))
+ merge_face_vector_with_property (f, attrs, prop);
+
+ /* Now merge the overlay data. */
+ noverlays = sort_overlays (overlay_vec, noverlays, w);
+ for (i = 0; i < noverlays; i++)
+ {
+ Lisp_Object oend;
+ int oendpos;
+
+ prop = Foverlay_get (overlay_vec[i], propname);
+ if (!NILP (prop))
+ merge_face_vector_with_property (f, attrs, prop);
+
+ oend = OVERLAY_END (overlay_vec[i]);
+ oendpos = OVERLAY_POSITION (oend);
+ if (oendpos < endpos)
+ endpos = oendpos;
+ }
+
+ /* If in the region, merge in the region face. */
+ if (pos >= region_beg && pos < region_end)
+ {
+ Lisp_Object region_face = lface_from_face_name (f, Qregion, 0);
+ merge_face_vectors (f, XVECTOR (region_face)->contents, attrs, Qnil);
+
+ if (region_end < endpos)
+ endpos = region_end;
+ }
+
+ *endptr = endpos;
+
+ /* Look up a realized face with the given face attributes,
+ or realize a new one for ASCII characters. */
+ return lookup_face (f, attrs, 0, NULL);
+}
+
+
+/* Compute the face at character position POS in Lisp string STRING on
+ window W, for ASCII characters.
+
+ If STRING is an overlay string, it comes from position BUFPOS in
+ current_buffer, otherwise BUFPOS is zero to indicate that STRING is
+ not an overlay string. W must display the current buffer.
+ REGION_BEG and REGION_END give the start and end positions of the
+ region; both are -1 if no region is visible.
+
+ BASE_FACE_ID is the id of a face to merge with. For strings coming
+ from overlays or the `display' property it is the face at BUFPOS.
+
+ If MOUSE_P is non-zero, use the character's mouse-face, not its face.
+
+ Set *ENDPTR to the next position where to check for faces in
+ STRING; -1 if the face is constant from POS to the end of the
+ string.
+
+ Value is the id of the face to use. The face returned is suitable
+ for displaying ASCII characters. */
+
+int
+face_at_string_position (w, string, pos, bufpos, region_beg,
+ region_end, endptr, base_face_id, mouse_p)
+ struct window *w;
+ Lisp_Object string;
+ int pos, bufpos;
+ int region_beg, region_end;
+ int *endptr;
+ enum face_id base_face_id;
+ int mouse_p;
+{
+ Lisp_Object prop, position, end, limit;
+ struct frame *f = XFRAME (WINDOW_FRAME (w));
+ Lisp_Object attrs[LFACE_VECTOR_SIZE];
+ struct face *base_face;
+ int multibyte_p = STRING_MULTIBYTE (string);
+ Lisp_Object prop_name = mouse_p ? Qmouse_face : Qface;
+
+ /* Get the value of the face property at the current position within
+ STRING. Value is nil if there is no face property. */
+ XSETFASTINT (position, pos);
+ prop = Fget_text_property (position, prop_name, string);
+
+ /* Get the next position at which to check for faces. Value of end
+ is nil if face is constant all the way to the end of the string.
+ Otherwise it is a string position where to check faces next.
+ Limit is the maximum position up to which to check for property
+ changes in Fnext_single_property_change. Strings are usually
+ short, so set the limit to the end of the string. */
+ XSETFASTINT (limit, XSTRING (string)->size);
+ end = Fnext_single_property_change (position, prop_name, string, limit);
+ if (INTEGERP (end))
+ *endptr = XFASTINT (end);
+ else
+ *endptr = -1;
+
+ base_face = FACE_FROM_ID (f, base_face_id);
+ xassert (base_face);
+
+ /* Optimize the default case that there is no face property and we
+ are not in the region. */
+ if (NILP (prop)
+ && (base_face_id != DEFAULT_FACE_ID
+ /* BUFPOS <= 0 means STRING is not an overlay string, so
+ that the region doesn't have to be taken into account. */
+ || bufpos <= 0
+ || bufpos < region_beg
+ || bufpos >= region_end)
+ && (multibyte_p
+ /* We can't realize faces for different charsets differently
+ if we don't have fonts, so we can stop here if not working
+ on a window-system frame. */
+ || !FRAME_WINDOW_P (f)
+ || FACE_SUITABLE_FOR_CHAR_P (base_face, 0)))
+ return base_face->id;
+
+ /* Begin with attributes from the base face. */
+ bcopy (base_face->lface, attrs, sizeof attrs);
+
+ /* Merge in attributes specified via text properties. */
+ if (!NILP (prop))
+ merge_face_vector_with_property (f, attrs, prop);
+
+ /* If in the region, merge in the region face. */
+ if (bufpos
+ && bufpos >= region_beg
+ && bufpos < region_end)
+ {
+ Lisp_Object region_face = lface_from_face_name (f, Qregion, 0);
+ merge_face_vectors (f, XVECTOR (region_face)->contents, attrs, Qnil);
+ }
+
+ /* Look up a realized face with the given face attributes,
+ or realize a new one for ASCII characters. */
+ return lookup_face (f, attrs, 0, NULL);
+}
+
+
+
+/***********************************************************************
+ Tests
+ ***********************************************************************/
+
+#if GLYPH_DEBUG
+
+/* Print the contents of the realized face FACE to stderr. */
+
+static void
+dump_realized_face (face)
+ struct face *face;
+{
+ fprintf (stderr, "ID: %d\n", face->id);
+#ifdef HAVE_X_WINDOWS
+ fprintf (stderr, "gc: %d\n", (int) face->gc);
+#endif
+ fprintf (stderr, "foreground: 0x%lx (%s)\n",
+ face->foreground,
+ XSTRING (face->lface[LFACE_FOREGROUND_INDEX])->data);
+ fprintf (stderr, "background: 0x%lx (%s)\n",
+ face->background,
+ XSTRING (face->lface[LFACE_BACKGROUND_INDEX])->data);
+ fprintf (stderr, "font_name: %s (%s)\n",
+ face->font_name,
+ XSTRING (face->lface[LFACE_FAMILY_INDEX])->data);
+#ifdef HAVE_X_WINDOWS
+ fprintf (stderr, "font = %p\n", face->font);
+#endif
+ fprintf (stderr, "font_info_id = %d\n", face->font_info_id);
+ fprintf (stderr, "fontset: %d\n", face->fontset);
+ fprintf (stderr, "underline: %d (%s)\n",
+ face->underline_p,
+ XSTRING (Fsymbol_name (face->lface[LFACE_UNDERLINE_INDEX]))->data);
+ fprintf (stderr, "hash: %d\n", face->hash);
+ fprintf (stderr, "charset: %d\n", face->charset);
+}
+
+
+DEFUN ("dump-face", Fdump_face, Sdump_face, 0, 1, 0, "")
+ (n)
+ Lisp_Object n;
+{
+ if (NILP (n))
+ {
+ int i;
+
+ fprintf (stderr, "font selection order: ");
+ for (i = 0; i < DIM (font_sort_order); ++i)
+ fprintf (stderr, "%d ", font_sort_order[i]);
+ fprintf (stderr, "\n");
+
+ fprintf (stderr, "alternative fonts: ");
+ debug_print (Vface_alternative_font_family_alist);
+ fprintf (stderr, "\n");
+
+ for (i = 0; i < FRAME_FACE_CACHE (SELECTED_FRAME ())->used; ++i)
+ Fdump_face (make_number (i));
+ }
+ else
+ {
+ struct face *face;
+ CHECK_NUMBER (n, 0);
+ face = FACE_FROM_ID (SELECTED_FRAME (), XINT (n));
+ if (face == NULL)
+ error ("Not a valid face");
+ dump_realized_face (face);
+ }
+
+ return Qnil;
+}
+
+
+DEFUN ("show-face-resources", Fshow_face_resources, Sshow_face_resources,
+ 0, 0, 0, "")
+ ()
+{
+ fprintf (stderr, "number of colors = %d\n", ncolors_allocated);
+ fprintf (stderr, "number of pixmaps = %d\n", npixmaps_allocated);
+ fprintf (stderr, "number of GCs = %d\n", ngcs);
+ return Qnil;
+}
+
+#endif /* GLYPH_DEBUG != 0 */
+
+
+
+/***********************************************************************
+ Initialization
+ ***********************************************************************/
+
+void
+syms_of_xfaces ()
+{
+ Qface = intern ("face");
+ staticpro (&Qface);
+ Qbitmap_spec_p = intern ("bitmap-spec-p");
+ staticpro (&Qbitmap_spec_p);
+ Qframe_update_face_colors = intern ("frame-update-face-colors");
+ staticpro (&Qframe_update_face_colors);
+
+ /* Lisp face attribute keywords. */
+ QCfamily = intern (":family");
+ staticpro (&QCfamily);
+ QCheight = intern (":height");
+ staticpro (&QCheight);
+ QCweight = intern (":weight");
+ staticpro (&QCweight);
+ QCslant = intern (":slant");
+ staticpro (&QCslant);
+ QCunderline = intern (":underline");
+ staticpro (&QCunderline);
+ QCinverse_video = intern (":inverse-video");
+ staticpro (&QCinverse_video);
+ QCreverse_video = intern (":reverse-video");
+ staticpro (&QCreverse_video);
+ QCforeground = intern (":foreground");
+ staticpro (&QCforeground);
+ QCbackground = intern (":background");
+ staticpro (&QCbackground);
+ QCstipple = intern (":stipple");;
+ staticpro (&QCstipple);
+ QCwidth = intern (":width");
+ staticpro (&QCwidth);
+ QCfont = intern (":font");
+ staticpro (&QCfont);
+ QCbold = intern (":bold");
+ staticpro (&QCbold);
+ QCitalic = intern (":italic");
+ staticpro (&QCitalic);
+ QCoverline = intern (":overline");
+ staticpro (&QCoverline);
+ QCstrike_through = intern (":strike-through");
+ staticpro (&QCstrike_through);
+ QCbox = intern (":box");
+ staticpro (&QCbox);
+ QCinherit = intern (":inherit");
+ staticpro (&QCinherit);
+
+ /* Symbols used for Lisp face attribute values. */
+ QCcolor = intern (":color");
+ staticpro (&QCcolor);
+ QCline_width = intern (":line-width");
+ staticpro (&QCline_width);
+ QCstyle = intern (":style");
+ staticpro (&QCstyle);
+ Qreleased_button = intern ("released-button");
+ staticpro (&Qreleased_button);
+ Qpressed_button = intern ("pressed-button");
+ staticpro (&Qpressed_button);
+ Qnormal = intern ("normal");
+ staticpro (&Qnormal);
+ Qultra_light = intern ("ultra-light");
+ staticpro (&Qultra_light);
+ Qextra_light = intern ("extra-light");
+ staticpro (&Qextra_light);
+ Qlight = intern ("light");
+ staticpro (&Qlight);
+ Qsemi_light = intern ("semi-light");
+ staticpro (&Qsemi_light);
+ Qsemi_bold = intern ("semi-bold");
+ staticpro (&Qsemi_bold);
+ Qbold = intern ("bold");
+ staticpro (&Qbold);
+ Qextra_bold = intern ("extra-bold");
+ staticpro (&Qextra_bold);
+ Qultra_bold = intern ("ultra-bold");
+ staticpro (&Qultra_bold);
+ Qoblique = intern ("oblique");
+ staticpro (&Qoblique);
+ Qitalic = intern ("italic");
+ staticpro (&Qitalic);
+ Qreverse_oblique = intern ("reverse-oblique");
+ staticpro (&Qreverse_oblique);
+ Qreverse_italic = intern ("reverse-italic");
+ staticpro (&Qreverse_italic);
+ Qultra_condensed = intern ("ultra-condensed");
+ staticpro (&Qultra_condensed);
+ Qextra_condensed = intern ("extra-condensed");
+ staticpro (&Qextra_condensed);
+ Qcondensed = intern ("condensed");
+ staticpro (&Qcondensed);
+ Qsemi_condensed = intern ("semi-condensed");
+ staticpro (&Qsemi_condensed);
+ Qsemi_expanded = intern ("semi-expanded");
+ staticpro (&Qsemi_expanded);
+ Qexpanded = intern ("expanded");
+ staticpro (&Qexpanded);
+ Qextra_expanded = intern ("extra-expanded");
+ staticpro (&Qextra_expanded);
+ Qultra_expanded = intern ("ultra-expanded");
+ staticpro (&Qultra_expanded);
+ Qbackground_color = intern ("background-color");
+ staticpro (&Qbackground_color);
+ Qforeground_color = intern ("foreground-color");
+ staticpro (&Qforeground_color);
+ Qunspecified = intern ("unspecified");
+ staticpro (&Qunspecified);
+
+ Qface_alias = intern ("face-alias");
+ staticpro (&Qface_alias);
+ Qdefault = intern ("default");
+ staticpro (&Qdefault);
+ Qtool_bar = intern ("tool-bar");
+ staticpro (&Qtool_bar);
+ Qregion = intern ("region");
+ staticpro (&Qregion);
+ Qfringe = intern ("fringe");
+ staticpro (&Qfringe);
+ Qheader_line = intern ("header-line");
+ staticpro (&Qheader_line);
+ Qscroll_bar = intern ("scroll-bar");
+ staticpro (&Qscroll_bar);
+ Qmenu = intern ("menu");
+ staticpro (&Qmenu);
+ Qcursor = intern ("cursor");
+ staticpro (&Qcursor);
+ Qborder = intern ("border");
+ staticpro (&Qborder);
+ Qmouse = intern ("mouse");
+ staticpro (&Qmouse);
+ Qtty_color_desc = intern ("tty-color-desc");
+ staticpro (&Qtty_color_desc);
+ Qtty_color_by_index = intern ("tty-color-by-index");
+ staticpro (&Qtty_color_by_index);
+ Qtty_color_alist = intern ("tty-color-alist");
+ staticpro (&Qtty_color_alist);
+ Qscalable_fonts_allowed = intern ("scalable-fonts-allowed");
+ staticpro (&Qscalable_fonts_allowed);
+
+ Vparam_value_alist = Fcons (Fcons (Qnil, Qnil), Qnil);
+ staticpro (&Vparam_value_alist);
+ Vface_alternative_font_family_alist = Qnil;
+ staticpro (&Vface_alternative_font_family_alist);
+ Vface_alternative_font_registry_alist = Qnil;
+ staticpro (&Vface_alternative_font_registry_alist);
+
+ defsubr (&Sinternal_make_lisp_face);
+ defsubr (&Sinternal_lisp_face_p);
+ defsubr (&Sinternal_set_lisp_face_attribute);
+#ifdef HAVE_WINDOW_SYSTEM
+ defsubr (&Sinternal_set_lisp_face_attribute_from_resource);
+#endif
+ defsubr (&Scolor_gray_p);
+ defsubr (&Scolor_supported_p);
+ defsubr (&Sinternal_get_lisp_face_attribute);
+ defsubr (&Sinternal_lisp_face_attribute_values);
+ defsubr (&Sinternal_lisp_face_equal_p);
+ defsubr (&Sinternal_lisp_face_empty_p);
+ defsubr (&Sinternal_copy_lisp_face);
+ defsubr (&Sinternal_merge_in_global_face);
+ defsubr (&Sface_font);
+ defsubr (&Sframe_face_alist);
+ defsubr (&Sinternal_set_font_selection_order);
+ defsubr (&Sinternal_set_alternative_font_family_alist);
+ defsubr (&Sinternal_set_alternative_font_registry_alist);
+#if GLYPH_DEBUG
+ defsubr (&Sdump_face);
+ defsubr (&Sshow_face_resources);
+#endif /* GLYPH_DEBUG */
+ defsubr (&Sclear_face_cache);
+ defsubr (&Stty_suppress_bold_inverse_default_colors);
+
+#if defined DEBUG_X_COLORS && defined HAVE_X_WINDOWS
+ defsubr (&Sdump_colors);
+#endif
+
+ DEFVAR_LISP ("font-list-limit", &Vfont_list_limit,
+ "*Limit for font matching.\n\
+If an integer > 0, font matching functions won't load more than\n\
+that number of fonts when searching for a matching font.");
+ Vfont_list_limit = make_number (DEFAULT_FONT_LIST_LIMIT);
+
+ DEFVAR_LISP ("face-new-frame-defaults", &Vface_new_frame_defaults,
+ "List of global face definitions (for internal use only.)");
+ Vface_new_frame_defaults = Qnil;
+
+ DEFVAR_LISP ("face-default-stipple", &Vface_default_stipple,
+ "*Default stipple pattern used on monochrome displays.\n\
+This stipple pattern is used on monochrome displays\n\
+instead of shades of gray for a face background color.\n\
+See `set-face-stipple' for possible values for this variable.");
+ Vface_default_stipple = build_string ("gray3");
+
+ DEFVAR_LISP ("tty-defined-color-alist", &Vtty_defined_color_alist,
+ "An alist of defined terminal colors and their RGB values.");
+ Vtty_defined_color_alist = Qnil;
+
+ DEFVAR_LISP ("scalable-fonts-allowed", &Vscalable_fonts_allowed,
+ "Allowed scalable fonts.\n\
+A value of nil means don't allow any scalable fonts.\n\
+A value of t means allow any scalable font.\n\
+Otherwise, value must be a list of regular expressions. A font may be\n\
+scaled if its name matches a regular expression in the list.\n\
+Note that if value is nil, a scalable font might still be used, if no\n\
+other font of the appropriate family and registry is available.");
+ Vscalable_fonts_allowed = Qnil;
+
+ DEFVAR_LISP ("face-ignored-fonts", &Vface_ignored_fonts,
+ "List of ignored fonts.\n\
+Each element is a regular expression that matches names of fonts to ignore.");
+ Vface_ignored_fonts = Qnil;
+
+#ifdef HAVE_WINDOW_SYSTEM
+ defsubr (&Sbitmap_spec_p);
+ defsubr (&Sx_list_fonts);
+ defsubr (&Sinternal_face_x_get_resource);
+ defsubr (&Sx_family_fonts);
+ defsubr (&Sx_font_family_list);
+#endif /* HAVE_WINDOW_SYSTEM */
+}
diff --git a/tests/contrib/xfaces/patch b/tests/contrib/xfaces/patch
new file mode 100644
index 0000000..814005d
--- /dev/null
+++ b/tests/contrib/xfaces/patch
@@ -0,0 +1,51 @@
+@@ -4503,29 +4508,45 @@
+ XSETFRAME (frame, f);
+ call1 (Qframe_update_face_colors, frame);
+
+- lface = lface_from_face_name (f, Qdefault, 1);
++ face = Qdefault;
++ lface = lface_from_face_name (f, face, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ realize_basic_faces (f);
+ }
+- if (EQ (param, Qborder_color))
++ else if (EQ (param, Qborder_color))
+ {
+- lface = lface_from_face_name (f, Qborder, 1);
++ face = Qborder;
++ lface = lface_from_face_name (f, face, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ }
+ else if (EQ (param, Qcursor_color))
+ {
+- lface = lface_from_face_name (f, Qcursor, 1);
++ face = Qcursor;
++ lface = lface_from_face_name (f, face, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ }
+ else if (EQ (param, Qmouse_color))
+ {
+- lface = lface_from_face_name (f, Qmouse, 1);
++ face = Qmouse;
++ lface = lface_from_face_name (f, face, 1);
+ LFACE_BACKGROUND (lface) = (STRINGP (new_value)
+ ? new_value : Qunspecified);
+ }
++
++ /* Changing a named face means that all realized faces depending on
++ that face are invalid. Since we cannot tell which realized faces
++ depend on the face, make sure they are all removed. This is done
++ by incrementing face_change_count. The next call to
++ init_iterator will then free realized faces. */
++ if (!NILP (face)
++ && NILP (Fget (face, Qface_no_inherit)))
++ {
++ ++face_change_count;
++ ++windows_or_buffers_changed;
++ }
+ }
+
+
diff --git a/tests/linux/idmap.h/merge b/tests/linux/idmap.h/merge
new file mode 100644
index 0000000..52028f8
--- /dev/null
+++ b/tests/linux/idmap.h/merge
@@ -0,0 +1,18 @@
+<<<<<<<
+|||||||
+#define IDMAP_STATUS_LOOKUPFAIL IDMAP_STATUS_FAIL
+
+
+/* XXX get (include) from bits/utmp.h */
+#define IDMAP_NAMESZ 128
+
+=======
+#define IDMAP_STATUS_LOOKUPFAIL IDMAP_STATUS_FAIL
+
+
+#define IDMAP_MAXMSGSZ 256
+
+/* XXX get (include) from bits/utmp.h */
+#define IDMAP_NAMESZ 128
+
+>>>>>>>
diff --git a/tests/linux/idmap.h/orig b/tests/linux/idmap.h/orig
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/linux/idmap.h/orig
diff --git a/tests/linux/idmap.h/patch b/tests/linux/idmap.h/patch
new file mode 100644
index 0000000..2230a2b
--- /dev/null
+++ b/tests/linux/idmap.h/patch
@@ -0,0 +1,17 @@
+***************
+*** 55,60 ****
+ #define IDMAP_STATUS_LOOKUPFAIL IDMAP_STATUS_FAIL
+
+
+ /* XXX get (include) from bits/utmp.h */
+ #define IDMAP_NAMESZ 128
+
+--- 55,62 ----
+ #define IDMAP_STATUS_LOOKUPFAIL IDMAP_STATUS_FAIL
+
+
++ #define IDMAP_MAXMSGSZ 256
++
+ /* XXX get (include) from bits/utmp.h */
+ #define IDMAP_NAMESZ 128
+
diff --git a/tests/linux/inode-fullpatch/diff b/tests/linux/inode-fullpatch/diff
new file mode 100644
index 0000000..d2a8b0d
--- /dev/null
+++ b/tests/linux/inode-fullpatch/diff
@@ -0,0 +1,1330 @@
+@@ -1,1323 +1,43 @@
+-/*
+- * linux/fs/inode.c
+- *
+- * (C) 1997 Linus Torvalds
+- */
+-
+-#include <linux/config.h>
+-#include <linux/fs.h>
+-#include <linux/mm.h>
+-#include <linux/dcache.h>
+-#include <linux/init.h>
+-#include <linux/quotaops.h>
+-#include <linux/slab.h>
+-#include <linux/writeback.h>
+-#include <linux/module.h>
+-#include <linux/backing-dev.h>
+-#include <linux/wait.h>
+-#include <linux/hash.h>
+-#include <linux/swap.h>
+-#include <linux/security.h>
+-
+-/*
+- * This is needed for the following functions:
+- * - inode_has_buffers
+- * - invalidate_inode_buffers
+- * - fsync_bdev
+- * - invalidate_bdev
+- *
+- * FIXME: remove all knowledge of the buffer layer from this file
+- */
+-#include <linux/buffer_head.h>
+-
+-/*
+- * New inode.c implementation.
+- *
+- * This implementation has the basic premise of trying
+- * to be extremely low-overhead and SMP-safe, yet be
+- * simple enough to be "obviously correct".
+- *
+- * Famous last words.
+- */
+-
+-/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
+-
+-/* #define INODE_PARANOIA 1 */
+-/* #define INODE_DEBUG 1 */
+-
+-/*
+- * Inode lookup is no longer as critical as it used to be:
+- * most of the lookups are going to be through the dcache.
+- */
+-#define I_HASHBITS i_hash_shift
+-#define I_HASHMASK i_hash_mask
+-
+-static unsigned int i_hash_mask;
+-static unsigned int i_hash_shift;
+-
+-/*
+- * Each inode can be on two separate lists. One is
+- * the hash list of the inode, used for lookups. The
+- * other linked list is the "type" list:
+- * "in_use" - valid inode, i_count > 0, i_nlink > 0
+- * "dirty" - as "in_use" but also dirty
+- * "unused" - valid inode, i_count = 0
+- *
+- * A "dirty" list is maintained for each super block,
+- * allowing for low-overhead inode sync() operations.
+- */
+-
+-LIST_HEAD(inode_in_use);
+-LIST_HEAD(inode_unused);
+-static struct hlist_head *inode_hashtable;
+-static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
+-
+-/*
+- * A simple spinlock to protect the list manipulations.
+- *
+- * NOTE! You also have to own the lock if you change
+- * the i_state of an inode while it is in use..
+- */
+-spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+-
+-/*
+- * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+- * icache shrinking path, and the umount path. Without this exclusion,
+- * by the time prune_icache calls iput for the inode whose pages it has
+- * been invalidating, or by the time it calls clear_inode & destroy_inode
+- * from its final dispose_list, the struct super_block they refer to
+- * (for inode->i_sb->s_op) may already have been freed and reused.
+- */
+-static DECLARE_MUTEX(iprune_sem);
+-
+-/*
+- * Statistics gathering..
+- */
+-struct inodes_stat_t inodes_stat;
+-
+-static kmem_cache_t * inode_cachep;
+-
+-static struct inode *alloc_inode(struct super_block *sb)
+-{
+- static struct address_space_operations empty_aops;
+- static struct inode_operations empty_iops;
+- static struct file_operations empty_fops;
+- struct inode *inode;
+-
+- if (sb->s_op->alloc_inode)
+- inode = sb->s_op->alloc_inode(sb);
+- else
+- inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
+-
+- if (inode) {
+- struct address_space * const mapping = &inode->i_data;
+-
+- inode->i_sb = sb;
+- inode->i_blkbits = sb->s_blocksize_bits;
+- inode->i_flags = 0;
+- atomic_set(&inode->i_count, 1);
+- inode->i_sock = 0;
+- inode->i_op = &empty_iops;
+- inode->i_fop = &empty_fops;
+- inode->i_nlink = 1;
+- atomic_set(&inode->i_writecount, 0);
+- inode->i_size = 0;
+- inode->i_blocks = 0;
+- inode->i_bytes = 0;
+- inode->i_generation = 0;
+- memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+- inode->i_pipe = NULL;
+- inode->i_bdev = NULL;
+- inode->i_rdev = to_kdev_t(0);
+- inode->i_security = NULL;
+- if (security_inode_alloc(inode)) {
+- if (inode->i_sb->s_op->destroy_inode)
+- inode->i_sb->s_op->destroy_inode(inode);
+- else
+- kmem_cache_free(inode_cachep, (inode));
+- return NULL;
+- }
+-
+- mapping->a_ops = &empty_aops;
+- mapping->host = inode;
+- mapping->gfp_mask = GFP_HIGHUSER;
+- mapping->dirtied_when = 0;
+- mapping->assoc_mapping = NULL;
+- mapping->backing_dev_info = &default_backing_dev_info;
+- if (sb->s_bdev)
+- mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+- memset(&inode->u, 0, sizeof(inode->u));
+- inode->i_mapping = mapping;
+- }
+- return inode;
+-}
+-
+-void destroy_inode(struct inode *inode)
+-{
+- if (inode_has_buffers(inode))
+- BUG();
+- security_inode_free(inode);
+- if (inode->i_sb->s_op->destroy_inode)
+- inode->i_sb->s_op->destroy_inode(inode);
+- else
+- kmem_cache_free(inode_cachep, (inode));
+-}
+-
+-
+-/*
+- * These are initializations that only need to be done
+- * once, because the fields are idempotent across use
+- * of the inode, so let the slab aware of that.
+- */
+-void inode_init_once(struct inode *inode)
+-{
+- memset(inode, 0, sizeof(*inode));
+- INIT_HLIST_NODE(&inode->i_hash);
+- INIT_LIST_HEAD(&inode->i_data.clean_pages);
+- INIT_LIST_HEAD(&inode->i_data.dirty_pages);
+- INIT_LIST_HEAD(&inode->i_data.locked_pages);
+- INIT_LIST_HEAD(&inode->i_data.io_pages);
+- INIT_LIST_HEAD(&inode->i_dentry);
+- INIT_LIST_HEAD(&inode->i_devices);
+- sema_init(&inode->i_sem, 1);
+- INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+- rwlock_init(&inode->i_data.page_lock);
+- init_MUTEX(&inode->i_data.i_shared_sem);
+- INIT_LIST_HEAD(&inode->i_data.private_list);
+- spin_lock_init(&inode->i_data.private_lock);
+- INIT_LIST_HEAD(&inode->i_data.i_mmap);
+- INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+- spin_lock_init(&inode->i_lock);
+-}
+-
+-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+-{
+- struct inode * inode = (struct inode *) foo;
+-
+- if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+- SLAB_CTOR_CONSTRUCTOR)
+- inode_init_once(inode);
+-}
+-
+-/*
+- * inode_lock must be held
+- */
+-void __iget(struct inode * inode)
+-{
+- if (atomic_read(&inode->i_count)) {
+- atomic_inc(&inode->i_count);
+- return;
+- }
+- atomic_inc(&inode->i_count);
+- if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+- list_del(&inode->i_list);
+- list_add(&inode->i_list, &inode_in_use);
+- }
+- inodes_stat.nr_unused--;
+-}
+-
+-/**
+- * clear_inode - clear an inode
+- * @inode: inode to clear
+- *
+- * This is called by the filesystem to tell us
+- * that the inode is no longer useful. We just
+- * terminate it with extreme prejudice.
+- */
+-
+-void clear_inode(struct inode *inode)
+-{
+- invalidate_inode_buffers(inode);
+-
+- if (inode->i_data.nrpages)
+- BUG();
+- if (!(inode->i_state & I_FREEING))
+- BUG();
+- if (inode->i_state & I_CLEAR)
+- BUG();
+- wait_on_inode(inode);
+- DQUOT_DROP(inode);
+- if (inode->i_sb && inode->i_sb->s_op->clear_inode)
+- inode->i_sb->s_op->clear_inode(inode);
+- if (inode->i_bdev)
+- bd_forget(inode);
+- inode->i_state = I_CLEAR;
+-}
+-
+-/*
+- * Dispose-list gets a local list with local inodes in it, so it doesn't
+- * need to worry about list corruption and SMP locks.
+- */
+-static void dispose_list(struct list_head *head)
+-{
+- int nr_disposed = 0;
+-
+- while (!list_empty(head)) {
+- struct inode *inode;
+-
+- inode = list_entry(head->next, struct inode, i_list);
+- list_del(&inode->i_list);
+-
+- if (inode->i_data.nrpages)
+- truncate_inode_pages(&inode->i_data, 0);
+- clear_inode(inode);
+- destroy_inode(inode);
+- nr_disposed++;
+- }
+- spin_lock(&inode_lock);
+- inodes_stat.nr_inodes -= nr_disposed;
+- spin_unlock(&inode_lock);
+-}
+-
+-/*
+- * Invalidate all inodes for a device.
+- */
+-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
+-{
+- struct list_head *next;
+- int busy = 0, count = 0;
+-
+- next = head->next;
+- for (;;) {
+- struct list_head * tmp = next;
+- struct inode * inode;
+-
+- next = next->next;
+- if (tmp == head)
+- break;
+- inode = list_entry(tmp, struct inode, i_list);
+- if (inode->i_sb != sb)
+- continue;
+- invalidate_inode_buffers(inode);
+- if (!atomic_read(&inode->i_count)) {
+- hlist_del_init(&inode->i_hash);
+- list_del(&inode->i_list);
+- list_add(&inode->i_list, dispose);
+- inode->i_state |= I_FREEING;
+- count++;
+- continue;
+- }
+- busy = 1;
+- }
+- /* only unused inodes may be cached with i_count zero */
+- inodes_stat.nr_unused -= count;
+- return busy;
+-}
+-
+-/*
+- * This is a two-stage process. First we collect all
+- * offending inodes onto the throw-away list, and in
+- * the second stage we actually dispose of them. This
+- * is because we don't want to sleep while messing
+- * with the global lists..
+- */
+-
+-/**
+- * invalidate_inodes - discard the inodes on a device
+- * @sb: superblock
+- *
+- * Discard all of the inodes for a given superblock. If the discard
+- * fails because there are busy inodes then a non zero value is returned.
+- * If the discard is successful all the inodes have been discarded.
+- */
+-
+-int invalidate_inodes(struct super_block * sb)
+-{
+- int busy;
+- LIST_HEAD(throw_away);
+-
+- down(&iprune_sem);
+- spin_lock(&inode_lock);
+- busy = invalidate_list(&inode_in_use, sb, &throw_away);
+- busy |= invalidate_list(&inode_unused, sb, &throw_away);
+- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+- busy |= invalidate_list(&sb->s_io, sb, &throw_away);
+- spin_unlock(&inode_lock);
+-
+- dispose_list(&throw_away);
+- up(&iprune_sem);
+-
+- return busy;
+-}
+-
+-int invalidate_device(kdev_t dev, int do_sync)
+-{
+- struct super_block *sb;
+- struct block_device *bdev = bdget(kdev_t_to_nr(dev));
+- int res;
+-
+- if (!bdev)
+- return 0;
+-
+- if (do_sync)
+- fsync_bdev(bdev);
+-
+- res = 0;
+- sb = get_super(bdev);
+- if (sb) {
+- /*
+- * no need to lock the super, get_super holds the
+- * read semaphore so the filesystem cannot go away
+- * under us (->put_super runs with the write lock
+- * hold).
+- */
+- shrink_dcache_sb(sb);
+- res = invalidate_inodes(sb);
+- drop_super(sb);
+- }
+- invalidate_bdev(bdev, 0);
+- bdput(bdev);
+- return res;
+-}
+-
+-static int can_unuse(struct inode *inode)
+-{
+- if (inode->i_state)
+- return 0;
+- if (inode_has_buffers(inode))
+- return 0;
+- if (atomic_read(&inode->i_count))
+- return 0;
+- if (inode->i_data.nrpages)
+- return 0;
+- return 1;
+-}
+-
+-/*
+- * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+- * a temporary list and then are freed outside inode_lock by dispose_list().
+- *
+- * Any inodes which are pinned purely because of attached pagecache have their
+- * pagecache removed. We expect the final iput() on that inode to add it to
+- * the front of the inode_unused list. So look for it there and if the
+- * inode is still freeable, proceed. The right inode is found 99.9% of the
+- * time in testing on a 4-way.
+- *
+- * If the inode has metadata buffers attached to mapping->private_list then
+- * try to remove them.
+- */
+-static void prune_icache(int nr_to_scan)
+-{
+- LIST_HEAD(freeable);
+- int nr_pruned = 0;
+- int nr_scanned;
+- unsigned long reap = 0;
+-
+- down(&iprune_sem);
+- spin_lock(&inode_lock);
+- for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+- struct inode *inode;
+-
+- if (list_empty(&inode_unused))
+- break;
+-
+- inode = list_entry(inode_unused.prev, struct inode, i_list);
+-
+- if (inode->i_state || atomic_read(&inode->i_count)) {
+- list_move(&inode->i_list, &inode_unused);
+- continue;
+- }
+- if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+- __iget(inode);
+- spin_unlock(&inode_lock);
+- if (remove_inode_buffers(inode))
+- reap += invalidate_inode_pages(&inode->i_data);
+- iput(inode);
+- spin_lock(&inode_lock);
+-
+- if (inode != list_entry(inode_unused.next,
+- struct inode, i_list))
+- continue; /* wrong inode or list_empty */
+- if (!can_unuse(inode))
+- continue;
+- }
+- hlist_del_init(&inode->i_hash);
+- list_move(&inode->i_list, &freeable);
+- inode->i_state |= I_FREEING;
+- nr_pruned++;
+- }
+- inodes_stat.nr_unused -= nr_pruned;
+- spin_unlock(&inode_lock);
+-
+- dispose_list(&freeable);
+- up(&iprune_sem);
+-
+- if (current_is_kswapd)
+- mod_page_state(kswapd_inodesteal, reap);
+- else
+- mod_page_state(pginodesteal, reap);
+-}
+-
+-/*
+- * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
+- * "unused" means that no dentries are referring to the inodes: the files are
+- * not open and the dcache references to those inodes have already been
+- * reclaimed.
+- *
+- * This function is passed the number of inodes to scan, and it returns the
+- * total number of remaining possibly-reclaimable inodes.
+- */
+-static int shrink_icache_memory(int nr, unsigned int gfp_mask)
+-{
+- if (nr) {
+- /*
+- * Nasty deadlock avoidance. We may hold various FS locks,
+- * and we don't want to recurse into the FS that called us
+- * in clear_inode() and friends..
+- */
+- if (gfp_mask & __GFP_FS)
+- prune_icache(nr);
+- }
++*** 470,6 **** 1
+| return inodes_stat.<<<--nr_unused-->>><<<++nr_inodes++>>>;
+ }
+
+ /*
+ * Called with the inode lock held.
+ * NOTE: we are not increasing the inode-refcount, you must call __iget()
+- * by hand after calling find_inode now! This simplifies iunique and won't
+- * add any additional branch in the common code.
+- */
+-static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
+-{
+- struct hlist_node *node;
+- struct inode * inode = NULL;
+-
+- hlist_for_each (node, head) {
+- prefetch(node->next);
+- inode = hlist_entry(node, struct inode, i_hash);
+- if (inode->i_sb != sb)
++*** 492,6 **** 2
+ continue;
+ if (!test(inode, data))
+ continue;
+ break;
+ }
+| return<<<-- node ?-->>> inode<<<-- : NULL-->>>;
+-}
+-
+-/*
+- * find_inode_fast is the fast path version of find_inode, see the comment at
+- * iget_locked for details.
+- */
+-static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
+-{
+- struct hlist_node *node;
+- struct inode * inode = NULL;
+-
+- hlist_for_each (node, head) {
+- prefetch(node->next);
+- inode = list_entry(node, struct inode, i_hash);
+- if (inode->i_ino != ino)
++*** 517,6 **** 3
+ continue;
+ if (inode->i_sb != sb)
+ continue;
+ break;
+ }
+| return<<<-- node ?-->>> inode<<<-- : NULL-->>>;
+-}
+-
+-/**
+- * new_inode - obtain an inode
+- * @sb: superblock
+- *
+- * Allocates a new inode for given superblock.
+- */
+-
+-struct inode *new_inode(struct super_block *sb)
+-{
+- static unsigned long last_ino;
+- struct inode * inode;
+-
+- spin_lock_prefetch(&inode_lock);
+-
+- inode = alloc_inode(sb);
+- if (inode) {
+- spin_lock(&inode_lock);
+- inodes_stat.nr_inodes++;
+- list_add(&inode->i_list, &inode_in_use);
+- inode->i_ino = ++last_ino;
+- inode->i_state = 0;
+- spin_unlock(&inode_lock);
+- }
+- return inode;
+-}
+-
+-void unlock_new_inode(struct inode *inode)
+-{
+- /*
+- * This is special! We do not need the spinlock
+- * when clearing I_LOCK, because we're guaranteed
+- * that nobody else tries to do anything about the
+- * state of the inode when it is locked, as we
+- * just created it (so there can be no old holders
+- * that haven't tested I_LOCK).
+- */
+- inode->i_state &= ~(I_LOCK|I_NEW);
+- wake_up_inode(inode);
+-}
+-EXPORT_SYMBOL(unlock_new_inode);
+-
+-/*
+- * This is called without the inode lock held.. Be careful.
+- *
+- * We no longer cache the sb_flags in i_flags - see fs.h
+- * -- rmk@arm.uk.linux.org
+- */
+-static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
+-{
+- struct inode * inode;
+-
+- inode = alloc_inode(sb);
+- if (inode) {
+- struct inode * old;
+-
+- spin_lock(&inode_lock);
+- /* We released the lock, so.. */
+- old = find_inode(sb, head, test, data);
+- if (!old) {
+- if (set(inode, data))
+- goto set_failed;
+-
+- inodes_stat.nr_inodes++;
+- list_add(&inode->i_list, &inode_in_use);
+- hlist_add_head(&inode->i_hash, head);
+- inode->i_state = I_LOCK|I_NEW;
+- spin_unlock(&inode_lock);
+-
+- /* Return the locked inode with I_NEW set, the
+- * caller is responsible for filling in the contents
+- */
+- return inode;
+- }
+-
+- /*
+- * Uhhuh, somebody else created the same inode under
+- * us. Use the old inode instead of the one we just
+- * allocated.
+- */
+- __iget(old);
+- spin_unlock(&inode_lock);
+- destroy_inode(inode);
+- inode = old;
+- wait_on_inode(inode);
+- }
+- return inode;
+-
+-set_failed:
+- spin_unlock(&inode_lock);
+- destroy_inode(inode);
+- return NULL;
+-}
+-
+-/*
+- * get_new_inode_fast is the fast path version of get_new_inode, see the
+- * comment at iget_locked for details.
+- */
+-static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
+-{
+- struct inode * inode;
+-
+- inode = alloc_inode(sb);
+- if (inode) {
+- struct inode * old;
+-
+- spin_lock(&inode_lock);
+- /* We released the lock, so.. */
+- old = find_inode_fast(sb, head, ino);
+- if (!old) {
+- inode->i_ino = ino;
+- inodes_stat.nr_inodes++;
+- list_add(&inode->i_list, &inode_in_use);
+- hlist_add_head(&inode->i_hash, head);
+- inode->i_state = I_LOCK|I_NEW;
+- spin_unlock(&inode_lock);
+-
+- /* Return the locked inode with I_NEW set, the
+- * caller is responsible for filling in the contents
+- */
+- return inode;
+- }
+-
+- /*
+- * Uhhuh, somebody else created the same inode under
+- * us. Use the old inode instead of the one we just
+- * allocated.
+- */
+- __iget(old);
+- spin_unlock(&inode_lock);
+- destroy_inode(inode);
+- inode = old;
+- wait_on_inode(inode);
+- }
+- return inode;
+-}
+-
+-static inline unsigned long hash(struct super_block *sb, unsigned long hashval)
+-{
+- unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES);
+- tmp = tmp + (tmp >> I_HASHBITS);
+- return tmp & I_HASHMASK;
+-}
+-
+-/* Yeah, I know about quadratic hash. Maybe, later. */
+-
+-/**
+- * iunique - get a unique inode number
+- * @sb: superblock
+- * @max_reserved: highest reserved inode number
+- *
+- * Obtain an inode number that is unique on the system for a given
+- * superblock. This is used by file systems that have no natural
+- * permanent inode numbering system. An inode number is returned that
+- * is higher than the reserved limit but unique.
+- *
+- * BUGS:
+- * With a large number of inodes live on the file system this function
+- * currently becomes quite slow.
+- */
+-
+-ino_t iunique(struct super_block *sb, ino_t max_reserved)
+-{
+- static ino_t counter = 0;
+- struct inode *inode;
+- struct hlist_head * head;
+- ino_t res;
+- spin_lock(&inode_lock);
+-retry:
+- if (counter > max_reserved) {
+- head = inode_hashtable + hash(sb,counter);
+- res = counter++;
+- inode = find_inode_fast(sb, head, res);
+- if (!inode) {
+- spin_unlock(&inode_lock);
+- return res;
+- }
+- } else {
+- counter = max_reserved + 1;
+- }
+- goto retry;
+-
+-}
+-
+-struct inode *igrab(struct inode *inode)
+-{
+- spin_lock(&inode_lock);
+- if (!(inode->i_state & I_FREEING))
+- __iget(inode);
+- else
+- /*
+- * Handle the case where s_op->clear_inode is not been
+- * called yet, and somebody is calling igrab
+- * while the inode is getting freed.
+- */
+- inode = NULL;
+- spin_unlock(&inode_lock);
+- return inode;
+-}
+-
+-/**
+- * ifind - internal function, you want ilookup5() or iget5().
+- * @sb: super block of file system to search
+- * @hashval: hash value (usually inode number) to search for
+- * @test: callback used for comparisons between inodes
+- * @data: opaque data pointer to pass to @test
+- *
+- * ifind() searches for the inode specified by @hashval and @data in the inode
+- * cache. This is a generalized version of ifind_fast() for file systems where
+- * the inode number is not sufficient for unique identification of an inode.
+- *
+- * If the inode is in the cache, the inode is returned with an incremented
+- * reference count.
+- *
+- * Otherwise NULL is returned.
+- *
+- * Note, @test is called with the inode_lock held, so can't sleep.
+- */
+-static inline struct inode *ifind(struct super_block *sb,
+- struct hlist_head *head, int (*test)(struct inode *, void *),
+- void *data)
+-{
+- struct inode *inode;
+-
+- spin_lock(&inode_lock);
+- inode = find_inode(sb, head, test, data);
+- if (inode) {
+- __iget(inode);
+- spin_unlock(&inode_lock);
+- wait_on_inode(inode);
+- return inode;
+- }
+- spin_unlock(&inode_lock);
+- return NULL;
+-}
+-
+-/**
+- * ifind_fast - internal function, you want ilookup() or iget().
+- * @sb: super block of file system to search
+- * @ino: inode number to search for
+- *
+- * ifind_fast() searches for the inode @ino in the inode cache. This is for
+- * file systems where the inode number is sufficient for unique identification
+- * of an inode.
+- *
+- * If the inode is in the cache, the inode is returned with an incremented
+- * reference count.
+- *
+- * Otherwise NULL is returned.
+- */
+-static inline struct inode *ifind_fast(struct super_block *sb,
+- struct hlist_head *head, unsigned long ino)
+-{
+- struct inode *inode;
+-
+- spin_lock(&inode_lock);
+- inode = find_inode_fast(sb, head, ino);
+- if (inode) {
+- __iget(inode);
+- spin_unlock(&inode_lock);
+- wait_on_inode(inode);
+- return inode;
+- }
+- spin_unlock(&inode_lock);
+- return NULL;
+-}
+-
+-/**
+- * ilookup5 - search for an inode in the inode cache
+- * @sb: super block of file system to search
+- * @hashval: hash value (usually inode number) to search for
+- * @test: callback used for comparisons between inodes
+- * @data: opaque data pointer to pass to @test
+- *
+- * ilookup5() uses ifind() to search for the inode specified by @hashval and
+- * @data in the inode cache. This is a generalized version of ilookup() for
+- * file systems where the inode number is not sufficient for unique
+- * identification of an inode.
+- *
+- * If the inode is in the cache, the inode is returned with an incremented
+- * reference count.
+- *
+- * Otherwise NULL is returned.
+- *
+- * Note, @test is called with the inode_lock held, so can't sleep.
+- */
+-struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
+- int (*test)(struct inode *, void *), void *data)
+-{
+- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+-
+- return ifind(sb, head, test, data);
+-}
+-EXPORT_SYMBOL(ilookup5);
+-
+-/**
+- * ilookup - search for an inode in the inode cache
+- * @sb: super block of file system to search
+- * @ino: inode number to search for
+- *
+- * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
+- * This is for file systems where the inode number is sufficient for unique
+- * identification of an inode.
+- *
+- * If the inode is in the cache, the inode is returned with an incremented
+- * reference count.
+- *
+- * Otherwise NULL is returned.
+- */
+-struct inode *ilookup(struct super_block *sb, unsigned long ino)
+-{
+- struct hlist_head *head = inode_hashtable + hash(sb, ino);
+-
+- return ifind_fast(sb, head, ino);
+-}
+-EXPORT_SYMBOL(ilookup);
+-
+-/**
+- * iget5_locked - obtain an inode from a mounted file system
+- * @sb: super block of file system
+- * @hashval: hash value (usually inode number) to get
+- * @test: callback used for comparisons between inodes
+- * @set: callback used to initialize a new struct inode
+- * @data: opaque data pointer to pass to @test and @set
+- *
+- * This is iget() without the read_inode() portion of get_new_inode().
+- *
+- * iget5_locked() uses ifind() to search for the inode specified by @hashval
+- * and @data in the inode cache and if present it is returned with an increased
+- * reference count. This is a generalized version of iget_locked() for file
+- * systems where the inode number is not sufficient for unique identification
+- * of an inode.
+- *
+- * If the inode is not in cache, get_new_inode() is called to allocate a new
+- * inode and this is returned locked, hashed, and with the I_NEW flag set. The
+- * file system gets to fill it in before unlocking it via unlock_new_inode().
+- *
+- * Note both @test and @set are called with the inode_lock held, so can't sleep.
+- */
+-struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+- int (*test)(struct inode *, void *),
+- int (*set)(struct inode *, void *), void *data)
+-{
+- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+- struct inode *inode;
+-
+- inode = ifind(sb, head, test, data);
+- if (inode)
+- return inode;
+- /*
+- * get_new_inode() will do the right thing, re-trying the search
+- * in case it had to block at any point.
+- */
+- return get_new_inode(sb, head, test, set, data);
+-}
+-EXPORT_SYMBOL(iget5_locked);
+-
+-/**
+- * iget_locked - obtain an inode from a mounted file system
+- * @sb: super block of file system
+- * @ino: inode number to get
+- *
+- * This is iget() without the read_inode() portion of get_new_inode_fast().
+- *
+- * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
+- * the inode cache and if present it is returned with an increased reference
+- * count. This is for file systems where the inode number is sufficient for
+- * unique identification of an inode.
+- *
+- * If the inode is not in cache, get_new_inode_fast() is called to allocate a
+- * new inode and this is returned locked, hashed, and with the I_NEW flag set.
+- * The file system gets to fill it in before unlocking it via
+- * unlock_new_inode().
+- */
+-struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+-{
+- struct hlist_head *head = inode_hashtable + hash(sb, ino);
+- struct inode *inode;
+-
+- inode = ifind_fast(sb, head, ino);
+- if (inode)
+- return inode;
+- /*
+- * get_new_inode_fast() will do the right thing, re-trying the search
+- * in case it had to block at any point.
+- */
+- return get_new_inode_fast(sb, head, ino);
+-}
+-EXPORT_SYMBOL(iget_locked);
+-
+-/**
+- * __insert_inode_hash - hash an inode
+- * @inode: unhashed inode
+- * @hashval: unsigned long value used to locate this object in the
+- * inode_hashtable.
+- *
+- * Add an inode to the inode hash for this superblock. If the inode
+- * has no superblock it is added to a separate anonymous chain.
+- */
+-
+-void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+-{
+- struct hlist_head *head = &anon_hash_chain;
+- if (inode->i_sb)
+- head = inode_hashtable + hash(inode->i_sb, hashval);
+- spin_lock(&inode_lock);
+- hlist_add_head(&inode->i_hash, head);
+- spin_unlock(&inode_lock);
+-}
+-
+-/**
+- * remove_inode_hash - remove an inode from the hash
+- * @inode: inode to unhash
+- *
+- * Remove an inode from the superblock or anonymous hash.
+- */
+-
+-void remove_inode_hash(struct inode *inode)
+-{
+- spin_lock(&inode_lock);
+- hlist_del_init(&inode->i_hash);
+- spin_unlock(&inode_lock);
+-}
+-
+-void generic_delete_inode(struct inode *inode)
++*** 949,7 **** 4
+ {
+ struct super_operations *op = inode->i_sb->s_op;
+
+| <<<--hlist_del_init-->>><<<++list_del_init++>>>(&inode->i_hash);
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+- spin_unlock(&inode_lock);
+-
+- if (inode->i_data.nrpages)
+- truncate_inode_pages(&inode->i_data, 0);
+-
+- security_inode_delete(inode);
+-
+- if (op->delete_inode) {
+- void (*delete)(struct inode *) = op->delete_inode;
+- if (!is_bad_inode(inode))
+- DQUOT_INIT(inode);
+- /* s_op->delete_inode internally recalls clear_inode() */
++*** 968,6 **** 5
+ delete(inode);
+ } else
+ clear_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ destroy_inode(inode);
+-}
+-EXPORT_SYMBOL(generic_delete_inode);
+-
+-static void generic_forget_inode(struct inode *inode)
+-{
+- struct super_block *sb = inode->i_sb;
+-
+- if (!hlist_unhashed(&inode->i_hash)) {
+- if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+- list_del(&inode->i_list);
+- list_add(&inode->i_list, &inode_unused);
+- }
+- inodes_stat.nr_unused++;
+- spin_unlock(&inode_lock);
+- if (!sb || (sb->s_flags & MS_ACTIVE))
+- return;
+- write_inode_now(inode, 1);
+- spin_lock(&inode_lock);
+- inodes_stat.nr_unused--;
+- hlist_del_init(&inode->i_hash);
+- }
+- list_del_init(&inode->i_list);
+- inode->i_state|=I_FREEING;
+- inodes_stat.nr_inodes--;
+- spin_unlock(&inode_lock);
+- if (inode->i_data.nrpages)
+- truncate_inode_pages(&inode->i_data, 0);
+- clear_inode(inode);
+- destroy_inode(inode);
+-}
+-
+-/*
+- * Normal UNIX filesystem behaviour: delete the
+- * inode when the usage count drops to zero, and
+- * i_nlink is zero.
+- */
+-static void generic_drop_inode(struct inode *inode)
+-{
+- if (!inode->i_nlink)
+- generic_delete_inode(inode);
+- else
+- generic_forget_inode(inode);
+-}
+-
+-/*
+- * Called when we're dropping the last reference
+- * to an inode.
+- *
+- * Call the FS "drop()" function, defaulting to
+- * the legacy UNIX filesystem behaviour..
+- *
+- * NOTE! NOTE! NOTE! We're called with the inode lock
+- * held, and the drop function is supposed to release
+- * the lock!
+- */
+-static inline void iput_final(struct inode *inode)
+-{
+- struct super_operations *op = inode->i_sb->s_op;
+- void (*drop)(struct inode *) = generic_drop_inode;
+-
+- if (op && op->drop_inode)
+- drop = op->drop_inode;
+- drop(inode);
+-}
+-
+-/**
+- * iput - put an inode
+- * @inode: inode to put
+- *
+- * Puts an inode, dropping its usage count. If the inode use count hits
+- * zero the inode is also then freed and may be destroyed.
+- */
+-
+-void iput(struct inode *inode)
+-{
+- if (inode) {
+- struct super_operations *op = inode->i_sb->s_op;
+-
+- if (inode->i_state == I_CLEAR)
+- BUG();
+-
+- if (op && op->put_inode)
+- op->put_inode(inode);
+-
+- if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+- iput_final(inode);
+- }
+-}
+-
+-/**
+- * bmap - find a block number in a file
+- * @inode: inode of file
+- * @block: block to find
+- *
+- * Returns the block number on the device holding the inode that
+- * is the disk block number for the block of the file requested.
+- * That is, asked for block 4 of inode 1 the function will return the
+- * disk block relative to the disk start that holds that block of the
+- * file.
+- */
+-
+-sector_t bmap(struct inode * inode, sector_t block)
+-{
+- sector_t res = 0;
+- if (inode->i_mapping->a_ops->bmap)
+- res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
+- return res;
+-}
+-
+-/*
+- * Return true if the filesystem which backs this inode considers the two
+- * passed timespecs to be sufficiently different to warrant flushing the
+- * altered time out to disk.
+- */
+-static int inode_times_differ(struct inode *inode,
+- struct timespec *old, struct timespec *new)
+-{
+- if (IS_ONE_SECOND(inode))
+- return old->tv_sec != new->tv_sec;
+- return !timespec_equal(old, new);
+-}
+-
+-/**
+- * update_atime - update the access time
+- * @inode: inode accessed
+- *
+- * Update the accessed time on an inode and mark it for writeback.
+- * This function automatically handles read only file systems and media,
+- * as well as the "noatime" flag and inode specific "noatime" markers.
+- */
+-
+-void update_atime(struct inode *inode)
+-{
+- struct timespec now;
+-
+- if (IS_NOATIME(inode))
+- return;
+- if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+- return;
+- if (IS_RDONLY(inode))
+- return;
+-
+- now = current_kernel_time();
+- if (inode_times_differ(inode, &inode->i_atime, &now)) {
+- inode->i_atime = now;
+- mark_inode_dirty_sync(inode);
+- } else {
+- if (!timespec_equal(&inode->i_atime, &now))
+- inode->i_atime = now;
+- }
+-}
+-
+-/**
+- * inode_update_time - update mtime and ctime time
+- * @inode: inode accessed
+- * @ctime_too: update ctime too
+- *
+- * Update the mtime time on an inode and mark it for writeback.
+- * When ctime_too is specified update the ctime too.
+- */
+-
+-void inode_update_time(struct inode *inode, int ctime_too)
+-{
+- struct timespec now = current_kernel_time();
+- int sync_it = 0;
+-
+- if (inode_times_differ(inode, &inode->i_mtime, &now))
+- sync_it = 1;
+- inode->i_mtime = now;
+-
+- if (ctime_too) {
+- if (inode_times_differ(inode, &inode->i_ctime, &now))
+- sync_it = 1;
+- inode->i_ctime = now;
+- }
+- if (sync_it)
+- mark_inode_dirty_sync(inode);
+-}
+-EXPORT_SYMBOL(inode_update_time);
+-
+-int inode_needs_sync(struct inode *inode)
+-{
+- if (IS_SYNC(inode))
+- return 1;
+- if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+- return 1;
+- return 0;
+-}
+-EXPORT_SYMBOL(inode_needs_sync);
+-
+-/*
+- * Quota functions that want to walk the inode lists..
+- */
+-#ifdef CONFIG_QUOTA
+-
+-/* Functions back in dquot.c */
+-void put_dquot_list(struct list_head *);
+-int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
+-
+-void remove_dquot_ref(struct super_block *sb, int type)
+-{
+- struct inode *inode;
+- struct list_head *act_head;
+- LIST_HEAD(tofree_head);
+-
+- if (!sb->dq_op)
+- return; /* nothing to do */
+- spin_lock(&inode_lock); /* This lock is for inodes code */
+- /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */
+-
+- list_for_each(act_head, &inode_in_use) {
+- inode = list_entry(act_head, struct inode, i_list);
+- if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+- remove_inode_dquot_ref(inode, type, &tofree_head);
+- }
+- list_for_each(act_head, &inode_unused) {
+- inode = list_entry(act_head, struct inode, i_list);
+- if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+- remove_inode_dquot_ref(inode, type, &tofree_head);
+- }
+- list_for_each(act_head, &sb->s_dirty) {
+- inode = list_entry(act_head, struct inode, i_list);
+- if (IS_QUOTAINIT(inode))
+- remove_inode_dquot_ref(inode, type, &tofree_head);
+- }
+- list_for_each(act_head, &sb->s_io) {
+- inode = list_entry(act_head, struct inode, i_list);
+- if (IS_QUOTAINIT(inode))
+- remove_inode_dquot_ref(inode, type, &tofree_head);
+- }
+- spin_unlock(&inode_lock);
+-
+- put_dquot_list(&tofree_head);
+-}
+-
+-#endif
+-
+-/*
+- * Hashed waitqueues for wait_on_inode(). The table is pretty small - the
+- * kernel doesn't lock many inodes at the same time.
+- */
+-#define I_WAIT_TABLE_ORDER 3
+-static struct i_wait_queue_head {
+- wait_queue_head_t wqh;
+-} ____cacheline_aligned_in_smp i_wait_queue_heads[1<<I_WAIT_TABLE_ORDER];
+-
+-/*
+- * Return the address of the waitqueue_head to be used for this inode
+- */
+-static wait_queue_head_t *i_waitq_head(struct inode *inode)
+-{
+- return &i_wait_queue_heads[hash_ptr(inode, I_WAIT_TABLE_ORDER)].wqh;
+-}
+-
+-void __wait_on_inode(struct inode *inode)
+-{
+- DECLARE_WAITQUEUE(wait, current);
+- wait_queue_head_t *wq = i_waitq_head(inode);
+-
+- add_wait_queue(wq, &wait);
+-repeat:
+- set_current_state(TASK_UNINTERRUPTIBLE);
+- if (inode->i_state & I_LOCK) {
+- schedule();
+- goto repeat;
+- }
+- remove_wait_queue(wq, &wait);
+|<<<-- __set_current_state(-->>><<<++*** 1219,6 **** 6
+| current->state = ++>>>TASK_RUNNING<<<--)-->>>;
+ }
+
+ void wake_up_inode(struct inode *inode)
+ {
+ wait_queue_head_t *wq = i_waitq_head(inode);
+-
+- /*
+- * Prevent speculative execution through spin_unlock(&inode_lock);
+- */
+- smp_mb();
+- if (waitqueue_active(wq))
+- wake_up_all(wq);
+-}
+-
+-/*
+- * Initialize the waitqueues and inode hash table.
+- */
+-void __init inode_init(unsigned long mempages)
+-{
+- struct hlist_head *head;
+- unsigned long order;
+- unsigned int nr_hash;
+- int i;
+-
+- for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
+- init_waitqueue_head(&i_wait_queue_heads[i].wqh);
+-
+- mempages >>= (14 - PAGE_SHIFT);
+- mempages *= sizeof(struct list_head);
+- for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
+- ;
+-
+- do {
+- unsigned long tmp;
+-
+- nr_hash = (1UL << order) * PAGE_SIZE /
+- sizeof(struct hlist_head);
+- i_hash_mask = (nr_hash - 1);
+-
+- tmp = nr_hash;
+- i_hash_shift = 0;
+- while ((tmp >>= 1UL) != 0UL)
+- i_hash_shift++;
+-
+- inode_hashtable = (struct hlist_head *)
+- __get_free_pages(GFP_ATOMIC, order);
+- } while (inode_hashtable == NULL && --order >= 0);
+-
+- printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+- nr_hash, order, (PAGE_SIZE << order));
+-
+- if (!inode_hashtable)
+- panic("Failed to allocate inode hash table\n");
+-
+- head = inode_hashtable;
+- i = nr_hash;
+- do {
+- INIT_HLIST_HEAD(head);
+- head++;
+- i--;
+- } while (i);
+-
+- /* inode slab cache */
+- inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
+- 0, SLAB_HWCACHE_ALIGN, init_once,
+- NULL);
+- if (!inode_cachep)
+- panic("cannot create inode slab cache");
+-
+- set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+-}
+-
+-void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
+-{
+- inode->i_mode = mode;
+- if (S_ISCHR(mode)) {
+- inode->i_fop = &def_chr_fops;
+- inode->i_rdev = to_kdev_t(rdev);
+- } else if (S_ISBLK(mode)) {
+- inode->i_fop = &def_blk_fops;
+- inode->i_rdev = to_kdev_t(rdev);
+- } else if (S_ISFIFO(mode))
+- inode->i_fop = &def_fifo_fops;
+- else if (S_ISSOCK(mode))
+- inode->i_fop = &bad_sock_fops;
+- else
+- printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
+- mode);
+-}
diff --git a/tests/linux/inode-fullpatch/merge b/tests/linux/inode-fullpatch/merge
new file mode 100644
index 0000000..685b14e
--- /dev/null
+++ b/tests/linux/inode-fullpatch/merge
@@ -0,0 +1,1358 @@
+/*
+ * linux/fs/inode.c
+ *
+ * (C) 1997 Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dcache.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+
+/*
+ * This is needed for the following functions:
+ * - inode_has_buffers
+ * - invalidate_inode_buffers
+ * - fsync_bdev
+ * - invalidate_bdev
+ *
+ * FIXME: remove all knowledge of the buffer layer from this file
+ */
+#include <linux/buffer_head.h>
+
+/*
+ * New inode.c implementation.
+ *
+ * This implementation has the basic premise of trying
+ * to be extremely low-overhead and SMP-safe, yet be
+ * simple enough to be "obviously correct".
+ *
+ * Famous last words.
+ */
+
+/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
+
+/* #define INODE_PARANOIA 1 */
+/* #define INODE_DEBUG 1 */
+
+/*
+ * Inode lookup is no longer as critical as it used to be:
+ * most of the lookups are going to be through the dcache.
+ */
+#define I_HASHBITS i_hash_shift
+#define I_HASHMASK i_hash_mask
+
+static unsigned int i_hash_mask;
+static unsigned int i_hash_shift;
+
+/*
+ * Each inode can be on two separate lists. One is
+ * the hash list of the inode, used for lookups. The
+ * other linked list is the "type" list:
+ * "in_use" - valid inode, i_count > 0, i_nlink > 0
+ * "dirty" - as "in_use" but also dirty
+ * "unused" - valid inode, i_count = 0
+ *
+ * A "dirty" list is maintained for each super block,
+ * allowing for low-overhead inode sync() operations.
+ */
+
+LIST_HEAD(inode_in_use);
+LIST_HEAD(inode_unused);
+static struct hlist_head *inode_hashtable;
+static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
+
+/*
+ * A simple spinlock to protect the list manipulations.
+ *
+ * NOTE! You also have to own the lock if you change
+ * the i_state of an inode while it is in use..
+ */
+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * icache shrinking path, and the umount path. Without this exclusion,
+ * by the time prune_icache calls iput for the inode whose pages it has
+ * been invalidating, or by the time it calls clear_inode & destroy_inode
+ * from its final dispose_list, the struct super_block they refer to
+ * (for inode->i_sb->s_op) may already have been freed and reused.
+ */
+static DECLARE_MUTEX(iprune_sem);
+
+/*
+ * Statistics gathering..
+ */
+struct inodes_stat_t inodes_stat;
+
+static kmem_cache_t * inode_cachep;
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+ static struct address_space_operations empty_aops;
+ static struct inode_operations empty_iops;
+ static struct file_operations empty_fops;
+ struct inode *inode;
+
+ if (sb->s_op->alloc_inode)
+ inode = sb->s_op->alloc_inode(sb);
+ else
+ inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
+
+ if (inode) {
+ struct address_space * const mapping = &inode->i_data;
+
+ inode->i_sb = sb;
+ inode->i_blkbits = sb->s_blocksize_bits;
+ inode->i_flags = 0;
+ atomic_set(&inode->i_count, 1);
+ inode->i_sock = 0;
+ inode->i_op = &empty_iops;
+ inode->i_fop = &empty_fops;
+ inode->i_nlink = 1;
+ atomic_set(&inode->i_writecount, 0);
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ inode->i_bytes = 0;
+ inode->i_generation = 0;
+ memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+ inode->i_pipe = NULL;
+ inode->i_bdev = NULL;
+ inode->i_rdev = to_kdev_t(0);
+ inode->i_security = NULL;
+ if (security_inode_alloc(inode)) {
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+ return NULL;
+ }
+
+ mapping->a_ops = &empty_aops;
+ mapping->host = inode;
+ mapping->gfp_mask = GFP_HIGHUSER;
+ mapping->dirtied_when = 0;
+ mapping->assoc_mapping = NULL;
+ mapping->backing_dev_info = &default_backing_dev_info;
+ if (sb->s_bdev)
+ mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ memset(&inode->u, 0, sizeof(inode->u));
+ inode->i_mapping = mapping;
+ }
+ return inode;
+}
+
+void destroy_inode(struct inode *inode)
+{
+ if (inode_has_buffers(inode))
+ BUG();
+ security_inode_free(inode);
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+}
+
+
+/*
+ * These are initializations that only need to be done
+ * once, because the fields are idempotent across use
+ * of the inode, so let the slab aware of that.
+ */
+void inode_init_once(struct inode *inode)
+{
+ memset(inode, 0, sizeof(*inode));
+ INIT_HLIST_NODE(&inode->i_hash);
+ INIT_LIST_HEAD(&inode->i_data.clean_pages);
+ INIT_LIST_HEAD(&inode->i_data.dirty_pages);
+ INIT_LIST_HEAD(&inode->i_data.locked_pages);
+ INIT_LIST_HEAD(&inode->i_data.io_pages);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ INIT_LIST_HEAD(&inode->i_devices);
+ sema_init(&inode->i_sem, 1);
+ INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+ rwlock_init(&inode->i_data.page_lock);
+ init_MUTEX(&inode->i_data.i_shared_sem);
+ INIT_LIST_HEAD(&inode->i_data.private_list);
+ spin_lock_init(&inode->i_data.private_lock);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+ spin_lock_init(&inode->i_lock);
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+ struct inode * inode = (struct inode *) foo;
+
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ inode_init_once(inode);
+}
+
+/*
+ * inode_lock must be held
+ */
+void __iget(struct inode * inode)
+{
+ if (atomic_read(&inode->i_count)) {
+ atomic_inc(&inode->i_count);
+ return;
+ }
+ atomic_inc(&inode->i_count);
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_in_use);
+ }
+ inodes_stat.nr_unused--;
+}
+
+/**
+ * clear_inode - clear an inode
+ * @inode: inode to clear
+ *
+ * This is called by the filesystem to tell us
+ * that the inode is no longer useful. We just
+ * terminate it with extreme prejudice.
+ */
+
+void clear_inode(struct inode *inode)
+{
+ invalidate_inode_buffers(inode);
+
+ if (inode->i_data.nrpages)
+ BUG();
+ if (!(inode->i_state & I_FREEING))
+ BUG();
+ if (inode->i_state & I_CLEAR)
+ BUG();
+ wait_on_inode(inode);
+ DQUOT_DROP(inode);
+ if (inode->i_sb && inode->i_sb->s_op->clear_inode)
+ inode->i_sb->s_op->clear_inode(inode);
+ if (inode->i_bdev)
+ bd_forget(inode);
+ inode->i_state = I_CLEAR;
+}
+
+/*
+ * Dispose-list gets a local list with local inodes in it, so it doesn't
+ * need to worry about list corruption and SMP locks.
+ */
+static void dispose_list(struct list_head *head)
+{
+ int nr_disposed = 0;
+
+ while (!list_empty(head)) {
+ struct inode *inode;
+
+ inode = list_entry(head->next, struct inode, i_list);
+ list_del(&inode->i_list);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+ nr_disposed++;
+ }
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes -= nr_disposed;
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * Invalidate all inodes for a device.
+ */
+static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
+{
+ struct list_head *next;
+ int busy = 0, count = 0;
+
+ next = head->next;
+ for (;;) {
+ struct list_head * tmp = next;
+ struct inode * inode;
+
+ next = next->next;
+ if (tmp == head)
+ break;
+ inode = list_entry(tmp, struct inode, i_list);
+ if (inode->i_sb != sb)
+ continue;
+ invalidate_inode_buffers(inode);
+ if (!atomic_read(&inode->i_count)) {
+ hlist_del_init(&inode->i_hash);
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, dispose);
+ inode->i_state |= I_FREEING;
+ count++;
+ continue;
+ }
+ busy = 1;
+ }
+ /* only unused inodes may be cached with i_count zero */
+ inodes_stat.nr_unused -= count;
+ return busy;
+}
+
+/*
+ * This is a two-stage process. First we collect all
+ * offending inodes onto the throw-away list, and in
+ * the second stage we actually dispose of them. This
+ * is because we don't want to sleep while messing
+ * with the global lists..
+ */
+
+/**
+ * invalidate_inodes - discard the inodes on a device
+ * @sb: superblock
+ *
+ * Discard all of the inodes for a given superblock. If the discard
+ * fails because there are busy inodes then a non zero value is returned.
+ * If the discard is successful all the inodes have been discarded.
+ */
+
+int invalidate_inodes(struct super_block * sb)
+{
+ int busy;
+ LIST_HEAD(throw_away);
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ busy = invalidate_list(&inode_in_use, sb, &throw_away);
+ busy |= invalidate_list(&inode_unused, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_io, sb, &throw_away);
+ spin_unlock(&inode_lock);
+
+ dispose_list(&throw_away);
+ up(&iprune_sem);
+
+ return busy;
+}
+
+int invalidate_device(kdev_t dev, int do_sync)
+{
+ struct super_block *sb;
+ struct block_device *bdev = bdget(kdev_t_to_nr(dev));
+ int res;
+
+ if (!bdev)
+ return 0;
+
+ if (do_sync)
+ fsync_bdev(bdev);
+
+ res = 0;
+ sb = get_super(bdev);
+ if (sb) {
+ /*
+ * no need to lock the super, get_super holds the
+ * read semaphore so the filesystem cannot go away
+ * under us (->put_super runs with the write lock
+ * hold).
+ */
+ shrink_dcache_sb(sb);
+ res = invalidate_inodes(sb);
+ drop_super(sb);
+ }
+ invalidate_bdev(bdev, 0);
+ bdput(bdev);
+ return res;
+}
+
+static int can_unuse(struct inode *inode)
+{
+ if (inode->i_state)
+ return 0;
+ if (inode_has_buffers(inode))
+ return 0;
+ if (atomic_read(&inode->i_count))
+ return 0;
+ if (inode->i_data.nrpages)
+ return 0;
+ return 1;
+}
+
+/*
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * a temporary list and then are freed outside inode_lock by dispose_list().
+ *
+ * Any inodes which are pinned purely because of attached pagecache have their
+ * pagecache removed. We expect the final iput() on that inode to add it to
+ * the front of the inode_unused list. So look for it there and if the
+ * inode is still freeable, proceed. The right inode is found 99.9% of the
+ * time in testing on a 4-way.
+ *
+ * If the inode has metadata buffers attached to mapping->private_list then
+ * try to remove them.
+ */
+static void prune_icache(int nr_to_scan)
+{
+ LIST_HEAD(freeable);
+ int nr_pruned = 0;
+ int nr_scanned;
+ unsigned long reap = 0;
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+ struct inode *inode;
+
+ if (list_empty(&inode_unused))
+ break;
+
+ inode = list_entry(inode_unused.prev, struct inode, i_list);
+
+ if (inode->i_state || atomic_read(&inode->i_count)) {
+ list_move(&inode->i_list, &inode_unused);
+ continue;
+ }
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ if (remove_inode_buffers(inode))
+ reap += invalidate_inode_pages(&inode->i_data);
+ iput(inode);
+ spin_lock(&inode_lock);
+
+ if (inode != list_entry(inode_unused.next,
+ struct inode, i_list))
+ continue; /* wrong inode or list_empty */
+ if (!can_unuse(inode))
+ continue;
+ }
+ hlist_del_init(&inode->i_hash);
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ nr_pruned++;
+ }
+ inodes_stat.nr_unused -= nr_pruned;
+ spin_unlock(&inode_lock);
+
+ dispose_list(&freeable);
+ up(&iprune_sem);
+
+ if (current_is_kswapd)
+ mod_page_state(kswapd_inodesteal, reap);
+ else
+ mod_page_state(pginodesteal, reap);
+}
+
+/*
+ * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
+ * "unused" means that no dentries are referring to the inodes: the files are
+ * not open and the dcache references to those inodes have already been
+ * reclaimed.
+ *
+ * This function is passed the number of inodes to scan, and it returns the
+ * total number of remaining possibly-reclaimable inodes.
+ */
+static int shrink_icache_memory(int nr, unsigned int gfp_mask)
+{
+ if (nr) {
+ /*
+ * Nasty deadlock avoidance. We may hold various FS locks,
+ * and we don't want to recurse into the FS that called us
+ * in clear_inode() and friends..
+ */
+ if (gfp_mask & __GFP_FS)
+ prune_icache(nr);
+ }
+ return inodes_stat.nr_unused;
+}
+
+void __wait_on_freeing_inode(struct inode *inode);
+/*
+ * Called with the inode lock held.
+ * NOTE: we are not increasing the inode-refcount, you must call __iget()
+ * by hand after calling find_inode now! This simplifies iunique and won't
+ * add any additional branch in the common code.
+ */
+static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = hlist_entry(node, struct inode, i_hash);
+ if (inode->i_sb != sb)
+ continue;
+ if (!test(inode, data))
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/*
+ * find_inode_fast is the fast path version of find_inode, see the comment at
+ * iget_locked for details.
+ */
+static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = list_entry(node, struct inode, i_hash);
+ if (inode->i_ino != ino)
+ continue;
+ if (inode->i_sb != sb)
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/**
+ * new_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ */
+
+struct inode *new_inode(struct super_block *sb)
+{
+ static unsigned long last_ino;
+ struct inode * inode;
+
+ spin_lock_prefetch(&inode_lock);
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ inode->i_ino = ++last_ino;
+ inode->i_state = 0;
+ spin_unlock(&inode_lock);
+ }
+ return inode;
+}
+
+void unlock_new_inode(struct inode *inode)
+{
+ /*
+ * This is special! We do not need the spinlock
+ * when clearing I_LOCK, because we're guaranteed
+ * that nobody else tries to do anything about the
+ * state of the inode when it is locked, as we
+ * just created it (so there can be no old holders
+ * that haven't tested I_LOCK).
+ */
+ inode->i_state &= ~(I_LOCK|I_NEW);
+ wake_up_inode(inode);
+}
+EXPORT_SYMBOL(unlock_new_inode);
+
+/*
+ * This is called without the inode lock held.. Be careful.
+ *
+ * We no longer cache the sb_flags in i_flags - see fs.h
+ * -- rmk@arm.uk.linux.org
+ */
+static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode(sb, head, test, data);
+ if (!old) {
+ if (set(inode, data))
+ goto set_failed;
+
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+
+set_failed:
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ return NULL;
+}
+
+/*
+ * get_new_inode_fast is the fast path version of get_new_inode, see the
+ * comment at iget_locked for details.
+ */
+static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode_fast(sb, head, ino);
+ if (!old) {
+ inode->i_ino = ino;
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+}
+
+static inline unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES);
+ tmp = tmp + (tmp >> I_HASHBITS);
+ return tmp & I_HASHMASK;
+}
+
+/* Yeah, I know about quadratic hash. Maybe, later. */
+
+/**
+ * iunique - get a unique inode number
+ * @sb: superblock
+ * @max_reserved: highest reserved inode number
+ *
+ * Obtain an inode number that is unique on the system for a given
+ * superblock. This is used by file systems that have no natural
+ * permanent inode numbering system. An inode number is returned that
+ * is higher than the reserved limit but unique.
+ *
+ * BUGS:
+ * With a large number of inodes live on the file system this function
+ * currently becomes quite slow.
+ */
+
+ino_t iunique(struct super_block *sb, ino_t max_reserved)
+{
+ static ino_t counter = 0;
+ struct inode *inode;
+ struct hlist_head * head;
+ ino_t res;
+ spin_lock(&inode_lock);
+retry:
+ if (counter > max_reserved) {
+ head = inode_hashtable + hash(sb,counter);
+ res = counter++;
+ inode = find_inode_fast(sb, head, res);
+ if (!inode) {
+ spin_unlock(&inode_lock);
+ return res;
+ }
+ } else {
+ counter = max_reserved + 1;
+ }
+ goto retry;
+
+}
+
+struct inode *igrab(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ if (!(inode->i_state & I_FREEING))
+ __iget(inode);
+ else
+ /*
+ * Handle the case where s_op->clear_inode is not been
+ * called yet, and somebody is calling igrab
+ * while the inode is getting freed.
+ */
+ inode = NULL;
+ spin_unlock(&inode_lock);
+ return inode;
+}
+
+/**
+ * ifind - internal function, you want ilookup5() or iget5().
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ifind() searches for the inode specified by @hashval and @data in the inode
+ * cache. This is a generalized version of ifind_fast() for file systems where
+ * the inode number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+static inline struct inode *ifind(struct super_block *sb,
+ struct hlist_head *head, int (*test)(struct inode *, void *),
+ void *data)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode(sb, head, test, data);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ifind_fast - internal function, you want ilookup() or iget().
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ifind_fast() searches for the inode @ino in the inode cache. This is for
+ * file systems where the inode number is sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+static inline struct inode *ifind_fast(struct super_block *sb,
+ struct hlist_head *head, unsigned long ino)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode_fast(sb, head, ino);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ilookup5 - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * @data in the inode cache. This is a generalized version of ilookup() for
+ * file systems where the inode number is not sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+
+ return ifind(sb, head, test, data);
+}
+EXPORT_SYMBOL(ilookup5);
+
+/**
+ * ilookup - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
+ * This is for file systems where the inode number is sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+struct inode *ilookup(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+
+ return ifind_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(ilookup);
+
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * This is iget() without the read_inode() portion of get_new_inode().
+ *
+ * iget5_locked() uses ifind() to search for the inode specified by @hashval
+ * and @data in the inode cache and if present it is returned with an increased
+ * reference count. This is a generalized version of iget_locked() for file
+ * systems where the inode number is not sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is not in cache, get_new_inode() is called to allocate a new
+ * inode and this is returned locked, hashed, and with the I_NEW flag set. The
+ * file system gets to fill it in before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_lock held, so can't sleep.
+ */
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode;
+
+ inode = ifind(sb, head, test, data);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode(sb, head, test, set, data);
+}
+EXPORT_SYMBOL(iget5_locked);
+
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @ino: inode number to get
+ *
+ * This is iget() without the read_inode() portion of get_new_inode_fast().
+ *
+ * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
+ * the inode cache and if present it is returned with an increased reference
+ * count. This is for file systems where the inode number is sufficient for
+ * unique identification of an inode.
+ *
+ * If the inode is not in cache, get_new_inode_fast() is called to allocate a
+ * new inode and this is returned locked, hashed, and with the I_NEW flag set.
+ * The file system gets to fill it in before unlocking it via
+ * unlock_new_inode().
+ */
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct inode *inode;
+
+ inode = ifind_fast(sb, head, ino);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode_fast() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(iget_locked);
+
+/**
+ * __insert_inode_hash - hash an inode
+ * @inode: unhashed inode
+ * @hashval: unsigned long value used to locate this object in the
+ * inode_hashtable.
+ *
+ * Add an inode to the inode hash for this superblock. If the inode
+ * has no superblock it is added to a separate anonymous chain.
+ */
+
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+ struct hlist_head *head = &anon_hash_chain;
+ if (inode->i_sb)
+ head = inode_hashtable + hash(inode->i_sb, hashval);
+ spin_lock(&inode_lock);
+ hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode_lock);
+}
+
+/**
+ * remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock or anonymous hash.
+ */
+
+void remove_inode_hash(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ hlist_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+}
+
+void generic_delete_inode(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+
+<<<<<<<
+ hlist_del_init(&inode->i_hash);
+|||||||
+ list_del_init(&inode->i_hash);
+=======
+>>>>>>>
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+
+ security_inode_delete(inode);
+
+ if (op->delete_inode) {
+ void (*delete)(struct inode *) = op->delete_inode;
+ if (!is_bad_inode(inode))
+ DQUOT_INIT(inode);
+ /* s_op->delete_inode internally recalls clear_inode() */
+ delete(inode);
+ } else
+ clear_inode(inode);
+ spin_lock(&inode_lock);
+ list_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+ wake_up_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ destroy_inode(inode);
+}
+EXPORT_SYMBOL(generic_delete_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!hlist_unhashed(&inode->i_hash)) {
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_unused);
+ }
+ inodes_stat.nr_unused++;
+ spin_unlock(&inode_lock);
+ if (!sb || (sb->s_flags & MS_ACTIVE))
+ return;
+ write_inode_now(inode, 1);
+ spin_lock(&inode_lock);
+ inodes_stat.nr_unused--;
+ hlist_del_init(&inode->i_hash);
+ }
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+}
+
+/*
+ * Normal UNIX filesystem behaviour: delete the
+ * inode when the usage count drops to zero, and
+ * i_nlink is zero.
+ */
+static void generic_drop_inode(struct inode *inode)
+{
+ if (!inode->i_nlink)
+ generic_delete_inode(inode);
+ else
+ generic_forget_inode(inode);
+}
+
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
+ *
+ * Call the FS "drop()" function, defaulting to
+ * the legacy UNIX filesystem behaviour..
+ *
+ * NOTE! NOTE! NOTE! We're called with the inode lock
+ * held, and the drop function is supposed to release
+ * the lock!
+ */
+static inline void iput_final(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+ void (*drop)(struct inode *) = generic_drop_inode;
+
+ if (op && op->drop_inode)
+ drop = op->drop_inode;
+ drop(inode);
+}
+
+/**
+ * iput - put an inode
+ * @inode: inode to put
+ *
+ * Puts an inode, dropping its usage count. If the inode use count hits
+ * zero the inode is also then freed and may be destroyed.
+ */
+
+void iput(struct inode *inode)
+{
+ if (inode) {
+ struct super_operations *op = inode->i_sb->s_op;
+
+ if (inode->i_state == I_CLEAR)
+ BUG();
+
+ if (op && op->put_inode)
+ op->put_inode(inode);
+
+ if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ iput_final(inode);
+ }
+}
+
+/**
+ * bmap - find a block number in a file
+ * @inode: inode of file
+ * @block: block to find
+ *
+ * Returns the block number on the device holding the inode that
+ * is the disk block number for the block of the file requested.
+ * That is, asked for block 4 of inode 1 the function will return the
+ * disk block relative to the disk start that holds that block of the
+ * file.
+ */
+
+sector_t bmap(struct inode * inode, sector_t block)
+{
+ sector_t res = 0;
+ if (inode->i_mapping->a_ops->bmap)
+ res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
+ return res;
+}
+
+/*
+ * Return true if the filesystem which backs this inode considers the two
+ * passed timespecs to be sufficiently different to warrant flushing the
+ * altered time out to disk.
+ */
+static int inode_times_differ(struct inode *inode,
+ struct timespec *old, struct timespec *new)
+{
+ if (IS_ONE_SECOND(inode))
+ return old->tv_sec != new->tv_sec;
+ return !timespec_equal(old, new);
+}
+
+/**
+ * update_atime - update the access time
+ * @inode: inode accessed
+ *
+ * Update the accessed time on an inode and mark it for writeback.
+ * This function automatically handles read only file systems and media,
+ * as well as the "noatime" flag and inode specific "noatime" markers.
+ */
+
+void update_atime(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOATIME(inode))
+ return;
+ if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+ return;
+ if (IS_RDONLY(inode))
+ return;
+
+ now = current_kernel_time();
+ if (inode_times_differ(inode, &inode->i_atime, &now)) {
+ inode->i_atime = now;
+ mark_inode_dirty_sync(inode);
+ } else {
+ if (!timespec_equal(&inode->i_atime, &now))
+ inode->i_atime = now;
+ }
+}
+
+/**
+ * inode_update_time - update mtime and ctime time
+ * @inode: inode accessed
+ * @ctime_too: update ctime too
+ *
+ * Update the mtime time on an inode and mark it for writeback.
+ * When ctime_too is specified update the ctime too.
+ */
+
+void inode_update_time(struct inode *inode, int ctime_too)
+{
+ struct timespec now = current_kernel_time();
+ int sync_it = 0;
+
+ if (inode_times_differ(inode, &inode->i_mtime, &now))
+ sync_it = 1;
+ inode->i_mtime = now;
+
+ if (ctime_too) {
+ if (inode_times_differ(inode, &inode->i_ctime, &now))
+ sync_it = 1;
+ inode->i_ctime = now;
+ }
+ if (sync_it)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL(inode_update_time);
+
+int inode_needs_sync(struct inode *inode)
+{
+ if (IS_SYNC(inode))
+ return 1;
+ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(inode_needs_sync);
+
+/*
+ * Quota functions that want to walk the inode lists..
+ */
+#ifdef CONFIG_QUOTA
+
+/* Functions back in dquot.c */
+void put_dquot_list(struct list_head *);
+int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
+
+void remove_dquot_ref(struct super_block *sb, int type)
+{
+ struct inode *inode;
+ struct list_head *act_head;
+ LIST_HEAD(tofree_head);
+
+ if (!sb->dq_op)
+ return; /* nothing to do */
+ spin_lock(&inode_lock); /* This lock is for inodes code */
+ /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */
+
+ list_for_each(act_head, &inode_in_use) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &inode_unused) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_dirty) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_io) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ spin_unlock(&inode_lock);
+
+ put_dquot_list(&tofree_head);
+}
+
+#endif
+
+/*
+ * Hashed waitqueues for wait_on_inode(). The table is pretty small - the
+ * kernel doesn't lock many inodes at the same time.
+ */
+#define I_WAIT_TABLE_ORDER 3
+static struct i_wait_queue_head {
+ wait_queue_head_t wqh;
+} ____cacheline_aligned_in_smp i_wait_queue_heads[1<<I_WAIT_TABLE_ORDER];
+
+/*
+ * Return the address of the waitqueue_head to be used for this inode
+ */
+static wait_queue_head_t *i_waitq_head(struct inode *inode)
+{
+ return &i_wait_queue_heads[hash_ptr(inode, I_WAIT_TABLE_ORDER)].wqh;
+}
+
+void __wait_on_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (inode->i_state & I_LOCK) {
+ schedule();
+ goto repeat;
+ }
+ remove_wait_queue(wq, &wait);
+ __set_current_state(TASK_RUNNING);
+}
+
+void __wait_on_freeing_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode_lock);
+ schedule();
+ remove_wait_queue(wq, &wait);
+ current->state = TASK_RUNNING;
+ spin_lock(&inode_lock);
+}
+
+
+void wake_up_inode(struct inode *inode)
+{
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ /*
+ * Prevent speculative execution through spin_unlock(&inode_lock);
+ */
+ smp_mb();
+ if (waitqueue_active(wq))
+ wake_up_all(wq);
+}
+
+/*
+ * Initialize the waitqueues and inode hash table.
+ */
+void __init inode_init(unsigned long mempages)
+{
+ struct hlist_head *head;
+ unsigned long order;
+ unsigned int nr_hash;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
+ init_waitqueue_head(&i_wait_queue_heads[i].wqh);
+
+ mempages >>= (14 - PAGE_SHIFT);
+ mempages *= sizeof(struct list_head);
+ for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
+ ;
+
+ do {
+ unsigned long tmp;
+
+ nr_hash = (1UL << order) * PAGE_SIZE /
+ sizeof(struct hlist_head);
+ i_hash_mask = (nr_hash - 1);
+
+ tmp = nr_hash;
+ i_hash_shift = 0;
+ while ((tmp >>= 1UL) != 0UL)
+ i_hash_shift++;
+
+ inode_hashtable = (struct hlist_head *)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (inode_hashtable == NULL && --order >= 0);
+
+ printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+ nr_hash, order, (PAGE_SIZE << order));
+
+ if (!inode_hashtable)
+ panic("Failed to allocate inode hash table\n");
+
+ head = inode_hashtable;
+ i = nr_hash;
+ do {
+ INIT_HLIST_HEAD(head);
+ head++;
+ i--;
+ } while (i);
+
+ /* inode slab cache */
+ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
+ 0, SLAB_HWCACHE_ALIGN, init_once,
+ NULL);
+ if (!inode_cachep)
+ panic("cannot create inode slab cache");
+
+ set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+}
+
+void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
+ inode->i_mode = mode;
+ if (S_ISCHR(mode)) {
+ inode->i_fop = &def_chr_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISBLK(mode)) {
+ inode->i_fop = &def_blk_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISFIFO(mode))
+ inode->i_fop = &def_fifo_fops;
+ else if (S_ISSOCK(mode))
+ inode->i_fop = &bad_sock_fops;
+ else
+ printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
+ mode);
+}
diff --git a/tests/linux/inode-fullpatch/orig b/tests/linux/inode-fullpatch/orig
new file mode 100644
index 0000000..47e7429
--- /dev/null
+++ b/tests/linux/inode-fullpatch/orig
@@ -0,0 +1,1323 @@
+/*
+ * linux/fs/inode.c
+ *
+ * (C) 1997 Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dcache.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+
+/*
+ * This is needed for the following functions:
+ * - inode_has_buffers
+ * - invalidate_inode_buffers
+ * - fsync_bdev
+ * - invalidate_bdev
+ *
+ * FIXME: remove all knowledge of the buffer layer from this file
+ */
+#include <linux/buffer_head.h>
+
+/*
+ * New inode.c implementation.
+ *
+ * This implementation has the basic premise of trying
+ * to be extremely low-overhead and SMP-safe, yet be
+ * simple enough to be "obviously correct".
+ *
+ * Famous last words.
+ */
+
+/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
+
+/* #define INODE_PARANOIA 1 */
+/* #define INODE_DEBUG 1 */
+
+/*
+ * Inode lookup is no longer as critical as it used to be:
+ * most of the lookups are going to be through the dcache.
+ */
+#define I_HASHBITS i_hash_shift
+#define I_HASHMASK i_hash_mask
+
+static unsigned int i_hash_mask;
+static unsigned int i_hash_shift;
+
+/*
+ * Each inode can be on two separate lists. One is
+ * the hash list of the inode, used for lookups. The
+ * other linked list is the "type" list:
+ * "in_use" - valid inode, i_count > 0, i_nlink > 0
+ * "dirty" - as "in_use" but also dirty
+ * "unused" - valid inode, i_count = 0
+ *
+ * A "dirty" list is maintained for each super block,
+ * allowing for low-overhead inode sync() operations.
+ */
+
+LIST_HEAD(inode_in_use);
+LIST_HEAD(inode_unused);
+static struct hlist_head *inode_hashtable;
+static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
+
+/*
+ * A simple spinlock to protect the list manipulations.
+ *
+ * NOTE! You also have to own the lock if you change
+ * the i_state of an inode while it is in use..
+ */
+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * icache shrinking path, and the umount path. Without this exclusion,
+ * by the time prune_icache calls iput for the inode whose pages it has
+ * been invalidating, or by the time it calls clear_inode & destroy_inode
+ * from its final dispose_list, the struct super_block they refer to
+ * (for inode->i_sb->s_op) may already have been freed and reused.
+ */
+static DECLARE_MUTEX(iprune_sem);
+
+/*
+ * Statistics gathering..
+ */
+struct inodes_stat_t inodes_stat;
+
+static kmem_cache_t * inode_cachep;
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+ static struct address_space_operations empty_aops;
+ static struct inode_operations empty_iops;
+ static struct file_operations empty_fops;
+ struct inode *inode;
+
+ if (sb->s_op->alloc_inode)
+ inode = sb->s_op->alloc_inode(sb);
+ else
+ inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
+
+ if (inode) {
+ struct address_space * const mapping = &inode->i_data;
+
+ inode->i_sb = sb;
+ inode->i_blkbits = sb->s_blocksize_bits;
+ inode->i_flags = 0;
+ atomic_set(&inode->i_count, 1);
+ inode->i_sock = 0;
+ inode->i_op = &empty_iops;
+ inode->i_fop = &empty_fops;
+ inode->i_nlink = 1;
+ atomic_set(&inode->i_writecount, 0);
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ inode->i_bytes = 0;
+ inode->i_generation = 0;
+ memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+ inode->i_pipe = NULL;
+ inode->i_bdev = NULL;
+ inode->i_rdev = to_kdev_t(0);
+ inode->i_security = NULL;
+ if (security_inode_alloc(inode)) {
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+ return NULL;
+ }
+
+ mapping->a_ops = &empty_aops;
+ mapping->host = inode;
+ mapping->gfp_mask = GFP_HIGHUSER;
+ mapping->dirtied_when = 0;
+ mapping->assoc_mapping = NULL;
+ mapping->backing_dev_info = &default_backing_dev_info;
+ if (sb->s_bdev)
+ mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ memset(&inode->u, 0, sizeof(inode->u));
+ inode->i_mapping = mapping;
+ }
+ return inode;
+}
+
+void destroy_inode(struct inode *inode)
+{
+ if (inode_has_buffers(inode))
+ BUG();
+ security_inode_free(inode);
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+}
+
+
+/*
+ * These are initializations that only need to be done
+ * once, because the fields are idempotent across use
+ * of the inode, so let the slab aware of that.
+ */
+void inode_init_once(struct inode *inode)
+{
+ memset(inode, 0, sizeof(*inode));
+ INIT_HLIST_NODE(&inode->i_hash);
+ INIT_LIST_HEAD(&inode->i_data.clean_pages);
+ INIT_LIST_HEAD(&inode->i_data.dirty_pages);
+ INIT_LIST_HEAD(&inode->i_data.locked_pages);
+ INIT_LIST_HEAD(&inode->i_data.io_pages);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ INIT_LIST_HEAD(&inode->i_devices);
+ sema_init(&inode->i_sem, 1);
+ INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+ rwlock_init(&inode->i_data.page_lock);
+ init_MUTEX(&inode->i_data.i_shared_sem);
+ INIT_LIST_HEAD(&inode->i_data.private_list);
+ spin_lock_init(&inode->i_data.private_lock);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+ spin_lock_init(&inode->i_lock);
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+ struct inode * inode = (struct inode *) foo;
+
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ inode_init_once(inode);
+}
+
+/*
+ * inode_lock must be held
+ */
+void __iget(struct inode * inode)
+{
+ if (atomic_read(&inode->i_count)) {
+ atomic_inc(&inode->i_count);
+ return;
+ }
+ atomic_inc(&inode->i_count);
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_in_use);
+ }
+ inodes_stat.nr_unused--;
+}
+
+/**
+ * clear_inode - clear an inode
+ * @inode: inode to clear
+ *
+ * This is called by the filesystem to tell us
+ * that the inode is no longer useful. We just
+ * terminate it with extreme prejudice.
+ */
+
+void clear_inode(struct inode *inode)
+{
+ invalidate_inode_buffers(inode);
+
+ if (inode->i_data.nrpages)
+ BUG();
+ if (!(inode->i_state & I_FREEING))
+ BUG();
+ if (inode->i_state & I_CLEAR)
+ BUG();
+ wait_on_inode(inode);
+ DQUOT_DROP(inode);
+ if (inode->i_sb && inode->i_sb->s_op->clear_inode)
+ inode->i_sb->s_op->clear_inode(inode);
+ if (inode->i_bdev)
+ bd_forget(inode);
+ inode->i_state = I_CLEAR;
+}
+
+/*
+ * Dispose-list gets a local list with local inodes in it, so it doesn't
+ * need to worry about list corruption and SMP locks.
+ */
+static void dispose_list(struct list_head *head)
+{
+ int nr_disposed = 0;
+
+ while (!list_empty(head)) {
+ struct inode *inode;
+
+ inode = list_entry(head->next, struct inode, i_list);
+ list_del(&inode->i_list);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+ nr_disposed++;
+ }
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes -= nr_disposed;
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * Invalidate all inodes for a device.
+ */
+static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
+{
+ struct list_head *next;
+ int busy = 0, count = 0;
+
+ next = head->next;
+ for (;;) {
+ struct list_head * tmp = next;
+ struct inode * inode;
+
+ next = next->next;
+ if (tmp == head)
+ break;
+ inode = list_entry(tmp, struct inode, i_list);
+ if (inode->i_sb != sb)
+ continue;
+ invalidate_inode_buffers(inode);
+ if (!atomic_read(&inode->i_count)) {
+ hlist_del_init(&inode->i_hash);
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, dispose);
+ inode->i_state |= I_FREEING;
+ count++;
+ continue;
+ }
+ busy = 1;
+ }
+ /* only unused inodes may be cached with i_count zero */
+ inodes_stat.nr_unused -= count;
+ return busy;
+}
+
+/*
+ * This is a two-stage process. First we collect all
+ * offending inodes onto the throw-away list, and in
+ * the second stage we actually dispose of them. This
+ * is because we don't want to sleep while messing
+ * with the global lists..
+ */
+
+/**
+ * invalidate_inodes - discard the inodes on a device
+ * @sb: superblock
+ *
+ * Discard all of the inodes for a given superblock. If the discard
+ * fails because there are busy inodes then a non zero value is returned.
+ * If the discard is successful all the inodes have been discarded.
+ */
+
+int invalidate_inodes(struct super_block * sb)
+{
+ int busy;
+ LIST_HEAD(throw_away);
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ busy = invalidate_list(&inode_in_use, sb, &throw_away);
+ busy |= invalidate_list(&inode_unused, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_io, sb, &throw_away);
+ spin_unlock(&inode_lock);
+
+ dispose_list(&throw_away);
+ up(&iprune_sem);
+
+ return busy;
+}
+
+int invalidate_device(kdev_t dev, int do_sync)
+{
+ struct super_block *sb;
+ struct block_device *bdev = bdget(kdev_t_to_nr(dev));
+ int res;
+
+ if (!bdev)
+ return 0;
+
+ if (do_sync)
+ fsync_bdev(bdev);
+
+ res = 0;
+ sb = get_super(bdev);
+ if (sb) {
+ /*
+ * no need to lock the super, get_super holds the
+ * read semaphore so the filesystem cannot go away
+ * under us (->put_super runs with the write lock
+ * hold).
+ */
+ shrink_dcache_sb(sb);
+ res = invalidate_inodes(sb);
+ drop_super(sb);
+ }
+ invalidate_bdev(bdev, 0);
+ bdput(bdev);
+ return res;
+}
+
+static int can_unuse(struct inode *inode)
+{
+ if (inode->i_state)
+ return 0;
+ if (inode_has_buffers(inode))
+ return 0;
+ if (atomic_read(&inode->i_count))
+ return 0;
+ if (inode->i_data.nrpages)
+ return 0;
+ return 1;
+}
+
+/*
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * a temporary list and then are freed outside inode_lock by dispose_list().
+ *
+ * Any inodes which are pinned purely because of attached pagecache have their
+ * pagecache removed. We expect the final iput() on that inode to add it to
+ * the front of the inode_unused list. So look for it there and if the
+ * inode is still freeable, proceed. The right inode is found 99.9% of the
+ * time in testing on a 4-way.
+ *
+ * If the inode has metadata buffers attached to mapping->private_list then
+ * try to remove them.
+ */
+static void prune_icache(int nr_to_scan)
+{
+ LIST_HEAD(freeable);
+ int nr_pruned = 0;
+ int nr_scanned;
+ unsigned long reap = 0;
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+ struct inode *inode;
+
+ if (list_empty(&inode_unused))
+ break;
+
+ inode = list_entry(inode_unused.prev, struct inode, i_list);
+
+ if (inode->i_state || atomic_read(&inode->i_count)) {
+ list_move(&inode->i_list, &inode_unused);
+ continue;
+ }
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ if (remove_inode_buffers(inode))
+ reap += invalidate_inode_pages(&inode->i_data);
+ iput(inode);
+ spin_lock(&inode_lock);
+
+ if (inode != list_entry(inode_unused.next,
+ struct inode, i_list))
+ continue; /* wrong inode or list_empty */
+ if (!can_unuse(inode))
+ continue;
+ }
+ hlist_del_init(&inode->i_hash);
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ nr_pruned++;
+ }
+ inodes_stat.nr_unused -= nr_pruned;
+ spin_unlock(&inode_lock);
+
+ dispose_list(&freeable);
+ up(&iprune_sem);
+
+ if (current_is_kswapd)
+ mod_page_state(kswapd_inodesteal, reap);
+ else
+ mod_page_state(pginodesteal, reap);
+}
+
+/*
+ * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
+ * "unused" means that no dentries are referring to the inodes: the files are
+ * not open and the dcache references to those inodes have already been
+ * reclaimed.
+ *
+ * This function is passed the number of inodes to scan, and it returns the
+ * total number of remaining possibly-reclaimable inodes.
+ */
+static int shrink_icache_memory(int nr, unsigned int gfp_mask)
+{
+ if (nr) {
+ /*
+ * Nasty deadlock avoidance. We may hold various FS locks,
+ * and we don't want to recurse into the FS that called us
+ * in clear_inode() and friends..
+ */
+ if (gfp_mask & __GFP_FS)
+ prune_icache(nr);
+ }
+ return inodes_stat.nr_unused;
+}
+
+/*
+ * Called with the inode lock held.
+ * NOTE: we are not increasing the inode-refcount, you must call __iget()
+ * by hand after calling find_inode now! This simplifies iunique and won't
+ * add any additional branch in the common code.
+ */
+static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = hlist_entry(node, struct inode, i_hash);
+ if (inode->i_sb != sb)
+ continue;
+ if (!test(inode, data))
+ continue;
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/*
+ * find_inode_fast is the fast path version of find_inode, see the comment at
+ * iget_locked for details.
+ */
+static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = list_entry(node, struct inode, i_hash);
+ if (inode->i_ino != ino)
+ continue;
+ if (inode->i_sb != sb)
+ continue;
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/**
+ * new_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ */
+
+struct inode *new_inode(struct super_block *sb)
+{
+ static unsigned long last_ino;
+ struct inode * inode;
+
+ spin_lock_prefetch(&inode_lock);
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ inode->i_ino = ++last_ino;
+ inode->i_state = 0;
+ spin_unlock(&inode_lock);
+ }
+ return inode;
+}
+
+void unlock_new_inode(struct inode *inode)
+{
+ /*
+ * This is special! We do not need the spinlock
+ * when clearing I_LOCK, because we're guaranteed
+ * that nobody else tries to do anything about the
+ * state of the inode when it is locked, as we
+ * just created it (so there can be no old holders
+ * that haven't tested I_LOCK).
+ */
+ inode->i_state &= ~(I_LOCK|I_NEW);
+ wake_up_inode(inode);
+}
+EXPORT_SYMBOL(unlock_new_inode);
+
+/*
+ * This is called without the inode lock held.. Be careful.
+ *
+ * We no longer cache the sb_flags in i_flags - see fs.h
+ * -- rmk@arm.uk.linux.org
+ */
+static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode(sb, head, test, data);
+ if (!old) {
+ if (set(inode, data))
+ goto set_failed;
+
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+
+set_failed:
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ return NULL;
+}
+
+/*
+ * get_new_inode_fast is the fast path version of get_new_inode, see the
+ * comment at iget_locked for details.
+ */
+static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode_fast(sb, head, ino);
+ if (!old) {
+ inode->i_ino = ino;
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+}
+
+static inline unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES);
+ tmp = tmp + (tmp >> I_HASHBITS);
+ return tmp & I_HASHMASK;
+}
+
+/* Yeah, I know about quadratic hash. Maybe, later. */
+
+/**
+ * iunique - get a unique inode number
+ * @sb: superblock
+ * @max_reserved: highest reserved inode number
+ *
+ * Obtain an inode number that is unique on the system for a given
+ * superblock. This is used by file systems that have no natural
+ * permanent inode numbering system. An inode number is returned that
+ * is higher than the reserved limit but unique.
+ *
+ * BUGS:
+ * With a large number of inodes live on the file system this function
+ * currently becomes quite slow.
+ */
+
+ino_t iunique(struct super_block *sb, ino_t max_reserved)
+{
+ static ino_t counter = 0;
+ struct inode *inode;
+ struct hlist_head * head;
+ ino_t res;
+ spin_lock(&inode_lock);
+retry:
+ if (counter > max_reserved) {
+ head = inode_hashtable + hash(sb,counter);
+ res = counter++;
+ inode = find_inode_fast(sb, head, res);
+ if (!inode) {
+ spin_unlock(&inode_lock);
+ return res;
+ }
+ } else {
+ counter = max_reserved + 1;
+ }
+ goto retry;
+
+}
+
+struct inode *igrab(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ if (!(inode->i_state & I_FREEING))
+ __iget(inode);
+ else
+ /*
+ * Handle the case where s_op->clear_inode is not been
+ * called yet, and somebody is calling igrab
+ * while the inode is getting freed.
+ */
+ inode = NULL;
+ spin_unlock(&inode_lock);
+ return inode;
+}
+
+/**
+ * ifind - internal function, you want ilookup5() or iget5().
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ifind() searches for the inode specified by @hashval and @data in the inode
+ * cache. This is a generalized version of ifind_fast() for file systems where
+ * the inode number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+static inline struct inode *ifind(struct super_block *sb,
+ struct hlist_head *head, int (*test)(struct inode *, void *),
+ void *data)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode(sb, head, test, data);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ifind_fast - internal function, you want ilookup() or iget().
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ifind_fast() searches for the inode @ino in the inode cache. This is for
+ * file systems where the inode number is sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+static inline struct inode *ifind_fast(struct super_block *sb,
+ struct hlist_head *head, unsigned long ino)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode_fast(sb, head, ino);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ilookup5 - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * @data in the inode cache. This is a generalized version of ilookup() for
+ * file systems where the inode number is not sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+
+ return ifind(sb, head, test, data);
+}
+EXPORT_SYMBOL(ilookup5);
+
+/**
+ * ilookup - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
+ * This is for file systems where the inode number is sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+struct inode *ilookup(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+
+ return ifind_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(ilookup);
+
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * This is iget() without the read_inode() portion of get_new_inode().
+ *
+ * iget5_locked() uses ifind() to search for the inode specified by @hashval
+ * and @data in the inode cache and if present it is returned with an increased
+ * reference count. This is a generalized version of iget_locked() for file
+ * systems where the inode number is not sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is not in cache, get_new_inode() is called to allocate a new
+ * inode and this is returned locked, hashed, and with the I_NEW flag set. The
+ * file system gets to fill it in before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_lock held, so can't sleep.
+ */
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode;
+
+ inode = ifind(sb, head, test, data);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode(sb, head, test, set, data);
+}
+EXPORT_SYMBOL(iget5_locked);
+
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @ino: inode number to get
+ *
+ * This is iget() without the read_inode() portion of get_new_inode_fast().
+ *
+ * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
+ * the inode cache and if present it is returned with an increased reference
+ * count. This is for file systems where the inode number is sufficient for
+ * unique identification of an inode.
+ *
+ * If the inode is not in cache, get_new_inode_fast() is called to allocate a
+ * new inode and this is returned locked, hashed, and with the I_NEW flag set.
+ * The file system gets to fill it in before unlocking it via
+ * unlock_new_inode().
+ */
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct inode *inode;
+
+ inode = ifind_fast(sb, head, ino);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode_fast() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(iget_locked);
+
+/**
+ * __insert_inode_hash - hash an inode
+ * @inode: unhashed inode
+ * @hashval: unsigned long value used to locate this object in the
+ * inode_hashtable.
+ *
+ * Add an inode to the inode hash for this superblock. If the inode
+ * has no superblock it is added to a separate anonymous chain.
+ */
+
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+ struct hlist_head *head = &anon_hash_chain;
+ if (inode->i_sb)
+ head = inode_hashtable + hash(inode->i_sb, hashval);
+ spin_lock(&inode_lock);
+ hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode_lock);
+}
+
+/**
+ * remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock or anonymous hash.
+ */
+
+void remove_inode_hash(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ hlist_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+}
+
+void generic_delete_inode(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+
+ hlist_del_init(&inode->i_hash);
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+
+ security_inode_delete(inode);
+
+ if (op->delete_inode) {
+ void (*delete)(struct inode *) = op->delete_inode;
+ if (!is_bad_inode(inode))
+ DQUOT_INIT(inode);
+ /* s_op->delete_inode internally recalls clear_inode() */
+ delete(inode);
+ } else
+ clear_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ destroy_inode(inode);
+}
+EXPORT_SYMBOL(generic_delete_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!hlist_unhashed(&inode->i_hash)) {
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_unused);
+ }
+ inodes_stat.nr_unused++;
+ spin_unlock(&inode_lock);
+ if (!sb || (sb->s_flags & MS_ACTIVE))
+ return;
+ write_inode_now(inode, 1);
+ spin_lock(&inode_lock);
+ inodes_stat.nr_unused--;
+ hlist_del_init(&inode->i_hash);
+ }
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+}
+
+/*
+ * Normal UNIX filesystem behaviour: delete the
+ * inode when the usage count drops to zero, and
+ * i_nlink is zero.
+ */
+static void generic_drop_inode(struct inode *inode)
+{
+ if (!inode->i_nlink)
+ generic_delete_inode(inode);
+ else
+ generic_forget_inode(inode);
+}
+
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
+ *
+ * Call the FS "drop()" function, defaulting to
+ * the legacy UNIX filesystem behaviour..
+ *
+ * NOTE! NOTE! NOTE! We're called with the inode lock
+ * held, and the drop function is supposed to release
+ * the lock!
+ */
+static inline void iput_final(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+ void (*drop)(struct inode *) = generic_drop_inode;
+
+ if (op && op->drop_inode)
+ drop = op->drop_inode;
+ drop(inode);
+}
+
+/**
+ * iput - put an inode
+ * @inode: inode to put
+ *
+ * Puts an inode, dropping its usage count. If the inode use count hits
+ * zero the inode is also then freed and may be destroyed.
+ */
+
+void iput(struct inode *inode)
+{
+ if (inode) {
+ struct super_operations *op = inode->i_sb->s_op;
+
+ if (inode->i_state == I_CLEAR)
+ BUG();
+
+ if (op && op->put_inode)
+ op->put_inode(inode);
+
+ if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ iput_final(inode);
+ }
+}
+
+/**
+ * bmap - find a block number in a file
+ * @inode: inode of file
+ * @block: block to find
+ *
+ * Returns the block number on the device holding the inode that
+ * is the disk block number for the block of the file requested.
+ * That is, asked for block 4 of inode 1 the function will return the
+ * disk block relative to the disk start that holds that block of the
+ * file.
+ */
+
+sector_t bmap(struct inode * inode, sector_t block)
+{
+ sector_t res = 0;
+ if (inode->i_mapping->a_ops->bmap)
+ res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
+ return res;
+}
+
+/*
+ * Return true if the filesystem which backs this inode considers the two
+ * passed timespecs to be sufficiently different to warrant flushing the
+ * altered time out to disk.
+ */
+static int inode_times_differ(struct inode *inode,
+ struct timespec *old, struct timespec *new)
+{
+ if (IS_ONE_SECOND(inode))
+ return old->tv_sec != new->tv_sec;
+ return !timespec_equal(old, new);
+}
+
+/**
+ * update_atime - update the access time
+ * @inode: inode accessed
+ *
+ * Update the accessed time on an inode and mark it for writeback.
+ * This function automatically handles read only file systems and media,
+ * as well as the "noatime" flag and inode specific "noatime" markers.
+ */
+
+void update_atime(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOATIME(inode))
+ return;
+ if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+ return;
+ if (IS_RDONLY(inode))
+ return;
+
+ now = current_kernel_time();
+ if (inode_times_differ(inode, &inode->i_atime, &now)) {
+ inode->i_atime = now;
+ mark_inode_dirty_sync(inode);
+ } else {
+ if (!timespec_equal(&inode->i_atime, &now))
+ inode->i_atime = now;
+ }
+}
+
+/**
+ * inode_update_time - update mtime and ctime time
+ * @inode: inode accessed
+ * @ctime_too: update ctime too
+ *
+ * Update the mtime time on an inode and mark it for writeback.
+ * When ctime_too is specified update the ctime too.
+ */
+
+void inode_update_time(struct inode *inode, int ctime_too)
+{
+ struct timespec now = current_kernel_time();
+ int sync_it = 0;
+
+ if (inode_times_differ(inode, &inode->i_mtime, &now))
+ sync_it = 1;
+ inode->i_mtime = now;
+
+ if (ctime_too) {
+ if (inode_times_differ(inode, &inode->i_ctime, &now))
+ sync_it = 1;
+ inode->i_ctime = now;
+ }
+ if (sync_it)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL(inode_update_time);
+
+int inode_needs_sync(struct inode *inode)
+{
+ if (IS_SYNC(inode))
+ return 1;
+ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(inode_needs_sync);
+
+/*
+ * Quota functions that want to walk the inode lists..
+ */
+#ifdef CONFIG_QUOTA
+
+/* Functions back in dquot.c */
+void put_dquot_list(struct list_head *);
+int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
+
+void remove_dquot_ref(struct super_block *sb, int type)
+{
+ struct inode *inode;
+ struct list_head *act_head;
+ LIST_HEAD(tofree_head);
+
+ if (!sb->dq_op)
+ return; /* nothing to do */
+ spin_lock(&inode_lock); /* This lock is for inodes code */
+ /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */
+
+ list_for_each(act_head, &inode_in_use) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &inode_unused) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_dirty) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_io) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ spin_unlock(&inode_lock);
+
+ put_dquot_list(&tofree_head);
+}
+
+#endif
+
+/*
+ * Hashed waitqueues for wait_on_inode(). The table is pretty small - the
+ * kernel doesn't lock many inodes at the same time.
+ */
+#define I_WAIT_TABLE_ORDER 3
+static struct i_wait_queue_head {
+ wait_queue_head_t wqh;
+} ____cacheline_aligned_in_smp i_wait_queue_heads[1<<I_WAIT_TABLE_ORDER];
+
+/*
+ * Return the address of the waitqueue_head to be used for this inode
+ */
+static wait_queue_head_t *i_waitq_head(struct inode *inode)
+{
+ return &i_wait_queue_heads[hash_ptr(inode, I_WAIT_TABLE_ORDER)].wqh;
+}
+
+void __wait_on_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (inode->i_state & I_LOCK) {
+ schedule();
+ goto repeat;
+ }
+ remove_wait_queue(wq, &wait);
+ __set_current_state(TASK_RUNNING);
+}
+
+void wake_up_inode(struct inode *inode)
+{
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ /*
+ * Prevent speculative execution through spin_unlock(&inode_lock);
+ */
+ smp_mb();
+ if (waitqueue_active(wq))
+ wake_up_all(wq);
+}
+
+/*
+ * Initialize the waitqueues and inode hash table.
+ */
+void __init inode_init(unsigned long mempages)
+{
+ struct hlist_head *head;
+ unsigned long order;
+ unsigned int nr_hash;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
+ init_waitqueue_head(&i_wait_queue_heads[i].wqh);
+
+ mempages >>= (14 - PAGE_SHIFT);
+ mempages *= sizeof(struct list_head);
+ for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
+ ;
+
+ do {
+ unsigned long tmp;
+
+ nr_hash = (1UL << order) * PAGE_SIZE /
+ sizeof(struct hlist_head);
+ i_hash_mask = (nr_hash - 1);
+
+ tmp = nr_hash;
+ i_hash_shift = 0;
+ while ((tmp >>= 1UL) != 0UL)
+ i_hash_shift++;
+
+ inode_hashtable = (struct hlist_head *)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (inode_hashtable == NULL && --order >= 0);
+
+ printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+ nr_hash, order, (PAGE_SIZE << order));
+
+ if (!inode_hashtable)
+ panic("Failed to allocate inode hash table\n");
+
+ head = inode_hashtable;
+ i = nr_hash;
+ do {
+ INIT_HLIST_HEAD(head);
+ head++;
+ i--;
+ } while (i);
+
+ /* inode slab cache */
+ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
+ 0, SLAB_HWCACHE_ALIGN, init_once,
+ NULL);
+ if (!inode_cachep)
+ panic("cannot create inode slab cache");
+
+ set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+}
+
+void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
+ inode->i_mode = mode;
+ if (S_ISCHR(mode)) {
+ inode->i_fop = &def_chr_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISBLK(mode)) {
+ inode->i_fop = &def_blk_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISFIFO(mode))
+ inode->i_fop = &def_fifo_fops;
+ else if (S_ISSOCK(mode))
+ inode->i_fop = &bad_sock_fops;
+ else
+ printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
+ mode);
+}
diff --git a/tests/linux/inode-fullpatch/patch b/tests/linux/inode-fullpatch/patch
new file mode 100644
index 0000000..aeafa41
--- /dev/null
+++ b/tests/linux/inode-fullpatch/patch
@@ -0,0 +1,77 @@
+
+diff ./fs/inode.c~current~ ./fs/inode.c
+--- ./fs/inode.c~current~ 2003-03-10 15:13:52.000000000 +1100
++++ ./fs/inode.c 2003-03-10 15:13:53.000000000 +1100
+@@ -470,6 +470,7 @@ static int shrink_icache_memory(int nr,
+ return inodes_stat.nr_inodes;
+ }
+
++void __wait_on_freeing_inode(struct inode *inode);
+ /*
+ * Called with the inode lock held.
+ * NOTE: we are not increasing the inode-refcount, you must call __iget()
+@@ -492,6 +493,11 @@ static struct inode * find_inode(struct
+ continue;
+ if (!test(inode, data))
+ continue;
++ if (inode->i_state & (I_FREEING|I_CLEAR)) {
++ __wait_on_freeing_inode(inode);
++ tmp = head;
++ continue;
++ }
+ break;
+ }
+ return inode;
+@@ -517,6 +523,11 @@ static struct inode * find_inode_fast(st
+ continue;
+ if (inode->i_sb != sb)
+ continue;
++ if (inode->i_state & (I_FREEING|I_CLEAR)) {
++ __wait_on_freeing_inode(inode);
++ tmp = head;
++ continue;
++ }
+ break;
+ }
+ return inode;
+@@ -949,7 +960,6 @@ void generic_delete_inode(struct inode *
+ {
+ struct super_operations *op = inode->i_sb->s_op;
+
+- list_del_init(&inode->i_hash);
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+@@ -968,6 +978,10 @@ void generic_delete_inode(struct inode *
+ delete(inode);
+ } else
+ clear_inode(inode);
++ spin_lock(&inode_lock);
++ list_del_init(&inode->i_hash);
++ spin_unlock(&inode_lock);
++ wake_up_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ destroy_inode(inode);
+@@ -1219,6 +1233,21 @@ repeat:
+ current->state = TASK_RUNNING;
+ }
+
++void __wait_on_freeing_inode(struct inode *inode)
++{
++ DECLARE_WAITQUEUE(wait, current);
++ wait_queue_head_t *wq = i_waitq_head(inode);
++
++ add_wait_queue(wq, &wait);
++ set_current_state(TASK_UNINTERRUPTIBLE);
++ spin_unlock(&inode_lock);
++ schedule();
++ remove_wait_queue(wq, &wait);
++ current->state = TASK_RUNNING;
++ spin_lock(&inode_lock);
++}
++
++
+ void wake_up_inode(struct inode *inode)
+ {
+ wait_queue_head_t *wq = i_waitq_head(inode);
diff --git a/tests/linux/inode-fullpatch/rediff b/tests/linux/inode-fullpatch/rediff
new file mode 100644
index 0000000..ea080cf
--- /dev/null
+++ b/tests/linux/inode-fullpatch/rediff
@@ -0,0 +1,73 @@
+@@ -470,6 +470,7 @@
+ return inodes_stat.nr_inodes;
+ }
+
++void __wait_on_freeing_inode(struct inode *inode);
+ /*
+ * Called with the inode lock held.
+ * NOTE: we are not increasing the inode-refcount, you must call __iget()
+@@ -492,6 +493,11 @@
+ continue;
+ if (!test(inode, data))
+ continue;
++ if (inode->i_state & (I_FREEING|I_CLEAR)) {
++ __wait_on_freeing_inode(inode);
++ tmp = head;
++ continue;
++ }
+ break;
+ }
+ return inode;
+@@ -517,6 +523,11 @@
+ continue;
+ if (inode->i_sb != sb)
+ continue;
++ if (inode->i_state & (I_FREEING|I_CLEAR)) {
++ __wait_on_freeing_inode(inode);
++ tmp = head;
++ continue;
++ }
+ break;
+ }
+ return inode;
+@@ -949,7 +960,6 @@
+ {
+ struct super_operations *op = inode->i_sb->s_op;
+
+- list_del_init(&inode->i_hash);
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+@@ -968,6 +978,10 @@
+ delete(inode);
+ } else
+ clear_inode(inode);
++ spin_lock(&inode_lock);
++ list_del_init(&inode->i_hash);
++ spin_unlock(&inode_lock);
++ wake_up_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ destroy_inode(inode);
+@@ -1219,6 +1233,21 @@
+ current->state = TASK_RUNNING;
+ }
+
++void __wait_on_freeing_inode(struct inode *inode)
++{
++ DECLARE_WAITQUEUE(wait, current);
++ wait_queue_head_t *wq = i_waitq_head(inode);
++
++ add_wait_queue(wq, &wait);
++ set_current_state(TASK_UNINTERRUPTIBLE);
++ spin_unlock(&inode_lock);
++ schedule();
++ remove_wait_queue(wq, &wait);
++ current->state = TASK_RUNNING;
++ spin_lock(&inode_lock);
++}
++
++
+ void wake_up_inode(struct inode *inode)
+ {
+ wait_queue_head_t *wq = i_waitq_head(inode);
diff --git a/tests/linux/inode-fullpatch/wmerge b/tests/linux/inode-fullpatch/wmerge
new file mode 100644
index 0000000..1ffda02
--- /dev/null
+++ b/tests/linux/inode-fullpatch/wmerge
@@ -0,0 +1,1352 @@
+/*
+ * linux/fs/inode.c
+ *
+ * (C) 1997 Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dcache.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+
+/*
+ * This is needed for the following functions:
+ * - inode_has_buffers
+ * - invalidate_inode_buffers
+ * - fsync_bdev
+ * - invalidate_bdev
+ *
+ * FIXME: remove all knowledge of the buffer layer from this file
+ */
+#include <linux/buffer_head.h>
+
+/*
+ * New inode.c implementation.
+ *
+ * This implementation has the basic premise of trying
+ * to be extremely low-overhead and SMP-safe, yet be
+ * simple enough to be "obviously correct".
+ *
+ * Famous last words.
+ */
+
+/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
+
+/* #define INODE_PARANOIA 1 */
+/* #define INODE_DEBUG 1 */
+
+/*
+ * Inode lookup is no longer as critical as it used to be:
+ * most of the lookups are going to be through the dcache.
+ */
+#define I_HASHBITS i_hash_shift
+#define I_HASHMASK i_hash_mask
+
+static unsigned int i_hash_mask;
+static unsigned int i_hash_shift;
+
+/*
+ * Each inode can be on two separate lists. One is
+ * the hash list of the inode, used for lookups. The
+ * other linked list is the "type" list:
+ * "in_use" - valid inode, i_count > 0, i_nlink > 0
+ * "dirty" - as "in_use" but also dirty
+ * "unused" - valid inode, i_count = 0
+ *
+ * A "dirty" list is maintained for each super block,
+ * allowing for low-overhead inode sync() operations.
+ */
+
+LIST_HEAD(inode_in_use);
+LIST_HEAD(inode_unused);
+static struct hlist_head *inode_hashtable;
+static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
+
+/*
+ * A simple spinlock to protect the list manipulations.
+ *
+ * NOTE! You also have to own the lock if you change
+ * the i_state of an inode while it is in use..
+ */
+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * icache shrinking path, and the umount path. Without this exclusion,
+ * by the time prune_icache calls iput for the inode whose pages it has
+ * been invalidating, or by the time it calls clear_inode & destroy_inode
+ * from its final dispose_list, the struct super_block they refer to
+ * (for inode->i_sb->s_op) may already have been freed and reused.
+ */
+static DECLARE_MUTEX(iprune_sem);
+
+/*
+ * Statistics gathering..
+ */
+struct inodes_stat_t inodes_stat;
+
+static kmem_cache_t * inode_cachep;
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+ static struct address_space_operations empty_aops;
+ static struct inode_operations empty_iops;
+ static struct file_operations empty_fops;
+ struct inode *inode;
+
+ if (sb->s_op->alloc_inode)
+ inode = sb->s_op->alloc_inode(sb);
+ else
+ inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
+
+ if (inode) {
+ struct address_space * const mapping = &inode->i_data;
+
+ inode->i_sb = sb;
+ inode->i_blkbits = sb->s_blocksize_bits;
+ inode->i_flags = 0;
+ atomic_set(&inode->i_count, 1);
+ inode->i_sock = 0;
+ inode->i_op = &empty_iops;
+ inode->i_fop = &empty_fops;
+ inode->i_nlink = 1;
+ atomic_set(&inode->i_writecount, 0);
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ inode->i_bytes = 0;
+ inode->i_generation = 0;
+ memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+ inode->i_pipe = NULL;
+ inode->i_bdev = NULL;
+ inode->i_rdev = to_kdev_t(0);
+ inode->i_security = NULL;
+ if (security_inode_alloc(inode)) {
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+ return NULL;
+ }
+
+ mapping->a_ops = &empty_aops;
+ mapping->host = inode;
+ mapping->gfp_mask = GFP_HIGHUSER;
+ mapping->dirtied_when = 0;
+ mapping->assoc_mapping = NULL;
+ mapping->backing_dev_info = &default_backing_dev_info;
+ if (sb->s_bdev)
+ mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ memset(&inode->u, 0, sizeof(inode->u));
+ inode->i_mapping = mapping;
+ }
+ return inode;
+}
+
+void destroy_inode(struct inode *inode)
+{
+ if (inode_has_buffers(inode))
+ BUG();
+ security_inode_free(inode);
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+}
+
+
+/*
+ * These are initializations that only need to be done
+ * once, because the fields are idempotent across use
+ * of the inode, so let the slab aware of that.
+ */
+void inode_init_once(struct inode *inode)
+{
+ memset(inode, 0, sizeof(*inode));
+ INIT_HLIST_NODE(&inode->i_hash);
+ INIT_LIST_HEAD(&inode->i_data.clean_pages);
+ INIT_LIST_HEAD(&inode->i_data.dirty_pages);
+ INIT_LIST_HEAD(&inode->i_data.locked_pages);
+ INIT_LIST_HEAD(&inode->i_data.io_pages);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ INIT_LIST_HEAD(&inode->i_devices);
+ sema_init(&inode->i_sem, 1);
+ INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+ rwlock_init(&inode->i_data.page_lock);
+ init_MUTEX(&inode->i_data.i_shared_sem);
+ INIT_LIST_HEAD(&inode->i_data.private_list);
+ spin_lock_init(&inode->i_data.private_lock);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+ spin_lock_init(&inode->i_lock);
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+ struct inode * inode = (struct inode *) foo;
+
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ inode_init_once(inode);
+}
+
+/*
+ * inode_lock must be held
+ */
+void __iget(struct inode * inode)
+{
+ if (atomic_read(&inode->i_count)) {
+ atomic_inc(&inode->i_count);
+ return;
+ }
+ atomic_inc(&inode->i_count);
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_in_use);
+ }
+ inodes_stat.nr_unused--;
+}
+
+/**
+ * clear_inode - clear an inode
+ * @inode: inode to clear
+ *
+ * This is called by the filesystem to tell us
+ * that the inode is no longer useful. We just
+ * terminate it with extreme prejudice.
+ */
+
+void clear_inode(struct inode *inode)
+{
+ invalidate_inode_buffers(inode);
+
+ if (inode->i_data.nrpages)
+ BUG();
+ if (!(inode->i_state & I_FREEING))
+ BUG();
+ if (inode->i_state & I_CLEAR)
+ BUG();
+ wait_on_inode(inode);
+ DQUOT_DROP(inode);
+ if (inode->i_sb && inode->i_sb->s_op->clear_inode)
+ inode->i_sb->s_op->clear_inode(inode);
+ if (inode->i_bdev)
+ bd_forget(inode);
+ inode->i_state = I_CLEAR;
+}
+
+/*
+ * Dispose-list gets a local list with local inodes in it, so it doesn't
+ * need to worry about list corruption and SMP locks.
+ */
+static void dispose_list(struct list_head *head)
+{
+ int nr_disposed = 0;
+
+ while (!list_empty(head)) {
+ struct inode *inode;
+
+ inode = list_entry(head->next, struct inode, i_list);
+ list_del(&inode->i_list);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+ nr_disposed++;
+ }
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes -= nr_disposed;
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * Invalidate all inodes for a device.
+ */
+static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
+{
+ struct list_head *next;
+ int busy = 0, count = 0;
+
+ next = head->next;
+ for (;;) {
+ struct list_head * tmp = next;
+ struct inode * inode;
+
+ next = next->next;
+ if (tmp == head)
+ break;
+ inode = list_entry(tmp, struct inode, i_list);
+ if (inode->i_sb != sb)
+ continue;
+ invalidate_inode_buffers(inode);
+ if (!atomic_read(&inode->i_count)) {
+ hlist_del_init(&inode->i_hash);
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, dispose);
+ inode->i_state |= I_FREEING;
+ count++;
+ continue;
+ }
+ busy = 1;
+ }
+ /* only unused inodes may be cached with i_count zero */
+ inodes_stat.nr_unused -= count;
+ return busy;
+}
+
+/*
+ * This is a two-stage process. First we collect all
+ * offending inodes onto the throw-away list, and in
+ * the second stage we actually dispose of them. This
+ * is because we don't want to sleep while messing
+ * with the global lists..
+ */
+
+/**
+ * invalidate_inodes - discard the inodes on a device
+ * @sb: superblock
+ *
+ * Discard all of the inodes for a given superblock. If the discard
+ * fails because there are busy inodes then a non zero value is returned.
+ * If the discard is successful all the inodes have been discarded.
+ */
+
+int invalidate_inodes(struct super_block * sb)
+{
+ int busy;
+ LIST_HEAD(throw_away);
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ busy = invalidate_list(&inode_in_use, sb, &throw_away);
+ busy |= invalidate_list(&inode_unused, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_io, sb, &throw_away);
+ spin_unlock(&inode_lock);
+
+ dispose_list(&throw_away);
+ up(&iprune_sem);
+
+ return busy;
+}
+
+int invalidate_device(kdev_t dev, int do_sync)
+{
+ struct super_block *sb;
+ struct block_device *bdev = bdget(kdev_t_to_nr(dev));
+ int res;
+
+ if (!bdev)
+ return 0;
+
+ if (do_sync)
+ fsync_bdev(bdev);
+
+ res = 0;
+ sb = get_super(bdev);
+ if (sb) {
+ /*
+ * no need to lock the super, get_super holds the
+ * read semaphore so the filesystem cannot go away
+ * under us (->put_super runs with the write lock
+ * hold).
+ */
+ shrink_dcache_sb(sb);
+ res = invalidate_inodes(sb);
+ drop_super(sb);
+ }
+ invalidate_bdev(bdev, 0);
+ bdput(bdev);
+ return res;
+}
+
+static int can_unuse(struct inode *inode)
+{
+ if (inode->i_state)
+ return 0;
+ if (inode_has_buffers(inode))
+ return 0;
+ if (atomic_read(&inode->i_count))
+ return 0;
+ if (inode->i_data.nrpages)
+ return 0;
+ return 1;
+}
+
+/*
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * a temporary list and then are freed outside inode_lock by dispose_list().
+ *
+ * Any inodes which are pinned purely because of attached pagecache have their
+ * pagecache removed. We expect the final iput() on that inode to add it to
+ * the front of the inode_unused list. So look for it there and if the
+ * inode is still freeable, proceed. The right inode is found 99.9% of the
+ * time in testing on a 4-way.
+ *
+ * If the inode has metadata buffers attached to mapping->private_list then
+ * try to remove them.
+ */
+static void prune_icache(int nr_to_scan)
+{
+ LIST_HEAD(freeable);
+ int nr_pruned = 0;
+ int nr_scanned;
+ unsigned long reap = 0;
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+ struct inode *inode;
+
+ if (list_empty(&inode_unused))
+ break;
+
+ inode = list_entry(inode_unused.prev, struct inode, i_list);
+
+ if (inode->i_state || atomic_read(&inode->i_count)) {
+ list_move(&inode->i_list, &inode_unused);
+ continue;
+ }
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ if (remove_inode_buffers(inode))
+ reap += invalidate_inode_pages(&inode->i_data);
+ iput(inode);
+ spin_lock(&inode_lock);
+
+ if (inode != list_entry(inode_unused.next,
+ struct inode, i_list))
+ continue; /* wrong inode or list_empty */
+ if (!can_unuse(inode))
+ continue;
+ }
+ hlist_del_init(&inode->i_hash);
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ nr_pruned++;
+ }
+ inodes_stat.nr_unused -= nr_pruned;
+ spin_unlock(&inode_lock);
+
+ dispose_list(&freeable);
+ up(&iprune_sem);
+
+ if (current_is_kswapd)
+ mod_page_state(kswapd_inodesteal, reap);
+ else
+ mod_page_state(pginodesteal, reap);
+}
+
+/*
+ * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
+ * "unused" means that no dentries are referring to the inodes: the files are
+ * not open and the dcache references to those inodes have already been
+ * reclaimed.
+ *
+ * This function is passed the number of inodes to scan, and it returns the
+ * total number of remaining possibly-reclaimable inodes.
+ */
+static int shrink_icache_memory(int nr, unsigned int gfp_mask)
+{
+ if (nr) {
+ /*
+ * Nasty deadlock avoidance. We may hold various FS locks,
+ * and we don't want to recurse into the FS that called us
+ * in clear_inode() and friends..
+ */
+ if (gfp_mask & __GFP_FS)
+ prune_icache(nr);
+ }
+ return inodes_stat.nr_unused;
+}
+
+void __wait_on_freeing_inode(struct inode *inode);
+/*
+ * Called with the inode lock held.
+ * NOTE: we are not increasing the inode-refcount, you must call __iget()
+ * by hand after calling find_inode now! This simplifies iunique and won't
+ * add any additional branch in the common code.
+ */
+static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = hlist_entry(node, struct inode, i_hash);
+ if (inode->i_sb != sb)
+ continue;
+ if (!test(inode, data))
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/*
+ * find_inode_fast is the fast path version of find_inode, see the comment at
+ * iget_locked for details.
+ */
+static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = list_entry(node, struct inode, i_hash);
+ if (inode->i_ino != ino)
+ continue;
+ if (inode->i_sb != sb)
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/**
+ * new_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ */
+
+struct inode *new_inode(struct super_block *sb)
+{
+ static unsigned long last_ino;
+ struct inode * inode;
+
+ spin_lock_prefetch(&inode_lock);
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ inode->i_ino = ++last_ino;
+ inode->i_state = 0;
+ spin_unlock(&inode_lock);
+ }
+ return inode;
+}
+
+void unlock_new_inode(struct inode *inode)
+{
+ /*
+ * This is special! We do not need the spinlock
+ * when clearing I_LOCK, because we're guaranteed
+ * that nobody else tries to do anything about the
+ * state of the inode when it is locked, as we
+ * just created it (so there can be no old holders
+ * that haven't tested I_LOCK).
+ */
+ inode->i_state &= ~(I_LOCK|I_NEW);
+ wake_up_inode(inode);
+}
+EXPORT_SYMBOL(unlock_new_inode);
+
+/*
+ * This is called without the inode lock held.. Be careful.
+ *
+ * We no longer cache the sb_flags in i_flags - see fs.h
+ * -- rmk@arm.uk.linux.org
+ */
+static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode(sb, head, test, data);
+ if (!old) {
+ if (set(inode, data))
+ goto set_failed;
+
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+
+set_failed:
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ return NULL;
+}
+
+/*
+ * get_new_inode_fast is the fast path version of get_new_inode, see the
+ * comment at iget_locked for details.
+ */
+static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode_fast(sb, head, ino);
+ if (!old) {
+ inode->i_ino = ino;
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+}
+
+static inline unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES);
+ tmp = tmp + (tmp >> I_HASHBITS);
+ return tmp & I_HASHMASK;
+}
+
+/* Yeah, I know about quadratic hash. Maybe, later. */
+
+/**
+ * iunique - get a unique inode number
+ * @sb: superblock
+ * @max_reserved: highest reserved inode number
+ *
+ * Obtain an inode number that is unique on the system for a given
+ * superblock. This is used by file systems that have no natural
+ * permanent inode numbering system. An inode number is returned that
+ * is higher than the reserved limit but unique.
+ *
+ * BUGS:
+ * With a large number of inodes live on the file system this function
+ * currently becomes quite slow.
+ */
+
+ino_t iunique(struct super_block *sb, ino_t max_reserved)
+{
+ static ino_t counter = 0;
+ struct inode *inode;
+ struct hlist_head * head;
+ ino_t res;
+ spin_lock(&inode_lock);
+retry:
+ if (counter > max_reserved) {
+ head = inode_hashtable + hash(sb,counter);
+ res = counter++;
+ inode = find_inode_fast(sb, head, res);
+ if (!inode) {
+ spin_unlock(&inode_lock);
+ return res;
+ }
+ } else {
+ counter = max_reserved + 1;
+ }
+ goto retry;
+
+}
+
+struct inode *igrab(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ if (!(inode->i_state & I_FREEING))
+ __iget(inode);
+ else
+ /*
+ * Handle the case where s_op->clear_inode is not been
+ * called yet, and somebody is calling igrab
+ * while the inode is getting freed.
+ */
+ inode = NULL;
+ spin_unlock(&inode_lock);
+ return inode;
+}
+
+/**
+ * ifind - internal function, you want ilookup5() or iget5().
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ifind() searches for the inode specified by @hashval and @data in the inode
+ * cache. This is a generalized version of ifind_fast() for file systems where
+ * the inode number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+static inline struct inode *ifind(struct super_block *sb,
+ struct hlist_head *head, int (*test)(struct inode *, void *),
+ void *data)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode(sb, head, test, data);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ifind_fast - internal function, you want ilookup() or iget().
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ifind_fast() searches for the inode @ino in the inode cache. This is for
+ * file systems where the inode number is sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+static inline struct inode *ifind_fast(struct super_block *sb,
+ struct hlist_head *head, unsigned long ino)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode_fast(sb, head, ino);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ilookup5 - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * @data in the inode cache. This is a generalized version of ilookup() for
+ * file systems where the inode number is not sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+
+ return ifind(sb, head, test, data);
+}
+EXPORT_SYMBOL(ilookup5);
+
+/**
+ * ilookup - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
+ * This is for file systems where the inode number is sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+struct inode *ilookup(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+
+ return ifind_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(ilookup);
+
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * This is iget() without the read_inode() portion of get_new_inode().
+ *
+ * iget5_locked() uses ifind() to search for the inode specified by @hashval
+ * and @data in the inode cache and if present it is returned with an increased
+ * reference count. This is a generalized version of iget_locked() for file
+ * systems where the inode number is not sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is not in cache, get_new_inode() is called to allocate a new
+ * inode and this is returned locked, hashed, and with the I_NEW flag set. The
+ * file system gets to fill it in before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_lock held, so can't sleep.
+ */
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode;
+
+ inode = ifind(sb, head, test, data);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode(sb, head, test, set, data);
+}
+EXPORT_SYMBOL(iget5_locked);
+
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @ino: inode number to get
+ *
+ * This is iget() without the read_inode() portion of get_new_inode_fast().
+ *
+ * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
+ * the inode cache and if present it is returned with an increased reference
+ * count. This is for file systems where the inode number is sufficient for
+ * unique identification of an inode.
+ *
+ * If the inode is not in cache, get_new_inode_fast() is called to allocate a
+ * new inode and this is returned locked, hashed, and with the I_NEW flag set.
+ * The file system gets to fill it in before unlocking it via
+ * unlock_new_inode().
+ */
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct inode *inode;
+
+ inode = ifind_fast(sb, head, ino);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode_fast() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(iget_locked);
+
+/**
+ * __insert_inode_hash - hash an inode
+ * @inode: unhashed inode
+ * @hashval: unsigned long value used to locate this object in the
+ * inode_hashtable.
+ *
+ * Add an inode to the inode hash for this superblock. If the inode
+ * has no superblock it is added to a separate anonymous chain.
+ */
+
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+ struct hlist_head *head = &anon_hash_chain;
+ if (inode->i_sb)
+ head = inode_hashtable + hash(inode->i_sb, hashval);
+ spin_lock(&inode_lock);
+ hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode_lock);
+}
+
+/**
+ * remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock or anonymous hash.
+ */
+
+void remove_inode_hash(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ hlist_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+}
+
+void generic_delete_inode(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+
+<<<---hlist_del_init|||list_del_init===--->>> list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+
+ security_inode_delete(inode);
+
+ if (op->delete_inode) {
+ void (*delete)(struct inode *) = op->delete_inode;
+ if (!is_bad_inode(inode))
+ DQUOT_INIT(inode);
+ /* s_op->delete_inode internally recalls clear_inode() */
+ delete(inode);
+ } else
+ clear_inode(inode);
+ spin_lock(&inode_lock);
+ list_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+ wake_up_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ destroy_inode(inode);
+}
+EXPORT_SYMBOL(generic_delete_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!hlist_unhashed(&inode->i_hash)) {
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_unused);
+ }
+ inodes_stat.nr_unused++;
+ spin_unlock(&inode_lock);
+ if (!sb || (sb->s_flags & MS_ACTIVE))
+ return;
+ write_inode_now(inode, 1);
+ spin_lock(&inode_lock);
+ inodes_stat.nr_unused--;
+ hlist_del_init(&inode->i_hash);
+ }
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+}
+
+/*
+ * Normal UNIX filesystem behaviour: delete the
+ * inode when the usage count drops to zero, and
+ * i_nlink is zero.
+ */
+static void generic_drop_inode(struct inode *inode)
+{
+ if (!inode->i_nlink)
+ generic_delete_inode(inode);
+ else
+ generic_forget_inode(inode);
+}
+
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
+ *
+ * Call the FS "drop()" function, defaulting to
+ * the legacy UNIX filesystem behaviour..
+ *
+ * NOTE! NOTE! NOTE! We're called with the inode lock
+ * held, and the drop function is supposed to release
+ * the lock!
+ */
+static inline void iput_final(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+ void (*drop)(struct inode *) = generic_drop_inode;
+
+ if (op && op->drop_inode)
+ drop = op->drop_inode;
+ drop(inode);
+}
+
+/**
+ * iput - put an inode
+ * @inode: inode to put
+ *
+ * Puts an inode, dropping its usage count. If the inode use count hits
+ * zero the inode is also then freed and may be destroyed.
+ */
+
+void iput(struct inode *inode)
+{
+ if (inode) {
+ struct super_operations *op = inode->i_sb->s_op;
+
+ if (inode->i_state == I_CLEAR)
+ BUG();
+
+ if (op && op->put_inode)
+ op->put_inode(inode);
+
+ if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ iput_final(inode);
+ }
+}
+
+/**
+ * bmap - find a block number in a file
+ * @inode: inode of file
+ * @block: block to find
+ *
+ * Returns the block number on the device holding the inode that
+ * is the disk block number for the block of the file requested.
+ * That is, asked for block 4 of inode 1 the function will return the
+ * disk block relative to the disk start that holds that block of the
+ * file.
+ */
+
+sector_t bmap(struct inode * inode, sector_t block)
+{
+ sector_t res = 0;
+ if (inode->i_mapping->a_ops->bmap)
+ res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
+ return res;
+}
+
+/*
+ * Return true if the filesystem which backs this inode considers the two
+ * passed timespecs to be sufficiently different to warrant flushing the
+ * altered time out to disk.
+ */
+static int inode_times_differ(struct inode *inode,
+ struct timespec *old, struct timespec *new)
+{
+ if (IS_ONE_SECOND(inode))
+ return old->tv_sec != new->tv_sec;
+ return !timespec_equal(old, new);
+}
+
+/**
+ * update_atime - update the access time
+ * @inode: inode accessed
+ *
+ * Update the accessed time on an inode and mark it for writeback.
+ * This function automatically handles read only file systems and media,
+ * as well as the "noatime" flag and inode specific "noatime" markers.
+ */
+
+void update_atime(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOATIME(inode))
+ return;
+ if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+ return;
+ if (IS_RDONLY(inode))
+ return;
+
+ now = current_kernel_time();
+ if (inode_times_differ(inode, &inode->i_atime, &now)) {
+ inode->i_atime = now;
+ mark_inode_dirty_sync(inode);
+ } else {
+ if (!timespec_equal(&inode->i_atime, &now))
+ inode->i_atime = now;
+ }
+}
+
+/**
+ * inode_update_time - update mtime and ctime time
+ * @inode: inode accessed
+ * @ctime_too: update ctime too
+ *
+ * Update the mtime time on an inode and mark it for writeback.
+ * When ctime_too is specified update the ctime too.
+ */
+
+void inode_update_time(struct inode *inode, int ctime_too)
+{
+ struct timespec now = current_kernel_time();
+ int sync_it = 0;
+
+ if (inode_times_differ(inode, &inode->i_mtime, &now))
+ sync_it = 1;
+ inode->i_mtime = now;
+
+ if (ctime_too) {
+ if (inode_times_differ(inode, &inode->i_ctime, &now))
+ sync_it = 1;
+ inode->i_ctime = now;
+ }
+ if (sync_it)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL(inode_update_time);
+
+int inode_needs_sync(struct inode *inode)
+{
+ if (IS_SYNC(inode))
+ return 1;
+ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(inode_needs_sync);
+
+/*
+ * Quota functions that want to walk the inode lists..
+ */
+#ifdef CONFIG_QUOTA
+
+/* Functions back in dquot.c */
+void put_dquot_list(struct list_head *);
+int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
+
+void remove_dquot_ref(struct super_block *sb, int type)
+{
+ struct inode *inode;
+ struct list_head *act_head;
+ LIST_HEAD(tofree_head);
+
+ if (!sb->dq_op)
+ return; /* nothing to do */
+ spin_lock(&inode_lock); /* This lock is for inodes code */
+ /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */
+
+ list_for_each(act_head, &inode_in_use) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &inode_unused) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_dirty) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_io) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ spin_unlock(&inode_lock);
+
+ put_dquot_list(&tofree_head);
+}
+
+#endif
+
+/*
+ * Hashed waitqueues for wait_on_inode(). The table is pretty small - the
+ * kernel doesn't lock many inodes at the same time.
+ */
+#define I_WAIT_TABLE_ORDER 3
+static struct i_wait_queue_head {
+ wait_queue_head_t wqh;
+} ____cacheline_aligned_in_smp i_wait_queue_heads[1<<I_WAIT_TABLE_ORDER];
+
+/*
+ * Return the address of the waitqueue_head to be used for this inode
+ */
+static wait_queue_head_t *i_waitq_head(struct inode *inode)
+{
+ return &i_wait_queue_heads[hash_ptr(inode, I_WAIT_TABLE_ORDER)].wqh;
+}
+
+void __wait_on_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (inode->i_state & I_LOCK) {
+ schedule();
+ goto repeat;
+ }
+ remove_wait_queue(wq, &wait);
+ __set_current_state(TASK_RUNNING);
+}
+
+void __wait_on_freeing_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode_lock);
+ schedule();
+ remove_wait_queue(wq, &wait);
+ current->state = TASK_RUNNING;
+ spin_lock(&inode_lock);
+}
+
+
+void wake_up_inode(struct inode *inode)
+{
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ /*
+ * Prevent speculative execution through spin_unlock(&inode_lock);
+ */
+ smp_mb();
+ if (waitqueue_active(wq))
+ wake_up_all(wq);
+}
+
+/*
+ * Initialize the waitqueues and inode hash table.
+ */
+void __init inode_init(unsigned long mempages)
+{
+ struct hlist_head *head;
+ unsigned long order;
+ unsigned int nr_hash;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
+ init_waitqueue_head(&i_wait_queue_heads[i].wqh);
+
+ mempages >>= (14 - PAGE_SHIFT);
+ mempages *= sizeof(struct list_head);
+ for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
+ ;
+
+ do {
+ unsigned long tmp;
+
+ nr_hash = (1UL << order) * PAGE_SIZE /
+ sizeof(struct hlist_head);
+ i_hash_mask = (nr_hash - 1);
+
+ tmp = nr_hash;
+ i_hash_shift = 0;
+ while ((tmp >>= 1UL) != 0UL)
+ i_hash_shift++;
+
+ inode_hashtable = (struct hlist_head *)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (inode_hashtable == NULL && --order >= 0);
+
+ printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+ nr_hash, order, (PAGE_SIZE << order));
+
+ if (!inode_hashtable)
+ panic("Failed to allocate inode hash table\n");
+
+ head = inode_hashtable;
+ i = nr_hash;
+ do {
+ INIT_HLIST_HEAD(head);
+ head++;
+ i--;
+ } while (i);
+
+ /* inode slab cache */
+ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
+ 0, SLAB_HWCACHE_ALIGN, init_once,
+ NULL);
+ if (!inode_cachep)
+ panic("cannot create inode slab cache");
+
+ set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+}
+
+void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
+ inode->i_mode = mode;
+ if (S_ISCHR(mode)) {
+ inode->i_fop = &def_chr_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISBLK(mode)) {
+ inode->i_fop = &def_blk_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISFIFO(mode))
+ inode->i_fop = &def_fifo_fops;
+ else if (S_ISSOCK(mode))
+ inode->i_fop = &bad_sock_fops;
+ else
+ printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
+ mode);
+}
diff --git a/tests/linux/inode-justrej/lmerge b/tests/linux/inode-justrej/lmerge
new file mode 100644
index 0000000..08b0a9b
--- /dev/null
+++ b/tests/linux/inode-justrej/lmerge
@@ -0,0 +1,1360 @@
+/*
+ * linux/fs/inode.c
+ *
+ * (C) 1997 Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dcache.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+
+/*
+ * This is needed for the following functions:
+ * - inode_has_buffers
+ * - invalidate_inode_buffers
+ * - fsync_bdev
+ * - invalidate_bdev
+ *
+ * FIXME: remove all knowledge of the buffer layer from this file
+ */
+#include <linux/buffer_head.h>
+
+/*
+ * New inode.c implementation.
+ *
+ * This implementation has the basic premise of trying
+ * to be extremely low-overhead and SMP-safe, yet be
+ * simple enough to be "obviously correct".
+ *
+ * Famous last words.
+ */
+
+/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
+
+/* #define INODE_PARANOIA 1 */
+/* #define INODE_DEBUG 1 */
+
+/*
+ * Inode lookup is no longer as critical as it used to be:
+ * most of the lookups are going to be through the dcache.
+ */
+#define I_HASHBITS i_hash_shift
+#define I_HASHMASK i_hash_mask
+
+static unsigned int i_hash_mask;
+static unsigned int i_hash_shift;
+
+/*
+ * Each inode can be on two separate lists. One is
+ * the hash list of the inode, used for lookups. The
+ * other linked list is the "type" list:
+ * "in_use" - valid inode, i_count > 0, i_nlink > 0
+ * "dirty" - as "in_use" but also dirty
+ * "unused" - valid inode, i_count = 0
+ *
+ * A "dirty" list is maintained for each super block,
+ * allowing for low-overhead inode sync() operations.
+ */
+
+LIST_HEAD(inode_in_use);
+LIST_HEAD(inode_unused);
+static struct hlist_head *inode_hashtable;
+static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
+
+/*
+ * A simple spinlock to protect the list manipulations.
+ *
+ * NOTE! You also have to own the lock if you change
+ * the i_state of an inode while it is in use..
+ */
+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * icache shrinking path, and the umount path. Without this exclusion,
+ * by the time prune_icache calls iput for the inode whose pages it has
+ * been invalidating, or by the time it calls clear_inode & destroy_inode
+ * from its final dispose_list, the struct super_block they refer to
+ * (for inode->i_sb->s_op) may already have been freed and reused.
+ */
+static DECLARE_MUTEX(iprune_sem);
+
+/*
+ * Statistics gathering..
+ */
+struct inodes_stat_t inodes_stat;
+
+static kmem_cache_t * inode_cachep;
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+ static struct address_space_operations empty_aops;
+ static struct inode_operations empty_iops;
+ static struct file_operations empty_fops;
+ struct inode *inode;
+
+ if (sb->s_op->alloc_inode)
+ inode = sb->s_op->alloc_inode(sb);
+ else
+ inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
+
+ if (inode) {
+ struct address_space * const mapping = &inode->i_data;
+
+ inode->i_sb = sb;
+ inode->i_blkbits = sb->s_blocksize_bits;
+ inode->i_flags = 0;
+ atomic_set(&inode->i_count, 1);
+ inode->i_sock = 0;
+ inode->i_op = &empty_iops;
+ inode->i_fop = &empty_fops;
+ inode->i_nlink = 1;
+ atomic_set(&inode->i_writecount, 0);
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ inode->i_bytes = 0;
+ inode->i_generation = 0;
+ memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+ inode->i_pipe = NULL;
+ inode->i_bdev = NULL;
+ inode->i_rdev = to_kdev_t(0);
+ inode->i_security = NULL;
+ if (security_inode_alloc(inode)) {
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+ return NULL;
+ }
+
+ mapping->a_ops = &empty_aops;
+ mapping->host = inode;
+ mapping->gfp_mask = GFP_HIGHUSER;
+ mapping->dirtied_when = 0;
+ mapping->assoc_mapping = NULL;
+ mapping->backing_dev_info = &default_backing_dev_info;
+ if (sb->s_bdev)
+ mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ memset(&inode->u, 0, sizeof(inode->u));
+ inode->i_mapping = mapping;
+ }
+ return inode;
+}
+
+void destroy_inode(struct inode *inode)
+{
+ if (inode_has_buffers(inode))
+ BUG();
+ security_inode_free(inode);
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+}
+
+
+/*
+ * These are initializations that only need to be done
+ * once, because the fields are idempotent across use
+ * of the inode, so let the slab aware of that.
+ */
+void inode_init_once(struct inode *inode)
+{
+ memset(inode, 0, sizeof(*inode));
+ INIT_HLIST_NODE(&inode->i_hash);
+ INIT_LIST_HEAD(&inode->i_data.clean_pages);
+ INIT_LIST_HEAD(&inode->i_data.dirty_pages);
+ INIT_LIST_HEAD(&inode->i_data.locked_pages);
+ INIT_LIST_HEAD(&inode->i_data.io_pages);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ INIT_LIST_HEAD(&inode->i_devices);
+ sema_init(&inode->i_sem, 1);
+ INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+ rwlock_init(&inode->i_data.page_lock);
+ init_MUTEX(&inode->i_data.i_shared_sem);
+ INIT_LIST_HEAD(&inode->i_data.private_list);
+ spin_lock_init(&inode->i_data.private_lock);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+ spin_lock_init(&inode->i_lock);
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+ struct inode * inode = (struct inode *) foo;
+
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ inode_init_once(inode);
+}
+
+/*
+ * inode_lock must be held
+ */
+void __iget(struct inode * inode)
+{
+ if (atomic_read(&inode->i_count)) {
+ atomic_inc(&inode->i_count);
+ return;
+ }
+ atomic_inc(&inode->i_count);
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_in_use);
+ }
+ inodes_stat.nr_unused--;
+}
+
+/**
+ * clear_inode - clear an inode
+ * @inode: inode to clear
+ *
+ * This is called by the filesystem to tell us
+ * that the inode is no longer useful. We just
+ * terminate it with extreme prejudice.
+ */
+
+void clear_inode(struct inode *inode)
+{
+ invalidate_inode_buffers(inode);
+
+ if (inode->i_data.nrpages)
+ BUG();
+ if (!(inode->i_state & I_FREEING))
+ BUG();
+ if (inode->i_state & I_CLEAR)
+ BUG();
+ wait_on_inode(inode);
+ DQUOT_DROP(inode);
+ if (inode->i_sb && inode->i_sb->s_op->clear_inode)
+ inode->i_sb->s_op->clear_inode(inode);
+ if (inode->i_bdev)
+ bd_forget(inode);
+ inode->i_state = I_CLEAR;
+}
+
+/*
+ * Dispose-list gets a local list with local inodes in it, so it doesn't
+ * need to worry about list corruption and SMP locks.
+ */
+static void dispose_list(struct list_head *head)
+{
+ int nr_disposed = 0;
+
+ while (!list_empty(head)) {
+ struct inode *inode;
+
+ inode = list_entry(head->next, struct inode, i_list);
+ list_del(&inode->i_list);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+ nr_disposed++;
+ }
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes -= nr_disposed;
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * Invalidate all inodes for a device.
+ */
+static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
+{
+ struct list_head *next;
+ int busy = 0, count = 0;
+
+ next = head->next;
+ for (;;) {
+ struct list_head * tmp = next;
+ struct inode * inode;
+
+ next = next->next;
+ if (tmp == head)
+ break;
+ inode = list_entry(tmp, struct inode, i_list);
+ if (inode->i_sb != sb)
+ continue;
+ invalidate_inode_buffers(inode);
+ if (!atomic_read(&inode->i_count)) {
+ hlist_del_init(&inode->i_hash);
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, dispose);
+ inode->i_state |= I_FREEING;
+ count++;
+ continue;
+ }
+ busy = 1;
+ }
+ /* only unused inodes may be cached with i_count zero */
+ inodes_stat.nr_unused -= count;
+ return busy;
+}
+
+/*
+ * This is a two-stage process. First we collect all
+ * offending inodes onto the throw-away list, and in
+ * the second stage we actually dispose of them. This
+ * is because we don't want to sleep while messing
+ * with the global lists..
+ */
+
+/**
+ * invalidate_inodes - discard the inodes on a device
+ * @sb: superblock
+ *
+ * Discard all of the inodes for a given superblock. If the discard
+ * fails because there are busy inodes then a non zero value is returned.
+ * If the discard is successful all the inodes have been discarded.
+ */
+
+int invalidate_inodes(struct super_block * sb)
+{
+ int busy;
+ LIST_HEAD(throw_away);
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ busy = invalidate_list(&inode_in_use, sb, &throw_away);
+ busy |= invalidate_list(&inode_unused, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_io, sb, &throw_away);
+ spin_unlock(&inode_lock);
+
+ dispose_list(&throw_away);
+ up(&iprune_sem);
+
+ return busy;
+}
+
+int invalidate_device(kdev_t dev, int do_sync)
+{
+ struct super_block *sb;
+ struct block_device *bdev = bdget(kdev_t_to_nr(dev));
+ int res;
+
+ if (!bdev)
+ return 0;
+
+ if (do_sync)
+ fsync_bdev(bdev);
+
+ res = 0;
+ sb = get_super(bdev);
+ if (sb) {
+ /*
+ * no need to lock the super, get_super holds the
+ * read semaphore so the filesystem cannot go away
+ * under us (->put_super runs with the write lock
+ * hold).
+ */
+ shrink_dcache_sb(sb);
+ res = invalidate_inodes(sb);
+ drop_super(sb);
+ }
+ invalidate_bdev(bdev, 0);
+ bdput(bdev);
+ return res;
+}
+
+static int can_unuse(struct inode *inode)
+{
+ if (inode->i_state)
+ return 0;
+ if (inode_has_buffers(inode))
+ return 0;
+ if (atomic_read(&inode->i_count))
+ return 0;
+ if (inode->i_data.nrpages)
+ return 0;
+ return 1;
+}
+
+/*
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * a temporary list and then are freed outside inode_lock by dispose_list().
+ *
+ * Any inodes which are pinned purely because of attached pagecache have their
+ * pagecache removed. We expect the final iput() on that inode to add it to
+ * the front of the inode_unused list. So look for it there and if the
+ * inode is still freeable, proceed. The right inode is found 99.9% of the
+ * time in testing on a 4-way.
+ *
+ * If the inode has metadata buffers attached to mapping->private_list then
+ * try to remove them.
+ */
+static void prune_icache(int nr_to_scan)
+{
+ LIST_HEAD(freeable);
+ int nr_pruned = 0;
+ int nr_scanned;
+ unsigned long reap = 0;
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+ struct inode *inode;
+
+ if (list_empty(&inode_unused))
+ break;
+
+ inode = list_entry(inode_unused.prev, struct inode, i_list);
+
+ if (inode->i_state || atomic_read(&inode->i_count)) {
+ list_move(&inode->i_list, &inode_unused);
+ continue;
+ }
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ if (remove_inode_buffers(inode))
+ reap += invalidate_inode_pages(&inode->i_data);
+ iput(inode);
+ spin_lock(&inode_lock);
+
+ if (inode != list_entry(inode_unused.next,
+ struct inode, i_list))
+ continue; /* wrong inode or list_empty */
+ if (!can_unuse(inode))
+ continue;
+ }
+ hlist_del_init(&inode->i_hash);
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ nr_pruned++;
+ }
+ inodes_stat.nr_unused -= nr_pruned;
+ spin_unlock(&inode_lock);
+
+ dispose_list(&freeable);
+ up(&iprune_sem);
+
+ if (current_is_kswapd)
+ mod_page_state(kswapd_inodesteal, reap);
+ else
+ mod_page_state(pginodesteal, reap);
+}
+
+/*
+ * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
+ * "unused" means that no dentries are referring to the inodes: the files are
+ * not open and the dcache references to those inodes have already been
+ * reclaimed.
+ *
+ * This function is passed the number of inodes to scan, and it returns the
+ * total number of remaining possibly-reclaimable inodes.
+ */
+static int shrink_icache_memory(int nr, unsigned int gfp_mask)
+{
+ if (nr) {
+ /*
+ * Nasty deadlock avoidance. We may hold various FS locks,
+ * and we don't want to recurse into the FS that called us
+ * in clear_inode() and friends..
+ */
+ if (gfp_mask & __GFP_FS)
+ prune_icache(nr);
+ }
+ return inodes_stat.nr_unused;
+}
+
+void __wait_on_freeing_inode(struct inode *inode);
+/*
+ * Called with the inode lock held.
+ * NOTE: we are not increasing the inode-refcount, you must call __iget()
+ * by hand after calling find_inode now! This simplifies iunique and won't
+ * add any additional branch in the common code.
+ */
+static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = hlist_entry(node, struct inode, i_hash);
+ if (inode->i_sb != sb)
+ continue;
+ if (!test(inode, data))
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/*
+ * find_inode_fast is the fast path version of find_inode, see the comment at
+ * iget_locked for details.
+ */
+static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = list_entry(node, struct inode, i_hash);
+ if (inode->i_ino != ino)
+ continue;
+ if (inode->i_sb != sb)
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/**
+ * new_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ */
+
+struct inode *new_inode(struct super_block *sb)
+{
+ static unsigned long last_ino;
+ struct inode * inode;
+
+ spin_lock_prefetch(&inode_lock);
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ inode->i_ino = ++last_ino;
+ inode->i_state = 0;
+ spin_unlock(&inode_lock);
+ }
+ return inode;
+}
+
+void unlock_new_inode(struct inode *inode)
+{
+ /*
+ * This is special! We do not need the spinlock
+ * when clearing I_LOCK, because we're guaranteed
+ * that nobody else tries to do anything about the
+ * state of the inode when it is locked, as we
+ * just created it (so there can be no old holders
+ * that haven't tested I_LOCK).
+ */
+ inode->i_state &= ~(I_LOCK|I_NEW);
+ wake_up_inode(inode);
+}
+EXPORT_SYMBOL(unlock_new_inode);
+
+/*
+ * This is called without the inode lock held.. Be careful.
+ *
+ * We no longer cache the sb_flags in i_flags - see fs.h
+ * -- rmk@arm.uk.linux.org
+ */
+static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode(sb, head, test, data);
+ if (!old) {
+ if (set(inode, data))
+ goto set_failed;
+
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+
+set_failed:
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ return NULL;
+}
+
+/*
+ * get_new_inode_fast is the fast path version of get_new_inode, see the
+ * comment at iget_locked for details.
+ */
+static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode_fast(sb, head, ino);
+ if (!old) {
+ inode->i_ino = ino;
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+}
+
+static inline unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES);
+ tmp = tmp + (tmp >> I_HASHBITS);
+ return tmp & I_HASHMASK;
+}
+
+/* Yeah, I know about quadratic hash. Maybe, later. */
+
+/**
+ * iunique - get a unique inode number
+ * @sb: superblock
+ * @max_reserved: highest reserved inode number
+ *
+ * Obtain an inode number that is unique on the system for a given
+ * superblock. This is used by file systems that have no natural
+ * permanent inode numbering system. An inode number is returned that
+ * is higher than the reserved limit but unique.
+ *
+ * BUGS:
+ * With a large number of inodes live on the file system this function
+ * currently becomes quite slow.
+ */
+
+ino_t iunique(struct super_block *sb, ino_t max_reserved)
+{
+ static ino_t counter = 0;
+ struct inode *inode;
+ struct hlist_head * head;
+ ino_t res;
+ spin_lock(&inode_lock);
+retry:
+ if (counter > max_reserved) {
+ head = inode_hashtable + hash(sb,counter);
+ res = counter++;
+ inode = find_inode_fast(sb, head, res);
+ if (!inode) {
+ spin_unlock(&inode_lock);
+ return res;
+ }
+ } else {
+ counter = max_reserved + 1;
+ }
+ goto retry;
+
+}
+
+struct inode *igrab(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ if (!(inode->i_state & I_FREEING))
+ __iget(inode);
+ else
+ /*
+ * Handle the case where s_op->clear_inode is not been
+ * called yet, and somebody is calling igrab
+ * while the inode is getting freed.
+ */
+ inode = NULL;
+ spin_unlock(&inode_lock);
+ return inode;
+}
+
+/**
+ * ifind - internal function, you want ilookup5() or iget5().
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ifind() searches for the inode specified by @hashval and @data in the inode
+ * cache. This is a generalized version of ifind_fast() for file systems where
+ * the inode number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+static inline struct inode *ifind(struct super_block *sb,
+ struct hlist_head *head, int (*test)(struct inode *, void *),
+ void *data)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode(sb, head, test, data);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ifind_fast - internal function, you want ilookup() or iget().
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ifind_fast() searches for the inode @ino in the inode cache. This is for
+ * file systems where the inode number is sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+static inline struct inode *ifind_fast(struct super_block *sb,
+ struct hlist_head *head, unsigned long ino)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode_fast(sb, head, ino);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ilookup5 - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * @data in the inode cache. This is a generalized version of ilookup() for
+ * file systems where the inode number is not sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+
+ return ifind(sb, head, test, data);
+}
+EXPORT_SYMBOL(ilookup5);
+
+/**
+ * ilookup - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
+ * This is for file systems where the inode number is sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+struct inode *ilookup(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+
+ return ifind_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(ilookup);
+
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * This is iget() without the read_inode() portion of get_new_inode().
+ *
+ * iget5_locked() uses ifind() to search for the inode specified by @hashval
+ * and @data in the inode cache and if present it is returned with an increased
+ * reference count. This is a generalized version of iget_locked() for file
+ * systems where the inode number is not sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is not in cache, get_new_inode() is called to allocate a new
+ * inode and this is returned locked, hashed, and with the I_NEW flag set. The
+ * file system gets to fill it in before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_lock held, so can't sleep.
+ */
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode;
+
+ inode = ifind(sb, head, test, data);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode(sb, head, test, set, data);
+}
+EXPORT_SYMBOL(iget5_locked);
+
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @ino: inode number to get
+ *
+ * This is iget() without the read_inode() portion of get_new_inode_fast().
+ *
+ * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
+ * the inode cache and if present it is returned with an increased reference
+ * count. This is for file systems where the inode number is sufficient for
+ * unique identification of an inode.
+ *
+ * If the inode is not in cache, get_new_inode_fast() is called to allocate a
+ * new inode and this is returned locked, hashed, and with the I_NEW flag set.
+ * The file system gets to fill it in before unlocking it via
+ * unlock_new_inode().
+ */
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct inode *inode;
+
+ inode = ifind_fast(sb, head, ino);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode_fast() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(iget_locked);
+
+/**
+ * __insert_inode_hash - hash an inode
+ * @inode: unhashed inode
+ * @hashval: unsigned long value used to locate this object in the
+ * inode_hashtable.
+ *
+ * Add an inode to the inode hash for this superblock. If the inode
+ * has no superblock it is added to a separate anonymous chain.
+ */
+
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+ struct hlist_head *head = &anon_hash_chain;
+ if (inode->i_sb)
+ head = inode_hashtable + hash(inode->i_sb, hashval);
+ spin_lock(&inode_lock);
+ hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode_lock);
+}
+
+/**
+ * remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock or anonymous hash.
+ */
+
+void remove_inode_hash(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ hlist_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+}
+
+void generic_delete_inode(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+<<<<<<<
+
+ hlist_del_init(&inode->i_hash);
+|||||||
+
+ list_del_init(&inode->i_hash);
+=======
+
+>>>>>>>
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+
+ security_inode_delete(inode);
+
+ if (op->delete_inode) {
+ void (*delete)(struct inode *) = op->delete_inode;
+ if (!is_bad_inode(inode))
+ DQUOT_INIT(inode);
+ /* s_op->delete_inode internally recalls clear_inode() */
+ delete(inode);
+ } else
+ clear_inode(inode);
+ spin_lock(&inode_lock);
+ list_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+ wake_up_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ destroy_inode(inode);
+}
+EXPORT_SYMBOL(generic_delete_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!hlist_unhashed(&inode->i_hash)) {
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_unused);
+ }
+ inodes_stat.nr_unused++;
+ spin_unlock(&inode_lock);
+ if (!sb || (sb->s_flags & MS_ACTIVE))
+ return;
+ write_inode_now(inode, 1);
+ spin_lock(&inode_lock);
+ inodes_stat.nr_unused--;
+ hlist_del_init(&inode->i_hash);
+ }
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+}
+
+/*
+ * Normal UNIX filesystem behaviour: delete the
+ * inode when the usage count drops to zero, and
+ * i_nlink is zero.
+ */
+static void generic_drop_inode(struct inode *inode)
+{
+ if (!inode->i_nlink)
+ generic_delete_inode(inode);
+ else
+ generic_forget_inode(inode);
+}
+
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
+ *
+ * Call the FS "drop()" function, defaulting to
+ * the legacy UNIX filesystem behaviour..
+ *
+ * NOTE! NOTE! NOTE! We're called with the inode lock
+ * held, and the drop function is supposed to release
+ * the lock!
+ */
+static inline void iput_final(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+ void (*drop)(struct inode *) = generic_drop_inode;
+
+ if (op && op->drop_inode)
+ drop = op->drop_inode;
+ drop(inode);
+}
+
+/**
+ * iput - put an inode
+ * @inode: inode to put
+ *
+ * Puts an inode, dropping its usage count. If the inode use count hits
+ * zero the inode is also then freed and may be destroyed.
+ */
+
+void iput(struct inode *inode)
+{
+ if (inode) {
+ struct super_operations *op = inode->i_sb->s_op;
+
+ if (inode->i_state == I_CLEAR)
+ BUG();
+
+ if (op && op->put_inode)
+ op->put_inode(inode);
+
+ if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ iput_final(inode);
+ }
+}
+
+/**
+ * bmap - find a block number in a file
+ * @inode: inode of file
+ * @block: block to find
+ *
+ * Returns the block number on the device holding the inode that
+ * is the disk block number for the block of the file requested.
+ * That is, asked for block 4 of inode 1 the function will return the
+ * disk block relative to the disk start that holds that block of the
+ * file.
+ */
+
+sector_t bmap(struct inode * inode, sector_t block)
+{
+ sector_t res = 0;
+ if (inode->i_mapping->a_ops->bmap)
+ res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
+ return res;
+}
+
+/*
+ * Return true if the filesystem which backs this inode considers the two
+ * passed timespecs to be sufficiently different to warrant flushing the
+ * altered time out to disk.
+ */
+static int inode_times_differ(struct inode *inode,
+ struct timespec *old, struct timespec *new)
+{
+ if (IS_ONE_SECOND(inode))
+ return old->tv_sec != new->tv_sec;
+ return !timespec_equal(old, new);
+}
+
+/**
+ * update_atime - update the access time
+ * @inode: inode accessed
+ *
+ * Update the accessed time on an inode and mark it for writeback.
+ * This function automatically handles read only file systems and media,
+ * as well as the "noatime" flag and inode specific "noatime" markers.
+ */
+
+void update_atime(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOATIME(inode))
+ return;
+ if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+ return;
+ if (IS_RDONLY(inode))
+ return;
+
+ now = current_kernel_time();
+ if (inode_times_differ(inode, &inode->i_atime, &now)) {
+ inode->i_atime = now;
+ mark_inode_dirty_sync(inode);
+ } else {
+ if (!timespec_equal(&inode->i_atime, &now))
+ inode->i_atime = now;
+ }
+}
+
+/**
+ * inode_update_time - update mtime and ctime time
+ * @inode: inode accessed
+ * @ctime_too: update ctime too
+ *
+ * Update the mtime time on an inode and mark it for writeback.
+ * When ctime_too is specified update the ctime too.
+ */
+
+void inode_update_time(struct inode *inode, int ctime_too)
+{
+ struct timespec now = current_kernel_time();
+ int sync_it = 0;
+
+ if (inode_times_differ(inode, &inode->i_mtime, &now))
+ sync_it = 1;
+ inode->i_mtime = now;
+
+ if (ctime_too) {
+ if (inode_times_differ(inode, &inode->i_ctime, &now))
+ sync_it = 1;
+ inode->i_ctime = now;
+ }
+ if (sync_it)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL(inode_update_time);
+
+int inode_needs_sync(struct inode *inode)
+{
+ if (IS_SYNC(inode))
+ return 1;
+ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(inode_needs_sync);
+
+/*
+ * Quota functions that want to walk the inode lists..
+ */
+#ifdef CONFIG_QUOTA
+
+/* Functions back in dquot.c */
+void put_dquot_list(struct list_head *);
+int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
+
+void remove_dquot_ref(struct super_block *sb, int type)
+{
+ struct inode *inode;
+ struct list_head *act_head;
+ LIST_HEAD(tofree_head);
+
+ if (!sb->dq_op)
+ return; /* nothing to do */
+ spin_lock(&inode_lock); /* This lock is for inodes code */
+ /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */
+
+ list_for_each(act_head, &inode_in_use) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &inode_unused) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_dirty) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_io) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ spin_unlock(&inode_lock);
+
+ put_dquot_list(&tofree_head);
+}
+
+#endif
+
+/*
+ * Hashed waitqueues for wait_on_inode(). The table is pretty small - the
+ * kernel doesn't lock many inodes at the same time.
+ */
+#define I_WAIT_TABLE_ORDER 3
+static struct i_wait_queue_head {
+ wait_queue_head_t wqh;
+} ____cacheline_aligned_in_smp i_wait_queue_heads[1<<I_WAIT_TABLE_ORDER];
+
+/*
+ * Return the address of the waitqueue_head to be used for this inode
+ */
+static wait_queue_head_t *i_waitq_head(struct inode *inode)
+{
+ return &i_wait_queue_heads[hash_ptr(inode, I_WAIT_TABLE_ORDER)].wqh;
+}
+
+void __wait_on_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (inode->i_state & I_LOCK) {
+ schedule();
+ goto repeat;
+ }
+ remove_wait_queue(wq, &wait);
+ __set_current_state(TASK_RUNNING);
+}
+
+void __wait_on_freeing_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode_lock);
+ schedule();
+ remove_wait_queue(wq, &wait);
+ current->state = TASK_RUNNING;
+ spin_lock(&inode_lock);
+}
+
+
+void wake_up_inode(struct inode *inode)
+{
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ /*
+ * Prevent speculative execution through spin_unlock(&inode_lock);
+ */
+ smp_mb();
+ if (waitqueue_active(wq))
+ wake_up_all(wq);
+}
+
+/*
+ * Initialize the waitqueues and inode hash table.
+ */
+void __init inode_init(unsigned long mempages)
+{
+ struct hlist_head *head;
+ unsigned long order;
+ unsigned int nr_hash;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
+ init_waitqueue_head(&i_wait_queue_heads[i].wqh);
+
+ mempages >>= (14 - PAGE_SHIFT);
+ mempages *= sizeof(struct list_head);
+ for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
+ ;
+
+ do {
+ unsigned long tmp;
+
+ nr_hash = (1UL << order) * PAGE_SIZE /
+ sizeof(struct hlist_head);
+ i_hash_mask = (nr_hash - 1);
+
+ tmp = nr_hash;
+ i_hash_shift = 0;
+ while ((tmp >>= 1UL) != 0UL)
+ i_hash_shift++;
+
+ inode_hashtable = (struct hlist_head *)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (inode_hashtable == NULL && --order >= 0);
+
+ printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+ nr_hash, order, (PAGE_SIZE << order));
+
+ if (!inode_hashtable)
+ panic("Failed to allocate inode hash table\n");
+
+ head = inode_hashtable;
+ i = nr_hash;
+ do {
+ INIT_HLIST_HEAD(head);
+ head++;
+ i--;
+ } while (i);
+
+ /* inode slab cache */
+ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
+ 0, SLAB_HWCACHE_ALIGN, init_once,
+ NULL);
+ if (!inode_cachep)
+ panic("cannot create inode slab cache");
+
+ set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+}
+
+void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
+ inode->i_mode = mode;
+ if (S_ISCHR(mode)) {
+ inode->i_fop = &def_chr_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISBLK(mode)) {
+ inode->i_fop = &def_blk_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISFIFO(mode))
+ inode->i_fop = &def_fifo_fops;
+ else if (S_ISSOCK(mode))
+ inode->i_fop = &bad_sock_fops;
+ else
+ printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
+ mode);
+}
diff --git a/tests/linux/inode-justrej/merge b/tests/linux/inode-justrej/merge
new file mode 100644
index 0000000..685b14e
--- /dev/null
+++ b/tests/linux/inode-justrej/merge
@@ -0,0 +1,1358 @@
+/*
+ * linux/fs/inode.c
+ *
+ * (C) 1997 Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dcache.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+
+/*
+ * This is needed for the following functions:
+ * - inode_has_buffers
+ * - invalidate_inode_buffers
+ * - fsync_bdev
+ * - invalidate_bdev
+ *
+ * FIXME: remove all knowledge of the buffer layer from this file
+ */
+#include <linux/buffer_head.h>
+
+/*
+ * New inode.c implementation.
+ *
+ * This implementation has the basic premise of trying
+ * to be extremely low-overhead and SMP-safe, yet be
+ * simple enough to be "obviously correct".
+ *
+ * Famous last words.
+ */
+
+/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
+
+/* #define INODE_PARANOIA 1 */
+/* #define INODE_DEBUG 1 */
+
+/*
+ * Inode lookup is no longer as critical as it used to be:
+ * most of the lookups are going to be through the dcache.
+ */
+#define I_HASHBITS i_hash_shift
+#define I_HASHMASK i_hash_mask
+
+static unsigned int i_hash_mask;
+static unsigned int i_hash_shift;
+
+/*
+ * Each inode can be on two separate lists. One is
+ * the hash list of the inode, used for lookups. The
+ * other linked list is the "type" list:
+ * "in_use" - valid inode, i_count > 0, i_nlink > 0
+ * "dirty" - as "in_use" but also dirty
+ * "unused" - valid inode, i_count = 0
+ *
+ * A "dirty" list is maintained for each super block,
+ * allowing for low-overhead inode sync() operations.
+ */
+
+LIST_HEAD(inode_in_use);
+LIST_HEAD(inode_unused);
+static struct hlist_head *inode_hashtable;
+static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
+
+/*
+ * A simple spinlock to protect the list manipulations.
+ *
+ * NOTE! You also have to own the lock if you change
+ * the i_state of an inode while it is in use..
+ */
+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * icache shrinking path, and the umount path. Without this exclusion,
+ * by the time prune_icache calls iput for the inode whose pages it has
+ * been invalidating, or by the time it calls clear_inode & destroy_inode
+ * from its final dispose_list, the struct super_block they refer to
+ * (for inode->i_sb->s_op) may already have been freed and reused.
+ */
+static DECLARE_MUTEX(iprune_sem);
+
+/*
+ * Statistics gathering..
+ */
+struct inodes_stat_t inodes_stat;
+
+static kmem_cache_t * inode_cachep;
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+ static struct address_space_operations empty_aops;
+ static struct inode_operations empty_iops;
+ static struct file_operations empty_fops;
+ struct inode *inode;
+
+ if (sb->s_op->alloc_inode)
+ inode = sb->s_op->alloc_inode(sb);
+ else
+ inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
+
+ if (inode) {
+ struct address_space * const mapping = &inode->i_data;
+
+ inode->i_sb = sb;
+ inode->i_blkbits = sb->s_blocksize_bits;
+ inode->i_flags = 0;
+ atomic_set(&inode->i_count, 1);
+ inode->i_sock = 0;
+ inode->i_op = &empty_iops;
+ inode->i_fop = &empty_fops;
+ inode->i_nlink = 1;
+ atomic_set(&inode->i_writecount, 0);
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ inode->i_bytes = 0;
+ inode->i_generation = 0;
+ memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+ inode->i_pipe = NULL;
+ inode->i_bdev = NULL;
+ inode->i_rdev = to_kdev_t(0);
+ inode->i_security = NULL;
+ if (security_inode_alloc(inode)) {
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+ return NULL;
+ }
+
+ mapping->a_ops = &empty_aops;
+ mapping->host = inode;
+ mapping->gfp_mask = GFP_HIGHUSER;
+ mapping->dirtied_when = 0;
+ mapping->assoc_mapping = NULL;
+ mapping->backing_dev_info = &default_backing_dev_info;
+ if (sb->s_bdev)
+ mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ memset(&inode->u, 0, sizeof(inode->u));
+ inode->i_mapping = mapping;
+ }
+ return inode;
+}
+
+void destroy_inode(struct inode *inode)
+{
+ if (inode_has_buffers(inode))
+ BUG();
+ security_inode_free(inode);
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+}
+
+
+/*
+ * These are initializations that only need to be done
+ * once, because the fields are idempotent across use
+ * of the inode, so let the slab aware of that.
+ */
+void inode_init_once(struct inode *inode)
+{
+ memset(inode, 0, sizeof(*inode));
+ INIT_HLIST_NODE(&inode->i_hash);
+ INIT_LIST_HEAD(&inode->i_data.clean_pages);
+ INIT_LIST_HEAD(&inode->i_data.dirty_pages);
+ INIT_LIST_HEAD(&inode->i_data.locked_pages);
+ INIT_LIST_HEAD(&inode->i_data.io_pages);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ INIT_LIST_HEAD(&inode->i_devices);
+ sema_init(&inode->i_sem, 1);
+ INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+ rwlock_init(&inode->i_data.page_lock);
+ init_MUTEX(&inode->i_data.i_shared_sem);
+ INIT_LIST_HEAD(&inode->i_data.private_list);
+ spin_lock_init(&inode->i_data.private_lock);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+ spin_lock_init(&inode->i_lock);
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+ struct inode * inode = (struct inode *) foo;
+
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ inode_init_once(inode);
+}
+
+/*
+ * inode_lock must be held
+ */
+void __iget(struct inode * inode)
+{
+ if (atomic_read(&inode->i_count)) {
+ atomic_inc(&inode->i_count);
+ return;
+ }
+ atomic_inc(&inode->i_count);
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_in_use);
+ }
+ inodes_stat.nr_unused--;
+}
+
+/**
+ * clear_inode - clear an inode
+ * @inode: inode to clear
+ *
+ * This is called by the filesystem to tell us
+ * that the inode is no longer useful. We just
+ * terminate it with extreme prejudice.
+ */
+
+void clear_inode(struct inode *inode)
+{
+ invalidate_inode_buffers(inode);
+
+ if (inode->i_data.nrpages)
+ BUG();
+ if (!(inode->i_state & I_FREEING))
+ BUG();
+ if (inode->i_state & I_CLEAR)
+ BUG();
+ wait_on_inode(inode);
+ DQUOT_DROP(inode);
+ if (inode->i_sb && inode->i_sb->s_op->clear_inode)
+ inode->i_sb->s_op->clear_inode(inode);
+ if (inode->i_bdev)
+ bd_forget(inode);
+ inode->i_state = I_CLEAR;
+}
+
+/*
+ * Dispose-list gets a local list with local inodes in it, so it doesn't
+ * need to worry about list corruption and SMP locks.
+ */
+static void dispose_list(struct list_head *head)
+{
+ int nr_disposed = 0;
+
+ while (!list_empty(head)) {
+ struct inode *inode;
+
+ inode = list_entry(head->next, struct inode, i_list);
+ list_del(&inode->i_list);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+ nr_disposed++;
+ }
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes -= nr_disposed;
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * Invalidate all inodes for a device.
+ */
+static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
+{
+ struct list_head *next;
+ int busy = 0, count = 0;
+
+ next = head->next;
+ for (;;) {
+ struct list_head * tmp = next;
+ struct inode * inode;
+
+ next = next->next;
+ if (tmp == head)
+ break;
+ inode = list_entry(tmp, struct inode, i_list);
+ if (inode->i_sb != sb)
+ continue;
+ invalidate_inode_buffers(inode);
+ if (!atomic_read(&inode->i_count)) {
+ hlist_del_init(&inode->i_hash);
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, dispose);
+ inode->i_state |= I_FREEING;
+ count++;
+ continue;
+ }
+ busy = 1;
+ }
+ /* only unused inodes may be cached with i_count zero */
+ inodes_stat.nr_unused -= count;
+ return busy;
+}
+
+/*
+ * This is a two-stage process. First we collect all
+ * offending inodes onto the throw-away list, and in
+ * the second stage we actually dispose of them. This
+ * is because we don't want to sleep while messing
+ * with the global lists..
+ */
+
+/**
+ * invalidate_inodes - discard the inodes on a device
+ * @sb: superblock
+ *
+ * Discard all of the inodes for a given superblock. If the discard
+ * fails because there are busy inodes then a non zero value is returned.
+ * If the discard is successful all the inodes have been discarded.
+ */
+
+int invalidate_inodes(struct super_block * sb)
+{
+ int busy;
+ LIST_HEAD(throw_away);
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ busy = invalidate_list(&inode_in_use, sb, &throw_away);
+ busy |= invalidate_list(&inode_unused, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_io, sb, &throw_away);
+ spin_unlock(&inode_lock);
+
+ dispose_list(&throw_away);
+ up(&iprune_sem);
+
+ return busy;
+}
+
+int invalidate_device(kdev_t dev, int do_sync)
+{
+ struct super_block *sb;
+ struct block_device *bdev = bdget(kdev_t_to_nr(dev));
+ int res;
+
+ if (!bdev)
+ return 0;
+
+ if (do_sync)
+ fsync_bdev(bdev);
+
+ res = 0;
+ sb = get_super(bdev);
+ if (sb) {
+ /*
+ * no need to lock the super, get_super holds the
+ * read semaphore so the filesystem cannot go away
+ * under us (->put_super runs with the write lock
+ * hold).
+ */
+ shrink_dcache_sb(sb);
+ res = invalidate_inodes(sb);
+ drop_super(sb);
+ }
+ invalidate_bdev(bdev, 0);
+ bdput(bdev);
+ return res;
+}
+
+static int can_unuse(struct inode *inode)
+{
+ if (inode->i_state)
+ return 0;
+ if (inode_has_buffers(inode))
+ return 0;
+ if (atomic_read(&inode->i_count))
+ return 0;
+ if (inode->i_data.nrpages)
+ return 0;
+ return 1;
+}
+
+/*
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * a temporary list and then are freed outside inode_lock by dispose_list().
+ *
+ * Any inodes which are pinned purely because of attached pagecache have their
+ * pagecache removed. We expect the final iput() on that inode to add it to
+ * the front of the inode_unused list. So look for it there and if the
+ * inode is still freeable, proceed. The right inode is found 99.9% of the
+ * time in testing on a 4-way.
+ *
+ * If the inode has metadata buffers attached to mapping->private_list then
+ * try to remove them.
+ */
+static void prune_icache(int nr_to_scan)
+{
+ LIST_HEAD(freeable);
+ int nr_pruned = 0;
+ int nr_scanned;
+ unsigned long reap = 0;
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+ struct inode *inode;
+
+ if (list_empty(&inode_unused))
+ break;
+
+ inode = list_entry(inode_unused.prev, struct inode, i_list);
+
+ if (inode->i_state || atomic_read(&inode->i_count)) {
+ list_move(&inode->i_list, &inode_unused);
+ continue;
+ }
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ if (remove_inode_buffers(inode))
+ reap += invalidate_inode_pages(&inode->i_data);
+ iput(inode);
+ spin_lock(&inode_lock);
+
+ if (inode != list_entry(inode_unused.next,
+ struct inode, i_list))
+ continue; /* wrong inode or list_empty */
+ if (!can_unuse(inode))
+ continue;
+ }
+ hlist_del_init(&inode->i_hash);
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ nr_pruned++;
+ }
+ inodes_stat.nr_unused -= nr_pruned;
+ spin_unlock(&inode_lock);
+
+ dispose_list(&freeable);
+ up(&iprune_sem);
+
+ if (current_is_kswapd)
+ mod_page_state(kswapd_inodesteal, reap);
+ else
+ mod_page_state(pginodesteal, reap);
+}
+
+/*
+ * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
+ * "unused" means that no dentries are referring to the inodes: the files are
+ * not open and the dcache references to those inodes have already been
+ * reclaimed.
+ *
+ * This function is passed the number of inodes to scan, and it returns the
+ * total number of remaining possibly-reclaimable inodes.
+ */
+static int shrink_icache_memory(int nr, unsigned int gfp_mask)
+{
+ if (nr) {
+ /*
+ * Nasty deadlock avoidance. We may hold various FS locks,
+ * and we don't want to recurse into the FS that called us
+ * in clear_inode() and friends..
+ */
+ if (gfp_mask & __GFP_FS)
+ prune_icache(nr);
+ }
+ return inodes_stat.nr_unused;
+}
+
+void __wait_on_freeing_inode(struct inode *inode);
+/*
+ * Called with the inode lock held.
+ * NOTE: we are not increasing the inode-refcount, you must call __iget()
+ * by hand after calling find_inode now! This simplifies iunique and won't
+ * add any additional branch in the common code.
+ */
+static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = hlist_entry(node, struct inode, i_hash);
+ if (inode->i_sb != sb)
+ continue;
+ if (!test(inode, data))
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/*
+ * find_inode_fast is the fast path version of find_inode, see the comment at
+ * iget_locked for details.
+ */
+static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = list_entry(node, struct inode, i_hash);
+ if (inode->i_ino != ino)
+ continue;
+ if (inode->i_sb != sb)
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/**
+ * new_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ */
+
+struct inode *new_inode(struct super_block *sb)
+{
+ static unsigned long last_ino;
+ struct inode * inode;
+
+ spin_lock_prefetch(&inode_lock);
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ inode->i_ino = ++last_ino;
+ inode->i_state = 0;
+ spin_unlock(&inode_lock);
+ }
+ return inode;
+}
+
+void unlock_new_inode(struct inode *inode)
+{
+ /*
+ * This is special! We do not need the spinlock
+ * when clearing I_LOCK, because we're guaranteed
+ * that nobody else tries to do anything about the
+ * state of the inode when it is locked, as we
+ * just created it (so there can be no old holders
+ * that haven't tested I_LOCK).
+ */
+ inode->i_state &= ~(I_LOCK|I_NEW);
+ wake_up_inode(inode);
+}
+EXPORT_SYMBOL(unlock_new_inode);
+
+/*
+ * This is called without the inode lock held.. Be careful.
+ *
+ * We no longer cache the sb_flags in i_flags - see fs.h
+ * -- rmk@arm.uk.linux.org
+ */
+static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode(sb, head, test, data);
+ if (!old) {
+ if (set(inode, data))
+ goto set_failed;
+
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+
+set_failed:
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ return NULL;
+}
+
+/*
+ * get_new_inode_fast is the fast path version of get_new_inode, see the
+ * comment at iget_locked for details.
+ */
+static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode_fast(sb, head, ino);
+ if (!old) {
+ inode->i_ino = ino;
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+}
+
+static inline unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES);
+ tmp = tmp + (tmp >> I_HASHBITS);
+ return tmp & I_HASHMASK;
+}
+
+/* Yeah, I know about quadratic hash. Maybe, later. */
+
+/**
+ * iunique - get a unique inode number
+ * @sb: superblock
+ * @max_reserved: highest reserved inode number
+ *
+ * Obtain an inode number that is unique on the system for a given
+ * superblock. This is used by file systems that have no natural
+ * permanent inode numbering system. An inode number is returned that
+ * is higher than the reserved limit but unique.
+ *
+ * BUGS:
+ * With a large number of inodes live on the file system this function
+ * currently becomes quite slow.
+ */
+
+ino_t iunique(struct super_block *sb, ino_t max_reserved)
+{
+ static ino_t counter = 0;
+ struct inode *inode;
+ struct hlist_head * head;
+ ino_t res;
+ spin_lock(&inode_lock);
+retry:
+ if (counter > max_reserved) {
+ head = inode_hashtable + hash(sb,counter);
+ res = counter++;
+ inode = find_inode_fast(sb, head, res);
+ if (!inode) {
+ spin_unlock(&inode_lock);
+ return res;
+ }
+ } else {
+ counter = max_reserved + 1;
+ }
+ goto retry;
+
+}
+
+struct inode *igrab(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ if (!(inode->i_state & I_FREEING))
+ __iget(inode);
+ else
+ /*
+ * Handle the case where s_op->clear_inode is not been
+ * called yet, and somebody is calling igrab
+ * while the inode is getting freed.
+ */
+ inode = NULL;
+ spin_unlock(&inode_lock);
+ return inode;
+}
+
+/**
+ * ifind - internal function, you want ilookup5() or iget5().
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ifind() searches for the inode specified by @hashval and @data in the inode
+ * cache. This is a generalized version of ifind_fast() for file systems where
+ * the inode number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+static inline struct inode *ifind(struct super_block *sb,
+ struct hlist_head *head, int (*test)(struct inode *, void *),
+ void *data)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode(sb, head, test, data);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ifind_fast - internal function, you want ilookup() or iget().
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ifind_fast() searches for the inode @ino in the inode cache. This is for
+ * file systems where the inode number is sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+static inline struct inode *ifind_fast(struct super_block *sb,
+ struct hlist_head *head, unsigned long ino)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode_fast(sb, head, ino);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ilookup5 - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * @data in the inode cache. This is a generalized version of ilookup() for
+ * file systems where the inode number is not sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+
+ return ifind(sb, head, test, data);
+}
+EXPORT_SYMBOL(ilookup5);
+
+/**
+ * ilookup - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
+ * This is for file systems where the inode number is sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+struct inode *ilookup(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+
+ return ifind_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(ilookup);
+
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * This is iget() without the read_inode() portion of get_new_inode().
+ *
+ * iget5_locked() uses ifind() to search for the inode specified by @hashval
+ * and @data in the inode cache and if present it is returned with an increased
+ * reference count. This is a generalized version of iget_locked() for file
+ * systems where the inode number is not sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is not in cache, get_new_inode() is called to allocate a new
+ * inode and this is returned locked, hashed, and with the I_NEW flag set. The
+ * file system gets to fill it in before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_lock held, so can't sleep.
+ */
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode;
+
+ inode = ifind(sb, head, test, data);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode(sb, head, test, set, data);
+}
+EXPORT_SYMBOL(iget5_locked);
+
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @ino: inode number to get
+ *
+ * This is iget() without the read_inode() portion of get_new_inode_fast().
+ *
+ * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
+ * the inode cache and if present it is returned with an increased reference
+ * count. This is for file systems where the inode number is sufficient for
+ * unique identification of an inode.
+ *
+ * If the inode is not in cache, get_new_inode_fast() is called to allocate a
+ * new inode and this is returned locked, hashed, and with the I_NEW flag set.
+ * The file system gets to fill it in before unlocking it via
+ * unlock_new_inode().
+ */
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct inode *inode;
+
+ inode = ifind_fast(sb, head, ino);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode_fast() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(iget_locked);
+
+/**
+ * __insert_inode_hash - hash an inode
+ * @inode: unhashed inode
+ * @hashval: unsigned long value used to locate this object in the
+ * inode_hashtable.
+ *
+ * Add an inode to the inode hash for this superblock. If the inode
+ * has no superblock it is added to a separate anonymous chain.
+ */
+
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+ struct hlist_head *head = &anon_hash_chain;
+ if (inode->i_sb)
+ head = inode_hashtable + hash(inode->i_sb, hashval);
+ spin_lock(&inode_lock);
+ hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode_lock);
+}
+
+/**
+ * remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock or anonymous hash.
+ */
+
+void remove_inode_hash(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ hlist_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+}
+
+void generic_delete_inode(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+
+<<<<<<<
+ hlist_del_init(&inode->i_hash);
+|||||||
+ list_del_init(&inode->i_hash);
+=======
+>>>>>>>
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+
+ security_inode_delete(inode);
+
+ if (op->delete_inode) {
+ void (*delete)(struct inode *) = op->delete_inode;
+ if (!is_bad_inode(inode))
+ DQUOT_INIT(inode);
+ /* s_op->delete_inode internally recalls clear_inode() */
+ delete(inode);
+ } else
+ clear_inode(inode);
+ spin_lock(&inode_lock);
+ list_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+ wake_up_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ destroy_inode(inode);
+}
+EXPORT_SYMBOL(generic_delete_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!hlist_unhashed(&inode->i_hash)) {
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_unused);
+ }
+ inodes_stat.nr_unused++;
+ spin_unlock(&inode_lock);
+ if (!sb || (sb->s_flags & MS_ACTIVE))
+ return;
+ write_inode_now(inode, 1);
+ spin_lock(&inode_lock);
+ inodes_stat.nr_unused--;
+ hlist_del_init(&inode->i_hash);
+ }
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+}
+
+/*
+ * Normal UNIX filesystem behaviour: delete the
+ * inode when the usage count drops to zero, and
+ * i_nlink is zero.
+ */
+static void generic_drop_inode(struct inode *inode)
+{
+ if (!inode->i_nlink)
+ generic_delete_inode(inode);
+ else
+ generic_forget_inode(inode);
+}
+
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
+ *
+ * Call the FS "drop()" function, defaulting to
+ * the legacy UNIX filesystem behaviour..
+ *
+ * NOTE! NOTE! NOTE! We're called with the inode lock
+ * held, and the drop function is supposed to release
+ * the lock!
+ */
+static inline void iput_final(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+ void (*drop)(struct inode *) = generic_drop_inode;
+
+ if (op && op->drop_inode)
+ drop = op->drop_inode;
+ drop(inode);
+}
+
+/**
+ * iput - put an inode
+ * @inode: inode to put
+ *
+ * Puts an inode, dropping its usage count. If the inode use count hits
+ * zero the inode is also then freed and may be destroyed.
+ */
+
+void iput(struct inode *inode)
+{
+ if (inode) {
+ struct super_operations *op = inode->i_sb->s_op;
+
+ if (inode->i_state == I_CLEAR)
+ BUG();
+
+ if (op && op->put_inode)
+ op->put_inode(inode);
+
+ if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ iput_final(inode);
+ }
+}
+
+/**
+ * bmap - find a block number in a file
+ * @inode: inode of file
+ * @block: block to find
+ *
+ * Returns the block number on the device holding the inode that
+ * is the disk block number for the block of the file requested.
+ * That is, asked for block 4 of inode 1 the function will return the
+ * disk block relative to the disk start that holds that block of the
+ * file.
+ */
+
+sector_t bmap(struct inode * inode, sector_t block)
+{
+ sector_t res = 0;
+ if (inode->i_mapping->a_ops->bmap)
+ res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
+ return res;
+}
+
+/*
+ * Return true if the filesystem which backs this inode considers the two
+ * passed timespecs to be sufficiently different to warrant flushing the
+ * altered time out to disk.
+ */
+static int inode_times_differ(struct inode *inode,
+ struct timespec *old, struct timespec *new)
+{
+ if (IS_ONE_SECOND(inode))
+ return old->tv_sec != new->tv_sec;
+ return !timespec_equal(old, new);
+}
+
+/**
+ * update_atime - update the access time
+ * @inode: inode accessed
+ *
+ * Update the accessed time on an inode and mark it for writeback.
+ * This function automatically handles read only file systems and media,
+ * as well as the "noatime" flag and inode specific "noatime" markers.
+ */
+
+void update_atime(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOATIME(inode))
+ return;
+ if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+ return;
+ if (IS_RDONLY(inode))
+ return;
+
+ now = current_kernel_time();
+ if (inode_times_differ(inode, &inode->i_atime, &now)) {
+ inode->i_atime = now;
+ mark_inode_dirty_sync(inode);
+ } else {
+ if (!timespec_equal(&inode->i_atime, &now))
+ inode->i_atime = now;
+ }
+}
+
+/**
+ * inode_update_time - update mtime and ctime time
+ * @inode: inode accessed
+ * @ctime_too: update ctime too
+ *
+ * Update the mtime time on an inode and mark it for writeback.
+ * When ctime_too is specified update the ctime too.
+ */
+
+void inode_update_time(struct inode *inode, int ctime_too)
+{
+ struct timespec now = current_kernel_time();
+ int sync_it = 0;
+
+ if (inode_times_differ(inode, &inode->i_mtime, &now))
+ sync_it = 1;
+ inode->i_mtime = now;
+
+ if (ctime_too) {
+ if (inode_times_differ(inode, &inode->i_ctime, &now))
+ sync_it = 1;
+ inode->i_ctime = now;
+ }
+ if (sync_it)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL(inode_update_time);
+
+int inode_needs_sync(struct inode *inode)
+{
+ if (IS_SYNC(inode))
+ return 1;
+ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(inode_needs_sync);
+
+/*
+ * Quota functions that want to walk the inode lists..
+ */
+#ifdef CONFIG_QUOTA
+
+/* Functions back in dquot.c */
+void put_dquot_list(struct list_head *);
+int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
+
+void remove_dquot_ref(struct super_block *sb, int type)
+{
+ struct inode *inode;
+ struct list_head *act_head;
+ LIST_HEAD(tofree_head);
+
+ if (!sb->dq_op)
+ return; /* nothing to do */
+ spin_lock(&inode_lock); /* This lock is for inodes code */
+ /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */
+
+ list_for_each(act_head, &inode_in_use) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &inode_unused) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_dirty) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_io) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ spin_unlock(&inode_lock);
+
+ put_dquot_list(&tofree_head);
+}
+
+#endif
+
+/*
+ * Hashed waitqueues for wait_on_inode(). The table is pretty small - the
+ * kernel doesn't lock many inodes at the same time.
+ */
+#define I_WAIT_TABLE_ORDER 3
+static struct i_wait_queue_head {
+ wait_queue_head_t wqh;
+} ____cacheline_aligned_in_smp i_wait_queue_heads[1<<I_WAIT_TABLE_ORDER];
+
+/*
+ * Return the address of the waitqueue_head to be used for this inode
+ */
+static wait_queue_head_t *i_waitq_head(struct inode *inode)
+{
+ return &i_wait_queue_heads[hash_ptr(inode, I_WAIT_TABLE_ORDER)].wqh;
+}
+
+void __wait_on_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (inode->i_state & I_LOCK) {
+ schedule();
+ goto repeat;
+ }
+ remove_wait_queue(wq, &wait);
+ __set_current_state(TASK_RUNNING);
+}
+
+void __wait_on_freeing_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode_lock);
+ schedule();
+ remove_wait_queue(wq, &wait);
+ current->state = TASK_RUNNING;
+ spin_lock(&inode_lock);
+}
+
+
+void wake_up_inode(struct inode *inode)
+{
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ /*
+ * Prevent speculative execution through spin_unlock(&inode_lock);
+ */
+ smp_mb();
+ if (waitqueue_active(wq))
+ wake_up_all(wq);
+}
+
+/*
+ * Initialize the waitqueues and inode hash table.
+ */
+void __init inode_init(unsigned long mempages)
+{
+ struct hlist_head *head;
+ unsigned long order;
+ unsigned int nr_hash;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
+ init_waitqueue_head(&i_wait_queue_heads[i].wqh);
+
+ mempages >>= (14 - PAGE_SHIFT);
+ mempages *= sizeof(struct list_head);
+ for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
+ ;
+
+ do {
+ unsigned long tmp;
+
+ nr_hash = (1UL << order) * PAGE_SIZE /
+ sizeof(struct hlist_head);
+ i_hash_mask = (nr_hash - 1);
+
+ tmp = nr_hash;
+ i_hash_shift = 0;
+ while ((tmp >>= 1UL) != 0UL)
+ i_hash_shift++;
+
+ inode_hashtable = (struct hlist_head *)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (inode_hashtable == NULL && --order >= 0);
+
+ printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+ nr_hash, order, (PAGE_SIZE << order));
+
+ if (!inode_hashtable)
+ panic("Failed to allocate inode hash table\n");
+
+ head = inode_hashtable;
+ i = nr_hash;
+ do {
+ INIT_HLIST_HEAD(head);
+ head++;
+ i--;
+ } while (i);
+
+ /* inode slab cache */
+ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
+ 0, SLAB_HWCACHE_ALIGN, init_once,
+ NULL);
+ if (!inode_cachep)
+ panic("cannot create inode slab cache");
+
+ set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+}
+
+void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
+ inode->i_mode = mode;
+ if (S_ISCHR(mode)) {
+ inode->i_fop = &def_chr_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISBLK(mode)) {
+ inode->i_fop = &def_blk_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISFIFO(mode))
+ inode->i_fop = &def_fifo_fops;
+ else if (S_ISSOCK(mode))
+ inode->i_fop = &bad_sock_fops;
+ else
+ printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
+ mode);
+}
diff --git a/tests/linux/inode-justrej/orig b/tests/linux/inode-justrej/orig
new file mode 100644
index 0000000..299c900
--- /dev/null
+++ b/tests/linux/inode-justrej/orig
@@ -0,0 +1,1353 @@
+/*
+ * linux/fs/inode.c
+ *
+ * (C) 1997 Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dcache.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+
+/*
+ * This is needed for the following functions:
+ * - inode_has_buffers
+ * - invalidate_inode_buffers
+ * - fsync_bdev
+ * - invalidate_bdev
+ *
+ * FIXME: remove all knowledge of the buffer layer from this file
+ */
+#include <linux/buffer_head.h>
+
+/*
+ * New inode.c implementation.
+ *
+ * This implementation has the basic premise of trying
+ * to be extremely low-overhead and SMP-safe, yet be
+ * simple enough to be "obviously correct".
+ *
+ * Famous last words.
+ */
+
+/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
+
+/* #define INODE_PARANOIA 1 */
+/* #define INODE_DEBUG 1 */
+
+/*
+ * Inode lookup is no longer as critical as it used to be:
+ * most of the lookups are going to be through the dcache.
+ */
+#define I_HASHBITS i_hash_shift
+#define I_HASHMASK i_hash_mask
+
+static unsigned int i_hash_mask;
+static unsigned int i_hash_shift;
+
+/*
+ * Each inode can be on two separate lists. One is
+ * the hash list of the inode, used for lookups. The
+ * other linked list is the "type" list:
+ * "in_use" - valid inode, i_count > 0, i_nlink > 0
+ * "dirty" - as "in_use" but also dirty
+ * "unused" - valid inode, i_count = 0
+ *
+ * A "dirty" list is maintained for each super block,
+ * allowing for low-overhead inode sync() operations.
+ */
+
+LIST_HEAD(inode_in_use);
+LIST_HEAD(inode_unused);
+static struct hlist_head *inode_hashtable;
+static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
+
+/*
+ * A simple spinlock to protect the list manipulations.
+ *
+ * NOTE! You also have to own the lock if you change
+ * the i_state of an inode while it is in use..
+ */
+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * icache shrinking path, and the umount path. Without this exclusion,
+ * by the time prune_icache calls iput for the inode whose pages it has
+ * been invalidating, or by the time it calls clear_inode & destroy_inode
+ * from its final dispose_list, the struct super_block they refer to
+ * (for inode->i_sb->s_op) may already have been freed and reused.
+ */
+static DECLARE_MUTEX(iprune_sem);
+
+/*
+ * Statistics gathering..
+ */
+struct inodes_stat_t inodes_stat;
+
+static kmem_cache_t * inode_cachep;
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+ static struct address_space_operations empty_aops;
+ static struct inode_operations empty_iops;
+ static struct file_operations empty_fops;
+ struct inode *inode;
+
+ if (sb->s_op->alloc_inode)
+ inode = sb->s_op->alloc_inode(sb);
+ else
+ inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
+
+ if (inode) {
+ struct address_space * const mapping = &inode->i_data;
+
+ inode->i_sb = sb;
+ inode->i_blkbits = sb->s_blocksize_bits;
+ inode->i_flags = 0;
+ atomic_set(&inode->i_count, 1);
+ inode->i_sock = 0;
+ inode->i_op = &empty_iops;
+ inode->i_fop = &empty_fops;
+ inode->i_nlink = 1;
+ atomic_set(&inode->i_writecount, 0);
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ inode->i_bytes = 0;
+ inode->i_generation = 0;
+ memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+ inode->i_pipe = NULL;
+ inode->i_bdev = NULL;
+ inode->i_rdev = to_kdev_t(0);
+ inode->i_security = NULL;
+ if (security_inode_alloc(inode)) {
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+ return NULL;
+ }
+
+ mapping->a_ops = &empty_aops;
+ mapping->host = inode;
+ mapping->gfp_mask = GFP_HIGHUSER;
+ mapping->dirtied_when = 0;
+ mapping->assoc_mapping = NULL;
+ mapping->backing_dev_info = &default_backing_dev_info;
+ if (sb->s_bdev)
+ mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ memset(&inode->u, 0, sizeof(inode->u));
+ inode->i_mapping = mapping;
+ }
+ return inode;
+}
+
+void destroy_inode(struct inode *inode)
+{
+ if (inode_has_buffers(inode))
+ BUG();
+ security_inode_free(inode);
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+}
+
+
+/*
+ * These are initializations that only need to be done
+ * once, because the fields are idempotent across use
+ * of the inode, so let the slab aware of that.
+ */
+void inode_init_once(struct inode *inode)
+{
+ memset(inode, 0, sizeof(*inode));
+ INIT_HLIST_NODE(&inode->i_hash);
+ INIT_LIST_HEAD(&inode->i_data.clean_pages);
+ INIT_LIST_HEAD(&inode->i_data.dirty_pages);
+ INIT_LIST_HEAD(&inode->i_data.locked_pages);
+ INIT_LIST_HEAD(&inode->i_data.io_pages);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ INIT_LIST_HEAD(&inode->i_devices);
+ sema_init(&inode->i_sem, 1);
+ INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+ rwlock_init(&inode->i_data.page_lock);
+ init_MUTEX(&inode->i_data.i_shared_sem);
+ INIT_LIST_HEAD(&inode->i_data.private_list);
+ spin_lock_init(&inode->i_data.private_lock);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+ spin_lock_init(&inode->i_lock);
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+ struct inode * inode = (struct inode *) foo;
+
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ inode_init_once(inode);
+}
+
+/*
+ * inode_lock must be held
+ */
+void __iget(struct inode * inode)
+{
+ if (atomic_read(&inode->i_count)) {
+ atomic_inc(&inode->i_count);
+ return;
+ }
+ atomic_inc(&inode->i_count);
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_in_use);
+ }
+ inodes_stat.nr_unused--;
+}
+
+/**
+ * clear_inode - clear an inode
+ * @inode: inode to clear
+ *
+ * This is called by the filesystem to tell us
+ * that the inode is no longer useful. We just
+ * terminate it with extreme prejudice.
+ */
+
+void clear_inode(struct inode *inode)
+{
+ invalidate_inode_buffers(inode);
+
+ if (inode->i_data.nrpages)
+ BUG();
+ if (!(inode->i_state & I_FREEING))
+ BUG();
+ if (inode->i_state & I_CLEAR)
+ BUG();
+ wait_on_inode(inode);
+ DQUOT_DROP(inode);
+ if (inode->i_sb && inode->i_sb->s_op->clear_inode)
+ inode->i_sb->s_op->clear_inode(inode);
+ if (inode->i_bdev)
+ bd_forget(inode);
+ inode->i_state = I_CLEAR;
+}
+
+/*
+ * Dispose-list gets a local list with local inodes in it, so it doesn't
+ * need to worry about list corruption and SMP locks.
+ */
+static void dispose_list(struct list_head *head)
+{
+ int nr_disposed = 0;
+
+ while (!list_empty(head)) {
+ struct inode *inode;
+
+ inode = list_entry(head->next, struct inode, i_list);
+ list_del(&inode->i_list);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+ nr_disposed++;
+ }
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes -= nr_disposed;
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * Invalidate all inodes for a device.
+ */
+static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
+{
+ struct list_head *next;
+ int busy = 0, count = 0;
+
+ next = head->next;
+ for (;;) {
+ struct list_head * tmp = next;
+ struct inode * inode;
+
+ next = next->next;
+ if (tmp == head)
+ break;
+ inode = list_entry(tmp, struct inode, i_list);
+ if (inode->i_sb != sb)
+ continue;
+ invalidate_inode_buffers(inode);
+ if (!atomic_read(&inode->i_count)) {
+ hlist_del_init(&inode->i_hash);
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, dispose);
+ inode->i_state |= I_FREEING;
+ count++;
+ continue;
+ }
+ busy = 1;
+ }
+ /* only unused inodes may be cached with i_count zero */
+ inodes_stat.nr_unused -= count;
+ return busy;
+}
+
+/*
+ * This is a two-stage process. First we collect all
+ * offending inodes onto the throw-away list, and in
+ * the second stage we actually dispose of them. This
+ * is because we don't want to sleep while messing
+ * with the global lists..
+ */
+
+/**
+ * invalidate_inodes - discard the inodes on a device
+ * @sb: superblock
+ *
+ * Discard all of the inodes for a given superblock. If the discard
+ * fails because there are busy inodes then a non zero value is returned.
+ * If the discard is successful all the inodes have been discarded.
+ */
+
+int invalidate_inodes(struct super_block * sb)
+{
+ int busy;
+ LIST_HEAD(throw_away);
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ busy = invalidate_list(&inode_in_use, sb, &throw_away);
+ busy |= invalidate_list(&inode_unused, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_io, sb, &throw_away);
+ spin_unlock(&inode_lock);
+
+ dispose_list(&throw_away);
+ up(&iprune_sem);
+
+ return busy;
+}
+
+int invalidate_device(kdev_t dev, int do_sync)
+{
+ struct super_block *sb;
+ struct block_device *bdev = bdget(kdev_t_to_nr(dev));
+ int res;
+
+ if (!bdev)
+ return 0;
+
+ if (do_sync)
+ fsync_bdev(bdev);
+
+ res = 0;
+ sb = get_super(bdev);
+ if (sb) {
+ /*
+ * no need to lock the super, get_super holds the
+ * read semaphore so the filesystem cannot go away
+ * under us (->put_super runs with the write lock
+ * hold).
+ */
+ shrink_dcache_sb(sb);
+ res = invalidate_inodes(sb);
+ drop_super(sb);
+ }
+ invalidate_bdev(bdev, 0);
+ bdput(bdev);
+ return res;
+}
+
+static int can_unuse(struct inode *inode)
+{
+ if (inode->i_state)
+ return 0;
+ if (inode_has_buffers(inode))
+ return 0;
+ if (atomic_read(&inode->i_count))
+ return 0;
+ if (inode->i_data.nrpages)
+ return 0;
+ return 1;
+}
+
+/*
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * a temporary list and then are freed outside inode_lock by dispose_list().
+ *
+ * Any inodes which are pinned purely because of attached pagecache have their
+ * pagecache removed. We expect the final iput() on that inode to add it to
+ * the front of the inode_unused list. So look for it there and if the
+ * inode is still freeable, proceed. The right inode is found 99.9% of the
+ * time in testing on a 4-way.
+ *
+ * If the inode has metadata buffers attached to mapping->private_list then
+ * try to remove them.
+ */
+static void prune_icache(int nr_to_scan)
+{
+ LIST_HEAD(freeable);
+ int nr_pruned = 0;
+ int nr_scanned;
+ unsigned long reap = 0;
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+ struct inode *inode;
+
+ if (list_empty(&inode_unused))
+ break;
+
+ inode = list_entry(inode_unused.prev, struct inode, i_list);
+
+ if (inode->i_state || atomic_read(&inode->i_count)) {
+ list_move(&inode->i_list, &inode_unused);
+ continue;
+ }
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ if (remove_inode_buffers(inode))
+ reap += invalidate_inode_pages(&inode->i_data);
+ iput(inode);
+ spin_lock(&inode_lock);
+
+ if (inode != list_entry(inode_unused.next,
+ struct inode, i_list))
+ continue; /* wrong inode or list_empty */
+ if (!can_unuse(inode))
+ continue;
+ }
+ hlist_del_init(&inode->i_hash);
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ nr_pruned++;
+ }
+ inodes_stat.nr_unused -= nr_pruned;
+ spin_unlock(&inode_lock);
+
+ dispose_list(&freeable);
+ up(&iprune_sem);
+
+ if (current_is_kswapd)
+ mod_page_state(kswapd_inodesteal, reap);
+ else
+ mod_page_state(pginodesteal, reap);
+}
+
+/*
+ * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
+ * "unused" means that no dentries are referring to the inodes: the files are
+ * not open and the dcache references to those inodes have already been
+ * reclaimed.
+ *
+ * This function is passed the number of inodes to scan, and it returns the
+ * total number of remaining possibly-reclaimable inodes.
+ */
+static int shrink_icache_memory(int nr, unsigned int gfp_mask)
+{
+ if (nr) {
+ /*
+ * Nasty deadlock avoidance. We may hold various FS locks,
+ * and we don't want to recurse into the FS that called us
+ * in clear_inode() and friends..
+ */
+ if (gfp_mask & __GFP_FS)
+ prune_icache(nr);
+ }
+ return inodes_stat.nr_unused;
+}
+
+void __wait_on_freeing_inode(struct inode *inode);
+/*
+ * Called with the inode lock held.
+ * NOTE: we are not increasing the inode-refcount, you must call __iget()
+ * by hand after calling find_inode now! This simplifies iunique and won't
+ * add any additional branch in the common code.
+ */
+static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = hlist_entry(node, struct inode, i_hash);
+ if (inode->i_sb != sb)
+ continue;
+ if (!test(inode, data))
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/*
+ * find_inode_fast is the fast path version of find_inode, see the comment at
+ * iget_locked for details.
+ */
+static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = list_entry(node, struct inode, i_hash);
+ if (inode->i_ino != ino)
+ continue;
+ if (inode->i_sb != sb)
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/**
+ * new_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ */
+
+struct inode *new_inode(struct super_block *sb)
+{
+ static unsigned long last_ino;
+ struct inode * inode;
+
+ spin_lock_prefetch(&inode_lock);
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ inode->i_ino = ++last_ino;
+ inode->i_state = 0;
+ spin_unlock(&inode_lock);
+ }
+ return inode;
+}
+
+void unlock_new_inode(struct inode *inode)
+{
+ /*
+ * This is special! We do not need the spinlock
+ * when clearing I_LOCK, because we're guaranteed
+ * that nobody else tries to do anything about the
+ * state of the inode when it is locked, as we
+ * just created it (so there can be no old holders
+ * that haven't tested I_LOCK).
+ */
+ inode->i_state &= ~(I_LOCK|I_NEW);
+ wake_up_inode(inode);
+}
+EXPORT_SYMBOL(unlock_new_inode);
+
+/*
+ * This is called without the inode lock held.. Be careful.
+ *
+ * We no longer cache the sb_flags in i_flags - see fs.h
+ * -- rmk@arm.uk.linux.org
+ */
+static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode(sb, head, test, data);
+ if (!old) {
+ if (set(inode, data))
+ goto set_failed;
+
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+
+set_failed:
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ return NULL;
+}
+
+/*
+ * get_new_inode_fast is the fast path version of get_new_inode, see the
+ * comment at iget_locked for details.
+ */
+static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode_fast(sb, head, ino);
+ if (!old) {
+ inode->i_ino = ino;
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+}
+
+static inline unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES);
+ tmp = tmp + (tmp >> I_HASHBITS);
+ return tmp & I_HASHMASK;
+}
+
+/* Yeah, I know about quadratic hash. Maybe, later. */
+
+/**
+ * iunique - get a unique inode number
+ * @sb: superblock
+ * @max_reserved: highest reserved inode number
+ *
+ * Obtain an inode number that is unique on the system for a given
+ * superblock. This is used by file systems that have no natural
+ * permanent inode numbering system. An inode number is returned that
+ * is higher than the reserved limit but unique.
+ *
+ * BUGS:
+ * With a large number of inodes live on the file system this function
+ * currently becomes quite slow.
+ */
+
+ino_t iunique(struct super_block *sb, ino_t max_reserved)
+{
+ static ino_t counter = 0;
+ struct inode *inode;
+ struct hlist_head * head;
+ ino_t res;
+ spin_lock(&inode_lock);
+retry:
+ if (counter > max_reserved) {
+ head = inode_hashtable + hash(sb,counter);
+ res = counter++;
+ inode = find_inode_fast(sb, head, res);
+ if (!inode) {
+ spin_unlock(&inode_lock);
+ return res;
+ }
+ } else {
+ counter = max_reserved + 1;
+ }
+ goto retry;
+
+}
+
+struct inode *igrab(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ if (!(inode->i_state & I_FREEING))
+ __iget(inode);
+ else
+ /*
+ * Handle the case where s_op->clear_inode is not been
+ * called yet, and somebody is calling igrab
+ * while the inode is getting freed.
+ */
+ inode = NULL;
+ spin_unlock(&inode_lock);
+ return inode;
+}
+
+/**
+ * ifind - internal function, you want ilookup5() or iget5().
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ifind() searches for the inode specified by @hashval and @data in the inode
+ * cache. This is a generalized version of ifind_fast() for file systems where
+ * the inode number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+static inline struct inode *ifind(struct super_block *sb,
+ struct hlist_head *head, int (*test)(struct inode *, void *),
+ void *data)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode(sb, head, test, data);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ifind_fast - internal function, you want ilookup() or iget().
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ifind_fast() searches for the inode @ino in the inode cache. This is for
+ * file systems where the inode number is sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+static inline struct inode *ifind_fast(struct super_block *sb,
+ struct hlist_head *head, unsigned long ino)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode_fast(sb, head, ino);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ilookup5 - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * @data in the inode cache. This is a generalized version of ilookup() for
+ * file systems where the inode number is not sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+
+ return ifind(sb, head, test, data);
+}
+EXPORT_SYMBOL(ilookup5);
+
+/**
+ * ilookup - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
+ * This is for file systems where the inode number is sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+struct inode *ilookup(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+
+ return ifind_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(ilookup);
+
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * This is iget() without the read_inode() portion of get_new_inode().
+ *
+ * iget5_locked() uses ifind() to search for the inode specified by @hashval
+ * and @data in the inode cache and if present it is returned with an increased
+ * reference count. This is a generalized version of iget_locked() for file
+ * systems where the inode number is not sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is not in cache, get_new_inode() is called to allocate a new
+ * inode and this is returned locked, hashed, and with the I_NEW flag set. The
+ * file system gets to fill it in before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_lock held, so can't sleep.
+ */
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode;
+
+ inode = ifind(sb, head, test, data);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode(sb, head, test, set, data);
+}
+EXPORT_SYMBOL(iget5_locked);
+
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @ino: inode number to get
+ *
+ * This is iget() without the read_inode() portion of get_new_inode_fast().
+ *
+ * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
+ * the inode cache and if present it is returned with an increased reference
+ * count. This is for file systems where the inode number is sufficient for
+ * unique identification of an inode.
+ *
+ * If the inode is not in cache, get_new_inode_fast() is called to allocate a
+ * new inode and this is returned locked, hashed, and with the I_NEW flag set.
+ * The file system gets to fill it in before unlocking it via
+ * unlock_new_inode().
+ */
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct inode *inode;
+
+ inode = ifind_fast(sb, head, ino);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode_fast() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(iget_locked);
+
+/**
+ * __insert_inode_hash - hash an inode
+ * @inode: unhashed inode
+ * @hashval: unsigned long value used to locate this object in the
+ * inode_hashtable.
+ *
+ * Add an inode to the inode hash for this superblock. If the inode
+ * has no superblock it is added to a separate anonymous chain.
+ */
+
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+ struct hlist_head *head = &anon_hash_chain;
+ if (inode->i_sb)
+ head = inode_hashtable + hash(inode->i_sb, hashval);
+ spin_lock(&inode_lock);
+ hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode_lock);
+}
+
+/**
+ * remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock or anonymous hash.
+ */
+
+void remove_inode_hash(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ hlist_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+}
+
+void generic_delete_inode(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+
+ hlist_del_init(&inode->i_hash);
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+
+ security_inode_delete(inode);
+
+ if (op->delete_inode) {
+ void (*delete)(struct inode *) = op->delete_inode;
+ if (!is_bad_inode(inode))
+ DQUOT_INIT(inode);
+ /* s_op->delete_inode internally recalls clear_inode() */
+ delete(inode);
+ } else
+ clear_inode(inode);
+ spin_lock(&inode_lock);
+ list_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+ wake_up_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ destroy_inode(inode);
+}
+EXPORT_SYMBOL(generic_delete_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!hlist_unhashed(&inode->i_hash)) {
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_unused);
+ }
+ inodes_stat.nr_unused++;
+ spin_unlock(&inode_lock);
+ if (!sb || (sb->s_flags & MS_ACTIVE))
+ return;
+ write_inode_now(inode, 1);
+ spin_lock(&inode_lock);
+ inodes_stat.nr_unused--;
+ hlist_del_init(&inode->i_hash);
+ }
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+}
+
+/*
+ * Normal UNIX filesystem behaviour: delete the
+ * inode when the usage count drops to zero, and
+ * i_nlink is zero.
+ */
+static void generic_drop_inode(struct inode *inode)
+{
+ if (!inode->i_nlink)
+ generic_delete_inode(inode);
+ else
+ generic_forget_inode(inode);
+}
+
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
+ *
+ * Call the FS "drop()" function, defaulting to
+ * the legacy UNIX filesystem behaviour..
+ *
+ * NOTE! NOTE! NOTE! We're called with the inode lock
+ * held, and the drop function is supposed to release
+ * the lock!
+ */
+static inline void iput_final(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+ void (*drop)(struct inode *) = generic_drop_inode;
+
+ if (op && op->drop_inode)
+ drop = op->drop_inode;
+ drop(inode);
+}
+
+/**
+ * iput - put an inode
+ * @inode: inode to put
+ *
+ * Puts an inode, dropping its usage count. If the inode use count hits
+ * zero the inode is also then freed and may be destroyed.
+ */
+
+void iput(struct inode *inode)
+{
+ if (inode) {
+ struct super_operations *op = inode->i_sb->s_op;
+
+ if (inode->i_state == I_CLEAR)
+ BUG();
+
+ if (op && op->put_inode)
+ op->put_inode(inode);
+
+ if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ iput_final(inode);
+ }
+}
+
+/**
+ * bmap - find a block number in a file
+ * @inode: inode of file
+ * @block: block to find
+ *
+ * Returns the block number on the device holding the inode that
+ * is the disk block number for the block of the file requested.
+ * That is, asked for block 4 of inode 1 the function will return the
+ * disk block relative to the disk start that holds that block of the
+ * file.
+ */
+
+sector_t bmap(struct inode * inode, sector_t block)
+{
+ sector_t res = 0;
+ if (inode->i_mapping->a_ops->bmap)
+ res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
+ return res;
+}
+
+/*
+ * Return true if the filesystem which backs this inode considers the two
+ * passed timespecs to be sufficiently different to warrant flushing the
+ * altered time out to disk.
+ */
+static int inode_times_differ(struct inode *inode,
+ struct timespec *old, struct timespec *new)
+{
+ if (IS_ONE_SECOND(inode))
+ return old->tv_sec != new->tv_sec;
+ return !timespec_equal(old, new);
+}
+
+/**
+ * update_atime - update the access time
+ * @inode: inode accessed
+ *
+ * Update the accessed time on an inode and mark it for writeback.
+ * This function automatically handles read only file systems and media,
+ * as well as the "noatime" flag and inode specific "noatime" markers.
+ */
+
+void update_atime(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOATIME(inode))
+ return;
+ if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+ return;
+ if (IS_RDONLY(inode))
+ return;
+
+ now = current_kernel_time();
+ if (inode_times_differ(inode, &inode->i_atime, &now)) {
+ inode->i_atime = now;
+ mark_inode_dirty_sync(inode);
+ } else {
+ if (!timespec_equal(&inode->i_atime, &now))
+ inode->i_atime = now;
+ }
+}
+
+/**
+ * inode_update_time - update mtime and ctime time
+ * @inode: inode accessed
+ * @ctime_too: update ctime too
+ *
+ * Update the mtime time on an inode and mark it for writeback.
+ * When ctime_too is specified update the ctime too.
+ */
+
+void inode_update_time(struct inode *inode, int ctime_too)
+{
+ struct timespec now = current_kernel_time();
+ int sync_it = 0;
+
+ if (inode_times_differ(inode, &inode->i_mtime, &now))
+ sync_it = 1;
+ inode->i_mtime = now;
+
+ if (ctime_too) {
+ if (inode_times_differ(inode, &inode->i_ctime, &now))
+ sync_it = 1;
+ inode->i_ctime = now;
+ }
+ if (sync_it)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL(inode_update_time);
+
+int inode_needs_sync(struct inode *inode)
+{
+ if (IS_SYNC(inode))
+ return 1;
+ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(inode_needs_sync);
+
+/*
+ * Quota functions that want to walk the inode lists..
+ */
+#ifdef CONFIG_QUOTA
+
+/* Functions back in dquot.c */
+void put_dquot_list(struct list_head *);
+int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
+
+void remove_dquot_ref(struct super_block *sb, int type)
+{
+ struct inode *inode;
+ struct list_head *act_head;
+ LIST_HEAD(tofree_head);
+
+ if (!sb->dq_op)
+ return; /* nothing to do */
+ spin_lock(&inode_lock); /* This lock is for inodes code */
+ /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */
+
+ list_for_each(act_head, &inode_in_use) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &inode_unused) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_dirty) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_io) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ spin_unlock(&inode_lock);
+
+ put_dquot_list(&tofree_head);
+}
+
+#endif
+
+/*
+ * Hashed waitqueues for wait_on_inode(). The table is pretty small - the
+ * kernel doesn't lock many inodes at the same time.
+ */
+#define I_WAIT_TABLE_ORDER 3
+static struct i_wait_queue_head {
+ wait_queue_head_t wqh;
+} ____cacheline_aligned_in_smp i_wait_queue_heads[1<<I_WAIT_TABLE_ORDER];
+
+/*
+ * Return the address of the waitqueue_head to be used for this inode
+ */
+static wait_queue_head_t *i_waitq_head(struct inode *inode)
+{
+ return &i_wait_queue_heads[hash_ptr(inode, I_WAIT_TABLE_ORDER)].wqh;
+}
+
+void __wait_on_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (inode->i_state & I_LOCK) {
+ schedule();
+ goto repeat;
+ }
+ remove_wait_queue(wq, &wait);
+ __set_current_state(TASK_RUNNING);
+}
+
+void __wait_on_freeing_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode_lock);
+ schedule();
+ remove_wait_queue(wq, &wait);
+ current->state = TASK_RUNNING;
+ spin_lock(&inode_lock);
+}
+
+
+void wake_up_inode(struct inode *inode)
+{
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ /*
+ * Prevent speculative execution through spin_unlock(&inode_lock);
+ */
+ smp_mb();
+ if (waitqueue_active(wq))
+ wake_up_all(wq);
+}
+
+/*
+ * Initialize the waitqueues and inode hash table.
+ */
+void __init inode_init(unsigned long mempages)
+{
+ struct hlist_head *head;
+ unsigned long order;
+ unsigned int nr_hash;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
+ init_waitqueue_head(&i_wait_queue_heads[i].wqh);
+
+ mempages >>= (14 - PAGE_SHIFT);
+ mempages *= sizeof(struct list_head);
+ for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
+ ;
+
+ do {
+ unsigned long tmp;
+
+ nr_hash = (1UL << order) * PAGE_SIZE /
+ sizeof(struct hlist_head);
+ i_hash_mask = (nr_hash - 1);
+
+ tmp = nr_hash;
+ i_hash_shift = 0;
+ while ((tmp >>= 1UL) != 0UL)
+ i_hash_shift++;
+
+ inode_hashtable = (struct hlist_head *)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (inode_hashtable == NULL && --order >= 0);
+
+ printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+ nr_hash, order, (PAGE_SIZE << order));
+
+ if (!inode_hashtable)
+ panic("Failed to allocate inode hash table\n");
+
+ head = inode_hashtable;
+ i = nr_hash;
+ do {
+ INIT_HLIST_HEAD(head);
+ head++;
+ i--;
+ } while (i);
+
+ /* inode slab cache */
+ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
+ 0, SLAB_HWCACHE_ALIGN, init_once,
+ NULL);
+ if (!inode_cachep)
+ panic("cannot create inode slab cache");
+
+ set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+}
+
+void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
+ inode->i_mode = mode;
+ if (S_ISCHR(mode)) {
+ inode->i_fop = &def_chr_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISBLK(mode)) {
+ inode->i_fop = &def_blk_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISFIFO(mode))
+ inode->i_fop = &def_fifo_fops;
+ else if (S_ISSOCK(mode))
+ inode->i_fop = &bad_sock_fops;
+ else
+ printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
+ mode);
+}
diff --git a/tests/linux/inode-justrej/patch b/tests/linux/inode-justrej/patch
new file mode 100644
index 0000000..ec42e22
--- /dev/null
+++ b/tests/linux/inode-justrej/patch
@@ -0,0 +1,16 @@
+***************
+*** 942,948 ****
+ {
+ struct super_operations *op = inode->i_sb->s_op;
+
+- list_del_init(&inode->i_hash);
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+--- 953,958 ----
+ {
+ struct super_operations *op = inode->i_sb->s_op;
+
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
diff --git a/tests/linux/inode-justrej/wmerge b/tests/linux/inode-justrej/wmerge
new file mode 100644
index 0000000..1ffda02
--- /dev/null
+++ b/tests/linux/inode-justrej/wmerge
@@ -0,0 +1,1352 @@
+/*
+ * linux/fs/inode.c
+ *
+ * (C) 1997 Linus Torvalds
+ */
+
+#include <linux/config.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dcache.h>
+#include <linux/init.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/wait.h>
+#include <linux/hash.h>
+#include <linux/swap.h>
+#include <linux/security.h>
+
+/*
+ * This is needed for the following functions:
+ * - inode_has_buffers
+ * - invalidate_inode_buffers
+ * - fsync_bdev
+ * - invalidate_bdev
+ *
+ * FIXME: remove all knowledge of the buffer layer from this file
+ */
+#include <linux/buffer_head.h>
+
+/*
+ * New inode.c implementation.
+ *
+ * This implementation has the basic premise of trying
+ * to be extremely low-overhead and SMP-safe, yet be
+ * simple enough to be "obviously correct".
+ *
+ * Famous last words.
+ */
+
+/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
+
+/* #define INODE_PARANOIA 1 */
+/* #define INODE_DEBUG 1 */
+
+/*
+ * Inode lookup is no longer as critical as it used to be:
+ * most of the lookups are going to be through the dcache.
+ */
+#define I_HASHBITS i_hash_shift
+#define I_HASHMASK i_hash_mask
+
+static unsigned int i_hash_mask;
+static unsigned int i_hash_shift;
+
+/*
+ * Each inode can be on two separate lists. One is
+ * the hash list of the inode, used for lookups. The
+ * other linked list is the "type" list:
+ * "in_use" - valid inode, i_count > 0, i_nlink > 0
+ * "dirty" - as "in_use" but also dirty
+ * "unused" - valid inode, i_count = 0
+ *
+ * A "dirty" list is maintained for each super block,
+ * allowing for low-overhead inode sync() operations.
+ */
+
+LIST_HEAD(inode_in_use);
+LIST_HEAD(inode_unused);
+static struct hlist_head *inode_hashtable;
+static HLIST_HEAD(anon_hash_chain); /* for inodes with NULL i_sb */
+
+/*
+ * A simple spinlock to protect the list manipulations.
+ *
+ * NOTE! You also have to own the lock if you change
+ * the i_state of an inode while it is in use..
+ */
+spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * iprune_sem provides exclusion between the kswapd or try_to_free_pages
+ * icache shrinking path, and the umount path. Without this exclusion,
+ * by the time prune_icache calls iput for the inode whose pages it has
+ * been invalidating, or by the time it calls clear_inode & destroy_inode
+ * from its final dispose_list, the struct super_block they refer to
+ * (for inode->i_sb->s_op) may already have been freed and reused.
+ */
+static DECLARE_MUTEX(iprune_sem);
+
+/*
+ * Statistics gathering..
+ */
+struct inodes_stat_t inodes_stat;
+
+static kmem_cache_t * inode_cachep;
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+ static struct address_space_operations empty_aops;
+ static struct inode_operations empty_iops;
+ static struct file_operations empty_fops;
+ struct inode *inode;
+
+ if (sb->s_op->alloc_inode)
+ inode = sb->s_op->alloc_inode(sb);
+ else
+ inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL);
+
+ if (inode) {
+ struct address_space * const mapping = &inode->i_data;
+
+ inode->i_sb = sb;
+ inode->i_blkbits = sb->s_blocksize_bits;
+ inode->i_flags = 0;
+ atomic_set(&inode->i_count, 1);
+ inode->i_sock = 0;
+ inode->i_op = &empty_iops;
+ inode->i_fop = &empty_fops;
+ inode->i_nlink = 1;
+ atomic_set(&inode->i_writecount, 0);
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ inode->i_bytes = 0;
+ inode->i_generation = 0;
+ memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+ inode->i_pipe = NULL;
+ inode->i_bdev = NULL;
+ inode->i_rdev = to_kdev_t(0);
+ inode->i_security = NULL;
+ if (security_inode_alloc(inode)) {
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+ return NULL;
+ }
+
+ mapping->a_ops = &empty_aops;
+ mapping->host = inode;
+ mapping->gfp_mask = GFP_HIGHUSER;
+ mapping->dirtied_when = 0;
+ mapping->assoc_mapping = NULL;
+ mapping->backing_dev_info = &default_backing_dev_info;
+ if (sb->s_bdev)
+ mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ memset(&inode->u, 0, sizeof(inode->u));
+ inode->i_mapping = mapping;
+ }
+ return inode;
+}
+
+void destroy_inode(struct inode *inode)
+{
+ if (inode_has_buffers(inode))
+ BUG();
+ security_inode_free(inode);
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+}
+
+
+/*
+ * These are initializations that only need to be done
+ * once, because the fields are idempotent across use
+ * of the inode, so let the slab aware of that.
+ */
+void inode_init_once(struct inode *inode)
+{
+ memset(inode, 0, sizeof(*inode));
+ INIT_HLIST_NODE(&inode->i_hash);
+ INIT_LIST_HEAD(&inode->i_data.clean_pages);
+ INIT_LIST_HEAD(&inode->i_data.dirty_pages);
+ INIT_LIST_HEAD(&inode->i_data.locked_pages);
+ INIT_LIST_HEAD(&inode->i_data.io_pages);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ INIT_LIST_HEAD(&inode->i_devices);
+ sema_init(&inode->i_sem, 1);
+ INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+ rwlock_init(&inode->i_data.page_lock);
+ init_MUTEX(&inode->i_data.i_shared_sem);
+ INIT_LIST_HEAD(&inode->i_data.private_list);
+ spin_lock_init(&inode->i_data.private_lock);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap);
+ INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
+ spin_lock_init(&inode->i_lock);
+}
+
+static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+{
+ struct inode * inode = (struct inode *) foo;
+
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ inode_init_once(inode);
+}
+
+/*
+ * inode_lock must be held
+ */
+void __iget(struct inode * inode)
+{
+ if (atomic_read(&inode->i_count)) {
+ atomic_inc(&inode->i_count);
+ return;
+ }
+ atomic_inc(&inode->i_count);
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_in_use);
+ }
+ inodes_stat.nr_unused--;
+}
+
+/**
+ * clear_inode - clear an inode
+ * @inode: inode to clear
+ *
+ * This is called by the filesystem to tell us
+ * that the inode is no longer useful. We just
+ * terminate it with extreme prejudice.
+ */
+
+void clear_inode(struct inode *inode)
+{
+ invalidate_inode_buffers(inode);
+
+ if (inode->i_data.nrpages)
+ BUG();
+ if (!(inode->i_state & I_FREEING))
+ BUG();
+ if (inode->i_state & I_CLEAR)
+ BUG();
+ wait_on_inode(inode);
+ DQUOT_DROP(inode);
+ if (inode->i_sb && inode->i_sb->s_op->clear_inode)
+ inode->i_sb->s_op->clear_inode(inode);
+ if (inode->i_bdev)
+ bd_forget(inode);
+ inode->i_state = I_CLEAR;
+}
+
+/*
+ * Dispose-list gets a local list with local inodes in it, so it doesn't
+ * need to worry about list corruption and SMP locks.
+ */
+static void dispose_list(struct list_head *head)
+{
+ int nr_disposed = 0;
+
+ while (!list_empty(head)) {
+ struct inode *inode;
+
+ inode = list_entry(head->next, struct inode, i_list);
+ list_del(&inode->i_list);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+ nr_disposed++;
+ }
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes -= nr_disposed;
+ spin_unlock(&inode_lock);
+}
+
+/*
+ * Invalidate all inodes for a device.
+ */
+static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose)
+{
+ struct list_head *next;
+ int busy = 0, count = 0;
+
+ next = head->next;
+ for (;;) {
+ struct list_head * tmp = next;
+ struct inode * inode;
+
+ next = next->next;
+ if (tmp == head)
+ break;
+ inode = list_entry(tmp, struct inode, i_list);
+ if (inode->i_sb != sb)
+ continue;
+ invalidate_inode_buffers(inode);
+ if (!atomic_read(&inode->i_count)) {
+ hlist_del_init(&inode->i_hash);
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, dispose);
+ inode->i_state |= I_FREEING;
+ count++;
+ continue;
+ }
+ busy = 1;
+ }
+ /* only unused inodes may be cached with i_count zero */
+ inodes_stat.nr_unused -= count;
+ return busy;
+}
+
+/*
+ * This is a two-stage process. First we collect all
+ * offending inodes onto the throw-away list, and in
+ * the second stage we actually dispose of them. This
+ * is because we don't want to sleep while messing
+ * with the global lists..
+ */
+
+/**
+ * invalidate_inodes - discard the inodes on a device
+ * @sb: superblock
+ *
+ * Discard all of the inodes for a given superblock. If the discard
+ * fails because there are busy inodes then a non zero value is returned.
+ * If the discard is successful all the inodes have been discarded.
+ */
+
+int invalidate_inodes(struct super_block * sb)
+{
+ int busy;
+ LIST_HEAD(throw_away);
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ busy = invalidate_list(&inode_in_use, sb, &throw_away);
+ busy |= invalidate_list(&inode_unused, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
+ busy |= invalidate_list(&sb->s_io, sb, &throw_away);
+ spin_unlock(&inode_lock);
+
+ dispose_list(&throw_away);
+ up(&iprune_sem);
+
+ return busy;
+}
+
+int invalidate_device(kdev_t dev, int do_sync)
+{
+ struct super_block *sb;
+ struct block_device *bdev = bdget(kdev_t_to_nr(dev));
+ int res;
+
+ if (!bdev)
+ return 0;
+
+ if (do_sync)
+ fsync_bdev(bdev);
+
+ res = 0;
+ sb = get_super(bdev);
+ if (sb) {
+ /*
+ * no need to lock the super, get_super holds the
+ * read semaphore so the filesystem cannot go away
+ * under us (->put_super runs with the write lock
+ * hold).
+ */
+ shrink_dcache_sb(sb);
+ res = invalidate_inodes(sb);
+ drop_super(sb);
+ }
+ invalidate_bdev(bdev, 0);
+ bdput(bdev);
+ return res;
+}
+
+static int can_unuse(struct inode *inode)
+{
+ if (inode->i_state)
+ return 0;
+ if (inode_has_buffers(inode))
+ return 0;
+ if (atomic_read(&inode->i_count))
+ return 0;
+ if (inode->i_data.nrpages)
+ return 0;
+ return 1;
+}
+
+/*
+ * Scan `goal' inodes on the unused list for freeable ones. They are moved to
+ * a temporary list and then are freed outside inode_lock by dispose_list().
+ *
+ * Any inodes which are pinned purely because of attached pagecache have their
+ * pagecache removed. We expect the final iput() on that inode to add it to
+ * the front of the inode_unused list. So look for it there and if the
+ * inode is still freeable, proceed. The right inode is found 99.9% of the
+ * time in testing on a 4-way.
+ *
+ * If the inode has metadata buffers attached to mapping->private_list then
+ * try to remove them.
+ */
+static void prune_icache(int nr_to_scan)
+{
+ LIST_HEAD(freeable);
+ int nr_pruned = 0;
+ int nr_scanned;
+ unsigned long reap = 0;
+
+ down(&iprune_sem);
+ spin_lock(&inode_lock);
+ for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
+ struct inode *inode;
+
+ if (list_empty(&inode_unused))
+ break;
+
+ inode = list_entry(inode_unused.prev, struct inode, i_list);
+
+ if (inode->i_state || atomic_read(&inode->i_count)) {
+ list_move(&inode->i_list, &inode_unused);
+ continue;
+ }
+ if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ if (remove_inode_buffers(inode))
+ reap += invalidate_inode_pages(&inode->i_data);
+ iput(inode);
+ spin_lock(&inode_lock);
+
+ if (inode != list_entry(inode_unused.next,
+ struct inode, i_list))
+ continue; /* wrong inode or list_empty */
+ if (!can_unuse(inode))
+ continue;
+ }
+ hlist_del_init(&inode->i_hash);
+ list_move(&inode->i_list, &freeable);
+ inode->i_state |= I_FREEING;
+ nr_pruned++;
+ }
+ inodes_stat.nr_unused -= nr_pruned;
+ spin_unlock(&inode_lock);
+
+ dispose_list(&freeable);
+ up(&iprune_sem);
+
+ if (current_is_kswapd)
+ mod_page_state(kswapd_inodesteal, reap);
+ else
+ mod_page_state(pginodesteal, reap);
+}
+
+/*
+ * shrink_icache_memory() will attempt to reclaim some unused inodes. Here,
+ * "unused" means that no dentries are referring to the inodes: the files are
+ * not open and the dcache references to those inodes have already been
+ * reclaimed.
+ *
+ * This function is passed the number of inodes to scan, and it returns the
+ * total number of remaining possibly-reclaimable inodes.
+ */
+static int shrink_icache_memory(int nr, unsigned int gfp_mask)
+{
+ if (nr) {
+ /*
+ * Nasty deadlock avoidance. We may hold various FS locks,
+ * and we don't want to recurse into the FS that called us
+ * in clear_inode() and friends..
+ */
+ if (gfp_mask & __GFP_FS)
+ prune_icache(nr);
+ }
+ return inodes_stat.nr_unused;
+}
+
+void __wait_on_freeing_inode(struct inode *inode);
+/*
+ * Called with the inode lock held.
+ * NOTE: we are not increasing the inode-refcount, you must call __iget()
+ * by hand after calling find_inode now! This simplifies iunique and won't
+ * add any additional branch in the common code.
+ */
+static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = hlist_entry(node, struct inode, i_hash);
+ if (inode->i_sb != sb)
+ continue;
+ if (!test(inode, data))
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/*
+ * find_inode_fast is the fast path version of find_inode, see the comment at
+ * iget_locked for details.
+ */
+static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino)
+{
+ struct hlist_node *node;
+ struct inode * inode = NULL;
+
+ hlist_for_each (node, head) {
+ prefetch(node->next);
+ inode = list_entry(node, struct inode, i_hash);
+ if (inode->i_ino != ino)
+ continue;
+ if (inode->i_sb != sb)
+ continue;
+ if (inode->i_state & (I_FREEING|I_CLEAR)) {
+ __wait_on_freeing_inode(inode);
+ tmp = head;
+ continue;
+ }
+ break;
+ }
+ return node ? inode : NULL;
+}
+
+/**
+ * new_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ */
+
+struct inode *new_inode(struct super_block *sb)
+{
+ static unsigned long last_ino;
+ struct inode * inode;
+
+ spin_lock_prefetch(&inode_lock);
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ spin_lock(&inode_lock);
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ inode->i_ino = ++last_ino;
+ inode->i_state = 0;
+ spin_unlock(&inode_lock);
+ }
+ return inode;
+}
+
+void unlock_new_inode(struct inode *inode)
+{
+ /*
+ * This is special! We do not need the spinlock
+ * when clearing I_LOCK, because we're guaranteed
+ * that nobody else tries to do anything about the
+ * state of the inode when it is locked, as we
+ * just created it (so there can be no old holders
+ * that haven't tested I_LOCK).
+ */
+ inode->i_state &= ~(I_LOCK|I_NEW);
+ wake_up_inode(inode);
+}
+EXPORT_SYMBOL(unlock_new_inode);
+
+/*
+ * This is called without the inode lock held.. Be careful.
+ *
+ * We no longer cache the sb_flags in i_flags - see fs.h
+ * -- rmk@arm.uk.linux.org
+ */
+static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode(sb, head, test, data);
+ if (!old) {
+ if (set(inode, data))
+ goto set_failed;
+
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+
+set_failed:
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ return NULL;
+}
+
+/*
+ * get_new_inode_fast is the fast path version of get_new_inode, see the
+ * comment at iget_locked for details.
+ */
+static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino)
+{
+ struct inode * inode;
+
+ inode = alloc_inode(sb);
+ if (inode) {
+ struct inode * old;
+
+ spin_lock(&inode_lock);
+ /* We released the lock, so.. */
+ old = find_inode_fast(sb, head, ino);
+ if (!old) {
+ inode->i_ino = ino;
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ hlist_add_head(&inode->i_hash, head);
+ inode->i_state = I_LOCK|I_NEW;
+ spin_unlock(&inode_lock);
+
+ /* Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ return inode;
+ }
+
+ /*
+ * Uhhuh, somebody else created the same inode under
+ * us. Use the old inode instead of the one we just
+ * allocated.
+ */
+ __iget(old);
+ spin_unlock(&inode_lock);
+ destroy_inode(inode);
+ inode = old;
+ wait_on_inode(inode);
+ }
+ return inode;
+}
+
+static inline unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES);
+ tmp = tmp + (tmp >> I_HASHBITS);
+ return tmp & I_HASHMASK;
+}
+
+/* Yeah, I know about quadratic hash. Maybe, later. */
+
+/**
+ * iunique - get a unique inode number
+ * @sb: superblock
+ * @max_reserved: highest reserved inode number
+ *
+ * Obtain an inode number that is unique on the system for a given
+ * superblock. This is used by file systems that have no natural
+ * permanent inode numbering system. An inode number is returned that
+ * is higher than the reserved limit but unique.
+ *
+ * BUGS:
+ * With a large number of inodes live on the file system this function
+ * currently becomes quite slow.
+ */
+
+ino_t iunique(struct super_block *sb, ino_t max_reserved)
+{
+ static ino_t counter = 0;
+ struct inode *inode;
+ struct hlist_head * head;
+ ino_t res;
+ spin_lock(&inode_lock);
+retry:
+ if (counter > max_reserved) {
+ head = inode_hashtable + hash(sb,counter);
+ res = counter++;
+ inode = find_inode_fast(sb, head, res);
+ if (!inode) {
+ spin_unlock(&inode_lock);
+ return res;
+ }
+ } else {
+ counter = max_reserved + 1;
+ }
+ goto retry;
+
+}
+
+struct inode *igrab(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ if (!(inode->i_state & I_FREEING))
+ __iget(inode);
+ else
+ /*
+ * Handle the case where s_op->clear_inode is not been
+ * called yet, and somebody is calling igrab
+ * while the inode is getting freed.
+ */
+ inode = NULL;
+ spin_unlock(&inode_lock);
+ return inode;
+}
+
+/**
+ * ifind - internal function, you want ilookup5() or iget5().
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ifind() searches for the inode specified by @hashval and @data in the inode
+ * cache. This is a generalized version of ifind_fast() for file systems where
+ * the inode number is not sufficient for unique identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+static inline struct inode *ifind(struct super_block *sb,
+ struct hlist_head *head, int (*test)(struct inode *, void *),
+ void *data)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode(sb, head, test, data);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ifind_fast - internal function, you want ilookup() or iget().
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ifind_fast() searches for the inode @ino in the inode cache. This is for
+ * file systems where the inode number is sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+static inline struct inode *ifind_fast(struct super_block *sb,
+ struct hlist_head *head, unsigned long ino)
+{
+ struct inode *inode;
+
+ spin_lock(&inode_lock);
+ inode = find_inode_fast(sb, head, ino);
+ if (inode) {
+ __iget(inode);
+ spin_unlock(&inode_lock);
+ wait_on_inode(inode);
+ return inode;
+ }
+ spin_unlock(&inode_lock);
+ return NULL;
+}
+
+/**
+ * ilookup5 - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @test: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @test
+ *
+ * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * @data in the inode cache. This is a generalized version of ilookup() for
+ * file systems where the inode number is not sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+
+ return ifind(sb, head, test, data);
+}
+EXPORT_SYMBOL(ilookup5);
+
+/**
+ * ilookup - search for an inode in the inode cache
+ * @sb: super block of file system to search
+ * @ino: inode number to search for
+ *
+ * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache.
+ * This is for file systems where the inode number is sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode is returned with an incremented
+ * reference count.
+ *
+ * Otherwise NULL is returned.
+ */
+struct inode *ilookup(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+
+ return ifind_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(ilookup);
+
+/**
+ * iget5_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * This is iget() without the read_inode() portion of get_new_inode().
+ *
+ * iget5_locked() uses ifind() to search for the inode specified by @hashval
+ * and @data in the inode cache and if present it is returned with an increased
+ * reference count. This is a generalized version of iget_locked() for file
+ * systems where the inode number is not sufficient for unique identification
+ * of an inode.
+ *
+ * If the inode is not in cache, get_new_inode() is called to allocate a new
+ * inode and this is returned locked, hashed, and with the I_NEW flag set. The
+ * file system gets to fill it in before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_lock held, so can't sleep.
+ */
+struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode;
+
+ inode = ifind(sb, head, test, data);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode(sb, head, test, set, data);
+}
+EXPORT_SYMBOL(iget5_locked);
+
+/**
+ * iget_locked - obtain an inode from a mounted file system
+ * @sb: super block of file system
+ * @ino: inode number to get
+ *
+ * This is iget() without the read_inode() portion of get_new_inode_fast().
+ *
+ * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
+ * the inode cache and if present it is returned with an increased reference
+ * count. This is for file systems where the inode number is sufficient for
+ * unique identification of an inode.
+ *
+ * If the inode is not in cache, get_new_inode_fast() is called to allocate a
+ * new inode and this is returned locked, hashed, and with the I_NEW flag set.
+ * The file system gets to fill it in before unlocking it via
+ * unlock_new_inode().
+ */
+struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct inode *inode;
+
+ inode = ifind_fast(sb, head, ino);
+ if (inode)
+ return inode;
+ /*
+ * get_new_inode_fast() will do the right thing, re-trying the search
+ * in case it had to block at any point.
+ */
+ return get_new_inode_fast(sb, head, ino);
+}
+EXPORT_SYMBOL(iget_locked);
+
+/**
+ * __insert_inode_hash - hash an inode
+ * @inode: unhashed inode
+ * @hashval: unsigned long value used to locate this object in the
+ * inode_hashtable.
+ *
+ * Add an inode to the inode hash for this superblock. If the inode
+ * has no superblock it is added to a separate anonymous chain.
+ */
+
+void __insert_inode_hash(struct inode *inode, unsigned long hashval)
+{
+ struct hlist_head *head = &anon_hash_chain;
+ if (inode->i_sb)
+ head = inode_hashtable + hash(inode->i_sb, hashval);
+ spin_lock(&inode_lock);
+ hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode_lock);
+}
+
+/**
+ * remove_inode_hash - remove an inode from the hash
+ * @inode: inode to unhash
+ *
+ * Remove an inode from the superblock or anonymous hash.
+ */
+
+void remove_inode_hash(struct inode *inode)
+{
+ spin_lock(&inode_lock);
+ hlist_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+}
+
+void generic_delete_inode(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+
+<<<---hlist_del_init|||list_del_init===--->>> list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+
+ security_inode_delete(inode);
+
+ if (op->delete_inode) {
+ void (*delete)(struct inode *) = op->delete_inode;
+ if (!is_bad_inode(inode))
+ DQUOT_INIT(inode);
+ /* s_op->delete_inode internally recalls clear_inode() */
+ delete(inode);
+ } else
+ clear_inode(inode);
+ spin_lock(&inode_lock);
+ list_del_init(&inode->i_hash);
+ spin_unlock(&inode_lock);
+ wake_up_inode(inode);
+ if (inode->i_state != I_CLEAR)
+ BUG();
+ destroy_inode(inode);
+}
+EXPORT_SYMBOL(generic_delete_inode);
+
+static void generic_forget_inode(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!hlist_unhashed(&inode->i_hash)) {
+ if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+ list_del(&inode->i_list);
+ list_add(&inode->i_list, &inode_unused);
+ }
+ inodes_stat.nr_unused++;
+ spin_unlock(&inode_lock);
+ if (!sb || (sb->s_flags & MS_ACTIVE))
+ return;
+ write_inode_now(inode, 1);
+ spin_lock(&inode_lock);
+ inodes_stat.nr_unused--;
+ hlist_del_init(&inode->i_hash);
+ }
+ list_del_init(&inode->i_list);
+ inode->i_state|=I_FREEING;
+ inodes_stat.nr_inodes--;
+ spin_unlock(&inode_lock);
+ if (inode->i_data.nrpages)
+ truncate_inode_pages(&inode->i_data, 0);
+ clear_inode(inode);
+ destroy_inode(inode);
+}
+
+/*
+ * Normal UNIX filesystem behaviour: delete the
+ * inode when the usage count drops to zero, and
+ * i_nlink is zero.
+ */
+static void generic_drop_inode(struct inode *inode)
+{
+ if (!inode->i_nlink)
+ generic_delete_inode(inode);
+ else
+ generic_forget_inode(inode);
+}
+
+/*
+ * Called when we're dropping the last reference
+ * to an inode.
+ *
+ * Call the FS "drop()" function, defaulting to
+ * the legacy UNIX filesystem behaviour..
+ *
+ * NOTE! NOTE! NOTE! We're called with the inode lock
+ * held, and the drop function is supposed to release
+ * the lock!
+ */
+static inline void iput_final(struct inode *inode)
+{
+ struct super_operations *op = inode->i_sb->s_op;
+ void (*drop)(struct inode *) = generic_drop_inode;
+
+ if (op && op->drop_inode)
+ drop = op->drop_inode;
+ drop(inode);
+}
+
+/**
+ * iput - put an inode
+ * @inode: inode to put
+ *
+ * Puts an inode, dropping its usage count. If the inode use count hits
+ * zero the inode is also then freed and may be destroyed.
+ */
+
+void iput(struct inode *inode)
+{
+ if (inode) {
+ struct super_operations *op = inode->i_sb->s_op;
+
+ if (inode->i_state == I_CLEAR)
+ BUG();
+
+ if (op && op->put_inode)
+ op->put_inode(inode);
+
+ if (atomic_dec_and_lock(&inode->i_count, &inode_lock))
+ iput_final(inode);
+ }
+}
+
+/**
+ * bmap - find a block number in a file
+ * @inode: inode of file
+ * @block: block to find
+ *
+ * Returns the block number on the device holding the inode that
+ * is the disk block number for the block of the file requested.
+ * That is, asked for block 4 of inode 1 the function will return the
+ * disk block relative to the disk start that holds that block of the
+ * file.
+ */
+
+sector_t bmap(struct inode * inode, sector_t block)
+{
+ sector_t res = 0;
+ if (inode->i_mapping->a_ops->bmap)
+ res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block);
+ return res;
+}
+
+/*
+ * Return true if the filesystem which backs this inode considers the two
+ * passed timespecs to be sufficiently different to warrant flushing the
+ * altered time out to disk.
+ */
+static int inode_times_differ(struct inode *inode,
+ struct timespec *old, struct timespec *new)
+{
+ if (IS_ONE_SECOND(inode))
+ return old->tv_sec != new->tv_sec;
+ return !timespec_equal(old, new);
+}
+
+/**
+ * update_atime - update the access time
+ * @inode: inode accessed
+ *
+ * Update the accessed time on an inode and mark it for writeback.
+ * This function automatically handles read only file systems and media,
+ * as well as the "noatime" flag and inode specific "noatime" markers.
+ */
+
+void update_atime(struct inode *inode)
+{
+ struct timespec now;
+
+ if (IS_NOATIME(inode))
+ return;
+ if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+ return;
+ if (IS_RDONLY(inode))
+ return;
+
+ now = current_kernel_time();
+ if (inode_times_differ(inode, &inode->i_atime, &now)) {
+ inode->i_atime = now;
+ mark_inode_dirty_sync(inode);
+ } else {
+ if (!timespec_equal(&inode->i_atime, &now))
+ inode->i_atime = now;
+ }
+}
+
+/**
+ * inode_update_time - update mtime and ctime time
+ * @inode: inode accessed
+ * @ctime_too: update ctime too
+ *
+ * Update the mtime time on an inode and mark it for writeback.
+ * When ctime_too is specified update the ctime too.
+ */
+
+void inode_update_time(struct inode *inode, int ctime_too)
+{
+ struct timespec now = current_kernel_time();
+ int sync_it = 0;
+
+ if (inode_times_differ(inode, &inode->i_mtime, &now))
+ sync_it = 1;
+ inode->i_mtime = now;
+
+ if (ctime_too) {
+ if (inode_times_differ(inode, &inode->i_ctime, &now))
+ sync_it = 1;
+ inode->i_ctime = now;
+ }
+ if (sync_it)
+ mark_inode_dirty_sync(inode);
+}
+EXPORT_SYMBOL(inode_update_time);
+
+int inode_needs_sync(struct inode *inode)
+{
+ if (IS_SYNC(inode))
+ return 1;
+ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(inode_needs_sync);
+
+/*
+ * Quota functions that want to walk the inode lists..
+ */
+#ifdef CONFIG_QUOTA
+
+/* Functions back in dquot.c */
+void put_dquot_list(struct list_head *);
+int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
+
+void remove_dquot_ref(struct super_block *sb, int type)
+{
+ struct inode *inode;
+ struct list_head *act_head;
+ LIST_HEAD(tofree_head);
+
+ if (!sb->dq_op)
+ return; /* nothing to do */
+ spin_lock(&inode_lock); /* This lock is for inodes code */
+ /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */
+
+ list_for_each(act_head, &inode_in_use) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &inode_unused) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (inode->i_sb == sb && IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_dirty) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ list_for_each(act_head, &sb->s_io) {
+ inode = list_entry(act_head, struct inode, i_list);
+ if (IS_QUOTAINIT(inode))
+ remove_inode_dquot_ref(inode, type, &tofree_head);
+ }
+ spin_unlock(&inode_lock);
+
+ put_dquot_list(&tofree_head);
+}
+
+#endif
+
+/*
+ * Hashed waitqueues for wait_on_inode(). The table is pretty small - the
+ * kernel doesn't lock many inodes at the same time.
+ */
+#define I_WAIT_TABLE_ORDER 3
+static struct i_wait_queue_head {
+ wait_queue_head_t wqh;
+} ____cacheline_aligned_in_smp i_wait_queue_heads[1<<I_WAIT_TABLE_ORDER];
+
+/*
+ * Return the address of the waitqueue_head to be used for this inode
+ */
+static wait_queue_head_t *i_waitq_head(struct inode *inode)
+{
+ return &i_wait_queue_heads[hash_ptr(inode, I_WAIT_TABLE_ORDER)].wqh;
+}
+
+void __wait_on_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+repeat:
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (inode->i_state & I_LOCK) {
+ schedule();
+ goto repeat;
+ }
+ remove_wait_queue(wq, &wait);
+ __set_current_state(TASK_RUNNING);
+}
+
+void __wait_on_freeing_inode(struct inode *inode)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ add_wait_queue(wq, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock(&inode_lock);
+ schedule();
+ remove_wait_queue(wq, &wait);
+ current->state = TASK_RUNNING;
+ spin_lock(&inode_lock);
+}
+
+
+void wake_up_inode(struct inode *inode)
+{
+ wait_queue_head_t *wq = i_waitq_head(inode);
+
+ /*
+ * Prevent speculative execution through spin_unlock(&inode_lock);
+ */
+ smp_mb();
+ if (waitqueue_active(wq))
+ wake_up_all(wq);
+}
+
+/*
+ * Initialize the waitqueues and inode hash table.
+ */
+void __init inode_init(unsigned long mempages)
+{
+ struct hlist_head *head;
+ unsigned long order;
+ unsigned int nr_hash;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++)
+ init_waitqueue_head(&i_wait_queue_heads[i].wqh);
+
+ mempages >>= (14 - PAGE_SHIFT);
+ mempages *= sizeof(struct list_head);
+ for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++)
+ ;
+
+ do {
+ unsigned long tmp;
+
+ nr_hash = (1UL << order) * PAGE_SIZE /
+ sizeof(struct hlist_head);
+ i_hash_mask = (nr_hash - 1);
+
+ tmp = nr_hash;
+ i_hash_shift = 0;
+ while ((tmp >>= 1UL) != 0UL)
+ i_hash_shift++;
+
+ inode_hashtable = (struct hlist_head *)
+ __get_free_pages(GFP_ATOMIC, order);
+ } while (inode_hashtable == NULL && --order >= 0);
+
+ printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n",
+ nr_hash, order, (PAGE_SIZE << order));
+
+ if (!inode_hashtable)
+ panic("Failed to allocate inode hash table\n");
+
+ head = inode_hashtable;
+ i = nr_hash;
+ do {
+ INIT_HLIST_HEAD(head);
+ head++;
+ i--;
+ } while (i);
+
+ /* inode slab cache */
+ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
+ 0, SLAB_HWCACHE_ALIGN, init_once,
+ NULL);
+ if (!inode_cachep)
+ panic("cannot create inode slab cache");
+
+ set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+}
+
+void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
+{
+ inode->i_mode = mode;
+ if (S_ISCHR(mode)) {
+ inode->i_fop = &def_chr_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISBLK(mode)) {
+ inode->i_fop = &def_blk_fops;
+ inode->i_rdev = to_kdev_t(rdev);
+ } else if (S_ISFIFO(mode))
+ inode->i_fop = &def_fifo_fops;
+ else if (S_ISSOCK(mode))
+ inode->i_fop = &bad_sock_fops;
+ else
+ printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n",
+ mode);
+}
diff --git a/tests/linux/md-autostart/merge b/tests/linux/md-autostart/merge
new file mode 100644
index 0000000..b3bde61
--- /dev/null
+++ b/tests/linux/md-autostart/merge
@@ -0,0 +1,4025 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/raid/xor.h>
+#include <linux/devfs_fs_kernel.h>
+
+#include <linux/init.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+
+#include <linux/blk.h>
+
+#define DEBUG 0
+#if DEBUG
+# define dprintk(x...) printk(x)
+#else
+# define dprintk(x...) do { } while(0)
+#endif
+
+#ifndef MODULE
+static void autostart_arrays (void);
+#endif
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 100 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static int sysctl_speed_limit_min = 100;
+static int sysctl_speed_limit_max = 100000;
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
+ &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
+ {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
+ &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
+ {0}
+};
+
+static ctl_table raid_dir_table[] = {
+ {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
+ {0}
+};
+
+static ctl_table raid_root_table[] = {
+ {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
+ {0}
+};
+
+/*
+ * these have to be allocated separately because external
+ * subsystems want to have a pre-defined structure
+ */
+struct hd_struct md_hd_struct[MAX_MD_DEVS];
+static int md_blocksizes[MAX_MD_DEVS];
+static int md_hardsect_sizes[MAX_MD_DEVS];
+static void md_recover_arrays(void);
+static mdk_thread_t *md_recovery_thread;
+
+int md_size[MAX_MD_DEVS];
+
+static struct block_device_operations md_fops;
+static devfs_handle_t devfs_handle;
+
+static struct gendisk md_gendisk=
+{
+ major: MD_MAJOR,
+ major_name: "md",
+ minor_shift: 0,
+ max_p: 1,
+ part: md_hd_struct,
+ sizes: md_size,
+ nr_real: MAX_MD_DEVS,
+ real_devices: NULL,
+ next: NULL,
+ fops: &md_fops,
+};
+
+/*
+ * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list as well as mddev_map.
+ */
+static MD_LIST_HEAD(all_mddevs);
+static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;
+
+
+/*
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
+ */
+#define ITERATE_MDDEV(mddev,tmp) \
+ \
+ for (spin_lock(&all_mddevs_lock), \
+ (tmp = all_mddevs.next), \
+ (mddev = NULL); \
+ (void)(tmp != &all_mddevs && \
+ mddev_get(list_entry(tmp, mddev_t, all_mddevs))),\
+ spin_unlock(&all_mddevs_lock), \
+ (mddev ? mddev_put(mddev):(void)NULL), \
+ (mddev = list_entry(tmp, mddev_t, all_mddevs)), \
+ (tmp != &all_mddevs); \
+ spin_lock(&all_mddevs_lock), \
+ (tmp = tmp->next) \
+ )
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio);
+ return 0;
+}
+
+static inline mddev_t *mddev_get(mddev_t *mddev)
+{
+ atomic_inc(&mddev->active);
+ return mddev;
+}
+
+static void mddev_put(mddev_t *mddev)
+{
+ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+ return;
+ if (!mddev->sb && list_empty(&mddev->disks)) {
+ list_del(&mddev->all_mddevs);
+ mddev_map[mdidx(mddev)] = NULL;
+ kfree(mddev);
+ MOD_DEC_USE_COUNT;
+ }
+ spin_unlock(&all_mddevs_lock);
+}
+
+static mddev_t * mddev_find(int unit)
+{
+ mddev_t *mddev, *new = NULL;
+
+ retry:
+ spin_lock(&all_mddevs_lock);
+ if (mddev_map[unit]) {
+ mddev = mddev_get(mddev_map[unit]);
+ spin_unlock(&all_mddevs_lock);
+ if (new)
+ kfree(new);
+ return mddev;
+ }
+ if (new) {
+ mddev_map[unit] = new;
+ list_add(&new->all_mddevs, &all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ MOD_INC_USE_COUNT;
+ return new;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ memset(new, 0, sizeof(*new));
+
+ new->__minor = unit;
+ init_MUTEX(&new->reconfig_sem);
+ MD_INIT_LIST_HEAD(&new->disks);
+ MD_INIT_LIST_HEAD(&new->all_mddevs);
+ atomic_set(&new->active, 1);
+
+ goto retry;
+}
+
+static inline int mddev_lock(mddev_t * mddev)
+{
+ return down_interruptible(&mddev->reconfig_sem);
+}
+
+static inline int mddev_trylock(mddev_t * mddev)
+{
+ return down_trylock(&mddev->reconfig_sem);
+}
+
+static inline void mddev_unlock(mddev_t * mddev)
+{
+ up(&mddev->reconfig_sem);
+}
+
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+static MD_LIST_HEAD(device_names);
+
+char * partition_name(kdev_t dev)
+{
+ struct gendisk *hd;
+ static char nomem [] = "<nomem>";
+ dev_name_t *dname;
+ struct md_list_head *tmp;
+
+ list_for_each(tmp, &device_names) {
+ dname = md_list_entry(tmp, dev_name_t, list);
+ if (dname->dev == dev)
+ return dname->name;
+ }
+
+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
+
+ if (!dname)
+ return nomem;
+ /*
+ * ok, add this new device name to the list
+ */
+ hd = get_gendisk (dev);
+ dname->name = NULL;
+ if (hd)
+ dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
+ if (!dname->name) {
+ sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
+ dname->name = dname->namebuf;
+ }
+
+ dname->dev = dev;
+ md_list_add(&dname->list, &device_names);
+
+ return dname->name;
+}
+
+static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev,
+ int persistent)
+{
+ unsigned int size = 0;
+
+ if (blk_size[MAJOR(dev)])
+ size = blk_size[MAJOR(dev)][MINOR(dev)];
+ if (persistent)
+ size = MD_NEW_SIZE_BLOCKS(size);
+ return size;
+}
+
+static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent)
+{
+ unsigned int size;
+
+ size = calc_dev_sboffset(dev, mddev, persistent);
+ if (!mddev->sb) {
+ MD_BUG();
+ return size;
+ }
+ if (mddev->sb->chunk_size)
+ size &= ~(mddev->sb->chunk_size/1024 - 1);
+ return size;
+}
+
+static unsigned int zoned_raid_size(mddev_t *mddev)
+{
+ unsigned int mask;
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ /*
+ * do size and offset calculations.
+ */
+ mask = ~(mddev->sb->chunk_size/1024 - 1);
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev->size &= mask;
+ md_size[mdidx(mddev)] += rdev->size;
+ }
+ return 0;
+}
+
+static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb)
+{
+ if (disk_active(disk)) {
+ sb->working_disks--;
+ } else {
+ if (disk_spare(disk)) {
+ sb->spare_disks--;
+ sb->working_disks--;
+ } else {
+ sb->failed_disks--;
+ }
+ }
+ sb->nr_disks--;
+ disk->major = 0;
+ disk->minor = 0;
+ mark_disk_removed(disk);
+}
+
+#define BAD_MAGIC KERN_ERR \
+"md: invalid raid superblock magic on %s\n"
+
+#define BAD_MINOR KERN_ERR \
+"md: %s: invalid raid minor (%x)\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_SB KERN_ERR \
+"md: disabled device %s, could not read superblock.\n"
+
+#define BAD_CSUM KERN_WARNING \
+"md: invalid superblock checksum on %s\n"
+
+static int alloc_array_sb(mddev_t * mddev)
+{
+ if (mddev->sb) {
+ MD_BUG();
+ return 0;
+ }
+
+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
+ if (!mddev->sb)
+ return -ENOMEM;
+ md_clear_page(mddev->sb);
+ return 0;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(OUT_OF_MEM);
+ return -EINVAL;
+ }
+ rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ page_cache_release(rdev->sb_page);
+ rdev->sb = NULL;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ } else {
+ if (!rdev->faulty)
+ MD_BUG();
+ }
+}
+
+
+static void bh_complete(struct buffer_head *bh, int uptodate)
+{
+
+ if (uptodate)
+ set_bit(BH_Uptodate, &bh->b_state);
+
+ complete((struct completion*)bh->b_private);
+}
+
+static int sync_page_io(kdev_t dev, unsigned long sector, int size,
+ struct page *page, int rw)
+{
+ struct buffer_head bh;
+ struct completion event;
+
+ init_completion(&event);
+ init_buffer(&bh, bh_complete, &event);
+ bh.b_rdev = dev;
+ bh.b_rsector = sector;
+ bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
+ bh.b_size = size;
+ bh.b_page = page;
+ bh.b_reqnext = NULL;
+ bh.b_data = page_address(page);
+ generic_make_request(rw, &bh);
+
+ run_task_queue(&tq_disk);
+ wait_for_completion(&event);
+
+ return test_bit(BH_Uptodate, &bh.b_state);
+}
+
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+ int ret = -EINVAL;
+ kdev_t dev = rdev->dev;
+ unsigned long sb_offset;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk
+ */
+ sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
+ rdev->sb_offset = sb_offset;
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) {
+ printk(NO_SB,partition_name(dev));
+ return -EINVAL;
+ }
+ printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
+ ret = 0;
+abort:
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+/*
+ * Check one RAID superblock for generic plausibility
+ */
+
+static int check_disk_sb(mdk_rdev_t * rdev)
+{
+ mdp_super_t *sb;
+ int ret = -EINVAL;
+
+ sb = rdev->sb;
+ if (!sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(BAD_MAGIC, partition_name(rdev->dev));
+ goto abort;
+ }
+
+ if (sb->md_minor >= MAX_MD_DEVS) {
+ printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor);
+ goto abort;
+ }
+
+ if (calc_sb_csum(sb) != sb->sb_csum) {
+ printk(BAD_CSUM, partition_name(rdev->dev));
+ goto abort;
+ }
+ ret = 0;
+abort:
+ return ret;
+}
+
+static kdev_t dev_unit(kdev_t dev)
+{
+ unsigned int mask;
+ struct gendisk *hd = get_gendisk(dev);
+
+ if (!hd)
+ return 0;
+ mask = ~((1 << hd->minor_shift) - 1);
+
+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
+}
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (dev_unit(rdev->dev) == dev_unit(dev))
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev->dev))
+ return 1;
+
+ return 0;
+}
+
+static MD_LIST_HEAD(all_raid_disks);
+static MD_LIST_HEAD(pending_raid_disks);
+
+static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ same_pdev = match_dev_unit(mddev, rdev->dev);
+ if (same_pdev)
+ printk( KERN_WARNING
+"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
+" protection against single-disk failure might be compromised.\n",
+ mdidx(mddev), partition_name(rdev->dev),
+ partition_name(same_pdev->dev));
+
+ md_list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev));
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev));
+ rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev(mdk_rdev_t *rdev)
+{
+ int err = 0;
+ struct block_device *bdev;
+
+ bdev = bdget(rdev->dev);
+ if (!bdev)
+ return -ENOMEM;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ if (!err)
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ blkdev_put(bdev, BDEV_RAW);
+}
+
+void md_autodetect_dev(kdev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev));
+ if (rdev->mddev)
+ MD_BUG();
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ list_del_init(&rdev->all);
+ if (!list_empty(&rdev->pending)) {
+ printk(KERN_INFO "md: (%s was pending)\n",
+ partition_name(rdev->dev));
+ list_del_init(&rdev->pending);
+ }
+#ifndef MODULE
+ md_autodetect_dev(rdev->dev);
+#endif
+ rdev->dev = 0;
+ rdev->faulty = 0;
+ kfree(rdev);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb = mddev->sb;
+
+ if (mddev->sb) {
+ mddev->sb = NULL;
+ free_page((unsigned long) sb);
+ }
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+}
+
+static void free_mddev(mddev_t *mddev)
+{
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ export_array(mddev);
+ md_size[mdidx(mddev)] = 0;
+ md_hd_struct[mdidx(mddev)].nr_sects = 0;
+}
+
+#undef BAD_CSUM
+#undef BAD_MAGIC
+#undef OUT_OF_MEM
+#undef NO_SB
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+ partition_name(MKDEV(desc->major,desc->minor)),
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
+ sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
+ partition_name(rdev->dev), partition_name(rdev->old_dev),
+ rdev->size, rdev->faulty, rdev->desc_nr);
+ if (rdev->sb) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb(rdev->sb);
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) {
+ printk("md%d: ", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", partition_name(rdev->dev));
+
+ if (mddev->sb) {
+ printk(" array superblock:\n");
+ print_sb(mddev->sb);
+ } else
+ printk(" no array superblock.\n");
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ mddev_unlock(mddev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ if (tmp1)
+ kfree(tmp1);
+ if (tmp2)
+ kfree(tmp2);
+
+ return ret;
+}
+
+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
+{
+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+static mdk_rdev_t * find_rdev_all(kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ list_for_each(tmp, &all_raid_disks) {
+ rdev = md_list_entry(tmp, mdk_rdev_t, all);
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+#define GETBLK_FAILED KERN_ERR \
+"md: getblk failed for device %s\n"
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+ kdev_t dev;
+ unsigned long sb_offset, size;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
+ MD_BUG();
+ return 1;
+ }
+
+ dev = rdev->dev;
+ sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
+ if (rdev->sb_offset != sb_offset) {
+ printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n",
+ partition_name(dev), rdev->sb_offset, sb_offset);
+ goto skip;
+ }
+ /*
+ * If the disk went offline meanwhile and it's just a spare, then
+ * its size has changed to zero silently, and the MD code does
+ * not yet know that it's faulty.
+ */
+ size = calc_dev_size(dev, rdev->mddev, 1);
+ if (size != rdev->size) {
+ printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n",
+ partition_name(dev), rdev->size, size);
+ goto skip;
+ }
+
+ printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) {
+ printk("md: write_disk_sb failed for device %s\n", partition_name(dev));
+ return 1;
+ }
+skip:
+ return 0;
+}
+#undef GETBLK_FAILED
+
+static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ int i, ok = 0;
+ mdp_disk_t *desc;
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ desc = mddev->sb->disks + i;
+#if 0
+ if (disk_faulty(desc)) {
+ if (MKDEV(desc->major,desc->minor) == rdev->dev)
+ ok = 1;
+ continue;
+ }
+#endif
+ if (MKDEV(desc->major,desc->minor) == rdev->dev) {
+ rdev->sb->this_disk = *desc;
+ rdev->desc_nr = desc->number;
+ ok = 1;
+ break;
+ }
+ }
+
+ if (!ok) {
+ MD_BUG();
+ }
+}
+
+static int sync_sbs(mddev_t * mddev)
+{
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty || rdev->alias_device)
+ continue;
+ sb = rdev->sb;
+ *sb = *mddev->sb;
+ set_this_disk(mddev, rdev);
+ sb->sb_csum = calc_sb_csum(sb);
+ }
+ return 0;
+}
+
+void __md_update_sb(mddev_t * mddev)
+{
+ int err, count = 100;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->sb_dirty) {
+ printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0));
+ return 0;
+ }
+ mddev->sb_dirty = 0;
+repeat:
+ mddev->sb->utime = CURRENT_TIME;
+ if ((++mddev->sb->events_lo)==0)
+ ++mddev->sb->events_hi;
+
+ if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
+ }
+ sync_sbs(mddev);
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (mddev->sb->not_persistent)
+ return;
+
+ printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
+ mdidx(mddev));
+
+ err = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk(KERN_INFO "md: ");
+ if (rdev->faulty)
+ printk("(skipping faulty ");
+ if (rdev->alias_device)
+ printk("(skipping alias ");
+ if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) {
+ printk("(skipping new-faulty %s )\n",
+ partition_name(rdev->dev));
+ continue;
+ }
+ printk("%s ", partition_name(rdev->dev));
+ if (!rdev->faulty && !rdev->alias_device) {
+ printk("[events: %08lx]",
+ (unsigned long)rdev->sb->events_lo);
+ err += write_disk_sb(rdev);
+ } else
+ printk(")\n");
+ }
+ if (err) {
+ if (--count) {
+ printk(KERN_ERR "md: errors occurred during superblock update, repeating\n");
+ goto repeat;
+ }
+ printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
+ }
+}
+
+void md_update_sb(mddev_t *mddev)
+{
+ if (mddev_lock(mddev))
+ return;
+ if (mddev->sb_dirty)
+ __md_update_sb(mddev);
+ mddev_unlock(mddev);
+}
+
+
+/*
+ * Import a device. If 'on_disk', then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ */
+static int md_import_device(kdev_t newdev, int on_disk)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ unsigned int size;
+
+ if (find_rdev_all(newdev))
+ return -EEXIST;
+
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev));
+ return -ENOMEM;
+ }
+ memset(rdev, 0, sizeof(*rdev));
+
+ if (is_mounted(newdev)) {
+ printk(KERN_WARNING "md: can not import %s, has active inodes!\n",
+ partition_name(newdev));
+ err = -EBUSY;
+ goto abort_free;
+ }
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ rdev->dev = newdev;
+ if (lock_rdev(rdev)) {
+ printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+ rdev->desc_nr = -1;
+ rdev->faulty = 0;
+
+ size = 0;
+ if (blk_size[MAJOR(newdev)])
+ size = blk_size[MAJOR(newdev)][MINOR(newdev)];
+ if (!size) {
+ printk(KERN_WARNING "md: %s has zero size, marking faulty!\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (on_disk) {
+ if ((err = read_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+ if ((err = check_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+
+ if (rdev->sb->level != -4) {
+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
+ rdev->sb->this_disk.minor);
+ rdev->desc_nr = rdev->sb->this_disk.number;
+ } else {
+ rdev->old_dev = MKDEV(0, 0);
+ rdev->desc_nr = -1;
+ }
+ }
+ md_list_add(&rdev->all, &all_raid_disks);
+ MD_INIT_LIST_HEAD(&rdev->pending);
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return 0;
+
+abort_free:
+ if (rdev->sb) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return err;
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+#define INCONSISTENT KERN_ERR \
+"md: fatal superblock inconsistency in %s -- removing from array\n"
+
+#define OUT_OF_DATE KERN_ERR \
+"md: superblock update time inconsistency -- using the most recent one\n"
+
+#define OLD_VERSION KERN_ALERT \
+"md: md%d: unsupported raid array version %d.%d.%d\n"
+
+#define NOT_CLEAN_IGNORE KERN_ERR \
+"md: md%d: raid array is not clean -- starting background reconstruction\n"
+
+#define UNKNOWN_LEVEL KERN_ERR \
+"md: md%d: unsupported raid level %d\n"
+
+static int analyze_sbs(mddev_t * mddev)
+{
+ int out_of_date = 0, i, first;
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev, *rdev2, *freshest;
+ mdp_super_t *sb;
+
+ /*
+ * Verify the RAID superblock on each real device
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty) {
+ MD_BUG();
+ goto abort;
+ }
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+ if (check_disk_sb(rdev))
+ goto abort;
+ }
+
+ /*
+ * The superblock constant part has to be the same
+ * for all disks in the array.
+ */
+ sb = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!sb) {
+ sb = rdev->sb;
+ continue;
+ }
+ if (!sb_equal(sb, rdev->sb)) {
+ printk(INCONSISTENT, partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * OK, we have all disks and the array is ready to run. Let's
+ * find the freshest superblock, that one will be the superblock
+ * that represents the whole array.
+ */
+ if (!mddev->sb)
+ if (alloc_array_sb(mddev))
+ goto abort;
+ sb = mddev->sb;
+ freshest = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2;
+ /*
+ * if the checksum is invalid, use the superblock
+ * only as a last resort. (decrease it's age by
+ * one event)
+ */
+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
+ if (rdev->sb->events_lo || rdev->sb->events_hi)
+ if ((rdev->sb->events_lo--)==0)
+ rdev->sb->events_hi--;
+ }
+
+ printk(KERN_INFO "md: %s's event counter: %08lx\n",
+ partition_name(rdev->dev),
+ (unsigned long)rdev->sb->events_lo);
+ if (!freshest) {
+ freshest = rdev;
+ continue;
+ }
+ /*
+ * Find the newest superblock version
+ */
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(freshest->sb);
+ if (ev1 != ev2) {
+ out_of_date = 1;
+ if (ev1 > ev2)
+ freshest = rdev;
+ }
+ }
+ if (out_of_date) {
+ printk(OUT_OF_DATE);
+ printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev));
+ }
+ memcpy (sb, freshest->sb, sizeof(*sb));
+
+ /*
+ * at this point we have picked the 'best' superblock
+ * from all available superblocks.
+ * now we validate this superblock and kick out possibly
+ * failed disks.
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Kick all non-fresh devices
+ */
+ __u64 ev1, ev2;
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ++ev1;
+ if (ev1 < ev2) {
+ printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
+ partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * Fix up changed device names ... but only if this disk has a
+ * recent update time. Use faulty checksum ones too.
+ */
+ if (mddev->sb->level != -4)
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2, ev3;
+ if (rdev->faulty || rdev->alias_device) {
+ MD_BUG();
+ goto abort;
+ }
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ev3 = ev2;
+ --ev3;
+ if ((rdev->dev != rdev->old_dev) &&
+ ((ev1 == ev2) || (ev1 == ev3))) {
+ mdp_disk_t *desc;
+
+ printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n",
+ partition_name(rdev->old_dev), partition_name(rdev->dev));
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ desc = &sb->disks[rdev->desc_nr];
+ if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
+ MD_BUG();
+ goto abort;
+ }
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ desc = &rdev->sb->this_disk;
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ }
+ }
+
+ /*
+ * Remove unavailable and faulty devices ...
+ *
+ * note that if an array becomes completely unrunnable due to
+ * missing devices, we do not write the superblock back, so the
+ * administrator has a chance to fix things up. The removal thus
+ * only happens if it's nonfatal to the contents of the array.
+ */
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ int found;
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ /*
+ * We kick faulty devices/descriptors immediately.
+ *
+ * Note: multipath devices are a special case. Since we
+ * were able to read the superblock on the path, we don't
+ * care if it was previously marked as faulty, it's up now
+ * so enable it.
+ */
+ if (disk_faulty(desc) && mddev->sb->level != -4) {
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr != desc->number)
+ continue;
+ printk(KERN_WARNING "md%d: kicking faulty %s!\n",
+ mdidx(mddev),partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ found = 1;
+ break;
+ }
+ if (!found) {
+ if (dev == MKDEV(0,0))
+ continue;
+ printk(KERN_WARNING "md%d: removing former faulty %s!\n",
+ mdidx(mddev), partition_name(dev));
+ }
+ remove_descriptor(desc, sb);
+ continue;
+ } else if (disk_faulty(desc)) {
+ /*
+ * multipath entry marked as faulty, unfaulty it
+ */
+ rdev = find_rdev(mddev, dev);
+ if(rdev)
+ mark_disk_spare(desc);
+ else
+ remove_descriptor(desc, sb);
+ }
+
+ if (dev == MKDEV(0,0))
+ continue;
+ /*
+ * Is this device present in the rdev ring?
+ */
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Multi-path IO special-case: since we have no
+ * this_disk descriptor at auto-detect time,
+ * we cannot check rdev->number.
+ * We can check the device though.
+ */
+ if ((sb->level == -4) && (rdev->dev ==
+ MKDEV(desc->major,desc->minor))) {
+ found = 1;
+ break;
+ }
+ if (rdev->desc_nr == desc->number) {
+ found = 1;
+ break;
+ }
+ }
+ if (found)
+ continue;
+
+ printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n",
+ mdidx(mddev), partition_name(dev));
+ remove_descriptor(desc, sb);
+ }
+
+ /*
+ * Double check wether all devices mentioned in the
+ * superblock are in the rdev ring.
+ */
+ first = 1;
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+
+ if (disk_faulty(desc)) {
+ MD_BUG();
+ goto abort;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * In the case of Multipath-IO, we have no
+ * other information source to find out which
+ * disk is which, only the position of the device
+ * in the superblock:
+ */
+ if (mddev->sb->level == -4) {
+ if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) {
+ MD_BUG();
+ goto abort;
+ }
+ rdev->desc_nr = i;
+ if (!first)
+ rdev->alias_device = 1;
+ else
+ first = 0;
+ }
+ }
+
+ /*
+ * Kick all rdevs that are not in the
+ * descriptor array:
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1)
+ kick_rdev_from_array(rdev);
+ }
+
+ /*
+ * Do a final reality check.
+ */
+ if (mddev->sb->level != -4) {
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * is the desc_nr unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->desc_nr == rdev->desc_nr)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ /*
+ * is the device unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->dev == rdev->dev)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ }
+ }
+
+ /*
+ * Check if we can support this RAID array
+ */
+ if (sb->major_version != MD_MAJOR_VERSION ||
+ sb->minor_version > MD_MINOR_VERSION) {
+
+ printk(OLD_VERSION, mdidx(mddev), sb->major_version,
+ sb->minor_version, sb->patch_version);
+ goto abort;
+ }
+
+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
+ (sb->level == 4) || (sb->level == 5)))
+ printk(NOT_CLEAN_IGNORE, mdidx(mddev));
+
+ return 0;
+abort:
+ return 1;
+}
+
+#undef INCONSISTENT
+#undef OUT_OF_DATE
+#undef OLD_VERSION
+#undef OLD_LEVEL
+
+static int device_size_calculation(mddev_t * mddev)
+{
+ int data_disks = 0, persistent;
+ unsigned int readahead;
+ mdp_super_t *sb = mddev->sb;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+ persistent = !mddev->sb->not_persistent;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size) {
+ MD_BUG();
+ continue;
+ }
+ rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
+ if (rdev->size < sb->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size: %ldk < %dk\n",
+ partition_name(rdev->dev),
+ rdev->size, sb->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (sb->level) {
+ case -4:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case -1:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = sb->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = sb->raid_disks-1;
+ break;
+ default:
+ printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = sb->size * data_disks;
+
+ readahead = MD_READAHEAD;
+ if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
+ readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (sb->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+abort:
+ return 1;
+}
+
+
+#define TOO_BIG_CHUNKSIZE KERN_ERR \
+"too big chunk_size: %d > %d\n"
+
+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
+"too small chunk_size: %d < %ld\n"
+
+#define BAD_CHUNKSIZE KERN_ERR \
+"no chunksize specified, see 'man raidtab'\n"
+
+static int do_md_run(mddev_t * mddev)
+{
+ int pnum, err;
+ int chunk_size;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Resize disks to align partitions size on a given
+ * chunk size.
+ */
+ md_size[mdidx(mddev)] = 0;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (analyze_sbs(mddev)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ chunk_size = mddev->sb->chunk_size;
+ pnum = level_to_pers(mddev->sb->level);
+
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+ if (!chunk_size) {
+ /*
+ * 'default chunksize' in the old md code used to
+ * be PAGE_SIZE, baaad.
+ * we abort here to be on the safe side. We dont
+ * want to continue the bad practice.
+ */
+ printk(BAD_CHUNKSIZE);
+ return -EINVAL;
+ }
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+ } else
+ if (chunk_size)
+ printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n",
+ mddev->sb->level);
+
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (!pers[pnum])
+ {
+#ifdef CONFIG_KMOD
+ char module_name[80];
+ sprintf (module_name, "md-personality-%d", pnum);
+ request_module (module_name);
+ if (!pers[pnum])
+#endif
+ {
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
+ pnum);
+ return -EINVAL;
+ }
+ }
+
+ if (device_size_calculation(mddev))
+ return -EINVAL;
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ md_hardsect_sizes[mdidx(mddev)] = 512;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ invalidate_device(rdev->dev, 1);
+ if (get_hardsect_size(rdev->dev)
+ > md_hardsect_sizes[mdidx(mddev)])
+ md_hardsect_sizes[mdidx(mddev)] =
+ get_hardsect_size(rdev->dev);
+ }
+ md_blocksizes[mdidx(mddev)] = 1024;
+ if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
+ md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
+ mddev->pers = pers[pnum];
+
+ blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+ mddev->queue.queuedata = mddev;
+
+ err = mddev->pers->run(mddev);
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ mddev->pers = NULL;
+ return -EINVAL;
+ }
+
+ mddev->in_sync = (mddev->sb->state & (1<<MD_SB_CLEAN));
+ /* if personality doesn't have "sync_request", then
+ * a dirty array doesn't mean anything
+ */
+ if (mddev->pers->sync_request)
+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
+ mddev->sb_dirty = 1;
+ __md_update_sb(mddev);
+
+ md_recover_arrays();
+ /*
+ * md_size has units of 1K blocks, which are
+ * twice as large as sectors.
+ */
+ md_hd_struct[mdidx(mddev)].start_sect = 0;
+ register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)),
+ 1, &md_fops, md_size[mdidx(mddev)]<<1);
+
+ read_ahead[MD_MAJOR] = 1024;
+ return (0);
+}
+
+#undef TOO_BIG_CHUNKSIZE
+#undef BAD_CHUNKSIZE
+
+static int restart_array(mddev_t *mddev)
+{
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->ro = 0;
+ set_device_ro(mddev_to_kdev(mddev), 0);
+
+ printk(KERN_INFO
+ "md: md%d switched to read-write mode.\n", mdidx(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ md_recover_arrays();
+ err = 0;
+ } else {
+ printk(KERN_ERR "md: md%d has no personality assigned.\n",
+ mdidx(mddev));
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+#define STILL_MOUNTED KERN_WARNING \
+"md: md%d still mounted.\n"
+#define STILL_IN_USE \
+"md: md%d still in use.\n"
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+ int err = 0;
+ kdev_t dev = mddev_to_kdev(mddev);
+
+ if (atomic_read(&mddev->active)>1) {
+ printk(STILL_IN_USE, mdidx(mddev));
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (mddev->pers) {
+ if (mddev->sync_thread) {
+ if (mddev->recovery_running > 0)
+ mddev->recovery_running = -EINTR;
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (mddev->spare) {
+ mddev->pers->diskop(mddev, &mddev->spare,
+ DISKOP_SPARE_INACTIVE);
+ mddev->spare = NULL;
+ }
+ }
+
+ invalidate_device(dev, 1);
+
+ if (ro) {
+ err = -ENXIO;
+ if (mddev->ro)
+ goto out;
+ mddev->ro = 1;
+ } else {
+ if (mddev->ro)
+ set_device_ro(dev, 0);
+ if (mddev->pers->stop(mddev)) {
+ err = -EBUSY;
+ if (mddev->ro)
+ set_device_ro(dev, 1);
+ goto out;
+ }
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (mddev->sb) {
+ /*
+ * mark it clean only if there was no resync
+ * interrupted.
+ */
+ if (mddev->in_sync) {
+ printk(KERN_INFO "md: marking sb clean...\n");
+ mddev->sb->state |= 1 << MD_SB_CLEAN;
+ }
+ mddev->sb_dirty = 1;
+ __md_update_sb(mddev);
+ }
+ if (ro)
+ set_device_ro(dev, 1);
+ }
+
+ /*
+ * Free resources if final stop
+ */
+ if (!ro) {
+ printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+ free_mddev(mddev);
+ } else
+ printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * We have to safely support old arrays too.
+ */
+int detect_old_array(mdp_super_t *sb)
+{
+ if (sb->major_version > 0)
+ return 0;
+ if (sb->minor_version >= 90)
+ return 0;
+
+ return -EINVAL;
+}
+
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return;
+ }
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk("<%s>", partition_name(rdev->dev));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+ /*
+ * prevent the writeback of an unrunnable array
+ */
+ mddev->sb_dirty = 0;
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in the ->pending list)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(void)
+{
+ struct md_list_head candidates;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ rdev0 = md_list_entry(pending_raid_disks.next,
+ mdk_rdev_t, pending);
+
+ printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev));
+ MD_INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ if (uuid_equal(rdev0, rdev)) {
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING
+ "md: %s has same UUID as %s, but superblocks differ ...\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ continue;
+ }
+ printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev));
+ md_list_del(&rdev->pending);
+ md_list_add(&rdev->pending, &candidates);
+ }
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+
+ mddev = mddev_find(rdev0->sb->md_minor);
+ if (!mddev) {
+ printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (mddev_lock(mddev))
+ printk(KERN_WARNING "md: md%d locked, cannot run\n",
+ mdidx(mddev));
+ else if (mddev->sb || !list_empty(&mddev->disks)) {
+ printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
+ mdidx(mddev), partition_name(rdev0->dev));
+ mddev_unlock(mddev);
+ } else {
+ printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
+ bind_rdev_to_array(rdev, mddev);
+ list_del_init(&rdev->pending);
+ }
+ autorun_array(mddev);
+ mddev_unlock(mddev);
+ }
+ /* on success, candidates will be empty, on error
+ * it wont...
+ */
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
+ export_rdev(rdev);
+ mddev_put(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+#define BAD_VERSION KERN_ERR \
+"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_DEVICE KERN_ERR \
+"md: disabled device %s\n"
+
+#define AUTOADD_FAILED KERN_ERR \
+"md: auto-adding devices to md%d FAILED (error %d).\n"
+
+#define AUTOADD_FAILED_USED KERN_ERR \
+"md: cannot auto-add device %s to md%d, already used.\n"
+
+#define AUTORUN_FAILED KERN_ERR \
+"md: auto-running md%d FAILED (error %d).\n"
+
+#define MDDEV_BUSY KERN_ERR \
+"md: cannot auto-add to md%d, already running.\n"
+
+#define AUTOADDING KERN_INFO \
+"md: auto-adding devices to md%d, based on %s's superblock.\n"
+
+#define AUTORUNNING KERN_INFO \
+"md: auto-running md%d.\n"
+
+static int autostart_array(kdev_t startdev)
+{
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ if (md_import_device(startdev, 1)) {
+ printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev));
+ goto abort;
+ }
+
+ start_rdev = find_rdev_all(startdev);
+ if (!start_rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ if (start_rdev->faulty) {
+ printk(KERN_WARNING "md: can not autostart based on faulty %s!\n",
+ partition_name(startdev));
+ goto abort;
+ }
+ md_list_add(&start_rdev->pending, &pending_raid_disks);
+
+ sb = start_rdev->sb;
+
+ err = detect_old_array(sb);
+ if (err) {
+ printk(KERN_WARNING "md: array version is too old to be autostarted ,"
+ "use raidtools 0.90 mkraid --upgrade to upgrade the array "
+ "without data loss!\n");
+ goto abort;
+ }
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+ if (dev == startdev)
+ continue;
+ if (md_import_device(dev, 1)) {
+ printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n",
+ partition_name(dev));
+ continue;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices();
+ return 0;
+
+abort:
+ if (start_rdev)
+ export_rdev(start_rdev);
+ return err;
+}
+
+#undef BAD_VERSION
+#undef OUT_OF_MEM
+#undef NO_DEVICE
+#undef AUTOADD_FAILED_USED
+#undef AUTOADD_FAILED
+#undef AUTORUN_FAILED
+#undef AUTOADDING
+#undef AUTORUNNING
+
+
+static int get_version(void * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (md_copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define SET_FROM_SB(x) info.x = mddev->sb->x
+static int get_array_info(mddev_t * mddev, void * arg)
+{
+ mdu_array_info_t info;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ SET_FROM_SB(major_version);
+ SET_FROM_SB(minor_version);
+ SET_FROM_SB(patch_version);
+ SET_FROM_SB(ctime);
+ SET_FROM_SB(level);
+ SET_FROM_SB(size);
+ SET_FROM_SB(nr_disks);
+ SET_FROM_SB(raid_disks);
+ SET_FROM_SB(md_minor);
+ SET_FROM_SB(not_persistent);
+
+ SET_FROM_SB(utime);
+ SET_FROM_SB(state);
+ SET_FROM_SB(active_disks);
+ SET_FROM_SB(working_disks);
+ SET_FROM_SB(failed_disks);
+ SET_FROM_SB(spare_disks);
+
+ SET_FROM_SB(layout);
+ SET_FROM_SB(chunk_size);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
+static int get_disk_info(mddev_t * mddev, void * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+
+ if (!mddev->sb)
+ return -EINVAL;
+
+ if (md_copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+ if (nr >= MD_SB_DISKS)
+ return -EINVAL;
+
+ SET_FROM_SB(major);
+ SET_FROM_SB(minor);
+ SET_FROM_SB(raid_disk);
+ SET_FROM_SB(state);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_SB(x) mddev->sb->disks[nr].x = info->x
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ int err, size, persistent;
+ mdk_rdev_t *rdev;
+ unsigned int nr;
+ kdev_t dev;
+ dev = MKDEV(info->major,info->minor);
+
+ if (find_rdev_all(dev)) {
+ printk(KERN_WARNING "md: device %s already used in a RAID array!\n",
+ partition_name(dev));
+ return -EBUSY;
+ }
+ if (!mddev->sb) {
+ /* expecting a device which has a superblock */
+ err = md_import_device(dev, 1);
+ if (err) {
+ printk(KERN_WARNING "md: md_import_device returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ if (!uuid_equal(rdev0, rdev)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ bind_rdev_to_array(rdev, mddev);
+ return 0;
+ }
+
+ nr = info->number;
+ if (nr >= mddev->sb->nr_disks) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+
+ SET_SB(number);
+ SET_SB(major);
+ SET_SB(minor);
+ SET_SB(raid_disk);
+ SET_SB(state);
+
+ if ((info->state & (1<<MD_DISK_FAULTY))==0) {
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ rdev->old_dev = dev;
+ rdev->desc_nr = info->number;
+
+ bind_rdev_to_array(rdev, mddev);
+
+ persistent = !mddev->sb->not_persistent;
+ if (!persistent)
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+
+ size = calc_dev_size(dev, mddev, persistent);
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ if (!mddev->sb->size || (mddev->sb->size > size))
+ mddev->sb->size = size;
+ }
+
+ /*
+ * sync all other superblocks with the main superblock
+ */
+ sync_sbs(mddev);
+
+ return 0;
+}
+#undef SET_SB
+
+static int hot_generate_error(mddev_t * mddev, kdev_t dev)
+{
+ struct request_queue *q;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ return -ENXIO;
+ }
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (!disk_active(disk))
+ return -ENODEV;
+
+ q = blk_get_queue(rdev->dev);
+ if (!q) {
+ MD_BUG();
+ return -ENODEV;
+ }
+ printk(KERN_INFO "md: okay, generating error!\n");
+// q->oneshot_error = 1; // disabled for now
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, kdev_t dev)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (disk_active(disk))
+ goto busy;
+
+ if (disk_removed(disk))
+ return -EINVAL;
+
+ err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
+ if (err == -EBUSY)
+ goto busy;
+
+ if (err) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ remove_descriptor(disk, mddev->sb);
+ kick_rdev_from_array(rdev);
+ __md_update_sb(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, kdev_t dev)
+{
+ int i, err, persistent;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ persistent = !mddev->sb->not_persistent;
+
+ rdev = find_rdev(mddev, dev);
+ if (rdev)
+ return -EBUSY;
+
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->faulty) {
+ printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n",
+ partition_name(dev), mdidx(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ size = calc_dev_size(dev, mddev, persistent);
+
+ if (size < mddev->sb->size) {
+ printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n",
+ mdidx(mddev), size, mddev->sb->size);
+ err = -ENOSPC;
+ goto abort_export;
+ }
+ bind_rdev_to_array(rdev, mddev);
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+ rdev->old_dev = dev;
+ rdev->size = size;
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ disk = mddev->sb->disks + mddev->sb->raid_disks;
+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
+ disk = mddev->sb->disks + i;
+
+ if (!disk->major && !disk->minor)
+ break;
+ if (disk_removed(disk))
+ break;
+ }
+ if (i == MD_SB_DISKS) {
+ printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ if (disk_removed(disk)) {
+ /*
+ * reuse slot
+ */
+ if (disk->number != i) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+ } else {
+ disk->number = i;
+ }
+
+ disk->raid_disk = disk->number;
+ disk->major = MAJOR(dev);
+ disk->minor = MINOR(dev);
+
+ if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+
+ mark_disk_spare(disk);
+ mddev->sb->nr_disks++;
+ mddev->sb->spare_disks++;
+ mddev->sb->working_disks++;
+
+ __md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ md_recover_arrays();
+
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+#define SET_SB(x) mddev->sb->x = info->x
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (alloc_array_sb(mddev))
+ return -ENOMEM;
+
+ mddev->sb->major_version = MD_MAJOR_VERSION;
+ mddev->sb->minor_version = MD_MINOR_VERSION;
+ mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->sb->ctime = CURRENT_TIME;
+
+ SET_SB(level);
+ SET_SB(size);
+ SET_SB(nr_disks);
+ SET_SB(raid_disks);
+ SET_SB(md_minor);
+ SET_SB(not_persistent);
+
+ SET_SB(state);
+ SET_SB(active_disks);
+ SET_SB(working_disks);
+ SET_SB(failed_disks);
+ SET_SB(spare_disks);
+
+ SET_SB(layout);
+ SET_SB(chunk_size);
+
+ mddev->sb->md_magic = MD_SB_MAGIC;
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(&mddev->sb->set_uuid0, 4);
+ get_random_bytes(&mddev->sb->set_uuid1, 4);
+ get_random_bytes(&mddev->sb->set_uuid2, 4);
+ get_random_bytes(&mddev->sb->set_uuid3, 4);
+
+ return 0;
+}
+#undef SET_SB
+
+static int set_disk_faulty(mddev_t *mddev, kdev_t dev)
+{
+ int ret;
+
+ ret = md_error(mddev, dev);
+ return ret;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ unsigned int minor;
+ int err = 0;
+ struct hd_geometry *loc = (struct hd_geometry *) arg;
+ mddev_t *mddev = NULL;
+ kdev_t dev;
+
+ if (!md_capable_admin())
+ return -EACCES;
+
+ dev = inode->i_rdev;
+ minor = MINOR(dev);
+ if (minor >= MAX_MD_DEVS) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version((void *)arg);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays();
+ goto done;
+#endif
+
+ case BLKGETSIZE:
+ case BLKGETSIZE64:
+ case BLKRAGET:
+ case BLKRASET:
+ case BLKFLSBUF:
+ case BLKBSZGET:
+ case BLKBSZSET:
+ err = blk_ioctl (dev, cmd, arg);
+ goto abort;
+
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev) {
+ BUG();
+ goto abort;
+ }
+
+
+ if (cmd == START_ARRAY) {
+ /* START_ARRAY doesn't need to lock the array as autostart_array
+ * does the locking, and it could even be a different array
+ */
+ err = autostart_array(val_to_kdev(arg));
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name(val_to_kdev(arg)));
+ goto abort;
+ }
+ goto done;
+ }
+
+ err = mddev_lock(mddev);
+ if (err) {
+ printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+
+ if (!list_empty(&mddev->disks)) {
+ printk(KERN_WARNING "md: array md%d already has disks!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (mddev->sb) {
+ printk(KERN_WARNING "md: array md%d already has a superblock!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (arg) {
+ mdu_array_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+<<<<<<<
+ err = autostart_array((kdev_t)arg);
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name((kdev_t)arg));
+|||||||
+ err = autostart_array(val_to_kdev(arg));
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name(val_to_kdev(arg)));
+=======
+>>>>>>>
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+ /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+ if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ err = do_md_stop (mddev, 0);
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ case HDIO_GETGEO:
+ if (!loc) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ err = md_put_user (2, (char *) &loc->heads);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (4, (char *) &loc->sectors);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
+ (short *) &loc->cylinders);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[minor].start_sect,
+ (long *) &loc->start);
+ goto done_unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow read-only arrays
+ * here:
+ */
+ if (mddev->ro) {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ case HOT_GENERATE_ERROR:
+ err = hot_generate_error(mddev, (kdev_t)arg);
+ goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ {
+ err = do_md_run (mddev);
+ /*
+ * we have to clean up the mess if
+ * the array cannot be run for some
+ * reason ...
+ */
+ if (err) {
+ mddev->sb_dirty = 0;
+ do_md_stop (mddev, 0);
+ }
+ goto done_unlock;
+ }
+
+ default:
+ printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, "
+ "upgrade your software to use new ictls.\n",
+ current->comm, current->pid);
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ mddev_unlock(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Succeed if we can find or allocate a mddev structure.
+ */
+ mddev_t *mddev = mddev_find(minor(inode->i_rdev));
+ int err = -ENOMEM;
+
+ if (!mddev)
+ goto out;
+
+ if ((err = mddev_lock(mddev)))
+ goto put;
+
+ err = 0;
+ mddev_unlock(mddev);
+ inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev);
+ put:
+ mddev_put(mddev);
+ out:
+ return err;
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev)
+ BUG();
+ mddev_put(mddev);
+
+ return 0;
+}
+
+static struct block_device_operations md_fops=
+{
+ owner: THIS_MODULE,
+ open: md_open,
+ release: md_release,
+ ioctl: md_ioctl,
+};
+
+
+int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ md_lock_kernel();
+
+ /*
+ * Detach thread
+ */
+
+ daemonize();
+ reparent_to_init();
+
+ sprintf(current->comm, thread->name);
+ md_init_signals();
+ md_flush_signals();
+ thread->tsk = current;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+ current->policy = SCHED_OTHER;
+ current->nice = -20;
+ md_unlock_kernel();
+
+ complete(thread->event);
+ while (thread->run) {
+ void (*run)(void *data);
+
+ wait_event_interruptible(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags));
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ run = thread->run;
+ if (run) {
+ run(thread->data);
+ run_task_queue(&tq_disk);
+ }
+ if (md_signal_pending(current))
+ md_flush_signals();
+ }
+ complete(thread->event);
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ dprintk("md: waking up MD thread %p.\n", thread);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+}
+
+mdk_thread_t *md_register_thread(void (*run) (void *),
+ void *data, const char *name)
+{
+ mdk_thread_t *thread;
+ int ret;
+ struct completion event;
+
+ thread = (mdk_thread_t *) kmalloc
+ (sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ memset(thread, 0, sizeof(mdk_thread_t));
+ md_init_waitqueue_head(&thread->wqueue);
+
+ init_completion(&event);
+ thread->event = &event;
+ thread->run = run;
+ thread->data = data;
+ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+ return NULL;
+ }
+ wait_for_completion(&event);
+ return thread;
+}
+
+void md_interrupt_thread(mdk_thread_t *thread)
+{
+ if (!thread->tsk) {
+ MD_BUG();
+ return;
+ }
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+ send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ struct completion event;
+
+ init_completion(&event);
+
+ thread->event = &event;
+ thread->run = NULL;
+ thread->name = NULL;
+ md_interrupt_thread(thread);
+ wait_for_completion(&event);
+ kfree(thread);
+}
+
+static void md_recover_arrays(void)
+{
+ if (!md_recovery_thread) {
+ MD_BUG();
+ return;
+ }
+ md_wakeup_thread(md_recovery_thread);
+}
+
+
+int md_error(mddev_t *mddev, kdev_t rdev)
+{
+ mdk_rdev_t * rrdev;
+
+ dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+
+ if (!mddev) {
+ MD_BUG();
+ return 0;
+ }
+ rrdev = find_rdev(mddev, rdev);
+ if (!rrdev || rrdev->faulty)
+ return 0;
+ if (!mddev->pers->error_handler
+ || mddev->pers->error_handler(mddev,rdev) <= 0) {
+ rrdev->faulty = 1;
+ } else
+ return 1;
+ /*
+ * if recovery was running, stop it now.
+ */
+ if (mddev->recovery_running)
+ mddev->recovery_running = -EIO;
+ md_recover_arrays();
+
+ return 0;
+}
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_ALL(rdev,tmp) {
+ if (list_empty(&rdev->same_set)) {
+ /*
+ * The device is not yet used by any array.
+ */
+ i++;
+ seq_printf(seq, "%s ",
+ partition_name(rdev->dev));
+ }
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ unsigned long max_blocks, resync, res, dt, db, rt;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ max_blocks = mddev->sb->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks)
+ MD_BUG();
+
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ {
+ int i, x = res/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+ (mddev->spare ? "recovery" : "resync"),
+ res/10, res % 10, resync, max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+
+}
+
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l > 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ return mddev;
+ }
+ return (void*)2;/* tail */
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = list_entry(tmp,mddev_t,all_mddevs);
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ int j, size;
+ struct md_list_head *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev = v;
+
+ if (v == (void*)1) {
+ seq_printf(seq, "Personalities : ");
+ for (j = 0; j < MAX_PERSONALITY; j++)
+ if (pers[j])
+ seq_printf(seq, "[%s] ", pers[j]->name);
+
+ seq_printf(seq, "\n");
+ seq_printf(seq, "read_ahead ");
+ if (read_ahead[MD_MAJOR] == INT_MAX)
+ seq_printf(seq, "not set\n");
+ else
+ seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]);
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro)
+ seq_printf(seq, " (read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ seq_printf(seq, " %s[%d]",
+ partition_name(rdev->dev), rdev->desc_nr);
+ if (rdev->faulty) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %d blocks",
+ md_size[mdidx(mddev)]);
+ else
+ seq_printf(seq, "\n %d blocks", size);
+ }
+
+ if (mddev->pers) {
+
+ mddev->pers->status (seq, mddev);
+
+ seq_printf(seq, "\n ");
+ if (mddev->curr_resync > 1)
+ status_resync (seq, mddev);
+ else if (mddev->curr_resync == 1)
+ seq_printf(seq, " resync=DELAYED");
+
+ }
+ seq_printf(seq, "\n");
+ return 0;
+}
+
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ return error;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (pers[pnum]) {
+ MD_BUG();
+ return -EBUSY;
+ }
+
+ pers[pnum] = p;
+ printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+ return 0;
+}
+
+int unregister_md_personality(int pnum)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+ pers[pnum] = NULL;
+ return 0;
+}
+
+mdp_disk_t *get_spare(mddev_t *mddev)
+{
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *disk;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (!rdev->sb) {
+ MD_BUG();
+ continue;
+ }
+ disk = &sb->disks[rdev->desc_nr];
+ if (disk_faulty(disk)) {
+ MD_BUG();
+ continue;
+ }
+ if (disk_active(disk))
+ continue;
+ return disk;
+ }
+ return NULL;
+}
+
+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
+void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
+{
+ unsigned int major = MAJOR(dev);
+ unsigned int index;
+
+ index = disk_index(dev);
+ if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ return;
+
+ sync_io[major][index] += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ int major = MAJOR(rdev->dev);
+ int idx = disk_index(rdev->dev);
+
+ if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ continue;
+
+ curr_events = kstat.dk_drive_rblk[major][idx] +
+ kstat.dk_drive_wblk[major][idx] ;
+ curr_events -= sync_io[major][idx];
+ if ((curr_events - rdev->last_events) > 32) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ mddev->recovery_running = -EIO;
+ md_recover_arrays();
+ // stop recovery, signal do_sync ....
+ if (mddev->pers->stop_resync)
+ mddev->pers->stop_resync(mddev);
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+ }
+}
+
+
+DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+static void md_do_sync(void *data)
+{
+ mddev_t *mddev = data;
+ mddev_t *mddev2;
+ unsigned int max_sectors, currspeed,
+ j, window, err;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct md_list_head *tmp;
+ unsigned long last_check;
+
+ /* just incase thread restarts... */
+ if (mddev->recovery_running <= 0)
+ return;
+
+ /* we overload curr_resync somewhat here.
+ * 0 == not engaged in resync at all
+ * 2 == checking that there is no conflict with another sync
+ * 1 == like 2, but have yielded to allow conflicting resync to
+ * commense
+ * other == active in resync - this many blocks
+ */
+ do {
+ mddev->curr_resync = 2;
+
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync &&
+ match_mddev_units(mddev,mddev2)) {
+ printk(KERN_INFO "md: delaying resync of md%d until md%d "
+ "has finished resync (they share one or more physical units)\n",
+ mdidx(mddev), mdidx(mddev2));
+ if (mddev < mddev2) /* arbitrarily yield */
+ mddev->curr_resync = 1;
+ if (wait_event_interruptible(resync_wait,
+ mddev2->curr_resync < 2)) {
+ md_flush_signals();
+ err = -EINTR;
+ mddev_put(mddev2);
+ goto out;
+ }
+ }
+ }
+ } while (mddev->curr_resync < 2);
+
+ max_sectors = mddev->sb->size<<1;
+
+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
+ sysctl_speed_limit_min);
+ printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ sysctl_speed_limit_max);
+
+ /*
+ * Resync has low priority.
+ */
+ current->nice = 19;
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = 0;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = vm_max_readahead*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+ window/2,max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+ for (j = 0; j < max_sectors;) {
+ int sectors;
+
+ sectors = mddev->pers->sync_request(mddev, j);
+
+ if (sectors < 0) {
+ err = sectors;
+ goto out;
+ }
+ atomic_add(sectors, &mddev->recovery_active);
+ j += sectors;
+ if (j>1) mddev->curr_resync = j;
+
+ if (last_check + window > j)
+ continue;
+
+ last_check = j;
+
+ run_task_queue(&tq_disk);
+
+ repeat:
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (md_signal_pending(current)) {
+ /*
+ * got a signal, exit.
+ */
+ printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
+ md_flush_signals();
+ err = -EINTR;
+ goto out;
+ }
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ if (md_need_resched(current))
+ schedule();
+
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > sysctl_speed_limit_min) {
+ current->nice = 19;
+
+ if ((currspeed > sysctl_speed_limit_max) ||
+ !is_mddev_idle(mddev)) {
+ current->state = TASK_INTERRUPTIBLE;
+ md_schedule_timeout(HZ/4);
+ goto repeat;
+ }
+ } else
+ current->nice = -20;
+ }
+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+ err = 0;
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+out:
+ wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
+ /* tell personality that we are finished */
+ mddev->pers->sync_request(mddev, max_sectors, 1);
+
+ mddev->curr_resync = 0;
+ if (err)
+ mddev->recovery_running = err;
+ if (mddev->recovery_running > 0)
+ mddev->recovery_running = 0;
+ if (mddev->recovery_running == 0)
+ mddev->in_sync = 1;
+ md_recover_arrays();
+}
+
+
+/*
+ * This is the kernel thread that watches all md arrays for re-sync action
+ * that might be needed.
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set "->recovery_running" and
+ * create a thread at ->sync_thread.
+ * When the thread finishes is clears recovery_running (or set and error)
+ * and wakeup up this thread which will reap the thread and finish up.
+ */
+void md_do_recovery(void *data)
+{
+ mddev_t *mddev;
+ mdp_super_t *sb;
+ struct md_list_head *tmp;
+
+ dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
+
+ ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) {
+ sb = mddev->sb;
+ if (!sb || !mddev->pers || !mddev->pers->diskop || mddev->ro)
+ goto unlock;
+ if (mddev->recovery_running > 0)
+ /* resync/recovery still happening */
+ goto unlock;
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ if (mddev->sync_thread) {
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (mddev->recovery_running < 0) {
+ /* some sort of failure.
+ * If we were doing a reconstruction,
+ * we need to retrieve the spare
+ */
+ if (mddev->spare) {
+ mddev->pers->diskop(mddev, &mddev->spare,
+ DISKOP_SPARE_INACTIVE);
+ mddev->spare = NULL;
+ }
+ } else {
+ /* success...*/
+ if (mddev->spare) {
+ mddev->pers->diskop(mddev, &mddev->spare,
+ DISKOP_SPARE_ACTIVE);
+ mark_disk_sync(mddev->spare);
+ mark_disk_active(mddev->spare);
+ sb->active_disks++;
+ sb->spare_disks--;
+ mddev->spare = NULL;
+ }
+ }
+ __md_update_sb(mddev);
+ mddev->recovery_running = 0;
+ wake_up(&resync_wait);
+ goto unlock;
+ }
+ if (mddev->recovery_running) {
+ /* that's odd.. */
+ mddev->recovery_running = 0;
+ wake_up(&resync_wait);
+ }
+
+ if (sb->active_disks < sb->raid_disks) {
+ mddev->spare = get_spare(mddev);
+ if (!mddev->spare)
+ printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
+ "-- continuing in degraded mode\n", mdidx(mddev));
+ else
+ printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
+ mdidx(mddev), partition_name(MKDEV(mddev->spare->major,mddev->spare->minor)));
+ }
+ if (!mddev->spare && mddev->in_sync) {
+ /* nothing we can do ... */
+ goto unlock;
+ }
+ if (mddev->pers->sync_request) {
+ mddev->sync_thread = md_register_thread(md_do_sync,
+ mddev,
+ "md_resync");
+ if (!mddev->sync_thread) {
+ printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev));
+ if (mddev->spare)
+ mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_INACTIVE);
+ mddev->spare = NULL;
+ mddev->recovery_running = 0;
+ } else {
+ if (mddev->spare)
+ mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_WRITE);
+ mddev->recovery_running = 1;
+ md_wakeup_thread(mddev->sync_thread);
+ }
+ }
+ unlock:
+ mddev_unlock(mddev);
+ }
+ dprintk(KERN_INFO "md: recovery thread finished ...\n");
+
+}
+
+int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct md_list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
+ || (code == MD_SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ if (mddev_trylock(mddev)==0)
+ do_md_stop (mddev, 1);
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ md_mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+ notifier_call: md_notify_reboot,
+ next: NULL,
+ priority: INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+ int i;
+
+ for(i = 0; i < MAX_MD_DEVS; i++) {
+ md_blocksizes[i] = 1024;
+ md_size[i] = 0;
+ md_hardsect_sizes[i] = 512;
+ }
+ blksize_size[MAJOR_NR] = md_blocksizes;
+ blk_size[MAJOR_NR] = md_size;
+ max_readahead[MAJOR_NR] = md_maxreadahead;
+ hardsect_size[MAJOR_NR] = md_hardsect_sizes;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+#ifdef CONFIG_PROC_FS
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+#endif
+}
+
+request_queue_t * md_queue_proc(kdev_t dev)
+{
+ mddev_t *mddev = mddev_find(minor(dev));
+ request_queue_t *q = BLK_DEFAULT_QUEUE(MAJOR_NR);
+ if (!mddev || atomic_read(&mddev->active)<2)
+ BUG();
+ if (mddev->pers)
+ q = &mddev->queue;
+ mddev_put(mddev); /* the caller must hold a reference... */
+ return q;
+}
+
+int md__init md_init(void)
+{
+ static char * name = "mdrecoveryd";
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+
+ if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
+ {
+ printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR);
+ return (-1);
+ }
+ devfs_handle = devfs_mk_dir (NULL, "md", NULL);
+ /* we don't use devfs_register_series because we want to fill md_hd_struct */
+ for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+ char devname[128];
+ sprintf (devname, "%u", minor);
+ md_hd_struct[minor].de = devfs_register (devfs_handle,
+ devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+ S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+ }
+
+ /* all requests on an uninitialised device get failed... */
+ blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request);
+ blk_dev[MAJOR_NR].queue = md_queue_proc;
+
+
+ read_ahead[MAJOR_NR] = INT_MAX;
+
+ add_gendisk(&md_gendisk);
+
+ md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
+ if (!md_recovery_thread)
+ printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n");
+
+ md_register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * When md (and any require personalities) are compiled into the kernel
+ * (not a module), arrays can be assembles are boot time using with AUTODETECT
+ * where specially marked partitions are registered with md_autodetect_dev(),
+ * and with MD_BOOT where devices to be collected are given on the boot line
+ * with md=.....
+ * The code for that is here.
+ */
+
+struct {
+ int set;
+ int noautodetect;
+} raid_setup_args md__initdata;
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static kdev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(kdev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(void)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ kdev_t dev = detected_devices[i];
+
+ if (md_import_device(dev,1)) {
+ printk(KERN_ALERT "md: could not import %s!\n",
+ partition_name(dev));
+ continue;
+ }
+ /*
+ * Sanity checks:
+ */
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ continue;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ continue;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices();
+}
+
+static struct {
+ char device_set [MAX_MD_DEVS];
+ int pers[MAX_MD_DEVS];
+ int chunk[MAX_MD_DEVS];
+ char *device_names[MAX_MD_DEVS];
+} md_setup_args md__initdata;
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the MD device now; that is handled by
+ * md_setup_drive after the low-level disk drivers have initialised.
+ *
+ * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
+ * assigns the task of parsing integer arguments to the
+ * invoked program now). Added ability to initialise all
+ * the MD devices (by specifying multiple "md=" lines)
+ * instead of just one. -- KTK
+ * 18May2000: Added support for persistant-superblock arrays:
+ * md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
+ * md=n,device-list reads a RAID superblock from the devices
+ * elements in device-list are read by name_to_kdev_t so can be
+ * a hex number or something like /dev/hda1 /dev/sdb
+ * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
+ * Shifted name_to_kdev_t() and related operations to md_set_drive()
+ * for later execution. Rewrote section to make devfs compatible.
+ */
+static int md__init md_setup(char *str)
+{
+ int minor, level, factor, fault;
+ char *pername = "";
+ char *str1 = str;
+
+ if (get_option(&str, &minor) != 2) { /* MD Number */
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ if (minor >= MAX_MD_DEVS) {
+ printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor);
+ return 0;
+ } else if (md_setup_args.device_names[minor]) {
+ printk(KERN_WARNING "md: md=%d, Specified more then once. "
+ "Replacing previous definition.\n", minor);
+ }
+ switch (get_option(&str, &level)) { /* RAID Personality */
+ case 2: /* could be 0 or -1.. */
+ if (level == 0 || level == -1) {
+ if (get_option(&str, &factor) != 2 || /* Chunk Size */
+ get_option(&str, &fault) != 2) {
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ md_setup_args.chunk[minor] = 1 << (factor+12);
+ switch(level) {
+ case -1:
+ level = LINEAR;
+ pername = "linear";
+ break;
+ case 0:
+ level = RAID0;
+ pername = "raid0";
+ break;
+ default:
+ printk(KERN_WARNING
+ "md: The kernel has not been configured for raid%d support!\n",
+ level);
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ break;
+ }
+ /* FALL THROUGH */
+ case 1: /* the first device is numeric */
+ str = str1;
+ /* FALL THROUGH */
+ case 0:
+ md_setup_args.pers[minor] = 0;
+ pername="super-block";
+ }
+
+ printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
+ minor, pername, str);
+ md_setup_args.device_names[minor] = str;
+
+ return 1;
+}
+
+extern kdev_t name_to_kdev_t(char *line) md__init;
+void md__init md_setup_drive(void)
+{
+ int minor, i;
+ kdev_t dev;
+ mddev_t*mddev;
+ kdev_t devices[MD_SB_DISKS+1];
+
+ for (minor = 0; minor < MAX_MD_DEVS; minor++) {
+ int err = 0;
+ char *devname;
+ mdu_disk_info_t dinfo;
+
+ if ((devname = md_setup_args.device_names[minor]) == 0) continue;
+
+ for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {
+
+ char *p;
+ void *handle;
+
+ p = strchr(devname, ',');
+ if (p)
+ *p++ = 0;
+
+ dev = name_to_kdev_t(devname);
+ handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev),
+ DEVFS_SPECIAL_BLK, 1);
+ if (handle != 0) {
+ unsigned major, minor;
+ devfs_get_maj_min(handle, &major, &minor);
+ dev = MKDEV(major, minor);
+ }
+ if (dev == 0) {
+ printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
+ break;
+ }
+
+ devices[i] = dev;
+ md_setup_args.device_set[minor] = 1;
+
+ devname = p;
+ }
+ devices[i] = 0;
+
+ if (md_setup_args.device_set[minor] == 0)
+ continue;
+
+ printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
+
+ mddev = mddev_find(minor);
+ if (!mddev) {
+ printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
+ continue;
+ }
+ if (mddev_lock(mddev)) {
+ printk(KERN_WARNING
+ "md: Ignoring md=%d, cannot lock!\n",
+ minor);
+ mddev_put(mddev);
+ continue;
+ }
+
+ if (mddev->sb || !list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
+ minor);
+ mddev_unlock(mddev);
+ mddev_put(mddev);
+ continue;
+ }
+ if (md_setup_args.pers[minor]) {
+ /* non-persistent */
+ mdu_array_info_t ainfo;
+ ainfo.level = pers_to_level(md_setup_args.pers[minor]);
+ ainfo.size = 0;
+ ainfo.nr_disks =0;
+ ainfo.raid_disks =0;
+ ainfo.md_minor =minor;
+ ainfo.not_persistent = 1;
+
+ ainfo.state = (1 << MD_SB_CLEAN);
+ ainfo.active_disks = 0;
+ ainfo.working_disks = 0;
+ ainfo.failed_disks = 0;
+ ainfo.spare_disks = 0;
+ ainfo.layout = 0;
+ ainfo.chunk_size = md_setup_args.chunk[minor];
+ err = set_array_info(mddev, &ainfo);
+ for (i = 0; !err && (dev = devices[i]); i++) {
+ dinfo.number = i;
+ dinfo.raid_disk = i;
+ dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ mddev->sb->nr_disks++;
+ mddev->sb->raid_disks++;
+ mddev->sb->active_disks++;
+ mddev->sb->working_disks++;
+ err = add_new_disk (mddev, &dinfo);
+ }
+ } else {
+ /* persistent */
+ for (i = 0; (dev = devices[i]); i++) {
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ add_new_disk (mddev, &dinfo);
+ }
+ }
+ if (!err)
+ err = do_md_run(mddev);
+ if (err) {
+ mddev->sb_dirty = 0;
+ do_md_stop(mddev, 0);
+ printk(KERN_WARNING "md: starting md%d failed\n", minor);
+ }
+ mddev_unlock(mddev);
+ mddev_put(mddev);
+ }
+}
+
+static int md__init raid_setup(char *str)
+{
+ int len, pos;
+
+ len = strlen(str) + 1;
+ pos = 0;
+
+ while (pos < len) {
+ char *comma = strchr(str+pos, ',');
+ int wlen;
+ if (comma)
+ wlen = (comma-str)-pos;
+ else wlen = (len-1)-pos;
+
+ if (strncmp(str, "noautodetect", wlen) == 0)
+ raid_setup_args.noautodetect = 1;
+ pos += wlen+1;
+ }
+ raid_setup_args.set = 1;
+ return 1;
+}
+
+int md__init md_run_setup(void)
+{
+ if (raid_setup_args.noautodetect)
+ printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
+ else
+ autostart_arrays();
+ md_setup_drive();
+ return 0;
+}
+
+__setup("raid=", raid_setup);
+__setup("md=", md_setup);
+
+__initcall(md_init);
+__initcall(md_run_setup);
+
+#else /* It is a MODULE */
+
+int init_module(void)
+{
+ return md_init();
+}
+
+static void free_device_names(void)
+{
+ while (!list_empty(&device_names)) {
+ struct dname *tmp = list_entry(device_names.next,
+ dev_name_t, list);
+ list_del(&tmp->list);
+ kfree(tmp);
+ }
+}
+
+
+void cleanup_module(void)
+{
+ md_unregister_thread(md_recovery_thread);
+ devfs_unregister(devfs_handle);
+
+ devfs_unregister_blkdev(MAJOR_NR,"md");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("mdstat", NULL);
+#endif
+
+ del_gendisk(&md_gendisk);
+
+ blk_dev[MAJOR_NR].queue = NULL;
+ blksize_size[MAJOR_NR] = NULL;
+ blk_size[MAJOR_NR] = NULL;
+ max_readahead[MAJOR_NR] = NULL;
+ hardsect_size[MAJOR_NR] = NULL;
+
+ free_device_names();
+
+}
+#endif
+
+MD_EXPORT_SYMBOL(md_size);
+MD_EXPORT_SYMBOL(register_md_personality);
+MD_EXPORT_SYMBOL(unregister_md_personality);
+MD_EXPORT_SYMBOL(partition_name);
+MD_EXPORT_SYMBOL(md_error);
+MD_EXPORT_SYMBOL(md_done_sync);
+MD_EXPORT_SYMBOL(md_unregister_thread);
+MD_EXPORT_SYMBOL(md_update_sb);
+MD_EXPORT_SYMBOL(md_wakeup_thread);
+MD_EXPORT_SYMBOL(md_print_devices);
+MD_EXPORT_SYMBOL(find_rdev_nr);
+MD_EXPORT_SYMBOL(md_interrupt_thread);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md-autostart/orig b/tests/linux/md-autostart/orig
new file mode 100644
index 0000000..12a3519
--- /dev/null
+++ b/tests/linux/md-autostart/orig
@@ -0,0 +1,4025 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/raid/xor.h>
+#include <linux/devfs_fs_kernel.h>
+
+#include <linux/init.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+
+#include <linux/blk.h>
+
+#define DEBUG 0
+#if DEBUG
+# define dprintk(x...) printk(x)
+#else
+# define dprintk(x...) do { } while(0)
+#endif
+
+#ifndef MODULE
+static void autostart_arrays (void);
+#endif
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 100 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static int sysctl_speed_limit_min = 100;
+static int sysctl_speed_limit_max = 100000;
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
+ &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
+ {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
+ &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
+ {0}
+};
+
+static ctl_table raid_dir_table[] = {
+ {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
+ {0}
+};
+
+static ctl_table raid_root_table[] = {
+ {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
+ {0}
+};
+
+/*
+ * these have to be allocated separately because external
+ * subsystems want to have a pre-defined structure
+ */
+struct hd_struct md_hd_struct[MAX_MD_DEVS];
+static int md_blocksizes[MAX_MD_DEVS];
+static int md_hardsect_sizes[MAX_MD_DEVS];
+static void md_recover_arrays(void);
+static mdk_thread_t *md_recovery_thread;
+
+int md_size[MAX_MD_DEVS];
+
+static struct block_device_operations md_fops;
+static devfs_handle_t devfs_handle;
+
+static struct gendisk md_gendisk=
+{
+ major: MD_MAJOR,
+ major_name: "md",
+ minor_shift: 0,
+ max_p: 1,
+ part: md_hd_struct,
+ sizes: md_size,
+ nr_real: MAX_MD_DEVS,
+ real_devices: NULL,
+ next: NULL,
+ fops: &md_fops,
+};
+
+/*
+ * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list as well as mddev_map.
+ */
+static MD_LIST_HEAD(all_mddevs);
+static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;
+
+
+/*
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
+ */
+#define ITERATE_MDDEV(mddev,tmp) \
+ \
+ for (spin_lock(&all_mddevs_lock), \
+ (tmp = all_mddevs.next), \
+ (mddev = NULL); \
+ (void)(tmp != &all_mddevs && \
+ mddev_get(list_entry(tmp, mddev_t, all_mddevs))),\
+ spin_unlock(&all_mddevs_lock), \
+ (mddev ? mddev_put(mddev):(void)NULL), \
+ (mddev = list_entry(tmp, mddev_t, all_mddevs)), \
+ (tmp != &all_mddevs); \
+ spin_lock(&all_mddevs_lock), \
+ (tmp = tmp->next) \
+ )
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio);
+ return 0;
+}
+
+static inline mddev_t *mddev_get(mddev_t *mddev)
+{
+ atomic_inc(&mddev->active);
+ return mddev;
+}
+
+static void mddev_put(mddev_t *mddev)
+{
+ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+ return;
+ if (!mddev->sb && list_empty(&mddev->disks)) {
+ list_del(&mddev->all_mddevs);
+ mddev_map[mdidx(mddev)] = NULL;
+ kfree(mddev);
+ MOD_DEC_USE_COUNT;
+ }
+ spin_unlock(&all_mddevs_lock);
+}
+
+static mddev_t * mddev_find(int unit)
+{
+ mddev_t *mddev, *new = NULL;
+
+ retry:
+ spin_lock(&all_mddevs_lock);
+ if (mddev_map[unit]) {
+ mddev = mddev_get(mddev_map[unit]);
+ spin_unlock(&all_mddevs_lock);
+ if (new)
+ kfree(new);
+ return mddev;
+ }
+ if (new) {
+ mddev_map[unit] = new;
+ list_add(&new->all_mddevs, &all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ MOD_INC_USE_COUNT;
+ return new;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ memset(new, 0, sizeof(*new));
+
+ new->__minor = unit;
+ init_MUTEX(&new->reconfig_sem);
+ MD_INIT_LIST_HEAD(&new->disks);
+ MD_INIT_LIST_HEAD(&new->all_mddevs);
+ atomic_set(&new->active, 1);
+
+ goto retry;
+}
+
+static inline int mddev_lock(mddev_t * mddev)
+{
+ return down_interruptible(&mddev->reconfig_sem);
+}
+
+static inline int mddev_trylock(mddev_t * mddev)
+{
+ return down_trylock(&mddev->reconfig_sem);
+}
+
+static inline void mddev_unlock(mddev_t * mddev)
+{
+ up(&mddev->reconfig_sem);
+}
+
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+static MD_LIST_HEAD(device_names);
+
+char * partition_name(kdev_t dev)
+{
+ struct gendisk *hd;
+ static char nomem [] = "<nomem>";
+ dev_name_t *dname;
+ struct md_list_head *tmp;
+
+ list_for_each(tmp, &device_names) {
+ dname = md_list_entry(tmp, dev_name_t, list);
+ if (dname->dev == dev)
+ return dname->name;
+ }
+
+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
+
+ if (!dname)
+ return nomem;
+ /*
+ * ok, add this new device name to the list
+ */
+ hd = get_gendisk (dev);
+ dname->name = NULL;
+ if (hd)
+ dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
+ if (!dname->name) {
+ sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
+ dname->name = dname->namebuf;
+ }
+
+ dname->dev = dev;
+ md_list_add(&dname->list, &device_names);
+
+ return dname->name;
+}
+
+static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev,
+ int persistent)
+{
+ unsigned int size = 0;
+
+ if (blk_size[MAJOR(dev)])
+ size = blk_size[MAJOR(dev)][MINOR(dev)];
+ if (persistent)
+ size = MD_NEW_SIZE_BLOCKS(size);
+ return size;
+}
+
+static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent)
+{
+ unsigned int size;
+
+ size = calc_dev_sboffset(dev, mddev, persistent);
+ if (!mddev->sb) {
+ MD_BUG();
+ return size;
+ }
+ if (mddev->sb->chunk_size)
+ size &= ~(mddev->sb->chunk_size/1024 - 1);
+ return size;
+}
+
+static unsigned int zoned_raid_size(mddev_t *mddev)
+{
+ unsigned int mask;
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ /*
+ * do size and offset calculations.
+ */
+ mask = ~(mddev->sb->chunk_size/1024 - 1);
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev->size &= mask;
+ md_size[mdidx(mddev)] += rdev->size;
+ }
+ return 0;
+}
+
+static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb)
+{
+ if (disk_active(disk)) {
+ sb->working_disks--;
+ } else {
+ if (disk_spare(disk)) {
+ sb->spare_disks--;
+ sb->working_disks--;
+ } else {
+ sb->failed_disks--;
+ }
+ }
+ sb->nr_disks--;
+ disk->major = 0;
+ disk->minor = 0;
+ mark_disk_removed(disk);
+}
+
+#define BAD_MAGIC KERN_ERR \
+"md: invalid raid superblock magic on %s\n"
+
+#define BAD_MINOR KERN_ERR \
+"md: %s: invalid raid minor (%x)\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_SB KERN_ERR \
+"md: disabled device %s, could not read superblock.\n"
+
+#define BAD_CSUM KERN_WARNING \
+"md: invalid superblock checksum on %s\n"
+
+static int alloc_array_sb(mddev_t * mddev)
+{
+ if (mddev->sb) {
+ MD_BUG();
+ return 0;
+ }
+
+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
+ if (!mddev->sb)
+ return -ENOMEM;
+ md_clear_page(mddev->sb);
+ return 0;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(OUT_OF_MEM);
+ return -EINVAL;
+ }
+ rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ page_cache_release(rdev->sb_page);
+ rdev->sb = NULL;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ } else {
+ if (!rdev->faulty)
+ MD_BUG();
+ }
+}
+
+
+static void bh_complete(struct buffer_head *bh, int uptodate)
+{
+
+ if (uptodate)
+ set_bit(BH_Uptodate, &bh->b_state);
+
+ complete((struct completion*)bh->b_private);
+}
+
+static int sync_page_io(kdev_t dev, unsigned long sector, int size,
+ struct page *page, int rw)
+{
+ struct buffer_head bh;
+ struct completion event;
+
+ init_completion(&event);
+ init_buffer(&bh, bh_complete, &event);
+ bh.b_rdev = dev;
+ bh.b_rsector = sector;
+ bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
+ bh.b_size = size;
+ bh.b_page = page;
+ bh.b_reqnext = NULL;
+ bh.b_data = page_address(page);
+ generic_make_request(rw, &bh);
+
+ run_task_queue(&tq_disk);
+ wait_for_completion(&event);
+
+ return test_bit(BH_Uptodate, &bh.b_state);
+}
+
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+ int ret = -EINVAL;
+ kdev_t dev = rdev->dev;
+ unsigned long sb_offset;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk
+ */
+ sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
+ rdev->sb_offset = sb_offset;
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) {
+ printk(NO_SB,partition_name(dev));
+ return -EINVAL;
+ }
+ printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
+ ret = 0;
+abort:
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+/*
+ * Check one RAID superblock for generic plausibility
+ */
+
+static int check_disk_sb(mdk_rdev_t * rdev)
+{
+ mdp_super_t *sb;
+ int ret = -EINVAL;
+
+ sb = rdev->sb;
+ if (!sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(BAD_MAGIC, partition_name(rdev->dev));
+ goto abort;
+ }
+
+ if (sb->md_minor >= MAX_MD_DEVS) {
+ printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor);
+ goto abort;
+ }
+
+ if (calc_sb_csum(sb) != sb->sb_csum) {
+ printk(BAD_CSUM, partition_name(rdev->dev));
+ goto abort;
+ }
+ ret = 0;
+abort:
+ return ret;
+}
+
+static kdev_t dev_unit(kdev_t dev)
+{
+ unsigned int mask;
+ struct gendisk *hd = get_gendisk(dev);
+
+ if (!hd)
+ return 0;
+ mask = ~((1 << hd->minor_shift) - 1);
+
+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
+}
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (dev_unit(rdev->dev) == dev_unit(dev))
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev->dev))
+ return 1;
+
+ return 0;
+}
+
+static MD_LIST_HEAD(all_raid_disks);
+static MD_LIST_HEAD(pending_raid_disks);
+
+static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ same_pdev = match_dev_unit(mddev, rdev->dev);
+ if (same_pdev)
+ printk( KERN_WARNING
+"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
+" protection against single-disk failure might be compromised.\n",
+ mdidx(mddev), partition_name(rdev->dev),
+ partition_name(same_pdev->dev));
+
+ md_list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev));
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev));
+ rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev(mdk_rdev_t *rdev)
+{
+ int err = 0;
+ struct block_device *bdev;
+
+ bdev = bdget(rdev->dev);
+ if (!bdev)
+ return -ENOMEM;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ if (!err)
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ blkdev_put(bdev, BDEV_RAW);
+}
+
+void md_autodetect_dev(kdev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev));
+ if (rdev->mddev)
+ MD_BUG();
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ list_del_init(&rdev->all);
+ if (!list_empty(&rdev->pending)) {
+ printk(KERN_INFO "md: (%s was pending)\n",
+ partition_name(rdev->dev));
+ list_del_init(&rdev->pending);
+ }
+#ifndef MODULE
+ md_autodetect_dev(rdev->dev);
+#endif
+ rdev->dev = 0;
+ rdev->faulty = 0;
+ kfree(rdev);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb = mddev->sb;
+
+ if (mddev->sb) {
+ mddev->sb = NULL;
+ free_page((unsigned long) sb);
+ }
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+}
+
+static void free_mddev(mddev_t *mddev)
+{
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ export_array(mddev);
+ md_size[mdidx(mddev)] = 0;
+ md_hd_struct[mdidx(mddev)].nr_sects = 0;
+}
+
+#undef BAD_CSUM
+#undef BAD_MAGIC
+#undef OUT_OF_MEM
+#undef NO_SB
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+ partition_name(MKDEV(desc->major,desc->minor)),
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
+ sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
+ partition_name(rdev->dev), partition_name(rdev->old_dev),
+ rdev->size, rdev->faulty, rdev->desc_nr);
+ if (rdev->sb) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb(rdev->sb);
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) {
+ printk("md%d: ", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", partition_name(rdev->dev));
+
+ if (mddev->sb) {
+ printk(" array superblock:\n");
+ print_sb(mddev->sb);
+ } else
+ printk(" no array superblock.\n");
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ mddev_unlock(mddev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ if (tmp1)
+ kfree(tmp1);
+ if (tmp2)
+ kfree(tmp2);
+
+ return ret;
+}
+
+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
+{
+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+static mdk_rdev_t * find_rdev_all(kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ list_for_each(tmp, &all_raid_disks) {
+ rdev = md_list_entry(tmp, mdk_rdev_t, all);
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+#define GETBLK_FAILED KERN_ERR \
+"md: getblk failed for device %s\n"
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+ kdev_t dev;
+ unsigned long sb_offset, size;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
+ MD_BUG();
+ return 1;
+ }
+
+ dev = rdev->dev;
+ sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
+ if (rdev->sb_offset != sb_offset) {
+ printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n",
+ partition_name(dev), rdev->sb_offset, sb_offset);
+ goto skip;
+ }
+ /*
+ * If the disk went offline meanwhile and it's just a spare, then
+ * its size has changed to zero silently, and the MD code does
+ * not yet know that it's faulty.
+ */
+ size = calc_dev_size(dev, rdev->mddev, 1);
+ if (size != rdev->size) {
+ printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n",
+ partition_name(dev), rdev->size, size);
+ goto skip;
+ }
+
+ printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) {
+ printk("md: write_disk_sb failed for device %s\n", partition_name(dev));
+ return 1;
+ }
+skip:
+ return 0;
+}
+#undef GETBLK_FAILED
+
+static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ int i, ok = 0;
+ mdp_disk_t *desc;
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ desc = mddev->sb->disks + i;
+#if 0
+ if (disk_faulty(desc)) {
+ if (MKDEV(desc->major,desc->minor) == rdev->dev)
+ ok = 1;
+ continue;
+ }
+#endif
+ if (MKDEV(desc->major,desc->minor) == rdev->dev) {
+ rdev->sb->this_disk = *desc;
+ rdev->desc_nr = desc->number;
+ ok = 1;
+ break;
+ }
+ }
+
+ if (!ok) {
+ MD_BUG();
+ }
+}
+
+static int sync_sbs(mddev_t * mddev)
+{
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty || rdev->alias_device)
+ continue;
+ sb = rdev->sb;
+ *sb = *mddev->sb;
+ set_this_disk(mddev, rdev);
+ sb->sb_csum = calc_sb_csum(sb);
+ }
+ return 0;
+}
+
+void __md_update_sb(mddev_t * mddev)
+{
+ int err, count = 100;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->sb_dirty) {
+ printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0));
+ return 0;
+ }
+ mddev->sb_dirty = 0;
+repeat:
+ mddev->sb->utime = CURRENT_TIME;
+ if ((++mddev->sb->events_lo)==0)
+ ++mddev->sb->events_hi;
+
+ if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
+ }
+ sync_sbs(mddev);
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (mddev->sb->not_persistent)
+ return;
+
+ printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
+ mdidx(mddev));
+
+ err = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk(KERN_INFO "md: ");
+ if (rdev->faulty)
+ printk("(skipping faulty ");
+ if (rdev->alias_device)
+ printk("(skipping alias ");
+ if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) {
+ printk("(skipping new-faulty %s )\n",
+ partition_name(rdev->dev));
+ continue;
+ }
+ printk("%s ", partition_name(rdev->dev));
+ if (!rdev->faulty && !rdev->alias_device) {
+ printk("[events: %08lx]",
+ (unsigned long)rdev->sb->events_lo);
+ err += write_disk_sb(rdev);
+ } else
+ printk(")\n");
+ }
+ if (err) {
+ if (--count) {
+ printk(KERN_ERR "md: errors occurred during superblock update, repeating\n");
+ goto repeat;
+ }
+ printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
+ }
+}
+
+void md_update_sb(mddev_t *mddev)
+{
+ if (mddev_lock(mddev))
+ return;
+ if (mddev->sb_dirty)
+ __md_update_sb(mddev);
+ mddev_unlock(mddev);
+}
+
+
+/*
+ * Import a device. If 'on_disk', then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ */
+static int md_import_device(kdev_t newdev, int on_disk)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ unsigned int size;
+
+ if (find_rdev_all(newdev))
+ return -EEXIST;
+
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev));
+ return -ENOMEM;
+ }
+ memset(rdev, 0, sizeof(*rdev));
+
+ if (is_mounted(newdev)) {
+ printk(KERN_WARNING "md: can not import %s, has active inodes!\n",
+ partition_name(newdev));
+ err = -EBUSY;
+ goto abort_free;
+ }
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ rdev->dev = newdev;
+ if (lock_rdev(rdev)) {
+ printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+ rdev->desc_nr = -1;
+ rdev->faulty = 0;
+
+ size = 0;
+ if (blk_size[MAJOR(newdev)])
+ size = blk_size[MAJOR(newdev)][MINOR(newdev)];
+ if (!size) {
+ printk(KERN_WARNING "md: %s has zero size, marking faulty!\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (on_disk) {
+ if ((err = read_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+ if ((err = check_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+
+ if (rdev->sb->level != -4) {
+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
+ rdev->sb->this_disk.minor);
+ rdev->desc_nr = rdev->sb->this_disk.number;
+ } else {
+ rdev->old_dev = MKDEV(0, 0);
+ rdev->desc_nr = -1;
+ }
+ }
+ md_list_add(&rdev->all, &all_raid_disks);
+ MD_INIT_LIST_HEAD(&rdev->pending);
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return 0;
+
+abort_free:
+ if (rdev->sb) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return err;
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+#define INCONSISTENT KERN_ERR \
+"md: fatal superblock inconsistency in %s -- removing from array\n"
+
+#define OUT_OF_DATE KERN_ERR \
+"md: superblock update time inconsistency -- using the most recent one\n"
+
+#define OLD_VERSION KERN_ALERT \
+"md: md%d: unsupported raid array version %d.%d.%d\n"
+
+#define NOT_CLEAN_IGNORE KERN_ERR \
+"md: md%d: raid array is not clean -- starting background reconstruction\n"
+
+#define UNKNOWN_LEVEL KERN_ERR \
+"md: md%d: unsupported raid level %d\n"
+
+static int analyze_sbs(mddev_t * mddev)
+{
+ int out_of_date = 0, i, first;
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev, *rdev2, *freshest;
+ mdp_super_t *sb;
+
+ /*
+ * Verify the RAID superblock on each real device
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty) {
+ MD_BUG();
+ goto abort;
+ }
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+ if (check_disk_sb(rdev))
+ goto abort;
+ }
+
+ /*
+ * The superblock constant part has to be the same
+ * for all disks in the array.
+ */
+ sb = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!sb) {
+ sb = rdev->sb;
+ continue;
+ }
+ if (!sb_equal(sb, rdev->sb)) {
+ printk(INCONSISTENT, partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * OK, we have all disks and the array is ready to run. Let's
+ * find the freshest superblock, that one will be the superblock
+ * that represents the whole array.
+ */
+ if (!mddev->sb)
+ if (alloc_array_sb(mddev))
+ goto abort;
+ sb = mddev->sb;
+ freshest = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2;
+ /*
+ * if the checksum is invalid, use the superblock
+ * only as a last resort. (decrease it's age by
+ * one event)
+ */
+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
+ if (rdev->sb->events_lo || rdev->sb->events_hi)
+ if ((rdev->sb->events_lo--)==0)
+ rdev->sb->events_hi--;
+ }
+
+ printk(KERN_INFO "md: %s's event counter: %08lx\n",
+ partition_name(rdev->dev),
+ (unsigned long)rdev->sb->events_lo);
+ if (!freshest) {
+ freshest = rdev;
+ continue;
+ }
+ /*
+ * Find the newest superblock version
+ */
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(freshest->sb);
+ if (ev1 != ev2) {
+ out_of_date = 1;
+ if (ev1 > ev2)
+ freshest = rdev;
+ }
+ }
+ if (out_of_date) {
+ printk(OUT_OF_DATE);
+ printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev));
+ }
+ memcpy (sb, freshest->sb, sizeof(*sb));
+
+ /*
+ * at this point we have picked the 'best' superblock
+ * from all available superblocks.
+ * now we validate this superblock and kick out possibly
+ * failed disks.
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Kick all non-fresh devices
+ */
+ __u64 ev1, ev2;
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ++ev1;
+ if (ev1 < ev2) {
+ printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
+ partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * Fix up changed device names ... but only if this disk has a
+ * recent update time. Use faulty checksum ones too.
+ */
+ if (mddev->sb->level != -4)
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2, ev3;
+ if (rdev->faulty || rdev->alias_device) {
+ MD_BUG();
+ goto abort;
+ }
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ev3 = ev2;
+ --ev3;
+ if ((rdev->dev != rdev->old_dev) &&
+ ((ev1 == ev2) || (ev1 == ev3))) {
+ mdp_disk_t *desc;
+
+ printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n",
+ partition_name(rdev->old_dev), partition_name(rdev->dev));
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ desc = &sb->disks[rdev->desc_nr];
+ if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
+ MD_BUG();
+ goto abort;
+ }
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ desc = &rdev->sb->this_disk;
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ }
+ }
+
+ /*
+ * Remove unavailable and faulty devices ...
+ *
+ * note that if an array becomes completely unrunnable due to
+ * missing devices, we do not write the superblock back, so the
+ * administrator has a chance to fix things up. The removal thus
+ * only happens if it's nonfatal to the contents of the array.
+ */
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ int found;
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ /*
+ * We kick faulty devices/descriptors immediately.
+ *
+ * Note: multipath devices are a special case. Since we
+ * were able to read the superblock on the path, we don't
+ * care if it was previously marked as faulty, it's up now
+ * so enable it.
+ */
+ if (disk_faulty(desc) && mddev->sb->level != -4) {
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr != desc->number)
+ continue;
+ printk(KERN_WARNING "md%d: kicking faulty %s!\n",
+ mdidx(mddev),partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ found = 1;
+ break;
+ }
+ if (!found) {
+ if (dev == MKDEV(0,0))
+ continue;
+ printk(KERN_WARNING "md%d: removing former faulty %s!\n",
+ mdidx(mddev), partition_name(dev));
+ }
+ remove_descriptor(desc, sb);
+ continue;
+ } else if (disk_faulty(desc)) {
+ /*
+ * multipath entry marked as faulty, unfaulty it
+ */
+ rdev = find_rdev(mddev, dev);
+ if(rdev)
+ mark_disk_spare(desc);
+ else
+ remove_descriptor(desc, sb);
+ }
+
+ if (dev == MKDEV(0,0))
+ continue;
+ /*
+ * Is this device present in the rdev ring?
+ */
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Multi-path IO special-case: since we have no
+ * this_disk descriptor at auto-detect time,
+ * we cannot check rdev->number.
+ * We can check the device though.
+ */
+ if ((sb->level == -4) && (rdev->dev ==
+ MKDEV(desc->major,desc->minor))) {
+ found = 1;
+ break;
+ }
+ if (rdev->desc_nr == desc->number) {
+ found = 1;
+ break;
+ }
+ }
+ if (found)
+ continue;
+
+ printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n",
+ mdidx(mddev), partition_name(dev));
+ remove_descriptor(desc, sb);
+ }
+
+ /*
+ * Double check wether all devices mentioned in the
+ * superblock are in the rdev ring.
+ */
+ first = 1;
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+
+ if (disk_faulty(desc)) {
+ MD_BUG();
+ goto abort;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * In the case of Multipath-IO, we have no
+ * other information source to find out which
+ * disk is which, only the position of the device
+ * in the superblock:
+ */
+ if (mddev->sb->level == -4) {
+ if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) {
+ MD_BUG();
+ goto abort;
+ }
+ rdev->desc_nr = i;
+ if (!first)
+ rdev->alias_device = 1;
+ else
+ first = 0;
+ }
+ }
+
+ /*
+ * Kick all rdevs that are not in the
+ * descriptor array:
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1)
+ kick_rdev_from_array(rdev);
+ }
+
+ /*
+ * Do a final reality check.
+ */
+ if (mddev->sb->level != -4) {
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * is the desc_nr unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->desc_nr == rdev->desc_nr)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ /*
+ * is the device unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->dev == rdev->dev)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ }
+ }
+
+ /*
+ * Check if we can support this RAID array
+ */
+ if (sb->major_version != MD_MAJOR_VERSION ||
+ sb->minor_version > MD_MINOR_VERSION) {
+
+ printk(OLD_VERSION, mdidx(mddev), sb->major_version,
+ sb->minor_version, sb->patch_version);
+ goto abort;
+ }
+
+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
+ (sb->level == 4) || (sb->level == 5)))
+ printk(NOT_CLEAN_IGNORE, mdidx(mddev));
+
+ return 0;
+abort:
+ return 1;
+}
+
+#undef INCONSISTENT
+#undef OUT_OF_DATE
+#undef OLD_VERSION
+#undef OLD_LEVEL
+
+static int device_size_calculation(mddev_t * mddev)
+{
+ int data_disks = 0, persistent;
+ unsigned int readahead;
+ mdp_super_t *sb = mddev->sb;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+ persistent = !mddev->sb->not_persistent;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size) {
+ MD_BUG();
+ continue;
+ }
+ rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
+ if (rdev->size < sb->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size: %ldk < %dk\n",
+ partition_name(rdev->dev),
+ rdev->size, sb->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (sb->level) {
+ case -4:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case -1:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = sb->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = sb->raid_disks-1;
+ break;
+ default:
+ printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = sb->size * data_disks;
+
+ readahead = MD_READAHEAD;
+ if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
+ readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (sb->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+abort:
+ return 1;
+}
+
+
+#define TOO_BIG_CHUNKSIZE KERN_ERR \
+"too big chunk_size: %d > %d\n"
+
+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
+"too small chunk_size: %d < %ld\n"
+
+#define BAD_CHUNKSIZE KERN_ERR \
+"no chunksize specified, see 'man raidtab'\n"
+
+static int do_md_run(mddev_t * mddev)
+{
+ int pnum, err;
+ int chunk_size;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Resize disks to align partitions size on a given
+ * chunk size.
+ */
+ md_size[mdidx(mddev)] = 0;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (analyze_sbs(mddev)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ chunk_size = mddev->sb->chunk_size;
+ pnum = level_to_pers(mddev->sb->level);
+
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+ if (!chunk_size) {
+ /*
+ * 'default chunksize' in the old md code used to
+ * be PAGE_SIZE, baaad.
+ * we abort here to be on the safe side. We dont
+ * want to continue the bad practice.
+ */
+ printk(BAD_CHUNKSIZE);
+ return -EINVAL;
+ }
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+ } else
+ if (chunk_size)
+ printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n",
+ mddev->sb->level);
+
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (!pers[pnum])
+ {
+#ifdef CONFIG_KMOD
+ char module_name[80];
+ sprintf (module_name, "md-personality-%d", pnum);
+ request_module (module_name);
+ if (!pers[pnum])
+#endif
+ {
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
+ pnum);
+ return -EINVAL;
+ }
+ }
+
+ if (device_size_calculation(mddev))
+ return -EINVAL;
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ md_hardsect_sizes[mdidx(mddev)] = 512;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ invalidate_device(rdev->dev, 1);
+ if (get_hardsect_size(rdev->dev)
+ > md_hardsect_sizes[mdidx(mddev)])
+ md_hardsect_sizes[mdidx(mddev)] =
+ get_hardsect_size(rdev->dev);
+ }
+ md_blocksizes[mdidx(mddev)] = 1024;
+ if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
+ md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
+ mddev->pers = pers[pnum];
+
+ blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+ mddev->queue.queuedata = mddev;
+
+ err = mddev->pers->run(mddev);
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ mddev->pers = NULL;
+ return -EINVAL;
+ }
+
+ mddev->in_sync = (mddev->sb->state & (1<<MD_SB_CLEAN));
+ /* if personality doesn't have "sync_request", then
+ * a dirty array doesn't mean anything
+ */
+ if (mddev->pers->sync_request)
+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
+ mddev->sb_dirty = 1;
+ __md_update_sb(mddev);
+
+ md_recover_arrays();
+ /*
+ * md_size has units of 1K blocks, which are
+ * twice as large as sectors.
+ */
+ md_hd_struct[mdidx(mddev)].start_sect = 0;
+ register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)),
+ 1, &md_fops, md_size[mdidx(mddev)]<<1);
+
+ read_ahead[MD_MAJOR] = 1024;
+ return (0);
+}
+
+#undef TOO_BIG_CHUNKSIZE
+#undef BAD_CHUNKSIZE
+
+static int restart_array(mddev_t *mddev)
+{
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->ro = 0;
+ set_device_ro(mddev_to_kdev(mddev), 0);
+
+ printk(KERN_INFO
+ "md: md%d switched to read-write mode.\n", mdidx(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ md_recover_arrays();
+ err = 0;
+ } else {
+ printk(KERN_ERR "md: md%d has no personality assigned.\n",
+ mdidx(mddev));
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+#define STILL_MOUNTED KERN_WARNING \
+"md: md%d still mounted.\n"
+#define STILL_IN_USE \
+"md: md%d still in use.\n"
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+ int err = 0;
+ kdev_t dev = mddev_to_kdev(mddev);
+
+ if (atomic_read(&mddev->active)>1) {
+ printk(STILL_IN_USE, mdidx(mddev));
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (mddev->pers) {
+ if (mddev->sync_thread) {
+ if (mddev->recovery_running > 0)
+ mddev->recovery_running = -EINTR;
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (mddev->spare) {
+ mddev->pers->diskop(mddev, &mddev->spare,
+ DISKOP_SPARE_INACTIVE);
+ mddev->spare = NULL;
+ }
+ }
+
+ invalidate_device(dev, 1);
+
+ if (ro) {
+ err = -ENXIO;
+ if (mddev->ro)
+ goto out;
+ mddev->ro = 1;
+ } else {
+ if (mddev->ro)
+ set_device_ro(dev, 0);
+ if (mddev->pers->stop(mddev)) {
+ err = -EBUSY;
+ if (mddev->ro)
+ set_device_ro(dev, 1);
+ goto out;
+ }
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (mddev->sb) {
+ /*
+ * mark it clean only if there was no resync
+ * interrupted.
+ */
+ if (mddev->in_sync) {
+ printk(KERN_INFO "md: marking sb clean...\n");
+ mddev->sb->state |= 1 << MD_SB_CLEAN;
+ }
+ mddev->sb_dirty = 1;
+ __md_update_sb(mddev);
+ }
+ if (ro)
+ set_device_ro(dev, 1);
+ }
+
+ /*
+ * Free resources if final stop
+ */
+ if (!ro) {
+ printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+ free_mddev(mddev);
+ } else
+ printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * We have to safely support old arrays too.
+ */
+int detect_old_array(mdp_super_t *sb)
+{
+ if (sb->major_version > 0)
+ return 0;
+ if (sb->minor_version >= 90)
+ return 0;
+
+ return -EINVAL;
+}
+
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return;
+ }
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk("<%s>", partition_name(rdev->dev));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+ /*
+ * prevent the writeback of an unrunnable array
+ */
+ mddev->sb_dirty = 0;
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in the ->pending list)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(void)
+{
+ struct md_list_head candidates;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ rdev0 = md_list_entry(pending_raid_disks.next,
+ mdk_rdev_t, pending);
+
+ printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev));
+ MD_INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ if (uuid_equal(rdev0, rdev)) {
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING
+ "md: %s has same UUID as %s, but superblocks differ ...\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ continue;
+ }
+ printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev));
+ md_list_del(&rdev->pending);
+ md_list_add(&rdev->pending, &candidates);
+ }
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+
+ mddev = mddev_find(rdev0->sb->md_minor);
+ if (!mddev) {
+ printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (mddev_lock(mddev))
+ printk(KERN_WARNING "md: md%d locked, cannot run\n",
+ mdidx(mddev));
+ else if (mddev->sb || !list_empty(&mddev->disks)) {
+ printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
+ mdidx(mddev), partition_name(rdev0->dev));
+ mddev_unlock(mddev);
+ } else {
+ printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
+ bind_rdev_to_array(rdev, mddev);
+ list_del_init(&rdev->pending);
+ }
+ autorun_array(mddev);
+ mddev_unlock(mddev);
+ }
+ /* on success, candidates will be empty, on error
+ * it wont...
+ */
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
+ export_rdev(rdev);
+ mddev_put(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+#define BAD_VERSION KERN_ERR \
+"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_DEVICE KERN_ERR \
+"md: disabled device %s\n"
+
+#define AUTOADD_FAILED KERN_ERR \
+"md: auto-adding devices to md%d FAILED (error %d).\n"
+
+#define AUTOADD_FAILED_USED KERN_ERR \
+"md: cannot auto-add device %s to md%d, already used.\n"
+
+#define AUTORUN_FAILED KERN_ERR \
+"md: auto-running md%d FAILED (error %d).\n"
+
+#define MDDEV_BUSY KERN_ERR \
+"md: cannot auto-add to md%d, already running.\n"
+
+#define AUTOADDING KERN_INFO \
+"md: auto-adding devices to md%d, based on %s's superblock.\n"
+
+#define AUTORUNNING KERN_INFO \
+"md: auto-running md%d.\n"
+
+static int autostart_array(kdev_t startdev)
+{
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ if (md_import_device(startdev, 1)) {
+ printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev));
+ goto abort;
+ }
+
+ start_rdev = find_rdev_all(startdev);
+ if (!start_rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ if (start_rdev->faulty) {
+ printk(KERN_WARNING "md: can not autostart based on faulty %s!\n",
+ partition_name(startdev));
+ goto abort;
+ }
+ md_list_add(&start_rdev->pending, &pending_raid_disks);
+
+ sb = start_rdev->sb;
+
+ err = detect_old_array(sb);
+ if (err) {
+ printk(KERN_WARNING "md: array version is too old to be autostarted ,"
+ "use raidtools 0.90 mkraid --upgrade to upgrade the array "
+ "without data loss!\n");
+ goto abort;
+ }
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+ if (dev == startdev)
+ continue;
+ if (md_import_device(dev, 1)) {
+ printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n",
+ partition_name(dev));
+ continue;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices();
+ return 0;
+
+abort:
+ if (start_rdev)
+ export_rdev(start_rdev);
+ return err;
+}
+
+#undef BAD_VERSION
+#undef OUT_OF_MEM
+#undef NO_DEVICE
+#undef AUTOADD_FAILED_USED
+#undef AUTOADD_FAILED
+#undef AUTORUN_FAILED
+#undef AUTOADDING
+#undef AUTORUNNING
+
+
+static int get_version(void * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (md_copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define SET_FROM_SB(x) info.x = mddev->sb->x
+static int get_array_info(mddev_t * mddev, void * arg)
+{
+ mdu_array_info_t info;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ SET_FROM_SB(major_version);
+ SET_FROM_SB(minor_version);
+ SET_FROM_SB(patch_version);
+ SET_FROM_SB(ctime);
+ SET_FROM_SB(level);
+ SET_FROM_SB(size);
+ SET_FROM_SB(nr_disks);
+ SET_FROM_SB(raid_disks);
+ SET_FROM_SB(md_minor);
+ SET_FROM_SB(not_persistent);
+
+ SET_FROM_SB(utime);
+ SET_FROM_SB(state);
+ SET_FROM_SB(active_disks);
+ SET_FROM_SB(working_disks);
+ SET_FROM_SB(failed_disks);
+ SET_FROM_SB(spare_disks);
+
+ SET_FROM_SB(layout);
+ SET_FROM_SB(chunk_size);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
+static int get_disk_info(mddev_t * mddev, void * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+
+ if (!mddev->sb)
+ return -EINVAL;
+
+ if (md_copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+ if (nr >= MD_SB_DISKS)
+ return -EINVAL;
+
+ SET_FROM_SB(major);
+ SET_FROM_SB(minor);
+ SET_FROM_SB(raid_disk);
+ SET_FROM_SB(state);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_SB(x) mddev->sb->disks[nr].x = info->x
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ int err, size, persistent;
+ mdk_rdev_t *rdev;
+ unsigned int nr;
+ kdev_t dev;
+ dev = MKDEV(info->major,info->minor);
+
+ if (find_rdev_all(dev)) {
+ printk(KERN_WARNING "md: device %s already used in a RAID array!\n",
+ partition_name(dev));
+ return -EBUSY;
+ }
+ if (!mddev->sb) {
+ /* expecting a device which has a superblock */
+ err = md_import_device(dev, 1);
+ if (err) {
+ printk(KERN_WARNING "md: md_import_device returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ if (!uuid_equal(rdev0, rdev)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ bind_rdev_to_array(rdev, mddev);
+ return 0;
+ }
+
+ nr = info->number;
+ if (nr >= mddev->sb->nr_disks) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+
+ SET_SB(number);
+ SET_SB(major);
+ SET_SB(minor);
+ SET_SB(raid_disk);
+ SET_SB(state);
+
+ if ((info->state & (1<<MD_DISK_FAULTY))==0) {
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ rdev->old_dev = dev;
+ rdev->desc_nr = info->number;
+
+ bind_rdev_to_array(rdev, mddev);
+
+ persistent = !mddev->sb->not_persistent;
+ if (!persistent)
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+
+ size = calc_dev_size(dev, mddev, persistent);
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ if (!mddev->sb->size || (mddev->sb->size > size))
+ mddev->sb->size = size;
+ }
+
+ /*
+ * sync all other superblocks with the main superblock
+ */
+ sync_sbs(mddev);
+
+ return 0;
+}
+#undef SET_SB
+
+static int hot_generate_error(mddev_t * mddev, kdev_t dev)
+{
+ struct request_queue *q;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ return -ENXIO;
+ }
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (!disk_active(disk))
+ return -ENODEV;
+
+ q = blk_get_queue(rdev->dev);
+ if (!q) {
+ MD_BUG();
+ return -ENODEV;
+ }
+ printk(KERN_INFO "md: okay, generating error!\n");
+// q->oneshot_error = 1; // disabled for now
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, kdev_t dev)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (disk_active(disk))
+ goto busy;
+
+ if (disk_removed(disk))
+ return -EINVAL;
+
+ err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
+ if (err == -EBUSY)
+ goto busy;
+
+ if (err) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ remove_descriptor(disk, mddev->sb);
+ kick_rdev_from_array(rdev);
+ __md_update_sb(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, kdev_t dev)
+{
+ int i, err, persistent;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ persistent = !mddev->sb->not_persistent;
+
+ rdev = find_rdev(mddev, dev);
+ if (rdev)
+ return -EBUSY;
+
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->faulty) {
+ printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n",
+ partition_name(dev), mdidx(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ size = calc_dev_size(dev, mddev, persistent);
+
+ if (size < mddev->sb->size) {
+ printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n",
+ mdidx(mddev), size, mddev->sb->size);
+ err = -ENOSPC;
+ goto abort_export;
+ }
+ bind_rdev_to_array(rdev, mddev);
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+ rdev->old_dev = dev;
+ rdev->size = size;
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ disk = mddev->sb->disks + mddev->sb->raid_disks;
+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
+ disk = mddev->sb->disks + i;
+
+ if (!disk->major && !disk->minor)
+ break;
+ if (disk_removed(disk))
+ break;
+ }
+ if (i == MD_SB_DISKS) {
+ printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ if (disk_removed(disk)) {
+ /*
+ * reuse slot
+ */
+ if (disk->number != i) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+ } else {
+ disk->number = i;
+ }
+
+ disk->raid_disk = disk->number;
+ disk->major = MAJOR(dev);
+ disk->minor = MINOR(dev);
+
+ if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+
+ mark_disk_spare(disk);
+ mddev->sb->nr_disks++;
+ mddev->sb->spare_disks++;
+ mddev->sb->working_disks++;
+
+ __md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ md_recover_arrays();
+
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+#define SET_SB(x) mddev->sb->x = info->x
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (alloc_array_sb(mddev))
+ return -ENOMEM;
+
+ mddev->sb->major_version = MD_MAJOR_VERSION;
+ mddev->sb->minor_version = MD_MINOR_VERSION;
+ mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->sb->ctime = CURRENT_TIME;
+
+ SET_SB(level);
+ SET_SB(size);
+ SET_SB(nr_disks);
+ SET_SB(raid_disks);
+ SET_SB(md_minor);
+ SET_SB(not_persistent);
+
+ SET_SB(state);
+ SET_SB(active_disks);
+ SET_SB(working_disks);
+ SET_SB(failed_disks);
+ SET_SB(spare_disks);
+
+ SET_SB(layout);
+ SET_SB(chunk_size);
+
+ mddev->sb->md_magic = MD_SB_MAGIC;
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(&mddev->sb->set_uuid0, 4);
+ get_random_bytes(&mddev->sb->set_uuid1, 4);
+ get_random_bytes(&mddev->sb->set_uuid2, 4);
+ get_random_bytes(&mddev->sb->set_uuid3, 4);
+
+ return 0;
+}
+#undef SET_SB
+
+static int set_disk_faulty(mddev_t *mddev, kdev_t dev)
+{
+ int ret;
+
+ ret = md_error(mddev, dev);
+ return ret;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ unsigned int minor;
+ int err = 0;
+ struct hd_geometry *loc = (struct hd_geometry *) arg;
+ mddev_t *mddev = NULL;
+ kdev_t dev;
+
+ if (!md_capable_admin())
+ return -EACCES;
+
+ dev = inode->i_rdev;
+ minor = MINOR(dev);
+ if (minor >= MAX_MD_DEVS) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version((void *)arg);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays();
+ goto done;
+#endif
+
+ case BLKGETSIZE:
+ case BLKGETSIZE64:
+ case BLKRAGET:
+ case BLKRASET:
+ case BLKFLSBUF:
+ case BLKBSZGET:
+ case BLKBSZSET:
+ err = blk_ioctl (dev, cmd, arg);
+ goto abort;
+
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev) {
+ BUG();
+ goto abort;
+ }
+
+
+ if (cmd == START_ARRAY) {
+ /* START_ARRAY doesn't need to lock the array as autostart_array
+ * does the locking, and it could even be a different array
+ */
+ err = autostart_array(val_to_kdev(arg));
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name(val_to_kdev(arg)));
+ goto abort;
+ }
+ goto done;
+ }
+
+ err = mddev_lock(mddev);
+ if (err) {
+ printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+
+ if (!list_empty(&mddev->disks)) {
+ printk(KERN_WARNING "md: array md%d already has disks!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (mddev->sb) {
+ printk(KERN_WARNING "md: array md%d already has a superblock!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (arg) {
+ mdu_array_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+ case START_ARRAY:
+ /*
+ * possibly make it lock the array ...
+ */
+ err = autostart_array((kdev_t)arg);
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name((kdev_t)arg));
+ goto abort_unlock;
+ }
+ goto done_unlock;
+
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+ /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+ if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ err = do_md_stop (mddev, 0);
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ case HDIO_GETGEO:
+ if (!loc) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ err = md_put_user (2, (char *) &loc->heads);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (4, (char *) &loc->sectors);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
+ (short *) &loc->cylinders);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[minor].start_sect,
+ (long *) &loc->start);
+ goto done_unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow read-only arrays
+ * here:
+ */
+ if (mddev->ro) {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ case HOT_GENERATE_ERROR:
+ err = hot_generate_error(mddev, (kdev_t)arg);
+ goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ {
+ err = do_md_run (mddev);
+ /*
+ * we have to clean up the mess if
+ * the array cannot be run for some
+ * reason ...
+ */
+ if (err) {
+ mddev->sb_dirty = 0;
+ do_md_stop (mddev, 0);
+ }
+ goto done_unlock;
+ }
+
+ default:
+ printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, "
+ "upgrade your software to use new ictls.\n",
+ current->comm, current->pid);
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ mddev_unlock(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Succeed if we can find or allocate a mddev structure.
+ */
+ mddev_t *mddev = mddev_find(minor(inode->i_rdev));
+ int err = -ENOMEM;
+
+ if (!mddev)
+ goto out;
+
+ if ((err = mddev_lock(mddev)))
+ goto put;
+
+ err = 0;
+ mddev_unlock(mddev);
+ inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev);
+ put:
+ mddev_put(mddev);
+ out:
+ return err;
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev)
+ BUG();
+ mddev_put(mddev);
+
+ return 0;
+}
+
+static struct block_device_operations md_fops=
+{
+ owner: THIS_MODULE,
+ open: md_open,
+ release: md_release,
+ ioctl: md_ioctl,
+};
+
+
+int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ md_lock_kernel();
+
+ /*
+ * Detach thread
+ */
+
+ daemonize();
+ reparent_to_init();
+
+ sprintf(current->comm, thread->name);
+ md_init_signals();
+ md_flush_signals();
+ thread->tsk = current;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+ current->policy = SCHED_OTHER;
+ current->nice = -20;
+ md_unlock_kernel();
+
+ complete(thread->event);
+ while (thread->run) {
+ void (*run)(void *data);
+
+ wait_event_interruptible(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags));
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ run = thread->run;
+ if (run) {
+ run(thread->data);
+ run_task_queue(&tq_disk);
+ }
+ if (md_signal_pending(current))
+ md_flush_signals();
+ }
+ complete(thread->event);
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ dprintk("md: waking up MD thread %p.\n", thread);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+}
+
+mdk_thread_t *md_register_thread(void (*run) (void *),
+ void *data, const char *name)
+{
+ mdk_thread_t *thread;
+ int ret;
+ struct completion event;
+
+ thread = (mdk_thread_t *) kmalloc
+ (sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ memset(thread, 0, sizeof(mdk_thread_t));
+ md_init_waitqueue_head(&thread->wqueue);
+
+ init_completion(&event);
+ thread->event = &event;
+ thread->run = run;
+ thread->data = data;
+ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+ return NULL;
+ }
+ wait_for_completion(&event);
+ return thread;
+}
+
+void md_interrupt_thread(mdk_thread_t *thread)
+{
+ if (!thread->tsk) {
+ MD_BUG();
+ return;
+ }
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+ send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ struct completion event;
+
+ init_completion(&event);
+
+ thread->event = &event;
+ thread->run = NULL;
+ thread->name = NULL;
+ md_interrupt_thread(thread);
+ wait_for_completion(&event);
+ kfree(thread);
+}
+
+static void md_recover_arrays(void)
+{
+ if (!md_recovery_thread) {
+ MD_BUG();
+ return;
+ }
+ md_wakeup_thread(md_recovery_thread);
+}
+
+
+int md_error(mddev_t *mddev, kdev_t rdev)
+{
+ mdk_rdev_t * rrdev;
+
+ dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+
+ if (!mddev) {
+ MD_BUG();
+ return 0;
+ }
+ rrdev = find_rdev(mddev, rdev);
+ if (!rrdev || rrdev->faulty)
+ return 0;
+ if (!mddev->pers->error_handler
+ || mddev->pers->error_handler(mddev,rdev) <= 0) {
+ rrdev->faulty = 1;
+ } else
+ return 1;
+ /*
+ * if recovery was running, stop it now.
+ */
+ if (mddev->recovery_running)
+ mddev->recovery_running = -EIO;
+ md_recover_arrays();
+
+ return 0;
+}
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_ALL(rdev,tmp) {
+ if (list_empty(&rdev->same_set)) {
+ /*
+ * The device is not yet used by any array.
+ */
+ i++;
+ seq_printf(seq, "%s ",
+ partition_name(rdev->dev));
+ }
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ unsigned long max_blocks, resync, res, dt, db, rt;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ max_blocks = mddev->sb->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks)
+ MD_BUG();
+
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ {
+ int i, x = res/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+ (mddev->spare ? "recovery" : "resync"),
+ res/10, res % 10, resync, max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+
+}
+
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l > 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ return mddev;
+ }
+ return (void*)2;/* tail */
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = list_entry(tmp,mddev_t,all_mddevs);
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ int j, size;
+ struct md_list_head *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev = v;
+
+ if (v == (void*)1) {
+ seq_printf(seq, "Personalities : ");
+ for (j = 0; j < MAX_PERSONALITY; j++)
+ if (pers[j])
+ seq_printf(seq, "[%s] ", pers[j]->name);
+
+ seq_printf(seq, "\n");
+ seq_printf(seq, "read_ahead ");
+ if (read_ahead[MD_MAJOR] == INT_MAX)
+ seq_printf(seq, "not set\n");
+ else
+ seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]);
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro)
+ seq_printf(seq, " (read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ seq_printf(seq, " %s[%d]",
+ partition_name(rdev->dev), rdev->desc_nr);
+ if (rdev->faulty) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %d blocks",
+ md_size[mdidx(mddev)]);
+ else
+ seq_printf(seq, "\n %d blocks", size);
+ }
+
+ if (mddev->pers) {
+
+ mddev->pers->status (seq, mddev);
+
+ seq_printf(seq, "\n ");
+ if (mddev->curr_resync > 1)
+ status_resync (seq, mddev);
+ else if (mddev->curr_resync == 1)
+ seq_printf(seq, " resync=DELAYED");
+
+ }
+ seq_printf(seq, "\n");
+ return 0;
+}
+
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ return error;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (pers[pnum]) {
+ MD_BUG();
+ return -EBUSY;
+ }
+
+ pers[pnum] = p;
+ printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+ return 0;
+}
+
+int unregister_md_personality(int pnum)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+ pers[pnum] = NULL;
+ return 0;
+}
+
+mdp_disk_t *get_spare(mddev_t *mddev)
+{
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *disk;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (!rdev->sb) {
+ MD_BUG();
+ continue;
+ }
+ disk = &sb->disks[rdev->desc_nr];
+ if (disk_faulty(disk)) {
+ MD_BUG();
+ continue;
+ }
+ if (disk_active(disk))
+ continue;
+ return disk;
+ }
+ return NULL;
+}
+
+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
+void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
+{
+ unsigned int major = MAJOR(dev);
+ unsigned int index;
+
+ index = disk_index(dev);
+ if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ return;
+
+ sync_io[major][index] += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ int major = MAJOR(rdev->dev);
+ int idx = disk_index(rdev->dev);
+
+ if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ continue;
+
+ curr_events = kstat.dk_drive_rblk[major][idx] +
+ kstat.dk_drive_wblk[major][idx] ;
+ curr_events -= sync_io[major][idx];
+ if ((curr_events - rdev->last_events) > 32) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ mddev->recovery_running = -EIO;
+ md_recover_arrays();
+ // stop recovery, signal do_sync ....
+ if (mddev->pers->stop_resync)
+ mddev->pers->stop_resync(mddev);
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+ }
+}
+
+
+DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+static void md_do_sync(void *data)
+{
+ mddev_t *mddev = data;
+ mddev_t *mddev2;
+ unsigned int max_sectors, currspeed,
+ j, window, err;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct md_list_head *tmp;
+ unsigned long last_check;
+
+ /* just incase thread restarts... */
+ if (mddev->recovery_running <= 0)
+ return;
+
+ /* we overload curr_resync somewhat here.
+ * 0 == not engaged in resync at all
+ * 2 == checking that there is no conflict with another sync
+ * 1 == like 2, but have yielded to allow conflicting resync to
+ * commense
+ * other == active in resync - this many blocks
+ */
+ do {
+ mddev->curr_resync = 2;
+
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync &&
+ match_mddev_units(mddev,mddev2)) {
+ printk(KERN_INFO "md: delaying resync of md%d until md%d "
+ "has finished resync (they share one or more physical units)\n",
+ mdidx(mddev), mdidx(mddev2));
+ if (mddev < mddev2) /* arbitrarily yield */
+ mddev->curr_resync = 1;
+ if (wait_event_interruptible(resync_wait,
+ mddev2->curr_resync < 2)) {
+ md_flush_signals();
+ err = -EINTR;
+ mddev_put(mddev2);
+ goto out;
+ }
+ }
+ }
+ } while (mddev->curr_resync < 2);
+
+ max_sectors = mddev->sb->size<<1;
+
+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
+ sysctl_speed_limit_min);
+ printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ sysctl_speed_limit_max);
+
+ /*
+ * Resync has low priority.
+ */
+ current->nice = 19;
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = 0;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = vm_max_readahead*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+ window/2,max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+ for (j = 0; j < max_sectors;) {
+ int sectors;
+
+ sectors = mddev->pers->sync_request(mddev, j);
+
+ if (sectors < 0) {
+ err = sectors;
+ goto out;
+ }
+ atomic_add(sectors, &mddev->recovery_active);
+ j += sectors;
+ if (j>1) mddev->curr_resync = j;
+
+ if (last_check + window > j)
+ continue;
+
+ last_check = j;
+
+ run_task_queue(&tq_disk);
+
+ repeat:
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (md_signal_pending(current)) {
+ /*
+ * got a signal, exit.
+ */
+ printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
+ md_flush_signals();
+ err = -EINTR;
+ goto out;
+ }
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ if (md_need_resched(current))
+ schedule();
+
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > sysctl_speed_limit_min) {
+ current->nice = 19;
+
+ if ((currspeed > sysctl_speed_limit_max) ||
+ !is_mddev_idle(mddev)) {
+ current->state = TASK_INTERRUPTIBLE;
+ md_schedule_timeout(HZ/4);
+ goto repeat;
+ }
+ } else
+ current->nice = -20;
+ }
+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+ err = 0;
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+out:
+ wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
+ /* tell personality that we are finished */
+ mddev->pers->sync_request(mddev, max_sectors, 1);
+
+ mddev->curr_resync = 0;
+ if (err)
+ mddev->recovery_running = err;
+ if (mddev->recovery_running > 0)
+ mddev->recovery_running = 0;
+ if (mddev->recovery_running == 0)
+ mddev->in_sync = 1;
+ md_recover_arrays();
+}
+
+
+/*
+ * This is the kernel thread that watches all md arrays for re-sync action
+ * that might be needed.
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set "->recovery_running" and
+ * create a thread at ->sync_thread.
+ * When the thread finishes is clears recovery_running (or set and error)
+ * and wakeup up this thread which will reap the thread and finish up.
+ */
+void md_do_recovery(void *data)
+{
+ mddev_t *mddev;
+ mdp_super_t *sb;
+ struct md_list_head *tmp;
+
+ dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
+
+ ITERATE_MDDEV(mddev,tmp) if (mddev_lock(mddev)==0) {
+ sb = mddev->sb;
+ if (!sb || !mddev->pers || !mddev->pers->diskop || mddev->ro)
+ goto unlock;
+ if (mddev->recovery_running > 0)
+ /* resync/recovery still happening */
+ goto unlock;
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ if (mddev->sync_thread) {
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (mddev->recovery_running < 0) {
+ /* some sort of failure.
+ * If we were doing a reconstruction,
+ * we need to retrieve the spare
+ */
+ if (mddev->spare) {
+ mddev->pers->diskop(mddev, &mddev->spare,
+ DISKOP_SPARE_INACTIVE);
+ mddev->spare = NULL;
+ }
+ } else {
+ /* success...*/
+ if (mddev->spare) {
+ mddev->pers->diskop(mddev, &mddev->spare,
+ DISKOP_SPARE_ACTIVE);
+ mark_disk_sync(mddev->spare);
+ mark_disk_active(mddev->spare);
+ sb->active_disks++;
+ sb->spare_disks--;
+ mddev->spare = NULL;
+ }
+ }
+ __md_update_sb(mddev);
+ mddev->recovery_running = 0;
+ wake_up(&resync_wait);
+ goto unlock;
+ }
+ if (mddev->recovery_running) {
+ /* that's odd.. */
+ mddev->recovery_running = 0;
+ wake_up(&resync_wait);
+ }
+
+ if (sb->active_disks < sb->raid_disks) {
+ mddev->spare = get_spare(mddev);
+ if (!mddev->spare)
+ printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
+ "-- continuing in degraded mode\n", mdidx(mddev));
+ else
+ printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
+ mdidx(mddev), partition_name(MKDEV(mddev->spare->major,mddev->spare->minor)));
+ }
+ if (!mddev->spare && mddev->in_sync) {
+ /* nothing we can do ... */
+ goto unlock;
+ }
+ if (mddev->pers->sync_request) {
+ mddev->sync_thread = md_register_thread(md_do_sync,
+ mddev,
+ "md_resync");
+ if (!mddev->sync_thread) {
+ printk(KERN_ERR "md%d: could not start resync thread...\n", mdidx(mddev));
+ if (mddev->spare)
+ mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_INACTIVE);
+ mddev->spare = NULL;
+ mddev->recovery_running = 0;
+ } else {
+ if (mddev->spare)
+ mddev->pers->diskop(mddev, &mddev->spare, DISKOP_SPARE_WRITE);
+ mddev->recovery_running = 1;
+ md_wakeup_thread(mddev->sync_thread);
+ }
+ }
+ unlock:
+ mddev_unlock(mddev);
+ }
+ dprintk(KERN_INFO "md: recovery thread finished ...\n");
+
+}
+
+int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct md_list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
+ || (code == MD_SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ if (mddev_trylock(mddev)==0)
+ do_md_stop (mddev, 1);
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ md_mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+ notifier_call: md_notify_reboot,
+ next: NULL,
+ priority: INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+ int i;
+
+ for(i = 0; i < MAX_MD_DEVS; i++) {
+ md_blocksizes[i] = 1024;
+ md_size[i] = 0;
+ md_hardsect_sizes[i] = 512;
+ }
+ blksize_size[MAJOR_NR] = md_blocksizes;
+ blk_size[MAJOR_NR] = md_size;
+ max_readahead[MAJOR_NR] = md_maxreadahead;
+ hardsect_size[MAJOR_NR] = md_hardsect_sizes;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+#ifdef CONFIG_PROC_FS
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+#endif
+}
+
+request_queue_t * md_queue_proc(kdev_t dev)
+{
+ mddev_t *mddev = mddev_find(minor(dev));
+ request_queue_t *q = BLK_DEFAULT_QUEUE(MAJOR_NR);
+ if (!mddev || atomic_read(&mddev->active)<2)
+ BUG();
+ if (mddev->pers)
+ q = &mddev->queue;
+ mddev_put(mddev); /* the caller must hold a reference... */
+ return q;
+}
+
+int md__init md_init(void)
+{
+ static char * name = "mdrecoveryd";
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+
+ if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
+ {
+ printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR);
+ return (-1);
+ }
+ devfs_handle = devfs_mk_dir (NULL, "md", NULL);
+ /* we don't use devfs_register_series because we want to fill md_hd_struct */
+ for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+ char devname[128];
+ sprintf (devname, "%u", minor);
+ md_hd_struct[minor].de = devfs_register (devfs_handle,
+ devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+ S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+ }
+
+ /* all requests on an uninitialised device get failed... */
+ blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request);
+ blk_dev[MAJOR_NR].queue = md_queue_proc;
+
+
+ read_ahead[MAJOR_NR] = INT_MAX;
+
+ add_gendisk(&md_gendisk);
+
+ md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
+ if (!md_recovery_thread)
+ printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n");
+
+ md_register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * When md (and any require personalities) are compiled into the kernel
+ * (not a module), arrays can be assembles are boot time using with AUTODETECT
+ * where specially marked partitions are registered with md_autodetect_dev(),
+ * and with MD_BOOT where devices to be collected are given on the boot line
+ * with md=.....
+ * The code for that is here.
+ */
+
+struct {
+ int set;
+ int noautodetect;
+} raid_setup_args md__initdata;
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static kdev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(kdev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(void)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ kdev_t dev = detected_devices[i];
+
+ if (md_import_device(dev,1)) {
+ printk(KERN_ALERT "md: could not import %s!\n",
+ partition_name(dev));
+ continue;
+ }
+ /*
+ * Sanity checks:
+ */
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ continue;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ continue;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices();
+}
+
+static struct {
+ char device_set [MAX_MD_DEVS];
+ int pers[MAX_MD_DEVS];
+ int chunk[MAX_MD_DEVS];
+ char *device_names[MAX_MD_DEVS];
+} md_setup_args md__initdata;
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the MD device now; that is handled by
+ * md_setup_drive after the low-level disk drivers have initialised.
+ *
+ * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
+ * assigns the task of parsing integer arguments to the
+ * invoked program now). Added ability to initialise all
+ * the MD devices (by specifying multiple "md=" lines)
+ * instead of just one. -- KTK
+ * 18May2000: Added support for persistant-superblock arrays:
+ * md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
+ * md=n,device-list reads a RAID superblock from the devices
+ * elements in device-list are read by name_to_kdev_t so can be
+ * a hex number or something like /dev/hda1 /dev/sdb
+ * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
+ * Shifted name_to_kdev_t() and related operations to md_set_drive()
+ * for later execution. Rewrote section to make devfs compatible.
+ */
+static int md__init md_setup(char *str)
+{
+ int minor, level, factor, fault;
+ char *pername = "";
+ char *str1 = str;
+
+ if (get_option(&str, &minor) != 2) { /* MD Number */
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ if (minor >= MAX_MD_DEVS) {
+ printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor);
+ return 0;
+ } else if (md_setup_args.device_names[minor]) {
+ printk(KERN_WARNING "md: md=%d, Specified more then once. "
+ "Replacing previous definition.\n", minor);
+ }
+ switch (get_option(&str, &level)) { /* RAID Personality */
+ case 2: /* could be 0 or -1.. */
+ if (level == 0 || level == -1) {
+ if (get_option(&str, &factor) != 2 || /* Chunk Size */
+ get_option(&str, &fault) != 2) {
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ md_setup_args.chunk[minor] = 1 << (factor+12);
+ switch(level) {
+ case -1:
+ level = LINEAR;
+ pername = "linear";
+ break;
+ case 0:
+ level = RAID0;
+ pername = "raid0";
+ break;
+ default:
+ printk(KERN_WARNING
+ "md: The kernel has not been configured for raid%d support!\n",
+ level);
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ break;
+ }
+ /* FALL THROUGH */
+ case 1: /* the first device is numeric */
+ str = str1;
+ /* FALL THROUGH */
+ case 0:
+ md_setup_args.pers[minor] = 0;
+ pername="super-block";
+ }
+
+ printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
+ minor, pername, str);
+ md_setup_args.device_names[minor] = str;
+
+ return 1;
+}
+
+extern kdev_t name_to_kdev_t(char *line) md__init;
+void md__init md_setup_drive(void)
+{
+ int minor, i;
+ kdev_t dev;
+ mddev_t*mddev;
+ kdev_t devices[MD_SB_DISKS+1];
+
+ for (minor = 0; minor < MAX_MD_DEVS; minor++) {
+ int err = 0;
+ char *devname;
+ mdu_disk_info_t dinfo;
+
+ if ((devname = md_setup_args.device_names[minor]) == 0) continue;
+
+ for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {
+
+ char *p;
+ void *handle;
+
+ p = strchr(devname, ',');
+ if (p)
+ *p++ = 0;
+
+ dev = name_to_kdev_t(devname);
+ handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev),
+ DEVFS_SPECIAL_BLK, 1);
+ if (handle != 0) {
+ unsigned major, minor;
+ devfs_get_maj_min(handle, &major, &minor);
+ dev = MKDEV(major, minor);
+ }
+ if (dev == 0) {
+ printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
+ break;
+ }
+
+ devices[i] = dev;
+ md_setup_args.device_set[minor] = 1;
+
+ devname = p;
+ }
+ devices[i] = 0;
+
+ if (md_setup_args.device_set[minor] == 0)
+ continue;
+
+ printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
+
+ mddev = mddev_find(minor);
+ if (!mddev) {
+ printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
+ continue;
+ }
+ if (mddev_lock(mddev)) {
+ printk(KERN_WARNING
+ "md: Ignoring md=%d, cannot lock!\n",
+ minor);
+ mddev_put(mddev);
+ continue;
+ }
+
+ if (mddev->sb || !list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
+ minor);
+ mddev_unlock(mddev);
+ mddev_put(mddev);
+ continue;
+ }
+ if (md_setup_args.pers[minor]) {
+ /* non-persistent */
+ mdu_array_info_t ainfo;
+ ainfo.level = pers_to_level(md_setup_args.pers[minor]);
+ ainfo.size = 0;
+ ainfo.nr_disks =0;
+ ainfo.raid_disks =0;
+ ainfo.md_minor =minor;
+ ainfo.not_persistent = 1;
+
+ ainfo.state = (1 << MD_SB_CLEAN);
+ ainfo.active_disks = 0;
+ ainfo.working_disks = 0;
+ ainfo.failed_disks = 0;
+ ainfo.spare_disks = 0;
+ ainfo.layout = 0;
+ ainfo.chunk_size = md_setup_args.chunk[minor];
+ err = set_array_info(mddev, &ainfo);
+ for (i = 0; !err && (dev = devices[i]); i++) {
+ dinfo.number = i;
+ dinfo.raid_disk = i;
+ dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ mddev->sb->nr_disks++;
+ mddev->sb->raid_disks++;
+ mddev->sb->active_disks++;
+ mddev->sb->working_disks++;
+ err = add_new_disk (mddev, &dinfo);
+ }
+ } else {
+ /* persistent */
+ for (i = 0; (dev = devices[i]); i++) {
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ add_new_disk (mddev, &dinfo);
+ }
+ }
+ if (!err)
+ err = do_md_run(mddev);
+ if (err) {
+ mddev->sb_dirty = 0;
+ do_md_stop(mddev, 0);
+ printk(KERN_WARNING "md: starting md%d failed\n", minor);
+ }
+ mddev_unlock(mddev);
+ mddev_put(mddev);
+ }
+}
+
+static int md__init raid_setup(char *str)
+{
+ int len, pos;
+
+ len = strlen(str) + 1;
+ pos = 0;
+
+ while (pos < len) {
+ char *comma = strchr(str+pos, ',');
+ int wlen;
+ if (comma)
+ wlen = (comma-str)-pos;
+ else wlen = (len-1)-pos;
+
+ if (strncmp(str, "noautodetect", wlen) == 0)
+ raid_setup_args.noautodetect = 1;
+ pos += wlen+1;
+ }
+ raid_setup_args.set = 1;
+ return 1;
+}
+
+int md__init md_run_setup(void)
+{
+ if (raid_setup_args.noautodetect)
+ printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
+ else
+ autostart_arrays();
+ md_setup_drive();
+ return 0;
+}
+
+__setup("raid=", raid_setup);
+__setup("md=", md_setup);
+
+__initcall(md_init);
+__initcall(md_run_setup);
+
+#else /* It is a MODULE */
+
+int init_module(void)
+{
+ return md_init();
+}
+
+static void free_device_names(void)
+{
+ while (!list_empty(&device_names)) {
+ struct dname *tmp = list_entry(device_names.next,
+ dev_name_t, list);
+ list_del(&tmp->list);
+ kfree(tmp);
+ }
+}
+
+
+void cleanup_module(void)
+{
+ md_unregister_thread(md_recovery_thread);
+ devfs_unregister(devfs_handle);
+
+ devfs_unregister_blkdev(MAJOR_NR,"md");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("mdstat", NULL);
+#endif
+
+ del_gendisk(&md_gendisk);
+
+ blk_dev[MAJOR_NR].queue = NULL;
+ blksize_size[MAJOR_NR] = NULL;
+ blk_size[MAJOR_NR] = NULL;
+ max_readahead[MAJOR_NR] = NULL;
+ hardsect_size[MAJOR_NR] = NULL;
+
+ free_device_names();
+
+}
+#endif
+
+MD_EXPORT_SYMBOL(md_size);
+MD_EXPORT_SYMBOL(register_md_personality);
+MD_EXPORT_SYMBOL(unregister_md_personality);
+MD_EXPORT_SYMBOL(partition_name);
+MD_EXPORT_SYMBOL(md_error);
+MD_EXPORT_SYMBOL(md_done_sync);
+MD_EXPORT_SYMBOL(md_unregister_thread);
+MD_EXPORT_SYMBOL(md_update_sb);
+MD_EXPORT_SYMBOL(md_wakeup_thread);
+MD_EXPORT_SYMBOL(md_print_devices);
+MD_EXPORT_SYMBOL(find_rdev_nr);
+MD_EXPORT_SYMBOL(md_interrupt_thread);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md-autostart/patch b/tests/linux/md-autostart/patch
new file mode 100644
index 0000000..9d6d660
--- /dev/null
+++ b/tests/linux/md-autostart/patch
@@ -0,0 +1,27 @@
+***************
+*** 2584,2601 ****
+ printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
+ goto abort_unlock;
+ }
+- }
+- goto done_unlock;
+-
+- case START_ARRAY:
+- /*
+- * possibly make it lock the array ...
+- */
+- err = autostart_array(val_to_kdev(arg));
+- if (err) {
+- printk(KERN_WARNING "md: autostart %s failed!\n",
+- partition_name(val_to_kdev(arg)));
+- goto abort_unlock;
+ }
+ goto done_unlock;
+
+--- 2598,2603 ----
+ printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
diff --git a/tests/linux/md-loop/1 b/tests/linux/md-loop/1
new file mode 100644
index 0000000..f0abb8e
--- /dev/null
+++ b/tests/linux/md-loop/1
@@ -0,0 +1,3949 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/raid/xor.h>
+#include <linux/devfs_fs_kernel.h>
+
+#include <linux/init.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+
+#include <linux/blk.h>
+
+#define DEBUG 0
+#if DEBUG
+# define dprintk(x...) printk(x)
+#else
+# define dprintk(x...) do { } while(0)
+#endif
+
+#ifndef MODULE
+static void autostart_arrays (void);
+#endif
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 100 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static int sysctl_speed_limit_min = 100;
+static int sysctl_speed_limit_max = 100000;
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
+ &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
+ {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
+ &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
+ {0}
+};
+
+static ctl_table raid_dir_table[] = {
+ {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
+ {0}
+};
+
+static ctl_table raid_root_table[] = {
+ {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
+ {0}
+};
+
+/*
+ * these have to be allocated separately because external
+ * subsystems want to have a pre-defined structure
+ */
+struct hd_struct md_hd_struct[MAX_MD_DEVS];
+static int md_blocksizes[MAX_MD_DEVS];
+static int md_hardsect_sizes[MAX_MD_DEVS];
+static mdk_thread_t *md_recovery_thread;
+
+int md_size[MAX_MD_DEVS];
+
+static struct block_device_operations md_fops;
+static devfs_handle_t devfs_handle;
+
+static struct gendisk md_gendisk=
+{
+ major: MD_MAJOR,
+ major_name: "md",
+ minor_shift: 0,
+ max_p: 1,
+ part: md_hd_struct,
+ sizes: md_size,
+ nr_real: MAX_MD_DEVS,
+ real_devices: NULL,
+ next: NULL,
+ fops: &md_fops,
+};
+
+/*
+ * Enables to iterate over all existing md arrays
+ */
+static MD_LIST_HEAD(all_mddevs);
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static inline mddev_t * kdev_to_mddev (kdev_t dev)
+{
+ if (MAJOR(dev) != MD_MAJOR)
+ BUG();
+ return mddev_map[MINOR(dev)];
+}
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio);
+ return 0;
+}
+
+static mddev_t * alloc_mddev(kdev_t dev)
+{
+ mddev_t *mddev;
+
+ if (MAJOR(dev) != MD_MAJOR) {
+ MD_BUG();
+ return 0;
+ }
+ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
+ if (!mddev)
+ return NULL;
+
+ memset(mddev, 0, sizeof(*mddev));
+
+ mddev->__minor = MINOR(dev);
+ init_MUTEX(&mddev->reconfig_sem);
+ init_MUTEX(&mddev->recovery_sem);
+ init_MUTEX(&mddev->resync_sem);
+ MD_INIT_LIST_HEAD(&mddev->disks);
+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);
+ atomic_set(&mddev->active, 0);
+
+ mddev_map[mdidx(mddev)] = mddev;
+ md_list_add(&mddev->all_mddevs, &all_mddevs);
+
+ MOD_INC_USE_COUNT;
+
+ return mddev;
+}
+
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+static MD_LIST_HEAD(device_names);
+
+char * partition_name(kdev_t dev)
+{
+ struct gendisk *hd;
+ static char nomem [] = "<nomem>";
+ dev_name_t *dname;
+ struct md_list_head *tmp;
+
+ list_for_each(tmp, &device_names) {
+ dname = md_list_entry(tmp, dev_name_t, list);
+ if (dname->dev == dev)
+ return dname->name;
+ }
+
+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
+
+ if (!dname)
+ return nomem;
+ /*
+ * ok, add this new device name to the list
+ */
+ hd = get_gendisk (dev);
+ dname->name = NULL;
+ if (hd)
+ dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
+ if (!dname->name) {
+ sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
+ dname->name = dname->namebuf;
+ }
+
+ dname->dev = dev;
+ md_list_add(&dname->list, &device_names);
+
+ return dname->name;
+}
+
+static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev,
+ int persistent)
+{
+ unsigned int size = 0;
+
+ if (blk_size[MAJOR(dev)])
+ size = blk_size[MAJOR(dev)][MINOR(dev)];
+ if (persistent)
+ size = MD_NEW_SIZE_BLOCKS(size);
+ return size;
+}
+
+static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent)
+{
+ unsigned int size;
+
+ size = calc_dev_sboffset(dev, mddev, persistent);
+ if (!mddev->sb) {
+ MD_BUG();
+ return size;
+ }
+ if (mddev->sb->chunk_size)
+ size &= ~(mddev->sb->chunk_size/1024 - 1);
+ return size;
+}
+
+static unsigned int zoned_raid_size(mddev_t *mddev)
+{
+ unsigned int mask;
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ /*
+ * do size and offset calculations.
+ */
+ mask = ~(mddev->sb->chunk_size/1024 - 1);
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev->size &= mask;
+ md_size[mdidx(mddev)] += rdev->size;
+ }
+ return 0;
+}
+
+static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb)
+{
+ if (disk_active(disk)) {
+ sb->working_disks--;
+ } else {
+ if (disk_spare(disk)) {
+ sb->spare_disks--;
+ sb->working_disks--;
+ } else {
+ sb->failed_disks--;
+ }
+ }
+ sb->nr_disks--;
+ disk->major = 0;
+ disk->minor = 0;
+ mark_disk_removed(disk);
+}
+
+#define BAD_MAGIC KERN_ERR \
+"md: invalid raid superblock magic on %s\n"
+
+#define BAD_MINOR KERN_ERR \
+"md: %s: invalid raid minor (%x)\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_SB KERN_ERR \
+"md: disabled device %s, could not read superblock.\n"
+
+#define BAD_CSUM KERN_WARNING \
+"md: invalid superblock checksum on %s\n"
+
+static int alloc_array_sb(mddev_t * mddev)
+{
+ if (mddev->sb) {
+ MD_BUG();
+ return 0;
+ }
+
+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
+ if (!mddev->sb)
+ return -ENOMEM;
+ md_clear_page(mddev->sb);
+ return 0;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(OUT_OF_MEM);
+ return -EINVAL;
+ }
+ rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ page_cache_release(rdev->sb_page);
+ rdev->sb = NULL;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ } else {
+ if (!rdev->faulty)
+ MD_BUG();
+ }
+}
+
+
+static void bh_complete(struct buffer_head *bh, int uptodate)
+{
+
+ if (uptodate)
+ set_bit(BH_Uptodate, &bh->b_state);
+
+ complete((struct completion*)bh->b_private);
+}
+
+static int sync_page_io(kdev_t dev, unsigned long sector, int size,
+ struct page *page, int rw)
+{
+ struct buffer_head bh;
+ struct completion event;
+
+ init_completion(&event);
+ init_buffer(&bh, bh_complete, &event);
+ bh.b_rdev = dev;
+ bh.b_rsector = sector;
+ bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
+ bh.b_size = size;
+ bh.b_page = page;
+ bh.b_reqnext = NULL;
+ bh.b_data = page_address(page);
+ generic_make_request(rw, &bh);
+
+ run_task_queue(&tq_disk);
+ wait_for_completion(&event);
+
+ return test_bit(BH_Uptodate, &bh.b_state);
+}
+
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+ int ret = -EINVAL;
+ kdev_t dev = rdev->dev;
+ unsigned long sb_offset;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk
+ */
+ sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
+ rdev->sb_offset = sb_offset;
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) {
+ printk(NO_SB,partition_name(dev));
+ return -EINVAL;
+ }
+ printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
+ ret = 0;
+abort:
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+/*
+ * Check one RAID superblock for generic plausibility
+ */
+
+static int check_disk_sb(mdk_rdev_t * rdev)
+{
+ mdp_super_t *sb;
+ int ret = -EINVAL;
+
+ sb = rdev->sb;
+ if (!sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(BAD_MAGIC, partition_name(rdev->dev));
+ goto abort;
+ }
+
+ if (sb->md_minor >= MAX_MD_DEVS) {
+ printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor);
+ goto abort;
+ }
+
+ if (calc_sb_csum(sb) != sb->sb_csum) {
+ printk(BAD_CSUM, partition_name(rdev->dev));
+ goto abort;
+ }
+ ret = 0;
+abort:
+ return ret;
+}
+
+static kdev_t dev_unit(kdev_t dev)
+{
+ unsigned int mask;
+ struct gendisk *hd = get_gendisk(dev);
+
+ if (!hd)
+ return 0;
+ mask = ~((1 << hd->minor_shift) - 1);
+
+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
+}
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (dev_unit(rdev->dev) == dev_unit(dev))
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev->dev))
+ return 1;
+
+ return 0;
+}
+
+static MD_LIST_HEAD(all_raid_disks);
+static MD_LIST_HEAD(pending_raid_disks);
+
+static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ same_pdev = match_dev_unit(mddev, rdev->dev);
+ if (same_pdev)
+ printk( KERN_WARNING
+"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
+" protection against single-disk failure might be compromised.\n",
+ mdidx(mddev), partition_name(rdev->dev),
+ partition_name(same_pdev->dev));
+
+ md_list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev));
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev));
+ rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev(mdk_rdev_t *rdev)
+{
+ int err = 0;
+ struct block_device *bdev;
+
+ bdev = bdget(rdev->dev);
+ if (!bdev)
+ return -ENOMEM;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ if (!err)
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ blkdev_put(bdev, BDEV_RAW);
+}
+
+void md_autodetect_dev(kdev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev));
+ if (rdev->mddev)
+ MD_BUG();
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ list_del_init(&rdev->all);
+ if (!list_empty(&rdev->pending)) {
+ printk(KERN_INFO "md: (%s was pending)\n",
+ partition_name(rdev->dev));
+ list_del_init(&rdev->pending);
+ }
+#ifndef MODULE
+ md_autodetect_dev(rdev->dev);
+#endif
+ rdev->dev = 0;
+ rdev->faulty = 0;
+ kfree(rdev);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb = mddev->sb;
+
+ if (mddev->sb) {
+ mddev->sb = NULL;
+ free_page((unsigned long) sb);
+ }
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+}
+
+static void free_mddev(mddev_t *mddev)
+{
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ export_array(mddev);
+ md_size[mdidx(mddev)] = 0;
+ md_hd_struct[mdidx(mddev)].nr_sects = 0;
+
+ /*
+ * Make sure nobody else is using this mddev
+ * (careful, we rely on the global kernel lock here)
+ */
+ while (sem_getcount(&mddev->resync_sem) != 1)
+ schedule();
+ while (sem_getcount(&mddev->recovery_sem) != 1)
+ schedule();
+
+ del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
+ md_list_del(&mddev->all_mddevs);
+ kfree(mddev);
+ MOD_DEC_USE_COUNT;
+}
+
+#undef BAD_CSUM
+#undef BAD_MAGIC
+#undef OUT_OF_MEM
+#undef NO_SB
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+ partition_name(MKDEV(desc->major,desc->minor)),
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
+ sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
+ partition_name(rdev->dev), partition_name(rdev->old_dev),
+ rdev->size, rdev->faulty, rdev->desc_nr);
+ if (rdev->sb) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb(rdev->sb);
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) {
+ printk("md%d: ", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", partition_name(rdev->dev));
+
+ if (mddev->sb) {
+ printk(" array superblock:\n");
+ print_sb(mddev->sb);
+ } else
+ printk(" no array superblock.\n");
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ if (tmp1)
+ kfree(tmp1);
+ if (tmp2)
+ kfree(tmp2);
+
+ return ret;
+}
+
+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
+{
+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+static mdk_rdev_t * find_rdev_all(kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ list_for_each(tmp, &all_raid_disks) {
+ rdev = md_list_entry(tmp, mdk_rdev_t, all);
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+#define GETBLK_FAILED KERN_ERR \
+"md: getblk failed for device %s\n"
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+ kdev_t dev;
+ unsigned long sb_offset, size;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
+ MD_BUG();
+ return 1;
+ }
+
+ dev = rdev->dev;
+ sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
+ if (rdev->sb_offset != sb_offset) {
+ printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n",
+ partition_name(dev), rdev->sb_offset, sb_offset);
+ goto skip;
+ }
+ /*
+ * If the disk went offline meanwhile and it's just a spare, then
+ * its size has changed to zero silently, and the MD code does
+ * not yet know that it's faulty.
+ */
+ size = calc_dev_size(dev, rdev->mddev, 1);
+ if (size != rdev->size) {
+ printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n",
+ partition_name(dev), rdev->size, size);
+ goto skip;
+ }
+
+ printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) {
+ printk("md: write_disk_sb failed for device %s\n", partition_name(dev));
+ return 1;
+ }
+skip:
+ return 0;
+}
+#undef GETBLK_FAILED
+
+static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ int i, ok = 0;
+ mdp_disk_t *desc;
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ desc = mddev->sb->disks + i;
+#if 0
+ if (disk_faulty(desc)) {
+ if (MKDEV(desc->major,desc->minor) == rdev->dev)
+ ok = 1;
+ continue;
+ }
+#endif
+ if (MKDEV(desc->major,desc->minor) == rdev->dev) {
+ rdev->sb->this_disk = *desc;
+ rdev->desc_nr = desc->number;
+ ok = 1;
+ break;
+ }
+ }
+
+ if (!ok) {
+ MD_BUG();
+ }
+}
+
+static int sync_sbs(mddev_t * mddev)
+{
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty || rdev->alias_device)
+ continue;
+ sb = rdev->sb;
+ *sb = *mddev->sb;
+ set_this_disk(mddev, rdev);
+ sb->sb_csum = calc_sb_csum(sb);
+ }
+ return 0;
+}
+
+int md_update_sb(mddev_t * mddev)
+{
+ int err, count = 100;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->sb_dirty) {
+ printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0));
+ return 0;
+ }
+ mddev->sb_dirty = 0;
+repeat:
+ mddev->sb->utime = CURRENT_TIME;
+ if ((++mddev->sb->events_lo)==0)
+ ++mddev->sb->events_hi;
+
+ if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
+ }
+ sync_sbs(mddev);
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (mddev->sb->not_persistent)
+ return 0;
+
+ printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
+ mdidx(mddev));
+
+ err = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk(KERN_INFO "md: ");
+ if (rdev->faulty)
+ printk("(skipping faulty ");
+ if (rdev->alias_device)
+ printk("(skipping alias ");
+ if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) {
+ printk("(skipping new-faulty %s )\n",
+ partition_name(rdev->dev));
+ continue;
+ }
+ printk("%s ", partition_name(rdev->dev));
+ if (!rdev->faulty && !rdev->alias_device) {
+ printk("[events: %08lx]",
+ (unsigned long)rdev->sb->events_lo);
+ err += write_disk_sb(rdev);
+ } else
+ printk(")\n");
+ }
+ if (err) {
+ if (--count) {
+ printk(KERN_ERR "md: errors occurred during superblock update, repeating\n");
+ goto repeat;
+ }
+ printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
+ }
+ return 0;
+}
+
+/*
+ * Import a device. If 'on_disk', then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ */
+static int md_import_device(kdev_t newdev, int on_disk)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ unsigned int size;
+
+ if (find_rdev_all(newdev))
+ return -EEXIST;
+
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev));
+ return -ENOMEM;
+ }
+ memset(rdev, 0, sizeof(*rdev));
+
+ if (is_mounted(newdev)) {
+ printk(KERN_WARNING "md: can not import %s, has active inodes!\n",
+ partition_name(newdev));
+ err = -EBUSY;
+ goto abort_free;
+ }
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ rdev->dev = newdev;
+ if (lock_rdev(rdev)) {
+ printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+ rdev->desc_nr = -1;
+ rdev->faulty = 0;
+
+ size = 0;
+ if (blk_size[MAJOR(newdev)])
+ size = blk_size[MAJOR(newdev)][MINOR(newdev)];
+ if (!size) {
+ printk(KERN_WARNING "md: %s has zero size, marking faulty!\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (on_disk) {
+ if ((err = read_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+ if ((err = check_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+
+ if (rdev->sb->level != -4) {
+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
+ rdev->sb->this_disk.minor);
+ rdev->desc_nr = rdev->sb->this_disk.number;
+ } else {
+ rdev->old_dev = MKDEV(0, 0);
+ rdev->desc_nr = -1;
+ }
+ }
+ md_list_add(&rdev->all, &all_raid_disks);
+ MD_INIT_LIST_HEAD(&rdev->pending);
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return 0;
+
+abort_free:
+ if (rdev->sb) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return err;
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+#define INCONSISTENT KERN_ERR \
+"md: fatal superblock inconsistency in %s -- removing from array\n"
+
+#define OUT_OF_DATE KERN_ERR \
+"md: superblock update time inconsistency -- using the most recent one\n"
+
+#define OLD_VERSION KERN_ALERT \
+"md: md%d: unsupported raid array version %d.%d.%d\n"
+
+#define NOT_CLEAN_IGNORE KERN_ERR \
+"md: md%d: raid array is not clean -- starting background reconstruction\n"
+
+#define UNKNOWN_LEVEL KERN_ERR \
+"md: md%d: unsupported raid level %d\n"
+
+static int analyze_sbs(mddev_t * mddev)
+{
+ int out_of_date = 0, i, first;
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev, *rdev2, *freshest;
+ mdp_super_t *sb;
+
+ /*
+ * Verify the RAID superblock on each real device
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty) {
+ MD_BUG();
+ goto abort;
+ }
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+ if (check_disk_sb(rdev))
+ goto abort;
+ }
+
+ /*
+ * The superblock constant part has to be the same
+ * for all disks in the array.
+ */
+ sb = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!sb) {
+ sb = rdev->sb;
+ continue;
+ }
+ if (!sb_equal(sb, rdev->sb)) {
+ printk(INCONSISTENT, partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * OK, we have all disks and the array is ready to run. Let's
+ * find the freshest superblock, that one will be the superblock
+ * that represents the whole array.
+ */
+ if (!mddev->sb)
+ if (alloc_array_sb(mddev))
+ goto abort;
+ sb = mddev->sb;
+ freshest = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2;
+ /*
+ * if the checksum is invalid, use the superblock
+ * only as a last resort. (decrease it's age by
+ * one event)
+ */
+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
+ if (rdev->sb->events_lo || rdev->sb->events_hi)
+ if ((rdev->sb->events_lo--)==0)
+ rdev->sb->events_hi--;
+ }
+
+ printk(KERN_INFO "md: %s's event counter: %08lx\n",
+ partition_name(rdev->dev),
+ (unsigned long)rdev->sb->events_lo);
+ if (!freshest) {
+ freshest = rdev;
+ continue;
+ }
+ /*
+ * Find the newest superblock version
+ */
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(freshest->sb);
+ if (ev1 != ev2) {
+ out_of_date = 1;
+ if (ev1 > ev2)
+ freshest = rdev;
+ }
+ }
+ if (out_of_date) {
+ printk(OUT_OF_DATE);
+ printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev));
+ }
+ memcpy (sb, freshest->sb, sizeof(*sb));
+
+ /*
+ * at this point we have picked the 'best' superblock
+ * from all available superblocks.
+ * now we validate this superblock and kick out possibly
+ * failed disks.
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Kick all non-fresh devices
+ */
+ __u64 ev1, ev2;
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ++ev1;
+ if (ev1 < ev2) {
+ printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
+ partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * Fix up changed device names ... but only if this disk has a
+ * recent update time. Use faulty checksum ones too.
+ */
+ if (mddev->sb->level != -4)
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2, ev3;
+ if (rdev->faulty || rdev->alias_device) {
+ MD_BUG();
+ goto abort;
+ }
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ev3 = ev2;
+ --ev3;
+ if ((rdev->dev != rdev->old_dev) &&
+ ((ev1 == ev2) || (ev1 == ev3))) {
+ mdp_disk_t *desc;
+
+ printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n",
+ partition_name(rdev->old_dev), partition_name(rdev->dev));
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ desc = &sb->disks[rdev->desc_nr];
+ if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
+ MD_BUG();
+ goto abort;
+ }
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ desc = &rdev->sb->this_disk;
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ }
+ }
+
+ /*
+ * Remove unavailable and faulty devices ...
+ *
+ * note that if an array becomes completely unrunnable due to
+ * missing devices, we do not write the superblock back, so the
+ * administrator has a chance to fix things up. The removal thus
+ * only happens if it's nonfatal to the contents of the array.
+ */
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ int found;
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ /*
+ * We kick faulty devices/descriptors immediately.
+ *
+ * Note: multipath devices are a special case. Since we
+ * were able to read the superblock on the path, we don't
+ * care if it was previously marked as faulty, it's up now
+ * so enable it.
+ */
+ if (disk_faulty(desc) && mddev->sb->level != -4) {
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr != desc->number)
+ continue;
+ printk(KERN_WARNING "md%d: kicking faulty %s!\n",
+ mdidx(mddev),partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ found = 1;
+ break;
+ }
+ if (!found) {
+ if (dev == MKDEV(0,0))
+ continue;
+ printk(KERN_WARNING "md%d: removing former faulty %s!\n",
+ mdidx(mddev), partition_name(dev));
+ }
+ remove_descriptor(desc, sb);
+ continue;
+ } else if (disk_faulty(desc)) {
+ /*
+ * multipath entry marked as faulty, unfaulty it
+ */
+ rdev = find_rdev(mddev, dev);
+ if(rdev)
+ mark_disk_spare(desc);
+ else
+ remove_descriptor(desc, sb);
+ }
+
+ if (dev == MKDEV(0,0))
+ continue;
+ /*
+ * Is this device present in the rdev ring?
+ */
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Multi-path IO special-case: since we have no
+ * this_disk descriptor at auto-detect time,
+ * we cannot check rdev->number.
+ * We can check the device though.
+ */
+ if ((sb->level == -4) && (rdev->dev ==
+ MKDEV(desc->major,desc->minor))) {
+ found = 1;
+ break;
+ }
+ if (rdev->desc_nr == desc->number) {
+ found = 1;
+ break;
+ }
+ }
+ if (found)
+ continue;
+
+ printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n",
+ mdidx(mddev), partition_name(dev));
+ remove_descriptor(desc, sb);
+ }
+
+ /*
+ * Double check wether all devices mentioned in the
+ * superblock are in the rdev ring.
+ */
+ first = 1;
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+
+ if (disk_faulty(desc)) {
+ MD_BUG();
+ goto abort;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * In the case of Multipath-IO, we have no
+ * other information source to find out which
+ * disk is which, only the position of the device
+ * in the superblock:
+ */
+ if (mddev->sb->level == -4) {
+ if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) {
+ MD_BUG();
+ goto abort;
+ }
+ rdev->desc_nr = i;
+ if (!first)
+ rdev->alias_device = 1;
+ else
+ first = 0;
+ }
+ }
+
+ /*
+ * Kick all rdevs that are not in the
+ * descriptor array:
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1)
+ kick_rdev_from_array(rdev);
+ }
+
+ /*
+ * Do a final reality check.
+ */
+ if (mddev->sb->level != -4) {
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * is the desc_nr unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->desc_nr == rdev->desc_nr)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ /*
+ * is the device unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->dev == rdev->dev)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ }
+ }
+
+ /*
+ * Check if we can support this RAID array
+ */
+ if (sb->major_version != MD_MAJOR_VERSION ||
+ sb->minor_version > MD_MINOR_VERSION) {
+
+ printk(OLD_VERSION, mdidx(mddev), sb->major_version,
+ sb->minor_version, sb->patch_version);
+ goto abort;
+ }
+
+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
+ (sb->level == 4) || (sb->level == 5)))
+ printk(NOT_CLEAN_IGNORE, mdidx(mddev));
+
+ return 0;
+abort:
+ return 1;
+}
+
+#undef INCONSISTENT
+#undef OUT_OF_DATE
+#undef OLD_VERSION
+#undef OLD_LEVEL
+
+static int device_size_calculation(mddev_t * mddev)
+{
+ int data_disks = 0, persistent;
+ unsigned int readahead;
+ mdp_super_t *sb = mddev->sb;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+ persistent = !mddev->sb->not_persistent;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size) {
+ MD_BUG();
+ continue;
+ }
+ rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
+ if (rdev->size < sb->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size: %ldk < %dk\n",
+ partition_name(rdev->dev),
+ rdev->size, sb->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (sb->level) {
+ case -4:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case -1:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = sb->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = sb->raid_disks-1;
+ break;
+ default:
+ printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = sb->size * data_disks;
+
+ readahead = MD_READAHEAD;
+ if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
+ readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (sb->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+abort:
+ return 1;
+}
+
+
+#define TOO_BIG_CHUNKSIZE KERN_ERR \
+"too big chunk_size: %d > %d\n"
+
+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
+"too small chunk_size: %d < %ld\n"
+
+#define BAD_CHUNKSIZE KERN_ERR \
+"no chunksize specified, see 'man raidtab'\n"
+
+static int do_md_run(mddev_t * mddev)
+{
+ int pnum, err;
+ int chunk_size;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Resize disks to align partitions size on a given
+ * chunk size.
+ */
+ md_size[mdidx(mddev)] = 0;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (analyze_sbs(mddev)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ chunk_size = mddev->sb->chunk_size;
+ pnum = level_to_pers(mddev->sb->level);
+
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+ if (!chunk_size) {
+ /*
+ * 'default chunksize' in the old md code used to
+ * be PAGE_SIZE, baaad.
+ * we abort here to be on the safe side. We dont
+ * want to continue the bad practice.
+ */
+ printk(BAD_CHUNKSIZE);
+ return -EINVAL;
+ }
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+ } else
+ if (chunk_size)
+ printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n",
+ mddev->sb->level);
+
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (!pers[pnum])
+ {
+#ifdef CONFIG_KMOD
+ char module_name[80];
+ sprintf (module_name, "md-personality-%d", pnum);
+ request_module (module_name);
+ if (!pers[pnum])
+#endif
+ {
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
+ pnum);
+ return -EINVAL;
+ }
+ }
+
+ if (device_size_calculation(mddev))
+ return -EINVAL;
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ md_hardsect_sizes[mdidx(mddev)] = 512;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ invalidate_device(rdev->dev, 1);
+ if (get_hardsect_size(rdev->dev)
+ > md_hardsect_sizes[mdidx(mddev)])
+ md_hardsect_sizes[mdidx(mddev)] =
+ get_hardsect_size(rdev->dev);
+ }
+ md_blocksizes[mdidx(mddev)] = 1024;
+ if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
+ md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
+ mddev->pers = pers[pnum];
+
+ blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+ mddev->queue.queuedata = mddev;
+
+ err = mddev->pers->run(mddev);
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ mddev->pers = NULL;
+ return -EINVAL;
+ }
+
+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ /*
+ * md_size has units of 1K blocks, which are
+ * twice as large as sectors.
+ */
+ md_hd_struct[mdidx(mddev)].start_sect = 0;
+ register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)),
+ 1, &md_fops, md_size[mdidx(mddev)]<<1);
+
+ read_ahead[MD_MAJOR] = 1024;
+ return (0);
+}
+
+#undef TOO_BIG_CHUNKSIZE
+#undef BAD_CHUNKSIZE
+
+static int restart_array(mddev_t *mddev)
+{
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->ro = 0;
+ set_device_ro(mddev_to_kdev(mddev), 0);
+
+ printk(KERN_INFO
+ "md: md%d switched to read-write mode.\n", mdidx(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ md_recover_arrays();
+ if (mddev->pers->restart_resync)
+ mddev->pers->restart_resync(mddev);
+ err = 0;
+ } else {
+ printk(KERN_ERR "md: md%d has no personality assigned.\n",
+ mdidx(mddev));
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+#define STILL_MOUNTED KERN_WARNING \
+"md: md%d still mounted.\n"
+#define STILL_IN_USE \
+"md: md%d still in use.\n"
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+ int err = 0, resync_interrupted = 0;
+ kdev_t dev = mddev_to_kdev(mddev);
+
+ if (atomic_read(&mddev->active)>1) {
+ printk(STILL_IN_USE, mdidx(mddev));
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (mddev->pers) {
+ /*
+ * It is safe to call stop here, it only frees private
+ * data. Also, it tells us if a device is unstoppable
+ * (eg. resyncing is in progress)
+ */
+ if (mddev->pers->stop_resync)
+ if (mddev->pers->stop_resync(mddev))
+ resync_interrupted = 1;
+
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+
+ /*
+ * This synchronizes with signal delivery to the
+ * resync or reconstruction thread. It also nicely
+ * hangs the process if some reconstruction has not
+ * finished.
+ */
+ down(&mddev->recovery_sem);
+ up(&mddev->recovery_sem);
+
+ invalidate_device(dev, 1);
+
+ if (ro) {
+ err = -ENXIO;
+ if (mddev->ro)
+ goto out;
+ mddev->ro = 1;
+ } else {
+ if (mddev->ro)
+ set_device_ro(dev, 0);
+ if (mddev->pers->stop(mddev)) {
+ err = -EBUSY;
+ if (mddev->ro)
+ set_device_ro(dev, 1);
+ goto out;
+ }
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (mddev->sb) {
+ /*
+ * mark it clean only if there was no resync
+ * interrupted.
+ */
+ if (!mddev->recovery_running && !resync_interrupted) {
+ printk(KERN_INFO "md: marking sb clean...\n");
+ mddev->sb->state |= 1 << MD_SB_CLEAN;
+ }
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+ }
+ if (ro)
+ set_device_ro(dev, 1);
+ }
+
+ /*
+ * Free resources if final stop
+ */
+ if (!ro) {
+ printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+ free_mddev(mddev);
+ } else
+ printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * We have to safely support old arrays too.
+ */
+int detect_old_array(mdp_super_t *sb)
+{
+ if (sb->major_version > 0)
+ return 0;
+ if (sb->minor_version >= 90)
+ return 0;
+
+ return -EINVAL;
+}
+
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return;
+ }
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk("<%s>", partition_name(rdev->dev));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+ /*
+ * prevent the writeback of an unrunnable array
+ */
+ mddev->sb_dirty = 0;
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in the ->pending list)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(kdev_t countdev)
+{
+ struct md_list_head candidates;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+ kdev_t md_kdev;
+
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ rdev0 = md_list_entry(pending_raid_disks.next,
+ mdk_rdev_t, pending);
+
+ printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev));
+ MD_INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ if (uuid_equal(rdev0, rdev)) {
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING
+ "md: %s has same UUID as %s, but superblocks differ ...\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ continue;
+ }
+ printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev));
+ md_list_del(&rdev->pending);
+ md_list_add(&rdev->pending, &candidates);
+ }
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
+ mddev = kdev_to_mddev(md_kdev);
+ if (mddev) {
+ printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
+ mdidx(mddev), partition_name(rdev0->dev));
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
+ export_rdev(rdev);
+ continue;
+ }
+ mddev = alloc_mddev(md_kdev);
+ if (!mddev) {
+ printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (md_kdev == countdev)
+ atomic_inc(&mddev->active);
+ printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
+ bind_rdev_to_array(rdev, mddev);
+ list_del_init(&rdev->pending);
+ }
+ autorun_array(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+#define BAD_VERSION KERN_ERR \
+"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_DEVICE KERN_ERR \
+"md: disabled device %s\n"
+
+#define AUTOADD_FAILED KERN_ERR \
+"md: auto-adding devices to md%d FAILED (error %d).\n"
+
+#define AUTOADD_FAILED_USED KERN_ERR \
+"md: cannot auto-add device %s to md%d, already used.\n"
+
+#define AUTORUN_FAILED KERN_ERR \
+"md: auto-running md%d FAILED (error %d).\n"
+
+#define MDDEV_BUSY KERN_ERR \
+"md: cannot auto-add to md%d, already running.\n"
+
+#define AUTOADDING KERN_INFO \
+"md: auto-adding devices to md%d, based on %s's superblock.\n"
+
+#define AUTORUNNING KERN_INFO \
+"md: auto-running md%d.\n"
+
+static int autostart_array(kdev_t startdev, kdev_t countdev)
+{
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ if (md_import_device(startdev, 1)) {
+ printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev));
+ goto abort;
+ }
+
+ start_rdev = find_rdev_all(startdev);
+ if (!start_rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ if (start_rdev->faulty) {
+ printk(KERN_WARNING "md: can not autostart based on faulty %s!\n",
+ partition_name(startdev));
+ goto abort;
+ }
+ md_list_add(&start_rdev->pending, &pending_raid_disks);
+
+ sb = start_rdev->sb;
+
+ err = detect_old_array(sb);
+ if (err) {
+ printk(KERN_WARNING "md: array version is too old to be autostarted ,"
+ "use raidtools 0.90 mkraid --upgrade to upgrade the array "
+ "without data loss!\n");
+ goto abort;
+ }
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+ if (dev == startdev)
+ continue;
+ if (md_import_device(dev, 1)) {
+ printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n",
+ partition_name(dev));
+ continue;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices(countdev);
+ return 0;
+
+abort:
+ if (start_rdev)
+ export_rdev(start_rdev);
+ return err;
+}
+
+#undef BAD_VERSION
+#undef OUT_OF_MEM
+#undef NO_DEVICE
+#undef AUTOADD_FAILED_USED
+#undef AUTOADD_FAILED
+#undef AUTORUN_FAILED
+#undef AUTOADDING
+#undef AUTORUNNING
+
+
+static int get_version(void * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (md_copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define SET_FROM_SB(x) info.x = mddev->sb->x
+static int get_array_info(mddev_t * mddev, void * arg)
+{
+ mdu_array_info_t info;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ SET_FROM_SB(major_version);
+ SET_FROM_SB(minor_version);
+ SET_FROM_SB(patch_version);
+ SET_FROM_SB(ctime);
+ SET_FROM_SB(level);
+ SET_FROM_SB(size);
+ SET_FROM_SB(nr_disks);
+ SET_FROM_SB(raid_disks);
+ SET_FROM_SB(md_minor);
+ SET_FROM_SB(not_persistent);
+
+ SET_FROM_SB(utime);
+ SET_FROM_SB(state);
+ SET_FROM_SB(active_disks);
+ SET_FROM_SB(working_disks);
+ SET_FROM_SB(failed_disks);
+ SET_FROM_SB(spare_disks);
+
+ SET_FROM_SB(layout);
+ SET_FROM_SB(chunk_size);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
+static int get_disk_info(mddev_t * mddev, void * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+
+ if (!mddev->sb)
+ return -EINVAL;
+
+ if (md_copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+ if (nr >= MD_SB_DISKS)
+ return -EINVAL;
+
+ SET_FROM_SB(major);
+ SET_FROM_SB(minor);
+ SET_FROM_SB(raid_disk);
+ SET_FROM_SB(state);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_SB(x) mddev->sb->disks[nr].x = info->x
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ int err, size, persistent;
+ mdk_rdev_t *rdev;
+ unsigned int nr;
+ kdev_t dev;
+ dev = MKDEV(info->major,info->minor);
+
+ if (find_rdev_all(dev)) {
+ printk(KERN_WARNING "md: device %s already used in a RAID array!\n",
+ partition_name(dev));
+ return -EBUSY;
+ }
+ if (!mddev->sb) {
+ /* expecting a device which has a superblock */
+ err = md_import_device(dev, 1);
+ if (err) {
+ printk(KERN_WARNING "md: md_import_device returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ if (!uuid_equal(rdev0, rdev)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ bind_rdev_to_array(rdev, mddev);
+ return 0;
+ }
+
+ nr = info->number;
+ if (nr >= mddev->sb->nr_disks) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+
+ SET_SB(number);
+ SET_SB(major);
+ SET_SB(minor);
+ SET_SB(raid_disk);
+ SET_SB(state);
+
+ if ((info->state & (1<<MD_DISK_FAULTY))==0) {
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ rdev->old_dev = dev;
+ rdev->desc_nr = info->number;
+
+ bind_rdev_to_array(rdev, mddev);
+
+ persistent = !mddev->sb->not_persistent;
+ if (!persistent)
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+
+ size = calc_dev_size(dev, mddev, persistent);
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ if (!mddev->sb->size || (mddev->sb->size > size))
+ mddev->sb->size = size;
+ }
+
+ /*
+ * sync all other superblocks with the main superblock
+ */
+ sync_sbs(mddev);
+
+ return 0;
+}
+#undef SET_SB
+
+static int hot_generate_error(mddev_t * mddev, kdev_t dev)
+{
+ struct request_queue *q;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ return -ENXIO;
+ }
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (!disk_active(disk))
+ return -ENODEV;
+
+ q = blk_get_queue(rdev->dev);
+ if (!q) {
+ MD_BUG();
+ return -ENODEV;
+ }
+ printk(KERN_INFO "md: okay, generating error!\n");
+// q->oneshot_error = 1; // disabled for now
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, kdev_t dev)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (disk_active(disk))
+ goto busy;
+
+ if (disk_removed(disk))
+ return -EINVAL;
+
+ err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
+ if (err == -EBUSY)
+ goto busy;
+
+ if (err) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ remove_descriptor(disk, mddev->sb);
+ kick_rdev_from_array(rdev);
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, kdev_t dev)
+{
+ int i, err, persistent;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ persistent = !mddev->sb->not_persistent;
+
+ rdev = find_rdev(mddev, dev);
+ if (rdev)
+ return -EBUSY;
+
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->faulty) {
+ printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n",
+ partition_name(dev), mdidx(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ size = calc_dev_size(dev, mddev, persistent);
+
+ if (size < mddev->sb->size) {
+ printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n",
+ mdidx(mddev), size, mddev->sb->size);
+ err = -ENOSPC;
+ goto abort_export;
+ }
+ bind_rdev_to_array(rdev, mddev);
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+ rdev->old_dev = dev;
+ rdev->size = size;
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ disk = mddev->sb->disks + mddev->sb->raid_disks;
+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
+ disk = mddev->sb->disks + i;
+
+ if (!disk->major && !disk->minor)
+ break;
+ if (disk_removed(disk))
+ break;
+ }
+ if (i == MD_SB_DISKS) {
+ printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ if (disk_removed(disk)) {
+ /*
+ * reuse slot
+ */
+ if (disk->number != i) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+ } else {
+ disk->number = i;
+ }
+
+ disk->raid_disk = disk->number;
+ disk->major = MAJOR(dev);
+ disk->minor = MINOR(dev);
+
+ if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+
+ mark_disk_spare(disk);
+ mddev->sb->nr_disks++;
+ mddev->sb->spare_disks++;
+ mddev->sb->working_disks++;
+
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ md_recover_arrays();
+
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+#define SET_SB(x) mddev->sb->x = info->x
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (alloc_array_sb(mddev))
+ return -ENOMEM;
+
+ mddev->sb->major_version = MD_MAJOR_VERSION;
+ mddev->sb->minor_version = MD_MINOR_VERSION;
+ mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->sb->ctime = CURRENT_TIME;
+
+ SET_SB(level);
+ SET_SB(size);
+ SET_SB(nr_disks);
+ SET_SB(raid_disks);
+ SET_SB(md_minor);
+ SET_SB(not_persistent);
+
+ SET_SB(state);
+ SET_SB(active_disks);
+ SET_SB(working_disks);
+ SET_SB(failed_disks);
+ SET_SB(spare_disks);
+
+ SET_SB(layout);
+ SET_SB(chunk_size);
+
+ mddev->sb->md_magic = MD_SB_MAGIC;
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(&mddev->sb->set_uuid0, 4);
+ get_random_bytes(&mddev->sb->set_uuid1, 4);
+ get_random_bytes(&mddev->sb->set_uuid2, 4);
+ get_random_bytes(&mddev->sb->set_uuid3, 4);
+
+ return 0;
+}
+#undef SET_SB
+
+static int set_disk_faulty(mddev_t *mddev, kdev_t dev)
+{
+ int ret;
+
+ ret = md_error(mddev, dev);
+ return ret;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ unsigned int minor;
+ int err = 0;
+ struct hd_geometry *loc = (struct hd_geometry *) arg;
+ mddev_t *mddev = NULL;
+ kdev_t dev;
+
+ if (!md_capable_admin())
+ return -EACCES;
+
+ dev = inode->i_rdev;
+ minor = MINOR(dev);
+ if (minor >= MAX_MD_DEVS) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version((void *)arg);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done_unlock;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays();
+ goto done;
+#endif
+
+ case BLKGETSIZE:
+ case BLKGETSIZE64:
+ case BLKRAGET:
+ case BLKRASET:
+ case BLKFLSBUF:
+ case BLKBSZGET:
+ case BLKBSZSET:
+ err = blk_ioctl (dev, cmd, arg);
+ goto abort;
+
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = kdev_to_mddev(dev);
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+ case START_ARRAY:
+ if (mddev) {
+ printk(KERN_WARNING "md: array md%d already exists!\n",
+ mdidx(mddev));
+ err = -EEXIST;
+ goto abort;
+ }
+ default:;
+ }
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+ mddev = alloc_mddev(dev);
+ if (!mddev) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ atomic_inc(&mddev->active);
+
+ /*
+ * alloc_mddev() should possibly self-lock.
+ */
+ err = lock_mddev(mddev);
+ if (err) {
+ printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ if (mddev->sb) {
+ printk(KERN_WARNING "md: array md%d already has a superblock!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (arg) {
+ mdu_array_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+ case START_ARRAY:
+ /*
+ * possibly make it lock the array ...
+ */
+ err = autostart_array((kdev_t)arg, dev);
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name((kdev_t)arg));
+ goto abort;
+ }
+ goto done;
+
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+
+ if (!mddev) {
+ err = -ENODEV;
+ goto abort;
+ }
+ err = lock_mddev(mddev);
+ if (err) {
+ printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
+ goto abort;
+ }
+ /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+ if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ if (!(err = do_md_stop (mddev, 0)))
+ mddev = NULL;
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ case HDIO_GETGEO:
+ if (!loc) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ err = md_put_user (2, (char *) &loc->heads);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (4, (char *) &loc->sectors);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
+ (short *) &loc->cylinders);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[minor].start_sect,
+ (long *) &loc->start);
+ goto done_unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow read-only arrays
+ * here:
+ */
+ if (mddev->ro) {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ case HOT_GENERATE_ERROR:
+ err = hot_generate_error(mddev, (kdev_t)arg);
+ goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ {
+ err = do_md_run (mddev);
+ /*
+ * we have to clean up the mess if
+ * the array cannot be run for some
+ * reason ...
+ */
+ if (err) {
+ mddev->sb_dirty = 0;
+ if (!do_md_stop (mddev, 0))
+ mddev = NULL;
+ }
+ goto done_unlock;
+ }
+
+ default:
+ printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, "
+ "upgrade your software to use new ictls.\n",
+ current->comm, current->pid);
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ if (mddev)
+ unlock_mddev(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Always succeed, but increment the usage count
+ */
+ mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
+ if (mddev)
+ atomic_inc(&mddev->active);
+ return (0);
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
+ if (mddev)
+ atomic_dec(&mddev->active);
+ return 0;
+}
+
+static struct block_device_operations md_fops=
+{
+ owner: THIS_MODULE,
+ open: md_open,
+ release: md_release,
+ ioctl: md_ioctl,
+};
+
+
+int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ md_lock_kernel();
+
+ /*
+ * Detach thread
+ */
+
+ daemonize();
+
+ sprintf(current->comm, thread->name);
+ md_init_signals();
+ md_flush_signals();
+ thread->tsk = current;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+ current->policy = SCHED_OTHER;
+ current->nice = -20;
+ md_unlock_kernel();
+
+ complete(thread->event);
+ while (thread->run) {
+ void (*run)(void *data);
+
+ wait_event_interruptible(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags));
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ run = thread->run;
+ if (run) {
+ run(thread->data);
+ run_task_queue(&tq_disk);
+ }
+ if (md_signal_pending(current))
+ md_flush_signals();
+ }
+ complete(thread->event);
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ dprintk("md: waking up MD thread %p.\n", thread);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+}
+
+mdk_thread_t *md_register_thread(void (*run) (void *),
+ void *data, const char *name)
+{
+ mdk_thread_t *thread;
+ int ret;
+ struct completion event;
+
+ thread = (mdk_thread_t *) kmalloc
+ (sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ memset(thread, 0, sizeof(mdk_thread_t));
+ md_init_waitqueue_head(&thread->wqueue);
+
+ init_completion(&event);
+ thread->event = &event;
+ thread->run = run;
+ thread->data = data;
+ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+ return NULL;
+ }
+ wait_for_completion(&event);
+ return thread;
+}
+
+void md_interrupt_thread(mdk_thread_t *thread)
+{
+ if (!thread->tsk) {
+ MD_BUG();
+ return;
+ }
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+ send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ struct completion event;
+
+ init_completion(&event);
+
+ thread->event = &event;
+ thread->run = NULL;
+ thread->name = NULL;
+ md_interrupt_thread(thread);
+ wait_for_completion(&event);
+ kfree(thread);
+}
+
+void md_recover_arrays(void)
+{
+ if (!md_recovery_thread) {
+ MD_BUG();
+ return;
+ }
+ md_wakeup_thread(md_recovery_thread);
+}
+
+
+int md_error(mddev_t *mddev, kdev_t rdev)
+{
+ mdk_rdev_t * rrdev;
+
+ dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+
+ if (!mddev) {
+ MD_BUG();
+ return 0;
+ }
+ rrdev = find_rdev(mddev, rdev);
+ if (!rrdev || rrdev->faulty)
+ return 0;
+ if (!mddev->pers->error_handler
+ || mddev->pers->error_handler(mddev,rdev) <= 0) {
+ rrdev->faulty = 1;
+ } else
+ return 1;
+ /*
+ * if recovery was running, stop it now.
+ */
+ if (mddev->pers->stop_resync)
+ mddev->pers->stop_resync(mddev);
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+ md_recover_arrays();
+
+ return 0;
+}
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_ALL(rdev,tmp) {
+ if (list_empty(&rdev->same_set)) {
+ /*
+ * The device is not yet used by any array.
+ */
+ i++;
+ seq_printf(seq, "%s ",
+ partition_name(rdev->dev));
+ }
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ unsigned long max_blocks, resync, res, dt, db, rt;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ max_blocks = mddev->sb->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks)
+ MD_BUG();
+
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ {
+ int i, x = res/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ if (!mddev->recovery_running)
+ /*
+ * true resync
+ */
+ seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)",
+ res/10, res % 10, resync, max_blocks);
+ else
+ /*
+ * recovery ...
+ */
+ seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)",
+ res/10, res % 10, resync, max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+
+}
+
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l > 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ return mddev;
+ }
+ return (void*)2;/* tail */
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = list_entry(tmp,mddev_t,all_mddevs);
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ int j, size;
+ struct md_list_head *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev = v;
+
+ if (v == (void*)1) {
+ seq_printf(seq, "Personalities : ");
+ for (j = 0; j < MAX_PERSONALITY; j++)
+ if (pers[j])
+ seq_printf(seq, "[%s] ", pers[j]->name);
+
+ seq_printf(seq, "\n");
+ seq_printf(seq, "read_ahead ");
+ if (read_ahead[MD_MAJOR] == INT_MAX)
+ seq_printf(seq, "not set\n");
+ else
+ seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]);
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro)
+ seq_printf(seq, " (read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ seq_printf(seq, " %s[%d]",
+ partition_name(rdev->dev), rdev->desc_nr);
+ if (rdev->faulty) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %d blocks",
+ md_size[mdidx(mddev)]);
+ else
+ seq_printf(seq, "\n %d blocks", size);
+ }
+
+ if (mddev->pers) {
+
+ mddev->pers->status (seq, mddev);
+
+ seq_printf(seq, "\n ");
+ if (mddev->curr_resync) {
+ status_resync (seq, mddev);
+ } else {
+ if (sem_getcount(&mddev->resync_sem) != 1)
+ seq_printf(seq, " resync=DELAYED");
+ }
+ }
+ seq_printf(seq, "\n");
+
+ return 0;
+}
+
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ return error;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (pers[pnum]) {
+ MD_BUG();
+ return -EBUSY;
+ }
+
+ pers[pnum] = p;
+ printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+ return 0;
+}
+
+int unregister_md_personality(int pnum)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+ pers[pnum] = NULL;
+ return 0;
+}
+
+mdp_disk_t *get_spare(mddev_t *mddev)
+{
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *disk;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (!rdev->sb) {
+ MD_BUG();
+ continue;
+ }
+ disk = &sb->disks[rdev->desc_nr];
+ if (disk_faulty(disk)) {
+ MD_BUG();
+ continue;
+ }
+ if (disk_active(disk))
+ continue;
+ return disk;
+ }
+ return NULL;
+}
+
+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
+void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
+{
+ unsigned int major = MAJOR(dev);
+ unsigned int index;
+
+ index = disk_index(dev);
+ if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ return;
+
+ sync_io[major][index] += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ int major = MAJOR(rdev->dev);
+ int idx = disk_index(rdev->dev);
+
+ if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ continue;
+
+ curr_events = kstat.dk_drive_rblk[major][idx] +
+ kstat.dk_drive_wblk[major][idx] ;
+ curr_events -= sync_io[major][idx];
+ if ((curr_events - rdev->last_events) > 32) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ // stop recovery, signal do_sync ....
+ if (mddev->pers->stop_resync)
+ mddev->pers->stop_resync(mddev);
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+ }
+}
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
+{
+ mddev_t *mddev2;
+ unsigned int max_sectors, currspeed,
+ j, window, err, serialize;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct md_list_head *tmp;
+ unsigned long last_check;
+
+
+ err = down_interruptible(&mddev->resync_sem);
+ if (err)
+ goto out_nolock;
+
+recheck:
+ serialize = 0;
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
+ printk(KERN_INFO "md: delaying resync of md%d until md%d "
+ "has finished resync (they share one or more physical units)\n",
+ mdidx(mddev), mdidx(mddev2));
+ serialize = 1;
+ break;
+ }
+ }
+ if (serialize) {
+ interruptible_sleep_on(&resync_wait);
+ if (md_signal_pending(current)) {
+ md_flush_signals();
+ err = -EINTR;
+ goto out;
+ }
+ goto recheck;
+ }
+
+ mddev->curr_resync = 1;
+
+ max_sectors = mddev->sb->size<<1;
+
+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
+ sysctl_speed_limit_min);
+ printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ sysctl_speed_limit_max);
+
+ /*
+ * Resync has low priority.
+ */
+ current->nice = 19;
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = 0;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = vm_max_readahead*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+ window/2,max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+ for (j = 0; j < max_sectors;) {
+ int sectors;
+
+ sectors = mddev->pers->sync_request(mddev, j);
+
+ if (sectors < 0) {
+ err = sectors;
+ goto out;
+ }
+ atomic_add(sectors, &mddev->recovery_active);
+ j += sectors;
+ mddev->curr_resync = j;
+
+ if (last_check + window > j)
+ continue;
+
+ last_check = j;
+
+ run_task_queue(&tq_disk);
+
+ repeat:
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (md_signal_pending(current)) {
+ /*
+ * got a signal, exit.
+ */
+ mddev->curr_resync = 0;
+ printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
+ md_flush_signals();
+ err = -EINTR;
+ goto out;
+ }
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ if (md_need_resched(current))
+ schedule();
+
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > sysctl_speed_limit_min) {
+ current->nice = 19;
+
+ if ((currspeed > sysctl_speed_limit_max) ||
+ !is_mddev_idle(mddev)) {
+ current->state = TASK_INTERRUPTIBLE;
+ md_schedule_timeout(HZ/4);
+ goto repeat;
+ }
+ } else
+ current->nice = -20;
+ }
+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+ err = 0;
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+out:
+ wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
+ up(&mddev->resync_sem);
+out_nolock:
+ mddev->curr_resync = 0;
+ wake_up(&resync_wait);
+ return err;
+}
+
+
+/*
+ * This is a kernel thread which syncs a spare disk with the active array
+ *
+ * the amount of foolproofing might seem to be a tad excessive, but an
+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
+ * of my root partition with the first 0.5 gigs of my /home partition ... so
+ * i'm a bit nervous ;)
+ */
+void md_do_recovery(void *data)
+{
+ int err;
+ mddev_t *mddev;
+ mdp_super_t *sb;
+ mdp_disk_t *spare;
+ struct md_list_head *tmp;
+
+ printk(KERN_INFO "md: recovery thread got woken up ...\n");
+restart:
+ ITERATE_MDDEV(mddev,tmp) {
+ sb = mddev->sb;
+ if (!sb)
+ continue;
+ if (mddev->recovery_running)
+ continue;
+ if (sb->active_disks == sb->raid_disks)
+ continue;
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ if (!sb->spare_disks) {
+ printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
+ "-- continuing in degraded mode\n", mdidx(mddev));
+ continue;
+ }
+ /*
+ * now here we get the spare and resync it.
+ */
+ spare = get_spare(mddev);
+ if (!spare)
+ continue;
+ printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
+ mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
+ if (!mddev->pers->diskop)
+ continue;
+ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
+ continue;
+ down(&mddev->recovery_sem);
+ mddev->recovery_running = 1;
+ err = md_do_sync(mddev, spare);
+ if (err == -EIO) {
+ printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n",
+ mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
+ if (!disk_faulty(spare)) {
+ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
+ mark_disk_faulty(spare);
+ mark_disk_nonsync(spare);
+ mark_disk_inactive(spare);
+ sb->spare_disks--;
+ sb->working_disks--;
+ sb->failed_disks++;
+ }
+ } else
+ if (disk_faulty(spare))
+ mddev->pers->diskop(mddev, &spare,
+ DISKOP_SPARE_INACTIVE);
+ if (err == -EINTR || err == -ENOMEM) {
+ /*
+ * Recovery got interrupted, or ran out of mem ...
+ * signal back that we have finished using the array.
+ */
+ mddev->pers->diskop(mddev, &spare,
+ DISKOP_SPARE_INACTIVE);
+ up(&mddev->recovery_sem);
+ mddev->recovery_running = 0;
+ continue;
+ } else {
+ mddev->recovery_running = 0;
+ up(&mddev->recovery_sem);
+ }
+ if (!disk_faulty(spare)) {
+ /*
+ * the SPARE_ACTIVE diskop possibly changes the
+ * pointer too
+ */
+ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
+ mark_disk_sync(spare);
+ mark_disk_active(spare);
+ sb->active_disks++;
+ sb->spare_disks--;
+ }
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+ goto restart;
+ }
+ printk(KERN_INFO "md: recovery thread finished ...\n");
+
+}
+
+int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct md_list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
+ || (code == MD_SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ do_md_stop (mddev, 1);
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ md_mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+ notifier_call: md_notify_reboot,
+ next: NULL,
+ priority: INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+ int i;
+
+ for(i = 0; i < MAX_MD_DEVS; i++) {
+ md_blocksizes[i] = 1024;
+ md_size[i] = 0;
+ md_hardsect_sizes[i] = 512;
+ }
+ blksize_size[MAJOR_NR] = md_blocksizes;
+ blk_size[MAJOR_NR] = md_size;
+ max_readahead[MAJOR_NR] = md_maxreadahead;
+ hardsect_size[MAJOR_NR] = md_hardsect_sizes;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+#ifdef CONFIG_PROC_FS
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+#endif
+}
+
+request_queue_t * md_queue_proc(kdev_t dev)
+{
+ mddev_t *mddev = kdev_to_mddev(dev);
+ if (mddev == NULL)
+ return BLK_DEFAULT_QUEUE(MAJOR_NR);
+ else
+ return &mddev->queue;
+}
+
+int md__init md_init(void)
+{
+ static char * name = "mdrecoveryd";
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+
+ if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
+ {
+ printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR);
+ return (-1);
+ }
+ devfs_handle = devfs_mk_dir (NULL, "md", NULL);
+ /* we don't use devfs_register_series because we want to fill md_hd_struct */
+ for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+ char devname[128];
+ sprintf (devname, "%u", minor);
+ md_hd_struct[minor].de = devfs_register (devfs_handle,
+ devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+ S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+ }
+
+ /* all requests on an uninitialised device get failed... */
+ blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request);
+ blk_dev[MAJOR_NR].queue = md_queue_proc;
+
+
+ read_ahead[MAJOR_NR] = INT_MAX;
+
+ add_gendisk(&md_gendisk);
+
+ md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
+ if (!md_recovery_thread)
+ printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n");
+
+ md_register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * When md (and any require personalities) are compiled into the kernel
+ * (not a module), arrays can be assembles are boot time using with AUTODETECT
+ * where specially marked partitions are registered with md_autodetect_dev(),
+ * and with MD_BOOT where devices to be collected are given on the boot line
+ * with md=.....
+ * The code for that is here.
+ */
+
+struct {
+ int set;
+ int noautodetect;
+} raid_setup_args md__initdata;
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static kdev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(kdev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(void)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ kdev_t dev = detected_devices[i];
+
+ if (md_import_device(dev,1)) {
+ printk(KERN_ALERT "md: could not import %s!\n",
+ partition_name(dev));
+ continue;
+ }
+ /*
+ * Sanity checks:
+ */
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ continue;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ continue;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices(-1);
+}
+
+static struct {
+ char device_set [MAX_MD_DEVS];
+ int pers[MAX_MD_DEVS];
+ int chunk[MAX_MD_DEVS];
+ char *device_names[MAX_MD_DEVS];
+} md_setup_args md__initdata;
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the MD device now; that is handled by
+ * md_setup_drive after the low-level disk drivers have initialised.
+ *
+ * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
+ * assigns the task of parsing integer arguments to the
+ * invoked program now). Added ability to initialise all
+ * the MD devices (by specifying multiple "md=" lines)
+ * instead of just one. -- KTK
+ * 18May2000: Added support for persistant-superblock arrays:
+ * md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
+ * md=n,device-list reads a RAID superblock from the devices
+ * elements in device-list are read by name_to_kdev_t so can be
+ * a hex number or something like /dev/hda1 /dev/sdb
+ * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
+ * Shifted name_to_kdev_t() and related operations to md_set_drive()
+ * for later execution. Rewrote section to make devfs compatible.
+ */
+static int md__init md_setup(char *str)
+{
+ int minor, level, factor, fault;
+ char *pername = "";
+ char *str1 = str;
+
+ if (get_option(&str, &minor) != 2) { /* MD Number */
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ if (minor >= MAX_MD_DEVS) {
+ printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor);
+ return 0;
+ } else if (md_setup_args.device_names[minor]) {
+ printk(KERN_WARNING "md: md=%d, Specified more then once. "
+ "Replacing previous definition.\n", minor);
+ }
+ switch (get_option(&str, &level)) { /* RAID Personality */
+ case 2: /* could be 0 or -1.. */
+ if (level == 0 || level == -1) {
+ if (get_option(&str, &factor) != 2 || /* Chunk Size */
+ get_option(&str, &fault) != 2) {
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ md_setup_args.chunk[minor] = 1 << (factor+12);
+ switch(level) {
+ case -1:
+ level = LINEAR;
+ pername = "linear";
+ break;
+ case 0:
+ level = RAID0;
+ pername = "raid0";
+ break;
+ default:
+ printk(KERN_WARNING
+ "md: The kernel has not been configured for raid%d support!\n",
+ level);
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ break;
+ }
+ /* FALL THROUGH */
+ case 1: /* the first device is numeric */
+ str = str1;
+ /* FALL THROUGH */
+ case 0:
+ md_setup_args.pers[minor] = 0;
+ pername="super-block";
+ }
+
+ printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
+ minor, pername, str);
+ md_setup_args.device_names[minor] = str;
+
+ return 1;
+}
+
+extern kdev_t name_to_kdev_t(char *line) md__init;
+void md__init md_setup_drive(void)
+{
+ int minor, i;
+ kdev_t dev;
+ mddev_t*mddev;
+ kdev_t devices[MD_SB_DISKS+1];
+
+ for (minor = 0; minor < MAX_MD_DEVS; minor++) {
+ int err = 0;
+ char *devname;
+ mdu_disk_info_t dinfo;
+
+ if ((devname = md_setup_args.device_names[minor]) == 0) continue;
+
+ for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {
+
+ char *p;
+ void *handle;
+
+ p = strchr(devname, ',');
+ if (p)
+ *p++ = 0;
+
+ dev = name_to_kdev_t(devname);
+ handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev),
+ DEVFS_SPECIAL_BLK, 1);
+ if (handle != 0) {
+ unsigned major, minor;
+ devfs_get_maj_min(handle, &major, &minor);
+ dev = MKDEV(major, minor);
+ }
+ if (dev == 0) {
+ printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
+ break;
+ }
+
+ devices[i] = dev;
+ md_setup_args.device_set[minor] = 1;
+
+ devname = p;
+ }
+ devices[i] = 0;
+
+ if (md_setup_args.device_set[minor] == 0)
+ continue;
+
+ if (mddev_map[minor]) {
+ printk(KERN_WARNING
+ "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
+ minor);
+ continue;
+ }
+ printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
+
+ mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
+ if (!mddev) {
+ printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
+ continue;
+ }
+ if (md_setup_args.pers[minor]) {
+ /* non-persistent */
+ mdu_array_info_t ainfo;
+ ainfo.level = pers_to_level(md_setup_args.pers[minor]);
+ ainfo.size = 0;
+ ainfo.nr_disks =0;
+ ainfo.raid_disks =0;
+ ainfo.md_minor =minor;
+ ainfo.not_persistent = 1;
+
+ ainfo.state = (1 << MD_SB_CLEAN);
+ ainfo.active_disks = 0;
+ ainfo.working_disks = 0;
+ ainfo.failed_disks = 0;
+ ainfo.spare_disks = 0;
+ ainfo.layout = 0;
+ ainfo.chunk_size = md_setup_args.chunk[minor];
+ err = set_array_info(mddev, &ainfo);
+ for (i = 0; !err && (dev = devices[i]); i++) {
+ dinfo.number = i;
+ dinfo.raid_disk = i;
+ dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ mddev->sb->nr_disks++;
+ mddev->sb->raid_disks++;
+ mddev->sb->active_disks++;
+ mddev->sb->working_disks++;
+ err = add_new_disk (mddev, &dinfo);
+ }
+ } else {
+ /* persistent */
+ for (i = 0; (dev = devices[i]); i++) {
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ add_new_disk (mddev, &dinfo);
+ }
+ }
+ if (!err)
+ err = do_md_run(mddev);
+ if (err) {
+ mddev->sb_dirty = 0;
+ do_md_stop(mddev, 0);
+ printk(KERN_WARNING "md: starting md%d failed\n", minor);
+ }
+ }
+}
+
+static int md__init raid_setup(char *str)
+{
+ int len, pos;
+
+ len = strlen(str) + 1;
+ pos = 0;
+
+ while (pos < len) {
+ char *comma = strchr(str+pos, ',');
+ int wlen;
+ if (comma)
+ wlen = (comma-str)-pos;
+ else wlen = (len-1)-pos;
+
+ if (strncmp(str, "noautodetect", wlen) == 0)
+ raid_setup_args.noautodetect = 1;
+ pos += wlen+1;
+ }
+ raid_setup_args.set = 1;
+ return 1;
+}
+
+int md__init md_run_setup(void)
+{
+ if (raid_setup_args.noautodetect)
+ printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
+ else
+ autostart_arrays();
+ md_setup_drive();
+ return 0;
+}
+
+__setup("raid=", raid_setup);
+__setup("md=", md_setup);
+
+__initcall(md_init);
+__initcall(md_run_setup);
+
+#else /* It is a MODULE */
+
+int init_module(void)
+{
+ return md_init();
+}
+
+static void free_device_names(void)
+{
+ while (!list_empty(&device_names)) {
+ struct dname *tmp = list_entry(device_names.next,
+ dev_name_t, list);
+ list_del(&tmp->list);
+ kfree(tmp);
+ }
+}
+
+
+void cleanup_module(void)
+{
+ md_unregister_thread(md_recovery_thread);
+ devfs_unregister(devfs_handle);
+
+ devfs_unregister_blkdev(MAJOR_NR,"md");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("mdstat", NULL);
+#endif
+
+ del_gendisk(&md_gendisk);
+
+ blk_dev[MAJOR_NR].queue = NULL;
+ blksize_size[MAJOR_NR] = NULL;
+ blk_size[MAJOR_NR] = NULL;
+ max_readahead[MAJOR_NR] = NULL;
+ hardsect_size[MAJOR_NR] = NULL;
+
+ free_device_names();
+
+}
+#endif
+
+MD_EXPORT_SYMBOL(md_size);
+MD_EXPORT_SYMBOL(register_md_personality);
+MD_EXPORT_SYMBOL(unregister_md_personality);
+MD_EXPORT_SYMBOL(partition_name);
+MD_EXPORT_SYMBOL(md_error);
+MD_EXPORT_SYMBOL(md_do_sync);
+MD_EXPORT_SYMBOL(md_sync_acct);
+MD_EXPORT_SYMBOL(md_done_sync);
+MD_EXPORT_SYMBOL(md_recover_arrays);
+MD_EXPORT_SYMBOL(md_register_thread);
+MD_EXPORT_SYMBOL(md_unregister_thread);
+MD_EXPORT_SYMBOL(md_update_sb);
+MD_EXPORT_SYMBOL(md_wakeup_thread);
+MD_EXPORT_SYMBOL(md_print_devices);
+MD_EXPORT_SYMBOL(find_rdev_nr);
+MD_EXPORT_SYMBOL(md_interrupt_thread);
+MD_EXPORT_SYMBOL(mddev_map);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md-loop/2 b/tests/linux/md-loop/2
new file mode 100644
index 0000000..fc01423
--- /dev/null
+++ b/tests/linux/md-loop/2
@@ -0,0 +1,3949 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/raid/xor.h>
+#include <linux/devfs_fs_kernel.h>
+
+#include <linux/init.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+
+#include <linux/blk.h>
+
+#define DEBUG 0
+#if DEBUG
+# define dprintk(x...) printk(x)
+#else
+# define dprintk(x...) do { } while(0)
+#endif
+
+#ifndef MODULE
+static void autostart_arrays (void);
+#endif
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 100 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static int sysctl_speed_limit_min = 100;
+static int sysctl_speed_limit_max = 100000;
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
+ &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
+ {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
+ &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
+ {0}
+};
+
+static ctl_table raid_dir_table[] = {
+ {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
+ {0}
+};
+
+static ctl_table raid_root_table[] = {
+ {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
+ {0}
+};
+
+/*
+ * these have to be allocated separately because external
+ * subsystems want to have a pre-defined structure
+ */
+struct hd_struct md_hd_struct[MAX_MD_DEVS];
+static int md_blocksizes[MAX_MD_DEVS];
+static int md_hardsect_sizes[MAX_MD_DEVS];
+static mdk_thread_t *md_recovery_thread;
+
+int md_size[MAX_MD_DEVS];
+
+static struct block_device_operations md_fops;
+static devfs_handle_t devfs_handle;
+
+static struct gendisk md_gendisk=
+{
+ major: MD_MAJOR,
+ major_name: "md",
+ minor_shift: 0,
+ max_p: 1,
+ part: md_hd_struct,
+ sizes: md_size,
+ nr_real: MAX_MD_DEVS,
+ real_devices: NULL,
+ next: NULL,
+ fops: &md_fops,
+};
+
+/*
+ * Enables to iterate over all existing md arrays
+ */
+static MD_LIST_HEAD(all_mddevs);
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static inline mddev_t * kdev_to_mddev (kdev_t dev)
+{
+ if (MAJOR(dev) != MD_MAJOR)
+ BUG();
+ return mddev_map[MINOR(dev)];
+}
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio);
+ return 0;
+}
+
+static mddev_t * alloc_mddev(kdev_t dev)
+{
+ mddev_t *mddev;
+
+ if (MAJOR(dev) != MD_MAJOR) {
+ MD_BUG();
+ return 0;
+ }
+ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
+ if (!mddev)
+ return NULL;
+
+ memset(mddev, 0, sizeof(*mddev));
+
+ mddev->__minor = MINOR(dev);
+ init_MUTEX(&mddev->reconfig_sem);
+ init_MUTEX(&mddev->recovery_sem);
+ init_MUTEX(&mddev->resync_sem);
+ MD_INIT_LIST_HEAD(&mddev->disks);
+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);
+ atomic_set(&mddev->active, 0);
+
+ mddev_map[mdidx(mddev)] = mddev;
+ md_list_add(&mddev->all_mddevs, &all_mddevs);
+
+ MOD_INC_USE_COUNT;
+
+ return mddev;
+}
+
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+static MD_LIST_HEAD(device_names);
+
+char * partition_name(kdev_t dev)
+{
+ struct gendisk *hd;
+ static char nomem [] = "<nomem>";
+ dev_name_t *dname;
+ struct md_list_head *tmp;
+
+ list_for_each(tmp, &device_names) {
+ dname = md_list_entry(tmp, dev_name_t, list);
+ if (dname->dev == dev)
+ return dname->name;
+ }
+
+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
+
+ if (!dname)
+ return nomem;
+ /*
+ * ok, add this new device name to the list
+ */
+ hd = get_gendisk (dev);
+ dname->name = NULL;
+ if (hd)
+ dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
+ if (!dname->name) {
+ sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
+ dname->name = dname->namebuf;
+ }
+
+ dname->dev = dev;
+ md_list_add(&dname->list, &device_names);
+
+ return dname->name;
+}
+
+static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev,
+ int persistent)
+{
+ unsigned int size = 0;
+
+ if (blk_size[MAJOR(dev)])
+ size = blk_size[MAJOR(dev)][MINOR(dev)];
+ if (persistent)
+ size = MD_NEW_SIZE_BLOCKS(size);
+ return size;
+}
+
+static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent)
+{
+ unsigned int size;
+
+ size = calc_dev_sboffset(dev, mddev, persistent);
+ if (!mddev->sb) {
+ MD_BUG();
+ return size;
+ }
+ if (mddev->sb->chunk_size)
+ size &= ~(mddev->sb->chunk_size/1024 - 1);
+ return size;
+}
+
+static unsigned int zoned_raid_size(mddev_t *mddev)
+{
+ unsigned int mask;
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ /*
+ * do size and offset calculations.
+ */
+ mask = ~(mddev->sb->chunk_size/1024 - 1);
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev->size &= mask;
+ md_size[mdidx(mddev)] += rdev->size;
+ }
+ return 0;
+}
+
+static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb)
+{
+ if (disk_active(disk)) {
+ sb->working_disks--;
+ } else {
+ if (disk_spare(disk)) {
+ sb->spare_disks--;
+ sb->working_disks--;
+ } else {
+ sb->failed_disks--;
+ }
+ }
+ sb->nr_disks--;
+ disk->major = 0;
+ disk->minor = 0;
+ mark_disk_removed(disk);
+}
+
+#define BAD_MAGIC KERN_ERR \
+"md: invalid raid superblock magic on %s\n"
+
+#define BAD_MINOR KERN_ERR \
+"md: %s: invalid raid minor (%x)\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_SB KERN_ERR \
+"md: disabled device %s, could not read superblock.\n"
+
+#define BAD_CSUM KERN_WARNING \
+"md: invalid superblock checksum on %s\n"
+
+static int alloc_array_sb(mddev_t * mddev)
+{
+ if (mddev->sb) {
+ MD_BUG();
+ return 0;
+ }
+
+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
+ if (!mddev->sb)
+ return -ENOMEM;
+ md_clear_page(mddev->sb);
+ return 0;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(OUT_OF_MEM);
+ return -EINVAL;
+ }
+ rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ page_cache_release(rdev->sb_page);
+ rdev->sb = NULL;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ } else {
+ if (!rdev->faulty)
+ MD_BUG();
+ }
+}
+
+
+static void bh_complete(struct buffer_head *bh, int uptodate)
+{
+
+ if (uptodate)
+ set_bit(BH_Uptodate, &bh->b_state);
+
+ complete((struct completion*)bh->b_private);
+}
+
+static int sync_page_io(kdev_t dev, unsigned long sector, int size,
+ struct page *page, int rw)
+{
+ struct buffer_head bh;
+ struct completion event;
+
+ init_completion(&event);
+ init_buffer(&bh, bh_complete, &event);
+ bh.b_rdev = dev;
+ bh.b_rsector = sector;
+ bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
+ bh.b_size = size;
+ bh.b_page = page;
+ bh.b_reqnext = NULL;
+ bh.b_data = page_address(page);
+ generic_make_request(rw, &bh);
+
+ run_task_queue(&tq_disk);
+ wait_for_completion(&event);
+
+ return test_bit(BH_Uptodate, &bh.b_state);
+}
+
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+ int ret = -EINVAL;
+ kdev_t dev = rdev->dev;
+ unsigned long sb_offset;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk
+ */
+ sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
+ rdev->sb_offset = sb_offset;
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) {
+ printk(NO_SB,partition_name(dev));
+ return -EINVAL;
+ }
+ printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
+ ret = 0;
+abort:
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+/*
+ * Check one RAID superblock for generic plausibility
+ */
+
+static int check_disk_sb(mdk_rdev_t * rdev)
+{
+ mdp_super_t *sb;
+ int ret = -EINVAL;
+
+ sb = rdev->sb;
+ if (!sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(BAD_MAGIC, partition_name(rdev->dev));
+ goto abort;
+ }
+
+ if (sb->md_minor >= MAX_MD_DEVS) {
+ printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor);
+ goto abort;
+ }
+
+ if (calc_sb_csum(sb) != sb->sb_csum) {
+ printk(BAD_CSUM, partition_name(rdev->dev));
+ goto abort;
+ }
+ ret = 0;
+abort:
+ return ret;
+}
+
+static kdev_t dev_unit(kdev_t dev)
+{
+ unsigned int mask;
+ struct gendisk *hd = get_gendisk(dev);
+
+ if (!hd)
+ return 0;
+ mask = ~((1 << hd->minor_shift) - 1);
+
+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
+}
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (dev_unit(rdev->dev) == dev_unit(dev))
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev->dev))
+ return 1;
+
+ return 0;
+}
+
+static MD_LIST_HEAD(all_raid_disks);
+static MD_LIST_HEAD(pending_raid_disks);
+
+static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ same_pdev = match_dev_unit(mddev, rdev->dev);
+ if (same_pdev)
+ printk( KERN_WARNING
+"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
+" protection against single-disk failure might be compromised.\n",
+ mdidx(mddev), partition_name(rdev->dev),
+ partition_name(same_pdev->dev));
+
+ md_list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev));
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev));
+ rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev(mdk_rdev_t *rdev)
+{
+ int err = 0;
+ struct block_device *bdev;
+
+ bdev = bdget(rdev->dev);
+ if (!bdev)
+ return -ENOMEM;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ if (!err)
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ blkdev_put(bdev, BDEV_RAW);
+}
+
+void md_autodetect_dev(kdev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev));
+ if (rdev->mddev)
+ MD_BUG();
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ list_del_init(&rdev->all);
+ if (!list_empty(&rdev->pending)) {
+ printk(KERN_INFO "md: (%s was pending)\n",
+ partition_name(rdev->dev));
+ list_del_init(&rdev->pending);
+ }
+#ifndef MODULE
+ md_autodetect_dev(rdev->dev);
+#endif
+ rdev->dev = 0;
+ rdev->faulty = 0;
+ kfree(rdev);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb = mddev->sb;
+
+ if (mddev->sb) {
+ mddev->sb = NULL;
+ free_page((unsigned long) sb);
+ }
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+}
+
+static void free_mddev(mddev_t *mddev)
+{
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ export_array(mddev);
+ md_size[mdidx(mddev)] = 0;
+ md_hd_struct[mdidx(mddev)].nr_sects = 0;
+
+ /*
+ * Make sure nobody else is using this mddev
+ * (careful, we rely on the global kernel lock here)
+ */
+ while (sem_getcount(&mddev->resync_sem) != 1)
+ schedule();
+ while (sem_getcount(&mddev->recovery_sem) != 1)
+ schedule();
+
+ del_mddev_mapping(mddev, mk_kdev(MD_MAJOR, mdidx(mddev)));
+ md_list_del(&mddev->all_mddevs);
+ kfree(mddev);
+ MOD_DEC_USE_COUNT;
+}
+
+#undef BAD_CSUM
+#undef BAD_MAGIC
+#undef OUT_OF_MEM
+#undef NO_SB
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+ partition_name(MKDEV(desc->major,desc->minor)),
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
+ sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
+ partition_name(rdev->dev), partition_name(rdev->old_dev),
+ rdev->size, rdev->faulty, rdev->desc_nr);
+ if (rdev->sb) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb(rdev->sb);
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) {
+ printk("md%d: ", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", partition_name(rdev->dev));
+
+ if (mddev->sb) {
+ printk(" array superblock:\n");
+ print_sb(mddev->sb);
+ } else
+ printk(" no array superblock.\n");
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ if (tmp1)
+ kfree(tmp1);
+ if (tmp2)
+ kfree(tmp2);
+
+ return ret;
+}
+
+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
+{
+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+static mdk_rdev_t * find_rdev_all(kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ list_for_each(tmp, &all_raid_disks) {
+ rdev = md_list_entry(tmp, mdk_rdev_t, all);
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+#define GETBLK_FAILED KERN_ERR \
+"md: getblk failed for device %s\n"
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+ kdev_t dev;
+ unsigned long sb_offset, size;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
+ MD_BUG();
+ return 1;
+ }
+
+ dev = rdev->dev;
+ sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
+ if (rdev->sb_offset != sb_offset) {
+ printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n",
+ partition_name(dev), rdev->sb_offset, sb_offset);
+ goto skip;
+ }
+ /*
+ * If the disk went offline meanwhile and it's just a spare, then
+ * its size has changed to zero silently, and the MD code does
+ * not yet know that it's faulty.
+ */
+ size = calc_dev_size(dev, rdev->mddev, 1);
+ if (size != rdev->size) {
+ printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n",
+ partition_name(dev), rdev->size, size);
+ goto skip;
+ }
+
+ printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) {
+ printk("md: write_disk_sb failed for device %s\n", partition_name(dev));
+ return 1;
+ }
+skip:
+ return 0;
+}
+#undef GETBLK_FAILED
+
+static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ int i, ok = 0;
+ mdp_disk_t *desc;
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ desc = mddev->sb->disks + i;
+#if 0
+ if (disk_faulty(desc)) {
+ if (MKDEV(desc->major,desc->minor) == rdev->dev)
+ ok = 1;
+ continue;
+ }
+#endif
+ if (MKDEV(desc->major,desc->minor) == rdev->dev) {
+ rdev->sb->this_disk = *desc;
+ rdev->desc_nr = desc->number;
+ ok = 1;
+ break;
+ }
+ }
+
+ if (!ok) {
+ MD_BUG();
+ }
+}
+
+static int sync_sbs(mddev_t * mddev)
+{
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty || rdev->alias_device)
+ continue;
+ sb = rdev->sb;
+ *sb = *mddev->sb;
+ set_this_disk(mddev, rdev);
+ sb->sb_csum = calc_sb_csum(sb);
+ }
+ return 0;
+}
+
+int md_update_sb(mddev_t * mddev)
+{
+ int err, count = 100;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->sb_dirty) {
+ printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0));
+ return 0;
+ }
+ mddev->sb_dirty = 0;
+repeat:
+ mddev->sb->utime = CURRENT_TIME;
+ if ((++mddev->sb->events_lo)==0)
+ ++mddev->sb->events_hi;
+
+ if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
+ }
+ sync_sbs(mddev);
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (mddev->sb->not_persistent)
+ return 0;
+
+ printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
+ mdidx(mddev));
+
+ err = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk(KERN_INFO "md: ");
+ if (rdev->faulty)
+ printk("(skipping faulty ");
+ if (rdev->alias_device)
+ printk("(skipping alias ");
+ if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) {
+ printk("(skipping new-faulty %s )\n",
+ partition_name(rdev->dev));
+ continue;
+ }
+ printk("%s ", partition_name(rdev->dev));
+ if (!rdev->faulty && !rdev->alias_device) {
+ printk("[events: %08lx]",
+ (unsigned long)rdev->sb->events_lo);
+ err += write_disk_sb(rdev);
+ } else
+ printk(")\n");
+ }
+ if (err) {
+ if (--count) {
+ printk(KERN_ERR "md: errors occurred during superblock update, repeating\n");
+ goto repeat;
+ }
+ printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
+ }
+ return 0;
+}
+
+/*
+ * Import a device. If 'on_disk', then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ */
+static int md_import_device(kdev_t newdev, int on_disk)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ unsigned int size;
+
+ if (find_rdev_all(newdev))
+ return -EEXIST;
+
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev));
+ return -ENOMEM;
+ }
+ memset(rdev, 0, sizeof(*rdev));
+
+ if (is_mounted(newdev)) {
+ printk(KERN_WARNING "md: can not import %s, has active inodes!\n",
+ partition_name(newdev));
+ err = -EBUSY;
+ goto abort_free;
+ }
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ rdev->dev = newdev;
+ if (lock_rdev(rdev)) {
+ printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+ rdev->desc_nr = -1;
+ rdev->faulty = 0;
+
+ size = 0;
+ if (blk_size[MAJOR(newdev)])
+ size = blk_size[MAJOR(newdev)][MINOR(newdev)];
+ if (!size) {
+ printk(KERN_WARNING "md: %s has zero size, marking faulty!\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (on_disk) {
+ if ((err = read_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+ if ((err = check_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+
+ if (rdev->sb->level != -4) {
+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
+ rdev->sb->this_disk.minor);
+ rdev->desc_nr = rdev->sb->this_disk.number;
+ } else {
+ rdev->old_dev = MKDEV(0, 0);
+ rdev->desc_nr = -1;
+ }
+ }
+ md_list_add(&rdev->all, &all_raid_disks);
+ MD_INIT_LIST_HEAD(&rdev->pending);
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return 0;
+
+abort_free:
+ if (rdev->sb) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return err;
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+#define INCONSISTENT KERN_ERR \
+"md: fatal superblock inconsistency in %s -- removing from array\n"
+
+#define OUT_OF_DATE KERN_ERR \
+"md: superblock update time inconsistency -- using the most recent one\n"
+
+#define OLD_VERSION KERN_ALERT \
+"md: md%d: unsupported raid array version %d.%d.%d\n"
+
+#define NOT_CLEAN_IGNORE KERN_ERR \
+"md: md%d: raid array is not clean -- starting background reconstruction\n"
+
+#define UNKNOWN_LEVEL KERN_ERR \
+"md: md%d: unsupported raid level %d\n"
+
+static int analyze_sbs(mddev_t * mddev)
+{
+ int out_of_date = 0, i, first;
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev, *rdev2, *freshest;
+ mdp_super_t *sb;
+
+ /*
+ * Verify the RAID superblock on each real device
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty) {
+ MD_BUG();
+ goto abort;
+ }
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+ if (check_disk_sb(rdev))
+ goto abort;
+ }
+
+ /*
+ * The superblock constant part has to be the same
+ * for all disks in the array.
+ */
+ sb = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!sb) {
+ sb = rdev->sb;
+ continue;
+ }
+ if (!sb_equal(sb, rdev->sb)) {
+ printk(INCONSISTENT, partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * OK, we have all disks and the array is ready to run. Let's
+ * find the freshest superblock, that one will be the superblock
+ * that represents the whole array.
+ */
+ if (!mddev->sb)
+ if (alloc_array_sb(mddev))
+ goto abort;
+ sb = mddev->sb;
+ freshest = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2;
+ /*
+ * if the checksum is invalid, use the superblock
+ * only as a last resort. (decrease it's age by
+ * one event)
+ */
+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
+ if (rdev->sb->events_lo || rdev->sb->events_hi)
+ if ((rdev->sb->events_lo--)==0)
+ rdev->sb->events_hi--;
+ }
+
+ printk(KERN_INFO "md: %s's event counter: %08lx\n",
+ partition_name(rdev->dev),
+ (unsigned long)rdev->sb->events_lo);
+ if (!freshest) {
+ freshest = rdev;
+ continue;
+ }
+ /*
+ * Find the newest superblock version
+ */
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(freshest->sb);
+ if (ev1 != ev2) {
+ out_of_date = 1;
+ if (ev1 > ev2)
+ freshest = rdev;
+ }
+ }
+ if (out_of_date) {
+ printk(OUT_OF_DATE);
+ printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev));
+ }
+ memcpy (sb, freshest->sb, sizeof(*sb));
+
+ /*
+ * at this point we have picked the 'best' superblock
+ * from all available superblocks.
+ * now we validate this superblock and kick out possibly
+ * failed disks.
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Kick all non-fresh devices
+ */
+ __u64 ev1, ev2;
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ++ev1;
+ if (ev1 < ev2) {
+ printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
+ partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * Fix up changed device names ... but only if this disk has a
+ * recent update time. Use faulty checksum ones too.
+ */
+ if (mddev->sb->level != -4)
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2, ev3;
+ if (rdev->faulty || rdev->alias_device) {
+ MD_BUG();
+ goto abort;
+ }
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ev3 = ev2;
+ --ev3;
+ if ((rdev->dev != rdev->old_dev) &&
+ ((ev1 == ev2) || (ev1 == ev3))) {
+ mdp_disk_t *desc;
+
+ printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n",
+ partition_name(rdev->old_dev), partition_name(rdev->dev));
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ desc = &sb->disks[rdev->desc_nr];
+ if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
+ MD_BUG();
+ goto abort;
+ }
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ desc = &rdev->sb->this_disk;
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ }
+ }
+
+ /*
+ * Remove unavailable and faulty devices ...
+ *
+ * note that if an array becomes completely unrunnable due to
+ * missing devices, we do not write the superblock back, so the
+ * administrator has a chance to fix things up. The removal thus
+ * only happens if it's nonfatal to the contents of the array.
+ */
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ int found;
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ /*
+ * We kick faulty devices/descriptors immediately.
+ *
+ * Note: multipath devices are a special case. Since we
+ * were able to read the superblock on the path, we don't
+ * care if it was previously marked as faulty, it's up now
+ * so enable it.
+ */
+ if (disk_faulty(desc) && mddev->sb->level != -4) {
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr != desc->number)
+ continue;
+ printk(KERN_WARNING "md%d: kicking faulty %s!\n",
+ mdidx(mddev),partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ found = 1;
+ break;
+ }
+ if (!found) {
+ if (dev == MKDEV(0,0))
+ continue;
+ printk(KERN_WARNING "md%d: removing former faulty %s!\n",
+ mdidx(mddev), partition_name(dev));
+ }
+ remove_descriptor(desc, sb);
+ continue;
+ } else if (disk_faulty(desc)) {
+ /*
+ * multipath entry marked as faulty, unfaulty it
+ */
+ rdev = find_rdev(mddev, dev);
+ if(rdev)
+ mark_disk_spare(desc);
+ else
+ remove_descriptor(desc, sb);
+ }
+
+ if (dev == MKDEV(0,0))
+ continue;
+ /*
+ * Is this device present in the rdev ring?
+ */
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Multi-path IO special-case: since we have no
+ * this_disk descriptor at auto-detect time,
+ * we cannot check rdev->number.
+ * We can check the device though.
+ */
+ if ((sb->level == -4) && (rdev->dev ==
+ MKDEV(desc->major,desc->minor))) {
+ found = 1;
+ break;
+ }
+ if (rdev->desc_nr == desc->number) {
+ found = 1;
+ break;
+ }
+ }
+ if (found)
+ continue;
+
+ printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n",
+ mdidx(mddev), partition_name(dev));
+ remove_descriptor(desc, sb);
+ }
+
+ /*
+ * Double check wether all devices mentioned in the
+ * superblock are in the rdev ring.
+ */
+ first = 1;
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+
+ if (disk_faulty(desc)) {
+ MD_BUG();
+ goto abort;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * In the case of Multipath-IO, we have no
+ * other information source to find out which
+ * disk is which, only the position of the device
+ * in the superblock:
+ */
+ if (mddev->sb->level == -4) {
+ if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) {
+ MD_BUG();
+ goto abort;
+ }
+ rdev->desc_nr = i;
+ if (!first)
+ rdev->alias_device = 1;
+ else
+ first = 0;
+ }
+ }
+
+ /*
+ * Kick all rdevs that are not in the
+ * descriptor array:
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1)
+ kick_rdev_from_array(rdev);
+ }
+
+ /*
+ * Do a final reality check.
+ */
+ if (mddev->sb->level != -4) {
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * is the desc_nr unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->desc_nr == rdev->desc_nr)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ /*
+ * is the device unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->dev == rdev->dev)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ }
+ }
+
+ /*
+ * Check if we can support this RAID array
+ */
+ if (sb->major_version != MD_MAJOR_VERSION ||
+ sb->minor_version > MD_MINOR_VERSION) {
+
+ printk(OLD_VERSION, mdidx(mddev), sb->major_version,
+ sb->minor_version, sb->patch_version);
+ goto abort;
+ }
+
+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
+ (sb->level == 4) || (sb->level == 5)))
+ printk(NOT_CLEAN_IGNORE, mdidx(mddev));
+
+ return 0;
+abort:
+ return 1;
+}
+
+#undef INCONSISTENT
+#undef OUT_OF_DATE
+#undef OLD_VERSION
+#undef OLD_LEVEL
+
+static int device_size_calculation(mddev_t * mddev)
+{
+ int data_disks = 0, persistent;
+ unsigned int readahead;
+ mdp_super_t *sb = mddev->sb;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+ persistent = !mddev->sb->not_persistent;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size) {
+ MD_BUG();
+ continue;
+ }
+ rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
+ if (rdev->size < sb->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size: %ldk < %dk\n",
+ partition_name(rdev->dev),
+ rdev->size, sb->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (sb->level) {
+ case -4:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case -1:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = sb->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = sb->raid_disks-1;
+ break;
+ default:
+ printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = sb->size * data_disks;
+
+ readahead = MD_READAHEAD;
+ if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
+ readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (sb->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+abort:
+ return 1;
+}
+
+
+#define TOO_BIG_CHUNKSIZE KERN_ERR \
+"too big chunk_size: %d > %d\n"
+
+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
+"too small chunk_size: %d < %ld\n"
+
+#define BAD_CHUNKSIZE KERN_ERR \
+"no chunksize specified, see 'man raidtab'\n"
+
+static int do_md_run(mddev_t * mddev)
+{
+ int pnum, err;
+ int chunk_size;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Resize disks to align partitions size on a given
+ * chunk size.
+ */
+ md_size[mdidx(mddev)] = 0;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (analyze_sbs(mddev)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ chunk_size = mddev->sb->chunk_size;
+ pnum = level_to_pers(mddev->sb->level);
+
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+ if (!chunk_size) {
+ /*
+ * 'default chunksize' in the old md code used to
+ * be PAGE_SIZE, baaad.
+ * we abort here to be on the safe side. We dont
+ * want to continue the bad practice.
+ */
+ printk(BAD_CHUNKSIZE);
+ return -EINVAL;
+ }
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+ } else
+ if (chunk_size)
+ printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n",
+ mddev->sb->level);
+
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (!pers[pnum])
+ {
+#ifdef CONFIG_KMOD
+ char module_name[80];
+ sprintf (module_name, "md-personality-%d", pnum);
+ request_module (module_name);
+ if (!pers[pnum])
+#endif
+ {
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
+ pnum);
+ return -EINVAL;
+ }
+ }
+
+ if (device_size_calculation(mddev))
+ return -EINVAL;
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ md_hardsect_sizes[mdidx(mddev)] = 512;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ invalidate_device(rdev->dev, 1);
+ if (get_hardsect_size(rdev->dev)
+ > md_hardsect_sizes[mdidx(mddev)])
+ md_hardsect_sizes[mdidx(mddev)] =
+ get_hardsect_size(rdev->dev);
+ }
+ md_blocksizes[mdidx(mddev)] = 1024;
+ if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
+ md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
+ mddev->pers = pers[pnum];
+
+ blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+ mddev->queue.queuedata = mddev;
+
+ err = mddev->pers->run(mddev);
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ mddev->pers = NULL;
+ return -EINVAL;
+ }
+
+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ /*
+ * md_size has units of 1K blocks, which are
+ * twice as large as sectors.
+ */
+ md_hd_struct[mdidx(mddev)].start_sect = 0;
+ register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)),
+ 1, &md_fops, md_size[mdidx(mddev)]<<1);
+
+ read_ahead[MD_MAJOR] = 1024;
+ return (0);
+}
+
+#undef TOO_BIG_CHUNKSIZE
+#undef BAD_CHUNKSIZE
+
+static int restart_array(mddev_t *mddev)
+{
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->ro = 0;
+ set_device_ro(mddev_to_kdev(mddev), 0);
+
+ printk(KERN_INFO
+ "md: md%d switched to read-write mode.\n", mdidx(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ md_recover_arrays();
+ if (mddev->pers->restart_resync)
+ mddev->pers->restart_resync(mddev);
+ err = 0;
+ } else {
+ printk(KERN_ERR "md: md%d has no personality assigned.\n",
+ mdidx(mddev));
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+#define STILL_MOUNTED KERN_WARNING \
+"md: md%d still mounted.\n"
+#define STILL_IN_USE \
+"md: md%d still in use.\n"
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+ int err = 0, resync_interrupted = 0;
+ kdev_t dev = mddev_to_kdev(mddev);
+
+ if (atomic_read(&mddev->active)>1) {
+ printk(STILL_IN_USE, mdidx(mddev));
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (mddev->pers) {
+ /*
+ * It is safe to call stop here, it only frees private
+ * data. Also, it tells us if a device is unstoppable
+ * (eg. resyncing is in progress)
+ */
+ if (mddev->pers->stop_resync)
+ if (mddev->pers->stop_resync(mddev))
+ resync_interrupted = 1;
+
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+
+ /*
+ * This synchronizes with signal delivery to the
+ * resync or reconstruction thread. It also nicely
+ * hangs the process if some reconstruction has not
+ * finished.
+ */
+ down(&mddev->recovery_sem);
+ up(&mddev->recovery_sem);
+
+ invalidate_device(dev, 1);
+
+ if (ro) {
+ err = -ENXIO;
+ if (mddev->ro)
+ goto out;
+ mddev->ro = 1;
+ } else {
+ if (mddev->ro)
+ set_device_ro(dev, 0);
+ if (mddev->pers->stop(mddev)) {
+ err = -EBUSY;
+ if (mddev->ro)
+ set_device_ro(dev, 1);
+ goto out;
+ }
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (mddev->sb) {
+ /*
+ * mark it clean only if there was no resync
+ * interrupted.
+ */
+ if (!mddev->recovery_running && !resync_interrupted) {
+ printk(KERN_INFO "md: marking sb clean...\n");
+ mddev->sb->state |= 1 << MD_SB_CLEAN;
+ }
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+ }
+ if (ro)
+ set_device_ro(dev, 1);
+ }
+
+ /*
+ * Free resources if final stop
+ */
+ if (!ro) {
+ printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+ free_mddev(mddev);
+ } else
+ printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * We have to safely support old arrays too.
+ */
+int detect_old_array(mdp_super_t *sb)
+{
+ if (sb->major_version > 0)
+ return 0;
+ if (sb->minor_version >= 90)
+ return 0;
+
+ return -EINVAL;
+}
+
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return;
+ }
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk("<%s>", partition_name(rdev->dev));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+ /*
+ * prevent the writeback of an unrunnable array
+ */
+ mddev->sb_dirty = 0;
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in the ->pending list)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(kdev_t countdev)
+{
+ struct md_list_head candidates;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+ kdev_t md_kdev;
+
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ rdev0 = md_list_entry(pending_raid_disks.next,
+ mdk_rdev_t, pending);
+
+ printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev));
+ MD_INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ if (uuid_equal(rdev0, rdev)) {
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING
+ "md: %s has same UUID as %s, but superblocks differ ...\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ continue;
+ }
+ printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev));
+ md_list_del(&rdev->pending);
+ md_list_add(&rdev->pending, &candidates);
+ }
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
+ mddev = kdev_to_mddev(md_kdev);
+ if (mddev) {
+ printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
+ mdidx(mddev), partition_name(rdev0->dev));
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
+ export_rdev(rdev);
+ continue;
+ }
+ mddev = alloc_mddev(md_kdev);
+ if (!mddev) {
+ printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (md_kdev == countdev)
+ atomic_inc(&mddev->active);
+ printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
+ bind_rdev_to_array(rdev, mddev);
+ list_del_init(&rdev->pending);
+ }
+ autorun_array(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+#define BAD_VERSION KERN_ERR \
+"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_DEVICE KERN_ERR \
+"md: disabled device %s\n"
+
+#define AUTOADD_FAILED KERN_ERR \
+"md: auto-adding devices to md%d FAILED (error %d).\n"
+
+#define AUTOADD_FAILED_USED KERN_ERR \
+"md: cannot auto-add device %s to md%d, already used.\n"
+
+#define AUTORUN_FAILED KERN_ERR \
+"md: auto-running md%d FAILED (error %d).\n"
+
+#define MDDEV_BUSY KERN_ERR \
+"md: cannot auto-add to md%d, already running.\n"
+
+#define AUTOADDING KERN_INFO \
+"md: auto-adding devices to md%d, based on %s's superblock.\n"
+
+#define AUTORUNNING KERN_INFO \
+"md: auto-running md%d.\n"
+
+static int autostart_array(kdev_t startdev, kdev_t countdev)
+{
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ if (md_import_device(startdev, 1)) {
+ printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev));
+ goto abort;
+ }
+
+ start_rdev = find_rdev_all(startdev);
+ if (!start_rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ if (start_rdev->faulty) {
+ printk(KERN_WARNING "md: can not autostart based on faulty %s!\n",
+ partition_name(startdev));
+ goto abort;
+ }
+ md_list_add(&start_rdev->pending, &pending_raid_disks);
+
+ sb = start_rdev->sb;
+
+ err = detect_old_array(sb);
+ if (err) {
+ printk(KERN_WARNING "md: array version is too old to be autostarted ,"
+ "use raidtools 0.90 mkraid --upgrade to upgrade the array "
+ "without data loss!\n");
+ goto abort;
+ }
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+ if (dev == startdev)
+ continue;
+ if (md_import_device(dev, 1)) {
+ printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n",
+ partition_name(dev));
+ continue;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices(countdev);
+ return 0;
+
+abort:
+ if (start_rdev)
+ export_rdev(start_rdev);
+ return err;
+}
+
+#undef BAD_VERSION
+#undef OUT_OF_MEM
+#undef NO_DEVICE
+#undef AUTOADD_FAILED_USED
+#undef AUTOADD_FAILED
+#undef AUTORUN_FAILED
+#undef AUTOADDING
+#undef AUTORUNNING
+
+
+static int get_version(void * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (md_copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define SET_FROM_SB(x) info.x = mddev->sb->x
+static int get_array_info(mddev_t * mddev, void * arg)
+{
+ mdu_array_info_t info;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ SET_FROM_SB(major_version);
+ SET_FROM_SB(minor_version);
+ SET_FROM_SB(patch_version);
+ SET_FROM_SB(ctime);
+ SET_FROM_SB(level);
+ SET_FROM_SB(size);
+ SET_FROM_SB(nr_disks);
+ SET_FROM_SB(raid_disks);
+ SET_FROM_SB(md_minor);
+ SET_FROM_SB(not_persistent);
+
+ SET_FROM_SB(utime);
+ SET_FROM_SB(state);
+ SET_FROM_SB(active_disks);
+ SET_FROM_SB(working_disks);
+ SET_FROM_SB(failed_disks);
+ SET_FROM_SB(spare_disks);
+
+ SET_FROM_SB(layout);
+ SET_FROM_SB(chunk_size);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
+static int get_disk_info(mddev_t * mddev, void * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+
+ if (!mddev->sb)
+ return -EINVAL;
+
+ if (md_copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+ if (nr >= MD_SB_DISKS)
+ return -EINVAL;
+
+ SET_FROM_SB(major);
+ SET_FROM_SB(minor);
+ SET_FROM_SB(raid_disk);
+ SET_FROM_SB(state);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_SB(x) mddev->sb->disks[nr].x = info->x
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ int err, size, persistent;
+ mdk_rdev_t *rdev;
+ unsigned int nr;
+ kdev_t dev;
+ dev = MKDEV(info->major,info->minor);
+
+ if (find_rdev_all(dev)) {
+ printk(KERN_WARNING "md: device %s already used in a RAID array!\n",
+ partition_name(dev));
+ return -EBUSY;
+ }
+ if (!mddev->sb) {
+ /* expecting a device which has a superblock */
+ err = md_import_device(dev, 1);
+ if (err) {
+ printk(KERN_WARNING "md: md_import_device returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ if (!uuid_equal(rdev0, rdev)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ bind_rdev_to_array(rdev, mddev);
+ return 0;
+ }
+
+ nr = info->number;
+ if (nr >= mddev->sb->nr_disks) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+
+ SET_SB(number);
+ SET_SB(major);
+ SET_SB(minor);
+ SET_SB(raid_disk);
+ SET_SB(state);
+
+ if ((info->state & (1<<MD_DISK_FAULTY))==0) {
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ rdev->old_dev = dev;
+ rdev->desc_nr = info->number;
+
+ bind_rdev_to_array(rdev, mddev);
+
+ persistent = !mddev->sb->not_persistent;
+ if (!persistent)
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+
+ size = calc_dev_size(dev, mddev, persistent);
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ if (!mddev->sb->size || (mddev->sb->size > size))
+ mddev->sb->size = size;
+ }
+
+ /*
+ * sync all other superblocks with the main superblock
+ */
+ sync_sbs(mddev);
+
+ return 0;
+}
+#undef SET_SB
+
+static int hot_generate_error(mddev_t * mddev, kdev_t dev)
+{
+ struct request_queue *q;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ return -ENXIO;
+ }
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (!disk_active(disk))
+ return -ENODEV;
+
+ q = blk_get_queue(rdev->dev);
+ if (!q) {
+ MD_BUG();
+ return -ENODEV;
+ }
+ printk(KERN_INFO "md: okay, generating error!\n");
+// q->oneshot_error = 1; // disabled for now
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, kdev_t dev)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (disk_active(disk))
+ goto busy;
+
+ if (disk_removed(disk))
+ return -EINVAL;
+
+ err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
+ if (err == -EBUSY)
+ goto busy;
+
+ if (err) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ remove_descriptor(disk, mddev->sb);
+ kick_rdev_from_array(rdev);
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, kdev_t dev)
+{
+ int i, err, persistent;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ persistent = !mddev->sb->not_persistent;
+
+ rdev = find_rdev(mddev, dev);
+ if (rdev)
+ return -EBUSY;
+
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->faulty) {
+ printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n",
+ partition_name(dev), mdidx(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ size = calc_dev_size(dev, mddev, persistent);
+
+ if (size < mddev->sb->size) {
+ printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n",
+ mdidx(mddev), size, mddev->sb->size);
+ err = -ENOSPC;
+ goto abort_export;
+ }
+ bind_rdev_to_array(rdev, mddev);
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+ rdev->old_dev = dev;
+ rdev->size = size;
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ disk = mddev->sb->disks + mddev->sb->raid_disks;
+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
+ disk = mddev->sb->disks + i;
+
+ if (!disk->major && !disk->minor)
+ break;
+ if (disk_removed(disk))
+ break;
+ }
+ if (i == MD_SB_DISKS) {
+ printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ if (disk_removed(disk)) {
+ /*
+ * reuse slot
+ */
+ if (disk->number != i) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+ } else {
+ disk->number = i;
+ }
+
+ disk->raid_disk = disk->number;
+ disk->major = MAJOR(dev);
+ disk->minor = MINOR(dev);
+
+ if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+
+ mark_disk_spare(disk);
+ mddev->sb->nr_disks++;
+ mddev->sb->spare_disks++;
+ mddev->sb->working_disks++;
+
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ md_recover_arrays();
+
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+#define SET_SB(x) mddev->sb->x = info->x
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (alloc_array_sb(mddev))
+ return -ENOMEM;
+
+ mddev->sb->major_version = MD_MAJOR_VERSION;
+ mddev->sb->minor_version = MD_MINOR_VERSION;
+ mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->sb->ctime = CURRENT_TIME;
+
+ SET_SB(level);
+ SET_SB(size);
+ SET_SB(nr_disks);
+ SET_SB(raid_disks);
+ SET_SB(md_minor);
+ SET_SB(not_persistent);
+
+ SET_SB(state);
+ SET_SB(active_disks);
+ SET_SB(working_disks);
+ SET_SB(failed_disks);
+ SET_SB(spare_disks);
+
+ SET_SB(layout);
+ SET_SB(chunk_size);
+
+ mddev->sb->md_magic = MD_SB_MAGIC;
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(&mddev->sb->set_uuid0, 4);
+ get_random_bytes(&mddev->sb->set_uuid1, 4);
+ get_random_bytes(&mddev->sb->set_uuid2, 4);
+ get_random_bytes(&mddev->sb->set_uuid3, 4);
+
+ return 0;
+}
+#undef SET_SB
+
+static int set_disk_faulty(mddev_t *mddev, kdev_t dev)
+{
+ int ret;
+
+ ret = md_error(mddev, dev);
+ return ret;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ unsigned int minor;
+ int err = 0;
+ struct hd_geometry *loc = (struct hd_geometry *) arg;
+ mddev_t *mddev = NULL;
+ kdev_t dev;
+
+ if (!md_capable_admin())
+ return -EACCES;
+
+ dev = inode->i_rdev;
+ minor = MINOR(dev);
+ if (minor >= MAX_MD_DEVS) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version((void *)arg);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done_unlock;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays();
+ goto done;
+#endif
+
+ case BLKGETSIZE:
+ case BLKGETSIZE64:
+ case BLKRAGET:
+ case BLKRASET:
+ case BLKFLSBUF:
+ case BLKBSZGET:
+ case BLKBSZSET:
+ err = blk_ioctl (dev, cmd, arg);
+ goto abort;
+
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = kdev_to_mddev(dev);
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+ case START_ARRAY:
+ if (mddev) {
+ printk(KERN_WARNING "md: array md%d already exists!\n",
+ mdidx(mddev));
+ err = -EEXIST;
+ goto abort;
+ }
+ default:;
+ }
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+ mddev = alloc_mddev(dev);
+ if (!mddev) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ atomic_inc(&mddev->active);
+
+ /*
+ * alloc_mddev() should possibly self-lock.
+ */
+ err = lock_mddev(mddev);
+ if (err) {
+ printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ if (mddev->sb) {
+ printk(KERN_WARNING "md: array md%d already has a superblock!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (arg) {
+ mdu_array_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+ case START_ARRAY:
+ /*
+ * possibly make it lock the array ...
+ */
+ err = autostart_array((kdev_t)arg, dev);
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name((kdev_t)arg));
+ goto abort;
+ }
+ goto done;
+
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+
+ if (!mddev) {
+ err = -ENODEV;
+ goto abort;
+ }
+ err = lock_mddev(mddev);
+ if (err) {
+ printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
+ goto abort;
+ }
+ /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+ if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ if (!(err = do_md_stop (mddev, 0)))
+ mddev = NULL;
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ case HDIO_GETGEO:
+ if (!loc) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ err = md_put_user (2, (char *) &loc->heads);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (4, (char *) &loc->sectors);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
+ (short *) &loc->cylinders);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[minor].start_sect,
+ (long *) &loc->start);
+ goto done_unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow read-only arrays
+ * here:
+ */
+ if (mddev->ro) {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ case HOT_GENERATE_ERROR:
+ err = hot_generate_error(mddev, (kdev_t)arg);
+ goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ {
+ err = do_md_run (mddev);
+ /*
+ * we have to clean up the mess if
+ * the array cannot be run for some
+ * reason ...
+ */
+ if (err) {
+ mddev->sb_dirty = 0;
+ if (!do_md_stop (mddev, 0))
+ mddev = NULL;
+ }
+ goto done_unlock;
+ }
+
+ default:
+ printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, "
+ "upgrade your software to use new ictls.\n",
+ current->comm, current->pid);
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ if (mddev)
+ unlock_mddev(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Always succeed, but increment the usage count
+ */
+ mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
+ if (mddev)
+ atomic_inc(&mddev->active);
+ return (0);
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
+ if (mddev)
+ atomic_dec(&mddev->active);
+ return 0;
+}
+
+static struct block_device_operations md_fops=
+{
+ owner: THIS_MODULE,
+ open: md_open,
+ release: md_release,
+ ioctl: md_ioctl,
+};
+
+
+int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ md_lock_kernel();
+
+ /*
+ * Detach thread
+ */
+
+ daemonize();
+
+ sprintf(current->comm, thread->name);
+ md_init_signals();
+ md_flush_signals();
+ thread->tsk = current;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+ current->policy = SCHED_OTHER;
+ current->nice = -20;
+ md_unlock_kernel();
+
+ complete(thread->event);
+ while (thread->run) {
+ void (*run)(void *data);
+
+ wait_event_interruptible(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags));
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ run = thread->run;
+ if (run) {
+ run(thread->data);
+ run_task_queue(&tq_disk);
+ }
+ if (md_signal_pending(current))
+ md_flush_signals();
+ }
+ complete(thread->event);
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ dprintk("md: waking up MD thread %p.\n", thread);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+}
+
+mdk_thread_t *md_register_thread(void (*run) (void *),
+ void *data, const char *name)
+{
+ mdk_thread_t *thread;
+ int ret;
+ struct completion event;
+
+ thread = (mdk_thread_t *) kmalloc
+ (sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ memset(thread, 0, sizeof(mdk_thread_t));
+ md_init_waitqueue_head(&thread->wqueue);
+
+ init_completion(&event);
+ thread->event = &event;
+ thread->run = run;
+ thread->data = data;
+ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+ return NULL;
+ }
+ wait_for_completion(&event);
+ return thread;
+}
+
+void md_interrupt_thread(mdk_thread_t *thread)
+{
+ if (!thread->tsk) {
+ MD_BUG();
+ return;
+ }
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+ send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ struct completion event;
+
+ init_completion(&event);
+
+ thread->event = &event;
+ thread->run = NULL;
+ thread->name = NULL;
+ md_interrupt_thread(thread);
+ wait_for_completion(&event);
+ kfree(thread);
+}
+
+void md_recover_arrays(void)
+{
+ if (!md_recovery_thread) {
+ MD_BUG();
+ return;
+ }
+ md_wakeup_thread(md_recovery_thread);
+}
+
+
+int md_error(mddev_t *mddev, kdev_t rdev)
+{
+ mdk_rdev_t * rrdev;
+
+ dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+
+ if (!mddev) {
+ MD_BUG();
+ return 0;
+ }
+ rrdev = find_rdev(mddev, rdev);
+ if (!rrdev || rrdev->faulty)
+ return 0;
+ if (!mddev->pers->error_handler
+ || mddev->pers->error_handler(mddev,rdev) <= 0) {
+ rrdev->faulty = 1;
+ } else
+ return 1;
+ /*
+ * if recovery was running, stop it now.
+ */
+ if (mddev->pers->stop_resync)
+ mddev->pers->stop_resync(mddev);
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+ md_recover_arrays();
+
+ return 0;
+}
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_ALL(rdev,tmp) {
+ if (list_empty(&rdev->same_set)) {
+ /*
+ * The device is not yet used by any array.
+ */
+ i++;
+ seq_printf(seq, "%s ",
+ partition_name(rdev->dev));
+ }
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ unsigned long max_blocks, resync, res, dt, db, rt;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ max_blocks = mddev->sb->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks)
+ MD_BUG();
+
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ {
+ int i, x = res/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ if (!mddev->recovery_running)
+ /*
+ * true resync
+ */
+ seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)",
+ res/10, res % 10, resync, max_blocks);
+ else
+ /*
+ * recovery ...
+ */
+ seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)",
+ res/10, res % 10, resync, max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+
+}
+
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l > 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ return mddev;
+ }
+ return (void*)2;/* tail */
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = list_entry(tmp,mddev_t,all_mddevs);
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ int j, size;
+ struct md_list_head *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev = v;
+
+ if (v == (void*)1) {
+ seq_printf(seq, "Personalities : ");
+ for (j = 0; j < MAX_PERSONALITY; j++)
+ if (pers[j])
+ seq_printf(seq, "[%s] ", pers[j]->name);
+
+ seq_printf(seq, "\n");
+ seq_printf(seq, "read_ahead ");
+ if (read_ahead[MD_MAJOR] == INT_MAX)
+ seq_printf(seq, "not set\n");
+ else
+ seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]);
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro)
+ seq_printf(seq, " (read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ seq_printf(seq, " %s[%d]",
+ partition_name(rdev->dev), rdev->desc_nr);
+ if (rdev->faulty) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %d blocks",
+ md_size[mdidx(mddev)]);
+ else
+ seq_printf(seq, "\n %d blocks", size);
+ }
+
+ if (mddev->pers) {
+
+ mddev->pers->status (seq, mddev);
+
+ seq_printf(seq, "\n ");
+ if (mddev->curr_resync) {
+ status_resync (seq, mddev);
+ } else {
+ if (sem_getcount(&mddev->resync_sem) != 1)
+ seq_printf(seq, " resync=DELAYED");
+ }
+ }
+ seq_printf(seq, "\n");
+
+ return 0;
+}
+
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ return error;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (pers[pnum]) {
+ MD_BUG();
+ return -EBUSY;
+ }
+
+ pers[pnum] = p;
+ printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+ return 0;
+}
+
+int unregister_md_personality(int pnum)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+ pers[pnum] = NULL;
+ return 0;
+}
+
+mdp_disk_t *get_spare(mddev_t *mddev)
+{
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *disk;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (!rdev->sb) {
+ MD_BUG();
+ continue;
+ }
+ disk = &sb->disks[rdev->desc_nr];
+ if (disk_faulty(disk)) {
+ MD_BUG();
+ continue;
+ }
+ if (disk_active(disk))
+ continue;
+ return disk;
+ }
+ return NULL;
+}
+
+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
+void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
+{
+ unsigned int major = MAJOR(dev);
+ unsigned int index;
+
+ index = disk_index(dev);
+ if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ return;
+
+ sync_io[major][index] += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ int major = MAJOR(rdev->dev);
+ int idx = disk_index(rdev->dev);
+
+ if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ continue;
+
+ curr_events = kstat.dk_drive_rblk[major][idx] +
+ kstat.dk_drive_wblk[major][idx] ;
+ curr_events -= sync_io[major][idx];
+ if ((curr_events - rdev->last_events) > 32) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ // stop recovery, signal do_sync ....
+ if (mddev->pers->stop_resync)
+ mddev->pers->stop_resync(mddev);
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+ }
+}
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
+{
+ mddev_t *mddev2;
+ unsigned int max_sectors, currspeed,
+ j, window, err, serialize;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct md_list_head *tmp;
+ unsigned long last_check;
+
+
+ err = down_interruptible(&mddev->resync_sem);
+ if (err)
+ goto out_nolock;
+
+recheck:
+ serialize = 0;
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
+ printk(KERN_INFO "md: delaying resync of md%d until md%d "
+ "has finished resync (they share one or more physical units)\n",
+ mdidx(mddev), mdidx(mddev2));
+ serialize = 1;
+ break;
+ }
+ }
+ if (serialize) {
+ interruptible_sleep_on(&resync_wait);
+ if (md_signal_pending(current)) {
+ md_flush_signals();
+ err = -EINTR;
+ goto out;
+ }
+ goto recheck;
+ }
+
+ mddev->curr_resync = 1;
+
+ max_sectors = mddev->sb->size<<1;
+
+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
+ sysctl_speed_limit_min);
+ printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ sysctl_speed_limit_max);
+
+ /*
+ * Resync has low priority.
+ */
+ current->nice = 19;
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = 0;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = vm_max_readahead*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+ window/2,max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+ for (j = 0; j < max_sectors;) {
+ int sectors;
+
+ sectors = mddev->pers->sync_request(mddev, j);
+
+ if (sectors < 0) {
+ err = sectors;
+ goto out;
+ }
+ atomic_add(sectors, &mddev->recovery_active);
+ j += sectors;
+ mddev->curr_resync = j;
+
+ if (last_check + window > j)
+ continue;
+
+ last_check = j;
+
+ run_task_queue(&tq_disk);
+
+ repeat:
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (md_signal_pending(current)) {
+ /*
+ * got a signal, exit.
+ */
+ mddev->curr_resync = 0;
+ printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
+ md_flush_signals();
+ err = -EINTR;
+ goto out;
+ }
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ if (md_need_resched(current))
+ schedule();
+
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > sysctl_speed_limit_min) {
+ current->nice = 19;
+
+ if ((currspeed > sysctl_speed_limit_max) ||
+ !is_mddev_idle(mddev)) {
+ current->state = TASK_INTERRUPTIBLE;
+ md_schedule_timeout(HZ/4);
+ goto repeat;
+ }
+ } else
+ current->nice = -20;
+ }
+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+ err = 0;
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+out:
+ wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
+ up(&mddev->resync_sem);
+out_nolock:
+ mddev->curr_resync = 0;
+ wake_up(&resync_wait);
+ return err;
+}
+
+
+/*
+ * This is a kernel thread which syncs a spare disk with the active array
+ *
+ * the amount of foolproofing might seem to be a tad excessive, but an
+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
+ * of my root partition with the first 0.5 gigs of my /home partition ... so
+ * i'm a bit nervous ;)
+ */
+void md_do_recovery(void *data)
+{
+ int err;
+ mddev_t *mddev;
+ mdp_super_t *sb;
+ mdp_disk_t *spare;
+ struct md_list_head *tmp;
+
+ printk(KERN_INFO "md: recovery thread got woken up ...\n");
+restart:
+ ITERATE_MDDEV(mddev,tmp) {
+ sb = mddev->sb;
+ if (!sb)
+ continue;
+ if (mddev->recovery_running)
+ continue;
+ if (sb->active_disks == sb->raid_disks)
+ continue;
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ if (!sb->spare_disks) {
+ printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
+ "-- continuing in degraded mode\n", mdidx(mddev));
+ continue;
+ }
+ /*
+ * now here we get the spare and resync it.
+ */
+ spare = get_spare(mddev);
+ if (!spare)
+ continue;
+ printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
+ mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
+ if (!mddev->pers->diskop)
+ continue;
+ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
+ continue;
+ down(&mddev->recovery_sem);
+ mddev->recovery_running = 1;
+ err = md_do_sync(mddev, spare);
+ if (err == -EIO) {
+ printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n",
+ mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
+ if (!disk_faulty(spare)) {
+ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
+ mark_disk_faulty(spare);
+ mark_disk_nonsync(spare);
+ mark_disk_inactive(spare);
+ sb->spare_disks--;
+ sb->working_disks--;
+ sb->failed_disks++;
+ }
+ } else
+ if (disk_faulty(spare))
+ mddev->pers->diskop(mddev, &spare,
+ DISKOP_SPARE_INACTIVE);
+ if (err == -EINTR || err == -ENOMEM) {
+ /*
+ * Recovery got interrupted, or ran out of mem ...
+ * signal back that we have finished using the array.
+ */
+ mddev->pers->diskop(mddev, &spare,
+ DISKOP_SPARE_INACTIVE);
+ up(&mddev->recovery_sem);
+ mddev->recovery_running = 0;
+ continue;
+ } else {
+ mddev->recovery_running = 0;
+ up(&mddev->recovery_sem);
+ }
+ if (!disk_faulty(spare)) {
+ /*
+ * the SPARE_ACTIVE diskop possibly changes the
+ * pointer too
+ */
+ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
+ mark_disk_sync(spare);
+ mark_disk_active(spare);
+ sb->active_disks++;
+ sb->spare_disks--;
+ }
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+ goto restart;
+ }
+ printk(KERN_INFO "md: recovery thread finished ...\n");
+
+}
+
+int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct md_list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
+ || (code == MD_SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ do_md_stop (mddev, 1);
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ md_mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+ notifier_call: md_notify_reboot,
+ next: NULL,
+ priority: INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+ int i;
+
+ for(i = 0; i < MAX_MD_DEVS; i++) {
+ md_blocksizes[i] = 1024;
+ md_size[i] = 0;
+ md_hardsect_sizes[i] = 512;
+ }
+ blksize_size[MAJOR_NR] = md_blocksizes;
+ blk_size[MAJOR_NR] = md_size;
+ max_readahead[MAJOR_NR] = md_maxreadahead;
+ hardsect_size[MAJOR_NR] = md_hardsect_sizes;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+#ifdef CONFIG_PROC_FS
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+#endif
+}
+
+request_queue_t * md_queue_proc(kdev_t dev)
+{
+ mddev_t *mddev = kdev_to_mddev(dev);
+ if (mddev == NULL)
+ return BLK_DEFAULT_QUEUE(MAJOR_NR);
+ else
+ return &mddev->queue;
+}
+
+int md__init md_init(void)
+{
+ static char * name = "mdrecoveryd";
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+
+ if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
+ {
+ printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR);
+ return (-1);
+ }
+ devfs_handle = devfs_mk_dir (NULL, "md", NULL);
+ /* we don't use devfs_register_series because we want to fill md_hd_struct */
+ for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+ char devname[128];
+ sprintf (devname, "%u", minor);
+ md_hd_struct[minor].de = devfs_register (devfs_handle,
+ devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+ S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+ }
+
+ /* all requests on an uninitialised device get failed... */
+ blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request);
+ blk_dev[MAJOR_NR].queue = md_queue_proc;
+
+
+ read_ahead[MAJOR_NR] = INT_MAX;
+
+ add_gendisk(&md_gendisk);
+
+ md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
+ if (!md_recovery_thread)
+ printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n");
+
+ md_register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * When md (and any require personalities) are compiled into the kernel
+ * (not a module), arrays can be assembles are boot time using with AUTODETECT
+ * where specially marked partitions are registered with md_autodetect_dev(),
+ * and with MD_BOOT where devices to be collected are given on the boot line
+ * with md=.....
+ * The code for that is here.
+ */
+
+struct {
+ int set;
+ int noautodetect;
+} raid_setup_args md__initdata;
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static kdev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(kdev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(void)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ kdev_t dev = detected_devices[i];
+
+ if (md_import_device(dev,1)) {
+ printk(KERN_ALERT "md: could not import %s!\n",
+ partition_name(dev));
+ continue;
+ }
+ /*
+ * Sanity checks:
+ */
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ continue;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ continue;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices(-1);
+}
+
+static struct {
+ char device_set [MAX_MD_DEVS];
+ int pers[MAX_MD_DEVS];
+ int chunk[MAX_MD_DEVS];
+ char *device_names[MAX_MD_DEVS];
+} md_setup_args md__initdata;
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the MD device now; that is handled by
+ * md_setup_drive after the low-level disk drivers have initialised.
+ *
+ * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
+ * assigns the task of parsing integer arguments to the
+ * invoked program now). Added ability to initialise all
+ * the MD devices (by specifying multiple "md=" lines)
+ * instead of just one. -- KTK
+ * 18May2000: Added support for persistant-superblock arrays:
+ * md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
+ * md=n,device-list reads a RAID superblock from the devices
+ * elements in device-list are read by name_to_kdev_t so can be
+ * a hex number or something like /dev/hda1 /dev/sdb
+ * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
+ * Shifted name_to_kdev_t() and related operations to md_set_drive()
+ * for later execution. Rewrote section to make devfs compatible.
+ */
+static int md__init md_setup(char *str)
+{
+ int minor, level, factor, fault;
+ char *pername = "";
+ char *str1 = str;
+
+ if (get_option(&str, &minor) != 2) { /* MD Number */
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ if (minor >= MAX_MD_DEVS) {
+ printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor);
+ return 0;
+ } else if (md_setup_args.device_names[minor]) {
+ printk(KERN_WARNING "md: md=%d, Specified more then once. "
+ "Replacing previous definition.\n", minor);
+ }
+ switch (get_option(&str, &level)) { /* RAID Personality */
+ case 2: /* could be 0 or -1.. */
+ if (level == 0 || level == -1) {
+ if (get_option(&str, &factor) != 2 || /* Chunk Size */
+ get_option(&str, &fault) != 2) {
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ md_setup_args.chunk[minor] = 1 << (factor+12);
+ switch(level) {
+ case -1:
+ level = LINEAR;
+ pername = "linear";
+ break;
+ case 0:
+ level = RAID0;
+ pername = "raid0";
+ break;
+ default:
+ printk(KERN_WARNING
+ "md: The kernel has not been configured for raid%d support!\n",
+ level);
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ break;
+ }
+ /* FALL THROUGH */
+ case 1: /* the first device is numeric */
+ str = str1;
+ /* FALL THROUGH */
+ case 0:
+ md_setup_args.pers[minor] = 0;
+ pername="super-block";
+ }
+
+ printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
+ minor, pername, str);
+ md_setup_args.device_names[minor] = str;
+
+ return 1;
+}
+
+extern kdev_t name_to_kdev_t(char *line) md__init;
+void md__init md_setup_drive(void)
+{
+ int minor, i;
+ kdev_t dev;
+ mddev_t*mddev;
+ kdev_t devices[MD_SB_DISKS+1];
+
+ for (minor = 0; minor < MAX_MD_DEVS; minor++) {
+ int err = 0;
+ char *devname;
+ mdu_disk_info_t dinfo;
+
+ if ((devname = md_setup_args.device_names[minor]) == 0) continue;
+
+ for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {
+
+ char *p;
+ void *handle;
+
+ p = strchr(devname, ',');
+ if (p)
+ *p++ = 0;
+
+ dev = name_to_kdev_t(devname);
+ handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev),
+ DEVFS_SPECIAL_BLK, 1);
+ if (handle != 0) {
+ unsigned major, minor;
+ devfs_get_maj_min(handle, &major, &minor);
+ dev = MKDEV(major, minor);
+ }
+ if (dev == 0) {
+ printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
+ break;
+ }
+
+ devices[i] = dev;
+ md_setup_args.device_set[minor] = 1;
+
+ devname = p;
+ }
+ devices[i] = 0;
+
+ if (md_setup_args.device_set[minor] == 0)
+ continue;
+
+ if (mddev_map[minor]) {
+ printk(KERN_WARNING
+ "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
+ minor);
+ continue;
+ }
+ printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
+
+ mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
+ if (!mddev) {
+ printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
+ continue;
+ }
+ if (md_setup_args.pers[minor]) {
+ /* non-persistent */
+ mdu_array_info_t ainfo;
+ ainfo.level = pers_to_level(md_setup_args.pers[minor]);
+ ainfo.size = 0;
+ ainfo.nr_disks =0;
+ ainfo.raid_disks =0;
+ ainfo.md_minor =minor;
+ ainfo.not_persistent = 1;
+
+ ainfo.state = (1 << MD_SB_CLEAN);
+ ainfo.active_disks = 0;
+ ainfo.working_disks = 0;
+ ainfo.failed_disks = 0;
+ ainfo.spare_disks = 0;
+ ainfo.layout = 0;
+ ainfo.chunk_size = md_setup_args.chunk[minor];
+ err = set_array_info(mddev, &ainfo);
+ for (i = 0; !err && (dev = devices[i]); i++) {
+ dinfo.number = i;
+ dinfo.raid_disk = i;
+ dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ mddev->sb->nr_disks++;
+ mddev->sb->raid_disks++;
+ mddev->sb->active_disks++;
+ mddev->sb->working_disks++;
+ err = add_new_disk (mddev, &dinfo);
+ }
+ } else {
+ /* persistent */
+ for (i = 0; (dev = devices[i]); i++) {
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ add_new_disk (mddev, &dinfo);
+ }
+ }
+ if (!err)
+ err = do_md_run(mddev);
+ if (err) {
+ mddev->sb_dirty = 0;
+ do_md_stop(mddev, 0);
+ printk(KERN_WARNING "md: starting md%d failed\n", minor);
+ }
+ }
+}
+
+static int md__init raid_setup(char *str)
+{
+ int len, pos;
+
+ len = strlen(str) + 1;
+ pos = 0;
+
+ while (pos < len) {
+ char *comma = strchr(str+pos, ',');
+ int wlen;
+ if (comma)
+ wlen = (comma-str)-pos;
+ else wlen = (len-1)-pos;
+
+ if (strncmp(str, "noautodetect", wlen) == 0)
+ raid_setup_args.noautodetect = 1;
+ pos += wlen+1;
+ }
+ raid_setup_args.set = 1;
+ return 1;
+}
+
+int md__init md_run_setup(void)
+{
+ if (raid_setup_args.noautodetect)
+ printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
+ else
+ autostart_arrays();
+ md_setup_drive();
+ return 0;
+}
+
+__setup("raid=", raid_setup);
+__setup("md=", md_setup);
+
+__initcall(md_init);
+__initcall(md_run_setup);
+
+#else /* It is a MODULE */
+
+int init_module(void)
+{
+ return md_init();
+}
+
+static void free_device_names(void)
+{
+ while (!list_empty(&device_names)) {
+ struct dname *tmp = list_entry(device_names.next,
+ dev_name_t, list);
+ list_del(&tmp->list);
+ kfree(tmp);
+ }
+}
+
+
+void cleanup_module(void)
+{
+ md_unregister_thread(md_recovery_thread);
+ devfs_unregister(devfs_handle);
+
+ devfs_unregister_blkdev(MAJOR_NR,"md");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("mdstat", NULL);
+#endif
+
+ del_gendisk(&md_gendisk);
+
+ blk_dev[MAJOR_NR].queue = NULL;
+ blksize_size[MAJOR_NR] = NULL;
+ blk_size[MAJOR_NR] = NULL;
+ max_readahead[MAJOR_NR] = NULL;
+ hardsect_size[MAJOR_NR] = NULL;
+
+ free_device_names();
+
+}
+#endif
+
+MD_EXPORT_SYMBOL(md_size);
+MD_EXPORT_SYMBOL(register_md_personality);
+MD_EXPORT_SYMBOL(unregister_md_personality);
+MD_EXPORT_SYMBOL(partition_name);
+MD_EXPORT_SYMBOL(md_error);
+MD_EXPORT_SYMBOL(md_do_sync);
+MD_EXPORT_SYMBOL(md_sync_acct);
+MD_EXPORT_SYMBOL(md_done_sync);
+MD_EXPORT_SYMBOL(md_recover_arrays);
+MD_EXPORT_SYMBOL(md_register_thread);
+MD_EXPORT_SYMBOL(md_unregister_thread);
+MD_EXPORT_SYMBOL(md_update_sb);
+MD_EXPORT_SYMBOL(md_wakeup_thread);
+MD_EXPORT_SYMBOL(md_print_devices);
+MD_EXPORT_SYMBOL(find_rdev_nr);
+MD_EXPORT_SYMBOL(md_interrupt_thread);
+EXPORT_SYMBOL(mddev_map);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md-loop/merge b/tests/linux/md-loop/merge
new file mode 100644
index 0000000..ebb59ea
--- /dev/null
+++ b/tests/linux/md-loop/merge
@@ -0,0 +1,3962 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/raid/xor.h>
+#include <linux/devfs_fs_kernel.h>
+
+#include <linux/init.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+
+#include <linux/blk.h>
+
+#define DEBUG 0
+#if DEBUG
+# define dprintk(x...) printk(x)
+#else
+# define dprintk(x...) do { } while(0)
+#endif
+
+#ifndef MODULE
+static void autostart_arrays (void);
+#endif
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 100 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static int sysctl_speed_limit_min = 100;
+static int sysctl_speed_limit_max = 100000;
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
+ &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
+ {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
+ &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
+ {0}
+};
+
+static ctl_table raid_dir_table[] = {
+ {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
+ {0}
+};
+
+static ctl_table raid_root_table[] = {
+ {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
+ {0}
+};
+
+/*
+ * these have to be allocated separately because external
+ * subsystems want to have a pre-defined structure
+ */
+struct hd_struct md_hd_struct[MAX_MD_DEVS];
+static int md_blocksizes[MAX_MD_DEVS];
+static int md_hardsect_sizes[MAX_MD_DEVS];
+static mdk_thread_t *md_recovery_thread;
+
+int md_size[MAX_MD_DEVS];
+
+static struct block_device_operations md_fops;
+static devfs_handle_t devfs_handle;
+
+static struct gendisk md_gendisk=
+{
+ major: MD_MAJOR,
+ major_name: "md",
+ minor_shift: 0,
+ max_p: 1,
+ part: md_hd_struct,
+ sizes: md_size,
+ nr_real: MAX_MD_DEVS,
+ real_devices: NULL,
+ next: NULL,
+ fops: &md_fops,
+};
+
+/*
+ * Enables to iterate over all existing md arrays
+ */
+static MD_LIST_HEAD(all_mddevs);
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static inline mddev_t * kdev_to_mddev (kdev_t dev)
+{
+ if (MAJOR(dev) != MD_MAJOR)
+ BUG();
+ return mddev_map[MINOR(dev)];
+}
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio);
+ return 0;
+}
+
+static mddev_t * alloc_mddev(kdev_t dev)
+{
+ mddev_t *mddev;
+
+ if (MAJOR(dev) != MD_MAJOR) {
+ MD_BUG();
+ return 0;
+ }
+ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
+ if (!mddev)
+ return NULL;
+
+ memset(mddev, 0, sizeof(*mddev));
+
+ mddev->__minor = MINOR(dev);
+ init_MUTEX(&mddev->reconfig_sem);
+ init_MUTEX(&mddev->recovery_sem);
+ init_MUTEX(&mddev->resync_sem);
+ MD_INIT_LIST_HEAD(&mddev->disks);
+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);
+ atomic_set(&mddev->active, 0);
+
+ mddev_map[mdidx(mddev)] = mddev;
+ md_list_add(&mddev->all_mddevs, &all_mddevs);
+
+ MOD_INC_USE_COUNT;
+
+ return mddev;
+}
+
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+static MD_LIST_HEAD(device_names);
+
+char * partition_name(kdev_t dev)
+{
+ struct gendisk *hd;
+ static char nomem [] = "<nomem>";
+ dev_name_t *dname;
+ struct md_list_head *tmp;
+
+ list_for_each(tmp, &device_names) {
+ dname = md_list_entry(tmp, dev_name_t, list);
+ if (dname->dev == dev)
+ return dname->name;
+ }
+
+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
+
+ if (!dname)
+ return nomem;
+ /*
+ * ok, add this new device name to the list
+ */
+ hd = get_gendisk (dev);
+ dname->name = NULL;
+ if (hd)
+ dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
+ if (!dname->name) {
+ sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
+ dname->name = dname->namebuf;
+ }
+
+ dname->dev = dev;
+ md_list_add(&dname->list, &device_names);
+
+ return dname->name;
+}
+
+static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev,
+ int persistent)
+{
+ unsigned int size = 0;
+
+ if (blk_size[MAJOR(dev)])
+ size = blk_size[MAJOR(dev)][MINOR(dev)];
+ if (persistent)
+ size = MD_NEW_SIZE_BLOCKS(size);
+ return size;
+}
+
+static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent)
+{
+ unsigned int size;
+
+ size = calc_dev_sboffset(dev, mddev, persistent);
+ if (!mddev->sb) {
+ MD_BUG();
+ return size;
+ }
+ if (mddev->sb->chunk_size)
+ size &= ~(mddev->sb->chunk_size/1024 - 1);
+ return size;
+}
+
+static unsigned int zoned_raid_size(mddev_t *mddev)
+{
+ unsigned int mask;
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ /*
+ * do size and offset calculations.
+ */
+ mask = ~(mddev->sb->chunk_size/1024 - 1);
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev->size &= mask;
+ md_size[mdidx(mddev)] += rdev->size;
+ }
+ return 0;
+}
+
+static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb)
+{
+ if (disk_active(disk)) {
+ sb->working_disks--;
+ } else {
+ if (disk_spare(disk)) {
+ sb->spare_disks--;
+ sb->working_disks--;
+ } else {
+ sb->failed_disks--;
+ }
+ }
+ sb->nr_disks--;
+ disk->major = 0;
+ disk->minor = 0;
+ mark_disk_removed(disk);
+}
+
+#define BAD_MAGIC KERN_ERR \
+"md: invalid raid superblock magic on %s\n"
+
+#define BAD_MINOR KERN_ERR \
+"md: %s: invalid raid minor (%x)\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_SB KERN_ERR \
+"md: disabled device %s, could not read superblock.\n"
+
+#define BAD_CSUM KERN_WARNING \
+"md: invalid superblock checksum on %s\n"
+
+static int alloc_array_sb(mddev_t * mddev)
+{
+ if (mddev->sb) {
+ MD_BUG();
+ return 0;
+ }
+
+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
+ if (!mddev->sb)
+ return -ENOMEM;
+ md_clear_page(mddev->sb);
+ return 0;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(OUT_OF_MEM);
+ return -EINVAL;
+ }
+ rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ page_cache_release(rdev->sb_page);
+ rdev->sb = NULL;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ } else {
+ if (!rdev->faulty)
+ MD_BUG();
+ }
+}
+
+
+static void bh_complete(struct buffer_head *bh, int uptodate)
+{
+
+ if (uptodate)
+ set_bit(BH_Uptodate, &bh->b_state);
+
+ complete((struct completion*)bh->b_private);
+}
+
+static int sync_page_io(kdev_t dev, unsigned long sector, int size,
+ struct page *page, int rw)
+{
+ struct buffer_head bh;
+ struct completion event;
+
+ init_completion(&event);
+ init_buffer(&bh, bh_complete, &event);
+ bh.b_rdev = dev;
+ bh.b_rsector = sector;
+ bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
+ bh.b_size = size;
+ bh.b_page = page;
+ bh.b_reqnext = NULL;
+ bh.b_data = page_address(page);
+ generic_make_request(rw, &bh);
+
+ run_task_queue(&tq_disk);
+ wait_for_completion(&event);
+
+ return test_bit(BH_Uptodate, &bh.b_state);
+}
+
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+ int ret = -EINVAL;
+ kdev_t dev = rdev->dev;
+ unsigned long sb_offset;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk
+ */
+ sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
+ rdev->sb_offset = sb_offset;
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) {
+ printk(NO_SB,partition_name(dev));
+ return -EINVAL;
+ }
+ printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
+ ret = 0;
+abort:
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+/*
+ * Check one RAID superblock for generic plausibility
+ */
+
+static int check_disk_sb(mdk_rdev_t * rdev)
+{
+ mdp_super_t *sb;
+ int ret = -EINVAL;
+
+ sb = rdev->sb;
+ if (!sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(BAD_MAGIC, partition_name(rdev->dev));
+ goto abort;
+ }
+
+ if (sb->md_minor >= MAX_MD_DEVS) {
+ printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor);
+ goto abort;
+ }
+
+ if (calc_sb_csum(sb) != sb->sb_csum) {
+ printk(BAD_CSUM, partition_name(rdev->dev));
+ goto abort;
+ }
+ ret = 0;
+abort:
+ return ret;
+}
+
+static kdev_t dev_unit(kdev_t dev)
+{
+ unsigned int mask;
+ struct gendisk *hd = get_gendisk(dev);
+
+ if (!hd)
+ return 0;
+ mask = ~((1 << hd->minor_shift) - 1);
+
+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
+}
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (dev_unit(rdev->dev) == dev_unit(dev))
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev->dev))
+ return 1;
+
+ return 0;
+}
+
+static MD_LIST_HEAD(all_raid_disks);
+static MD_LIST_HEAD(pending_raid_disks);
+
+static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ same_pdev = match_dev_unit(mddev, rdev->dev);
+ if (same_pdev)
+ printk( KERN_WARNING
+"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
+" protection against single-disk failure might be compromised.\n",
+ mdidx(mddev), partition_name(rdev->dev),
+ partition_name(same_pdev->dev));
+
+ md_list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev));
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev));
+ rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev(mdk_rdev_t *rdev)
+{
+ int err = 0;
+ struct block_device *bdev;
+
+ bdev = bdget(rdev->dev);
+ if (!bdev)
+ return -ENOMEM;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ if (!err)
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ blkdev_put(bdev, BDEV_RAW);
+}
+
+void md_autodetect_dev(kdev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev));
+ if (rdev->mddev)
+ MD_BUG();
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ list_del_init(&rdev->all);
+ if (!list_empty(&rdev->pending)) {
+ printk(KERN_INFO "md: (%s was pending)\n",
+ partition_name(rdev->dev));
+ list_del_init(&rdev->pending);
+ }
+#ifndef MODULE
+ md_autodetect_dev(rdev->dev);
+#endif
+ rdev->dev = 0;
+ rdev->faulty = 0;
+ kfree(rdev);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb = mddev->sb;
+
+ if (mddev->sb) {
+ mddev->sb = NULL;
+ free_page((unsigned long) sb);
+ }
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+}
+
+static void free_mddev(mddev_t *mddev)
+{
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ export_array(mddev);
+ md_size[mdidx(mddev)] = 0;
+ md_hd_struct[mdidx(mddev)].nr_sects = 0;
+
+ /*
+ * Make sure nobody else is using this mddev
+ * (careful, we rely on the global kernel lock here)
+ */
+ while (sem_getcount(&mddev->resync_sem) != 1)
+ schedule();
+ while (sem_getcount(&mddev->recovery_sem) != 1)
+ schedule();
+
+<<<<<<<
+ del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
+|||||||
+ del_mddev_mapping(mddev, mk_kdev(MD_MAJOR, mdidx(mddev)));
+=======
+ mddev_map[mdidx(mddev)] = NULL;
+>>>>>>>
+ md_list_del(&mddev->all_mddevs);
+ kfree(mddev);
+ MOD_DEC_USE_COUNT;
+}
+
+#undef BAD_CSUM
+#undef BAD_MAGIC
+#undef OUT_OF_MEM
+#undef NO_SB
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+ partition_name(MKDEV(desc->major,desc->minor)),
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
+ sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
+ partition_name(rdev->dev), partition_name(rdev->old_dev),
+ rdev->size, rdev->faulty, rdev->desc_nr);
+ if (rdev->sb) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb(rdev->sb);
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) {
+ printk("md%d: ", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", partition_name(rdev->dev));
+
+ if (mddev->sb) {
+ printk(" array superblock:\n");
+ print_sb(mddev->sb);
+ } else
+ printk(" no array superblock.\n");
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ if (tmp1)
+ kfree(tmp1);
+ if (tmp2)
+ kfree(tmp2);
+
+ return ret;
+}
+
+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
+{
+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+static mdk_rdev_t * find_rdev_all(kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ list_for_each(tmp, &all_raid_disks) {
+ rdev = md_list_entry(tmp, mdk_rdev_t, all);
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+#define GETBLK_FAILED KERN_ERR \
+"md: getblk failed for device %s\n"
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+ kdev_t dev;
+ unsigned long sb_offset, size;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
+ MD_BUG();
+ return 1;
+ }
+
+ dev = rdev->dev;
+ sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
+ if (rdev->sb_offset != sb_offset) {
+ printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n",
+ partition_name(dev), rdev->sb_offset, sb_offset);
+ goto skip;
+ }
+ /*
+ * If the disk went offline meanwhile and it's just a spare, then
+ * its size has changed to zero silently, and the MD code does
+ * not yet know that it's faulty.
+ */
+ size = calc_dev_size(dev, rdev->mddev, 1);
+ if (size != rdev->size) {
+ printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n",
+ partition_name(dev), rdev->size, size);
+ goto skip;
+ }
+
+ printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) {
+ printk("md: write_disk_sb failed for device %s\n", partition_name(dev));
+ return 1;
+ }
+skip:
+ return 0;
+}
+#undef GETBLK_FAILED
+
+static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ int i, ok = 0;
+ mdp_disk_t *desc;
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ desc = mddev->sb->disks + i;
+#if 0
+ if (disk_faulty(desc)) {
+ if (MKDEV(desc->major,desc->minor) == rdev->dev)
+ ok = 1;
+ continue;
+ }
+#endif
+ if (MKDEV(desc->major,desc->minor) == rdev->dev) {
+ rdev->sb->this_disk = *desc;
+ rdev->desc_nr = desc->number;
+ ok = 1;
+ break;
+ }
+ }
+
+ if (!ok) {
+ MD_BUG();
+ }
+}
+
+static int sync_sbs(mddev_t * mddev)
+{
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty || rdev->alias_device)
+ continue;
+ sb = rdev->sb;
+ *sb = *mddev->sb;
+ set_this_disk(mddev, rdev);
+ sb->sb_csum = calc_sb_csum(sb);
+ }
+ return 0;
+}
+
+int md_update_sb(mddev_t * mddev)
+{
+ int err, count = 100;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->sb_dirty) {
+ printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0));
+ return 0;
+ }
+ mddev->sb_dirty = 0;
+repeat:
+ mddev->sb->utime = CURRENT_TIME;
+ if ((++mddev->sb->events_lo)==0)
+ ++mddev->sb->events_hi;
+
+ if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
+ }
+ sync_sbs(mddev);
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (mddev->sb->not_persistent)
+ return 0;
+
+ printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
+ mdidx(mddev));
+
+ err = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk(KERN_INFO "md: ");
+ if (rdev->faulty)
+ printk("(skipping faulty ");
+ if (rdev->alias_device)
+ printk("(skipping alias ");
+ if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) {
+ printk("(skipping new-faulty %s )\n",
+ partition_name(rdev->dev));
+ continue;
+ }
+ printk("%s ", partition_name(rdev->dev));
+ if (!rdev->faulty && !rdev->alias_device) {
+ printk("[events: %08lx]",
+ (unsigned long)rdev->sb->events_lo);
+ err += write_disk_sb(rdev);
+ } else
+ printk(")\n");
+ }
+ if (err) {
+ if (--count) {
+ printk(KERN_ERR "md: errors occurred during superblock update, repeating\n");
+ goto repeat;
+ }
+ printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
+ }
+ return 0;
+}
+
+/*
+ * Import a device. If 'on_disk', then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ */
+static int md_import_device(kdev_t newdev, int on_disk)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ unsigned int size;
+
+ if (find_rdev_all(newdev))
+ return -EEXIST;
+
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev));
+ return -ENOMEM;
+ }
+ memset(rdev, 0, sizeof(*rdev));
+
+ if (is_mounted(newdev)) {
+ printk(KERN_WARNING "md: can not import %s, has active inodes!\n",
+ partition_name(newdev));
+ err = -EBUSY;
+ goto abort_free;
+ }
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ rdev->dev = newdev;
+ if (lock_rdev(rdev)) {
+ printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+ rdev->desc_nr = -1;
+ rdev->faulty = 0;
+
+ size = 0;
+ if (blk_size[MAJOR(newdev)])
+ size = blk_size[MAJOR(newdev)][MINOR(newdev)];
+ if (!size) {
+ printk(KERN_WARNING "md: %s has zero size, marking faulty!\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (on_disk) {
+ if ((err = read_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+ if ((err = check_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+
+ if (rdev->sb->level != -4) {
+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
+ rdev->sb->this_disk.minor);
+ rdev->desc_nr = rdev->sb->this_disk.number;
+ } else {
+ rdev->old_dev = MKDEV(0, 0);
+ rdev->desc_nr = -1;
+ }
+ }
+ md_list_add(&rdev->all, &all_raid_disks);
+ MD_INIT_LIST_HEAD(&rdev->pending);
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return 0;
+
+abort_free:
+ if (rdev->sb) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return err;
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+#define INCONSISTENT KERN_ERR \
+"md: fatal superblock inconsistency in %s -- removing from array\n"
+
+#define OUT_OF_DATE KERN_ERR \
+"md: superblock update time inconsistency -- using the most recent one\n"
+
+#define OLD_VERSION KERN_ALERT \
+"md: md%d: unsupported raid array version %d.%d.%d\n"
+
+#define NOT_CLEAN_IGNORE KERN_ERR \
+"md: md%d: raid array is not clean -- starting background reconstruction\n"
+
+#define UNKNOWN_LEVEL KERN_ERR \
+"md: md%d: unsupported raid level %d\n"
+
+static int analyze_sbs(mddev_t * mddev)
+{
+ int out_of_date = 0, i, first;
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev, *rdev2, *freshest;
+ mdp_super_t *sb;
+
+ /*
+ * Verify the RAID superblock on each real device
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty) {
+ MD_BUG();
+ goto abort;
+ }
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+ if (check_disk_sb(rdev))
+ goto abort;
+ }
+
+ /*
+ * The superblock constant part has to be the same
+ * for all disks in the array.
+ */
+ sb = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!sb) {
+ sb = rdev->sb;
+ continue;
+ }
+ if (!sb_equal(sb, rdev->sb)) {
+ printk(INCONSISTENT, partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * OK, we have all disks and the array is ready to run. Let's
+ * find the freshest superblock, that one will be the superblock
+ * that represents the whole array.
+ */
+ if (!mddev->sb)
+ if (alloc_array_sb(mddev))
+ goto abort;
+ sb = mddev->sb;
+ freshest = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2;
+ /*
+ * if the checksum is invalid, use the superblock
+ * only as a last resort. (decrease it's age by
+ * one event)
+ */
+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
+ if (rdev->sb->events_lo || rdev->sb->events_hi)
+ if ((rdev->sb->events_lo--)==0)
+ rdev->sb->events_hi--;
+ }
+
+ printk(KERN_INFO "md: %s's event counter: %08lx\n",
+ partition_name(rdev->dev),
+ (unsigned long)rdev->sb->events_lo);
+ if (!freshest) {
+ freshest = rdev;
+ continue;
+ }
+ /*
+ * Find the newest superblock version
+ */
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(freshest->sb);
+ if (ev1 != ev2) {
+ out_of_date = 1;
+ if (ev1 > ev2)
+ freshest = rdev;
+ }
+ }
+ if (out_of_date) {
+ printk(OUT_OF_DATE);
+ printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev));
+ }
+ memcpy (sb, freshest->sb, sizeof(*sb));
+
+ /*
+ * at this point we have picked the 'best' superblock
+ * from all available superblocks.
+ * now we validate this superblock and kick out possibly
+ * failed disks.
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Kick all non-fresh devices
+ */
+ __u64 ev1, ev2;
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ++ev1;
+ if (ev1 < ev2) {
+ printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
+ partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * Fix up changed device names ... but only if this disk has a
+ * recent update time. Use faulty checksum ones too.
+ */
+ if (mddev->sb->level != -4)
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2, ev3;
+ if (rdev->faulty || rdev->alias_device) {
+ MD_BUG();
+ goto abort;
+ }
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ev3 = ev2;
+ --ev3;
+ if ((rdev->dev != rdev->old_dev) &&
+ ((ev1 == ev2) || (ev1 == ev3))) {
+ mdp_disk_t *desc;
+
+ printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n",
+ partition_name(rdev->old_dev), partition_name(rdev->dev));
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ desc = &sb->disks[rdev->desc_nr];
+ if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
+ MD_BUG();
+ goto abort;
+ }
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ desc = &rdev->sb->this_disk;
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ }
+ }
+
+ /*
+ * Remove unavailable and faulty devices ...
+ *
+ * note that if an array becomes completely unrunnable due to
+ * missing devices, we do not write the superblock back, so the
+ * administrator has a chance to fix things up. The removal thus
+ * only happens if it's nonfatal to the contents of the array.
+ */
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ int found;
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ /*
+ * We kick faulty devices/descriptors immediately.
+ *
+ * Note: multipath devices are a special case. Since we
+ * were able to read the superblock on the path, we don't
+ * care if it was previously marked as faulty, it's up now
+ * so enable it.
+ */
+ if (disk_faulty(desc) && mddev->sb->level != -4) {
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr != desc->number)
+ continue;
+ printk(KERN_WARNING "md%d: kicking faulty %s!\n",
+ mdidx(mddev),partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ found = 1;
+ break;
+ }
+ if (!found) {
+ if (dev == MKDEV(0,0))
+ continue;
+ printk(KERN_WARNING "md%d: removing former faulty %s!\n",
+ mdidx(mddev), partition_name(dev));
+ }
+ remove_descriptor(desc, sb);
+ continue;
+ } else if (disk_faulty(desc)) {
+ /*
+ * multipath entry marked as faulty, unfaulty it
+ */
+ rdev = find_rdev(mddev, dev);
+ if(rdev)
+ mark_disk_spare(desc);
+ else
+ remove_descriptor(desc, sb);
+ }
+
+ if (dev == MKDEV(0,0))
+ continue;
+ /*
+ * Is this device present in the rdev ring?
+ */
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Multi-path IO special-case: since we have no
+ * this_disk descriptor at auto-detect time,
+ * we cannot check rdev->number.
+ * We can check the device though.
+ */
+ if ((sb->level == -4) && (rdev->dev ==
+ MKDEV(desc->major,desc->minor))) {
+ found = 1;
+ break;
+ }
+ if (rdev->desc_nr == desc->number) {
+ found = 1;
+ break;
+ }
+ }
+ if (found)
+ continue;
+
+ printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n",
+ mdidx(mddev), partition_name(dev));
+ remove_descriptor(desc, sb);
+ }
+
+ /*
+ * Double check wether all devices mentioned in the
+ * superblock are in the rdev ring.
+ */
+ first = 1;
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+
+ if (disk_faulty(desc)) {
+ MD_BUG();
+ goto abort;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * In the case of Multipath-IO, we have no
+ * other information source to find out which
+ * disk is which, only the position of the device
+ * in the superblock:
+ */
+ if (mddev->sb->level == -4) {
+ if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) {
+ MD_BUG();
+ goto abort;
+ }
+ rdev->desc_nr = i;
+ if (!first)
+ rdev->alias_device = 1;
+ else
+ first = 0;
+ }
+ }
+
+ /*
+ * Kick all rdevs that are not in the
+ * descriptor array:
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1)
+ kick_rdev_from_array(rdev);
+ }
+
+ /*
+ * Do a final reality check.
+ */
+ if (mddev->sb->level != -4) {
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * is the desc_nr unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->desc_nr == rdev->desc_nr)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ /*
+ * is the device unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->dev == rdev->dev)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ }
+ }
+
+ /*
+ * Check if we can support this RAID array
+ */
+ if (sb->major_version != MD_MAJOR_VERSION ||
+ sb->minor_version > MD_MINOR_VERSION) {
+
+ printk(OLD_VERSION, mdidx(mddev), sb->major_version,
+ sb->minor_version, sb->patch_version);
+ goto abort;
+ }
+
+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
+ (sb->level == 4) || (sb->level == 5)))
+ printk(NOT_CLEAN_IGNORE, mdidx(mddev));
+
+ return 0;
+abort:
+ return 1;
+}
+
+#undef INCONSISTENT
+#undef OUT_OF_DATE
+#undef OLD_VERSION
+#undef OLD_LEVEL
+
+static int device_size_calculation(mddev_t * mddev)
+{
+ int data_disks = 0, persistent;
+ unsigned int readahead;
+ mdp_super_t *sb = mddev->sb;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+ persistent = !mddev->sb->not_persistent;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size) {
+ MD_BUG();
+ continue;
+ }
+ rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
+ if (rdev->size < sb->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size: %ldk < %dk\n",
+ partition_name(rdev->dev),
+ rdev->size, sb->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (sb->level) {
+ case -4:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case -1:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = sb->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = sb->raid_disks-1;
+ break;
+ default:
+ printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = sb->size * data_disks;
+
+ readahead = MD_READAHEAD;
+ if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
+ readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (sb->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+abort:
+ return 1;
+}
+
+
+#define TOO_BIG_CHUNKSIZE KERN_ERR \
+"too big chunk_size: %d > %d\n"
+
+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
+"too small chunk_size: %d < %ld\n"
+
+#define BAD_CHUNKSIZE KERN_ERR \
+"no chunksize specified, see 'man raidtab'\n"
+
+static int do_md_run(mddev_t * mddev)
+{
+ int pnum, err;
+ int chunk_size;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Resize disks to align partitions size on a given
+ * chunk size.
+ */
+ md_size[mdidx(mddev)] = 0;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (analyze_sbs(mddev)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ chunk_size = mddev->sb->chunk_size;
+ pnum = level_to_pers(mddev->sb->level);
+
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+ if (!chunk_size) {
+ /*
+ * 'default chunksize' in the old md code used to
+ * be PAGE_SIZE, baaad.
+ * we abort here to be on the safe side. We dont
+ * want to continue the bad practice.
+ */
+ printk(BAD_CHUNKSIZE);
+ return -EINVAL;
+ }
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+ } else
+ if (chunk_size)
+ printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n",
+ mddev->sb->level);
+
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (!pers[pnum])
+ {
+#ifdef CONFIG_KMOD
+ char module_name[80];
+ sprintf (module_name, "md-personality-%d", pnum);
+ request_module (module_name);
+ if (!pers[pnum])
+#endif
+ {
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
+ pnum);
+ return -EINVAL;
+ }
+ }
+
+ if (device_size_calculation(mddev))
+ return -EINVAL;
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ md_hardsect_sizes[mdidx(mddev)] = 512;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ invalidate_device(rdev->dev, 1);
+ if (get_hardsect_size(rdev->dev)
+ > md_hardsect_sizes[mdidx(mddev)])
+ md_hardsect_sizes[mdidx(mddev)] =
+ get_hardsect_size(rdev->dev);
+ }
+ md_blocksizes[mdidx(mddev)] = 1024;
+ if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
+ md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
+ mddev->pers = pers[pnum];
+
+ blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+ mddev->queue.queuedata = mddev;
+
+ err = mddev->pers->run(mddev);
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ mddev->pers = NULL;
+ return -EINVAL;
+ }
+
+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ /*
+ * md_size has units of 1K blocks, which are
+ * twice as large as sectors.
+ */
+ md_hd_struct[mdidx(mddev)].start_sect = 0;
+ register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)),
+ 1, &md_fops, md_size[mdidx(mddev)]<<1);
+
+ read_ahead[MD_MAJOR] = 1024;
+ return (0);
+}
+
+#undef TOO_BIG_CHUNKSIZE
+#undef BAD_CHUNKSIZE
+
+static int restart_array(mddev_t *mddev)
+{
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->ro = 0;
+ set_device_ro(mddev_to_kdev(mddev), 0);
+
+ printk(KERN_INFO
+ "md: md%d switched to read-write mode.\n", mdidx(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ md_recover_arrays();
+ if (mddev->pers->restart_resync)
+ mddev->pers->restart_resync(mddev);
+ err = 0;
+ } else {
+ printk(KERN_ERR "md: md%d has no personality assigned.\n",
+ mdidx(mddev));
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+#define STILL_MOUNTED KERN_WARNING \
+"md: md%d still mounted.\n"
+#define STILL_IN_USE \
+"md: md%d still in use.\n"
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+ int err = 0, resync_interrupted = 0;
+ kdev_t dev = mddev_to_kdev(mddev);
+
+ if (atomic_read(&mddev->active)>1) {
+ printk(STILL_IN_USE, mdidx(mddev));
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (mddev->pers) {
+ /*
+ * It is safe to call stop here, it only frees private
+ * data. Also, it tells us if a device is unstoppable
+ * (eg. resyncing is in progress)
+ */
+ if (mddev->pers->stop_resync)
+ if (mddev->pers->stop_resync(mddev))
+ resync_interrupted = 1;
+
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+
+ /*
+ * This synchronizes with signal delivery to the
+ * resync or reconstruction thread. It also nicely
+ * hangs the process if some reconstruction has not
+ * finished.
+ */
+ down(&mddev->recovery_sem);
+ up(&mddev->recovery_sem);
+
+ invalidate_device(dev, 1);
+
+ if (ro) {
+ err = -ENXIO;
+ if (mddev->ro)
+ goto out;
+ mddev->ro = 1;
+ } else {
+ if (mddev->ro)
+ set_device_ro(dev, 0);
+ if (mddev->pers->stop(mddev)) {
+ err = -EBUSY;
+ if (mddev->ro)
+ set_device_ro(dev, 1);
+ goto out;
+ }
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (mddev->sb) {
+ /*
+ * mark it clean only if there was no resync
+ * interrupted.
+ */
+ if (!mddev->recovery_running && !resync_interrupted) {
+ printk(KERN_INFO "md: marking sb clean...\n");
+ mddev->sb->state |= 1 << MD_SB_CLEAN;
+ }
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+ }
+ if (ro)
+ set_device_ro(dev, 1);
+ }
+
+ /*
+ * Free resources if final stop
+ */
+ if (!ro) {
+ printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+ free_mddev(mddev);
+ } else
+ printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * We have to safely support old arrays too.
+ */
+int detect_old_array(mdp_super_t *sb)
+{
+ if (sb->major_version > 0)
+ return 0;
+ if (sb->minor_version >= 90)
+ return 0;
+
+ return -EINVAL;
+}
+
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return;
+ }
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk("<%s>", partition_name(rdev->dev));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+ /*
+ * prevent the writeback of an unrunnable array
+ */
+ mddev->sb_dirty = 0;
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in the ->pending list)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(kdev_t countdev)
+{
+ struct md_list_head candidates;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+ kdev_t md_kdev;
+
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ rdev0 = md_list_entry(pending_raid_disks.next,
+ mdk_rdev_t, pending);
+
+ printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev));
+ MD_INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ if (uuid_equal(rdev0, rdev)) {
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING
+ "md: %s has same UUID as %s, but superblocks differ ...\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ continue;
+ }
+ printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev));
+ md_list_del(&rdev->pending);
+ md_list_add(&rdev->pending, &candidates);
+ }
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
+ mddev = kdev_to_mddev(md_kdev);
+ if (mddev) {
+ printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
+ mdidx(mddev), partition_name(rdev0->dev));
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
+ export_rdev(rdev);
+ continue;
+ }
+ mddev = alloc_mddev(md_kdev);
+ if (!mddev) {
+ printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (md_kdev == countdev)
+ atomic_inc(&mddev->active);
+ printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
+ bind_rdev_to_array(rdev, mddev);
+ list_del_init(&rdev->pending);
+ }
+ autorun_array(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+#define BAD_VERSION KERN_ERR \
+"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_DEVICE KERN_ERR \
+"md: disabled device %s\n"
+
+#define AUTOADD_FAILED KERN_ERR \
+"md: auto-adding devices to md%d FAILED (error %d).\n"
+
+#define AUTOADD_FAILED_USED KERN_ERR \
+"md: cannot auto-add device %s to md%d, already used.\n"
+
+#define AUTORUN_FAILED KERN_ERR \
+"md: auto-running md%d FAILED (error %d).\n"
+
+#define MDDEV_BUSY KERN_ERR \
+"md: cannot auto-add to md%d, already running.\n"
+
+#define AUTOADDING KERN_INFO \
+"md: auto-adding devices to md%d, based on %s's superblock.\n"
+
+#define AUTORUNNING KERN_INFO \
+"md: auto-running md%d.\n"
+
+static int autostart_array(kdev_t startdev, kdev_t countdev)
+{
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ if (md_import_device(startdev, 1)) {
+ printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev));
+ goto abort;
+ }
+
+ start_rdev = find_rdev_all(startdev);
+ if (!start_rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ if (start_rdev->faulty) {
+ printk(KERN_WARNING "md: can not autostart based on faulty %s!\n",
+ partition_name(startdev));
+ goto abort;
+ }
+ md_list_add(&start_rdev->pending, &pending_raid_disks);
+
+ sb = start_rdev->sb;
+
+ err = detect_old_array(sb);
+ if (err) {
+ printk(KERN_WARNING "md: array version is too old to be autostarted ,"
+ "use raidtools 0.90 mkraid --upgrade to upgrade the array "
+ "without data loss!\n");
+ goto abort;
+ }
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+ if (dev == startdev)
+ continue;
+ if (md_import_device(dev, 1)) {
+ printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n",
+ partition_name(dev));
+ continue;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices(countdev);
+ return 0;
+
+abort:
+ if (start_rdev)
+ export_rdev(start_rdev);
+ return err;
+}
+
+#undef BAD_VERSION
+#undef OUT_OF_MEM
+#undef NO_DEVICE
+#undef AUTOADD_FAILED_USED
+#undef AUTOADD_FAILED
+#undef AUTORUN_FAILED
+#undef AUTOADDING
+#undef AUTORUNNING
+
+
+static int get_version(void * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (md_copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define SET_FROM_SB(x) info.x = mddev->sb->x
+static int get_array_info(mddev_t * mddev, void * arg)
+{
+ mdu_array_info_t info;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ SET_FROM_SB(major_version);
+ SET_FROM_SB(minor_version);
+ SET_FROM_SB(patch_version);
+ SET_FROM_SB(ctime);
+ SET_FROM_SB(level);
+ SET_FROM_SB(size);
+ SET_FROM_SB(nr_disks);
+ SET_FROM_SB(raid_disks);
+ SET_FROM_SB(md_minor);
+ SET_FROM_SB(not_persistent);
+
+ SET_FROM_SB(utime);
+ SET_FROM_SB(state);
+ SET_FROM_SB(active_disks);
+ SET_FROM_SB(working_disks);
+ SET_FROM_SB(failed_disks);
+ SET_FROM_SB(spare_disks);
+
+ SET_FROM_SB(layout);
+ SET_FROM_SB(chunk_size);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
+static int get_disk_info(mddev_t * mddev, void * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+
+ if (!mddev->sb)
+ return -EINVAL;
+
+ if (md_copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+ if (nr >= MD_SB_DISKS)
+ return -EINVAL;
+
+ SET_FROM_SB(major);
+ SET_FROM_SB(minor);
+ SET_FROM_SB(raid_disk);
+ SET_FROM_SB(state);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_SB(x) mddev->sb->disks[nr].x = info->x
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ int err, size, persistent;
+ mdk_rdev_t *rdev;
+ unsigned int nr;
+ kdev_t dev;
+ dev = MKDEV(info->major,info->minor);
+
+ if (find_rdev_all(dev)) {
+ printk(KERN_WARNING "md: device %s already used in a RAID array!\n",
+ partition_name(dev));
+ return -EBUSY;
+ }
+ if (!mddev->sb) {
+ /* expecting a device which has a superblock */
+ err = md_import_device(dev, 1);
+ if (err) {
+ printk(KERN_WARNING "md: md_import_device returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ if (!uuid_equal(rdev0, rdev)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ bind_rdev_to_array(rdev, mddev);
+ return 0;
+ }
+
+ nr = info->number;
+ if (nr >= mddev->sb->nr_disks) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+
+ SET_SB(number);
+ SET_SB(major);
+ SET_SB(minor);
+ SET_SB(raid_disk);
+ SET_SB(state);
+
+ if ((info->state & (1<<MD_DISK_FAULTY))==0) {
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ rdev->old_dev = dev;
+ rdev->desc_nr = info->number;
+
+ bind_rdev_to_array(rdev, mddev);
+
+ persistent = !mddev->sb->not_persistent;
+ if (!persistent)
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+
+ size = calc_dev_size(dev, mddev, persistent);
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ if (!mddev->sb->size || (mddev->sb->size > size))
+ mddev->sb->size = size;
+ }
+
+ /*
+ * sync all other superblocks with the main superblock
+ */
+ sync_sbs(mddev);
+
+ return 0;
+}
+#undef SET_SB
+
+static int hot_generate_error(mddev_t * mddev, kdev_t dev)
+{
+ struct request_queue *q;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ return -ENXIO;
+ }
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (!disk_active(disk))
+ return -ENODEV;
+
+ q = blk_get_queue(rdev->dev);
+ if (!q) {
+ MD_BUG();
+ return -ENODEV;
+ }
+ printk(KERN_INFO "md: okay, generating error!\n");
+// q->oneshot_error = 1; // disabled for now
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, kdev_t dev)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (disk_active(disk))
+ goto busy;
+
+ if (disk_removed(disk))
+ return -EINVAL;
+
+ err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
+ if (err == -EBUSY)
+ goto busy;
+
+ if (err) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ remove_descriptor(disk, mddev->sb);
+ kick_rdev_from_array(rdev);
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, kdev_t dev)
+{
+ int i, err, persistent;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ persistent = !mddev->sb->not_persistent;
+
+ rdev = find_rdev(mddev, dev);
+ if (rdev)
+ return -EBUSY;
+
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->faulty) {
+ printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n",
+ partition_name(dev), mdidx(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ size = calc_dev_size(dev, mddev, persistent);
+
+ if (size < mddev->sb->size) {
+ printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n",
+ mdidx(mddev), size, mddev->sb->size);
+ err = -ENOSPC;
+ goto abort_export;
+ }
+ bind_rdev_to_array(rdev, mddev);
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+ rdev->old_dev = dev;
+ rdev->size = size;
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ disk = mddev->sb->disks + mddev->sb->raid_disks;
+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
+ disk = mddev->sb->disks + i;
+
+ if (!disk->major && !disk->minor)
+ break;
+ if (disk_removed(disk))
+ break;
+ }
+ if (i == MD_SB_DISKS) {
+ printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ if (disk_removed(disk)) {
+ /*
+ * reuse slot
+ */
+ if (disk->number != i) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+ } else {
+ disk->number = i;
+ }
+
+ disk->raid_disk = disk->number;
+ disk->major = MAJOR(dev);
+ disk->minor = MINOR(dev);
+
+ if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+
+ mark_disk_spare(disk);
+ mddev->sb->nr_disks++;
+ mddev->sb->spare_disks++;
+ mddev->sb->working_disks++;
+
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ md_recover_arrays();
+
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+#define SET_SB(x) mddev->sb->x = info->x
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (alloc_array_sb(mddev))
+ return -ENOMEM;
+
+ mddev->sb->major_version = MD_MAJOR_VERSION;
+ mddev->sb->minor_version = MD_MINOR_VERSION;
+ mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->sb->ctime = CURRENT_TIME;
+
+ SET_SB(level);
+ SET_SB(size);
+ SET_SB(nr_disks);
+ SET_SB(raid_disks);
+ SET_SB(md_minor);
+ SET_SB(not_persistent);
+
+ SET_SB(state);
+ SET_SB(active_disks);
+ SET_SB(working_disks);
+ SET_SB(failed_disks);
+ SET_SB(spare_disks);
+
+ SET_SB(layout);
+ SET_SB(chunk_size);
+
+ mddev->sb->md_magic = MD_SB_MAGIC;
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(&mddev->sb->set_uuid0, 4);
+ get_random_bytes(&mddev->sb->set_uuid1, 4);
+ get_random_bytes(&mddev->sb->set_uuid2, 4);
+ get_random_bytes(&mddev->sb->set_uuid3, 4);
+
+ return 0;
+}
+#undef SET_SB
+
+static int set_disk_faulty(mddev_t *mddev, kdev_t dev)
+{
+ int ret;
+
+ ret = md_error(mddev, dev);
+ return ret;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ unsigned int minor;
+ int err = 0;
+ struct hd_geometry *loc = (struct hd_geometry *) arg;
+ mddev_t *mddev = NULL;
+ kdev_t dev;
+
+ if (!md_capable_admin())
+ return -EACCES;
+
+ dev = inode->i_rdev;
+ minor = MINOR(dev);
+ if (minor >= MAX_MD_DEVS) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version((void *)arg);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done_unlock;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays();
+ goto done;
+#endif
+
+ case BLKGETSIZE:
+ case BLKGETSIZE64:
+ case BLKRAGET:
+ case BLKRASET:
+ case BLKFLSBUF:
+ case BLKBSZGET:
+ case BLKBSZSET:
+ err = blk_ioctl (dev, cmd, arg);
+ goto abort;
+
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = kdev_to_mddev(dev);
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+ case START_ARRAY:
+ if (mddev) {
+ printk(KERN_WARNING "md: array md%d already exists!\n",
+ mdidx(mddev));
+ err = -EEXIST;
+ goto abort;
+ }
+ default:;
+ }
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+ mddev = alloc_mddev(dev);
+ if (!mddev) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ atomic_inc(&mddev->active);
+
+ /*
+ * alloc_mddev() should possibly self-lock.
+ */
+ err = lock_mddev(mddev);
+ if (err) {
+ printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ if (mddev->sb) {
+ printk(KERN_WARNING "md: array md%d already has a superblock!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (arg) {
+ mdu_array_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+ case START_ARRAY:
+ /*
+ * possibly make it lock the array ...
+ */
+ err = autostart_array((kdev_t)arg, dev);
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name((kdev_t)arg));
+ goto abort;
+ }
+ goto done;
+
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+
+ if (!mddev) {
+ err = -ENODEV;
+ goto abort;
+ }
+ err = lock_mddev(mddev);
+ if (err) {
+ printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
+ goto abort;
+ }
+ /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+ if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ if (!(err = do_md_stop (mddev, 0)))
+ mddev = NULL;
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ case HDIO_GETGEO:
+ if (!loc) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ err = md_put_user (2, (char *) &loc->heads);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (4, (char *) &loc->sectors);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
+ (short *) &loc->cylinders);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[minor].start_sect,
+ (long *) &loc->start);
+ goto done_unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow read-only arrays
+ * here:
+ */
+ if (mddev->ro) {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ case HOT_GENERATE_ERROR:
+ err = hot_generate_error(mddev, (kdev_t)arg);
+ goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ {
+ err = do_md_run (mddev);
+ /*
+ * we have to clean up the mess if
+ * the array cannot be run for some
+ * reason ...
+ */
+ if (err) {
+ mddev->sb_dirty = 0;
+ if (!do_md_stop (mddev, 0))
+ mddev = NULL;
+ }
+ goto done_unlock;
+ }
+
+ default:
+ printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, "
+ "upgrade your software to use new ictls.\n",
+ current->comm, current->pid);
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ if (mddev)
+ unlock_mddev(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Always succeed, but increment the usage count
+ */
+ mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
+ if (mddev)
+ atomic_inc(&mddev->active);
+ return (0);
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
+ if (mddev)
+ atomic_dec(&mddev->active);
+ return 0;
+}
+
+static struct block_device_operations md_fops=
+{
+ owner: THIS_MODULE,
+ open: md_open,
+ release: md_release,
+ ioctl: md_ioctl,
+};
+
+
+int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ md_lock_kernel();
+
+ /*
+ * Detach thread
+ */
+
+ daemonize();
+
+ sprintf(current->comm, thread->name);
+ md_init_signals();
+ md_flush_signals();
+ thread->tsk = current;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+ current->policy = SCHED_OTHER;
+ current->nice = -20;
+ md_unlock_kernel();
+
+ complete(thread->event);
+ while (thread->run) {
+ void (*run)(void *data);
+
+ wait_event_interruptible(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags));
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ run = thread->run;
+ if (run) {
+ run(thread->data);
+ run_task_queue(&tq_disk);
+ }
+ if (md_signal_pending(current))
+ md_flush_signals();
+ }
+ complete(thread->event);
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ dprintk("md: waking up MD thread %p.\n", thread);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+}
+
+mdk_thread_t *md_register_thread(void (*run) (void *),
+ void *data, const char *name)
+{
+ mdk_thread_t *thread;
+ int ret;
+ struct completion event;
+
+ thread = (mdk_thread_t *) kmalloc
+ (sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ memset(thread, 0, sizeof(mdk_thread_t));
+ md_init_waitqueue_head(&thread->wqueue);
+
+ init_completion(&event);
+ thread->event = &event;
+ thread->run = run;
+ thread->data = data;
+ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+ return NULL;
+ }
+ wait_for_completion(&event);
+ return thread;
+}
+
+void md_interrupt_thread(mdk_thread_t *thread)
+{
+ if (!thread->tsk) {
+ MD_BUG();
+ return;
+ }
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+ send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ struct completion event;
+
+ init_completion(&event);
+
+ thread->event = &event;
+ thread->run = NULL;
+ thread->name = NULL;
+ md_interrupt_thread(thread);
+ wait_for_completion(&event);
+ kfree(thread);
+}
+
+void md_recover_arrays(void)
+{
+ if (!md_recovery_thread) {
+ MD_BUG();
+ return;
+ }
+ md_wakeup_thread(md_recovery_thread);
+}
+
+
+int md_error(mddev_t *mddev, kdev_t rdev)
+{
+ mdk_rdev_t * rrdev;
+
+ dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+
+ if (!mddev) {
+ MD_BUG();
+ return 0;
+ }
+ rrdev = find_rdev(mddev, rdev);
+ if (!rrdev || rrdev->faulty)
+ return 0;
+ if (!mddev->pers->error_handler
+ || mddev->pers->error_handler(mddev,rdev) <= 0) {
+ rrdev->faulty = 1;
+ } else
+ return 1;
+ /*
+ * if recovery was running, stop it now.
+ */
+ if (mddev->pers->stop_resync)
+ mddev->pers->stop_resync(mddev);
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+ md_recover_arrays();
+
+ return 0;
+}
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_ALL(rdev,tmp) {
+ if (list_empty(&rdev->same_set)) {
+ /*
+ * The device is not yet used by any array.
+ */
+ i++;
+ seq_printf(seq, "%s ",
+ partition_name(rdev->dev));
+ }
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ unsigned long max_blocks, resync, res, dt, db, rt;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ max_blocks = mddev->sb->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks)
+ MD_BUG();
+
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ {
+ int i, x = res/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ if (!mddev->recovery_running)
+ /*
+ * true resync
+ */
+ seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)",
+ res/10, res % 10, resync, max_blocks);
+ else
+ /*
+ * recovery ...
+ */
+ seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)",
+ res/10, res % 10, resync, max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+
+}
+
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l > 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ return mddev;
+ }
+ return (void*)2;/* tail */
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = list_entry(tmp,mddev_t,all_mddevs);
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ int j, size;
+ struct md_list_head *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev = v;
+
+ if (v == (void*)1) {
+ seq_printf(seq, "Personalities : ");
+ for (j = 0; j < MAX_PERSONALITY; j++)
+ if (pers[j])
+ seq_printf(seq, "[%s] ", pers[j]->name);
+
+ seq_printf(seq, "\n");
+ seq_printf(seq, "read_ahead ");
+ if (read_ahead[MD_MAJOR] == INT_MAX)
+ seq_printf(seq, "not set\n");
+ else
+ seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]);
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro)
+ seq_printf(seq, " (read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ seq_printf(seq, " %s[%d]",
+ partition_name(rdev->dev), rdev->desc_nr);
+ if (rdev->faulty) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %d blocks",
+ md_size[mdidx(mddev)]);
+ else
+ seq_printf(seq, "\n %d blocks", size);
+ }
+
+ if (mddev->pers) {
+
+ mddev->pers->status (seq, mddev);
+
+ seq_printf(seq, "\n ");
+ if (mddev->curr_resync) {
+ status_resync (seq, mddev);
+ } else {
+ if (sem_getcount(&mddev->resync_sem) != 1)
+ seq_printf(seq, " resync=DELAYED");
+ }
+ }
+ seq_printf(seq, "\n");
+
+ return 0;
+}
+
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ return error;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (pers[pnum]) {
+ MD_BUG();
+ return -EBUSY;
+ }
+
+ pers[pnum] = p;
+ printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+ return 0;
+}
+
+int unregister_md_personality(int pnum)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+ pers[pnum] = NULL;
+ return 0;
+}
+
+mdp_disk_t *get_spare(mddev_t *mddev)
+{
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *disk;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (!rdev->sb) {
+ MD_BUG();
+ continue;
+ }
+ disk = &sb->disks[rdev->desc_nr];
+ if (disk_faulty(disk)) {
+ MD_BUG();
+ continue;
+ }
+ if (disk_active(disk))
+ continue;
+ return disk;
+ }
+ return NULL;
+}
+
+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
+void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
+{
+ unsigned int major = MAJOR(dev);
+ unsigned int index;
+
+ index = disk_index(dev);
+ if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ return;
+
+ sync_io[major][index] += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ int major = MAJOR(rdev->dev);
+ int idx = disk_index(rdev->dev);
+
+ if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ continue;
+
+ curr_events = kstat.dk_drive_rblk[major][idx] +
+ kstat.dk_drive_wblk[major][idx] ;
+ curr_events -= sync_io[major][idx];
+ if ((curr_events - rdev->last_events) > 32) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ // stop recovery, signal do_sync ....
+ if (mddev->pers->stop_resync)
+ mddev->pers->stop_resync(mddev);
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+ }
+}
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
+{
+ mddev_t *mddev2;
+ unsigned int max_sectors, currspeed,
+ j, window, err, serialize;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct md_list_head *tmp;
+ unsigned long last_check;
+
+
+ err = down_interruptible(&mddev->resync_sem);
+ if (err)
+ goto out_nolock;
+
+recheck:
+ serialize = 0;
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
+ printk(KERN_INFO "md: delaying resync of md%d until md%d "
+ "has finished resync (they share one or more physical units)\n",
+ mdidx(mddev), mdidx(mddev2));
+ serialize = 1;
+ break;
+ }
+ }
+ if (serialize) {
+ interruptible_sleep_on(&resync_wait);
+ if (md_signal_pending(current)) {
+ md_flush_signals();
+ err = -EINTR;
+ goto out;
+ }
+ goto recheck;
+ }
+
+ mddev->curr_resync = 1;
+
+ max_sectors = mddev->sb->size<<1;
+
+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
+ sysctl_speed_limit_min);
+ printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ sysctl_speed_limit_max);
+
+ /*
+ * Resync has low priority.
+ */
+ current->nice = 19;
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = 0;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = vm_max_readahead*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+ window/2,max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+ for (j = 0; j < max_sectors;) {
+ int sectors;
+
+ sectors = mddev->pers->sync_request(mddev, j);
+
+ if (sectors < 0) {
+ err = sectors;
+ goto out;
+ }
+ atomic_add(sectors, &mddev->recovery_active);
+ j += sectors;
+ mddev->curr_resync = j;
+
+ if (last_check + window > j)
+ continue;
+
+ last_check = j;
+
+ run_task_queue(&tq_disk);
+
+ repeat:
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (md_signal_pending(current)) {
+ /*
+ * got a signal, exit.
+ */
+ mddev->curr_resync = 0;
+ printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
+ md_flush_signals();
+ err = -EINTR;
+ goto out;
+ }
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ if (md_need_resched(current))
+ schedule();
+
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > sysctl_speed_limit_min) {
+ current->nice = 19;
+
+ if ((currspeed > sysctl_speed_limit_max) ||
+ !is_mddev_idle(mddev)) {
+ current->state = TASK_INTERRUPTIBLE;
+ md_schedule_timeout(HZ/4);
+ goto repeat;
+ }
+ } else
+ current->nice = -20;
+ }
+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+ err = 0;
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+out:
+ wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
+ up(&mddev->resync_sem);
+out_nolock:
+ mddev->curr_resync = 0;
+ wake_up(&resync_wait);
+ return err;
+}
+
+
+/*
+ * This is a kernel thread which syncs a spare disk with the active array
+ *
+ * the amount of foolproofing might seem to be a tad excessive, but an
+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
+ * of my root partition with the first 0.5 gigs of my /home partition ... so
+ * i'm a bit nervous ;)
+ */
+void md_do_recovery(void *data)
+{
+ int err;
+ mddev_t *mddev;
+ mdp_super_t *sb;
+ mdp_disk_t *spare;
+ struct md_list_head *tmp;
+
+ printk(KERN_INFO "md: recovery thread got woken up ...\n");
+restart:
+ ITERATE_MDDEV(mddev,tmp) {
+ sb = mddev->sb;
+ if (!sb)
+ continue;
+ if (mddev->recovery_running)
+ continue;
+ if (sb->active_disks == sb->raid_disks)
+ continue;
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ if (!sb->spare_disks) {
+ printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
+ "-- continuing in degraded mode\n", mdidx(mddev));
+ continue;
+ }
+ /*
+ * now here we get the spare and resync it.
+ */
+ spare = get_spare(mddev);
+ if (!spare)
+ continue;
+ printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
+ mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
+ if (!mddev->pers->diskop)
+ continue;
+ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
+ continue;
+ down(&mddev->recovery_sem);
+ mddev->recovery_running = 1;
+ err = md_do_sync(mddev, spare);
+ if (err == -EIO) {
+ printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n",
+ mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
+ if (!disk_faulty(spare)) {
+ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
+ mark_disk_faulty(spare);
+ mark_disk_nonsync(spare);
+ mark_disk_inactive(spare);
+ sb->spare_disks--;
+ sb->working_disks--;
+ sb->failed_disks++;
+ }
+ } else
+ if (disk_faulty(spare))
+ mddev->pers->diskop(mddev, &spare,
+ DISKOP_SPARE_INACTIVE);
+ if (err == -EINTR || err == -ENOMEM) {
+ /*
+ * Recovery got interrupted, or ran out of mem ...
+ * signal back that we have finished using the array.
+ */
+ mddev->pers->diskop(mddev, &spare,
+ DISKOP_SPARE_INACTIVE);
+ up(&mddev->recovery_sem);
+ mddev->recovery_running = 0;
+ continue;
+ } else {
+ mddev->recovery_running = 0;
+ up(&mddev->recovery_sem);
+ }
+ if (!disk_faulty(spare)) {
+ /*
+ * the SPARE_ACTIVE diskop possibly changes the
+ * pointer too
+ */
+ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
+ mark_disk_sync(spare);
+ mark_disk_active(spare);
+ sb->active_disks++;
+ sb->spare_disks--;
+ }
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+ goto restart;
+ }
+ printk(KERN_INFO "md: recovery thread finished ...\n");
+
+}
+
+int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct md_list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
+ || (code == MD_SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ do_md_stop (mddev, 1);
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ md_mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+ notifier_call: md_notify_reboot,
+ next: NULL,
+ priority: INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+ int i;
+
+ for(i = 0; i < MAX_MD_DEVS; i++) {
+ md_blocksizes[i] = 1024;
+ md_size[i] = 0;
+ md_hardsect_sizes[i] = 512;
+ }
+ blksize_size[MAJOR_NR] = md_blocksizes;
+ blk_size[MAJOR_NR] = md_size;
+ max_readahead[MAJOR_NR] = md_maxreadahead;
+ hardsect_size[MAJOR_NR] = md_hardsect_sizes;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+#ifdef CONFIG_PROC_FS
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+#endif
+}
+
+request_queue_t * md_queue_proc(kdev_t dev)
+{
+ mddev_t *mddev = kdev_to_mddev(dev);
+ if (mddev == NULL)
+ return BLK_DEFAULT_QUEUE(MAJOR_NR);
+ else
+ return &mddev->queue;
+}
+
+int md__init md_init(void)
+{
+ static char * name = "mdrecoveryd";
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+
+ if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
+ {
+ printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR);
+ return (-1);
+ }
+ devfs_handle = devfs_mk_dir (NULL, "md", NULL);
+ /* we don't use devfs_register_series because we want to fill md_hd_struct */
+ for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+ char devname[128];
+ sprintf (devname, "%u", minor);
+ md_hd_struct[minor].de = devfs_register (devfs_handle,
+ devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+ S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+ }
+
+ /* all requests on an uninitialised device get failed... */
+ blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request);
+ blk_dev[MAJOR_NR].queue = md_queue_proc;
+
+
+ read_ahead[MAJOR_NR] = INT_MAX;
+
+ add_gendisk(&md_gendisk);
+
+ md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
+ if (!md_recovery_thread)
+ printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n");
+
+ md_register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * When md (and any require personalities) are compiled into the kernel
+ * (not a module), arrays can be assembles are boot time using with AUTODETECT
+ * where specially marked partitions are registered with md_autodetect_dev(),
+ * and with MD_BOOT where devices to be collected are given on the boot line
+ * with md=.....
+ * The code for that is here.
+ */
+
+struct {
+ int set;
+ int noautodetect;
+} raid_setup_args md__initdata;
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static kdev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(kdev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(void)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ kdev_t dev = detected_devices[i];
+
+ if (md_import_device(dev,1)) {
+ printk(KERN_ALERT "md: could not import %s!\n",
+ partition_name(dev));
+ continue;
+ }
+ /*
+ * Sanity checks:
+ */
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ continue;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ continue;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices(-1);
+}
+
+static struct {
+ char device_set [MAX_MD_DEVS];
+ int pers[MAX_MD_DEVS];
+ int chunk[MAX_MD_DEVS];
+ char *device_names[MAX_MD_DEVS];
+} md_setup_args md__initdata;
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the MD device now; that is handled by
+ * md_setup_drive after the low-level disk drivers have initialised.
+ *
+ * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
+ * assigns the task of parsing integer arguments to the
+ * invoked program now). Added ability to initialise all
+ * the MD devices (by specifying multiple "md=" lines)
+ * instead of just one. -- KTK
+ * 18May2000: Added support for persistant-superblock arrays:
+ * md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
+ * md=n,device-list reads a RAID superblock from the devices
+ * elements in device-list are read by name_to_kdev_t so can be
+ * a hex number or something like /dev/hda1 /dev/sdb
+ * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
+ * Shifted name_to_kdev_t() and related operations to md_set_drive()
+ * for later execution. Rewrote section to make devfs compatible.
+ */
+static int md__init md_setup(char *str)
+{
+ int minor, level, factor, fault;
+ char *pername = "";
+ char *str1 = str;
+
+ if (get_option(&str, &minor) != 2) { /* MD Number */
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ if (minor >= MAX_MD_DEVS) {
+ printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor);
+ return 0;
+ } else if (md_setup_args.device_names[minor]) {
+ printk(KERN_WARNING "md: md=%d, Specified more then once. "
+ "Replacing previous definition.\n", minor);
+ }
+ switch (get_option(&str, &level)) { /* RAID Personality */
+ case 2: /* could be 0 or -1.. */
+ if (level == 0 || level == -1) {
+ if (get_option(&str, &factor) != 2 || /* Chunk Size */
+ get_option(&str, &fault) != 2) {
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ md_setup_args.chunk[minor] = 1 << (factor+12);
+ switch(level) {
+ case -1:
+ level = LINEAR;
+ pername = "linear";
+ break;
+ case 0:
+ level = RAID0;
+ pername = "raid0";
+ break;
+ default:
+ printk(KERN_WARNING
+ "md: The kernel has not been configured for raid%d support!\n",
+ level);
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ break;
+ }
+ /* FALL THROUGH */
+ case 1: /* the first device is numeric */
+ str = str1;
+ /* FALL THROUGH */
+ case 0:
+ md_setup_args.pers[minor] = 0;
+ pername="super-block";
+ }
+
+ printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
+ minor, pername, str);
+ md_setup_args.device_names[minor] = str;
+
+ return 1;
+}
+
+extern kdev_t name_to_kdev_t(char *line) md__init;
+void md__init md_setup_drive(void)
+{
+ int minor, i;
+ kdev_t dev;
+ mddev_t*mddev;
+ kdev_t devices[MD_SB_DISKS+1];
+
+ for (minor = 0; minor < MAX_MD_DEVS; minor++) {
+ int err = 0;
+ char *devname;
+ mdu_disk_info_t dinfo;
+
+ if ((devname = md_setup_args.device_names[minor]) == 0) continue;
+
+ for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {
+
+ char *p;
+ void *handle;
+
+ p = strchr(devname, ',');
+ if (p)
+ *p++ = 0;
+
+ dev = name_to_kdev_t(devname);
+ handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev),
+ DEVFS_SPECIAL_BLK, 1);
+ if (handle != 0) {
+ unsigned major, minor;
+ devfs_get_maj_min(handle, &major, &minor);
+ dev = MKDEV(major, minor);
+ }
+ if (dev == 0) {
+ printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
+ break;
+ }
+
+ devices[i] = dev;
+ md_setup_args.device_set[minor] = 1;
+
+ devname = p;
+ }
+ devices[i] = 0;
+
+ if (md_setup_args.device_set[minor] == 0)
+ continue;
+
+ if (mddev_map[minor]) {
+ printk(KERN_WARNING
+ "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
+ minor);
+ continue;
+ }
+ printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
+
+ mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
+ if (!mddev) {
+ printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
+ continue;
+ }
+ if (md_setup_args.pers[minor]) {
+ /* non-persistent */
+ mdu_array_info_t ainfo;
+ ainfo.level = pers_to_level(md_setup_args.pers[minor]);
+ ainfo.size = 0;
+ ainfo.nr_disks =0;
+ ainfo.raid_disks =0;
+ ainfo.md_minor =minor;
+ ainfo.not_persistent = 1;
+
+ ainfo.state = (1 << MD_SB_CLEAN);
+ ainfo.active_disks = 0;
+ ainfo.working_disks = 0;
+ ainfo.failed_disks = 0;
+ ainfo.spare_disks = 0;
+ ainfo.layout = 0;
+ ainfo.chunk_size = md_setup_args.chunk[minor];
+ err = set_array_info(mddev, &ainfo);
+ for (i = 0; !err && (dev = devices[i]); i++) {
+ dinfo.number = i;
+ dinfo.raid_disk = i;
+ dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ mddev->sb->nr_disks++;
+ mddev->sb->raid_disks++;
+ mddev->sb->active_disks++;
+ mddev->sb->working_disks++;
+ err = add_new_disk (mddev, &dinfo);
+ }
+ } else {
+ /* persistent */
+ for (i = 0; (dev = devices[i]); i++) {
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ add_new_disk (mddev, &dinfo);
+ }
+ }
+ if (!err)
+ err = do_md_run(mddev);
+ if (err) {
+ mddev->sb_dirty = 0;
+ do_md_stop(mddev, 0);
+ printk(KERN_WARNING "md: starting md%d failed\n", minor);
+ }
+ }
+}
+
+static int md__init raid_setup(char *str)
+{
+ int len, pos;
+
+ len = strlen(str) + 1;
+ pos = 0;
+
+ while (pos < len) {
+ char *comma = strchr(str+pos, ',');
+ int wlen;
+ if (comma)
+ wlen = (comma-str)-pos;
+ else wlen = (len-1)-pos;
+
+ if (strncmp(str, "noautodetect", wlen) == 0)
+ raid_setup_args.noautodetect = 1;
+ pos += wlen+1;
+ }
+ raid_setup_args.set = 1;
+ return 1;
+}
+
+int md__init md_run_setup(void)
+{
+ if (raid_setup_args.noautodetect)
+ printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
+ else
+ autostart_arrays();
+ md_setup_drive();
+ return 0;
+}
+
+__setup("raid=", raid_setup);
+__setup("md=", md_setup);
+
+__initcall(md_init);
+__initcall(md_run_setup);
+
+#else /* It is a MODULE */
+
+int init_module(void)
+{
+ return md_init();
+}
+
+static void free_device_names(void)
+{
+ while (!list_empty(&device_names)) {
+ struct dname *tmp = list_entry(device_names.next,
+ dev_name_t, list);
+ list_del(&tmp->list);
+ kfree(tmp);
+ }
+}
+
+
+void cleanup_module(void)
+{
+ md_unregister_thread(md_recovery_thread);
+ devfs_unregister(devfs_handle);
+
+ devfs_unregister_blkdev(MAJOR_NR,"md");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("mdstat", NULL);
+#endif
+
+ del_gendisk(&md_gendisk);
+
+ blk_dev[MAJOR_NR].queue = NULL;
+ blksize_size[MAJOR_NR] = NULL;
+ blk_size[MAJOR_NR] = NULL;
+ max_readahead[MAJOR_NR] = NULL;
+ hardsect_size[MAJOR_NR] = NULL;
+
+ free_device_names();
+
+}
+#endif
+
+MD_EXPORT_SYMBOL(md_size);
+MD_EXPORT_SYMBOL(register_md_personality);
+MD_EXPORT_SYMBOL(unregister_md_personality);
+MD_EXPORT_SYMBOL(partition_name);
+MD_EXPORT_SYMBOL(md_error);
+MD_EXPORT_SYMBOL(md_do_sync);
+MD_EXPORT_SYMBOL(md_sync_acct);
+MD_EXPORT_SYMBOL(md_done_sync);
+MD_EXPORT_SYMBOL(md_recover_arrays);
+MD_EXPORT_SYMBOL(md_register_thread);
+MD_EXPORT_SYMBOL(md_unregister_thread);
+MD_EXPORT_SYMBOL(md_update_sb);
+MD_EXPORT_SYMBOL(md_wakeup_thread);
+MD_EXPORT_SYMBOL(md_print_devices);
+MD_EXPORT_SYMBOL(find_rdev_nr);
+<<<<<<<
+MD_EXPORT_SYMBOL(md_interrupt_thread);
+MD_EXPORT_SYMBOL(mddev_map);
+|||||||
+MD_EXPORT_SYMBOL(md_interrupt_thread);
+EXPORT_SYMBOL(mddev_map);
+=======
+MD_EXPORT_SYMBOL(md_interrupt_thread);
+>>>>>>>
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md-loop/orig b/tests/linux/md-loop/orig
new file mode 100644
index 0000000..682ed20
--- /dev/null
+++ b/tests/linux/md-loop/orig
@@ -0,0 +1,3960 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/raid/xor.h>
+#include <linux/devfs_fs_kernel.h>
+
+#include <linux/init.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+
+#include <linux/blk.h>
+
+#define DEBUG 0
+#if DEBUG
+# define dprintk(x...) printk(x)
+#else
+# define dprintk(x...) do { } while(0)
+#endif
+
+#ifndef MODULE
+static void autostart_arrays (void);
+#endif
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 100 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static int sysctl_speed_limit_min = 100;
+static int sysctl_speed_limit_max = 100000;
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min",
+ &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
+ {DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max",
+ &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
+ {0}
+};
+
+static ctl_table raid_dir_table[] = {
+ {DEV_RAID, "raid", NULL, 0, 0555, raid_table},
+ {0}
+};
+
+static ctl_table raid_root_table[] = {
+ {CTL_DEV, "dev", NULL, 0, 0555, raid_dir_table},
+ {0}
+};
+
+/*
+ * these have to be allocated separately because external
+ * subsystems want to have a pre-defined structure
+ */
+struct hd_struct md_hd_struct[MAX_MD_DEVS];
+static int md_blocksizes[MAX_MD_DEVS];
+static int md_hardsect_sizes[MAX_MD_DEVS];
+static mdk_thread_t *md_recovery_thread;
+
+int md_size[MAX_MD_DEVS];
+
+static struct block_device_operations md_fops;
+static devfs_handle_t devfs_handle;
+
+static struct gendisk md_gendisk=
+{
+ major: MD_MAJOR,
+ major_name: "md",
+ minor_shift: 0,
+ max_p: 1,
+ part: md_hd_struct,
+ sizes: md_size,
+ nr_real: MAX_MD_DEVS,
+ real_devices: NULL,
+ next: NULL,
+ fops: &md_fops,
+};
+
+/*
+ * Enables to iterate over all existing md arrays
+ */
+static MD_LIST_HEAD(all_mddevs);
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static inline mddev_t * kdev_to_mddev (kdev_t dev)
+{
+ if (MAJOR(dev) != MD_MAJOR)
+ BUG();
+ return mddev_map[MINOR(dev)];
+}
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio);
+ return 0;
+}
+
+static mddev_t * alloc_mddev(kdev_t dev)
+{
+ mddev_t *mddev;
+
+ if (MAJOR(dev) != MD_MAJOR) {
+ MD_BUG();
+ return 0;
+ }
+ mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
+ if (!mddev)
+ return NULL;
+
+ memset(mddev, 0, sizeof(*mddev));
+
+ mddev->__minor = MINOR(dev);
+ init_MUTEX(&mddev->reconfig_sem);
+ init_MUTEX(&mddev->recovery_sem);
+ init_MUTEX(&mddev->resync_sem);
+ MD_INIT_LIST_HEAD(&mddev->disks);
+ MD_INIT_LIST_HEAD(&mddev->all_mddevs);
+ atomic_set(&mddev->active, 0);
+
+ mddev_map[mdidx(mddev)] = mddev;
+ md_list_add(&mddev->all_mddevs, &all_mddevs);
+
+ MOD_INC_USE_COUNT;
+
+ return mddev;
+}
+
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+mdk_rdev_t * find_rdev(mddev_t * mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+static MD_LIST_HEAD(device_names);
+
+char * partition_name(kdev_t dev)
+{
+ struct gendisk *hd;
+ static char nomem [] = "<nomem>";
+ dev_name_t *dname;
+ struct md_list_head *tmp;
+
+ list_for_each(tmp, &device_names) {
+ dname = md_list_entry(tmp, dev_name_t, list);
+ if (dname->dev == dev)
+ return dname->name;
+ }
+
+ dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
+
+ if (!dname)
+ return nomem;
+ /*
+ * ok, add this new device name to the list
+ */
+ hd = get_gendisk (dev);
+ dname->name = NULL;
+ if (hd)
+ dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
+ if (!dname->name) {
+ sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
+ dname->name = dname->namebuf;
+ }
+
+ dname->dev = dev;
+ md_list_add(&dname->list, &device_names);
+
+ return dname->name;
+}
+
+static unsigned int calc_dev_sboffset(kdev_t dev, mddev_t *mddev,
+ int persistent)
+{
+ unsigned int size = 0;
+
+ if (blk_size[MAJOR(dev)])
+ size = blk_size[MAJOR(dev)][MINOR(dev)];
+ if (persistent)
+ size = MD_NEW_SIZE_BLOCKS(size);
+ return size;
+}
+
+static unsigned int calc_dev_size(kdev_t dev, mddev_t *mddev, int persistent)
+{
+ unsigned int size;
+
+ size = calc_dev_sboffset(dev, mddev, persistent);
+ if (!mddev->sb) {
+ MD_BUG();
+ return size;
+ }
+ if (mddev->sb->chunk_size)
+ size &= ~(mddev->sb->chunk_size/1024 - 1);
+ return size;
+}
+
+static unsigned int zoned_raid_size(mddev_t *mddev)
+{
+ unsigned int mask;
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ /*
+ * do size and offset calculations.
+ */
+ mask = ~(mddev->sb->chunk_size/1024 - 1);
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ rdev->size &= mask;
+ md_size[mdidx(mddev)] += rdev->size;
+ }
+ return 0;
+}
+
+static void remove_descriptor(mdp_disk_t *disk, mdp_super_t *sb)
+{
+ if (disk_active(disk)) {
+ sb->working_disks--;
+ } else {
+ if (disk_spare(disk)) {
+ sb->spare_disks--;
+ sb->working_disks--;
+ } else {
+ sb->failed_disks--;
+ }
+ }
+ sb->nr_disks--;
+ disk->major = 0;
+ disk->minor = 0;
+ mark_disk_removed(disk);
+}
+
+#define BAD_MAGIC KERN_ERR \
+"md: invalid raid superblock magic on %s\n"
+
+#define BAD_MINOR KERN_ERR \
+"md: %s: invalid raid minor (%x)\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_SB KERN_ERR \
+"md: disabled device %s, could not read superblock.\n"
+
+#define BAD_CSUM KERN_WARNING \
+"md: invalid superblock checksum on %s\n"
+
+static int alloc_array_sb(mddev_t * mddev)
+{
+ if (mddev->sb) {
+ MD_BUG();
+ return 0;
+ }
+
+ mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
+ if (!mddev->sb)
+ return -ENOMEM;
+ md_clear_page(mddev->sb);
+ return 0;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(OUT_OF_MEM);
+ return -EINVAL;
+ }
+ rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ page_cache_release(rdev->sb_page);
+ rdev->sb = NULL;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ } else {
+ if (!rdev->faulty)
+ MD_BUG();
+ }
+}
+
+
+static void bh_complete(struct buffer_head *bh, int uptodate)
+{
+
+ if (uptodate)
+ set_bit(BH_Uptodate, &bh->b_state);
+
+ complete((struct completion*)bh->b_private);
+}
+
+static int sync_page_io(kdev_t dev, unsigned long sector, int size,
+ struct page *page, int rw)
+{
+ struct buffer_head bh;
+ struct completion event;
+
+ init_completion(&event);
+ init_buffer(&bh, bh_complete, &event);
+ bh.b_rdev = dev;
+ bh.b_rsector = sector;
+ bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
+ bh.b_size = size;
+ bh.b_page = page;
+ bh.b_reqnext = NULL;
+ bh.b_data = page_address(page);
+ generic_make_request(rw, &bh);
+
+ run_task_queue(&tq_disk);
+ wait_for_completion(&event);
+
+ return test_bit(BH_Uptodate, &bh.b_state);
+}
+
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+ int ret = -EINVAL;
+ kdev_t dev = rdev->dev;
+ unsigned long sb_offset;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk
+ */
+ sb_offset = calc_dev_sboffset(rdev->dev, rdev->mddev, 1);
+ rdev->sb_offset = sb_offset;
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ)) {
+ printk(NO_SB,partition_name(dev));
+ return -EINVAL;
+ }
+ printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
+ ret = 0;
+abort:
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+/*
+ * Check one RAID superblock for generic plausibility
+ */
+
+static int check_disk_sb(mdk_rdev_t * rdev)
+{
+ mdp_super_t *sb;
+ int ret = -EINVAL;
+
+ sb = rdev->sb;
+ if (!sb) {
+ MD_BUG();
+ goto abort;
+ }
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(BAD_MAGIC, partition_name(rdev->dev));
+ goto abort;
+ }
+
+ if (sb->md_minor >= MAX_MD_DEVS) {
+ printk(BAD_MINOR, partition_name(rdev->dev), sb->md_minor);
+ goto abort;
+ }
+
+ if (calc_sb_csum(sb) != sb->sb_csum) {
+ printk(BAD_CSUM, partition_name(rdev->dev));
+ goto abort;
+ }
+ ret = 0;
+abort:
+ return ret;
+}
+
+static kdev_t dev_unit(kdev_t dev)
+{
+ unsigned int mask;
+ struct gendisk *hd = get_gendisk(dev);
+
+ if (!hd)
+ return 0;
+ mask = ~((1 << hd->minor_shift) - 1);
+
+ return MKDEV(MAJOR(dev), MINOR(dev) & mask);
+}
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (dev_unit(rdev->dev) == dev_unit(dev))
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev->dev))
+ return 1;
+
+ return 0;
+}
+
+static MD_LIST_HEAD(all_raid_disks);
+static MD_LIST_HEAD(pending_raid_disks);
+
+static void bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ same_pdev = match_dev_unit(mddev, rdev->dev);
+ if (same_pdev)
+ printk( KERN_WARNING
+"md%d: WARNING: %s appears to be on the same physical disk as %s. True\n"
+" protection against single-disk failure might be compromised.\n",
+ mdidx(mddev), partition_name(rdev->dev),
+ partition_name(same_pdev->dev));
+
+ md_list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", partition_name(rdev->dev));
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", partition_name(rdev->dev));
+ rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev(mdk_rdev_t *rdev)
+{
+ int err = 0;
+ struct block_device *bdev;
+
+ bdev = bdget(rdev->dev);
+ if (!bdev)
+ return -ENOMEM;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ if (!err)
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ blkdev_put(bdev, BDEV_RAW);
+}
+
+void md_autodetect_dev(kdev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ printk(KERN_INFO "md: export_rdev(%s)\n",partition_name(rdev->dev));
+ if (rdev->mddev)
+ MD_BUG();
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ list_del_init(&rdev->all);
+ if (!list_empty(&rdev->pending)) {
+ printk(KERN_INFO "md: (%s was pending)\n",
+ partition_name(rdev->dev));
+ list_del_init(&rdev->pending);
+ }
+#ifndef MODULE
+ md_autodetect_dev(rdev->dev);
+#endif
+ rdev->dev = 0;
+ rdev->faulty = 0;
+ kfree(rdev);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb = mddev->sb;
+
+ if (mddev->sb) {
+ mddev->sb = NULL;
+ free_page((unsigned long) sb);
+ }
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+}
+
+static void free_mddev(mddev_t *mddev)
+{
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ export_array(mddev);
+ md_size[mdidx(mddev)] = 0;
+ md_hd_struct[mdidx(mddev)].nr_sects = 0;
+
+ /*
+ * Make sure nobody else is using this mddev
+ * (careful, we rely on the global kernel lock here)
+ */
+ while (sem_getcount(&mddev->resync_sem) != 1)
+ schedule();
+ while (sem_getcount(&mddev->recovery_sem) != 1)
+ schedule();
+
+<<<<<<<
+ del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
+|||||||
+ del_mddev_mapping(mddev, mk_kdev(MD_MAJOR, mdidx(mddev)));
+=======
+ mddev_map[mdidx(mddev)] = NULL;
+>>>>>>>
+ md_list_del(&mddev->all_mddevs);
+ kfree(mddev);
+ MOD_DEC_USE_COUNT;
+}
+
+#undef BAD_CSUM
+#undef BAD_MAGIC
+#undef OUT_OF_MEM
+#undef NO_SB
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+ partition_name(MKDEV(desc->major,desc->minor)),
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
+ sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
+ sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ printk(KERN_INFO "md: rdev %s: O:%s, SZ:%08ld F:%d DN:%d ",
+ partition_name(rdev->dev), partition_name(rdev->old_dev),
+ rdev->size, rdev->faulty, rdev->desc_nr);
+ if (rdev->sb) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb(rdev->sb);
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) {
+ printk("md%d: ", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", partition_name(rdev->dev));
+
+ if (mddev->sb) {
+ printk(" array superblock:\n");
+ print_sb(mddev->sb);
+ } else
+ printk(" no array superblock.\n");
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ if (tmp1)
+ kfree(tmp1);
+ if (tmp2)
+ kfree(tmp2);
+
+ return ret;
+}
+
+static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
+{
+ if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
+ (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
+ (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
+ (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+static mdk_rdev_t * find_rdev_all(kdev_t dev)
+{
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ list_for_each(tmp, &all_raid_disks) {
+ rdev = md_list_entry(tmp, mdk_rdev_t, all);
+ if (rdev->dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+#define GETBLK_FAILED KERN_ERR \
+"md: getblk failed for device %s\n"
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+ kdev_t dev;
+ unsigned long sb_offset, size;
+
+ if (!rdev->sb) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->sb->md_magic != MD_SB_MAGIC) {
+ MD_BUG();
+ return 1;
+ }
+
+ dev = rdev->dev;
+ sb_offset = calc_dev_sboffset(dev, rdev->mddev, 1);
+ if (rdev->sb_offset != sb_offset) {
+ printk(KERN_INFO "%s's sb offset has changed from %ld to %ld, skipping\n",
+ partition_name(dev), rdev->sb_offset, sb_offset);
+ goto skip;
+ }
+ /*
+ * If the disk went offline meanwhile and it's just a spare, then
+ * its size has changed to zero silently, and the MD code does
+ * not yet know that it's faulty.
+ */
+ size = calc_dev_size(dev, rdev->mddev, 1);
+ if (size != rdev->size) {
+ printk(KERN_INFO "%s's size has changed from %ld to %ld since import, skipping\n",
+ partition_name(dev), rdev->size, size);
+ goto skip;
+ }
+
+ printk(KERN_INFO "(write) %s's sb offset: %ld\n", partition_name(dev), sb_offset);
+
+ if (!sync_page_io(dev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) {
+ printk("md: write_disk_sb failed for device %s\n", partition_name(dev));
+ return 1;
+ }
+skip:
+ return 0;
+}
+#undef GETBLK_FAILED
+
+static void set_this_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ int i, ok = 0;
+ mdp_disk_t *desc;
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ desc = mddev->sb->disks + i;
+#if 0
+ if (disk_faulty(desc)) {
+ if (MKDEV(desc->major,desc->minor) == rdev->dev)
+ ok = 1;
+ continue;
+ }
+#endif
+ if (MKDEV(desc->major,desc->minor) == rdev->dev) {
+ rdev->sb->this_disk = *desc;
+ rdev->desc_nr = desc->number;
+ ok = 1;
+ break;
+ }
+ }
+
+ if (!ok) {
+ MD_BUG();
+ }
+}
+
+static int sync_sbs(mddev_t * mddev)
+{
+ mdk_rdev_t *rdev;
+ mdp_super_t *sb;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty || rdev->alias_device)
+ continue;
+ sb = rdev->sb;
+ *sb = *mddev->sb;
+ set_this_disk(mddev, rdev);
+ sb->sb_csum = calc_sb_csum(sb);
+ }
+ return 0;
+}
+
+int md_update_sb(mddev_t * mddev)
+{
+ int err, count = 100;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->sb_dirty) {
+ printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0));
+ return 0;
+ }
+ mddev->sb_dirty = 0;
+repeat:
+ mddev->sb->utime = CURRENT_TIME;
+ if ((++mddev->sb->events_lo)==0)
+ ++mddev->sb->events_hi;
+
+ if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
+ }
+ sync_sbs(mddev);
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (mddev->sb->not_persistent)
+ return 0;
+
+ printk(KERN_INFO "md: updating md%d RAID superblock on device\n",
+ mdidx(mddev));
+
+ err = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk(KERN_INFO "md: ");
+ if (rdev->faulty)
+ printk("(skipping faulty ");
+ if (rdev->alias_device)
+ printk("(skipping alias ");
+ if (!rdev->faulty && disk_faulty(&rdev->sb->this_disk)) {
+ printk("(skipping new-faulty %s )\n",
+ partition_name(rdev->dev));
+ continue;
+ }
+ printk("%s ", partition_name(rdev->dev));
+ if (!rdev->faulty && !rdev->alias_device) {
+ printk("[events: %08lx]",
+ (unsigned long)rdev->sb->events_lo);
+ err += write_disk_sb(rdev);
+ } else
+ printk(")\n");
+ }
+ if (err) {
+ if (--count) {
+ printk(KERN_ERR "md: errors occurred during superblock update, repeating\n");
+ goto repeat;
+ }
+ printk(KERN_ERR "md: excessive errors occurred during superblock update, exiting\n");
+ }
+ return 0;
+}
+
+/*
+ * Import a device. If 'on_disk', then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ */
+static int md_import_device(kdev_t newdev, int on_disk)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ unsigned int size;
+
+ if (find_rdev_all(newdev))
+ return -EEXIST;
+
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for %s!\n", partition_name(newdev));
+ return -ENOMEM;
+ }
+ memset(rdev, 0, sizeof(*rdev));
+
+ if (is_mounted(newdev)) {
+ printk(KERN_WARNING "md: can not import %s, has active inodes!\n",
+ partition_name(newdev));
+ err = -EBUSY;
+ goto abort_free;
+ }
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ rdev->dev = newdev;
+ if (lock_rdev(rdev)) {
+ printk(KERN_ERR "md: could not lock %s, zero-size? Marking faulty.\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+ rdev->desc_nr = -1;
+ rdev->faulty = 0;
+
+ size = 0;
+ if (blk_size[MAJOR(newdev)])
+ size = blk_size[MAJOR(newdev)][MINOR(newdev)];
+ if (!size) {
+ printk(KERN_WARNING "md: %s has zero size, marking faulty!\n",
+ partition_name(newdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (on_disk) {
+ if ((err = read_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+ if ((err = check_disk_sb(rdev))) {
+ printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+
+ if (rdev->sb->level != -4) {
+ rdev->old_dev = MKDEV(rdev->sb->this_disk.major,
+ rdev->sb->this_disk.minor);
+ rdev->desc_nr = rdev->sb->this_disk.number;
+ } else {
+ rdev->old_dev = MKDEV(0, 0);
+ rdev->desc_nr = -1;
+ }
+ }
+ md_list_add(&rdev->all, &all_raid_disks);
+ MD_INIT_LIST_HEAD(&rdev->pending);
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return 0;
+
+abort_free:
+ if (rdev->sb) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return err;
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+#define INCONSISTENT KERN_ERR \
+"md: fatal superblock inconsistency in %s -- removing from array\n"
+
+#define OUT_OF_DATE KERN_ERR \
+"md: superblock update time inconsistency -- using the most recent one\n"
+
+#define OLD_VERSION KERN_ALERT \
+"md: md%d: unsupported raid array version %d.%d.%d\n"
+
+#define NOT_CLEAN_IGNORE KERN_ERR \
+"md: md%d: raid array is not clean -- starting background reconstruction\n"
+
+#define UNKNOWN_LEVEL KERN_ERR \
+"md: md%d: unsupported raid level %d\n"
+
+static int analyze_sbs(mddev_t * mddev)
+{
+ int out_of_date = 0, i, first;
+ struct md_list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev, *rdev2, *freshest;
+ mdp_super_t *sb;
+
+ /*
+ * Verify the RAID superblock on each real device
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty) {
+ MD_BUG();
+ goto abort;
+ }
+ if (!rdev->sb) {
+ MD_BUG();
+ goto abort;
+ }
+ if (check_disk_sb(rdev))
+ goto abort;
+ }
+
+ /*
+ * The superblock constant part has to be the same
+ * for all disks in the array.
+ */
+ sb = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!sb) {
+ sb = rdev->sb;
+ continue;
+ }
+ if (!sb_equal(sb, rdev->sb)) {
+ printk(INCONSISTENT, partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * OK, we have all disks and the array is ready to run. Let's
+ * find the freshest superblock, that one will be the superblock
+ * that represents the whole array.
+ */
+ if (!mddev->sb)
+ if (alloc_array_sb(mddev))
+ goto abort;
+ sb = mddev->sb;
+ freshest = NULL;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2;
+ /*
+ * if the checksum is invalid, use the superblock
+ * only as a last resort. (decrease it's age by
+ * one event)
+ */
+ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
+ if (rdev->sb->events_lo || rdev->sb->events_hi)
+ if ((rdev->sb->events_lo--)==0)
+ rdev->sb->events_hi--;
+ }
+
+ printk(KERN_INFO "md: %s's event counter: %08lx\n",
+ partition_name(rdev->dev),
+ (unsigned long)rdev->sb->events_lo);
+ if (!freshest) {
+ freshest = rdev;
+ continue;
+ }
+ /*
+ * Find the newest superblock version
+ */
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(freshest->sb);
+ if (ev1 != ev2) {
+ out_of_date = 1;
+ if (ev1 > ev2)
+ freshest = rdev;
+ }
+ }
+ if (out_of_date) {
+ printk(OUT_OF_DATE);
+ printk(KERN_INFO "md: freshest: %s\n", partition_name(freshest->dev));
+ }
+ memcpy (sb, freshest->sb, sizeof(*sb));
+
+ /*
+ * at this point we have picked the 'best' superblock
+ * from all available superblocks.
+ * now we validate this superblock and kick out possibly
+ * failed disks.
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Kick all non-fresh devices
+ */
+ __u64 ev1, ev2;
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ++ev1;
+ if (ev1 < ev2) {
+ printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
+ partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ }
+
+ /*
+ * Fix up changed device names ... but only if this disk has a
+ * recent update time. Use faulty checksum ones too.
+ */
+ if (mddev->sb->level != -4)
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ __u64 ev1, ev2, ev3;
+ if (rdev->faulty || rdev->alias_device) {
+ MD_BUG();
+ goto abort;
+ }
+ ev1 = md_event(rdev->sb);
+ ev2 = md_event(sb);
+ ev3 = ev2;
+ --ev3;
+ if ((rdev->dev != rdev->old_dev) &&
+ ((ev1 == ev2) || (ev1 == ev3))) {
+ mdp_disk_t *desc;
+
+ printk(KERN_WARNING "md: device name has changed from %s to %s since last import!\n",
+ partition_name(rdev->old_dev), partition_name(rdev->dev));
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ desc = &sb->disks[rdev->desc_nr];
+ if (rdev->old_dev != MKDEV(desc->major, desc->minor)) {
+ MD_BUG();
+ goto abort;
+ }
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ desc = &rdev->sb->this_disk;
+ desc->major = MAJOR(rdev->dev);
+ desc->minor = MINOR(rdev->dev);
+ }
+ }
+
+ /*
+ * Remove unavailable and faulty devices ...
+ *
+ * note that if an array becomes completely unrunnable due to
+ * missing devices, we do not write the superblock back, so the
+ * administrator has a chance to fix things up. The removal thus
+ * only happens if it's nonfatal to the contents of the array.
+ */
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ int found;
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ /*
+ * We kick faulty devices/descriptors immediately.
+ *
+ * Note: multipath devices are a special case. Since we
+ * were able to read the superblock on the path, we don't
+ * care if it was previously marked as faulty, it's up now
+ * so enable it.
+ */
+ if (disk_faulty(desc) && mddev->sb->level != -4) {
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr != desc->number)
+ continue;
+ printk(KERN_WARNING "md%d: kicking faulty %s!\n",
+ mdidx(mddev),partition_name(rdev->dev));
+ kick_rdev_from_array(rdev);
+ found = 1;
+ break;
+ }
+ if (!found) {
+ if (dev == MKDEV(0,0))
+ continue;
+ printk(KERN_WARNING "md%d: removing former faulty %s!\n",
+ mdidx(mddev), partition_name(dev));
+ }
+ remove_descriptor(desc, sb);
+ continue;
+ } else if (disk_faulty(desc)) {
+ /*
+ * multipath entry marked as faulty, unfaulty it
+ */
+ rdev = find_rdev(mddev, dev);
+ if(rdev)
+ mark_disk_spare(desc);
+ else
+ remove_descriptor(desc, sb);
+ }
+
+ if (dev == MKDEV(0,0))
+ continue;
+ /*
+ * Is this device present in the rdev ring?
+ */
+ found = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * Multi-path IO special-case: since we have no
+ * this_disk descriptor at auto-detect time,
+ * we cannot check rdev->number.
+ * We can check the device though.
+ */
+ if ((sb->level == -4) && (rdev->dev ==
+ MKDEV(desc->major,desc->minor))) {
+ found = 1;
+ break;
+ }
+ if (rdev->desc_nr == desc->number) {
+ found = 1;
+ break;
+ }
+ }
+ if (found)
+ continue;
+
+ printk(KERN_WARNING "md%d: former device %s is unavailable, removing from array!\n",
+ mdidx(mddev), partition_name(dev));
+ remove_descriptor(desc, sb);
+ }
+
+ /*
+ * Double check wether all devices mentioned in the
+ * superblock are in the rdev ring.
+ */
+ first = 1;
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+
+ if (disk_faulty(desc)) {
+ MD_BUG();
+ goto abort;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * In the case of Multipath-IO, we have no
+ * other information source to find out which
+ * disk is which, only the position of the device
+ * in the superblock:
+ */
+ if (mddev->sb->level == -4) {
+ if ((rdev->desc_nr != -1) && (rdev->desc_nr != i)) {
+ MD_BUG();
+ goto abort;
+ }
+ rdev->desc_nr = i;
+ if (!first)
+ rdev->alias_device = 1;
+ else
+ first = 0;
+ }
+ }
+
+ /*
+ * Kick all rdevs that are not in the
+ * descriptor array:
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1)
+ kick_rdev_from_array(rdev);
+ }
+
+ /*
+ * Do a final reality check.
+ */
+ if (mddev->sb->level != -4) {
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ goto abort;
+ }
+ /*
+ * is the desc_nr unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->desc_nr == rdev->desc_nr)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ /*
+ * is the device unique?
+ */
+ ITERATE_RDEV(mddev,rdev2,tmp2) {
+ if ((rdev2 != rdev) &&
+ (rdev2->dev == rdev->dev)) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+ }
+ }
+
+ /*
+ * Check if we can support this RAID array
+ */
+ if (sb->major_version != MD_MAJOR_VERSION ||
+ sb->minor_version > MD_MINOR_VERSION) {
+
+ printk(OLD_VERSION, mdidx(mddev), sb->major_version,
+ sb->minor_version, sb->patch_version);
+ goto abort;
+ }
+
+ if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
+ (sb->level == 4) || (sb->level == 5)))
+ printk(NOT_CLEAN_IGNORE, mdidx(mddev));
+
+ return 0;
+abort:
+ return 1;
+}
+
+#undef INCONSISTENT
+#undef OUT_OF_DATE
+#undef OLD_VERSION
+#undef OLD_LEVEL
+
+static int device_size_calculation(mddev_t * mddev)
+{
+ int data_disks = 0, persistent;
+ unsigned int readahead;
+ mdp_super_t *sb = mddev->sb;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+ persistent = !mddev->sb->not_persistent;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size) {
+ MD_BUG();
+ continue;
+ }
+ rdev->size = calc_dev_size(rdev->dev, mddev, persistent);
+ if (rdev->size < sb->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size: %ldk < %dk\n",
+ partition_name(rdev->dev),
+ rdev->size, sb->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (sb->level) {
+ case -4:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case -1:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = sb->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = sb->raid_disks-1;
+ break;
+ default:
+ printk(UNKNOWN_LEVEL, mdidx(mddev), sb->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = sb->size * data_disks;
+
+ readahead = MD_READAHEAD;
+ if ((sb->level == 0) || (sb->level == 4) || (sb->level == 5)) {
+ readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (sb->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+abort:
+ return 1;
+}
+
+
+#define TOO_BIG_CHUNKSIZE KERN_ERR \
+"too big chunk_size: %d > %d\n"
+
+#define TOO_SMALL_CHUNKSIZE KERN_ERR \
+"too small chunk_size: %d < %ld\n"
+
+#define BAD_CHUNKSIZE KERN_ERR \
+"no chunksize specified, see 'man raidtab'\n"
+
+static int do_md_run(mddev_t * mddev)
+{
+ int pnum, err;
+ int chunk_size;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev;
+
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Resize disks to align partitions size on a given
+ * chunk size.
+ */
+ md_size[mdidx(mddev)] = 0;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (analyze_sbs(mddev)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ chunk_size = mddev->sb->chunk_size;
+ pnum = level_to_pers(mddev->sb->level);
+
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+ if (!chunk_size) {
+ /*
+ * 'default chunksize' in the old md code used to
+ * be PAGE_SIZE, baaad.
+ * we abort here to be on the safe side. We dont
+ * want to continue the bad practice.
+ */
+ printk(BAD_CHUNKSIZE);
+ return -EINVAL;
+ }
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+ } else
+ if (chunk_size)
+ printk(KERN_INFO "md: RAID level %d does not need chunksize! Continuing anyway.\n",
+ mddev->sb->level);
+
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (!pers[pnum])
+ {
+#ifdef CONFIG_KMOD
+ char module_name[80];
+ sprintf (module_name, "md-personality-%d", pnum);
+ request_module (module_name);
+ if (!pers[pnum])
+#endif
+ {
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
+ pnum);
+ return -EINVAL;
+ }
+ }
+
+ if (device_size_calculation(mddev))
+ return -EINVAL;
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ md_hardsect_sizes[mdidx(mddev)] = 512;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ invalidate_device(rdev->dev, 1);
+ if (get_hardsect_size(rdev->dev)
+ > md_hardsect_sizes[mdidx(mddev)])
+ md_hardsect_sizes[mdidx(mddev)] =
+ get_hardsect_size(rdev->dev);
+ }
+ md_blocksizes[mdidx(mddev)] = 1024;
+ if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
+ md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
+ mddev->pers = pers[pnum];
+
+ blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+ mddev->queue.queuedata = mddev;
+
+ err = mddev->pers->run(mddev);
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ mddev->pers = NULL;
+ return -EINVAL;
+ }
+
+ mddev->sb->state &= ~(1 << MD_SB_CLEAN);
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ /*
+ * md_size has units of 1K blocks, which are
+ * twice as large as sectors.
+ */
+ md_hd_struct[mdidx(mddev)].start_sect = 0;
+ register_disk(&md_gendisk, MKDEV(MAJOR_NR,mdidx(mddev)),
+ 1, &md_fops, md_size[mdidx(mddev)]<<1);
+
+ read_ahead[MD_MAJOR] = 1024;
+ return (0);
+}
+
+#undef TOO_BIG_CHUNKSIZE
+#undef BAD_CHUNKSIZE
+
+static int restart_array(mddev_t *mddev)
+{
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->ro = 0;
+ set_device_ro(mddev_to_kdev(mddev), 0);
+
+ printk(KERN_INFO
+ "md: md%d switched to read-write mode.\n", mdidx(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ md_recover_arrays();
+ if (mddev->pers->restart_resync)
+ mddev->pers->restart_resync(mddev);
+ err = 0;
+ } else {
+ printk(KERN_ERR "md: md%d has no personality assigned.\n",
+ mdidx(mddev));
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+#define STILL_MOUNTED KERN_WARNING \
+"md: md%d still mounted.\n"
+#define STILL_IN_USE \
+"md: md%d still in use.\n"
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+ int err = 0, resync_interrupted = 0;
+ kdev_t dev = mddev_to_kdev(mddev);
+
+ if (atomic_read(&mddev->active)>1) {
+ printk(STILL_IN_USE, mdidx(mddev));
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (mddev->pers) {
+ /*
+ * It is safe to call stop here, it only frees private
+ * data. Also, it tells us if a device is unstoppable
+ * (eg. resyncing is in progress)
+ */
+ if (mddev->pers->stop_resync)
+ if (mddev->pers->stop_resync(mddev))
+ resync_interrupted = 1;
+
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+
+ /*
+ * This synchronizes with signal delivery to the
+ * resync or reconstruction thread. It also nicely
+ * hangs the process if some reconstruction has not
+ * finished.
+ */
+ down(&mddev->recovery_sem);
+ up(&mddev->recovery_sem);
+
+ invalidate_device(dev, 1);
+
+ if (ro) {
+ err = -ENXIO;
+ if (mddev->ro)
+ goto out;
+ mddev->ro = 1;
+ } else {
+ if (mddev->ro)
+ set_device_ro(dev, 0);
+ if (mddev->pers->stop(mddev)) {
+ err = -EBUSY;
+ if (mddev->ro)
+ set_device_ro(dev, 1);
+ goto out;
+ }
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (mddev->sb) {
+ /*
+ * mark it clean only if there was no resync
+ * interrupted.
+ */
+ if (!mddev->recovery_running && !resync_interrupted) {
+ printk(KERN_INFO "md: marking sb clean...\n");
+ mddev->sb->state |= 1 << MD_SB_CLEAN;
+ }
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+ }
+ if (ro)
+ set_device_ro(dev, 1);
+ }
+
+ /*
+ * Free resources if final stop
+ */
+ if (!ro) {
+ printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+ free_mddev(mddev);
+ } else
+ printk(KERN_INFO "md: md%d switched to read-only mode.\n", mdidx(mddev));
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * We have to safely support old arrays too.
+ */
+int detect_old_array(mdp_super_t *sb)
+{
+ if (sb->major_version > 0)
+ return 0;
+ if (sb->minor_version >= 90)
+ return 0;
+
+ return -EINVAL;
+}
+
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return;
+ }
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk("<%s>", partition_name(rdev->dev));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+ /*
+ * prevent the writeback of an unrunnable array
+ */
+ mddev->sb_dirty = 0;
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in the ->pending list)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(kdev_t countdev)
+{
+ struct md_list_head candidates;
+ struct md_list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+ kdev_t md_kdev;
+
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ rdev0 = md_list_entry(pending_raid_disks.next,
+ mdk_rdev_t, pending);
+
+ printk(KERN_INFO "md: considering %s ...\n", partition_name(rdev0->dev));
+ MD_INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ if (uuid_equal(rdev0, rdev)) {
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING
+ "md: %s has same UUID as %s, but superblocks differ ...\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ continue;
+ }
+ printk(KERN_INFO "md: adding %s ...\n", partition_name(rdev->dev));
+ md_list_del(&rdev->pending);
+ md_list_add(&rdev->pending, &candidates);
+ }
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+ md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
+ mddev = kdev_to_mddev(md_kdev);
+ if (mddev) {
+ printk(KERN_WARNING "md: md%d already running, cannot run %s\n",
+ mdidx(mddev), partition_name(rdev0->dev));
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
+ export_rdev(rdev);
+ continue;
+ }
+ mddev = alloc_mddev(md_kdev);
+ if (!mddev) {
+ printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (md_kdev == countdev)
+ atomic_inc(&mddev->active);
+ printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+ ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
+ bind_rdev_to_array(rdev, mddev);
+ list_del_init(&rdev->pending);
+ }
+ autorun_array(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+#define BAD_VERSION KERN_ERR \
+"md: %s has RAID superblock version 0.%d, autodetect needs v0.90 or higher\n"
+
+#define OUT_OF_MEM KERN_ALERT \
+"md: out of memory.\n"
+
+#define NO_DEVICE KERN_ERR \
+"md: disabled device %s\n"
+
+#define AUTOADD_FAILED KERN_ERR \
+"md: auto-adding devices to md%d FAILED (error %d).\n"
+
+#define AUTOADD_FAILED_USED KERN_ERR \
+"md: cannot auto-add device %s to md%d, already used.\n"
+
+#define AUTORUN_FAILED KERN_ERR \
+"md: auto-running md%d FAILED (error %d).\n"
+
+#define MDDEV_BUSY KERN_ERR \
+"md: cannot auto-add to md%d, already running.\n"
+
+#define AUTOADDING KERN_INFO \
+"md: auto-adding devices to md%d, based on %s's superblock.\n"
+
+#define AUTORUNNING KERN_INFO \
+"md: auto-running md%d.\n"
+
+static int autostart_array(kdev_t startdev, kdev_t countdev)
+{
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ if (md_import_device(startdev, 1)) {
+ printk(KERN_WARNING "md: could not import %s!\n", partition_name(startdev));
+ goto abort;
+ }
+
+ start_rdev = find_rdev_all(startdev);
+ if (!start_rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ if (start_rdev->faulty) {
+ printk(KERN_WARNING "md: can not autostart based on faulty %s!\n",
+ partition_name(startdev));
+ goto abort;
+ }
+ md_list_add(&start_rdev->pending, &pending_raid_disks);
+
+ sb = start_rdev->sb;
+
+ err = detect_old_array(sb);
+ if (err) {
+ printk(KERN_WARNING "md: array version is too old to be autostarted ,"
+ "use raidtools 0.90 mkraid --upgrade to upgrade the array "
+ "without data loss!\n");
+ goto abort;
+ }
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ kdev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (dev == MKDEV(0,0))
+ continue;
+ if (dev == startdev)
+ continue;
+ if (md_import_device(dev, 1)) {
+ printk(KERN_WARNING "md: could not import %s, trying to run array nevertheless.\n",
+ partition_name(dev));
+ continue;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ goto abort;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices(countdev);
+ return 0;
+
+abort:
+ if (start_rdev)
+ export_rdev(start_rdev);
+ return err;
+}
+
+#undef BAD_VERSION
+#undef OUT_OF_MEM
+#undef NO_DEVICE
+#undef AUTOADD_FAILED_USED
+#undef AUTOADD_FAILED
+#undef AUTORUN_FAILED
+#undef AUTOADDING
+#undef AUTORUNNING
+
+
+static int get_version(void * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (md_copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define SET_FROM_SB(x) info.x = mddev->sb->x
+static int get_array_info(mddev_t * mddev, void * arg)
+{
+ mdu_array_info_t info;
+
+ if (!mddev->sb) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ SET_FROM_SB(major_version);
+ SET_FROM_SB(minor_version);
+ SET_FROM_SB(patch_version);
+ SET_FROM_SB(ctime);
+ SET_FROM_SB(level);
+ SET_FROM_SB(size);
+ SET_FROM_SB(nr_disks);
+ SET_FROM_SB(raid_disks);
+ SET_FROM_SB(md_minor);
+ SET_FROM_SB(not_persistent);
+
+ SET_FROM_SB(utime);
+ SET_FROM_SB(state);
+ SET_FROM_SB(active_disks);
+ SET_FROM_SB(working_disks);
+ SET_FROM_SB(failed_disks);
+ SET_FROM_SB(spare_disks);
+
+ SET_FROM_SB(layout);
+ SET_FROM_SB(chunk_size);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_FROM_SB(x) info.x = mddev->sb->disks[nr].x
+static int get_disk_info(mddev_t * mddev, void * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+
+ if (!mddev->sb)
+ return -EINVAL;
+
+ if (md_copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+ if (nr >= MD_SB_DISKS)
+ return -EINVAL;
+
+ SET_FROM_SB(major);
+ SET_FROM_SB(minor);
+ SET_FROM_SB(raid_disk);
+ SET_FROM_SB(state);
+
+ if (md_copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+#undef SET_FROM_SB
+
+#define SET_SB(x) mddev->sb->disks[nr].x = info->x
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ int err, size, persistent;
+ mdk_rdev_t *rdev;
+ unsigned int nr;
+ kdev_t dev;
+ dev = MKDEV(info->major,info->minor);
+
+ if (find_rdev_all(dev)) {
+ printk(KERN_WARNING "md: device %s already used in a RAID array!\n",
+ partition_name(dev));
+ return -EBUSY;
+ }
+ if (!mddev->sb) {
+ /* expecting a device which has a superblock */
+ err = md_import_device(dev, 1);
+ if (err) {
+ printk(KERN_WARNING "md: md_import_device returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = md_list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ if (!uuid_equal(rdev0, rdev)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ if (!sb_equal(rdev0->sb, rdev->sb)) {
+ printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
+ partition_name(rdev->dev), partition_name(rdev0->dev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ bind_rdev_to_array(rdev, mddev);
+ return 0;
+ }
+
+ nr = info->number;
+ if (nr >= mddev->sb->nr_disks) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+
+ SET_SB(number);
+ SET_SB(major);
+ SET_SB(minor);
+ SET_SB(raid_disk);
+ SET_SB(state);
+
+ if ((info->state & (1<<MD_DISK_FAULTY))==0) {
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ rdev->old_dev = dev;
+ rdev->desc_nr = info->number;
+
+ bind_rdev_to_array(rdev, mddev);
+
+ persistent = !mddev->sb->not_persistent;
+ if (!persistent)
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+
+ size = calc_dev_size(dev, mddev, persistent);
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ if (!mddev->sb->size || (mddev->sb->size > size))
+ mddev->sb->size = size;
+ }
+
+ /*
+ * sync all other superblocks with the main superblock
+ */
+ sync_sbs(mddev);
+
+ return 0;
+}
+#undef SET_SB
+
+static int hot_generate_error(mddev_t * mddev, kdev_t dev)
+{
+ struct request_queue *q;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ return -ENXIO;
+ }
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (!disk_active(disk))
+ return -ENODEV;
+
+ q = blk_get_queue(rdev->dev);
+ if (!q) {
+ MD_BUG();
+ return -ENODEV;
+ }
+ printk(KERN_INFO "md: okay, generating error!\n");
+// q->oneshot_error = 1; // disabled for now
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, kdev_t dev)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ disk = &mddev->sb->disks[rdev->desc_nr];
+ if (disk_active(disk))
+ goto busy;
+
+ if (disk_removed(disk))
+ return -EINVAL;
+
+ err = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
+ if (err == -EBUSY)
+ goto busy;
+
+ if (err) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ remove_descriptor(disk, mddev->sb);
+ kick_rdev_from_array(rdev);
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, kdev_t dev)
+{
+ int i, err, persistent;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+ mdp_disk_t *disk;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (!mddev->pers->diskop) {
+ printk(KERN_WARNING "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ persistent = !mddev->sb->not_persistent;
+
+ rdev = find_rdev(mddev, dev);
+ if (rdev)
+ return -EBUSY;
+
+ err = md_import_device (dev, 0);
+ if (err) {
+ printk(KERN_WARNING "md: error, md_import_device() returned %d\n", err);
+ return -EINVAL;
+ }
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->faulty) {
+ printk(KERN_WARNING "md: can not hot-add faulty %s disk to md%d!\n",
+ partition_name(dev), mdidx(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ size = calc_dev_size(dev, mddev, persistent);
+
+ if (size < mddev->sb->size) {
+ printk(KERN_WARNING "md%d: disk size %d blocks < array size %d\n",
+ mdidx(mddev), size, mddev->sb->size);
+ err = -ENOSPC;
+ goto abort_export;
+ }
+ bind_rdev_to_array(rdev, mddev);
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+ rdev->old_dev = dev;
+ rdev->size = size;
+ rdev->sb_offset = calc_dev_sboffset(dev, mddev, persistent);
+
+ disk = mddev->sb->disks + mddev->sb->raid_disks;
+ for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
+ disk = mddev->sb->disks + i;
+
+ if (!disk->major && !disk->minor)
+ break;
+ if (disk_removed(disk))
+ break;
+ }
+ if (i == MD_SB_DISKS) {
+ printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ if (disk_removed(disk)) {
+ /*
+ * reuse slot
+ */
+ if (disk->number != i) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+ } else {
+ disk->number = i;
+ }
+
+ disk->raid_disk = disk->number;
+ disk->major = MAJOR(dev);
+ disk->minor = MINOR(dev);
+
+ if (mddev->pers->diskop(mddev, &disk, DISKOP_HOT_ADD_DISK)) {
+ MD_BUG();
+ err = -EINVAL;
+ goto abort_unbind_export;
+ }
+
+ mark_disk_spare(disk);
+ mddev->sb->nr_disks++;
+ mddev->sb->spare_disks++;
+ mddev->sb->working_disks++;
+
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ md_recover_arrays();
+
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+#define SET_SB(x) mddev->sb->x = info->x
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (alloc_array_sb(mddev))
+ return -ENOMEM;
+
+ mddev->sb->major_version = MD_MAJOR_VERSION;
+ mddev->sb->minor_version = MD_MINOR_VERSION;
+ mddev->sb->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->sb->ctime = CURRENT_TIME;
+
+ SET_SB(level);
+ SET_SB(size);
+ SET_SB(nr_disks);
+ SET_SB(raid_disks);
+ SET_SB(md_minor);
+ SET_SB(not_persistent);
+
+ SET_SB(state);
+ SET_SB(active_disks);
+ SET_SB(working_disks);
+ SET_SB(failed_disks);
+ SET_SB(spare_disks);
+
+ SET_SB(layout);
+ SET_SB(chunk_size);
+
+ mddev->sb->md_magic = MD_SB_MAGIC;
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(&mddev->sb->set_uuid0, 4);
+ get_random_bytes(&mddev->sb->set_uuid1, 4);
+ get_random_bytes(&mddev->sb->set_uuid2, 4);
+ get_random_bytes(&mddev->sb->set_uuid3, 4);
+
+ return 0;
+}
+#undef SET_SB
+
+static int set_disk_faulty(mddev_t *mddev, kdev_t dev)
+{
+ int ret;
+
+ ret = md_error(mddev, dev);
+ return ret;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ unsigned int minor;
+ int err = 0;
+ struct hd_geometry *loc = (struct hd_geometry *) arg;
+ mddev_t *mddev = NULL;
+ kdev_t dev;
+
+ if (!md_capable_admin())
+ return -EACCES;
+
+ dev = inode->i_rdev;
+ minor = MINOR(dev);
+ if (minor >= MAX_MD_DEVS) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version((void *)arg);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done_unlock;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays();
+ goto done;
+#endif
+
+ case BLKGETSIZE:
+ case BLKGETSIZE64:
+ case BLKRAGET:
+ case BLKRASET:
+ case BLKFLSBUF:
+ case BLKBSZGET:
+ case BLKBSZSET:
+ err = blk_ioctl (dev, cmd, arg);
+ goto abort;
+
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = kdev_to_mddev(dev);
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+ case START_ARRAY:
+ if (mddev) {
+ printk(KERN_WARNING "md: array md%d already exists!\n",
+ mdidx(mddev));
+ err = -EEXIST;
+ goto abort;
+ }
+ default:;
+ }
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+ mddev = alloc_mddev(dev);
+ if (!mddev) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ atomic_inc(&mddev->active);
+
+ /*
+ * alloc_mddev() should possibly self-lock.
+ */
+ err = lock_mddev(mddev);
+ if (err) {
+ printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ if (mddev->sb) {
+ printk(KERN_WARNING "md: array md%d already has a superblock!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (arg) {
+ mdu_array_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldnt set array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+ case START_ARRAY:
+ /*
+ * possibly make it lock the array ...
+ */
+ err = autostart_array((kdev_t)arg, dev);
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name((kdev_t)arg));
+ goto abort;
+ }
+ goto done;
+
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+
+ if (!mddev) {
+ err = -ENODEV;
+ goto abort;
+ }
+ err = lock_mddev(mddev);
+ if (err) {
+ printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
+ goto abort;
+ }
+ /* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+ if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ if (!(err = do_md_stop (mddev, 0)))
+ mddev = NULL;
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ case HDIO_GETGEO:
+ if (!loc) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ err = md_put_user (2, (char *) &loc->heads);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (4, (char *) &loc->sectors);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[mdidx(mddev)].nr_sects/8,
+ (short *) &loc->cylinders);
+ if (err)
+ goto abort_unlock;
+ err = md_put_user (md_hd_struct[minor].start_sect,
+ (long *) &loc->start);
+ goto done_unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow read-only arrays
+ * here:
+ */
+ if (mddev->ro) {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (md_copy_from_user(&info, (void*)arg, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ case HOT_GENERATE_ERROR:
+ err = hot_generate_error(mddev, (kdev_t)arg);
+ goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, (kdev_t)arg);
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ {
+ err = do_md_run (mddev);
+ /*
+ * we have to clean up the mess if
+ * the array cannot be run for some
+ * reason ...
+ */
+ if (err) {
+ mddev->sb_dirty = 0;
+ if (!do_md_stop (mddev, 0))
+ mddev = NULL;
+ }
+ goto done_unlock;
+ }
+
+ default:
+ printk(KERN_WARNING "md: %s(pid %d) used obsolete MD ioctl, "
+ "upgrade your software to use new ictls.\n",
+ current->comm, current->pid);
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ if (mddev)
+ unlock_mddev(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Always succeed, but increment the usage count
+ */
+ mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
+ if (mddev)
+ atomic_inc(&mddev->active);
+ return (0);
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
+ if (mddev)
+ atomic_dec(&mddev->active);
+ return 0;
+}
+
+static struct block_device_operations md_fops=
+{
+ owner: THIS_MODULE,
+ open: md_open,
+ release: md_release,
+ ioctl: md_ioctl,
+};
+
+
+int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ md_lock_kernel();
+
+ /*
+ * Detach thread
+ */
+
+ daemonize();
+
+ sprintf(current->comm, thread->name);
+ md_init_signals();
+ md_flush_signals();
+ thread->tsk = current;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+ current->policy = SCHED_OTHER;
+ current->nice = -20;
+ md_unlock_kernel();
+
+ complete(thread->event);
+ while (thread->run) {
+ void (*run)(void *data);
+
+ wait_event_interruptible(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags));
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ run = thread->run;
+ if (run) {
+ run(thread->data);
+ run_task_queue(&tq_disk);
+ }
+ if (md_signal_pending(current))
+ md_flush_signals();
+ }
+ complete(thread->event);
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ dprintk("md: waking up MD thread %p.\n", thread);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+}
+
+mdk_thread_t *md_register_thread(void (*run) (void *),
+ void *data, const char *name)
+{
+ mdk_thread_t *thread;
+ int ret;
+ struct completion event;
+
+ thread = (mdk_thread_t *) kmalloc
+ (sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ memset(thread, 0, sizeof(mdk_thread_t));
+ md_init_waitqueue_head(&thread->wqueue);
+
+ init_completion(&event);
+ thread->event = &event;
+ thread->run = run;
+ thread->data = data;
+ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+ return NULL;
+ }
+ wait_for_completion(&event);
+ return thread;
+}
+
+void md_interrupt_thread(mdk_thread_t *thread)
+{
+ if (!thread->tsk) {
+ MD_BUG();
+ return;
+ }
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+ send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ struct completion event;
+
+ init_completion(&event);
+
+ thread->event = &event;
+ thread->run = NULL;
+ thread->name = NULL;
+ md_interrupt_thread(thread);
+ wait_for_completion(&event);
+ kfree(thread);
+}
+
+void md_recover_arrays(void)
+{
+ if (!md_recovery_thread) {
+ MD_BUG();
+ return;
+ }
+ md_wakeup_thread(md_recovery_thread);
+}
+
+
+int md_error(mddev_t *mddev, kdev_t rdev)
+{
+ mdk_rdev_t * rrdev;
+
+ dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ MD_MAJOR,mdidx(mddev),MAJOR(rdev),MINOR(rdev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+
+ if (!mddev) {
+ MD_BUG();
+ return 0;
+ }
+ rrdev = find_rdev(mddev, rdev);
+ if (!rrdev || rrdev->faulty)
+ return 0;
+ if (!mddev->pers->error_handler
+ || mddev->pers->error_handler(mddev,rdev) <= 0) {
+ rrdev->faulty = 1;
+ } else
+ return 1;
+ /*
+ * if recovery was running, stop it now.
+ */
+ if (mddev->pers->stop_resync)
+ mddev->pers->stop_resync(mddev);
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+ md_recover_arrays();
+
+ return 0;
+}
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_ALL(rdev,tmp) {
+ if (list_empty(&rdev->same_set)) {
+ /*
+ * The device is not yet used by any array.
+ */
+ i++;
+ seq_printf(seq, "%s ",
+ partition_name(rdev->dev));
+ }
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ unsigned long max_blocks, resync, res, dt, db, rt;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ max_blocks = mddev->sb->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks)
+ MD_BUG();
+
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ {
+ int i, x = res/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ if (!mddev->recovery_running)
+ /*
+ * true resync
+ */
+ seq_printf(seq, " resync =%3lu.%lu%% (%lu/%lu)",
+ res/10, res % 10, resync, max_blocks);
+ else
+ /*
+ * recovery ...
+ */
+ seq_printf(seq, " recovery =%3lu.%lu%% (%lu/%lu)",
+ res/10, res % 10, resync, max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+
+}
+
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l > 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ return mddev;
+ }
+ return (void*)2;/* tail */
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = list_entry(tmp,mddev_t,all_mddevs);
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ int j, size;
+ struct md_list_head *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev = v;
+
+ if (v == (void*)1) {
+ seq_printf(seq, "Personalities : ");
+ for (j = 0; j < MAX_PERSONALITY; j++)
+ if (pers[j])
+ seq_printf(seq, "[%s] ", pers[j]->name);
+
+ seq_printf(seq, "\n");
+ seq_printf(seq, "read_ahead ");
+ if (read_ahead[MD_MAJOR] == INT_MAX)
+ seq_printf(seq, "not set\n");
+ else
+ seq_printf(seq, "%d sectors\n", read_ahead[MD_MAJOR]);
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro)
+ seq_printf(seq, " (read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ seq_printf(seq, " %s[%d]",
+ partition_name(rdev->dev), rdev->desc_nr);
+ if (rdev->faulty) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %d blocks",
+ md_size[mdidx(mddev)]);
+ else
+ seq_printf(seq, "\n %d blocks", size);
+ }
+
+ if (mddev->pers) {
+
+ mddev->pers->status (seq, mddev);
+
+ seq_printf(seq, "\n ");
+ if (mddev->curr_resync) {
+ status_resync (seq, mddev);
+ } else {
+ if (sem_getcount(&mddev->resync_sem) != 1)
+ seq_printf(seq, " resync=DELAYED");
+ }
+ }
+ seq_printf(seq, "\n");
+
+ return 0;
+}
+
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ return error;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (pers[pnum]) {
+ MD_BUG();
+ return -EBUSY;
+ }
+
+ pers[pnum] = p;
+ printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+ return 0;
+}
+
+int unregister_md_personality(int pnum)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+ pers[pnum] = NULL;
+ return 0;
+}
+
+mdp_disk_t *get_spare(mddev_t *mddev)
+{
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *disk;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (!rdev->sb) {
+ MD_BUG();
+ continue;
+ }
+ disk = &sb->disks[rdev->desc_nr];
+ if (disk_faulty(disk)) {
+ MD_BUG();
+ continue;
+ }
+ if (disk_active(disk))
+ continue;
+ return disk;
+ }
+ return NULL;
+}
+
+static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
+void md_sync_acct(kdev_t dev, unsigned long nr_sectors)
+{
+ unsigned int major = MAJOR(dev);
+ unsigned int index;
+
+ index = disk_index(dev);
+ if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ return;
+
+ sync_io[major][index] += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct md_list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ int major = MAJOR(rdev->dev);
+ int idx = disk_index(rdev->dev);
+
+ if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
+ continue;
+
+ curr_events = kstat.dk_drive_rblk[major][idx] +
+ kstat.dk_drive_wblk[major][idx] ;
+ curr_events -= sync_io[major][idx];
+ if ((curr_events - rdev->last_events) > 32) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+MD_DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ // stop recovery, signal do_sync ....
+ if (mddev->pers->stop_resync)
+ mddev->pers->stop_resync(mddev);
+ if (mddev->recovery_running)
+ md_interrupt_thread(md_recovery_thread);
+ }
+}
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+int md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
+{
+ mddev_t *mddev2;
+ unsigned int max_sectors, currspeed,
+ j, window, err, serialize;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct md_list_head *tmp;
+ unsigned long last_check;
+
+
+ err = down_interruptible(&mddev->resync_sem);
+ if (err)
+ goto out_nolock;
+
+recheck:
+ serialize = 0;
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
+ printk(KERN_INFO "md: delaying resync of md%d until md%d "
+ "has finished resync (they share one or more physical units)\n",
+ mdidx(mddev), mdidx(mddev2));
+ serialize = 1;
+ break;
+ }
+ }
+ if (serialize) {
+ interruptible_sleep_on(&resync_wait);
+ if (md_signal_pending(current)) {
+ md_flush_signals();
+ err = -EINTR;
+ goto out;
+ }
+ goto recheck;
+ }
+
+ mddev->curr_resync = 1;
+
+ max_sectors = mddev->sb->size<<1;
+
+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
+ sysctl_speed_limit_min);
+ printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ sysctl_speed_limit_max);
+
+ /*
+ * Resync has low priority.
+ */
+ current->nice = 19;
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = 0;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = vm_max_readahead*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+ window/2,max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+ for (j = 0; j < max_sectors;) {
+ int sectors;
+
+ sectors = mddev->pers->sync_request(mddev, j);
+
+ if (sectors < 0) {
+ err = sectors;
+ goto out;
+ }
+ atomic_add(sectors, &mddev->recovery_active);
+ j += sectors;
+ mddev->curr_resync = j;
+
+ if (last_check + window > j)
+ continue;
+
+ last_check = j;
+
+ run_task_queue(&tq_disk);
+
+ repeat:
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (md_signal_pending(current)) {
+ /*
+ * got a signal, exit.
+ */
+ mddev->curr_resync = 0;
+ printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n");
+ md_flush_signals();
+ err = -EINTR;
+ goto out;
+ }
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ if (md_need_resched(current))
+ schedule();
+
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > sysctl_speed_limit_min) {
+ current->nice = 19;
+
+ if ((currspeed > sysctl_speed_limit_max) ||
+ !is_mddev_idle(mddev)) {
+ current->state = TASK_INTERRUPTIBLE;
+ md_schedule_timeout(HZ/4);
+ goto repeat;
+ }
+ } else
+ current->nice = -20;
+ }
+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+ err = 0;
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+out:
+ wait_disk_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
+ up(&mddev->resync_sem);
+out_nolock:
+ mddev->curr_resync = 0;
+ wake_up(&resync_wait);
+ return err;
+}
+
+
+/*
+ * This is a kernel thread which syncs a spare disk with the active array
+ *
+ * the amount of foolproofing might seem to be a tad excessive, but an
+ * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
+ * of my root partition with the first 0.5 gigs of my /home partition ... so
+ * i'm a bit nervous ;)
+ */
+void md_do_recovery(void *data)
+{
+ int err;
+ mddev_t *mddev;
+ mdp_super_t *sb;
+ mdp_disk_t *spare;
+ struct md_list_head *tmp;
+
+ printk(KERN_INFO "md: recovery thread got woken up ...\n");
+restart:
+ ITERATE_MDDEV(mddev,tmp) {
+ sb = mddev->sb;
+ if (!sb)
+ continue;
+ if (mddev->recovery_running)
+ continue;
+ if (sb->active_disks == sb->raid_disks)
+ continue;
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ if (!sb->spare_disks) {
+ printk(KERN_ERR "md%d: no spare disk to reconstruct array! "
+ "-- continuing in degraded mode\n", mdidx(mddev));
+ continue;
+ }
+ /*
+ * now here we get the spare and resync it.
+ */
+ spare = get_spare(mddev);
+ if (!spare)
+ continue;
+ printk(KERN_INFO "md%d: resyncing spare disk %s to replace failed disk\n",
+ mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
+ if (!mddev->pers->diskop)
+ continue;
+ if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
+ continue;
+ down(&mddev->recovery_sem);
+ mddev->recovery_running = 1;
+ err = md_do_sync(mddev, spare);
+ if (err == -EIO) {
+ printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n",
+ mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
+ if (!disk_faulty(spare)) {
+ mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
+ mark_disk_faulty(spare);
+ mark_disk_nonsync(spare);
+ mark_disk_inactive(spare);
+ sb->spare_disks--;
+ sb->working_disks--;
+ sb->failed_disks++;
+ }
+ } else
+ if (disk_faulty(spare))
+ mddev->pers->diskop(mddev, &spare,
+ DISKOP_SPARE_INACTIVE);
+ if (err == -EINTR || err == -ENOMEM) {
+ /*
+ * Recovery got interrupted, or ran out of mem ...
+ * signal back that we have finished using the array.
+ */
+ mddev->pers->diskop(mddev, &spare,
+ DISKOP_SPARE_INACTIVE);
+ up(&mddev->recovery_sem);
+ mddev->recovery_running = 0;
+ continue;
+ } else {
+ mddev->recovery_running = 0;
+ up(&mddev->recovery_sem);
+ }
+ if (!disk_faulty(spare)) {
+ /*
+ * the SPARE_ACTIVE diskop possibly changes the
+ * pointer too
+ */
+ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
+ mark_disk_sync(spare);
+ mark_disk_active(spare);
+ sb->active_disks++;
+ sb->spare_disks--;
+ }
+ mddev->sb_dirty = 1;
+ md_update_sb(mddev);
+ goto restart;
+ }
+ printk(KERN_INFO "md: recovery thread finished ...\n");
+
+}
+
+int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct md_list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == MD_SYS_DOWN) || (code == MD_SYS_HALT)
+ || (code == MD_SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ do_md_stop (mddev, 1);
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ md_mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+ notifier_call: md_notify_reboot,
+ next: NULL,
+ priority: INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+ int i;
+
+ for(i = 0; i < MAX_MD_DEVS; i++) {
+ md_blocksizes[i] = 1024;
+ md_size[i] = 0;
+ md_hardsect_sizes[i] = 512;
+ }
+ blksize_size[MAJOR_NR] = md_blocksizes;
+ blk_size[MAJOR_NR] = md_size;
+ max_readahead[MAJOR_NR] = md_maxreadahead;
+ hardsect_size[MAJOR_NR] = md_hardsect_sizes;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+#ifdef CONFIG_PROC_FS
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+#endif
+}
+
+request_queue_t * md_queue_proc(kdev_t dev)
+{
+ mddev_t *mddev = kdev_to_mddev(dev);
+ if (mddev == NULL)
+ return BLK_DEFAULT_QUEUE(MAJOR_NR);
+ else
+ return &mddev->queue;
+}
+
+int md__init md_init(void)
+{
+ static char * name = "mdrecoveryd";
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d, MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+
+ if (devfs_register_blkdev (MAJOR_NR, "md", &md_fops))
+ {
+ printk(KERN_ALERT "md: Unable to get major %d for md\n", MAJOR_NR);
+ return (-1);
+ }
+ devfs_handle = devfs_mk_dir (NULL, "md", NULL);
+ /* we don't use devfs_register_series because we want to fill md_hd_struct */
+ for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+ char devname[128];
+ sprintf (devname, "%u", minor);
+ md_hd_struct[minor].de = devfs_register (devfs_handle,
+ devname, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+ S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+ }
+
+ /* all requests on an uninitialised device get failed... */
+ blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), md_fail_request);
+ blk_dev[MAJOR_NR].queue = md_queue_proc;
+
+
+ read_ahead[MAJOR_NR] = INT_MAX;
+
+ add_gendisk(&md_gendisk);
+
+ md_recovery_thread = md_register_thread(md_do_recovery, NULL, name);
+ if (!md_recovery_thread)
+ printk(KERN_ALERT "md: bug: couldn't allocate md_recovery_thread\n");
+
+ md_register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * When md (and any require personalities) are compiled into the kernel
+ * (not a module), arrays can be assembles are boot time using with AUTODETECT
+ * where specially marked partitions are registered with md_autodetect_dev(),
+ * and with MD_BOOT where devices to be collected are given on the boot line
+ * with md=.....
+ * The code for that is here.
+ */
+
+struct {
+ int set;
+ int noautodetect;
+} raid_setup_args md__initdata;
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static kdev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(kdev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(void)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ kdev_t dev = detected_devices[i];
+
+ if (md_import_device(dev,1)) {
+ printk(KERN_ALERT "md: could not import %s!\n",
+ partition_name(dev));
+ continue;
+ }
+ /*
+ * Sanity checks:
+ */
+ rdev = find_rdev_all(dev);
+ if (!rdev) {
+ MD_BUG();
+ continue;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ continue;
+ }
+ md_list_add(&rdev->pending, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices(-1);
+}
+
+static struct {
+ char device_set [MAX_MD_DEVS];
+ int pers[MAX_MD_DEVS];
+ int chunk[MAX_MD_DEVS];
+ char *device_names[MAX_MD_DEVS];
+} md_setup_args md__initdata;
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the MD device now; that is handled by
+ * md_setup_drive after the low-level disk drivers have initialised.
+ *
+ * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which
+ * assigns the task of parsing integer arguments to the
+ * invoked program now). Added ability to initialise all
+ * the MD devices (by specifying multiple "md=" lines)
+ * instead of just one. -- KTK
+ * 18May2000: Added support for persistant-superblock arrays:
+ * md=n,0,factor,fault,device-list uses RAID0 for device n
+ * md=n,-1,factor,fault,device-list uses LINEAR for device n
+ * md=n,device-list reads a RAID superblock from the devices
+ * elements in device-list are read by name_to_kdev_t so can be
+ * a hex number or something like /dev/hda1 /dev/sdb
+ * 2001-06-03: Dave Cinege <dcinege@psychosis.com>
+ * Shifted name_to_kdev_t() and related operations to md_set_drive()
+ * for later execution. Rewrote section to make devfs compatible.
+ */
+static int md__init md_setup(char *str)
+{
+ int minor, level, factor, fault;
+ char *pername = "";
+ char *str1 = str;
+
+ if (get_option(&str, &minor) != 2) { /* MD Number */
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ if (minor >= MAX_MD_DEVS) {
+ printk(KERN_WARNING "md: md=%d, Minor device number too high.\n", minor);
+ return 0;
+ } else if (md_setup_args.device_names[minor]) {
+ printk(KERN_WARNING "md: md=%d, Specified more then once. "
+ "Replacing previous definition.\n", minor);
+ }
+ switch (get_option(&str, &level)) { /* RAID Personality */
+ case 2: /* could be 0 or -1.. */
+ if (level == 0 || level == -1) {
+ if (get_option(&str, &factor) != 2 || /* Chunk Size */
+ get_option(&str, &fault) != 2) {
+ printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ md_setup_args.chunk[minor] = 1 << (factor+12);
+ switch(level) {
+ case -1:
+ level = LINEAR;
+ pername = "linear";
+ break;
+ case 0:
+ level = RAID0;
+ pername = "raid0";
+ break;
+ default:
+ printk(KERN_WARNING
+ "md: The kernel has not been configured for raid%d support!\n",
+ level);
+ return 0;
+ }
+ md_setup_args.pers[minor] = level;
+ break;
+ }
+ /* FALL THROUGH */
+ case 1: /* the first device is numeric */
+ str = str1;
+ /* FALL THROUGH */
+ case 0:
+ md_setup_args.pers[minor] = 0;
+ pername="super-block";
+ }
+
+ printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n",
+ minor, pername, str);
+ md_setup_args.device_names[minor] = str;
+
+ return 1;
+}
+
+extern kdev_t name_to_kdev_t(char *line) md__init;
+void md__init md_setup_drive(void)
+{
+ int minor, i;
+ kdev_t dev;
+ mddev_t*mddev;
+ kdev_t devices[MD_SB_DISKS+1];
+
+ for (minor = 0; minor < MAX_MD_DEVS; minor++) {
+ int err = 0;
+ char *devname;
+ mdu_disk_info_t dinfo;
+
+ if ((devname = md_setup_args.device_names[minor]) == 0) continue;
+
+ for (i = 0; i < MD_SB_DISKS && devname != 0; i++) {
+
+ char *p;
+ void *handle;
+
+ p = strchr(devname, ',');
+ if (p)
+ *p++ = 0;
+
+ dev = name_to_kdev_t(devname);
+ handle = devfs_find_handle(NULL, devname, MAJOR (dev), MINOR (dev),
+ DEVFS_SPECIAL_BLK, 1);
+ if (handle != 0) {
+ unsigned major, minor;
+ devfs_get_maj_min(handle, &major, &minor);
+ dev = MKDEV(major, minor);
+ }
+ if (dev == 0) {
+ printk(KERN_WARNING "md: Unknown device name: %s\n", devname);
+ break;
+ }
+
+ devices[i] = dev;
+ md_setup_args.device_set[minor] = 1;
+
+ devname = p;
+ }
+ devices[i] = 0;
+
+ if (md_setup_args.device_set[minor] == 0)
+ continue;
+
+ if (mddev_map[minor]) {
+ printk(KERN_WARNING
+ "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n",
+ minor);
+ continue;
+ }
+ printk(KERN_INFO "md: Loading md%d: %s\n", minor, md_setup_args.device_names[minor]);
+
+ mddev = alloc_mddev(MKDEV(MD_MAJOR,minor));
+ if (!mddev) {
+ printk(KERN_ERR "md: kmalloc failed - cannot start array %d\n", minor);
+ continue;
+ }
+ if (md_setup_args.pers[minor]) {
+ /* non-persistent */
+ mdu_array_info_t ainfo;
+ ainfo.level = pers_to_level(md_setup_args.pers[minor]);
+ ainfo.size = 0;
+ ainfo.nr_disks =0;
+ ainfo.raid_disks =0;
+ ainfo.md_minor =minor;
+ ainfo.not_persistent = 1;
+
+ ainfo.state = (1 << MD_SB_CLEAN);
+ ainfo.active_disks = 0;
+ ainfo.working_disks = 0;
+ ainfo.failed_disks = 0;
+ ainfo.spare_disks = 0;
+ ainfo.layout = 0;
+ ainfo.chunk_size = md_setup_args.chunk[minor];
+ err = set_array_info(mddev, &ainfo);
+ for (i = 0; !err && (dev = devices[i]); i++) {
+ dinfo.number = i;
+ dinfo.raid_disk = i;
+ dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC);
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ mddev->sb->nr_disks++;
+ mddev->sb->raid_disks++;
+ mddev->sb->active_disks++;
+ mddev->sb->working_disks++;
+ err = add_new_disk (mddev, &dinfo);
+ }
+ } else {
+ /* persistent */
+ for (i = 0; (dev = devices[i]); i++) {
+ dinfo.major = MAJOR(dev);
+ dinfo.minor = MINOR(dev);
+ add_new_disk (mddev, &dinfo);
+ }
+ }
+ if (!err)
+ err = do_md_run(mddev);
+ if (err) {
+ mddev->sb_dirty = 0;
+ do_md_stop(mddev, 0);
+ printk(KERN_WARNING "md: starting md%d failed\n", minor);
+ }
+ }
+}
+
+static int md__init raid_setup(char *str)
+{
+ int len, pos;
+
+ len = strlen(str) + 1;
+ pos = 0;
+
+ while (pos < len) {
+ char *comma = strchr(str+pos, ',');
+ int wlen;
+ if (comma)
+ wlen = (comma-str)-pos;
+ else wlen = (len-1)-pos;
+
+ if (strncmp(str, "noautodetect", wlen) == 0)
+ raid_setup_args.noautodetect = 1;
+ pos += wlen+1;
+ }
+ raid_setup_args.set = 1;
+ return 1;
+}
+
+int md__init md_run_setup(void)
+{
+ if (raid_setup_args.noautodetect)
+ printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
+ else
+ autostart_arrays();
+ md_setup_drive();
+ return 0;
+}
+
+__setup("raid=", raid_setup);
+__setup("md=", md_setup);
+
+__initcall(md_init);
+__initcall(md_run_setup);
+
+#else /* It is a MODULE */
+
+int init_module(void)
+{
+ return md_init();
+}
+
+static void free_device_names(void)
+{
+ while (!list_empty(&device_names)) {
+ struct dname *tmp = list_entry(device_names.next,
+ dev_name_t, list);
+ list_del(&tmp->list);
+ kfree(tmp);
+ }
+}
+
+
+void cleanup_module(void)
+{
+ md_unregister_thread(md_recovery_thread);
+ devfs_unregister(devfs_handle);
+
+ devfs_unregister_blkdev(MAJOR_NR,"md");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("mdstat", NULL);
+#endif
+
+ del_gendisk(&md_gendisk);
+
+ blk_dev[MAJOR_NR].queue = NULL;
+ blksize_size[MAJOR_NR] = NULL;
+ blk_size[MAJOR_NR] = NULL;
+ max_readahead[MAJOR_NR] = NULL;
+ hardsect_size[MAJOR_NR] = NULL;
+
+ free_device_names();
+
+}
+#endif
+
+MD_EXPORT_SYMBOL(md_size);
+MD_EXPORT_SYMBOL(register_md_personality);
+MD_EXPORT_SYMBOL(unregister_md_personality);
+MD_EXPORT_SYMBOL(partition_name);
+MD_EXPORT_SYMBOL(md_error);
+MD_EXPORT_SYMBOL(md_do_sync);
+MD_EXPORT_SYMBOL(md_sync_acct);
+MD_EXPORT_SYMBOL(md_done_sync);
+MD_EXPORT_SYMBOL(md_recover_arrays);
+MD_EXPORT_SYMBOL(md_register_thread);
+MD_EXPORT_SYMBOL(md_unregister_thread);
+MD_EXPORT_SYMBOL(md_update_sb);
+MD_EXPORT_SYMBOL(md_wakeup_thread);
+MD_EXPORT_SYMBOL(md_print_devices);
+MD_EXPORT_SYMBOL(find_rdev_nr);
+MD_EXPORT_SYMBOL(md_interrupt_thread);
+<<<<<<<
+MD_EXPORT_SYMBOL(mddev_map);
+|||||||
+EXPORT_SYMBOL(mddev_map);
+=======
+>>>>>>>
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md-messy/diff b/tests/linux/md-messy/diff
new file mode 100644
index 0000000..35d56b7
--- /dev/null
+++ b/tests/linux/md-messy/diff
@@ -0,0 +1,93 @@
+@@ -1,90 +1,90 @@
+| return <<<--0-->>><<<++1++>>>;
+|<<<--abort:-->>><<<++}++>>>
+|<<<-- return-->>><<<++
+|#undef++>>> <<<--1;
+|}-->>><<<++OLD_LEVEL++>>>
+
+ static int device_size_calculation(mddev_t * mddev)
+ {
+ int data_disks = 0;
+ unsigned int readahead;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size < mddev->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size:"
+ " %lluk < %dk\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->size,
+ mddev->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (mddev->level) {
+ case LEVEL_MULTIPATH:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case LEVEL_LINEAR:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = mddev->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = mddev->raid_disks-1;
+ break;
+ default:
+ printk(KERN_ERR "md: md%d: unsupported raid level %d\n",
+ mdidx(mddev), mddev->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = mddev->size * data_disks;
+
+ readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) {
+ readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (mddev->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+ abort:
+ return 1;
+ }
+
+ static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+ {
+ static DECLARE_MUTEX(disks_sem);
+|<<<-- -->>> \ No newline at end of file
diff --git a/tests/linux/md-messy/new b/tests/linux/md-messy/new
new file mode 100644
index 0000000..c9b96f5
--- /dev/null
+++ b/tests/linux/md-messy/new
@@ -0,0 +1,90 @@
+ return 1;
+}
+
+#undef OLD_LEVEL
+
+static int device_size_calculation(mddev_t * mddev)
+{
+ int data_disks = 0;
+ unsigned int readahead;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size < mddev->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size:"
+ " %lluk < %dk\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->size,
+ mddev->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (mddev->level) {
+ case LEVEL_MULTIPATH:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case LEVEL_LINEAR:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = mddev->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = mddev->raid_disks-1;
+ break;
+ default:
+ printk(KERN_ERR "md: md%d: unsupported raid level %d\n",
+ mdidx(mddev), mddev->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = mddev->size * data_disks;
+
+ readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) {
+ readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (mddev->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+abort:
+ return 1;
+}
+
+static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+{
+ static DECLARE_MUTEX(disks_sem);
diff --git a/tests/linux/md-messy/orig b/tests/linux/md-messy/orig
new file mode 100644
index 0000000..252f9de
--- /dev/null
+++ b/tests/linux/md-messy/orig
@@ -0,0 +1,91 @@
+ return 0;
+abort:
+ return 1;
+}
+
+static int device_size_calculation(mddev_t * mddev)
+{
+ int data_disks = 0;
+ unsigned int readahead;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size < mddev->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size:"
+ " %lluk < %dk\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->size,
+ mddev->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (mddev->level) {
+ case LEVEL_MULTIPATH:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case LEVEL_LINEAR:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = mddev->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = mddev->raid_disks-1;
+ break;
+ default:
+ printk(KERN_ERR "md: md%d: unsupported raid level %d\n",
+ mdidx(mddev), mddev->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = mddev->size * data_disks;
+
+ readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) {
+ readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (mddev->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+abort:
+ return 1;
+}
+
+static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+{
+ static DECLARE_MUTEX(disks_sem);
+ \ No newline at end of file
diff --git a/tests/linux/md-resync/merge b/tests/linux/md-resync/merge
new file mode 100644
index 0000000..fd18d0f
--- /dev/null
+++ b/tests/linux/md-resync/merge
@@ -0,0 +1,1911 @@
+/*
+ * raid1.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * RAID-1 management functions.
+ *
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
+ *
+ * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/raid/raid1.h>
+#include <asm/atomic.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+
+#define MAX_WORK_PER_DISK 128
+
+#define NR_RESERVED_BUFS 32
+
+
+/*
+ * The following can be used to debug the driver
+ */
+#define RAID1_DEBUG 0
+
+#if RAID1_DEBUG
+#define PRINTK(x...) printk(x)
+#define inline
+#define __inline__
+#else
+#define PRINTK(x...) do { } while (0)
+#endif
+
+
+static mdk_personality_t raid1_personality;
+static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
+struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
+
+static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
+{
+ /* return a linked list of "cnt" struct buffer_heads.
+ * don't take any off the free list unless we know we can
+ * get all we need, otherwise we could deadlock
+ */
+ struct buffer_head *bh=NULL;
+
+ while(cnt) {
+ struct buffer_head *t;
+ md_spin_lock_irq(&conf->device_lock);
+ if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
+ while (cnt) {
+ t = conf->freebh;
+ conf->freebh = t->b_next;
+ t->b_next = bh;
+ bh = t;
+ t->b_state = 0;
+ conf->freebh_cnt--;
+ cnt--;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ if (cnt == 0)
+ break;
+ t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
+ if (t) {
+ t->b_next = bh;
+ bh = t;
+ cnt--;
+ } else {
+ PRINTK("raid1: waiting for %d bh\n", cnt);
+ conf->freebh_blocked = 1;
+ wait_disk_event(conf->wait_buffer,
+ !conf->freebh_blocked ||
+ conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
+ conf->freebh_blocked = 0;
+ }
+ }
+ return bh;
+}
+
+static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ while (bh) {
+ struct buffer_head *t = bh;
+ bh=bh->b_next;
+ if (t->b_pprev == NULL)
+ kmem_cache_free(bh_cachep, t);
+ else {
+ t->b_next= conf->freebh;
+ conf->freebh = t;
+ conf->freebh_cnt++;
+ }
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ wake_up(&conf->wait_buffer);
+}
+
+static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
+{
+ /* allocate cnt buffer_heads, possibly less if kmalloc fails */
+ int i = 0;
+
+ while (i < cnt) {
+ struct buffer_head *bh;
+ bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
+ if (!bh) break;
+
+ md_spin_lock_irq(&conf->device_lock);
+ bh->b_pprev = &conf->freebh;
+ bh->b_next = conf->freebh;
+ conf->freebh = bh;
+ conf->freebh_cnt++;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ i++;
+ }
+ return i;
+}
+
+static void raid1_shrink_bh(raid1_conf_t *conf)
+{
+ /* discard all buffer_heads */
+
+ md_spin_lock_irq(&conf->device_lock);
+ while (conf->freebh) {
+ struct buffer_head *bh = conf->freebh;
+ conf->freebh = bh->b_next;
+ kmem_cache_free(bh_cachep, bh);
+ conf->freebh_cnt--;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+}
+
+
+static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
+{
+ struct raid1_bh *r1_bh = NULL;
+
+ do {
+ md_spin_lock_irq(&conf->device_lock);
+ if (!conf->freer1_blocked && conf->freer1) {
+ r1_bh = conf->freer1;
+ conf->freer1 = r1_bh->next_r1;
+ conf->freer1_cnt--;
+ r1_bh->next_r1 = NULL;
+ r1_bh->state = (1 << R1BH_PreAlloc);
+ r1_bh->bh_req.b_state = 0;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ if (r1_bh)
+ return r1_bh;
+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
+ if (r1_bh) {
+ memset(r1_bh, 0, sizeof(*r1_bh));
+ return r1_bh;
+ }
+ conf->freer1_blocked = 1;
+ wait_disk_event(conf->wait_buffer,
+ !conf->freer1_blocked ||
+ conf->freer1_cnt > NR_RESERVED_BUFS/2
+ );
+ conf->freer1_blocked = 0;
+ } while (1);
+}
+
+static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
+{
+ struct buffer_head *bh = r1_bh->mirror_bh_list;
+ raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+
+ r1_bh->mirror_bh_list = NULL;
+
+ if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ r1_bh->next_r1 = conf->freer1;
+ conf->freer1 = r1_bh;
+ conf->freer1_cnt++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ /* don't need to wakeup wait_buffer because
+ * raid1_free_bh below will do that
+ */
+ } else {
+ kfree(r1_bh);
+ }
+ raid1_free_bh(conf, bh);
+}
+
+static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
+{
+ int i = 0;
+
+ while (i < cnt) {
+ struct raid1_bh *r1_bh;
+ r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+ if (!r1_bh)
+ break;
+ memset(r1_bh, 0, sizeof(*r1_bh));
+ set_bit(R1BH_PreAlloc, &r1_bh->state);
+ r1_bh->mddev = conf->mddev;
+
+ raid1_free_r1bh(r1_bh);
+ i++;
+ }
+ return i;
+}
+
+static void raid1_shrink_r1bh(raid1_conf_t *conf)
+{
+ md_spin_lock_irq(&conf->device_lock);
+ while (conf->freer1) {
+ struct raid1_bh *r1_bh = conf->freer1;
+ conf->freer1 = r1_bh->next_r1;
+ conf->freer1_cnt--;
+ kfree(r1_bh);
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+}
+
+
+
+static inline void raid1_free_buf(struct raid1_bh *r1_bh)
+{
+ unsigned long flags;
+ struct buffer_head *bh = r1_bh->mirror_bh_list;
+ raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+ r1_bh->mirror_bh_list = NULL;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ r1_bh->next_r1 = conf->freebuf;
+ conf->freebuf = r1_bh;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ raid1_free_bh(conf, bh);
+}
+
+static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
+{
+ struct raid1_bh *r1_bh;
+
+ md_spin_lock_irq(&conf->device_lock);
+ wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
+ r1_bh = conf->freebuf;
+ conf->freebuf = r1_bh->next_r1;
+ r1_bh->next_r1= NULL;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ return r1_bh;
+}
+
+static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
+{
+ int i = 0;
+ struct raid1_bh *head = NULL, **tail;
+ tail = &head;
+
+ while (i < cnt) {
+ struct raid1_bh *r1_bh;
+ struct page *page;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ break;
+
+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+ if (!r1_bh) {
+ __free_page(page);
+ break;
+ }
+ memset(r1_bh, 0, sizeof(*r1_bh));
+ r1_bh->bh_req.b_page = page;
+ r1_bh->bh_req.b_data = page_address(page);
+ *tail = r1_bh;
+ r1_bh->next_r1 = NULL;
+ tail = & r1_bh->next_r1;
+ i++;
+ }
+ /* this lock probably isn't needed, as at the time when
+ * we are allocating buffers, nobody else will be touching the
+ * freebuf list. But it doesn't hurt....
+ */
+ md_spin_lock_irq(&conf->device_lock);
+ *tail = conf->freebuf;
+ conf->freebuf = head;
+ md_spin_unlock_irq(&conf->device_lock);
+ return i;
+}
+
+static void raid1_shrink_buffers (raid1_conf_t *conf)
+{
+ struct raid1_bh *head;
+ md_spin_lock_irq(&conf->device_lock);
+ head = conf->freebuf;
+ conf->freebuf = NULL;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ while (head) {
+ struct raid1_bh *r1_bh = head;
+ head = r1_bh->next_r1;
+ __free_page(r1_bh->bh_req.b_page);
+ kfree(r1_bh);
+ }
+}
+
+static int raid1_map (mddev_t *mddev, kdev_t *rdev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ int i, disks = MD_SB_DISKS;
+
+ /*
+ * Later we do read balancing on the read side
+ * now we use the first available disk.
+ */
+
+ for (i = 0; i < disks; i++) {
+ if (conf->mirrors[i].operational) {
+ *rdev = conf->mirrors[i].dev;
+ return (0);
+ }
+ }
+
+ printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
+ return (-1);
+}
+
+static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
+{
+ unsigned long flags;
+ mddev_t *mddev = r1_bh->mddev;
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+
+ md_spin_lock_irqsave(&retry_list_lock, flags);
+ if (raid1_retry_list == NULL)
+ raid1_retry_tail = &raid1_retry_list;
+ *raid1_retry_tail = r1_bh;
+ raid1_retry_tail = &r1_bh->next_r1;
+ r1_bh->next_r1 = NULL;
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
+ md_wakeup_thread(conf->thread);
+}
+
+
+static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->segment_lock, flags);
+ if (sector < conf->start_active)
+ conf->cnt_done--;
+ else if (sector >= conf->start_future && conf->phase == phase)
+ conf->cnt_future--;
+ else if (!--conf->cnt_pending)
+ wake_up(&conf->wait_ready);
+
+ spin_unlock_irqrestore(&conf->segment_lock, flags);
+}
+
+static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->segment_lock, flags);
+ if (sector >= conf->start_ready)
+ --conf->cnt_ready;
+ else if (sector >= conf->start_active) {
+ if (!--conf->cnt_active) {
+ conf->start_active = conf->start_ready;
+ wake_up(&conf->wait_done);
+ }
+ }
+ spin_unlock_irqrestore(&conf->segment_lock, flags);
+}
+
+/*
+ * raid1_end_bh_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
+{
+ struct buffer_head *bh = r1_bh->master_bh;
+
+ io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
+ test_bit(R1BH_SyncPhase, &r1_bh->state));
+
+ bh->b_end_io(bh, uptodate);
+ raid1_free_r1bh(r1_bh);
+}
+void raid1_end_request (struct buffer_head *bh, int uptodate)
+{
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
+
+ /*
+ * this branch is our 'one mirror IO has finished' event handler:
+ */
+ if (!uptodate)
+ md_error (r1_bh->mddev, bh->b_dev);
+ else
+ /*
+ * Set R1BH_Uptodate in our master buffer_head, so that
+ * we will return a good error code for to the higher
+ * levels even if IO on some other mirrored buffer fails.
+ *
+ * The 'master' represents the complex operation to
+ * user-side. So if something waits for IO, then it will
+ * wait for the 'master' buffer_head.
+ */
+ set_bit (R1BH_Uptodate, &r1_bh->state);
+
+ /*
+ * We split up the read and write side, imho they are
+ * conceptually different.
+ */
+
+ if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
+ /*
+ * we have only one buffer_head on the read side
+ */
+
+ if (uptodate) {
+ raid1_end_bh_io(r1_bh, uptodate);
+ return;
+ }
+ /*
+ * oops, read error:
+ */
+ printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
+ partition_name(bh->b_dev), bh->b_blocknr);
+ raid1_reschedule_retry(r1_bh);
+ return;
+ }
+
+ /*
+ * WRITE:
+ *
+ * Let's see if all mirrored write operations have finished
+ * already.
+ */
+
+ if (atomic_dec_and_test(&r1_bh->remaining))
+ raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
+}
+
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. It bookkeeps the last read position for every disk
+ * in array and when new read requests come, the disk which last
+ * position is nearest to the request, is chosen.
+ *
+ * TODO: now if there are 2 mirrors in the same 2 devices, performance
+ * degrades dramatically because position is mirror, not device based.
+ * This should be changed to be device based. Also atomic sequential
+ * reads should be somehow balanced.
+ */
+
+static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
+{
+ int new_disk = conf->last_used;
+ const int sectors = bh->b_size >> 9;
+ const unsigned long this_sector = bh->b_rsector;
+ int disk = new_disk;
+ unsigned long new_distance;
+ unsigned long current_distance;
+
+ /*
+ * Check if it is sane at all to balance
+ */
+
+ if (!conf->mddev->in_sync)
+ goto rb_out;
+
+
+ /* make sure that disk is operational */
+ while( !conf->mirrors[new_disk].operational) {
+ if (new_disk <= 0) new_disk = conf->raid_disks;
+ new_disk--;
+ if (new_disk == disk) {
+ /*
+ * This means no working disk was found
+ * Nothing much to do, lets not change anything
+ * and hope for the best...
+ */
+
+ new_disk = conf->last_used;
+
+ goto rb_out;
+ }
+ }
+ disk = new_disk;
+ /* now disk == new_disk == starting point for search */
+
+ /*
+ * Don't touch anything for sequential reads.
+ */
+
+ if (this_sector == conf->mirrors[new_disk].head_position)
+ goto rb_out;
+
+ /*
+ * If reads have been done only on a single disk
+ * for a time, lets give another disk a change.
+ * This is for kicking those idling disks so that
+ * they would find work near some hotspot.
+ */
+
+ if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
+ conf->sect_count = 0;
+
+#if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
+ /* Work around a compiler bug in egcs-2.92.11 19980921 */
+ new_disk = *(volatile int *)&new_disk;
+#endif
+ do {
+ if (new_disk<=0)
+ new_disk = conf->raid_disks;
+ new_disk--;
+ if (new_disk == disk)
+ break;
+ } while ((conf->mirrors[new_disk].write_only) ||
+ (!conf->mirrors[new_disk].operational));
+
+ goto rb_out;
+ }
+
+ current_distance = abs(this_sector -
+ conf->mirrors[disk].head_position);
+
+ /* Find the disk which is closest */
+
+ do {
+ if (disk <= 0)
+ disk = conf->raid_disks;
+ disk--;
+
+ if ((conf->mirrors[disk].write_only) ||
+ (!conf->mirrors[disk].operational))
+ continue;
+
+ new_distance = abs(this_sector -
+ conf->mirrors[disk].head_position);
+
+ if (new_distance < current_distance) {
+ conf->sect_count = 0;
+ current_distance = new_distance;
+ new_disk = disk;
+ }
+ } while (disk != conf->last_used);
+
+rb_out:
+ conf->mirrors[new_disk].head_position = this_sector + sectors;
+
+ conf->last_used = new_disk;
+ conf->sect_count += sectors;
+
+ return new_disk;
+}
+
+static int raid1_make_request (request_queue_t *q,
+ struct buffer_head * bh)
+{
+ mddev_t *mddev = q->queuedata;
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ struct buffer_head *bh_req, *bhl;
+ struct raid1_bh * r1_bh;
+ int disks = MD_SB_DISKS;
+ int i, sum_bhs = 0;
+ struct mirror_info *mirror;
+
+ if (!buffer_locked(bh))
+ BUG();
+
+/*
+ * make_request() can abort the operation when READA is being
+ * used and no empty request is available.
+ *
+ * Currently, just replace the command with READ/WRITE.
+ */
+ r1_bh = raid1_alloc_r1bh (conf);
+
+ spin_lock_irq(&conf->segment_lock);
+ wait_event_lock_irq(conf->wait_done,
+ bh->b_rsector < conf->start_active ||
+ bh->b_rsector >= conf->start_future,
+ conf->segment_lock);
+ if (bh->b_rsector < conf->start_active)
+ conf->cnt_done++;
+ else {
+ conf->cnt_future++;
+ if (conf->phase)
+ set_bit(R1BH_SyncPhase, &r1_bh->state);
+ }
+ spin_unlock_irq(&conf->segment_lock);
+
+ /*
+ * i think the read and write branch should be separated completely,
+ * since we want to do read balancing on the read side for example.
+ * Alternative implementations? :) --mingo
+ */
+
+ r1_bh->master_bh = bh;
+ r1_bh->mddev = mddev;
+ r1_bh->cmd = rw;
+
+ if (rw == READ) {
+ /*
+ * read balancing logic:
+ */
+ mirror = conf->mirrors + raid1_read_balance(conf, bh);
+
+ bh_req = &r1_bh->bh_req;
+ memcpy(bh_req, bh, sizeof(*bh));
+ bh_req->b_blocknr = bh->b_rsector;
+ bh_req->b_dev = mirror->dev;
+ bh_req->b_rdev = mirror->dev;
+ /* bh_req->b_rsector = bh->n_rsector; */
+ bh_req->b_end_io = raid1_end_request;
+ bh_req->b_private = r1_bh;
+ generic_make_request (rw, bh_req);
+ return 0;
+ }
+
+ /*
+ * WRITE:
+ */
+
+ bhl = raid1_alloc_bh(conf, conf->raid_disks);
+ for (i = 0; i < disks; i++) {
+ struct buffer_head *mbh;
+ if (!conf->mirrors[i].operational)
+ continue;
+
+ /*
+ * We should use a private pool (size depending on NR_REQUEST),
+ * to avoid writes filling up the memory with bhs
+ *
+ * Such pools are much faster than kmalloc anyways (so we waste
+ * almost nothing by not using the master bh when writing and
+ * win alot of cleanness) but for now we are cool enough. --mingo
+ *
+ * It's safe to sleep here, buffer heads cannot be used in a shared
+ * manner in the write branch. Look how we lock the buffer at the
+ * beginning of this function to grok the difference ;)
+ */
+ mbh = bhl;
+ if (mbh == NULL) {
+ MD_BUG();
+ break;
+ }
+ bhl = mbh->b_next;
+ mbh->b_next = NULL;
+ mbh->b_this_page = (struct buffer_head *)1;
+
+ /*
+ * prepare mirrored mbh (fields ordered for max mem throughput):
+ */
+ mbh->b_blocknr = bh->b_rsector;
+ mbh->b_dev = conf->mirrors[i].dev;
+ mbh->b_rdev = conf->mirrors[i].dev;
+ mbh->b_rsector = bh->b_rsector;
+ mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
+ (1<<BH_Mapped) | (1<<BH_Lock);
+
+ atomic_set(&mbh->b_count, 1);
+ mbh->b_size = bh->b_size;
+ mbh->b_page = bh->b_page;
+ mbh->b_data = bh->b_data;
+ mbh->b_list = BUF_LOCKED;
+ mbh->b_end_io = raid1_end_request;
+ mbh->b_private = r1_bh;
+
+ mbh->b_next = r1_bh->mirror_bh_list;
+ r1_bh->mirror_bh_list = mbh;
+ sum_bhs++;
+ }
+ if (bhl) raid1_free_bh(conf,bhl);
+ if (!sum_bhs) {
+ /* Gag - all mirrors non-operational.. */
+ raid1_end_bh_io(r1_bh, 0);
+ return 0;
+ }
+ md_atomic_set(&r1_bh->remaining, sum_bhs);
+
+ /*
+ * We have to be a bit careful about the semaphore above, thats
+ * why we start the requests separately. Since kmalloc() could
+ * fail, sleep and make_request() can sleep too, this is the
+ * safer solution. Imagine, end_request decreasing the semaphore
+ * before we could have set it up ... We could play tricks with
+ * the semaphore (presetting it and correcting at the end if
+ * sum_bhs is not 'n' but we have to do end_request by hand if
+ * all requests finish until we had a chance to set up the
+ * semaphore correctly ... lots of races).
+ */
+ bh = r1_bh->mirror_bh_list;
+ while(bh) {
+ struct buffer_head *bh2 = bh;
+ bh = bh->b_next;
+ generic_make_request(rw, bh2);
+ }
+ return (0);
+}
+
+static void raid1_status(struct seq_file *seq, mddev_t *mddev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ int i;
+
+ seq_printf(seq, " [%d/%d] [", conf->raid_disks,
+ conf->working_disks);
+ for (i = 0; i < conf->raid_disks; i++)
+ seq_printf(seq, "%s",
+ conf->mirrors[i].operational ? "U" : "_");
+ seq_printf(seq, "]");
+}
+
+#define LAST_DISK KERN_ALERT \
+"raid1: only one disk left and IO error.\n"
+
+#define NO_SPARE_DISK KERN_ALERT \
+"raid1: no spare disk left, degrading mirror level by one.\n"
+
+#define DISK_FAILED KERN_ALERT \
+"raid1: Disk failure on %s, disabling device. \n" \
+" Operation continuing on %d devices\n"
+
+#define START_SYNCING KERN_ALERT \
+"raid1: start syncing spare disk.\n"
+
+#define ALREADY_SYNCING KERN_INFO \
+"raid1: syncing already in progress.\n"
+
+static void mark_disk_bad (mddev_t *mddev, int failed)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ struct mirror_info *mirror = conf->mirrors+failed;
+ mdp_super_t *sb = mddev->sb;
+
+ mirror->operational = 0;
+ mark_disk_faulty(sb->disks+mirror->number);
+ mark_disk_nonsync(sb->disks+mirror->number);
+ mark_disk_inactive(sb->disks+mirror->number);
+ if (!mirror->write_only)
+ sb->active_disks--;
+ sb->working_disks--;
+ sb->failed_disks++;
+ mddev->sb_dirty = 1;
+ md_wakeup_thread(conf->thread);
+ if (!mirror->write_only)
+ conf->working_disks--;
+ printk (DISK_FAILED, partition_name (mirror->dev),
+ conf->working_disks);
+}
+
+static int raid1_error (mddev_t *mddev, kdev_t dev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ struct mirror_info * mirrors = conf->mirrors;
+ int disks = MD_SB_DISKS;
+ int i;
+
+ /* Find the drive.
+ * If it is not operational, then we have already marked it as dead
+ * else if it is the last working disks, ignore the error, let the
+ * next level up know.
+ * else mark the drive as failed
+ */
+
+ for (i = 0; i < disks; i++)
+ if (mirrors[i].dev==dev && mirrors[i].operational)
+ break;
+ if (i == disks)
+ return 0;
+
+ if (i < conf->raid_disks && conf->working_disks == 1) {
+ /* Don't fail the drive, act as though we were just a
+ * normal single drive
+ */
+
+ return 1;
+ }
+ mark_disk_bad(mddev, i);
+ return 0;
+}
+
+#undef LAST_DISK
+#undef NO_SPARE_DISK
+#undef DISK_FAILED
+#undef START_SYNCING
+
+
+static void print_raid1_conf (raid1_conf_t *conf)
+{
+ int i;
+ struct mirror_info *tmp;
+
+ printk("RAID1 conf printout:\n");
+ if (!conf) {
+ printk("(conf==NULL)\n");
+ return;
+ }
+ printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
+ conf->raid_disks, conf->nr_disks);
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ tmp = conf->mirrors + i;
+ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
+ i, tmp->spare,tmp->operational,
+ tmp->number,tmp->raid_disk,tmp->used_slot,
+ partition_name(tmp->dev));
+ }
+}
+
+static void close_sync(raid1_conf_t *conf)
+{
+ mddev_t *mddev = conf->mddev;
+ /* If reconstruction was interrupted, we need to close the "active" and "pending"
+ * holes.
+ * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
+ */
+ /* this is really needed when recovery stops too... */
+ spin_lock_irq(&conf->segment_lock);
+ conf->start_active = conf->start_pending;
+ conf->start_ready = conf->start_pending;
+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+ conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
+ conf->start_future = (mddev->sb->size<<1)+1;
+ conf->cnt_pending = conf->cnt_future;
+ conf->cnt_future = 0;
+ conf->phase = conf->phase ^1;
+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+ conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
+ conf->phase = 0;
+ conf->cnt_future = conf->cnt_done;;
+ conf->cnt_done = 0;
+ spin_unlock_irq(&conf->segment_lock);
+ wake_up(&conf->wait_done);
+
+ mempool_destroy(conf->r1buf_pool);
+ conf->r1buf_pool = NULL;
+}
+
+static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+{
+ int err = 0;
+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
+ raid1_conf_t *conf = mddev->private;
+ struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
+ mdk_rdev_t *spare_rdev, *failed_rdev;
+
+ print_raid1_conf(conf);
+
+ switch (state) {
+ case DISKOP_SPARE_ACTIVE:
+ case DISKOP_SPARE_INACTIVE:
+ /* need to wait for pending sync io before locking device */
+ close_sync(conf);
+ }
+
+ md_spin_lock_irq(&conf->device_lock);
+ /*
+ * find the disk ...
+ */
+ switch (state) {
+
+ case DISKOP_SPARE_ACTIVE:
+
+ /*
+ * Find the failed disk within the RAID1 configuration ...
+ * (this can only be in the first conf->working_disks part)
+ */
+ for (i = 0; i < conf->raid_disks; i++) {
+ tmp = conf->mirrors + i;
+ if ((!tmp->operational && !tmp->spare) ||
+ !tmp->used_slot) {
+ failed_disk = i;
+ break;
+ }
+ }
+ /*
+ * When we activate a spare disk we _must_ have a disk in
+ * the lower (active) part of the array to replace.
+ */
+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ /* fall through */
+
+ case DISKOP_SPARE_WRITE:
+ case DISKOP_SPARE_INACTIVE:
+
+ /*
+ * Find the spare disk ... (can only be in the 'high'
+ * area of the array)
+ */
+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+ tmp = conf->mirrors + i;
+ if (tmp->spare && tmp->number == (*d)->number) {
+ spare_disk = i;
+ break;
+ }
+ }
+ if (spare_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+
+ case DISKOP_HOT_REMOVE_DISK:
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ tmp = conf->mirrors + i;
+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
+ if (tmp->operational) {
+ err = -EBUSY;
+ goto abort;
+ }
+ removed_disk = i;
+ break;
+ }
+ }
+ if (removed_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+
+ case DISKOP_HOT_ADD_DISK:
+
+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+ tmp = conf->mirrors + i;
+ if (!tmp->used_slot) {
+ added_disk = i;
+ break;
+ }
+ }
+ if (added_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+ }
+
+ switch (state) {
+ /*
+ * Switch the spare disk to write-only mode:
+ */
+ case DISKOP_SPARE_WRITE:
+ sdisk = conf->mirrors + spare_disk;
+ sdisk->operational = 1;
+ sdisk->write_only = 1;
+ break;
+ /*
+ * Deactivate a spare disk:
+ */
+ case DISKOP_SPARE_INACTIVE:
+<<<<<<<
+ if (conf->start_future > 0) {
+ MD_BUG();
+ err = -EBUSY;
+ break;
+ }
+|||||||
+ close_sync(conf);
+=======
+>>>>>>>
+ sdisk = conf->mirrors + spare_disk;
+ sdisk->operational = 0;
+ sdisk->write_only = 0;
+ break;
+ /*
+ * Activate (mark read-write) the (now sync) spare disk,
+ * which means we switch it's 'raid position' (->raid_disk)
+ * with the failed disk. (only the first 'conf->nr_disks'
+ * slots are used for 'real' disks and we must preserve this
+ * property)
+ */
+ case DISKOP_SPARE_ACTIVE:
+<<<<<<<
+ if (conf->start_future > 0) {
+ MD_BUG();
+ err = -EBUSY;
+ break;
+ }
+|||||||
+ close_sync(conf);
+=======
+>>>>>>>
+ sdisk = conf->mirrors + spare_disk;
+ fdisk = conf->mirrors + failed_disk;
+
+ spare_desc = &sb->disks[sdisk->number];
+ failed_desc = &sb->disks[fdisk->number];
+
+ if (spare_desc != *d) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (spare_desc->raid_disk != sdisk->raid_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (sdisk->raid_disk != spare_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (failed_desc->raid_disk != fdisk->raid_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (fdisk->raid_disk != failed_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ /*
+ * do the switch finally
+ */
+ spare_rdev = find_rdev_nr(mddev, spare_desc->number);
+ failed_rdev = find_rdev_nr(mddev, failed_desc->number);
+
+ /* There must be a spare_rdev, but there may not be a
+ * failed_rdev. That slot might be empty...
+ */
+ spare_rdev->desc_nr = failed_desc->number;
+ if (failed_rdev)
+ failed_rdev->desc_nr = spare_desc->number;
+
+ xchg_values(*spare_desc, *failed_desc);
+ xchg_values(*fdisk, *sdisk);
+
+ /*
+ * (careful, 'failed' and 'spare' are switched from now on)
+ *
+ * we want to preserve linear numbering and we want to
+ * give the proper raid_disk number to the now activated
+ * disk. (this means we switch back these values)
+ */
+
+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
+ xchg_values(spare_desc->number, failed_desc->number);
+ xchg_values(sdisk->number, fdisk->number);
+
+ *d = failed_desc;
+
+ if (sdisk->dev == MKDEV(0,0))
+ sdisk->used_slot = 0;
+ /*
+ * this really activates the spare.
+ */
+ fdisk->spare = 0;
+ fdisk->write_only = 0;
+
+ /*
+ * if we activate a spare, we definitely replace a
+ * non-operational disk slot in the 'low' area of
+ * the disk array.
+ */
+
+ conf->working_disks++;
+
+ break;
+
+ case DISKOP_HOT_REMOVE_DISK:
+ rdisk = conf->mirrors + removed_disk;
+
+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ rdisk->dev = MKDEV(0,0);
+ rdisk->used_slot = 0;
+ conf->nr_disks--;
+ break;
+
+ case DISKOP_HOT_ADD_DISK:
+ adisk = conf->mirrors + added_disk;
+ added_desc = *d;
+
+ if (added_disk != added_desc->number) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ adisk->number = added_desc->number;
+ adisk->raid_disk = added_desc->raid_disk;
+ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
+
+ adisk->operational = 0;
+ adisk->write_only = 0;
+ adisk->spare = 1;
+ adisk->used_slot = 1;
+ adisk->head_position = 0;
+ conf->nr_disks++;
+
+ break;
+
+ default:
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+abort:
+ md_spin_unlock_irq(&conf->device_lock);
+<<<<<<<
+ if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
+ /* should move to "END_REBUILD" when such exists */
+ raid1_shrink_buffers(conf);
+
+ print_raid1_conf(conf);
+|||||||
+ if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) {
+ mempool_destroy(conf->r1buf_pool);
+ conf->r1buf_pool = NULL;
+ }
+
+ print_conf(conf);
+=======
+
+ print_conf(conf);
+>>>>>>>
+ return err;
+}
+
+
+#define IO_ERROR KERN_ALERT \
+"raid1: %s: unrecoverable I/O read error for block %lu\n"
+
+#define REDIRECT_SECTOR KERN_ERR \
+"raid1: %s: redirecting sector %lu to another mirror\n"
+
+/*
+ * This is a kernel thread which:
+ *
+ * 1. Retries failed read operations on working mirrors.
+ * 2. Updates the raid superblock when problems encounter.
+ * 3. Performs writes following reads for array syncronising.
+ */
+static void end_sync_write(struct buffer_head *bh, int uptodate);
+static void end_sync_read(struct buffer_head *bh, int uptodate);
+
+static void raid1d (void *data)
+{
+ struct raid1_bh *r1_bh;
+ struct buffer_head *bh;
+ unsigned long flags;
+ raid1_conf_t *conf = data;
+ mddev_t *mddev = conf->mddev;
+ kdev_t dev;
+
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+
+ for (;;) {
+ md_spin_lock_irqsave(&retry_list_lock, flags);
+ r1_bh = raid1_retry_list;
+ if (!r1_bh)
+ break;
+ raid1_retry_list = r1_bh->next_r1;
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
+
+ mddev = r1_bh->mddev;
+ bh = &r1_bh->bh_req;
+ switch(r1_bh->cmd) {
+ case SPECIAL:
+ /* have to allocate lots of bh structures and
+ * schedule writes
+ */
+ if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
+ int i, sum_bhs = 0;
+ int disks = MD_SB_DISKS;
+ struct buffer_head *bhl, *mbh;
+
+ conf = mddev_to_conf(mddev);
+ bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
+ for (i = 0; i < disks ; i++) {
+ if (!conf->mirrors[i].operational)
+ continue;
+ if (i==conf->last_used)
+ /* we read from here, no need to write */
+ continue;
+ if (i < conf->raid_disks
+ && mddev->in_sync)
+ /* don't need to write this,
+ * we are just rebuilding */
+ continue;
+ mbh = bhl;
+ if (!mbh) {
+ MD_BUG();
+ break;
+ }
+ bhl = mbh->b_next;
+ mbh->b_this_page = (struct buffer_head *)1;
+
+
+ /*
+ * prepare mirrored bh (fields ordered for max mem throughput):
+ */
+ mbh->b_blocknr = bh->b_blocknr;
+ mbh->b_dev = conf->mirrors[i].dev;
+ mbh->b_rdev = conf->mirrors[i].dev;
+ mbh->b_rsector = bh->b_blocknr;
+ mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
+ (1<<BH_Mapped) | (1<<BH_Lock);
+ atomic_set(&mbh->b_count, 1);
+ mbh->b_size = bh->b_size;
+ mbh->b_page = bh->b_page;
+ mbh->b_data = bh->b_data;
+ mbh->b_list = BUF_LOCKED;
+ mbh->b_end_io = end_sync_write;
+ mbh->b_private = r1_bh;
+
+ mbh->b_next = r1_bh->mirror_bh_list;
+ r1_bh->mirror_bh_list = mbh;
+
+ sum_bhs++;
+ }
+ md_atomic_set(&r1_bh->remaining, sum_bhs);
+ if (bhl) raid1_free_bh(conf, bhl);
+ mbh = r1_bh->mirror_bh_list;
+
+ if (!sum_bhs) {
+ /* nowhere to write this too... I guess we
+ * must be done
+ */
+ sync_request_done(bh->b_blocknr, conf);
+ md_done_sync(mddev, bh->b_size>>9, 0);
+ raid1_free_buf(r1_bh);
+ } else
+ while (mbh) {
+ struct buffer_head *bh1 = mbh;
+ mbh = mbh->b_next;
+ generic_make_request(WRITE, bh1);
+ md_sync_acct(bh1->b_dev, bh1->b_size/512);
+ }
+ } else {
+ /* There is no point trying a read-for-reconstruct
+ * as reconstruct is about to be aborted
+ */
+
+ printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
+ md_done_sync(mddev, bh->b_size>>9, 0);
+ }
+
+ break;
+ case READ:
+ case READA:
+ dev = bh->b_dev;
+ raid1_map (mddev, &bh->b_dev);
+ if (bh->b_dev == dev) {
+ printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
+ raid1_end_bh_io(r1_bh, 0);
+ } else {
+ printk (REDIRECT_SECTOR,
+ partition_name(bh->b_dev), bh->b_blocknr);
+ bh->b_rdev = bh->b_dev;
+ bh->b_rsector = bh->b_blocknr;
+ generic_make_request (r1_bh->cmd, bh);
+ }
+ break;
+ }
+ }
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
+}
+#undef IO_ERROR
+#undef REDIRECT_SECTOR
+
+
+static int init_resync (conf_t *conf)
+{
+*** 1144,16 **** 8
+<<<<<<<
+ raid1_conf_t *conf = data;
+ mddev_t *mddev = conf->mddev;
+|||||||
+ conf_t *conf = data;
+ mddev_t *mddev = conf->mddev;
+=======
+ sector_t max_sector, nr_sectors;
+ int disk, partial;
+>>>>>>>
+
+ if (sector_nr == 0)
+ if (init_resync(conf))
+ return -ENOMEM;
+
+<<<<<<<
+ close_sync(conf);
+
+}
+
+/*
+ * perform a "sync" on one "block"
+ *
+ * We need to make sure that no normal I/O request - particularly write
+ * requests - conflict with active sync requests.
+ * This is achieved by conceptually dividing the device space into a
+ * number of sections:
+ * DONE: 0 .. a-1 These blocks are in-sync
+ * ACTIVE: a.. b-1 These blocks may have active sync requests, but
+ * no normal IO requests
+ * READY: b .. c-1 These blocks have no normal IO requests - sync
+ * request may be happening
+ * PENDING: c .. d-1 These blocks may have IO requests, but no new
+ * ones will be added
+ * FUTURE: d .. end These blocks are not to be considered yet. IO may
+ * be happening, but not sync
+ *
+ * We keep a
+ * phase which flips (0 or 1) each time d moves and
+ * a count of:
+ * z = active io requests in FUTURE since d moved - marked with
+ * current phase
+ * y = active io requests in FUTURE before d moved, or PENDING -
+ * marked with previous phase
+ * x = active sync requests in READY
+ * w = active sync requests in ACTIVE
+ * v = active io requests in DONE
+ *
+ * Normally, a=b=c=d=0 and z= active io requests
+ * or a=b=c=d=END and v= active io requests
+ * Allowed changes to a,b,c,d:
+ * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase
+ * B: y==0 -> c=d
+ * C: b=c, w+=x, x=0
+ * D: w==0 -> a=b
+ * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
+ *
+ * At start of sync we apply A.
+ * When y reaches 0, we apply B then A then being sync requests
+ * When sync point reaches c-1, we wait for y==0, and W==0, and
+ * then apply apply B then A then D then C.
+ * Finally, we apply E
+ *
+ * The sync request simply issues a "read" against a working drive
+ * This is marked so that on completion the raid1d thread is woken to
+ * issue suitable write requests
+ */
+
+static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ struct mirror_info *mirror;
+ struct raid1_bh *r1_bh;
+ struct buffer_head *bh;
+ int bsize;
+ int disk;
+ int block_nr;
+ int buffs;
+
+ if (!sector_nr) {
+ /* we want enough buffers to hold twice the window of 128*/
+ buffs = 128 *2 / (PAGE_SIZE>>9);
+ buffs = raid1_grow_buffers(conf, buffs);
+ if (buffs < 2)
+ goto nomem;
+ conf->window = buffs*(PAGE_SIZE>>9)/2;
+ }
+ spin_lock_irq(&conf->segment_lock);
+ if (!sector_nr) {
+ /* initialize ...*/
+ conf->start_active = 0;
+ conf->start_ready = 0;
+ conf->start_pending = 0;
+ conf->start_future = 0;
+ conf->phase = 0;
+
+ conf->cnt_future += conf->cnt_done+conf->cnt_pending;
+ conf->cnt_done = conf->cnt_pending = 0;
+ if (conf->cnt_ready || conf->cnt_active)
+ MD_BUG();
+ }
+ while (sector_nr >= conf->start_pending) {
+ PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
+ sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
+ conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
+ wait_event_lock_irq(conf->wait_done,
+ !conf->cnt_active,
+ conf->segment_lock);
+ wait_event_lock_irq(conf->wait_ready,
+ !conf->cnt_pending,
+ conf->segment_lock);
+ conf->start_active = conf->start_ready;
+ conf->start_ready = conf->start_pending;
+ conf->start_pending = conf->start_future;
+ conf->start_future = conf->start_future+conf->window;
+ // Note: falling off the end is not a problem
+ conf->phase = conf->phase ^1;
+ conf->cnt_active = conf->cnt_ready;
+ conf->cnt_ready = 0;
+ conf->cnt_pending = conf->cnt_future;
+ conf->cnt_future = 0;
+ wake_up(&conf->wait_done);
+ }
+ conf->cnt_ready++;
+ spin_unlock_irq(&conf->segment_lock);
+
+
+ /* If reconstructing, and >1 working disc,
+ * could dedicate one to rebuild and others to
+ * service read requests ..
+ */
+ disk = conf->last_used;
+ /* make sure disk is operational */
+ while (!conf->mirrors[disk].operational) {
+ if (disk <= 0) disk = conf->raid_disks;
+ disk--;
+ if (disk == conf->last_used)
+ break;
+ }
+ conf->last_used = disk;
+
+ mirror = conf->mirrors+conf->last_used;
+
+ r1_bh = raid1_alloc_buf (conf);
+ r1_bh->master_bh = NULL;
+ r1_bh->mddev = mddev;
+ r1_bh->cmd = SPECIAL;
+ bh = &r1_bh->bh_req;
+
+ block_nr = sector_nr;
+ bsize = 512;
+ while (!(block_nr & 1) && bsize < PAGE_SIZE
+ && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) {
+ block_nr >>= 1;
+ bsize <<= 1;
+ }
+ bh->b_size = bsize;
+ bh->b_list = BUF_LOCKED;
+ bh->b_dev = mirror->dev;
+ bh->b_rdev = mirror->dev;
+ bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
+ if (!bh->b_page)
+ BUG();
+ if (!bh->b_data)
+ BUG();
+ if (bh->b_data != page_address(bh->b_page))
+ BUG();
+ bh->b_end_io = end_sync_read;
+ bh->b_private = r1_bh;
+ bh->b_blocknr = sector_nr;
+ bh->b_rsector = sector_nr;
+ init_waitqueue_head(&bh->b_wait);
+
+ generic_make_request(READ, bh);
+ md_sync_acct(bh->b_dev, bh->b_size/512);
+
+ return (bsize >> 9);
+|||||||
+ close_sync(conf);
+
+}
+
+static int init_resync(conf_t *conf)
+{
+*** 1170,9 **** 8
+ sector_t max_sector, nr_sectors;
+ int disk, partial;
+=======
+ max_sector = mddev->sb->size << 1;
+>>>>>>>
+nomem:
+<<<<<<<
+ raid1_shrink_buffers(conf);
+ return -ENOMEM;
+}
+|||||||
+ if (!sector_nr)
+ if (init_resync(conf))
+ return -ENOMEM;
+ /*
+=======
+ if (sector_nr >= max_sector) {
+ close_sync(conf);
+ return 0;
+ }
+
+ /*
+>>>>>>>
+
+static void end_sync_read(struct buffer_head *bh, int uptodate)
+{
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
+
+ /* we have read a block, now it needs to be re-written,
+ * or re-read if the read failed.
+ * We don't do much here, just schedule handling by raid1d
+ */
+ if (!uptodate)
+ md_error (r1_bh->mddev, bh->b_dev);
+ else
+ set_bit(R1BH_Uptodate, &r1_bh->state);
+ raid1_reschedule_retry(r1_bh);
+}
+
+static void end_sync_write(struct buffer_head *bh, int uptodate)
+{
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
+
+ if (!uptodate)
+ md_error (r1_bh->mddev, bh->b_dev);
+ if (atomic_dec_and_test(&r1_bh->remaining)) {
+ mddev_t *mddev = r1_bh->mddev;
+ unsigned long sect = bh->b_blocknr;
+ int size = bh->b_size;
+ raid1_free_buf(r1_bh);
+ sync_request_done(sect, mddev_to_conf(mddev));
+ md_done_sync(mddev,size>>9, uptodate);
+ }
+}
+
+#define INVALID_LEVEL KERN_WARNING \
+"raid1: md%d: raid level not set to mirroring (%d)\n"
+
+#define NO_SB KERN_ERR \
+"raid1: disabled mirror %s (couldn't access raid superblock)\n"
+
+#define ERRORS KERN_ERR \
+"raid1: disabled mirror %s (errors detected)\n"
+
+#define NOT_IN_SYNC KERN_ERR \
+"raid1: disabled mirror %s (not in sync)\n"
+
+#define INCONSISTENT KERN_ERR \
+"raid1: disabled mirror %s (inconsistent descriptor)\n"
+
+#define ALREADY_RUNNING KERN_ERR \
+"raid1: disabled mirror %s (mirror %d already operational)\n"
+
+#define OPERATIONAL KERN_INFO \
+"raid1: device %s operational as mirror %d\n"
+
+#define MEM_ERROR KERN_ERR \
+"raid1: couldn't allocate memory for md%d\n"
+
+#define SPARE KERN_INFO \
+"raid1: spare disk %s\n"
+
+#define NONE_OPERATIONAL KERN_ERR \
+"raid1: no operational mirrors for md%d\n"
+
+#define ARRAY_IS_ACTIVE KERN_INFO \
+"raid1: raid set md%d active with %d out of %d mirrors\n"
+
+#define THREAD_ERROR KERN_ERR \
+"raid1: couldn't allocate thread for md%d\n"
+
+#define START_RESYNC KERN_WARNING \
+"raid1: raid set md%d not clean; reconstructing mirrors\n"
+
+static int raid1_run (mddev_t *mddev)
+{
+ raid1_conf_t *conf;
+ int i, j, disk_idx;
+ struct mirror_info *disk;
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *descriptor;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+
+ MOD_INC_USE_COUNT;
+
+ if (sb->level != 1) {
+ printk(INVALID_LEVEL, mdidx(mddev), sb->level);
+ goto out;
+ }
+ /*
+ * copy the already verified devices into our private RAID1
+ * bookkeeping area. [whatever we allocate in raid1_run(),
+ * should be freed in raid1_stop()]
+ */
+
+ conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
+ mddev->private = conf;
+ if (!conf) {
+ printk(MEM_ERROR, mdidx(mddev));
+ goto out;
+ }
+ memset(conf, 0, sizeof(*conf));
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty) {
+ printk(ERRORS, partition_name(rdev->dev));
+ } else {
+ if (!rdev->sb) {
+ MD_BUG();
+ continue;
+ }
+ }
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ continue;
+ }
+ descriptor = &sb->disks[rdev->desc_nr];
+ disk_idx = descriptor->raid_disk;
+ disk = conf->mirrors + disk_idx;
+
+ if (disk_faulty(descriptor)) {
+ disk->number = descriptor->number;
+ disk->raid_disk = disk_idx;
+ disk->dev = rdev->dev;
+ disk->sect_limit = MAX_WORK_PER_DISK;
+ disk->operational = 0;
+ disk->write_only = 0;
+ disk->spare = 0;
+ disk->used_slot = 1;
+ disk->head_position = 0;
+ continue;
+ }
+ if (disk_active(descriptor)) {
+ if (!disk_sync(descriptor)) {
+ printk(NOT_IN_SYNC,
+ partition_name(rdev->dev));
+ continue;
+ }
+ if ((descriptor->number > MD_SB_DISKS) ||
+ (disk_idx > sb->raid_disks)) {
+
+ printk(INCONSISTENT,
+ partition_name(rdev->dev));
+ continue;
+ }
+ if (disk->operational) {
+ printk(ALREADY_RUNNING,
+ partition_name(rdev->dev),
+ disk_idx);
+ continue;
+ }
+ printk(OPERATIONAL, partition_name(rdev->dev),
+ disk_idx);
+ disk->number = descriptor->number;
+ disk->raid_disk = disk_idx;
+ disk->dev = rdev->dev;
+ disk->sect_limit = MAX_WORK_PER_DISK;
+ disk->operational = 1;
+ disk->write_only = 0;
+ disk->spare = 0;
+ disk->used_slot = 1;
+ disk->head_position = 0;
+ conf->working_disks++;
+ } else {
+ /*
+ * Must be a spare disk ..
+ */
+ printk(SPARE, partition_name(rdev->dev));
+ disk->number = descriptor->number;
+ disk->raid_disk = disk_idx;
+ disk->dev = rdev->dev;
+ disk->sect_limit = MAX_WORK_PER_DISK;
+ disk->operational = 0;
+ disk->write_only = 0;
+ disk->spare = 1;
+ disk->used_slot = 1;
+ disk->head_position = 0;
+ }
+ }
+ conf->raid_disks = sb->raid_disks;
+ conf->nr_disks = sb->nr_disks;
+ conf->mddev = mddev;
+ conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
+
+ conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
+ init_waitqueue_head(&conf->wait_buffer);
+ init_waitqueue_head(&conf->wait_done);
+ init_waitqueue_head(&conf->wait_ready);
+
+ if (!conf->working_disks) {
+ printk(NONE_OPERATIONAL, mdidx(mddev));
+ goto out_free_conf;
+ }
+
+
+ /* pre-allocate some buffer_head structures.
+ * As a minimum, 1 r1bh and raid_disks buffer_heads
+ * would probably get us by in tight memory situations,
+ * but a few more is probably a good idea.
+ * For now, try NR_RESERVED_BUFS r1bh and
+ * NR_RESERVED_BUFS*raid_disks bufferheads
+ * This will allow at least NR_RESERVED_BUFS concurrent
+ * reads or writes even if kmalloc starts failing
+ */
+ if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
+ raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
+ < NR_RESERVED_BUFS*conf->raid_disks) {
+ printk(MEM_ERROR, mdidx(mddev));
+ goto out_free_conf;
+ }
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+
+ descriptor = sb->disks+i;
+ disk_idx = descriptor->raid_disk;
+ disk = conf->mirrors + disk_idx;
+
+ if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
+ !disk->used_slot) {
+
+ disk->number = descriptor->number;
+ disk->raid_disk = disk_idx;
+ disk->dev = MKDEV(0,0);
+
+ disk->operational = 0;
+ disk->write_only = 0;
+ disk->spare = 0;
+ disk->used_slot = 1;
+ disk->head_position = 0;
+ }
+ }
+
+ /*
+ * find the first working one and use it as a starting point
+ * to read balancing.
+ */
+ for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
+ /* nothing */;
+ conf->last_used = j;
+
+
+
+ {
+ const char * name = "raid1d";
+
+ conf->thread = md_register_thread(raid1d, conf, name);
+ if (!conf->thread) {
+ printk(THREAD_ERROR, mdidx(mddev));
+ goto out_free_conf;
+ }
+ }
+
+<<<<<<<
+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
+ (conf->working_disks > 1)) {
+ const char * name = "raid1syncd";
+
+ conf->resync_thread = md_register_thread(raid1syncd, conf,name);
+|||||||
+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
+ (conf->working_disks > 1)) {
+ const char * name = "raid1syncd";
+
+ conf->resync_thread = md_register_thread(raid1syncd, conf, name);
+=======
+>>>>>>>
+
+ /*
+ * Regenerate the "device is in sync with the raid set" bit for
+ * each device.
+ */
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mark_disk_nonsync(sb->disks+i);
+ for (j = 0; j < sb->raid_disks; j++) {
+ if (!conf->mirrors[j].operational)
+ continue;
+ if (sb->disks[i].number == conf->mirrors[j].number)
+ mark_disk_sync(sb->disks+i);
+ }
+ }
+ sb->active_disks = conf->working_disks;
+
+ printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
+ /*
+ * Ok, everything is just fine now
+ */
+ return 0;
+
+out_free_conf:
+ raid1_shrink_r1bh(conf);
+ raid1_shrink_bh(conf);
+ raid1_shrink_buffers(conf);
+ kfree(conf);
+ mddev->private = NULL;
+out:
+ MOD_DEC_USE_COUNT;
+ return -EIO;
+}
+
+#undef INVALID_LEVEL
+#undef NO_SB
+#undef ERRORS
+#undef NOT_IN_SYNC
+#undef INCONSISTENT
+#undef ALREADY_RUNNING
+#undef OPERATIONAL
+#undef SPARE
+#undef NONE_OPERATIONAL
+#undef ARRAY_IS_ACTIVE
+
+<<<<<<<
+static int raid1_stop_resync (mddev_t *mddev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+
+ if (conf->resync_thread) {
+ if (conf->resync_mirrors) {
+ md_interrupt_thread(conf->resync_thread);
+
+ printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
+ return 1;
+ }
+ return 0;
+ }
+ return 0;
+}
+
+static int raid1_restart_resync (mddev_t *mddev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+|||||||
+static int stop_resync(mddev_t *mddev)
+{
+ conf_t *conf = mddev_to_conf(mddev);
+
+ if (conf->resync_thread) {
+ if (conf->resync_mirrors) {
+ md_interrupt_thread(conf->resync_thread);
+
+ printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
+ return 1;
+ }
+ return 0;
+ }
+ return 0;
+}
+
+static int restart_resync(mddev_t *mddev)
+{
+ conf_t *conf = mddev_to_conf(mddev);
+=======
+>>>>>>>
+static int raid1_stop (mddev_t *mddev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+
+ md_unregister_thread(conf->thread);
+ raid1_shrink_r1bh(conf);
+ raid1_shrink_bh(conf);
+ raid1_shrink_buffers(conf);
+ kfree(conf);
+ mddev->private = NULL;
+ MOD_DEC_USE_COUNT;
+ return 0;
+}
+
+static mdk_personality_t raid1_personality=
+{
+ name: "raid1",
+ make_request: raid1_make_request,
+ run: raid1_run,
+ stop: raid1_stop,
+ status: raid1_status,
+ error_handler: raid1_error,
+ diskop: raid1_diskop,
+<<<<<<<
+ stop_resync: raid1_stop_resync,
+ restart_resync: raid1_restart_resync,
+|||||||
+ stop_resync: stop_resync,
+ restart_resync: restart_resync,
+=======
+>>>>>>>
+ sync_request: raid1_sync_request
+};
+
+static int md__init raid1_init (void)
+{
+ return register_md_personality (RAID1, &raid1_personality);
+}
+
+static void raid1_exit (void)
+{
+ unregister_md_personality (RAID1);
+}
+
+module_init(raid1_init);
+module_exit(raid1_exit);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md-resync/orig b/tests/linux/md-resync/orig
new file mode 100644
index 0000000..375e485
--- /dev/null
+++ b/tests/linux/md-resync/orig
@@ -0,0 +1,1848 @@
+/*
+ * raid1.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * RAID-1 management functions.
+ *
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
+ *
+ * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/raid/raid1.h>
+#include <asm/atomic.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define MD_PERSONALITY
+
+#define MAX_WORK_PER_DISK 128
+
+#define NR_RESERVED_BUFS 32
+
+
+/*
+ * The following can be used to debug the driver
+ */
+#define RAID1_DEBUG 0
+
+#if RAID1_DEBUG
+#define PRINTK(x...) printk(x)
+#define inline
+#define __inline__
+#else
+#define PRINTK(x...) do { } while (0)
+#endif
+
+
+static mdk_personality_t raid1_personality;
+static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
+struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
+
+static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
+{
+ /* return a linked list of "cnt" struct buffer_heads.
+ * don't take any off the free list unless we know we can
+ * get all we need, otherwise we could deadlock
+ */
+ struct buffer_head *bh=NULL;
+
+ while(cnt) {
+ struct buffer_head *t;
+ md_spin_lock_irq(&conf->device_lock);
+ if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
+ while (cnt) {
+ t = conf->freebh;
+ conf->freebh = t->b_next;
+ t->b_next = bh;
+ bh = t;
+ t->b_state = 0;
+ conf->freebh_cnt--;
+ cnt--;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ if (cnt == 0)
+ break;
+ t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
+ if (t) {
+ t->b_next = bh;
+ bh = t;
+ cnt--;
+ } else {
+ PRINTK("raid1: waiting for %d bh\n", cnt);
+ conf->freebh_blocked = 1;
+ wait_disk_event(conf->wait_buffer,
+ !conf->freebh_blocked ||
+ conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
+ conf->freebh_blocked = 0;
+ }
+ }
+ return bh;
+}
+
+static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ while (bh) {
+ struct buffer_head *t = bh;
+ bh=bh->b_next;
+ if (t->b_pprev == NULL)
+ kmem_cache_free(bh_cachep, t);
+ else {
+ t->b_next= conf->freebh;
+ conf->freebh = t;
+ conf->freebh_cnt++;
+ }
+ }
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ wake_up(&conf->wait_buffer);
+}
+
+static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
+{
+ /* allocate cnt buffer_heads, possibly less if kmalloc fails */
+ int i = 0;
+
+ while (i < cnt) {
+ struct buffer_head *bh;
+ bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
+ if (!bh) break;
+
+ md_spin_lock_irq(&conf->device_lock);
+ bh->b_pprev = &conf->freebh;
+ bh->b_next = conf->freebh;
+ conf->freebh = bh;
+ conf->freebh_cnt++;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ i++;
+ }
+ return i;
+}
+
+static void raid1_shrink_bh(raid1_conf_t *conf)
+{
+ /* discard all buffer_heads */
+
+ md_spin_lock_irq(&conf->device_lock);
+ while (conf->freebh) {
+ struct buffer_head *bh = conf->freebh;
+ conf->freebh = bh->b_next;
+ kmem_cache_free(bh_cachep, bh);
+ conf->freebh_cnt--;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+}
+
+
+static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
+{
+ struct raid1_bh *r1_bh = NULL;
+
+ do {
+ md_spin_lock_irq(&conf->device_lock);
+ if (!conf->freer1_blocked && conf->freer1) {
+ r1_bh = conf->freer1;
+ conf->freer1 = r1_bh->next_r1;
+ conf->freer1_cnt--;
+ r1_bh->next_r1 = NULL;
+ r1_bh->state = (1 << R1BH_PreAlloc);
+ r1_bh->bh_req.b_state = 0;
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+ if (r1_bh)
+ return r1_bh;
+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
+ if (r1_bh) {
+ memset(r1_bh, 0, sizeof(*r1_bh));
+ return r1_bh;
+ }
+ conf->freer1_blocked = 1;
+ wait_disk_event(conf->wait_buffer,
+ !conf->freer1_blocked ||
+ conf->freer1_cnt > NR_RESERVED_BUFS/2
+ );
+ conf->freer1_blocked = 0;
+ } while (1);
+}
+
+static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
+{
+ struct buffer_head *bh = r1_bh->mirror_bh_list;
+ raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+
+ r1_bh->mirror_bh_list = NULL;
+
+ if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ r1_bh->next_r1 = conf->freer1;
+ conf->freer1 = r1_bh;
+ conf->freer1_cnt++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ /* don't need to wakeup wait_buffer because
+ * raid1_free_bh below will do that
+ */
+ } else {
+ kfree(r1_bh);
+ }
+ raid1_free_bh(conf, bh);
+}
+
+static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
+{
+ int i = 0;
+
+ while (i < cnt) {
+ struct raid1_bh *r1_bh;
+ r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+ if (!r1_bh)
+ break;
+ memset(r1_bh, 0, sizeof(*r1_bh));
+ set_bit(R1BH_PreAlloc, &r1_bh->state);
+ r1_bh->mddev = conf->mddev;
+
+ raid1_free_r1bh(r1_bh);
+ i++;
+ }
+ return i;
+}
+
+static void raid1_shrink_r1bh(raid1_conf_t *conf)
+{
+ md_spin_lock_irq(&conf->device_lock);
+ while (conf->freer1) {
+ struct raid1_bh *r1_bh = conf->freer1;
+ conf->freer1 = r1_bh->next_r1;
+ conf->freer1_cnt--;
+ kfree(r1_bh);
+ }
+ md_spin_unlock_irq(&conf->device_lock);
+}
+
+
+
+static inline void raid1_free_buf(struct raid1_bh *r1_bh)
+{
+ unsigned long flags;
+ struct buffer_head *bh = r1_bh->mirror_bh_list;
+ raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
+ r1_bh->mirror_bh_list = NULL;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ r1_bh->next_r1 = conf->freebuf;
+ conf->freebuf = r1_bh;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ raid1_free_bh(conf, bh);
+}
+
+static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
+{
+ struct raid1_bh *r1_bh;
+
+ md_spin_lock_irq(&conf->device_lock);
+ wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
+ r1_bh = conf->freebuf;
+ conf->freebuf = r1_bh->next_r1;
+ r1_bh->next_r1= NULL;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ return r1_bh;
+}
+
+static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
+{
+ int i = 0;
+ struct raid1_bh *head = NULL, **tail;
+ tail = &head;
+
+ while (i < cnt) {
+ struct raid1_bh *r1_bh;
+ struct page *page;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ break;
+
+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+ if (!r1_bh) {
+ __free_page(page);
+ break;
+ }
+ memset(r1_bh, 0, sizeof(*r1_bh));
+ r1_bh->bh_req.b_page = page;
+ r1_bh->bh_req.b_data = page_address(page);
+ *tail = r1_bh;
+ r1_bh->next_r1 = NULL;
+ tail = & r1_bh->next_r1;
+ i++;
+ }
+ /* this lock probably isn't needed, as at the time when
+ * we are allocating buffers, nobody else will be touching the
+ * freebuf list. But it doesn't hurt....
+ */
+ md_spin_lock_irq(&conf->device_lock);
+ *tail = conf->freebuf;
+ conf->freebuf = head;
+ md_spin_unlock_irq(&conf->device_lock);
+ return i;
+}
+
+static void raid1_shrink_buffers (raid1_conf_t *conf)
+{
+ struct raid1_bh *head;
+ md_spin_lock_irq(&conf->device_lock);
+ head = conf->freebuf;
+ conf->freebuf = NULL;
+ md_spin_unlock_irq(&conf->device_lock);
+
+ while (head) {
+ struct raid1_bh *r1_bh = head;
+ head = r1_bh->next_r1;
+ __free_page(r1_bh->bh_req.b_page);
+ kfree(r1_bh);
+ }
+}
+
+static int raid1_map (mddev_t *mddev, kdev_t *rdev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ int i, disks = MD_SB_DISKS;
+
+ /*
+ * Later we do read balancing on the read side
+ * now we use the first available disk.
+ */
+
+ for (i = 0; i < disks; i++) {
+ if (conf->mirrors[i].operational) {
+ *rdev = conf->mirrors[i].dev;
+ return (0);
+ }
+ }
+
+ printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
+ return (-1);
+}
+
+static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
+{
+ unsigned long flags;
+ mddev_t *mddev = r1_bh->mddev;
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+
+ md_spin_lock_irqsave(&retry_list_lock, flags);
+ if (raid1_retry_list == NULL)
+ raid1_retry_tail = &raid1_retry_list;
+ *raid1_retry_tail = r1_bh;
+ raid1_retry_tail = &r1_bh->next_r1;
+ r1_bh->next_r1 = NULL;
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
+ md_wakeup_thread(conf->thread);
+}
+
+
+static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->segment_lock, flags);
+ if (sector < conf->start_active)
+ conf->cnt_done--;
+ else if (sector >= conf->start_future && conf->phase == phase)
+ conf->cnt_future--;
+ else if (!--conf->cnt_pending)
+ wake_up(&conf->wait_ready);
+
+ spin_unlock_irqrestore(&conf->segment_lock, flags);
+}
+
+static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->segment_lock, flags);
+ if (sector >= conf->start_ready)
+ --conf->cnt_ready;
+ else if (sector >= conf->start_active) {
+ if (!--conf->cnt_active) {
+ conf->start_active = conf->start_ready;
+ wake_up(&conf->wait_done);
+ }
+ }
+ spin_unlock_irqrestore(&conf->segment_lock, flags);
+}
+
+/*
+ * raid1_end_bh_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
+{
+ struct buffer_head *bh = r1_bh->master_bh;
+
+ io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
+ test_bit(R1BH_SyncPhase, &r1_bh->state));
+
+ bh->b_end_io(bh, uptodate);
+ raid1_free_r1bh(r1_bh);
+}
+void raid1_end_request (struct buffer_head *bh, int uptodate)
+{
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
+
+ /*
+ * this branch is our 'one mirror IO has finished' event handler:
+ */
+ if (!uptodate)
+ md_error (r1_bh->mddev, bh->b_dev);
+ else
+ /*
+ * Set R1BH_Uptodate in our master buffer_head, so that
+ * we will return a good error code for to the higher
+ * levels even if IO on some other mirrored buffer fails.
+ *
+ * The 'master' represents the complex operation to
+ * user-side. So if something waits for IO, then it will
+ * wait for the 'master' buffer_head.
+ */
+ set_bit (R1BH_Uptodate, &r1_bh->state);
+
+ /*
+ * We split up the read and write side, imho they are
+ * conceptually different.
+ */
+
+ if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
+ /*
+ * we have only one buffer_head on the read side
+ */
+
+ if (uptodate) {
+ raid1_end_bh_io(r1_bh, uptodate);
+ return;
+ }
+ /*
+ * oops, read error:
+ */
+ printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
+ partition_name(bh->b_dev), bh->b_blocknr);
+ raid1_reschedule_retry(r1_bh);
+ return;
+ }
+
+ /*
+ * WRITE:
+ *
+ * Let's see if all mirrored write operations have finished
+ * already.
+ */
+
+ if (atomic_dec_and_test(&r1_bh->remaining))
+ raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
+}
+
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. It bookkeeps the last read position for every disk
+ * in array and when new read requests come, the disk which last
+ * position is nearest to the request, is chosen.
+ *
+ * TODO: now if there are 2 mirrors in the same 2 devices, performance
+ * degrades dramatically because position is mirror, not device based.
+ * This should be changed to be device based. Also atomic sequential
+ * reads should be somehow balanced.
+ */
+
+static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
+{
+ int new_disk = conf->last_used;
+ const int sectors = bh->b_size >> 9;
+ const unsigned long this_sector = bh->b_rsector;
+ int disk = new_disk;
+ unsigned long new_distance;
+ unsigned long current_distance;
+
+ /*
+ * Check if it is sane at all to balance
+ */
+
+ if (conf->resync_mirrors)
+ goto rb_out;
+
+
+ /* make sure that disk is operational */
+ while( !conf->mirrors[new_disk].operational) {
+ if (new_disk <= 0) new_disk = conf->raid_disks;
+ new_disk--;
+ if (new_disk == disk) {
+ /*
+ * This means no working disk was found
+ * Nothing much to do, lets not change anything
+ * and hope for the best...
+ */
+
+ new_disk = conf->last_used;
+
+ goto rb_out;
+ }
+ }
+ disk = new_disk;
+ /* now disk == new_disk == starting point for search */
+
+ /*
+ * Don't touch anything for sequential reads.
+ */
+
+ if (this_sector == conf->mirrors[new_disk].head_position)
+ goto rb_out;
+
+ /*
+ * If reads have been done only on a single disk
+ * for a time, lets give another disk a change.
+ * This is for kicking those idling disks so that
+ * they would find work near some hotspot.
+ */
+
+ if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
+ conf->sect_count = 0;
+
+#if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
+ /* Work around a compiler bug in egcs-2.92.11 19980921 */
+ new_disk = *(volatile int *)&new_disk;
+#endif
+ do {
+ if (new_disk<=0)
+ new_disk = conf->raid_disks;
+ new_disk--;
+ if (new_disk == disk)
+ break;
+ } while ((conf->mirrors[new_disk].write_only) ||
+ (!conf->mirrors[new_disk].operational));
+
+ goto rb_out;
+ }
+
+ current_distance = abs(this_sector -
+ conf->mirrors[disk].head_position);
+
+ /* Find the disk which is closest */
+
+ do {
+ if (disk <= 0)
+ disk = conf->raid_disks;
+ disk--;
+
+ if ((conf->mirrors[disk].write_only) ||
+ (!conf->mirrors[disk].operational))
+ continue;
+
+ new_distance = abs(this_sector -
+ conf->mirrors[disk].head_position);
+
+ if (new_distance < current_distance) {
+ conf->sect_count = 0;
+ current_distance = new_distance;
+ new_disk = disk;
+ }
+ } while (disk != conf->last_used);
+
+rb_out:
+ conf->mirrors[new_disk].head_position = this_sector + sectors;
+
+ conf->last_used = new_disk;
+ conf->sect_count += sectors;
+
+ return new_disk;
+}
+
+static int raid1_make_request (request_queue_t *q,
+ struct buffer_head * bh)
+{
+ mddev_t *mddev = q->queuedata;
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ struct buffer_head *bh_req, *bhl;
+ struct raid1_bh * r1_bh;
+ int disks = MD_SB_DISKS;
+ int i, sum_bhs = 0;
+ struct mirror_info *mirror;
+
+ if (!buffer_locked(bh))
+ BUG();
+
+/*
+ * make_request() can abort the operation when READA is being
+ * used and no empty request is available.
+ *
+ * Currently, just replace the command with READ/WRITE.
+ */
+ r1_bh = raid1_alloc_r1bh (conf);
+
+ spin_lock_irq(&conf->segment_lock);
+ wait_event_lock_irq(conf->wait_done,
+ bh->b_rsector < conf->start_active ||
+ bh->b_rsector >= conf->start_future,
+ conf->segment_lock);
+ if (bh->b_rsector < conf->start_active)
+ conf->cnt_done++;
+ else {
+ conf->cnt_future++;
+ if (conf->phase)
+ set_bit(R1BH_SyncPhase, &r1_bh->state);
+ }
+ spin_unlock_irq(&conf->segment_lock);
+
+ /*
+ * i think the read and write branch should be separated completely,
+ * since we want to do read balancing on the read side for example.
+ * Alternative implementations? :) --mingo
+ */
+
+ r1_bh->master_bh = bh;
+ r1_bh->mddev = mddev;
+ r1_bh->cmd = rw;
+
+ if (rw == READ) {
+ /*
+ * read balancing logic:
+ */
+ mirror = conf->mirrors + raid1_read_balance(conf, bh);
+
+ bh_req = &r1_bh->bh_req;
+ memcpy(bh_req, bh, sizeof(*bh));
+ bh_req->b_blocknr = bh->b_rsector;
+ bh_req->b_dev = mirror->dev;
+ bh_req->b_rdev = mirror->dev;
+ /* bh_req->b_rsector = bh->n_rsector; */
+ bh_req->b_end_io = raid1_end_request;
+ bh_req->b_private = r1_bh;
+ generic_make_request (rw, bh_req);
+ return 0;
+ }
+
+ /*
+ * WRITE:
+ */
+
+ bhl = raid1_alloc_bh(conf, conf->raid_disks);
+ for (i = 0; i < disks; i++) {
+ struct buffer_head *mbh;
+ if (!conf->mirrors[i].operational)
+ continue;
+
+ /*
+ * We should use a private pool (size depending on NR_REQUEST),
+ * to avoid writes filling up the memory with bhs
+ *
+ * Such pools are much faster than kmalloc anyways (so we waste
+ * almost nothing by not using the master bh when writing and
+ * win alot of cleanness) but for now we are cool enough. --mingo
+ *
+ * It's safe to sleep here, buffer heads cannot be used in a shared
+ * manner in the write branch. Look how we lock the buffer at the
+ * beginning of this function to grok the difference ;)
+ */
+ mbh = bhl;
+ if (mbh == NULL) {
+ MD_BUG();
+ break;
+ }
+ bhl = mbh->b_next;
+ mbh->b_next = NULL;
+ mbh->b_this_page = (struct buffer_head *)1;
+
+ /*
+ * prepare mirrored mbh (fields ordered for max mem throughput):
+ */
+ mbh->b_blocknr = bh->b_rsector;
+ mbh->b_dev = conf->mirrors[i].dev;
+ mbh->b_rdev = conf->mirrors[i].dev;
+ mbh->b_rsector = bh->b_rsector;
+ mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
+ (1<<BH_Mapped) | (1<<BH_Lock);
+
+ atomic_set(&mbh->b_count, 1);
+ mbh->b_size = bh->b_size;
+ mbh->b_page = bh->b_page;
+ mbh->b_data = bh->b_data;
+ mbh->b_list = BUF_LOCKED;
+ mbh->b_end_io = raid1_end_request;
+ mbh->b_private = r1_bh;
+
+ mbh->b_next = r1_bh->mirror_bh_list;
+ r1_bh->mirror_bh_list = mbh;
+ sum_bhs++;
+ }
+ if (bhl) raid1_free_bh(conf,bhl);
+ if (!sum_bhs) {
+ /* Gag - all mirrors non-operational.. */
+ raid1_end_bh_io(r1_bh, 0);
+ return 0;
+ }
+ md_atomic_set(&r1_bh->remaining, sum_bhs);
+
+ /*
+ * We have to be a bit careful about the semaphore above, thats
+ * why we start the requests separately. Since kmalloc() could
+ * fail, sleep and make_request() can sleep too, this is the
+ * safer solution. Imagine, end_request decreasing the semaphore
+ * before we could have set it up ... We could play tricks with
+ * the semaphore (presetting it and correcting at the end if
+ * sum_bhs is not 'n' but we have to do end_request by hand if
+ * all requests finish until we had a chance to set up the
+ * semaphore correctly ... lots of races).
+ */
+ bh = r1_bh->mirror_bh_list;
+ while(bh) {
+ struct buffer_head *bh2 = bh;
+ bh = bh->b_next;
+ generic_make_request(rw, bh2);
+ }
+ return (0);
+}
+
+static void raid1_status(struct seq_file *seq, mddev_t *mddev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ int i;
+
+ seq_printf(seq, " [%d/%d] [", conf->raid_disks,
+ conf->working_disks);
+ for (i = 0; i < conf->raid_disks; i++)
+ seq_printf(seq, "%s",
+ conf->mirrors[i].operational ? "U" : "_");
+ seq_printf(seq, "]");
+}
+
+#define LAST_DISK KERN_ALERT \
+"raid1: only one disk left and IO error.\n"
+
+#define NO_SPARE_DISK KERN_ALERT \
+"raid1: no spare disk left, degrading mirror level by one.\n"
+
+#define DISK_FAILED KERN_ALERT \
+"raid1: Disk failure on %s, disabling device. \n" \
+" Operation continuing on %d devices\n"
+
+#define START_SYNCING KERN_ALERT \
+"raid1: start syncing spare disk.\n"
+
+#define ALREADY_SYNCING KERN_INFO \
+"raid1: syncing already in progress.\n"
+
+static void mark_disk_bad (mddev_t *mddev, int failed)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ struct mirror_info *mirror = conf->mirrors+failed;
+ mdp_super_t *sb = mddev->sb;
+
+ mirror->operational = 0;
+ mark_disk_faulty(sb->disks+mirror->number);
+ mark_disk_nonsync(sb->disks+mirror->number);
+ mark_disk_inactive(sb->disks+mirror->number);
+ if (!mirror->write_only)
+ sb->active_disks--;
+ sb->working_disks--;
+ sb->failed_disks++;
+ mddev->sb_dirty = 1;
+ md_wakeup_thread(conf->thread);
+ if (!mirror->write_only)
+ conf->working_disks--;
+ printk (DISK_FAILED, partition_name (mirror->dev),
+ conf->working_disks);
+}
+
+static int raid1_error (mddev_t *mddev, kdev_t dev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ struct mirror_info * mirrors = conf->mirrors;
+ int disks = MD_SB_DISKS;
+ int i;
+
+ /* Find the drive.
+ * If it is not operational, then we have already marked it as dead
+ * else if it is the last working disks, ignore the error, let the
+ * next level up know.
+ * else mark the drive as failed
+ */
+
+ for (i = 0; i < disks; i++)
+ if (mirrors[i].dev==dev && mirrors[i].operational)
+ break;
+ if (i == disks)
+ return 0;
+
+ if (i < conf->raid_disks && conf->working_disks == 1) {
+ /* Don't fail the drive, act as though we were just a
+ * normal single drive
+ */
+
+ return 1;
+ }
+ mark_disk_bad(mddev, i);
+ return 0;
+}
+
+#undef LAST_DISK
+#undef NO_SPARE_DISK
+#undef DISK_FAILED
+#undef START_SYNCING
+
+
+static void print_raid1_conf (raid1_conf_t *conf)
+{
+ int i;
+ struct mirror_info *tmp;
+
+ printk("RAID1 conf printout:\n");
+ if (!conf) {
+ printk("(conf==NULL)\n");
+ return;
+ }
+ printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
+ conf->raid_disks, conf->nr_disks);
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ tmp = conf->mirrors + i;
+ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
+ i, tmp->spare,tmp->operational,
+ tmp->number,tmp->raid_disk,tmp->used_slot,
+ partition_name(tmp->dev));
+ }
+}
+
+static void close_sync(raid1_conf_t *conf)
+{
+ mddev_t *mddev = conf->mddev;
+ /* If reconstruction was interrupted, we need to close the "active" and "pending"
+ * holes.
+ * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
+ */
+ /* this is really needed when recovery stops too... */
+ spin_lock_irq(&conf->segment_lock);
+ conf->start_active = conf->start_pending;
+ conf->start_ready = conf->start_pending;
+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+ conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
+ conf->start_future = (mddev->sb->size<<1)+1;
+ conf->cnt_pending = conf->cnt_future;
+ conf->cnt_future = 0;
+ conf->phase = conf->phase ^1;
+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+ conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
+ conf->phase = 0;
+ conf->cnt_future = conf->cnt_done;;
+ conf->cnt_done = 0;
+ spin_unlock_irq(&conf->segment_lock);
+ wake_up(&conf->wait_done);
+}
+
+static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+{
+ int err = 0;
+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
+ raid1_conf_t *conf = mddev->private;
+ struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
+ mdk_rdev_t *spare_rdev, *failed_rdev;
+
+ print_raid1_conf(conf);
+
+ switch (state) {
+ case DISKOP_SPARE_ACTIVE:
+ case DISKOP_SPARE_INACTIVE:
+ /* need to wait for pending sync io before locking device */
+ close_sync(conf);
+ }
+
+ md_spin_lock_irq(&conf->device_lock);
+ /*
+ * find the disk ...
+ */
+ switch (state) {
+
+ case DISKOP_SPARE_ACTIVE:
+
+ /*
+ * Find the failed disk within the RAID1 configuration ...
+ * (this can only be in the first conf->working_disks part)
+ */
+ for (i = 0; i < conf->raid_disks; i++) {
+ tmp = conf->mirrors + i;
+ if ((!tmp->operational && !tmp->spare) ||
+ !tmp->used_slot) {
+ failed_disk = i;
+ break;
+ }
+ }
+ /*
+ * When we activate a spare disk we _must_ have a disk in
+ * the lower (active) part of the array to replace.
+ */
+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ /* fall through */
+
+ case DISKOP_SPARE_WRITE:
+ case DISKOP_SPARE_INACTIVE:
+
+ /*
+ * Find the spare disk ... (can only be in the 'high'
+ * area of the array)
+ */
+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+ tmp = conf->mirrors + i;
+ if (tmp->spare && tmp->number == (*d)->number) {
+ spare_disk = i;
+ break;
+ }
+ }
+ if (spare_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+
+ case DISKOP_HOT_REMOVE_DISK:
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ tmp = conf->mirrors + i;
+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
+ if (tmp->operational) {
+ err = -EBUSY;
+ goto abort;
+ }
+ removed_disk = i;
+ break;
+ }
+ }
+ if (removed_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+
+ case DISKOP_HOT_ADD_DISK:
+
+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+ tmp = conf->mirrors + i;
+ if (!tmp->used_slot) {
+ added_disk = i;
+ break;
+ }
+ }
+ if (added_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+ }
+
+ switch (state) {
+ /*
+ * Switch the spare disk to write-only mode:
+ */
+ case DISKOP_SPARE_WRITE:
+ sdisk = conf->mirrors + spare_disk;
+ sdisk->operational = 1;
+ sdisk->write_only = 1;
+ break;
+ /*
+ * Deactivate a spare disk:
+ */
+ case DISKOP_SPARE_INACTIVE:
+ if (conf->start_future > 0) {
+ MD_BUG();
+ err = -EBUSY;
+ break;
+ }
+ sdisk = conf->mirrors + spare_disk;
+ sdisk->operational = 0;
+ sdisk->write_only = 0;
+ break;
+ /*
+ * Activate (mark read-write) the (now sync) spare disk,
+ * which means we switch it's 'raid position' (->raid_disk)
+ * with the failed disk. (only the first 'conf->nr_disks'
+ * slots are used for 'real' disks and we must preserve this
+ * property)
+ */
+ case DISKOP_SPARE_ACTIVE:
+ if (conf->start_future > 0) {
+ MD_BUG();
+ err = -EBUSY;
+ break;
+ }
+ sdisk = conf->mirrors + spare_disk;
+ fdisk = conf->mirrors + failed_disk;
+
+ spare_desc = &sb->disks[sdisk->number];
+ failed_desc = &sb->disks[fdisk->number];
+
+ if (spare_desc != *d) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (spare_desc->raid_disk != sdisk->raid_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (sdisk->raid_disk != spare_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (failed_desc->raid_disk != fdisk->raid_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (fdisk->raid_disk != failed_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ /*
+ * do the switch finally
+ */
+ spare_rdev = find_rdev_nr(mddev, spare_desc->number);
+ failed_rdev = find_rdev_nr(mddev, failed_desc->number);
+
+ /* There must be a spare_rdev, but there may not be a
+ * failed_rdev. That slot might be empty...
+ */
+ spare_rdev->desc_nr = failed_desc->number;
+ if (failed_rdev)
+ failed_rdev->desc_nr = spare_desc->number;
+
+ xchg_values(*spare_desc, *failed_desc);
+ xchg_values(*fdisk, *sdisk);
+
+ /*
+ * (careful, 'failed' and 'spare' are switched from now on)
+ *
+ * we want to preserve linear numbering and we want to
+ * give the proper raid_disk number to the now activated
+ * disk. (this means we switch back these values)
+ */
+
+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
+ xchg_values(spare_desc->number, failed_desc->number);
+ xchg_values(sdisk->number, fdisk->number);
+
+ *d = failed_desc;
+
+ if (sdisk->dev == MKDEV(0,0))
+ sdisk->used_slot = 0;
+ /*
+ * this really activates the spare.
+ */
+ fdisk->spare = 0;
+ fdisk->write_only = 0;
+
+ /*
+ * if we activate a spare, we definitely replace a
+ * non-operational disk slot in the 'low' area of
+ * the disk array.
+ */
+
+ conf->working_disks++;
+
+ break;
+
+ case DISKOP_HOT_REMOVE_DISK:
+ rdisk = conf->mirrors + removed_disk;
+
+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ rdisk->dev = MKDEV(0,0);
+ rdisk->used_slot = 0;
+ conf->nr_disks--;
+ break;
+
+ case DISKOP_HOT_ADD_DISK:
+ adisk = conf->mirrors + added_disk;
+ added_desc = *d;
+
+ if (added_disk != added_desc->number) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ adisk->number = added_desc->number;
+ adisk->raid_disk = added_desc->raid_disk;
+ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
+
+ adisk->operational = 0;
+ adisk->write_only = 0;
+ adisk->spare = 1;
+ adisk->used_slot = 1;
+ adisk->head_position = 0;
+ conf->nr_disks++;
+
+ break;
+
+ default:
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+abort:
+ md_spin_unlock_irq(&conf->device_lock);
+ if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
+ /* should move to "END_REBUILD" when such exists */
+ raid1_shrink_buffers(conf);
+
+ print_raid1_conf(conf);
+ return err;
+}
+
+
+#define IO_ERROR KERN_ALERT \
+"raid1: %s: unrecoverable I/O read error for block %lu\n"
+
+#define REDIRECT_SECTOR KERN_ERR \
+"raid1: %s: redirecting sector %lu to another mirror\n"
+
+/*
+ * This is a kernel thread which:
+ *
+ * 1. Retries failed read operations on working mirrors.
+ * 2. Updates the raid superblock when problems encounter.
+ * 3. Performs writes following reads for array syncronising.
+ */
+static void end_sync_write(struct buffer_head *bh, int uptodate);
+static void end_sync_read(struct buffer_head *bh, int uptodate);
+
+static void raid1d (void *data)
+{
+ struct raid1_bh *r1_bh;
+ struct buffer_head *bh;
+ unsigned long flags;
+ raid1_conf_t *conf = data;
+ mddev_t *mddev = conf->mddev;
+ kdev_t dev;
+
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+
+ for (;;) {
+ md_spin_lock_irqsave(&retry_list_lock, flags);
+ r1_bh = raid1_retry_list;
+ if (!r1_bh)
+ break;
+ raid1_retry_list = r1_bh->next_r1;
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
+
+ mddev = r1_bh->mddev;
+ bh = &r1_bh->bh_req;
+ switch(r1_bh->cmd) {
+ case SPECIAL:
+ /* have to allocate lots of bh structures and
+ * schedule writes
+ */
+ if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
+ int i, sum_bhs = 0;
+ int disks = MD_SB_DISKS;
+ struct buffer_head *bhl, *mbh;
+
+ conf = mddev_to_conf(mddev);
+ bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
+ for (i = 0; i < disks ; i++) {
+ if (!conf->mirrors[i].operational)
+ continue;
+ if (i==conf->last_used)
+ /* we read from here, no need to write */
+ continue;
+ if (i < conf->raid_disks
+ && !conf->resync_mirrors)
+ /* don't need to write this,
+ * we are just rebuilding */
+ continue;
+ mbh = bhl;
+ if (!mbh) {
+ MD_BUG();
+ break;
+ }
+ bhl = mbh->b_next;
+ mbh->b_this_page = (struct buffer_head *)1;
+
+
+ /*
+ * prepare mirrored bh (fields ordered for max mem throughput):
+ */
+ mbh->b_blocknr = bh->b_blocknr;
+ mbh->b_dev = conf->mirrors[i].dev;
+ mbh->b_rdev = conf->mirrors[i].dev;
+ mbh->b_rsector = bh->b_blocknr;
+ mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
+ (1<<BH_Mapped) | (1<<BH_Lock);
+ atomic_set(&mbh->b_count, 1);
+ mbh->b_size = bh->b_size;
+ mbh->b_page = bh->b_page;
+ mbh->b_data = bh->b_data;
+ mbh->b_list = BUF_LOCKED;
+ mbh->b_end_io = end_sync_write;
+ mbh->b_private = r1_bh;
+
+ mbh->b_next = r1_bh->mirror_bh_list;
+ r1_bh->mirror_bh_list = mbh;
+
+ sum_bhs++;
+ }
+ md_atomic_set(&r1_bh->remaining, sum_bhs);
+ if (bhl) raid1_free_bh(conf, bhl);
+ mbh = r1_bh->mirror_bh_list;
+
+ if (!sum_bhs) {
+ /* nowhere to write this too... I guess we
+ * must be done
+ */
+ sync_request_done(bh->b_blocknr, conf);
+ md_done_sync(mddev, bh->b_size>>9, 0);
+ raid1_free_buf(r1_bh);
+ } else
+ while (mbh) {
+ struct buffer_head *bh1 = mbh;
+ mbh = mbh->b_next;
+ generic_make_request(WRITE, bh1);
+ md_sync_acct(bh1->b_dev, bh1->b_size/512);
+ }
+ } else {
+ /* There is no point trying a read-for-reconstruct
+ * as reconstruct is about to be aborted
+ */
+
+ printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
+ md_done_sync(mddev, bh->b_size>>9, 0);
+ }
+
+ break;
+ case READ:
+ case READA:
+ dev = bh->b_dev;
+ raid1_map (mddev, &bh->b_dev);
+ if (bh->b_dev == dev) {
+ printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
+ raid1_end_bh_io(r1_bh, 0);
+ } else {
+ printk (REDIRECT_SECTOR,
+ partition_name(bh->b_dev), bh->b_blocknr);
+ bh->b_rdev = bh->b_dev;
+ bh->b_rsector = bh->b_blocknr;
+ generic_make_request (r1_bh->cmd, bh);
+ }
+ break;
+ }
+ }
+ md_spin_unlock_irqrestore(&retry_list_lock, flags);
+}
+#undef IO_ERROR
+#undef REDIRECT_SECTOR
+
+/*
+ * Private kernel thread to reconstruct mirrors after an unclean
+ * shutdown.
+ */
+static void raid1syncd (void *data)
+{
+ raid1_conf_t *conf = data;
+ mddev_t *mddev = conf->mddev;
+
+ if (!conf->resync_mirrors)
+ return;
+ if (mddev->recovery_running != 2)
+ return;
+ if (!md_do_sync(mddev, NULL)) {
+ /*
+ * Only if everything went Ok.
+ */
+ conf->resync_mirrors = 0;
+ }
+
+ close_sync(conf);
+
+}
+
+/*
+ * perform a "sync" on one "block"
+ *
+ * We need to make sure that no normal I/O request - particularly write
+ * requests - conflict with active sync requests.
+ * This is achieved by conceptually dividing the device space into a
+ * number of sections:
+ * DONE: 0 .. a-1 These blocks are in-sync
+ * ACTIVE: a.. b-1 These blocks may have active sync requests, but
+ * no normal IO requests
+ * READY: b .. c-1 These blocks have no normal IO requests - sync
+ * request may be happening
+ * PENDING: c .. d-1 These blocks may have IO requests, but no new
+ * ones will be added
+ * FUTURE: d .. end These blocks are not to be considered yet. IO may
+ * be happening, but not sync
+ *
+ * We keep a
+ * phase which flips (0 or 1) each time d moves and
+ * a count of:
+ * z = active io requests in FUTURE since d moved - marked with
+ * current phase
+ * y = active io requests in FUTURE before d moved, or PENDING -
+ * marked with previous phase
+ * x = active sync requests in READY
+ * w = active sync requests in ACTIVE
+ * v = active io requests in DONE
+ *
+ * Normally, a=b=c=d=0 and z= active io requests
+ * or a=b=c=d=END and v= active io requests
+ * Allowed changes to a,b,c,d:
+ * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase
+ * B: y==0 -> c=d
+ * C: b=c, w+=x, x=0
+ * D: w==0 -> a=b
+ * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
+ *
+ * At start of sync we apply A.
+ * When y reaches 0, we apply B then A then being sync requests
+ * When sync point reaches c-1, we wait for y==0, and W==0, and
+ * then apply apply B then A then D then C.
+ * Finally, we apply E
+ *
+ * The sync request simply issues a "read" against a working drive
+ * This is marked so that on completion the raid1d thread is woken to
+ * issue suitable write requests
+ */
+
+static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+ struct mirror_info *mirror;
+ struct raid1_bh *r1_bh;
+ struct buffer_head *bh;
+ int bsize;
+ int disk;
+ int block_nr;
+ int buffs;
+
+ if (!sector_nr) {
+ /* we want enough buffers to hold twice the window of 128*/
+ buffs = 128 *2 / (PAGE_SIZE>>9);
+ buffs = raid1_grow_buffers(conf, buffs);
+ if (buffs < 2)
+ goto nomem;
+ conf->window = buffs*(PAGE_SIZE>>9)/2;
+ }
+ spin_lock_irq(&conf->segment_lock);
+ if (!sector_nr) {
+ /* initialize ...*/
+ conf->start_active = 0;
+ conf->start_ready = 0;
+ conf->start_pending = 0;
+ conf->start_future = 0;
+ conf->phase = 0;
+
+ conf->cnt_future += conf->cnt_done+conf->cnt_pending;
+ conf->cnt_done = conf->cnt_pending = 0;
+ if (conf->cnt_ready || conf->cnt_active)
+ MD_BUG();
+ }
+ while (sector_nr >= conf->start_pending) {
+ PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
+ sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
+ conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
+ wait_event_lock_irq(conf->wait_done,
+ !conf->cnt_active,
+ conf->segment_lock);
+ wait_event_lock_irq(conf->wait_ready,
+ !conf->cnt_pending,
+ conf->segment_lock);
+ conf->start_active = conf->start_ready;
+ conf->start_ready = conf->start_pending;
+ conf->start_pending = conf->start_future;
+ conf->start_future = conf->start_future+conf->window;
+ // Note: falling off the end is not a problem
+ conf->phase = conf->phase ^1;
+ conf->cnt_active = conf->cnt_ready;
+ conf->cnt_ready = 0;
+ conf->cnt_pending = conf->cnt_future;
+ conf->cnt_future = 0;
+ wake_up(&conf->wait_done);
+ }
+ conf->cnt_ready++;
+ spin_unlock_irq(&conf->segment_lock);
+
+
+ /* If reconstructing, and >1 working disc,
+ * could dedicate one to rebuild and others to
+ * service read requests ..
+ */
+ disk = conf->last_used;
+ /* make sure disk is operational */
+ while (!conf->mirrors[disk].operational) {
+ if (disk <= 0) disk = conf->raid_disks;
+ disk--;
+ if (disk == conf->last_used)
+ break;
+ }
+ conf->last_used = disk;
+
+ mirror = conf->mirrors+conf->last_used;
+
+ r1_bh = raid1_alloc_buf (conf);
+ r1_bh->master_bh = NULL;
+ r1_bh->mddev = mddev;
+ r1_bh->cmd = SPECIAL;
+ bh = &r1_bh->bh_req;
+
+ block_nr = sector_nr;
+ bsize = 512;
+ while (!(block_nr & 1) && bsize < PAGE_SIZE
+ && (block_nr+2)*(bsize>>9) < (mddev->sb->size *2)) {
+ block_nr >>= 1;
+ bsize <<= 1;
+ }
+ bh->b_size = bsize;
+ bh->b_list = BUF_LOCKED;
+ bh->b_dev = mirror->dev;
+ bh->b_rdev = mirror->dev;
+ bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
+ if (!bh->b_page)
+ BUG();
+ if (!bh->b_data)
+ BUG();
+ if (bh->b_data != page_address(bh->b_page))
+ BUG();
+ bh->b_end_io = end_sync_read;
+ bh->b_private = r1_bh;
+ bh->b_blocknr = sector_nr;
+ bh->b_rsector = sector_nr;
+ init_waitqueue_head(&bh->b_wait);
+
+ generic_make_request(READ, bh);
+ md_sync_acct(bh->b_dev, bh->b_size/512);
+
+ return (bsize >> 9);
+
+nomem:
+ raid1_shrink_buffers(conf);
+ return -ENOMEM;
+}
+
+static void end_sync_read(struct buffer_head *bh, int uptodate)
+{
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
+
+ /* we have read a block, now it needs to be re-written,
+ * or re-read if the read failed.
+ * We don't do much here, just schedule handling by raid1d
+ */
+ if (!uptodate)
+ md_error (r1_bh->mddev, bh->b_dev);
+ else
+ set_bit(R1BH_Uptodate, &r1_bh->state);
+ raid1_reschedule_retry(r1_bh);
+}
+
+static void end_sync_write(struct buffer_head *bh, int uptodate)
+{
+ struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
+
+ if (!uptodate)
+ md_error (r1_bh->mddev, bh->b_dev);
+ if (atomic_dec_and_test(&r1_bh->remaining)) {
+ mddev_t *mddev = r1_bh->mddev;
+ unsigned long sect = bh->b_blocknr;
+ int size = bh->b_size;
+ raid1_free_buf(r1_bh);
+ sync_request_done(sect, mddev_to_conf(mddev));
+ md_done_sync(mddev,size>>9, uptodate);
+ }
+}
+
+#define INVALID_LEVEL KERN_WARNING \
+"raid1: md%d: raid level not set to mirroring (%d)\n"
+
+#define NO_SB KERN_ERR \
+"raid1: disabled mirror %s (couldn't access raid superblock)\n"
+
+#define ERRORS KERN_ERR \
+"raid1: disabled mirror %s (errors detected)\n"
+
+#define NOT_IN_SYNC KERN_ERR \
+"raid1: disabled mirror %s (not in sync)\n"
+
+#define INCONSISTENT KERN_ERR \
+"raid1: disabled mirror %s (inconsistent descriptor)\n"
+
+#define ALREADY_RUNNING KERN_ERR \
+"raid1: disabled mirror %s (mirror %d already operational)\n"
+
+#define OPERATIONAL KERN_INFO \
+"raid1: device %s operational as mirror %d\n"
+
+#define MEM_ERROR KERN_ERR \
+"raid1: couldn't allocate memory for md%d\n"
+
+#define SPARE KERN_INFO \
+"raid1: spare disk %s\n"
+
+#define NONE_OPERATIONAL KERN_ERR \
+"raid1: no operational mirrors for md%d\n"
+
+#define ARRAY_IS_ACTIVE KERN_INFO \
+"raid1: raid set md%d active with %d out of %d mirrors\n"
+
+#define THREAD_ERROR KERN_ERR \
+"raid1: couldn't allocate thread for md%d\n"
+
+#define START_RESYNC KERN_WARNING \
+"raid1: raid set md%d not clean; reconstructing mirrors\n"
+
+static int raid1_run (mddev_t *mddev)
+{
+ raid1_conf_t *conf;
+ int i, j, disk_idx;
+ struct mirror_info *disk;
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *descriptor;
+ mdk_rdev_t *rdev;
+ struct md_list_head *tmp;
+ int start_recovery = 0;
+
+ MOD_INC_USE_COUNT;
+
+ if (sb->level != 1) {
+ printk(INVALID_LEVEL, mdidx(mddev), sb->level);
+ goto out;
+ }
+ /*
+ * copy the already verified devices into our private RAID1
+ * bookkeeping area. [whatever we allocate in raid1_run(),
+ * should be freed in raid1_stop()]
+ */
+
+ conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
+ mddev->private = conf;
+ if (!conf) {
+ printk(MEM_ERROR, mdidx(mddev));
+ goto out;
+ }
+ memset(conf, 0, sizeof(*conf));
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty) {
+ printk(ERRORS, partition_name(rdev->dev));
+ } else {
+ if (!rdev->sb) {
+ MD_BUG();
+ continue;
+ }
+ }
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ continue;
+ }
+ descriptor = &sb->disks[rdev->desc_nr];
+ disk_idx = descriptor->raid_disk;
+ disk = conf->mirrors + disk_idx;
+
+ if (disk_faulty(descriptor)) {
+ disk->number = descriptor->number;
+ disk->raid_disk = disk_idx;
+ disk->dev = rdev->dev;
+ disk->sect_limit = MAX_WORK_PER_DISK;
+ disk->operational = 0;
+ disk->write_only = 0;
+ disk->spare = 0;
+ disk->used_slot = 1;
+ disk->head_position = 0;
+ continue;
+ }
+ if (disk_active(descriptor)) {
+ if (!disk_sync(descriptor)) {
+ printk(NOT_IN_SYNC,
+ partition_name(rdev->dev));
+ continue;
+ }
+ if ((descriptor->number > MD_SB_DISKS) ||
+ (disk_idx > sb->raid_disks)) {
+
+ printk(INCONSISTENT,
+ partition_name(rdev->dev));
+ continue;
+ }
+ if (disk->operational) {
+ printk(ALREADY_RUNNING,
+ partition_name(rdev->dev),
+ disk_idx);
+ continue;
+ }
+ printk(OPERATIONAL, partition_name(rdev->dev),
+ disk_idx);
+ disk->number = descriptor->number;
+ disk->raid_disk = disk_idx;
+ disk->dev = rdev->dev;
+ disk->sect_limit = MAX_WORK_PER_DISK;
+ disk->operational = 1;
+ disk->write_only = 0;
+ disk->spare = 0;
+ disk->used_slot = 1;
+ disk->head_position = 0;
+ conf->working_disks++;
+ } else {
+ /*
+ * Must be a spare disk ..
+ */
+ printk(SPARE, partition_name(rdev->dev));
+ disk->number = descriptor->number;
+ disk->raid_disk = disk_idx;
+ disk->dev = rdev->dev;
+ disk->sect_limit = MAX_WORK_PER_DISK;
+ disk->operational = 0;
+ disk->write_only = 0;
+ disk->spare = 1;
+ disk->used_slot = 1;
+ disk->head_position = 0;
+ }
+ }
+ conf->raid_disks = sb->raid_disks;
+ conf->nr_disks = sb->nr_disks;
+ conf->mddev = mddev;
+ conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
+
+ conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
+ init_waitqueue_head(&conf->wait_buffer);
+ init_waitqueue_head(&conf->wait_done);
+ init_waitqueue_head(&conf->wait_ready);
+
+ if (!conf->working_disks) {
+ printk(NONE_OPERATIONAL, mdidx(mddev));
+ goto out_free_conf;
+ }
+
+
+ /* pre-allocate some buffer_head structures.
+ * As a minimum, 1 r1bh and raid_disks buffer_heads
+ * would probably get us by in tight memory situations,
+ * but a few more is probably a good idea.
+ * For now, try NR_RESERVED_BUFS r1bh and
+ * NR_RESERVED_BUFS*raid_disks bufferheads
+ * This will allow at least NR_RESERVED_BUFS concurrent
+ * reads or writes even if kmalloc starts failing
+ */
+ if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
+ raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
+ < NR_RESERVED_BUFS*conf->raid_disks) {
+ printk(MEM_ERROR, mdidx(mddev));
+ goto out_free_conf;
+ }
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+
+ descriptor = sb->disks+i;
+ disk_idx = descriptor->raid_disk;
+ disk = conf->mirrors + disk_idx;
+
+ if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
+ !disk->used_slot) {
+
+ disk->number = descriptor->number;
+ disk->raid_disk = disk_idx;
+ disk->dev = MKDEV(0,0);
+
+ disk->operational = 0;
+ disk->write_only = 0;
+ disk->spare = 0;
+ disk->used_slot = 1;
+ disk->head_position = 0;
+ }
+ }
+
+ /*
+ * find the first working one and use it as a starting point
+ * to read balancing.
+ */
+ for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
+ /* nothing */;
+ conf->last_used = j;
+
+
+
+ {
+ const char * name = "raid1d";
+
+ conf->thread = md_register_thread(raid1d, conf, name);
+ if (!conf->thread) {
+ printk(THREAD_ERROR, mdidx(mddev));
+ goto out_free_conf;
+ }
+ }
+
+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
+ (conf->working_disks > 1)) {
+ const char * name = "raid1syncd";
+
+ conf->resync_thread = md_register_thread(raid1syncd, conf,name);
+ if (!conf->resync_thread) {
+ printk(THREAD_ERROR, mdidx(mddev));
+ goto out_free_conf;
+ }
+
+ printk(START_RESYNC, mdidx(mddev));
+ conf->resync_mirrors = 1;
+ mddev->recovery_running = 2;
+ md_wakeup_thread(conf->resync_thread);
+ }
+
+ /*
+ * Regenerate the "device is in sync with the raid set" bit for
+ * each device.
+ */
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mark_disk_nonsync(sb->disks+i);
+ for (j = 0; j < sb->raid_disks; j++) {
+ if (!conf->mirrors[j].operational)
+ continue;
+ if (sb->disks[i].number == conf->mirrors[j].number)
+ mark_disk_sync(sb->disks+i);
+ }
+ }
+ sb->active_disks = conf->working_disks;
+
+ printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
+ /*
+ * Ok, everything is just fine now
+ */
+ return 0;
+
+out_free_conf:
+ raid1_shrink_r1bh(conf);
+ raid1_shrink_bh(conf);
+ raid1_shrink_buffers(conf);
+ kfree(conf);
+ mddev->private = NULL;
+out:
+ MOD_DEC_USE_COUNT;
+ return -EIO;
+}
+
+#undef INVALID_LEVEL
+#undef NO_SB
+#undef ERRORS
+#undef NOT_IN_SYNC
+#undef INCONSISTENT
+#undef ALREADY_RUNNING
+#undef OPERATIONAL
+#undef SPARE
+#undef NONE_OPERATIONAL
+#undef ARRAY_IS_ACTIVE
+
+static int raid1_stop_resync (mddev_t *mddev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+
+ if (conf->resync_thread) {
+ if (conf->resync_mirrors) {
+ md_interrupt_thread(conf->resync_thread);
+
+ printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
+ return 1;
+ }
+ return 0;
+ }
+ return 0;
+}
+
+static int raid1_restart_resync (mddev_t *mddev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+
+ if (conf->resync_mirrors) {
+ if (!conf->resync_thread) {
+ MD_BUG();
+ return 0;
+ }
+ mddev->recovery_running = 2;
+ md_wakeup_thread(conf->resync_thread);
+ return 1;
+ }
+ return 0;
+}
+
+static int raid1_stop (mddev_t *mddev)
+{
+ raid1_conf_t *conf = mddev_to_conf(mddev);
+
+ md_unregister_thread(conf->thread);
+ if (conf->resync_thread)
+ md_unregister_thread(conf->resync_thread);
+ raid1_shrink_r1bh(conf);
+ raid1_shrink_bh(conf);
+ raid1_shrink_buffers(conf);
+ kfree(conf);
+ mddev->private = NULL;
+ MOD_DEC_USE_COUNT;
+ return 0;
+}
+
+static mdk_personality_t raid1_personality=
+{
+ name: "raid1",
+ make_request: raid1_make_request,
+ run: raid1_run,
+ stop: raid1_stop,
+ status: raid1_status,
+ error_handler: raid1_error,
+ diskop: raid1_diskop,
+ stop_resync: raid1_stop_resync,
+ restart_resync: raid1_restart_resync,
+ sync_request: raid1_sync_request
+};
+
+static int md__init raid1_init (void)
+{
+ return register_md_personality (RAID1, &raid1_personality);
+}
+
+static void raid1_exit (void)
+{
+ unregister_md_personality (RAID1);
+}
+
+module_init(raid1_init);
+module_exit(raid1_exit);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md-resync/patch b/tests/linux/md-resync/patch
new file mode 100644
index 0000000..1ed2ab1
--- /dev/null
+++ b/tests/linux/md-resync/patch
@@ -0,0 +1,312 @@
+***************
+*** 333,339 ****
+ * device if no resync is going on, or below the resync window.
+ * We take the first readable disk when above the resync window.
+ */
+- if (conf->resync_mirrors && (this_sector + sectors >= conf->next_resync)) {
+ /* make sure that disk is operational */
+ new_disk = 0;
+ while (!conf->mirrors[new_disk].operational || conf->mirrors[new_disk].write_only) {
+--- 333,339 ----
+ * device if no resync is going on, or below the resync window.
+ * We take the first readable disk when above the resync window.
+ */
++ if (!conf->mddev->in_sync && (this_sector + sectors >= conf->next_resync)) {
+ /* make sure that disk is operational */
+ new_disk = 0;
+ while (!conf->mirrors[new_disk].operational || conf->mirrors[new_disk].write_only) {
+***************
+*** 652,657 ****
+ if (conf->barrier) BUG();
+ if (waitqueue_active(&conf->wait_idle)) BUG();
+ if (waitqueue_active(&conf->wait_resume)) BUG();
+ }
+
+ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+--- 652,660 ----
+ if (conf->barrier) BUG();
+ if (waitqueue_active(&conf->wait_idle)) BUG();
+ if (waitqueue_active(&conf->wait_resume)) BUG();
++
++ mempool_destroy(conf->r1buf_pool);
++ conf->r1buf_pool = NULL;
+ }
+
+ static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+***************
+*** 768,774 ****
+ * Deactivate a spare disk:
+ */
+ case DISKOP_SPARE_INACTIVE:
+- close_sync(conf);
+ sdisk = conf->mirrors + spare_disk;
+ sdisk->operational = 0;
+ sdisk->write_only = 0;
+--- 771,776 ----
+ * Deactivate a spare disk:
+ */
+ case DISKOP_SPARE_INACTIVE:
+ sdisk = conf->mirrors + spare_disk;
+ sdisk->operational = 0;
+ sdisk->write_only = 0;
+***************
+*** 781,787 ****
+ * property)
+ */
+ case DISKOP_SPARE_ACTIVE:
+- close_sync(conf);
+ sdisk = conf->mirrors + spare_disk;
+ fdisk = conf->mirrors + failed_disk;
+
+--- 783,788 ----
+ * property)
+ */
+ case DISKOP_SPARE_ACTIVE:
+ sdisk = conf->mirrors + spare_disk;
+ fdisk = conf->mirrors + failed_disk;
+
+***************
+*** 915,924 ****
+ }
+ abort:
+ spin_unlock_irq(&conf->device_lock);
+- if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) {
+- mempool_destroy(conf->r1buf_pool);
+- conf->r1buf_pool = NULL;
+- }
+
+ print_conf(conf);
+ return err;
+--- 916,921 ----
+ }
+ abort:
+ spin_unlock_irq(&conf->device_lock);
+
+ print_conf(conf);
+ return err;
+***************
+*** 1008,1014 ****
+ * we read from here, no need to write
+ */
+ continue;
+- if (i < conf->raid_disks && !conf->resync_mirrors)
+ /*
+ * don't need to write this we are just rebuilding
+ */
+--- 1005,1011 ----
+ * we read from here, no need to write
+ */
+ continue;
++ if (i < conf->raid_disks && mddev->in_sync)
+ /*
+ * don't need to write this we are just rebuilding
+ */
+***************
+*** 1113,1141 ****
+ spin_unlock_irqrestore(&retry_list_lock, flags);
+ }
+
+- /*
+- * Private kernel thread to reconstruct mirrors after an unclean
+- * shutdown.
+- */
+- static void raid1syncd(void *data)
+- {
+- conf_t *conf = data;
+- mddev_t *mddev = conf->mddev;
+-
+- if (!conf->resync_mirrors)
+- return;
+- if (mddev->recovery_running != 2)
+- return;
+- if (!md_do_sync(mddev, NULL)) {
+- /*
+- * Only if everything went Ok.
+- */
+- conf->resync_mirrors = 0;
+- }
+-
+- close_sync(conf);
+-
+- }
+
+ static int init_resync(conf_t *conf)
+ {
+--- 1110,1115 ----
+ spin_unlock_irqrestore(&retry_list_lock, flags);
+ }
+
+
+ static int init_resync(conf_t *conf)
+ {
+***************
+*** 1170,1178 ****
+ sector_t max_sector, nr_sectors;
+ int disk, partial;
+
+- if (!sector_nr)
+ if (init_resync(conf))
+ return -ENOMEM;
+ /*
+ * If there is non-resync activity waiting for us then
+ * put in a delay to throttle resync.
+--- 1144,1159 ----
+ sector_t max_sector, nr_sectors;
+ int disk, partial;
+
++ if (sector_nr == 0)
+ if (init_resync(conf))
+ return -ENOMEM;
++
++ max_sector = mddev->sb->size << 1;
++ if (sector_nr >= max_sector) {
++ close_sync(conf);
++ return 0;
++ }
++
+ /*
+ * If there is non-resync activity waiting for us then
+ * put in a delay to throttle resync.
+***************
+*** 1209,1218 ****
+ r1_bio->sector = sector_nr;
+ r1_bio->cmd = SPECIAL;
+
+- max_sector = mddev->sb->size << 1;
+- if (sector_nr >= max_sector)
+- BUG();
+-
+ bio = r1_bio->master_bio;
+ nr_sectors = RESYNC_BLOCK_SIZE >> 9;
+ if (max_sector - sector_nr < nr_sectors)
+--- 1190,1195 ----
+ r1_bio->sector = sector_nr;
+ r1_bio->cmd = SPECIAL;
+
+ bio = r1_bio->master_bio;
+ nr_sectors = RESYNC_BLOCK_SIZE >> 9;
+ if (max_sector - sector_nr < nr_sectors)
+***************
+*** 1295,1301 ****
+ mdp_disk_t *descriptor;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+- int start_recovery = 0;
+
+ MOD_INC_USE_COUNT;
+
+--- 1272,1277 ----
+ mdp_disk_t *descriptor;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ MOD_INC_USE_COUNT;
+
+***************
+*** 1716,1736 ****
+ }
+ }
+
+- if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
+- (conf->working_disks > 1)) {
+- const char * name = "raid1syncd";
+-
+- conf->resync_thread = md_register_thread(raid1syncd, conf, name);
+- if (!conf->resync_thread) {
+- printk(THREAD_ERROR, mdidx(mddev));
+- goto out_free_conf;
+- }
+-
+- printk(START_RESYNC, mdidx(mddev));
+- conf->resync_mirrors = 1;
+- mddev->recovery_running = 2;
+- md_wakeup_thread(conf->resync_thread);
+- }
+
+ /*
+ * Regenerate the "device is in sync with the raid set" bit for
+--- 1688,1693 ----
+ }
+ }
+
+
+ /*
+ * Regenerate the "device is in sync with the raid set" bit for
+***************
+*** 1770,1815 ****
+ return -EIO;
+ }
+
+- static int stop_resync(mddev_t *mddev)
+- {
+- conf_t *conf = mddev_to_conf(mddev);
+-
+- if (conf->resync_thread) {
+- if (conf->resync_mirrors) {
+- md_interrupt_thread(conf->resync_thread);
+-
+- printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
+- return 1;
+- }
+- return 0;
+- }
+- return 0;
+- }
+-
+- static int restart_resync(mddev_t *mddev)
+- {
+- conf_t *conf = mddev_to_conf(mddev);
+-
+- if (conf->resync_mirrors) {
+- if (!conf->resync_thread) {
+- MD_BUG();
+- return 0;
+- }
+- mddev->recovery_running = 2;
+- md_wakeup_thread(conf->resync_thread);
+- return 1;
+- }
+- return 0;
+- }
+-
+ static int stop(mddev_t *mddev)
+ {
+ conf_t *conf = mddev_to_conf(mddev);
+ int i;
+
+ md_unregister_thread(conf->thread);
+- if (conf->resync_thread)
+- md_unregister_thread(conf->resync_thread);
+ if (conf->r1bio_pool)
+ mempool_destroy(conf->r1bio_pool);
+ for (i = 0; i < MD_SB_DISKS; i++)
+--- 1723,1734 ----
+ return -EIO;
+ }
+
+ static int stop(mddev_t *mddev)
+ {
+ conf_t *conf = mddev_to_conf(mddev);
+ int i;
+
+ md_unregister_thread(conf->thread);
+ if (conf->r1bio_pool)
+ mempool_destroy(conf->r1bio_pool);
+ for (i = 0; i < MD_SB_DISKS; i++)
+***************
+*** 1830,1837 ****
+ status: status,
+ error_handler: error,
+ diskop: diskop,
+- stop_resync: stop_resync,
+- restart_resync: restart_resync,
+ sync_request: sync_request
+ };
+
+--- 1749,1754 ----
+ status: status,
+ error_handler: error,
+ diskop: diskop,
+ sync_request: sync_request
+ };
+
diff --git a/tests/linux/md/diff b/tests/linux/md/diff
new file mode 100644
index 0000000..77e3f76
--- /dev/null
+++ b/tests/linux/md/diff
@@ -0,0 +1,3680 @@
+@@ -1,3674 +1,101 @@
+-/*
+- md.c : Multiple Devices driver for Linux
+- Copyright (C) 1998, 1999, 2000 Ingo Molnar
+-
+- completely rewritten, based on the MD driver code from Marc Zyngier
+-
+- Changes:
+-
+- - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+- - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+- - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+- - kmod support by: Cyrus Durgin
+- - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+- - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+-
+- - lots of fixes and improvements to the RAID1/RAID5 and generic
+- RAID code (such as request based resynchronization):
+-
+- Neil Brown <neilb@cse.unsw.edu.au>.
+-
+- This program is free software; you can redistribute it and/or modify
+- it under the terms of the GNU General Public License as published by
+- the Free Software Foundation; either version 2, or (at your option)
+- any later version.
+-
+- You should have received a copy of the GNU General Public License
+- (for example /usr/src/linux/COPYING); if not, write to the Free
+- Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+-*/
+-
+-#include <linux/module.h>
+-#include <linux/config.h>
+-#include <linux/linkage.h>
+-#include <linux/raid/md.h>
+-#include <linux/sysctl.h>
+-#include <linux/bio.h>
+-#include <linux/devfs_fs_kernel.h>
+-#include <linux/buffer_head.h> /* for invalidate_bdev */
+-#include <linux/suspend.h>
+-
+-#include <linux/init.h>
+-
+-#ifdef CONFIG_KMOD
+-#include <linux/kmod.h>
+-#endif
+-
+-#define __KERNEL_SYSCALLS__
+-#include <linux/unistd.h>
+-
+-#include <asm/unaligned.h>
+-
+-#define MAJOR_NR MD_MAJOR
+-#define MD_DRIVER
+-#define DEVICE_NR(device) (minor(device))
+-
+-#include <linux/blk.h>
+-
+-#define DEBUG 0
+-#define dprintk(x...) ((void)(DEBUG && printk(x)))
+-
+-
+-#ifndef MODULE
+-static void autostart_arrays (void);
+-#endif
+-
+-static mdk_personality_t *pers[MAX_PERSONALITY];
+-static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED;
+-
+-/*
+- * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+- * is 1000 KB/sec, so the extra system load does not show up that much.
+- * Increase it if you want to have more _guaranteed_ speed. Note that
+- * the RAID driver will use the maximum available bandwith if the IO
+- * subsystem is idle. There is also an 'absolute maximum' reconstruction
+- * speed limit - in case reconstruction slows down your system despite
+- * idle IO detection.
+- *
+- * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+- */
+-
+-static int sysctl_speed_limit_min = 1000;
+-static int sysctl_speed_limit_max = 200000;
+-
+-static struct ctl_table_header *raid_table_header;
+-
+-static ctl_table raid_table[] = {
+- {
+- .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
+- .procname = "speed_limit_min",
+- .data = &sysctl_speed_limit_min,
+- .maxlen = sizeof(int),
+- .mode = 0644,
+- .proc_handler = &proc_dointvec,
+- },
+- {
+- .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
+- .procname = "speed_limit_max",
+- .data = &sysctl_speed_limit_max,
+- .maxlen = sizeof(int),
+- .mode = 0644,
+- .proc_handler = &proc_dointvec,
+- },
+- { .ctl_name = 0 }
+-};
+-
+-static ctl_table raid_dir_table[] = {
+- {
+- .ctl_name = DEV_RAID,
+- .procname = "raid",
+- .maxlen = 0,
+- .mode = 0555,
+- .child = raid_table,
+- },
+- { .ctl_name = 0 }
+-};
+-
+-static ctl_table raid_root_table[] = {
+- {
+- .ctl_name = CTL_DEV,
+- .procname = "dev",
+- .maxlen = 0,
+- .mode = 0555,
+- .child = raid_dir_table,
+- },
+- { .ctl_name = 0 }
+-};
+-
+-static struct block_device_operations md_fops;
+-
+-static struct gendisk *disks[MAX_MD_DEVS];
+-
+-/*
+- * Enables to iterate over all existing md arrays
+- * all_mddevs_lock protects this list as well as mddev_map.
+- */
+-static LIST_HEAD(all_mddevs);
+-static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;
+-
+-
+-/*
+- * iterates through all used mddevs in the system.
+- * We take care to grab the all_mddevs_lock whenever navigating
+- * the list, and to always hold a refcount when unlocked.
+- * Any code which breaks out of this loop while own
+- * a reference to the current mddev and must mddev_put it.
+- */
+-#define ITERATE_MDDEV(mddev,tmp) \
+- \
+- for (({ spin_lock(&all_mddevs_lock); \
+- tmp = all_mddevs.next; \
+- mddev = NULL;}); \
+- ({ if (tmp != &all_mddevs) \
+- mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
+- spin_unlock(&all_mddevs_lock); \
+- if (mddev) mddev_put(mddev); \
+- mddev = list_entry(tmp, mddev_t, all_mddevs); \
+- tmp != &all_mddevs;}); \
+- ({ spin_lock(&all_mddevs_lock); \
+- tmp = tmp->next;}) \
+- )
+-
+-static mddev_t *mddev_map[MAX_MD_DEVS];
+-
+-static int md_fail_request (request_queue_t *q, struct bio *bio)
+-{
+- bio_io_error(bio, bio->bi_size);
+- return 0;
+-}
+-
+-static inline mddev_t *mddev_get(mddev_t *mddev)
+-{
+- atomic_inc(&mddev->active);
+- return mddev;
+-}
+-
+-static void mddev_put(mddev_t *mddev)
+-{
+- if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+- return;
+- if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+- list_del(&mddev->all_mddevs);
+- mddev_map[mdidx(mddev)] = NULL;
+- kfree(mddev);
+- MOD_DEC_USE_COUNT;
+- }
+- spin_unlock(&all_mddevs_lock);
+-}
+-
+-static mddev_t * mddev_find(int unit)
+-{
+- mddev_t *mddev, *new = NULL;
+-
+- retry:
+- spin_lock(&all_mddevs_lock);
+- if (mddev_map[unit]) {
+- mddev = mddev_get(mddev_map[unit]);
+- spin_unlock(&all_mddevs_lock);
+- if (new)
+- kfree(new);
+- return mddev;
+- }
+- if (new) {
+- mddev_map[unit] = new;
+- list_add(&new->all_mddevs, &all_mddevs);
+- spin_unlock(&all_mddevs_lock);
+- MOD_INC_USE_COUNT;
+- return new;
+- }
+- spin_unlock(&all_mddevs_lock);
+-
+- new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
+- if (!new)
+- return NULL;
+-
+- memset(new, 0, sizeof(*new));
+-
+- new->__minor = unit;
+- init_MUTEX(&new->reconfig_sem);
+- INIT_LIST_HEAD(&new->disks);
+- INIT_LIST_HEAD(&new->all_mddevs);
+- init_timer(&new->safemode_timer);
+- atomic_set(&new->active, 1);
+- blk_queue_make_request(&new->queue, md_fail_request);
+-
+- goto retry;
+-}
+-
+-static inline int mddev_lock(mddev_t * mddev)
+-{
+- return down_interruptible(&mddev->reconfig_sem);
+-}
+-
+-static inline void mddev_lock_uninterruptible(mddev_t * mddev)
+-{
+- down(&mddev->reconfig_sem);
+-}
+-
+-static inline int mddev_trylock(mddev_t * mddev)
+-{
+- return down_trylock(&mddev->reconfig_sem);
+-}
+-
+-static inline void mddev_unlock(mddev_t * mddev)
+-{
+- up(&mddev->reconfig_sem);
+-}
+-
+-mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+-{
+- mdk_rdev_t * rdev;
+- struct list_head *tmp;
+-
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- if (rdev->desc_nr == nr)
+- return rdev;
+- }
+- return NULL;
+-}
+-
+-static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
+-{
+- struct list_head *tmp;
+- mdk_rdev_t *rdev;
+-
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- if (rdev->bdev->bd_dev == dev)
+- return rdev;
+- }
+- return NULL;
+-}
+-
+-inline static sector_t calc_dev_sboffset(struct block_device *bdev)
+-{
+- sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+- return MD_NEW_SIZE_BLOCKS(size);
+-}
+-
+-static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
+-{
+- sector_t size;
+-
+- size = rdev->sb_offset;
+-
+- if (chunk_size)
+- size &= ~((sector_t)chunk_size/1024 - 1);
+- return size;
+-}
+-
+-static int alloc_disk_sb(mdk_rdev_t * rdev)
+-{
+- if (rdev->sb_page)
+- MD_BUG();
+-
+- rdev->sb_page = alloc_page(GFP_KERNEL);
+- if (!rdev->sb_page) {
+- printk(KERN_ALERT "md: out of memory.\n");
+- return -EINVAL;
+- }
+-
+- return 0;
+-}
+-
+-static void free_disk_sb(mdk_rdev_t * rdev)
+-{
+- if (rdev->sb_page) {
+- page_cache_release(rdev->sb_page);
+- rdev->sb_loaded = 0;
+- rdev->sb_page = NULL;
+- rdev->sb_offset = 0;
+- rdev->size = 0;
+- }
+-}
+-
+-
+-static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
+-{
+- if (bio->bi_size)
+- return 1;
+-
+- complete((struct completion*)bio->bi_private);
+- return 0;
+-}
+-
+-static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+- struct page *page, int rw)
+-{
+- struct bio bio;
+- struct bio_vec vec;
+- struct completion event;
+-
+- bio_init(&bio);
+- bio.bi_io_vec = &vec;
+- vec.bv_page = page;
+- vec.bv_len = size;
+- vec.bv_offset = 0;
+- bio.bi_vcnt = 1;
+- bio.bi_idx = 0;
+- bio.bi_size = size;
+- bio.bi_bdev = bdev;
+- bio.bi_sector = sector;
+- init_completion(&event);
+- bio.bi_private = &event;
+- bio.bi_end_io = bi_complete;
+- submit_bio(rw, &bio);
+- blk_run_queues();
+- wait_for_completion(&event);
+-
+- return test_bit(BIO_UPTODATE, &bio.bi_flags);
+-}
+-
+-static int read_disk_sb(mdk_rdev_t * rdev)
+-{
+-
+- if (!rdev->sb_page) {
+- MD_BUG();
+- return -EINVAL;
+- }
+- if (rdev->sb_loaded)
+- return 0;
+-
+-
+- if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
+- goto fail;
+- rdev->sb_loaded = 1;
+- return 0;
+-
+-fail:
+- printk(KERN_ERR "md: disabled device %s, could not read superblock.\n",
+- bdev_partition_name(rdev->bdev));
+- return -EINVAL;
+-}
+-
+-static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+-{
+- if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
+- (sb1->set_uuid1 == sb2->set_uuid1) &&
+- (sb1->set_uuid2 == sb2->set_uuid2) &&
+- (sb1->set_uuid3 == sb2->set_uuid3))
+-
+- return 1;
+-
+- return 0;
+-}
+-
+-
+-static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+-{
+- int ret;
+- mdp_super_t *tmp1, *tmp2;
+-
+- tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+- tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+-
+- if (!tmp1 || !tmp2) {
+- ret = 0;
+- printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+- goto abort;
+- }
+-
+- *tmp1 = *sb1;
+- *tmp2 = *sb2;
+-
+- /*
+- * nr_disks is not constant
+- */
+- tmp1->nr_disks = 0;
+- tmp2->nr_disks = 0;
+-
+- if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+- ret = 0;
+- else
+- ret = 1;
+-
+-abort:
+- if (tmp1)
+- kfree(tmp1);
+- if (tmp2)
+- kfree(tmp2);
+-
+- return ret;
+-}
+-
+-static unsigned int calc_sb_csum(mdp_super_t * sb)
+-{
+- unsigned int disk_csum, csum;
+-
+- disk_csum = sb->sb_csum;
+- sb->sb_csum = 0;
+- csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+- sb->sb_csum = disk_csum;
+- return csum;
+-}
+-
+-/*
+- * Handle superblock details.
+- * We want to be able to handle multiple superblock formats
+- * so we have a common interface to them all, and an array of
+- * different handlers.
+- * We rely on user-space to write the initial superblock, and support
+- * reading and updating of superblocks.
+- * Interface methods are:
+- * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
+- * loads and validates a superblock on dev.
+- * if refdev != NULL, compare superblocks on both devices
+- * Return:
+- * 0 - dev has a superblock that is compatible with refdev
+- * 1 - dev has a superblock that is compatible and newer than refdev
+- * so dev should be used as the refdev in future
+- * -EINVAL superblock incompatible or invalid
+- * -othererror e.g. -EIO
+- *
+- * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
+- * Verify that dev is acceptable into mddev.
+- * The first time, mddev->raid_disks will be 0, and data from
+- * dev should be merged in. Subsequent calls check that dev
+- * is new enough. Return 0 or -EINVAL
+- *
+- * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
+- * Update the superblock for rdev with data in mddev
+- * This does not write to disc.
+- *
+- */
+-
+-struct super_type {
+- char *name;
+- struct module *owner;
+- int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
+- int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+- void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+-};
+-
+-/*
+- * load_super for 0.90.0
+- */
+-static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+-{
+- mdp_super_t *sb;
+- int ret;
+- sector_t sb_offset;
+-
+- /*
+- * Calculate the position of the superblock,
+- * it's at the end of the disk.
+- *
+- * It also happens to be a multiple of 4Kb.
+- */
+- sb_offset = calc_dev_sboffset(rdev->bdev);
+- rdev->sb_offset = sb_offset;
+-
+- ret = read_disk_sb(rdev);
+- if (ret) return ret;
+-
+- ret = -EINVAL;
+-
+- sb = (mdp_super_t*)page_address(rdev->sb_page);
+-
+- if (sb->md_magic != MD_SB_MAGIC) {
+- printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
+- bdev_partition_name(rdev->bdev));
+- goto abort;
+- }
+-
+- if (sb->major_version != 0 ||
+- sb->minor_version != 90) {
+- printk(KERN_WARNING "Bad version number %d.%d on %s\n",
+- sb->major_version, sb->minor_version,
+- bdev_partition_name(rdev->bdev));
+- goto abort;
+- }
+-
+- if (sb->md_minor >= MAX_MD_DEVS) {
+- printk(KERN_ERR "md: %s: invalid raid minor (%x)\n",
+- bdev_partition_name(rdev->bdev), sb->md_minor);
+- goto abort;
+- }
+- if (sb->raid_disks <= 0)
+- goto abort;
+-
+- if (calc_sb_csum(sb) != sb->sb_csum) {
+- printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
+- bdev_partition_name(rdev->bdev));
+- goto abort;
+- }
+-
+- rdev->preferred_minor = sb->md_minor;
+- rdev->data_offset = 0;
+-
+- if (sb->level == MULTIPATH)
+- rdev->desc_nr = -1;
+- else
+- rdev->desc_nr = sb->this_disk.number;
+-
+- if (refdev == 0)
+- ret = 1;
+- else {
+- __u64 ev1, ev2;
+- mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+- if (!uuid_equal(refsb, sb)) {
+- printk(KERN_WARNING "md: %s has different UUID to %s\n",
+- bdev_partition_name(rdev->bdev),
+- bdev_partition_name(refdev->bdev));
+- goto abort;
+- }
+- if (!sb_equal(refsb, sb)) {
+- printk(KERN_WARNING "md: %s has same UUID"
+- " but different superblock to %s\n",
+- bdev_partition_name(rdev->bdev),
+- bdev_partition_name(refdev->bdev));
+- goto abort;
+- }
+- ev1 = md_event(sb);
+- ev2 = md_event(refsb);
+- if (ev1 > ev2)
+- ret = 1;
+- else
+- ret = 0;
+- }
+- rdev->size = calc_dev_size(rdev, sb->chunk_size);
+-
+- abort:
+- return ret;
+-}
+-
+-/*
+- * validate_super for 0.90.0
+- */
+-static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+-{
+- mdp_disk_t *desc;
+- mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+-
+- if (mddev->raid_disks == 0) {
+- mddev->major_version = 0;
+- mddev->minor_version = sb->minor_version;
+- mddev->patch_version = sb->patch_version;
+- mddev->persistent = ! sb->not_persistent;
+- mddev->chunk_size = sb->chunk_size;
+- mddev->ctime = sb->ctime;
+- mddev->utime = sb->utime;
+- mddev->level = sb->level;
+- mddev->layout = sb->layout;
+- mddev->raid_disks = sb->raid_disks;
+- mddev->size = sb->size;
+- mddev->events = md_event(sb);
+-
+- if (sb->state & (1<<MD_SB_CLEAN))
+- mddev->recovery_cp = MaxSector;
+- else {
+- if (sb->events_hi == sb->cp_events_hi &&
+- sb->events_lo == sb->cp_events_lo) {
+- mddev->recovery_cp = sb->recovery_cp;
+- } else
+- mddev->recovery_cp = 0;
+- }
+-
+- memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
+- memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
+- memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
+- memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
+-
+- mddev->max_disks = MD_SB_DISKS;
+- } else {
+- __u64 ev1;
+- ev1 = md_event(sb);
+- ++ev1;
+- if (ev1 < mddev->events)
+- return -EINVAL;
+- }
+- if (mddev->level != LEVEL_MULTIPATH) {
+- rdev->raid_disk = -1;
+- rdev->in_sync = rdev->faulty = 0;
+- desc = sb->disks + rdev->desc_nr;
+-
+- if (desc->state & (1<<MD_DISK_FAULTY))
+- rdev->faulty = 1;
+- else if (desc->state & (1<<MD_DISK_SYNC) &&
+- desc->raid_disk < mddev->raid_disks) {
+- rdev->in_sync = 1;
+- rdev->raid_disk = desc->raid_disk;
+- }
+- }
+- return 0;
+-}
+-
+-/*
+- * sync_super for 0.90.0
+- */
+-static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+-{
+- mdp_super_t *sb;
+- struct list_head *tmp;
+- mdk_rdev_t *rdev2;
+- int next_spare = mddev->raid_disks;
+-
+- /* make rdev->sb match mddev data..
+- *
+- * 1/ zero out disks
+- * 2/ Add info for each disk, keeping track of highest desc_nr
+- * 3/ any empty disks < highest become removed
+- *
+- * disks[0] gets initialised to REMOVED because
+- * we cannot be sure from other fields if it has
+- * been initialised or not.
+- */
+- int highest = 0;
+- int i;
+- int active=0, working=0,failed=0,spare=0,nr_disks=0;
+-
+- sb = (mdp_super_t*)page_address(rdev->sb_page);
+-
+- memset(sb, 0, sizeof(*sb));
+-
+- sb->md_magic = MD_SB_MAGIC;
+- sb->major_version = mddev->major_version;
+- sb->minor_version = mddev->minor_version;
+- sb->patch_version = mddev->patch_version;
+- sb->gvalid_words = 0; /* ignored */
+- memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
+- memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
+- memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
+- memcpy(&sb->set_uuid3, mddev->uuid+12,4);
+-
+- sb->ctime = mddev->ctime;
+- sb->level = mddev->level;
+- sb->size = mddev->size;
+- sb->raid_disks = mddev->raid_disks;
+- sb->md_minor = mddev->__minor;
+- sb->not_persistent = !mddev->persistent;
+- sb->utime = mddev->utime;
+- sb->state = 0;
+- sb->events_hi = (mddev->events>>32);
+- sb->events_lo = (u32)mddev->events;
+-
+- if (mddev->in_sync)
+- {
+- sb->recovery_cp = mddev->recovery_cp;
+- sb->cp_events_hi = (mddev->events>>32);
+- sb->cp_events_lo = (u32)mddev->events;
+- if (mddev->recovery_cp == MaxSector)
+- sb->state = (1<< MD_SB_CLEAN);
+- } else
+- sb->recovery_cp = 0;
+-
+- sb->layout = mddev->layout;
+- sb->chunk_size = mddev->chunk_size;
+-
+- sb->disks[0].state = (1<<MD_DISK_REMOVED);
+- ITERATE_RDEV(mddev,rdev2,tmp) {
+- mdp_disk_t *d;
+- if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
+- rdev2->desc_nr = rdev2->raid_disk;
+- else
+- rdev2->desc_nr = next_spare++;
+- d = &sb->disks[rdev2->desc_nr];
+- nr_disks++;
+- d->number = rdev2->desc_nr;
+- d->major = MAJOR(rdev2->bdev->bd_dev);
+- d->minor = MINOR(rdev2->bdev->bd_dev);
+- if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
+- d->raid_disk = rdev2->raid_disk;
+- else
+- d->raid_disk = rdev2->desc_nr; /* compatibility */
+- if (rdev2->faulty) {
+- d->state = (1<<MD_DISK_FAULTY);
+- failed++;
+- } else if (rdev2->in_sync) {
+- d->state = (1<<MD_DISK_ACTIVE);
+- d->state |= (1<<MD_DISK_SYNC);
+- active++;
+- working++;
+- } else {
+- d->state = 0;
+- spare++;
+- working++;
+- }
+- if (rdev2->desc_nr > highest)
+- highest = rdev2->desc_nr;
+- }
+-
+- /* now set the "removed" bit on any non-trailing holes */
+- for (i=0; i<highest; i++) {
+- mdp_disk_t *d = &sb->disks[i];
+- if (d->state == 0 && d->number == 0) {
+- d->number = i;
+- d->raid_disk = i;
+- d->state = (1<<MD_DISK_REMOVED);
+- }
+- }
+- sb->nr_disks = nr_disks;
+- sb->active_disks = active;
+- sb->working_disks = working;
+- sb->failed_disks = failed;
+- sb->spare_disks = spare;
+-
+- sb->this_disk = sb->disks[rdev->desc_nr];
+- sb->sb_csum = calc_sb_csum(sb);
+-}
+-
+-/*
+- * version 1 superblock
+- */
+-
+-static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+-{
+- unsigned int disk_csum, csum;
+- int size = 256 + sb->max_dev*2;
+-
+- disk_csum = sb->sb_csum;
+- sb->sb_csum = 0;
+- csum = csum_partial((void *)sb, size, 0);
+- sb->sb_csum = disk_csum;
+- return csum;
+-}
+-
+-static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+-{
+- struct mdp_superblock_1 *sb;
+- int ret;
+- sector_t sb_offset;
+-
+- /*
+- * Calculate the position of the superblock.
+- * It is always aligned to a 4K boundary and
+- * depeding on minor_version, it can be:
+- * 0: At least 8K, but less than 12K, from end of device
+- * 1: At start of device
+- * 2: 4K from start of device.
+- */
+- switch(minor_version) {
+- case 0:
+- sb_offset = rdev->bdev->bd_inode->i_size >> 9;
+- sb_offset -= 8*2;
+- sb_offset &= ~(4*2);
+- /* convert from sectors to K */
+- sb_offset /= 2;
+- break;
+- case 1:
+- sb_offset = 0;
+- break;
+- case 2:
+- sb_offset = 4;
+- break;
+- default:
+- return -EINVAL;
+- }
+- rdev->sb_offset = sb_offset;
+-
+- ret = read_disk_sb(rdev);
+- if (ret) return ret;
+-
+-
+- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+-
+- if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
+- sb->major_version != cpu_to_le32(1) ||
+- le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
+- le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+- sb->feature_map != 0)
+- return -EINVAL;
+-
+- if (calc_sb_1_csum(sb) != sb->sb_csum) {
+- printk("md: invalid superblock checksum on %s\n",
+- bdev_partition_name(rdev->bdev));
+- return -EINVAL;
+- }
+- rdev->preferred_minor = 0xffff;
+- rdev->data_offset = le64_to_cpu(sb->data_offset);
+-
+- if (refdev == 0)
+- return 1;
+- else {
+- __u64 ev1, ev2;
+- struct mdp_superblock_1 *refsb =
+- (struct mdp_superblock_1*)page_address(refdev->sb_page);
+-
+- if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
+- sb->level != refsb->level ||
+- sb->layout != refsb->layout ||
+- sb->chunksize != refsb->chunksize) {
+- printk(KERN_WARNING "md: %s has strangely different"
+- " superblock to %s\n",
+- bdev_partition_name(rdev->bdev),
+- bdev_partition_name(refdev->bdev));
+- return -EINVAL;
+- }
+- ev1 = le64_to_cpu(sb->events);
+- ev2 = le64_to_cpu(refsb->events);
+-
+- if (ev1 > ev2)
+- return 1;
+- }
+- if (minor_version)
+- rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+- else
+- rdev->size = rdev->sb_offset;
+- if (rdev->size < le64_to_cpu(sb->data_size)/2)
+- return -EINVAL;
+- rdev->size = le64_to_cpu(sb->data_size)/2;
+- if (le32_to_cpu(sb->chunksize))
+- rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+- return 0;
+-}
+-
+-static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+-{
+- struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+-
+- if (mddev->raid_disks == 0) {
+- mddev->major_version = 1;
+- mddev->minor_version = 0;
+- mddev->patch_version = 0;
+- mddev->persistent = 1;
+- mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+- mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
+- mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+- mddev->level = le32_to_cpu(sb->level);
+- mddev->layout = le32_to_cpu(sb->layout);
+- mddev->raid_disks = le32_to_cpu(sb->raid_disks);
+- mddev->size = (u32)le64_to_cpu(sb->size);
+- mddev->events = le64_to_cpu(sb->events);
+-
+- mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+- memcpy(mddev->uuid, sb->set_uuid, 16);
+-
+- mddev->max_disks = (4096-256)/2;
+- } else {
+- __u64 ev1;
+- ev1 = le64_to_cpu(sb->events);
+- ++ev1;
+- if (ev1 < mddev->events)
+- return -EINVAL;
+- }
+-
+- if (mddev->level != LEVEL_MULTIPATH) {
+- int role;
+- rdev->desc_nr = le32_to_cpu(sb->dev_number);
+- role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+- switch(role) {
+- case 0xffff: /* spare */
+- rdev->in_sync = 0;
+- rdev->faulty = 0;
+- rdev->raid_disk = -1;
+- break;
+- case 0xfffe: /* faulty */
+- rdev->in_sync = 0;
+- rdev->faulty = 1;
+- rdev->raid_disk = -1;
+- break;
+- default:
+- rdev->in_sync = 1;
+- rdev->faulty = 0;
+- rdev->raid_disk = role;
+- break;
+- }
+- }
+- return 0;
+-}
+-
+-static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+-{
+- struct mdp_superblock_1 *sb;
+- struct list_head *tmp;
+- mdk_rdev_t *rdev2;
+- int max_dev, i;
+- /* make rdev->sb match mddev and rdev data. */
+-
+- sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+-
+- sb->feature_map = 0;
+- sb->pad0 = 0;
+- memset(sb->pad1, 0, sizeof(sb->pad1));
+- memset(sb->pad2, 0, sizeof(sb->pad2));
+- memset(sb->pad3, 0, sizeof(sb->pad3));
+-
+- sb->utime = cpu_to_le64((__u64)mddev->utime);
+- sb->events = cpu_to_le64(mddev->events);
+- if (mddev->in_sync)
+- sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+- else
+- sb->resync_offset = cpu_to_le64(0);
+-
+- max_dev = 0;
+- ITERATE_RDEV(mddev,rdev2,tmp)
+- if (rdev2->desc_nr > max_dev)
+- max_dev = rdev2->desc_nr;
+-
+- sb->max_dev = max_dev;
+- for (i=0; i<max_dev;i++)
+- sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
+-
+- ITERATE_RDEV(mddev,rdev2,tmp) {
+- i = rdev2->desc_nr;
+- if (rdev2->faulty)
+- sb->dev_roles[i] = cpu_to_le16(0xfffe);
+- else if (rdev2->in_sync)
+- sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+- else
+- sb->dev_roles[i] = cpu_to_le16(0xffff);
+- }
+-
+- sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
+-}
+-
+-
+-struct super_type super_types[] = {
+- [0] = {
+- .name = "0.90.0",
+- .owner = THIS_MODULE,
+- .load_super = super_90_load,
+- .validate_super = super_90_validate,
+- .sync_super = super_90_sync,
+- },
+- [1] = {
+- .name = "md-1",
+- .owner = THIS_MODULE,
+- .load_super = super_1_load,
+- .validate_super = super_1_validate,
+- .sync_super = super_1_sync,
+- },
+-};
+-
+-static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
+-{
+- struct list_head *tmp;
+- mdk_rdev_t *rdev;
+-
+- ITERATE_RDEV(mddev,rdev,tmp)
+- if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
+- return rdev;
+-
+- return NULL;
+-}
+-
+-static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+-{
+- struct list_head *tmp;
+- mdk_rdev_t *rdev;
+-
+- ITERATE_RDEV(mddev1,rdev,tmp)
+- if (match_dev_unit(mddev2, rdev))
+- return 1;
+-
+- return 0;
+-}
+-
+-static LIST_HEAD(pending_raid_disks);
+-
+-static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+-{
+- mdk_rdev_t *same_pdev;
+-
+- if (rdev->mddev) {
+- MD_BUG();
+- return -EINVAL;
+- }
+- same_pdev = match_dev_unit(mddev, rdev);
+- if (same_pdev)
+- printk(KERN_WARNING
+- "md%d: WARNING: %s appears to be on the same physical"
+- " disk as %s. True\n protection against single-disk"
+- " failure might be compromised.\n",
+- mdidx(mddev), bdev_partition_name(rdev->bdev),
+- bdev_partition_name(same_pdev->bdev));
+-
+- /* Verify rdev->desc_nr is unique.
+- * If it is -1, assign a free number, else
+- * check number is not in use
+- */
+- if (rdev->desc_nr < 0) {
+- int choice = 0;
+- if (mddev->pers) choice = mddev->raid_disks;
+- while (find_rdev_nr(mddev, choice))
+- choice++;
+- rdev->desc_nr = choice;
+- } else {
+- if (find_rdev_nr(mddev, rdev->desc_nr))
+- return -EBUSY;
+- }
+-
+- list_add(&rdev->same_set, &mddev->disks);
+- rdev->mddev = mddev;
+- printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev));
+- return 0;
+-}
+-
+-static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+-{
+- if (!rdev->mddev) {
+- MD_BUG();
+- return;
+- }
+- list_del_init(&rdev->same_set);
+- printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev));
+- rdev->mddev = NULL;
+-}
+-
+-/*
+- * prevent the device from being mounted, repartitioned or
+- * otherwise reused by a RAID array (or any other kernel
+- * subsystem), by opening the device. [simply getting an
+- * inode is not enough, the SCSI module usage code needs
+- * an explicit open() on the device]
+- */
+-static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
+-{
+- int err = 0;
+- struct block_device *bdev;
+-
+- bdev = bdget(dev);
+- if (!bdev)
+- return -ENOMEM;
+- err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+- if (err)
+- return err;
+- err = bd_claim(bdev, rdev);
+- if (err) {
+- blkdev_put(bdev, BDEV_RAW);
+- return err;
+- }
+- rdev->bdev = bdev;
+- return err;
+-}
+-
+-static void unlock_rdev(mdk_rdev_t *rdev)
+-{
+- struct block_device *bdev = rdev->bdev;
+- rdev->bdev = NULL;
+- if (!bdev)
+- MD_BUG();
+- bd_release(bdev);
+- blkdev_put(bdev, BDEV_RAW);
+-}
+-
+-void md_autodetect_dev(dev_t dev);
+-
+-static void export_rdev(mdk_rdev_t * rdev)
+-{
+- printk(KERN_INFO "md: export_rdev(%s)\n",
+- bdev_partition_name(rdev->bdev));
+- if (rdev->mddev)
+- MD_BUG();
+- free_disk_sb(rdev);
+- list_del_init(&rdev->same_set);
+-#ifndef MODULE
+- md_autodetect_dev(rdev->bdev->bd_dev);
+-#endif
+- unlock_rdev(rdev);
+- kfree(rdev);
+-}
+-
+-static void kick_rdev_from_array(mdk_rdev_t * rdev)
+-{
+- unbind_rdev_from_array(rdev);
+- export_rdev(rdev);
+-}
+-
+-static void export_array(mddev_t *mddev)
+-{
+- struct list_head *tmp;
+- mdk_rdev_t *rdev;
+-
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- if (!rdev->mddev) {
+- MD_BUG();
+- continue;
+- }
+- kick_rdev_from_array(rdev);
+- }
+- if (!list_empty(&mddev->disks))
+- MD_BUG();
+- mddev->raid_disks = 0;
+- mddev->major_version = 0;
+-}
+-
+-static void print_desc(mdp_disk_t *desc)
+-{
+- printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+- partition_name(MKDEV(desc->major,desc->minor)),
+- desc->major,desc->minor,desc->raid_disk,desc->state);
+-}
+-
+-static void print_sb(mdp_super_t *sb)
+-{
+- int i;
+-
+- printk(KERN_INFO
+- "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+- sb->major_version, sb->minor_version, sb->patch_version,
+- sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+- sb->ctime);
+- printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
+- sb->level, sb->size, sb->nr_disks, sb->raid_disks,
+- sb->md_minor, sb->layout, sb->chunk_size);
+- printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
+- " FD:%d SD:%d CSUM:%08x E:%08lx\n",
+- sb->utime, sb->state, sb->active_disks, sb->working_disks,
+- sb->failed_disks, sb->spare_disks,
+- sb->sb_csum, (unsigned long)sb->events_lo);
+-
+- printk(KERN_INFO);
+- for (i = 0; i < MD_SB_DISKS; i++) {
+- mdp_disk_t *desc;
+-
+- desc = sb->disks + i;
+- if (desc->number || desc->major || desc->minor ||
+- desc->raid_disk || (desc->state && (desc->state != 4))) {
+- printk(" D %2d: ", i);
+- print_desc(desc);
+- }
+- }
+- printk(KERN_INFO "md: THIS: ");
+- print_desc(&sb->this_disk);
+-
+-}
+-
+-static void print_rdev(mdk_rdev_t *rdev)
+-{
+- printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ",
+- bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size,
+- rdev->faulty, rdev->in_sync, rdev->desc_nr);
+- if (rdev->sb_loaded) {
+- printk(KERN_INFO "md: rdev superblock:\n");
+- print_sb((mdp_super_t*)page_address(rdev->sb_page));
+- } else
+- printk(KERN_INFO "md: no rdev superblock!\n");
+-}
+-
+-void md_print_devices(void)
+-{
+- struct list_head *tmp, *tmp2;
+- mdk_rdev_t *rdev;
+- mddev_t *mddev;
+-
+- printk("\n");
+- printk("md: **********************************\n");
+- printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+- printk("md: **********************************\n");
+- ITERATE_MDDEV(mddev,tmp) {
+- printk("md%d: ", mdidx(mddev));
+-
+- ITERATE_RDEV(mddev,rdev,tmp2)
+- printk("<%s>", bdev_partition_name(rdev->bdev));
+-
+- ITERATE_RDEV(mddev,rdev,tmp2)
+- print_rdev(rdev);
+- }
+- printk("md: **********************************\n");
+- printk("\n");
+-}
+-
+-
+-static int write_disk_sb(mdk_rdev_t * rdev)
+-{
+-
+- if (!rdev->sb_loaded) {
+- MD_BUG();
+- return 1;
+- }
+- if (rdev->faulty) {
+- MD_BUG();
+- return 1;
+- }
+-
+- dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
+- bdev_partition_name(rdev->bdev),
+- (unsigned long long)rdev->sb_offset);
+-
+- if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
+- return 0;
+-
+- printk("md: write_disk_sb failed for device %s\n",
+- bdev_partition_name(rdev->bdev));
+- return 1;
+-}
+-
+-static void sync_sbs(mddev_t * mddev)
+-{
+- mdk_rdev_t *rdev;
+- struct list_head *tmp;
+-
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- super_types[mddev->major_version].
+- sync_super(mddev, rdev);
+- rdev->sb_loaded = 1;
+- }
+-}
+-
+-static void md_update_sb(mddev_t * mddev)
+-{
+- int err, count = 100;
+- struct list_head *tmp;
+- mdk_rdev_t *rdev;
+-
+- mddev->sb_dirty = 0;
+-repeat:
+- mddev->utime = get_seconds();
+- mddev->events ++;
+-
+- if (!mddev->events) {
+- /*
+- * oops, this 64-bit counter should never wrap.
+- * Either we are in around ~1 trillion A.C., assuming
+- * 1 reboot per second, or we have a bug:
+- */
+- MD_BUG();
+- mddev->events --;
+- }
+- sync_sbs(mddev);
+-
+- /*
+- * do not write anything to disk if using
+- * nonpersistent superblocks
+- */
+- if (!mddev->persistent)
+- return;
+-
+- dprintk(KERN_INFO
+- "md: updating md%d RAID superblock on device (in sync %d)\n",
+- mdidx(mddev),mddev->in_sync);
+-
+- err = 0;
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- dprintk(KERN_INFO "md: ");
+- if (rdev->faulty)
+- dprintk("(skipping faulty ");
+-
+- dprintk("%s ", bdev_partition_name(rdev->bdev));
+- if (!rdev->faulty) {
+- err += write_disk_sb(rdev);
+- } else
+- dprintk(")\n");
+- if (!err && mddev->level == LEVEL_MULTIPATH)
+- /* only need to write one superblock... */
+- break;
+- }
+- if (err) {
+- if (--count) {
+- printk(KERN_ERR "md: errors occurred during superblock"
+- " update, repeating\n");
+- goto repeat;
+- }
+- printk(KERN_ERR \
+- "md: excessive errors occurred during superblock update, exiting\n");
+- }
+-}
+-
+-/*
+- * Import a device. If 'super_format' >= 0, then sanity check the superblock
+- *
+- * mark the device faulty if:
+- *
+- * - the device is nonexistent (zero size)
+- * - the device has no valid superblock
+- *
+- * a faulty rdev _never_ has rdev->sb set.
+- */
+-static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
+-{
+- int err;
+- mdk_rdev_t *rdev;
+- sector_t size;
+-
+- rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+- if (!rdev) {
+- printk(KERN_ERR "md: could not alloc mem for %s!\n",
+- partition_name(newdev));
+- return ERR_PTR(-ENOMEM);
+- }
+- memset(rdev, 0, sizeof(*rdev));
+-
+- if ((err = alloc_disk_sb(rdev)))
+- goto abort_free;
+-
+- err = lock_rdev(rdev, newdev);
+- if (err) {
+- printk(KERN_ERR "md: could not lock %s.\n",
+- partition_name(newdev));
+- goto abort_free;
+- }
+- rdev->desc_nr = -1;
+- rdev->faulty = 0;
+- rdev->in_sync = 0;
+- rdev->data_offset = 0;
+- atomic_set(&rdev->nr_pending, 0);
+-
+- size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+- if (!size) {
+- printk(KERN_WARNING
+- "md: %s has zero or unknown size, marking faulty!\n",
+- bdev_partition_name(rdev->bdev));
+- err = -EINVAL;
+- goto abort_free;
+- }
+-
+- if (super_format >= 0) {
+- err = super_types[super_format].
+- load_super(rdev, NULL, super_minor);
+- if (err == -EINVAL) {
+- printk(KERN_WARNING
+- "md: %s has invalid sb, not importing!\n",
+- bdev_partition_name(rdev->bdev));
+- goto abort_free;
+- }
+- if (err < 0) {
+- printk(KERN_WARNING
+- "md: could not read %s's sb, not importing!\n",
+- bdev_partition_name(rdev->bdev));
+- goto abort_free;
+- }
+- }
+- INIT_LIST_HEAD(&rdev->same_set);
+-
+- return rdev;
+-
+-abort_free:
+- if (rdev->sb_page) {
+- if (rdev->bdev)
+- unlock_rdev(rdev);
+- free_disk_sb(rdev);
+- }
+- kfree(rdev);
+- return ERR_PTR(err);
+-}
+-
+-/*
+- * Check a full RAID array for plausibility
+- */
+-
+-
+-static int analyze_sbs(mddev_t * mddev)
+-{
+- int i;
+- struct list_head *tmp;
+- mdk_rdev_t *rdev, *freshest;
+-
+- freshest = NULL;
+- ITERATE_RDEV(mddev,rdev,tmp)
+- switch (super_types[mddev->major_version].
+- load_super(rdev, freshest, mddev->minor_version)) {
+- case 1:
+- freshest = rdev;
+- break;
+- case 0:
+- break;
+- default:
+- printk( KERN_ERR \
+- "md: fatal superblock inconsistency in %s"
+- " -- removing from array\n",
+- bdev_partition_name(rdev->bdev));
+- kick_rdev_from_array(rdev);
+- }
+-
+-
+- super_types[mddev->major_version].
+- validate_super(mddev, freshest);
+-
+- i = 0;
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- if (rdev != freshest)
+- if (super_types[mddev->major_version].
+- validate_super(mddev, rdev)) {
+- printk(KERN_WARNING "md: kicking non-fresh %s"
+- " from array!\n",
+- bdev_partition_name(rdev->bdev));
+- kick_rdev_from_array(rdev);
+- continue;
+- }
+- if (mddev->level == LEVEL_MULTIPATH) {
+- rdev->desc_nr = i++;
+- rdev->raid_disk = rdev->desc_nr;
+- rdev->in_sync = 1;
+- }
+- }
+-
+-
+- /*
+- * Check if we can support this RAID array
+- */
+- if (mddev->major_version != MD_MAJOR_VERSION ||
+- mddev->minor_version > MD_MINOR_VERSION) {
+- printk(KERN_ALERT
+- "md: md%d: unsupported raid array version %d.%d.%d\n",
+- mdidx(mddev), mddev->major_version,
+- mddev->minor_version, mddev->patch_version);
+- goto abort;
+- }
+-
+- if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) ||
+- (mddev->level == 4) || (mddev->level == 5)))
+- printk(KERN_ERR "md: md%d: raid array is not clean"
+- " -- starting background reconstruction\n",
+- mdidx(mddev));
+-
+- return 0;
+-abort:
++*** 1453,90 **** 1
+ return 1;
+ }
+
++#undef OLD_LEVEL
++
+ static int device_size_calculation(mddev_t * mddev)
+ {
+ int data_disks = 0;
+ unsigned int readahead;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size < mddev->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size:"
+ " %lluk < %dk\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->size,
+ mddev->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (mddev->level) {
+ case LEVEL_MULTIPATH:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case LEVEL_LINEAR:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = mddev->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = mddev->raid_disks-1;
+ break;
+ default:
+ printk(KERN_ERR "md: md%d: unsupported raid level %d\n",
+ mdidx(mddev), mddev->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = mddev->size * data_disks;
+
+ readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) {
+ readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (mddev->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+ abort:
+ return 1;
+ }
+
+ static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+ {
+ static DECLARE_MUTEX(disks_sem);
+- int unit = MINOR(dev);
+- mddev_t *mddev = mddev_find(unit);
+- struct gendisk *disk;
+-
+- if (!mddev)
+- return NULL;
+-
+- down(&disks_sem);
+- if (disks[unit]) {
+- up(&disks_sem);
+- mddev_put(mddev);
+- return NULL;
+- }
+- disk = alloc_disk(1);
+- if (!disk) {
+- up(&disks_sem);
+- mddev_put(mddev);
+- return NULL;
+- }
+- disk->major = MD_MAJOR;
+- disk->first_minor = mdidx(mddev);
+- sprintf(disk->disk_name, "md%d", mdidx(mddev));
+- disk->fops = &md_fops;
+- disk->private_data = mddev;
+- disk->queue = &mddev->queue;
+- add_disk(disk);
+- disks[mdidx(mddev)] = disk;
+- up(&disks_sem);
+- return NULL;
+-}
+-
+-void md_wakeup_thread(mdk_thread_t *thread);
+-
+-static void md_safemode_timeout(unsigned long data)
+-{
+- mddev_t *mddev = (mddev_t *) data;
+-
+- mddev->safemode = 1;
+- md_wakeup_thread(mddev->thread);
+-}
+-
+-
+-static int do_md_run(mddev_t * mddev)
+-{
+- int pnum, err;
+- int chunk_size;
+- struct list_head *tmp;
+- mdk_rdev_t *rdev;
+- struct gendisk *disk;
+-
+- if (list_empty(&mddev->disks)) {
+- MD_BUG();
+- return -EINVAL;
+- }
+-
+- if (mddev->pers)
+- return -EBUSY;
+-
+- /*
+- * Analyze all RAID superblock(s)
+- */
+- if (!mddev->raid_disks && analyze_sbs(mddev)) {
+- MD_BUG();
+- return -EINVAL;
+- }
+-
+- chunk_size = mddev->chunk_size;
+- pnum = level_to_pers(mddev->level);
+-
+- if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+- if (!chunk_size) {
+- /*
+- * 'default chunksize' in the old md code used to
+- * be PAGE_SIZE, baaad.
+- * we abort here to be on the safe side. We don't
+- * want to continue the bad practice.
+- */
+- printk(KERN_ERR
+- "no chunksize specified, see 'man raidtab'\n");
+- return -EINVAL;
+- }
+- if (chunk_size > MAX_CHUNK_SIZE) {
+- printk(KERN_ERR "too big chunk_size: %d > %d\n",
+- chunk_size, MAX_CHUNK_SIZE);
+- return -EINVAL;
+- }
+- /*
+- * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+- */
+- if ( (1 << ffz(~chunk_size)) != chunk_size) {
+- MD_BUG();
+- return -EINVAL;
+- }
+- if (chunk_size < PAGE_SIZE) {
+- printk(KERN_ERR "too small chunk_size: %d < %ld\n",
+- chunk_size, PAGE_SIZE);
+- return -EINVAL;
+- }
+-
+- /* devices must have minimum size of one chunk */
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- if (rdev->faulty)
+- continue;
+- if (rdev->size < chunk_size / 1024) {
+- printk(KERN_WARNING
+- "md: Dev %s smaller than chunk_size:"
+- " %lluk < %dk\n",
+- bdev_partition_name(rdev->bdev),
+- (unsigned long long)rdev->size,
+- chunk_size / 1024);
+- return -EINVAL;
+- }
+- }
+- }
+- if (pnum >= MAX_PERSONALITY) {
+- MD_BUG();
+- return -EINVAL;
+- }
+-
+-#ifdef CONFIG_KMOD
+- if (!pers[pnum])
+- {
+- char module_name[80];
+- sprintf (module_name, "md-personality-%d", pnum);
+- request_module (module_name);
++*** 1664,9 **** 2
++ }
+ }
+-#endif
+
+ if (device_size_calculation(mddev))
+ return -EINVAL;
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+- * device.
+- * Also find largest hardsector size
+- */
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- if (rdev->faulty)
+- continue;
+- sync_blockdev(rdev->bdev);
+- invalidate_bdev(rdev->bdev, 0);
+- }
+-
+- md_probe(mdidx(mddev), NULL, NULL);
+- disk = disks[mdidx(mddev)];
+- if (!disk)
+- return -ENOMEM;
+-
+- spin_lock(&pers_lock);
+- if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
+- spin_unlock(&pers_lock);
+- printk(KERN_ERR "md: personality %d is not loaded!\n",
+- pnum);
+- return -EINVAL;
+- }
+-
+- mddev->pers = pers[pnum];
+- spin_unlock(&pers_lock);
+-
+- blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+- printk("%s: setting max_sectors to %d, segment boundary to %d\n",
+- disk->disk_name,
+- chunk_size >> 9,
+- (chunk_size>>1)-1);
+- blk_queue_max_sectors(&mddev->queue, chunk_size >> 9);
+- blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1);
+- mddev->queue.queuedata = mddev;
+-
+- err = mddev->pers->run(mddev);
+- if (err) {
+- printk(KERN_ERR "md: pers->run() failed ...\n");
+- module_put(mddev->pers->owner);
+- mddev->pers = NULL;
+- return -EINVAL;
+- }
+- atomic_set(&mddev->writes_pending,0);
+- mddev->safemode = 0;
+- mddev->safemode_timer.function = md_safemode_timeout;
+- mddev->safemode_timer.data = (unsigned long) mddev;
+- mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
+- mddev->in_sync = 1;
+-
+- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+- md_wakeup_thread(mddev->thread);
+- set_capacity(disk, mddev->array_size<<1);
+- return 0;
+-}
+-
+-static int restart_array(mddev_t *mddev)
+-{
+- struct gendisk *disk = disks[mdidx(mddev)];
+- int err;
+-
+- /*
+- * Complain if it has no devices
+- */
+- err = -ENXIO;
+- if (list_empty(&mddev->disks))
+- goto out;
+-
+- if (mddev->pers) {
+- err = -EBUSY;
+- if (!mddev->ro)
+- goto out;
+-
+- mddev->safemode = 0;
+- mddev->ro = 0;
+- set_disk_ro(disk, 0);
+-
+- printk(KERN_INFO "md: md%d switched to read-write mode.\n",
+- mdidx(mddev));
+- /*
+- * Kick recovery or resync if necessary
+- */
+- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+- md_wakeup_thread(mddev->thread);
+- err = 0;
+- } else {
+- printk(KERN_ERR "md: md%d has no personality assigned.\n",
+- mdidx(mddev));
+- err = -EINVAL;
+- }
+-
+-out:
+- return err;
+-}
+-
+-static int do_md_stop(mddev_t * mddev, int ro)
+-{
+- int err = 0;
+- struct gendisk *disk = disks[mdidx(mddev)];
+-
+- if (atomic_read(&mddev->active)>2) {
+- printk("md: md%d still in use.\n",mdidx(mddev));
+- err = -EBUSY;
+- goto out;
+- }
+-
+- if (mddev->pers) {
+- if (mddev->sync_thread) {
+- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+- md_unregister_thread(mddev->sync_thread);
+- mddev->sync_thread = NULL;
+- }
+-
+- del_timer_sync(&mddev->safemode_timer);
+-
+- invalidate_device(mk_kdev(disk->major, disk->first_minor), 1);
+-
+- if (ro) {
+- err = -ENXIO;
+- if (mddev->ro)
+- goto out;
+- mddev->ro = 1;
+- } else {
+- if (mddev->ro)
+- set_disk_ro(disk, 0);
+- if (mddev->pers->stop(mddev)) {
+- err = -EBUSY;
+- if (mddev->ro)
+- set_disk_ro(disk, 1);
+- goto out;
+- }
+- module_put(mddev->pers->owner);
+- mddev->pers = NULL;
+- if (mddev->ro)
+- mddev->ro = 0;
+- }
+- if (mddev->raid_disks) {
+- /* mark array as shutdown cleanly */
+- mddev->in_sync = 1;
+- md_update_sb(mddev);
+- }
+- if (ro)
+- set_disk_ro(disk, 1);
+- }
+- /*
+- * Free resources if final stop
+- */
+- if (!ro) {
+- struct gendisk *disk;
+- printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+-
+- export_array(mddev);
+-
+- mddev->array_size = 0;
+- disk = disks[mdidx(mddev)];
+- if (disk)
+- set_capacity(disk, 0);
+- } else
+- printk(KERN_INFO "md: md%d switched to read-only mode.\n",
+- mdidx(mddev));
+- err = 0;
+-out:
+- return err;
+-}
+-
+-static void autorun_array(mddev_t *mddev)
+-{
+- mdk_rdev_t *rdev;
+- struct list_head *tmp;
+- int err;
+-
+- if (list_empty(&mddev->disks)) {
+- MD_BUG();
+- return;
+- }
+-
+- printk(KERN_INFO "md: running: ");
+-
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- printk("<%s>", bdev_partition_name(rdev->bdev));
+- }
+- printk("\n");
+-
+- err = do_md_run (mddev);
+- if (err) {
+- printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+- do_md_stop (mddev, 0);
+- }
+-}
+-
+-/*
+- * lets try to run arrays based on all disks that have arrived
+- * until now. (those are in pending_raid_disks)
+- *
+- * the method: pick the first pending disk, collect all disks with
+- * the same UUID, remove all from the pending list and put them into
+- * the 'same_array' list. Then order this list based on superblock
+- * update time (freshest comes first), kick out 'old' disks and
+- * compare superblocks. If everything's fine then run it.
+- *
+- * If "unit" is allocated, then bump its reference count
+- */
+-static void autorun_devices(void)
+-{
+- struct list_head candidates;
+- struct list_head *tmp;
+- mdk_rdev_t *rdev0, *rdev;
+- mddev_t *mddev;
+-
+- printk(KERN_INFO "md: autorun ...\n");
+- while (!list_empty(&pending_raid_disks)) {
+- rdev0 = list_entry(pending_raid_disks.next,
+- mdk_rdev_t, same_set);
+-
+- printk(KERN_INFO "md: considering %s ...\n",
+- bdev_partition_name(rdev0->bdev));
+- INIT_LIST_HEAD(&candidates);
+- ITERATE_RDEV_PENDING(rdev,tmp)
+- if (super_90_load(rdev, rdev0, 0) >= 0) {
+- printk(KERN_INFO "md: adding %s ...\n",
+- bdev_partition_name(rdev->bdev));
+- list_move(&rdev->same_set, &candidates);
+- }
+- /*
+- * now we have a set of devices, with all of them having
+- * mostly sane superblocks. It's time to allocate the
+- * mddev.
+- */
+-
+- mddev = mddev_find(rdev0->preferred_minor);
+- if (!mddev) {
+- printk(KERN_ERR
+- "md: cannot allocate memory for md drive.\n");
+- break;
+- }
+- if (mddev_lock(mddev))
+- printk(KERN_WARNING "md: md%d locked, cannot run\n",
+- mdidx(mddev));
+- else if (mddev->raid_disks || mddev->major_version
+- || !list_empty(&mddev->disks)) {
+- printk(KERN_WARNING
+- "md: md%d already running, cannot run %s\n",
+- mdidx(mddev), bdev_partition_name(rdev0->bdev));
+- mddev_unlock(mddev);
+- } else {
+- printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+- ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
+- list_del_init(&rdev->same_set);
+- if (bind_rdev_to_array(rdev, mddev))
+- export_rdev(rdev);
+- }
+- autorun_array(mddev);
+- mddev_unlock(mddev);
+- }
+- /* on success, candidates will be empty, on error
+- * it won't...
+- */
+- ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
+- export_rdev(rdev);
+- mddev_put(mddev);
+- }
+- printk(KERN_INFO "md: ... autorun DONE.\n");
+-}
+-
+-/*
+- * import RAID devices based on one partition
+- * if possible, the array gets run as well.
+- */
+-
+-static int autostart_array(dev_t startdev)
+-{
+- int err = -EINVAL, i;
+- mdp_super_t *sb = NULL;
+- mdk_rdev_t *start_rdev = NULL, *rdev;
+-
+- start_rdev = md_import_device(startdev, 0, 0);
+- if (IS_ERR(start_rdev)) {
+- printk(KERN_WARNING "md: could not import %s!\n",
+- partition_name(startdev));
+- return err;
+- }
+-
+- /* NOTE: this can only work for 0.90.0 superblocks */
+- sb = (mdp_super_t*)page_address(start_rdev->sb_page);
+- if (sb->major_version != 0 ||
+- sb->minor_version != 90 ) {
+- printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
+- export_rdev(start_rdev);
+- return err;
+- }
+-
+- if (start_rdev->faulty) {
+- printk(KERN_WARNING
+- "md: can not autostart based on faulty %s!\n",
+- bdev_partition_name(start_rdev->bdev));
+- export_rdev(start_rdev);
+- return err;
+- }
+- list_add(&start_rdev->same_set, &pending_raid_disks);
+-
+- for (i = 0; i < MD_SB_DISKS; i++) {
+- mdp_disk_t *desc;
+- dev_t dev;
+-
+- desc = sb->disks + i;
+- dev = MKDEV(desc->major, desc->minor);
+-
+- if (!dev)
+- continue;
+- if (dev == startdev)
+- continue;
+- rdev = md_import_device(dev, 0, 0);
+- if (IS_ERR(rdev)) {
+- printk(KERN_WARNING "md: could not import %s,"
+- " trying to run array nevertheless.\n",
+- partition_name(dev));
+- continue;
+- }
+- list_add(&rdev->same_set, &pending_raid_disks);
+- }
+-
+- /*
+- * possibly return codes
+- */
+- autorun_devices();
+- return 0;
+-
+-}
+-
+-
+-static int get_version(void * arg)
+-{
+- mdu_version_t ver;
+-
+- ver.major = MD_MAJOR_VERSION;
+- ver.minor = MD_MINOR_VERSION;
+- ver.patchlevel = MD_PATCHLEVEL_VERSION;
+-
+- if (copy_to_user(arg, &ver, sizeof(ver)))
+- return -EFAULT;
+-
+- return 0;
+-}
+-
+-static int get_array_info(mddev_t * mddev, void * arg)
+-{
+- mdu_array_info_t info;
+- int nr,working,active,failed,spare;
+- mdk_rdev_t *rdev;
+- struct list_head *tmp;
+-
+- nr=working=active=failed=spare=0;
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- nr++;
+- if (rdev->faulty)
+- failed++;
+- else {
+- working++;
+- if (rdev->in_sync)
+- active++;
+- else
+- spare++;
+- }
+- }
+-
+- info.major_version = mddev->major_version;
+- info.minor_version = mddev->minor_version;
+- info.patch_version = 1;
+- info.ctime = mddev->ctime;
+- info.level = mddev->level;
+- info.size = mddev->size;
+- info.nr_disks = nr;
+- info.raid_disks = mddev->raid_disks;
+- info.md_minor = mddev->__minor;
+- info.not_persistent= !mddev->persistent;
+-
+- info.utime = mddev->utime;
+- info.state = 0;
+- if (mddev->in_sync)
+- info.state = (1<<MD_SB_CLEAN);
+- info.active_disks = active;
+- info.working_disks = working;
+- info.failed_disks = failed;
+- info.spare_disks = spare;
+-
+- info.layout = mddev->layout;
+- info.chunk_size = mddev->chunk_size;
+-
+- if (copy_to_user(arg, &info, sizeof(info)))
+- return -EFAULT;
+-
+- return 0;
+-}
+-
+-static int get_disk_info(mddev_t * mddev, void * arg)
+-{
+- mdu_disk_info_t info;
+- unsigned int nr;
+- mdk_rdev_t *rdev;
+-
+- if (copy_from_user(&info, arg, sizeof(info)))
+- return -EFAULT;
+-
+- nr = info.number;
+-
+- rdev = find_rdev_nr(mddev, nr);
+- if (rdev) {
+- info.major = MAJOR(rdev->bdev->bd_dev);
+- info.minor = MINOR(rdev->bdev->bd_dev);
+- info.raid_disk = rdev->raid_disk;
+- info.state = 0;
+- if (rdev->faulty)
+- info.state |= (1<<MD_DISK_FAULTY);
+- else if (rdev->in_sync) {
+- info.state |= (1<<MD_DISK_ACTIVE);
+- info.state |= (1<<MD_DISK_SYNC);
+- }
+- } else {
+- info.major = info.minor = 0;
+- info.raid_disk = -1;
+- info.state = (1<<MD_DISK_REMOVED);
+- }
+-
+- if (copy_to_user(arg, &info, sizeof(info)))
+- return -EFAULT;
+-
+- return 0;
+-}
+-
+-static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+-{
+- mdk_rdev_t *rdev;
+- dev_t dev;
+- dev = MKDEV(info->major,info->minor);
+- if (!mddev->raid_disks) {
+- int err;
+- /* expecting a device which has a superblock */
+- rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
+- if (IS_ERR(rdev)) {
+- printk(KERN_WARNING
+- "md: md_import_device returned %ld\n",
+- PTR_ERR(rdev));
+- return PTR_ERR(rdev);
+- }
+- if (!list_empty(&mddev->disks)) {
+- mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+- mdk_rdev_t, same_set);
+- int err = super_types[mddev->major_version]
+- .load_super(rdev, rdev0, mddev->minor_version);
+- if (err < 0) {
+- printk(KERN_WARNING
+- "md: %s has different UUID to %s\n",
+- bdev_partition_name(rdev->bdev),
+- bdev_partition_name(rdev0->bdev));
+- export_rdev(rdev);
+- return -EINVAL;
+- }
+- }
+- err = bind_rdev_to_array(rdev, mddev);
+- if (err)
+- export_rdev(rdev);
+- return err;
+- }
+-
+- /*
+- * add_new_disk can be used once the array is assembled
+- * to add "hot spares". They must already have a superblock
+- * written
+- */
+- if (mddev->pers) {
+- int err;
+- if (!mddev->pers->hot_add_disk) {
+- printk(KERN_WARNING
+- "md%d: personality does not support diskops!\n",
+- mdidx(mddev));
+- return -EINVAL;
+- }
+- rdev = md_import_device(dev, mddev->major_version,
+- mddev->minor_version);
+- if (IS_ERR(rdev)) {
+- printk(KERN_WARNING
+- "md: md_import_device returned %ld\n",
+- PTR_ERR(rdev));
+- return PTR_ERR(rdev);
+- }
+- rdev->in_sync = 0; /* just to be sure */
+- rdev->raid_disk = -1;
+- err = bind_rdev_to_array(rdev, mddev);
+- if (err)
+- export_rdev(rdev);
+- if (mddev->thread)
+- md_wakeup_thread(mddev->thread);
+- return err;
+- }
+-
+- /* otherwise, add_new_disk is only allowed
+- * for major_version==0 superblocks
+- */
+- if (mddev->major_version != 0) {
+- printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n",
+- mdidx(mddev));
+- return -EINVAL;
+- }
+-
+- if (!(info->state & (1<<MD_DISK_FAULTY))) {
+- int err;
+- rdev = md_import_device (dev, -1, 0);
+- if (IS_ERR(rdev)) {
+- printk(KERN_WARNING
+- "md: error, md_import_device() returned %ld\n",
+- PTR_ERR(rdev));
+- return PTR_ERR(rdev);
+- }
+- rdev->desc_nr = info->number;
+- if (info->raid_disk < mddev->raid_disks)
+- rdev->raid_disk = info->raid_disk;
+- else
+- rdev->raid_disk = -1;
+-
+- rdev->faulty = 0;
+- if (rdev->raid_disk < mddev->raid_disks)
+- rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
+- else
+- rdev->in_sync = 0;
+-
+- err = bind_rdev_to_array(rdev, mddev);
+- if (err) {
+- export_rdev(rdev);
+- return err;
+- }
+-
+- if (!mddev->persistent) {
+- printk(KERN_INFO "md: nonpersistent superblock ...\n");
+- rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+- } else
+- rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+- rdev->size = calc_dev_size(rdev, mddev->chunk_size);
+-
+- if (!mddev->size || (mddev->size > rdev->size))
+- mddev->size = rdev->size;
+- }
+-
+- return 0;
+-}
+-
+-static int hot_generate_error(mddev_t * mddev, dev_t dev)
+-{
+- struct request_queue *q;
+- mdk_rdev_t *rdev;
+-
+- if (!mddev->pers)
+- return -ENODEV;
+-
+- printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+- partition_name(dev), mdidx(mddev));
+-
+- rdev = find_rdev(mddev, dev);
+- if (!rdev) {
+- MD_BUG();
+- return -ENXIO;
+- }
+-
+- if (rdev->desc_nr == -1) {
+- MD_BUG();
+- return -EINVAL;
+- }
+- if (!rdev->in_sync)
+- return -ENODEV;
+-
+- q = bdev_get_queue(rdev->bdev);
+- if (!q) {
+- MD_BUG();
+- return -ENODEV;
+- }
+- printk(KERN_INFO "md: okay, generating error!\n");
+-// q->oneshot_error = 1; // disabled for now
+-
+- return 0;
+-}
+-
+-static int hot_remove_disk(mddev_t * mddev, dev_t dev)
+-{
+- mdk_rdev_t *rdev;
+-
+- if (!mddev->pers)
+- return -ENODEV;
+-
+- printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+- partition_name(dev), mdidx(mddev));
+-
+- rdev = find_rdev(mddev, dev);
+- if (!rdev)
+- return -ENXIO;
+-
+- if (rdev->raid_disk >= 0)
+- goto busy;
+-
+- kick_rdev_from_array(rdev);
+- md_update_sb(mddev);
+-
+- return 0;
+-busy:
+- printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+- bdev_partition_name(rdev->bdev), mdidx(mddev));
+- return -EBUSY;
+-}
+-
+-static int hot_add_disk(mddev_t * mddev, dev_t dev)
+-{
+- int err;
+- unsigned int size;
+- mdk_rdev_t *rdev;
+-
+- if (!mddev->pers)
+- return -ENODEV;
+-
+- printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+- partition_name(dev), mdidx(mddev));
+-
+- if (mddev->major_version != 0) {
+- printk(KERN_WARNING "md%d: HOT_ADD may only be used with"
+- " version-0 superblocks.\n",
+- mdidx(mddev));
+- return -EINVAL;
+- }
+- if (!mddev->pers->hot_add_disk) {
+- printk(KERN_WARNING
+- "md%d: personality does not support diskops!\n",
+- mdidx(mddev));
+- return -EINVAL;
+- }
+-
+- rdev = md_import_device (dev, -1, 0);
+- if (IS_ERR(rdev)) {
+- printk(KERN_WARNING
+- "md: error, md_import_device() returned %ld\n",
+- PTR_ERR(rdev));
+- return -EINVAL;
+- }
+-
+- rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+- size = calc_dev_size(rdev, mddev->chunk_size);
+- rdev->size = size;
+-
+- if (size < mddev->size) {
+- printk(KERN_WARNING
+- "md%d: disk size %llu blocks < array size %llu\n",
+- mdidx(mddev), (unsigned long long)size,
+- (unsigned long long)mddev->size);
+- err = -ENOSPC;
+- goto abort_export;
+- }
+-
+- if (rdev->faulty) {
+- printk(KERN_WARNING
+- "md: can not hot-add faulty %s disk to md%d!\n",
+- bdev_partition_name(rdev->bdev), mdidx(mddev));
+- err = -EINVAL;
+- goto abort_export;
+- }
+- rdev->in_sync = 0;
+- rdev->desc_nr = -1;
+- bind_rdev_to_array(rdev, mddev);
+-
+- /*
+- * The rest should better be atomic, we can have disk failures
+- * noticed in interrupt contexts ...
+- */
+-
+- if (rdev->desc_nr == mddev->max_disks) {
+- printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+- mdidx(mddev));
+- err = -EBUSY;
+- goto abort_unbind_export;
+- }
+-
+- rdev->raid_disk = -1;
+-
+- md_update_sb(mddev);
+-
+- /*
+- * Kick recovery, maybe this spare has to be added to the
+- * array immediately.
+- */
+- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+- md_wakeup_thread(mddev->thread);
+-
+- return 0;
+-
+-abort_unbind_export:
+- unbind_rdev_from_array(rdev);
+-
+-abort_export:
+- export_rdev(rdev);
+- return err;
+-}
+-
+-/*
+- * set_array_info is used two different ways
+- * The original usage is when creating a new array.
+- * In this usage, raid_disks is > = and it together with
+- * level, size, not_persistent,layout,chunksize determine the
+- * shape of the array.
+- * This will always create an array with a type-0.90.0 superblock.
+- * The newer usage is when assembling an array.
+- * In this case raid_disks will be 0, and the major_version field is
+- * use to determine which style super-blocks are to be found on the devices.
+- * The minor and patch _version numbers are also kept incase the
+- * super_block handler wishes to interpret them.
+- */
+-static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+-{
+-
+- if (info->raid_disks == 0) {
+- /* just setting version number for superblock loading */
+- if (info->major_version < 0 ||
+- info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
+- super_types[info->major_version].name == NULL) {
+- /* maybe try to auto-load a module? */
+- printk(KERN_INFO
+- "md: superblock version %d not known\n",
+- info->major_version);
+- return -EINVAL;
+- }
+- mddev->major_version = info->major_version;
+- mddev->minor_version = info->minor_version;
+- mddev->patch_version = info->patch_version;
+- return 0;
+- }
+- mddev->major_version = MD_MAJOR_VERSION;
+- mddev->minor_version = MD_MINOR_VERSION;
+- mddev->patch_version = MD_PATCHLEVEL_VERSION;
+- mddev->ctime = get_seconds();
+-
+- mddev->level = info->level;
+- mddev->size = info->size;
+- mddev->raid_disks = info->raid_disks;
+- /* don't set __minor, it is determined by which /dev/md* was
+- * openned
+- */
+- if (info->state & (1<<MD_SB_CLEAN))
+- mddev->recovery_cp = MaxSector;
+- else
+- mddev->recovery_cp = 0;
+- mddev->persistent = ! info->not_persistent;
+-
+- mddev->layout = info->layout;
+- mddev->chunk_size = info->chunk_size;
+-
+- mddev->max_disks = MD_SB_DISKS;
+-
+-
+- /*
+- * Generate a 128 bit UUID
+- */
+- get_random_bytes(mddev->uuid, 16);
+-
+- return 0;
+-}
+-
+-static int set_disk_faulty(mddev_t *mddev, dev_t dev)
+-{
+- mdk_rdev_t *rdev;
+-
+- rdev = find_rdev(mddev, dev);
+- if (!rdev)
+- return 0;
+-
+- md_error(mddev, rdev);
+- return 1;
+-}
+-
+-static int md_ioctl(struct inode *inode, struct file *file,
+- unsigned int cmd, unsigned long arg)
+-{
+- unsigned int minor;
+- int err = 0;
+- struct hd_geometry *loc = (struct hd_geometry *) arg;
+- mddev_t *mddev = NULL;
+- kdev_t dev;
+-
+- if (!capable(CAP_SYS_ADMIN))
+- return -EACCES;
+-
+- dev = inode->i_rdev;
+- minor = minor(dev);
+- if (minor >= MAX_MD_DEVS) {
+- MD_BUG();
+- return -EINVAL;
+- }
+-
+- /*
+- * Commands dealing with the RAID driver but not any
+- * particular array:
+- */
+- switch (cmd)
+- {
+- case RAID_VERSION:
+- err = get_version((void *)arg);
+- goto done;
+-
+- case PRINT_RAID_DEBUG:
+- err = 0;
+- md_print_devices();
+- goto done;
+-
+-#ifndef MODULE
+- case RAID_AUTORUN:
+- err = 0;
+- autostart_arrays();
+- goto done;
+-#endif
+- default:;
+- }
+-
+- /*
+- * Commands creating/starting a new array:
+- */
+-
+- mddev = inode->i_bdev->bd_inode->u.generic_ip;
+-
+- if (!mddev) {
+- BUG();
+- goto abort;
+- }
+-
+-
+- if (cmd == START_ARRAY) {
+- /* START_ARRAY doesn't need to lock the array as autostart_array
+- * does the locking, and it could even be a different array
+- */
+- err = autostart_array(arg);
+- if (err) {
+- printk(KERN_WARNING "md: autostart %s failed!\n",
+- partition_name(arg));
+- goto abort;
+- }
+- goto done;
+- }
+-
+- err = mddev_lock(mddev);
+- if (err) {
+- printk(KERN_INFO
+- "md: ioctl lock interrupted, reason %d, cmd %d\n",
+- err, cmd);
+- goto abort;
+- }
+-
+- switch (cmd)
+- {
+- case SET_ARRAY_INFO:
+-
+- if (!list_empty(&mddev->disks)) {
+- printk(KERN_WARNING
+- "md: array md%d already has disks!\n",
+- mdidx(mddev));
+- err = -EBUSY;
+- goto abort_unlock;
+- }
+- if (mddev->raid_disks) {
+- printk(KERN_WARNING
+- "md: array md%d already initialised!\n",
+- mdidx(mddev));
+- err = -EBUSY;
+- goto abort_unlock;
+- }
+- {
+- mdu_array_info_t info;
+- if (!arg)
+- memset(&info, 0, sizeof(info));
+- else if (copy_from_user(&info, (void*)arg, sizeof(info))) {
+- err = -EFAULT;
+- goto abort_unlock;
+- }
+- err = set_array_info(mddev, &info);
+- if (err) {
+- printk(KERN_WARNING "md: couldn't set"
+- " array info. %d\n", err);
+- goto abort_unlock;
+- }
+- }
+- goto done_unlock;
+-
+- default:;
+- }
+-
+- /*
+- * Commands querying/configuring an existing array:
+- */
+- /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+- if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+- err = -ENODEV;
+- goto abort_unlock;
+- }
+-
+- /*
+- * Commands even a read-only array can execute:
+- */
+- switch (cmd)
+- {
+- case GET_ARRAY_INFO:
+- err = get_array_info(mddev, (void *)arg);
+- goto done_unlock;
+-
+- case GET_DISK_INFO:
+- err = get_disk_info(mddev, (void *)arg);
+- goto done_unlock;
+-
+- case RESTART_ARRAY_RW:
+- err = restart_array(mddev);
+- goto done_unlock;
+-
+- case STOP_ARRAY:
+- err = do_md_stop (mddev, 0);
+- goto done_unlock;
+-
+- case STOP_ARRAY_RO:
+- err = do_md_stop (mddev, 1);
+- goto done_unlock;
+-
+- /*
+- * We have a problem here : there is no easy way to give a CHS
+- * virtual geometry. We currently pretend that we have a 2 heads
+- * 4 sectors (with a BIG number of cylinders...). This drives
+- * dosfs just mad... ;-)
+- */
+- case HDIO_GETGEO:
+- if (!loc) {
+- err = -EINVAL;
+- goto abort_unlock;
+- }
+- err = put_user (2, (char *) &loc->heads);
+- if (err)
+- goto abort_unlock;
+- err = put_user (4, (char *) &loc->sectors);
+- if (err)
+- goto abort_unlock;
+- err = put_user(get_capacity(disks[mdidx(mddev)])/8,
+- (short *) &loc->cylinders);
+- if (err)
+- goto abort_unlock;
+- err = put_user (get_start_sect(inode->i_bdev),
+- (long *) &loc->start);
+- goto done_unlock;
+- }
+-
+- /*
+- * The remaining ioctls are changing the state of the
+- * superblock, so we do not allow read-only arrays
+- * here:
+- */
+- if (mddev->ro) {
+- err = -EROFS;
+- goto abort_unlock;
+- }
+-
+- switch (cmd)
+- {
+- case ADD_NEW_DISK:
+- {
+- mdu_disk_info_t info;
+- if (copy_from_user(&info, (void*)arg, sizeof(info)))
+- err = -EFAULT;
+- else
+- err = add_new_disk(mddev, &info);
+- goto done_unlock;
+- }
+- case HOT_GENERATE_ERROR:
+- err = hot_generate_error(mddev, arg);
+- goto done_unlock;
+- case HOT_REMOVE_DISK:
+- err = hot_remove_disk(mddev, arg);
+- goto done_unlock;
+-
+- case HOT_ADD_DISK:
+- err = hot_add_disk(mddev, arg);
+- goto done_unlock;
+-
+- case SET_DISK_FAULTY:
+- err = set_disk_faulty(mddev, arg);
+- goto done_unlock;
+-
+- case RUN_ARRAY:
+- {
+- err = do_md_run (mddev);
+- /*
+- * we have to clean up the mess if
+- * the array cannot be run for some
+- * reason ...
+- * ->pers will not be set, to superblock will
+- * not be updated.
+- */
+- if (err)
+- do_md_stop (mddev, 0);
+- goto done_unlock;
+- }
+-
+- default:
+- if (_IOC_TYPE(cmd) == MD_MAJOR)
+- printk(KERN_WARNING "md: %s(pid %d) used"
+- " obsolete MD ioctl, upgrade your"
+- " software to use new ictls.\n",
+- current->comm, current->pid);
+- err = -EINVAL;
+- goto abort_unlock;
+- }
+-
+-done_unlock:
+-abort_unlock:
+- mddev_unlock(mddev);
+-
+- return err;
+-done:
+- if (err)
+- MD_BUG();
+-abort:
+- return err;
+-}
+-
+-static int md_open(struct inode *inode, struct file *file)
+-{
+- /*
+- * Succeed if we can find or allocate a mddev structure.
+- */
+- mddev_t *mddev = mddev_find(minor(inode->i_rdev));
+- int err = -ENOMEM;
+-
+- if (!mddev)
+- goto out;
+-
+- if ((err = mddev_lock(mddev)))
+- goto put;
+-
+- err = 0;
+- mddev_unlock(mddev);
+- inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev);
+- put:
+- mddev_put(mddev);
+- out:
+- return err;
+-}
+-
+-static int md_release(struct inode *inode, struct file * file)
+-{
+- mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip;
+-
+- if (!mddev)
+- BUG();
+- mddev_put(mddev);
+-
+- return 0;
+-}
+-
+-static struct block_device_operations md_fops =
+-{
+- .owner = THIS_MODULE,
+- .open = md_open,
+- .release = md_release,
+- .ioctl = md_ioctl,
+-};
+-
+-int md_thread(void * arg)
+-{
+- mdk_thread_t *thread = arg;
+-
+- lock_kernel();
+-
+- /*
+- * Detach thread
+- */
+-
+- daemonize(thread->name, mdidx(thread->mddev));
+-
+- current->exit_signal = SIGCHLD;
+- allow_signal(SIGKILL);
+- thread->tsk = current;
+-
+- /*
+- * md_thread is a 'system-thread', it's priority should be very
+- * high. We avoid resource deadlocks individually in each
+- * raid personality. (RAID5 does preallocation) We also use RR and
+- * the very same RT priority as kswapd, thus we will never get
+- * into a priority inversion deadlock.
+- *
+- * we definitely have to have equal or higher priority than
+- * bdflush, otherwise bdflush will deadlock if there are too
+- * many dirty RAID5 blocks.
+- */
+- unlock_kernel();
+-
+- complete(thread->event);
+- while (thread->run) {
+- void (*run)(mddev_t *);
+-
+- wait_event_interruptible(thread->wqueue,
+- test_bit(THREAD_WAKEUP, &thread->flags));
+- if (current->flags & PF_FREEZE)
+- refrigerator(PF_IOTHREAD);
+-
+- clear_bit(THREAD_WAKEUP, &thread->flags);
+-
+- run = thread->run;
+- if (run) {
+- run(thread->mddev);
+- blk_run_queues();
+- }
+- if (signal_pending(current))
+- flush_signals(current);
+- }
+- complete(thread->event);
+- return 0;
+-}
+-
+-void md_wakeup_thread(mdk_thread_t *thread)
+-{
+- if (thread) {
+- dprintk("md: waking up MD thread %p.\n", thread);
+- set_bit(THREAD_WAKEUP, &thread->flags);
+- wake_up(&thread->wqueue);
+- }
+-}
+-
+-mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
+- const char *name)
+-{
+- mdk_thread_t *thread;
+- int ret;
+- struct completion event;
+-
+- thread = (mdk_thread_t *) kmalloc
+- (sizeof(mdk_thread_t), GFP_KERNEL);
+- if (!thread)
+- return NULL;
+-
+- memset(thread, 0, sizeof(mdk_thread_t));
+- init_waitqueue_head(&thread->wqueue);
+-
+- init_completion(&event);
+- thread->event = &event;
+- thread->run = run;
+- thread->mddev = mddev;
+- thread->name = name;
+- ret = kernel_thread(md_thread, thread, 0);
+- if (ret < 0) {
+- kfree(thread);
+- return NULL;
+- }
+- wait_for_completion(&event);
+- return thread;
+-}
+-
+-void md_interrupt_thread(mdk_thread_t *thread)
+-{
+- if (!thread->tsk) {
+- MD_BUG();
+- return;
+- }
+- dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+- send_sig(SIGKILL, thread->tsk, 1);
+-}
+-
+-void md_unregister_thread(mdk_thread_t *thread)
+-{
+- struct completion event;
+-
+- init_completion(&event);
+-
+- thread->event = &event;
+- thread->run = NULL;
+- thread->name = NULL;
+- md_interrupt_thread(thread);
+- wait_for_completion(&event);
+- kfree(thread);
+-}
+-
+-void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+-{
+- dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+- MD_MAJOR,mdidx(mddev),
+- MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
+- __builtin_return_address(0),__builtin_return_address(1),
+- __builtin_return_address(2),__builtin_return_address(3));
+-
+- if (!mddev) {
+- MD_BUG();
+- return;
+- }
+-
+- if (!rdev || rdev->faulty)
+- return;
+- if (!mddev->pers->error_handler)
+- return;
+- mddev->pers->error_handler(mddev,rdev);
+- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+- md_wakeup_thread(mddev->thread);
+-}
+-
+-/* seq_file implementation /proc/mdstat */
+-
+-static void status_unused(struct seq_file *seq)
+-{
+- int i = 0;
+- mdk_rdev_t *rdev;
+- struct list_head *tmp;
+-
+- seq_printf(seq, "unused devices: ");
+-
+- ITERATE_RDEV_PENDING(rdev,tmp) {
+- i++;
+- seq_printf(seq, "%s ",
+- bdev_partition_name(rdev->bdev));
+- }
+- if (!i)
+- seq_printf(seq, "<none>");
+-
+- seq_printf(seq, "\n");
+-}
+-
+-
+-static void status_resync(struct seq_file *seq, mddev_t * mddev)
+-{
+- unsigned long max_blocks, resync, res, dt, db, rt;
+-
+- resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+- max_blocks = mddev->size;
+-
+- /*
+- * Should not happen.
+- */
+- if (!max_blocks) {
+- MD_BUG();
+- return;
+- }
+- res = (resync/1024)*1000/(max_blocks/1024 + 1);
+- {
+- int i, x = res/50, y = 20-x;
+- seq_printf(seq, "[");
+- for (i = 0; i < x; i++)
+- seq_printf(seq, "=");
+- seq_printf(seq, ">");
+- for (i = 0; i < y; i++)
+- seq_printf(seq, ".");
+- seq_printf(seq, "] ");
+- }
+- seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+- (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+- "resync" : "recovery"),
+- res/10, res % 10, resync, max_blocks);
+-
+- /*
+- * We do not want to overflow, so the order of operands and
+- * the * 100 / 100 trick are important. We do a +1 to be
+- * safe against division by zero. We only estimate anyway.
+- *
+- * dt: time from mark until now
+- * db: blocks written from mark until now
+- * rt: remaining time
+- */
+- dt = ((jiffies - mddev->resync_mark) / HZ);
+- if (!dt) dt++;
+- db = resync - (mddev->resync_mark_cnt/2);
+- rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+-
+- seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+-
+- seq_printf(seq, " speed=%ldK/sec", db/dt);
+-}
+-
+-static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+-{
+- struct list_head *tmp;
+- loff_t l = *pos;
+- mddev_t *mddev;
+-
+- if (l > 0x10000)
+- return NULL;
+- if (!l--)
+- /* header */
+- return (void*)1;
+-
+- spin_lock(&all_mddevs_lock);
+- list_for_each(tmp,&all_mddevs)
+- if (!l--) {
+- mddev = list_entry(tmp, mddev_t, all_mddevs);
+- mddev_get(mddev);
+- spin_unlock(&all_mddevs_lock);
+- return mddev;
+- }
+- spin_unlock(&all_mddevs_lock);
+- return (void*)2;/* tail */
+-}
+-
+-static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+-{
+- struct list_head *tmp;
+- mddev_t *next_mddev, *mddev = v;
+-
+- ++*pos;
+- if (v == (void*)2)
+- return NULL;
+-
+- spin_lock(&all_mddevs_lock);
+- if (v == (void*)1)
+- tmp = all_mddevs.next;
+- else
+- tmp = mddev->all_mddevs.next;
+- if (tmp != &all_mddevs)
+- next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
+- else {
+- next_mddev = (void*)2;
+- *pos = 0x10000;
+- }
+- spin_unlock(&all_mddevs_lock);
+-
+- if (v != (void*)1)
+- mddev_put(mddev);
+- return next_mddev;
+-
+-}
+-
+-static void md_seq_stop(struct seq_file *seq, void *v)
+-{
+- mddev_t *mddev = v;
+-
+- if (mddev && v != (void*)1 && v != (void*)2)
+- mddev_put(mddev);
+-}
+-
+-static int md_seq_show(struct seq_file *seq, void *v)
+-{
+- mddev_t *mddev = v;
+- sector_t size;
+- struct list_head *tmp2;
+- mdk_rdev_t *rdev;
+- int i;
+-
+- if (v == (void*)1) {
+- seq_printf(seq, "Personalities : ");
+- spin_lock(&pers_lock);
+- for (i = 0; i < MAX_PERSONALITY; i++)
+- if (pers[i])
+- seq_printf(seq, "[%s] ", pers[i]->name);
+-
+- spin_unlock(&pers_lock);
+- seq_printf(seq, "\n");
+- return 0;
+- }
+- if (v == (void*)2) {
+- status_unused(seq);
+- return 0;
+- }
+-
+- if (mddev_lock(mddev)!=0)
+- return -EINTR;
+- if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
+- seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+- mddev->pers ? "" : "in");
+- if (mddev->pers) {
+- if (mddev->ro)
+- seq_printf(seq, " (read-only)");
+- seq_printf(seq, " %s", mddev->pers->name);
+- }
+-
+- size = 0;
+- ITERATE_RDEV(mddev,rdev,tmp2) {
+- seq_printf(seq, " %s[%d]",
+- bdev_partition_name(rdev->bdev), rdev->desc_nr);
+- if (rdev->faulty) {
+- seq_printf(seq, "(F)");
+- continue;
+- }
+- size += rdev->size;
+- }
+-
+- if (!list_empty(&mddev->disks)) {
+- if (mddev->pers)
+- seq_printf(seq, "\n %llu blocks",
+- (unsigned long long)mddev->array_size);
+- else
+- seq_printf(seq, "\n %llu blocks",
+- (unsigned long long)size);
+- }
+-
+- if (mddev->pers) {
+- mddev->pers->status (seq, mddev);
+- seq_printf(seq, "\n ");
+- if (mddev->curr_resync > 2)
+- status_resync (seq, mddev);
+- else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
+- seq_printf(seq, " resync=DELAYED");
+- }
+-
+- seq_printf(seq, "\n");
+- }
+- mddev_unlock(mddev);
+-
+- return 0;
+-}
+-
+-static struct seq_operations md_seq_ops = {
+- .start = md_seq_start,
+- .next = md_seq_next,
+- .stop = md_seq_stop,
+- .show = md_seq_show,
+-};
+-
+-static int md_seq_open(struct inode *inode, struct file *file)
+-{
+- int error;
+-
+- error = seq_open(file, &md_seq_ops);
+- return error;
+-}
+-
+-static struct file_operations md_seq_fops = {
+- .open = md_seq_open,
+- .read = seq_read,
+- .llseek = seq_lseek,
+- .release = seq_release,
+-};
+-
+-int register_md_personality(int pnum, mdk_personality_t *p)
+-{
+- if (pnum >= MAX_PERSONALITY) {
+- MD_BUG();
+- return -EINVAL;
+- }
+-
+- spin_lock(&pers_lock);
+- if (pers[pnum]) {
+- spin_unlock(&pers_lock);
+- MD_BUG();
+- return -EBUSY;
+- }
+-
+- pers[pnum] = p;
+- printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+- spin_unlock(&pers_lock);
+- return 0;
+-}
+-
+-int unregister_md_personality(int pnum)
+-{
+- if (pnum >= MAX_PERSONALITY) {
+- MD_BUG();
+- return -EINVAL;
+- }
+-
+- printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+- spin_lock(&pers_lock);
+- pers[pnum] = NULL;
+- spin_unlock(&pers_lock);
+- return 0;
+-}
+-
+-void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
+-{
+- rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors;
+-}
+-
+-static int is_mddev_idle(mddev_t *mddev)
+-{
+- mdk_rdev_t * rdev;
+- struct list_head *tmp;
+- int idle;
+- unsigned long curr_events;
+-
+- idle = 1;
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+- curr_events = disk_stat_read(disk, read_sectors) +
+- disk_stat_read(disk, write_sectors) -
+- disk->sync_io;
+- if ((curr_events - rdev->last_events) > 32) {
+- rdev->last_events = curr_events;
+- idle = 0;
+- }
+- }
+- return idle;
+-}
+-
+-void md_done_sync(mddev_t *mddev, int blocks, int ok)
+-{
+- /* another "blocks" (512byte) blocks have been synced */
+- atomic_sub(blocks, &mddev->recovery_active);
+- wake_up(&mddev->recovery_wait);
+- if (!ok) {
+- set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+- md_wakeup_thread(mddev->thread);
+- // stop recovery, signal do_sync ....
+- }
+-}
+-
+-
+-void md_write_start(mddev_t *mddev)
+-{
+- if (!atomic_read(&mddev->writes_pending)) {
+- mddev_lock_uninterruptible(mddev);
+- if (mddev->in_sync) {
+- mddev->in_sync = 0;
+- del_timer(&mddev->safemode_timer);
+- md_update_sb(mddev);
+- }
+- atomic_inc(&mddev->writes_pending);
+- mddev_unlock(mddev);
+- } else
+- atomic_inc(&mddev->writes_pending);
+-}
+-
+-void md_write_end(mddev_t *mddev)
+-{
+- if (atomic_dec_and_test(&mddev->writes_pending)) {
+- if (mddev->safemode == 2)
+- md_wakeup_thread(mddev->thread);
+- else
+- mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
+- }
+-}
+-
+-static inline void md_enter_safemode(mddev_t *mddev)
+-{
+- mddev_lock_uninterruptible(mddev);
+- if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
+- !mddev->in_sync && mddev->recovery_cp == MaxSector) {
+- mddev->in_sync = 1;
+- md_update_sb(mddev);
+- }
+- mddev_unlock(mddev);
+-
+- if (mddev->safemode == 1)
+- mddev->safemode = 0;
+-}
+-
+-void md_handle_safemode(mddev_t *mddev)
+-{
+- if (signal_pending(current)) {
+- printk(KERN_INFO "md: md%d in immediate safe mode\n",
+- mdidx(mddev));
+- mddev->safemode = 2;
+- flush_signals(current);
+- }
+- if (mddev->safemode)
+- md_enter_safemode(mddev);
+-}
+-
+-
+-DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+-
+-#define SYNC_MARKS 10
+-#define SYNC_MARK_STEP (3*HZ)
+-static void md_do_sync(mddev_t *mddev)
+-{
+- mddev_t *mddev2;
+- unsigned int max_sectors, currspeed = 0,
+- j, window;
+- unsigned long mark[SYNC_MARKS];
+- unsigned long mark_cnt[SYNC_MARKS];
+- int last_mark,m;
+- struct list_head *tmp;
+- unsigned long last_check;
+-
+- /* just incase thread restarts... */
+- if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+- return;
+-
+- /* we overload curr_resync somewhat here.
+- * 0 == not engaged in resync at all
+- * 2 == checking that there is no conflict with another sync
+- * 1 == like 2, but have yielded to allow conflicting resync to
+- * commense
+- * other == active in resync - this many blocks
+- */
+- do {
+- mddev->curr_resync = 2;
+-
+- ITERATE_MDDEV(mddev2,tmp) {
+- if (mddev2 == mddev)
+- continue;
+- if (mddev2->curr_resync &&
+- match_mddev_units(mddev,mddev2)) {
+- printk(KERN_INFO "md: delaying resync of md%d"
+- " until md%d has finished resync (they"
+- " share one or more physical units)\n",
+- mdidx(mddev), mdidx(mddev2));
+- if (mddev < mddev2) {/* arbitrarily yield */
+- mddev->curr_resync = 1;
+- wake_up(&resync_wait);
+- }
+- if (wait_event_interruptible(resync_wait,
+- mddev2->curr_resync < mddev->curr_resync)) {
+- flush_signals(current);
+- mddev_put(mddev2);
+- goto skip;
+- }
+- }
+- if (mddev->curr_resync == 1) {
+- mddev_put(mddev2);
+- break;
+- }
+- }
+- } while (mddev->curr_resync < 2);
+-
+- max_sectors = mddev->size << 1;
+-
+- printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+- printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
+- " %d KB/sec/disc.\n", sysctl_speed_limit_min);
+- printk(KERN_INFO "md: using maximum available idle IO bandwith "
+- "(but not more than %d KB/sec) for reconstruction.\n",
+- sysctl_speed_limit_max);
+-
+- is_mddev_idle(mddev); /* this also initializes IO event counters */
+- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+- j = mddev->recovery_cp;
+- else
+- j = 0;
+- for (m = 0; m < SYNC_MARKS; m++) {
+- mark[m] = jiffies;
+- mark_cnt[m] = j;
+- }
+- last_mark = 0;
+- mddev->resync_mark = mark[last_mark];
+- mddev->resync_mark_cnt = mark_cnt[last_mark];
+-
+- /*
+- * Tune reconstruction:
+- */
+- window = 32*(PAGE_SIZE/512);
+- printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+- window/2,max_sectors/2);
+-
+- atomic_set(&mddev->recovery_active, 0);
+- init_waitqueue_head(&mddev->recovery_wait);
+- last_check = 0;
+-
+- if (j)
+- printk(KERN_INFO
+- "md: resuming recovery of md%d from checkpoint.\n",
+- mdidx(mddev));
+-
+- while (j < max_sectors) {
+- int sectors;
+-
+- sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
+- if (sectors < 0) {
+- set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+- goto out;
+- }
+- atomic_add(sectors, &mddev->recovery_active);
+- j += sectors;
+- if (j>1) mddev->curr_resync = j;
+-
+- if (last_check + window > j)
+- continue;
+-
+- last_check = j;
+-
+- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
+- test_bit(MD_RECOVERY_ERR, &mddev->recovery))
+- break;
+-
+- blk_run_queues();
+-
+- repeat:
+- if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+- /* step marks */
+- int next = (last_mark+1) % SYNC_MARKS;
+-
+- mddev->resync_mark = mark[next];
+- mddev->resync_mark_cnt = mark_cnt[next];
+- mark[next] = jiffies;
+- mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+- last_mark = next;
+- }
+-
+-
+- if (signal_pending(current)) {
+- /*
+- * got a signal, exit.
+- */
+- printk(KERN_INFO
+- "md: md_do_sync() got signal ... exiting\n");
+- flush_signals(current);
+- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+- goto out;
+- }
+-
+- /*
+- * this loop exits only if either when we are slower than
+- * the 'hard' speed limit, or the system was IO-idle for
+- * a jiffy.
+- * the system might be non-idle CPU-wise, but we only care
+- * about not overloading the IO subsystem. (things like an
+- * e2fsck being done on the RAID array should execute fast)
+- */
+- cond_resched();
+-
+- currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+-
+- if (currspeed > sysctl_speed_limit_min) {
+- if ((currspeed > sysctl_speed_limit_max) ||
+- !is_mddev_idle(mddev)) {
+- current->state = TASK_INTERRUPTIBLE;
+- schedule_timeout(HZ/4);
+- goto repeat;
+- }
+- }
+- }
+- printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+- /*
+- * this also signals 'finished resyncing' to md_stop
+- */
+- out:
+- wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+-
+- /* tell personality that we are finished */
+- mddev->pers->sync_request(mddev, max_sectors, 1);
+-
+- if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+- mddev->curr_resync > 2 &&
+- mddev->curr_resync > mddev->recovery_cp) {
+- if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+- printk(KERN_INFO
+- "md: checkpointing recovery of md%d.\n",
+- mdidx(mddev));
+- mddev->recovery_cp = mddev->curr_resync;
+- } else
+- mddev->recovery_cp = MaxSector;
+- }
+-
+- if (mddev->safemode)
+- md_enter_safemode(mddev);
+- skip:
+- mddev->curr_resync = 0;
+- set_bit(MD_RECOVERY_DONE, &mddev->recovery);
+- md_wakeup_thread(mddev->thread);
+-}
+-
+-
+-/*
+- * This routine is regularly called by all per-raid-array threads to
+- * deal with generic issues like resync and super-block update.
+- * Raid personalities that don't have a thread (linear/raid0) do not
+- * need this as they never do any recovery or update the superblock.
+- *
+- * It does not do any resync itself, but rather "forks" off other threads
+- * to do that as needed.
+- * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
+- * "->recovery" and create a thread at ->sync_thread.
+- * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
+- * and wakeups up this thread which will reap the thread and finish up.
+- * This thread also removes any faulty devices (with nr_pending == 0).
+- *
+- * The overall approach is:
+- * 1/ if the superblock needs updating, update it.
+- * 2/ If a recovery thread is running, don't do anything else.
+- * 3/ If recovery has finished, clean up, possibly marking spares active.
+- * 4/ If there are any faulty devices, remove them.
+- * 5/ If array is degraded, try to add spares devices
+- * 6/ If array has spares or is not in-sync, start a resync thread.
+- */
+-void md_check_recovery(mddev_t *mddev)
+-{
+- mdk_rdev_t *rdev;
+- struct list_head *rtmp;
+-
+-
+- dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
+-
+- if (mddev->ro)
+- return;
+- if ( ! (
+- mddev->sb_dirty ||
+- test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+- test_bit(MD_RECOVERY_DONE, &mddev->recovery)
+- ))
+- return;
+- if (mddev_trylock(mddev)==0) {
+- int spares =0;
+- if (mddev->sb_dirty)
+- md_update_sb(mddev);
+- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+- !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+- /* resync/recovery still happening */
+- goto unlock;
+- if (mddev->sync_thread) {
+- /* resync has finished, collect result */
+- md_unregister_thread(mddev->sync_thread);
+- mddev->sync_thread = NULL;
+- if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) {
+- /* success...*/
+- /* activate any spares */
+- mddev->pers->spare_active(mddev);
+- }
+- md_update_sb(mddev);
+- mddev->recovery = 0;
+- wake_up(&resync_wait);
+- goto unlock;
+- }
+- if (mddev->recovery) {
+- /* that's odd.. */
+- mddev->recovery = 0;
+- wake_up(&resync_wait);
+- }
+-
+- /* no recovery is running.
+- * remove any failed drives, then
+- * add spares if possible
+- */
+- ITERATE_RDEV(mddev,rdev,rtmp) {
+- if (rdev->raid_disk >= 0 &&
+- rdev->faulty &&
+- atomic_read(&rdev->nr_pending)==0) {
+- mddev->pers->hot_remove_disk(mddev, rdev->raid_disk);
+- rdev->raid_disk = -1;
+- }
+- if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
+- spares++;
+- }
+- if (mddev->degraded) {
+- ITERATE_RDEV(mddev,rdev,rtmp)
+- if (rdev->raid_disk < 0
+- && !rdev->faulty) {
+- if (mddev->pers->hot_add_disk(mddev,rdev))
+- spares++;
+- else
+- break;
+- }
+- }
+-
+- if (!spares && (mddev->recovery_cp == MaxSector )) {
+- /* nothing we can do ... */
+- goto unlock;
+- }
+- if (mddev->pers->sync_request) {
+- set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+- if (!spares)
+- set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+- mddev->sync_thread = md_register_thread(md_do_sync,
+- mddev,
+- "md%d_resync");
+- if (!mddev->sync_thread) {
+- printk(KERN_ERR "md%d: could not start resync"
+- " thread...\n",
+- mdidx(mddev));
+- /* leave the spares where they are, it shouldn't hurt */
+- mddev->recovery = 0;
+- } else {
+- md_wakeup_thread(mddev->sync_thread);
+- }
+- }
+- unlock:
+- mddev_unlock(mddev);
+- }
+-}
+-
+-int md_notify_reboot(struct notifier_block *this,
+- unsigned long code, void *x)
+-{
+- struct list_head *tmp;
+- mddev_t *mddev;
+-
+- if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
+-
+- printk(KERN_INFO "md: stopping all md devices.\n");
+-
+- ITERATE_MDDEV(mddev,tmp)
+- if (mddev_trylock(mddev)==0)
+- do_md_stop (mddev, 1);
+- /*
+- * certain more exotic SCSI devices are known to be
+- * volatile wrt too early system reboots. While the
+- * right place to handle this issue is the given
+- * driver, we do want to have a safe RAID driver ...
+- */
+- mdelay(1000*1);
+- }
+- return NOTIFY_DONE;
+-}
+-
+-struct notifier_block md_notifier = {
+- .notifier_call = md_notify_reboot,
+- .next = NULL,
+- .priority = INT_MAX, /* before any real devices */
+-};
+-
+-static void md_geninit(void)
+-{
+- struct proc_dir_entry *p;
+-
+- dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+-
+-#ifdef CONFIG_PROC_FS
+- p = create_proc_entry("mdstat", S_IRUGO, NULL);
+- if (p)
+- p->proc_fops = &md_seq_fops;
+-#endif
+-}
+-
+-int __init md_init(void)
+-{
+- int minor;
+-
+- printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
+- " MD_SB_DISKS=%d\n",
+- MD_MAJOR_VERSION, MD_MINOR_VERSION,
+- MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+-
+- if (register_blkdev(MAJOR_NR, "md"))
+- return -1;
+-
+- devfs_mk_dir("md");
+- blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
+- md_probe, NULL, NULL);
+- for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+- char name[16];
+- sprintf(name, "md/%d", minor);
+- devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+- S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+- }
+-
+- register_reboot_notifier(&md_notifier);
+- raid_table_header = register_sysctl_table(raid_root_table, 1);
+-
+- md_geninit();
+- return (0);
+-}
+-
+-
+-#ifndef MODULE
+-
+-/*
+- * Searches all registered partitions for autorun RAID arrays
+- * at boot time.
+- */
+-static dev_t detected_devices[128];
+-static int dev_cnt;
+-
+-void md_autodetect_dev(dev_t dev)
+-{
+- if (dev_cnt >= 0 && dev_cnt < 127)
+- detected_devices[dev_cnt++] = dev;
+-}
+-
+-
+-static void autostart_arrays(void)
+-{
+- mdk_rdev_t *rdev;
+- int i;
+-
+- printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+-
+- for (i = 0; i < dev_cnt; i++) {
+- dev_t dev = detected_devices[i];
+-
+- rdev = md_import_device(dev,0, 0);
+- if (IS_ERR(rdev)) {
+- printk(KERN_ALERT "md: could not import %s!\n",
+- partition_name(dev));
+- continue;
+- }
+- if (rdev->faulty) {
+- MD_BUG();
+- continue;
+- }
+- list_add(&rdev->same_set, &pending_raid_disks);
+- }
+- dev_cnt = 0;
+-
+- autorun_devices();
+-}
+-
+-#endif
+-
+-static __exit void md_exit(void)
+-{
+- int i;
+- blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
+- for (i=0; i < MAX_MD_DEVS; i++)
+- devfs_remove("md/%d", i);
+- devfs_remove("md");
+-
+- unregister_blkdev(MAJOR_NR,"md");
+- unregister_reboot_notifier(&md_notifier);
+- unregister_sysctl_table(raid_table_header);
+-#ifdef CONFIG_PROC_FS
+- remove_proc_entry("mdstat", NULL);
+-#endif
+- for (i = 0; i < MAX_MD_DEVS; i++) {
+- struct gendisk *disk = disks[i];
+- mddev_t *mddev;
+- if (!disks[i])
+- continue;
+- mddev = disk->private_data;
+- del_gendisk(disk);
+- put_disk(disk);
+- mddev_put(mddev);
+- }
+-}
+-
+-module_init(md_init)
+-module_exit(md_exit)
+-
+-EXPORT_SYMBOL(register_md_personality);
+-EXPORT_SYMBOL(unregister_md_personality);
+-EXPORT_SYMBOL(md_error);
+-EXPORT_SYMBOL(md_sync_acct);
+-EXPORT_SYMBOL(md_done_sync);
+-EXPORT_SYMBOL(md_write_start);
+-EXPORT_SYMBOL(md_write_end);
+-EXPORT_SYMBOL(md_handle_safemode);
+-EXPORT_SYMBOL(md_register_thread);
+-EXPORT_SYMBOL(md_unregister_thread);
+-EXPORT_SYMBOL(md_wakeup_thread);
+-EXPORT_SYMBOL(md_print_devices);
+-EXPORT_SYMBOL(md_interrupt_thread);
+-EXPORT_SYMBOL(md_check_recovery);
+-MODULE_LICENSE("GPL");
diff --git a/tests/linux/md/lmerge b/tests/linux/md/lmerge
new file mode 100644
index 0000000..4238601
--- /dev/null
+++ b/tests/linux/md/lmerge
@@ -0,0 +1,3589 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/bio.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/suspend.h>
+
+#include <linux/init.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define DEVICE_NR(device) (minor(device))
+
+#include <linux/blk.h>
+
+#define DEBUG 0
+#define dprintk(x...) ((void)(DEBUG && printk(x)))
+
+
+#ifndef MODULE
+static void autostart_arrays (void);
+#endif
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 1000 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static int sysctl_speed_limit_min = 1000;
+static int sysctl_speed_limit_max = 200000;
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {
+ .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
+ .procname = "speed_limit_min",
+ .data = &sysctl_speed_limit_min,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
+ .procname = "speed_limit_max",
+ .data = &sysctl_speed_limit_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table raid_dir_table[] = {
+ {
+ .ctl_name = DEV_RAID,
+ .procname = "raid",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = raid_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table raid_root_table[] = {
+ {
+ .ctl_name = CTL_DEV,
+ .procname = "dev",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = raid_dir_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static struct block_device_operations md_fops;
+
+static struct gendisk *disks[MAX_MD_DEVS];
+
+/*
+ * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list as well as mddev_map.
+ */
+static LIST_HEAD(all_mddevs);
+static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;
+
+
+/*
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
+ */
+#define ITERATE_MDDEV(mddev,tmp) \
+ \
+ for (({ spin_lock(&all_mddevs_lock); \
+ tmp = all_mddevs.next; \
+ mddev = NULL;}); \
+ ({ if (tmp != &all_mddevs) \
+ mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
+ spin_unlock(&all_mddevs_lock); \
+ if (mddev) mddev_put(mddev); \
+ mddev = list_entry(tmp, mddev_t, all_mddevs); \
+ tmp != &all_mddevs;}); \
+ ({ spin_lock(&all_mddevs_lock); \
+ tmp = tmp->next;}) \
+ )
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio, bio->bi_size);
+ return 0;
+}
+
+static inline mddev_t *mddev_get(mddev_t *mddev)
+{
+ atomic_inc(&mddev->active);
+ return mddev;
+}
+
+static void mddev_put(mddev_t *mddev)
+{
+ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+ return;
+ if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+ list_del(&mddev->all_mddevs);
+ mddev_map[mdidx(mddev)] = NULL;
+ kfree(mddev);
+ MOD_DEC_USE_COUNT;
+ }
+ spin_unlock(&all_mddevs_lock);
+}
+
+static mddev_t * mddev_find(int unit)
+{
+ mddev_t *mddev, *new = NULL;
+
+ retry:
+ spin_lock(&all_mddevs_lock);
+ if (mddev_map[unit]) {
+ mddev = mddev_get(mddev_map[unit]);
+ spin_unlock(&all_mddevs_lock);
+ if (new)
+ kfree(new);
+ return mddev;
+ }
+ if (new) {
+ mddev_map[unit] = new;
+ list_add(&new->all_mddevs, &all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ MOD_INC_USE_COUNT;
+ return new;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ memset(new, 0, sizeof(*new));
+
+ new->__minor = unit;
+ init_MUTEX(&new->reconfig_sem);
+ INIT_LIST_HEAD(&new->disks);
+ INIT_LIST_HEAD(&new->all_mddevs);
+ init_timer(&new->safemode_timer);
+ atomic_set(&new->active, 1);
+ blk_queue_make_request(&new->queue, md_fail_request);
+
+ goto retry;
+}
+
+static inline int mddev_lock(mddev_t * mddev)
+{
+ return down_interruptible(&mddev->reconfig_sem);
+}
+
+static inline void mddev_lock_uninterruptible(mddev_t * mddev)
+{
+ down(&mddev->reconfig_sem);
+}
+
+static inline int mddev_trylock(mddev_t * mddev)
+{
+ return down_trylock(&mddev->reconfig_sem);
+}
+
+static inline void mddev_unlock(mddev_t * mddev)
+{
+ up(&mddev->reconfig_sem);
+}
+
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->bdev->bd_dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+inline static sector_t calc_dev_sboffset(struct block_device *bdev)
+{
+ sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ return MD_NEW_SIZE_BLOCKS(size);
+}
+
+static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
+{
+ sector_t size;
+
+ size = rdev->sb_offset;
+
+ if (chunk_size)
+ size &= ~((sector_t)chunk_size/1024 - 1);
+ return size;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(KERN_ALERT "md: out of memory.\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ page_cache_release(rdev->sb_page);
+ rdev->sb_loaded = 0;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ }
+}
+
+
+static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
+{
+ if (bio->bi_size)
+ return 1;
+
+ complete((struct completion*)bio->bi_private);
+ return 0;
+}
+
+static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+ struct page *page, int rw)
+{
+ struct bio bio;
+ struct bio_vec vec;
+ struct completion event;
+
+ bio_init(&bio);
+ bio.bi_io_vec = &vec;
+ vec.bv_page = page;
+ vec.bv_len = size;
+ vec.bv_offset = 0;
+ bio.bi_vcnt = 1;
+ bio.bi_idx = 0;
+ bio.bi_size = size;
+ bio.bi_bdev = bdev;
+ bio.bi_sector = sector;
+ init_completion(&event);
+ bio.bi_private = &event;
+ bio.bi_end_io = bi_complete;
+ submit_bio(rw, &bio);
+ blk_run_queues();
+ wait_for_completion(&event);
+
+ return test_bit(BIO_UPTODATE, &bio.bi_flags);
+}
+
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+
+ if (!rdev->sb_page) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->sb_loaded)
+ return 0;
+
+
+ if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
+ goto fail;
+ rdev->sb_loaded = 1;
+ return 0;
+
+fail:
+ printk(KERN_ERR "md: disabled device %s, could not read superblock.\n",
+ bdev_partition_name(rdev->bdev));
+ return -EINVAL;
+}
+
+static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
+ (sb1->set_uuid1 == sb2->set_uuid1) &&
+ (sb1->set_uuid2 == sb2->set_uuid2) &&
+ (sb1->set_uuid3 == sb2->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ if (tmp1)
+ kfree(tmp1);
+ if (tmp2)
+ kfree(tmp2);
+
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+/*
+ * Handle superblock details.
+ * We want to be able to handle multiple superblock formats
+ * so we have a common interface to them all, and an array of
+ * different handlers.
+ * We rely on user-space to write the initial superblock, and support
+ * reading and updating of superblocks.
+ * Interface methods are:
+ * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
+ * loads and validates a superblock on dev.
+ * if refdev != NULL, compare superblocks on both devices
+ * Return:
+ * 0 - dev has a superblock that is compatible with refdev
+ * 1 - dev has a superblock that is compatible and newer than refdev
+ * so dev should be used as the refdev in future
+ * -EINVAL superblock incompatible or invalid
+ * -othererror e.g. -EIO
+ *
+ * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
+ * Verify that dev is acceptable into mddev.
+ * The first time, mddev->raid_disks will be 0, and data from
+ * dev should be merged in. Subsequent calls check that dev
+ * is new enough. Return 0 or -EINVAL
+ *
+ * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
+ * Update the superblock for rdev with data in mddev
+ * This does not write to disc.
+ *
+ */
+
+struct super_type {
+ char *name;
+ struct module *owner;
+ int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
+ int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+ void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+};
+
+/*
+ * load_super for 0.90.0
+ */
+static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+ mdp_super_t *sb;
+ int ret;
+ sector_t sb_offset;
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk.
+ *
+ * It also happens to be a multiple of 4Kb.
+ */
+ sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_offset = sb_offset;
+
+ ret = read_disk_sb(rdev);
+ if (ret) return ret;
+
+ ret = -EINVAL;
+
+ sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ if (sb->major_version != 0 ||
+ sb->minor_version != 90) {
+ printk(KERN_WARNING "Bad version number %d.%d on %s\n",
+ sb->major_version, sb->minor_version,
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ if (sb->md_minor >= MAX_MD_DEVS) {
+ printk(KERN_ERR "md: %s: invalid raid minor (%x)\n",
+ bdev_partition_name(rdev->bdev), sb->md_minor);
+ goto abort;
+ }
+ if (sb->raid_disks <= 0)
+ goto abort;
+
+ if (calc_sb_csum(sb) != sb->sb_csum) {
+ printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ rdev->preferred_minor = sb->md_minor;
+ rdev->data_offset = 0;
+
+ if (sb->level == MULTIPATH)
+ rdev->desc_nr = -1;
+ else
+ rdev->desc_nr = sb->this_disk.number;
+
+ if (refdev == 0)
+ ret = 1;
+ else {
+ __u64 ev1, ev2;
+ mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+ if (!uuid_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ goto abort;
+ }
+ if (!sb_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has same UUID"
+ " but different superblock to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ goto abort;
+ }
+ ev1 = md_event(sb);
+ ev2 = md_event(refsb);
+ if (ev1 > ev2)
+ ret = 1;
+ else
+ ret = 0;
+ }
+ rdev->size = calc_dev_size(rdev, sb->chunk_size);
+
+ abort:
+ return ret;
+}
+
+/*
+ * validate_super for 0.90.0
+ */
+static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdp_disk_t *desc;
+ mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 0;
+ mddev->minor_version = sb->minor_version;
+ mddev->patch_version = sb->patch_version;
+ mddev->persistent = ! sb->not_persistent;
+ mddev->chunk_size = sb->chunk_size;
+ mddev->ctime = sb->ctime;
+ mddev->utime = sb->utime;
+ mddev->level = sb->level;
+ mddev->layout = sb->layout;
+ mddev->raid_disks = sb->raid_disks;
+ mddev->size = sb->size;
+ mddev->events = md_event(sb);
+
+ if (sb->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else {
+ if (sb->events_hi == sb->cp_events_hi &&
+ sb->events_lo == sb->cp_events_lo) {
+ mddev->recovery_cp = sb->recovery_cp;
+ } else
+ mddev->recovery_cp = 0;
+ }
+
+ memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
+ memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
+ memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
+ memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
+
+ mddev->max_disks = MD_SB_DISKS;
+ } else {
+ __u64 ev1;
+ ev1 = md_event(sb);
+ ++ev1;
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ }
+ if (mddev->level != LEVEL_MULTIPATH) {
+ rdev->raid_disk = -1;
+ rdev->in_sync = rdev->faulty = 0;
+ desc = sb->disks + rdev->desc_nr;
+
+ if (desc->state & (1<<MD_DISK_FAULTY))
+ rdev->faulty = 1;
+ else if (desc->state & (1<<MD_DISK_SYNC) &&
+ desc->raid_disk < mddev->raid_disks) {
+ rdev->in_sync = 1;
+ rdev->raid_disk = desc->raid_disk;
+ }
+ }
+ return 0;
+}
+
+/*
+ * sync_super for 0.90.0
+ */
+static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdp_super_t *sb;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev2;
+ int next_spare = mddev->raid_disks;
+
+ /* make rdev->sb match mddev data..
+ *
+ * 1/ zero out disks
+ * 2/ Add info for each disk, keeping track of highest desc_nr
+ * 3/ any empty disks < highest become removed
+ *
+ * disks[0] gets initialised to REMOVED because
+ * we cannot be sure from other fields if it has
+ * been initialised or not.
+ */
+ int highest = 0;
+ int i;
+ int active=0, working=0,failed=0,spare=0,nr_disks=0;
+
+ sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+ memset(sb, 0, sizeof(*sb));
+
+ sb->md_magic = MD_SB_MAGIC;
+ sb->major_version = mddev->major_version;
+ sb->minor_version = mddev->minor_version;
+ sb->patch_version = mddev->patch_version;
+ sb->gvalid_words = 0; /* ignored */
+ memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
+ memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
+ memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
+ memcpy(&sb->set_uuid3, mddev->uuid+12,4);
+
+ sb->ctime = mddev->ctime;
+ sb->level = mddev->level;
+ sb->size = mddev->size;
+ sb->raid_disks = mddev->raid_disks;
+ sb->md_minor = mddev->__minor;
+ sb->not_persistent = !mddev->persistent;
+ sb->utime = mddev->utime;
+ sb->state = 0;
+ sb->events_hi = (mddev->events>>32);
+ sb->events_lo = (u32)mddev->events;
+
+ if (mddev->in_sync)
+ {
+ sb->recovery_cp = mddev->recovery_cp;
+ sb->cp_events_hi = (mddev->events>>32);
+ sb->cp_events_lo = (u32)mddev->events;
+ if (mddev->recovery_cp == MaxSector)
+ sb->state = (1<< MD_SB_CLEAN);
+ } else
+ sb->recovery_cp = 0;
+
+ sb->layout = mddev->layout;
+ sb->chunk_size = mddev->chunk_size;
+
+ sb->disks[0].state = (1<<MD_DISK_REMOVED);
+ ITERATE_RDEV(mddev,rdev2,tmp) {
+ mdp_disk_t *d;
+ if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
+ rdev2->desc_nr = rdev2->raid_disk;
+ else
+ rdev2->desc_nr = next_spare++;
+ d = &sb->disks[rdev2->desc_nr];
+ nr_disks++;
+ d->number = rdev2->desc_nr;
+ d->major = MAJOR(rdev2->bdev->bd_dev);
+ d->minor = MINOR(rdev2->bdev->bd_dev);
+ if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
+ d->raid_disk = rdev2->raid_disk;
+ else
+ d->raid_disk = rdev2->desc_nr; /* compatibility */
+ if (rdev2->faulty) {
+ d->state = (1<<MD_DISK_FAULTY);
+ failed++;
+ } else if (rdev2->in_sync) {
+ d->state = (1<<MD_DISK_ACTIVE);
+ d->state |= (1<<MD_DISK_SYNC);
+ active++;
+ working++;
+ } else {
+ d->state = 0;
+ spare++;
+ working++;
+ }
+ if (rdev2->desc_nr > highest)
+ highest = rdev2->desc_nr;
+ }
+
+ /* now set the "removed" bit on any non-trailing holes */
+ for (i=0; i<highest; i++) {
+ mdp_disk_t *d = &sb->disks[i];
+ if (d->state == 0 && d->number == 0) {
+ d->number = i;
+ d->raid_disk = i;
+ d->state = (1<<MD_DISK_REMOVED);
+ }
+ }
+ sb->nr_disks = nr_disks;
+ sb->active_disks = active;
+ sb->working_disks = working;
+ sb->failed_disks = failed;
+ sb->spare_disks = spare;
+
+ sb->this_disk = sb->disks[rdev->desc_nr];
+ sb->sb_csum = calc_sb_csum(sb);
+}
+
+/*
+ * version 1 superblock
+ */
+
+static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+{
+ unsigned int disk_csum, csum;
+ int size = 256 + sb->max_dev*2;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, size, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+ struct mdp_superblock_1 *sb;
+ int ret;
+ sector_t sb_offset;
+
+ /*
+ * Calculate the position of the superblock.
+ * It is always aligned to a 4K boundary and
+ * depeding on minor_version, it can be:
+ * 0: At least 8K, but less than 12K, from end of device
+ * 1: At start of device
+ * 2: 4K from start of device.
+ */
+ switch(minor_version) {
+ case 0:
+ sb_offset = rdev->bdev->bd_inode->i_size >> 9;
+ sb_offset -= 8*2;
+ sb_offset &= ~(4*2);
+ /* convert from sectors to K */
+ sb_offset /= 2;
+ break;
+ case 1:
+ sb_offset = 0;
+ break;
+ case 2:
+ sb_offset = 4;
+ break;
+ default:
+ return -EINVAL;
+ }
+ rdev->sb_offset = sb_offset;
+
+ ret = read_disk_sb(rdev);
+ if (ret) return ret;
+
+
+ sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
+ sb->major_version != cpu_to_le32(1) ||
+ le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
+ le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+ sb->feature_map != 0)
+ return -EINVAL;
+
+ if (calc_sb_1_csum(sb) != sb->sb_csum) {
+ printk("md: invalid superblock checksum on %s\n",
+ bdev_partition_name(rdev->bdev));
+ return -EINVAL;
+ }
+ rdev->preferred_minor = 0xffff;
+ rdev->data_offset = le64_to_cpu(sb->data_offset);
+
+ if (refdev == 0)
+ return 1;
+ else {
+ __u64 ev1, ev2;
+ struct mdp_superblock_1 *refsb =
+ (struct mdp_superblock_1*)page_address(refdev->sb_page);
+
+ if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
+ sb->level != refsb->level ||
+ sb->layout != refsb->layout ||
+ sb->chunksize != refsb->chunksize) {
+ printk(KERN_WARNING "md: %s has strangely different"
+ " superblock to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ return -EINVAL;
+ }
+ ev1 = le64_to_cpu(sb->events);
+ ev2 = le64_to_cpu(refsb->events);
+
+ if (ev1 > ev2)
+ return 1;
+ }
+ if (minor_version)
+ rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+ else
+ rdev->size = rdev->sb_offset;
+ if (rdev->size < le64_to_cpu(sb->data_size)/2)
+ return -EINVAL;
+ rdev->size = le64_to_cpu(sb->data_size)/2;
+ if (le32_to_cpu(sb->chunksize))
+ rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+ return 0;
+}
+
+static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 1;
+ mddev->minor_version = 0;
+ mddev->patch_version = 0;
+ mddev->persistent = 1;
+ mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+ mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
+ mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+ mddev->level = le32_to_cpu(sb->level);
+ mddev->layout = le32_to_cpu(sb->layout);
+ mddev->raid_disks = le32_to_cpu(sb->raid_disks);
+ mddev->size = (u32)le64_to_cpu(sb->size);
+ mddev->events = le64_to_cpu(sb->events);
+
+ mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+ memcpy(mddev->uuid, sb->set_uuid, 16);
+
+ mddev->max_disks = (4096-256)/2;
+ } else {
+ __u64 ev1;
+ ev1 = le64_to_cpu(sb->events);
+ ++ev1;
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ }
+
+ if (mddev->level != LEVEL_MULTIPATH) {
+ int role;
+ rdev->desc_nr = le32_to_cpu(sb->dev_number);
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ switch(role) {
+ case 0xffff: /* spare */
+ rdev->in_sync = 0;
+ rdev->faulty = 0;
+ rdev->raid_disk = -1;
+ break;
+ case 0xfffe: /* faulty */
+ rdev->in_sync = 0;
+ rdev->faulty = 1;
+ rdev->raid_disk = -1;
+ break;
+ default:
+ rdev->in_sync = 1;
+ rdev->faulty = 0;
+ rdev->raid_disk = role;
+ break;
+ }
+ }
+ return 0;
+}
+
+static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct mdp_superblock_1 *sb;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev2;
+ int max_dev, i;
+ /* make rdev->sb match mddev and rdev data. */
+
+ sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ sb->feature_map = 0;
+ sb->pad0 = 0;
+ memset(sb->pad1, 0, sizeof(sb->pad1));
+ memset(sb->pad2, 0, sizeof(sb->pad2));
+ memset(sb->pad3, 0, sizeof(sb->pad3));
+
+ sb->utime = cpu_to_le64((__u64)mddev->utime);
+ sb->events = cpu_to_le64(mddev->events);
+ if (mddev->in_sync)
+ sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ else
+ sb->resync_offset = cpu_to_le64(0);
+
+ max_dev = 0;
+ ITERATE_RDEV(mddev,rdev2,tmp)
+ if (rdev2->desc_nr > max_dev)
+ max_dev = rdev2->desc_nr;
+
+ sb->max_dev = max_dev;
+ for (i=0; i<max_dev;i++)
+ sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
+
+ ITERATE_RDEV(mddev,rdev2,tmp) {
+ i = rdev2->desc_nr;
+ if (rdev2->faulty)
+ sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ else if (rdev2->in_sync)
+ sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else
+ sb->dev_roles[i] = cpu_to_le16(0xffff);
+ }
+
+ sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
+}
+
+
+struct super_type super_types[] = {
+ [0] = {
+ .name = "0.90.0",
+ .owner = THIS_MODULE,
+ .load_super = super_90_load,
+ .validate_super = super_90_validate,
+ .sync_super = super_90_sync,
+ },
+ [1] = {
+ .name = "md-1",
+ .owner = THIS_MODULE,
+ .load_super = super_1_load,
+ .validate_super = super_1_validate,
+ .sync_super = super_1_sync,
+ },
+};
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev))
+ return 1;
+
+ return 0;
+}
+
+static LIST_HEAD(pending_raid_disks);
+
+static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ same_pdev = match_dev_unit(mddev, rdev);
+ if (same_pdev)
+ printk(KERN_WARNING
+ "md%d: WARNING: %s appears to be on the same physical"
+ " disk as %s. True\n protection against single-disk"
+ " failure might be compromised.\n",
+ mdidx(mddev), bdev_partition_name(rdev->bdev),
+ bdev_partition_name(same_pdev->bdev));
+
+ /* Verify rdev->desc_nr is unique.
+ * If it is -1, assign a free number, else
+ * check number is not in use
+ */
+ if (rdev->desc_nr < 0) {
+ int choice = 0;
+ if (mddev->pers) choice = mddev->raid_disks;
+ while (find_rdev_nr(mddev, choice))
+ choice++;
+ rdev->desc_nr = choice;
+ } else {
+ if (find_rdev_nr(mddev, rdev->desc_nr))
+ return -EBUSY;
+ }
+
+ list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev));
+ return 0;
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev));
+ rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
+{
+ int err = 0;
+ struct block_device *bdev;
+
+ bdev = bdget(dev);
+ if (!bdev)
+ return -ENOMEM;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ if (err)
+ return err;
+ err = bd_claim(bdev, rdev);
+ if (err) {
+ blkdev_put(bdev, BDEV_RAW);
+ return err;
+ }
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ bd_release(bdev);
+ blkdev_put(bdev, BDEV_RAW);
+}
+
+void md_autodetect_dev(dev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ printk(KERN_INFO "md: export_rdev(%s)\n",
+ bdev_partition_name(rdev->bdev));
+ if (rdev->mddev)
+ MD_BUG();
+ free_disk_sb(rdev);
+ list_del_init(&rdev->same_set);
+#ifndef MODULE
+ md_autodetect_dev(rdev->bdev->bd_dev);
+#endif
+ unlock_rdev(rdev);
+ kfree(rdev);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+ mddev->raid_disks = 0;
+ mddev->major_version = 0;
+}
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+ partition_name(MKDEV(desc->major,desc->minor)),
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO
+ "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
+ sb->level, sb->size, sb->nr_disks, sb->raid_disks,
+ sb->md_minor, sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
+ " FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ",
+ bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size,
+ rdev->faulty, rdev->in_sync, rdev->desc_nr);
+ if (rdev->sb_loaded) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb((mdp_super_t*)page_address(rdev->sb_page));
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+ struct list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) {
+ printk("md%d: ", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", bdev_partition_name(rdev->bdev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+
+ if (!rdev->sb_loaded) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ return 1;
+ }
+
+ dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->sb_offset);
+
+ if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
+ return 0;
+
+ printk("md: write_disk_sb failed for device %s\n",
+ bdev_partition_name(rdev->bdev));
+ return 1;
+}
+
+static void sync_sbs(mddev_t * mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ super_types[mddev->major_version].
+ sync_super(mddev, rdev);
+ rdev->sb_loaded = 1;
+ }
+}
+
+static void md_update_sb(mddev_t * mddev)
+{
+ int err, count = 100;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ mddev->sb_dirty = 0;
+repeat:
+ mddev->utime = get_seconds();
+ mddev->events ++;
+
+ if (!mddev->events) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->events --;
+ }
+ sync_sbs(mddev);
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (!mddev->persistent)
+ return;
+
+ dprintk(KERN_INFO
+ "md: updating md%d RAID superblock on device (in sync %d)\n",
+ mdidx(mddev),mddev->in_sync);
+
+ err = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ dprintk(KERN_INFO "md: ");
+ if (rdev->faulty)
+ dprintk("(skipping faulty ");
+
+ dprintk("%s ", bdev_partition_name(rdev->bdev));
+ if (!rdev->faulty) {
+ err += write_disk_sb(rdev);
+ } else
+ dprintk(")\n");
+ if (!err && mddev->level == LEVEL_MULTIPATH)
+ /* only need to write one superblock... */
+ break;
+ }
+ if (err) {
+ if (--count) {
+ printk(KERN_ERR "md: errors occurred during superblock"
+ " update, repeating\n");
+ goto repeat;
+ }
+ printk(KERN_ERR \
+ "md: excessive errors occurred during superblock update, exiting\n");
+ }
+}
+
+/*
+ * Import a device. If 'super_format' >= 0, then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ * a faulty rdev _never_ has rdev->sb set.
+ */
+static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ sector_t size;
+
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for %s!\n",
+ partition_name(newdev));
+ return ERR_PTR(-ENOMEM);
+ }
+ memset(rdev, 0, sizeof(*rdev));
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ err = lock_rdev(rdev, newdev);
+ if (err) {
+ printk(KERN_ERR "md: could not lock %s.\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+ rdev->desc_nr = -1;
+ rdev->faulty = 0;
+ rdev->in_sync = 0;
+ rdev->data_offset = 0;
+ atomic_set(&rdev->nr_pending, 0);
+
+ size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ if (!size) {
+ printk(KERN_WARNING
+ "md: %s has zero or unknown size, marking faulty!\n",
+ bdev_partition_name(rdev->bdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (super_format >= 0) {
+ err = super_types[super_format].
+ load_super(rdev, NULL, super_minor);
+ if (err == -EINVAL) {
+ printk(KERN_WARNING
+ "md: %s has invalid sb, not importing!\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort_free;
+ }
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: could not read %s's sb, not importing!\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort_free;
+ }
+ }
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return rdev;
+
+abort_free:
+ if (rdev->sb_page) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return ERR_PTR(err);
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+
+static int analyze_sbs(mddev_t * mddev)
+{
+ int i;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev, *freshest;
+
+ freshest = NULL;
+ ITERATE_RDEV(mddev,rdev,tmp)
+ switch (super_types[mddev->major_version].
+ load_super(rdev, freshest, mddev->minor_version)) {
+ case 1:
+ freshest = rdev;
+ break;
+ case 0:
+ break;
+ default:
+ printk( KERN_ERR \
+ "md: fatal superblock inconsistency in %s"
+ " -- removing from array\n",
+ bdev_partition_name(rdev->bdev));
+ kick_rdev_from_array(rdev);
+ }
+
+
+ super_types[mddev->major_version].
+ validate_super(mddev, freshest);
+
+ i = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev != freshest)
+ if (super_types[mddev->major_version].
+ validate_super(mddev, rdev)) {
+ printk(KERN_WARNING "md: kicking non-fresh %s"
+ " from array!\n",
+ bdev_partition_name(rdev->bdev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ if (mddev->level == LEVEL_MULTIPATH) {
+ rdev->desc_nr = i++;
+ rdev->raid_disk = rdev->desc_nr;
+ rdev->in_sync = 1;
+ }
+ }
+
+
+ /*
+ * Check if we can support this RAID array
+ */
+ if (mddev->major_version != MD_MAJOR_VERSION ||
+ mddev->minor_version > MD_MINOR_VERSION) {
+ printk(KERN_ALERT
+ "md: md%d: unsupported raid array version %d.%d.%d\n",
+ mdidx(mddev), mddev->major_version,
+ mddev->minor_version, mddev->patch_version);
+ goto abort;
+ }
+
+ if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) ||
+ (mddev->level == 4) || (mddev->level == 5)))
+ printk(KERN_ERR "md: md%d: raid array is not clean"
+ " -- starting background reconstruction\n",
+ mdidx(mddev));
+
+ return 0;
+abort:
+ return 1;
+}
+
+static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+{
+ static DECLARE_MUTEX(disks_sem);
+ int unit = MINOR(dev);
+ mddev_t *mddev = mddev_find(unit);
+ struct gendisk *disk;
+
+ if (!mddev)
+ return NULL;
+
+ down(&disks_sem);
+ if (disks[unit]) {
+ up(&disks_sem);
+ mddev_put(mddev);
+ return NULL;
+ }
+ disk = alloc_disk(1);
+ if (!disk) {
+ up(&disks_sem);
+ mddev_put(mddev);
+ return NULL;
+ }
+ disk->major = MD_MAJOR;
+ disk->first_minor = mdidx(mddev);
+ sprintf(disk->disk_name, "md%d", mdidx(mddev));
+ disk->fops = &md_fops;
+ disk->private_data = mddev;
+ disk->queue = &mddev->queue;
+ add_disk(disk);
+ disks[mdidx(mddev)] = disk;
+ up(&disks_sem);
+ return NULL;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread);
+
+static void md_safemode_timeout(unsigned long data)
+{
+ mddev_t *mddev = (mddev_t *) data;
+
+ mddev->safemode = 1;
+ md_wakeup_thread(mddev->thread);
+}
+
+
+static int do_md_run(mddev_t * mddev)
+{
+ int pnum, err;
+ int chunk_size;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+ struct gendisk *disk;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (!mddev->raid_disks && analyze_sbs(mddev)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ chunk_size = mddev->chunk_size;
+ pnum = level_to_pers(mddev->level);
+
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+ if (!chunk_size) {
+ /*
+ * 'default chunksize' in the old md code used to
+ * be PAGE_SIZE, baaad.
+ * we abort here to be on the safe side. We don't
+ * want to continue the bad practice.
+ */
+ printk(KERN_ERR
+ "no chunksize specified, see 'man raidtab'\n");
+ return -EINVAL;
+ }
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(KERN_ERR "too big chunk_size: %d > %d\n",
+ chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(KERN_ERR "too small chunk_size: %d < %ld\n",
+ chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+
+ /* devices must have minimum size of one chunk */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size < chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size:"
+ " %lluk < %dk\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->size,
+ chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+ }
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+#ifdef CONFIG_KMOD
+ if (!pers[pnum])
+ {
+ char module_name[80];
+ sprintf (module_name, "md-personality-%d", pnum);
+ request_module (module_name);
+ }
+#endif
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ sync_blockdev(rdev->bdev);
+ invalidate_bdev(rdev->bdev, 0);
+ }
+
+ md_probe(mdidx(mddev), NULL, NULL);
+ disk = disks[mdidx(mddev)];
+ if (!disk)
+ return -ENOMEM;
+
+ spin_lock(&pers_lock);
+ if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
+ spin_unlock(&pers_lock);
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
+ pnum);
+ return -EINVAL;
+ }
+
+ mddev->pers = pers[pnum];
+ spin_unlock(&pers_lock);
+
+ blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+ printk("%s: setting max_sectors to %d, segment boundary to %d\n",
+ disk->disk_name,
+ chunk_size >> 9,
+ (chunk_size>>1)-1);
+ blk_queue_max_sectors(&mddev->queue, chunk_size >> 9);
+ blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1);
+ mddev->queue.queuedata = mddev;
+
+ err = mddev->pers->run(mddev);
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ module_put(mddev->pers->owner);
+ mddev->pers = NULL;
+ return -EINVAL;
+ }
+ atomic_set(&mddev->writes_pending,0);
+ mddev->safemode = 0;
+ mddev->safemode_timer.function = md_safemode_timeout;
+ mddev->safemode_timer.data = (unsigned long) mddev;
+ mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
+ mddev->in_sync = 1;
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ set_capacity(disk, mddev->array_size<<1);
+ return 0;
+}
+
+static int restart_array(mddev_t *mddev)
+{
+ struct gendisk *disk = disks[mdidx(mddev)];
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->safemode = 0;
+ mddev->ro = 0;
+ set_disk_ro(disk, 0);
+
+ printk(KERN_INFO "md: md%d switched to read-write mode.\n",
+ mdidx(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ err = 0;
+ } else {
+ printk(KERN_ERR "md: md%d has no personality assigned.\n",
+ mdidx(mddev));
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+ int err = 0;
+ struct gendisk *disk = disks[mdidx(mddev)];
+
+ if (atomic_read(&mddev->active)>2) {
+ printk("md: md%d still in use.\n",mdidx(mddev));
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (mddev->pers) {
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ }
+
+ del_timer_sync(&mddev->safemode_timer);
+
+ invalidate_device(mk_kdev(disk->major, disk->first_minor), 1);
+
+ if (ro) {
+ err = -ENXIO;
+ if (mddev->ro)
+ goto out;
+ mddev->ro = 1;
+ } else {
+ if (mddev->ro)
+ set_disk_ro(disk, 0);
+ if (mddev->pers->stop(mddev)) {
+ err = -EBUSY;
+ if (mddev->ro)
+ set_disk_ro(disk, 1);
+ goto out;
+ }
+ module_put(mddev->pers->owner);
+ mddev->pers = NULL;
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (mddev->raid_disks) {
+ /* mark array as shutdown cleanly */
+ mddev->in_sync = 1;
+ md_update_sb(mddev);
+ }
+ if (ro)
+ set_disk_ro(disk, 1);
+ }
+ /*
+ * Free resources if final stop
+ */
+ if (!ro) {
+ struct gendisk *disk;
+ printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+
+ export_array(mddev);
+
+ mddev->array_size = 0;
+ disk = disks[mdidx(mddev)];
+ if (disk)
+ set_capacity(disk, 0);
+ } else
+ printk(KERN_INFO "md: md%d switched to read-only mode.\n",
+ mdidx(mddev));
+ err = 0;
+out:
+ return err;
+}
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return;
+ }
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk("<%s>", bdev_partition_name(rdev->bdev));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in pending_raid_disks)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(void)
+{
+ struct list_head candidates;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ rdev0 = list_entry(pending_raid_disks.next,
+ mdk_rdev_t, same_set);
+
+ printk(KERN_INFO "md: considering %s ...\n",
+ bdev_partition_name(rdev0->bdev));
+ INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp)
+ if (super_90_load(rdev, rdev0, 0) >= 0) {
+ printk(KERN_INFO "md: adding %s ...\n",
+ bdev_partition_name(rdev->bdev));
+ list_move(&rdev->same_set, &candidates);
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+
+ mddev = mddev_find(rdev0->preferred_minor);
+ if (!mddev) {
+ printk(KERN_ERR
+ "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (mddev_lock(mddev))
+ printk(KERN_WARNING "md: md%d locked, cannot run\n",
+ mdidx(mddev));
+ else if (mddev->raid_disks || mddev->major_version
+ || !list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: md%d already running, cannot run %s\n",
+ mdidx(mddev), bdev_partition_name(rdev0->bdev));
+ mddev_unlock(mddev);
+ } else {
+ printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+ ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
+ list_del_init(&rdev->same_set);
+ if (bind_rdev_to_array(rdev, mddev))
+ export_rdev(rdev);
+ }
+ autorun_array(mddev);
+ mddev_unlock(mddev);
+ }
+ /* on success, candidates will be empty, on error
+ * it won't...
+ */
+ ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
+ export_rdev(rdev);
+ mddev_put(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+static int autostart_array(dev_t startdev)
+{
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ start_rdev = md_import_device(startdev, 0, 0);
+ if (IS_ERR(start_rdev)) {
+ printk(KERN_WARNING "md: could not import %s!\n",
+ partition_name(startdev));
+ return err;
+ }
+
+ /* NOTE: this can only work for 0.90.0 superblocks */
+ sb = (mdp_super_t*)page_address(start_rdev->sb_page);
+ if (sb->major_version != 0 ||
+ sb->minor_version != 90 ) {
+ printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
+ export_rdev(start_rdev);
+ return err;
+ }
+
+ if (start_rdev->faulty) {
+ printk(KERN_WARNING
+ "md: can not autostart based on faulty %s!\n",
+ bdev_partition_name(start_rdev->bdev));
+ export_rdev(start_rdev);
+ return err;
+ }
+ list_add(&start_rdev->same_set, &pending_raid_disks);
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ dev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (!dev)
+ continue;
+ if (dev == startdev)
+ continue;
+ rdev = md_import_device(dev, 0, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING "md: could not import %s,"
+ " trying to run array nevertheless.\n",
+ partition_name(dev));
+ continue;
+ }
+ list_add(&rdev->same_set, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices();
+ return 0;
+
+}
+
+
+static int get_version(void * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_array_info(mddev_t * mddev, void * arg)
+{
+ mdu_array_info_t info;
+ int nr,working,active,failed,spare;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ nr=working=active=failed=spare=0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ nr++;
+ if (rdev->faulty)
+ failed++;
+ else {
+ working++;
+ if (rdev->in_sync)
+ active++;
+ else
+ spare++;
+ }
+ }
+
+ info.major_version = mddev->major_version;
+ info.minor_version = mddev->minor_version;
+ info.patch_version = 1;
+ info.ctime = mddev->ctime;
+ info.level = mddev->level;
+ info.size = mddev->size;
+ info.nr_disks = nr;
+ info.raid_disks = mddev->raid_disks;
+ info.md_minor = mddev->__minor;
+ info.not_persistent= !mddev->persistent;
+
+ info.utime = mddev->utime;
+ info.state = 0;
+ if (mddev->in_sync)
+ info.state = (1<<MD_SB_CLEAN);
+ info.active_disks = active;
+ info.working_disks = working;
+ info.failed_disks = failed;
+ info.spare_disks = spare;
+
+ info.layout = mddev->layout;
+ info.chunk_size = mddev->chunk_size;
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_disk_info(mddev_t * mddev, void * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+ mdk_rdev_t *rdev;
+
+ if (copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+
+ rdev = find_rdev_nr(mddev, nr);
+ if (rdev) {
+ info.major = MAJOR(rdev->bdev->bd_dev);
+ info.minor = MINOR(rdev->bdev->bd_dev);
+ info.raid_disk = rdev->raid_disk;
+ info.state = 0;
+ if (rdev->faulty)
+ info.state |= (1<<MD_DISK_FAULTY);
+ else if (rdev->in_sync) {
+ info.state |= (1<<MD_DISK_ACTIVE);
+ info.state |= (1<<MD_DISK_SYNC);
+ }
+ } else {
+ info.major = info.minor = 0;
+ info.raid_disk = -1;
+ info.state = (1<<MD_DISK_REMOVED);
+ }
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ mdk_rdev_t *rdev;
+ dev_t dev;
+ dev = MKDEV(info->major,info->minor);
+ if (!mddev->raid_disks) {
+ int err;
+ /* expecting a device which has a superblock */
+ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ int err = super_types[mddev->major_version]
+ .load_super(rdev, rdev0, mddev->minor_version);
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: %s has different UUID to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(rdev0->bdev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ return err;
+ }
+
+ /*
+ * add_new_disk can be used once the array is assembled
+ * to add "hot spares". They must already have a superblock
+ * written
+ */
+ if (mddev->pers) {
+ int err;
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+ rdev = md_import_device(dev, mddev->major_version,
+ mddev->minor_version);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ rdev->in_sync = 0; /* just to be sure */
+ rdev->raid_disk = -1;
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ if (mddev->thread)
+ md_wakeup_thread(mddev->thread);
+ return err;
+ }
+
+ /* otherwise, add_new_disk is only allowed
+ * for major_version==0 superblocks
+ */
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ if (!(info->state & (1<<MD_DISK_FAULTY))) {
+ int err;
+ rdev = md_import_device (dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ rdev->desc_nr = info->number;
+ if (info->raid_disk < mddev->raid_disks)
+ rdev->raid_disk = info->raid_disk;
+ else
+ rdev->raid_disk = -1;
+
+ rdev->faulty = 0;
+ if (rdev->raid_disk < mddev->raid_disks)
+ rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
+ else
+ rdev->in_sync = 0;
+
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err) {
+ export_rdev(rdev);
+ return err;
+ }
+
+ if (!mddev->persistent) {
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+ rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ } else
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->size = calc_dev_size(rdev, mddev->chunk_size);
+
+ if (!mddev->size || (mddev->size > rdev->size))
+ mddev->size = rdev->size;
+ }
+
+ return 0;
+}
+
+static int hot_generate_error(mddev_t * mddev, dev_t dev)
+{
+ struct request_queue *q;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ return -ENXIO;
+ }
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (!rdev->in_sync)
+ return -ENODEV;
+
+ q = bdev_get_queue(rdev->bdev);
+ if (!q) {
+ MD_BUG();
+ return -ENODEV;
+ }
+ printk(KERN_INFO "md: okay, generating error!\n");
+// q->oneshot_error = 1; // disabled for now
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, dev_t dev)
+{
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->raid_disk >= 0)
+ goto busy;
+
+ kick_rdev_from_array(rdev);
+ md_update_sb(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+ bdev_partition_name(rdev->bdev), mdidx(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, dev_t dev)
+{
+ int err;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "md%d: HOT_ADD may only be used with"
+ " version-0 superblocks.\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ rdev = md_import_device (dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return -EINVAL;
+ }
+
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ size = calc_dev_size(rdev, mddev->chunk_size);
+ rdev->size = size;
+
+ if (size < mddev->size) {
+ printk(KERN_WARNING
+ "md%d: disk size %llu blocks < array size %llu\n",
+ mdidx(mddev), (unsigned long long)size,
+ (unsigned long long)mddev->size);
+ err = -ENOSPC;
+ goto abort_export;
+ }
+
+ if (rdev->faulty) {
+ printk(KERN_WARNING
+ "md: can not hot-add faulty %s disk to md%d!\n",
+ bdev_partition_name(rdev->bdev), mdidx(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ rdev->in_sync = 0;
+ rdev->desc_nr = -1;
+ bind_rdev_to_array(rdev, mddev);
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+
+ if (rdev->desc_nr == mddev->max_disks) {
+ printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ rdev->raid_disk = -1;
+
+ md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+/*
+ * set_array_info is used two different ways
+ * The original usage is when creating a new array.
+ * In this usage, raid_disks is > = and it together with
+ * level, size, not_persistent,layout,chunksize determine the
+ * shape of the array.
+ * This will always create an array with a type-0.90.0 superblock.
+ * The newer usage is when assembling an array.
+ * In this case raid_disks will be 0, and the major_version field is
+ * use to determine which style super-blocks are to be found on the devices.
+ * The minor and patch _version numbers are also kept incase the
+ * super_block handler wishes to interpret them.
+ */
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (info->raid_disks == 0) {
+ /* just setting version number for superblock loading */
+ if (info->major_version < 0 ||
+ info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
+ super_types[info->major_version].name == NULL) {
+ /* maybe try to auto-load a module? */
+ printk(KERN_INFO
+ "md: superblock version %d not known\n",
+ info->major_version);
+ return -EINVAL;
+ }
+ mddev->major_version = info->major_version;
+ mddev->minor_version = info->minor_version;
+ mddev->patch_version = info->patch_version;
+ return 0;
+ }
+ mddev->major_version = MD_MAJOR_VERSION;
+ mddev->minor_version = MD_MINOR_VERSION;
+ mddev->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->ctime = get_seconds();
+
+ mddev->level = info->level;
+ mddev->size = info->size;
+ mddev->raid_disks = info->raid_disks;
+ /* don't set __minor, it is determined by which /dev/md* was
+ * openned
+ */
+ if (info->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else
+ mddev->recovery_cp = 0;
+ mddev->persistent = ! info->not_persistent;
+
+ mddev->layout = info->layout;
+ mddev->chunk_size = info->chunk_size;
+
+ mddev->max_disks = MD_SB_DISKS;
+
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(mddev->uuid, 16);
+
+ return 0;
+}
+
+static int set_disk_faulty(mddev_t *mddev, dev_t dev)
+{
+ mdk_rdev_t *rdev;
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return 0;
+
+ md_error(mddev, rdev);
+ return 1;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ unsigned int minor;
+ int err = 0;
+ struct hd_geometry *loc = (struct hd_geometry *) arg;
+ mddev_t *mddev = NULL;
+ kdev_t dev;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ dev = inode->i_rdev;
+ minor = minor(dev);
+ if (minor >= MAX_MD_DEVS) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version((void *)arg);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays();
+ goto done;
+#endif
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev) {
+ BUG();
+ goto abort;
+ }
+
+
+ if (cmd == START_ARRAY) {
+ /* START_ARRAY doesn't need to lock the array as autostart_array
+ * does the locking, and it could even be a different array
+ */
+ err = autostart_array(arg);
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name(arg));
+ goto abort;
+ }
+ goto done;
+ }
+
+ err = mddev_lock(mddev);
+ if (err) {
+ printk(KERN_INFO
+ "md: ioctl lock interrupted, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+
+ if (!list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: array md%d already has disks!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (mddev->raid_disks) {
+ printk(KERN_WARNING
+ "md: array md%d already initialised!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ {
+ mdu_array_info_t info;
+ if (!arg)
+ memset(&info, 0, sizeof(info));
+ else if (copy_from_user(&info, (void*)arg, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't set"
+ " array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+ /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+ if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ err = do_md_stop (mddev, 0);
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ case HDIO_GETGEO:
+ if (!loc) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ err = put_user (2, (char *) &loc->heads);
+ if (err)
+ goto abort_unlock;
+ err = put_user (4, (char *) &loc->sectors);
+ if (err)
+ goto abort_unlock;
+ err = put_user(get_capacity(disks[mdidx(mddev)])/8,
+ (short *) &loc->cylinders);
+ if (err)
+ goto abort_unlock;
+ err = put_user (get_start_sect(inode->i_bdev),
+ (long *) &loc->start);
+ goto done_unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow read-only arrays
+ * here:
+ */
+ if (mddev->ro) {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (copy_from_user(&info, (void*)arg, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ case HOT_GENERATE_ERROR:
+ err = hot_generate_error(mddev, arg);
+ goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, arg);
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, arg);
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, arg);
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ {
+ err = do_md_run (mddev);
+ /*
+ * we have to clean up the mess if
+ * the array cannot be run for some
+ * reason ...
+ * ->pers will not be set, to superblock will
+ * not be updated.
+ */
+ if (err)
+ do_md_stop (mddev, 0);
+ goto done_unlock;
+ }
+
+ default:
+ if (_IOC_TYPE(cmd) == MD_MAJOR)
+ printk(KERN_WARNING "md: %s(pid %d) used"
+ " obsolete MD ioctl, upgrade your"
+ " software to use new ictls.\n",
+ current->comm, current->pid);
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ mddev_unlock(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Succeed if we can find or allocate a mddev structure.
+ */
+ mddev_t *mddev = mddev_find(minor(inode->i_rdev));
+ int err = -ENOMEM;
+
+ if (!mddev)
+ goto out;
+
+ if ((err = mddev_lock(mddev)))
+ goto put;
+
+ err = 0;
+ mddev_unlock(mddev);
+ inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev);
+ put:
+ mddev_put(mddev);
+ out:
+ return err;
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev)
+ BUG();
+ mddev_put(mddev);
+
+ return 0;
+}
+
+static struct block_device_operations md_fops =
+{
+ .owner = THIS_MODULE,
+ .open = md_open,
+ .release = md_release,
+ .ioctl = md_ioctl,
+};
+
+int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ lock_kernel();
+
+ /*
+ * Detach thread
+ */
+
+ daemonize(thread->name, mdidx(thread->mddev));
+
+ current->exit_signal = SIGCHLD;
+ allow_signal(SIGKILL);
+ thread->tsk = current;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+ unlock_kernel();
+
+ complete(thread->event);
+ while (thread->run) {
+ void (*run)(mddev_t *);
+
+ wait_event_interruptible(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags));
+ if (current->flags & PF_FREEZE)
+ refrigerator(PF_IOTHREAD);
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ run = thread->run;
+ if (run) {
+ run(thread->mddev);
+ blk_run_queues();
+ }
+ if (signal_pending(current))
+ flush_signals(current);
+ }
+ complete(thread->event);
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ if (thread) {
+ dprintk("md: waking up MD thread %p.\n", thread);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+ }
+}
+
+mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
+ const char *name)
+{
+ mdk_thread_t *thread;
+ int ret;
+ struct completion event;
+
+ thread = (mdk_thread_t *) kmalloc
+ (sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ memset(thread, 0, sizeof(mdk_thread_t));
+ init_waitqueue_head(&thread->wqueue);
+
+ init_completion(&event);
+ thread->event = &event;
+ thread->run = run;
+ thread->mddev = mddev;
+ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+ return NULL;
+ }
+ wait_for_completion(&event);
+ return thread;
+}
+
+void md_interrupt_thread(mdk_thread_t *thread)
+{
+ if (!thread->tsk) {
+ MD_BUG();
+ return;
+ }
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+ send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ struct completion event;
+
+ init_completion(&event);
+
+ thread->event = &event;
+ thread->run = NULL;
+ thread->name = NULL;
+ md_interrupt_thread(thread);
+ wait_for_completion(&event);
+ kfree(thread);
+}
+
+void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ MD_MAJOR,mdidx(mddev),
+ MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ if (!rdev || rdev->faulty)
+ return;
+ if (!mddev->pers->error_handler)
+ return;
+ mddev->pers->error_handler(mddev,rdev);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+}
+
+/* seq_file implementation /proc/mdstat */
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ i++;
+ seq_printf(seq, "%s ",
+ bdev_partition_name(rdev->bdev));
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ unsigned long max_blocks, resync, res, dt, db, rt;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ max_blocks = mddev->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks) {
+ MD_BUG();
+ return;
+ }
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ {
+ int i, x = res/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+ (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+ "resync" : "recovery"),
+ res/10, res % 10, resync, max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+}
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l > 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ spin_lock(&all_mddevs_lock);
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+ return mddev;
+ }
+ spin_unlock(&all_mddevs_lock);
+ return (void*)2;/* tail */
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ spin_lock(&all_mddevs_lock);
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ if (v != (void*)1)
+ mddev_put(mddev);
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+ mddev_t *mddev = v;
+
+ if (mddev && v != (void*)1 && v != (void*)2)
+ mddev_put(mddev);
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ mddev_t *mddev = v;
+ sector_t size;
+ struct list_head *tmp2;
+ mdk_rdev_t *rdev;
+ int i;
+
+ if (v == (void*)1) {
+ seq_printf(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ for (i = 0; i < MAX_PERSONALITY; i++)
+ if (pers[i])
+ seq_printf(seq, "[%s] ", pers[i]->name);
+
+ spin_unlock(&pers_lock);
+ seq_printf(seq, "\n");
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ if (mddev_lock(mddev)!=0)
+ return -EINTR;
+ if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
+ seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro)
+ seq_printf(seq, " (read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ seq_printf(seq, " %s[%d]",
+ bdev_partition_name(rdev->bdev), rdev->desc_nr);
+ if (rdev->faulty) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)mddev->array_size);
+ else
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)size);
+ }
+
+ if (mddev->pers) {
+ mddev->pers->status (seq, mddev);
+ seq_printf(seq, "\n ");
+ if (mddev->curr_resync > 2)
+ status_resync (seq, mddev);
+ else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
+ seq_printf(seq, " resync=DELAYED");
+ }
+
+ seq_printf(seq, "\n");
+ }
+ mddev_unlock(mddev);
+
+ return 0;
+}
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ return error;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ spin_lock(&pers_lock);
+ if (pers[pnum]) {
+ spin_unlock(&pers_lock);
+ MD_BUG();
+ return -EBUSY;
+ }
+
+ pers[pnum] = p;
+ printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+ spin_unlock(&pers_lock);
+ return 0;
+}
+
+int unregister_md_personality(int pnum)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+ spin_lock(&pers_lock);
+ pers[pnum] = NULL;
+ spin_unlock(&pers_lock);
+ return 0;
+}
+
+void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
+{
+ rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+ curr_events = disk_stat_read(disk, read_sectors) +
+ disk_stat_read(disk, write_sectors) -
+ disk->sync_io;
+ if ((curr_events - rdev->last_events) > 32) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ // stop recovery, signal do_sync ....
+ }
+}
+
+
+void md_write_start(mddev_t *mddev)
+{
+ if (!atomic_read(&mddev->writes_pending)) {
+ mddev_lock_uninterruptible(mddev);
+ if (mddev->in_sync) {
+ mddev->in_sync = 0;
+ del_timer(&mddev->safemode_timer);
+ md_update_sb(mddev);
+ }
+ atomic_inc(&mddev->writes_pending);
+ mddev_unlock(mddev);
+ } else
+ atomic_inc(&mddev->writes_pending);
+}
+
+void md_write_end(mddev_t *mddev)
+{
+ if (atomic_dec_and_test(&mddev->writes_pending)) {
+ if (mddev->safemode == 2)
+ md_wakeup_thread(mddev->thread);
+ else
+ mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
+ }
+}
+
+static inline void md_enter_safemode(mddev_t *mddev)
+{
+ mddev_lock_uninterruptible(mddev);
+ if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
+ !mddev->in_sync && mddev->recovery_cp == MaxSector) {
+ mddev->in_sync = 1;
+ md_update_sb(mddev);
+ }
+ mddev_unlock(mddev);
+
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+}
+
+void md_handle_safemode(mddev_t *mddev)
+{
+ if (signal_pending(current)) {
+ printk(KERN_INFO "md: md%d in immediate safe mode\n",
+ mdidx(mddev));
+ mddev->safemode = 2;
+ flush_signals(current);
+ }
+ if (mddev->safemode)
+ md_enter_safemode(mddev);
+}
+
+
+DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+static void md_do_sync(mddev_t *mddev)
+{
+ mddev_t *mddev2;
+ unsigned int max_sectors, currspeed = 0,
+ j, window;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct list_head *tmp;
+ unsigned long last_check;
+
+ /* just incase thread restarts... */
+ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ return;
+
+ /* we overload curr_resync somewhat here.
+ * 0 == not engaged in resync at all
+ * 2 == checking that there is no conflict with another sync
+ * 1 == like 2, but have yielded to allow conflicting resync to
+ * commense
+ * other == active in resync - this many blocks
+ */
+ do {
+ mddev->curr_resync = 2;
+
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync &&
+ match_mddev_units(mddev,mddev2)) {
+ printk(KERN_INFO "md: delaying resync of md%d"
+ " until md%d has finished resync (they"
+ " share one or more physical units)\n",
+ mdidx(mddev), mdidx(mddev2));
+ if (mddev < mddev2) {/* arbitrarily yield */
+ mddev->curr_resync = 1;
+ wake_up(&resync_wait);
+ }
+ if (wait_event_interruptible(resync_wait,
+ mddev2->curr_resync < mddev->curr_resync)) {
+ flush_signals(current);
+ mddev_put(mddev2);
+ goto skip;
+ }
+ }
+ if (mddev->curr_resync == 1) {
+ mddev_put(mddev2);
+ break;
+ }
+ }
+ } while (mddev->curr_resync < 2);
+
+ max_sectors = mddev->size << 1;
+
+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
+ " %d KB/sec/disc.\n", sysctl_speed_limit_min);
+ printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ sysctl_speed_limit_max);
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ j = mddev->recovery_cp;
+ else
+ j = 0;
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = j;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = 32*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+ window/2,max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+
+ if (j)
+ printk(KERN_INFO
+ "md: resuming recovery of md%d from checkpoint.\n",
+ mdidx(mddev));
+
+ while (j < max_sectors) {
+ int sectors;
+
+ sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
+ if (sectors < 0) {
+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ goto out;
+ }
+ atomic_add(sectors, &mddev->recovery_active);
+ j += sectors;
+ if (j>1) mddev->curr_resync = j;
+
+ if (last_check + window > j)
+ continue;
+
+ last_check = j;
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_ERR, &mddev->recovery))
+ break;
+
+ blk_run_queues();
+
+ repeat:
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (signal_pending(current)) {
+ /*
+ * got a signal, exit.
+ */
+ printk(KERN_INFO
+ "md: md_do_sync() got signal ... exiting\n");
+ flush_signals(current);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ goto out;
+ }
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ cond_resched();
+
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > sysctl_speed_limit_min) {
+ if ((currspeed > sysctl_speed_limit_max) ||
+ !is_mddev_idle(mddev)) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(HZ/4);
+ goto repeat;
+ }
+ }
+ }
+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+ out:
+ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+
+ /* tell personality that we are finished */
+ mddev->pers->sync_request(mddev, max_sectors, 1);
+
+ if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+ mddev->curr_resync > 2 &&
+ mddev->curr_resync > mddev->recovery_cp) {
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ printk(KERN_INFO
+ "md: checkpointing recovery of md%d.\n",
+ mdidx(mddev));
+ mddev->recovery_cp = mddev->curr_resync;
+ } else
+ mddev->recovery_cp = MaxSector;
+ }
+
+ if (mddev->safemode)
+ md_enter_safemode(mddev);
+ skip:
+ mddev->curr_resync = 0;
+ set_bit(MD_RECOVERY_DONE, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+}
+
+
+/*
+ * This routine is regularly called by all per-raid-array threads to
+ * deal with generic issues like resync and super-block update.
+ * Raid personalities that don't have a thread (linear/raid0) do not
+ * need this as they never do any recovery or update the superblock.
+ *
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
+ * "->recovery" and create a thread at ->sync_thread.
+ * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
+ * and wakeups up this thread which will reap the thread and finish up.
+ * This thread also removes any faulty devices (with nr_pending == 0).
+ *
+ * The overall approach is:
+ * 1/ if the superblock needs updating, update it.
+ * 2/ If a recovery thread is running, don't do anything else.
+ * 3/ If recovery has finished, clean up, possibly marking spares active.
+ * 4/ If there are any faulty devices, remove them.
+ * 5/ If array is degraded, try to add spares devices
+ * 6/ If array has spares or is not in-sync, start a resync thread.
+ */
+void md_check_recovery(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *rtmp;
+
+
+ dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
+
+ if (mddev->ro)
+ return;
+ if ( ! (
+ mddev->sb_dirty ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_DONE, &mddev->recovery)
+ ))
+ return;
+ if (mddev_trylock(mddev)==0) {
+ int spares =0;
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ /* resync/recovery still happening */
+ goto unlock;
+ if (mddev->sync_thread) {
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ mddev->pers->spare_active(mddev);
+ }
+ md_update_sb(mddev);
+ mddev->recovery = 0;
+ wake_up(&resync_wait);
+ goto unlock;
+ }
+ if (mddev->recovery) {
+ /* that's odd.. */
+ mddev->recovery = 0;
+ wake_up(&resync_wait);
+ }
+
+ /* no recovery is running.
+ * remove any failed drives, then
+ * add spares if possible
+ */
+ ITERATE_RDEV(mddev,rdev,rtmp) {
+ if (rdev->raid_disk >= 0 &&
+ rdev->faulty &&
+ atomic_read(&rdev->nr_pending)==0) {
+ mddev->pers->hot_remove_disk(mddev, rdev->raid_disk);
+ rdev->raid_disk = -1;
+ }
+ if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
+ spares++;
+ }
+ if (mddev->degraded) {
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ if (rdev->raid_disk < 0
+ && !rdev->faulty) {
+ if (mddev->pers->hot_add_disk(mddev,rdev))
+ spares++;
+ else
+ break;
+ }
+ }
+
+ if (!spares && (mddev->recovery_cp == MaxSector )) {
+ /* nothing we can do ... */
+ goto unlock;
+ }
+ if (mddev->pers->sync_request) {
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ if (!spares)
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ mddev->sync_thread = md_register_thread(md_do_sync,
+ mddev,
+ "md%d_resync");
+ if (!mddev->sync_thread) {
+ printk(KERN_ERR "md%d: could not start resync"
+ " thread...\n",
+ mdidx(mddev));
+ /* leave the spares where they are, it shouldn't hurt */
+ mddev->recovery = 0;
+ } else {
+ md_wakeup_thread(mddev->sync_thread);
+ }
+ }
+ unlock:
+ mddev_unlock(mddev);
+ }
+}
+
+int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ if (mddev_trylock(mddev)==0)
+ do_md_stop (mddev, 1);
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+ .notifier_call = md_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+#ifdef CONFIG_PROC_FS
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+#endif
+}
+
+int __init md_init(void)
+{
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
+ " MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+
+ if (register_blkdev(MAJOR_NR, "md"))
+ return -1;
+
+ devfs_mk_dir("md");
+ blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
+ md_probe, NULL, NULL);
+ for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+ char name[16];
+ sprintf(name, "md/%d", minor);
+ devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+ S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+ }
+
+ register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static dev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(dev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(void)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ dev_t dev = detected_devices[i];
+
+ rdev = md_import_device(dev,0, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_ALERT "md: could not import %s!\n",
+ partition_name(dev));
+ continue;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ continue;
+ }
+ list_add(&rdev->same_set, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices();
+}
+
+#endif
+
+static __exit void md_exit(void)
+{
+ int i;
+ blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
+ for (i=0; i < MAX_MD_DEVS; i++)
+ devfs_remove("md/%d", i);
+ devfs_remove("md");
+
+ unregister_blkdev(MAJOR_NR,"md");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("mdstat", NULL);
+#endif
+ for (i = 0; i < MAX_MD_DEVS; i++) {
+ struct gendisk *disk = disks[i];
+ mddev_t *mddev;
+ if (!disks[i])
+ continue;
+ mddev = disk->private_data;
+ del_gendisk(disk);
+ put_disk(disk);
+ mddev_put(mddev);
+ }
+}
+
+module_init(md_init)
+module_exit(md_exit)
+
+EXPORT_SYMBOL(register_md_personality);
+EXPORT_SYMBOL(unregister_md_personality);
+EXPORT_SYMBOL(md_error);
+EXPORT_SYMBOL(md_sync_acct);
+EXPORT_SYMBOL(md_done_sync);
+EXPORT_SYMBOL(md_write_start);
+EXPORT_SYMBOL(md_write_end);
+EXPORT_SYMBOL(md_handle_safemode);
+EXPORT_SYMBOL(md_register_thread);
+EXPORT_SYMBOL(md_unregister_thread);
+EXPORT_SYMBOL(md_wakeup_thread);
+EXPORT_SYMBOL(md_print_devices);
+EXPORT_SYMBOL(md_interrupt_thread);
+EXPORT_SYMBOL(md_check_recovery);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md/merge b/tests/linux/md/merge
new file mode 100644
index 0000000..4238601
--- /dev/null
+++ b/tests/linux/md/merge
@@ -0,0 +1,3589 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/bio.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/suspend.h>
+
+#include <linux/init.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define DEVICE_NR(device) (minor(device))
+
+#include <linux/blk.h>
+
+#define DEBUG 0
+#define dprintk(x...) ((void)(DEBUG && printk(x)))
+
+
+#ifndef MODULE
+static void autostart_arrays (void);
+#endif
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 1000 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static int sysctl_speed_limit_min = 1000;
+static int sysctl_speed_limit_max = 200000;
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {
+ .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
+ .procname = "speed_limit_min",
+ .data = &sysctl_speed_limit_min,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
+ .procname = "speed_limit_max",
+ .data = &sysctl_speed_limit_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table raid_dir_table[] = {
+ {
+ .ctl_name = DEV_RAID,
+ .procname = "raid",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = raid_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table raid_root_table[] = {
+ {
+ .ctl_name = CTL_DEV,
+ .procname = "dev",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = raid_dir_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static struct block_device_operations md_fops;
+
+static struct gendisk *disks[MAX_MD_DEVS];
+
+/*
+ * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list as well as mddev_map.
+ */
+static LIST_HEAD(all_mddevs);
+static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;
+
+
+/*
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
+ */
+#define ITERATE_MDDEV(mddev,tmp) \
+ \
+ for (({ spin_lock(&all_mddevs_lock); \
+ tmp = all_mddevs.next; \
+ mddev = NULL;}); \
+ ({ if (tmp != &all_mddevs) \
+ mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
+ spin_unlock(&all_mddevs_lock); \
+ if (mddev) mddev_put(mddev); \
+ mddev = list_entry(tmp, mddev_t, all_mddevs); \
+ tmp != &all_mddevs;}); \
+ ({ spin_lock(&all_mddevs_lock); \
+ tmp = tmp->next;}) \
+ )
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio, bio->bi_size);
+ return 0;
+}
+
+static inline mddev_t *mddev_get(mddev_t *mddev)
+{
+ atomic_inc(&mddev->active);
+ return mddev;
+}
+
+static void mddev_put(mddev_t *mddev)
+{
+ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+ return;
+ if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+ list_del(&mddev->all_mddevs);
+ mddev_map[mdidx(mddev)] = NULL;
+ kfree(mddev);
+ MOD_DEC_USE_COUNT;
+ }
+ spin_unlock(&all_mddevs_lock);
+}
+
+static mddev_t * mddev_find(int unit)
+{
+ mddev_t *mddev, *new = NULL;
+
+ retry:
+ spin_lock(&all_mddevs_lock);
+ if (mddev_map[unit]) {
+ mddev = mddev_get(mddev_map[unit]);
+ spin_unlock(&all_mddevs_lock);
+ if (new)
+ kfree(new);
+ return mddev;
+ }
+ if (new) {
+ mddev_map[unit] = new;
+ list_add(&new->all_mddevs, &all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ MOD_INC_USE_COUNT;
+ return new;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ memset(new, 0, sizeof(*new));
+
+ new->__minor = unit;
+ init_MUTEX(&new->reconfig_sem);
+ INIT_LIST_HEAD(&new->disks);
+ INIT_LIST_HEAD(&new->all_mddevs);
+ init_timer(&new->safemode_timer);
+ atomic_set(&new->active, 1);
+ blk_queue_make_request(&new->queue, md_fail_request);
+
+ goto retry;
+}
+
+static inline int mddev_lock(mddev_t * mddev)
+{
+ return down_interruptible(&mddev->reconfig_sem);
+}
+
+static inline void mddev_lock_uninterruptible(mddev_t * mddev)
+{
+ down(&mddev->reconfig_sem);
+}
+
+static inline int mddev_trylock(mddev_t * mddev)
+{
+ return down_trylock(&mddev->reconfig_sem);
+}
+
+static inline void mddev_unlock(mddev_t * mddev)
+{
+ up(&mddev->reconfig_sem);
+}
+
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->bdev->bd_dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+inline static sector_t calc_dev_sboffset(struct block_device *bdev)
+{
+ sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ return MD_NEW_SIZE_BLOCKS(size);
+}
+
+static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
+{
+ sector_t size;
+
+ size = rdev->sb_offset;
+
+ if (chunk_size)
+ size &= ~((sector_t)chunk_size/1024 - 1);
+ return size;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(KERN_ALERT "md: out of memory.\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ page_cache_release(rdev->sb_page);
+ rdev->sb_loaded = 0;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ }
+}
+
+
+static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
+{
+ if (bio->bi_size)
+ return 1;
+
+ complete((struct completion*)bio->bi_private);
+ return 0;
+}
+
+static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+ struct page *page, int rw)
+{
+ struct bio bio;
+ struct bio_vec vec;
+ struct completion event;
+
+ bio_init(&bio);
+ bio.bi_io_vec = &vec;
+ vec.bv_page = page;
+ vec.bv_len = size;
+ vec.bv_offset = 0;
+ bio.bi_vcnt = 1;
+ bio.bi_idx = 0;
+ bio.bi_size = size;
+ bio.bi_bdev = bdev;
+ bio.bi_sector = sector;
+ init_completion(&event);
+ bio.bi_private = &event;
+ bio.bi_end_io = bi_complete;
+ submit_bio(rw, &bio);
+ blk_run_queues();
+ wait_for_completion(&event);
+
+ return test_bit(BIO_UPTODATE, &bio.bi_flags);
+}
+
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+
+ if (!rdev->sb_page) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->sb_loaded)
+ return 0;
+
+
+ if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
+ goto fail;
+ rdev->sb_loaded = 1;
+ return 0;
+
+fail:
+ printk(KERN_ERR "md: disabled device %s, could not read superblock.\n",
+ bdev_partition_name(rdev->bdev));
+ return -EINVAL;
+}
+
+static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
+ (sb1->set_uuid1 == sb2->set_uuid1) &&
+ (sb1->set_uuid2 == sb2->set_uuid2) &&
+ (sb1->set_uuid3 == sb2->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ if (tmp1)
+ kfree(tmp1);
+ if (tmp2)
+ kfree(tmp2);
+
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+/*
+ * Handle superblock details.
+ * We want to be able to handle multiple superblock formats
+ * so we have a common interface to them all, and an array of
+ * different handlers.
+ * We rely on user-space to write the initial superblock, and support
+ * reading and updating of superblocks.
+ * Interface methods are:
+ * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
+ * loads and validates a superblock on dev.
+ * if refdev != NULL, compare superblocks on both devices
+ * Return:
+ * 0 - dev has a superblock that is compatible with refdev
+ * 1 - dev has a superblock that is compatible and newer than refdev
+ * so dev should be used as the refdev in future
+ * -EINVAL superblock incompatible or invalid
+ * -othererror e.g. -EIO
+ *
+ * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
+ * Verify that dev is acceptable into mddev.
+ * The first time, mddev->raid_disks will be 0, and data from
+ * dev should be merged in. Subsequent calls check that dev
+ * is new enough. Return 0 or -EINVAL
+ *
+ * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
+ * Update the superblock for rdev with data in mddev
+ * This does not write to disc.
+ *
+ */
+
+struct super_type {
+ char *name;
+ struct module *owner;
+ int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
+ int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+ void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+};
+
+/*
+ * load_super for 0.90.0
+ */
+static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+ mdp_super_t *sb;
+ int ret;
+ sector_t sb_offset;
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk.
+ *
+ * It also happens to be a multiple of 4Kb.
+ */
+ sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_offset = sb_offset;
+
+ ret = read_disk_sb(rdev);
+ if (ret) return ret;
+
+ ret = -EINVAL;
+
+ sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ if (sb->major_version != 0 ||
+ sb->minor_version != 90) {
+ printk(KERN_WARNING "Bad version number %d.%d on %s\n",
+ sb->major_version, sb->minor_version,
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ if (sb->md_minor >= MAX_MD_DEVS) {
+ printk(KERN_ERR "md: %s: invalid raid minor (%x)\n",
+ bdev_partition_name(rdev->bdev), sb->md_minor);
+ goto abort;
+ }
+ if (sb->raid_disks <= 0)
+ goto abort;
+
+ if (calc_sb_csum(sb) != sb->sb_csum) {
+ printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ rdev->preferred_minor = sb->md_minor;
+ rdev->data_offset = 0;
+
+ if (sb->level == MULTIPATH)
+ rdev->desc_nr = -1;
+ else
+ rdev->desc_nr = sb->this_disk.number;
+
+ if (refdev == 0)
+ ret = 1;
+ else {
+ __u64 ev1, ev2;
+ mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+ if (!uuid_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ goto abort;
+ }
+ if (!sb_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has same UUID"
+ " but different superblock to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ goto abort;
+ }
+ ev1 = md_event(sb);
+ ev2 = md_event(refsb);
+ if (ev1 > ev2)
+ ret = 1;
+ else
+ ret = 0;
+ }
+ rdev->size = calc_dev_size(rdev, sb->chunk_size);
+
+ abort:
+ return ret;
+}
+
+/*
+ * validate_super for 0.90.0
+ */
+static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdp_disk_t *desc;
+ mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 0;
+ mddev->minor_version = sb->minor_version;
+ mddev->patch_version = sb->patch_version;
+ mddev->persistent = ! sb->not_persistent;
+ mddev->chunk_size = sb->chunk_size;
+ mddev->ctime = sb->ctime;
+ mddev->utime = sb->utime;
+ mddev->level = sb->level;
+ mddev->layout = sb->layout;
+ mddev->raid_disks = sb->raid_disks;
+ mddev->size = sb->size;
+ mddev->events = md_event(sb);
+
+ if (sb->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else {
+ if (sb->events_hi == sb->cp_events_hi &&
+ sb->events_lo == sb->cp_events_lo) {
+ mddev->recovery_cp = sb->recovery_cp;
+ } else
+ mddev->recovery_cp = 0;
+ }
+
+ memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
+ memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
+ memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
+ memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
+
+ mddev->max_disks = MD_SB_DISKS;
+ } else {
+ __u64 ev1;
+ ev1 = md_event(sb);
+ ++ev1;
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ }
+ if (mddev->level != LEVEL_MULTIPATH) {
+ rdev->raid_disk = -1;
+ rdev->in_sync = rdev->faulty = 0;
+ desc = sb->disks + rdev->desc_nr;
+
+ if (desc->state & (1<<MD_DISK_FAULTY))
+ rdev->faulty = 1;
+ else if (desc->state & (1<<MD_DISK_SYNC) &&
+ desc->raid_disk < mddev->raid_disks) {
+ rdev->in_sync = 1;
+ rdev->raid_disk = desc->raid_disk;
+ }
+ }
+ return 0;
+}
+
+/*
+ * sync_super for 0.90.0
+ */
+static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdp_super_t *sb;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev2;
+ int next_spare = mddev->raid_disks;
+
+ /* make rdev->sb match mddev data..
+ *
+ * 1/ zero out disks
+ * 2/ Add info for each disk, keeping track of highest desc_nr
+ * 3/ any empty disks < highest become removed
+ *
+ * disks[0] gets initialised to REMOVED because
+ * we cannot be sure from other fields if it has
+ * been initialised or not.
+ */
+ int highest = 0;
+ int i;
+ int active=0, working=0,failed=0,spare=0,nr_disks=0;
+
+ sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+ memset(sb, 0, sizeof(*sb));
+
+ sb->md_magic = MD_SB_MAGIC;
+ sb->major_version = mddev->major_version;
+ sb->minor_version = mddev->minor_version;
+ sb->patch_version = mddev->patch_version;
+ sb->gvalid_words = 0; /* ignored */
+ memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
+ memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
+ memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
+ memcpy(&sb->set_uuid3, mddev->uuid+12,4);
+
+ sb->ctime = mddev->ctime;
+ sb->level = mddev->level;
+ sb->size = mddev->size;
+ sb->raid_disks = mddev->raid_disks;
+ sb->md_minor = mddev->__minor;
+ sb->not_persistent = !mddev->persistent;
+ sb->utime = mddev->utime;
+ sb->state = 0;
+ sb->events_hi = (mddev->events>>32);
+ sb->events_lo = (u32)mddev->events;
+
+ if (mddev->in_sync)
+ {
+ sb->recovery_cp = mddev->recovery_cp;
+ sb->cp_events_hi = (mddev->events>>32);
+ sb->cp_events_lo = (u32)mddev->events;
+ if (mddev->recovery_cp == MaxSector)
+ sb->state = (1<< MD_SB_CLEAN);
+ } else
+ sb->recovery_cp = 0;
+
+ sb->layout = mddev->layout;
+ sb->chunk_size = mddev->chunk_size;
+
+ sb->disks[0].state = (1<<MD_DISK_REMOVED);
+ ITERATE_RDEV(mddev,rdev2,tmp) {
+ mdp_disk_t *d;
+ if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
+ rdev2->desc_nr = rdev2->raid_disk;
+ else
+ rdev2->desc_nr = next_spare++;
+ d = &sb->disks[rdev2->desc_nr];
+ nr_disks++;
+ d->number = rdev2->desc_nr;
+ d->major = MAJOR(rdev2->bdev->bd_dev);
+ d->minor = MINOR(rdev2->bdev->bd_dev);
+ if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
+ d->raid_disk = rdev2->raid_disk;
+ else
+ d->raid_disk = rdev2->desc_nr; /* compatibility */
+ if (rdev2->faulty) {
+ d->state = (1<<MD_DISK_FAULTY);
+ failed++;
+ } else if (rdev2->in_sync) {
+ d->state = (1<<MD_DISK_ACTIVE);
+ d->state |= (1<<MD_DISK_SYNC);
+ active++;
+ working++;
+ } else {
+ d->state = 0;
+ spare++;
+ working++;
+ }
+ if (rdev2->desc_nr > highest)
+ highest = rdev2->desc_nr;
+ }
+
+ /* now set the "removed" bit on any non-trailing holes */
+ for (i=0; i<highest; i++) {
+ mdp_disk_t *d = &sb->disks[i];
+ if (d->state == 0 && d->number == 0) {
+ d->number = i;
+ d->raid_disk = i;
+ d->state = (1<<MD_DISK_REMOVED);
+ }
+ }
+ sb->nr_disks = nr_disks;
+ sb->active_disks = active;
+ sb->working_disks = working;
+ sb->failed_disks = failed;
+ sb->spare_disks = spare;
+
+ sb->this_disk = sb->disks[rdev->desc_nr];
+ sb->sb_csum = calc_sb_csum(sb);
+}
+
+/*
+ * version 1 superblock
+ */
+
+static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+{
+ unsigned int disk_csum, csum;
+ int size = 256 + sb->max_dev*2;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, size, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+ struct mdp_superblock_1 *sb;
+ int ret;
+ sector_t sb_offset;
+
+ /*
+ * Calculate the position of the superblock.
+ * It is always aligned to a 4K boundary and
+ * depeding on minor_version, it can be:
+ * 0: At least 8K, but less than 12K, from end of device
+ * 1: At start of device
+ * 2: 4K from start of device.
+ */
+ switch(minor_version) {
+ case 0:
+ sb_offset = rdev->bdev->bd_inode->i_size >> 9;
+ sb_offset -= 8*2;
+ sb_offset &= ~(4*2);
+ /* convert from sectors to K */
+ sb_offset /= 2;
+ break;
+ case 1:
+ sb_offset = 0;
+ break;
+ case 2:
+ sb_offset = 4;
+ break;
+ default:
+ return -EINVAL;
+ }
+ rdev->sb_offset = sb_offset;
+
+ ret = read_disk_sb(rdev);
+ if (ret) return ret;
+
+
+ sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
+ sb->major_version != cpu_to_le32(1) ||
+ le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
+ le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+ sb->feature_map != 0)
+ return -EINVAL;
+
+ if (calc_sb_1_csum(sb) != sb->sb_csum) {
+ printk("md: invalid superblock checksum on %s\n",
+ bdev_partition_name(rdev->bdev));
+ return -EINVAL;
+ }
+ rdev->preferred_minor = 0xffff;
+ rdev->data_offset = le64_to_cpu(sb->data_offset);
+
+ if (refdev == 0)
+ return 1;
+ else {
+ __u64 ev1, ev2;
+ struct mdp_superblock_1 *refsb =
+ (struct mdp_superblock_1*)page_address(refdev->sb_page);
+
+ if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
+ sb->level != refsb->level ||
+ sb->layout != refsb->layout ||
+ sb->chunksize != refsb->chunksize) {
+ printk(KERN_WARNING "md: %s has strangely different"
+ " superblock to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ return -EINVAL;
+ }
+ ev1 = le64_to_cpu(sb->events);
+ ev2 = le64_to_cpu(refsb->events);
+
+ if (ev1 > ev2)
+ return 1;
+ }
+ if (minor_version)
+ rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+ else
+ rdev->size = rdev->sb_offset;
+ if (rdev->size < le64_to_cpu(sb->data_size)/2)
+ return -EINVAL;
+ rdev->size = le64_to_cpu(sb->data_size)/2;
+ if (le32_to_cpu(sb->chunksize))
+ rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+ return 0;
+}
+
+static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 1;
+ mddev->minor_version = 0;
+ mddev->patch_version = 0;
+ mddev->persistent = 1;
+ mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+ mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
+ mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+ mddev->level = le32_to_cpu(sb->level);
+ mddev->layout = le32_to_cpu(sb->layout);
+ mddev->raid_disks = le32_to_cpu(sb->raid_disks);
+ mddev->size = (u32)le64_to_cpu(sb->size);
+ mddev->events = le64_to_cpu(sb->events);
+
+ mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+ memcpy(mddev->uuid, sb->set_uuid, 16);
+
+ mddev->max_disks = (4096-256)/2;
+ } else {
+ __u64 ev1;
+ ev1 = le64_to_cpu(sb->events);
+ ++ev1;
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ }
+
+ if (mddev->level != LEVEL_MULTIPATH) {
+ int role;
+ rdev->desc_nr = le32_to_cpu(sb->dev_number);
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ switch(role) {
+ case 0xffff: /* spare */
+ rdev->in_sync = 0;
+ rdev->faulty = 0;
+ rdev->raid_disk = -1;
+ break;
+ case 0xfffe: /* faulty */
+ rdev->in_sync = 0;
+ rdev->faulty = 1;
+ rdev->raid_disk = -1;
+ break;
+ default:
+ rdev->in_sync = 1;
+ rdev->faulty = 0;
+ rdev->raid_disk = role;
+ break;
+ }
+ }
+ return 0;
+}
+
+static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct mdp_superblock_1 *sb;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev2;
+ int max_dev, i;
+ /* make rdev->sb match mddev and rdev data. */
+
+ sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ sb->feature_map = 0;
+ sb->pad0 = 0;
+ memset(sb->pad1, 0, sizeof(sb->pad1));
+ memset(sb->pad2, 0, sizeof(sb->pad2));
+ memset(sb->pad3, 0, sizeof(sb->pad3));
+
+ sb->utime = cpu_to_le64((__u64)mddev->utime);
+ sb->events = cpu_to_le64(mddev->events);
+ if (mddev->in_sync)
+ sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ else
+ sb->resync_offset = cpu_to_le64(0);
+
+ max_dev = 0;
+ ITERATE_RDEV(mddev,rdev2,tmp)
+ if (rdev2->desc_nr > max_dev)
+ max_dev = rdev2->desc_nr;
+
+ sb->max_dev = max_dev;
+ for (i=0; i<max_dev;i++)
+ sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
+
+ ITERATE_RDEV(mddev,rdev2,tmp) {
+ i = rdev2->desc_nr;
+ if (rdev2->faulty)
+ sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ else if (rdev2->in_sync)
+ sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else
+ sb->dev_roles[i] = cpu_to_le16(0xffff);
+ }
+
+ sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
+}
+
+
+struct super_type super_types[] = {
+ [0] = {
+ .name = "0.90.0",
+ .owner = THIS_MODULE,
+ .load_super = super_90_load,
+ .validate_super = super_90_validate,
+ .sync_super = super_90_sync,
+ },
+ [1] = {
+ .name = "md-1",
+ .owner = THIS_MODULE,
+ .load_super = super_1_load,
+ .validate_super = super_1_validate,
+ .sync_super = super_1_sync,
+ },
+};
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev))
+ return 1;
+
+ return 0;
+}
+
+static LIST_HEAD(pending_raid_disks);
+
+static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ same_pdev = match_dev_unit(mddev, rdev);
+ if (same_pdev)
+ printk(KERN_WARNING
+ "md%d: WARNING: %s appears to be on the same physical"
+ " disk as %s. True\n protection against single-disk"
+ " failure might be compromised.\n",
+ mdidx(mddev), bdev_partition_name(rdev->bdev),
+ bdev_partition_name(same_pdev->bdev));
+
+ /* Verify rdev->desc_nr is unique.
+ * If it is -1, assign a free number, else
+ * check number is not in use
+ */
+ if (rdev->desc_nr < 0) {
+ int choice = 0;
+ if (mddev->pers) choice = mddev->raid_disks;
+ while (find_rdev_nr(mddev, choice))
+ choice++;
+ rdev->desc_nr = choice;
+ } else {
+ if (find_rdev_nr(mddev, rdev->desc_nr))
+ return -EBUSY;
+ }
+
+ list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev));
+ return 0;
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev));
+ rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
+{
+ int err = 0;
+ struct block_device *bdev;
+
+ bdev = bdget(dev);
+ if (!bdev)
+ return -ENOMEM;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ if (err)
+ return err;
+ err = bd_claim(bdev, rdev);
+ if (err) {
+ blkdev_put(bdev, BDEV_RAW);
+ return err;
+ }
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ bd_release(bdev);
+ blkdev_put(bdev, BDEV_RAW);
+}
+
+void md_autodetect_dev(dev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ printk(KERN_INFO "md: export_rdev(%s)\n",
+ bdev_partition_name(rdev->bdev));
+ if (rdev->mddev)
+ MD_BUG();
+ free_disk_sb(rdev);
+ list_del_init(&rdev->same_set);
+#ifndef MODULE
+ md_autodetect_dev(rdev->bdev->bd_dev);
+#endif
+ unlock_rdev(rdev);
+ kfree(rdev);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+ mddev->raid_disks = 0;
+ mddev->major_version = 0;
+}
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+ partition_name(MKDEV(desc->major,desc->minor)),
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO
+ "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
+ sb->level, sb->size, sb->nr_disks, sb->raid_disks,
+ sb->md_minor, sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
+ " FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ",
+ bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size,
+ rdev->faulty, rdev->in_sync, rdev->desc_nr);
+ if (rdev->sb_loaded) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb((mdp_super_t*)page_address(rdev->sb_page));
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+ struct list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) {
+ printk("md%d: ", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", bdev_partition_name(rdev->bdev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+
+ if (!rdev->sb_loaded) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ return 1;
+ }
+
+ dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->sb_offset);
+
+ if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
+ return 0;
+
+ printk("md: write_disk_sb failed for device %s\n",
+ bdev_partition_name(rdev->bdev));
+ return 1;
+}
+
+static void sync_sbs(mddev_t * mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ super_types[mddev->major_version].
+ sync_super(mddev, rdev);
+ rdev->sb_loaded = 1;
+ }
+}
+
+static void md_update_sb(mddev_t * mddev)
+{
+ int err, count = 100;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ mddev->sb_dirty = 0;
+repeat:
+ mddev->utime = get_seconds();
+ mddev->events ++;
+
+ if (!mddev->events) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->events --;
+ }
+ sync_sbs(mddev);
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (!mddev->persistent)
+ return;
+
+ dprintk(KERN_INFO
+ "md: updating md%d RAID superblock on device (in sync %d)\n",
+ mdidx(mddev),mddev->in_sync);
+
+ err = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ dprintk(KERN_INFO "md: ");
+ if (rdev->faulty)
+ dprintk("(skipping faulty ");
+
+ dprintk("%s ", bdev_partition_name(rdev->bdev));
+ if (!rdev->faulty) {
+ err += write_disk_sb(rdev);
+ } else
+ dprintk(")\n");
+ if (!err && mddev->level == LEVEL_MULTIPATH)
+ /* only need to write one superblock... */
+ break;
+ }
+ if (err) {
+ if (--count) {
+ printk(KERN_ERR "md: errors occurred during superblock"
+ " update, repeating\n");
+ goto repeat;
+ }
+ printk(KERN_ERR \
+ "md: excessive errors occurred during superblock update, exiting\n");
+ }
+}
+
+/*
+ * Import a device. If 'super_format' >= 0, then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ * a faulty rdev _never_ has rdev->sb set.
+ */
+static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ sector_t size;
+
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for %s!\n",
+ partition_name(newdev));
+ return ERR_PTR(-ENOMEM);
+ }
+ memset(rdev, 0, sizeof(*rdev));
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ err = lock_rdev(rdev, newdev);
+ if (err) {
+ printk(KERN_ERR "md: could not lock %s.\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+ rdev->desc_nr = -1;
+ rdev->faulty = 0;
+ rdev->in_sync = 0;
+ rdev->data_offset = 0;
+ atomic_set(&rdev->nr_pending, 0);
+
+ size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ if (!size) {
+ printk(KERN_WARNING
+ "md: %s has zero or unknown size, marking faulty!\n",
+ bdev_partition_name(rdev->bdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (super_format >= 0) {
+ err = super_types[super_format].
+ load_super(rdev, NULL, super_minor);
+ if (err == -EINVAL) {
+ printk(KERN_WARNING
+ "md: %s has invalid sb, not importing!\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort_free;
+ }
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: could not read %s's sb, not importing!\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort_free;
+ }
+ }
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return rdev;
+
+abort_free:
+ if (rdev->sb_page) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return ERR_PTR(err);
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+
+static int analyze_sbs(mddev_t * mddev)
+{
+ int i;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev, *freshest;
+
+ freshest = NULL;
+ ITERATE_RDEV(mddev,rdev,tmp)
+ switch (super_types[mddev->major_version].
+ load_super(rdev, freshest, mddev->minor_version)) {
+ case 1:
+ freshest = rdev;
+ break;
+ case 0:
+ break;
+ default:
+ printk( KERN_ERR \
+ "md: fatal superblock inconsistency in %s"
+ " -- removing from array\n",
+ bdev_partition_name(rdev->bdev));
+ kick_rdev_from_array(rdev);
+ }
+
+
+ super_types[mddev->major_version].
+ validate_super(mddev, freshest);
+
+ i = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev != freshest)
+ if (super_types[mddev->major_version].
+ validate_super(mddev, rdev)) {
+ printk(KERN_WARNING "md: kicking non-fresh %s"
+ " from array!\n",
+ bdev_partition_name(rdev->bdev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ if (mddev->level == LEVEL_MULTIPATH) {
+ rdev->desc_nr = i++;
+ rdev->raid_disk = rdev->desc_nr;
+ rdev->in_sync = 1;
+ }
+ }
+
+
+ /*
+ * Check if we can support this RAID array
+ */
+ if (mddev->major_version != MD_MAJOR_VERSION ||
+ mddev->minor_version > MD_MINOR_VERSION) {
+ printk(KERN_ALERT
+ "md: md%d: unsupported raid array version %d.%d.%d\n",
+ mdidx(mddev), mddev->major_version,
+ mddev->minor_version, mddev->patch_version);
+ goto abort;
+ }
+
+ if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) ||
+ (mddev->level == 4) || (mddev->level == 5)))
+ printk(KERN_ERR "md: md%d: raid array is not clean"
+ " -- starting background reconstruction\n",
+ mdidx(mddev));
+
+ return 0;
+abort:
+ return 1;
+}
+
+static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+{
+ static DECLARE_MUTEX(disks_sem);
+ int unit = MINOR(dev);
+ mddev_t *mddev = mddev_find(unit);
+ struct gendisk *disk;
+
+ if (!mddev)
+ return NULL;
+
+ down(&disks_sem);
+ if (disks[unit]) {
+ up(&disks_sem);
+ mddev_put(mddev);
+ return NULL;
+ }
+ disk = alloc_disk(1);
+ if (!disk) {
+ up(&disks_sem);
+ mddev_put(mddev);
+ return NULL;
+ }
+ disk->major = MD_MAJOR;
+ disk->first_minor = mdidx(mddev);
+ sprintf(disk->disk_name, "md%d", mdidx(mddev));
+ disk->fops = &md_fops;
+ disk->private_data = mddev;
+ disk->queue = &mddev->queue;
+ add_disk(disk);
+ disks[mdidx(mddev)] = disk;
+ up(&disks_sem);
+ return NULL;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread);
+
+static void md_safemode_timeout(unsigned long data)
+{
+ mddev_t *mddev = (mddev_t *) data;
+
+ mddev->safemode = 1;
+ md_wakeup_thread(mddev->thread);
+}
+
+
+static int do_md_run(mddev_t * mddev)
+{
+ int pnum, err;
+ int chunk_size;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+ struct gendisk *disk;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (!mddev->raid_disks && analyze_sbs(mddev)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ chunk_size = mddev->chunk_size;
+ pnum = level_to_pers(mddev->level);
+
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+ if (!chunk_size) {
+ /*
+ * 'default chunksize' in the old md code used to
+ * be PAGE_SIZE, baaad.
+ * we abort here to be on the safe side. We don't
+ * want to continue the bad practice.
+ */
+ printk(KERN_ERR
+ "no chunksize specified, see 'man raidtab'\n");
+ return -EINVAL;
+ }
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(KERN_ERR "too big chunk_size: %d > %d\n",
+ chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(KERN_ERR "too small chunk_size: %d < %ld\n",
+ chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+
+ /* devices must have minimum size of one chunk */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size < chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size:"
+ " %lluk < %dk\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->size,
+ chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+ }
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+#ifdef CONFIG_KMOD
+ if (!pers[pnum])
+ {
+ char module_name[80];
+ sprintf (module_name, "md-personality-%d", pnum);
+ request_module (module_name);
+ }
+#endif
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ sync_blockdev(rdev->bdev);
+ invalidate_bdev(rdev->bdev, 0);
+ }
+
+ md_probe(mdidx(mddev), NULL, NULL);
+ disk = disks[mdidx(mddev)];
+ if (!disk)
+ return -ENOMEM;
+
+ spin_lock(&pers_lock);
+ if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
+ spin_unlock(&pers_lock);
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
+ pnum);
+ return -EINVAL;
+ }
+
+ mddev->pers = pers[pnum];
+ spin_unlock(&pers_lock);
+
+ blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+ printk("%s: setting max_sectors to %d, segment boundary to %d\n",
+ disk->disk_name,
+ chunk_size >> 9,
+ (chunk_size>>1)-1);
+ blk_queue_max_sectors(&mddev->queue, chunk_size >> 9);
+ blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1);
+ mddev->queue.queuedata = mddev;
+
+ err = mddev->pers->run(mddev);
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ module_put(mddev->pers->owner);
+ mddev->pers = NULL;
+ return -EINVAL;
+ }
+ atomic_set(&mddev->writes_pending,0);
+ mddev->safemode = 0;
+ mddev->safemode_timer.function = md_safemode_timeout;
+ mddev->safemode_timer.data = (unsigned long) mddev;
+ mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
+ mddev->in_sync = 1;
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ set_capacity(disk, mddev->array_size<<1);
+ return 0;
+}
+
+static int restart_array(mddev_t *mddev)
+{
+ struct gendisk *disk = disks[mdidx(mddev)];
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->safemode = 0;
+ mddev->ro = 0;
+ set_disk_ro(disk, 0);
+
+ printk(KERN_INFO "md: md%d switched to read-write mode.\n",
+ mdidx(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ err = 0;
+ } else {
+ printk(KERN_ERR "md: md%d has no personality assigned.\n",
+ mdidx(mddev));
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+ int err = 0;
+ struct gendisk *disk = disks[mdidx(mddev)];
+
+ if (atomic_read(&mddev->active)>2) {
+ printk("md: md%d still in use.\n",mdidx(mddev));
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (mddev->pers) {
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ }
+
+ del_timer_sync(&mddev->safemode_timer);
+
+ invalidate_device(mk_kdev(disk->major, disk->first_minor), 1);
+
+ if (ro) {
+ err = -ENXIO;
+ if (mddev->ro)
+ goto out;
+ mddev->ro = 1;
+ } else {
+ if (mddev->ro)
+ set_disk_ro(disk, 0);
+ if (mddev->pers->stop(mddev)) {
+ err = -EBUSY;
+ if (mddev->ro)
+ set_disk_ro(disk, 1);
+ goto out;
+ }
+ module_put(mddev->pers->owner);
+ mddev->pers = NULL;
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (mddev->raid_disks) {
+ /* mark array as shutdown cleanly */
+ mddev->in_sync = 1;
+ md_update_sb(mddev);
+ }
+ if (ro)
+ set_disk_ro(disk, 1);
+ }
+ /*
+ * Free resources if final stop
+ */
+ if (!ro) {
+ struct gendisk *disk;
+ printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+
+ export_array(mddev);
+
+ mddev->array_size = 0;
+ disk = disks[mdidx(mddev)];
+ if (disk)
+ set_capacity(disk, 0);
+ } else
+ printk(KERN_INFO "md: md%d switched to read-only mode.\n",
+ mdidx(mddev));
+ err = 0;
+out:
+ return err;
+}
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return;
+ }
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk("<%s>", bdev_partition_name(rdev->bdev));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in pending_raid_disks)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(void)
+{
+ struct list_head candidates;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ rdev0 = list_entry(pending_raid_disks.next,
+ mdk_rdev_t, same_set);
+
+ printk(KERN_INFO "md: considering %s ...\n",
+ bdev_partition_name(rdev0->bdev));
+ INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp)
+ if (super_90_load(rdev, rdev0, 0) >= 0) {
+ printk(KERN_INFO "md: adding %s ...\n",
+ bdev_partition_name(rdev->bdev));
+ list_move(&rdev->same_set, &candidates);
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+
+ mddev = mddev_find(rdev0->preferred_minor);
+ if (!mddev) {
+ printk(KERN_ERR
+ "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (mddev_lock(mddev))
+ printk(KERN_WARNING "md: md%d locked, cannot run\n",
+ mdidx(mddev));
+ else if (mddev->raid_disks || mddev->major_version
+ || !list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: md%d already running, cannot run %s\n",
+ mdidx(mddev), bdev_partition_name(rdev0->bdev));
+ mddev_unlock(mddev);
+ } else {
+ printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+ ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
+ list_del_init(&rdev->same_set);
+ if (bind_rdev_to_array(rdev, mddev))
+ export_rdev(rdev);
+ }
+ autorun_array(mddev);
+ mddev_unlock(mddev);
+ }
+ /* on success, candidates will be empty, on error
+ * it won't...
+ */
+ ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
+ export_rdev(rdev);
+ mddev_put(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+static int autostart_array(dev_t startdev)
+{
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ start_rdev = md_import_device(startdev, 0, 0);
+ if (IS_ERR(start_rdev)) {
+ printk(KERN_WARNING "md: could not import %s!\n",
+ partition_name(startdev));
+ return err;
+ }
+
+ /* NOTE: this can only work for 0.90.0 superblocks */
+ sb = (mdp_super_t*)page_address(start_rdev->sb_page);
+ if (sb->major_version != 0 ||
+ sb->minor_version != 90 ) {
+ printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
+ export_rdev(start_rdev);
+ return err;
+ }
+
+ if (start_rdev->faulty) {
+ printk(KERN_WARNING
+ "md: can not autostart based on faulty %s!\n",
+ bdev_partition_name(start_rdev->bdev));
+ export_rdev(start_rdev);
+ return err;
+ }
+ list_add(&start_rdev->same_set, &pending_raid_disks);
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ dev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (!dev)
+ continue;
+ if (dev == startdev)
+ continue;
+ rdev = md_import_device(dev, 0, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING "md: could not import %s,"
+ " trying to run array nevertheless.\n",
+ partition_name(dev));
+ continue;
+ }
+ list_add(&rdev->same_set, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices();
+ return 0;
+
+}
+
+
+static int get_version(void * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_array_info(mddev_t * mddev, void * arg)
+{
+ mdu_array_info_t info;
+ int nr,working,active,failed,spare;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ nr=working=active=failed=spare=0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ nr++;
+ if (rdev->faulty)
+ failed++;
+ else {
+ working++;
+ if (rdev->in_sync)
+ active++;
+ else
+ spare++;
+ }
+ }
+
+ info.major_version = mddev->major_version;
+ info.minor_version = mddev->minor_version;
+ info.patch_version = 1;
+ info.ctime = mddev->ctime;
+ info.level = mddev->level;
+ info.size = mddev->size;
+ info.nr_disks = nr;
+ info.raid_disks = mddev->raid_disks;
+ info.md_minor = mddev->__minor;
+ info.not_persistent= !mddev->persistent;
+
+ info.utime = mddev->utime;
+ info.state = 0;
+ if (mddev->in_sync)
+ info.state = (1<<MD_SB_CLEAN);
+ info.active_disks = active;
+ info.working_disks = working;
+ info.failed_disks = failed;
+ info.spare_disks = spare;
+
+ info.layout = mddev->layout;
+ info.chunk_size = mddev->chunk_size;
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_disk_info(mddev_t * mddev, void * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+ mdk_rdev_t *rdev;
+
+ if (copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+
+ rdev = find_rdev_nr(mddev, nr);
+ if (rdev) {
+ info.major = MAJOR(rdev->bdev->bd_dev);
+ info.minor = MINOR(rdev->bdev->bd_dev);
+ info.raid_disk = rdev->raid_disk;
+ info.state = 0;
+ if (rdev->faulty)
+ info.state |= (1<<MD_DISK_FAULTY);
+ else if (rdev->in_sync) {
+ info.state |= (1<<MD_DISK_ACTIVE);
+ info.state |= (1<<MD_DISK_SYNC);
+ }
+ } else {
+ info.major = info.minor = 0;
+ info.raid_disk = -1;
+ info.state = (1<<MD_DISK_REMOVED);
+ }
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ mdk_rdev_t *rdev;
+ dev_t dev;
+ dev = MKDEV(info->major,info->minor);
+ if (!mddev->raid_disks) {
+ int err;
+ /* expecting a device which has a superblock */
+ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ int err = super_types[mddev->major_version]
+ .load_super(rdev, rdev0, mddev->minor_version);
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: %s has different UUID to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(rdev0->bdev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ return err;
+ }
+
+ /*
+ * add_new_disk can be used once the array is assembled
+ * to add "hot spares". They must already have a superblock
+ * written
+ */
+ if (mddev->pers) {
+ int err;
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+ rdev = md_import_device(dev, mddev->major_version,
+ mddev->minor_version);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ rdev->in_sync = 0; /* just to be sure */
+ rdev->raid_disk = -1;
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ if (mddev->thread)
+ md_wakeup_thread(mddev->thread);
+ return err;
+ }
+
+ /* otherwise, add_new_disk is only allowed
+ * for major_version==0 superblocks
+ */
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ if (!(info->state & (1<<MD_DISK_FAULTY))) {
+ int err;
+ rdev = md_import_device (dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ rdev->desc_nr = info->number;
+ if (info->raid_disk < mddev->raid_disks)
+ rdev->raid_disk = info->raid_disk;
+ else
+ rdev->raid_disk = -1;
+
+ rdev->faulty = 0;
+ if (rdev->raid_disk < mddev->raid_disks)
+ rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
+ else
+ rdev->in_sync = 0;
+
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err) {
+ export_rdev(rdev);
+ return err;
+ }
+
+ if (!mddev->persistent) {
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+ rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ } else
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->size = calc_dev_size(rdev, mddev->chunk_size);
+
+ if (!mddev->size || (mddev->size > rdev->size))
+ mddev->size = rdev->size;
+ }
+
+ return 0;
+}
+
+static int hot_generate_error(mddev_t * mddev, dev_t dev)
+{
+ struct request_queue *q;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ return -ENXIO;
+ }
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (!rdev->in_sync)
+ return -ENODEV;
+
+ q = bdev_get_queue(rdev->bdev);
+ if (!q) {
+ MD_BUG();
+ return -ENODEV;
+ }
+ printk(KERN_INFO "md: okay, generating error!\n");
+// q->oneshot_error = 1; // disabled for now
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, dev_t dev)
+{
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->raid_disk >= 0)
+ goto busy;
+
+ kick_rdev_from_array(rdev);
+ md_update_sb(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+ bdev_partition_name(rdev->bdev), mdidx(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, dev_t dev)
+{
+ int err;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "md%d: HOT_ADD may only be used with"
+ " version-0 superblocks.\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ rdev = md_import_device (dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return -EINVAL;
+ }
+
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ size = calc_dev_size(rdev, mddev->chunk_size);
+ rdev->size = size;
+
+ if (size < mddev->size) {
+ printk(KERN_WARNING
+ "md%d: disk size %llu blocks < array size %llu\n",
+ mdidx(mddev), (unsigned long long)size,
+ (unsigned long long)mddev->size);
+ err = -ENOSPC;
+ goto abort_export;
+ }
+
+ if (rdev->faulty) {
+ printk(KERN_WARNING
+ "md: can not hot-add faulty %s disk to md%d!\n",
+ bdev_partition_name(rdev->bdev), mdidx(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ rdev->in_sync = 0;
+ rdev->desc_nr = -1;
+ bind_rdev_to_array(rdev, mddev);
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+
+ if (rdev->desc_nr == mddev->max_disks) {
+ printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ rdev->raid_disk = -1;
+
+ md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+/*
+ * set_array_info is used two different ways
+ * The original usage is when creating a new array.
+ * In this usage, raid_disks is > = and it together with
+ * level, size, not_persistent,layout,chunksize determine the
+ * shape of the array.
+ * This will always create an array with a type-0.90.0 superblock.
+ * The newer usage is when assembling an array.
+ * In this case raid_disks will be 0, and the major_version field is
+ * use to determine which style super-blocks are to be found on the devices.
+ * The minor and patch _version numbers are also kept incase the
+ * super_block handler wishes to interpret them.
+ */
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (info->raid_disks == 0) {
+ /* just setting version number for superblock loading */
+ if (info->major_version < 0 ||
+ info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
+ super_types[info->major_version].name == NULL) {
+ /* maybe try to auto-load a module? */
+ printk(KERN_INFO
+ "md: superblock version %d not known\n",
+ info->major_version);
+ return -EINVAL;
+ }
+ mddev->major_version = info->major_version;
+ mddev->minor_version = info->minor_version;
+ mddev->patch_version = info->patch_version;
+ return 0;
+ }
+ mddev->major_version = MD_MAJOR_VERSION;
+ mddev->minor_version = MD_MINOR_VERSION;
+ mddev->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->ctime = get_seconds();
+
+ mddev->level = info->level;
+ mddev->size = info->size;
+ mddev->raid_disks = info->raid_disks;
+ /* don't set __minor, it is determined by which /dev/md* was
+ * openned
+ */
+ if (info->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else
+ mddev->recovery_cp = 0;
+ mddev->persistent = ! info->not_persistent;
+
+ mddev->layout = info->layout;
+ mddev->chunk_size = info->chunk_size;
+
+ mddev->max_disks = MD_SB_DISKS;
+
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(mddev->uuid, 16);
+
+ return 0;
+}
+
+static int set_disk_faulty(mddev_t *mddev, dev_t dev)
+{
+ mdk_rdev_t *rdev;
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return 0;
+
+ md_error(mddev, rdev);
+ return 1;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ unsigned int minor;
+ int err = 0;
+ struct hd_geometry *loc = (struct hd_geometry *) arg;
+ mddev_t *mddev = NULL;
+ kdev_t dev;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ dev = inode->i_rdev;
+ minor = minor(dev);
+ if (minor >= MAX_MD_DEVS) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version((void *)arg);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays();
+ goto done;
+#endif
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev) {
+ BUG();
+ goto abort;
+ }
+
+
+ if (cmd == START_ARRAY) {
+ /* START_ARRAY doesn't need to lock the array as autostart_array
+ * does the locking, and it could even be a different array
+ */
+ err = autostart_array(arg);
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name(arg));
+ goto abort;
+ }
+ goto done;
+ }
+
+ err = mddev_lock(mddev);
+ if (err) {
+ printk(KERN_INFO
+ "md: ioctl lock interrupted, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+
+ if (!list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: array md%d already has disks!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (mddev->raid_disks) {
+ printk(KERN_WARNING
+ "md: array md%d already initialised!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ {
+ mdu_array_info_t info;
+ if (!arg)
+ memset(&info, 0, sizeof(info));
+ else if (copy_from_user(&info, (void*)arg, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't set"
+ " array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+ /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+ if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ err = do_md_stop (mddev, 0);
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ case HDIO_GETGEO:
+ if (!loc) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ err = put_user (2, (char *) &loc->heads);
+ if (err)
+ goto abort_unlock;
+ err = put_user (4, (char *) &loc->sectors);
+ if (err)
+ goto abort_unlock;
+ err = put_user(get_capacity(disks[mdidx(mddev)])/8,
+ (short *) &loc->cylinders);
+ if (err)
+ goto abort_unlock;
+ err = put_user (get_start_sect(inode->i_bdev),
+ (long *) &loc->start);
+ goto done_unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow read-only arrays
+ * here:
+ */
+ if (mddev->ro) {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (copy_from_user(&info, (void*)arg, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ case HOT_GENERATE_ERROR:
+ err = hot_generate_error(mddev, arg);
+ goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, arg);
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, arg);
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, arg);
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ {
+ err = do_md_run (mddev);
+ /*
+ * we have to clean up the mess if
+ * the array cannot be run for some
+ * reason ...
+ * ->pers will not be set, to superblock will
+ * not be updated.
+ */
+ if (err)
+ do_md_stop (mddev, 0);
+ goto done_unlock;
+ }
+
+ default:
+ if (_IOC_TYPE(cmd) == MD_MAJOR)
+ printk(KERN_WARNING "md: %s(pid %d) used"
+ " obsolete MD ioctl, upgrade your"
+ " software to use new ictls.\n",
+ current->comm, current->pid);
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ mddev_unlock(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Succeed if we can find or allocate a mddev structure.
+ */
+ mddev_t *mddev = mddev_find(minor(inode->i_rdev));
+ int err = -ENOMEM;
+
+ if (!mddev)
+ goto out;
+
+ if ((err = mddev_lock(mddev)))
+ goto put;
+
+ err = 0;
+ mddev_unlock(mddev);
+ inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev);
+ put:
+ mddev_put(mddev);
+ out:
+ return err;
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev)
+ BUG();
+ mddev_put(mddev);
+
+ return 0;
+}
+
+static struct block_device_operations md_fops =
+{
+ .owner = THIS_MODULE,
+ .open = md_open,
+ .release = md_release,
+ .ioctl = md_ioctl,
+};
+
+int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ lock_kernel();
+
+ /*
+ * Detach thread
+ */
+
+ daemonize(thread->name, mdidx(thread->mddev));
+
+ current->exit_signal = SIGCHLD;
+ allow_signal(SIGKILL);
+ thread->tsk = current;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+ unlock_kernel();
+
+ complete(thread->event);
+ while (thread->run) {
+ void (*run)(mddev_t *);
+
+ wait_event_interruptible(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags));
+ if (current->flags & PF_FREEZE)
+ refrigerator(PF_IOTHREAD);
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ run = thread->run;
+ if (run) {
+ run(thread->mddev);
+ blk_run_queues();
+ }
+ if (signal_pending(current))
+ flush_signals(current);
+ }
+ complete(thread->event);
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ if (thread) {
+ dprintk("md: waking up MD thread %p.\n", thread);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+ }
+}
+
+mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
+ const char *name)
+{
+ mdk_thread_t *thread;
+ int ret;
+ struct completion event;
+
+ thread = (mdk_thread_t *) kmalloc
+ (sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ memset(thread, 0, sizeof(mdk_thread_t));
+ init_waitqueue_head(&thread->wqueue);
+
+ init_completion(&event);
+ thread->event = &event;
+ thread->run = run;
+ thread->mddev = mddev;
+ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+ return NULL;
+ }
+ wait_for_completion(&event);
+ return thread;
+}
+
+void md_interrupt_thread(mdk_thread_t *thread)
+{
+ if (!thread->tsk) {
+ MD_BUG();
+ return;
+ }
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+ send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ struct completion event;
+
+ init_completion(&event);
+
+ thread->event = &event;
+ thread->run = NULL;
+ thread->name = NULL;
+ md_interrupt_thread(thread);
+ wait_for_completion(&event);
+ kfree(thread);
+}
+
+void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ MD_MAJOR,mdidx(mddev),
+ MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ if (!rdev || rdev->faulty)
+ return;
+ if (!mddev->pers->error_handler)
+ return;
+ mddev->pers->error_handler(mddev,rdev);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+}
+
+/* seq_file implementation /proc/mdstat */
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ i++;
+ seq_printf(seq, "%s ",
+ bdev_partition_name(rdev->bdev));
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ unsigned long max_blocks, resync, res, dt, db, rt;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ max_blocks = mddev->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks) {
+ MD_BUG();
+ return;
+ }
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ {
+ int i, x = res/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+ (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+ "resync" : "recovery"),
+ res/10, res % 10, resync, max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+}
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l > 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ spin_lock(&all_mddevs_lock);
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+ return mddev;
+ }
+ spin_unlock(&all_mddevs_lock);
+ return (void*)2;/* tail */
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ spin_lock(&all_mddevs_lock);
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ if (v != (void*)1)
+ mddev_put(mddev);
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+ mddev_t *mddev = v;
+
+ if (mddev && v != (void*)1 && v != (void*)2)
+ mddev_put(mddev);
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ mddev_t *mddev = v;
+ sector_t size;
+ struct list_head *tmp2;
+ mdk_rdev_t *rdev;
+ int i;
+
+ if (v == (void*)1) {
+ seq_printf(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ for (i = 0; i < MAX_PERSONALITY; i++)
+ if (pers[i])
+ seq_printf(seq, "[%s] ", pers[i]->name);
+
+ spin_unlock(&pers_lock);
+ seq_printf(seq, "\n");
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ if (mddev_lock(mddev)!=0)
+ return -EINTR;
+ if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
+ seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro)
+ seq_printf(seq, " (read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ seq_printf(seq, " %s[%d]",
+ bdev_partition_name(rdev->bdev), rdev->desc_nr);
+ if (rdev->faulty) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)mddev->array_size);
+ else
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)size);
+ }
+
+ if (mddev->pers) {
+ mddev->pers->status (seq, mddev);
+ seq_printf(seq, "\n ");
+ if (mddev->curr_resync > 2)
+ status_resync (seq, mddev);
+ else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
+ seq_printf(seq, " resync=DELAYED");
+ }
+
+ seq_printf(seq, "\n");
+ }
+ mddev_unlock(mddev);
+
+ return 0;
+}
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ return error;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ spin_lock(&pers_lock);
+ if (pers[pnum]) {
+ spin_unlock(&pers_lock);
+ MD_BUG();
+ return -EBUSY;
+ }
+
+ pers[pnum] = p;
+ printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+ spin_unlock(&pers_lock);
+ return 0;
+}
+
+int unregister_md_personality(int pnum)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+ spin_lock(&pers_lock);
+ pers[pnum] = NULL;
+ spin_unlock(&pers_lock);
+ return 0;
+}
+
+void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
+{
+ rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+ curr_events = disk_stat_read(disk, read_sectors) +
+ disk_stat_read(disk, write_sectors) -
+ disk->sync_io;
+ if ((curr_events - rdev->last_events) > 32) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ // stop recovery, signal do_sync ....
+ }
+}
+
+
+void md_write_start(mddev_t *mddev)
+{
+ if (!atomic_read(&mddev->writes_pending)) {
+ mddev_lock_uninterruptible(mddev);
+ if (mddev->in_sync) {
+ mddev->in_sync = 0;
+ del_timer(&mddev->safemode_timer);
+ md_update_sb(mddev);
+ }
+ atomic_inc(&mddev->writes_pending);
+ mddev_unlock(mddev);
+ } else
+ atomic_inc(&mddev->writes_pending);
+}
+
+void md_write_end(mddev_t *mddev)
+{
+ if (atomic_dec_and_test(&mddev->writes_pending)) {
+ if (mddev->safemode == 2)
+ md_wakeup_thread(mddev->thread);
+ else
+ mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
+ }
+}
+
+static inline void md_enter_safemode(mddev_t *mddev)
+{
+ mddev_lock_uninterruptible(mddev);
+ if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
+ !mddev->in_sync && mddev->recovery_cp == MaxSector) {
+ mddev->in_sync = 1;
+ md_update_sb(mddev);
+ }
+ mddev_unlock(mddev);
+
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+}
+
+void md_handle_safemode(mddev_t *mddev)
+{
+ if (signal_pending(current)) {
+ printk(KERN_INFO "md: md%d in immediate safe mode\n",
+ mdidx(mddev));
+ mddev->safemode = 2;
+ flush_signals(current);
+ }
+ if (mddev->safemode)
+ md_enter_safemode(mddev);
+}
+
+
+DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+static void md_do_sync(mddev_t *mddev)
+{
+ mddev_t *mddev2;
+ unsigned int max_sectors, currspeed = 0,
+ j, window;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct list_head *tmp;
+ unsigned long last_check;
+
+ /* just incase thread restarts... */
+ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ return;
+
+ /* we overload curr_resync somewhat here.
+ * 0 == not engaged in resync at all
+ * 2 == checking that there is no conflict with another sync
+ * 1 == like 2, but have yielded to allow conflicting resync to
+ * commense
+ * other == active in resync - this many blocks
+ */
+ do {
+ mddev->curr_resync = 2;
+
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync &&
+ match_mddev_units(mddev,mddev2)) {
+ printk(KERN_INFO "md: delaying resync of md%d"
+ " until md%d has finished resync (they"
+ " share one or more physical units)\n",
+ mdidx(mddev), mdidx(mddev2));
+ if (mddev < mddev2) {/* arbitrarily yield */
+ mddev->curr_resync = 1;
+ wake_up(&resync_wait);
+ }
+ if (wait_event_interruptible(resync_wait,
+ mddev2->curr_resync < mddev->curr_resync)) {
+ flush_signals(current);
+ mddev_put(mddev2);
+ goto skip;
+ }
+ }
+ if (mddev->curr_resync == 1) {
+ mddev_put(mddev2);
+ break;
+ }
+ }
+ } while (mddev->curr_resync < 2);
+
+ max_sectors = mddev->size << 1;
+
+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
+ " %d KB/sec/disc.\n", sysctl_speed_limit_min);
+ printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ sysctl_speed_limit_max);
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ j = mddev->recovery_cp;
+ else
+ j = 0;
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = j;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = 32*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+ window/2,max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+
+ if (j)
+ printk(KERN_INFO
+ "md: resuming recovery of md%d from checkpoint.\n",
+ mdidx(mddev));
+
+ while (j < max_sectors) {
+ int sectors;
+
+ sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
+ if (sectors < 0) {
+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ goto out;
+ }
+ atomic_add(sectors, &mddev->recovery_active);
+ j += sectors;
+ if (j>1) mddev->curr_resync = j;
+
+ if (last_check + window > j)
+ continue;
+
+ last_check = j;
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_ERR, &mddev->recovery))
+ break;
+
+ blk_run_queues();
+
+ repeat:
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (signal_pending(current)) {
+ /*
+ * got a signal, exit.
+ */
+ printk(KERN_INFO
+ "md: md_do_sync() got signal ... exiting\n");
+ flush_signals(current);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ goto out;
+ }
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ cond_resched();
+
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > sysctl_speed_limit_min) {
+ if ((currspeed > sysctl_speed_limit_max) ||
+ !is_mddev_idle(mddev)) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(HZ/4);
+ goto repeat;
+ }
+ }
+ }
+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+ out:
+ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+
+ /* tell personality that we are finished */
+ mddev->pers->sync_request(mddev, max_sectors, 1);
+
+ if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+ mddev->curr_resync > 2 &&
+ mddev->curr_resync > mddev->recovery_cp) {
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ printk(KERN_INFO
+ "md: checkpointing recovery of md%d.\n",
+ mdidx(mddev));
+ mddev->recovery_cp = mddev->curr_resync;
+ } else
+ mddev->recovery_cp = MaxSector;
+ }
+
+ if (mddev->safemode)
+ md_enter_safemode(mddev);
+ skip:
+ mddev->curr_resync = 0;
+ set_bit(MD_RECOVERY_DONE, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+}
+
+
+/*
+ * This routine is regularly called by all per-raid-array threads to
+ * deal with generic issues like resync and super-block update.
+ * Raid personalities that don't have a thread (linear/raid0) do not
+ * need this as they never do any recovery or update the superblock.
+ *
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
+ * "->recovery" and create a thread at ->sync_thread.
+ * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
+ * and wakeups up this thread which will reap the thread and finish up.
+ * This thread also removes any faulty devices (with nr_pending == 0).
+ *
+ * The overall approach is:
+ * 1/ if the superblock needs updating, update it.
+ * 2/ If a recovery thread is running, don't do anything else.
+ * 3/ If recovery has finished, clean up, possibly marking spares active.
+ * 4/ If there are any faulty devices, remove them.
+ * 5/ If array is degraded, try to add spares devices
+ * 6/ If array has spares or is not in-sync, start a resync thread.
+ */
+void md_check_recovery(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *rtmp;
+
+
+ dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
+
+ if (mddev->ro)
+ return;
+ if ( ! (
+ mddev->sb_dirty ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_DONE, &mddev->recovery)
+ ))
+ return;
+ if (mddev_trylock(mddev)==0) {
+ int spares =0;
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ /* resync/recovery still happening */
+ goto unlock;
+ if (mddev->sync_thread) {
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ mddev->pers->spare_active(mddev);
+ }
+ md_update_sb(mddev);
+ mddev->recovery = 0;
+ wake_up(&resync_wait);
+ goto unlock;
+ }
+ if (mddev->recovery) {
+ /* that's odd.. */
+ mddev->recovery = 0;
+ wake_up(&resync_wait);
+ }
+
+ /* no recovery is running.
+ * remove any failed drives, then
+ * add spares if possible
+ */
+ ITERATE_RDEV(mddev,rdev,rtmp) {
+ if (rdev->raid_disk >= 0 &&
+ rdev->faulty &&
+ atomic_read(&rdev->nr_pending)==0) {
+ mddev->pers->hot_remove_disk(mddev, rdev->raid_disk);
+ rdev->raid_disk = -1;
+ }
+ if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
+ spares++;
+ }
+ if (mddev->degraded) {
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ if (rdev->raid_disk < 0
+ && !rdev->faulty) {
+ if (mddev->pers->hot_add_disk(mddev,rdev))
+ spares++;
+ else
+ break;
+ }
+ }
+
+ if (!spares && (mddev->recovery_cp == MaxSector )) {
+ /* nothing we can do ... */
+ goto unlock;
+ }
+ if (mddev->pers->sync_request) {
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ if (!spares)
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ mddev->sync_thread = md_register_thread(md_do_sync,
+ mddev,
+ "md%d_resync");
+ if (!mddev->sync_thread) {
+ printk(KERN_ERR "md%d: could not start resync"
+ " thread...\n",
+ mdidx(mddev));
+ /* leave the spares where they are, it shouldn't hurt */
+ mddev->recovery = 0;
+ } else {
+ md_wakeup_thread(mddev->sync_thread);
+ }
+ }
+ unlock:
+ mddev_unlock(mddev);
+ }
+}
+
+int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ if (mddev_trylock(mddev)==0)
+ do_md_stop (mddev, 1);
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+ .notifier_call = md_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+#ifdef CONFIG_PROC_FS
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+#endif
+}
+
+int __init md_init(void)
+{
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
+ " MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+
+ if (register_blkdev(MAJOR_NR, "md"))
+ return -1;
+
+ devfs_mk_dir("md");
+ blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
+ md_probe, NULL, NULL);
+ for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+ char name[16];
+ sprintf(name, "md/%d", minor);
+ devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+ S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+ }
+
+ register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static dev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(dev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(void)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ dev_t dev = detected_devices[i];
+
+ rdev = md_import_device(dev,0, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_ALERT "md: could not import %s!\n",
+ partition_name(dev));
+ continue;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ continue;
+ }
+ list_add(&rdev->same_set, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices();
+}
+
+#endif
+
+static __exit void md_exit(void)
+{
+ int i;
+ blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
+ for (i=0; i < MAX_MD_DEVS; i++)
+ devfs_remove("md/%d", i);
+ devfs_remove("md");
+
+ unregister_blkdev(MAJOR_NR,"md");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("mdstat", NULL);
+#endif
+ for (i = 0; i < MAX_MD_DEVS; i++) {
+ struct gendisk *disk = disks[i];
+ mddev_t *mddev;
+ if (!disks[i])
+ continue;
+ mddev = disk->private_data;
+ del_gendisk(disk);
+ put_disk(disk);
+ mddev_put(mddev);
+ }
+}
+
+module_init(md_init)
+module_exit(md_exit)
+
+EXPORT_SYMBOL(register_md_personality);
+EXPORT_SYMBOL(unregister_md_personality);
+EXPORT_SYMBOL(md_error);
+EXPORT_SYMBOL(md_sync_acct);
+EXPORT_SYMBOL(md_done_sync);
+EXPORT_SYMBOL(md_write_start);
+EXPORT_SYMBOL(md_write_end);
+EXPORT_SYMBOL(md_handle_safemode);
+EXPORT_SYMBOL(md_register_thread);
+EXPORT_SYMBOL(md_unregister_thread);
+EXPORT_SYMBOL(md_wakeup_thread);
+EXPORT_SYMBOL(md_print_devices);
+EXPORT_SYMBOL(md_interrupt_thread);
+EXPORT_SYMBOL(md_check_recovery);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md/orig b/tests/linux/md/orig
new file mode 100644
index 0000000..3f5b666
--- /dev/null
+++ b/tests/linux/md/orig
@@ -0,0 +1,3674 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/bio.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/suspend.h>
+
+#include <linux/init.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define DEVICE_NR(device) (minor(device))
+
+#include <linux/blk.h>
+
+#define DEBUG 0
+#define dprintk(x...) ((void)(DEBUG && printk(x)))
+
+
+#ifndef MODULE
+static void autostart_arrays (void);
+#endif
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 1000 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static int sysctl_speed_limit_min = 1000;
+static int sysctl_speed_limit_max = 200000;
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {
+ .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
+ .procname = "speed_limit_min",
+ .data = &sysctl_speed_limit_min,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
+ .procname = "speed_limit_max",
+ .data = &sysctl_speed_limit_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table raid_dir_table[] = {
+ {
+ .ctl_name = DEV_RAID,
+ .procname = "raid",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = raid_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table raid_root_table[] = {
+ {
+ .ctl_name = CTL_DEV,
+ .procname = "dev",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = raid_dir_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static struct block_device_operations md_fops;
+
+static struct gendisk *disks[MAX_MD_DEVS];
+
+/*
+ * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list as well as mddev_map.
+ */
+static LIST_HEAD(all_mddevs);
+static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;
+
+
+/*
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
+ */
+#define ITERATE_MDDEV(mddev,tmp) \
+ \
+ for (({ spin_lock(&all_mddevs_lock); \
+ tmp = all_mddevs.next; \
+ mddev = NULL;}); \
+ ({ if (tmp != &all_mddevs) \
+ mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
+ spin_unlock(&all_mddevs_lock); \
+ if (mddev) mddev_put(mddev); \
+ mddev = list_entry(tmp, mddev_t, all_mddevs); \
+ tmp != &all_mddevs;}); \
+ ({ spin_lock(&all_mddevs_lock); \
+ tmp = tmp->next;}) \
+ )
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio, bio->bi_size);
+ return 0;
+}
+
+static inline mddev_t *mddev_get(mddev_t *mddev)
+{
+ atomic_inc(&mddev->active);
+ return mddev;
+}
+
+static void mddev_put(mddev_t *mddev)
+{
+ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+ return;
+ if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+ list_del(&mddev->all_mddevs);
+ mddev_map[mdidx(mddev)] = NULL;
+ kfree(mddev);
+ MOD_DEC_USE_COUNT;
+ }
+ spin_unlock(&all_mddevs_lock);
+}
+
+static mddev_t * mddev_find(int unit)
+{
+ mddev_t *mddev, *new = NULL;
+
+ retry:
+ spin_lock(&all_mddevs_lock);
+ if (mddev_map[unit]) {
+ mddev = mddev_get(mddev_map[unit]);
+ spin_unlock(&all_mddevs_lock);
+ if (new)
+ kfree(new);
+ return mddev;
+ }
+ if (new) {
+ mddev_map[unit] = new;
+ list_add(&new->all_mddevs, &all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ MOD_INC_USE_COUNT;
+ return new;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ memset(new, 0, sizeof(*new));
+
+ new->__minor = unit;
+ init_MUTEX(&new->reconfig_sem);
+ INIT_LIST_HEAD(&new->disks);
+ INIT_LIST_HEAD(&new->all_mddevs);
+ init_timer(&new->safemode_timer);
+ atomic_set(&new->active, 1);
+ blk_queue_make_request(&new->queue, md_fail_request);
+
+ goto retry;
+}
+
+static inline int mddev_lock(mddev_t * mddev)
+{
+ return down_interruptible(&mddev->reconfig_sem);
+}
+
+static inline void mddev_lock_uninterruptible(mddev_t * mddev)
+{
+ down(&mddev->reconfig_sem);
+}
+
+static inline int mddev_trylock(mddev_t * mddev)
+{
+ return down_trylock(&mddev->reconfig_sem);
+}
+
+static inline void mddev_unlock(mddev_t * mddev)
+{
+ up(&mddev->reconfig_sem);
+}
+
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->bdev->bd_dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+inline static sector_t calc_dev_sboffset(struct block_device *bdev)
+{
+ sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ return MD_NEW_SIZE_BLOCKS(size);
+}
+
+static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
+{
+ sector_t size;
+
+ size = rdev->sb_offset;
+
+ if (chunk_size)
+ size &= ~((sector_t)chunk_size/1024 - 1);
+ return size;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(KERN_ALERT "md: out of memory.\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ page_cache_release(rdev->sb_page);
+ rdev->sb_loaded = 0;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ }
+}
+
+
+static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
+{
+ if (bio->bi_size)
+ return 1;
+
+ complete((struct completion*)bio->bi_private);
+ return 0;
+}
+
+static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+ struct page *page, int rw)
+{
+ struct bio bio;
+ struct bio_vec vec;
+ struct completion event;
+
+ bio_init(&bio);
+ bio.bi_io_vec = &vec;
+ vec.bv_page = page;
+ vec.bv_len = size;
+ vec.bv_offset = 0;
+ bio.bi_vcnt = 1;
+ bio.bi_idx = 0;
+ bio.bi_size = size;
+ bio.bi_bdev = bdev;
+ bio.bi_sector = sector;
+ init_completion(&event);
+ bio.bi_private = &event;
+ bio.bi_end_io = bi_complete;
+ submit_bio(rw, &bio);
+ blk_run_queues();
+ wait_for_completion(&event);
+
+ return test_bit(BIO_UPTODATE, &bio.bi_flags);
+}
+
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+
+ if (!rdev->sb_page) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->sb_loaded)
+ return 0;
+
+
+ if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
+ goto fail;
+ rdev->sb_loaded = 1;
+ return 0;
+
+fail:
+ printk(KERN_ERR "md: disabled device %s, could not read superblock.\n",
+ bdev_partition_name(rdev->bdev));
+ return -EINVAL;
+}
+
+static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
+ (sb1->set_uuid1 == sb2->set_uuid1) &&
+ (sb1->set_uuid2 == sb2->set_uuid2) &&
+ (sb1->set_uuid3 == sb2->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ if (tmp1)
+ kfree(tmp1);
+ if (tmp2)
+ kfree(tmp2);
+
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+/*
+ * Handle superblock details.
+ * We want to be able to handle multiple superblock formats
+ * so we have a common interface to them all, and an array of
+ * different handlers.
+ * We rely on user-space to write the initial superblock, and support
+ * reading and updating of superblocks.
+ * Interface methods are:
+ * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
+ * loads and validates a superblock on dev.
+ * if refdev != NULL, compare superblocks on both devices
+ * Return:
+ * 0 - dev has a superblock that is compatible with refdev
+ * 1 - dev has a superblock that is compatible and newer than refdev
+ * so dev should be used as the refdev in future
+ * -EINVAL superblock incompatible or invalid
+ * -othererror e.g. -EIO
+ *
+ * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
+ * Verify that dev is acceptable into mddev.
+ * The first time, mddev->raid_disks will be 0, and data from
+ * dev should be merged in. Subsequent calls check that dev
+ * is new enough. Return 0 or -EINVAL
+ *
+ * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
+ * Update the superblock for rdev with data in mddev
+ * This does not write to disc.
+ *
+ */
+
+struct super_type {
+ char *name;
+ struct module *owner;
+ int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
+ int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+ void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+};
+
+/*
+ * load_super for 0.90.0
+ */
+static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+ mdp_super_t *sb;
+ int ret;
+ sector_t sb_offset;
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk.
+ *
+ * It also happens to be a multiple of 4Kb.
+ */
+ sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_offset = sb_offset;
+
+ ret = read_disk_sb(rdev);
+ if (ret) return ret;
+
+ ret = -EINVAL;
+
+ sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ if (sb->major_version != 0 ||
+ sb->minor_version != 90) {
+ printk(KERN_WARNING "Bad version number %d.%d on %s\n",
+ sb->major_version, sb->minor_version,
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ if (sb->md_minor >= MAX_MD_DEVS) {
+ printk(KERN_ERR "md: %s: invalid raid minor (%x)\n",
+ bdev_partition_name(rdev->bdev), sb->md_minor);
+ goto abort;
+ }
+ if (sb->raid_disks <= 0)
+ goto abort;
+
+ if (calc_sb_csum(sb) != sb->sb_csum) {
+ printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ rdev->preferred_minor = sb->md_minor;
+ rdev->data_offset = 0;
+
+ if (sb->level == MULTIPATH)
+ rdev->desc_nr = -1;
+ else
+ rdev->desc_nr = sb->this_disk.number;
+
+ if (refdev == 0)
+ ret = 1;
+ else {
+ __u64 ev1, ev2;
+ mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+ if (!uuid_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ goto abort;
+ }
+ if (!sb_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has same UUID"
+ " but different superblock to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ goto abort;
+ }
+ ev1 = md_event(sb);
+ ev2 = md_event(refsb);
+ if (ev1 > ev2)
+ ret = 1;
+ else
+ ret = 0;
+ }
+ rdev->size = calc_dev_size(rdev, sb->chunk_size);
+
+ abort:
+ return ret;
+}
+
+/*
+ * validate_super for 0.90.0
+ */
+static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdp_disk_t *desc;
+ mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 0;
+ mddev->minor_version = sb->minor_version;
+ mddev->patch_version = sb->patch_version;
+ mddev->persistent = ! sb->not_persistent;
+ mddev->chunk_size = sb->chunk_size;
+ mddev->ctime = sb->ctime;
+ mddev->utime = sb->utime;
+ mddev->level = sb->level;
+ mddev->layout = sb->layout;
+ mddev->raid_disks = sb->raid_disks;
+ mddev->size = sb->size;
+ mddev->events = md_event(sb);
+
+ if (sb->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else {
+ if (sb->events_hi == sb->cp_events_hi &&
+ sb->events_lo == sb->cp_events_lo) {
+ mddev->recovery_cp = sb->recovery_cp;
+ } else
+ mddev->recovery_cp = 0;
+ }
+
+ memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
+ memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
+ memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
+ memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
+
+ mddev->max_disks = MD_SB_DISKS;
+ } else {
+ __u64 ev1;
+ ev1 = md_event(sb);
+ ++ev1;
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ }
+ if (mddev->level != LEVEL_MULTIPATH) {
+ rdev->raid_disk = -1;
+ rdev->in_sync = rdev->faulty = 0;
+ desc = sb->disks + rdev->desc_nr;
+
+ if (desc->state & (1<<MD_DISK_FAULTY))
+ rdev->faulty = 1;
+ else if (desc->state & (1<<MD_DISK_SYNC) &&
+ desc->raid_disk < mddev->raid_disks) {
+ rdev->in_sync = 1;
+ rdev->raid_disk = desc->raid_disk;
+ }
+ }
+ return 0;
+}
+
+/*
+ * sync_super for 0.90.0
+ */
+static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdp_super_t *sb;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev2;
+ int next_spare = mddev->raid_disks;
+
+ /* make rdev->sb match mddev data..
+ *
+ * 1/ zero out disks
+ * 2/ Add info for each disk, keeping track of highest desc_nr
+ * 3/ any empty disks < highest become removed
+ *
+ * disks[0] gets initialised to REMOVED because
+ * we cannot be sure from other fields if it has
+ * been initialised or not.
+ */
+ int highest = 0;
+ int i;
+ int active=0, working=0,failed=0,spare=0,nr_disks=0;
+
+ sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+ memset(sb, 0, sizeof(*sb));
+
+ sb->md_magic = MD_SB_MAGIC;
+ sb->major_version = mddev->major_version;
+ sb->minor_version = mddev->minor_version;
+ sb->patch_version = mddev->patch_version;
+ sb->gvalid_words = 0; /* ignored */
+ memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
+ memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
+ memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
+ memcpy(&sb->set_uuid3, mddev->uuid+12,4);
+
+ sb->ctime = mddev->ctime;
+ sb->level = mddev->level;
+ sb->size = mddev->size;
+ sb->raid_disks = mddev->raid_disks;
+ sb->md_minor = mddev->__minor;
+ sb->not_persistent = !mddev->persistent;
+ sb->utime = mddev->utime;
+ sb->state = 0;
+ sb->events_hi = (mddev->events>>32);
+ sb->events_lo = (u32)mddev->events;
+
+ if (mddev->in_sync)
+ {
+ sb->recovery_cp = mddev->recovery_cp;
+ sb->cp_events_hi = (mddev->events>>32);
+ sb->cp_events_lo = (u32)mddev->events;
+ if (mddev->recovery_cp == MaxSector)
+ sb->state = (1<< MD_SB_CLEAN);
+ } else
+ sb->recovery_cp = 0;
+
+ sb->layout = mddev->layout;
+ sb->chunk_size = mddev->chunk_size;
+
+ sb->disks[0].state = (1<<MD_DISK_REMOVED);
+ ITERATE_RDEV(mddev,rdev2,tmp) {
+ mdp_disk_t *d;
+ if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
+ rdev2->desc_nr = rdev2->raid_disk;
+ else
+ rdev2->desc_nr = next_spare++;
+ d = &sb->disks[rdev2->desc_nr];
+ nr_disks++;
+ d->number = rdev2->desc_nr;
+ d->major = MAJOR(rdev2->bdev->bd_dev);
+ d->minor = MINOR(rdev2->bdev->bd_dev);
+ if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
+ d->raid_disk = rdev2->raid_disk;
+ else
+ d->raid_disk = rdev2->desc_nr; /* compatibility */
+ if (rdev2->faulty) {
+ d->state = (1<<MD_DISK_FAULTY);
+ failed++;
+ } else if (rdev2->in_sync) {
+ d->state = (1<<MD_DISK_ACTIVE);
+ d->state |= (1<<MD_DISK_SYNC);
+ active++;
+ working++;
+ } else {
+ d->state = 0;
+ spare++;
+ working++;
+ }
+ if (rdev2->desc_nr > highest)
+ highest = rdev2->desc_nr;
+ }
+
+ /* now set the "removed" bit on any non-trailing holes */
+ for (i=0; i<highest; i++) {
+ mdp_disk_t *d = &sb->disks[i];
+ if (d->state == 0 && d->number == 0) {
+ d->number = i;
+ d->raid_disk = i;
+ d->state = (1<<MD_DISK_REMOVED);
+ }
+ }
+ sb->nr_disks = nr_disks;
+ sb->active_disks = active;
+ sb->working_disks = working;
+ sb->failed_disks = failed;
+ sb->spare_disks = spare;
+
+ sb->this_disk = sb->disks[rdev->desc_nr];
+ sb->sb_csum = calc_sb_csum(sb);
+}
+
+/*
+ * version 1 superblock
+ */
+
+static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+{
+ unsigned int disk_csum, csum;
+ int size = 256 + sb->max_dev*2;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, size, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+ struct mdp_superblock_1 *sb;
+ int ret;
+ sector_t sb_offset;
+
+ /*
+ * Calculate the position of the superblock.
+ * It is always aligned to a 4K boundary and
+ * depeding on minor_version, it can be:
+ * 0: At least 8K, but less than 12K, from end of device
+ * 1: At start of device
+ * 2: 4K from start of device.
+ */
+ switch(minor_version) {
+ case 0:
+ sb_offset = rdev->bdev->bd_inode->i_size >> 9;
+ sb_offset -= 8*2;
+ sb_offset &= ~(4*2);
+ /* convert from sectors to K */
+ sb_offset /= 2;
+ break;
+ case 1:
+ sb_offset = 0;
+ break;
+ case 2:
+ sb_offset = 4;
+ break;
+ default:
+ return -EINVAL;
+ }
+ rdev->sb_offset = sb_offset;
+
+ ret = read_disk_sb(rdev);
+ if (ret) return ret;
+
+
+ sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
+ sb->major_version != cpu_to_le32(1) ||
+ le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
+ le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+ sb->feature_map != 0)
+ return -EINVAL;
+
+ if (calc_sb_1_csum(sb) != sb->sb_csum) {
+ printk("md: invalid superblock checksum on %s\n",
+ bdev_partition_name(rdev->bdev));
+ return -EINVAL;
+ }
+ rdev->preferred_minor = 0xffff;
+ rdev->data_offset = le64_to_cpu(sb->data_offset);
+
+ if (refdev == 0)
+ return 1;
+ else {
+ __u64 ev1, ev2;
+ struct mdp_superblock_1 *refsb =
+ (struct mdp_superblock_1*)page_address(refdev->sb_page);
+
+ if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
+ sb->level != refsb->level ||
+ sb->layout != refsb->layout ||
+ sb->chunksize != refsb->chunksize) {
+ printk(KERN_WARNING "md: %s has strangely different"
+ " superblock to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ return -EINVAL;
+ }
+ ev1 = le64_to_cpu(sb->events);
+ ev2 = le64_to_cpu(refsb->events);
+
+ if (ev1 > ev2)
+ return 1;
+ }
+ if (minor_version)
+ rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+ else
+ rdev->size = rdev->sb_offset;
+ if (rdev->size < le64_to_cpu(sb->data_size)/2)
+ return -EINVAL;
+ rdev->size = le64_to_cpu(sb->data_size)/2;
+ if (le32_to_cpu(sb->chunksize))
+ rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+ return 0;
+}
+
+static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 1;
+ mddev->minor_version = 0;
+ mddev->patch_version = 0;
+ mddev->persistent = 1;
+ mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+ mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
+ mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+ mddev->level = le32_to_cpu(sb->level);
+ mddev->layout = le32_to_cpu(sb->layout);
+ mddev->raid_disks = le32_to_cpu(sb->raid_disks);
+ mddev->size = (u32)le64_to_cpu(sb->size);
+ mddev->events = le64_to_cpu(sb->events);
+
+ mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+ memcpy(mddev->uuid, sb->set_uuid, 16);
+
+ mddev->max_disks = (4096-256)/2;
+ } else {
+ __u64 ev1;
+ ev1 = le64_to_cpu(sb->events);
+ ++ev1;
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ }
+
+ if (mddev->level != LEVEL_MULTIPATH) {
+ int role;
+ rdev->desc_nr = le32_to_cpu(sb->dev_number);
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ switch(role) {
+ case 0xffff: /* spare */
+ rdev->in_sync = 0;
+ rdev->faulty = 0;
+ rdev->raid_disk = -1;
+ break;
+ case 0xfffe: /* faulty */
+ rdev->in_sync = 0;
+ rdev->faulty = 1;
+ rdev->raid_disk = -1;
+ break;
+ default:
+ rdev->in_sync = 1;
+ rdev->faulty = 0;
+ rdev->raid_disk = role;
+ break;
+ }
+ }
+ return 0;
+}
+
+static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct mdp_superblock_1 *sb;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev2;
+ int max_dev, i;
+ /* make rdev->sb match mddev and rdev data. */
+
+ sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ sb->feature_map = 0;
+ sb->pad0 = 0;
+ memset(sb->pad1, 0, sizeof(sb->pad1));
+ memset(sb->pad2, 0, sizeof(sb->pad2));
+ memset(sb->pad3, 0, sizeof(sb->pad3));
+
+ sb->utime = cpu_to_le64((__u64)mddev->utime);
+ sb->events = cpu_to_le64(mddev->events);
+ if (mddev->in_sync)
+ sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ else
+ sb->resync_offset = cpu_to_le64(0);
+
+ max_dev = 0;
+ ITERATE_RDEV(mddev,rdev2,tmp)
+ if (rdev2->desc_nr > max_dev)
+ max_dev = rdev2->desc_nr;
+
+ sb->max_dev = max_dev;
+ for (i=0; i<max_dev;i++)
+ sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
+
+ ITERATE_RDEV(mddev,rdev2,tmp) {
+ i = rdev2->desc_nr;
+ if (rdev2->faulty)
+ sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ else if (rdev2->in_sync)
+ sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else
+ sb->dev_roles[i] = cpu_to_le16(0xffff);
+ }
+
+ sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
+}
+
+
+struct super_type super_types[] = {
+ [0] = {
+ .name = "0.90.0",
+ .owner = THIS_MODULE,
+ .load_super = super_90_load,
+ .validate_super = super_90_validate,
+ .sync_super = super_90_sync,
+ },
+ [1] = {
+ .name = "md-1",
+ .owner = THIS_MODULE,
+ .load_super = super_1_load,
+ .validate_super = super_1_validate,
+ .sync_super = super_1_sync,
+ },
+};
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev))
+ return 1;
+
+ return 0;
+}
+
+static LIST_HEAD(pending_raid_disks);
+
+static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ same_pdev = match_dev_unit(mddev, rdev);
+ if (same_pdev)
+ printk(KERN_WARNING
+ "md%d: WARNING: %s appears to be on the same physical"
+ " disk as %s. True\n protection against single-disk"
+ " failure might be compromised.\n",
+ mdidx(mddev), bdev_partition_name(rdev->bdev),
+ bdev_partition_name(same_pdev->bdev));
+
+ /* Verify rdev->desc_nr is unique.
+ * If it is -1, assign a free number, else
+ * check number is not in use
+ */
+ if (rdev->desc_nr < 0) {
+ int choice = 0;
+ if (mddev->pers) choice = mddev->raid_disks;
+ while (find_rdev_nr(mddev, choice))
+ choice++;
+ rdev->desc_nr = choice;
+ } else {
+ if (find_rdev_nr(mddev, rdev->desc_nr))
+ return -EBUSY;
+ }
+
+ list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev));
+ return 0;
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev));
+ rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
+{
+ int err = 0;
+ struct block_device *bdev;
+
+ bdev = bdget(dev);
+ if (!bdev)
+ return -ENOMEM;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ if (err)
+ return err;
+ err = bd_claim(bdev, rdev);
+ if (err) {
+ blkdev_put(bdev, BDEV_RAW);
+ return err;
+ }
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ bd_release(bdev);
+ blkdev_put(bdev, BDEV_RAW);
+}
+
+void md_autodetect_dev(dev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ printk(KERN_INFO "md: export_rdev(%s)\n",
+ bdev_partition_name(rdev->bdev));
+ if (rdev->mddev)
+ MD_BUG();
+ free_disk_sb(rdev);
+ list_del_init(&rdev->same_set);
+#ifndef MODULE
+ md_autodetect_dev(rdev->bdev->bd_dev);
+#endif
+ unlock_rdev(rdev);
+ kfree(rdev);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+ mddev->raid_disks = 0;
+ mddev->major_version = 0;
+}
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+ partition_name(MKDEV(desc->major,desc->minor)),
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO
+ "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
+ sb->level, sb->size, sb->nr_disks, sb->raid_disks,
+ sb->md_minor, sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
+ " FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ",
+ bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size,
+ rdev->faulty, rdev->in_sync, rdev->desc_nr);
+ if (rdev->sb_loaded) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb((mdp_super_t*)page_address(rdev->sb_page));
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+ struct list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) {
+ printk("md%d: ", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", bdev_partition_name(rdev->bdev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+
+ if (!rdev->sb_loaded) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ return 1;
+ }
+
+ dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->sb_offset);
+
+ if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
+ return 0;
+
+ printk("md: write_disk_sb failed for device %s\n",
+ bdev_partition_name(rdev->bdev));
+ return 1;
+}
+
+static void sync_sbs(mddev_t * mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ super_types[mddev->major_version].
+ sync_super(mddev, rdev);
+ rdev->sb_loaded = 1;
+ }
+}
+
+static void md_update_sb(mddev_t * mddev)
+{
+ int err, count = 100;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ mddev->sb_dirty = 0;
+repeat:
+ mddev->utime = get_seconds();
+ mddev->events ++;
+
+ if (!mddev->events) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->events --;
+ }
+ sync_sbs(mddev);
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (!mddev->persistent)
+ return;
+
+ dprintk(KERN_INFO
+ "md: updating md%d RAID superblock on device (in sync %d)\n",
+ mdidx(mddev),mddev->in_sync);
+
+ err = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ dprintk(KERN_INFO "md: ");
+ if (rdev->faulty)
+ dprintk("(skipping faulty ");
+
+ dprintk("%s ", bdev_partition_name(rdev->bdev));
+ if (!rdev->faulty) {
+ err += write_disk_sb(rdev);
+ } else
+ dprintk(")\n");
+ if (!err && mddev->level == LEVEL_MULTIPATH)
+ /* only need to write one superblock... */
+ break;
+ }
+ if (err) {
+ if (--count) {
+ printk(KERN_ERR "md: errors occurred during superblock"
+ " update, repeating\n");
+ goto repeat;
+ }
+ printk(KERN_ERR \
+ "md: excessive errors occurred during superblock update, exiting\n");
+ }
+}
+
+/*
+ * Import a device. If 'super_format' >= 0, then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ * a faulty rdev _never_ has rdev->sb set.
+ */
+static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ sector_t size;
+
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for %s!\n",
+ partition_name(newdev));
+ return ERR_PTR(-ENOMEM);
+ }
+ memset(rdev, 0, sizeof(*rdev));
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ err = lock_rdev(rdev, newdev);
+ if (err) {
+ printk(KERN_ERR "md: could not lock %s.\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+ rdev->desc_nr = -1;
+ rdev->faulty = 0;
+ rdev->in_sync = 0;
+ rdev->data_offset = 0;
+ atomic_set(&rdev->nr_pending, 0);
+
+ size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ if (!size) {
+ printk(KERN_WARNING
+ "md: %s has zero or unknown size, marking faulty!\n",
+ bdev_partition_name(rdev->bdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (super_format >= 0) {
+ err = super_types[super_format].
+ load_super(rdev, NULL, super_minor);
+ if (err == -EINVAL) {
+ printk(KERN_WARNING
+ "md: %s has invalid sb, not importing!\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort_free;
+ }
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: could not read %s's sb, not importing!\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort_free;
+ }
+ }
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return rdev;
+
+abort_free:
+ if (rdev->sb_page) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return ERR_PTR(err);
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+
+static int analyze_sbs(mddev_t * mddev)
+{
+ int i;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev, *freshest;
+
+ freshest = NULL;
+ ITERATE_RDEV(mddev,rdev,tmp)
+ switch (super_types[mddev->major_version].
+ load_super(rdev, freshest, mddev->minor_version)) {
+ case 1:
+ freshest = rdev;
+ break;
+ case 0:
+ break;
+ default:
+ printk( KERN_ERR \
+ "md: fatal superblock inconsistency in %s"
+ " -- removing from array\n",
+ bdev_partition_name(rdev->bdev));
+ kick_rdev_from_array(rdev);
+ }
+
+
+ super_types[mddev->major_version].
+ validate_super(mddev, freshest);
+
+ i = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev != freshest)
+ if (super_types[mddev->major_version].
+ validate_super(mddev, rdev)) {
+ printk(KERN_WARNING "md: kicking non-fresh %s"
+ " from array!\n",
+ bdev_partition_name(rdev->bdev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ if (mddev->level == LEVEL_MULTIPATH) {
+ rdev->desc_nr = i++;
+ rdev->raid_disk = rdev->desc_nr;
+ rdev->in_sync = 1;
+ }
+ }
+
+
+ /*
+ * Check if we can support this RAID array
+ */
+ if (mddev->major_version != MD_MAJOR_VERSION ||
+ mddev->minor_version > MD_MINOR_VERSION) {
+ printk(KERN_ALERT
+ "md: md%d: unsupported raid array version %d.%d.%d\n",
+ mdidx(mddev), mddev->major_version,
+ mddev->minor_version, mddev->patch_version);
+ goto abort;
+ }
+
+ if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) ||
+ (mddev->level == 4) || (mddev->level == 5)))
+ printk(KERN_ERR "md: md%d: raid array is not clean"
+ " -- starting background reconstruction\n",
+ mdidx(mddev));
+
+ return 0;
+abort:
+ return 1;
+}
+
+static int device_size_calculation(mddev_t * mddev)
+{
+ int data_disks = 0;
+ unsigned int readahead;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ /*
+ * Do device size calculation. Bail out if too small.
+ * (we have to do this after having validated chunk_size,
+ * because device size has to be modulo chunk_size)
+ */
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size < mddev->chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size:"
+ " %lluk < %dk\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->size,
+ mddev->chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+
+ switch (mddev->level) {
+ case LEVEL_MULTIPATH:
+ data_disks = 1;
+ break;
+ case -3:
+ data_disks = 1;
+ break;
+ case -2:
+ data_disks = 1;
+ break;
+ case LEVEL_LINEAR:
+ zoned_raid_size(mddev);
+ data_disks = 1;
+ break;
+ case 0:
+ zoned_raid_size(mddev);
+ data_disks = mddev->raid_disks;
+ break;
+ case 1:
+ data_disks = 1;
+ break;
+ case 4:
+ case 5:
+ data_disks = mddev->raid_disks-1;
+ break;
+ default:
+ printk(KERN_ERR "md: md%d: unsupported raid level %d\n",
+ mdidx(mddev), mddev->level);
+ goto abort;
+ }
+ if (!md_size[mdidx(mddev)])
+ md_size[mdidx(mddev)] = mddev->size * data_disks;
+
+ readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) {
+ readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+ if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+ readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+ } else {
+ // (no multipath branch - it uses the default setting)
+ if (mddev->level == -3)
+ readahead = 0;
+ }
+
+ printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+ mdidx(mddev), readahead*(PAGE_SIZE/1024));
+
+ printk(KERN_INFO
+ "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+ mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+ return 0;
+abort:
+ return 1;
+}
+
+static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+{
+ static DECLARE_MUTEX(disks_sem);
+ int unit = MINOR(dev);
+ mddev_t *mddev = mddev_find(unit);
+ struct gendisk *disk;
+
+ if (!mddev)
+ return NULL;
+
+ down(&disks_sem);
+ if (disks[unit]) {
+ up(&disks_sem);
+ mddev_put(mddev);
+ return NULL;
+ }
+ disk = alloc_disk(1);
+ if (!disk) {
+ up(&disks_sem);
+ mddev_put(mddev);
+ return NULL;
+ }
+ disk->major = MD_MAJOR;
+ disk->first_minor = mdidx(mddev);
+ sprintf(disk->disk_name, "md%d", mdidx(mddev));
+ disk->fops = &md_fops;
+ disk->private_data = mddev;
+ disk->queue = &mddev->queue;
+ add_disk(disk);
+ disks[mdidx(mddev)] = disk;
+ up(&disks_sem);
+ return NULL;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread);
+
+static void md_safemode_timeout(unsigned long data)
+{
+ mddev_t *mddev = (mddev_t *) data;
+
+ mddev->safemode = 1;
+ md_wakeup_thread(mddev->thread);
+}
+
+
+static int do_md_run(mddev_t * mddev)
+{
+ int pnum, err;
+ int chunk_size;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+ struct gendisk *disk;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (!mddev->raid_disks && analyze_sbs(mddev)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ chunk_size = mddev->chunk_size;
+ pnum = level_to_pers(mddev->level);
+
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+ if (!chunk_size) {
+ /*
+ * 'default chunksize' in the old md code used to
+ * be PAGE_SIZE, baaad.
+ * we abort here to be on the safe side. We don't
+ * want to continue the bad practice.
+ */
+ printk(KERN_ERR
+ "no chunksize specified, see 'man raidtab'\n");
+ return -EINVAL;
+ }
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(KERN_ERR "too big chunk_size: %d > %d\n",
+ chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(KERN_ERR "too small chunk_size: %d < %ld\n",
+ chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+
+ /* devices must have minimum size of one chunk */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size < chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size:"
+ " %lluk < %dk\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->size,
+ chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+ }
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+#ifdef CONFIG_KMOD
+ if (!pers[pnum])
+ {
+ char module_name[80];
+ sprintf (module_name, "md-personality-%d", pnum);
+ request_module (module_name);
+ }
+#endif
+
+ if (device_size_calculation(mddev))
+ return -EINVAL;
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ sync_blockdev(rdev->bdev);
+ invalidate_bdev(rdev->bdev, 0);
+ }
+
+ md_probe(mdidx(mddev), NULL, NULL);
+ disk = disks[mdidx(mddev)];
+ if (!disk)
+ return -ENOMEM;
+
+ spin_lock(&pers_lock);
+ if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
+ spin_unlock(&pers_lock);
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
+ pnum);
+ return -EINVAL;
+ }
+
+ mddev->pers = pers[pnum];
+ spin_unlock(&pers_lock);
+
+ blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+ printk("%s: setting max_sectors to %d, segment boundary to %d\n",
+ disk->disk_name,
+ chunk_size >> 9,
+ (chunk_size>>1)-1);
+ blk_queue_max_sectors(&mddev->queue, chunk_size >> 9);
+ blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1);
+ mddev->queue.queuedata = mddev;
+
+ err = mddev->pers->run(mddev);
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ module_put(mddev->pers->owner);
+ mddev->pers = NULL;
+ return -EINVAL;
+ }
+ atomic_set(&mddev->writes_pending,0);
+ mddev->safemode = 0;
+ mddev->safemode_timer.function = md_safemode_timeout;
+ mddev->safemode_timer.data = (unsigned long) mddev;
+ mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
+ mddev->in_sync = 1;
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ set_capacity(disk, mddev->array_size<<1);
+ return 0;
+}
+
+static int restart_array(mddev_t *mddev)
+{
+ struct gendisk *disk = disks[mdidx(mddev)];
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->safemode = 0;
+ mddev->ro = 0;
+ set_disk_ro(disk, 0);
+
+ printk(KERN_INFO "md: md%d switched to read-write mode.\n",
+ mdidx(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ err = 0;
+ } else {
+ printk(KERN_ERR "md: md%d has no personality assigned.\n",
+ mdidx(mddev));
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+ int err = 0;
+ struct gendisk *disk = disks[mdidx(mddev)];
+
+ if (atomic_read(&mddev->active)>2) {
+ printk("md: md%d still in use.\n",mdidx(mddev));
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (mddev->pers) {
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ }
+
+ del_timer_sync(&mddev->safemode_timer);
+
+ invalidate_device(mk_kdev(disk->major, disk->first_minor), 1);
+
+ if (ro) {
+ err = -ENXIO;
+ if (mddev->ro)
+ goto out;
+ mddev->ro = 1;
+ } else {
+ if (mddev->ro)
+ set_disk_ro(disk, 0);
+ if (mddev->pers->stop(mddev)) {
+ err = -EBUSY;
+ if (mddev->ro)
+ set_disk_ro(disk, 1);
+ goto out;
+ }
+ module_put(mddev->pers->owner);
+ mddev->pers = NULL;
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (mddev->raid_disks) {
+ /* mark array as shutdown cleanly */
+ mddev->in_sync = 1;
+ md_update_sb(mddev);
+ }
+ if (ro)
+ set_disk_ro(disk, 1);
+ }
+ /*
+ * Free resources if final stop
+ */
+ if (!ro) {
+ struct gendisk *disk;
+ printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+
+ export_array(mddev);
+
+ mddev->array_size = 0;
+ disk = disks[mdidx(mddev)];
+ if (disk)
+ set_capacity(disk, 0);
+ } else
+ printk(KERN_INFO "md: md%d switched to read-only mode.\n",
+ mdidx(mddev));
+ err = 0;
+out:
+ return err;
+}
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return;
+ }
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk("<%s>", bdev_partition_name(rdev->bdev));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in pending_raid_disks)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(void)
+{
+ struct list_head candidates;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ rdev0 = list_entry(pending_raid_disks.next,
+ mdk_rdev_t, same_set);
+
+ printk(KERN_INFO "md: considering %s ...\n",
+ bdev_partition_name(rdev0->bdev));
+ INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp)
+ if (super_90_load(rdev, rdev0, 0) >= 0) {
+ printk(KERN_INFO "md: adding %s ...\n",
+ bdev_partition_name(rdev->bdev));
+ list_move(&rdev->same_set, &candidates);
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+
+ mddev = mddev_find(rdev0->preferred_minor);
+ if (!mddev) {
+ printk(KERN_ERR
+ "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (mddev_lock(mddev))
+ printk(KERN_WARNING "md: md%d locked, cannot run\n",
+ mdidx(mddev));
+ else if (mddev->raid_disks || mddev->major_version
+ || !list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: md%d already running, cannot run %s\n",
+ mdidx(mddev), bdev_partition_name(rdev0->bdev));
+ mddev_unlock(mddev);
+ } else {
+ printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+ ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
+ list_del_init(&rdev->same_set);
+ if (bind_rdev_to_array(rdev, mddev))
+ export_rdev(rdev);
+ }
+ autorun_array(mddev);
+ mddev_unlock(mddev);
+ }
+ /* on success, candidates will be empty, on error
+ * it won't...
+ */
+ ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
+ export_rdev(rdev);
+ mddev_put(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+static int autostart_array(dev_t startdev)
+{
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ start_rdev = md_import_device(startdev, 0, 0);
+ if (IS_ERR(start_rdev)) {
+ printk(KERN_WARNING "md: could not import %s!\n",
+ partition_name(startdev));
+ return err;
+ }
+
+ /* NOTE: this can only work for 0.90.0 superblocks */
+ sb = (mdp_super_t*)page_address(start_rdev->sb_page);
+ if (sb->major_version != 0 ||
+ sb->minor_version != 90 ) {
+ printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
+ export_rdev(start_rdev);
+ return err;
+ }
+
+ if (start_rdev->faulty) {
+ printk(KERN_WARNING
+ "md: can not autostart based on faulty %s!\n",
+ bdev_partition_name(start_rdev->bdev));
+ export_rdev(start_rdev);
+ return err;
+ }
+ list_add(&start_rdev->same_set, &pending_raid_disks);
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ dev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (!dev)
+ continue;
+ if (dev == startdev)
+ continue;
+ rdev = md_import_device(dev, 0, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING "md: could not import %s,"
+ " trying to run array nevertheless.\n",
+ partition_name(dev));
+ continue;
+ }
+ list_add(&rdev->same_set, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices();
+ return 0;
+
+}
+
+
+static int get_version(void * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_array_info(mddev_t * mddev, void * arg)
+{
+ mdu_array_info_t info;
+ int nr,working,active,failed,spare;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ nr=working=active=failed=spare=0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ nr++;
+ if (rdev->faulty)
+ failed++;
+ else {
+ working++;
+ if (rdev->in_sync)
+ active++;
+ else
+ spare++;
+ }
+ }
+
+ info.major_version = mddev->major_version;
+ info.minor_version = mddev->minor_version;
+ info.patch_version = 1;
+ info.ctime = mddev->ctime;
+ info.level = mddev->level;
+ info.size = mddev->size;
+ info.nr_disks = nr;
+ info.raid_disks = mddev->raid_disks;
+ info.md_minor = mddev->__minor;
+ info.not_persistent= !mddev->persistent;
+
+ info.utime = mddev->utime;
+ info.state = 0;
+ if (mddev->in_sync)
+ info.state = (1<<MD_SB_CLEAN);
+ info.active_disks = active;
+ info.working_disks = working;
+ info.failed_disks = failed;
+ info.spare_disks = spare;
+
+ info.layout = mddev->layout;
+ info.chunk_size = mddev->chunk_size;
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_disk_info(mddev_t * mddev, void * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+ mdk_rdev_t *rdev;
+
+ if (copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+
+ rdev = find_rdev_nr(mddev, nr);
+ if (rdev) {
+ info.major = MAJOR(rdev->bdev->bd_dev);
+ info.minor = MINOR(rdev->bdev->bd_dev);
+ info.raid_disk = rdev->raid_disk;
+ info.state = 0;
+ if (rdev->faulty)
+ info.state |= (1<<MD_DISK_FAULTY);
+ else if (rdev->in_sync) {
+ info.state |= (1<<MD_DISK_ACTIVE);
+ info.state |= (1<<MD_DISK_SYNC);
+ }
+ } else {
+ info.major = info.minor = 0;
+ info.raid_disk = -1;
+ info.state = (1<<MD_DISK_REMOVED);
+ }
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ mdk_rdev_t *rdev;
+ dev_t dev;
+ dev = MKDEV(info->major,info->minor);
+ if (!mddev->raid_disks) {
+ int err;
+ /* expecting a device which has a superblock */
+ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ int err = super_types[mddev->major_version]
+ .load_super(rdev, rdev0, mddev->minor_version);
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: %s has different UUID to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(rdev0->bdev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ return err;
+ }
+
+ /*
+ * add_new_disk can be used once the array is assembled
+ * to add "hot spares". They must already have a superblock
+ * written
+ */
+ if (mddev->pers) {
+ int err;
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+ rdev = md_import_device(dev, mddev->major_version,
+ mddev->minor_version);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ rdev->in_sync = 0; /* just to be sure */
+ rdev->raid_disk = -1;
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ if (mddev->thread)
+ md_wakeup_thread(mddev->thread);
+ return err;
+ }
+
+ /* otherwise, add_new_disk is only allowed
+ * for major_version==0 superblocks
+ */
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ if (!(info->state & (1<<MD_DISK_FAULTY))) {
+ int err;
+ rdev = md_import_device (dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ rdev->desc_nr = info->number;
+ if (info->raid_disk < mddev->raid_disks)
+ rdev->raid_disk = info->raid_disk;
+ else
+ rdev->raid_disk = -1;
+
+ rdev->faulty = 0;
+ if (rdev->raid_disk < mddev->raid_disks)
+ rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
+ else
+ rdev->in_sync = 0;
+
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err) {
+ export_rdev(rdev);
+ return err;
+ }
+
+ if (!mddev->persistent) {
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+ rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ } else
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->size = calc_dev_size(rdev, mddev->chunk_size);
+
+ if (!mddev->size || (mddev->size > rdev->size))
+ mddev->size = rdev->size;
+ }
+
+ return 0;
+}
+
+static int hot_generate_error(mddev_t * mddev, dev_t dev)
+{
+ struct request_queue *q;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ return -ENXIO;
+ }
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (!rdev->in_sync)
+ return -ENODEV;
+
+ q = bdev_get_queue(rdev->bdev);
+ if (!q) {
+ MD_BUG();
+ return -ENODEV;
+ }
+ printk(KERN_INFO "md: okay, generating error!\n");
+// q->oneshot_error = 1; // disabled for now
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, dev_t dev)
+{
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->raid_disk >= 0)
+ goto busy;
+
+ kick_rdev_from_array(rdev);
+ md_update_sb(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+ bdev_partition_name(rdev->bdev), mdidx(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, dev_t dev)
+{
+ int err;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "md%d: HOT_ADD may only be used with"
+ " version-0 superblocks.\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ rdev = md_import_device (dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return -EINVAL;
+ }
+
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ size = calc_dev_size(rdev, mddev->chunk_size);
+ rdev->size = size;
+
+ if (size < mddev->size) {
+ printk(KERN_WARNING
+ "md%d: disk size %llu blocks < array size %llu\n",
+ mdidx(mddev), (unsigned long long)size,
+ (unsigned long long)mddev->size);
+ err = -ENOSPC;
+ goto abort_export;
+ }
+
+ if (rdev->faulty) {
+ printk(KERN_WARNING
+ "md: can not hot-add faulty %s disk to md%d!\n",
+ bdev_partition_name(rdev->bdev), mdidx(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ rdev->in_sync = 0;
+ rdev->desc_nr = -1;
+ bind_rdev_to_array(rdev, mddev);
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+
+ if (rdev->desc_nr == mddev->max_disks) {
+ printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ rdev->raid_disk = -1;
+
+ md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+/*
+ * set_array_info is used two different ways
+ * The original usage is when creating a new array.
+ * In this usage, raid_disks is > = and it together with
+ * level, size, not_persistent,layout,chunksize determine the
+ * shape of the array.
+ * This will always create an array with a type-0.90.0 superblock.
+ * The newer usage is when assembling an array.
+ * In this case raid_disks will be 0, and the major_version field is
+ * use to determine which style super-blocks are to be found on the devices.
+ * The minor and patch _version numbers are also kept incase the
+ * super_block handler wishes to interpret them.
+ */
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (info->raid_disks == 0) {
+ /* just setting version number for superblock loading */
+ if (info->major_version < 0 ||
+ info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
+ super_types[info->major_version].name == NULL) {
+ /* maybe try to auto-load a module? */
+ printk(KERN_INFO
+ "md: superblock version %d not known\n",
+ info->major_version);
+ return -EINVAL;
+ }
+ mddev->major_version = info->major_version;
+ mddev->minor_version = info->minor_version;
+ mddev->patch_version = info->patch_version;
+ return 0;
+ }
+ mddev->major_version = MD_MAJOR_VERSION;
+ mddev->minor_version = MD_MINOR_VERSION;
+ mddev->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->ctime = get_seconds();
+
+ mddev->level = info->level;
+ mddev->size = info->size;
+ mddev->raid_disks = info->raid_disks;
+ /* don't set __minor, it is determined by which /dev/md* was
+ * openned
+ */
+ if (info->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else
+ mddev->recovery_cp = 0;
+ mddev->persistent = ! info->not_persistent;
+
+ mddev->layout = info->layout;
+ mddev->chunk_size = info->chunk_size;
+
+ mddev->max_disks = MD_SB_DISKS;
+
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(mddev->uuid, 16);
+
+ return 0;
+}
+
+static int set_disk_faulty(mddev_t *mddev, dev_t dev)
+{
+ mdk_rdev_t *rdev;
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return 0;
+
+ md_error(mddev, rdev);
+ return 1;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ unsigned int minor;
+ int err = 0;
+ struct hd_geometry *loc = (struct hd_geometry *) arg;
+ mddev_t *mddev = NULL;
+ kdev_t dev;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ dev = inode->i_rdev;
+ minor = minor(dev);
+ if (minor >= MAX_MD_DEVS) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version((void *)arg);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays();
+ goto done;
+#endif
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev) {
+ BUG();
+ goto abort;
+ }
+
+
+ if (cmd == START_ARRAY) {
+ /* START_ARRAY doesn't need to lock the array as autostart_array
+ * does the locking, and it could even be a different array
+ */
+ err = autostart_array(arg);
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name(arg));
+ goto abort;
+ }
+ goto done;
+ }
+
+ err = mddev_lock(mddev);
+ if (err) {
+ printk(KERN_INFO
+ "md: ioctl lock interrupted, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+
+ if (!list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: array md%d already has disks!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (mddev->raid_disks) {
+ printk(KERN_WARNING
+ "md: array md%d already initialised!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ {
+ mdu_array_info_t info;
+ if (!arg)
+ memset(&info, 0, sizeof(info));
+ else if (copy_from_user(&info, (void*)arg, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't set"
+ " array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+ /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+ if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ err = do_md_stop (mddev, 0);
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ case HDIO_GETGEO:
+ if (!loc) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ err = put_user (2, (char *) &loc->heads);
+ if (err)
+ goto abort_unlock;
+ err = put_user (4, (char *) &loc->sectors);
+ if (err)
+ goto abort_unlock;
+ err = put_user(get_capacity(disks[mdidx(mddev)])/8,
+ (short *) &loc->cylinders);
+ if (err)
+ goto abort_unlock;
+ err = put_user (get_start_sect(inode->i_bdev),
+ (long *) &loc->start);
+ goto done_unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow read-only arrays
+ * here:
+ */
+ if (mddev->ro) {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (copy_from_user(&info, (void*)arg, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ case HOT_GENERATE_ERROR:
+ err = hot_generate_error(mddev, arg);
+ goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, arg);
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, arg);
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, arg);
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ {
+ err = do_md_run (mddev);
+ /*
+ * we have to clean up the mess if
+ * the array cannot be run for some
+ * reason ...
+ * ->pers will not be set, to superblock will
+ * not be updated.
+ */
+ if (err)
+ do_md_stop (mddev, 0);
+ goto done_unlock;
+ }
+
+ default:
+ if (_IOC_TYPE(cmd) == MD_MAJOR)
+ printk(KERN_WARNING "md: %s(pid %d) used"
+ " obsolete MD ioctl, upgrade your"
+ " software to use new ictls.\n",
+ current->comm, current->pid);
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ mddev_unlock(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Succeed if we can find or allocate a mddev structure.
+ */
+ mddev_t *mddev = mddev_find(minor(inode->i_rdev));
+ int err = -ENOMEM;
+
+ if (!mddev)
+ goto out;
+
+ if ((err = mddev_lock(mddev)))
+ goto put;
+
+ err = 0;
+ mddev_unlock(mddev);
+ inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev);
+ put:
+ mddev_put(mddev);
+ out:
+ return err;
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev)
+ BUG();
+ mddev_put(mddev);
+
+ return 0;
+}
+
+static struct block_device_operations md_fops =
+{
+ .owner = THIS_MODULE,
+ .open = md_open,
+ .release = md_release,
+ .ioctl = md_ioctl,
+};
+
+int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ lock_kernel();
+
+ /*
+ * Detach thread
+ */
+
+ daemonize(thread->name, mdidx(thread->mddev));
+
+ current->exit_signal = SIGCHLD;
+ allow_signal(SIGKILL);
+ thread->tsk = current;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+ unlock_kernel();
+
+ complete(thread->event);
+ while (thread->run) {
+ void (*run)(mddev_t *);
+
+ wait_event_interruptible(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags));
+ if (current->flags & PF_FREEZE)
+ refrigerator(PF_IOTHREAD);
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ run = thread->run;
+ if (run) {
+ run(thread->mddev);
+ blk_run_queues();
+ }
+ if (signal_pending(current))
+ flush_signals(current);
+ }
+ complete(thread->event);
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ if (thread) {
+ dprintk("md: waking up MD thread %p.\n", thread);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+ }
+}
+
+mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
+ const char *name)
+{
+ mdk_thread_t *thread;
+ int ret;
+ struct completion event;
+
+ thread = (mdk_thread_t *) kmalloc
+ (sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ memset(thread, 0, sizeof(mdk_thread_t));
+ init_waitqueue_head(&thread->wqueue);
+
+ init_completion(&event);
+ thread->event = &event;
+ thread->run = run;
+ thread->mddev = mddev;
+ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+ return NULL;
+ }
+ wait_for_completion(&event);
+ return thread;
+}
+
+void md_interrupt_thread(mdk_thread_t *thread)
+{
+ if (!thread->tsk) {
+ MD_BUG();
+ return;
+ }
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+ send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ struct completion event;
+
+ init_completion(&event);
+
+ thread->event = &event;
+ thread->run = NULL;
+ thread->name = NULL;
+ md_interrupt_thread(thread);
+ wait_for_completion(&event);
+ kfree(thread);
+}
+
+void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ MD_MAJOR,mdidx(mddev),
+ MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ if (!rdev || rdev->faulty)
+ return;
+ if (!mddev->pers->error_handler)
+ return;
+ mddev->pers->error_handler(mddev,rdev);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+}
+
+/* seq_file implementation /proc/mdstat */
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ i++;
+ seq_printf(seq, "%s ",
+ bdev_partition_name(rdev->bdev));
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ unsigned long max_blocks, resync, res, dt, db, rt;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ max_blocks = mddev->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks) {
+ MD_BUG();
+ return;
+ }
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ {
+ int i, x = res/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+ (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+ "resync" : "recovery"),
+ res/10, res % 10, resync, max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+}
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l > 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ spin_lock(&all_mddevs_lock);
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+ return mddev;
+ }
+ spin_unlock(&all_mddevs_lock);
+ return (void*)2;/* tail */
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ spin_lock(&all_mddevs_lock);
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ if (v != (void*)1)
+ mddev_put(mddev);
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+ mddev_t *mddev = v;
+
+ if (mddev && v != (void*)1 && v != (void*)2)
+ mddev_put(mddev);
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ mddev_t *mddev = v;
+ sector_t size;
+ struct list_head *tmp2;
+ mdk_rdev_t *rdev;
+ int i;
+
+ if (v == (void*)1) {
+ seq_printf(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ for (i = 0; i < MAX_PERSONALITY; i++)
+ if (pers[i])
+ seq_printf(seq, "[%s] ", pers[i]->name);
+
+ spin_unlock(&pers_lock);
+ seq_printf(seq, "\n");
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ if (mddev_lock(mddev)!=0)
+ return -EINTR;
+ if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
+ seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro)
+ seq_printf(seq, " (read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ seq_printf(seq, " %s[%d]",
+ bdev_partition_name(rdev->bdev), rdev->desc_nr);
+ if (rdev->faulty) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)mddev->array_size);
+ else
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)size);
+ }
+
+ if (mddev->pers) {
+ mddev->pers->status (seq, mddev);
+ seq_printf(seq, "\n ");
+ if (mddev->curr_resync > 2)
+ status_resync (seq, mddev);
+ else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
+ seq_printf(seq, " resync=DELAYED");
+ }
+
+ seq_printf(seq, "\n");
+ }
+ mddev_unlock(mddev);
+
+ return 0;
+}
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ return error;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ spin_lock(&pers_lock);
+ if (pers[pnum]) {
+ spin_unlock(&pers_lock);
+ MD_BUG();
+ return -EBUSY;
+ }
+
+ pers[pnum] = p;
+ printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+ spin_unlock(&pers_lock);
+ return 0;
+}
+
+int unregister_md_personality(int pnum)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+ spin_lock(&pers_lock);
+ pers[pnum] = NULL;
+ spin_unlock(&pers_lock);
+ return 0;
+}
+
+void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
+{
+ rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+ curr_events = disk_stat_read(disk, read_sectors) +
+ disk_stat_read(disk, write_sectors) -
+ disk->sync_io;
+ if ((curr_events - rdev->last_events) > 32) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ // stop recovery, signal do_sync ....
+ }
+}
+
+
+void md_write_start(mddev_t *mddev)
+{
+ if (!atomic_read(&mddev->writes_pending)) {
+ mddev_lock_uninterruptible(mddev);
+ if (mddev->in_sync) {
+ mddev->in_sync = 0;
+ del_timer(&mddev->safemode_timer);
+ md_update_sb(mddev);
+ }
+ atomic_inc(&mddev->writes_pending);
+ mddev_unlock(mddev);
+ } else
+ atomic_inc(&mddev->writes_pending);
+}
+
+void md_write_end(mddev_t *mddev)
+{
+ if (atomic_dec_and_test(&mddev->writes_pending)) {
+ if (mddev->safemode == 2)
+ md_wakeup_thread(mddev->thread);
+ else
+ mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
+ }
+}
+
+static inline void md_enter_safemode(mddev_t *mddev)
+{
+ mddev_lock_uninterruptible(mddev);
+ if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
+ !mddev->in_sync && mddev->recovery_cp == MaxSector) {
+ mddev->in_sync = 1;
+ md_update_sb(mddev);
+ }
+ mddev_unlock(mddev);
+
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+}
+
+void md_handle_safemode(mddev_t *mddev)
+{
+ if (signal_pending(current)) {
+ printk(KERN_INFO "md: md%d in immediate safe mode\n",
+ mdidx(mddev));
+ mddev->safemode = 2;
+ flush_signals(current);
+ }
+ if (mddev->safemode)
+ md_enter_safemode(mddev);
+}
+
+
+DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+static void md_do_sync(mddev_t *mddev)
+{
+ mddev_t *mddev2;
+ unsigned int max_sectors, currspeed = 0,
+ j, window;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct list_head *tmp;
+ unsigned long last_check;
+
+ /* just incase thread restarts... */
+ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ return;
+
+ /* we overload curr_resync somewhat here.
+ * 0 == not engaged in resync at all
+ * 2 == checking that there is no conflict with another sync
+ * 1 == like 2, but have yielded to allow conflicting resync to
+ * commense
+ * other == active in resync - this many blocks
+ */
+ do {
+ mddev->curr_resync = 2;
+
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync &&
+ match_mddev_units(mddev,mddev2)) {
+ printk(KERN_INFO "md: delaying resync of md%d"
+ " until md%d has finished resync (they"
+ " share one or more physical units)\n",
+ mdidx(mddev), mdidx(mddev2));
+ if (mddev < mddev2) {/* arbitrarily yield */
+ mddev->curr_resync = 1;
+ wake_up(&resync_wait);
+ }
+ if (wait_event_interruptible(resync_wait,
+ mddev2->curr_resync < mddev->curr_resync)) {
+ flush_signals(current);
+ mddev_put(mddev2);
+ goto skip;
+ }
+ }
+ if (mddev->curr_resync == 1) {
+ mddev_put(mddev2);
+ break;
+ }
+ }
+ } while (mddev->curr_resync < 2);
+
+ max_sectors = mddev->size << 1;
+
+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
+ " %d KB/sec/disc.\n", sysctl_speed_limit_min);
+ printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ sysctl_speed_limit_max);
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ j = mddev->recovery_cp;
+ else
+ j = 0;
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = j;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = 32*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+ window/2,max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+
+ if (j)
+ printk(KERN_INFO
+ "md: resuming recovery of md%d from checkpoint.\n",
+ mdidx(mddev));
+
+ while (j < max_sectors) {
+ int sectors;
+
+ sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
+ if (sectors < 0) {
+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ goto out;
+ }
+ atomic_add(sectors, &mddev->recovery_active);
+ j += sectors;
+ if (j>1) mddev->curr_resync = j;
+
+ if (last_check + window > j)
+ continue;
+
+ last_check = j;
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_ERR, &mddev->recovery))
+ break;
+
+ blk_run_queues();
+
+ repeat:
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (signal_pending(current)) {
+ /*
+ * got a signal, exit.
+ */
+ printk(KERN_INFO
+ "md: md_do_sync() got signal ... exiting\n");
+ flush_signals(current);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ goto out;
+ }
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ cond_resched();
+
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > sysctl_speed_limit_min) {
+ if ((currspeed > sysctl_speed_limit_max) ||
+ !is_mddev_idle(mddev)) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(HZ/4);
+ goto repeat;
+ }
+ }
+ }
+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+ out:
+ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+
+ /* tell personality that we are finished */
+ mddev->pers->sync_request(mddev, max_sectors, 1);
+
+ if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+ mddev->curr_resync > 2 &&
+ mddev->curr_resync > mddev->recovery_cp) {
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ printk(KERN_INFO
+ "md: checkpointing recovery of md%d.\n",
+ mdidx(mddev));
+ mddev->recovery_cp = mddev->curr_resync;
+ } else
+ mddev->recovery_cp = MaxSector;
+ }
+
+ if (mddev->safemode)
+ md_enter_safemode(mddev);
+ skip:
+ mddev->curr_resync = 0;
+ set_bit(MD_RECOVERY_DONE, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+}
+
+
+/*
+ * This routine is regularly called by all per-raid-array threads to
+ * deal with generic issues like resync and super-block update.
+ * Raid personalities that don't have a thread (linear/raid0) do not
+ * need this as they never do any recovery or update the superblock.
+ *
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
+ * "->recovery" and create a thread at ->sync_thread.
+ * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
+ * and wakeups up this thread which will reap the thread and finish up.
+ * This thread also removes any faulty devices (with nr_pending == 0).
+ *
+ * The overall approach is:
+ * 1/ if the superblock needs updating, update it.
+ * 2/ If a recovery thread is running, don't do anything else.
+ * 3/ If recovery has finished, clean up, possibly marking spares active.
+ * 4/ If there are any faulty devices, remove them.
+ * 5/ If array is degraded, try to add spares devices
+ * 6/ If array has spares or is not in-sync, start a resync thread.
+ */
+void md_check_recovery(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *rtmp;
+
+
+ dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
+
+ if (mddev->ro)
+ return;
+ if ( ! (
+ mddev->sb_dirty ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_DONE, &mddev->recovery)
+ ))
+ return;
+ if (mddev_trylock(mddev)==0) {
+ int spares =0;
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ /* resync/recovery still happening */
+ goto unlock;
+ if (mddev->sync_thread) {
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ mddev->pers->spare_active(mddev);
+ }
+ md_update_sb(mddev);
+ mddev->recovery = 0;
+ wake_up(&resync_wait);
+ goto unlock;
+ }
+ if (mddev->recovery) {
+ /* that's odd.. */
+ mddev->recovery = 0;
+ wake_up(&resync_wait);
+ }
+
+ /* no recovery is running.
+ * remove any failed drives, then
+ * add spares if possible
+ */
+ ITERATE_RDEV(mddev,rdev,rtmp) {
+ if (rdev->raid_disk >= 0 &&
+ rdev->faulty &&
+ atomic_read(&rdev->nr_pending)==0) {
+ mddev->pers->hot_remove_disk(mddev, rdev->raid_disk);
+ rdev->raid_disk = -1;
+ }
+ if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
+ spares++;
+ }
+ if (mddev->degraded) {
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ if (rdev->raid_disk < 0
+ && !rdev->faulty) {
+ if (mddev->pers->hot_add_disk(mddev,rdev))
+ spares++;
+ else
+ break;
+ }
+ }
+
+ if (!spares && (mddev->recovery_cp == MaxSector )) {
+ /* nothing we can do ... */
+ goto unlock;
+ }
+ if (mddev->pers->sync_request) {
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ if (!spares)
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ mddev->sync_thread = md_register_thread(md_do_sync,
+ mddev,
+ "md%d_resync");
+ if (!mddev->sync_thread) {
+ printk(KERN_ERR "md%d: could not start resync"
+ " thread...\n",
+ mdidx(mddev));
+ /* leave the spares where they are, it shouldn't hurt */
+ mddev->recovery = 0;
+ } else {
+ md_wakeup_thread(mddev->sync_thread);
+ }
+ }
+ unlock:
+ mddev_unlock(mddev);
+ }
+}
+
+int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ if (mddev_trylock(mddev)==0)
+ do_md_stop (mddev, 1);
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+ .notifier_call = md_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+#ifdef CONFIG_PROC_FS
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+#endif
+}
+
+int __init md_init(void)
+{
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
+ " MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+
+ if (register_blkdev(MAJOR_NR, "md"))
+ return -1;
+
+ devfs_mk_dir("md");
+ blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
+ md_probe, NULL, NULL);
+ for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+ char name[16];
+ sprintf(name, "md/%d", minor);
+ devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+ S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+ }
+
+ register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static dev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(dev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(void)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ dev_t dev = detected_devices[i];
+
+ rdev = md_import_device(dev,0, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_ALERT "md: could not import %s!\n",
+ partition_name(dev));
+ continue;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ continue;
+ }
+ list_add(&rdev->same_set, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices();
+}
+
+#endif
+
+static __exit void md_exit(void)
+{
+ int i;
+ blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
+ for (i=0; i < MAX_MD_DEVS; i++)
+ devfs_remove("md/%d", i);
+ devfs_remove("md");
+
+ unregister_blkdev(MAJOR_NR,"md");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("mdstat", NULL);
+#endif
+ for (i = 0; i < MAX_MD_DEVS; i++) {
+ struct gendisk *disk = disks[i];
+ mddev_t *mddev;
+ if (!disks[i])
+ continue;
+ mddev = disk->private_data;
+ del_gendisk(disk);
+ put_disk(disk);
+ mddev_put(mddev);
+ }
+}
+
+module_init(md_init)
+module_exit(md_exit)
+
+EXPORT_SYMBOL(register_md_personality);
+EXPORT_SYMBOL(unregister_md_personality);
+EXPORT_SYMBOL(md_error);
+EXPORT_SYMBOL(md_sync_acct);
+EXPORT_SYMBOL(md_done_sync);
+EXPORT_SYMBOL(md_write_start);
+EXPORT_SYMBOL(md_write_end);
+EXPORT_SYMBOL(md_handle_safemode);
+EXPORT_SYMBOL(md_register_thread);
+EXPORT_SYMBOL(md_unregister_thread);
+EXPORT_SYMBOL(md_wakeup_thread);
+EXPORT_SYMBOL(md_print_devices);
+EXPORT_SYMBOL(md_interrupt_thread);
+EXPORT_SYMBOL(md_check_recovery);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/md/patch b/tests/linux/md/patch
new file mode 100644
index 0000000..1370009
--- /dev/null
+++ b/tests/linux/md/patch
@@ -0,0 +1,117 @@
+***************
+*** 1453,1542 ****
+ return 1;
+ }
+
+- #undef OLD_LEVEL
+-
+- static int device_size_calculation(mddev_t * mddev)
+- {
+- int data_disks = 0;
+- unsigned int readahead;
+- struct list_head *tmp;
+- mdk_rdev_t *rdev;
+-
+- /*
+- * Do device size calculation. Bail out if too small.
+- * (we have to do this after having validated chunk_size,
+- * because device size has to be modulo chunk_size)
+- */
+-
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- if (rdev->faulty)
+- continue;
+- if (rdev->size < mddev->chunk_size / 1024) {
+- printk(KERN_WARNING
+- "md: Dev %s smaller than chunk_size:"
+- " %lluk < %dk\n",
+- bdev_partition_name(rdev->bdev),
+- (unsigned long long)rdev->size,
+- mddev->chunk_size / 1024);
+- return -EINVAL;
+- }
+- }
+-
+- switch (mddev->level) {
+- case LEVEL_MULTIPATH:
+- data_disks = 1;
+- break;
+- case -3:
+- data_disks = 1;
+- break;
+- case -2:
+- data_disks = 1;
+- break;
+- case LEVEL_LINEAR:
+- zoned_raid_size(mddev);
+- data_disks = 1;
+- break;
+- case 0:
+- zoned_raid_size(mddev);
+- data_disks = mddev->raid_disks;
+- break;
+- case 1:
+- data_disks = 1;
+- break;
+- case 4:
+- case 5:
+- data_disks = mddev->raid_disks-1;
+- break;
+- default:
+- printk(KERN_ERR "md: md%d: unsupported raid level %d\n",
+- mdidx(mddev), mddev->level);
+- goto abort;
+- }
+- if (!md_size[mdidx(mddev)])
+- md_size[mdidx(mddev)] = mddev->size * data_disks;
+-
+- readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+- if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) {
+- readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+- if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+- readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+- } else {
+- // (no multipath branch - it uses the default setting)
+- if (mddev->level == -3)
+- readahead = 0;
+- }
+-
+- printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+- mdidx(mddev), readahead*(PAGE_SIZE/1024));
+-
+- printk(KERN_INFO
+- "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+- mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+- return 0;
+- abort:
+- return 1;
+- }
+-
+ static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+ {
+ static DECLARE_MUTEX(disks_sem);
+--- 1436,1441 ----
+ return 1;
+ }
+
+ static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+ {
+ static DECLARE_MUTEX(disks_sem);
+***************
+*** 1664,1672 ****
+ }
+ }
+
+- if (device_size_calculation(mddev))
+- return -EINVAL;
+-
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+--- 1571,1576 ----
+ }
+ }
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
diff --git a/tests/linux/md/rediff b/tests/linux/md/rediff
new file mode 100644
index 0000000..fc27949
--- /dev/null
+++ b/tests/linux/md/rediff
@@ -0,0 +1,101 @@
+@@ -1453,90 +1436,6 @@
+ return 1;
+ }
+
+-#undef OLD_LEVEL
+-
+-static int device_size_calculation(mddev_t * mddev)
+-{
+- int data_disks = 0;
+- unsigned int readahead;
+- struct list_head *tmp;
+- mdk_rdev_t *rdev;
+-
+- /*
+- * Do device size calculation. Bail out if too small.
+- * (we have to do this after having validated chunk_size,
+- * because device size has to be modulo chunk_size)
+- */
+-
+- ITERATE_RDEV(mddev,rdev,tmp) {
+- if (rdev->faulty)
+- continue;
+- if (rdev->size < mddev->chunk_size / 1024) {
+- printk(KERN_WARNING
+- "md: Dev %s smaller than chunk_size:"
+- " %lluk < %dk\n",
+- bdev_partition_name(rdev->bdev),
+- (unsigned long long)rdev->size,
+- mddev->chunk_size / 1024);
+- return -EINVAL;
+- }
+- }
+-
+- switch (mddev->level) {
+- case LEVEL_MULTIPATH:
+- data_disks = 1;
+- break;
+- case -3:
+- data_disks = 1;
+- break;
+- case -2:
+- data_disks = 1;
+- break;
+- case LEVEL_LINEAR:
+- zoned_raid_size(mddev);
+- data_disks = 1;
+- break;
+- case 0:
+- zoned_raid_size(mddev);
+- data_disks = mddev->raid_disks;
+- break;
+- case 1:
+- data_disks = 1;
+- break;
+- case 4:
+- case 5:
+- data_disks = mddev->raid_disks-1;
+- break;
+- default:
+- printk(KERN_ERR "md: md%d: unsupported raid level %d\n",
+- mdidx(mddev), mddev->level);
+- goto abort;
+- }
+- if (!md_size[mdidx(mddev)])
+- md_size[mdidx(mddev)] = mddev->size * data_disks;
+-
+- readahead = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+- if (!mddev->level || (mddev->level == 4) || (mddev->level == 5)) {
+- readahead = (mddev->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
+- if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
+- readahead = data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2;
+- } else {
+- // (no multipath branch - it uses the default setting)
+- if (mddev->level == -3)
+- readahead = 0;
+- }
+-
+- printk(KERN_INFO "md%d: max total readahead window set to %ldk\n",
+- mdidx(mddev), readahead*(PAGE_SIZE/1024));
+-
+- printk(KERN_INFO
+- "md%d: %d data-disks, max readahead per data-disk: %ldk\n",
+- mdidx(mddev), data_disks, readahead/data_disks*(PAGE_SIZE/1024));
+- return 0;
+-abort:
+- return 1;
+-}
+-
+ static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+ {
+ static DECLARE_MUTEX(disks_sem);
+@@ -1664,9 +1571,6 @@
+ }
+ }
+
+- if (device_size_calculation(mddev))
+- return -EINVAL;
+-
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
diff --git a/tests/linux/md/replace b/tests/linux/md/replace
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tests/linux/md/replace
diff --git a/tests/linux/md/wmerge b/tests/linux/md/wmerge
new file mode 100644
index 0000000..4238601
--- /dev/null
+++ b/tests/linux/md/wmerge
@@ -0,0 +1,3589 @@
+/*
+ md.c : Multiple Devices driver for Linux
+ Copyright (C) 1998, 1999, 2000 Ingo Molnar
+
+ completely rewritten, based on the MD driver code from Marc Zyngier
+
+ Changes:
+
+ - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
+ - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
+ - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
+ - kmod support by: Cyrus Durgin
+ - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
+ - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
+
+ - lots of fixes and improvements to the RAID1/RAID5 and generic
+ RAID code (such as request based resynchronization):
+
+ Neil Brown <neilb@cse.unsw.edu.au>.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <linux/raid/md.h>
+#include <linux/sysctl.h>
+#include <linux/bio.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/suspend.h>
+
+#include <linux/init.h>
+
+#ifdef CONFIG_KMOD
+#include <linux/kmod.h>
+#endif
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+
+#include <asm/unaligned.h>
+
+#define MAJOR_NR MD_MAJOR
+#define MD_DRIVER
+#define DEVICE_NR(device) (minor(device))
+
+#include <linux/blk.h>
+
+#define DEBUG 0
+#define dprintk(x...) ((void)(DEBUG && printk(x)))
+
+
+#ifndef MODULE
+static void autostart_arrays (void);
+#endif
+
+static mdk_personality_t *pers[MAX_PERSONALITY];
+static spinlock_t pers_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
+ * is 1000 KB/sec, so the extra system load does not show up that much.
+ * Increase it if you want to have more _guaranteed_ speed. Note that
+ * the RAID driver will use the maximum available bandwith if the IO
+ * subsystem is idle. There is also an 'absolute maximum' reconstruction
+ * speed limit - in case reconstruction slows down your system despite
+ * idle IO detection.
+ *
+ * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
+ */
+
+static int sysctl_speed_limit_min = 1000;
+static int sysctl_speed_limit_max = 200000;
+
+static struct ctl_table_header *raid_table_header;
+
+static ctl_table raid_table[] = {
+ {
+ .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
+ .procname = "speed_limit_min",
+ .data = &sysctl_speed_limit_min,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
+ .procname = "speed_limit_max",
+ .data = &sysctl_speed_limit_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table raid_dir_table[] = {
+ {
+ .ctl_name = DEV_RAID,
+ .procname = "raid",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = raid_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static ctl_table raid_root_table[] = {
+ {
+ .ctl_name = CTL_DEV,
+ .procname = "dev",
+ .maxlen = 0,
+ .mode = 0555,
+ .child = raid_dir_table,
+ },
+ { .ctl_name = 0 }
+};
+
+static struct block_device_operations md_fops;
+
+static struct gendisk *disks[MAX_MD_DEVS];
+
+/*
+ * Enables to iterate over all existing md arrays
+ * all_mddevs_lock protects this list as well as mddev_map.
+ */
+static LIST_HEAD(all_mddevs);
+static spinlock_t all_mddevs_lock = SPIN_LOCK_UNLOCKED;
+
+
+/*
+ * iterates through all used mddevs in the system.
+ * We take care to grab the all_mddevs_lock whenever navigating
+ * the list, and to always hold a refcount when unlocked.
+ * Any code which breaks out of this loop while own
+ * a reference to the current mddev and must mddev_put it.
+ */
+#define ITERATE_MDDEV(mddev,tmp) \
+ \
+ for (({ spin_lock(&all_mddevs_lock); \
+ tmp = all_mddevs.next; \
+ mddev = NULL;}); \
+ ({ if (tmp != &all_mddevs) \
+ mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
+ spin_unlock(&all_mddevs_lock); \
+ if (mddev) mddev_put(mddev); \
+ mddev = list_entry(tmp, mddev_t, all_mddevs); \
+ tmp != &all_mddevs;}); \
+ ({ spin_lock(&all_mddevs_lock); \
+ tmp = tmp->next;}) \
+ )
+
+static mddev_t *mddev_map[MAX_MD_DEVS];
+
+static int md_fail_request (request_queue_t *q, struct bio *bio)
+{
+ bio_io_error(bio, bio->bi_size);
+ return 0;
+}
+
+static inline mddev_t *mddev_get(mddev_t *mddev)
+{
+ atomic_inc(&mddev->active);
+ return mddev;
+}
+
+static void mddev_put(mddev_t *mddev)
+{
+ if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
+ return;
+ if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+ list_del(&mddev->all_mddevs);
+ mddev_map[mdidx(mddev)] = NULL;
+ kfree(mddev);
+ MOD_DEC_USE_COUNT;
+ }
+ spin_unlock(&all_mddevs_lock);
+}
+
+static mddev_t * mddev_find(int unit)
+{
+ mddev_t *mddev, *new = NULL;
+
+ retry:
+ spin_lock(&all_mddevs_lock);
+ if (mddev_map[unit]) {
+ mddev = mddev_get(mddev_map[unit]);
+ spin_unlock(&all_mddevs_lock);
+ if (new)
+ kfree(new);
+ return mddev;
+ }
+ if (new) {
+ mddev_map[unit] = new;
+ list_add(&new->all_mddevs, &all_mddevs);
+ spin_unlock(&all_mddevs_lock);
+ MOD_INC_USE_COUNT;
+ return new;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
+ if (!new)
+ return NULL;
+
+ memset(new, 0, sizeof(*new));
+
+ new->__minor = unit;
+ init_MUTEX(&new->reconfig_sem);
+ INIT_LIST_HEAD(&new->disks);
+ INIT_LIST_HEAD(&new->all_mddevs);
+ init_timer(&new->safemode_timer);
+ atomic_set(&new->active, 1);
+ blk_queue_make_request(&new->queue, md_fail_request);
+
+ goto retry;
+}
+
+static inline int mddev_lock(mddev_t * mddev)
+{
+ return down_interruptible(&mddev->reconfig_sem);
+}
+
+static inline void mddev_lock_uninterruptible(mddev_t * mddev)
+{
+ down(&mddev->reconfig_sem);
+}
+
+static inline int mddev_trylock(mddev_t * mddev)
+{
+ return down_trylock(&mddev->reconfig_sem);
+}
+
+static inline void mddev_unlock(mddev_t * mddev)
+{
+ up(&mddev->reconfig_sem);
+}
+
+mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
+{
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->desc_nr == nr)
+ return rdev;
+ }
+ return NULL;
+}
+
+static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->bdev->bd_dev == dev)
+ return rdev;
+ }
+ return NULL;
+}
+
+inline static sector_t calc_dev_sboffset(struct block_device *bdev)
+{
+ sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ return MD_NEW_SIZE_BLOCKS(size);
+}
+
+static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
+{
+ sector_t size;
+
+ size = rdev->sb_offset;
+
+ if (chunk_size)
+ size &= ~((sector_t)chunk_size/1024 - 1);
+ return size;
+}
+
+static int alloc_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page)
+ MD_BUG();
+
+ rdev->sb_page = alloc_page(GFP_KERNEL);
+ if (!rdev->sb_page) {
+ printk(KERN_ALERT "md: out of memory.\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void free_disk_sb(mdk_rdev_t * rdev)
+{
+ if (rdev->sb_page) {
+ page_cache_release(rdev->sb_page);
+ rdev->sb_loaded = 0;
+ rdev->sb_page = NULL;
+ rdev->sb_offset = 0;
+ rdev->size = 0;
+ }
+}
+
+
+static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
+{
+ if (bio->bi_size)
+ return 1;
+
+ complete((struct completion*)bio->bi_private);
+ return 0;
+}
+
+static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
+ struct page *page, int rw)
+{
+ struct bio bio;
+ struct bio_vec vec;
+ struct completion event;
+
+ bio_init(&bio);
+ bio.bi_io_vec = &vec;
+ vec.bv_page = page;
+ vec.bv_len = size;
+ vec.bv_offset = 0;
+ bio.bi_vcnt = 1;
+ bio.bi_idx = 0;
+ bio.bi_size = size;
+ bio.bi_bdev = bdev;
+ bio.bi_sector = sector;
+ init_completion(&event);
+ bio.bi_private = &event;
+ bio.bi_end_io = bi_complete;
+ submit_bio(rw, &bio);
+ blk_run_queues();
+ wait_for_completion(&event);
+
+ return test_bit(BIO_UPTODATE, &bio.bi_flags);
+}
+
+static int read_disk_sb(mdk_rdev_t * rdev)
+{
+
+ if (!rdev->sb_page) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (rdev->sb_loaded)
+ return 0;
+
+
+ if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
+ goto fail;
+ rdev->sb_loaded = 1;
+ return 0;
+
+fail:
+ printk(KERN_ERR "md: disabled device %s, could not read superblock.\n",
+ bdev_partition_name(rdev->bdev));
+ return -EINVAL;
+}
+
+static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
+ (sb1->set_uuid1 == sb2->set_uuid1) &&
+ (sb1->set_uuid2 == sb2->set_uuid2) &&
+ (sb1->set_uuid3 == sb2->set_uuid3))
+
+ return 1;
+
+ return 0;
+}
+
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+ int ret;
+ mdp_super_t *tmp1, *tmp2;
+
+ tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+ tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+ if (!tmp1 || !tmp2) {
+ ret = 0;
+ printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+ goto abort;
+ }
+
+ *tmp1 = *sb1;
+ *tmp2 = *sb2;
+
+ /*
+ * nr_disks is not constant
+ */
+ tmp1->nr_disks = 0;
+ tmp2->nr_disks = 0;
+
+ if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+ ret = 0;
+ else
+ ret = 1;
+
+abort:
+ if (tmp1)
+ kfree(tmp1);
+ if (tmp2)
+ kfree(tmp2);
+
+ return ret;
+}
+
+static unsigned int calc_sb_csum(mdp_super_t * sb)
+{
+ unsigned int disk_csum, csum;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+/*
+ * Handle superblock details.
+ * We want to be able to handle multiple superblock formats
+ * so we have a common interface to them all, and an array of
+ * different handlers.
+ * We rely on user-space to write the initial superblock, and support
+ * reading and updating of superblocks.
+ * Interface methods are:
+ * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
+ * loads and validates a superblock on dev.
+ * if refdev != NULL, compare superblocks on both devices
+ * Return:
+ * 0 - dev has a superblock that is compatible with refdev
+ * 1 - dev has a superblock that is compatible and newer than refdev
+ * so dev should be used as the refdev in future
+ * -EINVAL superblock incompatible or invalid
+ * -othererror e.g. -EIO
+ *
+ * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
+ * Verify that dev is acceptable into mddev.
+ * The first time, mddev->raid_disks will be 0, and data from
+ * dev should be merged in. Subsequent calls check that dev
+ * is new enough. Return 0 or -EINVAL
+ *
+ * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
+ * Update the superblock for rdev with data in mddev
+ * This does not write to disc.
+ *
+ */
+
+struct super_type {
+ char *name;
+ struct module *owner;
+ int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
+ int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+ void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+};
+
+/*
+ * load_super for 0.90.0
+ */
+static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+ mdp_super_t *sb;
+ int ret;
+ sector_t sb_offset;
+
+ /*
+ * Calculate the position of the superblock,
+ * it's at the end of the disk.
+ *
+ * It also happens to be a multiple of 4Kb.
+ */
+ sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_offset = sb_offset;
+
+ ret = read_disk_sb(rdev);
+ if (ret) return ret;
+
+ ret = -EINVAL;
+
+ sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+ if (sb->md_magic != MD_SB_MAGIC) {
+ printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ if (sb->major_version != 0 ||
+ sb->minor_version != 90) {
+ printk(KERN_WARNING "Bad version number %d.%d on %s\n",
+ sb->major_version, sb->minor_version,
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ if (sb->md_minor >= MAX_MD_DEVS) {
+ printk(KERN_ERR "md: %s: invalid raid minor (%x)\n",
+ bdev_partition_name(rdev->bdev), sb->md_minor);
+ goto abort;
+ }
+ if (sb->raid_disks <= 0)
+ goto abort;
+
+ if (calc_sb_csum(sb) != sb->sb_csum) {
+ printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort;
+ }
+
+ rdev->preferred_minor = sb->md_minor;
+ rdev->data_offset = 0;
+
+ if (sb->level == MULTIPATH)
+ rdev->desc_nr = -1;
+ else
+ rdev->desc_nr = sb->this_disk.number;
+
+ if (refdev == 0)
+ ret = 1;
+ else {
+ __u64 ev1, ev2;
+ mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+ if (!uuid_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has different UUID to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ goto abort;
+ }
+ if (!sb_equal(refsb, sb)) {
+ printk(KERN_WARNING "md: %s has same UUID"
+ " but different superblock to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ goto abort;
+ }
+ ev1 = md_event(sb);
+ ev2 = md_event(refsb);
+ if (ev1 > ev2)
+ ret = 1;
+ else
+ ret = 0;
+ }
+ rdev->size = calc_dev_size(rdev, sb->chunk_size);
+
+ abort:
+ return ret;
+}
+
+/*
+ * validate_super for 0.90.0
+ */
+static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdp_disk_t *desc;
+ mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 0;
+ mddev->minor_version = sb->minor_version;
+ mddev->patch_version = sb->patch_version;
+ mddev->persistent = ! sb->not_persistent;
+ mddev->chunk_size = sb->chunk_size;
+ mddev->ctime = sb->ctime;
+ mddev->utime = sb->utime;
+ mddev->level = sb->level;
+ mddev->layout = sb->layout;
+ mddev->raid_disks = sb->raid_disks;
+ mddev->size = sb->size;
+ mddev->events = md_event(sb);
+
+ if (sb->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else {
+ if (sb->events_hi == sb->cp_events_hi &&
+ sb->events_lo == sb->cp_events_lo) {
+ mddev->recovery_cp = sb->recovery_cp;
+ } else
+ mddev->recovery_cp = 0;
+ }
+
+ memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
+ memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
+ memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
+ memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
+
+ mddev->max_disks = MD_SB_DISKS;
+ } else {
+ __u64 ev1;
+ ev1 = md_event(sb);
+ ++ev1;
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ }
+ if (mddev->level != LEVEL_MULTIPATH) {
+ rdev->raid_disk = -1;
+ rdev->in_sync = rdev->faulty = 0;
+ desc = sb->disks + rdev->desc_nr;
+
+ if (desc->state & (1<<MD_DISK_FAULTY))
+ rdev->faulty = 1;
+ else if (desc->state & (1<<MD_DISK_SYNC) &&
+ desc->raid_disk < mddev->raid_disks) {
+ rdev->in_sync = 1;
+ rdev->raid_disk = desc->raid_disk;
+ }
+ }
+ return 0;
+}
+
+/*
+ * sync_super for 0.90.0
+ */
+static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ mdp_super_t *sb;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev2;
+ int next_spare = mddev->raid_disks;
+
+ /* make rdev->sb match mddev data..
+ *
+ * 1/ zero out disks
+ * 2/ Add info for each disk, keeping track of highest desc_nr
+ * 3/ any empty disks < highest become removed
+ *
+ * disks[0] gets initialised to REMOVED because
+ * we cannot be sure from other fields if it has
+ * been initialised or not.
+ */
+ int highest = 0;
+ int i;
+ int active=0, working=0,failed=0,spare=0,nr_disks=0;
+
+ sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+ memset(sb, 0, sizeof(*sb));
+
+ sb->md_magic = MD_SB_MAGIC;
+ sb->major_version = mddev->major_version;
+ sb->minor_version = mddev->minor_version;
+ sb->patch_version = mddev->patch_version;
+ sb->gvalid_words = 0; /* ignored */
+ memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
+ memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
+ memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
+ memcpy(&sb->set_uuid3, mddev->uuid+12,4);
+
+ sb->ctime = mddev->ctime;
+ sb->level = mddev->level;
+ sb->size = mddev->size;
+ sb->raid_disks = mddev->raid_disks;
+ sb->md_minor = mddev->__minor;
+ sb->not_persistent = !mddev->persistent;
+ sb->utime = mddev->utime;
+ sb->state = 0;
+ sb->events_hi = (mddev->events>>32);
+ sb->events_lo = (u32)mddev->events;
+
+ if (mddev->in_sync)
+ {
+ sb->recovery_cp = mddev->recovery_cp;
+ sb->cp_events_hi = (mddev->events>>32);
+ sb->cp_events_lo = (u32)mddev->events;
+ if (mddev->recovery_cp == MaxSector)
+ sb->state = (1<< MD_SB_CLEAN);
+ } else
+ sb->recovery_cp = 0;
+
+ sb->layout = mddev->layout;
+ sb->chunk_size = mddev->chunk_size;
+
+ sb->disks[0].state = (1<<MD_DISK_REMOVED);
+ ITERATE_RDEV(mddev,rdev2,tmp) {
+ mdp_disk_t *d;
+ if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
+ rdev2->desc_nr = rdev2->raid_disk;
+ else
+ rdev2->desc_nr = next_spare++;
+ d = &sb->disks[rdev2->desc_nr];
+ nr_disks++;
+ d->number = rdev2->desc_nr;
+ d->major = MAJOR(rdev2->bdev->bd_dev);
+ d->minor = MINOR(rdev2->bdev->bd_dev);
+ if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
+ d->raid_disk = rdev2->raid_disk;
+ else
+ d->raid_disk = rdev2->desc_nr; /* compatibility */
+ if (rdev2->faulty) {
+ d->state = (1<<MD_DISK_FAULTY);
+ failed++;
+ } else if (rdev2->in_sync) {
+ d->state = (1<<MD_DISK_ACTIVE);
+ d->state |= (1<<MD_DISK_SYNC);
+ active++;
+ working++;
+ } else {
+ d->state = 0;
+ spare++;
+ working++;
+ }
+ if (rdev2->desc_nr > highest)
+ highest = rdev2->desc_nr;
+ }
+
+ /* now set the "removed" bit on any non-trailing holes */
+ for (i=0; i<highest; i++) {
+ mdp_disk_t *d = &sb->disks[i];
+ if (d->state == 0 && d->number == 0) {
+ d->number = i;
+ d->raid_disk = i;
+ d->state = (1<<MD_DISK_REMOVED);
+ }
+ }
+ sb->nr_disks = nr_disks;
+ sb->active_disks = active;
+ sb->working_disks = working;
+ sb->failed_disks = failed;
+ sb->spare_disks = spare;
+
+ sb->this_disk = sb->disks[rdev->desc_nr];
+ sb->sb_csum = calc_sb_csum(sb);
+}
+
+/*
+ * version 1 superblock
+ */
+
+static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+{
+ unsigned int disk_csum, csum;
+ int size = 256 + sb->max_dev*2;
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ csum = csum_partial((void *)sb, size, 0);
+ sb->sb_csum = disk_csum;
+ return csum;
+}
+
+static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
+{
+ struct mdp_superblock_1 *sb;
+ int ret;
+ sector_t sb_offset;
+
+ /*
+ * Calculate the position of the superblock.
+ * It is always aligned to a 4K boundary and
+ * depeding on minor_version, it can be:
+ * 0: At least 8K, but less than 12K, from end of device
+ * 1: At start of device
+ * 2: 4K from start of device.
+ */
+ switch(minor_version) {
+ case 0:
+ sb_offset = rdev->bdev->bd_inode->i_size >> 9;
+ sb_offset -= 8*2;
+ sb_offset &= ~(4*2);
+ /* convert from sectors to K */
+ sb_offset /= 2;
+ break;
+ case 1:
+ sb_offset = 0;
+ break;
+ case 2:
+ sb_offset = 4;
+ break;
+ default:
+ return -EINVAL;
+ }
+ rdev->sb_offset = sb_offset;
+
+ ret = read_disk_sb(rdev);
+ if (ret) return ret;
+
+
+ sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
+ sb->major_version != cpu_to_le32(1) ||
+ le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
+ le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
+ sb->feature_map != 0)
+ return -EINVAL;
+
+ if (calc_sb_1_csum(sb) != sb->sb_csum) {
+ printk("md: invalid superblock checksum on %s\n",
+ bdev_partition_name(rdev->bdev));
+ return -EINVAL;
+ }
+ rdev->preferred_minor = 0xffff;
+ rdev->data_offset = le64_to_cpu(sb->data_offset);
+
+ if (refdev == 0)
+ return 1;
+ else {
+ __u64 ev1, ev2;
+ struct mdp_superblock_1 *refsb =
+ (struct mdp_superblock_1*)page_address(refdev->sb_page);
+
+ if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
+ sb->level != refsb->level ||
+ sb->layout != refsb->layout ||
+ sb->chunksize != refsb->chunksize) {
+ printk(KERN_WARNING "md: %s has strangely different"
+ " superblock to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(refdev->bdev));
+ return -EINVAL;
+ }
+ ev1 = le64_to_cpu(sb->events);
+ ev2 = le64_to_cpu(refsb->events);
+
+ if (ev1 > ev2)
+ return 1;
+ }
+ if (minor_version)
+ rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
+ else
+ rdev->size = rdev->sb_offset;
+ if (rdev->size < le64_to_cpu(sb->data_size)/2)
+ return -EINVAL;
+ rdev->size = le64_to_cpu(sb->data_size)/2;
+ if (le32_to_cpu(sb->chunksize))
+ rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
+ return 0;
+}
+
+static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ if (mddev->raid_disks == 0) {
+ mddev->major_version = 1;
+ mddev->minor_version = 0;
+ mddev->patch_version = 0;
+ mddev->persistent = 1;
+ mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+ mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
+ mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
+ mddev->level = le32_to_cpu(sb->level);
+ mddev->layout = le32_to_cpu(sb->layout);
+ mddev->raid_disks = le32_to_cpu(sb->raid_disks);
+ mddev->size = (u32)le64_to_cpu(sb->size);
+ mddev->events = le64_to_cpu(sb->events);
+
+ mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+ memcpy(mddev->uuid, sb->set_uuid, 16);
+
+ mddev->max_disks = (4096-256)/2;
+ } else {
+ __u64 ev1;
+ ev1 = le64_to_cpu(sb->events);
+ ++ev1;
+ if (ev1 < mddev->events)
+ return -EINVAL;
+ }
+
+ if (mddev->level != LEVEL_MULTIPATH) {
+ int role;
+ rdev->desc_nr = le32_to_cpu(sb->dev_number);
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ switch(role) {
+ case 0xffff: /* spare */
+ rdev->in_sync = 0;
+ rdev->faulty = 0;
+ rdev->raid_disk = -1;
+ break;
+ case 0xfffe: /* faulty */
+ rdev->in_sync = 0;
+ rdev->faulty = 1;
+ rdev->raid_disk = -1;
+ break;
+ default:
+ rdev->in_sync = 1;
+ rdev->faulty = 0;
+ rdev->raid_disk = role;
+ break;
+ }
+ }
+ return 0;
+}
+
+static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ struct mdp_superblock_1 *sb;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev2;
+ int max_dev, i;
+ /* make rdev->sb match mddev and rdev data. */
+
+ sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
+
+ sb->feature_map = 0;
+ sb->pad0 = 0;
+ memset(sb->pad1, 0, sizeof(sb->pad1));
+ memset(sb->pad2, 0, sizeof(sb->pad2));
+ memset(sb->pad3, 0, sizeof(sb->pad3));
+
+ sb->utime = cpu_to_le64((__u64)mddev->utime);
+ sb->events = cpu_to_le64(mddev->events);
+ if (mddev->in_sync)
+ sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ else
+ sb->resync_offset = cpu_to_le64(0);
+
+ max_dev = 0;
+ ITERATE_RDEV(mddev,rdev2,tmp)
+ if (rdev2->desc_nr > max_dev)
+ max_dev = rdev2->desc_nr;
+
+ sb->max_dev = max_dev;
+ for (i=0; i<max_dev;i++)
+ sb->dev_roles[max_dev] = cpu_to_le16(0xfffe);
+
+ ITERATE_RDEV(mddev,rdev2,tmp) {
+ i = rdev2->desc_nr;
+ if (rdev2->faulty)
+ sb->dev_roles[i] = cpu_to_le16(0xfffe);
+ else if (rdev2->in_sync)
+ sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+ else
+ sb->dev_roles[i] = cpu_to_le16(0xffff);
+ }
+
+ sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
+}
+
+
+struct super_type super_types[] = {
+ [0] = {
+ .name = "0.90.0",
+ .owner = THIS_MODULE,
+ .load_super = super_90_load,
+ .validate_super = super_90_validate,
+ .sync_super = super_90_sync,
+ },
+ [1] = {
+ .name = "md-1",
+ .owner = THIS_MODULE,
+ .load_super = super_1_load,
+ .validate_super = super_1_validate,
+ .sync_super = super_1_sync,
+ },
+};
+
+static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp)
+ if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
+ return rdev;
+
+ return NULL;
+}
+
+static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev1,rdev,tmp)
+ if (match_dev_unit(mddev2, rdev))
+ return 1;
+
+ return 0;
+}
+
+static LIST_HEAD(pending_raid_disks);
+
+static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
+{
+ mdk_rdev_t *same_pdev;
+
+ if (rdev->mddev) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ same_pdev = match_dev_unit(mddev, rdev);
+ if (same_pdev)
+ printk(KERN_WARNING
+ "md%d: WARNING: %s appears to be on the same physical"
+ " disk as %s. True\n protection against single-disk"
+ " failure might be compromised.\n",
+ mdidx(mddev), bdev_partition_name(rdev->bdev),
+ bdev_partition_name(same_pdev->bdev));
+
+ /* Verify rdev->desc_nr is unique.
+ * If it is -1, assign a free number, else
+ * check number is not in use
+ */
+ if (rdev->desc_nr < 0) {
+ int choice = 0;
+ if (mddev->pers) choice = mddev->raid_disks;
+ while (find_rdev_nr(mddev, choice))
+ choice++;
+ rdev->desc_nr = choice;
+ } else {
+ if (find_rdev_nr(mddev, rdev->desc_nr))
+ return -EBUSY;
+ }
+
+ list_add(&rdev->same_set, &mddev->disks);
+ rdev->mddev = mddev;
+ printk(KERN_INFO "md: bind<%s>\n", bdev_partition_name(rdev->bdev));
+ return 0;
+}
+
+static void unbind_rdev_from_array(mdk_rdev_t * rdev)
+{
+ if (!rdev->mddev) {
+ MD_BUG();
+ return;
+ }
+ list_del_init(&rdev->same_set);
+ printk(KERN_INFO "md: unbind<%s>\n", bdev_partition_name(rdev->bdev));
+ rdev->mddev = NULL;
+}
+
+/*
+ * prevent the device from being mounted, repartitioned or
+ * otherwise reused by a RAID array (or any other kernel
+ * subsystem), by opening the device. [simply getting an
+ * inode is not enough, the SCSI module usage code needs
+ * an explicit open() on the device]
+ */
+static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
+{
+ int err = 0;
+ struct block_device *bdev;
+
+ bdev = bdget(dev);
+ if (!bdev)
+ return -ENOMEM;
+ err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ if (err)
+ return err;
+ err = bd_claim(bdev, rdev);
+ if (err) {
+ blkdev_put(bdev, BDEV_RAW);
+ return err;
+ }
+ rdev->bdev = bdev;
+ return err;
+}
+
+static void unlock_rdev(mdk_rdev_t *rdev)
+{
+ struct block_device *bdev = rdev->bdev;
+ rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ bd_release(bdev);
+ blkdev_put(bdev, BDEV_RAW);
+}
+
+void md_autodetect_dev(dev_t dev);
+
+static void export_rdev(mdk_rdev_t * rdev)
+{
+ printk(KERN_INFO "md: export_rdev(%s)\n",
+ bdev_partition_name(rdev->bdev));
+ if (rdev->mddev)
+ MD_BUG();
+ free_disk_sb(rdev);
+ list_del_init(&rdev->same_set);
+#ifndef MODULE
+ md_autodetect_dev(rdev->bdev->bd_dev);
+#endif
+ unlock_rdev(rdev);
+ kfree(rdev);
+}
+
+static void kick_rdev_from_array(mdk_rdev_t * rdev)
+{
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+}
+
+static void export_array(mddev_t *mddev)
+{
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (!rdev->mddev) {
+ MD_BUG();
+ continue;
+ }
+ kick_rdev_from_array(rdev);
+ }
+ if (!list_empty(&mddev->disks))
+ MD_BUG();
+ mddev->raid_disks = 0;
+ mddev->major_version = 0;
+}
+
+static void print_desc(mdp_disk_t *desc)
+{
+ printk(" DISK<N:%d,%s(%d,%d),R:%d,S:%d>\n", desc->number,
+ partition_name(MKDEV(desc->major,desc->minor)),
+ desc->major,desc->minor,desc->raid_disk,desc->state);
+}
+
+static void print_sb(mdp_super_t *sb)
+{
+ int i;
+
+ printk(KERN_INFO
+ "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
+ sb->major_version, sb->minor_version, sb->patch_version,
+ sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
+ sb->ctime);
+ printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
+ sb->level, sb->size, sb->nr_disks, sb->raid_disks,
+ sb->md_minor, sb->layout, sb->chunk_size);
+ printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
+ " FD:%d SD:%d CSUM:%08x E:%08lx\n",
+ sb->utime, sb->state, sb->active_disks, sb->working_disks,
+ sb->failed_disks, sb->spare_disks,
+ sb->sb_csum, (unsigned long)sb->events_lo);
+
+ printk(KERN_INFO);
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+
+ desc = sb->disks + i;
+ if (desc->number || desc->major || desc->minor ||
+ desc->raid_disk || (desc->state && (desc->state != 4))) {
+ printk(" D %2d: ", i);
+ print_desc(desc);
+ }
+ }
+ printk(KERN_INFO "md: THIS: ");
+ print_desc(&sb->this_disk);
+
+}
+
+static void print_rdev(mdk_rdev_t *rdev)
+{
+ printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ",
+ bdev_partition_name(rdev->bdev), (unsigned long long)rdev->size,
+ rdev->faulty, rdev->in_sync, rdev->desc_nr);
+ if (rdev->sb_loaded) {
+ printk(KERN_INFO "md: rdev superblock:\n");
+ print_sb((mdp_super_t*)page_address(rdev->sb_page));
+ } else
+ printk(KERN_INFO "md: no rdev superblock!\n");
+}
+
+void md_print_devices(void)
+{
+ struct list_head *tmp, *tmp2;
+ mdk_rdev_t *rdev;
+ mddev_t *mddev;
+
+ printk("\n");
+ printk("md: **********************************\n");
+ printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
+ printk("md: **********************************\n");
+ ITERATE_MDDEV(mddev,tmp) {
+ printk("md%d: ", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ printk("<%s>", bdev_partition_name(rdev->bdev));
+
+ ITERATE_RDEV(mddev,rdev,tmp2)
+ print_rdev(rdev);
+ }
+ printk("md: **********************************\n");
+ printk("\n");
+}
+
+
+static int write_disk_sb(mdk_rdev_t * rdev)
+{
+
+ if (!rdev->sb_loaded) {
+ MD_BUG();
+ return 1;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ return 1;
+ }
+
+ dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->sb_offset);
+
+ if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
+ return 0;
+
+ printk("md: write_disk_sb failed for device %s\n",
+ bdev_partition_name(rdev->bdev));
+ return 1;
+}
+
+static void sync_sbs(mddev_t * mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ super_types[mddev->major_version].
+ sync_super(mddev, rdev);
+ rdev->sb_loaded = 1;
+ }
+}
+
+static void md_update_sb(mddev_t * mddev)
+{
+ int err, count = 100;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+
+ mddev->sb_dirty = 0;
+repeat:
+ mddev->utime = get_seconds();
+ mddev->events ++;
+
+ if (!mddev->events) {
+ /*
+ * oops, this 64-bit counter should never wrap.
+ * Either we are in around ~1 trillion A.C., assuming
+ * 1 reboot per second, or we have a bug:
+ */
+ MD_BUG();
+ mddev->events --;
+ }
+ sync_sbs(mddev);
+
+ /*
+ * do not write anything to disk if using
+ * nonpersistent superblocks
+ */
+ if (!mddev->persistent)
+ return;
+
+ dprintk(KERN_INFO
+ "md: updating md%d RAID superblock on device (in sync %d)\n",
+ mdidx(mddev),mddev->in_sync);
+
+ err = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ dprintk(KERN_INFO "md: ");
+ if (rdev->faulty)
+ dprintk("(skipping faulty ");
+
+ dprintk("%s ", bdev_partition_name(rdev->bdev));
+ if (!rdev->faulty) {
+ err += write_disk_sb(rdev);
+ } else
+ dprintk(")\n");
+ if (!err && mddev->level == LEVEL_MULTIPATH)
+ /* only need to write one superblock... */
+ break;
+ }
+ if (err) {
+ if (--count) {
+ printk(KERN_ERR "md: errors occurred during superblock"
+ " update, repeating\n");
+ goto repeat;
+ }
+ printk(KERN_ERR \
+ "md: excessive errors occurred during superblock update, exiting\n");
+ }
+}
+
+/*
+ * Import a device. If 'super_format' >= 0, then sanity check the superblock
+ *
+ * mark the device faulty if:
+ *
+ * - the device is nonexistent (zero size)
+ * - the device has no valid superblock
+ *
+ * a faulty rdev _never_ has rdev->sb set.
+ */
+static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
+{
+ int err;
+ mdk_rdev_t *rdev;
+ sector_t size;
+
+ rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
+ if (!rdev) {
+ printk(KERN_ERR "md: could not alloc mem for %s!\n",
+ partition_name(newdev));
+ return ERR_PTR(-ENOMEM);
+ }
+ memset(rdev, 0, sizeof(*rdev));
+
+ if ((err = alloc_disk_sb(rdev)))
+ goto abort_free;
+
+ err = lock_rdev(rdev, newdev);
+ if (err) {
+ printk(KERN_ERR "md: could not lock %s.\n",
+ partition_name(newdev));
+ goto abort_free;
+ }
+ rdev->desc_nr = -1;
+ rdev->faulty = 0;
+ rdev->in_sync = 0;
+ rdev->data_offset = 0;
+ atomic_set(&rdev->nr_pending, 0);
+
+ size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ if (!size) {
+ printk(KERN_WARNING
+ "md: %s has zero or unknown size, marking faulty!\n",
+ bdev_partition_name(rdev->bdev));
+ err = -EINVAL;
+ goto abort_free;
+ }
+
+ if (super_format >= 0) {
+ err = super_types[super_format].
+ load_super(rdev, NULL, super_minor);
+ if (err == -EINVAL) {
+ printk(KERN_WARNING
+ "md: %s has invalid sb, not importing!\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort_free;
+ }
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: could not read %s's sb, not importing!\n",
+ bdev_partition_name(rdev->bdev));
+ goto abort_free;
+ }
+ }
+ INIT_LIST_HEAD(&rdev->same_set);
+
+ return rdev;
+
+abort_free:
+ if (rdev->sb_page) {
+ if (rdev->bdev)
+ unlock_rdev(rdev);
+ free_disk_sb(rdev);
+ }
+ kfree(rdev);
+ return ERR_PTR(err);
+}
+
+/*
+ * Check a full RAID array for plausibility
+ */
+
+
+static int analyze_sbs(mddev_t * mddev)
+{
+ int i;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev, *freshest;
+
+ freshest = NULL;
+ ITERATE_RDEV(mddev,rdev,tmp)
+ switch (super_types[mddev->major_version].
+ load_super(rdev, freshest, mddev->minor_version)) {
+ case 1:
+ freshest = rdev;
+ break;
+ case 0:
+ break;
+ default:
+ printk( KERN_ERR \
+ "md: fatal superblock inconsistency in %s"
+ " -- removing from array\n",
+ bdev_partition_name(rdev->bdev));
+ kick_rdev_from_array(rdev);
+ }
+
+
+ super_types[mddev->major_version].
+ validate_super(mddev, freshest);
+
+ i = 0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev != freshest)
+ if (super_types[mddev->major_version].
+ validate_super(mddev, rdev)) {
+ printk(KERN_WARNING "md: kicking non-fresh %s"
+ " from array!\n",
+ bdev_partition_name(rdev->bdev));
+ kick_rdev_from_array(rdev);
+ continue;
+ }
+ if (mddev->level == LEVEL_MULTIPATH) {
+ rdev->desc_nr = i++;
+ rdev->raid_disk = rdev->desc_nr;
+ rdev->in_sync = 1;
+ }
+ }
+
+
+ /*
+ * Check if we can support this RAID array
+ */
+ if (mddev->major_version != MD_MAJOR_VERSION ||
+ mddev->minor_version > MD_MINOR_VERSION) {
+ printk(KERN_ALERT
+ "md: md%d: unsupported raid array version %d.%d.%d\n",
+ mdidx(mddev), mddev->major_version,
+ mddev->minor_version, mddev->patch_version);
+ goto abort;
+ }
+
+ if ((mddev->recovery_cp != MaxSector) && ((mddev->level == 1) ||
+ (mddev->level == 4) || (mddev->level == 5)))
+ printk(KERN_ERR "md: md%d: raid array is not clean"
+ " -- starting background reconstruction\n",
+ mdidx(mddev));
+
+ return 0;
+abort:
+ return 1;
+}
+
+static struct gendisk *md_probe(dev_t dev, int *part, void *data)
+{
+ static DECLARE_MUTEX(disks_sem);
+ int unit = MINOR(dev);
+ mddev_t *mddev = mddev_find(unit);
+ struct gendisk *disk;
+
+ if (!mddev)
+ return NULL;
+
+ down(&disks_sem);
+ if (disks[unit]) {
+ up(&disks_sem);
+ mddev_put(mddev);
+ return NULL;
+ }
+ disk = alloc_disk(1);
+ if (!disk) {
+ up(&disks_sem);
+ mddev_put(mddev);
+ return NULL;
+ }
+ disk->major = MD_MAJOR;
+ disk->first_minor = mdidx(mddev);
+ sprintf(disk->disk_name, "md%d", mdidx(mddev));
+ disk->fops = &md_fops;
+ disk->private_data = mddev;
+ disk->queue = &mddev->queue;
+ add_disk(disk);
+ disks[mdidx(mddev)] = disk;
+ up(&disks_sem);
+ return NULL;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread);
+
+static void md_safemode_timeout(unsigned long data)
+{
+ mddev_t *mddev = (mddev_t *) data;
+
+ mddev->safemode = 1;
+ md_wakeup_thread(mddev->thread);
+}
+
+
+static int do_md_run(mddev_t * mddev)
+{
+ int pnum, err;
+ int chunk_size;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev;
+ struct gendisk *disk;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ if (mddev->pers)
+ return -EBUSY;
+
+ /*
+ * Analyze all RAID superblock(s)
+ */
+ if (!mddev->raid_disks && analyze_sbs(mddev)) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ chunk_size = mddev->chunk_size;
+ pnum = level_to_pers(mddev->level);
+
+ if ((pnum != MULTIPATH) && (pnum != RAID1)) {
+ if (!chunk_size) {
+ /*
+ * 'default chunksize' in the old md code used to
+ * be PAGE_SIZE, baaad.
+ * we abort here to be on the safe side. We don't
+ * want to continue the bad practice.
+ */
+ printk(KERN_ERR
+ "no chunksize specified, see 'man raidtab'\n");
+ return -EINVAL;
+ }
+ if (chunk_size > MAX_CHUNK_SIZE) {
+ printk(KERN_ERR "too big chunk_size: %d > %d\n",
+ chunk_size, MAX_CHUNK_SIZE);
+ return -EINVAL;
+ }
+ /*
+ * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
+ */
+ if ( (1 << ffz(~chunk_size)) != chunk_size) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (chunk_size < PAGE_SIZE) {
+ printk(KERN_ERR "too small chunk_size: %d < %ld\n",
+ chunk_size, PAGE_SIZE);
+ return -EINVAL;
+ }
+
+ /* devices must have minimum size of one chunk */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ if (rdev->size < chunk_size / 1024) {
+ printk(KERN_WARNING
+ "md: Dev %s smaller than chunk_size:"
+ " %lluk < %dk\n",
+ bdev_partition_name(rdev->bdev),
+ (unsigned long long)rdev->size,
+ chunk_size / 1024);
+ return -EINVAL;
+ }
+ }
+ }
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+#ifdef CONFIG_KMOD
+ if (!pers[pnum])
+ {
+ char module_name[80];
+ sprintf (module_name, "md-personality-%d", pnum);
+ request_module (module_name);
+ }
+#endif
+
+ /*
+ * Drop all container device buffers, from now on
+ * the only valid external interface is through the md
+ * device.
+ * Also find largest hardsector size
+ */
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ if (rdev->faulty)
+ continue;
+ sync_blockdev(rdev->bdev);
+ invalidate_bdev(rdev->bdev, 0);
+ }
+
+ md_probe(mdidx(mddev), NULL, NULL);
+ disk = disks[mdidx(mddev)];
+ if (!disk)
+ return -ENOMEM;
+
+ spin_lock(&pers_lock);
+ if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
+ spin_unlock(&pers_lock);
+ printk(KERN_ERR "md: personality %d is not loaded!\n",
+ pnum);
+ return -EINVAL;
+ }
+
+ mddev->pers = pers[pnum];
+ spin_unlock(&pers_lock);
+
+ blk_queue_make_request(&mddev->queue, mddev->pers->make_request);
+ printk("%s: setting max_sectors to %d, segment boundary to %d\n",
+ disk->disk_name,
+ chunk_size >> 9,
+ (chunk_size>>1)-1);
+ blk_queue_max_sectors(&mddev->queue, chunk_size >> 9);
+ blk_queue_segment_boundary(&mddev->queue, (chunk_size>>1) - 1);
+ mddev->queue.queuedata = mddev;
+
+ err = mddev->pers->run(mddev);
+ if (err) {
+ printk(KERN_ERR "md: pers->run() failed ...\n");
+ module_put(mddev->pers->owner);
+ mddev->pers = NULL;
+ return -EINVAL;
+ }
+ atomic_set(&mddev->writes_pending,0);
+ mddev->safemode = 0;
+ mddev->safemode_timer.function = md_safemode_timeout;
+ mddev->safemode_timer.data = (unsigned long) mddev;
+ mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
+ mddev->in_sync = 1;
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ set_capacity(disk, mddev->array_size<<1);
+ return 0;
+}
+
+static int restart_array(mddev_t *mddev)
+{
+ struct gendisk *disk = disks[mdidx(mddev)];
+ int err;
+
+ /*
+ * Complain if it has no devices
+ */
+ err = -ENXIO;
+ if (list_empty(&mddev->disks))
+ goto out;
+
+ if (mddev->pers) {
+ err = -EBUSY;
+ if (!mddev->ro)
+ goto out;
+
+ mddev->safemode = 0;
+ mddev->ro = 0;
+ set_disk_ro(disk, 0);
+
+ printk(KERN_INFO "md: md%d switched to read-write mode.\n",
+ mdidx(mddev));
+ /*
+ * Kick recovery or resync if necessary
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ err = 0;
+ } else {
+ printk(KERN_ERR "md: md%d has no personality assigned.\n",
+ mdidx(mddev));
+ err = -EINVAL;
+ }
+
+out:
+ return err;
+}
+
+static int do_md_stop(mddev_t * mddev, int ro)
+{
+ int err = 0;
+ struct gendisk *disk = disks[mdidx(mddev)];
+
+ if (atomic_read(&mddev->active)>2) {
+ printk("md: md%d still in use.\n",mdidx(mddev));
+ err = -EBUSY;
+ goto out;
+ }
+
+ if (mddev->pers) {
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ }
+
+ del_timer_sync(&mddev->safemode_timer);
+
+ invalidate_device(mk_kdev(disk->major, disk->first_minor), 1);
+
+ if (ro) {
+ err = -ENXIO;
+ if (mddev->ro)
+ goto out;
+ mddev->ro = 1;
+ } else {
+ if (mddev->ro)
+ set_disk_ro(disk, 0);
+ if (mddev->pers->stop(mddev)) {
+ err = -EBUSY;
+ if (mddev->ro)
+ set_disk_ro(disk, 1);
+ goto out;
+ }
+ module_put(mddev->pers->owner);
+ mddev->pers = NULL;
+ if (mddev->ro)
+ mddev->ro = 0;
+ }
+ if (mddev->raid_disks) {
+ /* mark array as shutdown cleanly */
+ mddev->in_sync = 1;
+ md_update_sb(mddev);
+ }
+ if (ro)
+ set_disk_ro(disk, 1);
+ }
+ /*
+ * Free resources if final stop
+ */
+ if (!ro) {
+ struct gendisk *disk;
+ printk(KERN_INFO "md: md%d stopped.\n", mdidx(mddev));
+
+ export_array(mddev);
+
+ mddev->array_size = 0;
+ disk = disks[mdidx(mddev)];
+ if (disk)
+ set_capacity(disk, 0);
+ } else
+ printk(KERN_INFO "md: md%d switched to read-only mode.\n",
+ mdidx(mddev));
+ err = 0;
+out:
+ return err;
+}
+
+static void autorun_array(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+ int err;
+
+ if (list_empty(&mddev->disks)) {
+ MD_BUG();
+ return;
+ }
+
+ printk(KERN_INFO "md: running: ");
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ printk("<%s>", bdev_partition_name(rdev->bdev));
+ }
+ printk("\n");
+
+ err = do_md_run (mddev);
+ if (err) {
+ printk(KERN_WARNING "md :do_md_run() returned %d\n", err);
+ do_md_stop (mddev, 0);
+ }
+}
+
+/*
+ * lets try to run arrays based on all disks that have arrived
+ * until now. (those are in pending_raid_disks)
+ *
+ * the method: pick the first pending disk, collect all disks with
+ * the same UUID, remove all from the pending list and put them into
+ * the 'same_array' list. Then order this list based on superblock
+ * update time (freshest comes first), kick out 'old' disks and
+ * compare superblocks. If everything's fine then run it.
+ *
+ * If "unit" is allocated, then bump its reference count
+ */
+static void autorun_devices(void)
+{
+ struct list_head candidates;
+ struct list_head *tmp;
+ mdk_rdev_t *rdev0, *rdev;
+ mddev_t *mddev;
+
+ printk(KERN_INFO "md: autorun ...\n");
+ while (!list_empty(&pending_raid_disks)) {
+ rdev0 = list_entry(pending_raid_disks.next,
+ mdk_rdev_t, same_set);
+
+ printk(KERN_INFO "md: considering %s ...\n",
+ bdev_partition_name(rdev0->bdev));
+ INIT_LIST_HEAD(&candidates);
+ ITERATE_RDEV_PENDING(rdev,tmp)
+ if (super_90_load(rdev, rdev0, 0) >= 0) {
+ printk(KERN_INFO "md: adding %s ...\n",
+ bdev_partition_name(rdev->bdev));
+ list_move(&rdev->same_set, &candidates);
+ }
+ /*
+ * now we have a set of devices, with all of them having
+ * mostly sane superblocks. It's time to allocate the
+ * mddev.
+ */
+
+ mddev = mddev_find(rdev0->preferred_minor);
+ if (!mddev) {
+ printk(KERN_ERR
+ "md: cannot allocate memory for md drive.\n");
+ break;
+ }
+ if (mddev_lock(mddev))
+ printk(KERN_WARNING "md: md%d locked, cannot run\n",
+ mdidx(mddev));
+ else if (mddev->raid_disks || mddev->major_version
+ || !list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: md%d already running, cannot run %s\n",
+ mdidx(mddev), bdev_partition_name(rdev0->bdev));
+ mddev_unlock(mddev);
+ } else {
+ printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
+ ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
+ list_del_init(&rdev->same_set);
+ if (bind_rdev_to_array(rdev, mddev))
+ export_rdev(rdev);
+ }
+ autorun_array(mddev);
+ mddev_unlock(mddev);
+ }
+ /* on success, candidates will be empty, on error
+ * it won't...
+ */
+ ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
+ export_rdev(rdev);
+ mddev_put(mddev);
+ }
+ printk(KERN_INFO "md: ... autorun DONE.\n");
+}
+
+/*
+ * import RAID devices based on one partition
+ * if possible, the array gets run as well.
+ */
+
+static int autostart_array(dev_t startdev)
+{
+ int err = -EINVAL, i;
+ mdp_super_t *sb = NULL;
+ mdk_rdev_t *start_rdev = NULL, *rdev;
+
+ start_rdev = md_import_device(startdev, 0, 0);
+ if (IS_ERR(start_rdev)) {
+ printk(KERN_WARNING "md: could not import %s!\n",
+ partition_name(startdev));
+ return err;
+ }
+
+ /* NOTE: this can only work for 0.90.0 superblocks */
+ sb = (mdp_super_t*)page_address(start_rdev->sb_page);
+ if (sb->major_version != 0 ||
+ sb->minor_version != 90 ) {
+ printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
+ export_rdev(start_rdev);
+ return err;
+ }
+
+ if (start_rdev->faulty) {
+ printk(KERN_WARNING
+ "md: can not autostart based on faulty %s!\n",
+ bdev_partition_name(start_rdev->bdev));
+ export_rdev(start_rdev);
+ return err;
+ }
+ list_add(&start_rdev->same_set, &pending_raid_disks);
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ mdp_disk_t *desc;
+ dev_t dev;
+
+ desc = sb->disks + i;
+ dev = MKDEV(desc->major, desc->minor);
+
+ if (!dev)
+ continue;
+ if (dev == startdev)
+ continue;
+ rdev = md_import_device(dev, 0, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING "md: could not import %s,"
+ " trying to run array nevertheless.\n",
+ partition_name(dev));
+ continue;
+ }
+ list_add(&rdev->same_set, &pending_raid_disks);
+ }
+
+ /*
+ * possibly return codes
+ */
+ autorun_devices();
+ return 0;
+
+}
+
+
+static int get_version(void * arg)
+{
+ mdu_version_t ver;
+
+ ver.major = MD_MAJOR_VERSION;
+ ver.minor = MD_MINOR_VERSION;
+ ver.patchlevel = MD_PATCHLEVEL_VERSION;
+
+ if (copy_to_user(arg, &ver, sizeof(ver)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_array_info(mddev_t * mddev, void * arg)
+{
+ mdu_array_info_t info;
+ int nr,working,active,failed,spare;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ nr=working=active=failed=spare=0;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ nr++;
+ if (rdev->faulty)
+ failed++;
+ else {
+ working++;
+ if (rdev->in_sync)
+ active++;
+ else
+ spare++;
+ }
+ }
+
+ info.major_version = mddev->major_version;
+ info.minor_version = mddev->minor_version;
+ info.patch_version = 1;
+ info.ctime = mddev->ctime;
+ info.level = mddev->level;
+ info.size = mddev->size;
+ info.nr_disks = nr;
+ info.raid_disks = mddev->raid_disks;
+ info.md_minor = mddev->__minor;
+ info.not_persistent= !mddev->persistent;
+
+ info.utime = mddev->utime;
+ info.state = 0;
+ if (mddev->in_sync)
+ info.state = (1<<MD_SB_CLEAN);
+ info.active_disks = active;
+ info.working_disks = working;
+ info.failed_disks = failed;
+ info.spare_disks = spare;
+
+ info.layout = mddev->layout;
+ info.chunk_size = mddev->chunk_size;
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int get_disk_info(mddev_t * mddev, void * arg)
+{
+ mdu_disk_info_t info;
+ unsigned int nr;
+ mdk_rdev_t *rdev;
+
+ if (copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ nr = info.number;
+
+ rdev = find_rdev_nr(mddev, nr);
+ if (rdev) {
+ info.major = MAJOR(rdev->bdev->bd_dev);
+ info.minor = MINOR(rdev->bdev->bd_dev);
+ info.raid_disk = rdev->raid_disk;
+ info.state = 0;
+ if (rdev->faulty)
+ info.state |= (1<<MD_DISK_FAULTY);
+ else if (rdev->in_sync) {
+ info.state |= (1<<MD_DISK_ACTIVE);
+ info.state |= (1<<MD_DISK_SYNC);
+ }
+ } else {
+ info.major = info.minor = 0;
+ info.raid_disk = -1;
+ info.state = (1<<MD_DISK_REMOVED);
+ }
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
+{
+ mdk_rdev_t *rdev;
+ dev_t dev;
+ dev = MKDEV(info->major,info->minor);
+ if (!mddev->raid_disks) {
+ int err;
+ /* expecting a device which has a superblock */
+ rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ if (!list_empty(&mddev->disks)) {
+ mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
+ mdk_rdev_t, same_set);
+ int err = super_types[mddev->major_version]
+ .load_super(rdev, rdev0, mddev->minor_version);
+ if (err < 0) {
+ printk(KERN_WARNING
+ "md: %s has different UUID to %s\n",
+ bdev_partition_name(rdev->bdev),
+ bdev_partition_name(rdev0->bdev));
+ export_rdev(rdev);
+ return -EINVAL;
+ }
+ }
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ return err;
+ }
+
+ /*
+ * add_new_disk can be used once the array is assembled
+ * to add "hot spares". They must already have a superblock
+ * written
+ */
+ if (mddev->pers) {
+ int err;
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+ rdev = md_import_device(dev, mddev->major_version,
+ mddev->minor_version);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: md_import_device returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ rdev->in_sync = 0; /* just to be sure */
+ rdev->raid_disk = -1;
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err)
+ export_rdev(rdev);
+ if (mddev->thread)
+ md_wakeup_thread(mddev->thread);
+ return err;
+ }
+
+ /* otherwise, add_new_disk is only allowed
+ * for major_version==0 superblocks
+ */
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "md%d: ADD_NEW_DISK not supported\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ if (!(info->state & (1<<MD_DISK_FAULTY))) {
+ int err;
+ rdev = md_import_device (dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return PTR_ERR(rdev);
+ }
+ rdev->desc_nr = info->number;
+ if (info->raid_disk < mddev->raid_disks)
+ rdev->raid_disk = info->raid_disk;
+ else
+ rdev->raid_disk = -1;
+
+ rdev->faulty = 0;
+ if (rdev->raid_disk < mddev->raid_disks)
+ rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
+ else
+ rdev->in_sync = 0;
+
+ err = bind_rdev_to_array(rdev, mddev);
+ if (err) {
+ export_rdev(rdev);
+ return err;
+ }
+
+ if (!mddev->persistent) {
+ printk(KERN_INFO "md: nonpersistent superblock ...\n");
+ rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ } else
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ rdev->size = calc_dev_size(rdev, mddev->chunk_size);
+
+ if (!mddev->size || (mddev->size > rdev->size))
+ mddev->size = rdev->size;
+ }
+
+ return 0;
+}
+
+static int hot_generate_error(mddev_t * mddev, dev_t dev)
+{
+ struct request_queue *q;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to generate %s error in md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev) {
+ MD_BUG();
+ return -ENXIO;
+ }
+
+ if (rdev->desc_nr == -1) {
+ MD_BUG();
+ return -EINVAL;
+ }
+ if (!rdev->in_sync)
+ return -ENODEV;
+
+ q = bdev_get_queue(rdev->bdev);
+ if (!q) {
+ MD_BUG();
+ return -ENODEV;
+ }
+ printk(KERN_INFO "md: okay, generating error!\n");
+// q->oneshot_error = 1; // disabled for now
+
+ return 0;
+}
+
+static int hot_remove_disk(mddev_t * mddev, dev_t dev)
+{
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to remove %s from md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return -ENXIO;
+
+ if (rdev->raid_disk >= 0)
+ goto busy;
+
+ kick_rdev_from_array(rdev);
+ md_update_sb(mddev);
+
+ return 0;
+busy:
+ printk(KERN_WARNING "md: cannot remove active disk %s from md%d ... \n",
+ bdev_partition_name(rdev->bdev), mdidx(mddev));
+ return -EBUSY;
+}
+
+static int hot_add_disk(mddev_t * mddev, dev_t dev)
+{
+ int err;
+ unsigned int size;
+ mdk_rdev_t *rdev;
+
+ if (!mddev->pers)
+ return -ENODEV;
+
+ printk(KERN_INFO "md: trying to hot-add %s to md%d ... \n",
+ partition_name(dev), mdidx(mddev));
+
+ if (mddev->major_version != 0) {
+ printk(KERN_WARNING "md%d: HOT_ADD may only be used with"
+ " version-0 superblocks.\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+ if (!mddev->pers->hot_add_disk) {
+ printk(KERN_WARNING
+ "md%d: personality does not support diskops!\n",
+ mdidx(mddev));
+ return -EINVAL;
+ }
+
+ rdev = md_import_device (dev, -1, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_WARNING
+ "md: error, md_import_device() returned %ld\n",
+ PTR_ERR(rdev));
+ return -EINVAL;
+ }
+
+ rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
+ size = calc_dev_size(rdev, mddev->chunk_size);
+ rdev->size = size;
+
+ if (size < mddev->size) {
+ printk(KERN_WARNING
+ "md%d: disk size %llu blocks < array size %llu\n",
+ mdidx(mddev), (unsigned long long)size,
+ (unsigned long long)mddev->size);
+ err = -ENOSPC;
+ goto abort_export;
+ }
+
+ if (rdev->faulty) {
+ printk(KERN_WARNING
+ "md: can not hot-add faulty %s disk to md%d!\n",
+ bdev_partition_name(rdev->bdev), mdidx(mddev));
+ err = -EINVAL;
+ goto abort_export;
+ }
+ rdev->in_sync = 0;
+ rdev->desc_nr = -1;
+ bind_rdev_to_array(rdev, mddev);
+
+ /*
+ * The rest should better be atomic, we can have disk failures
+ * noticed in interrupt contexts ...
+ */
+
+ if (rdev->desc_nr == mddev->max_disks) {
+ printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unbind_export;
+ }
+
+ rdev->raid_disk = -1;
+
+ md_update_sb(mddev);
+
+ /*
+ * Kick recovery, maybe this spare has to be added to the
+ * array immediately.
+ */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+
+ return 0;
+
+abort_unbind_export:
+ unbind_rdev_from_array(rdev);
+
+abort_export:
+ export_rdev(rdev);
+ return err;
+}
+
+/*
+ * set_array_info is used two different ways
+ * The original usage is when creating a new array.
+ * In this usage, raid_disks is > = and it together with
+ * level, size, not_persistent,layout,chunksize determine the
+ * shape of the array.
+ * This will always create an array with a type-0.90.0 superblock.
+ * The newer usage is when assembling an array.
+ * In this case raid_disks will be 0, and the major_version field is
+ * use to determine which style super-blocks are to be found on the devices.
+ * The minor and patch _version numbers are also kept incase the
+ * super_block handler wishes to interpret them.
+ */
+static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
+{
+
+ if (info->raid_disks == 0) {
+ /* just setting version number for superblock loading */
+ if (info->major_version < 0 ||
+ info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
+ super_types[info->major_version].name == NULL) {
+ /* maybe try to auto-load a module? */
+ printk(KERN_INFO
+ "md: superblock version %d not known\n",
+ info->major_version);
+ return -EINVAL;
+ }
+ mddev->major_version = info->major_version;
+ mddev->minor_version = info->minor_version;
+ mddev->patch_version = info->patch_version;
+ return 0;
+ }
+ mddev->major_version = MD_MAJOR_VERSION;
+ mddev->minor_version = MD_MINOR_VERSION;
+ mddev->patch_version = MD_PATCHLEVEL_VERSION;
+ mddev->ctime = get_seconds();
+
+ mddev->level = info->level;
+ mddev->size = info->size;
+ mddev->raid_disks = info->raid_disks;
+ /* don't set __minor, it is determined by which /dev/md* was
+ * openned
+ */
+ if (info->state & (1<<MD_SB_CLEAN))
+ mddev->recovery_cp = MaxSector;
+ else
+ mddev->recovery_cp = 0;
+ mddev->persistent = ! info->not_persistent;
+
+ mddev->layout = info->layout;
+ mddev->chunk_size = info->chunk_size;
+
+ mddev->max_disks = MD_SB_DISKS;
+
+
+ /*
+ * Generate a 128 bit UUID
+ */
+ get_random_bytes(mddev->uuid, 16);
+
+ return 0;
+}
+
+static int set_disk_faulty(mddev_t *mddev, dev_t dev)
+{
+ mdk_rdev_t *rdev;
+
+ rdev = find_rdev(mddev, dev);
+ if (!rdev)
+ return 0;
+
+ md_error(mddev, rdev);
+ return 1;
+}
+
+static int md_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ unsigned int minor;
+ int err = 0;
+ struct hd_geometry *loc = (struct hd_geometry *) arg;
+ mddev_t *mddev = NULL;
+ kdev_t dev;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ dev = inode->i_rdev;
+ minor = minor(dev);
+ if (minor >= MAX_MD_DEVS) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ /*
+ * Commands dealing with the RAID driver but not any
+ * particular array:
+ */
+ switch (cmd)
+ {
+ case RAID_VERSION:
+ err = get_version((void *)arg);
+ goto done;
+
+ case PRINT_RAID_DEBUG:
+ err = 0;
+ md_print_devices();
+ goto done;
+
+#ifndef MODULE
+ case RAID_AUTORUN:
+ err = 0;
+ autostart_arrays();
+ goto done;
+#endif
+ default:;
+ }
+
+ /*
+ * Commands creating/starting a new array:
+ */
+
+ mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev) {
+ BUG();
+ goto abort;
+ }
+
+
+ if (cmd == START_ARRAY) {
+ /* START_ARRAY doesn't need to lock the array as autostart_array
+ * does the locking, and it could even be a different array
+ */
+ err = autostart_array(arg);
+ if (err) {
+ printk(KERN_WARNING "md: autostart %s failed!\n",
+ partition_name(arg));
+ goto abort;
+ }
+ goto done;
+ }
+
+ err = mddev_lock(mddev);
+ if (err) {
+ printk(KERN_INFO
+ "md: ioctl lock interrupted, reason %d, cmd %d\n",
+ err, cmd);
+ goto abort;
+ }
+
+ switch (cmd)
+ {
+ case SET_ARRAY_INFO:
+
+ if (!list_empty(&mddev->disks)) {
+ printk(KERN_WARNING
+ "md: array md%d already has disks!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ if (mddev->raid_disks) {
+ printk(KERN_WARNING
+ "md: array md%d already initialised!\n",
+ mdidx(mddev));
+ err = -EBUSY;
+ goto abort_unlock;
+ }
+ {
+ mdu_array_info_t info;
+ if (!arg)
+ memset(&info, 0, sizeof(info));
+ else if (copy_from_user(&info, (void*)arg, sizeof(info))) {
+ err = -EFAULT;
+ goto abort_unlock;
+ }
+ err = set_array_info(mddev, &info);
+ if (err) {
+ printk(KERN_WARNING "md: couldn't set"
+ " array info. %d\n", err);
+ goto abort_unlock;
+ }
+ }
+ goto done_unlock;
+
+ default:;
+ }
+
+ /*
+ * Commands querying/configuring an existing array:
+ */
+ /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
+ if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
+ err = -ENODEV;
+ goto abort_unlock;
+ }
+
+ /*
+ * Commands even a read-only array can execute:
+ */
+ switch (cmd)
+ {
+ case GET_ARRAY_INFO:
+ err = get_array_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case GET_DISK_INFO:
+ err = get_disk_info(mddev, (void *)arg);
+ goto done_unlock;
+
+ case RESTART_ARRAY_RW:
+ err = restart_array(mddev);
+ goto done_unlock;
+
+ case STOP_ARRAY:
+ err = do_md_stop (mddev, 0);
+ goto done_unlock;
+
+ case STOP_ARRAY_RO:
+ err = do_md_stop (mddev, 1);
+ goto done_unlock;
+
+ /*
+ * We have a problem here : there is no easy way to give a CHS
+ * virtual geometry. We currently pretend that we have a 2 heads
+ * 4 sectors (with a BIG number of cylinders...). This drives
+ * dosfs just mad... ;-)
+ */
+ case HDIO_GETGEO:
+ if (!loc) {
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+ err = put_user (2, (char *) &loc->heads);
+ if (err)
+ goto abort_unlock;
+ err = put_user (4, (char *) &loc->sectors);
+ if (err)
+ goto abort_unlock;
+ err = put_user(get_capacity(disks[mdidx(mddev)])/8,
+ (short *) &loc->cylinders);
+ if (err)
+ goto abort_unlock;
+ err = put_user (get_start_sect(inode->i_bdev),
+ (long *) &loc->start);
+ goto done_unlock;
+ }
+
+ /*
+ * The remaining ioctls are changing the state of the
+ * superblock, so we do not allow read-only arrays
+ * here:
+ */
+ if (mddev->ro) {
+ err = -EROFS;
+ goto abort_unlock;
+ }
+
+ switch (cmd)
+ {
+ case ADD_NEW_DISK:
+ {
+ mdu_disk_info_t info;
+ if (copy_from_user(&info, (void*)arg, sizeof(info)))
+ err = -EFAULT;
+ else
+ err = add_new_disk(mddev, &info);
+ goto done_unlock;
+ }
+ case HOT_GENERATE_ERROR:
+ err = hot_generate_error(mddev, arg);
+ goto done_unlock;
+ case HOT_REMOVE_DISK:
+ err = hot_remove_disk(mddev, arg);
+ goto done_unlock;
+
+ case HOT_ADD_DISK:
+ err = hot_add_disk(mddev, arg);
+ goto done_unlock;
+
+ case SET_DISK_FAULTY:
+ err = set_disk_faulty(mddev, arg);
+ goto done_unlock;
+
+ case RUN_ARRAY:
+ {
+ err = do_md_run (mddev);
+ /*
+ * we have to clean up the mess if
+ * the array cannot be run for some
+ * reason ...
+ * ->pers will not be set, to superblock will
+ * not be updated.
+ */
+ if (err)
+ do_md_stop (mddev, 0);
+ goto done_unlock;
+ }
+
+ default:
+ if (_IOC_TYPE(cmd) == MD_MAJOR)
+ printk(KERN_WARNING "md: %s(pid %d) used"
+ " obsolete MD ioctl, upgrade your"
+ " software to use new ictls.\n",
+ current->comm, current->pid);
+ err = -EINVAL;
+ goto abort_unlock;
+ }
+
+done_unlock:
+abort_unlock:
+ mddev_unlock(mddev);
+
+ return err;
+done:
+ if (err)
+ MD_BUG();
+abort:
+ return err;
+}
+
+static int md_open(struct inode *inode, struct file *file)
+{
+ /*
+ * Succeed if we can find or allocate a mddev structure.
+ */
+ mddev_t *mddev = mddev_find(minor(inode->i_rdev));
+ int err = -ENOMEM;
+
+ if (!mddev)
+ goto out;
+
+ if ((err = mddev_lock(mddev)))
+ goto put;
+
+ err = 0;
+ mddev_unlock(mddev);
+ inode->i_bdev->bd_inode->u.generic_ip = mddev_get(mddev);
+ put:
+ mddev_put(mddev);
+ out:
+ return err;
+}
+
+static int md_release(struct inode *inode, struct file * file)
+{
+ mddev_t *mddev = inode->i_bdev->bd_inode->u.generic_ip;
+
+ if (!mddev)
+ BUG();
+ mddev_put(mddev);
+
+ return 0;
+}
+
+static struct block_device_operations md_fops =
+{
+ .owner = THIS_MODULE,
+ .open = md_open,
+ .release = md_release,
+ .ioctl = md_ioctl,
+};
+
+int md_thread(void * arg)
+{
+ mdk_thread_t *thread = arg;
+
+ lock_kernel();
+
+ /*
+ * Detach thread
+ */
+
+ daemonize(thread->name, mdidx(thread->mddev));
+
+ current->exit_signal = SIGCHLD;
+ allow_signal(SIGKILL);
+ thread->tsk = current;
+
+ /*
+ * md_thread is a 'system-thread', it's priority should be very
+ * high. We avoid resource deadlocks individually in each
+ * raid personality. (RAID5 does preallocation) We also use RR and
+ * the very same RT priority as kswapd, thus we will never get
+ * into a priority inversion deadlock.
+ *
+ * we definitely have to have equal or higher priority than
+ * bdflush, otherwise bdflush will deadlock if there are too
+ * many dirty RAID5 blocks.
+ */
+ unlock_kernel();
+
+ complete(thread->event);
+ while (thread->run) {
+ void (*run)(mddev_t *);
+
+ wait_event_interruptible(thread->wqueue,
+ test_bit(THREAD_WAKEUP, &thread->flags));
+ if (current->flags & PF_FREEZE)
+ refrigerator(PF_IOTHREAD);
+
+ clear_bit(THREAD_WAKEUP, &thread->flags);
+
+ run = thread->run;
+ if (run) {
+ run(thread->mddev);
+ blk_run_queues();
+ }
+ if (signal_pending(current))
+ flush_signals(current);
+ }
+ complete(thread->event);
+ return 0;
+}
+
+void md_wakeup_thread(mdk_thread_t *thread)
+{
+ if (thread) {
+ dprintk("md: waking up MD thread %p.\n", thread);
+ set_bit(THREAD_WAKEUP, &thread->flags);
+ wake_up(&thread->wqueue);
+ }
+}
+
+mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
+ const char *name)
+{
+ mdk_thread_t *thread;
+ int ret;
+ struct completion event;
+
+ thread = (mdk_thread_t *) kmalloc
+ (sizeof(mdk_thread_t), GFP_KERNEL);
+ if (!thread)
+ return NULL;
+
+ memset(thread, 0, sizeof(mdk_thread_t));
+ init_waitqueue_head(&thread->wqueue);
+
+ init_completion(&event);
+ thread->event = &event;
+ thread->run = run;
+ thread->mddev = mddev;
+ thread->name = name;
+ ret = kernel_thread(md_thread, thread, 0);
+ if (ret < 0) {
+ kfree(thread);
+ return NULL;
+ }
+ wait_for_completion(&event);
+ return thread;
+}
+
+void md_interrupt_thread(mdk_thread_t *thread)
+{
+ if (!thread->tsk) {
+ MD_BUG();
+ return;
+ }
+ dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
+ send_sig(SIGKILL, thread->tsk, 1);
+}
+
+void md_unregister_thread(mdk_thread_t *thread)
+{
+ struct completion event;
+
+ init_completion(&event);
+
+ thread->event = &event;
+ thread->run = NULL;
+ thread->name = NULL;
+ md_interrupt_thread(thread);
+ wait_for_completion(&event);
+ kfree(thread);
+}
+
+void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ dprintk("md_error dev:(%d:%d), rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
+ MD_MAJOR,mdidx(mddev),
+ MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
+ __builtin_return_address(0),__builtin_return_address(1),
+ __builtin_return_address(2),__builtin_return_address(3));
+
+ if (!mddev) {
+ MD_BUG();
+ return;
+ }
+
+ if (!rdev || rdev->faulty)
+ return;
+ if (!mddev->pers->error_handler)
+ return;
+ mddev->pers->error_handler(mddev,rdev);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+}
+
+/* seq_file implementation /proc/mdstat */
+
+static void status_unused(struct seq_file *seq)
+{
+ int i = 0;
+ mdk_rdev_t *rdev;
+ struct list_head *tmp;
+
+ seq_printf(seq, "unused devices: ");
+
+ ITERATE_RDEV_PENDING(rdev,tmp) {
+ i++;
+ seq_printf(seq, "%s ",
+ bdev_partition_name(rdev->bdev));
+ }
+ if (!i)
+ seq_printf(seq, "<none>");
+
+ seq_printf(seq, "\n");
+}
+
+
+static void status_resync(struct seq_file *seq, mddev_t * mddev)
+{
+ unsigned long max_blocks, resync, res, dt, db, rt;
+
+ resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
+ max_blocks = mddev->size;
+
+ /*
+ * Should not happen.
+ */
+ if (!max_blocks) {
+ MD_BUG();
+ return;
+ }
+ res = (resync/1024)*1000/(max_blocks/1024 + 1);
+ {
+ int i, x = res/50, y = 20-x;
+ seq_printf(seq, "[");
+ for (i = 0; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+ }
+ seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
+ (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
+ "resync" : "recovery"),
+ res/10, res % 10, resync, max_blocks);
+
+ /*
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ dt = ((jiffies - mddev->resync_mark) / HZ);
+ if (!dt) dt++;
+ db = resync - (mddev->resync_mark_cnt/2);
+ rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
+
+ seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
+
+ seq_printf(seq, " speed=%ldK/sec", db/dt);
+}
+
+static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct list_head *tmp;
+ loff_t l = *pos;
+ mddev_t *mddev;
+
+ if (l > 0x10000)
+ return NULL;
+ if (!l--)
+ /* header */
+ return (void*)1;
+
+ spin_lock(&all_mddevs_lock);
+ list_for_each(tmp,&all_mddevs)
+ if (!l--) {
+ mddev = list_entry(tmp, mddev_t, all_mddevs);
+ mddev_get(mddev);
+ spin_unlock(&all_mddevs_lock);
+ return mddev;
+ }
+ spin_unlock(&all_mddevs_lock);
+ return (void*)2;/* tail */
+}
+
+static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct list_head *tmp;
+ mddev_t *next_mddev, *mddev = v;
+
+ ++*pos;
+ if (v == (void*)2)
+ return NULL;
+
+ spin_lock(&all_mddevs_lock);
+ if (v == (void*)1)
+ tmp = all_mddevs.next;
+ else
+ tmp = mddev->all_mddevs.next;
+ if (tmp != &all_mddevs)
+ next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
+ else {
+ next_mddev = (void*)2;
+ *pos = 0x10000;
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ if (v != (void*)1)
+ mddev_put(mddev);
+ return next_mddev;
+
+}
+
+static void md_seq_stop(struct seq_file *seq, void *v)
+{
+ mddev_t *mddev = v;
+
+ if (mddev && v != (void*)1 && v != (void*)2)
+ mddev_put(mddev);
+}
+
+static int md_seq_show(struct seq_file *seq, void *v)
+{
+ mddev_t *mddev = v;
+ sector_t size;
+ struct list_head *tmp2;
+ mdk_rdev_t *rdev;
+ int i;
+
+ if (v == (void*)1) {
+ seq_printf(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ for (i = 0; i < MAX_PERSONALITY; i++)
+ if (pers[i])
+ seq_printf(seq, "[%s] ", pers[i]->name);
+
+ spin_unlock(&pers_lock);
+ seq_printf(seq, "\n");
+ return 0;
+ }
+ if (v == (void*)2) {
+ status_unused(seq);
+ return 0;
+ }
+
+ if (mddev_lock(mddev)!=0)
+ return -EINTR;
+ if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
+ seq_printf(seq, "md%d : %sactive", mdidx(mddev),
+ mddev->pers ? "" : "in");
+ if (mddev->pers) {
+ if (mddev->ro)
+ seq_printf(seq, " (read-only)");
+ seq_printf(seq, " %s", mddev->pers->name);
+ }
+
+ size = 0;
+ ITERATE_RDEV(mddev,rdev,tmp2) {
+ seq_printf(seq, " %s[%d]",
+ bdev_partition_name(rdev->bdev), rdev->desc_nr);
+ if (rdev->faulty) {
+ seq_printf(seq, "(F)");
+ continue;
+ }
+ size += rdev->size;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ if (mddev->pers)
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)mddev->array_size);
+ else
+ seq_printf(seq, "\n %llu blocks",
+ (unsigned long long)size);
+ }
+
+ if (mddev->pers) {
+ mddev->pers->status (seq, mddev);
+ seq_printf(seq, "\n ");
+ if (mddev->curr_resync > 2)
+ status_resync (seq, mddev);
+ else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
+ seq_printf(seq, " resync=DELAYED");
+ }
+
+ seq_printf(seq, "\n");
+ }
+ mddev_unlock(mddev);
+
+ return 0;
+}
+
+static struct seq_operations md_seq_ops = {
+ .start = md_seq_start,
+ .next = md_seq_next,
+ .stop = md_seq_stop,
+ .show = md_seq_show,
+};
+
+static int md_seq_open(struct inode *inode, struct file *file)
+{
+ int error;
+
+ error = seq_open(file, &md_seq_ops);
+ return error;
+}
+
+static struct file_operations md_seq_fops = {
+ .open = md_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+int register_md_personality(int pnum, mdk_personality_t *p)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ spin_lock(&pers_lock);
+ if (pers[pnum]) {
+ spin_unlock(&pers_lock);
+ MD_BUG();
+ return -EBUSY;
+ }
+
+ pers[pnum] = p;
+ printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
+ spin_unlock(&pers_lock);
+ return 0;
+}
+
+int unregister_md_personality(int pnum)
+{
+ if (pnum >= MAX_PERSONALITY) {
+ MD_BUG();
+ return -EINVAL;
+ }
+
+ printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
+ spin_lock(&pers_lock);
+ pers[pnum] = NULL;
+ spin_unlock(&pers_lock);
+ return 0;
+}
+
+void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors)
+{
+ rdev->bdev->bd_contains->bd_disk->sync_io += nr_sectors;
+}
+
+static int is_mddev_idle(mddev_t *mddev)
+{
+ mdk_rdev_t * rdev;
+ struct list_head *tmp;
+ int idle;
+ unsigned long curr_events;
+
+ idle = 1;
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+ curr_events = disk_stat_read(disk, read_sectors) +
+ disk_stat_read(disk, write_sectors) -
+ disk->sync_io;
+ if ((curr_events - rdev->last_events) > 32) {
+ rdev->last_events = curr_events;
+ idle = 0;
+ }
+ }
+ return idle;
+}
+
+void md_done_sync(mddev_t *mddev, int blocks, int ok)
+{
+ /* another "blocks" (512byte) blocks have been synced */
+ atomic_sub(blocks, &mddev->recovery_active);
+ wake_up(&mddev->recovery_wait);
+ if (!ok) {
+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ // stop recovery, signal do_sync ....
+ }
+}
+
+
+void md_write_start(mddev_t *mddev)
+{
+ if (!atomic_read(&mddev->writes_pending)) {
+ mddev_lock_uninterruptible(mddev);
+ if (mddev->in_sync) {
+ mddev->in_sync = 0;
+ del_timer(&mddev->safemode_timer);
+ md_update_sb(mddev);
+ }
+ atomic_inc(&mddev->writes_pending);
+ mddev_unlock(mddev);
+ } else
+ atomic_inc(&mddev->writes_pending);
+}
+
+void md_write_end(mddev_t *mddev)
+{
+ if (atomic_dec_and_test(&mddev->writes_pending)) {
+ if (mddev->safemode == 2)
+ md_wakeup_thread(mddev->thread);
+ else
+ mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
+ }
+}
+
+static inline void md_enter_safemode(mddev_t *mddev)
+{
+ mddev_lock_uninterruptible(mddev);
+ if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
+ !mddev->in_sync && mddev->recovery_cp == MaxSector) {
+ mddev->in_sync = 1;
+ md_update_sb(mddev);
+ }
+ mddev_unlock(mddev);
+
+ if (mddev->safemode == 1)
+ mddev->safemode = 0;
+}
+
+void md_handle_safemode(mddev_t *mddev)
+{
+ if (signal_pending(current)) {
+ printk(KERN_INFO "md: md%d in immediate safe mode\n",
+ mdidx(mddev));
+ mddev->safemode = 2;
+ flush_signals(current);
+ }
+ if (mddev->safemode)
+ md_enter_safemode(mddev);
+}
+
+
+DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+
+#define SYNC_MARKS 10
+#define SYNC_MARK_STEP (3*HZ)
+static void md_do_sync(mddev_t *mddev)
+{
+ mddev_t *mddev2;
+ unsigned int max_sectors, currspeed = 0,
+ j, window;
+ unsigned long mark[SYNC_MARKS];
+ unsigned long mark_cnt[SYNC_MARKS];
+ int last_mark,m;
+ struct list_head *tmp;
+ unsigned long last_check;
+
+ /* just incase thread restarts... */
+ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ return;
+
+ /* we overload curr_resync somewhat here.
+ * 0 == not engaged in resync at all
+ * 2 == checking that there is no conflict with another sync
+ * 1 == like 2, but have yielded to allow conflicting resync to
+ * commense
+ * other == active in resync - this many blocks
+ */
+ do {
+ mddev->curr_resync = 2;
+
+ ITERATE_MDDEV(mddev2,tmp) {
+ if (mddev2 == mddev)
+ continue;
+ if (mddev2->curr_resync &&
+ match_mddev_units(mddev,mddev2)) {
+ printk(KERN_INFO "md: delaying resync of md%d"
+ " until md%d has finished resync (they"
+ " share one or more physical units)\n",
+ mdidx(mddev), mdidx(mddev2));
+ if (mddev < mddev2) {/* arbitrarily yield */
+ mddev->curr_resync = 1;
+ wake_up(&resync_wait);
+ }
+ if (wait_event_interruptible(resync_wait,
+ mddev2->curr_resync < mddev->curr_resync)) {
+ flush_signals(current);
+ mddev_put(mddev2);
+ goto skip;
+ }
+ }
+ if (mddev->curr_resync == 1) {
+ mddev_put(mddev2);
+ break;
+ }
+ }
+ } while (mddev->curr_resync < 2);
+
+ max_sectors = mddev->size << 1;
+
+ printk(KERN_INFO "md: syncing RAID array md%d\n", mdidx(mddev));
+ printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
+ " %d KB/sec/disc.\n", sysctl_speed_limit_min);
+ printk(KERN_INFO "md: using maximum available idle IO bandwith "
+ "(but not more than %d KB/sec) for reconstruction.\n",
+ sysctl_speed_limit_max);
+
+ is_mddev_idle(mddev); /* this also initializes IO event counters */
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ j = mddev->recovery_cp;
+ else
+ j = 0;
+ for (m = 0; m < SYNC_MARKS; m++) {
+ mark[m] = jiffies;
+ mark_cnt[m] = j;
+ }
+ last_mark = 0;
+ mddev->resync_mark = mark[last_mark];
+ mddev->resync_mark_cnt = mark_cnt[last_mark];
+
+ /*
+ * Tune reconstruction:
+ */
+ window = 32*(PAGE_SIZE/512);
+ printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
+ window/2,max_sectors/2);
+
+ atomic_set(&mddev->recovery_active, 0);
+ init_waitqueue_head(&mddev->recovery_wait);
+ last_check = 0;
+
+ if (j)
+ printk(KERN_INFO
+ "md: resuming recovery of md%d from checkpoint.\n",
+ mdidx(mddev));
+
+ while (j < max_sectors) {
+ int sectors;
+
+ sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
+ if (sectors < 0) {
+ set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+ goto out;
+ }
+ atomic_add(sectors, &mddev->recovery_active);
+ j += sectors;
+ if (j>1) mddev->curr_resync = j;
+
+ if (last_check + window > j)
+ continue;
+
+ last_check = j;
+
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_ERR, &mddev->recovery))
+ break;
+
+ blk_run_queues();
+
+ repeat:
+ if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
+ /* step marks */
+ int next = (last_mark+1) % SYNC_MARKS;
+
+ mddev->resync_mark = mark[next];
+ mddev->resync_mark_cnt = mark_cnt[next];
+ mark[next] = jiffies;
+ mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
+ last_mark = next;
+ }
+
+
+ if (signal_pending(current)) {
+ /*
+ * got a signal, exit.
+ */
+ printk(KERN_INFO
+ "md: md_do_sync() got signal ... exiting\n");
+ flush_signals(current);
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ goto out;
+ }
+
+ /*
+ * this loop exits only if either when we are slower than
+ * the 'hard' speed limit, or the system was IO-idle for
+ * a jiffy.
+ * the system might be non-idle CPU-wise, but we only care
+ * about not overloading the IO subsystem. (things like an
+ * e2fsck being done on the RAID array should execute fast)
+ */
+ cond_resched();
+
+ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
+
+ if (currspeed > sysctl_speed_limit_min) {
+ if ((currspeed > sysctl_speed_limit_max) ||
+ !is_mddev_idle(mddev)) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(HZ/4);
+ goto repeat;
+ }
+ }
+ }
+ printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev));
+ /*
+ * this also signals 'finished resyncing' to md_stop
+ */
+ out:
+ wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
+
+ /* tell personality that we are finished */
+ mddev->pers->sync_request(mddev, max_sectors, 1);
+
+ if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
+ mddev->curr_resync > 2 &&
+ mddev->curr_resync > mddev->recovery_cp) {
+ if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+ printk(KERN_INFO
+ "md: checkpointing recovery of md%d.\n",
+ mdidx(mddev));
+ mddev->recovery_cp = mddev->curr_resync;
+ } else
+ mddev->recovery_cp = MaxSector;
+ }
+
+ if (mddev->safemode)
+ md_enter_safemode(mddev);
+ skip:
+ mddev->curr_resync = 0;
+ set_bit(MD_RECOVERY_DONE, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+}
+
+
+/*
+ * This routine is regularly called by all per-raid-array threads to
+ * deal with generic issues like resync and super-block update.
+ * Raid personalities that don't have a thread (linear/raid0) do not
+ * need this as they never do any recovery or update the superblock.
+ *
+ * It does not do any resync itself, but rather "forks" off other threads
+ * to do that as needed.
+ * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
+ * "->recovery" and create a thread at ->sync_thread.
+ * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
+ * and wakeups up this thread which will reap the thread and finish up.
+ * This thread also removes any faulty devices (with nr_pending == 0).
+ *
+ * The overall approach is:
+ * 1/ if the superblock needs updating, update it.
+ * 2/ If a recovery thread is running, don't do anything else.
+ * 3/ If recovery has finished, clean up, possibly marking spares active.
+ * 4/ If there are any faulty devices, remove them.
+ * 5/ If array is degraded, try to add spares devices
+ * 6/ If array has spares or is not in-sync, start a resync thread.
+ */
+void md_check_recovery(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+ struct list_head *rtmp;
+
+
+ dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
+
+ if (mddev->ro)
+ return;
+ if ( ! (
+ mddev->sb_dirty ||
+ test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_DONE, &mddev->recovery)
+ ))
+ return;
+ if (mddev_trylock(mddev)==0) {
+ int spares =0;
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+ /* resync/recovery still happening */
+ goto unlock;
+ if (mddev->sync_thread) {
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ mddev->pers->spare_active(mddev);
+ }
+ md_update_sb(mddev);
+ mddev->recovery = 0;
+ wake_up(&resync_wait);
+ goto unlock;
+ }
+ if (mddev->recovery) {
+ /* that's odd.. */
+ mddev->recovery = 0;
+ wake_up(&resync_wait);
+ }
+
+ /* no recovery is running.
+ * remove any failed drives, then
+ * add spares if possible
+ */
+ ITERATE_RDEV(mddev,rdev,rtmp) {
+ if (rdev->raid_disk >= 0 &&
+ rdev->faulty &&
+ atomic_read(&rdev->nr_pending)==0) {
+ mddev->pers->hot_remove_disk(mddev, rdev->raid_disk);
+ rdev->raid_disk = -1;
+ }
+ if (!rdev->faulty && rdev->raid_disk >= 0 && !rdev->in_sync)
+ spares++;
+ }
+ if (mddev->degraded) {
+ ITERATE_RDEV(mddev,rdev,rtmp)
+ if (rdev->raid_disk < 0
+ && !rdev->faulty) {
+ if (mddev->pers->hot_add_disk(mddev,rdev))
+ spares++;
+ else
+ break;
+ }
+ }
+
+ if (!spares && (mddev->recovery_cp == MaxSector )) {
+ /* nothing we can do ... */
+ goto unlock;
+ }
+ if (mddev->pers->sync_request) {
+ set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ if (!spares)
+ set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ mddev->sync_thread = md_register_thread(md_do_sync,
+ mddev,
+ "md%d_resync");
+ if (!mddev->sync_thread) {
+ printk(KERN_ERR "md%d: could not start resync"
+ " thread...\n",
+ mdidx(mddev));
+ /* leave the spares where they are, it shouldn't hurt */
+ mddev->recovery = 0;
+ } else {
+ md_wakeup_thread(mddev->sync_thread);
+ }
+ }
+ unlock:
+ mddev_unlock(mddev);
+ }
+}
+
+int md_notify_reboot(struct notifier_block *this,
+ unsigned long code, void *x)
+{
+ struct list_head *tmp;
+ mddev_t *mddev;
+
+ if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
+
+ printk(KERN_INFO "md: stopping all md devices.\n");
+
+ ITERATE_MDDEV(mddev,tmp)
+ if (mddev_trylock(mddev)==0)
+ do_md_stop (mddev, 1);
+ /*
+ * certain more exotic SCSI devices are known to be
+ * volatile wrt too early system reboots. While the
+ * right place to handle this issue is the given
+ * driver, we do want to have a safe RAID driver ...
+ */
+ mdelay(1000*1);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block md_notifier = {
+ .notifier_call = md_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX, /* before any real devices */
+};
+
+static void md_geninit(void)
+{
+ struct proc_dir_entry *p;
+
+ dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
+
+#ifdef CONFIG_PROC_FS
+ p = create_proc_entry("mdstat", S_IRUGO, NULL);
+ if (p)
+ p->proc_fops = &md_seq_fops;
+#endif
+}
+
+int __init md_init(void)
+{
+ int minor;
+
+ printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
+ " MD_SB_DISKS=%d\n",
+ MD_MAJOR_VERSION, MD_MINOR_VERSION,
+ MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
+
+ if (register_blkdev(MAJOR_NR, "md"))
+ return -1;
+
+ devfs_mk_dir("md");
+ blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
+ md_probe, NULL, NULL);
+ for (minor=0; minor < MAX_MD_DEVS; ++minor) {
+ char name[16];
+ sprintf(name, "md/%d", minor);
+ devfs_register(NULL, name, DEVFS_FL_DEFAULT, MAJOR_NR, minor,
+ S_IFBLK | S_IRUSR | S_IWUSR, &md_fops, NULL);
+ }
+
+ register_reboot_notifier(&md_notifier);
+ raid_table_header = register_sysctl_table(raid_root_table, 1);
+
+ md_geninit();
+ return (0);
+}
+
+
+#ifndef MODULE
+
+/*
+ * Searches all registered partitions for autorun RAID arrays
+ * at boot time.
+ */
+static dev_t detected_devices[128];
+static int dev_cnt;
+
+void md_autodetect_dev(dev_t dev)
+{
+ if (dev_cnt >= 0 && dev_cnt < 127)
+ detected_devices[dev_cnt++] = dev;
+}
+
+
+static void autostart_arrays(void)
+{
+ mdk_rdev_t *rdev;
+ int i;
+
+ printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
+
+ for (i = 0; i < dev_cnt; i++) {
+ dev_t dev = detected_devices[i];
+
+ rdev = md_import_device(dev,0, 0);
+ if (IS_ERR(rdev)) {
+ printk(KERN_ALERT "md: could not import %s!\n",
+ partition_name(dev));
+ continue;
+ }
+ if (rdev->faulty) {
+ MD_BUG();
+ continue;
+ }
+ list_add(&rdev->same_set, &pending_raid_disks);
+ }
+ dev_cnt = 0;
+
+ autorun_devices();
+}
+
+#endif
+
+static __exit void md_exit(void)
+{
+ int i;
+ blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
+ for (i=0; i < MAX_MD_DEVS; i++)
+ devfs_remove("md/%d", i);
+ devfs_remove("md");
+
+ unregister_blkdev(MAJOR_NR,"md");
+ unregister_reboot_notifier(&md_notifier);
+ unregister_sysctl_table(raid_table_header);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("mdstat", NULL);
+#endif
+ for (i = 0; i < MAX_MD_DEVS; i++) {
+ struct gendisk *disk = disks[i];
+ mddev_t *mddev;
+ if (!disks[i])
+ continue;
+ mddev = disk->private_data;
+ del_gendisk(disk);
+ put_disk(disk);
+ mddev_put(mddev);
+ }
+}
+
+module_init(md_init)
+module_exit(md_exit)
+
+EXPORT_SYMBOL(register_md_personality);
+EXPORT_SYMBOL(unregister_md_personality);
+EXPORT_SYMBOL(md_error);
+EXPORT_SYMBOL(md_sync_acct);
+EXPORT_SYMBOL(md_done_sync);
+EXPORT_SYMBOL(md_write_start);
+EXPORT_SYMBOL(md_write_end);
+EXPORT_SYMBOL(md_handle_safemode);
+EXPORT_SYMBOL(md_register_thread);
+EXPORT_SYMBOL(md_unregister_thread);
+EXPORT_SYMBOL(md_wakeup_thread);
+EXPORT_SYMBOL(md_print_devices);
+EXPORT_SYMBOL(md_interrupt_thread);
+EXPORT_SYMBOL(md_check_recovery);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/nfsd-defines/merge b/tests/linux/nfsd-defines/merge
new file mode 100644
index 0000000..379b771
--- /dev/null
+++ b/tests/linux/nfsd-defines/merge
@@ -0,0 +1,270 @@
+/*
+ * linux/include/linux/nfsd/nfsd.h
+ *
+ * Hodge-podge collection of knfsd-related stuff.
+ * I will sort this out later.
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_NFSD_H
+#define LINUX_NFSD_NFSD_H
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <linux/dirent.h>
+#include <linux/fs.h>
+
+#include <linux/nfsd/debug.h>
+#include <linux/nfsd/nfsfh.h>
+#include <linux/nfsd/export.h>
+#include <linux/nfsd/auth.h>
+#include <linux/nfsd/stats.h>
+#include <linux/nfsd/interface.h>
+/*
+ * nfsd version
+ */
+#define NFSD_VERSION "0.5"
+#define NFSD_SUPPORTED_MINOR_VERSION 0
+
+#ifdef __KERNEL__
+/*
+ * Special flags for nfsd_permission. These must be different from MAY_READ,
+ * MAY_WRITE, and MAY_EXEC.
+ */
+#define MAY_NOP 0
+#define MAY_SATTR 8
+#define MAY_TRUNC 16
+#define MAY_LOCK 32
+#define MAY_OWNER_OVERRIDE 64
+#define MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
+#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAY_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC)
+# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_LOCAL_ACCESS or MAY_OWNER_OVERRIDE."
+#endif
+#define MAY_CREATE (MAY_EXEC|MAY_WRITE)
+#define MAY_REMOVE (MAY_EXEC|MAY_WRITE|MAY_TRUNC)
+
+/*
+ * Callback function for readdir
+ */
+struct readdir_cd {
+ int err; /* 0, nfserr, or nfserr_eof */
+};
+typedef int (*encode_dent_fn)(struct readdir_cd *, const char *,
+ int, loff_t, ino_t, unsigned int);
+typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
+
+extern struct svc_program nfsd_program;
+extern struct svc_version nfsd_version2, nfsd_version3,
+ nfsd_version4;
+
+/*
+ * Function prototypes.
+ */
+int nfsd_svc(unsigned short port, int nrservs);
+int nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp);
+
+/* nfsd/vfs.c */
+int fh_lock_parent(struct svc_fh *, struct dentry *);
+int nfsd_racache_init(int);
+void nfsd_racache_shutdown(void);
+int nfsd_lookup(struct svc_rqst *, struct svc_fh *,
+ const char *, int, struct svc_fh *);
+int nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+ struct iattr *, int, time_t);
+int nfsd_create(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ int type, dev_t rdev, struct svc_fh *res);
+#ifdef CONFIG_NFSD_V3
+int nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
+int nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ struct svc_fh *res, int createmode,
+ u32 *verifier, int *truncp);
+int nfsd_commit(struct svc_rqst *, struct svc_fh *,
+ off_t, unsigned long);
+#endif /* CONFIG_NFSD_V3 */
+int nfsd_open(struct svc_rqst *, struct svc_fh *, int,
+ int, struct file *);
+void nfsd_close(struct file *);
+int nfsd_read(struct svc_rqst *, struct svc_fh *,
+ loff_t, struct iovec *,int, unsigned long *);
+int nfsd_write(struct svc_rqst *, struct svc_fh *,
+ loff_t, struct iovec *,int, unsigned long, int *);
+int nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+ char *, int *);
+int nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, char *path, int plen,
+ struct svc_fh *res, struct iattr *);
+int nfsd_link(struct svc_rqst *, struct svc_fh *,
+ char *, int, struct svc_fh *);
+int nfsd_rename(struct svc_rqst *,
+ struct svc_fh *, char *, int,
+ struct svc_fh *, char *, int);
+int nfsd_remove(struct svc_rqst *,
+ struct svc_fh *, char *, int);
+int nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
+ char *name, int len);
+int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
+ unsigned long size);
+int nfsd_readdir(struct svc_rqst *, struct svc_fh *,
+ loff_t *, struct readdir_cd *, encode_dent_fn);
+int nfsd_statfs(struct svc_rqst *, struct svc_fh *,
+ struct statfs *);
+
+int nfsd_notify_change(struct inode *, struct iattr *);
+int nfsd_permission(struct svc_export *, struct dentry *, int);
+
+
+/*
+ * NFSv4 State
+ */
+#ifdef CONFIG_NFSD_V4
+void nfs4_state_init(void);
+void nfs4_state_shutdown(void);
+#else
+void static inline nfs4_state_init(void){}
+void static inline nfs4_state_shutdown(void){}
+#endif
+
+/*
+ * lockd binding
+ */
+void nfsd_lockd_init(void);
+void nfsd_lockd_shutdown(void);
+
+
+/*
+ * These macros provide pre-xdr'ed values for faster operation.
+ */
+#define nfs_ok __constant_htonl(NFS_OK)
+#define nfserr_perm __constant_htonl(NFSERR_PERM)
+#define nfserr_noent __constant_htonl(NFSERR_NOENT)
+#define nfserr_io __constant_htonl(NFSERR_IO)
+#define nfserr_nxio __constant_htonl(NFSERR_NXIO)
+#define nfserr_eagain __constant_htonl(NFSERR_EAGAIN)
+#define nfserr_acces __constant_htonl(NFSERR_ACCES)
+#define nfserr_exist __constant_htonl(NFSERR_EXIST)
+#define nfserr_xdev __constant_htonl(NFSERR_XDEV)
+#define nfserr_nodev __constant_htonl(NFSERR_NODEV)
+#define nfserr_notdir __constant_htonl(NFSERR_NOTDIR)
+#define nfserr_isdir __constant_htonl(NFSERR_ISDIR)
+#define nfserr_inval __constant_htonl(NFSERR_INVAL)
+#define nfserr_fbig __constant_htonl(NFSERR_FBIG)
+#define nfserr_nospc __constant_htonl(NFSERR_NOSPC)
+#define nfserr_rofs __constant_htonl(NFSERR_ROFS)
+#define nfserr_mlink __constant_htonl(NFSERR_MLINK)
+#define nfserr_opnotsupp __constant_htonl(NFSERR_OPNOTSUPP)
+#define nfserr_nametoolong __constant_htonl(NFSERR_NAMETOOLONG)
+#define nfserr_notempty __constant_htonl(NFSERR_NOTEMPTY)
+#define nfserr_dquot __constant_htonl(NFSERR_DQUOT)
+#define nfserr_stale __constant_htonl(NFSERR_STALE)
+#define nfserr_remote __constant_htonl(NFSERR_REMOTE)
+#define nfserr_wflush __constant_htonl(NFSERR_WFLUSH)
+#define nfserr_badhandle __constant_htonl(NFSERR_BADHANDLE)
+#define nfserr_notsync __constant_htonl(NFSERR_NOT_SYNC)
+#define nfserr_badcookie __constant_htonl(NFSERR_BAD_COOKIE)
+#define nfserr_notsupp __constant_htonl(NFSERR_NOTSUPP)
+#define nfserr_toosmall __constant_htonl(NFSERR_TOOSMALL)
+#define nfserr_serverfault __constant_htonl(NFSERR_SERVERFAULT)
+#define nfserr_badtype __constant_htonl(NFSERR_BADTYPE)
+#define nfserr_jukebox __constant_htonl(NFSERR_JUKEBOX)
+#define nfserr_bad_cookie __constant_htonl(NFSERR_BAD_COOKIE)
+#define nfserr_same __constant_htonl(NFSERR_SAME)
+#define nfserr_clid_inuse __constant_htonl(NFSERR_CLID_INUSE)
+#define nfserr_stale_clientid __constant_htonl(NFSERR_STALE_CLIENTID)
+#define nfserr_resource __constant_htonl(NFSERR_RESOURCE)
+#define nfserr_nofilehandle __constant_htonl(NFSERR_NOFILEHANDLE)
+#define nfserr_minor_vers_mismatch __constant_htonl(NFSERR_MINOR_VERS_MISMATCH)
+#define nfserr_symlink __constant_htonl(NFSERR_SYMLINK)
+#define nfserr_not_same __constant_htonl(NFSERR_NOT_SAME)
+#define nfserr_readdir_nospc __constant_htonl(NFSERR_READDIR_NOSPC)
+#define nfserr_bad_xdr __constant_htonl(NFSERR_BAD_XDR)
+
+/* error codes for internal use */
+/* if a request fails due to kmalloc failure, it gets dropped.
+ * Client should resend eventually
+ */
+#define nfserr_dropit __constant_htonl(30000)
+/* end-of-file indicator in readdir */
+#define nfserr_eof __constant_htonl(30001)
+
+/* Check for dir entries '.' and '..' */
+#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
+
+/*
+ * Time of server startup
+ */
+extern struct timeval nfssvc_boot;
+
+
+#ifdef CONFIG_NFSD_V4
+
+/* before processing a COMPOUND operation, we have to check that there
+ * is enough space in the buffer for XDR encode to succeed. otherwise,
+ * we might process an operation with side effects, and be unable to
+ * tell the client that the operation succeeded.
+ *
+ * COMPOUND_SLACK_SPACE - this is the minimum amount of buffer space
+ * needed to encode an "ordinary" _successful_ operation. (GETATTR,
+ * READ, READDIR, and READLINK have their own buffer checks.) if we
+ * fall below this level, we fail the next operation with NFS4ERR_RESOURCE.
+ *
+ * COMPOUND_ERR_SLACK_SPACE - this is the minimum amount of buffer space
+ * needed to encode an operation which has failed with NFS4ERR_RESOURCE.
+ * care is taken to ensure that we never fall below this level for any
+ * reason.
+ */
+#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
+#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
+
+#define NFSD_LEASE_TIME 60 /* seconds */
+
+/*
+ * The following attributes are currently not supported by the NFSv4 server:
+ * ACL (will be supported in a forthcoming patch)
+ * ARCHIVE (deprecated anyway)
+ * FS_LOCATIONS (will be supported eventually)
+ * HIDDEN (unlikely to be supported any time soon)
+ * MIMETYPE (unlikely to be supported any time soon)
+ * QUOTA_* (will be supported in a forthcoming patch)
+ * SYSTEM (unlikely to be supported any time soon)
+ * TIME_BACKUP (unlikely to be supported any time soon)
+ * TIME_CREATE (unlikely to be supported any time soon)
+ */
+#define NFSD_SUPPORTED_ATTRS_WORD0 \
+(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \
+ | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \
+ | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \
+ | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \
+ | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \
+ | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \
+ | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \
+ | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_HOMOGENEOUS \
+ | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \
+ | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE)
+
+#define NFSD_SUPPORTED_ATTRS_WORD1 \
+(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \
+ | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \
+ | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \
+ | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \
+ | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \
+ | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET)
+
+/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
+#define NFSD_WRITEONLY_ATTRS_WORD1 \
+(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+
+/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
+#define NFSD_WRITEABLE_ATTRS_WORD0 FATTR4_WORD0_SIZE
+#define NFSD_WRITEABLE_ATTRS_WORD1 \
+(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY_SET)
+
+#endif /* CONFIG_NFSD_V4 */
+
+#endif /* __KERNEL__ */
+
+#endif /* LINUX_NFSD_NFSD_H */
diff --git a/tests/linux/nfsd-defines/orig b/tests/linux/nfsd-defines/orig
new file mode 100644
index 0000000..f4b2784
--- /dev/null
+++ b/tests/linux/nfsd-defines/orig
@@ -0,0 +1,270 @@
+/*
+ * linux/include/linux/nfsd/nfsd.h
+ *
+ * Hodge-podge collection of knfsd-related stuff.
+ * I will sort this out later.
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#ifndef LINUX_NFSD_NFSD_H
+#define LINUX_NFSD_NFSD_H
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <linux/dirent.h>
+#include <linux/fs.h>
+
+#include <linux/nfsd/debug.h>
+#include <linux/nfsd/nfsfh.h>
+#include <linux/nfsd/export.h>
+#include <linux/nfsd/auth.h>
+#include <linux/nfsd/stats.h>
+#include <linux/nfsd/interface.h>
+/*
+ * nfsd version
+ */
+#define NFSD_VERSION "0.5"
+#define NFSD_SUPPORTED_MINOR_VERSION 0
+
+#ifdef __KERNEL__
+/*
+ * Special flags for nfsd_permission. These must be different from MAY_READ,
+ * MAY_WRITE, and MAY_EXEC.
+ */
+#define MAY_NOP 0
+#define MAY_SATTR 8
+#define MAY_TRUNC 16
+#define MAY_LOCK 32
+#define MAY_OWNER_OVERRIDE 64
+#define MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
+#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAY_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC)
+# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_OWNER_OVERRIDE."
+#endif
+#define MAY_CREATE (MAY_EXEC|MAY_WRITE)
+#define MAY_REMOVE (MAY_EXEC|MAY_WRITE|MAY_TRUNC)
+
+/*
+ * Callback function for readdir
+ */
+struct readdir_cd {
+ int err; /* 0, nfserr, or nfserr_eof */
+};
+typedef int (*encode_dent_fn)(struct readdir_cd *, const char *,
+ int, loff_t, ino_t, unsigned int);
+typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
+
+extern struct svc_program nfsd_program;
+extern struct svc_version nfsd_version2, nfsd_version3,
+ nfsd_version4;
+
+/*
+ * Function prototypes.
+ */
+int nfsd_svc(unsigned short port, int nrservs);
+int nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp);
+
+/* nfsd/vfs.c */
+int fh_lock_parent(struct svc_fh *, struct dentry *);
+int nfsd_racache_init(int);
+void nfsd_racache_shutdown(void);
+int nfsd_lookup(struct svc_rqst *, struct svc_fh *,
+ const char *, int, struct svc_fh *);
+int nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+ struct iattr *, int, time_t);
+int nfsd_create(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ int type, dev_t rdev, struct svc_fh *res);
+#ifdef CONFIG_NFSD_V3
+int nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
+int nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ struct svc_fh *res, int createmode,
+ u32 *verifier, int *truncp);
+int nfsd_commit(struct svc_rqst *, struct svc_fh *,
+ off_t, unsigned long);
+#endif /* CONFIG_NFSD_V3 */
+int nfsd_open(struct svc_rqst *, struct svc_fh *, int,
+ int, struct file *);
+void nfsd_close(struct file *);
+int nfsd_read(struct svc_rqst *, struct svc_fh *,
+ loff_t, struct iovec *,int, unsigned long *);
+int nfsd_write(struct svc_rqst *, struct svc_fh *,
+ loff_t, struct iovec *,int, unsigned long, int *);
+int nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+ char *, int *);
+int nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, char *path, int plen,
+ struct svc_fh *res, struct iattr *);
+int nfsd_link(struct svc_rqst *, struct svc_fh *,
+ char *, int, struct svc_fh *);
+int nfsd_rename(struct svc_rqst *,
+ struct svc_fh *, char *, int,
+ struct svc_fh *, char *, int);
+int nfsd_remove(struct svc_rqst *,
+ struct svc_fh *, char *, int);
+int nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
+ char *name, int len);
+int nfsd_truncate(struct svc_rqst *, struct svc_fh *,
+ unsigned long size);
+int nfsd_readdir(struct svc_rqst *, struct svc_fh *,
+ loff_t *, struct readdir_cd *, encode_dent_fn);
+int nfsd_statfs(struct svc_rqst *, struct svc_fh *,
+ struct statfs *);
+
+int nfsd_notify_change(struct inode *, struct iattr *);
+int nfsd_permission(struct svc_export *, struct dentry *, int);
+
+
+/*
+ * NFSv4 State
+ */
+#ifdef CONFIG_NFSD_V4
+void nfs4_state_init(void);
+void nfs4_state_shutdown(void);
+#else
+void static inline nfs4_state_init(void){}
+void static inline nfs4_state_shutdown(void){}
+#endif
+
+/*
+ * lockd binding
+ */
+void nfsd_lockd_init(void);
+void nfsd_lockd_shutdown(void);
+
+
+/*
+ * These macros provide pre-xdr'ed values for faster operation.
+ */
+#define nfs_ok __constant_htonl(NFS_OK)
+#define nfserr_perm __constant_htonl(NFSERR_PERM)
+#define nfserr_noent __constant_htonl(NFSERR_NOENT)
+#define nfserr_io __constant_htonl(NFSERR_IO)
+#define nfserr_nxio __constant_htonl(NFSERR_NXIO)
+#define nfserr_eagain __constant_htonl(NFSERR_EAGAIN)
+#define nfserr_acces __constant_htonl(NFSERR_ACCES)
+#define nfserr_exist __constant_htonl(NFSERR_EXIST)
+#define nfserr_xdev __constant_htonl(NFSERR_XDEV)
+#define nfserr_nodev __constant_htonl(NFSERR_NODEV)
+#define nfserr_notdir __constant_htonl(NFSERR_NOTDIR)
+#define nfserr_isdir __constant_htonl(NFSERR_ISDIR)
+#define nfserr_inval __constant_htonl(NFSERR_INVAL)
+#define nfserr_fbig __constant_htonl(NFSERR_FBIG)
+#define nfserr_nospc __constant_htonl(NFSERR_NOSPC)
+#define nfserr_rofs __constant_htonl(NFSERR_ROFS)
+#define nfserr_mlink __constant_htonl(NFSERR_MLINK)
+#define nfserr_opnotsupp __constant_htonl(NFSERR_OPNOTSUPP)
+#define nfserr_nametoolong __constant_htonl(NFSERR_NAMETOOLONG)
+#define nfserr_notempty __constant_htonl(NFSERR_NOTEMPTY)
+#define nfserr_dquot __constant_htonl(NFSERR_DQUOT)
+#define nfserr_stale __constant_htonl(NFSERR_STALE)
+#define nfserr_remote __constant_htonl(NFSERR_REMOTE)
+#define nfserr_wflush __constant_htonl(NFSERR_WFLUSH)
+#define nfserr_badhandle __constant_htonl(NFSERR_BADHANDLE)
+#define nfserr_notsync __constant_htonl(NFSERR_NOT_SYNC)
+#define nfserr_badcookie __constant_htonl(NFSERR_BAD_COOKIE)
+#define nfserr_notsupp __constant_htonl(NFSERR_NOTSUPP)
+#define nfserr_toosmall __constant_htonl(NFSERR_TOOSMALL)
+#define nfserr_serverfault __constant_htonl(NFSERR_SERVERFAULT)
+#define nfserr_badtype __constant_htonl(NFSERR_BADTYPE)
+#define nfserr_jukebox __constant_htonl(NFSERR_JUKEBOX)
+#define nfserr_bad_cookie __constant_htonl(NFSERR_BAD_COOKIE)
+#define nfserr_same __constant_htonl(NFSERR_SAME)
+#define nfserr_clid_inuse __constant_htonl(NFSERR_CLID_INUSE)
+#define nfserr_stale_clientid __constant_htonl(NFSERR_STALE_CLIENTID)
+#define nfserr_resource __constant_htonl(NFSERR_RESOURCE)
+#define nfserr_nofilehandle __constant_htonl(NFSERR_NOFILEHANDLE)
+#define nfserr_minor_vers_mismatch __constant_htonl(NFSERR_MINOR_VERS_MISMATCH)
+#define nfserr_symlink __constant_htonl(NFSERR_SYMLINK)
+#define nfserr_not_same __constant_htonl(NFSERR_NOT_SAME)
+#define nfserr_readdir_nospc __constant_htonl(NFSERR_READDIR_NOSPC)
+#define nfserr_bad_xdr __constant_htonl(NFSERR_BAD_XDR)
+
+/* error codes for internal use */
+/* if a request fails due to kmalloc failure, it gets dropped.
+ * Client should resend eventually
+ */
+#define nfserr_dropit __constant_htonl(30000)
+/* end-of-file indicator in readdir */
+#define nfserr_eof __constant_htonl(30001)
+
+/* Check for dir entries '.' and '..' */
+#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
+
+/*
+ * Time of server startup
+ */
+extern struct timeval nfssvc_boot;
+
+
+#ifdef CONFIG_NFSD_V4
+
+/* before processing a COMPOUND operation, we have to check that there
+ * is enough space in the buffer for XDR encode to succeed. otherwise,
+ * we might process an operation with side effects, and be unable to
+ * tell the client that the operation succeeded.
+ *
+ * COMPOUND_SLACK_SPACE - this is the minimum amount of buffer space
+ * needed to encode an "ordinary" _successful_ operation. (GETATTR,
+ * READ, READDIR, and READLINK have their own buffer checks.) if we
+ * fall below this level, we fail the next operation with NFS4ERR_RESOURCE.
+ *
+ * COMPOUND_ERR_SLACK_SPACE - this is the minimum amount of buffer space
+ * needed to encode an operation which has failed with NFS4ERR_RESOURCE.
+ * care is taken to ensure that we never fall below this level for any
+ * reason.
+ */
+#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
+#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
+
+#define NFSD_LEASE_TIME 60 /* seconds */
+
+/*
+ * The following attributes are currently not supported by the NFSv4 server:
+ * ACL (will be supported in a forthcoming patch)
+ * ARCHIVE (deprecated anyway)
+ * FS_LOCATIONS (will be supported eventually)
+ * HIDDEN (unlikely to be supported any time soon)
+ * MIMETYPE (unlikely to be supported any time soon)
+ * QUOTA_* (will be supported in a forthcoming patch)
+ * SYSTEM (unlikely to be supported any time soon)
+ * TIME_BACKUP (unlikely to be supported any time soon)
+ * TIME_CREATE (unlikely to be supported any time soon)
+ */
+#define NFSD_SUPPORTED_ATTRS_WORD0 \
+(FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_TYPE | FATTR4_WORD0_FH_EXPIRE_TYPE \
+ | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_LINK_SUPPORT \
+ | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR | FATTR4_WORD0_FSID \
+ | FATTR4_WORD0_UNIQUE_HANDLES | FATTR4_WORD0_LEASE_TIME | FATTR4_WORD0_RDATTR_ERROR \
+ | FATTR4_WORD0_ACLSUPPORT | FATTR4_WORD0_CANSETTIME | FATTR4_WORD0_CASE_INSENSITIVE \
+ | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED \
+ | FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FILEID | FATTR4_WORD0_FILES_AVAIL \
+ | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_HOMOGENEOUS \
+ | FATTR4_WORD0_MAXFILESIZE | FATTR4_WORD0_MAXLINK | FATTR4_WORD0_MAXNAME \
+ | FATTR4_WORD0_MAXREAD | FATTR4_WORD0_MAXWRITE)
+
+#define NFSD_SUPPORTED_ATTRS_WORD1 \
+(FATTR4_WORD1_MODE | FATTR4_WORD1_NO_TRUNC | FATTR4_WORD1_NUMLINKS \
+ | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \
+ | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \
+ | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \
+ | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \
+ | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET)
+
+/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
+#define NFSD_WRITEONLY_ATTRS_WORD1 \
+(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+
+/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
+#define NFSD_WRITEABLE_ATTRS_WORD0 FATTR4_WORD0_SIZE
+#define NFSD_WRITEABLE_ATTRS_WORD1 \
+(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY_SET)
+
+#endif /* CONFIG_NFSD_V4 */
+
+#endif /* __KERNEL__ */
+
+#endif /* LINUX_NFSD_NFSD_H */
diff --git a/tests/linux/nfsd-defines/patch b/tests/linux/nfsd-defines/patch
new file mode 100644
index 0000000..506a370
--- /dev/null
+++ b/tests/linux/nfsd-defines/patch
@@ -0,0 +1,24 @@
+Status: trivial
+
+Fix errors with MAY_* value checking
+
+Typos and sillyness...
+
+ ----------- Diffstat output ------------
+ ./include/linux/nfsd/nfsd.h | 4 ++--
+ 1 files changed, 2 insertions(+), 2 deletions(-)
+
+diff ./include/linux/nfsd/nfsd.h~current~ ./include/linux/nfsd/nfsd.h
+--- ./include/linux/nfsd/nfsd.h~current~ 2003-04-17 10:31:15.000000000 +1000
++++ ./include/linux/nfsd/nfsd.h 2003-04-17 10:31:08.000000000 +1000
+@@ -39,8 +39,8 @@
+ #define MAY_LOCK 32
+ #define MAY_OWNER_OVERRIDE 64
+ #define MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
+-#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAX_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC | MAY_OWNER_OVERRIDE)
+-# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_OWNER_OVERRIDE."
++#if (MAY_SATTR | MAY_TRUNC | MAY_LOCK | MAY_OWNER_OVERRIDE | MAY_LOCAL_ACCESS) & (MAY_READ | MAY_WRITE | MAY_EXEC)
++# error "please use a different value for MAY_SATTR or MAY_TRUNC or MAY_LOCK or MAY_LOCAL_ACCESS or MAY_OWNER_OVERRIDE."
+ #endif
+ #define MAY_CREATE (MAY_EXEC|MAY_WRITE)
+ #define MAY_REMOVE (MAY_EXEC|MAY_WRITE|MAY_TRUNC)
diff --git a/tests/linux/raid1-A/merge b/tests/linux/raid1-A/merge
new file mode 100644
index 0000000..86abd0b
--- /dev/null
+++ b/tests/linux/raid1-A/merge
@@ -0,0 +1,2333 @@
+/*
+ * raid1.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * RAID-1 management functions.
+ *
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
+ *
+ * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
+ * bitmapped intelligence in resync:
+ *
+ * - bitmap marked during normal i/o
+ * - bitmap used to skip nondirty blocks during sync
+ *
+ * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
+ * - persistent bitmap code
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid1.h"
+#include "bitmap.h"
+
+#define DEBUG 0
+#if DEBUG
+#define PRINTK(x...) printk(x)
+#else
+#define PRINTK(x...)
+#endif
+
+/*
+ * Number of guaranteed r1bios in case of extreme VM load:
+ */
+#define NR_RAID1_BIOS 256
+
+
+static void allow_barrier(conf_t *conf);
+static void lower_barrier(conf_t *conf);
+
+static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
+{
+ struct pool_info *pi = data;
+ int size = offsetof(r1bio_t, bios[pi->raid_disks]);
+
+ /* allocate a r1bio with room for raid_disks entries in the bios array */
+ return kzalloc(size, gfp_flags);
+}
+
+static void r1bio_pool_free(void *r1_bio, void *data)
+{
+ kfree(r1_bio);
+}
+
+#define RESYNC_BLOCK_SIZE (64*1024)
+//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+#define RESYNC_WINDOW (2048*1024)
+
+static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
+{
+ struct pool_info *pi = data;
+ struct page *page;
+ r1bio_t *r1_bio;
+ struct bio *bio;
+ int i, j;
+
+ r1_bio = r1bio_pool_alloc(gfp_flags, pi);
+ if (!r1_bio)
+ return NULL;
+
+ /*
+ * Allocate bios : 1 for reading, n-1 for writing
+ */
+ for (j = pi->raid_disks ; j-- ; ) {
+ bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
+ if (!bio)
+ goto out_free_bio;
+ r1_bio->bios[j] = bio;
+ }
+ /*
+ * Allocate RESYNC_PAGES data pages and attach them to
+ * the first bio.
+ * If this is a user-requested check/repair, allocate
+ * RESYNC_PAGES for each bio.
+ */
+ if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
+ j = pi->raid_disks;
+ else
+ j = 1;
+ while(j--) {
+ bio = r1_bio->bios[j];
+ for (i = 0; i < RESYNC_PAGES; i++) {
+ page = alloc_page(gfp_flags);
+ if (unlikely(!page))
+ goto out_free_pages;
+
+ bio->bi_io_vec[i].bv_page = page;
+ bio->bi_vcnt = i+1;
+ }
+ }
+ /* If not user-requests, copy the page pointers to all bios */
+ if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
+ for (i=0; i<RESYNC_PAGES ; i++)
+ for (j=1; j<pi->raid_disks; j++)
+ r1_bio->bios[j]->bi_io_vec[i].bv_page =
+ r1_bio->bios[0]->bi_io_vec[i].bv_page;
+ }
+
+ r1_bio->master_bio = NULL;
+
+ return r1_bio;
+
+out_free_pages:
+ for (j=0 ; j < pi->raid_disks; j++)
+ for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
+ put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
+ j = -1;
+out_free_bio:
+ while ( ++j < pi->raid_disks )
+ bio_put(r1_bio->bios[j]);
+ r1bio_pool_free(r1_bio, data);
+ return NULL;
+}
+
+static void r1buf_pool_free(void *__r1_bio, void *data)
+{
+ struct pool_info *pi = data;
+ int i,j;
+ r1bio_t *r1bio = __r1_bio;
+
+ for (i = 0; i < RESYNC_PAGES; i++)
+ for (j = pi->raid_disks; j-- ;) {
+ if (j == 0 ||
+ r1bio->bios[j]->bi_io_vec[i].bv_page !=
+ r1bio->bios[0]->bi_io_vec[i].bv_page)
+ safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
+ }
+ for (i=0 ; i < pi->raid_disks; i++)
+ bio_put(r1bio->bios[i]);
+
+ r1bio_pool_free(r1bio, data);
+}
+
+static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
+{
+ int i;
+
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct bio **bio = r1_bio->bios + i;
+ if (*bio && *bio != IO_BLOCKED)
+ bio_put(*bio);
+ *bio = NULL;
+ }
+}
+
+static void free_r1bio(r1bio_t *r1_bio)
+{
+ conf_t *conf = r1_bio->mddev->private;
+
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf);
+
+ put_all_bios(conf, r1_bio);
+ mempool_free(r1_bio, conf->r1bio_pool);
+}
+
+static void put_buf(r1bio_t *r1_bio)
+{
+ conf_t *conf = r1_bio->mddev->private;
+ int i;
+
+ for (i=0; i<conf->raid_disks; i++) {
+ struct bio *bio = r1_bio->bios[i];
+ if (bio->bi_end_io)
+ rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
+ }
+
+ mempool_free(r1_bio, conf->r1buf_pool);
+
+ lower_barrier(conf);
+}
+
+static void reschedule_retry(r1bio_t *r1_bio)
+{
+ unsigned long flags;
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ list_add(&r1_bio->retry_list, &conf->retry_list);
+ conf->nr_queued ++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ wake_up(&conf->wait_barrier);
+ md_wakeup_thread(mddev->thread);
+}
+
+/*
+ * raid_end_bio_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void raid_end_bio_io(r1bio_t *r1_bio)
+{
+ struct bio *bio = r1_bio->master_bio;
+
+ /* if nobody has done the final endio yet, do it now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
+ (bio_data_dir(bio) == WRITE) ? "write" : "read",
+ (unsigned long long) bio->bi_sector,
+ (unsigned long long) bio->bi_sector +
+ (bio->bi_size >> 9) - 1);
+
+ bio_endio(bio,
+ test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+ }
+ free_r1bio(r1_bio);
+}
+
+/*
+ * Update disk head position estimator based on IRQ completion info.
+ */
+static inline void update_head_pos(int disk, r1bio_t *r1_bio)
+{
+ conf_t *conf = r1_bio->mddev->private;
+
+ conf->mirrors[disk].head_position =
+ r1_bio->sector + (r1_bio->sectors);
+}
+
+static void raid1_end_read_request(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ r1bio_t *r1_bio = bio->bi_private;
+ int mirror;
+ conf_t *conf = r1_bio->mddev->private;
+
+ mirror = r1_bio->read_disk;
+ /*
+ * this branch is our 'one mirror IO has finished' event handler:
+ */
+ update_head_pos(mirror, r1_bio);
+
+ if (uptodate)
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ else {
+ /* If all other devices have failed, we want to return
+ * the error upwards rather than fail the last device.
+ * Here we redefine "uptodate" to mean "Don't want to retry"
+ */
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (r1_bio->mddev->degraded == conf->raid_disks ||
+ (r1_bio->mddev->degraded == conf->raid_disks-1 &&
+ !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
+ uptodate = 1;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+
+ if (uptodate)
+ raid_end_bio_io(r1_bio);
+ else {
+ /*
+ * oops, read error:
+ */
+ char b[BDEVNAME_SIZE];
+ if (printk_ratelimit())
+ printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
+ mdname(conf->mddev),
+ bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+ reschedule_retry(r1_bio);
+ }
+
+ rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+}
+
+static void r1_bio_write_done(r1bio_t *r1_bio)
+{
+ if (atomic_dec_and_test(&r1_bio->remaining))
+ {
+ /* it really is the end of this request */
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ /* free extra copy of the data pages */
+ int i = r1_bio->behind_page_count;
+ while (i--)
+ safe_put_page(r1_bio->behind_pages[i]);
+ kfree(r1_bio->behind_pages);
+ r1_bio->behind_pages = NULL;
+ }
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ !test_bit(R1BIO_Degraded, &r1_bio->state),
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ md_write_end(r1_bio->mddev);
+ raid_end_bio_io(r1_bio);
+ }
+}
+
+static void raid1_end_write_request(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ r1bio_t *r1_bio = bio->bi_private;
+ int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
+ conf_t *conf = r1_bio->mddev->private;
+ struct bio *to_put = NULL;
+
+
+ for (mirror = 0; mirror < conf->raid_disks; mirror++)
+ if (r1_bio->bios[mirror] == bio)
+ break;
+
+ /*
+ * 'one mirror IO has finished' event handler:
+ */
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
+ if (!uptodate) {
+ md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+ /* an I/O failed, we can't clear the bitmap */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ } else
+ /*
+ * Set R1BIO_Uptodate in our master bio, so that we
+ * will return a good error code for to the higher
+ * levels even if IO on some other mirrored buffer
+ * fails.
+ *
+ * The 'master' represents the composite IO operation
+ * to user-side. So if something waits for IO, then it
+ * will wait for the 'master' bio.
+ */
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+ update_head_pos(mirror, r1_bio);
+
+ if (behind) {
+ if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+ atomic_dec(&r1_bio->behind_remaining);
+
+ /*
+ * In behind mode, we ACK the master bio once the I/O
+ * has safely reached all non-writemostly
+ * disks. Setting the Returned bit ensures that this
+ * gets done only once -- we don't ever want to return
+ * -EIO here, instead we'll wait
+ */
+ if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+ test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+ /* Maybe we can return now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ struct bio *mbio = r1_bio->master_bio;
+ PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+ (unsigned long long) mbio->bi_sector,
+ (unsigned long long) mbio->bi_sector +
+ (mbio->bi_size >> 9) - 1);
+ bio_endio(mbio, 0);
+ }
+ }
+ }
+ rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+
+ /*
+ * Let's see if all mirrored write operations have finished
+ * already.
+ */
+ r1_bio_write_done(r1_bio);
+
+ if (to_put)
+ bio_put(to_put);
+}
+
+
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. There is a per-array 'next expected sequential IO' sector
+ * number - if this matches on the next IO then we use the last disk.
+ * There is also a per-disk 'last know head position' sector that is
+ * maintained from IRQ contexts, both the normal and the resync IO
+ * completion handlers update this position correctly. If there is no
+ * perfect sequential match then we pick the disk whose head is closest.
+ *
+ * If there are 2 mirrors in the same 2 devices, performance degrades
+ * because position is mirror, not device based.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+static int read_balance(conf_t *conf, r1bio_t *r1_bio)
+{
+ const sector_t this_sector = r1_bio->sector;
+ const int sectors = r1_bio->sectors;
+ int start_disk;
+ int best_disk;
+ int i;
+ sector_t best_dist;
+ mdk_rdev_t *rdev;
+ int choose_first;
+
+ rcu_read_lock();
+ /*
+ * Check if we can balance. We can balance on the whole
+ * device if no resync is going on, or below the resync window.
+ * We take the first readable disk when above the resync window.
+ */
+ retry:
+ best_disk = -1;
+ best_dist = MaxSector;
+ if (conf->mddev->recovery_cp < MaxSector &&
+ (this_sector + sectors >= conf->next_resync)) {
+ choose_first = 1;
+ start_disk = 0;
+ } else {
+ choose_first = 0;
+ start_disk = conf->last_used;
+ }
+
+ for (i = 0 ; i < conf->raid_disks ; i++) {
+ sector_t dist;
+ int disk = start_disk + i;
+ if (disk >= conf->raid_disks)
+ disk -= conf->raid_disks;
+
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (r1_bio->bios[disk] == IO_BLOCKED
+ || rdev == NULL
+ || test_bit(Faulty, &rdev->flags))
+ continue;
+ if (!test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < this_sector + sectors)
+ continue;
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ /* Don't balance among write-mostly, just
+ * use the first as a last resort */
+ if (best_disk < 0)
+ best_disk = disk;
+ continue;
+ }
+ /* This is a reasonable device to use. It might
+ * even be best.
+ */
+ dist = abs(this_sector - conf->mirrors[disk].head_position);
+ if (choose_first
+ /* Don't change to another disk for sequential reads */
+ || conf->next_seq_sect == this_sector
+ || dist == 0
+ /* If device is idle, use it */
+ || atomic_read(&rdev->nr_pending) == 0) {
+ best_disk = disk;
+ break;
+ }
+ if (dist < best_dist) {
+ best_dist = dist;
+ best_disk = disk;
+ }
+ }
+
+ if (best_disk >= 0) {
+ rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
+ if (!rdev)
+ goto retry;
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(Faulty, &rdev->flags)) {
+ /* cannot risk returning a device that failed
+ * before we inc'ed nr_pending
+ */
+ rdev_dec_pending(rdev, conf->mddev);
+ goto retry;
+ }
+ conf->next_seq_sect = this_sector + sectors;
+ conf->last_used = best_disk;
+ }
+ rcu_read_unlock();
+
+ return best_disk;
+}
+
+int md_raid1_congested(mddev_t *mddev, int bits)
+{
+ conf_t *conf = mddev->private;
+ int i, ret = 0;
+
+ rcu_read_lock();
+ for (i = 0; i < mddev->raid_disks; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
+
+ BUG_ON(!q);
+
+ /* Note the '|| 1' - when read_balance prefers
+ * non-congested targets, it can be removed
+ */
+ if ((bits & (1<<BDI_async_congested)) || 1)
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ else
+ ret &= bdi_congested(&q->backing_dev_info, bits);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(md_raid1_congested);
+
+static int max_queued = INT_MAX;
+static int raid1_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+
+ return mddev_congested(mddev, bits) ||
+ md_raid1_congested(mddev, bits);
+}
+
+static void flush_pending_writes(conf_t *conf)
+{
+ /* Any writes that have been queued but are awaiting
+ * bitmap updates get flushed here.
+ */
+ spin_lock_irq(&conf->device_lock);
+
+ if (conf->pending_bio_list.head) {
+ struct bio *bio;
+ bio = bio_list_get(&conf->pending_bio_list);
+ conf->pending_count = 0;
+ spin_unlock_irq(&conf->device_lock);
+ wake_up(&conf->wait_barrier);
+ /* flush any pending bitmap writes to
+ * disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = next;
+ }
+ } else
+ spin_unlock_irq(&conf->device_lock);
+}
+
+/* Barriers....
+ * Sometimes we need to suspend IO while we do something else,
+ * either some resync/recovery, or reconfigure the array.
+ * To do this we raise a 'barrier'.
+ * The 'barrier' is a counter that can be raised multiple times
+ * to count how many activities are happening which preclude
+ * normal IO.
+ * We can only raise the barrier if there is no pending IO.
+ * i.e. if nr_pending == 0.
+ * We choose only to raise the barrier if no-one is waiting for the
+ * barrier to go down. This means that as soon as an IO request
+ * is ready, no other operations which require a barrier will start
+ * until the IO request has had a chance.
+ *
+ * So: regular IO calls 'wait_barrier'. When that returns there
+ * is no backgroup IO happening, It must arrange to call
+ * allow_barrier when it has finished its IO.
+ * backgroup IO calls must call raise_barrier. Once that returns
+ * there is no normal IO happeing. It must arrange to call
+ * lower_barrier when the particular background IO completes.
+ */
+#define RESYNC_DEPTH 32
+
+static void raise_barrier(conf_t *conf)
+{
+ spin_lock_irq(&conf->resync_lock);
+
+ /* Wait until no block IO is waiting */
+ wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+ conf->resync_lock, );
+
+ /* block any new IO from starting */
+ conf->barrier++;
+
+ /* Now wait for all pending IO to complete */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+ conf->resync_lock, );
+
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static void lower_barrier(conf_t *conf)
+{
+ unsigned long flags;
+ BUG_ON(conf->barrier <= 0);
+ spin_lock_irqsave(&conf->resync_lock, flags);
+ conf->barrier--;
+ spin_unlock_irqrestore(&conf->resync_lock, flags);
+ wake_up(&conf->wait_barrier);
+}
+
+static void wait_barrier(conf_t *conf)
+{
+ spin_lock_irq(&conf->resync_lock);
+ if (conf->barrier) {
+ conf->nr_waiting++;
+ wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+ conf->resync_lock,
+ );
+ conf->nr_waiting--;
+ }
+ conf->nr_pending++;
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static void allow_barrier(conf_t *conf)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->resync_lock, flags);
+ conf->nr_pending--;
+ spin_unlock_irqrestore(&conf->resync_lock, flags);
+ wake_up(&conf->wait_barrier);
+}
+
+static void freeze_array(conf_t *conf)
+{
+ /* stop syncio and normal IO and wait for everything to
+ * go quite.
+ * We increment barrier and nr_waiting, and then
+ * wait until nr_pending match nr_queued+1
+ * This is called in the context of one normal IO request
+ * that has failed. Thus any sync request that might be pending
+ * will be blocked by nr_pending, and we need to wait for
+ * pending IO requests to complete or be queued for re-try.
+ * Thus the number queued (nr_queued) plus this request (1)
+ * must match the number of pending IOs (nr_pending) before
+ * we continue.
+ */
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier++;
+ conf->nr_waiting++;
+ wait_event_lock_irq(conf->wait_barrier,
+ conf->nr_pending == conf->nr_queued+1,
+ conf->resync_lock,
+ flush_pending_writes(conf));
+ spin_unlock_irq(&conf->resync_lock);
+}
+static void unfreeze_array(conf_t *conf)
+{
+ /* reverse the effect of the freeze */
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier--;
+ conf->nr_waiting--;
+ wake_up(&conf->wait_barrier);
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+
+/* duplicate the data pages for behind I/O
+ */
+static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
+{
+ int i;
+ struct bio_vec *bvec;
+ struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
+ GFP_NOIO);
+ if (unlikely(!pages))
+ return;
+
+ bio_for_each_segment(bvec, bio, i) {
+ pages[i] = alloc_page(GFP_NOIO);
+ if (unlikely(!pages[i]))
+ goto do_sync_io;
+ memcpy(kmap(pages[i]) + bvec->bv_offset,
+ kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
+ kunmap(pages[i]);
+ kunmap(bvec->bv_page);
+ }
+ r1_bio->behind_pages = pages;
+ r1_bio->behind_page_count = bio->bi_vcnt;
+ set_bit(R1BIO_BehindIO, &r1_bio->state);
+ return;
+
+do_sync_io:
+ for (i = 0; i < bio->bi_vcnt; i++)
+ if (pages[i])
+ put_page(pages[i]);
+ kfree(pages);
+ PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+}
+
+static int make_request(mddev_t *mddev, struct bio * bio)
+{
+ conf_t *conf = mddev->private;
+ mirror_info_t *mirror;
+ r1bio_t *r1_bio;
+ struct bio *read_bio;
+ int i, targets = 0, disks;
+ struct bitmap *bitmap;
+<<<<<<<
+ unsigned long flags;
+|||||||
+ unsigned long flags;
+ struct bio_list bl;
+ struct page **behind_pages = NULL;
+=======
+ unsigned long flags;
+ struct bio_list bl;
+ int bl_count;
+ struct page **behind_pages = NULL;
+>>>>>>>
+ const int rw = bio_data_dir(bio);
+ const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
+ const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
+ mdk_rdev_t *blocked_rdev;
+ int plugged;
+
+ /*
+ * Register the new request and wait if the reconstruction
+ * thread has put up a bar for new requests.
+ * Continue immediately if no resync is active currently.
+ */
+
+ md_write_start(mddev, bio); /* wait on superblock update early */
+
+ if (bio_data_dir(bio) == WRITE &&
+ bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+ bio->bi_sector < mddev->suspend_hi) {
+ /* As the suspend_* range is controlled by
+ * userspace, we want an interruptible
+ * wait.
+ */
+ DEFINE_WAIT(w);
+ for (;;) {
+ flush_signals(current);
+ prepare_to_wait(&conf->wait_barrier,
+ &w, TASK_INTERRUPTIBLE);
+ if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+ bio->bi_sector >= mddev->suspend_hi)
+ break;
+ schedule();
+ }
+ finish_wait(&conf->wait_barrier, &w);
+ }
+
+ wait_barrier(conf);
+
+ bitmap = mddev->bitmap;
+
+ /*
+ * make_request() can abort the operation when READA is being
+ * used and no empty request is available.
+ *
+ */
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = bio->bi_size >> 9;
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_sector;
+
+ if (rw == READ) {
+ /*
+ * read balancing logic:
+ */
+ int rdisk = read_balance(conf, r1_bio);
+
+ if (rdisk < 0) {
+ /* couldn't find anywhere to read from */
+ raid_end_bio_io(r1_bio);
+ return 0;
+ }
+ mirror = conf->mirrors + rdisk;
+
+ if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+ bitmap) {
+ /* Reading from a write-mostly device must
+ * take care not to over-take any writes
+ * that are 'behind'
+ */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+ r1_bio->read_disk = rdisk;
+
+ read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+
+ r1_bio->bios[rdisk] = read_bio;
+
+ read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
+ read_bio->bi_bdev = mirror->rdev->bdev;
+ read_bio->bi_end_io = raid1_end_read_request;
+ read_bio->bi_rw = READ | do_sync;
+ read_bio->bi_private = r1_bio;
+
+ generic_make_request(read_bio);
+ return 0;
+ }
+
+ /*
+ * WRITE:
+ */
+ if (conf->pending_count >= max_queued) {
+ md_wakeup_thread(mddev->thread);
+ wait_event(conf->wait_barrier,
+ conf->pending_count < max_queued);
+ }
+ /* first select target devices under spinlock and
+ * inc refcount on their rdev. Record them by setting
+ * bios[x] to bio
+ */
+ plugged = mddev_check_plugged(mddev);
+
+ disks = conf->raid_disks;
+ retry_write:
+ blocked_rdev = NULL;
+ rcu_read_lock();
+ for (i = 0; i < disks; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+ atomic_inc(&rdev->nr_pending);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(Faulty, &rdev->flags)) {
+ rdev_dec_pending(rdev, mddev);
+ r1_bio->bios[i] = NULL;
+ } else {
+ r1_bio->bios[i] = bio;
+ targets++;
+ }
+ } else
+ r1_bio->bios[i] = NULL;
+ }
+ rcu_read_unlock();
+
+ if (unlikely(blocked_rdev)) {
+ /* Wait for this device to become unblocked */
+ int j;
+
+ for (j = 0; j < i; j++)
+ if (r1_bio->bios[j])
+ rdev_dec_pending(conf->mirrors[j].rdev, mddev);
+
+ allow_barrier(conf);
+ md_wait_for_blocked_rdev(blocked_rdev, mddev);
+ wait_barrier(conf);
+ goto retry_write;
+ }
+
+ BUG_ON(targets == 0); /* we never fail the last device */
+
+ if (targets < conf->raid_disks) {
+ /* array is degraded, we will not clear the bitmap
+ * on I/O completion (see raid1_end_write_request) */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ }
+
+ /* do behind I/O ?
+ * Not if there are too many, or cannot allocate memory,
+ * or a reader on WriteMostly is waiting for behind writes
+ * to flush */
+ if (bitmap &&
+ (atomic_read(&bitmap->behind_writes)
+ < mddev->bitmap_info.max_write_behind) &&
+ !waitqueue_active(&bitmap->behind_wait))
+ alloc_behind_pages(bio, r1_bio);
+
+ atomic_set(&r1_bio->remaining, 1);
+ atomic_set(&r1_bio->behind_remaining, 0);
+
+ bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ bl_count = 0;
+ for (i = 0; i < disks; i++) {
+ struct bio *mbio;
+ if (!r1_bio->bios[i])
+ continue;
+
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ r1_bio->bios[i] = mbio;
+
+ mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
+ mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ mbio->bi_end_io = raid1_end_write_request;
+ mbio->bi_rw = WRITE | do_flush_fua | do_sync;
+ mbio->bi_private = r1_bio;
+
+ if (r1_bio->behind_pages) {
+ struct bio_vec *bvec;
+ int j;
+
+ /* Yes, I really want the '__' version so that
+ * we clear any unused pointer in the io_vec, rather
+ * than leave them unchanged. This is important
+ * because when we come to free the pages, we won't
+ * know the original bi_idx, so we just free
+ * them all
+ */
+ __bio_for_each_segment(bvec, mbio, j, 0)
+ bvec->bv_page = r1_bio->behind_pages[j];
+ if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+ atomic_inc(&r1_bio->behind_remaining);
+<<<<<<<
+ }
+
+ atomic_inc(&r1_bio->remaining);
+|||||||
+ bio_list_add(&bl, mbio);
+ }
+ kfree(behind_pages); /* the behind pages are attached to the bios now */
+
+=======
+ bio_list_add(&bl, mbio);
+ bl_count++;
+ }
+ kfree(behind_pages); /* the behind pages are attached to the bios now */
+
+>>>>>>>
+<<<<<<<
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_add(&conf->pending_bio_list, mbio);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+|||||||
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_merge(&conf->pending_bio_list, &bl);
+ bio_list_init(&bl);
+
+=======
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_merge(&conf->pending_bio_list, &bl);
+ conf->pending_count += bl_count;
+ bio_list_init(&bl);
+
+>>>>>>>
+ r1_bio_write_done(r1_bio);
+
+ /* In case raid1d snuck in to freeze_array */
+ wake_up(&conf->wait_barrier);
+
+ if (do_sync || !bitmap || !plugged)
+ md_wakeup_thread(mddev->thread);
+
+ return 0;
+}
+
+static void status(struct seq_file *seq, mddev_t *mddev)
+{
+ conf_t *conf = mddev->private;
+ int i;
+
+ seq_printf(seq, " [%d/%d] [", conf->raid_disks,
+ conf->raid_disks - mddev->degraded);
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ seq_printf(seq, "%s",
+ rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
+ }
+ rcu_read_unlock();
+ seq_printf(seq, "]");
+}
+
+
+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ char b[BDEVNAME_SIZE];
+ conf_t *conf = mddev->private;
+
+ /*
+ * If it is not operational, then we have already marked it as dead
+ * else if it is the last working disks, ignore the error, let the
+ * next level up know.
+ * else mark the drive as failed
+ */
+ if (test_bit(In_sync, &rdev->flags)
+ && (conf->raid_disks - mddev->degraded) == 1) {
+ /*
+ * Don't fail the drive, act as though we were just a
+ * normal single drive.
+ * However don't try a recovery from this drive as
+ * it is very likely to fail.
+ */
+ mddev->recovery_disabled = 1;
+ return;
+ }
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded++;
+ set_bit(Faulty, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ /*
+ * if recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ } else
+ set_bit(Faulty, &rdev->flags);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ printk(KERN_ALERT
+ "md/raid1:%s: Disk failure on %s, disabling device.\n"
+ "md/raid1:%s: Operation continuing on %d devices.\n",
+ mdname(mddev), bdevname(rdev->bdev, b),
+ mdname(mddev), conf->raid_disks - mddev->degraded);
+}
+
+static void print_conf(conf_t *conf)
+{
+ int i;
+
+ printk(KERN_DEBUG "RAID1 conf printout:\n");
+ if (!conf) {
+ printk(KERN_DEBUG "(!conf)\n");
+ return;
+ }
+ printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
+ conf->raid_disks);
+
+ if ((bits & (1 << BDI_async_congested)) &&
+ conf->pending_count >= max_queued)
+ return 1;
+
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ char b[BDEVNAME_SIZE];
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev)
+ printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
+ i, !test_bit(In_sync, &rdev->flags),
+ !test_bit(Faulty, &rdev->flags),
+ bdevname(rdev->bdev,b));
+ }
+ rcu_read_unlock();
+}
+
+static void close_sync(conf_t *conf)
+{
+ wait_barrier(conf);
+ allow_barrier(conf);
+
+ mempool_destroy(conf->r1buf_pool);
+ conf->r1buf_pool = NULL;
+}
+
+static int raid1_spare_active(mddev_t *mddev)
+{
+ int i;
+ conf_t *conf = mddev->private;
+ int count = 0;
+ unsigned long flags;
+
+ /*
+ * Find all failed disks within the RAID1 configuration
+ * and mark them readable.
+ * Called under mddev lock, so rcu protection not needed.
+ */
+ for (i = 0; i < conf->raid_disks; i++) {
+ mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+ if (rdev
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_and_set_bit(In_sync, &rdev->flags)) {
+ count++;
+ sysfs_notify_dirent(rdev->sysfs_state);
+ }
+ }
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded -= count;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ print_conf(conf);
+ return count;
+}
+
+
+static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ conf_t *conf = mddev->private;
+ int err = -EEXIST;
+ int mirror = 0;
+ mirror_info_t *p;
+ int first = 0;
+ int last = mddev->raid_disks - 1;
+
+ if (rdev->raid_disk >= 0)
+ first = last = rdev->raid_disk;
+
+ for (mirror = first; mirror <= last; mirror++)
+ if ( !(p=conf->mirrors+mirror)->rdev) {
+
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ /* as we don't honour merge_bvec_fn, we must
+ * never risk violating it, so limit
+ * ->max_segments to one lying with a single
+ * page, as a one page request is never in
+ * violation.
+ */
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
+ }
+
+ p->head_position = 0;
+ rdev->raid_disk = mirror;
+ err = 0;
+ /* As all devices are equivalent, we don't need a full recovery
+ * if this was recently any drive of the array
+ */
+ if (rdev->saved_raid_disk < 0)
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->rdev, rdev);
+ break;
+ }
+ md_integrity_add_rdev(rdev, mddev);
+ print_conf(conf);
+ return err;
+}
+
+static int raid1_remove_disk(mddev_t *mddev, int number)
+{
+ conf_t *conf = mddev->private;
+ int err = 0;
+ mdk_rdev_t *rdev;
+ mirror_info_t *p = conf->mirrors+ number;
+
+ print_conf(conf);
+ rdev = p->rdev;
+ if (rdev) {
+ if (test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending)) {
+ err = -EBUSY;
+ goto abort;
+ }
+ /* Only remove non-faulty devices if recovery
+ * is not possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ !mddev->recovery_disabled &&
+ mddev->degraded < conf->raid_disks) {
+ err = -EBUSY;
+ goto abort;
+ }
+ p->rdev = NULL;
+ synchronize_rcu();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ p->rdev = rdev;
+ goto abort;
+ }
+ err = md_integrity_register(mddev);
+ }
+abort:
+
+ print_conf(conf);
+ return err;
+}
+
+
+static void end_sync_read(struct bio *bio, int error)
+{
+ r1bio_t *r1_bio = bio->bi_private;
+ int i;
+
+ for (i=r1_bio->mddev->raid_disks; i--; )
+ if (r1_bio->bios[i] == bio)
+ break;
+ BUG_ON(i < 0);
+ update_head_pos(i, r1_bio);
+ /*
+ * we have read a block, now it needs to be re-written,
+ * or re-read if the read failed.
+ * We don't do much here, just schedule handling by raid1d
+ */
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+ if (atomic_dec_and_test(&r1_bio->remaining))
+ reschedule_retry(r1_bio);
+}
+
+static void end_sync_write(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ r1bio_t *r1_bio = bio->bi_private;
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+ int i;
+ int mirror=0;
+
+ for (i = 0; i < conf->raid_disks; i++)
+ if (r1_bio->bios[i] == bio) {
+ mirror = i;
+ break;
+ }
+ if (!uptodate) {
+ sector_t sync_blocks = 0;
+ sector_t s = r1_bio->sector;
+ long sectors_to_go = r1_bio->sectors;
+ /* make sure these bits doesn't get cleared. */
+ do {
+ bitmap_end_sync(mddev->bitmap, s,
+ &sync_blocks, 1);
+ s += sync_blocks;
+ sectors_to_go -= sync_blocks;
+ } while (sectors_to_go > 0);
+ md_error(mddev, conf->mirrors[mirror].rdev);
+ }
+
+ update_head_pos(mirror, r1_bio);
+
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ sector_t s = r1_bio->sectors;
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, uptodate);
+ }
+}
+
+static int fix_sync_read_error(r1bio_t *r1_bio)
+{
+ /* Try some synchronous reads of other devices to get
+ * good data, much like with normal read errors. Only
+ * read into the pages we already have so we don't
+ * need to re-issue the read request.
+ * We don't need to freeze the array, because being in an
+ * active sync request, there is no normal IO, and
+ * no overlapping syncs.
+ */
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+ struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+ sector_t sect = r1_bio->sector;
+ int sectors = r1_bio->sectors;
+ int idx = 0;
+
+ while(sectors) {
+ int s = sectors;
+ int d = r1_bio->read_disk;
+ int success = 0;
+ mdk_rdev_t *rdev;
+ int start;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+ do {
+ if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
+ /* No rcu protection needed here devices
+ * can only be removed when no resync is
+ * active, and resync is currently active
+ */
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false)) {
+ success = 1;
+ break;
+ }
+ }
+ d++;
+ if (d == conf->raid_disks)
+ d = 0;
+ } while (!success && d != r1_bio->read_disk);
+
+ if (!success) {
+ char b[BDEVNAME_SIZE];
+ /* Cannot read from anywhere, array is toast */
+ md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
+ " for block %llu\n",
+ mdname(mddev),
+ bdevname(bio->bi_bdev, b),
+ (unsigned long long)r1_bio->sector);
+ md_done_sync(mddev, r1_bio->sectors, 0);
+ put_buf(r1_bio);
+ return 0;
+ }
+
+ start = d;
+ /* write it back and re-read */
+ while (d != r1_bio->read_disk) {
+ if (d == 0)
+ d = conf->raid_disks;
+ d--;
+ if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+ continue;
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ WRITE, false) == 0) {
+ r1_bio->bios[d]->bi_end_io = NULL;
+ rdev_dec_pending(rdev, mddev);
+ md_error(mddev, rdev);
+ } else
+ atomic_add(s, &rdev->corrected_errors);
+ }
+ d = start;
+ while (d != r1_bio->read_disk) {
+ if (d == 0)
+ d = conf->raid_disks;
+ d--;
+ if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+ continue;
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false) == 0)
+ md_error(mddev, rdev);
+ }
+ sectors -= s;
+ sect += s;
+ idx ++;
+ }
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ return 1;
+}
+
+static int process_checks(r1bio_t *r1_bio)
+{
+ /* We have read all readable devices. If we haven't
+ * got the block, then there is no hope left.
+ * If we have, then we want to do a comparison
+ * and skip the write if everything is the same.
+ * If any blocks failed to read, then we need to
+ * attempt an over-write
+ */
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+ int primary;
+ int i;
+
+ for (primary = 0; primary < conf->raid_disks; primary++)
+ if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
+ test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+ r1_bio->bios[primary]->bi_end_io = NULL;
+ rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
+ break;
+ }
+ r1_bio->read_disk = primary;
+ for (i = 0; i < conf->raid_disks; i++) {
+ int j;
+ int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
+ struct bio *pbio = r1_bio->bios[primary];
+ struct bio *sbio = r1_bio->bios[i];
+ int size;
+
+ if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+ continue;
+
+ if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+ for (j = vcnt; j-- ; ) {
+ struct page *p, *s;
+ p = pbio->bi_io_vec[j].bv_page;
+ s = sbio->bi_io_vec[j].bv_page;
+ if (memcmp(page_address(p),
+ page_address(s),
+ PAGE_SIZE))
+ break;
+ }
+ } else
+ j = 0;
+ if (j >= 0)
+ mddev->resync_mismatches += r1_bio->sectors;
+ if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
+ && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
+ /* No need to write to this device. */
+ sbio->bi_end_io = NULL;
+ rdev_dec_pending(conf->mirrors[i].rdev, mddev);
+ continue;
+ }
+ /* fixup the bio for reuse */
+ sbio->bi_vcnt = vcnt;
+ sbio->bi_size = r1_bio->sectors << 9;
+ sbio->bi_idx = 0;
+ sbio->bi_phys_segments = 0;
+ sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ sbio->bi_flags |= 1 << BIO_UPTODATE;
+ sbio->bi_next = NULL;
+ sbio->bi_sector = r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset;
+ sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ size = sbio->bi_size;
+ for (j = 0; j < vcnt ; j++) {
+ struct bio_vec *bi;
+ bi = &sbio->bi_io_vec[j];
+ bi->bv_offset = 0;
+ if (size > PAGE_SIZE)
+ bi->bv_len = PAGE_SIZE;
+ else
+ bi->bv_len = size;
+ size -= PAGE_SIZE;
+ memcpy(page_address(bi->bv_page),
+ page_address(pbio->bi_io_vec[j].bv_page),
+ PAGE_SIZE);
+ }
+ }
+ return 0;
+}
+
+static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+{
+ conf_t *conf = mddev->private;
+ int i;
+ int disks = conf->raid_disks;
+ struct bio *bio, *wbio;
+
+ bio = r1_bio->bios[r1_bio->read_disk];
+
+ if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+ /* ouch - failed to read all of that. */
+ if (!fix_sync_read_error(r1_bio))
+ return;
+
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ if (process_checks(r1_bio) < 0)
+ return;
+ /*
+ * schedule writes
+ */
+ atomic_set(&r1_bio->remaining, 1);
+ for (i = 0; i < disks ; i++) {
+ wbio = r1_bio->bios[i];
+ if (wbio->bi_end_io == NULL ||
+ (wbio->bi_end_io == end_sync_read &&
+ (i == r1_bio->read_disk ||
+ !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
+ continue;
+
+ wbio->bi_rw = WRITE;
+ wbio->bi_end_io = end_sync_write;
+ atomic_inc(&r1_bio->remaining);
+ md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
+
+ generic_make_request(wbio);
+ }
+
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ /* if we're here, all write(s) have completed, so clean up */
+ md_done_sync(mddev, r1_bio->sectors, 1);
+ put_buf(r1_bio);
+ }
+}
+
+/*
+ * This is a kernel thread which:
+ *
+ * 1. Retries failed read operations on working mirrors.
+ * 2. Updates the raid superblock when problems encounter.
+ * 3. Performs writes following reads for array syncronising.
+ */
+
+static void fix_read_error(conf_t *conf, int read_disk,
+ sector_t sect, int sectors)
+{
+ mddev_t *mddev = conf->mddev;
+ while(sectors) {
+ int s = sectors;
+ int d = read_disk;
+ int success = 0;
+ int start;
+ mdk_rdev_t *rdev;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ do {
+ /* Note: no rcu protection needed here
+ * as this is synchronous in the raid1d thread
+ * which is the thread that might remove
+ * a device. If raid1d ever becomes multi-threaded....
+ */
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags) &&
+ sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false))
+ success = 1;
+ else {
+ d++;
+ if (d == conf->raid_disks)
+ d = 0;
+ }
+ } while (!success && d != read_disk);
+
+ if (!success) {
+ /* Cannot read from anywhere -- bye bye array */
+ md_error(mddev, conf->mirrors[read_disk].rdev);
+ break;
+ }
+ /* write it back and re-read */
+ start = d;
+ while (d != read_disk) {
+ if (d==0)
+ d = conf->raid_disks;
+ d--;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, WRITE, false)
+ == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ }
+ }
+ d = start;
+ while (d != read_disk) {
+ char b[BDEVNAME_SIZE];
+ if (d==0)
+ d = conf->raid_disks;
+ d--;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false)
+ == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ else {
+ atomic_add(s, &rdev->corrected_errors);
+ printk(KERN_INFO
+ "md/raid1:%s: read error corrected "
+ "(%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(sect +
+ rdev->data_offset),
+ bdevname(rdev->bdev, b));
+ }
+ }
+ }
+ sectors -= s;
+ sect += s;
+ }
+}
+
+static void raid1d(mddev_t *mddev)
+{
+ r1bio_t *r1_bio;
+ struct bio *bio;
+ unsigned long flags;
+ conf_t *conf = mddev->private;
+ struct list_head *head = &conf->retry_list;
+ mdk_rdev_t *rdev;
+ struct blk_plug plug;
+
+ md_check_recovery(mddev);
+
+ blk_start_plug(&plug);
+ for (;;) {
+ char b[BDEVNAME_SIZE];
+
+ if (atomic_read(&mddev->plug_cnt) == 0)
+ flush_pending_writes(conf);
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (list_empty(head)) {
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ break;
+ }
+ r1_bio = list_entry(head->prev, r1bio_t, retry_list);
+ list_del(head->prev);
+ conf->nr_queued--;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ mddev = r1_bio->mddev;
+ conf = mddev->private;
+ if (test_bit(R1BIO_IsSync, &r1_bio->state))
+ sync_request_write(mddev, r1_bio);
+ else {
+ int disk;
+
+ /* we got a read error. Maybe the drive is bad. Maybe just
+ * the block and we can fix it.
+ * We freeze all other IO, and try reading the block from
+ * other devices. When we find one, we re-write
+ * and check it that fixes the read error.
+ * This is all done synchronously while the array is
+ * frozen
+ */
+ if (mddev->ro == 0) {
+ freeze_array(conf);
+ fix_read_error(conf, r1_bio->read_disk,
+ r1_bio->sector,
+ r1_bio->sectors);
+ unfreeze_array(conf);
+ } else
+ md_error(mddev,
+ conf->mirrors[r1_bio->read_disk].rdev);
+
+ bio = r1_bio->bios[r1_bio->read_disk];
+ if ((disk=read_balance(conf, r1_bio)) == -1) {
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
+ " read error for block %llu\n",
+ mdname(mddev),
+ bdevname(bio->bi_bdev,b),
+ (unsigned long long)r1_bio->sector);
+ raid_end_bio_io(r1_bio);
+ } else {
+ const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
+ r1_bio->bios[r1_bio->read_disk] =
+ mddev->ro ? IO_BLOCKED : NULL;
+ r1_bio->read_disk = disk;
+ bio_put(bio);
+ bio = bio_clone_mddev(r1_bio->master_bio,
+ GFP_NOIO, mddev);
+ r1_bio->bios[r1_bio->read_disk] = bio;
+ rdev = conf->mirrors[disk].rdev;
+ if (printk_ratelimit())
+ printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
+ " other mirror: %s\n",
+ mdname(mddev),
+ (unsigned long long)r1_bio->sector,
+ bdevname(rdev->bdev,b));
+ bio->bi_sector = r1_bio->sector + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_end_io = raid1_end_read_request;
+ bio->bi_rw = READ | do_sync;
+ bio->bi_private = r1_bio;
+ generic_make_request(bio);
+ }
+ }
+ cond_resched();
+ }
+ blk_finish_plug(&plug);
+}
+
+
+static int init_resync(conf_t *conf)
+{
+ int buffs;
+
+ buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
+ BUG_ON(conf->r1buf_pool);
+ conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
+ conf->poolinfo);
+ if (!conf->r1buf_pool)
+ return -ENOMEM;
+ conf->next_resync = 0;
+ return 0;
+}
+
+/*
+ * perform a "sync" on one "block"
+ *
+ * We need to make sure that no normal I/O request - particularly write
+ * requests - conflict with active sync requests.
+ *
+ * This is achieved by tracking pending requests and a 'barrier' concept
+ * that can be installed to exclude normal IO requests.
+ */
+
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+{
+ conf_t *conf = mddev->private;
+ r1bio_t *r1_bio;
+ struct bio *bio;
+ sector_t max_sector, nr_sectors;
+ int disk = -1;
+ int i;
+ int wonly = -1;
+ int write_targets = 0, read_targets = 0;
+ sector_t sync_blocks;
+ int still_degraded = 0;
+
+ if (!conf->r1buf_pool)
+ if (init_resync(conf))
+ return 0;
+
+ max_sector = mddev->dev_sectors;
+ if (sector_nr >= max_sector) {
+ /* If we aborted, we need to abort the
+ * sync on the 'current' bitmap chunk (there will
+ * only be one in raid1 resync.
+ * We can find the current addess in mddev->curr_resync
+ */
+ if (mddev->curr_resync < max_sector) /* aborted */
+ bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+ &sync_blocks, 1);
+ else /* completed sync */
+ conf->fullsync = 0;
+
+ bitmap_close_sync(mddev->bitmap);
+ close_sync(conf);
+ return 0;
+ }
+
+ if (mddev->bitmap == NULL &&
+ mddev->recovery_cp == MaxSector &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ conf->fullsync == 0) {
+ *skipped = 1;
+ return max_sector - sector_nr;
+ }
+ /* before building a request, check if we can skip these blocks..
+ * This call the bitmap_start_sync doesn't actually record anything
+ */
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ /* We can skip this block, and probably several more */
+ *skipped = 1;
+ return sync_blocks;
+ }
+ /*
+ * If there is non-resync activity waiting for a turn,
+ * and resync is going fast enough,
+ * then let it though before starting on this new sync request.
+ */
+ if (!go_faster && conf->nr_waiting)
+ msleep_interruptible(1000);
+
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+ raise_barrier(conf);
+
+ conf->next_resync = sector_nr;
+
+ rcu_read_lock();
+ /*
+ * If we get a correctably read error during resync or recovery,
+ * we might want to read from a different device. So we
+ * flag all drives that could conceivably be read from for READ,
+ * and any others (which will be non-In_sync devices) for WRITE.
+ * If a read fails, we try reading from something else for which READ
+ * is OK.
+ */
+
+ r1_bio->mddev = mddev;
+ r1_bio->sector = sector_nr;
+ r1_bio->state = 0;
+ set_bit(R1BIO_IsSync, &r1_bio->state);
+
+ for (i=0; i < conf->raid_disks; i++) {
+ mdk_rdev_t *rdev;
+ bio = r1_bio->bios[i];
+
+ /* take from bio_init */
+ bio->bi_next = NULL;
+ bio->bi_flags &= ~(BIO_POOL_MASK-1);
+ bio->bi_flags |= 1 << BIO_UPTODATE;
+ bio->bi_comp_cpu = -1;
+ bio->bi_rw = READ;
+ bio->bi_vcnt = 0;
+ bio->bi_idx = 0;
+ bio->bi_phys_segments = 0;
+ bio->bi_size = 0;
+ bio->bi_end_io = NULL;
+ bio->bi_private = NULL;
+
+ rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev == NULL ||
+ test_bit(Faulty, &rdev->flags)) {
+ still_degraded = 1;
+ continue;
+ } else if (!test_bit(In_sync, &rdev->flags)) {
+ bio->bi_rw = WRITE;
+ bio->bi_end_io = end_sync_write;
+ write_targets ++;
+ } else {
+ /* may need to read from here */
+ bio->bi_rw = READ;
+ bio->bi_end_io = end_sync_read;
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ if (wonly < 0)
+ wonly = i;
+ } else {
+ if (disk < 0)
+ disk = i;
+ }
+ read_targets++;
+ }
+ atomic_inc(&rdev->nr_pending);
+ bio->bi_sector = sector_nr + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_private = r1_bio;
+ }
+ rcu_read_unlock();
+ if (disk < 0)
+ disk = wonly;
+ r1_bio->read_disk = disk;
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
+ /* extra read targets are also write targets */
+ write_targets += read_targets-1;
+
+ if (write_targets == 0 || read_targets == 0) {
+ /* There is nowhere to write, so all non-sync
+ * drives must be failed - so we are finished
+ */
+ sector_t rv = max_sector - sector_nr;
+ *skipped = 1;
+ put_buf(r1_bio);
+ return rv;
+ }
+
+ if (max_sector > mddev->resync_max)
+ max_sector = mddev->resync_max; /* Don't do IO beyond here */
+ nr_sectors = 0;
+ sync_blocks = 0;
+ do {
+ struct page *page;
+ int len = PAGE_SIZE;
+ if (sector_nr + (len>>9) > max_sector)
+ len = (max_sector - sector_nr) << 9;
+ if (len == 0)
+ break;
+ if (sync_blocks == 0) {
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr,
+ &sync_blocks, still_degraded) &&
+ !conf->fullsync &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ break;
+ BUG_ON(sync_blocks < (PAGE_SIZE>>9));
+ if ((len >> 9) > sync_blocks)
+ len = sync_blocks<<9;
+ }
+
+ for (i=0 ; i < conf->raid_disks; i++) {
+ bio = r1_bio->bios[i];
+ if (bio->bi_end_io) {
+ page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+ if (bio_add_page(bio, page, len, 0) == 0) {
+ /* stop here */
+ bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+ while (i > 0) {
+ i--;
+ bio = r1_bio->bios[i];
+ if (bio->bi_end_io==NULL)
+ continue;
+ /* remove last page from this bio */
+ bio->bi_vcnt--;
+ bio->bi_size -= len;
+ bio->bi_flags &= ~(1<< BIO_SEG_VALID);
+ }
+ goto bio_full;
+ }
+ }
+ }
+ nr_sectors += len>>9;
+ sector_nr += len>>9;
+ sync_blocks -= (len>>9);
+ } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
+ bio_full:
+ r1_bio->sectors = nr_sectors;
+
+ /* For a user-requested sync, we read all readable devices and do a
+ * compare
+ */
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ atomic_set(&r1_bio->remaining, read_targets);
+ for (i=0; i<conf->raid_disks; i++) {
+ bio = r1_bio->bios[i];
+ if (bio->bi_end_io == end_sync_read) {
+ md_sync_acct(bio->bi_bdev, nr_sectors);
+ generic_make_request(bio);
+ }
+ }
+ } else {
+ atomic_set(&r1_bio->remaining, 1);
+ bio = r1_bio->bios[r1_bio->read_disk];
+ md_sync_acct(bio->bi_bdev, nr_sectors);
+ generic_make_request(bio);
+
+ }
+ return nr_sectors;
+}
+
+static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+ if (sectors)
+ return sectors;
+
+ return mddev->dev_sectors;
+}
+
+static conf_t *setup_conf(mddev_t *mddev)
+{
+ conf_t *conf;
+ int i;
+ mirror_info_t *disk;
+ mdk_rdev_t *rdev;
+ int err = -ENOMEM;
+
+ conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
+ if (!conf)
+ goto abort;
+
+ conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+ GFP_KERNEL);
+ if (!conf->mirrors)
+ goto abort;
+
+ conf->tmppage = alloc_page(GFP_KERNEL);
+ if (!conf->tmppage)
+ goto abort;
+
+ conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
+ if (!conf->poolinfo)
+ goto abort;
+ conf->poolinfo->raid_disks = mddev->raid_disks;
+ conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free,
+ conf->poolinfo);
+ if (!conf->r1bio_pool)
+ goto abort;
+
+ conf->poolinfo->mddev = mddev;
+
+ spin_lock_init(&conf->device_lock);
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ int disk_idx = rdev->raid_disk;
+ if (disk_idx >= mddev->raid_disks
+ || disk_idx < 0)
+ continue;
+ disk = conf->mirrors + disk_idx;
+
+ disk->rdev = rdev;
+
+ disk->head_position = 0;
+ }
+ conf->raid_disks = mddev->raid_disks;
+ conf->mddev = mddev;
+ INIT_LIST_HEAD(&conf->retry_list);
+
+ spin_lock_init(&conf->resync_lock);
+ init_waitqueue_head(&conf->wait_barrier);
+
+<<<<<<<
+ bio_list_init(&conf->pending_bio_list);
+
+ conf->last_used = -1;
+ for (i = 0; i < conf->raid_disks; i++) {
+
+ disk = conf->mirrors + i;
+
+ if (!disk->rdev ||
+ !test_bit(In_sync, &disk->rdev->flags)) {
+ disk->head_position = 0;
+ if (disk->rdev)
+ conf->fullsync = 1;
+ } else if (conf->last_used < 0)
+ /*
+ * The first working device is used as a
+ * starting point to read balancing.
+ */
+ conf->last_used = i;
+ }
+
+ err = -EIO;
+ if (conf->last_used < 0) {
+ printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
+ mdname(mddev));
+ goto abort;
+ }
+ err = -ENOMEM;
+ conf->thread = md_register_thread(raid1d, mddev, NULL);
+ if (!conf->thread) {
+ printk(KERN_ERR
+ "md/raid1:%s: couldn't allocate thread\n",
+ mdname(mddev));
+ goto abort;
+ }
+
+ return conf;
+
+ abort:
+ if (conf) {
+ if (conf->r1bio_pool)
+ mempool_destroy(conf->r1bio_pool);
+ kfree(conf->mirrors);
+ safe_put_page(conf->tmppage);
+ kfree(conf->poolinfo);
+ kfree(conf);
+ }
+ return ERR_PTR(err);
+}
+
+static int run(mddev_t *mddev)
+{
+ conf_t *conf;
+ int i;
+ mdk_rdev_t *rdev;
+
+ if (mddev->level != 1) {
+ printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
+ mdname(mddev), mddev->level);
+ return -EIO;
+ }
+ if (mddev->reshape_position != MaxSector) {
+ printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
+ mdname(mddev));
+ return -EIO;
+ }
+ /*
+ * copy the already verified devices into our private RAID1
+ * bookkeeping area. [whatever we allocate in run(),
+ * should be freed in stop()]
+ */
+ if (mddev->private == NULL)
+ conf = setup_conf(mddev);
+ else
+ conf = mddev->private;
+
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
+
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (!mddev->gendisk)
+ continue;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ /* as we don't honour merge_bvec_fn, we must never risk
+ * violating it, so limit ->max_segments to 1 lying within
+ * a single page, as a one page request is never in violation.
+ */
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
+ }
+ }
+
+ mddev->degraded = 0;
+ for (i=0; i < conf->raid_disks; i++)
+ if (conf->mirrors[i].rdev == NULL ||
+ !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
+ test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+ mddev->degraded++;
+
+ if (conf->raid_disks - mddev->degraded == 1)
+ mddev->recovery_cp = MaxSector;
+
+ if (mddev->recovery_cp != MaxSector)
+ printk(KERN_NOTICE "md/raid1:%s: not clean"
+ " -- starting background reconstruction\n",
+ mdname(mddev));
+ printk(KERN_INFO
+ "md/raid1:%s: active with %d out of %d mirrors\n",
+ mdname(mddev), mddev->raid_disks - mddev->degraded,
+ mddev->raid_disks);
+
+ /*
+ * Ok, everything is just fine now
+ */
+ mddev->thread = conf->thread;
+ conf->thread = NULL;
+ mddev->private = conf;
+
+ md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
+
+ if (mddev->queue) {
+ mddev->queue->backing_dev_info.congested_fn = raid1_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
+ }
+ return md_integrity_register(mddev);
+}
+
+static int stop(mddev_t *mddev)
+{
+ conf_t *conf = mddev->private;
+ struct bitmap *bitmap = mddev->bitmap;
+
+ /* wait for behind writes to complete */
+ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
+ mdname(mddev));
+ /* need to kick something here to make sure I/O goes? */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+
+ raise_barrier(conf);
+ lower_barrier(conf);
+
+ md_unregister_thread(mddev->thread);
+ mddev->thread = NULL;
+ if (conf->r1bio_pool)
+ mempool_destroy(conf->r1bio_pool);
+ kfree(conf->mirrors);
+ kfree(conf->poolinfo);
+ kfree(conf);
+ mddev->private = NULL;
+ return 0;
+}
+
+static int raid1_resize(mddev_t *mddev, sector_t sectors)
+{
+ /* no resync is happening, and there is enough space
+ * on all devices, so we can resize.
+ * We need to make sure resync covers any new space.
+ * If the array is shrinking we should possibly wait until
+ * any io in the removed space completes, but it hardly seems
+ * worth it.
+ */
+ md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
+ if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
+ return -EINVAL;
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ if (sectors > mddev->dev_sectors &&
+ mddev->recovery_cp > mddev->dev_sectors) {
+ mddev->recovery_cp = mddev->dev_sectors;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ }
+ mddev->dev_sectors = sectors;
+ mddev->resync_max_sectors = sectors;
+ return 0;
+}
+
+static int raid1_reshape(mddev_t *mddev)
+{
+ /* We need to:
+ * 1/ resize the r1bio_pool
+ * 2/ resize conf->mirrors
+ *
+ * We allocate a new r1bio_pool if we can.
+ * Then raise a device barrier and wait until all IO stops.
+ * Then resize conf->mirrors and swap in the new r1bio pool.
+ *
+ * At the same time, we "pack" the devices so that all the missing
+ * devices have the higher raid_disk numbers.
+ */
+ mempool_t *newpool, *oldpool;
+ struct pool_info *newpoolinfo;
+ mirror_info_t *newmirrors;
+ conf_t *conf = mddev->private;
+ int cnt, raid_disks;
+ unsigned long flags;
+ int d, d2, err;
+
+ /* Cannot change chunk_size, layout, or level */
+ if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
+ mddev->layout != mddev->new_layout ||
+ mddev->level != mddev->new_level) {
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ mddev->new_layout = mddev->layout;
+ mddev->new_level = mddev->level;
+ return -EINVAL;
+ }
+
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
+
+ raid_disks = mddev->raid_disks + mddev->delta_disks;
+
+ if (raid_disks < conf->raid_disks) {
+ cnt=0;
+ for (d= 0; d < conf->raid_disks; d++)
+ if (conf->mirrors[d].rdev)
+ cnt++;
+ if (cnt > raid_disks)
+ return -EBUSY;
+ }
+
+ newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
+ if (!newpoolinfo)
+ return -ENOMEM;
+ newpoolinfo->mddev = mddev;
+ newpoolinfo->raid_disks = raid_disks;
+
+ newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free, newpoolinfo);
+ if (!newpool) {
+ kfree(newpoolinfo);
+ return -ENOMEM;
+ }
+ newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
+ if (!newmirrors) {
+ kfree(newpoolinfo);
+ mempool_destroy(newpool);
+ return -ENOMEM;
+ }
+
+ raise_barrier(conf);
+
+ /* ok, everything is stopped */
+ oldpool = conf->r1bio_pool;
+ conf->r1bio_pool = newpool;
+
+ for (d = d2 = 0; d < conf->raid_disks; d++) {
+ mdk_rdev_t *rdev = conf->mirrors[d].rdev;
+ if (rdev && rdev->raid_disk != d2) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ rdev->raid_disk = d2;
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ if (sysfs_create_link(&mddev->kobj,
+ &rdev->kobj, nm))
+ printk(KERN_WARNING
+ "md/raid1:%s: cannot register "
+ "%s\n",
+ mdname(mddev), nm);
+ }
+ if (rdev)
+ newmirrors[d2++].rdev = rdev;
+ }
+ kfree(conf->mirrors);
+ conf->mirrors = newmirrors;
+ kfree(conf->poolinfo);
+ conf->poolinfo = newpoolinfo;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded += (raid_disks - conf->raid_disks);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ conf->raid_disks = mddev->raid_disks = raid_disks;
+ mddev->delta_disks = 0;
+
+ conf->last_used = 0; /* just make sure it is in-range */
+ lower_barrier(conf);
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+
+ mempool_destroy(oldpool);
+ return 0;
+}
+
+static void raid1_quiesce(mddev_t *mddev, int state)
+{
+ conf_t *conf = mddev->private;
+
+ switch(state) {
+ case 2: /* wake for suspend */
+ wake_up(&conf->wait_barrier);
+ break;
+ case 1:
+ raise_barrier(conf);
+ break;
+ case 0:
+ lower_barrier(conf);
+ break;
+ }
+}
+
+static void *raid1_takeover(mddev_t *mddev)
+{
+ /* raid1 can take over:
+ * raid5 with 2 devices, any layout or chunk size
+ */
+ if (mddev->level == 5 && mddev->raid_disks == 2) {
+ conf_t *conf;
+ mddev->new_level = 1;
+ mddev->new_layout = 0;
+ mddev->new_chunk_sectors = 0;
+ conf = setup_conf(mddev);
+ if (!IS_ERR(conf))
+ conf->barrier = 1;
+ return conf;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct mdk_personality raid1_personality =
+{
+ .name = "raid1",
+ .level = 1,
+ .owner = THIS_MODULE,
+ .make_request = make_request,
+ .run = run,
+ .stop = stop,
+ .status = status,
+ .error_handler = error,
+ .hot_add_disk = raid1_add_disk,
+ .hot_remove_disk= raid1_remove_disk,
+ .spare_active = raid1_spare_active,
+ .sync_request = sync_request,
+ .resize = raid1_resize,
+ .size = raid1_size,
+ .check_reshape = raid1_reshape,
+ .quiesce = raid1_quiesce,
+ .takeover = raid1_takeover,
+};
+
+static int __init raid_init(void)
+{
+ return register_md_personality(&raid1_personality);
+}
+
+static void raid_exit(void)
+{
+ unregister_md_personality(&raid1_personality);
+}
+
+module_init(raid_init);
+module_exit(raid_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
+MODULE_ALIAS("md-personality-3"); /* RAID1 */
+MODULE_ALIAS("md-raid1");
+MODULE_ALIAS("md-level-1");
+
+module_param(max_queued, int, S_IRUGO|S_IWUSR);
+|||||||
+ bio_list_init(&conf->pending_bio_list);
+ bio_list_init(&conf->flushing_bio_list);
+
+
+=======
+ bio_list_init(&conf->pending_bio_list);
+ conf->pending_count = 0;
+ bio_list_init(&conf->flushing_bio_list);
+
+
+>>>>>>>
diff --git a/tests/linux/raid1-A/orig b/tests/linux/raid1-A/orig
new file mode 100644
index 0000000..fe7bb27
--- /dev/null
+++ b/tests/linux/raid1-A/orig
@@ -0,0 +1,2273 @@
+/*
+ * raid1.c : Multiple Devices driver for Linux
+ *
+ * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
+ *
+ * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ *
+ * RAID-1 management functions.
+ *
+ * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
+ *
+ * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
+ * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
+ * bitmapped intelligence in resync:
+ *
+ * - bitmap marked during normal i/o
+ * - bitmap used to skip nondirty blocks during sync
+ *
+ * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
+ * - persistent bitmap code
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/seq_file.h>
+#include "md.h"
+#include "raid1.h"
+#include "bitmap.h"
+
+#define DEBUG 0
+#if DEBUG
+#define PRINTK(x...) printk(x)
+#else
+#define PRINTK(x...)
+#endif
+
+/*
+ * Number of guaranteed r1bios in case of extreme VM load:
+ */
+#define NR_RAID1_BIOS 256
+
+
+static void allow_barrier(conf_t *conf);
+static void lower_barrier(conf_t *conf);
+
+static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
+{
+ struct pool_info *pi = data;
+ int size = offsetof(r1bio_t, bios[pi->raid_disks]);
+
+ /* allocate a r1bio with room for raid_disks entries in the bios array */
+ return kzalloc(size, gfp_flags);
+}
+
+static void r1bio_pool_free(void *r1_bio, void *data)
+{
+ kfree(r1_bio);
+}
+
+#define RESYNC_BLOCK_SIZE (64*1024)
+//#define RESYNC_BLOCK_SIZE PAGE_SIZE
+#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
+#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
+#define RESYNC_WINDOW (2048*1024)
+
+static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
+{
+ struct pool_info *pi = data;
+ struct page *page;
+ r1bio_t *r1_bio;
+ struct bio *bio;
+ int i, j;
+
+ r1_bio = r1bio_pool_alloc(gfp_flags, pi);
+ if (!r1_bio)
+ return NULL;
+
+ /*
+ * Allocate bios : 1 for reading, n-1 for writing
+ */
+ for (j = pi->raid_disks ; j-- ; ) {
+ bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
+ if (!bio)
+ goto out_free_bio;
+ r1_bio->bios[j] = bio;
+ }
+ /*
+ * Allocate RESYNC_PAGES data pages and attach them to
+ * the first bio.
+ * If this is a user-requested check/repair, allocate
+ * RESYNC_PAGES for each bio.
+ */
+ if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
+ j = pi->raid_disks;
+ else
+ j = 1;
+ while(j--) {
+ bio = r1_bio->bios[j];
+ for (i = 0; i < RESYNC_PAGES; i++) {
+ page = alloc_page(gfp_flags);
+ if (unlikely(!page))
+ goto out_free_pages;
+
+ bio->bi_io_vec[i].bv_page = page;
+ bio->bi_vcnt = i+1;
+ }
+ }
+ /* If not user-requests, copy the page pointers to all bios */
+ if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
+ for (i=0; i<RESYNC_PAGES ; i++)
+ for (j=1; j<pi->raid_disks; j++)
+ r1_bio->bios[j]->bi_io_vec[i].bv_page =
+ r1_bio->bios[0]->bi_io_vec[i].bv_page;
+ }
+
+ r1_bio->master_bio = NULL;
+
+ return r1_bio;
+
+out_free_pages:
+ for (j=0 ; j < pi->raid_disks; j++)
+ for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
+ put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
+ j = -1;
+out_free_bio:
+ while ( ++j < pi->raid_disks )
+ bio_put(r1_bio->bios[j]);
+ r1bio_pool_free(r1_bio, data);
+ return NULL;
+}
+
+static void r1buf_pool_free(void *__r1_bio, void *data)
+{
+ struct pool_info *pi = data;
+ int i,j;
+ r1bio_t *r1bio = __r1_bio;
+
+ for (i = 0; i < RESYNC_PAGES; i++)
+ for (j = pi->raid_disks; j-- ;) {
+ if (j == 0 ||
+ r1bio->bios[j]->bi_io_vec[i].bv_page !=
+ r1bio->bios[0]->bi_io_vec[i].bv_page)
+ safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
+ }
+ for (i=0 ; i < pi->raid_disks; i++)
+ bio_put(r1bio->bios[i]);
+
+ r1bio_pool_free(r1bio, data);
+}
+
+static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
+{
+ int i;
+
+ for (i = 0; i < conf->raid_disks; i++) {
+ struct bio **bio = r1_bio->bios + i;
+ if (*bio && *bio != IO_BLOCKED)
+ bio_put(*bio);
+ *bio = NULL;
+ }
+}
+
+static void free_r1bio(r1bio_t *r1_bio)
+{
+ conf_t *conf = r1_bio->mddev->private;
+
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf);
+
+ put_all_bios(conf, r1_bio);
+ mempool_free(r1_bio, conf->r1bio_pool);
+}
+
+static void put_buf(r1bio_t *r1_bio)
+{
+ conf_t *conf = r1_bio->mddev->private;
+ int i;
+
+ for (i=0; i<conf->raid_disks; i++) {
+ struct bio *bio = r1_bio->bios[i];
+ if (bio->bi_end_io)
+ rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
+ }
+
+ mempool_free(r1_bio, conf->r1buf_pool);
+
+ lower_barrier(conf);
+}
+
+static void reschedule_retry(r1bio_t *r1_bio)
+{
+ unsigned long flags;
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ list_add(&r1_bio->retry_list, &conf->retry_list);
+ conf->nr_queued ++;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ wake_up(&conf->wait_barrier);
+ md_wakeup_thread(mddev->thread);
+}
+
+/*
+ * raid_end_bio_io() is called when we have finished servicing a mirrored
+ * operation and are ready to return a success/failure code to the buffer
+ * cache layer.
+ */
+static void raid_end_bio_io(r1bio_t *r1_bio)
+{
+ struct bio *bio = r1_bio->master_bio;
+
+ /* if nobody has done the final endio yet, do it now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
+ (bio_data_dir(bio) == WRITE) ? "write" : "read",
+ (unsigned long long) bio->bi_sector,
+ (unsigned long long) bio->bi_sector +
+ (bio->bi_size >> 9) - 1);
+
+ bio_endio(bio,
+ test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
+ }
+ free_r1bio(r1_bio);
+}
+
+/*
+ * Update disk head position estimator based on IRQ completion info.
+ */
+static inline void update_head_pos(int disk, r1bio_t *r1_bio)
+{
+ conf_t *conf = r1_bio->mddev->private;
+
+ conf->mirrors[disk].head_position =
+ r1_bio->sector + (r1_bio->sectors);
+}
+
+static void raid1_end_read_request(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ r1bio_t *r1_bio = bio->bi_private;
+ int mirror;
+ conf_t *conf = r1_bio->mddev->private;
+
+ mirror = r1_bio->read_disk;
+ /*
+ * this branch is our 'one mirror IO has finished' event handler:
+ */
+ update_head_pos(mirror, r1_bio);
+
+ if (uptodate)
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ else {
+ /* If all other devices have failed, we want to return
+ * the error upwards rather than fail the last device.
+ * Here we redefine "uptodate" to mean "Don't want to retry"
+ */
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (r1_bio->mddev->degraded == conf->raid_disks ||
+ (r1_bio->mddev->degraded == conf->raid_disks-1 &&
+ !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)))
+ uptodate = 1;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+
+ if (uptodate)
+ raid_end_bio_io(r1_bio);
+ else {
+ /*
+ * oops, read error:
+ */
+ char b[BDEVNAME_SIZE];
+ if (printk_ratelimit())
+ printk(KERN_ERR "md/raid1:%s: %s: rescheduling sector %llu\n",
+ mdname(conf->mddev),
+ bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
+ reschedule_retry(r1_bio);
+ }
+
+ rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+}
+
+static void r1_bio_write_done(r1bio_t *r1_bio)
+{
+ if (atomic_dec_and_test(&r1_bio->remaining))
+ {
+ /* it really is the end of this request */
+ if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+ /* free extra copy of the data pages */
+ int i = r1_bio->behind_page_count;
+ while (i--)
+ safe_put_page(r1_bio->behind_pages[i]);
+ kfree(r1_bio->behind_pages);
+ r1_bio->behind_pages = NULL;
+ }
+ /* clear the bitmap if all writes complete successfully */
+ bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+ r1_bio->sectors,
+ !test_bit(R1BIO_Degraded, &r1_bio->state),
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ md_write_end(r1_bio->mddev);
+ raid_end_bio_io(r1_bio);
+ }
+}
+
+static void raid1_end_write_request(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ r1bio_t *r1_bio = bio->bi_private;
+ int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
+ conf_t *conf = r1_bio->mddev->private;
+ struct bio *to_put = NULL;
+
+
+ for (mirror = 0; mirror < conf->raid_disks; mirror++)
+ if (r1_bio->bios[mirror] == bio)
+ break;
+
+ /*
+ * 'one mirror IO has finished' event handler:
+ */
+ r1_bio->bios[mirror] = NULL;
+ to_put = bio;
+ if (!uptodate) {
+ md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+ /* an I/O failed, we can't clear the bitmap */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ } else
+ /*
+ * Set R1BIO_Uptodate in our master bio, so that we
+ * will return a good error code for to the higher
+ * levels even if IO on some other mirrored buffer
+ * fails.
+ *
+ * The 'master' represents the composite IO operation
+ * to user-side. So if something waits for IO, then it
+ * will wait for the 'master' bio.
+ */
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+ update_head_pos(mirror, r1_bio);
+
+ if (behind) {
+ if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+ atomic_dec(&r1_bio->behind_remaining);
+
+ /*
+ * In behind mode, we ACK the master bio once the I/O
+ * has safely reached all non-writemostly
+ * disks. Setting the Returned bit ensures that this
+ * gets done only once -- we don't ever want to return
+ * -EIO here, instead we'll wait
+ */
+ if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+ test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+ /* Maybe we can return now */
+ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+ struct bio *mbio = r1_bio->master_bio;
+ PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+ (unsigned long long) mbio->bi_sector,
+ (unsigned long long) mbio->bi_sector +
+ (mbio->bi_size >> 9) - 1);
+ bio_endio(mbio, 0);
+ }
+ }
+ }
+ rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
+
+ /*
+ * Let's see if all mirrored write operations have finished
+ * already.
+ */
+ r1_bio_write_done(r1_bio);
+
+ if (to_put)
+ bio_put(to_put);
+}
+
+
+/*
+ * This routine returns the disk from which the requested read should
+ * be done. There is a per-array 'next expected sequential IO' sector
+ * number - if this matches on the next IO then we use the last disk.
+ * There is also a per-disk 'last know head position' sector that is
+ * maintained from IRQ contexts, both the normal and the resync IO
+ * completion handlers update this position correctly. If there is no
+ * perfect sequential match then we pick the disk whose head is closest.
+ *
+ * If there are 2 mirrors in the same 2 devices, performance degrades
+ * because position is mirror, not device based.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+static int read_balance(conf_t *conf, r1bio_t *r1_bio)
+{
+ const sector_t this_sector = r1_bio->sector;
+ const int sectors = r1_bio->sectors;
+ int start_disk;
+ int best_disk;
+ int i;
+ sector_t best_dist;
+ mdk_rdev_t *rdev;
+ int choose_first;
+
+ rcu_read_lock();
+ /*
+ * Check if we can balance. We can balance on the whole
+ * device if no resync is going on, or below the resync window.
+ * We take the first readable disk when above the resync window.
+ */
+ retry:
+ best_disk = -1;
+ best_dist = MaxSector;
+ if (conf->mddev->recovery_cp < MaxSector &&
+ (this_sector + sectors >= conf->next_resync)) {
+ choose_first = 1;
+ start_disk = 0;
+ } else {
+ choose_first = 0;
+ start_disk = conf->last_used;
+ }
+
+ for (i = 0 ; i < conf->raid_disks ; i++) {
+ sector_t dist;
+ int disk = start_disk + i;
+ if (disk >= conf->raid_disks)
+ disk -= conf->raid_disks;
+
+ rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ if (r1_bio->bios[disk] == IO_BLOCKED
+ || rdev == NULL
+ || test_bit(Faulty, &rdev->flags))
+ continue;
+ if (!test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < this_sector + sectors)
+ continue;
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ /* Don't balance among write-mostly, just
+ * use the first as a last resort */
+ if (best_disk < 0)
+ best_disk = disk;
+ continue;
+ }
+ /* This is a reasonable device to use. It might
+ * even be best.
+ */
+ dist = abs(this_sector - conf->mirrors[disk].head_position);
+ if (choose_first
+ /* Don't change to another disk for sequential reads */
+ || conf->next_seq_sect == this_sector
+ || dist == 0
+ /* If device is idle, use it */
+ || atomic_read(&rdev->nr_pending) == 0) {
+ best_disk = disk;
+ break;
+ }
+ if (dist < best_dist) {
+ best_dist = dist;
+ best_disk = disk;
+ }
+ }
+
+ if (best_disk >= 0) {
+ rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
+ if (!rdev)
+ goto retry;
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(Faulty, &rdev->flags)) {
+ /* cannot risk returning a device that failed
+ * before we inc'ed nr_pending
+ */
+ rdev_dec_pending(rdev, conf->mddev);
+ goto retry;
+ }
+ conf->next_seq_sect = this_sector + sectors;
+ conf->last_used = best_disk;
+ }
+ rcu_read_unlock();
+
+ return best_disk;
+}
+
+int md_raid1_congested(mddev_t *mddev, int bits)
+{
+ conf_t *conf = mddev->private;
+ int i, ret = 0;
+
+ rcu_read_lock();
+ for (i = 0; i < mddev->raid_disks; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ struct request_queue *q = bdev_get_queue(rdev->bdev);
+
+ BUG_ON(!q);
+
+ /* Note the '|| 1' - when read_balance prefers
+ * non-congested targets, it can be removed
+ */
+ if ((bits & (1<<BDI_async_congested)) || 1)
+ ret |= bdi_congested(&q->backing_dev_info, bits);
+ else
+ ret &= bdi_congested(&q->backing_dev_info, bits);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(md_raid1_congested);
+
+static int max_queued = INT_MAX;
+static int raid1_congested(void *data, int bits)
+{
+ mddev_t *mddev = data;
+
+ return mddev_congested(mddev, bits) ||
+ md_raid1_congested(mddev, bits);
+}
+
+static void flush_pending_writes(conf_t *conf)
+{
+ /* Any writes that have been queued but are awaiting
+ * bitmap updates get flushed here.
+ */
+ spin_lock_irq(&conf->device_lock);
+
+ if (conf->pending_bio_list.head) {
+ struct bio *bio;
+ bio = bio_list_get(&conf->pending_bio_list);
+ spin_unlock_irq(&conf->device_lock);
+ /* flush any pending bitmap writes to
+ * disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+
+ while (bio) { /* submit pending writes */
+ struct bio *next = bio->bi_next;
+ bio->bi_next = NULL;
+ generic_make_request(bio);
+ bio = next;
+ }
+ } else
+ spin_unlock_irq(&conf->device_lock);
+}
+
+/* Barriers....
+ * Sometimes we need to suspend IO while we do something else,
+ * either some resync/recovery, or reconfigure the array.
+ * To do this we raise a 'barrier'.
+ * The 'barrier' is a counter that can be raised multiple times
+ * to count how many activities are happening which preclude
+ * normal IO.
+ * We can only raise the barrier if there is no pending IO.
+ * i.e. if nr_pending == 0.
+ * We choose only to raise the barrier if no-one is waiting for the
+ * barrier to go down. This means that as soon as an IO request
+ * is ready, no other operations which require a barrier will start
+ * until the IO request has had a chance.
+ *
+ * So: regular IO calls 'wait_barrier'. When that returns there
+ * is no backgroup IO happening, It must arrange to call
+ * allow_barrier when it has finished its IO.
+ * backgroup IO calls must call raise_barrier. Once that returns
+ * there is no normal IO happeing. It must arrange to call
+ * lower_barrier when the particular background IO completes.
+ */
+#define RESYNC_DEPTH 32
+
+static void raise_barrier(conf_t *conf)
+{
+ spin_lock_irq(&conf->resync_lock);
+
+ /* Wait until no block IO is waiting */
+ wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
+ conf->resync_lock, );
+
+ /* block any new IO from starting */
+ conf->barrier++;
+
+ /* Now wait for all pending IO to complete */
+ wait_event_lock_irq(conf->wait_barrier,
+ !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
+ conf->resync_lock, );
+
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static void lower_barrier(conf_t *conf)
+{
+ unsigned long flags;
+ BUG_ON(conf->barrier <= 0);
+ spin_lock_irqsave(&conf->resync_lock, flags);
+ conf->barrier--;
+ spin_unlock_irqrestore(&conf->resync_lock, flags);
+ wake_up(&conf->wait_barrier);
+}
+
+static void wait_barrier(conf_t *conf)
+{
+ spin_lock_irq(&conf->resync_lock);
+ if (conf->barrier) {
+ conf->nr_waiting++;
+ wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
+ conf->resync_lock,
+ );
+ conf->nr_waiting--;
+ }
+ conf->nr_pending++;
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+static void allow_barrier(conf_t *conf)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&conf->resync_lock, flags);
+ conf->nr_pending--;
+ spin_unlock_irqrestore(&conf->resync_lock, flags);
+ wake_up(&conf->wait_barrier);
+}
+
+static void freeze_array(conf_t *conf)
+{
+ /* stop syncio and normal IO and wait for everything to
+ * go quite.
+ * We increment barrier and nr_waiting, and then
+ * wait until nr_pending match nr_queued+1
+ * This is called in the context of one normal IO request
+ * that has failed. Thus any sync request that might be pending
+ * will be blocked by nr_pending, and we need to wait for
+ * pending IO requests to complete or be queued for re-try.
+ * Thus the number queued (nr_queued) plus this request (1)
+ * must match the number of pending IOs (nr_pending) before
+ * we continue.
+ */
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier++;
+ conf->nr_waiting++;
+ wait_event_lock_irq(conf->wait_barrier,
+ conf->nr_pending == conf->nr_queued+1,
+ conf->resync_lock,
+ flush_pending_writes(conf));
+ spin_unlock_irq(&conf->resync_lock);
+}
+static void unfreeze_array(conf_t *conf)
+{
+ /* reverse the effect of the freeze */
+ spin_lock_irq(&conf->resync_lock);
+ conf->barrier--;
+ conf->nr_waiting--;
+ wake_up(&conf->wait_barrier);
+ spin_unlock_irq(&conf->resync_lock);
+}
+
+
+/* duplicate the data pages for behind I/O
+ */
+static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio)
+{
+ int i;
+ struct bio_vec *bvec;
+ struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*),
+ GFP_NOIO);
+ if (unlikely(!pages))
+ return;
+
+ bio_for_each_segment(bvec, bio, i) {
+ pages[i] = alloc_page(GFP_NOIO);
+ if (unlikely(!pages[i]))
+ goto do_sync_io;
+ memcpy(kmap(pages[i]) + bvec->bv_offset,
+ kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
+ kunmap(pages[i]);
+ kunmap(bvec->bv_page);
+ }
+ r1_bio->behind_pages = pages;
+ r1_bio->behind_page_count = bio->bi_vcnt;
+ set_bit(R1BIO_BehindIO, &r1_bio->state);
+ return;
+
+do_sync_io:
+ for (i = 0; i < bio->bi_vcnt; i++)
+ if (pages[i])
+ put_page(pages[i]);
+ kfree(pages);
+ PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
+}
+
+static int make_request(mddev_t *mddev, struct bio * bio)
+{
+ conf_t *conf = mddev->private;
+ mirror_info_t *mirror;
+ r1bio_t *r1_bio;
+ struct bio *read_bio;
+ int i, targets = 0, disks;
+ struct bitmap *bitmap;
+ unsigned long flags;
+ const int rw = bio_data_dir(bio);
+ const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
+ const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
+ mdk_rdev_t *blocked_rdev;
+ int plugged;
+
+ /*
+ * Register the new request and wait if the reconstruction
+ * thread has put up a bar for new requests.
+ * Continue immediately if no resync is active currently.
+ */
+
+ md_write_start(mddev, bio); /* wait on superblock update early */
+
+ if (bio_data_dir(bio) == WRITE &&
+ bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+ bio->bi_sector < mddev->suspend_hi) {
+ /* As the suspend_* range is controlled by
+ * userspace, we want an interruptible
+ * wait.
+ */
+ DEFINE_WAIT(w);
+ for (;;) {
+ flush_signals(current);
+ prepare_to_wait(&conf->wait_barrier,
+ &w, TASK_INTERRUPTIBLE);
+ if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+ bio->bi_sector >= mddev->suspend_hi)
+ break;
+ schedule();
+ }
+ finish_wait(&conf->wait_barrier, &w);
+ }
+
+ wait_barrier(conf);
+
+ bitmap = mddev->bitmap;
+
+ /*
+ * make_request() can abort the operation when READA is being
+ * used and no empty request is available.
+ *
+ */
+ r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+ r1_bio->master_bio = bio;
+ r1_bio->sectors = bio->bi_size >> 9;
+ r1_bio->state = 0;
+ r1_bio->mddev = mddev;
+ r1_bio->sector = bio->bi_sector;
+
+ if (rw == READ) {
+ /*
+ * read balancing logic:
+ */
+ int rdisk = read_balance(conf, r1_bio);
+
+ if (rdisk < 0) {
+ /* couldn't find anywhere to read from */
+ raid_end_bio_io(r1_bio);
+ return 0;
+ }
+ mirror = conf->mirrors + rdisk;
+
+ if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+ bitmap) {
+ /* Reading from a write-mostly device must
+ * take care not to over-take any writes
+ * that are 'behind'
+ */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+ r1_bio->read_disk = rdisk;
+
+ read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+
+ r1_bio->bios[rdisk] = read_bio;
+
+ read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
+ read_bio->bi_bdev = mirror->rdev->bdev;
+ read_bio->bi_end_io = raid1_end_read_request;
+ read_bio->bi_rw = READ | do_sync;
+ read_bio->bi_private = r1_bio;
+
+ generic_make_request(read_bio);
+ return 0;
+ }
+
+ /*
+ * WRITE:
+ */
+ /* first select target devices under spinlock and
+ * inc refcount on their rdev. Record them by setting
+ * bios[x] to bio
+ */
+ plugged = mddev_check_plugged(mddev);
+
+ disks = conf->raid_disks;
+ retry_write:
+ blocked_rdev = NULL;
+ rcu_read_lock();
+ for (i = 0; i < disks; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+ atomic_inc(&rdev->nr_pending);
+ blocked_rdev = rdev;
+ break;
+ }
+ if (rdev && !test_bit(Faulty, &rdev->flags)) {
+ atomic_inc(&rdev->nr_pending);
+ if (test_bit(Faulty, &rdev->flags)) {
+ rdev_dec_pending(rdev, mddev);
+ r1_bio->bios[i] = NULL;
+ } else {
+ r1_bio->bios[i] = bio;
+ targets++;
+ }
+ } else
+ r1_bio->bios[i] = NULL;
+ }
+ rcu_read_unlock();
+
+ if (unlikely(blocked_rdev)) {
+ /* Wait for this device to become unblocked */
+ int j;
+
+ for (j = 0; j < i; j++)
+ if (r1_bio->bios[j])
+ rdev_dec_pending(conf->mirrors[j].rdev, mddev);
+
+ allow_barrier(conf);
+ md_wait_for_blocked_rdev(blocked_rdev, mddev);
+ wait_barrier(conf);
+ goto retry_write;
+ }
+
+ BUG_ON(targets == 0); /* we never fail the last device */
+
+ if (targets < conf->raid_disks) {
+ /* array is degraded, we will not clear the bitmap
+ * on I/O completion (see raid1_end_write_request) */
+ set_bit(R1BIO_Degraded, &r1_bio->state);
+ }
+
+ /* do behind I/O ?
+ * Not if there are too many, or cannot allocate memory,
+ * or a reader on WriteMostly is waiting for behind writes
+ * to flush */
+ if (bitmap &&
+ (atomic_read(&bitmap->behind_writes)
+ < mddev->bitmap_info.max_write_behind) &&
+ !waitqueue_active(&bitmap->behind_wait))
+ alloc_behind_pages(bio, r1_bio);
+
+ atomic_set(&r1_bio->remaining, 1);
+ atomic_set(&r1_bio->behind_remaining, 0);
+
+ bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ for (i = 0; i < disks; i++) {
+ struct bio *mbio;
+ if (!r1_bio->bios[i])
+ continue;
+
+ mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+ r1_bio->bios[i] = mbio;
+
+ mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
+ mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ mbio->bi_end_io = raid1_end_write_request;
+ mbio->bi_rw = WRITE | do_flush_fua | do_sync;
+ mbio->bi_private = r1_bio;
+
+ if (r1_bio->behind_pages) {
+ struct bio_vec *bvec;
+ int j;
+
+ /* Yes, I really want the '__' version so that
+ * we clear any unused pointer in the io_vec, rather
+ * than leave them unchanged. This is important
+ * because when we come to free the pages, we won't
+ * know the original bi_idx, so we just free
+ * them all
+ */
+ __bio_for_each_segment(bvec, mbio, j, 0)
+ bvec->bv_page = r1_bio->behind_pages[j];
+ if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
+ atomic_inc(&r1_bio->behind_remaining);
+ }
+
+ atomic_inc(&r1_bio->remaining);
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_add(&conf->pending_bio_list, mbio);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+ r1_bio_write_done(r1_bio);
+
+ /* In case raid1d snuck in to freeze_array */
+ wake_up(&conf->wait_barrier);
+
+ if (do_sync || !bitmap || !plugged)
+ md_wakeup_thread(mddev->thread);
+
+ return 0;
+}
+
+static void status(struct seq_file *seq, mddev_t *mddev)
+{
+ conf_t *conf = mddev->private;
+ int i;
+
+ seq_printf(seq, " [%d/%d] [", conf->raid_disks,
+ conf->raid_disks - mddev->degraded);
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ seq_printf(seq, "%s",
+ rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
+ }
+ rcu_read_unlock();
+ seq_printf(seq, "]");
+}
+
+
+static void error(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ char b[BDEVNAME_SIZE];
+ conf_t *conf = mddev->private;
+
+ /*
+ * If it is not operational, then we have already marked it as dead
+ * else if it is the last working disks, ignore the error, let the
+ * next level up know.
+ * else mark the drive as failed
+ */
+ if (test_bit(In_sync, &rdev->flags)
+ && (conf->raid_disks - mddev->degraded) == 1) {
+ /*
+ * Don't fail the drive, act as though we were just a
+ * normal single drive.
+ * However don't try a recovery from this drive as
+ * it is very likely to fail.
+ */
+ mddev->recovery_disabled = 1;
+ return;
+ }
+ if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded++;
+ set_bit(Faulty, &rdev->flags);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ /*
+ * if recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ } else
+ set_bit(Faulty, &rdev->flags);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ printk(KERN_ALERT
+ "md/raid1:%s: Disk failure on %s, disabling device.\n"
+ "md/raid1:%s: Operation continuing on %d devices.\n",
+ mdname(mddev), bdevname(rdev->bdev, b),
+ mdname(mddev), conf->raid_disks - mddev->degraded);
+}
+
+static void print_conf(conf_t *conf)
+{
+ int i;
+
+ printk(KERN_DEBUG "RAID1 conf printout:\n");
+ if (!conf) {
+ printk(KERN_DEBUG "(!conf)\n");
+ return;
+ }
+ printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
+ conf->raid_disks);
+
+ if ((bits & (1 << BDI_async_congested)) &&
+ conf->pending_count >= max_queued)
+ return 1;
+
+ rcu_read_lock();
+ for (i = 0; i < conf->raid_disks; i++) {
+ char b[BDEVNAME_SIZE];
+ mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev)
+ printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
+ i, !test_bit(In_sync, &rdev->flags),
+ !test_bit(Faulty, &rdev->flags),
+ bdevname(rdev->bdev,b));
+ }
+ rcu_read_unlock();
+}
+
+static void close_sync(conf_t *conf)
+{
+ wait_barrier(conf);
+ allow_barrier(conf);
+
+ mempool_destroy(conf->r1buf_pool);
+ conf->r1buf_pool = NULL;
+}
+
+static int raid1_spare_active(mddev_t *mddev)
+{
+ int i;
+ conf_t *conf = mddev->private;
+ int count = 0;
+ unsigned long flags;
+
+ /*
+ * Find all failed disks within the RAID1 configuration
+ * and mark them readable.
+ * Called under mddev lock, so rcu protection not needed.
+ */
+ for (i = 0; i < conf->raid_disks; i++) {
+ mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+ if (rdev
+ && !test_bit(Faulty, &rdev->flags)
+ && !test_and_set_bit(In_sync, &rdev->flags)) {
+ count++;
+ sysfs_notify_dirent(rdev->sysfs_state);
+ }
+ }
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded -= count;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ print_conf(conf);
+ return count;
+}
+
+
+static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+ conf_t *conf = mddev->private;
+ int err = -EEXIST;
+ int mirror = 0;
+ mirror_info_t *p;
+ int first = 0;
+ int last = mddev->raid_disks - 1;
+
+ if (rdev->raid_disk >= 0)
+ first = last = rdev->raid_disk;
+
+ for (mirror = first; mirror <= last; mirror++)
+ if ( !(p=conf->mirrors+mirror)->rdev) {
+
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ /* as we don't honour merge_bvec_fn, we must
+ * never risk violating it, so limit
+ * ->max_segments to one lying with a single
+ * page, as a one page request is never in
+ * violation.
+ */
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
+ }
+
+ p->head_position = 0;
+ rdev->raid_disk = mirror;
+ err = 0;
+ /* As all devices are equivalent, we don't need a full recovery
+ * if this was recently any drive of the array
+ */
+ if (rdev->saved_raid_disk < 0)
+ conf->fullsync = 1;
+ rcu_assign_pointer(p->rdev, rdev);
+ break;
+ }
+ md_integrity_add_rdev(rdev, mddev);
+ print_conf(conf);
+ return err;
+}
+
+static int raid1_remove_disk(mddev_t *mddev, int number)
+{
+ conf_t *conf = mddev->private;
+ int err = 0;
+ mdk_rdev_t *rdev;
+ mirror_info_t *p = conf->mirrors+ number;
+
+ print_conf(conf);
+ rdev = p->rdev;
+ if (rdev) {
+ if (test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending)) {
+ err = -EBUSY;
+ goto abort;
+ }
+ /* Only remove non-faulty devices if recovery
+ * is not possible.
+ */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ !mddev->recovery_disabled &&
+ mddev->degraded < conf->raid_disks) {
+ err = -EBUSY;
+ goto abort;
+ }
+ p->rdev = NULL;
+ synchronize_rcu();
+ if (atomic_read(&rdev->nr_pending)) {
+ /* lost the race, try later */
+ err = -EBUSY;
+ p->rdev = rdev;
+ goto abort;
+ }
+ err = md_integrity_register(mddev);
+ }
+abort:
+
+ print_conf(conf);
+ return err;
+}
+
+
+static void end_sync_read(struct bio *bio, int error)
+{
+ r1bio_t *r1_bio = bio->bi_private;
+ int i;
+
+ for (i=r1_bio->mddev->raid_disks; i--; )
+ if (r1_bio->bios[i] == bio)
+ break;
+ BUG_ON(i < 0);
+ update_head_pos(i, r1_bio);
+ /*
+ * we have read a block, now it needs to be re-written,
+ * or re-read if the read failed.
+ * We don't do much here, just schedule handling by raid1d
+ */
+ if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+
+ if (atomic_dec_and_test(&r1_bio->remaining))
+ reschedule_retry(r1_bio);
+}
+
+static void end_sync_write(struct bio *bio, int error)
+{
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ r1bio_t *r1_bio = bio->bi_private;
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+ int i;
+ int mirror=0;
+
+ for (i = 0; i < conf->raid_disks; i++)
+ if (r1_bio->bios[i] == bio) {
+ mirror = i;
+ break;
+ }
+ if (!uptodate) {
+ sector_t sync_blocks = 0;
+ sector_t s = r1_bio->sector;
+ long sectors_to_go = r1_bio->sectors;
+ /* make sure these bits doesn't get cleared. */
+ do {
+ bitmap_end_sync(mddev->bitmap, s,
+ &sync_blocks, 1);
+ s += sync_blocks;
+ sectors_to_go -= sync_blocks;
+ } while (sectors_to_go > 0);
+ md_error(mddev, conf->mirrors[mirror].rdev);
+ }
+
+ update_head_pos(mirror, r1_bio);
+
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ sector_t s = r1_bio->sectors;
+ put_buf(r1_bio);
+ md_done_sync(mddev, s, uptodate);
+ }
+}
+
+static int fix_sync_read_error(r1bio_t *r1_bio)
+{
+ /* Try some synchronous reads of other devices to get
+ * good data, much like with normal read errors. Only
+ * read into the pages we already have so we don't
+ * need to re-issue the read request.
+ * We don't need to freeze the array, because being in an
+ * active sync request, there is no normal IO, and
+ * no overlapping syncs.
+ */
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+ struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+ sector_t sect = r1_bio->sector;
+ int sectors = r1_bio->sectors;
+ int idx = 0;
+
+ while(sectors) {
+ int s = sectors;
+ int d = r1_bio->read_disk;
+ int success = 0;
+ mdk_rdev_t *rdev;
+ int start;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+ do {
+ if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
+ /* No rcu protection needed here devices
+ * can only be removed when no resync is
+ * active, and resync is currently active
+ */
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false)) {
+ success = 1;
+ break;
+ }
+ }
+ d++;
+ if (d == conf->raid_disks)
+ d = 0;
+ } while (!success && d != r1_bio->read_disk);
+
+ if (!success) {
+ char b[BDEVNAME_SIZE];
+ /* Cannot read from anywhere, array is toast */
+ md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error"
+ " for block %llu\n",
+ mdname(mddev),
+ bdevname(bio->bi_bdev, b),
+ (unsigned long long)r1_bio->sector);
+ md_done_sync(mddev, r1_bio->sectors, 0);
+ put_buf(r1_bio);
+ return 0;
+ }
+
+ start = d;
+ /* write it back and re-read */
+ while (d != r1_bio->read_disk) {
+ if (d == 0)
+ d = conf->raid_disks;
+ d--;
+ if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+ continue;
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ WRITE, false) == 0) {
+ r1_bio->bios[d]->bi_end_io = NULL;
+ rdev_dec_pending(rdev, mddev);
+ md_error(mddev, rdev);
+ } else
+ atomic_add(s, &rdev->corrected_errors);
+ }
+ d = start;
+ while (d != r1_bio->read_disk) {
+ if (d == 0)
+ d = conf->raid_disks;
+ d--;
+ if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+ continue;
+ rdev = conf->mirrors[d].rdev;
+ if (sync_page_io(rdev,
+ sect,
+ s<<9,
+ bio->bi_io_vec[idx].bv_page,
+ READ, false) == 0)
+ md_error(mddev, rdev);
+ }
+ sectors -= s;
+ sect += s;
+ idx ++;
+ }
+ set_bit(R1BIO_Uptodate, &r1_bio->state);
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ return 1;
+}
+
+static int process_checks(r1bio_t *r1_bio)
+{
+ /* We have read all readable devices. If we haven't
+ * got the block, then there is no hope left.
+ * If we have, then we want to do a comparison
+ * and skip the write if everything is the same.
+ * If any blocks failed to read, then we need to
+ * attempt an over-write
+ */
+ mddev_t *mddev = r1_bio->mddev;
+ conf_t *conf = mddev->private;
+ int primary;
+ int i;
+
+ for (primary = 0; primary < conf->raid_disks; primary++)
+ if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
+ test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+ r1_bio->bios[primary]->bi_end_io = NULL;
+ rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
+ break;
+ }
+ r1_bio->read_disk = primary;
+ for (i = 0; i < conf->raid_disks; i++) {
+ int j;
+ int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
+ struct bio *pbio = r1_bio->bios[primary];
+ struct bio *sbio = r1_bio->bios[i];
+ int size;
+
+ if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+ continue;
+
+ if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+ for (j = vcnt; j-- ; ) {
+ struct page *p, *s;
+ p = pbio->bi_io_vec[j].bv_page;
+ s = sbio->bi_io_vec[j].bv_page;
+ if (memcmp(page_address(p),
+ page_address(s),
+ PAGE_SIZE))
+ break;
+ }
+ } else
+ j = 0;
+ if (j >= 0)
+ mddev->resync_mismatches += r1_bio->sectors;
+ if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
+ && test_bit(BIO_UPTODATE, &sbio->bi_flags))) {
+ /* No need to write to this device. */
+ sbio->bi_end_io = NULL;
+ rdev_dec_pending(conf->mirrors[i].rdev, mddev);
+ continue;
+ }
+ /* fixup the bio for reuse */
+ sbio->bi_vcnt = vcnt;
+ sbio->bi_size = r1_bio->sectors << 9;
+ sbio->bi_idx = 0;
+ sbio->bi_phys_segments = 0;
+ sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ sbio->bi_flags |= 1 << BIO_UPTODATE;
+ sbio->bi_next = NULL;
+ sbio->bi_sector = r1_bio->sector +
+ conf->mirrors[i].rdev->data_offset;
+ sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+ size = sbio->bi_size;
+ for (j = 0; j < vcnt ; j++) {
+ struct bio_vec *bi;
+ bi = &sbio->bi_io_vec[j];
+ bi->bv_offset = 0;
+ if (size > PAGE_SIZE)
+ bi->bv_len = PAGE_SIZE;
+ else
+ bi->bv_len = size;
+ size -= PAGE_SIZE;
+ memcpy(page_address(bi->bv_page),
+ page_address(pbio->bi_io_vec[j].bv_page),
+ PAGE_SIZE);
+ }
+ }
+ return 0;
+}
+
+static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
+{
+ conf_t *conf = mddev->private;
+ int i;
+ int disks = conf->raid_disks;
+ struct bio *bio, *wbio;
+
+ bio = r1_bio->bios[r1_bio->read_disk];
+
+ if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
+ /* ouch - failed to read all of that. */
+ if (!fix_sync_read_error(r1_bio))
+ return;
+
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ if (process_checks(r1_bio) < 0)
+ return;
+ /*
+ * schedule writes
+ */
+ atomic_set(&r1_bio->remaining, 1);
+ for (i = 0; i < disks ; i++) {
+ wbio = r1_bio->bios[i];
+ if (wbio->bi_end_io == NULL ||
+ (wbio->bi_end_io == end_sync_read &&
+ (i == r1_bio->read_disk ||
+ !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
+ continue;
+
+ wbio->bi_rw = WRITE;
+ wbio->bi_end_io = end_sync_write;
+ atomic_inc(&r1_bio->remaining);
+ md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
+
+ generic_make_request(wbio);
+ }
+
+ if (atomic_dec_and_test(&r1_bio->remaining)) {
+ /* if we're here, all write(s) have completed, so clean up */
+ md_done_sync(mddev, r1_bio->sectors, 1);
+ put_buf(r1_bio);
+ }
+}
+
+/*
+ * This is a kernel thread which:
+ *
+ * 1. Retries failed read operations on working mirrors.
+ * 2. Updates the raid superblock when problems encounter.
+ * 3. Performs writes following reads for array syncronising.
+ */
+
+static void fix_read_error(conf_t *conf, int read_disk,
+ sector_t sect, int sectors)
+{
+ mddev_t *mddev = conf->mddev;
+ while(sectors) {
+ int s = sectors;
+ int d = read_disk;
+ int success = 0;
+ int start;
+ mdk_rdev_t *rdev;
+
+ if (s > (PAGE_SIZE>>9))
+ s = PAGE_SIZE >> 9;
+
+ do {
+ /* Note: no rcu protection needed here
+ * as this is synchronous in the raid1d thread
+ * which is the thread that might remove
+ * a device. If raid1d ever becomes multi-threaded....
+ */
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags) &&
+ sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false))
+ success = 1;
+ else {
+ d++;
+ if (d == conf->raid_disks)
+ d = 0;
+ }
+ } while (!success && d != read_disk);
+
+ if (!success) {
+ /* Cannot read from anywhere -- bye bye array */
+ md_error(mddev, conf->mirrors[read_disk].rdev);
+ break;
+ }
+ /* write it back and re-read */
+ start = d;
+ while (d != read_disk) {
+ if (d==0)
+ d = conf->raid_disks;
+ d--;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, WRITE, false)
+ == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ }
+ }
+ d = start;
+ while (d != read_disk) {
+ char b[BDEVNAME_SIZE];
+ if (d==0)
+ d = conf->raid_disks;
+ d--;
+ rdev = conf->mirrors[d].rdev;
+ if (rdev &&
+ test_bit(In_sync, &rdev->flags)) {
+ if (sync_page_io(rdev, sect, s<<9,
+ conf->tmppage, READ, false)
+ == 0)
+ /* Well, this device is dead */
+ md_error(mddev, rdev);
+ else {
+ atomic_add(s, &rdev->corrected_errors);
+ printk(KERN_INFO
+ "md/raid1:%s: read error corrected "
+ "(%d sectors at %llu on %s)\n",
+ mdname(mddev), s,
+ (unsigned long long)(sect +
+ rdev->data_offset),
+ bdevname(rdev->bdev, b));
+ }
+ }
+ }
+ sectors -= s;
+ sect += s;
+ }
+}
+
+static void raid1d(mddev_t *mddev)
+{
+ r1bio_t *r1_bio;
+ struct bio *bio;
+ unsigned long flags;
+ conf_t *conf = mddev->private;
+ struct list_head *head = &conf->retry_list;
+ mdk_rdev_t *rdev;
+ struct blk_plug plug;
+
+ md_check_recovery(mddev);
+
+ blk_start_plug(&plug);
+ for (;;) {
+ char b[BDEVNAME_SIZE];
+
+ if (atomic_read(&mddev->plug_cnt) == 0)
+ flush_pending_writes(conf);
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (list_empty(head)) {
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ break;
+ }
+ r1_bio = list_entry(head->prev, r1bio_t, retry_list);
+ list_del(head->prev);
+ conf->nr_queued--;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+
+ mddev = r1_bio->mddev;
+ conf = mddev->private;
+ if (test_bit(R1BIO_IsSync, &r1_bio->state))
+ sync_request_write(mddev, r1_bio);
+ else {
+ int disk;
+
+ /* we got a read error. Maybe the drive is bad. Maybe just
+ * the block and we can fix it.
+ * We freeze all other IO, and try reading the block from
+ * other devices. When we find one, we re-write
+ * and check it that fixes the read error.
+ * This is all done synchronously while the array is
+ * frozen
+ */
+ if (mddev->ro == 0) {
+ freeze_array(conf);
+ fix_read_error(conf, r1_bio->read_disk,
+ r1_bio->sector,
+ r1_bio->sectors);
+ unfreeze_array(conf);
+ } else
+ md_error(mddev,
+ conf->mirrors[r1_bio->read_disk].rdev);
+
+ bio = r1_bio->bios[r1_bio->read_disk];
+ if ((disk=read_balance(conf, r1_bio)) == -1) {
+ printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
+ " read error for block %llu\n",
+ mdname(mddev),
+ bdevname(bio->bi_bdev,b),
+ (unsigned long long)r1_bio->sector);
+ raid_end_bio_io(r1_bio);
+ } else {
+ const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
+ r1_bio->bios[r1_bio->read_disk] =
+ mddev->ro ? IO_BLOCKED : NULL;
+ r1_bio->read_disk = disk;
+ bio_put(bio);
+ bio = bio_clone_mddev(r1_bio->master_bio,
+ GFP_NOIO, mddev);
+ r1_bio->bios[r1_bio->read_disk] = bio;
+ rdev = conf->mirrors[disk].rdev;
+ if (printk_ratelimit())
+ printk(KERN_ERR "md/raid1:%s: redirecting sector %llu to"
+ " other mirror: %s\n",
+ mdname(mddev),
+ (unsigned long long)r1_bio->sector,
+ bdevname(rdev->bdev,b));
+ bio->bi_sector = r1_bio->sector + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_end_io = raid1_end_read_request;
+ bio->bi_rw = READ | do_sync;
+ bio->bi_private = r1_bio;
+ generic_make_request(bio);
+ }
+ }
+ cond_resched();
+ }
+ blk_finish_plug(&plug);
+}
+
+
+static int init_resync(conf_t *conf)
+{
+ int buffs;
+
+ buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
+ BUG_ON(conf->r1buf_pool);
+ conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
+ conf->poolinfo);
+ if (!conf->r1buf_pool)
+ return -ENOMEM;
+ conf->next_resync = 0;
+ return 0;
+}
+
+/*
+ * perform a "sync" on one "block"
+ *
+ * We need to make sure that no normal I/O request - particularly write
+ * requests - conflict with active sync requests.
+ *
+ * This is achieved by tracking pending requests and a 'barrier' concept
+ * that can be installed to exclude normal IO requests.
+ */
+
+static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
+{
+ conf_t *conf = mddev->private;
+ r1bio_t *r1_bio;
+ struct bio *bio;
+ sector_t max_sector, nr_sectors;
+ int disk = -1;
+ int i;
+ int wonly = -1;
+ int write_targets = 0, read_targets = 0;
+ sector_t sync_blocks;
+ int still_degraded = 0;
+
+ if (!conf->r1buf_pool)
+ if (init_resync(conf))
+ return 0;
+
+ max_sector = mddev->dev_sectors;
+ if (sector_nr >= max_sector) {
+ /* If we aborted, we need to abort the
+ * sync on the 'current' bitmap chunk (there will
+ * only be one in raid1 resync.
+ * We can find the current addess in mddev->curr_resync
+ */
+ if (mddev->curr_resync < max_sector) /* aborted */
+ bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
+ &sync_blocks, 1);
+ else /* completed sync */
+ conf->fullsync = 0;
+
+ bitmap_close_sync(mddev->bitmap);
+ close_sync(conf);
+ return 0;
+ }
+
+ if (mddev->bitmap == NULL &&
+ mddev->recovery_cp == MaxSector &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
+ conf->fullsync == 0) {
+ *skipped = 1;
+ return max_sector - sector_nr;
+ }
+ /* before building a request, check if we can skip these blocks..
+ * This call the bitmap_start_sync doesn't actually record anything
+ */
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ /* We can skip this block, and probably several more */
+ *skipped = 1;
+ return sync_blocks;
+ }
+ /*
+ * If there is non-resync activity waiting for a turn,
+ * and resync is going fast enough,
+ * then let it though before starting on this new sync request.
+ */
+ if (!go_faster && conf->nr_waiting)
+ msleep_interruptible(1000);
+
+ bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+ r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
+ raise_barrier(conf);
+
+ conf->next_resync = sector_nr;
+
+ rcu_read_lock();
+ /*
+ * If we get a correctably read error during resync or recovery,
+ * we might want to read from a different device. So we
+ * flag all drives that could conceivably be read from for READ,
+ * and any others (which will be non-In_sync devices) for WRITE.
+ * If a read fails, we try reading from something else for which READ
+ * is OK.
+ */
+
+ r1_bio->mddev = mddev;
+ r1_bio->sector = sector_nr;
+ r1_bio->state = 0;
+ set_bit(R1BIO_IsSync, &r1_bio->state);
+
+ for (i=0; i < conf->raid_disks; i++) {
+ mdk_rdev_t *rdev;
+ bio = r1_bio->bios[i];
+
+ /* take from bio_init */
+ bio->bi_next = NULL;
+ bio->bi_flags &= ~(BIO_POOL_MASK-1);
+ bio->bi_flags |= 1 << BIO_UPTODATE;
+ bio->bi_comp_cpu = -1;
+ bio->bi_rw = READ;
+ bio->bi_vcnt = 0;
+ bio->bi_idx = 0;
+ bio->bi_phys_segments = 0;
+ bio->bi_size = 0;
+ bio->bi_end_io = NULL;
+ bio->bi_private = NULL;
+
+ rdev = rcu_dereference(conf->mirrors[i].rdev);
+ if (rdev == NULL ||
+ test_bit(Faulty, &rdev->flags)) {
+ still_degraded = 1;
+ continue;
+ } else if (!test_bit(In_sync, &rdev->flags)) {
+ bio->bi_rw = WRITE;
+ bio->bi_end_io = end_sync_write;
+ write_targets ++;
+ } else {
+ /* may need to read from here */
+ bio->bi_rw = READ;
+ bio->bi_end_io = end_sync_read;
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ if (wonly < 0)
+ wonly = i;
+ } else {
+ if (disk < 0)
+ disk = i;
+ }
+ read_targets++;
+ }
+ atomic_inc(&rdev->nr_pending);
+ bio->bi_sector = sector_nr + rdev->data_offset;
+ bio->bi_bdev = rdev->bdev;
+ bio->bi_private = r1_bio;
+ }
+ rcu_read_unlock();
+ if (disk < 0)
+ disk = wonly;
+ r1_bio->read_disk = disk;
+
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
+ /* extra read targets are also write targets */
+ write_targets += read_targets-1;
+
+ if (write_targets == 0 || read_targets == 0) {
+ /* There is nowhere to write, so all non-sync
+ * drives must be failed - so we are finished
+ */
+ sector_t rv = max_sector - sector_nr;
+ *skipped = 1;
+ put_buf(r1_bio);
+ return rv;
+ }
+
+ if (max_sector > mddev->resync_max)
+ max_sector = mddev->resync_max; /* Don't do IO beyond here */
+ nr_sectors = 0;
+ sync_blocks = 0;
+ do {
+ struct page *page;
+ int len = PAGE_SIZE;
+ if (sector_nr + (len>>9) > max_sector)
+ len = (max_sector - sector_nr) << 9;
+ if (len == 0)
+ break;
+ if (sync_blocks == 0) {
+ if (!bitmap_start_sync(mddev->bitmap, sector_nr,
+ &sync_blocks, still_degraded) &&
+ !conf->fullsync &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+ break;
+ BUG_ON(sync_blocks < (PAGE_SIZE>>9));
+ if ((len >> 9) > sync_blocks)
+ len = sync_blocks<<9;
+ }
+
+ for (i=0 ; i < conf->raid_disks; i++) {
+ bio = r1_bio->bios[i];
+ if (bio->bi_end_io) {
+ page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
+ if (bio_add_page(bio, page, len, 0) == 0) {
+ /* stop here */
+ bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
+ while (i > 0) {
+ i--;
+ bio = r1_bio->bios[i];
+ if (bio->bi_end_io==NULL)
+ continue;
+ /* remove last page from this bio */
+ bio->bi_vcnt--;
+ bio->bi_size -= len;
+ bio->bi_flags &= ~(1<< BIO_SEG_VALID);
+ }
+ goto bio_full;
+ }
+ }
+ }
+ nr_sectors += len>>9;
+ sector_nr += len>>9;
+ sync_blocks -= (len>>9);
+ } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
+ bio_full:
+ r1_bio->sectors = nr_sectors;
+
+ /* For a user-requested sync, we read all readable devices and do a
+ * compare
+ */
+ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ atomic_set(&r1_bio->remaining, read_targets);
+ for (i=0; i<conf->raid_disks; i++) {
+ bio = r1_bio->bios[i];
+ if (bio->bi_end_io == end_sync_read) {
+ md_sync_acct(bio->bi_bdev, nr_sectors);
+ generic_make_request(bio);
+ }
+ }
+ } else {
+ atomic_set(&r1_bio->remaining, 1);
+ bio = r1_bio->bios[r1_bio->read_disk];
+ md_sync_acct(bio->bi_bdev, nr_sectors);
+ generic_make_request(bio);
+
+ }
+ return nr_sectors;
+}
+
+static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks)
+{
+ if (sectors)
+ return sectors;
+
+ return mddev->dev_sectors;
+}
+
+static conf_t *setup_conf(mddev_t *mddev)
+{
+ conf_t *conf;
+ int i;
+ mirror_info_t *disk;
+ mdk_rdev_t *rdev;
+ int err = -ENOMEM;
+
+ conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
+ if (!conf)
+ goto abort;
+
+ conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
+ GFP_KERNEL);
+ if (!conf->mirrors)
+ goto abort;
+
+ conf->tmppage = alloc_page(GFP_KERNEL);
+ if (!conf->tmppage)
+ goto abort;
+
+ conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
+ if (!conf->poolinfo)
+ goto abort;
+ conf->poolinfo->raid_disks = mddev->raid_disks;
+ conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free,
+ conf->poolinfo);
+ if (!conf->r1bio_pool)
+ goto abort;
+
+ conf->poolinfo->mddev = mddev;
+
+ spin_lock_init(&conf->device_lock);
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ int disk_idx = rdev->raid_disk;
+ if (disk_idx >= mddev->raid_disks
+ || disk_idx < 0)
+ continue;
+ disk = conf->mirrors + disk_idx;
+
+ disk->rdev = rdev;
+
+ disk->head_position = 0;
+ }
+ conf->raid_disks = mddev->raid_disks;
+ conf->mddev = mddev;
+ INIT_LIST_HEAD(&conf->retry_list);
+
+ spin_lock_init(&conf->resync_lock);
+ init_waitqueue_head(&conf->wait_barrier);
+
+ bio_list_init(&conf->pending_bio_list);
+
+ conf->last_used = -1;
+ for (i = 0; i < conf->raid_disks; i++) {
+
+ disk = conf->mirrors + i;
+
+ if (!disk->rdev ||
+ !test_bit(In_sync, &disk->rdev->flags)) {
+ disk->head_position = 0;
+ if (disk->rdev)
+ conf->fullsync = 1;
+ } else if (conf->last_used < 0)
+ /*
+ * The first working device is used as a
+ * starting point to read balancing.
+ */
+ conf->last_used = i;
+ }
+
+ err = -EIO;
+ if (conf->last_used < 0) {
+ printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
+ mdname(mddev));
+ goto abort;
+ }
+ err = -ENOMEM;
+ conf->thread = md_register_thread(raid1d, mddev, NULL);
+ if (!conf->thread) {
+ printk(KERN_ERR
+ "md/raid1:%s: couldn't allocate thread\n",
+ mdname(mddev));
+ goto abort;
+ }
+
+ return conf;
+
+ abort:
+ if (conf) {
+ if (conf->r1bio_pool)
+ mempool_destroy(conf->r1bio_pool);
+ kfree(conf->mirrors);
+ safe_put_page(conf->tmppage);
+ kfree(conf->poolinfo);
+ kfree(conf);
+ }
+ return ERR_PTR(err);
+}
+
+static int run(mddev_t *mddev)
+{
+ conf_t *conf;
+ int i;
+ mdk_rdev_t *rdev;
+
+ if (mddev->level != 1) {
+ printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
+ mdname(mddev), mddev->level);
+ return -EIO;
+ }
+ if (mddev->reshape_position != MaxSector) {
+ printk(KERN_ERR "md/raid1:%s: reshape_position set but not supported\n",
+ mdname(mddev));
+ return -EIO;
+ }
+ /*
+ * copy the already verified devices into our private RAID1
+ * bookkeeping area. [whatever we allocate in run(),
+ * should be freed in stop()]
+ */
+ if (mddev->private == NULL)
+ conf = setup_conf(mddev);
+ else
+ conf = mddev->private;
+
+ if (IS_ERR(conf))
+ return PTR_ERR(conf);
+
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (!mddev->gendisk)
+ continue;
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->data_offset << 9);
+ /* as we don't honour merge_bvec_fn, we must never risk
+ * violating it, so limit ->max_segments to 1 lying within
+ * a single page, as a one page request is never in violation.
+ */
+ if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+ blk_queue_max_segments(mddev->queue, 1);
+ blk_queue_segment_boundary(mddev->queue,
+ PAGE_CACHE_SIZE - 1);
+ }
+ }
+
+ mddev->degraded = 0;
+ for (i=0; i < conf->raid_disks; i++)
+ if (conf->mirrors[i].rdev == NULL ||
+ !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
+ test_bit(Faulty, &conf->mirrors[i].rdev->flags))
+ mddev->degraded++;
+
+ if (conf->raid_disks - mddev->degraded == 1)
+ mddev->recovery_cp = MaxSector;
+
+ if (mddev->recovery_cp != MaxSector)
+ printk(KERN_NOTICE "md/raid1:%s: not clean"
+ " -- starting background reconstruction\n",
+ mdname(mddev));
+ printk(KERN_INFO
+ "md/raid1:%s: active with %d out of %d mirrors\n",
+ mdname(mddev), mddev->raid_disks - mddev->degraded,
+ mddev->raid_disks);
+
+ /*
+ * Ok, everything is just fine now
+ */
+ mddev->thread = conf->thread;
+ conf->thread = NULL;
+ mddev->private = conf;
+
+ md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
+
+ if (mddev->queue) {
+ mddev->queue->backing_dev_info.congested_fn = raid1_congested;
+ mddev->queue->backing_dev_info.congested_data = mddev;
+ }
+ return md_integrity_register(mddev);
+}
+
+static int stop(mddev_t *mddev)
+{
+ conf_t *conf = mddev->private;
+ struct bitmap *bitmap = mddev->bitmap;
+
+ /* wait for behind writes to complete */
+ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+ printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n",
+ mdname(mddev));
+ /* need to kick something here to make sure I/O goes? */
+ wait_event(bitmap->behind_wait,
+ atomic_read(&bitmap->behind_writes) == 0);
+ }
+
+ raise_barrier(conf);
+ lower_barrier(conf);
+
+ md_unregister_thread(mddev->thread);
+ mddev->thread = NULL;
+ if (conf->r1bio_pool)
+ mempool_destroy(conf->r1bio_pool);
+ kfree(conf->mirrors);
+ kfree(conf->poolinfo);
+ kfree(conf);
+ mddev->private = NULL;
+ return 0;
+}
+
+static int raid1_resize(mddev_t *mddev, sector_t sectors)
+{
+ /* no resync is happening, and there is enough space
+ * on all devices, so we can resize.
+ * We need to make sure resync covers any new space.
+ * If the array is shrinking we should possibly wait until
+ * any io in the removed space completes, but it hardly seems
+ * worth it.
+ */
+ md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
+ if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
+ return -EINVAL;
+ set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
+ if (sectors > mddev->dev_sectors &&
+ mddev->recovery_cp > mddev->dev_sectors) {
+ mddev->recovery_cp = mddev->dev_sectors;
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ }
+ mddev->dev_sectors = sectors;
+ mddev->resync_max_sectors = sectors;
+ return 0;
+}
+
+static int raid1_reshape(mddev_t *mddev)
+{
+ /* We need to:
+ * 1/ resize the r1bio_pool
+ * 2/ resize conf->mirrors
+ *
+ * We allocate a new r1bio_pool if we can.
+ * Then raise a device barrier and wait until all IO stops.
+ * Then resize conf->mirrors and swap in the new r1bio pool.
+ *
+ * At the same time, we "pack" the devices so that all the missing
+ * devices have the higher raid_disk numbers.
+ */
+ mempool_t *newpool, *oldpool;
+ struct pool_info *newpoolinfo;
+ mirror_info_t *newmirrors;
+ conf_t *conf = mddev->private;
+ int cnt, raid_disks;
+ unsigned long flags;
+ int d, d2, err;
+
+ /* Cannot change chunk_size, layout, or level */
+ if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
+ mddev->layout != mddev->new_layout ||
+ mddev->level != mddev->new_level) {
+ mddev->new_chunk_sectors = mddev->chunk_sectors;
+ mddev->new_layout = mddev->layout;
+ mddev->new_level = mddev->level;
+ return -EINVAL;
+ }
+
+ err = md_allow_write(mddev);
+ if (err)
+ return err;
+
+ raid_disks = mddev->raid_disks + mddev->delta_disks;
+
+ if (raid_disks < conf->raid_disks) {
+ cnt=0;
+ for (d= 0; d < conf->raid_disks; d++)
+ if (conf->mirrors[d].rdev)
+ cnt++;
+ if (cnt > raid_disks)
+ return -EBUSY;
+ }
+
+ newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
+ if (!newpoolinfo)
+ return -ENOMEM;
+ newpoolinfo->mddev = mddev;
+ newpoolinfo->raid_disks = raid_disks;
+
+ newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
+ r1bio_pool_free, newpoolinfo);
+ if (!newpool) {
+ kfree(newpoolinfo);
+ return -ENOMEM;
+ }
+ newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
+ if (!newmirrors) {
+ kfree(newpoolinfo);
+ mempool_destroy(newpool);
+ return -ENOMEM;
+ }
+
+ raise_barrier(conf);
+
+ /* ok, everything is stopped */
+ oldpool = conf->r1bio_pool;
+ conf->r1bio_pool = newpool;
+
+ for (d = d2 = 0; d < conf->raid_disks; d++) {
+ mdk_rdev_t *rdev = conf->mirrors[d].rdev;
+ if (rdev && rdev->raid_disk != d2) {
+ char nm[20];
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ rdev->raid_disk = d2;
+ sprintf(nm, "rd%d", rdev->raid_disk);
+ sysfs_remove_link(&mddev->kobj, nm);
+ if (sysfs_create_link(&mddev->kobj,
+ &rdev->kobj, nm))
+ printk(KERN_WARNING
+ "md/raid1:%s: cannot register "
+ "%s\n",
+ mdname(mddev), nm);
+ }
+ if (rdev)
+ newmirrors[d2++].rdev = rdev;
+ }
+ kfree(conf->mirrors);
+ conf->mirrors = newmirrors;
+ kfree(conf->poolinfo);
+ conf->poolinfo = newpoolinfo;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ mddev->degraded += (raid_disks - conf->raid_disks);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ conf->raid_disks = mddev->raid_disks = raid_disks;
+ mddev->delta_disks = 0;
+
+ conf->last_used = 0; /* just make sure it is in-range */
+ lower_barrier(conf);
+
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+
+ mempool_destroy(oldpool);
+ return 0;
+}
+
+static void raid1_quiesce(mddev_t *mddev, int state)
+{
+ conf_t *conf = mddev->private;
+
+ switch(state) {
+ case 2: /* wake for suspend */
+ wake_up(&conf->wait_barrier);
+ break;
+ case 1:
+ raise_barrier(conf);
+ break;
+ case 0:
+ lower_barrier(conf);
+ break;
+ }
+}
+
+static void *raid1_takeover(mddev_t *mddev)
+{
+ /* raid1 can take over:
+ * raid5 with 2 devices, any layout or chunk size
+ */
+ if (mddev->level == 5 && mddev->raid_disks == 2) {
+ conf_t *conf;
+ mddev->new_level = 1;
+ mddev->new_layout = 0;
+ mddev->new_chunk_sectors = 0;
+ conf = setup_conf(mddev);
+ if (!IS_ERR(conf))
+ conf->barrier = 1;
+ return conf;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct mdk_personality raid1_personality =
+{
+ .name = "raid1",
+ .level = 1,
+ .owner = THIS_MODULE,
+ .make_request = make_request,
+ .run = run,
+ .stop = stop,
+ .status = status,
+ .error_handler = error,
+ .hot_add_disk = raid1_add_disk,
+ .hot_remove_disk= raid1_remove_disk,
+ .spare_active = raid1_spare_active,
+ .sync_request = sync_request,
+ .resize = raid1_resize,
+ .size = raid1_size,
+ .check_reshape = raid1_reshape,
+ .quiesce = raid1_quiesce,
+ .takeover = raid1_takeover,
+};
+
+static int __init raid_init(void)
+{
+ return register_md_personality(&raid1_personality);
+}
+
+static void raid_exit(void)
+{
+ unregister_md_personality(&raid1_personality);
+}
+
+module_init(raid_init);
+module_exit(raid_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
+MODULE_ALIAS("md-personality-3"); /* RAID1 */
+MODULE_ALIAS("md-raid1");
+MODULE_ALIAS("md-level-1");
+
+module_param(max_queued, int, S_IRUGO|S_IWUSR);
diff --git a/tests/linux/raid1-A/patch b/tests/linux/raid1-A/patch
new file mode 100644
index 0000000..cb9a29e
--- /dev/null
+++ b/tests/linux/raid1-A/patch
@@ -0,0 +1,64 @@
+--- drivers/md/raid1.c
++++ drivers/md/raid1.c
+@@ -618,7 +623,9 @@
+ struct bio *bio;
+ bio = bio_list_get(&conf->pending_bio_list);
+ blk_remove_plug(conf->mddev->queue);
++ conf->pending_count = 0;
+ spin_unlock_irq(&conf->device_lock);
++ wake_up(&conf->wait_barrier);
+ /* flush any pending bitmap writes to
+ * disk before proceeding w/ I/O */
+ bitmap_unplug(conf->mddev->bitmap);
+@@ -788,6 +795,7 @@
+ struct bitmap *bitmap;
+ unsigned long flags;
+ struct bio_list bl;
++ int bl_count;
+ struct page **behind_pages = NULL;
+ const int rw = bio_data_dir(bio);
+ const bool do_sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
+@@ -878,6 +886,11 @@
+ /*
+ * WRITE:
+ */
++ if (conf->pending_count >= max_queued) {
++ md_wakeup_thread(mddev->thread);
++ wait_event(conf->wait_barrier,
++ conf->pending_count < max_queued);
++ }
+ /* first select target devices under spinlock and
+ * inc refcount on their rdev. Record them by setting
+ * bios[x] to bio
+@@ -954,6 +967,7 @@
+ set_bit(R1BIO_Barrier, &r1_bio->state);
+
+ bio_list_init(&bl);
++ bl_count = 0;
+ for (i = 0; i < disks; i++) {
+ struct bio *mbio;
+ if (!r1_bio->bios[i])
+@@ -989,6 +1003,7 @@
+ atomic_inc(&r1_bio->remaining);
+
+ bio_list_add(&bl, mbio);
++ bl_count++;
+ }
+ kfree(behind_pages); /* the behind pages are attached to the bios now */
+
+@@ -996,6 +1011,7 @@
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ spin_lock_irqsave(&conf->device_lock, flags);
+ bio_list_merge(&conf->pending_bio_list, &bl);
++ conf->pending_count += bl_count;
+ bio_list_init(&bl);
+
+ blk_plug_device(mddev->queue);
+@@ -2040,6 +2056,7 @@
+ init_waitqueue_head(&conf->wait_barrier);
+
+ bio_list_init(&conf->pending_bio_list);
++ conf->pending_count = 0;
+ bio_list_init(&conf->flushing_bio_list);
+
+
diff --git a/tests/linux/raid5/orig b/tests/linux/raid5/orig
new file mode 100644
index 0000000..40204c9
--- /dev/null
+++ b/tests/linux/raid5/orig
@@ -0,0 +1,2079 @@
+/*
+ * raid5.c : Multiple Devices driver for Linux
+ * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
+ * Copyright (C) 1999, 2000 Ingo Molnar
+ *
+ * RAID-5 management functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/locks.h>
+#include <linux/slab.h>
+#include <linux/raid/raid5.h>
+#include <asm/bitops.h>
+#include <asm/atomic.h>
+
+/*
+ * Stripe cache
+ */
+
+#define NR_STRIPES 256
+#define STRIPE_SIZE PAGE_SIZE
+#define STRIPE_SECTORS (STRIPE_SIZE>>9)
+#define IO_THRESHOLD 1
+#define HASH_PAGES 1
+#define HASH_PAGES_ORDER 0
+#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
+#define HASH_MASK (NR_HASH - 1)
+#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / STRIPE_SECTORS) & HASH_MASK])
+
+/*
+ * The following can be used to debug the driver
+ */
+#define RAID5_DEBUG 0
+#define RAID5_PARANOIA 1
+#if RAID5_PARANOIA && CONFIG_SMP
+# define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
+#else
+# define CHECK_DEVLOCK()
+#endif
+
+#if RAID5_DEBUG
+#define PRINTK(x...) printk(x)
+#define inline
+#define __inline__
+#else
+#define PRINTK(x...) do { } while (0)
+#endif
+
+static void print_raid5_conf (raid5_conf_t *conf);
+
+static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
+{
+ if (atomic_dec_and_test(&sh->count)) {
+ if (!list_empty(&sh->lru))
+ BUG();
+ if (atomic_read(&conf->active_stripes)==0)
+ BUG();
+ if (test_bit(STRIPE_HANDLE, &sh->state)) {
+ if (test_bit(STRIPE_DELAYED, &sh->state))
+ list_add_tail(&sh->lru, &conf->delayed_list);
+ else
+ list_add_tail(&sh->lru, &conf->handle_list);
+ md_wakeup_thread(conf->thread);
+ } else {
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ atomic_dec(&conf->preread_active_stripes);
+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+ md_wakeup_thread(conf->thread);
+ }
+ list_add_tail(&sh->lru, &conf->inactive_list);
+ atomic_dec(&conf->active_stripes);
+ if (!conf->inactive_blocked ||
+ atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
+ wake_up(&conf->wait_for_stripe);
+ }
+ }
+}
+static void release_stripe(struct stripe_head *sh)
+{
+ raid5_conf_t *conf = sh->raid_conf;
+ unsigned long flags;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ __release_stripe(conf, sh);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+
+static void remove_hash(struct stripe_head *sh)
+{
+ PRINTK("remove_hash(), stripe %lu\n", sh->sector);
+
+ if (sh->hash_pprev) {
+ if (sh->hash_next)
+ sh->hash_next->hash_pprev = sh->hash_pprev;
+ *sh->hash_pprev = sh->hash_next;
+ sh->hash_pprev = NULL;
+ }
+}
+
+static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
+{
+ struct stripe_head **shp = &stripe_hash(conf, sh->sector);
+
+ PRINTK("insert_hash(), stripe %lu\n",sh->sector);
+
+ CHECK_DEVLOCK();
+ if ((sh->hash_next = *shp) != NULL)
+ (*shp)->hash_pprev = &sh->hash_next;
+ *shp = sh;
+ sh->hash_pprev = shp;
+}
+
+
+/* find an idle stripe, make sure it is unhashed, and return it. */
+static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
+{
+ struct stripe_head *sh = NULL;
+ struct list_head *first;
+
+ CHECK_DEVLOCK();
+ if (list_empty(&conf->inactive_list))
+ goto out;
+ first = conf->inactive_list.next;
+ sh = list_entry(first, struct stripe_head, lru);
+ list_del_init(first);
+ remove_hash(sh);
+ atomic_inc(&conf->active_stripes);
+out:
+ return sh;
+}
+
+static void shrink_buffers(struct stripe_head *sh, int num)
+{
+ struct buffer_head *bh;
+ int i;
+
+ for (i=0; i<num ; i++) {
+ bh = sh->bh_cache[i];
+ if (!bh)
+ return;
+ sh->bh_cache[i] = NULL;
+ free_page((unsigned long) bh->b_data);
+ kfree(bh);
+ }
+}
+
+static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
+{
+ struct buffer_head *bh;
+ int i;
+
+ for (i=0; i<num; i++) {
+ struct page *page;
+ bh = kmalloc(sizeof(struct buffer_head), priority);
+ if (!bh)
+ return 1;
+ memset(bh, 0, sizeof (struct buffer_head));
+ init_waitqueue_head(&bh->b_wait);
+ if ((page = alloc_page(priority)))
+ bh->b_data = page_address(page);
+ else {
+ kfree(bh);
+ return 1;
+ }
+ atomic_set(&bh->b_count, 0);
+ bh->b_page = page;
+ sh->bh_cache[i] = bh;
+
+ }
+ return 0;
+}
+
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
+
+static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
+{
+ raid5_conf_t *conf = sh->raid_conf;
+ int disks = conf->raid_disks, i;
+
+ if (atomic_read(&sh->count) != 0)
+ BUG();
+ if (test_bit(STRIPE_HANDLE, &sh->state))
+ BUG();
+
+ CHECK_DEVLOCK();
+ PRINTK("init_stripe called, stripe %lu\n", sh->sector);
+
+ remove_hash(sh);
+
+ sh->sector = sector;
+ sh->size = conf->buffer_size;
+ sh->state = 0;
+
+ for (i=disks; i--; ) {
+ if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
+ buffer_locked(sh->bh_cache[i])) {
+ printk("sector=%lx i=%d %p %p %p %d\n",
+ sh->sector, i, sh->bh_read[i],
+ sh->bh_write[i], sh->bh_written[i],
+ buffer_locked(sh->bh_cache[i]));
+ BUG();
+ }
+ clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
+ raid5_build_block(sh, i);
+ }
+ insert_hash(conf, sh);
+}
+
+/* the buffer size has changed, so unhash all stripes
+ * as active stripes complete, they will go onto inactive list
+ */
+static void shrink_stripe_cache(raid5_conf_t *conf)
+{
+ int i;
+ CHECK_DEVLOCK();
+ if (atomic_read(&conf->active_stripes))
+ BUG();
+ for (i=0; i < NR_HASH; i++) {
+ struct stripe_head *sh;
+ while ((sh = conf->stripe_hashtbl[i]))
+ remove_hash(sh);
+ }
+}
+
+static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
+{
+ struct stripe_head *sh;
+
+ CHECK_DEVLOCK();
+ PRINTK("__find_stripe, sector %lu\n", sector);
+ for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
+ if (sh->sector == sector)
+ return sh;
+ PRINTK("__stripe %lu not in cache\n", sector);
+ return NULL;
+}
+
+static struct stripe_head *get_active_stripe(raid5_conf_t *conf, unsigned long sector,
+ int pd_idx, int noblock)
+{
+ struct stripe_head *sh;
+
+ PRINTK("get_stripe, sector %lu\n", sector);
+
+ spin_lock_irq(&conf->device_lock);
+
+ do {
+ sh = __find_stripe(conf, sector);
+ if (!sh) {
+ if (!conf->inactive_blocked)
+ sh = get_free_stripe(conf);
+ if (noblock && sh == NULL)
+ break;
+ if (!sh) {
+ conf->inactive_blocked = 1;
+ wait_event_lock_irq(conf->wait_for_stripe,
+ !list_empty(&conf->inactive_list) &&
+ (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
+ || !conf->inactive_blocked),
+ conf->device_lock);
+ conf->inactive_blocked = 0;
+ } else
+ init_stripe(sh, sector, pd_idx);
+ } else {
+ if (atomic_read(&sh->count)) {
+ if (!list_empty(&sh->lru))
+ BUG();
+ } else {
+ if (!test_bit(STRIPE_HANDLE, &sh->state))
+ atomic_inc(&conf->active_stripes);
+ if (list_empty(&sh->lru))
+ BUG();
+ list_del_init(&sh->lru);
+ }
+ }
+ } while (sh == NULL);
+
+ if (sh)
+ atomic_inc(&sh->count);
+
+ spin_unlock_irq(&conf->device_lock);
+ return sh;
+}
+
+static int grow_stripes(raid5_conf_t *conf, int num)
+{
+ struct stripe_head *sh;
+ kmem_cache_t *sc;
+ int devs = conf->raid_disks;
+
+ sprintf(conf->cache_name, "md/raid5-%d", conf->mddev->__minor);
+
+ sc = kmem_cache_create(conf->cache_name,
+ sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
+ 0, 0, NULL, NULL);
+ if (!sc)
+ return 1;
+ conf->slab_cache = sc;
+ while (num--) {
+ sh = kmem_cache_alloc(sc, GFP_KERNEL);
+ if (!sh)
+ return 1;
+ memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
+ sh->raid_conf = conf;
+ sh->lock = SPIN_LOCK_UNLOCKED;
+
+ if (grow_buffers(sh, conf->raid_disks)) {
+ shrink_buffers(sh, conf->raid_disks);
+ kmem_cache_free(sc, sh);
+ return 1;
+ }
+ /* we just created an active stripe so... */
+ atomic_set(&sh->count, 1);
+ atomic_inc(&conf->active_stripes);
+ INIT_LIST_HEAD(&sh->lru);
+ release_stripe(sh);
+ }
+ return 0;
+}
+
+static void shrink_stripes(raid5_conf_t *conf)
+{
+ struct stripe_head *sh;
+
+ while (1) {
+ spin_lock_irq(&conf->device_lock);
+ sh = get_free_stripe(conf);
+ spin_unlock_irq(&conf->device_lock);
+ if (!sh)
+ break;
+ if (atomic_read(&sh->count))
+ BUG();
+ shrink_buffers(sh, conf->raid_disks);
+ kmem_cache_free(conf->slab_cache, sh);
+ atomic_dec(&conf->active_stripes);
+ }
+ kmem_cache_destroy(conf->slab_cache);
+ conf->slab_cache = NULL;
+}
+
+static void raid5_end_read_request (struct bio * bi)
+{
+ struct stripe_head *sh = bi->bi_private;
+ raid5_conf_t *conf = sh->raid_conf;
+ int disks = conf->raid_disks, i;
+ int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+
+ for (i=0 ; i<disks; i++)
+ if (bi == &sh->dev[i].req)
+ break;
+
+ PRINTK("end_read_request %lu/%d, count: %d, uptodate %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
+ if (i == disks) {
+ BUG();
+ return;
+ }
+
+ if (uptodate) {
+#if 0
+ struct bio *bio;
+ unsigned long flags;
+ spin_lock_irqsave(&conf->device_lock, flags);
+ /* we can return a buffer if we bypassed the cache or
+ * if the top buffer is not in highmem. If there are
+ * multiple buffers, leave the extra work to
+ * handle_stripe
+ */
+ buffer = sh->bh_read[i];
+ if (buffer &&
+ (!PageHighMem(buffer->b_page)
+ || buffer->b_page == bh->b_page )
+ ) {
+ sh->bh_read[i] = buffer->b_reqnext;
+ buffer->b_reqnext = NULL;
+ } else
+ buffer = NULL;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ if (sh->bh_page[i]==NULL)
+ set_bit(BH_Uptodate, &bh->b_state);
+ if (buffer) {
+ if (buffer->b_page != bh->b_page)
+ memcpy(buffer->b_data, bh->b_data, bh->b_size);
+ buffer->b_end_io(buffer, 1);
+ }
+ } else {
+ md_error(conf->mddev, bh->b_dev);
+ clear_bit(BH_Uptodate, &bh->b_state);
+ }
+ /* must restore b_page before unlocking buffer... */
+ if (sh->bh_page[i]) {
+ bh->b_page = sh->bh_page[i];
+ bh->b_data = page_address(bh->b_page);
+ sh->bh_page[i] = NULL;
+ clear_bit(BH_Uptodate, &bh->b_state);
+ }
+ clear_bit(BH_Lock, &bh->b_state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+}
+
+static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
+{
+ struct stripe_head *sh = bh->b_private;
+ raid5_conf_t *conf = sh->raid_conf;
+ int disks = conf->raid_disks, i;
+ unsigned long flags;
+
+ for (i=0 ; i<disks; i++)
+ if (bh == sh->bh_cache[i])
+ break;
+
+ PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
+ if (i == disks) {
+ BUG();
+ return;
+ }
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (!uptodate)
+ md_error(conf->mddev, bh->b_dev);
+ clear_bit(BH_Lock, &bh->b_state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ __release_stripe(conf, sh);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+
+
+
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
+{
+ raid5_conf_t *conf = sh->raid_conf;
+ struct buffer_head *bh = sh->bh_cache[i];
+ unsigned long block = sh->sector / (sh->size >> 9);
+
+ init_buffer(bh, raid5_end_read_request, sh);
+ bh->b_dev = conf->disks[i].dev;
+ bh->b_blocknr = block;
+
+ bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
+ bh->b_size = sh->size;
+ bh->b_list = BUF_LOCKED;
+ return bh;
+}
+
+static int error (mddev_t *mddev, kdev_t dev)
+{
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+ mdp_super_t *sb = mddev->sb;
+ struct disk_info *disk;
+ int i;
+
+ PRINTK("raid5: error called\n");
+
+ for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
+ if (disk->dev == dev) {
+ if (disk->operational) {
+ disk->operational = 0;
+ mark_disk_faulty(sb->disks+disk->number);
+ mark_disk_nonsync(sb->disks+disk->number);
+ mark_disk_inactive(sb->disks+disk->number);
+ sb->active_disks--;
+ sb->working_disks--;
+ sb->failed_disks++;
+ mddev->sb_dirty = 1;
+ conf->working_disks--;
+ conf->failed_disks++;
+ md_wakeup_thread(conf->thread);
+ printk (KERN_ALERT
+ "raid5: Disk failure on %s, disabling device."
+ " Operation continuing on %d devices\n",
+ partition_name (dev), conf->working_disks);
+ }
+ return 0;
+ }
+ }
+ /*
+ * handle errors in spares (during reconstruction)
+ */
+ if (conf->spare) {
+ disk = conf->spare;
+ if (disk->dev == dev) {
+ printk (KERN_ALERT
+ "raid5: Disk failure on spare %s\n",
+ partition_name (dev));
+ if (!conf->spare->operational) {
+ /* probably a SET_DISK_FAULTY ioctl */
+ return -EIO;
+ }
+ disk->operational = 0;
+ disk->write_only = 0;
+ conf->spare = NULL;
+ mark_disk_faulty(sb->disks+disk->number);
+ mark_disk_nonsync(sb->disks+disk->number);
+ mark_disk_inactive(sb->disks+disk->number);
+ sb->spare_disks--;
+ sb->working_disks--;
+ sb->failed_disks++;
+
+ mddev->sb_dirty = 1;
+ md_wakeup_thread(conf->thread);
+
+ return 0;
+ }
+ }
+ MD_BUG();
+ return -EIO;
+}
+
+/*
+ * Input: a 'big' sector number,
+ * Output: index of the data and parity disk, and the sector # in them.
+ */
+static unsigned long raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
+ unsigned int data_disks, unsigned int * dd_idx,
+ unsigned int * pd_idx, raid5_conf_t *conf)
+{
+ sector_t stripe;
+ unsigned long chunk_number;
+ unsigned int chunk_offset;
+ sector_t new_sector;
+ int sectors_per_chunk = conf->chunk_size >> 9;
+
+ /* First compute the information on this sector */
+
+ /*
+ * Compute the chunk number and the sector offset inside the chunk
+ */
+ chunk_number = r_sector / sectors_per_chunk;
+ chunk_offset = r_sector % sectors_per_chunk;
+
+ /*
+ * Compute the stripe number
+ */
+ stripe = chunk_number / data_disks;
+
+ /*
+ * Compute the data disk and parity disk indexes inside the stripe
+ */
+ *dd_idx = chunk_number % data_disks;
+
+ /*
+ * Select the parity disk based on the user selected algorithm.
+ */
+ if (conf->level == 4)
+ *pd_idx = data_disks;
+ else switch (conf->algorithm) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ *pd_idx = data_disks - stripe % raid_disks;
+ if (*dd_idx >= *pd_idx)
+ (*dd_idx)++;
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ *pd_idx = stripe % raid_disks;
+ if (*dd_idx >= *pd_idx)
+ (*dd_idx)++;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ *pd_idx = data_disks - stripe % raid_disks;
+ *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ *pd_idx = stripe % raid_disks;
+ *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
+ break;
+ default:
+ printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
+ }
+
+ /*
+ * Finally, compute the new sector number
+ */
+ new_sector = stripe * sectors_per_chunk + chunk_offset;
+ return new_sector;
+}
+
+
+static sector_t compute_blocknr(struct stripe_head *sh, int i)
+{
+ raid5_conf_t *conf = sh->raid_conf;
+ int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
+ sector_t new_sector = sh->sector, check;
+ int sectors_per_chunk = conf->chunk_size >> 9;
+ sector_t stripe = new_sector / sectors_per_chunk;
+ int chunk_offset = new_sector % sectors_per_chunk;
+ int chunk_number, dummy1, dummy2, dd_idx = i;
+ sector_t r_sector;
+
+ switch (conf->algorithm) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ if (i > sh->pd_idx)
+ i--;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ if (i < sh->pd_idx)
+ i += raid_disks;
+ i -= (sh->pd_idx + 1);
+ break;
+ default:
+ printk ("raid5: unsupported algorithm %d\n", conf->algorithm);
+ }
+
+ chunk_number = stripe * data_disks + i;
+ r_sector = chunk_number * sectors_per_chunk + chunk_offset;
+
+ check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
+ if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
+ printk("compute_blocknr: map not correct\n");
+ return 0;
+ }
+ return r_sector;
+}
+
+
+
+/*
+ * Copy data between a page in the stripe cache, and one or more bion
+ * The page could align with the middle of the bio, or there could be
+ * several bion, each with several bio_vecs, which cover part of the page
+ * Multiple bion are linked together on bi_next. There may be extras
+ * at the end of this list. We ignore them.
+ */
+static void copy_data(int frombio, struct bio *bio,
+ struct page *page,
+ sector_t sector)
+{
+ char *pa = page_address(page);
+ struct bio_vec *bvl;
+ int i;
+
+ for (;bio && bio->bi_sector < sector+STRIPE_SECTORS;
+ bio = bio->bi_next) {
+ int page_offset;
+ if (bio->bi_sector >= sector)
+ page_offset = (signed)(bio->bi_sector - sector) * 512;
+ else
+ page_offset = (signed)(sector - bio->bi_sector) * -512;
+ bio_for_each_segment(bvl, bio, i) {
+ char *ba = __bio_kmap(bio, i);
+ int len = bio_iovec_idx(bio,i)->bv_len;
+ int clen;
+ int b_offset = 0;
+
+ if (page_offset < 0) {
+ b_offset = -page_offset;
+ page_offset += b_offset;
+ len -= b_offset;
+ }
+
+ if (len > 0 && page_offset + len > STRIPE_SIZE)
+ clen = STRIPE_SIZE - page_offset;
+ else clen = len;
+
+ if (len > 0) {
+ if (frombio)
+ memcpy(pa+page_offset, ba+b_offset, clen);
+ else
+ memcpy(ba+b_offset, pa+page_offset, clen);
+ }
+ __bio_kunmap(bio, i);
+ page_offset += len;
+ }
+ }
+}
+
+#define check_xor() do { \
+ if (count == MAX_XOR_BLOCKS) { \
+ xor_block(count, STRIPE_SIZE, ptr); \
+ count = 1; \
+ } \
+ } while(0)
+
+
+static void compute_block(struct stripe_head *sh, int dd_idx)
+{
+ raid5_conf_t *conf = sh->raid_conf;
+ int i, count, disks = conf->raid_disks;
+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
+
+ PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
+
+
+ memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
+ bh_ptr[0] = sh->bh_cache[dd_idx];
+ count = 1;
+ for (i = disks ; i--; ) {
+ if (i == dd_idx)
+ continue;
+ bh = sh->bh_cache[i];
+ if (buffer_uptodate(bh))
+ bh_ptr[count++] = bh;
+ else
+ printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
+
+ check_xor();
+ }
+ if (count != 1)
+ xor_block(count, bh_ptr);
+ set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
+}
+
+static void compute_parity(struct stripe_head *sh, int method)
+{
+ raid5_conf_t *conf = sh->raid_conf;
+ int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
+ struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+ struct buffer_head *chosen[MD_SB_DISKS];
+
+ PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
+ memset(chosen, 0, sizeof(chosen));
+
+ count = 1;
+ bh_ptr[0] = sh->bh_cache[pd_idx];
+ switch(method) {
+ case READ_MODIFY_WRITE:
+ if (!buffer_uptodate(sh->bh_cache[pd_idx]))
+ BUG();
+ for (i=disks ; i-- ;) {
+ if (i==pd_idx)
+ continue;
+ if (sh->bh_write[i] &&
+ buffer_uptodate(sh->bh_cache[i])) {
+ bh_ptr[count++] = sh->bh_cache[i];
+ chosen[i] = sh->bh_write[i];
+ sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
+ chosen[i]->b_reqnext = sh->bh_written[i];
+ sh->bh_written[i] = chosen[i];
+ check_xor();
+ }
+ }
+ break;
+ case RECONSTRUCT_WRITE:
+ memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
+ for (i= disks; i-- ;)
+ if (i!=pd_idx && sh->bh_write[i]) {
+ chosen[i] = sh->bh_write[i];
+ sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
+ chosen[i]->b_reqnext = sh->bh_written[i];
+ sh->bh_written[i] = chosen[i];
+ }
+ break;
+ case CHECK_PARITY:
+ break;
+ }
+ if (count>1) {
+ xor_block(count, bh_ptr);
+ count = 1;
+ }
+
+ for (i = disks; i--;)
+ if (chosen[i]) {
+ struct buffer_head *bh = sh->bh_cache[i];
+ char *bdata;
+ bdata = bh_kmap(chosen[i]);
+ memcpy(bh->b_data,
+ bdata,sh->size);
+ bh_kunmap(chosen[i]);
+ set_bit(BH_Lock, &bh->b_state);
+ mark_buffer_uptodate(bh, 1);
+ }
+
+ switch(method) {
+ case RECONSTRUCT_WRITE:
+ case CHECK_PARITY:
+ for (i=disks; i--;)
+ if (i != pd_idx) {
+ bh_ptr[count++] = sh->bh_cache[i];
+ check_xor();
+ }
+ break;
+ case READ_MODIFY_WRITE:
+ for (i = disks; i--;)
+ if (chosen[i]) {
+ bh_ptr[count++] = sh->bh_cache[i];
+ check_xor();
+ }
+ }
+ if (count != 1)
+ xor_block(count, bh_ptr);
+
+ if (method != CHECK_PARITY) {
+ mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
+ set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
+ } else
+ mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
+}
+
+static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
+{
+ struct buffer_head **bhp;
+ raid5_conf_t *conf = sh->raid_conf;
+
+ PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
+
+
+ spin_lock(&sh->lock);
+ spin_lock_irq(&conf->device_lock);
+ bh->b_reqnext = NULL;
+ if (rw == READ)
+ bhp = &sh->bh_read[dd_idx];
+ else
+ bhp = &sh->bh_write[dd_idx];
+ while (*bhp) {
+ printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
+ bhp = & (*bhp)->b_reqnext;
+ }
+ *bhp = bh;
+ spin_unlock_irq(&conf->device_lock);
+ spin_unlock(&sh->lock);
+
+ PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
+}
+
+
+
+
+
+/*
+ * handle_stripe - do things to a stripe.
+ *
+ * We lock the stripe and then examine the state of various bits
+ * to see what needs to be done.
+ * Possible results:
+ * return some read request which now have data
+ * return some write requests which are safely on disc
+ * schedule a read on some buffers
+ * schedule a write of some buffers
+ * return confirmation of parity correctness
+ *
+ * Parity calculations are done inside the stripe lock
+ * buffers are taken off read_list or write_list, and bh_cache buffers
+ * get BH_Lock set before the stripe lock is released.
+ *
+ */
+
+static void handle_stripe(struct stripe_head *sh)
+{
+ raid5_conf_t *conf = sh->raid_conf;
+ int disks = conf->raid_disks;
+ struct bio *return_bi= NULL;
+ struct bio *bi;
+ int action[MD_SB_DISKS];
+ int i;
+ int syncing;
+ int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+ int failed_num=0;
+ struct r5dev *dev;
+
+ PRINTK("handling stripe %ld, cnt=%d, pd_idx=%d\n", sh->sector, atomic_read(&sh->count), sh->pd_idx);
+ memset(action, 0, sizeof(action));
+
+ spin_lock(&sh->lock);
+ clear_bit(STRIPE_HANDLE, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+
+ syncing = test_bit(STRIPE_SYNCING, &sh->state);
+ /* Now to look around and see what can be done */
+
+ for (i=disks; i--; ) {
+ dev = &sh->dev[i];
+ PRINTK("check %d: state 0x%lx read %p write %p written %p\n", i,
+ dev->flags, dev->toread, dev->towrite, dev->written);
+ /* maybe we can reply to a read */
+ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
+ struct bio *rbi, *rbi2;
+ PRINTK("Return read for disc %d\n", i);
+ spin_lock_irq(&conf->device_lock);
+ rbi = dev->toread;
+ dev->toread = NULL;
+ spin_unlock_irq(&conf->device_lock);
+ while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ copy_data(0, rbi, dev->page, dev->sector);
+ rbi2 = rbi->bi_next;
+ spin_lock_irq(&conf->device_lock);
+ if (--rbi->bi_phys_segments == 0) {
+ rbi->bi_next = return_bi;
+ return_bi = rbi;
+ }
+ spin_unlock_irq(&conf->device_lock);
+ rbi = rbi2;
+ }
+ }
+
+ /* now count some things */
+ if (test_bit(R5_LOCKED, &dev->flags)) locked++;
+ if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+
+
+ if (dev->toread) to_read++;
+ if (dev->towrite) to_write++;
+ if (dev->written) written++;
+ if (!conf->disks[i].operational) {
+ failed++;
+ failed_num = i;
+ }
+ }
+ PRINTK("locked=%d uptodate=%d to_read=%d to_write=%d failed=%d failed_num=%d\n",
+ locked, uptodate, to_read, to_write, failed, failed_num);
+ /* check if the array has lost two devices and, if so, some requests might
+ * need to be failed
+ */
+ if (failed > 1 && to_read+to_write) {
+ spin_lock_irq(&conf->device_lock);
+ for (i=disks; i--; ) {
+ /* fail all writes first */
+ bi = sh->dev[i].towrite;
+ sh->dev[i].towrite = NULL;
+ if (bi) to_write--;
+
+ while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+ struct bio *nextbi = bi->bi_next;
+ clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ if (--bi->bi_phys_segments == 0) {
+ bi->bi_next = return_bi;
+ return_bi = bi;
+ }
+ bi = nextbi;
+ }
+ /* fail any reads if this device is non-operational */
+ if (!conf->disks[i].operational) {
+ bi = sh->dev[i].toread;
+ sh->dev[i].toread = NULL;
+ if (bi) to_read--;
+ while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+ struct bio *nextbi = bi->bi_next;
+ clear_bit(BIO_UPTODATE, &bi->bi_flags);
+ if (--bi->bi_phys_segments == 0) {
+ bi->bi_next = return_bi;
+ return_bi = bi;
+ }
+ bi = nextbi;
+ }
+ }
+ }
+ spin_unlock_irq(&conf->device_lock);
+ }
+ if (failed > 1 && syncing) {
+ md_done_sync(conf->mddev, STRIPE_SECTORS,0);
+ clear_bit(STRIPE_SYNCING, &sh->state);
+ syncing = 0;
+ }
+
+ /* might be able to return some write requests if the parity block
+ * is safe, or on a failed drive
+ */
+ dev = &sh->dev[sh->pd_idx];
+ if ( written &&
+ ( (conf->disks[sh->pd_idx].operational && !test_bit(R5_LOCKED, &dev->flags) &&
+ test_bit(R5_UPTODATE, &dev->flags))
+ || (failed == 1 && failed_num == sh->pd_idx))
+ ) {
+ /* any written block on an uptodate or failed drive can be returned */
+ for (i=disks; i--; )
+ if (sh->dev[i].written) {
+ dev = &sh->dev[i];
+ if (!conf->disks[sh->pd_idx].operational ||
+ (!test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) ) {
+ /* maybe we can return some write requests */
+ struct bio *wbi, *wbi2;
+ PRINTK("Return write for disc %d\n", i);
+ wbi = dev->written;
+ dev->written = NULL;
+ while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+ wbi2 = wbi->bi_next;
+ if (--wbi->bi_phys_segments == 0) {
+ wbi->bi_next = return_bi;
+ return_bi = wbi;
+ }
+ wbi = wbi2;
+ }
+ }
+ }
+ }
+
+ /* Now we might consider reading some blocks, either to check/generate
+ * parity, or to satisfy requests
+ */
+ if (to_read || (syncing && (uptodate+failed < disks))) {
+ for (i=disks; i--;) {
+ dev = &sh->dev[i];
+ if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+ (dev->toread || syncing || (failed && sh->dev[failed_num].toread))) {
+ /* we would like to get this block, possibly
+ * by computing it, but we might not be able to
+ */
+ if (uptodate == disks-1) {
+ PRINTK("Computing block %d\n", i);
+ compute_block(sh, i);
+ uptodate++;
+ } else if (conf->disks[i].operational) {
+ set_bit(BH_Lock, &bh->b_state);
+ action[i] = READ+1;
+ /* if I am just reading this block and we don't have
+ a failed drive, or any pending writes then sidestep the cache */
+ if (sh->bh_page[i]) BUG();
+ if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
+ ! syncing && !failed && !to_write) {
+ sh->bh_page[i] = sh->bh_cache[i]->b_page;
+ sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
+ sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
+ }
+ locked++;
+ PRINTK("Reading block %d (sync=%d)\n", i, syncing);
+ if (syncing)
+ md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
+ }
+ }
+ }
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+
+ /* now to consider writing and what else, if anything should be read */
+ if (to_write) {
+ int rmw=0, rcw=0;
+ for (i=disks ; i--;) {
+ /* would I have to read this buffer for read_modify_write */
+ dev = &sh->dev[i];
+ if ((dev->towrite || i == sh->pd_idx) &&
+ (!test_bit(R5_LOCKED, &dev->flags)
+#if 0
+|| sh->bh_page[i]!=bh->b_page
+#endif
+ ) &&
+ !test_bit(R5_UPTODATE, &dev->flags)) {
+ if (conf->disks[i].operational
+/* && !(conf->resync_parity && i == sh->pd_idx) */
+ )
+ rmw++;
+ else rmw += 2*disks; /* cannot read it */
+ }
+ /* Would I have to read this buffer for reconstruct_write */
+ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+ (!test_bit(R5_LOCKED, &dev->flags)
+#if 0
+|| sh->bh_page[i] != bh->b_page
+#endif
+ ) &&
+ !test_bit(R5_UPTODATE, &dev->flags)) {
+ if (conf->disks[i].operational) rcw++;
+ else rcw += 2*disks;
+ }
+ }
+ PRINTK("for sector %ld, rmw=%d rcw=%d\n", sh->sector, rmw, rcw);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ if (rmw < rcw && rmw > 0)
+ /* prefer read-modify-write, but need to get some data */
+ for (i=disks; i--;) {
+ bh = sh->bh_cache[i];
+ if ((sh->bh_write[i] || i == sh->pd_idx) &&
+ !buffer_locked(bh) && !buffer_uptodate(bh) &&
+ conf->disks[i].operational) {
+ if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ {
+ PRINTK("Read_old block %d for r-m-w\n", i);
+ set_bit(BH_Lock, &bh->b_state);
+ action[i] = READ+1;
+ locked++;
+ } else {
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+ }
+ }
+ if (rcw <= rmw && rcw > 0)
+ /* want reconstruct write, but need to get some data */
+ for (i=disks; i--;) {
+ bh = sh->bh_cache[i];
+ if (!sh->bh_write[i] && i != sh->pd_idx &&
+ !buffer_locked(bh) && !buffer_uptodate(bh) &&
+ conf->disks[i].operational) {
+ if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ {
+ PRINTK("Read_old block %d for Reconstruct\n", i);
+ set_bit(BH_Lock, &bh->b_state);
+ action[i] = READ+1;
+ locked++;
+ } else {
+ set_bit(STRIPE_DELAYED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+ }
+ }
+ /* now if nothing is locked, and if we have enough data, we can start a write request */
+ if (locked == 0 && (rcw == 0 ||rmw == 0)) {
+ PRINTK("Computing parity...\n");
+ compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
+ /* now every locked buffer is ready to be written */
+ for (i=disks; i--;)
+ if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
+ PRINTK("Writing block %d\n", i);
+ locked++;
+ action[i] = WRITE+1;
+ if (!conf->disks[i].operational
+ || (i==sh->pd_idx && failed == 0))
+ set_bit(STRIPE_INSYNC, &sh->state);
+ }
+ if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ atomic_dec(&conf->preread_active_stripes);
+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
+ md_wakeup_thread(conf->thread);
+ }
+ }
+ }
+
+ /* maybe we need to check and possibly fix the parity for this stripe
+ * Any reads will already have been scheduled, so we just see if enough data
+ * is available
+ */
+ if (syncing && locked == 0 &&
+ !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
+ set_bit(STRIPE_HANDLE, &sh->state);
+ if (failed == 0) {
+ char *pagea;
+ if (uptodate != disks)
+ BUG();
+ compute_parity(sh, CHECK_PARITY);
+ uptodate--;
+ pagea = page_address(sh->dev[sh->pd_idx].page);
+ if ((*(u32*)pagea) == 0 &&
+ !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
+ /* parity is correct (on disc, not in buffer any more) */
+ set_bit(STRIPE_INSYNC, &sh->state);
+ }
+ }
+ if (!test_bit(STRIPE_INSYNC, &sh->state)) {
+ struct disk_info *spare;
+ if (failed==0)
+ failed_num = sh->pd_idx;
+ /* should be able to compute the missing block and write it to spare */
+ if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {
+ if (uptodate+1 != disks)
+ BUG();
+ compute_block(sh, failed_num);
+ uptodate++;
+ }
+ if (uptodate != disks)
+ BUG();
+ bh = sh->bh_cache[failed_num];
+ set_bit(BH_Lock, &bh->b_state);
+ action[failed_num] = WRITE+1;
+ locked++;
+ set_bit(STRIPE_INSYNC, &sh->state);
+ if (conf->disks[failed_num].operational)
+ md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
+ else if ((spare=conf->spare))
+ md_sync_acct(spare->dev, bh->b_size>>9);
+
+ }
+ }
+ if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+ md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
+ clear_bit(STRIPE_SYNCING, &sh->state);
+ }
+
+
+ spin_unlock(&sh->lock);
+
+ while ((bh=return_ok)) {
+ return_ok = bh->b_reqnext;
+ bh->b_reqnext = NULL;
+ bh->b_end_io(bh, 1);
+ }
+ while ((bh=return_fail)) {
+ return_fail = bh->b_reqnext;
+ bh->b_reqnext = NULL;
+ bh->b_end_io(bh, 0);
+ }
+ for (i=disks; i-- ;)
+ if (action[i]) {
+ struct buffer_head *bh = sh->bh_cache[i];
+ struct disk_info *spare = conf->spare;
+ int skip = 0;
+ if (action[i] == READ+1)
+ bh->b_end_io = raid5_end_read_request;
+ else
+ bh->b_end_io = raid5_end_write_request;
+ if (conf->disks[i].operational)
+ bh->b_dev = conf->disks[i].dev;
+ else if (spare && action[i] == WRITE+1)
+ bh->b_dev = spare->dev;
+ else skip=1;
+ if (!skip) {
+ PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
+ atomic_inc(&sh->count);
+ bh->b_rdev = bh->b_dev;
+ bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
+ generic_make_request(action[i]-1, bh);
+ } else {
+ PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
+ clear_bit(BH_Lock, &bh->b_state);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+ }
+}
+
+static inline void raid5_activate_delayed(raid5_conf_t *conf)
+{
+ if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
+ while (!list_empty(&conf->delayed_list)) {
+ struct list_head *l = conf->delayed_list.next;
+ struct stripe_head *sh;
+ sh = list_entry(l, struct stripe_head, lru);
+ list_del_init(l);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ atomic_inc(&conf->preread_active_stripes);
+ list_add_tail(&sh->lru, &conf->handle_list);
+ }
+ }
+}
+static void raid5_unplug_device(void *data)
+{
+ raid5_conf_t *conf = (raid5_conf_t *)data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+
+ raid5_activate_delayed(conf);
+
+ conf->plugged = 0;
+ md_wakeup_thread(conf->thread);
+
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+
+static inline void raid5_plug_device(raid5_conf_t *conf)
+{
+ spin_lock_irq(&conf->device_lock);
+ if (list_empty(&conf->delayed_list))
+ if (!conf->plugged) {
+ conf->plugged = 1;
+ queue_task(&conf->plug_tq, &tq_disk);
+ }
+ spin_unlock_irq(&conf->device_lock);
+}
+
+static int make_request (mddev_t *mddev, int rw, struct bio * bi)
+{
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+ const unsigned int raid_disks = conf->raid_disks;
+ const unsigned int data_disks = raid_disks - 1;
+ unsigned int dd_idx, pd_idx;
+ sector_t new_sector;
+ sector_t logical_sector, last_sector;
+ int read_ahead = 0;
+
+ struct stripe_head *sh;
+
+ if (rw == READA) {
+ rw = READ;
+ read_ahead=1;
+ }
+
+ new_sector = raid5_compute_sector(bh->b_rsector,
+ raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+
+ PRINTK("raid5: make_request, sector %lu\n", new_sector);
+ sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
+ if (sh) {
+ sh->pd_idx = pd_idx;
+
+ add_stripe_bh(sh, bh, dd_idx, rw);
+
+ raid5_plug_device(conf);
+ handle_stripe(sh);
+ release_stripe(sh);
+ } else
+ bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
+ return 0;
+}
+
+/*
+ * Determine correct block size for this device.
+ */
+unsigned int device_bsize (kdev_t dev)
+{
+ unsigned int i, correct_size;
+
+ correct_size = BLOCK_SIZE;
+ if (blksize_size[MAJOR(dev)]) {
+ i = blksize_size[MAJOR(dev)][MINOR(dev)];
+ if (i)
+ correct_size = i;
+ }
+
+ return correct_size;
+}
+
+static int sync_request (mddev_t *mddev, unsigned long sector_nr)
+{
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+ struct stripe_head *sh;
+ int sectors_per_chunk = conf->chunk_size >> 9;
+ unsigned long stripe = sector_nr/sectors_per_chunk;
+ int chunk_offset = sector_nr % sectors_per_chunk;
+ int dd_idx, pd_idx;
+ unsigned long first_sector;
+ int raid_disks = conf->raid_disks;
+ int data_disks = raid_disks-1;
+
+ first_sector = raid5_compute_sector(stripe*data_disks*sectors_per_chunk
+ + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+ sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
+ spin_lock(&sh->lock);
+ set_bit(STRIPE_SYNCING, &sh->state);
+ clear_bit(STRIPE_INSYNC, &sh->state);
+ spin_unlock(&sh->lock);
+
+ handle_stripe(sh);
+ release_stripe(sh);
+
+ return STRIPE_SECTORS;
+}
+
+/*
+ * This is our raid5 kernel thread.
+ *
+ * We scan the hash table for stripes which can be handled now.
+ * During the scan, completed stripes are saved for us by the interrupt
+ * handler, so that they will not have to wait for our next wakeup.
+ */
+static void raid5d (void *data)
+{
+ struct stripe_head *sh;
+ raid5_conf_t *conf = data;
+ mddev_t *mddev = conf->mddev;
+ int handled;
+
+ PRINTK("+++ raid5d active\n");
+
+ handled = 0;
+
+ if (mddev->sb_dirty)
+ md_update_sb(mddev);
+ spin_lock_irq(&conf->device_lock);
+ while (1) {
+ struct list_head *first;
+
+ if (list_empty(&conf->handle_list) &&
+ atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
+ !conf->plugged &&
+ !list_empty(&conf->delayed_list))
+ raid5_activate_delayed(conf);
+
+ if (list_empty(&conf->handle_list))
+ break;
+
+ first = conf->handle_list.next;
+ sh = list_entry(first, struct stripe_head, lru);
+
+ list_del_init(first);
+ atomic_inc(&sh->count);
+ if (atomic_read(&sh->count)!= 1)
+ BUG();
+ spin_unlock_irq(&conf->device_lock);
+
+ handled++;
+ handle_stripe(sh);
+ release_stripe(sh);
+
+ spin_lock_irq(&conf->device_lock);
+ }
+ PRINTK("%d stripes handled\n", handled);
+
+ spin_unlock_irq(&conf->device_lock);
+
+ PRINTK("--- raid5d inactive\n");
+}
+
+/*
+ * Private kernel thread for parity reconstruction after an unclean
+ * shutdown. Reconstruction on spare drives in case of a failed drive
+ * is done by the generic mdsyncd.
+ */
+static void raid5syncd (void *data)
+{
+ raid5_conf_t *conf = data;
+ mddev_t *mddev = conf->mddev;
+
+ if (!conf->resync_parity)
+ return;
+ if (conf->resync_parity == 2)
+ return;
+ down(&mddev->recovery_sem);
+ if (md_do_sync(mddev,NULL)) {
+ up(&mddev->recovery_sem);
+ printk("raid5: resync aborted!\n");
+ return;
+ }
+ conf->resync_parity = 0;
+ up(&mddev->recovery_sem);
+ printk("raid5: resync finished.\n");
+}
+
+static int run (mddev_t *mddev)
+{
+ raid5_conf_t *conf;
+ int i, j, raid_disk, memory;
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *desc;
+ mdk_rdev_t *rdev;
+ struct disk_info *disk;
+ struct list_head *tmp;
+ int start_recovery = 0;
+
+ MOD_INC_USE_COUNT;
+
+ if (sb->level != 5 && sb->level != 4) {
+ printk("raid5: md%d: raid level not set to 4/5 (%d)\n", mdidx(mddev), sb->level);
+ MOD_DEC_USE_COUNT;
+ return -EIO;
+ }
+
+ mddev->private = kmalloc (sizeof (raid5_conf_t), GFP_KERNEL);
+ if ((conf = mddev->private) == NULL)
+ goto abort;
+ memset (conf, 0, sizeof (*conf));
+ conf->mddev = mddev;
+
+ if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
+ goto abort;
+ memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
+
+ conf->device_lock = SPIN_LOCK_UNLOCKED;
+ init_waitqueue_head(&conf->wait_for_stripe);
+ INIT_LIST_HEAD(&conf->handle_list);
+ INIT_LIST_HEAD(&conf->delayed_list);
+ INIT_LIST_HEAD(&conf->inactive_list);
+ atomic_set(&conf->active_stripes, 0);
+ atomic_set(&conf->preread_active_stripes, 0);
+
+ conf->plugged = 0;
+ conf->plug_tq.sync = 0;
+ conf->plug_tq.routine = &raid5_unplug_device;
+ conf->plug_tq.data = conf;
+
+ PRINTK("raid5: run(md%d) called.\n", mdidx(mddev));
+
+ ITERATE_RDEV(mddev,rdev,tmp) {
+ /*
+ * This is important -- we are using the descriptor on
+ * the disk only to get a pointer to the descriptor on
+ * the main superblock, which might be more recent.
+ */
+ desc = sb->disks + rdev->desc_nr;
+ raid_disk = desc->raid_disk;
+ disk = conf->disks + raid_disk;
+
+ if (disk_faulty(desc)) {
+ printk(KERN_ERR "raid5: disabled device %s (errors detected)\n", partition_name(rdev->dev));
+ if (!rdev->faulty) {
+ MD_BUG();
+ goto abort;
+ }
+ disk->number = desc->number;
+ disk->raid_disk = raid_disk;
+ disk->dev = rdev->dev;
+ disk->bdev = rdev->bdev;
+
+ disk->operational = 0;
+ disk->write_only = 0;
+ disk->spare = 0;
+ disk->used_slot = 1;
+ continue;
+ }
+ if (disk_active(desc)) {
+ if (!disk_sync(desc)) {
+ printk(KERN_ERR "raid5: disabled device %s (not in sync)\n", partition_name(rdev->dev));
+ MD_BUG();
+ goto abort;
+ }
+ if (raid_disk > sb->raid_disks) {
+ printk(KERN_ERR "raid5: disabled device %s (inconsistent descriptor)\n", partition_name(rdev->dev));
+ continue;
+ }
+ if (disk->operational) {
+ printk(KERN_ERR "raid5: disabled device %s (device %d already operational)\n", partition_name(rdev->dev), raid_disk);
+ continue;
+ }
+ printk(KERN_INFO "raid5: device %s operational as raid disk %d\n", partition_name(rdev->dev), raid_disk);
+
+ disk->number = desc->number;
+ disk->raid_disk = raid_disk;
+ disk->dev = rdev->dev;
+ disk->bdev = rdev->bdev;
+ disk->operational = 1;
+ disk->used_slot = 1;
+
+ conf->working_disks++;
+ } else {
+ /*
+ * Must be a spare disk ..
+ */
+ printk(KERN_INFO "raid5: spare disk %s\n", partition_name(rdev->dev));
+ disk->number = desc->number;
+ disk->raid_disk = raid_disk;
+ disk->dev = rdev->dev;
+ disk->bdev = rdev->bdev;
+
+ disk->operational = 0;
+ disk->write_only = 0;
+ disk->spare = 1;
+ disk->used_slot = 1;
+ }
+ }
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ desc = sb->disks + i;
+ raid_disk = desc->raid_disk;
+ disk = conf->disks + raid_disk;
+
+ if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
+ !conf->disks[raid_disk].used_slot) {
+
+ disk->number = desc->number;
+ disk->raid_disk = raid_disk;
+ disk->dev = MKDEV(0,0);
+
+ disk->operational = 0;
+ disk->write_only = 0;
+ disk->spare = 0;
+ disk->used_slot = 1;
+ }
+ }
+
+ conf->raid_disks = sb->raid_disks;
+ /*
+ * 0 for a fully functional array, 1 for a degraded array.
+ */
+ conf->failed_disks = conf->raid_disks - conf->working_disks;
+ conf->mddev = mddev;
+ conf->chunk_size = sb->chunk_size;
+ conf->level = sb->level;
+ conf->algorithm = sb->layout;
+ conf->max_nr_stripes = NR_STRIPES;
+
+#if 0
+ for (i = 0; i < conf->raid_disks; i++) {
+ if (!conf->disks[i].used_slot) {
+ MD_BUG();
+ goto abort;
+ }
+ }
+#endif
+ if (!conf->chunk_size || conf->chunk_size % 4) {
+ printk(KERN_ERR "raid5: invalid chunk size %d for md%d\n", conf->chunk_size, mdidx(mddev));
+ goto abort;
+ }
+ if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
+ printk(KERN_ERR "raid5: unsupported parity algorithm %d for md%d\n", conf->algorithm, mdidx(mddev));
+ goto abort;
+ }
+ if (conf->failed_disks > 1) {
+ printk(KERN_ERR "raid5: not enough operational devices for md%d (%d/%d failed)\n", mdidx(mddev), conf->failed_disks, conf->raid_disks);
+ goto abort;
+ }
+
+ if (conf->working_disks != sb->raid_disks) {
+ printk(KERN_ALERT "raid5: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
+ start_recovery = 1;
+ }
+
+ {
+ const char * name = "raid5d";
+
+ conf->thread = md_register_thread(raid5d, conf, name);
+ if (!conf->thread) {
+ printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
+ goto abort;
+ }
+ }
+
+ memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+ conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
+ if (grow_stripes(conf, conf->max_nr_stripes)) {
+ printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
+ shrink_stripes(conf);
+ goto abort;
+ } else
+ printk(KERN_INFO "raid5: allocated %dkB for md%d\n", memory, mdidx(mddev));
+
+ /*
+ * Regenerate the "device is in sync with the raid set" bit for
+ * each device.
+ */
+ for (i = 0; i < MD_SB_DISKS ; i++) {
+ mark_disk_nonsync(sb->disks + i);
+ for (j = 0; j < sb->raid_disks; j++) {
+ if (!conf->disks[j].operational)
+ continue;
+ if (sb->disks[i].number == conf->disks[j].number)
+ mark_disk_sync(sb->disks + i);
+ }
+ }
+ sb->active_disks = conf->working_disks;
+
+ if (sb->active_disks == sb->raid_disks)
+ printk("raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
+ else
+ printk(KERN_ALERT "raid5: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", conf->level, mdidx(mddev), sb->active_disks, sb->raid_disks, conf->algorithm);
+
+ if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
+ const char * name = "raid5syncd";
+
+ conf->resync_thread = md_register_thread(raid5syncd, conf,name);
+ if (!conf->resync_thread) {
+ printk(KERN_ERR "raid5: couldn't allocate thread for md%d\n", mdidx(mddev));
+ goto abort;
+ }
+
+ printk("raid5: raid set md%d not clean; reconstructing parity\n", mdidx(mddev));
+ conf->resync_parity = 1;
+ md_wakeup_thread(conf->resync_thread);
+ }
+
+ print_raid5_conf(conf);
+ if (start_recovery)
+ md_recover_arrays();
+ print_raid5_conf(conf);
+
+ /* Ok, everything is just fine now */
+ return (0);
+abort:
+ if (conf) {
+ print_raid5_conf(conf);
+ if (conf->stripe_hashtbl)
+ free_pages((unsigned long) conf->stripe_hashtbl,
+ HASH_PAGES_ORDER);
+ kfree(conf);
+ }
+ mddev->private = NULL;
+ printk(KERN_ALERT "raid5: failed to run raid set md%d\n", mdidx(mddev));
+ MOD_DEC_USE_COUNT;
+ return -EIO;
+}
+
+static int stop_resync (mddev_t *mddev)
+{
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+ mdk_thread_t *thread = conf->resync_thread;
+
+ if (thread) {
+ if (conf->resync_parity) {
+ conf->resync_parity = 2;
+ md_interrupt_thread(thread);
+ printk(KERN_INFO "raid5: parity resync was not fully finished, restarting next time.\n");
+ return 1;
+ }
+ return 0;
+ }
+ return 0;
+}
+
+static int restart_resync (mddev_t *mddev)
+{
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+
+ if (conf->resync_parity) {
+ if (!conf->resync_thread) {
+ MD_BUG();
+ return 0;
+ }
+ printk("raid5: waking up raid5resync.\n");
+ conf->resync_parity = 1;
+ md_wakeup_thread(conf->resync_thread);
+ return 1;
+ } else
+ printk("raid5: no restart-resync needed.\n");
+ return 0;
+}
+
+
+static int stop (mddev_t *mddev)
+{
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+
+ if (conf->resync_thread)
+ md_unregister_thread(conf->resync_thread);
+ md_unregister_thread(conf->thread);
+ shrink_stripes(conf);
+ free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
+ kfree(conf);
+ mddev->private = NULL;
+ MOD_DEC_USE_COUNT;
+ return 0;
+}
+
+#if RAID5_DEBUG
+static void print_sh (struct stripe_head *sh)
+{
+ int i;
+
+ printk("sh %lu, pd_idx %d, state %ld.\n", sh->sector, sh->pd_idx, sh->state);
+ printk("sh %lu, count %d.\n", sh->sector, atomic_read(&sh->count));
+ printk("sh %lu, ", sh->sector);
+ for (i = 0; i < sh->raid_conf->raid_disks; i++) {
+ printk("(cache%d: %p %ld) ", i, sh->dev[i].page, sh->dev[i].flags);
+ }
+ printk("\n");
+}
+
+static void printall (raid5_conf_t *conf)
+{
+ struct stripe_head *sh;
+ int i;
+
+ spin_lock_irq(&conf->device_lock);
+ for (i = 0; i < NR_HASH; i++) {
+ sh = conf->stripe_hashtbl[i];
+ for (; sh; sh = sh->hash_next) {
+ if (sh->raid_conf != conf)
+ continue;
+ print_sh(sh);
+ }
+ }
+ spin_unlock_irq(&conf->device_lock);
+
+ PRINTK("--- raid5d inactive\n");
+}
+#endif
+
+static void status (struct seq_file *seq, mddev_t *mddev)
+{
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+ mdp_super_t *sb = mddev->sb;
+ int i;
+
+ seq_printf (seq, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout);
+ seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
+ for (i = 0; i < conf->raid_disks; i++)
+ seq_printf (seq, "%s", conf->disks[i].operational ? "U" : "_");
+ seq_printf (seq, "]");
+#if RAID5_DEBUG
+#define D(x) \
+ seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
+ printall(conf);
+#endif
+
+}
+
+static void print_raid5_conf (raid5_conf_t *conf)
+{
+ int i;
+ struct disk_info *tmp;
+
+ printk("RAID5 conf printout:\n");
+ if (!conf) {
+ printk("(conf==NULL)\n");
+ return;
+ }
+ printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
+ conf->working_disks, conf->failed_disks);
+
+#if RAID5_DEBUG
+ for (i = 0; i < MD_SB_DISKS; i++) {
+#else
+ for (i = 0; i < conf->working_disks+conf->failed_disks; i++) {
+#endif
+ tmp = conf->disks + i;
+ printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
+ i, tmp->spare,tmp->operational,
+ tmp->number,tmp->raid_disk,tmp->used_slot,
+ partition_name(tmp->dev));
+ }
+}
+
+static int diskop(mddev_t *mddev, mdp_disk_t **d, int state)
+{
+ int err = 0;
+ int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
+ raid5_conf_t *conf = mddev->private;
+ struct disk_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
+ mdp_super_t *sb = mddev->sb;
+ mdp_disk_t *failed_desc, *spare_desc, *added_desc;
+ mdk_rdev_t *spare_rdev, *failed_rdev;
+
+ print_raid5_conf(conf);
+ spin_lock_irq(&conf->device_lock);
+ /*
+ * find the disk ...
+ */
+ switch (state) {
+
+ case DISKOP_SPARE_ACTIVE:
+
+ /*
+ * Find the failed disk within the RAID5 configuration ...
+ * (this can only be in the first conf->raid_disks part)
+ */
+ for (i = 0; i < conf->raid_disks; i++) {
+ tmp = conf->disks + i;
+ if ((!tmp->operational && !tmp->spare) ||
+ !tmp->used_slot) {
+ failed_disk = i;
+ break;
+ }
+ }
+ /*
+ * When we activate a spare disk we _must_ have a disk in
+ * the lower (active) part of the array to replace.
+ */
+ if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ /* fall through */
+
+ case DISKOP_SPARE_WRITE:
+ case DISKOP_SPARE_INACTIVE:
+
+ /*
+ * Find the spare disk ... (can only be in the 'high'
+ * area of the array)
+ */
+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+ tmp = conf->disks + i;
+ if (tmp->spare && tmp->number == (*d)->number) {
+ spare_disk = i;
+ break;
+ }
+ }
+ if (spare_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+
+ case DISKOP_HOT_REMOVE_DISK:
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ tmp = conf->disks + i;
+ if (tmp->used_slot && (tmp->number == (*d)->number)) {
+ if (tmp->operational) {
+ err = -EBUSY;
+ goto abort;
+ }
+ removed_disk = i;
+ break;
+ }
+ }
+ if (removed_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+
+ case DISKOP_HOT_ADD_DISK:
+
+ for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
+ tmp = conf->disks + i;
+ if (!tmp->used_slot) {
+ added_disk = i;
+ break;
+ }
+ }
+ if (added_disk == -1) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ break;
+ }
+
+ switch (state) {
+ /*
+ * Switch the spare disk to write-only mode:
+ */
+ case DISKOP_SPARE_WRITE:
+ if (conf->spare) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ sdisk = conf->disks + spare_disk;
+ sdisk->operational = 1;
+ sdisk->write_only = 1;
+ conf->spare = sdisk;
+ break;
+ /*
+ * Deactivate a spare disk:
+ */
+ case DISKOP_SPARE_INACTIVE:
+ sdisk = conf->disks + spare_disk;
+ sdisk->operational = 0;
+ sdisk->write_only = 0;
+ /*
+ * Was the spare being resynced?
+ */
+ if (conf->spare == sdisk)
+ conf->spare = NULL;
+ break;
+ /*
+ * Activate (mark read-write) the (now sync) spare disk,
+ * which means we switch it's 'raid position' (->raid_disk)
+ * with the failed disk. (only the first 'conf->raid_disks'
+ * slots are used for 'real' disks and we must preserve this
+ * property)
+ */
+ case DISKOP_SPARE_ACTIVE:
+ if (!conf->spare) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ sdisk = conf->disks + spare_disk;
+ fdisk = conf->disks + failed_disk;
+
+ spare_desc = &sb->disks[sdisk->number];
+ failed_desc = &sb->disks[fdisk->number];
+
+ if (spare_desc != *d) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (spare_desc->raid_disk != sdisk->raid_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (sdisk->raid_disk != spare_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (failed_desc->raid_disk != fdisk->raid_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ if (fdisk->raid_disk != failed_disk) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ /*
+ * do the switch finally
+ */
+ spare_rdev = find_rdev_nr(mddev, spare_desc->number);
+ failed_rdev = find_rdev_nr(mddev, failed_desc->number);
+
+ /* There must be a spare_rdev, but there may not be a
+ * failed_rdev. That slot might be empty...
+ */
+ spare_rdev->desc_nr = failed_desc->number;
+ if (failed_rdev)
+ failed_rdev->desc_nr = spare_desc->number;
+
+ xchg_values(*spare_desc, *failed_desc);
+ xchg_values(*fdisk, *sdisk);
+
+ /*
+ * (careful, 'failed' and 'spare' are switched from now on)
+ *
+ * we want to preserve linear numbering and we want to
+ * give the proper raid_disk number to the now activated
+ * disk. (this means we switch back these values)
+ */
+
+ xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
+ xchg_values(sdisk->raid_disk, fdisk->raid_disk);
+ xchg_values(spare_desc->number, failed_desc->number);
+ xchg_values(sdisk->number, fdisk->number);
+
+ *d = failed_desc;
+
+ if (sdisk->dev == MKDEV(0,0))
+ sdisk->used_slot = 0;
+
+ /*
+ * this really activates the spare.
+ */
+ fdisk->spare = 0;
+ fdisk->write_only = 0;
+
+ /*
+ * if we activate a spare, we definitely replace a
+ * non-operational disk slot in the 'low' area of
+ * the disk array.
+ */
+ conf->failed_disks--;
+ conf->working_disks++;
+ conf->spare = NULL;
+
+ break;
+
+ case DISKOP_HOT_REMOVE_DISK:
+ rdisk = conf->disks + removed_disk;
+
+ if (rdisk->spare && (removed_disk < conf->raid_disks)) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+ rdisk->dev = MKDEV(0,0);
+ rdisk->used_slot = 0;
+
+ break;
+
+ case DISKOP_HOT_ADD_DISK:
+ adisk = conf->disks + added_disk;
+ added_desc = *d;
+
+ if (added_disk != added_desc->number) {
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+
+ adisk->number = added_desc->number;
+ adisk->raid_disk = added_desc->raid_disk;
+ adisk->dev = MKDEV(added_desc->major,added_desc->minor);
+
+ adisk->operational = 0;
+ adisk->write_only = 0;
+ adisk->spare = 1;
+ adisk->used_slot = 1;
+
+
+ break;
+
+ default:
+ MD_BUG();
+ err = 1;
+ goto abort;
+ }
+abort:
+ spin_unlock_irq(&conf->device_lock);
+ print_raid5_conf(conf);
+ return err;
+}
+
+static mdk_personality_t raid5_personality=
+{
+ name: "raid5",
+ make_request: make_request,
+ run: run,
+ stop: stop,
+ status: status,
+ error_handler: error,
+ diskop: diskop,
+ stop_resync: stop_resync,
+ restart_resync: restart_resync,
+ sync_request: sync_request
+};
+
+static int __init raid5_init (void)
+{
+ return register_md_personality (RAID5, &raid5_personality);
+}
+
+static void raid5_exit (void)
+{
+ unregister_md_personality (RAID5);
+}
+
+module_init(raid5_init);
+module_exit(raid5_exit);
+MODULE_LICENSE("GPL");
diff --git a/tests/linux/raid5/patch b/tests/linux/raid5/patch
new file mode 100644
index 0000000..d149229
--- /dev/null
+++ b/tests/linux/raid5/patch
@@ -0,0 +1,962 @@
+***************
+*** 142,188 ****
+
+ static void shrink_buffers(struct stripe_head *sh, int num)
+ {
+- struct buffer_head *bh;
+ int i;
+
+ for (i=0; i<num ; i++) {
+- bh = sh->bh_cache[i];
+- if (!bh)
+- return;
+- sh->bh_cache[i] = NULL;
+- free_page((unsigned long) bh->b_data);
+- kfree(bh);
+ }
+ }
+
+- static int grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
+ {
+- struct buffer_head *bh;
+ int i;
+
+ for (i=0; i<num; i++) {
+ struct page *page;
+- bh = kmalloc(sizeof(struct buffer_head), priority);
+- if (!bh)
+- return 1;
+- memset(bh, 0, sizeof (struct buffer_head));
+- if ((page = alloc_page(priority)))
+- bh->b_data = page_address(page);
+- else {
+- kfree(bh);
+ return 1;
+ }
+- atomic_set(&bh->b_count, 0);
+- bh->b_page = page;
+- sh->bh_cache[i] = bh;
+-
+ }
+ return 0;
+ }
+
+- static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i);
+
+- static inline void init_stripe(struct stripe_head *sh, unsigned long sector)
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+ int disks = conf->raid_disks, i;
+--- 142,177 ----
+
+ static void shrink_buffers(struct stripe_head *sh, int num)
+ {
++ struct page *p;
+ int i;
+
+ for (i=0; i<num ; i++) {
++ p = sh->dev[i].page;
++ if (!p)
++ continue;
++ sh->dev[i].page = NULL;
++ page_cache_release(p);
+ }
+ }
+
++ static int grow_buffers(struct stripe_head *sh, int num)
+ {
+ int i;
+
+ for (i=0; i<num; i++) {
+ struct page *page;
++
++ if (!(page = alloc_page(GFP_KERNEL))) {
+ return 1;
+ }
++ sh->dev[i].page = page;
+ }
+ return 0;
+ }
+
++ static void raid5_build_block (struct stripe_head *sh, int i);
+
++ static inline void init_stripe(struct stripe_head *sh, unsigned long sector, int pd_idx)
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+ int disks = conf->raid_disks, i;
+***************
+*** 198,237 ****
+ remove_hash(sh);
+
+ sh->sector = sector;
+- sh->size = conf->buffer_size;
+ sh->state = 0;
+
+ for (i=disks; i--; ) {
+- if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
+- buffer_locked(sh->bh_cache[i])) {
+ printk("sector=%lx i=%d %p %p %p %d\n",
+- sh->sector, i, sh->bh_read[i],
+- sh->bh_write[i], sh->bh_written[i],
+- buffer_locked(sh->bh_cache[i]));
+ BUG();
+ }
+- clear_buffer_uptodate(sh->bh_cache[i]);
+ raid5_build_block(sh, i);
+ }
+ insert_hash(conf, sh);
+ }
+
+- /* the buffer size has changed, so unhash all stripes
+- * as active stripes complete, they will go onto inactive list
+- */
+- static void shrink_stripe_cache(raid5_conf_t *conf)
+- {
+- int i;
+- CHECK_DEVLOCK();
+- if (atomic_read(&conf->active_stripes))
+- BUG();
+- for (i=0; i < NR_HASH; i++) {
+- struct stripe_head *sh;
+- while ((sh = conf->stripe_hashtbl[i]))
+- remove_hash(sh);
+- }
+- }
+-
+ static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
+ {
+ struct stripe_head *sh;
+--- 187,212 ----
+ remove_hash(sh);
+
+ sh->sector = sector;
++ sh->pd_idx = pd_idx;
+ sh->state = 0;
+
+ for (i=disks; i--; ) {
++ struct r5dev *dev = &sh->dev[i];
++
++ if (dev->toread || dev->towrite || dev->written ||
++ test_bit(R5_LOCKED, &dev->flags)) {
+ printk("sector=%lx i=%d %p %p %p %d\n",
++ sh->sector, i, dev->toread,
++ dev->towrite, dev->written,
++ test_bit(R5_LOCKED, &dev->flags));
+ BUG();
+ }
++ dev->flags = 0;
+ raid5_build_block(sh, i);
+ }
+ insert_hash(conf, sh);
+ }
+
+ static struct stripe_head *__find_stripe(raid5_conf_t *conf, unsigned long sector)
+ {
+ struct stripe_head *sh;
+***************
+*** 410,447 ****
+ } else
+ buffer = NULL;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+- if (sh->bh_page[i]==NULL)
+ set_buffer_uptodate(bh);
+ if (buffer) {
+ if (buffer->b_page != bh->b_page)
+ memcpy(buffer->b_data, bh->b_data, bh->b_size);
+ buffer->b_end_io(buffer, 1);
+ }
+ } else {
+- md_error(conf->mddev, bh->b_bdev);
+- clear_buffer_uptodate(bh);
+ }
+ /* must restore b_page before unlocking buffer... */
+- if (sh->bh_page[i]) {
+ bh->b_page = sh->bh_page[i];
+ bh->b_data = page_address(bh->b_page);
+- sh->bh_page[i] = NULL;
+ clear_buffer_uptodate(bh);
+ }
+- clear_buffer_locked(bh);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+ }
+
+- static void raid5_end_write_request (struct buffer_head *bh, int uptodate)
+ {
+- struct stripe_head *sh = bh->b_private;
+ raid5_conf_t *conf = sh->raid_conf;
+ int disks = conf->raid_disks, i;
+ unsigned long flags;
+
+ for (i=0 ; i<disks; i++)
+- if (bh == sh->bh_cache[i])
+ break;
+
+ PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
+--- 361,403 ----
+ } else
+ buffer = NULL;
+ spin_unlock_irqrestore(&conf->device_lock, flags);
++ if (sh->bh_page[i]==bh->b_page)
+ set_buffer_uptodate(bh);
+ if (buffer) {
+ if (buffer->b_page != bh->b_page)
+ memcpy(buffer->b_data, bh->b_data, bh->b_size);
+ buffer->b_end_io(buffer, 1);
+ }
++ #else
++ set_bit(R5_UPTODATE, &sh->dev[i].flags);
++ #endif
+ } else {
++ md_error(conf->mddev, bi->bi_bdev);
++ clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+ }
++ #if 0
+ /* must restore b_page before unlocking buffer... */
++ if (sh->bh_page[i] != bh->b_page) {
+ bh->b_page = sh->bh_page[i];
+ bh->b_data = page_address(bh->b_page);
+ clear_buffer_uptodate(bh);
+ }
++ #endif
++ clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ release_stripe(sh);
+ }
+
++ static void raid5_end_write_request (struct bio *bi)
+ {
++ struct stripe_head *sh = bi->bi_private;
+ raid5_conf_t *conf = sh->raid_conf;
+ int disks = conf->raid_disks, i;
+ unsigned long flags;
++ int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
+
+ for (i=0 ; i<disks; i++)
++ if (bi == &sh->dev[i].req)
+ break;
+
+ PRINTK("end_write_request %lu/%d, count %d, uptodate: %d.\n", sh->sector, i, atomic_read(&sh->count), uptodate);
+***************
+*** 452,480 ****
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (!uptodate)
+- md_error(conf->mddev, bh->b_bdev);
+- clear_buffer_locked(bh);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ __release_stripe(conf, sh);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+-
+
+
+- static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+- struct buffer_head *bh = sh->bh_cache[i];
+- unsigned long block = sh->sector / (sh->size >> 9);
+
+- init_buffer(bh, raid5_end_read_request, sh);
+- bh->b_dev = conf->disks[i].dev;
+- /* FIXME - later we will need bdev here */
+- bh->b_blocknr = block;
+-
+- bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
+- bh->b_size = sh->size;
+- return bh;
+ }
+
+ static int error (mddev_t *mddev, kdev_t dev)
+--- 408,443 ----
+
+ spin_lock_irqsave(&conf->device_lock, flags);
+ if (!uptodate)
++ md_error(conf->mddev, bi->bi_bdev);
++
++ clear_bit(R5_LOCKED, &sh->dev[i].flags);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ __release_stripe(conf, sh);
+ spin_unlock_irqrestore(&conf->device_lock, flags);
+ }
+
+
++ static unsigned long compute_blocknr(struct stripe_head *sh, int i);
++
++ static void raid5_build_block (struct stripe_head *sh, int i)
+ {
+ raid5_conf_t *conf = sh->raid_conf;
++ struct r5dev *dev = &sh->dev[i];
+
++ bio_init(&dev->req);
++ dev->req.bi_io_vec = &dev->vec;
++ dev->req.bi_vcnt++;
++ dev->vec.bv_page = dev->page;
++ dev->vec.bv_len = STRIPE_SIZE;
++ dev->vec.bv_offset = 0;
++
++ dev->req.bi_bdev = conf->disks[i].bdev;
++ dev->req.bi_sector = sh->sector;
++ dev->req.bi_private = sh;
++
++ dev->flags = 0;
++ if (i != sh->pd_idx)
++ dev->sector = compute_blocknr(sh, i);
+ }
+
+ static int error (mddev_t *mddev, kdev_t dev)
+***************
+*** 661,748 ****
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+ int i, count, disks = conf->raid_disks;
+- struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
+
+ PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
+
+-
+- memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
+- bh_ptr[0] = sh->bh_cache[dd_idx];
+ count = 1;
+ for (i = disks ; i--; ) {
+ if (i == dd_idx)
+ continue;
+- bh = sh->bh_cache[i];
+- if (buffer_uptodate(bh))
+- bh_ptr[count++] = bh;
+ else
+ printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
+
+ check_xor();
+ }
+ if (count != 1)
+- xor_block(count, bh_ptr);
+- set_buffer_uptodate(sh->bh_cache[dd_idx]);
+ }
+
+ static void compute_parity(struct stripe_head *sh, int method)
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+ int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
+- struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
+- struct buffer_head *chosen[MD_SB_DISKS];
+
+ PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
+ memset(chosen, 0, sizeof(chosen));
+
+ count = 1;
+- bh_ptr[0] = sh->bh_cache[pd_idx];
+ switch(method) {
+ case READ_MODIFY_WRITE:
+- if (!buffer_uptodate(sh->bh_cache[pd_idx]))
+ BUG();
+ for (i=disks ; i-- ;) {
+ if (i==pd_idx)
+ continue;
+- if (sh->bh_write[i] &&
+- buffer_uptodate(sh->bh_cache[i])) {
+- bh_ptr[count++] = sh->bh_cache[i];
+- chosen[i] = sh->bh_write[i];
+- sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
+- chosen[i]->b_reqnext = sh->bh_written[i];
+- sh->bh_written[i] = chosen[i];
+ check_xor();
+ }
+ }
+ break;
+ case RECONSTRUCT_WRITE:
+- memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
+ for (i= disks; i-- ;)
+- if (i!=pd_idx && sh->bh_write[i]) {
+- chosen[i] = sh->bh_write[i];
+- sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
+- chosen[i]->b_reqnext = sh->bh_written[i];
+- sh->bh_written[i] = chosen[i];
+ }
+ break;
+ case CHECK_PARITY:
+ break;
+ }
+ if (count>1) {
+- xor_block(count, bh_ptr);
+ count = 1;
+ }
+
+ for (i = disks; i--;)
+ if (chosen[i]) {
+- struct buffer_head *bh = sh->bh_cache[i];
+- char *bdata;
+- bdata = bh_kmap(chosen[i]);
+- memcpy(bh->b_data,
+- bdata,sh->size);
+- bh_kunmap(chosen[i]);
+- set_buffer_locked(bh);
+- set_buffer_uptodate(bh);
+ }
+
+ switch(method) {
+--- 674,757 ----
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+ int i, count, disks = conf->raid_disks;
++ void *ptr[MAX_XOR_BLOCKS], *p;
+
+ PRINTK("compute_block, stripe %lu, idx %d\n", sh->sector, dd_idx);
+
++ ptr[0] = page_address(sh->dev[dd_idx].page);
++ memset(ptr[0], 0, STRIPE_SIZE);
+ count = 1;
+ for (i = disks ; i--; ) {
+ if (i == dd_idx)
+ continue;
++ p = page_address(sh->dev[i].page);
++ if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
++ ptr[count++] = p;
+ else
+ printk("compute_block() %d, stripe %lu, %d not present\n", dd_idx, sh->sector, i);
+
+ check_xor();
+ }
+ if (count != 1)
++ xor_block(count, STRIPE_SIZE, ptr);
++ set_bit(R5_UPTODATE, &sh->dev[i].flags);
+ }
+
+ static void compute_parity(struct stripe_head *sh, int method)
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+ int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
++ void *ptr[MAX_XOR_BLOCKS];
++ struct bio *chosen[MD_SB_DISKS];
+
+ PRINTK("compute_parity, stripe %lu, method %d\n", sh->sector, method);
+ memset(chosen, 0, sizeof(chosen));
+
+ count = 1;
++ ptr[0] = page_address(sh->dev[pd_idx].page);
+ switch(method) {
+ case READ_MODIFY_WRITE:
++ if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags))
+ BUG();
+ for (i=disks ; i-- ;) {
+ if (i==pd_idx)
+ continue;
++ if (sh->dev[i].towrite &&
++ test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
++ ptr[count++] = page_address(sh->dev[i].page);
++ chosen[i] = sh->dev[i].towrite;
++ sh->dev[i].towrite = NULL;
++ if (sh->dev[i].written) BUG();
++ sh->dev[i].written = chosen[i];
+ check_xor();
+ }
+ }
+ break;
+ case RECONSTRUCT_WRITE:
++ memset(ptr[0], 0, STRIPE_SIZE);
+ for (i= disks; i-- ;)
++ if (i!=pd_idx && sh->dev[i].towrite) {
++ chosen[i] = sh->dev[i].towrite;
++ sh->dev[i].towrite = NULL;
++ if (sh->dev[i].written) BUG();
++ sh->dev[i].written = chosen[i];
+ }
+ break;
+ case CHECK_PARITY:
+ break;
+ }
+ if (count>1) {
++ xor_block(count, STRIPE_SIZE, ptr);
+ count = 1;
+ }
+
+ for (i = disks; i--;)
+ if (chosen[i]) {
++ sector_t sector = sh->dev[i].sector;
++ copy_data(1, chosen[i], sh->dev[i].page, sector);
++
++ set_bit(R5_LOCKED, &sh->dev[i].flags);
++ set_bit(R5_UPTODATE, &sh->dev[i].flags);
+ }
+
+ switch(method) {
+***************
+*** 750,804 ****
+ case CHECK_PARITY:
+ for (i=disks; i--;)
+ if (i != pd_idx) {
+- bh_ptr[count++] = sh->bh_cache[i];
+ check_xor();
+ }
+ break;
+ case READ_MODIFY_WRITE:
+ for (i = disks; i--;)
+ if (chosen[i]) {
+- bh_ptr[count++] = sh->bh_cache[i];
+ check_xor();
+ }
+ }
+ if (count != 1)
+- xor_block(count, bh_ptr);
+
+ if (method != CHECK_PARITY) {
+- set_buffer_uptodate(sh->bh_cache[pd_idx]);
+- set_buffer_locked(sh->bh_cache[pd_idx]);
+ } else
+- clear_buffer_uptodate(sh->bh_cache[pd_idx]);
+ }
+
+- static void add_stripe_bh (struct stripe_head *sh, struct buffer_head *bh, int dd_idx, int rw)
+ {
+- struct buffer_head **bhp;
+ raid5_conf_t *conf = sh->raid_conf;
+
+- PRINTK("adding bh b#%lu to stripe s#%lu\n", bh->b_blocknr, sh->sector);
+
+
+ spin_lock(&sh->lock);
+ spin_lock_irq(&conf->device_lock);
+- bh->b_reqnext = NULL;
+- if (rw == READ)
+- bhp = &sh->bh_read[dd_idx];
+ else
+- bhp = &sh->bh_write[dd_idx];
+- while (*bhp) {
+- printk(KERN_NOTICE "raid5: multiple %d requests for sector %ld\n", rw, sh->sector);
+- bhp = & (*bhp)->b_reqnext;
+- }
+- *bhp = bh;
+ spin_unlock_irq(&conf->device_lock);
+ spin_unlock(&sh->lock);
+
+- PRINTK("added bh b#%lu to stripe s#%lu, disk %d.\n", bh->b_blocknr, sh->sector, dd_idx);
+- }
+-
+-
+
+
+
+ /*
+--- 759,832 ----
+ case CHECK_PARITY:
+ for (i=disks; i--;)
+ if (i != pd_idx) {
++ ptr[count++] = page_address(sh->dev[i].page);
+ check_xor();
+ }
+ break;
+ case READ_MODIFY_WRITE:
+ for (i = disks; i--;)
+ if (chosen[i]) {
++ ptr[count++] = page_address(sh->dev[i].page);
+ check_xor();
+ }
+ }
+ if (count != 1)
++ xor_block(count, STRIPE_SIZE, ptr);
+
+ if (method != CHECK_PARITY) {
++ set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
++ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
+ } else
++ clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+ }
+
++ /*
++ * Each stripe/dev can have one or more bion attached.
++ * toread/towrite point to the first in a chain.
++ * The bi_next chain must be in order.
++ */
++ static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+ {
++ struct bio **bip;
+ raid5_conf_t *conf = sh->raid_conf;
+
++ PRINTK("adding bh b#%lu to stripe s#%lu\n", bi->bi_sector, sh->sector);
+
+
+ spin_lock(&sh->lock);
+ spin_lock_irq(&conf->device_lock);
++ if (forwrite)
++ bip = &sh->dev[dd_idx].towrite;
+ else
++ bip = &sh->dev[dd_idx].toread;
++ while (*bip && (*bip)->bi_sector < bi->bi_sector)
++ bip = & (*bip)->bi_next;
++ /* FIXME do I need to worry about overlapping bion */
++ if (*bip && bi->bi_next && (*bip) != bi->bi_next)
++ BUG();
++ if (*bip)
++ bi->bi_next = *bip;
++ *bip = bi;
++ bi->bi_phys_segments ++;
+ spin_unlock_irq(&conf->device_lock);
+ spin_unlock(&sh->lock);
+
++ if (forwrite) {
++ /* check if page is coverred */
++ sector_t sector = sh->dev[dd_idx].sector;
++ for (bi=sh->dev[dd_idx].towrite;
++ sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
++ bi && bi->bi_sector <= sector;
++ bi = bi->bi_next) {
++ if (bi->bi_sector + (bi->bi_size>>9) >= sector)
++ sector = bi->bi_sector + (bi->bi_size>>9);
++ }
++ if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
++ set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
++ }
+
++ PRINTK("added bi b#%lu to stripe s#%lu, disk %d.\n", bi->bi_sector, sh->sector, dd_idx);
++ }
+
+
+ /*
+***************
+*** 955,975 ****
+ compute_block(sh, i);
+ uptodate++;
+ } else if (conf->disks[i].operational) {
+- set_buffer_locked(bh);
+ action[i] = READ+1;
+ /* if I am just reading this block and we don't have
+ a failed drive, or any pending writes then sidestep the cache */
+- if (sh->bh_page[i]) BUG();
+ if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
+ ! syncing && !failed && !to_write) {
+- sh->bh_page[i] = sh->bh_cache[i]->b_page;
+ sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
+ sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
+ }
+ locked++;
+ PRINTK("Reading block %d (sync=%d)\n", i, syncing);
+ if (syncing)
+- md_sync_acct(conf->disks[i].dev, bh->b_size>>9);
+ }
+ }
+ }
+--- 1002,1022 ----
+ compute_block(sh, i);
+ uptodate++;
+ } else if (conf->disks[i].operational) {
++ set_bit(R5_LOCKED, &dev->flags);
+ action[i] = READ+1;
++ #if 0
+ /* if I am just reading this block and we don't have
+ a failed drive, or any pending writes then sidestep the cache */
+ if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
+ ! syncing && !failed && !to_write) {
+ sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
+ sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
+ }
++ #endif
+ locked++;
+ PRINTK("Reading block %d (sync=%d)\n", i, syncing);
+ if (syncing)
++ md_sync_acct(conf->disks[i].dev, STRIPE_SECTORS);
+ }
+ }
+ }
+***************
+*** 1004,1017 ****
+ if (rmw < rcw && rmw > 0)
+ /* prefer read-modify-write, but need to get some data */
+ for (i=disks; i--;) {
+- bh = sh->bh_cache[i];
+- if ((sh->bh_write[i] || i == sh->pd_idx) &&
+- !buffer_locked(bh) && !buffer_uptodate(bh) &&
+ conf->disks[i].operational) {
+ if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ {
+ PRINTK("Read_old block %d for r-m-w\n", i);
+- set_buffer_locked(bh);
+ action[i] = READ+1;
+ locked++;
+ } else {
+--- 1059,1072 ----
+ if (rmw < rcw && rmw > 0)
+ /* prefer read-modify-write, but need to get some data */
+ for (i=disks; i--;) {
++ dev = &sh->dev[i];
++ if ((dev->towrite || i == sh->pd_idx) &&
++ !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+ conf->disks[i].operational) {
+ if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ {
+ PRINTK("Read_old block %d for r-m-w\n", i);
++ set_bit(R5_LOCKED, &dev->flags);
+ action[i] = READ+1;
+ locked++;
+ } else {
+***************
+*** 1023,1036 ****
+ if (rcw <= rmw && rcw > 0)
+ /* want reconstruct write, but need to get some data */
+ for (i=disks; i--;) {
+- bh = sh->bh_cache[i];
+- if (!sh->bh_write[i] && i != sh->pd_idx &&
+- !buffer_locked(bh) && !buffer_uptodate(bh) &&
+ conf->disks[i].operational) {
+ if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ {
+ PRINTK("Read_old block %d for Reconstruct\n", i);
+- set_buffer_locked(bh);
+ action[i] = READ+1;
+ locked++;
+ } else {
+--- 1078,1091 ----
+ if (rcw <= rmw && rcw > 0)
+ /* want reconstruct write, but need to get some data */
+ for (i=disks; i--;) {
++ dev = &sh->dev[i];
++ if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
++ !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
+ conf->disks[i].operational) {
+ if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ {
+ PRINTK("Read_old block %d for Reconstruct\n", i);
++ set_bit(R5_LOCKED, &dev->flags);
+ action[i] = READ+1;
+ locked++;
+ } else {
+***************
+*** 1093,1152 ****
+ }
+ if (uptodate != disks)
+ BUG();
+- bh = sh->bh_cache[failed_num];
+- set_buffer_locked(bh);
+ action[failed_num] = WRITE+1;
+ locked++;
+ set_bit(STRIPE_INSYNC, &sh->state);
+ if (conf->disks[failed_num].operational)
+- md_sync_acct(conf->disks[failed_num].dev, bh->b_size>>9);
+ else if ((spare=conf->spare))
+- md_sync_acct(spare->dev, bh->b_size>>9);
+
+ }
+ }
+ if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
+- md_done_sync(conf->mddev, (sh->size>>9) - sh->sync_redone,1);
+ clear_bit(STRIPE_SYNCING, &sh->state);
+ }
+
+-
+ spin_unlock(&sh->lock);
+
+- while ((bh=return_ok)) {
+- return_ok = bh->b_reqnext;
+- bh->b_reqnext = NULL;
+- bh->b_end_io(bh, 1);
+- }
+- while ((bh=return_fail)) {
+- return_fail = bh->b_reqnext;
+- bh->b_reqnext = NULL;
+- bh->b_end_io(bh, 0);
+ }
+ for (i=disks; i-- ;)
+ if (action[i]) {
+- struct buffer_head *bh = sh->bh_cache[i];
+ struct disk_info *spare = conf->spare;
+ int skip = 0;
+ if (action[i] == READ+1)
+- bh->b_end_io = raid5_end_read_request;
+ else
+- bh->b_end_io = raid5_end_write_request;
+ if (conf->disks[i].operational)
+- bh->b_dev = conf->disks[i].dev;
+ else if (spare && action[i] == WRITE+1)
+- bh->b_dev = spare->dev;
+ else skip=1;
+- /* FIXME - later we will need bdev here */
+ if (!skip) {
+ PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
+ atomic_inc(&sh->count);
+- bh->b_rdev = bh->b_dev;
+- bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
+- generic_make_request(action[i]-1, bh);
+ } else {
+ PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
+- clear_buffer_locked(bh);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+ }
+--- 1149,1210 ----
+ }
+ if (uptodate != disks)
+ BUG();
++ dev = &sh->dev[failed_num];
++ set_bit(R5_LOCKED, &dev->flags);
+ action[failed_num] = WRITE+1;
+ locked++;
+ set_bit(STRIPE_INSYNC, &sh->state);
+ if (conf->disks[failed_num].operational)
++ md_sync_acct(conf->disks[failed_num].dev, STRIPE_SECTORS);
+ else if ((spare=conf->spare))
++ md_sync_acct(spare->dev, STRIPE_SECTORS);
+
+ }
+ }
+ if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
++ md_done_sync(conf->mddev, STRIPE_SECTORS,1);
+ clear_bit(STRIPE_SYNCING, &sh->state);
+ }
+
+ spin_unlock(&sh->lock);
+
++ while ((bi=return_bi)) {
++ return_bi = bi->bi_next;
++ bi->bi_next = NULL;
++ bi->bi_end_io(bi);
+ }
+ for (i=disks; i-- ;)
+ if (action[i]) {
++ struct bio *bi = &sh->dev[i].req;
+ struct disk_info *spare = conf->spare;
+ int skip = 0;
+ if (action[i] == READ+1)
++ bi->bi_end_io = raid5_end_read_request;
+ else
++ bi->bi_end_io = raid5_end_write_request;
+ if (conf->disks[i].operational)
++ bi->bi_bdev = conf->disks[i].bdev;
+ else if (spare && action[i] == WRITE+1)
++ bi->bi_bdev = spare->bdev;
+ else skip=1;
+ if (!skip) {
+ PRINTK("for %ld schedule op %d on disc %d\n", sh->sector, action[i]-1, i);
+ atomic_inc(&sh->count);
++ bi->bi_sector = sh->sector;
++ if (action[i] == READ+1)
++ bi->bi_rw = 0;
++ else
++ bi->bi_rw = 1;
++ bi->bi_flags = 0;
++ bi->bi_vcnt = 1;
++ bi->bi_idx = 0;
++ bi->bi_io_vec = &sh->dev[i].vec;
++ bi->bi_size = STRIPE_SIZE;
++ bi->bi_next = NULL;
++ generic_make_request(bi);
+ } else {
+ PRINTK("skip op %d on disc %d for sector %ld\n", action[i]-1, i, sh->sector);
++ clear_bit(R5_LOCKED, &dev->flags);
+ set_bit(STRIPE_HANDLE, &sh->state);
+ }
+ }
+***************
+*** 1208,1232 ****
+ read_ahead=1;
+ }
+
+- new_sector = raid5_compute_sector(bh->b_rsector,
+- raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+
+- PRINTK("raid5: make_request, sector %lu\n", new_sector);
+- sh = get_active_stripe(conf, new_sector, bh->b_size, read_ahead);
+- if (sh) {
+- sh->pd_idx = pd_idx;
+
+- add_stripe_bh(sh, bh, dd_idx, rw);
+
+- raid5_plug_device(conf);
+- handle_stripe(sh);
+- release_stripe(sh);
+- } else
+- bh->b_end_io(bh, buffer_uptodate(bh));
+ return 0;
+ }
+
+- static int sync_request (mddev_t *mddev, unsigned long sector_nr)
+ {
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+ struct stripe_head *sh;
+--- 1267,1305 ----
+ read_ahead=1;
+ }
+
++ logical_sector = bi->bi_sector & ~(STRIPE_SECTORS-1);
++ last_sector = bi->bi_sector + (bi->bi_size>>9);
+
++ bi->bi_next = NULL;
++ set_bit(BIO_UPTODATE, &bi->bi_flags); /* will be cleared if error detected */
++ bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
++ for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
++
++ new_sector = raid5_compute_sector(logical_sector,
++ raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+
++ PRINTK("raid5: make_request, sector %ul logical %ul\n",
++ new_sector, logical_sector);
+
++ sh = get_active_stripe(conf, new_sector, pd_idx, read_ahead);
++ if (sh) {
++
++ add_stripe_bio(sh, bi, dd_idx, rw);
++
++ raid5_plug_device(conf);
++ handle_stripe(sh);
++ release_stripe(sh);
++ }
++ }
++ spin_lock_irq(&conf->device_lock);
++ if (--bi->bi_phys_segments == 0)
++ bi->bi_end_io(bi);
++ spin_unlock_irq(&conf->device_lock);
+ return 0;
+ }
+
++ /* FIXME go_faster isn't used */
++ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
+ {
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+ struct stripe_head *sh;
+***************
+*** 1476,1481 ****
+ disk->number = desc->number;
+ disk->raid_disk = raid_disk;
+ disk->dev = NODEV;
+
+ disk->operational = 0;
+ disk->write_only = 0;
+--- 1545,1551 ----
+ disk->number = desc->number;
+ disk->raid_disk = raid_disk;
+ disk->dev = NODEV;
++ disk->bdev = NULL;
+
+ disk->operational = 0;
+ disk->write_only = 0;
+***************
+*** 1963,1968 ****
+ goto abort;
+ }
+ rdisk->dev = NODEV;
+ rdisk->used_slot = 0;
+
+ break;
+--- 2032,2038 ----
+ goto abort;
+ }
+ rdisk->dev = NODEV;
++ rdisk->bdev = NULL;
+ rdisk->used_slot = 0;
+
+ break;
+***************
+*** 1980,1985 ****
+ adisk->number = added_desc->number;
+ adisk->raid_disk = added_desc->raid_disk;
+ adisk->dev = mk_kdev(added_desc->major,added_desc->minor);
+
+ adisk->operational = 0;
+ adisk->write_only = 0;
+--- 2050,2057 ----
+ adisk->number = added_desc->number;
+ adisk->raid_disk = added_desc->raid_disk;
+ adisk->dev = mk_kdev(added_desc->major,added_desc->minor);
++ /* it will be held open by rdev */
++ adisk->bdev = bdget(kdev_t_to_nr(adisk->dev));
+
+ adisk->operational = 0;
+ adisk->write_only = 0;
diff --git a/tests/linux/raid5build/merge b/tests/linux/raid5build/merge
new file mode 100644
index 0000000..0fe41cd
--- /dev/null
+++ b/tests/linux/raid5build/merge
@@ -0,0 +1,38 @@
+static void raid5_build_block (struct stripe_head *sh, int i)
+{
+ raid5_conf_t *conf = sh->raid_conf;
+ struct r5dev *dev = &sh->dev[i];
+
+ bio_init(&dev->req);
+ dev->req.bi_io_vec = &dev->vec;
+ dev->req.bi_vcnt++;
+ dev->vec.bv_page = dev->page;
+ dev->vec.bv_len = STRIPE_SIZE;
+ dev->vec.bv_offset = 0;
+
+<<<<<<<
+ bh->b_dev = conf->disks[i].dev;
+|||||||
+ bh->b_dev = conf->disks[i].dev;
+ /* FIXME - later we will need bdev here */
+=======
+ dev->req.bi_bdev = conf->disks[i].bdev;
+ dev->req.bi_sector = sh->sector;
+>>>>>>>
+ dev->req.bi_private = sh;
+
+ dev->flags = 0;
+ if (i != sh->pd_idx)
+<<<<<<<
+ bh->b_size = sh->size;
+ bh->b_list = BUF_LOCKED;
+ return bh;
+}
+|||||||
+ bh->b_size = sh->size;
+ return bh;
+}
+=======
+ dev->sector = compute_blocknr(sh, i);
+}
+>>>>>>>
diff --git a/tests/linux/raid5build/orig b/tests/linux/raid5build/orig
new file mode 100644
index 0000000..3738f06
--- /dev/null
+++ b/tests/linux/raid5build/orig
@@ -0,0 +1,15 @@
+static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
+{
+ raid5_conf_t *conf = sh->raid_conf;
+ struct buffer_head *bh = sh->bh_cache[i];
+ unsigned long block = sh->sector / (sh->size >> 9);
+
+ init_buffer(bh, raid5_end_read_request, sh);
+ bh->b_dev = conf->disks[i].dev;
+ bh->b_blocknr = block;
+
+ bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
+ bh->b_size = sh->size;
+ bh->b_list = BUF_LOCKED;
+ return bh;
+}
diff --git a/tests/linux/raid5build/patch b/tests/linux/raid5build/patch
new file mode 100644
index 0000000..69cb527
--- /dev/null
+++ b/tests/linux/raid5build/patch
@@ -0,0 +1,31 @@
+@@ -1,15 +1,20 @@@
+-static struct buffer_head *raid5_build_block (struct stripe_head *sh, int i)
++static void raid5_build_block (struct stripe_head *sh, int i)
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+- struct buffer_head *bh = sh->bh_cache[i];
+- unsigned long block = sh->sector / (sh->size >> 9);
++ struct r5dev *dev = &sh->dev[i];
+
+- init_buffer(bh, raid5_end_read_request, sh);
+- bh->b_dev = conf->disks[i].dev;
+- /* FIXME - later we will need bdev here */
+- bh->b_blocknr = block;
++ bio_init(&dev->req);
++ dev->req.bi_io_vec = &dev->vec;
++ dev->req.bi_vcnt++;
++ dev->vec.bv_page = dev->page;
++ dev->vec.bv_len = STRIPE_SIZE;
++ dev->vec.bv_offset = 0;
+
+- bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
+- bh->b_size = sh->size;
+- return bh;
++ dev->req.bi_bdev = conf->disks[i].bdev;
++ dev->req.bi_sector = sh->sector;
++ dev->req.bi_private = sh;
++
++ dev->flags = 0;
++ if (i != sh->pd_idx)
++ dev->sector = compute_blocknr(sh, i);
+ }
diff --git a/tests/linux/raid5line/lmerge b/tests/linux/raid5line/lmerge
new file mode 100644
index 0000000..4d8dba6
--- /dev/null
+++ b/tests/linux/raid5line/lmerge
@@ -0,0 +1,7 @@
+ clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
+<<<<<<<
+|||||||
+ clear_buffer_uptodate(sh->bh_cache[i]);
+=======
+ dev->flags = 0;
+>>>>>>>
diff --git a/tests/linux/raid5line/merge b/tests/linux/raid5line/merge
new file mode 100644
index 0000000..e6ffa40
--- /dev/null
+++ b/tests/linux/raid5line/merge
@@ -0,0 +1,7 @@
+<<<<<<<
+ clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
+|||||||
+ clear_buffer_uptodate(sh->bh_cache[i]);
+=======
+ dev->flags = 0;
+>>>>>>>
diff --git a/tests/linux/raid5line/orig b/tests/linux/raid5line/orig
new file mode 100644
index 0000000..8b28be2
--- /dev/null
+++ b/tests/linux/raid5line/orig
@@ -0,0 +1 @@
+ clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
diff --git a/tests/linux/raid5line/patch b/tests/linux/raid5line/patch
new file mode 100644
index 0000000..c0ebfeb
--- /dev/null
+++ b/tests/linux/raid5line/patch
@@ -0,0 +1,3 @@
+@@ -1,1 +1,1 @@
+- clear_buffer_uptodate(sh->bh_cache[i]);
++ dev->flags = 0;
diff --git a/tests/linux/raid5line/wmerge b/tests/linux/raid5line/wmerge
new file mode 100644
index 0000000..e0b1530
--- /dev/null
+++ b/tests/linux/raid5line/wmerge
@@ -0,0 +1 @@
+<<<--- clear_bit(BH_Uptodate, &||| clear_buffer_uptodate(=== dev--->>>-><<<---->b_state|||===flags = 0--->>>;
diff --git a/tests/linux/rpc_tcp_nonagle/merge b/tests/linux/rpc_tcp_nonagle/merge
new file mode 100644
index 0000000..2f9aa46
--- /dev/null
+++ b/tests/linux/rpc_tcp_nonagle/merge
@@ -0,0 +1,1528 @@
+/*
+ * linux/net/sunrpc/svcsock.c
+ *
+ * These are the RPC server socket internals.
+ *
+ * The server scheduling algorithm does not always distribute the load
+ * evenly when servicing a single client. May need to modify the
+ * svc_sock_enqueue procedure...
+ *
+ * TCP support is largely untested and may be a little slow. The problem
+ * is that we currently do two separate recvfrom's, one for the 4-byte
+ * record length, and the second for the actual record. This could possibly
+ * be improved by always reading a minimum size of around 100 bytes and
+ * tucking any superfluous bytes away in a temporary store. Still, that
+ * leaves write requests out in the rain. An alternative may be to peek at
+ * the first skb in the queue, and if it matches the next TCP sequence
+ * number, to extract the record marker. Yuck.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/version.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/stats.h>
+
+/* SMP locking strategy:
+ *
+ * svc_serv->sv_lock protects most stuff for that service.
+ *
+ * Some flags can be set to certain values at any time
+ * providing that certain rules are followed:
+ *
+ * SK_BUSY can be set to 0 at any time.
+ * svc_sock_enqueue must be called afterwards
+ * SK_CONN, SK_DATA, can be set or cleared at any time.
+ * after a set, svc_sock_enqueue must be called.
+ * after a clear, the socket must be read/accepted
+ * if this succeeds, it must be set again.
+ * SK_CLOSE can set at any time. It is never cleared.
+ *
+ */
+
+#define RPCDBG_FACILITY RPCDBG_SVCSOCK
+
+
+static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
+ int *errp, int pmap_reg);
+static void svc_udp_data_ready(struct sock *, int);
+static int svc_udp_recvfrom(struct svc_rqst *);
+static int svc_udp_sendto(struct svc_rqst *);
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
+static int svc_deferred_recv(struct svc_rqst *rqstp);
+static struct cache_deferred_req *svc_defer(struct cache_req *req);
+
+/*
+ * Queue up an idle server thread. Must have serv->sv_lock held.
+ * Note: this is really a stack rather than a queue, so that we only
+ * use as many different threads as we need, and the rest don't polute
+ * the cache.
+ */
+static inline void
+svc_serv_enqueue(struct svc_serv *serv, struct svc_rqst *rqstp)
+{
+ list_add(&rqstp->rq_list, &serv->sv_threads);
+}
+
+/*
+ * Dequeue an nfsd thread. Must have serv->sv_lock held.
+ */
+static inline void
+svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp)
+{
+ list_del(&rqstp->rq_list);
+}
+
+/*
+ * Release an skbuff after use
+ */
+static inline void
+svc_release_skb(struct svc_rqst *rqstp)
+{
+ struct sk_buff *skb = rqstp->rq_skbuff;
+ struct svc_deferred_req *dr = rqstp->rq_deferred;
+
+ if (skb) {
+ rqstp->rq_skbuff = NULL;
+
+ dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
+ skb_free_datagram(rqstp->rq_sock->sk_sk, skb);
+ }
+ if (dr) {
+ rqstp->rq_deferred = NULL;
+ kfree(dr);
+ }
+}
+
+/*
+ * Queue up a socket with data pending. If there are idle nfsd
+ * processes, wake 'em up.
+ *
+ */
+static void
+svc_sock_enqueue(struct svc_sock *svsk)
+{
+ struct svc_serv *serv = svsk->sk_server;
+ struct svc_rqst *rqstp;
+
+ if (!(svsk->sk_flags &
+ ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
+ return;
+
+ spin_lock_bh(&serv->sv_lock);
+
+ if (!list_empty(&serv->sv_threads) &&
+ !list_empty(&serv->sv_sockets))
+ printk(KERN_ERR
+ "svc_sock_enqueue: threads and sockets both waiting??\n");
+
+ if (test_bit(SK_DEAD, &svsk->sk_flags)) {
+ /* Don't enqueue dead sockets */
+ dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk);
+ goto out_unlock;
+ }
+
+ if (test_bit(SK_BUSY, &svsk->sk_flags)) {
+ /* Don't enqueue socket while daemon is receiving */
+ dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
+ goto out_unlock;
+ }
+
+ if (((svsk->sk_reserved + serv->sv_bufsz)*2
+ > sock_wspace(svsk->sk_sk))
+ && !test_bit(SK_CLOSE, &svsk->sk_flags)
+ && !test_bit(SK_CONN, &svsk->sk_flags)) {
+ /* Don't enqueue while not enough space for reply */
+ dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n",
+ svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz,
+ sock_wspace(svsk->sk_sk));
+ goto out_unlock;
+ }
+
+ /* Mark socket as busy. It will remain in this state until the
+ * server has processed all pending data and put the socket back
+ * on the idle list.
+ */
+ set_bit(SK_BUSY, &svsk->sk_flags);
+
+ if (!list_empty(&serv->sv_threads)) {
+ rqstp = list_entry(serv->sv_threads.next,
+ struct svc_rqst,
+ rq_list);
+ dprintk("svc: socket %p served by daemon %p\n",
+ svsk->sk_sk, rqstp);
+ svc_serv_dequeue(serv, rqstp);
+ if (rqstp->rq_sock)
+ printk(KERN_ERR
+ "svc_sock_enqueue: server %p, rq_sock=%p!\n",
+ rqstp, rqstp->rq_sock);
+ rqstp->rq_sock = svsk;
+ svsk->sk_inuse++;
+ rqstp->rq_reserved = serv->sv_bufsz;
+ svsk->sk_reserved += rqstp->rq_reserved;
+ wake_up(&rqstp->rq_wait);
+ } else {
+ dprintk("svc: socket %p put into queue\n", svsk->sk_sk);
+ list_add_tail(&svsk->sk_ready, &serv->sv_sockets);
+ }
+
+out_unlock:
+ spin_unlock_bh(&serv->sv_lock);
+}
+
+/*
+ * Dequeue the first socket. Must be called with the serv->sv_lock held.
+ */
+static inline struct svc_sock *
+svc_sock_dequeue(struct svc_serv *serv)
+{
+ struct svc_sock *svsk;
+
+ if (list_empty(&serv->sv_sockets))
+ return NULL;
+
+ svsk = list_entry(serv->sv_sockets.next,
+ struct svc_sock, sk_ready);
+ list_del_init(&svsk->sk_ready);
+
+ dprintk("svc: socket %p dequeued, inuse=%d\n",
+ svsk->sk_sk, svsk->sk_inuse);
+
+ return svsk;
+}
+
+/*
+ * Having read something from a socket, check whether it
+ * needs to be re-enqueued.
+ * Note: SK_DATA only gets cleared when a read-attempt finds
+ * no (or insufficient) data.
+ */
+static inline void
+svc_sock_received(struct svc_sock *svsk)
+{
+ clear_bit(SK_BUSY, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+}
+
+
+/**
+ * svc_reserve - change the space reserved for the reply to a request.
+ * @rqstp: The request in question
+ * @space: new max space to reserve
+ *
+ * Each request reserves some space on the output queue of the socket
+ * to make sure the reply fits. This function reduces that reserved
+ * space to be the amount of space used already, plus @space.
+ *
+ */
+void svc_reserve(struct svc_rqst *rqstp, int space)
+{
+ space += rqstp->rq_res.head[0].iov_len;
+
+ if (space < rqstp->rq_reserved) {
+ struct svc_sock *svsk = rqstp->rq_sock;
+ spin_lock_bh(&svsk->sk_server->sv_lock);
+ svsk->sk_reserved -= (rqstp->rq_reserved - space);
+ rqstp->rq_reserved = space;
+ spin_unlock_bh(&svsk->sk_server->sv_lock);
+
+ svc_sock_enqueue(svsk);
+ }
+}
+
+/*
+ * Release a socket after use.
+ */
+static inline void
+svc_sock_put(struct svc_sock *svsk)
+{
+ struct svc_serv *serv = svsk->sk_server;
+
+ spin_lock_bh(&serv->sv_lock);
+ if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) {
+ spin_unlock_bh(&serv->sv_lock);
+ dprintk("svc: releasing dead socket\n");
+ sock_release(svsk->sk_sock);
+ kfree(svsk);
+ }
+ else
+ spin_unlock_bh(&serv->sv_lock);
+}
+
+static void
+svc_sock_release(struct svc_rqst *rqstp)
+{
+ struct svc_sock *svsk = rqstp->rq_sock;
+
+ svc_release_skb(rqstp);
+
+ svc_free_allpages(rqstp);
+ rqstp->rq_res.page_len = 0;
+ rqstp->rq_res.page_base = 0;
+
+
+ /* Reset response buffer and release
+ * the reservation.
+ * But first, check that enough space was reserved
+ * for the reply, otherwise we have a bug!
+ */
+ if ((rqstp->rq_res.len) > rqstp->rq_reserved)
+ printk(KERN_ERR "RPC request reserved %d but used %d\n",
+ rqstp->rq_reserved,
+ rqstp->rq_res.len);
+
+ rqstp->rq_res.head[0].iov_len = 0;
+ svc_reserve(rqstp, 0);
+ rqstp->rq_sock = NULL;
+
+ svc_sock_put(svsk);
+}
+
+/*
+ * External function to wake up a server waiting for data
+ */
+void
+svc_wake_up(struct svc_serv *serv)
+{
+ struct svc_rqst *rqstp;
+
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&serv->sv_threads)) {
+ rqstp = list_entry(serv->sv_threads.next,
+ struct svc_rqst,
+ rq_list);
+ dprintk("svc: daemon %p woken up.\n", rqstp);
+ /*
+ svc_serv_dequeue(serv, rqstp);
+ rqstp->rq_sock = NULL;
+ */
+ wake_up(&rqstp->rq_wait);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+}
+
+/*
+ * Generic sendto routine
+ */
+static int
+svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
+{
+ struct svc_sock *svsk = rqstp->rq_sock;
+ struct socket *sock = svsk->sk_sock;
+ int slen;
+ int len = 0;
+ int result;
+ int size;
+ struct page **ppage = xdr->pages;
+ size_t base = xdr->page_base;
+ unsigned int pglen = xdr->page_len;
+ unsigned int flags = MSG_MORE;
+
+ slen = xdr->len;
+
+ /* Grab svsk->sk_sem to serialize outgoing data. */
+ down(&svsk->sk_sem);
+
+ if (rqstp->rq_prot == IPPROTO_UDP) {
+ /* set the destination */
+ struct msghdr msg;
+ msg.msg_name = &rqstp->rq_addr;
+ msg.msg_namelen = sizeof(rqstp->rq_addr);
+ msg.msg_iov = NULL;
+ msg.msg_iovlen = 0;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_MORE;
+
+ if (sock_sendmsg(sock, &msg, 0) < 0)
+ goto out;
+ }
+
+ /* send head */
+ if (slen == xdr->head[0].iov_len)
+ flags = 0;
+ len = sock->ops->sendpage(sock, rqstp->rq_respages[0], 0, xdr->head[0].iov_len, flags);
+ if (len != xdr->head[0].iov_len)
+ goto out;
+ slen -= xdr->head[0].iov_len;
+ if (slen == 0)
+ goto out;
+
+ /* send page data */
+ size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
+ while (pglen > 0) {
+ if (slen == size)
+ flags = 0;
+ result = sock->ops->sendpage(sock, *ppage, base, size, flags);
+ if (result > 0)
+ len += result;
+ if (result != size)
+ goto out;
+ slen -= size;
+ pglen -= size;
+ size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
+ base = 0;
+ ppage++;
+ }
+ /* send tail */
+ if (xdr->tail[0].iov_len) {
+ /* The tail *will* be in respages[0]; */
+ result = sock->ops->sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage],
+ ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1),
+ xdr->tail[0].iov_len, 0);
+
+ if (result > 0)
+ len += result;
+ }
+out:
+ up(&svsk->sk_sem);
+
+ dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %x)\n",
+ rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len,
+ rqstp->rq_addr.sin_addr.s_addr);
+
+ return len;
+}
+
+/*
+ * Check input queue length
+ */
+static int
+svc_recv_available(struct svc_sock *svsk)
+{
+ mm_segment_t oldfs;
+ struct socket *sock = svsk->sk_sock;
+ int avail, err;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail);
+ set_fs(oldfs);
+
+ return (err >= 0)? avail : err;
+}
+
+/*
+ * Generic recvfrom routine.
+ */
+static int
+svc_recvfrom(struct svc_rqst *rqstp, struct iovec *iov, int nr, int buflen)
+{
+ mm_segment_t oldfs;
+ struct msghdr msg;
+ struct socket *sock;
+ int len, alen;
+
+ rqstp->rq_addrlen = sizeof(rqstp->rq_addr);
+ sock = rqstp->rq_sock->sk_sock;
+
+ msg.msg_name = &rqstp->rq_addr;
+ msg.msg_namelen = sizeof(rqstp->rq_addr);
+ msg.msg_iov = iov;
+ msg.msg_iovlen = nr;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+
+ msg.msg_flags = MSG_DONTWAIT;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ len = sock_recvmsg(sock, &msg, buflen, MSG_DONTWAIT);
+ set_fs(oldfs);
+
+ /* sock_recvmsg doesn't fill in the name/namelen, so we must..
+ * possibly we should cache this in the svc_sock structure
+ * at accept time. FIXME
+ */
+ alen = sizeof(rqstp->rq_addr);
+ sock->ops->getname(sock, (struct sockaddr *)&rqstp->rq_addr, &alen, 1);
+
+ dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
+ rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, len);
+
+ return len;
+}
+
+/*
+ * Set socket snd and rcv buffer lengths
+ */
+static inline void
+svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
+{
+#if 0
+ mm_segment_t oldfs;
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+ (char*)&snd, sizeof(snd));
+ sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+ (char*)&rcv, sizeof(rcv));
+#else
+ /* sock_setsockopt limits use to sysctl_?mem_max,
+ * which isn't acceptable. Until that is made conditional
+ * on not having CAP_SYS_RESOURCE or similar, we go direct...
+ * DaveM said I could!
+ */
+ lock_sock(sock->sk);
+ sock->sk->sndbuf = snd * 2;
+ sock->sk->rcvbuf = rcv * 2;
+ sock->sk->userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
+ release_sock(sock->sk);
+#endif
+}
+/*
+ * INET callback when data has been received on the socket.
+ */
+static void
+svc_udp_data_ready(struct sock *sk, int count)
+{
+ struct svc_sock *svsk = (struct svc_sock *)(sk->user_data);
+
+ if (!svsk)
+ goto out;
+ dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
+ svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags));
+ set_bit(SK_DATA, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible(sk->sleep);
+}
+
+/*
+ * INET callback when space is newly available on the socket.
+ */
+static void
+svc_write_space(struct sock *sk)
+{
+ struct svc_sock *svsk = (struct svc_sock *)(sk->user_data);
+
+ if (svsk) {
+ dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
+ svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags));
+ svc_sock_enqueue(svsk);
+ }
+
+ if (sk->sleep && waitqueue_active(sk->sleep)) {
+ printk(KERN_WARNING "RPC svc_write_space: some sleeping on %p\n",
+ svsk);
+ wake_up_interruptible(sk->sleep);
+ }
+}
+
+/*
+ * Receive a datagram from a UDP socket.
+ */
+extern int
+csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb);
+
+static int
+svc_udp_recvfrom(struct svc_rqst *rqstp)
+{
+ struct svc_sock *svsk = rqstp->rq_sock;
+ struct svc_serv *serv = svsk->sk_server;
+ struct sk_buff *skb;
+ int err, len;
+
+ if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+ /* udp sockets need large rcvbuf as all pending
+ * requests are still in that buffer. sndbuf must
+ * also be large enough that there is enough space
+ * for one reply per thread.
+ */
+ svc_sock_setbufsize(svsk->sk_sock,
+ (serv->sv_nrthreads+3) * serv->sv_bufsz,
+ (serv->sv_nrthreads+3) * serv->sv_bufsz);
+
+ if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk)))
+ return svc_deferred_recv(rqstp);
+
+ clear_bit(SK_DATA, &svsk->sk_flags);
+ while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) {
+ svc_sock_received(svsk);
+ if (err == -EAGAIN)
+ return err;
+ /* possibly an icmp error */
+ dprintk("svc: recvfrom returned error %d\n", -err);
+ }
+ set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
+
+ len = skb->len - sizeof(struct udphdr);
+ rqstp->rq_arg.len = len;
+
+ rqstp->rq_prot = IPPROTO_UDP;
+
+ /* Get sender address */
+ rqstp->rq_addr.sin_family = AF_INET;
+ rqstp->rq_addr.sin_port = skb->h.uh->source;
+ rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr;
+
+ if (skb_is_nonlinear(skb)) {
+ /* we have to copy */
+ local_bh_disable();
+ if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
+ local_bh_enable();
+ /* checksum error */
+ skb_free_datagram(svsk->sk_sk, skb);
+ svc_sock_received(svsk);
+ return 0;
+ }
+ local_bh_enable();
+ skb_free_datagram(svsk->sk_sk, skb);
+ } else {
+ /* we can use it in-place */
+ rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
+ rqstp->rq_arg.head[0].iov_len = len;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
+ if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
+ skb_free_datagram(svsk->sk_sk, skb);
+ svc_sock_received(svsk);
+ return 0;
+ }
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ rqstp->rq_skbuff = skb;
+ }
+
+ rqstp->rq_arg.page_base = 0;
+ if (len <= rqstp->rq_arg.head[0].iov_len) {
+ rqstp->rq_arg.head[0].iov_len = len;
+ rqstp->rq_arg.page_len = 0;
+ } else {
+ rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
+ rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE;
+ }
+
+ if (serv->sv_stats)
+ serv->sv_stats->netudpcnt++;
+
+ /* One down, maybe more to go... */
+ svsk->sk_sk->stamp = skb->stamp;
+ svc_sock_received(svsk);
+
+ return len;
+}
+
+static int
+svc_udp_sendto(struct svc_rqst *rqstp)
+{
+ int error;
+
+ error = svc_sendto(rqstp, &rqstp->rq_res);
+ if (error == -ECONNREFUSED)
+ /* ICMP error on earlier request. */
+ error = svc_sendto(rqstp, &rqstp->rq_res);
+
+ return error;
+}
+
+static void
+svc_udp_init(struct svc_sock *svsk)
+{
+ svsk->sk_sk->data_ready = svc_udp_data_ready;
+ svsk->sk_sk->write_space = svc_write_space;
+ svsk->sk_recvfrom = svc_udp_recvfrom;
+ svsk->sk_sendto = svc_udp_sendto;
+
+ /* initialise setting must have enough space to
+ * receive and respond to one request.
+ * svc_udp_recvfrom will re-adjust if necessary
+ */
+ svc_sock_setbufsize(svsk->sk_sock,
+ 3 * svsk->sk_server->sv_bufsz,
+ 3 * svsk->sk_server->sv_bufsz);
+
+ set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */
+ set_bit(SK_CHNGBUF, &svsk->sk_flags);
+}
+
+/*
+ * A data_ready event on a listening socket means there's a connection
+ * pending. Do not use state_change as a substitute for it.
+ */
+static void
+svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
+{
+ struct svc_sock *svsk;
+
+ dprintk("svc: socket %p TCP (listen) state change %d\n",
+ sk, sk->state);
+
+ if (sk->state != TCP_ESTABLISHED) {
+ /* Aborted connection, SYN_RECV or whatever... */
+ goto out;
+ }
+ if (!(svsk = (struct svc_sock *) sk->user_data)) {
+ printk("svc: socket %p: no user data\n", sk);
+ goto out;
+ }
+ set_bit(SK_CONN, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible_all(sk->sleep);
+}
+
+/*
+ * A state change on a connected socket means it's dying or dead.
+ */
+static void
+svc_tcp_state_change(struct sock *sk)
+{
+ struct svc_sock *svsk;
+
+ dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
+ sk, sk->state, sk->user_data);
+
+ if (!(svsk = (struct svc_sock *) sk->user_data)) {
+ printk("svc: socket %p: no user data\n", sk);
+ goto out;
+ }
+ set_bit(SK_CLOSE, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible_all(sk->sleep);
+}
+
+static void
+svc_tcp_data_ready(struct sock *sk, int count)
+{
+ struct svc_sock * svsk;
+
+ dprintk("svc: socket %p TCP data ready (svsk %p)\n",
+ sk, sk->user_data);
+ if (!(svsk = (struct svc_sock *)(sk->user_data)))
+ goto out;
+ set_bit(SK_DATA, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible(sk->sleep);
+}
+
+/*
+ * Accept a TCP connection
+ */
+static void
+svc_tcp_accept(struct svc_sock *svsk)
+{
+ struct sockaddr_in sin;
+ struct svc_serv *serv = svsk->sk_server;
+ struct socket *sock = svsk->sk_sock;
+ struct socket *newsock;
+ struct proto_ops *ops;
+ struct svc_sock *newsvsk;
+ int err, slen;
+
+ dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
+ if (!sock)
+ return;
+
+ if (!(newsock = sock_alloc())) {
+ printk(KERN_WARNING "%s: no more sockets!\n", serv->sv_name);
+ return;
+ }
+ dprintk("svc: tcp_accept %p allocated\n", newsock);
+
+ newsock->type = sock->type;
+ newsock->ops = ops = sock->ops;
+
+ clear_bit(SK_CONN, &svsk->sk_flags);
+ if ((err = ops->accept(sock, newsock, O_NONBLOCK)) < 0) {
+ if (err != -EAGAIN && net_ratelimit())
+ printk(KERN_WARNING "%s: accept failed (err %d)!\n",
+ serv->sv_name, -err);
+ goto failed; /* aborted connection or whatever */
+ }
+ set_bit(SK_CONN, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+
+ slen = sizeof(sin);
+ err = ops->getname(newsock, (struct sockaddr *) &sin, &slen, 1);
+ if (err < 0) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "%s: peername failed (err %d)!\n",
+ serv->sv_name, -err);
+ goto failed; /* aborted connection or whatever */
+ }
+
+ /* Ideally, we would want to reject connections from unauthorized
+ * hosts here, but when we get encription, the IP of the host won't
+ * tell us anything. For now just warn about unpriv connections.
+ */
+ if (ntohs(sin.sin_port) >= 1024) {
+ dprintk(KERN_WARNING
+ "%s: connect from unprivileged port: %u.%u.%u.%u:%d\n",
+ serv->sv_name,
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+ }
+
+ dprintk("%s: connect from %u.%u.%u.%u:%04x\n", serv->sv_name,
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+
+ /* make sure that a write doesn't block forever when
+ * low on memory
+ */
+ newsock->sk->sndtimeo = HZ*30;
+
+ if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 0)))
+ goto failed;
+
+
+ /* make sure that we don't have too many active connections.
+ * If we have, something must be dropped.
+ * We randomly choose between newest and oldest (in terms
+ * of recent activity) and drop it.
+ */
+ if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*5) {
+ struct svc_sock *svsk = NULL;
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&serv->sv_tempsocks)) {
+ if (net_random()&1)
+ svsk = list_entry(serv->sv_tempsocks.prev,
+ struct svc_sock,
+ sk_list);
+ else
+ svsk = list_entry(serv->sv_tempsocks.next,
+ struct svc_sock,
+ sk_list);
+ set_bit(SK_CLOSE, &svsk->sk_flags);
+ svsk->sk_inuse ++;
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ if (svsk) {
+ svc_sock_enqueue(svsk);
+ svc_sock_put(svsk);
+ }
+
+ }
+
+ if (serv->sv_stats)
+ serv->sv_stats->nettcpconn++;
+
+ return;
+
+failed:
+ sock_release(newsock);
+ return;
+}
+
+/*
+ * Receive data from a TCP socket.
+ */
+static int
+svc_tcp_recvfrom(struct svc_rqst *rqstp)
+{
+ struct svc_sock *svsk = rqstp->rq_sock;
+ struct svc_serv *serv = svsk->sk_server;
+ int len;
+ struct iovec vec[RPCSVC_MAXPAGES];
+ int pnum, vlen;
+
+ dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
+ svsk, test_bit(SK_DATA, &svsk->sk_flags),
+ test_bit(SK_CONN, &svsk->sk_flags),
+ test_bit(SK_CLOSE, &svsk->sk_flags));
+
+ if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk)))
+ return svc_deferred_recv(rqstp);
+
+ if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
+ svc_delete_socket(svsk);
+ return 0;
+ }
+
+ if (test_bit(SK_CONN, &svsk->sk_flags)) {
+ svc_tcp_accept(svsk);
+ svc_sock_received(svsk);
+ return 0;
+ }
+
+ if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+ /* sndbuf needs to have room for one request
+ * per thread, otherwise we can stall even when the
+ * network isn't a bottleneck.
+ * rcvbuf just needs to be able to hold a few requests.
+ * Normally they will be removed from the queue
+ * as soon a a complete request arrives.
+ */
+ svc_sock_setbufsize(svsk->sk_sock,
+ (serv->sv_nrthreads+3) * serv->sv_bufsz,
+ 3 * serv->sv_bufsz);
+
+ clear_bit(SK_DATA, &svsk->sk_flags);
+
+ /* Receive data. If we haven't got the record length yet, get
+ * the next four bytes. Otherwise try to gobble up as much as
+ * possible up to the complete record length.
+ */
+ if (svsk->sk_tcplen < 4) {
+ unsigned long want = 4 - svsk->sk_tcplen;
+ struct iovec iov;
+
+ iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
+ iov.iov_len = want;
+ if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
+ goto error;
+ svsk->sk_tcplen += len;
+
+ if (len < want) {
+ dprintk("svc: short recvfrom while reading record length (%d of %d)\n",
+ len, want);
+ svc_sock_received(svsk);
+ return -EAGAIN; /* record header not complete */
+ }
+
+ svsk->sk_reclen = ntohl(svsk->sk_reclen);
+ if (!(svsk->sk_reclen & 0x80000000)) {
+ /* FIXME: technically, a record can be fragmented,
+ * and non-terminal fragments will not have the top
+ * bit set in the fragment length header.
+ * But apparently no known nfs clients send fragmented
+ * records. */
+ printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (non-terminal)\n",
+ (unsigned long) svsk->sk_reclen);
+ goto err_delete;
+ }
+ svsk->sk_reclen &= 0x7fffffff;
+ dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen);
+ if (svsk->sk_reclen > serv->sv_bufsz) {
+ printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n",
+ (unsigned long) svsk->sk_reclen);
+ goto err_delete;
+ }
+ }
+
+ /* Check whether enough data is available */
+ len = svc_recv_available(svsk);
+ if (len < 0)
+ goto error;
+
+ if (len < svsk->sk_reclen) {
+ dprintk("svc: incomplete TCP record (%d of %d)\n",
+ len, svsk->sk_reclen);
+ svc_sock_received(svsk);
+ return -EAGAIN; /* record not complete */
+ }
+ len = svsk->sk_reclen;
+ set_bit(SK_DATA, &svsk->sk_flags);
+
+ vec[0] = rqstp->rq_arg.head[0];
+ vlen = PAGE_SIZE;
+ pnum = 1;
+ while (vlen < len) {
+ vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]);
+ vec[pnum].iov_len = PAGE_SIZE;
+ pnum++;
+ vlen += PAGE_SIZE;
+ }
+
+ /* Now receive data */
+ len = svc_recvfrom(rqstp, vec, pnum, len);
+ if (len < 0)
+ goto error;
+
+ dprintk("svc: TCP complete record (%d bytes)\n", len);
+ rqstp->rq_arg.len = len;
+ rqstp->rq_arg.page_base = 0;
+ if (len <= rqstp->rq_arg.head[0].iov_len) {
+ rqstp->rq_arg.head[0].iov_len = len;
+ rqstp->rq_arg.page_len = 0;
+ } else {
+ rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
+ }
+
+ rqstp->rq_skbuff = 0;
+ rqstp->rq_prot = IPPROTO_TCP;
+
+ /* Reset TCP read info */
+ svsk->sk_reclen = 0;
+ svsk->sk_tcplen = 0;
+
+ svc_sock_received(svsk);
+ if (serv->sv_stats)
+ serv->sv_stats->nettcpcnt++;
+
+ return len;
+
+ err_delete:
+ svc_delete_socket(svsk);
+ return -EAGAIN;
+
+ error:
+ if (len == -EAGAIN) {
+ dprintk("RPC: TCP recvfrom got EAGAIN\n");
+ svc_sock_received(svsk);
+ } else {
+ printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
+ svsk->sk_server->sv_name, -len);
+ svc_sock_received(svsk);
+ }
+
+ return len;
+}
+
+/*
+ * Send out data on TCP socket.
+ */
+static int
+svc_tcp_sendto(struct svc_rqst *rqstp)
+{
+ struct xdr_buf *xbufp = &rqstp->rq_res;
+ int sent;
+ u32 reclen;
+
+ /* Set up the first element of the reply iovec.
+ * Any other iovecs that may be in use have been taken
+ * care of by the server implementation itself.
+ */
+ reclen = htonl(0x80000000|((xbufp->len ) - 4));
+ memcpy(xbufp->head[0].iov_base, &reclen, 4);
+
+ sent = svc_sendto(rqstp, &rqstp->rq_res);
+ if (sent != xbufp->len) {
+ printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
+ rqstp->rq_sock->sk_server->sv_name,
+ (sent<0)?"got error":"sent only",
+ sent, xbufp->len);
+ svc_delete_socket(rqstp->rq_sock);
+ sent = -EAGAIN;
+ }
+ return sent;
+}
+
+static void
+svc_tcp_init(struct svc_sock *svsk)
+{
+ struct sock *sk = svsk->sk_sk;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ svsk->sk_recvfrom = svc_tcp_recvfrom;
+ svsk->sk_sendto = svc_tcp_sendto;
+
+ if (sk->state == TCP_LISTEN) {
+ dprintk("setting up TCP socket for listening\n");
+ sk->data_ready = svc_tcp_listen_data_ready;
+ set_bit(SK_CONN, &svsk->sk_flags);
+ } else {
+ dprintk("setting up TCP socket for reading\n");
+ sk->state_change = svc_tcp_state_change;
+ sk->data_ready = svc_tcp_data_ready;
+ sk->write_space = svc_write_space;
+
+ svsk->sk_reclen = 0;
+<<<<<<<
+ svsk->sk_tcplen = 0;
+
+ /* initialise setting must have enough space to
+ * receive and respond to one request.
+ * svc_tcp_recvfrom will re-adjust if necessary
+|||||||
+ svsk->sk_tcplen = 0;
+
+ /* initialise setting must have enough space to
+ * receive and respond to one request.
+ * svc_tcp_recvfrom will re-adjust if necessary
+=======
+ svsk->sk_tcplen = 0;
+
+ tp->nonagle = 1; /* disable Nagle's algorithm */
+
+ /* initialise setting must have enough space to
+ * receive and respond to one request.
+ * svc_tcp_recvfrom will re-adjust if necessary
+>>>>>>>
+ */
+ svc_sock_setbufsize(svsk->sk_sock,
+ 3 * svsk->sk_server->sv_bufsz,
+ 3 * svsk->sk_server->sv_bufsz);
+
+ set_bit(SK_CHNGBUF, &svsk->sk_flags);
+ set_bit(SK_DATA, &svsk->sk_flags);
+ }
+}
+
+void
+svc_sock_update_bufs(struct svc_serv *serv)
+{
+ /*
+ * The number of server threads has changed. Update
+ * rcvbuf and sndbuf accordingly on all sockets
+ */
+ struct list_head *le;
+
+ spin_lock_bh(&serv->sv_lock);
+ list_for_each(le, &serv->sv_permsocks) {
+ struct svc_sock *svsk =
+ list_entry(le, struct svc_sock, sk_list);
+ set_bit(SK_CHNGBUF, &svsk->sk_flags);
+ }
+ list_for_each(le, &serv->sv_tempsocks) {
+ struct svc_sock *svsk =
+ list_entry(le, struct svc_sock, sk_list);
+ set_bit(SK_CHNGBUF, &svsk->sk_flags);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+}
+
+/*
+ * Receive the next request on any socket.
+ */
+int
+svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
+{
+ struct svc_sock *svsk =NULL;
+ int len;
+ int pages;
+ struct xdr_buf *arg;
+ DECLARE_WAITQUEUE(wait, current);
+
+ dprintk("svc: server %p waiting for data (to = %ld)\n",
+ rqstp, timeout);
+
+ if (rqstp->rq_sock)
+ printk(KERN_ERR
+ "svc_recv: service %p, socket not NULL!\n",
+ rqstp);
+ if (waitqueue_active(&rqstp->rq_wait))
+ printk(KERN_ERR
+ "svc_recv: service %p, wait queue active!\n",
+ rqstp);
+
+ /* Initialize the buffers */
+ /* first reclaim pages that were moved to response list */
+ svc_pushback_allpages(rqstp);
+
+ /* now allocate needed pages. If we get a failure, sleep briefly */
+ pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE;
+ while (rqstp->rq_arghi < pages) {
+ struct page *p = alloc_page(GFP_KERNEL);
+ if (!p) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(HZ/2);
+ current->state = TASK_RUNNING;
+ continue;
+ }
+ rqstp->rq_argpages[rqstp->rq_arghi++] = p;
+ }
+
+ /* Make arg->head point to first page and arg->pages point to rest */
+ arg = &rqstp->rq_arg;
+ arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]);
+ arg->head[0].iov_len = PAGE_SIZE;
+ rqstp->rq_argused = 1;
+ arg->pages = rqstp->rq_argpages + 1;
+ arg->page_base = 0;
+ /* save at least one page for response */
+ arg->page_len = (pages-2)*PAGE_SIZE;
+ arg->len = (pages-1)*PAGE_SIZE;
+ arg->tail[0].iov_len = 0;
+
+ if (signalled())
+ return -EINTR;
+
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&serv->sv_tempsocks)) {
+ svsk = list_entry(serv->sv_tempsocks.next,
+ struct svc_sock, sk_list);
+ /* apparently the "standard" is that clients close
+ * idle connections after 5 minutes, servers after
+ * 6 minutes
+ * http://www.connectathon.org/talks96/nfstcp.pdf
+ */
+ if (get_seconds() - svsk->sk_lastrecv < 6*60
+ || test_bit(SK_BUSY, &svsk->sk_flags))
+ svsk = NULL;
+ }
+ if (svsk) {
+ set_bit(SK_BUSY, &svsk->sk_flags);
+ set_bit(SK_CLOSE, &svsk->sk_flags);
+ rqstp->rq_sock = svsk;
+ svsk->sk_inuse++;
+ } else if ((svsk = svc_sock_dequeue(serv)) != NULL) {
+ rqstp->rq_sock = svsk;
+ svsk->sk_inuse++;
+ rqstp->rq_reserved = serv->sv_bufsz;
+ svsk->sk_reserved += rqstp->rq_reserved;
+ } else {
+ /* No data pending. Go to sleep */
+ svc_serv_enqueue(serv, rqstp);
+
+ /*
+ * We have to be able to interrupt this wait
+ * to bring down the daemons ...
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&rqstp->rq_wait, &wait);
+ spin_unlock_bh(&serv->sv_lock);
+
+ schedule_timeout(timeout);
+
+ spin_lock_bh(&serv->sv_lock);
+ remove_wait_queue(&rqstp->rq_wait, &wait);
+
+ if (!(svsk = rqstp->rq_sock)) {
+ svc_serv_dequeue(serv, rqstp);
+ spin_unlock_bh(&serv->sv_lock);
+ dprintk("svc: server %p, no data yet\n", rqstp);
+ return signalled()? -EINTR : -EAGAIN;
+ }
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ dprintk("svc: server %p, socket %p, inuse=%d\n",
+ rqstp, svsk, svsk->sk_inuse);
+ len = svsk->sk_recvfrom(rqstp);
+ dprintk("svc: got len=%d\n", len);
+
+ /* No data, incomplete (TCP) read, or accept() */
+ if (len == 0 || len == -EAGAIN) {
+ svc_sock_release(rqstp);
+ return -EAGAIN;
+ }
+ svsk->sk_lastrecv = get_seconds();
+ if (test_bit(SK_TEMP, &svsk->sk_flags)) {
+ /* push active sockets to end of list */
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&svsk->sk_list))
+ list_move_tail(&svsk->sk_list, &serv->sv_tempsocks);
+ spin_unlock_bh(&serv->sv_lock);
+ }
+
+ rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024;
+ rqstp->rq_userset = 0;
+ rqstp->rq_chandle.defer = svc_defer;
+
+ if (serv->sv_stats)
+ serv->sv_stats->netcnt++;
+ return len;
+}
+
+/*
+ * Drop request
+ */
+void
+svc_drop(struct svc_rqst *rqstp)
+{
+ dprintk("svc: socket %p dropped request\n", rqstp->rq_sock);
+ svc_sock_release(rqstp);
+}
+
+/*
+ * Return reply to client.
+ */
+int
+svc_send(struct svc_rqst *rqstp)
+{
+ struct svc_sock *svsk;
+ int len;
+ struct xdr_buf *xb;
+
+ if ((svsk = rqstp->rq_sock) == NULL) {
+ printk(KERN_WARNING "NULL socket pointer in %s:%d\n",
+ __FILE__, __LINE__);
+ return -EFAULT;
+ }
+
+ /* release the receive skb before sending the reply */
+ svc_release_skb(rqstp);
+
+ /* calculate over-all length */
+ xb = & rqstp->rq_res;
+ xb->len = xb->head[0].iov_len +
+ xb->page_len +
+ xb->tail[0].iov_len;
+
+ len = svsk->sk_sendto(rqstp);
+ svc_sock_release(rqstp);
+
+ if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
+ return 0;
+ return len;
+}
+
+/*
+ * Initialize socket for RPC use and create svc_sock struct
+ * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
+ */
+static struct svc_sock *
+svc_setup_socket(struct svc_serv *serv, struct socket *sock,
+ int *errp, int pmap_register)
+{
+ struct svc_sock *svsk;
+ struct sock *inet;
+
+ dprintk("svc: svc_setup_socket %p\n", sock);
+ if (!(svsk = kmalloc(sizeof(*svsk), GFP_KERNEL))) {
+ *errp = -ENOMEM;
+ return NULL;
+ }
+ memset(svsk, 0, sizeof(*svsk));
+
+ inet = sock->sk;
+
+ /* Register socket with portmapper */
+ if (*errp >= 0 && pmap_register)
+ *errp = svc_register(serv, inet->protocol,
+ ntohs(inet_sk(inet)->sport));
+
+ if (*errp < 0) {
+ kfree(svsk);
+ return NULL;
+ }
+
+ set_bit(SK_BUSY, &svsk->sk_flags);
+ inet->user_data = svsk;
+ svsk->sk_sock = sock;
+ svsk->sk_sk = inet;
+ svsk->sk_ostate = inet->state_change;
+ svsk->sk_odata = inet->data_ready;
+ svsk->sk_owspace = inet->write_space;
+ svsk->sk_server = serv;
+ svsk->sk_lastrecv = get_seconds();
+ INIT_LIST_HEAD(&svsk->sk_deferred);
+ INIT_LIST_HEAD(&svsk->sk_ready);
+ sema_init(&svsk->sk_sem, 1);
+
+ /* Initialize the socket */
+ if (sock->type == SOCK_DGRAM)
+ svc_udp_init(svsk);
+ else
+ svc_tcp_init(svsk);
+
+ spin_lock_bh(&serv->sv_lock);
+ if (!pmap_register) {
+ set_bit(SK_TEMP, &svsk->sk_flags);
+ list_add(&svsk->sk_list, &serv->sv_tempsocks);
+ serv->sv_tmpcnt++;
+ } else {
+ clear_bit(SK_TEMP, &svsk->sk_flags);
+ list_add(&svsk->sk_list, &serv->sv_permsocks);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ dprintk("svc: svc_setup_socket created %p (inet %p)\n",
+ svsk, svsk->sk_sk);
+
+ clear_bit(SK_BUSY, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ return svsk;
+}
+
+/*
+ * Create socket for RPC service.
+ */
+static int
+svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin)
+{
+ struct svc_sock *svsk;
+ struct socket *sock;
+ int error;
+ int type;
+
+ dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n",
+ serv->sv_program->pg_name, protocol,
+ NIPQUAD(sin->sin_addr.s_addr),
+ ntohs(sin->sin_port));
+
+ if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
+ printk(KERN_WARNING "svc: only UDP and TCP "
+ "sockets supported\n");
+ return -EINVAL;
+ }
+ type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
+
+ if ((error = sock_create(PF_INET, type, protocol, &sock)) < 0)
+ return error;
+
+ if (sin != NULL) {
+ sock->sk->reuse = 1; /* allow address reuse */
+ error = sock->ops->bind(sock, (struct sockaddr *) sin,
+ sizeof(*sin));
+ if (error < 0)
+ goto bummer;
+ }
+
+ if (protocol == IPPROTO_TCP) {
+ if ((error = sock->ops->listen(sock, 64)) < 0)
+ goto bummer;
+ }
+
+ if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL)
+ return 0;
+
+bummer:
+ dprintk("svc: svc_create_socket error = %d\n", -error);
+ sock_release(sock);
+ return error;
+}
+
+/*
+ * Remove a dead socket
+ */
+void
+svc_delete_socket(struct svc_sock *svsk)
+{
+ struct svc_serv *serv;
+ struct sock *sk;
+
+ dprintk("svc: svc_delete_socket(%p)\n", svsk);
+
+ serv = svsk->sk_server;
+ sk = svsk->sk_sk;
+
+ sk->state_change = svsk->sk_ostate;
+ sk->data_ready = svsk->sk_odata;
+ sk->write_space = svsk->sk_owspace;
+
+ spin_lock_bh(&serv->sv_lock);
+
+ list_del_init(&svsk->sk_list);
+ list_del_init(&svsk->sk_ready);
+ if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags))
+ if (test_bit(SK_TEMP, &svsk->sk_flags))
+ serv->sv_tmpcnt--;
+
+ if (!svsk->sk_inuse) {
+ spin_unlock_bh(&serv->sv_lock);
+ sock_release(svsk->sk_sock);
+ kfree(svsk);
+ } else {
+ spin_unlock_bh(&serv->sv_lock);
+ dprintk(KERN_NOTICE "svc: server socket destroy delayed\n");
+ /* svsk->sk_server = NULL; */
+ }
+}
+
+/*
+ * Make a socket for nfsd and lockd
+ */
+int
+svc_makesock(struct svc_serv *serv, int protocol, unsigned short port)
+{
+ struct sockaddr_in sin;
+
+ dprintk("svc: creating socket proto = %d\n", protocol);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = INADDR_ANY;
+ sin.sin_port = htons(port);
+ return svc_create_socket(serv, protocol, &sin);
+}
+
+/*
+ * Handle defer and revisit of requests
+ */
+
+static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
+{
+ struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle);
+ struct svc_serv *serv = dr->serv;
+ struct svc_sock *svsk;
+
+ if (too_many) {
+ svc_sock_put(dr->svsk);
+ kfree(dr);
+ return;
+ }
+ dprintk("revisit queued\n");
+ svsk = dr->svsk;
+ dr->svsk = NULL;
+ spin_lock(&serv->sv_lock);
+ list_add(&dr->handle.recent, &svsk->sk_deferred);
+ spin_unlock(&serv->sv_lock);
+ set_bit(SK_DEFERRED, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ svc_sock_put(svsk);
+}
+
+static struct cache_deferred_req *
+svc_defer(struct cache_req *req)
+{
+ struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
+ int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
+ struct svc_deferred_req *dr;
+
+ if (rqstp->rq_arg.page_len)
+ return NULL; /* if more than a page, give up FIXME */
+ if (rqstp->rq_deferred) {
+ dr = rqstp->rq_deferred;
+ rqstp->rq_deferred = NULL;
+ } else {
+ int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
+ /* FIXME maybe discard if size too large */
+ dr = kmalloc(size, GFP_KERNEL);
+ if (dr == NULL)
+ return NULL;
+
+ dr->serv = rqstp->rq_server;
+ dr->prot = rqstp->rq_prot;
+ dr->addr = rqstp->rq_addr;
+ dr->argslen = rqstp->rq_arg.len >> 2;
+ memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
+ }
+ spin_lock(&rqstp->rq_server->sv_lock);
+ rqstp->rq_sock->sk_inuse++;
+ dr->svsk = rqstp->rq_sock;
+ spin_unlock(&rqstp->rq_server->sv_lock);
+
+ dr->handle.revisit = svc_revisit;
+ return &dr->handle;
+}
+
+/*
+ * recv data from a deferred request into an active one
+ */
+static int svc_deferred_recv(struct svc_rqst *rqstp)
+{
+ struct svc_deferred_req *dr = rqstp->rq_deferred;
+
+ rqstp->rq_arg.head[0].iov_base = dr->args;
+ rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
+ rqstp->rq_arg.page_len = 0;
+ rqstp->rq_arg.len = dr->argslen<<2;
+ rqstp->rq_prot = dr->prot;
+ rqstp->rq_addr = dr->addr;
+ return dr->argslen<<2;
+}
+
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
+{
+ struct svc_deferred_req *dr = NULL;
+ struct svc_serv *serv = svsk->sk_server;
+
+ if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
+ return NULL;
+ spin_lock(&serv->sv_lock);
+ clear_bit(SK_DEFERRED, &svsk->sk_flags);
+ if (!list_empty(&svsk->sk_deferred)) {
+ dr = list_entry(svsk->sk_deferred.next,
+ struct svc_deferred_req,
+ handle.recent);
+ list_del_init(&dr->handle.recent);
+ set_bit(SK_DEFERRED, &svsk->sk_flags);
+ }
+ spin_unlock(&serv->sv_lock);
+ svc_sock_received(svsk);
+ return dr;
+}
diff --git a/tests/linux/rpc_tcp_nonagle/orig b/tests/linux/rpc_tcp_nonagle/orig
new file mode 100644
index 0000000..983322e
--- /dev/null
+++ b/tests/linux/rpc_tcp_nonagle/orig
@@ -0,0 +1,1511 @@
+/*
+ * linux/net/sunrpc/svcsock.c
+ *
+ * These are the RPC server socket internals.
+ *
+ * The server scheduling algorithm does not always distribute the load
+ * evenly when servicing a single client. May need to modify the
+ * svc_sock_enqueue procedure...
+ *
+ * TCP support is largely untested and may be a little slow. The problem
+ * is that we currently do two separate recvfrom's, one for the 4-byte
+ * record length, and the second for the actual record. This could possibly
+ * be improved by always reading a minimum size of around 100 bytes and
+ * tucking any superfluous bytes away in a temporary store. Still, that
+ * leaves write requests out in the rain. An alternative may be to peek at
+ * the first skb in the queue, and if it matches the next TCP sequence
+ * number, to extract the record marker. Yuck.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/version.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+#include <linux/sunrpc/types.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/stats.h>
+
+/* SMP locking strategy:
+ *
+ * svc_serv->sv_lock protects most stuff for that service.
+ *
+ * Some flags can be set to certain values at any time
+ * providing that certain rules are followed:
+ *
+ * SK_BUSY can be set to 0 at any time.
+ * svc_sock_enqueue must be called afterwards
+ * SK_CONN, SK_DATA, can be set or cleared at any time.
+ * after a set, svc_sock_enqueue must be called.
+ * after a clear, the socket must be read/accepted
+ * if this succeeds, it must be set again.
+ * SK_CLOSE can set at any time. It is never cleared.
+ *
+ */
+
+#define RPCDBG_FACILITY RPCDBG_SVCSOCK
+
+
+static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
+ int *errp, int pmap_reg);
+static void svc_udp_data_ready(struct sock *, int);
+static int svc_udp_recvfrom(struct svc_rqst *);
+static int svc_udp_sendto(struct svc_rqst *);
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
+static int svc_deferred_recv(struct svc_rqst *rqstp);
+static struct cache_deferred_req *svc_defer(struct cache_req *req);
+
+/*
+ * Queue up an idle server thread. Must have serv->sv_lock held.
+ * Note: this is really a stack rather than a queue, so that we only
+ * use as many different threads as we need, and the rest don't polute
+ * the cache.
+ */
+static inline void
+svc_serv_enqueue(struct svc_serv *serv, struct svc_rqst *rqstp)
+{
+ list_add(&rqstp->rq_list, &serv->sv_threads);
+}
+
+/*
+ * Dequeue an nfsd thread. Must have serv->sv_lock held.
+ */
+static inline void
+svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp)
+{
+ list_del(&rqstp->rq_list);
+}
+
+/*
+ * Release an skbuff after use
+ */
+static inline void
+svc_release_skb(struct svc_rqst *rqstp)
+{
+ struct sk_buff *skb = rqstp->rq_skbuff;
+ struct svc_deferred_req *dr = rqstp->rq_deferred;
+
+ if (skb) {
+ rqstp->rq_skbuff = NULL;
+
+ dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
+ skb_free_datagram(rqstp->rq_sock->sk_sk, skb);
+ }
+ if (dr) {
+ rqstp->rq_deferred = NULL;
+ kfree(dr);
+ }
+}
+
+/*
+ * Queue up a socket with data pending. If there are idle nfsd
+ * processes, wake 'em up.
+ *
+ */
+static void
+svc_sock_enqueue(struct svc_sock *svsk)
+{
+ struct svc_serv *serv = svsk->sk_server;
+ struct svc_rqst *rqstp;
+
+ if (!(svsk->sk_flags &
+ ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
+ return;
+
+ spin_lock_bh(&serv->sv_lock);
+
+ if (!list_empty(&serv->sv_threads) &&
+ !list_empty(&serv->sv_sockets))
+ printk(KERN_ERR
+ "svc_sock_enqueue: threads and sockets both waiting??\n");
+
+ if (test_bit(SK_DEAD, &svsk->sk_flags)) {
+ /* Don't enqueue dead sockets */
+ dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk);
+ goto out_unlock;
+ }
+
+ if (test_bit(SK_BUSY, &svsk->sk_flags)) {
+ /* Don't enqueue socket while daemon is receiving */
+ dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk);
+ goto out_unlock;
+ }
+
+ if (((svsk->sk_reserved + serv->sv_bufsz)*2
+ > sock_wspace(svsk->sk_sk))
+ && !test_bit(SK_CLOSE, &svsk->sk_flags)
+ && !test_bit(SK_CONN, &svsk->sk_flags)) {
+ /* Don't enqueue while not enough space for reply */
+ dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n",
+ svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz,
+ sock_wspace(svsk->sk_sk));
+ goto out_unlock;
+ }
+
+ /* Mark socket as busy. It will remain in this state until the
+ * server has processed all pending data and put the socket back
+ * on the idle list.
+ */
+ set_bit(SK_BUSY, &svsk->sk_flags);
+
+ if (!list_empty(&serv->sv_threads)) {
+ rqstp = list_entry(serv->sv_threads.next,
+ struct svc_rqst,
+ rq_list);
+ dprintk("svc: socket %p served by daemon %p\n",
+ svsk->sk_sk, rqstp);
+ svc_serv_dequeue(serv, rqstp);
+ if (rqstp->rq_sock)
+ printk(KERN_ERR
+ "svc_sock_enqueue: server %p, rq_sock=%p!\n",
+ rqstp, rqstp->rq_sock);
+ rqstp->rq_sock = svsk;
+ svsk->sk_inuse++;
+ rqstp->rq_reserved = serv->sv_bufsz;
+ svsk->sk_reserved += rqstp->rq_reserved;
+ wake_up(&rqstp->rq_wait);
+ } else {
+ dprintk("svc: socket %p put into queue\n", svsk->sk_sk);
+ list_add_tail(&svsk->sk_ready, &serv->sv_sockets);
+ }
+
+out_unlock:
+ spin_unlock_bh(&serv->sv_lock);
+}
+
+/*
+ * Dequeue the first socket. Must be called with the serv->sv_lock held.
+ */
+static inline struct svc_sock *
+svc_sock_dequeue(struct svc_serv *serv)
+{
+ struct svc_sock *svsk;
+
+ if (list_empty(&serv->sv_sockets))
+ return NULL;
+
+ svsk = list_entry(serv->sv_sockets.next,
+ struct svc_sock, sk_ready);
+ list_del_init(&svsk->sk_ready);
+
+ dprintk("svc: socket %p dequeued, inuse=%d\n",
+ svsk->sk_sk, svsk->sk_inuse);
+
+ return svsk;
+}
+
+/*
+ * Having read something from a socket, check whether it
+ * needs to be re-enqueued.
+ * Note: SK_DATA only gets cleared when a read-attempt finds
+ * no (or insufficient) data.
+ */
+static inline void
+svc_sock_received(struct svc_sock *svsk)
+{
+ clear_bit(SK_BUSY, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+}
+
+
+/**
+ * svc_reserve - change the space reserved for the reply to a request.
+ * @rqstp: The request in question
+ * @space: new max space to reserve
+ *
+ * Each request reserves some space on the output queue of the socket
+ * to make sure the reply fits. This function reduces that reserved
+ * space to be the amount of space used already, plus @space.
+ *
+ */
+void svc_reserve(struct svc_rqst *rqstp, int space)
+{
+ space += rqstp->rq_res.head[0].iov_len;
+
+ if (space < rqstp->rq_reserved) {
+ struct svc_sock *svsk = rqstp->rq_sock;
+ spin_lock_bh(&svsk->sk_server->sv_lock);
+ svsk->sk_reserved -= (rqstp->rq_reserved - space);
+ rqstp->rq_reserved = space;
+ spin_unlock_bh(&svsk->sk_server->sv_lock);
+
+ svc_sock_enqueue(svsk);
+ }
+}
+
+/*
+ * Release a socket after use.
+ */
+static inline void
+svc_sock_put(struct svc_sock *svsk)
+{
+ struct svc_serv *serv = svsk->sk_server;
+
+ spin_lock_bh(&serv->sv_lock);
+ if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) {
+ spin_unlock_bh(&serv->sv_lock);
+ dprintk("svc: releasing dead socket\n");
+ sock_release(svsk->sk_sock);
+ kfree(svsk);
+ }
+ else
+ spin_unlock_bh(&serv->sv_lock);
+}
+
+static void
+svc_sock_release(struct svc_rqst *rqstp)
+{
+ struct svc_sock *svsk = rqstp->rq_sock;
+
+ svc_release_skb(rqstp);
+
+ svc_free_allpages(rqstp);
+ rqstp->rq_res.page_len = 0;
+ rqstp->rq_res.page_base = 0;
+
+
+ /* Reset response buffer and release
+ * the reservation.
+ * But first, check that enough space was reserved
+ * for the reply, otherwise we have a bug!
+ */
+ if ((rqstp->rq_res.len) > rqstp->rq_reserved)
+ printk(KERN_ERR "RPC request reserved %d but used %d\n",
+ rqstp->rq_reserved,
+ rqstp->rq_res.len);
+
+ rqstp->rq_res.head[0].iov_len = 0;
+ svc_reserve(rqstp, 0);
+ rqstp->rq_sock = NULL;
+
+ svc_sock_put(svsk);
+}
+
+/*
+ * External function to wake up a server waiting for data
+ */
+void
+svc_wake_up(struct svc_serv *serv)
+{
+ struct svc_rqst *rqstp;
+
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&serv->sv_threads)) {
+ rqstp = list_entry(serv->sv_threads.next,
+ struct svc_rqst,
+ rq_list);
+ dprintk("svc: daemon %p woken up.\n", rqstp);
+ /*
+ svc_serv_dequeue(serv, rqstp);
+ rqstp->rq_sock = NULL;
+ */
+ wake_up(&rqstp->rq_wait);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+}
+
+/*
+ * Generic sendto routine
+ */
+static int
+svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
+{
+ struct svc_sock *svsk = rqstp->rq_sock;
+ struct socket *sock = svsk->sk_sock;
+ int slen;
+ int len = 0;
+ int result;
+ int size;
+ struct page **ppage = xdr->pages;
+ size_t base = xdr->page_base;
+ unsigned int pglen = xdr->page_len;
+ unsigned int flags = MSG_MORE;
+
+ slen = xdr->len;
+
+ /* Grab svsk->sk_sem to serialize outgoing data. */
+ down(&svsk->sk_sem);
+
+ if (rqstp->rq_prot == IPPROTO_UDP) {
+ /* set the destination */
+ struct msghdr msg;
+ msg.msg_name = &rqstp->rq_addr;
+ msg.msg_namelen = sizeof(rqstp->rq_addr);
+ msg.msg_iov = NULL;
+ msg.msg_iovlen = 0;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = MSG_MORE;
+
+ if (sock_sendmsg(sock, &msg, 0) < 0)
+ goto out;
+ }
+
+ /* send head */
+ if (slen == xdr->head[0].iov_len)
+ flags = 0;
+ len = sock->ops->sendpage(sock, rqstp->rq_respages[0], 0, xdr->head[0].iov_len, flags);
+ if (len != xdr->head[0].iov_len)
+ goto out;
+ slen -= xdr->head[0].iov_len;
+ if (slen == 0)
+ goto out;
+
+ /* send page data */
+ size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
+ while (pglen > 0) {
+ if (slen == size)
+ flags = 0;
+ result = sock->ops->sendpage(sock, *ppage, base, size, flags);
+ if (result > 0)
+ len += result;
+ if (result != size)
+ goto out;
+ slen -= size;
+ pglen -= size;
+ size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
+ base = 0;
+ ppage++;
+ }
+ /* send tail */
+ if (xdr->tail[0].iov_len) {
+ /* The tail *will* be in respages[0]; */
+ result = sock->ops->sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage],
+ ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1),
+ xdr->tail[0].iov_len, 0);
+
+ if (result > 0)
+ len += result;
+ }
+out:
+ up(&svsk->sk_sem);
+
+ dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %x)\n",
+ rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len,
+ rqstp->rq_addr.sin_addr.s_addr);
+
+ return len;
+}
+
+/*
+ * Check input queue length
+ */
+static int
+svc_recv_available(struct svc_sock *svsk)
+{
+ mm_segment_t oldfs;
+ struct socket *sock = svsk->sk_sock;
+ int avail, err;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail);
+ set_fs(oldfs);
+
+ return (err >= 0)? avail : err;
+}
+
+/*
+ * Generic recvfrom routine.
+ */
+static int
+svc_recvfrom(struct svc_rqst *rqstp, struct iovec *iov, int nr, int buflen)
+{
+ mm_segment_t oldfs;
+ struct msghdr msg;
+ struct socket *sock;
+ int len, alen;
+
+ rqstp->rq_addrlen = sizeof(rqstp->rq_addr);
+ sock = rqstp->rq_sock->sk_sock;
+
+ msg.msg_name = &rqstp->rq_addr;
+ msg.msg_namelen = sizeof(rqstp->rq_addr);
+ msg.msg_iov = iov;
+ msg.msg_iovlen = nr;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+
+ msg.msg_flags = MSG_DONTWAIT;
+
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ len = sock_recvmsg(sock, &msg, buflen, MSG_DONTWAIT);
+ set_fs(oldfs);
+
+ /* sock_recvmsg doesn't fill in the name/namelen, so we must..
+ * possibly we should cache this in the svc_sock structure
+ * at accept time. FIXME
+ */
+ alen = sizeof(rqstp->rq_addr);
+ sock->ops->getname(sock, (struct sockaddr *)&rqstp->rq_addr, &alen, 1);
+
+ dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
+ rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, len);
+
+ return len;
+}
+
+/*
+ * Set socket snd and rcv buffer lengths
+ */
+static inline void
+svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv)
+{
+#if 0
+ mm_segment_t oldfs;
+ oldfs = get_fs(); set_fs(KERNEL_DS);
+ sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+ (char*)&snd, sizeof(snd));
+ sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+ (char*)&rcv, sizeof(rcv));
+#else
+ /* sock_setsockopt limits use to sysctl_?mem_max,
+ * which isn't acceptable. Until that is made conditional
+ * on not having CAP_SYS_RESOURCE or similar, we go direct...
+ * DaveM said I could!
+ */
+ lock_sock(sock->sk);
+ sock->sk->sndbuf = snd * 2;
+ sock->sk->rcvbuf = rcv * 2;
+ sock->sk->userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
+ release_sock(sock->sk);
+#endif
+}
+/*
+ * INET callback when data has been received on the socket.
+ */
+static void
+svc_udp_data_ready(struct sock *sk, int count)
+{
+ struct svc_sock *svsk = (struct svc_sock *)(sk->user_data);
+
+ if (!svsk)
+ goto out;
+ dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n",
+ svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags));
+ set_bit(SK_DATA, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible(sk->sleep);
+}
+
+/*
+ * INET callback when space is newly available on the socket.
+ */
+static void
+svc_write_space(struct sock *sk)
+{
+ struct svc_sock *svsk = (struct svc_sock *)(sk->user_data);
+
+ if (svsk) {
+ dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
+ svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags));
+ svc_sock_enqueue(svsk);
+ }
+
+ if (sk->sleep && waitqueue_active(sk->sleep)) {
+ printk(KERN_WARNING "RPC svc_write_space: some sleeping on %p\n",
+ svsk);
+ wake_up_interruptible(sk->sleep);
+ }
+}
+
+/*
+ * Receive a datagram from a UDP socket.
+ */
+extern int
+csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb);
+
+static int
+svc_udp_recvfrom(struct svc_rqst *rqstp)
+{
+ struct svc_sock *svsk = rqstp->rq_sock;
+ struct svc_serv *serv = svsk->sk_server;
+ struct sk_buff *skb;
+ int err, len;
+
+ if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+ /* udp sockets need large rcvbuf as all pending
+ * requests are still in that buffer. sndbuf must
+ * also be large enough that there is enough space
+ * for one reply per thread.
+ */
+ svc_sock_setbufsize(svsk->sk_sock,
+ (serv->sv_nrthreads+3) * serv->sv_bufsz,
+ (serv->sv_nrthreads+3) * serv->sv_bufsz);
+
+ if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk)))
+ return svc_deferred_recv(rqstp);
+
+ clear_bit(SK_DATA, &svsk->sk_flags);
+ while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) {
+ svc_sock_received(svsk);
+ if (err == -EAGAIN)
+ return err;
+ /* possibly an icmp error */
+ dprintk("svc: recvfrom returned error %d\n", -err);
+ }
+ set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
+
+ len = skb->len - sizeof(struct udphdr);
+ rqstp->rq_arg.len = len;
+
+ rqstp->rq_prot = IPPROTO_UDP;
+
+ /* Get sender address */
+ rqstp->rq_addr.sin_family = AF_INET;
+ rqstp->rq_addr.sin_port = skb->h.uh->source;
+ rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr;
+
+ if (skb_is_nonlinear(skb)) {
+ /* we have to copy */
+ local_bh_disable();
+ if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
+ local_bh_enable();
+ /* checksum error */
+ skb_free_datagram(svsk->sk_sk, skb);
+ svc_sock_received(svsk);
+ return 0;
+ }
+ local_bh_enable();
+ skb_free_datagram(svsk->sk_sk, skb);
+ } else {
+ /* we can use it in-place */
+ rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
+ rqstp->rq_arg.head[0].iov_len = len;
+ if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
+ if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
+ skb_free_datagram(svsk->sk_sk, skb);
+ svc_sock_received(svsk);
+ return 0;
+ }
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ rqstp->rq_skbuff = skb;
+ }
+
+ rqstp->rq_arg.page_base = 0;
+ if (len <= rqstp->rq_arg.head[0].iov_len) {
+ rqstp->rq_arg.head[0].iov_len = len;
+ rqstp->rq_arg.page_len = 0;
+ } else {
+ rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
+ rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE;
+ }
+
+ if (serv->sv_stats)
+ serv->sv_stats->netudpcnt++;
+
+ /* One down, maybe more to go... */
+ svsk->sk_sk->stamp = skb->stamp;
+ svc_sock_received(svsk);
+
+ return len;
+}
+
+static int
+svc_udp_sendto(struct svc_rqst *rqstp)
+{
+ int error;
+
+ error = svc_sendto(rqstp, &rqstp->rq_res);
+ if (error == -ECONNREFUSED)
+ /* ICMP error on earlier request. */
+ error = svc_sendto(rqstp, &rqstp->rq_res);
+
+ return error;
+}
+
+static void
+svc_udp_init(struct svc_sock *svsk)
+{
+ svsk->sk_sk->data_ready = svc_udp_data_ready;
+ svsk->sk_sk->write_space = svc_write_space;
+ svsk->sk_recvfrom = svc_udp_recvfrom;
+ svsk->sk_sendto = svc_udp_sendto;
+
+ /* initialise setting must have enough space to
+ * receive and respond to one request.
+ * svc_udp_recvfrom will re-adjust if necessary
+ */
+ svc_sock_setbufsize(svsk->sk_sock,
+ 3 * svsk->sk_server->sv_bufsz,
+ 3 * svsk->sk_server->sv_bufsz);
+
+ set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */
+ set_bit(SK_CHNGBUF, &svsk->sk_flags);
+}
+
+/*
+ * A data_ready event on a listening socket means there's a connection
+ * pending. Do not use state_change as a substitute for it.
+ */
+static void
+svc_tcp_listen_data_ready(struct sock *sk, int count_unused)
+{
+ struct svc_sock *svsk;
+
+ dprintk("svc: socket %p TCP (listen) state change %d\n",
+ sk, sk->state);
+
+ if (sk->state != TCP_ESTABLISHED) {
+ /* Aborted connection, SYN_RECV or whatever... */
+ goto out;
+ }
+ if (!(svsk = (struct svc_sock *) sk->user_data)) {
+ printk("svc: socket %p: no user data\n", sk);
+ goto out;
+ }
+ set_bit(SK_CONN, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible_all(sk->sleep);
+}
+
+/*
+ * A state change on a connected socket means it's dying or dead.
+ */
+static void
+svc_tcp_state_change(struct sock *sk)
+{
+ struct svc_sock *svsk;
+
+ dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
+ sk, sk->state, sk->user_data);
+
+ if (!(svsk = (struct svc_sock *) sk->user_data)) {
+ printk("svc: socket %p: no user data\n", sk);
+ goto out;
+ }
+ set_bit(SK_CLOSE, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible_all(sk->sleep);
+}
+
+static void
+svc_tcp_data_ready(struct sock *sk, int count)
+{
+ struct svc_sock * svsk;
+
+ dprintk("svc: socket %p TCP data ready (svsk %p)\n",
+ sk, sk->user_data);
+ if (!(svsk = (struct svc_sock *)(sk->user_data)))
+ goto out;
+ set_bit(SK_DATA, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ out:
+ if (sk->sleep && waitqueue_active(sk->sleep))
+ wake_up_interruptible(sk->sleep);
+}
+
+/*
+ * Accept a TCP connection
+ */
+static void
+svc_tcp_accept(struct svc_sock *svsk)
+{
+ struct sockaddr_in sin;
+ struct svc_serv *serv = svsk->sk_server;
+ struct socket *sock = svsk->sk_sock;
+ struct socket *newsock;
+ struct proto_ops *ops;
+ struct svc_sock *newsvsk;
+ int err, slen;
+
+ dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
+ if (!sock)
+ return;
+
+ if (!(newsock = sock_alloc())) {
+ printk(KERN_WARNING "%s: no more sockets!\n", serv->sv_name);
+ return;
+ }
+ dprintk("svc: tcp_accept %p allocated\n", newsock);
+
+ newsock->type = sock->type;
+ newsock->ops = ops = sock->ops;
+
+ clear_bit(SK_CONN, &svsk->sk_flags);
+ if ((err = ops->accept(sock, newsock, O_NONBLOCK)) < 0) {
+ if (err != -EAGAIN && net_ratelimit())
+ printk(KERN_WARNING "%s: accept failed (err %d)!\n",
+ serv->sv_name, -err);
+ goto failed; /* aborted connection or whatever */
+ }
+ set_bit(SK_CONN, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+
+ slen = sizeof(sin);
+ err = ops->getname(newsock, (struct sockaddr *) &sin, &slen, 1);
+ if (err < 0) {
+ if (net_ratelimit())
+ printk(KERN_WARNING "%s: peername failed (err %d)!\n",
+ serv->sv_name, -err);
+ goto failed; /* aborted connection or whatever */
+ }
+
+ /* Ideally, we would want to reject connections from unauthorized
+ * hosts here, but when we get encription, the IP of the host won't
+ * tell us anything. For now just warn about unpriv connections.
+ */
+ if (ntohs(sin.sin_port) >= 1024) {
+ dprintk(KERN_WARNING
+ "%s: connect from unprivileged port: %u.%u.%u.%u:%d\n",
+ serv->sv_name,
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+ }
+
+ dprintk("%s: connect from %u.%u.%u.%u:%04x\n", serv->sv_name,
+ NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+
+ /* make sure that a write doesn't block forever when
+ * low on memory
+ */
+ newsock->sk->sndtimeo = HZ*30;
+
+ if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 0)))
+ goto failed;
+
+
+ /* make sure that we don't have too many active connections.
+ * If we have, something must be dropped.
+ * We randomly choose between newest and oldest (in terms
+ * of recent activity) and drop it.
+ */
+ if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*5) {
+ struct svc_sock *svsk = NULL;
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&serv->sv_tempsocks)) {
+ if (net_random()&1)
+ svsk = list_entry(serv->sv_tempsocks.prev,
+ struct svc_sock,
+ sk_list);
+ else
+ svsk = list_entry(serv->sv_tempsocks.next,
+ struct svc_sock,
+ sk_list);
+ set_bit(SK_CLOSE, &svsk->sk_flags);
+ svsk->sk_inuse ++;
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ if (svsk) {
+ svc_sock_enqueue(svsk);
+ svc_sock_put(svsk);
+ }
+
+ }
+
+ if (serv->sv_stats)
+ serv->sv_stats->nettcpconn++;
+
+ return;
+
+failed:
+ sock_release(newsock);
+ return;
+}
+
+/*
+ * Receive data from a TCP socket.
+ */
+static int
+svc_tcp_recvfrom(struct svc_rqst *rqstp)
+{
+ struct svc_sock *svsk = rqstp->rq_sock;
+ struct svc_serv *serv = svsk->sk_server;
+ int len;
+ struct iovec vec[RPCSVC_MAXPAGES];
+ int pnum, vlen;
+
+ dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
+ svsk, test_bit(SK_DATA, &svsk->sk_flags),
+ test_bit(SK_CONN, &svsk->sk_flags),
+ test_bit(SK_CLOSE, &svsk->sk_flags));
+
+ if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk)))
+ return svc_deferred_recv(rqstp);
+
+ if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
+ svc_delete_socket(svsk);
+ return 0;
+ }
+
+ if (test_bit(SK_CONN, &svsk->sk_flags)) {
+ svc_tcp_accept(svsk);
+ svc_sock_received(svsk);
+ return 0;
+ }
+
+ if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
+ /* sndbuf needs to have room for one request
+ * per thread, otherwise we can stall even when the
+ * network isn't a bottleneck.
+ * rcvbuf just needs to be able to hold a few requests.
+ * Normally they will be removed from the queue
+ * as soon a a complete request arrives.
+ */
+ svc_sock_setbufsize(svsk->sk_sock,
+ (serv->sv_nrthreads+3) * serv->sv_bufsz,
+ 3 * serv->sv_bufsz);
+
+ clear_bit(SK_DATA, &svsk->sk_flags);
+
+ /* Receive data. If we haven't got the record length yet, get
+ * the next four bytes. Otherwise try to gobble up as much as
+ * possible up to the complete record length.
+ */
+ if (svsk->sk_tcplen < 4) {
+ unsigned long want = 4 - svsk->sk_tcplen;
+ struct iovec iov;
+
+ iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen;
+ iov.iov_len = want;
+ if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0)
+ goto error;
+ svsk->sk_tcplen += len;
+
+ if (len < want) {
+ dprintk("svc: short recvfrom while reading record length (%d of %d)\n",
+ len, want);
+ svc_sock_received(svsk);
+ return -EAGAIN; /* record header not complete */
+ }
+
+ svsk->sk_reclen = ntohl(svsk->sk_reclen);
+ if (!(svsk->sk_reclen & 0x80000000)) {
+ /* FIXME: technically, a record can be fragmented,
+ * and non-terminal fragments will not have the top
+ * bit set in the fragment length header.
+ * But apparently no known nfs clients send fragmented
+ * records. */
+ printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (non-terminal)\n",
+ (unsigned long) svsk->sk_reclen);
+ goto err_delete;
+ }
+ svsk->sk_reclen &= 0x7fffffff;
+ dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen);
+ if (svsk->sk_reclen > serv->sv_bufsz) {
+ printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n",
+ (unsigned long) svsk->sk_reclen);
+ goto err_delete;
+ }
+ }
+
+ /* Check whether enough data is available */
+ len = svc_recv_available(svsk);
+ if (len < 0)
+ goto error;
+
+ if (len < svsk->sk_reclen) {
+ dprintk("svc: incomplete TCP record (%d of %d)\n",
+ len, svsk->sk_reclen);
+ svc_sock_received(svsk);
+ return -EAGAIN; /* record not complete */
+ }
+ len = svsk->sk_reclen;
+ set_bit(SK_DATA, &svsk->sk_flags);
+
+ vec[0] = rqstp->rq_arg.head[0];
+ vlen = PAGE_SIZE;
+ pnum = 1;
+ while (vlen < len) {
+ vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]);
+ vec[pnum].iov_len = PAGE_SIZE;
+ pnum++;
+ vlen += PAGE_SIZE;
+ }
+
+ /* Now receive data */
+ len = svc_recvfrom(rqstp, vec, pnum, len);
+ if (len < 0)
+ goto error;
+
+ dprintk("svc: TCP complete record (%d bytes)\n", len);
+ rqstp->rq_arg.len = len;
+ rqstp->rq_arg.page_base = 0;
+ if (len <= rqstp->rq_arg.head[0].iov_len) {
+ rqstp->rq_arg.head[0].iov_len = len;
+ rqstp->rq_arg.page_len = 0;
+ } else {
+ rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
+ }
+
+ rqstp->rq_skbuff = 0;
+ rqstp->rq_prot = IPPROTO_TCP;
+
+ /* Reset TCP read info */
+ svsk->sk_reclen = 0;
+ svsk->sk_tcplen = 0;
+
+ svc_sock_received(svsk);
+ if (serv->sv_stats)
+ serv->sv_stats->nettcpcnt++;
+
+ return len;
+
+ err_delete:
+ svc_delete_socket(svsk);
+ return -EAGAIN;
+
+ error:
+ if (len == -EAGAIN) {
+ dprintk("RPC: TCP recvfrom got EAGAIN\n");
+ svc_sock_received(svsk);
+ } else {
+ printk(KERN_NOTICE "%s: recvfrom returned errno %d\n",
+ svsk->sk_server->sv_name, -len);
+ svc_sock_received(svsk);
+ }
+
+ return len;
+}
+
+/*
+ * Send out data on TCP socket.
+ */
+static int
+svc_tcp_sendto(struct svc_rqst *rqstp)
+{
+ struct xdr_buf *xbufp = &rqstp->rq_res;
+ int sent;
+ u32 reclen;
+
+ /* Set up the first element of the reply iovec.
+ * Any other iovecs that may be in use have been taken
+ * care of by the server implementation itself.
+ */
+ reclen = htonl(0x80000000|((xbufp->len ) - 4));
+ memcpy(xbufp->head[0].iov_base, &reclen, 4);
+
+ sent = svc_sendto(rqstp, &rqstp->rq_res);
+ if (sent != xbufp->len) {
+ printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
+ rqstp->rq_sock->sk_server->sv_name,
+ (sent<0)?"got error":"sent only",
+ sent, xbufp->len);
+ svc_delete_socket(rqstp->rq_sock);
+ sent = -EAGAIN;
+ }
+ return sent;
+}
+
+static void
+svc_tcp_init(struct svc_sock *svsk)
+{
+ struct sock *sk = svsk->sk_sk;
+
+ svsk->sk_recvfrom = svc_tcp_recvfrom;
+ svsk->sk_sendto = svc_tcp_sendto;
+
+ if (sk->state == TCP_LISTEN) {
+ dprintk("setting up TCP socket for listening\n");
+ sk->data_ready = svc_tcp_listen_data_ready;
+ set_bit(SK_CONN, &svsk->sk_flags);
+ } else {
+ dprintk("setting up TCP socket for reading\n");
+ sk->state_change = svc_tcp_state_change;
+ sk->data_ready = svc_tcp_data_ready;
+ sk->write_space = svc_write_space;
+
+ svsk->sk_reclen = 0;
+ svsk->sk_tcplen = 0;
+
+ /* initialise setting must have enough space to
+ * receive and respond to one request.
+ * svc_tcp_recvfrom will re-adjust if necessary
+ */
+ svc_sock_setbufsize(svsk->sk_sock,
+ 3 * svsk->sk_server->sv_bufsz,
+ 3 * svsk->sk_server->sv_bufsz);
+
+ set_bit(SK_CHNGBUF, &svsk->sk_flags);
+ set_bit(SK_DATA, &svsk->sk_flags);
+ }
+}
+
+void
+svc_sock_update_bufs(struct svc_serv *serv)
+{
+ /*
+ * The number of server threads has changed. Update
+ * rcvbuf and sndbuf accordingly on all sockets
+ */
+ struct list_head *le;
+
+ spin_lock_bh(&serv->sv_lock);
+ list_for_each(le, &serv->sv_permsocks) {
+ struct svc_sock *svsk =
+ list_entry(le, struct svc_sock, sk_list);
+ set_bit(SK_CHNGBUF, &svsk->sk_flags);
+ }
+ list_for_each(le, &serv->sv_tempsocks) {
+ struct svc_sock *svsk =
+ list_entry(le, struct svc_sock, sk_list);
+ set_bit(SK_CHNGBUF, &svsk->sk_flags);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+}
+
+/*
+ * Receive the next request on any socket.
+ */
+int
+svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
+{
+ struct svc_sock *svsk =NULL;
+ int len;
+ int pages;
+ struct xdr_buf *arg;
+ DECLARE_WAITQUEUE(wait, current);
+
+ dprintk("svc: server %p waiting for data (to = %ld)\n",
+ rqstp, timeout);
+
+ if (rqstp->rq_sock)
+ printk(KERN_ERR
+ "svc_recv: service %p, socket not NULL!\n",
+ rqstp);
+ if (waitqueue_active(&rqstp->rq_wait))
+ printk(KERN_ERR
+ "svc_recv: service %p, wait queue active!\n",
+ rqstp);
+
+ /* Initialize the buffers */
+ /* first reclaim pages that were moved to response list */
+ svc_pushback_allpages(rqstp);
+
+ /* now allocate needed pages. If we get a failure, sleep briefly */
+ pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE;
+ while (rqstp->rq_arghi < pages) {
+ struct page *p = alloc_page(GFP_KERNEL);
+ if (!p) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(HZ/2);
+ current->state = TASK_RUNNING;
+ continue;
+ }
+ rqstp->rq_argpages[rqstp->rq_arghi++] = p;
+ }
+
+ /* Make arg->head point to first page and arg->pages point to rest */
+ arg = &rqstp->rq_arg;
+ arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]);
+ arg->head[0].iov_len = PAGE_SIZE;
+ rqstp->rq_argused = 1;
+ arg->pages = rqstp->rq_argpages + 1;
+ arg->page_base = 0;
+ /* save at least one page for response */
+ arg->page_len = (pages-2)*PAGE_SIZE;
+ arg->len = (pages-1)*PAGE_SIZE;
+ arg->tail[0].iov_len = 0;
+
+ if (signalled())
+ return -EINTR;
+
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&serv->sv_tempsocks)) {
+ svsk = list_entry(serv->sv_tempsocks.next,
+ struct svc_sock, sk_list);
+ /* apparently the "standard" is that clients close
+ * idle connections after 5 minutes, servers after
+ * 6 minutes
+ * http://www.connectathon.org/talks96/nfstcp.pdf
+ */
+ if (get_seconds() - svsk->sk_lastrecv < 6*60
+ || test_bit(SK_BUSY, &svsk->sk_flags))
+ svsk = NULL;
+ }
+ if (svsk) {
+ set_bit(SK_BUSY, &svsk->sk_flags);
+ set_bit(SK_CLOSE, &svsk->sk_flags);
+ rqstp->rq_sock = svsk;
+ svsk->sk_inuse++;
+ } else if ((svsk = svc_sock_dequeue(serv)) != NULL) {
+ rqstp->rq_sock = svsk;
+ svsk->sk_inuse++;
+ rqstp->rq_reserved = serv->sv_bufsz;
+ svsk->sk_reserved += rqstp->rq_reserved;
+ } else {
+ /* No data pending. Go to sleep */
+ svc_serv_enqueue(serv, rqstp);
+
+ /*
+ * We have to be able to interrupt this wait
+ * to bring down the daemons ...
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&rqstp->rq_wait, &wait);
+ spin_unlock_bh(&serv->sv_lock);
+
+ schedule_timeout(timeout);
+
+ spin_lock_bh(&serv->sv_lock);
+ remove_wait_queue(&rqstp->rq_wait, &wait);
+
+ if (!(svsk = rqstp->rq_sock)) {
+ svc_serv_dequeue(serv, rqstp);
+ spin_unlock_bh(&serv->sv_lock);
+ dprintk("svc: server %p, no data yet\n", rqstp);
+ return signalled()? -EINTR : -EAGAIN;
+ }
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ dprintk("svc: server %p, socket %p, inuse=%d\n",
+ rqstp, svsk, svsk->sk_inuse);
+ len = svsk->sk_recvfrom(rqstp);
+ dprintk("svc: got len=%d\n", len);
+
+ /* No data, incomplete (TCP) read, or accept() */
+ if (len == 0 || len == -EAGAIN) {
+ svc_sock_release(rqstp);
+ return -EAGAIN;
+ }
+ svsk->sk_lastrecv = get_seconds();
+ if (test_bit(SK_TEMP, &svsk->sk_flags)) {
+ /* push active sockets to end of list */
+ spin_lock_bh(&serv->sv_lock);
+ if (!list_empty(&svsk->sk_list))
+ list_move_tail(&svsk->sk_list, &serv->sv_tempsocks);
+ spin_unlock_bh(&serv->sv_lock);
+ }
+
+ rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024;
+ rqstp->rq_userset = 0;
+ rqstp->rq_chandle.defer = svc_defer;
+
+ if (serv->sv_stats)
+ serv->sv_stats->netcnt++;
+ return len;
+}
+
+/*
+ * Drop request
+ */
+void
+svc_drop(struct svc_rqst *rqstp)
+{
+ dprintk("svc: socket %p dropped request\n", rqstp->rq_sock);
+ svc_sock_release(rqstp);
+}
+
+/*
+ * Return reply to client.
+ */
+int
+svc_send(struct svc_rqst *rqstp)
+{
+ struct svc_sock *svsk;
+ int len;
+ struct xdr_buf *xb;
+
+ if ((svsk = rqstp->rq_sock) == NULL) {
+ printk(KERN_WARNING "NULL socket pointer in %s:%d\n",
+ __FILE__, __LINE__);
+ return -EFAULT;
+ }
+
+ /* release the receive skb before sending the reply */
+ svc_release_skb(rqstp);
+
+ /* calculate over-all length */
+ xb = & rqstp->rq_res;
+ xb->len = xb->head[0].iov_len +
+ xb->page_len +
+ xb->tail[0].iov_len;
+
+ len = svsk->sk_sendto(rqstp);
+ svc_sock_release(rqstp);
+
+ if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN)
+ return 0;
+ return len;
+}
+
+/*
+ * Initialize socket for RPC use and create svc_sock struct
+ * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
+ */
+static struct svc_sock *
+svc_setup_socket(struct svc_serv *serv, struct socket *sock,
+ int *errp, int pmap_register)
+{
+ struct svc_sock *svsk;
+ struct sock *inet;
+
+ dprintk("svc: svc_setup_socket %p\n", sock);
+ if (!(svsk = kmalloc(sizeof(*svsk), GFP_KERNEL))) {
+ *errp = -ENOMEM;
+ return NULL;
+ }
+ memset(svsk, 0, sizeof(*svsk));
+
+ inet = sock->sk;
+
+ /* Register socket with portmapper */
+ if (*errp >= 0 && pmap_register)
+ *errp = svc_register(serv, inet->protocol,
+ ntohs(inet_sk(inet)->sport));
+
+ if (*errp < 0) {
+ kfree(svsk);
+ return NULL;
+ }
+
+ set_bit(SK_BUSY, &svsk->sk_flags);
+ inet->user_data = svsk;
+ svsk->sk_sock = sock;
+ svsk->sk_sk = inet;
+ svsk->sk_ostate = inet->state_change;
+ svsk->sk_odata = inet->data_ready;
+ svsk->sk_owspace = inet->write_space;
+ svsk->sk_server = serv;
+ svsk->sk_lastrecv = get_seconds();
+ INIT_LIST_HEAD(&svsk->sk_deferred);
+ INIT_LIST_HEAD(&svsk->sk_ready);
+ sema_init(&svsk->sk_sem, 1);
+
+ /* Initialize the socket */
+ if (sock->type == SOCK_DGRAM)
+ svc_udp_init(svsk);
+ else
+ svc_tcp_init(svsk);
+
+ spin_lock_bh(&serv->sv_lock);
+ if (!pmap_register) {
+ set_bit(SK_TEMP, &svsk->sk_flags);
+ list_add(&svsk->sk_list, &serv->sv_tempsocks);
+ serv->sv_tmpcnt++;
+ } else {
+ clear_bit(SK_TEMP, &svsk->sk_flags);
+ list_add(&svsk->sk_list, &serv->sv_permsocks);
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ dprintk("svc: svc_setup_socket created %p (inet %p)\n",
+ svsk, svsk->sk_sk);
+
+ clear_bit(SK_BUSY, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ return svsk;
+}
+
+/*
+ * Create socket for RPC service.
+ */
+static int
+svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin)
+{
+ struct svc_sock *svsk;
+ struct socket *sock;
+ int error;
+ int type;
+
+ dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n",
+ serv->sv_program->pg_name, protocol,
+ NIPQUAD(sin->sin_addr.s_addr),
+ ntohs(sin->sin_port));
+
+ if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) {
+ printk(KERN_WARNING "svc: only UDP and TCP "
+ "sockets supported\n");
+ return -EINVAL;
+ }
+ type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
+
+ if ((error = sock_create(PF_INET, type, protocol, &sock)) < 0)
+ return error;
+
+ if (sin != NULL) {
+ sock->sk->reuse = 1; /* allow address reuse */
+ error = sock->ops->bind(sock, (struct sockaddr *) sin,
+ sizeof(*sin));
+ if (error < 0)
+ goto bummer;
+ }
+
+ if (protocol == IPPROTO_TCP) {
+ if ((error = sock->ops->listen(sock, 64)) < 0)
+ goto bummer;
+ }
+
+ if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL)
+ return 0;
+
+bummer:
+ dprintk("svc: svc_create_socket error = %d\n", -error);
+ sock_release(sock);
+ return error;
+}
+
+/*
+ * Remove a dead socket
+ */
+void
+svc_delete_socket(struct svc_sock *svsk)
+{
+ struct svc_serv *serv;
+ struct sock *sk;
+
+ dprintk("svc: svc_delete_socket(%p)\n", svsk);
+
+ serv = svsk->sk_server;
+ sk = svsk->sk_sk;
+
+ sk->state_change = svsk->sk_ostate;
+ sk->data_ready = svsk->sk_odata;
+ sk->write_space = svsk->sk_owspace;
+
+ spin_lock_bh(&serv->sv_lock);
+
+ list_del_init(&svsk->sk_list);
+ list_del_init(&svsk->sk_ready);
+ if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags))
+ if (test_bit(SK_TEMP, &svsk->sk_flags))
+ serv->sv_tmpcnt--;
+
+ if (!svsk->sk_inuse) {
+ spin_unlock_bh(&serv->sv_lock);
+ sock_release(svsk->sk_sock);
+ kfree(svsk);
+ } else {
+ spin_unlock_bh(&serv->sv_lock);
+ dprintk(KERN_NOTICE "svc: server socket destroy delayed\n");
+ /* svsk->sk_server = NULL; */
+ }
+}
+
+/*
+ * Make a socket for nfsd and lockd
+ */
+int
+svc_makesock(struct svc_serv *serv, int protocol, unsigned short port)
+{
+ struct sockaddr_in sin;
+
+ dprintk("svc: creating socket proto = %d\n", protocol);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = INADDR_ANY;
+ sin.sin_port = htons(port);
+ return svc_create_socket(serv, protocol, &sin);
+}
+
+/*
+ * Handle defer and revisit of requests
+ */
+
+static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
+{
+ struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle);
+ struct svc_serv *serv = dr->serv;
+ struct svc_sock *svsk;
+
+ if (too_many) {
+ svc_sock_put(dr->svsk);
+ kfree(dr);
+ return;
+ }
+ dprintk("revisit queued\n");
+ svsk = dr->svsk;
+ dr->svsk = NULL;
+ spin_lock(&serv->sv_lock);
+ list_add(&dr->handle.recent, &svsk->sk_deferred);
+ spin_unlock(&serv->sv_lock);
+ set_bit(SK_DEFERRED, &svsk->sk_flags);
+ svc_sock_enqueue(svsk);
+ svc_sock_put(svsk);
+}
+
+static struct cache_deferred_req *
+svc_defer(struct cache_req *req)
+{
+ struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
+ int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len);
+ struct svc_deferred_req *dr;
+
+ if (rqstp->rq_arg.page_len)
+ return NULL; /* if more than a page, give up FIXME */
+ if (rqstp->rq_deferred) {
+ dr = rqstp->rq_deferred;
+ rqstp->rq_deferred = NULL;
+ } else {
+ int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len;
+ /* FIXME maybe discard if size too large */
+ dr = kmalloc(size, GFP_KERNEL);
+ if (dr == NULL)
+ return NULL;
+
+ dr->serv = rqstp->rq_server;
+ dr->prot = rqstp->rq_prot;
+ dr->addr = rqstp->rq_addr;
+ dr->argslen = rqstp->rq_arg.len >> 2;
+ memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2);
+ }
+ spin_lock(&rqstp->rq_server->sv_lock);
+ rqstp->rq_sock->sk_inuse++;
+ dr->svsk = rqstp->rq_sock;
+ spin_unlock(&rqstp->rq_server->sv_lock);
+
+ dr->handle.revisit = svc_revisit;
+ return &dr->handle;
+}
+
+/*
+ * recv data from a deferred request into an active one
+ */
+static int svc_deferred_recv(struct svc_rqst *rqstp)
+{
+ struct svc_deferred_req *dr = rqstp->rq_deferred;
+
+ rqstp->rq_arg.head[0].iov_base = dr->args;
+ rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
+ rqstp->rq_arg.page_len = 0;
+ rqstp->rq_arg.len = dr->argslen<<2;
+ rqstp->rq_prot = dr->prot;
+ rqstp->rq_addr = dr->addr;
+ return dr->argslen<<2;
+}
+
+
+static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
+{
+ struct svc_deferred_req *dr = NULL;
+ struct svc_serv *serv = svsk->sk_server;
+
+ if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
+ return NULL;
+ spin_lock(&serv->sv_lock);
+ clear_bit(SK_DEFERRED, &svsk->sk_flags);
+ if (!list_empty(&svsk->sk_deferred)) {
+ dr = list_entry(svsk->sk_deferred.next,
+ struct svc_deferred_req,
+ handle.recent);
+ list_del_init(&dr->handle.recent);
+ set_bit(SK_DEFERRED, &svsk->sk_flags);
+ }
+ spin_unlock(&serv->sv_lock);
+ svc_sock_received(svsk);
+ return dr;
+}
diff --git a/tests/linux/rpc_tcp_nonagle/patch b/tests/linux/rpc_tcp_nonagle/patch
new file mode 100644
index 0000000..bafda29
--- /dev/null
+++ b/tests/linux/rpc_tcp_nonagle/patch
@@ -0,0 +1,33 @@
+***************
+*** 932,937 ****
+ svc_tcp_init(struct svc_sock *svsk)
+ {
+ struct sock *sk = svsk->sk_sk;
+
+ svsk->sk_recvfrom = svc_tcp_recvfrom;
+ svsk->sk_sendto = svc_tcp_sendto;
+--- 932,938 ----
+ svc_tcp_init(struct svc_sock *svsk)
+ {
+ struct sock *sk = svsk->sk_sk;
++ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+
+ svsk->sk_recvfrom = svc_tcp_recvfrom;
+ svsk->sk_sendto = svc_tcp_sendto;
+***************
+*** 948,953 ****
+ svsk->sk_reclen = 0;
+ svsk->sk_tcplen = 0;
+
+ /* initialise setting must have enough space to
+ * receive and respond to one request.
+ * svc_tcp_recvfrom will re-adjust if necessary
+--- 949,956 ----
+ svsk->sk_reclen = 0;
+ svsk->sk_tcplen = 0;
+
++ tp->nonagle = 1; /* disable Nagle's algorithm */
++
+ /* initialise setting must have enough space to
+ * receive and respond to one request.
+ * svc_tcp_recvfrom will re-adjust if necessary
diff --git a/tests/simple/all-different-2/lmerge b/tests/simple/all-different-2/lmerge
new file mode 100644
index 0000000..65606f9
--- /dev/null
+++ b/tests/simple/all-different-2/lmerge
@@ -0,0 +1,34 @@
+<<<<<<<
+1
+2
+3
+4
+5
+6
+7
+8
+9
+0
+|||||||
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+=======
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+>>>>>>>
diff --git a/tests/simple/all-different-2/merge b/tests/simple/all-different-2/merge
new file mode 100644
index 0000000..65606f9
--- /dev/null
+++ b/tests/simple/all-different-2/merge
@@ -0,0 +1,34 @@
+<<<<<<<
+1
+2
+3
+4
+5
+6
+7
+8
+9
+0
+|||||||
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+=======
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+>>>>>>>
diff --git a/tests/simple/all-different-2/new b/tests/simple/all-different-2/new
new file mode 100644
index 0000000..92dfa21
--- /dev/null
+++ b/tests/simple/all-different-2/new
@@ -0,0 +1,10 @@
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
diff --git a/tests/simple/all-different-2/new2 b/tests/simple/all-different-2/new2
new file mode 100644
index 0000000..719a59f
--- /dev/null
+++ b/tests/simple/all-different-2/new2
@@ -0,0 +1,10 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
diff --git a/tests/simple/all-different-2/orig b/tests/simple/all-different-2/orig
new file mode 100644
index 0000000..e53eaa1
--- /dev/null
+++ b/tests/simple/all-different-2/orig
@@ -0,0 +1,10 @@
+1
+2
+3
+4
+5
+6
+7
+8
+9
+0
diff --git a/tests/simple/all-different-2/wmerge b/tests/simple/all-different-2/wmerge
new file mode 100644
index 0000000..d84a11f
--- /dev/null
+++ b/tests/simple/all-different-2/wmerge
@@ -0,0 +1,10 @@
+<<<---1|||a===A--->>>
+<<<---2|||b===B--->>>
+<<<---3|||c===C--->>>
+<<<---4|||d===D--->>>
+<<<---5|||e===E--->>>
+<<<---6|||f===F--->>>
+<<<---7|||g===G--->>>
+<<<---8|||h===H--->>>
+<<<---9|||i===I--->>>
+<<<---0|||j===J--->>>
diff --git a/tests/simple/all-different/lmerge b/tests/simple/all-different/lmerge
new file mode 100644
index 0000000..ab83c87
--- /dev/null
+++ b/tests/simple/all-different/lmerge
@@ -0,0 +1,35 @@
+<<<<<<<
+1
+2
+3
+4
+5
+6
+7
+8
+9
+0
+|||||||
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+=======
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+>>>>>>>
+yes
diff --git a/tests/simple/all-different/merge b/tests/simple/all-different/merge
new file mode 100644
index 0000000..28ee454
--- /dev/null
+++ b/tests/simple/all-different/merge
@@ -0,0 +1,37 @@
+<<<<<<<
+1
+2
+3
+4
+5
+6
+7
+8
+9
+0
+yes
+|||||||
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+yes
+=======
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+yes
+>>>>>>>
diff --git a/tests/simple/all-different/new b/tests/simple/all-different/new
new file mode 100644
index 0000000..2e93219
--- /dev/null
+++ b/tests/simple/all-different/new
@@ -0,0 +1,11 @@
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+yes
diff --git a/tests/simple/all-different/new2 b/tests/simple/all-different/new2
new file mode 100644
index 0000000..6186f49
--- /dev/null
+++ b/tests/simple/all-different/new2
@@ -0,0 +1,11 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+yes
diff --git a/tests/simple/all-different/orig b/tests/simple/all-different/orig
new file mode 100644
index 0000000..9db162b
--- /dev/null
+++ b/tests/simple/all-different/orig
@@ -0,0 +1,11 @@
+1
+2
+3
+4
+5
+6
+7
+8
+9
+0
+yes
diff --git a/tests/simple/all-different/wmerge b/tests/simple/all-different/wmerge
new file mode 100644
index 0000000..ac32368
--- /dev/null
+++ b/tests/simple/all-different/wmerge
@@ -0,0 +1,11 @@
+<<<---1|||a===A--->>>
+<<<---2|||b===B--->>>
+<<<---3|||c===C--->>>
+<<<---4|||d===D--->>>
+<<<---5|||e===E--->>>
+<<<---6|||f===F--->>>
+<<<---7|||g===G--->>>
+<<<---8|||h===H--->>>
+<<<---9|||i===I--->>>
+<<<---0|||j===J--->>>
+yes
diff --git a/tests/simple/already-applied/merge b/tests/simple/already-applied/merge
new file mode 100644
index 0000000..5532005
--- /dev/null
+++ b/tests/simple/already-applied/merge
@@ -0,0 +1,3 @@
+This is the
+current version of the file
+which has already had the word 'current' updated.
diff --git a/tests/simple/already-applied/new b/tests/simple/already-applied/new
new file mode 100644
index 0000000..cfd09a2
--- /dev/null
+++ b/tests/simple/already-applied/new
@@ -0,0 +1,2 @@
+This is the
+old version of the files
diff --git a/tests/simple/already-applied/new2 b/tests/simple/already-applied/new2
new file mode 100644
index 0000000..1680c7e
--- /dev/null
+++ b/tests/simple/already-applied/new2
@@ -0,0 +1,2 @@
+This is the
+current version of the file
diff --git a/tests/simple/already-applied/orig b/tests/simple/already-applied/orig
new file mode 100644
index 0000000..3702563
--- /dev/null
+++ b/tests/simple/already-applied/orig
@@ -0,0 +1,3 @@
+This is the
+current version of the files
+which has already had the word 'current' updated.
diff --git a/tests/simple/base/diff b/tests/simple/base/diff
new file mode 100644
index 0000000..ed409b8
--- /dev/null
+++ b/tests/simple/base/diff
@@ -0,0 +1,23 @@
+@@ -1,20 +1,21 @@
+-
+ This is a base file
+ some changes are going to happen to it
+ but it has
++had
+ several lines
+ so that alll
+ the changes
+ don't h...
+|I don't know <<<--waht-->>><<<++what++>>> I am saying.
+|This <<<--lion-->>><<<++line++>>> will have some changes made.
+ but this one wont
+ stuf stuf stuff
+ thing thing
+ xxxxx
+ that is all
+ except
+ for
+ this
+ last
+ bit
++x
diff --git a/tests/simple/base/ldiff b/tests/simple/base/ldiff
new file mode 100644
index 0000000..2b0ef69
--- /dev/null
+++ b/tests/simple/base/ldiff
@@ -0,0 +1,25 @@
+@@ -1,20 +1,21 @@
+-
+ This is a base file
+ some changes are going to happen to it
+ but it has
++had
+ several lines
+ so that alll
+ the changes
+ don't h...
+-I don't know waht I am saying.
+-This lion will have some changes made.
++I don't know what I am saying.
++This line will have some changes made.
+ but this one wont
+ stuf stuf stuff
+ thing thing
+ xxxxx
+ that is all
+ except
+ for
+ this
+ last
+ bit
++x
diff --git a/tests/simple/base/merge b/tests/simple/base/merge
new file mode 100644
index 0000000..fdd9823
--- /dev/null
+++ b/tests/simple/base/merge
@@ -0,0 +1,20 @@
+
+This is a base file
+some changes are going to happen to it
+but it has
+several lines
+so that alll
+the changes
+don't h...
+I don't know waht I am saying.
+This lion will have some modifications made.
+but this one wont
+stuf stuf stuff
+thing thing
+xxxxx
+that is all
+except
+for
+this
+last
+bit
diff --git a/tests/simple/base/new b/tests/simple/base/new
new file mode 100644
index 0000000..0ea0d92
--- /dev/null
+++ b/tests/simple/base/new
@@ -0,0 +1,21 @@
+This is a base file
+some changes are going to happen to it
+but it has
+had
+several lines
+so that alll
+the changes
+don't h...
+I don't know what I am saying.
+This line will have some changes made.
+but this one wont
+stuf stuf stuff
+thing thing
+xxxxx
+that is all
+except
+for
+this
+last
+bit
+x
diff --git a/tests/simple/base/new2 b/tests/simple/base/new2
new file mode 100644
index 0000000..cf8f75c
--- /dev/null
+++ b/tests/simple/base/new2
@@ -0,0 +1,21 @@
+This is a base file
+some changes are going to happen to it
+but it has
+had
+several lines
+so that alll
+the changes
+don't h...
+I don't know what I am saying.
+This line will have some modifications made.
+but this one wont
+stuf stuf stuff
+thing thing
+xxxxx
+that is all
+except
+for
+this
+last
+bit
+x
diff --git a/tests/simple/base/orig b/tests/simple/base/orig
new file mode 100644
index 0000000..46c9ab9
--- /dev/null
+++ b/tests/simple/base/orig
@@ -0,0 +1,20 @@
+
+This is a base file
+some changes are going to happen to it
+but it has
+several lines
+so that alll
+the changes
+don't h...
+I don't know waht I am saying.
+This lion will have some changes made.
+but this one wont
+stuf stuf stuff
+thing thing
+xxxxx
+that is all
+except
+for
+this
+last
+bit
diff --git a/tests/simple/bothadd/lmerge b/tests/simple/bothadd/lmerge
new file mode 100644
index 0000000..163d8fb
--- /dev/null
+++ b/tests/simple/bothadd/lmerge
@@ -0,0 +1,4 @@
+this is a
+line of text
+that was added
+to the file
diff --git a/tests/simple/bothadd/merge b/tests/simple/bothadd/merge
new file mode 100644
index 0000000..163d8fb
--- /dev/null
+++ b/tests/simple/bothadd/merge
@@ -0,0 +1,4 @@
+this is a
+line of text
+that was added
+to the file
diff --git a/tests/simple/bothadd/new b/tests/simple/bothadd/new
new file mode 100644
index 0000000..b28b04f
--- /dev/null
+++ b/tests/simple/bothadd/new
@@ -0,0 +1,3 @@
+
+
+
diff --git a/tests/simple/bothadd/new2 b/tests/simple/bothadd/new2
new file mode 100644
index 0000000..163d8fb
--- /dev/null
+++ b/tests/simple/bothadd/new2
@@ -0,0 +1,4 @@
+this is a
+line of text
+that was added
+to the file
diff --git a/tests/simple/bothadd/orig b/tests/simple/bothadd/orig
new file mode 100644
index 0000000..163d8fb
--- /dev/null
+++ b/tests/simple/bothadd/orig
@@ -0,0 +1,4 @@
+this is a
+line of text
+that was added
+to the file
diff --git a/tests/simple/brokenlines/diff b/tests/simple/brokenlines/diff
new file mode 100644
index 0000000..e04a44d
--- /dev/null
+++ b/tests/simple/brokenlines/diff
@@ -0,0 +1,7 @@
+@@ -1,5 +1,3 @@
+|This is a long line that <<<--might-->>><<<++has++>>> <<<--be -->>><<<++been
+|++>>>broken
+|and this is<<<--
+|-->>><<<++ ++>>>a broken line<<<--
+|-->>><<<++ ++>>>that <<<--might-->>><<<++will++>>> be<<<--
+|-->>><<<++ ++>>>joined
diff --git a/tests/simple/brokenlines/merge b/tests/simple/brokenlines/merge
new file mode 100644
index 0000000..ae3d3e3
--- /dev/null
+++ b/tests/simple/brokenlines/merge
@@ -0,0 +1,5 @@
+This is a longish line that might be split up
+and this is
+a broken line
+that might be
+catenated
diff --git a/tests/simple/brokenlines/new b/tests/simple/brokenlines/new
new file mode 100644
index 0000000..9ce96e0
--- /dev/null
+++ b/tests/simple/brokenlines/new
@@ -0,0 +1,3 @@
+This is a long line that has been
+broken
+and this is a broken line that will be joined
diff --git a/tests/simple/brokenlines/new2 b/tests/simple/brokenlines/new2
new file mode 100644
index 0000000..1548622
--- /dev/null
+++ b/tests/simple/brokenlines/new2
@@ -0,0 +1,3 @@
+This is a longish line that has been
+split up
+and this is a broken line that will be catenated
diff --git a/tests/simple/brokenlines/orig b/tests/simple/brokenlines/orig
new file mode 100644
index 0000000..9a2e13a
--- /dev/null
+++ b/tests/simple/brokenlines/orig
@@ -0,0 +1,5 @@
+This is a long line that might be broken
+and this is
+a broken line
+that might be
+joined
diff --git a/tests/simple/changeafteradd/merge b/tests/simple/changeafteradd/merge
new file mode 100644
index 0000000..88b2138
--- /dev/null
+++ b/tests/simple/changeafteradd/merge
@@ -0,0 +1,5 @@
+here
+is
+the
+inaugural
+file
diff --git a/tests/simple/changeafteradd/new b/tests/simple/changeafteradd/new
new file mode 100644
index 0000000..a5eefce
--- /dev/null
+++ b/tests/simple/changeafteradd/new
@@ -0,0 +1,6 @@
+here
+is
+the
+new version of the
+original
+file
diff --git a/tests/simple/changeafteradd/new2 b/tests/simple/changeafteradd/new2
new file mode 100644
index 0000000..39e2ee8
--- /dev/null
+++ b/tests/simple/changeafteradd/new2
@@ -0,0 +1,6 @@
+here
+is
+the
+new version of the
+inaugural
+file
diff --git a/tests/simple/changeafteradd/orig b/tests/simple/changeafteradd/orig
new file mode 100644
index 0000000..c37acc3
--- /dev/null
+++ b/tests/simple/changeafteradd/orig
@@ -0,0 +1,5 @@
+here
+is
+the
+original
+file
diff --git a/tests/simple/conflict/diff b/tests/simple/conflict/diff
new file mode 100644
index 0000000..8ecf042
--- /dev/null
+++ b/tests/simple/conflict/diff
@@ -0,0 +1,5 @@
+@@ -1,4 +1,4 @@
+ this is a file
+ with the word
+|<<<--two-->>><<<++to++>>> which is
+ misspelt
diff --git a/tests/simple/conflict/ldiff b/tests/simple/conflict/ldiff
new file mode 100644
index 0000000..4772aae
--- /dev/null
+++ b/tests/simple/conflict/ldiff
@@ -0,0 +1,6 @@
+@@ -1,4 +1,4 @@
+ this is a file
+ with the word
+-two which is
++to which is
+ misspelt
diff --git a/tests/simple/conflict/merge b/tests/simple/conflict/merge
new file mode 100644
index 0000000..8bbd487
--- /dev/null
+++ b/tests/simple/conflict/merge
@@ -0,0 +1,16 @@
+<<<<<<<
+this is a file
+with the word
+two which is
+misspelt
+|||||||
+this is a file
+with the word
+to which is
+misspelt
+=======
+this is a file
+with the word
+too which is
+misspelt
+>>>>>>>
diff --git a/tests/simple/conflict/new b/tests/simple/conflict/new
new file mode 100644
index 0000000..5c346ba
--- /dev/null
+++ b/tests/simple/conflict/new
@@ -0,0 +1,4 @@
+this is a file
+with the word
+to which is
+misspelt
diff --git a/tests/simple/conflict/new2 b/tests/simple/conflict/new2
new file mode 100644
index 0000000..cb8ea09
--- /dev/null
+++ b/tests/simple/conflict/new2
@@ -0,0 +1,4 @@
+this is a file
+with the word
+too which is
+misspelt
diff --git a/tests/simple/conflict/orig b/tests/simple/conflict/orig
new file mode 100644
index 0000000..bc856ca
--- /dev/null
+++ b/tests/simple/conflict/orig
@@ -0,0 +1,4 @@
+this is a file
+with the word
+two which is
+misspelt
diff --git a/tests/simple/conflict/wmerge b/tests/simple/conflict/wmerge
new file mode 100644
index 0000000..6af56bc
--- /dev/null
+++ b/tests/simple/conflict/wmerge
@@ -0,0 +1,4 @@
+this is a file
+with the word
+<<<---two|||to===too--->>> which is
+misspelt
diff --git a/tests/simple/conflictmixed/diff b/tests/simple/conflictmixed/diff
new file mode 100644
index 0000000..8ecf042
--- /dev/null
+++ b/tests/simple/conflictmixed/diff
@@ -0,0 +1,5 @@
+@@ -1,4 +1,4 @@
+ this is a file
+ with the word
+|<<<--two-->>><<<++to++>>> which is
+ misspelt
diff --git a/tests/simple/conflictmixed/ldiff b/tests/simple/conflictmixed/ldiff
new file mode 100644
index 0000000..4772aae
--- /dev/null
+++ b/tests/simple/conflictmixed/ldiff
@@ -0,0 +1,6 @@
+@@ -1,4 +1,4 @@
+ this is a file
+ with the word
+-two which is
++to which is
+ misspelt
diff --git a/tests/simple/conflictmixed/lmerge b/tests/simple/conflictmixed/lmerge
new file mode 100644
index 0000000..6d7071e
--- /dev/null
+++ b/tests/simple/conflictmixed/lmerge
@@ -0,0 +1,14 @@
+<<<<<<<
+this is a file
+with the word
+two which is
+|||||||
+this is a file
+with the word
+to which is
+=======
+this is a file
+with the word
+too which was
+>>>>>>>
+misspelt
diff --git a/tests/simple/conflictmixed/merge b/tests/simple/conflictmixed/merge
new file mode 100644
index 0000000..bb38d8a
--- /dev/null
+++ b/tests/simple/conflictmixed/merge
@@ -0,0 +1,16 @@
+<<<<<<<
+this is a file
+with the word
+two which is
+misspelt
+|||||||
+this is a file
+with the word
+to which is
+misspelt
+=======
+this is a file
+with the word
+too which was
+misspelt
+>>>>>>>
diff --git a/tests/simple/conflictmixed/new b/tests/simple/conflictmixed/new
new file mode 100644
index 0000000..5c346ba
--- /dev/null
+++ b/tests/simple/conflictmixed/new
@@ -0,0 +1,4 @@
+this is a file
+with the word
+to which is
+misspelt
diff --git a/tests/simple/conflictmixed/new2 b/tests/simple/conflictmixed/new2
new file mode 100644
index 0000000..24e7c78
--- /dev/null
+++ b/tests/simple/conflictmixed/new2
@@ -0,0 +1,4 @@
+this is a file
+with the word
+too which was
+misspelt
diff --git a/tests/simple/conflictmixed/orig b/tests/simple/conflictmixed/orig
new file mode 100644
index 0000000..bc856ca
--- /dev/null
+++ b/tests/simple/conflictmixed/orig
@@ -0,0 +1,4 @@
+this is a file
+with the word
+two which is
+misspelt
diff --git a/tests/simple/conflictmixed/wmerge b/tests/simple/conflictmixed/wmerge
new file mode 100644
index 0000000..d10fc02
--- /dev/null
+++ b/tests/simple/conflictmixed/wmerge
@@ -0,0 +1,4 @@
+this is a file
+with the word
+<<<---two|||to===too--->>> which was
+misspelt
diff --git a/tests/simple/multideletes/lmerge b/tests/simple/multideletes/lmerge
new file mode 100644
index 0000000..d1849fe
--- /dev/null
+++ b/tests/simple/multideletes/lmerge
@@ -0,0 +1,2 @@
+First line
+last line
diff --git a/tests/simple/multideletes/merge b/tests/simple/multideletes/merge
new file mode 100644
index 0000000..d1849fe
--- /dev/null
+++ b/tests/simple/multideletes/merge
@@ -0,0 +1,2 @@
+First line
+last line
diff --git a/tests/simple/multideletes/new b/tests/simple/multideletes/new
new file mode 100644
index 0000000..66ddf08
--- /dev/null
+++ b/tests/simple/multideletes/new
@@ -0,0 +1,8 @@
+Some padding
+this line will go
+Some more padding
+this one too
+This stuff is padding too
+and this
+Guess what you find here?
+last line
diff --git a/tests/simple/multideletes/new2 b/tests/simple/multideletes/new2
new file mode 100644
index 0000000..ead2f24
--- /dev/null
+++ b/tests/simple/multideletes/new2
@@ -0,0 +1,5 @@
+Some padding
+Some more padding
+This stuff is padding too
+Guess what you find here?
+last line
diff --git a/tests/simple/multideletes/orig b/tests/simple/multideletes/orig
new file mode 100644
index 0000000..084d8d8
--- /dev/null
+++ b/tests/simple/multideletes/orig
@@ -0,0 +1,5 @@
+First line
+this line will go
+this one too
+and this
+last line
diff --git a/tests/simple/multiple-add/lmerge b/tests/simple/multiple-add/lmerge
new file mode 100644
index 0000000..5827de2
--- /dev/null
+++ b/tests/simple/multiple-add/lmerge
@@ -0,0 +1,17 @@
+This
+is
+the
+current
+version
+of
+<<<<<<<
+the
+file.
+|||||||
+the
+file
+=======
+the
+file that has changed
+>>>>>>>
+
diff --git a/tests/simple/multiple-add/merge b/tests/simple/multiple-add/merge
new file mode 100644
index 0000000..312609f
--- /dev/null
+++ b/tests/simple/multiple-add/merge
@@ -0,0 +1,17 @@
+This
+is
+the
+current
+version
+of
+the
+<<<<<<<
+file.
+
+|||||||
+file
+
+=======
+file that has changed
+
+>>>>>>>
diff --git a/tests/simple/multiple-add/new b/tests/simple/multiple-add/new
new file mode 100644
index 0000000..f34b7b2
--- /dev/null
+++ b/tests/simple/multiple-add/new
@@ -0,0 +1,9 @@
+This
+is
+the
+old
+version
+of
+the
+file
+
diff --git a/tests/simple/multiple-add/new2 b/tests/simple/multiple-add/new2
new file mode 100644
index 0000000..234da11
--- /dev/null
+++ b/tests/simple/multiple-add/new2
@@ -0,0 +1,9 @@
+This
+is
+the
+old
+version
+of
+the
+file that has changed
+
diff --git a/tests/simple/multiple-add/orig b/tests/simple/multiple-add/orig
new file mode 100644
index 0000000..c6ed59c
--- /dev/null
+++ b/tests/simple/multiple-add/orig
@@ -0,0 +1,9 @@
+This
+is
+the
+current
+version
+of
+the
+file.
+
diff --git a/tests/simple/multiple-add/wmerge b/tests/simple/multiple-add/wmerge
new file mode 100644
index 0000000..27f6ce8
--- /dev/null
+++ b/tests/simple/multiple-add/wmerge
@@ -0,0 +1,9 @@
+This
+is
+the
+current
+version
+of
+the
+file<<<---.|||=== that has changed--->>>
+
diff --git a/tests/simple/show-wiggle-1/Wmerge b/tests/simple/show-wiggle-1/Wmerge
new file mode 100644
index 0000000..d5cba67
--- /dev/null
+++ b/tests/simple/show-wiggle-1/Wmerge
@@ -0,0 +1,20 @@
+<<<<<<<
+
+This is one line of the file
+
+|||||||
+
+This is 1 line of the file
+
+=======
+
+This is 1 line of the document
+
+&&&&&&&
+
+This is one line of the document
+
+>>>>>>>
+I think this is another line
+
+So is this
diff --git a/tests/simple/show-wiggle-1/new b/tests/simple/show-wiggle-1/new
new file mode 100644
index 0000000..6f588b7
--- /dev/null
+++ b/tests/simple/show-wiggle-1/new
@@ -0,0 +1,5 @@
+
+This is 1 line of the file
+
+I think this is another line
+
diff --git a/tests/simple/show-wiggle-1/new2 b/tests/simple/show-wiggle-1/new2
new file mode 100644
index 0000000..7f6b98d
--- /dev/null
+++ b/tests/simple/show-wiggle-1/new2
@@ -0,0 +1,5 @@
+
+This is 1 line of the document
+
+I think this is another line
+
diff --git a/tests/simple/show-wiggle-1/orig b/tests/simple/show-wiggle-1/orig
new file mode 100644
index 0000000..4f791aa
--- /dev/null
+++ b/tests/simple/show-wiggle-1/orig
@@ -0,0 +1,6 @@
+
+This is one line of the file
+
+I think this is another line
+
+So is this
diff --git a/tests/simple/show-wiggle-2/Wmerge b/tests/simple/show-wiggle-2/Wmerge
new file mode 100644
index 0000000..dadabc6
--- /dev/null
+++ b/tests/simple/show-wiggle-2/Wmerge
@@ -0,0 +1,13 @@
+Openning line
+
+<<<<<<<
+content line with content
+|||||||
+content line content
+=======
+middle line content
+&&&&&&&
+middle line with content
+>>>>>>>
+
+closing line
diff --git a/tests/simple/show-wiggle-2/new b/tests/simple/show-wiggle-2/new
new file mode 100644
index 0000000..c3e9e7e
--- /dev/null
+++ b/tests/simple/show-wiggle-2/new
@@ -0,0 +1,5 @@
+Openning line
+
+content line content
+
+closing line
diff --git a/tests/simple/show-wiggle-2/new2 b/tests/simple/show-wiggle-2/new2
new file mode 100644
index 0000000..ce25b4c
--- /dev/null
+++ b/tests/simple/show-wiggle-2/new2
@@ -0,0 +1,5 @@
+Openning line
+
+middle line content
+
+closing line
diff --git a/tests/simple/show-wiggle-2/orig b/tests/simple/show-wiggle-2/orig
new file mode 100644
index 0000000..c15140e
--- /dev/null
+++ b/tests/simple/show-wiggle-2/orig
@@ -0,0 +1,5 @@
+Openning line
+
+content line with content
+
+closing line
diff --git a/tests/simple/trivial-conflict/merge b/tests/simple/trivial-conflict/merge
new file mode 100644
index 0000000..4e532df
--- /dev/null
+++ b/tests/simple/trivial-conflict/merge
@@ -0,0 +1,7 @@
+<<<<<<<
+c
+|||||||
+a
+=======
+b
+>>>>>>>
diff --git a/tests/simple/trivial-conflict/orig b/tests/simple/trivial-conflict/orig
new file mode 100644
index 0000000..f2ad6c7
--- /dev/null
+++ b/tests/simple/trivial-conflict/orig
@@ -0,0 +1 @@
+c
diff --git a/tests/simple/trivial-conflict/patch b/tests/simple/trivial-conflict/patch
new file mode 100644
index 0000000..3f94cc8
--- /dev/null
+++ b/tests/simple/trivial-conflict/patch
@@ -0,0 +1,5 @@
+--- a 2009-03-05 16:33:02.000000000 +1100
++++ b 2009-03-05 16:33:04.000000000 +1100
+@@ -1 +1 @@
+-a
++b
diff --git a/vpatch.c b/vpatch.c
new file mode 100644
index 0000000..44aa898
--- /dev/null
+++ b/vpatch.c
@@ -0,0 +1,2409 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2005 Neil Brown <neilb@cse.unsw.edu.au>
+ * Copyright (C) 2010-2011 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+/*
+ * vpatch - visual front end for wiggle - aka Browse mode.
+ *
+ * "files" display, lists all files with statistics
+ * - can hide various lines including subdirectories
+ * and files without wiggles or conflicts
+ * "merge" display shows various views of merged file with different
+ * parts in different colours.
+ *
+ * The window can be split horizontally to show the original and result
+ * beside the diff, and each different branch can be shown alone.
+ *
+ */
+
+#include "wiggle.h"
+#include <curses.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <ctype.h>
+
+static void term_init(void);
+
+/* global attributes */
+unsigned int a_delete, a_added, a_common, a_sep, a_void,
+ a_unmatched, a_extra, a_already;
+unsigned int a_has_conflicts, a_has_wiggles, a_no_wiggles;
+
+/******************************************************************
+ * Help window
+ * We display help in an insert, leaving 5 columns left and right,
+ * and 2 rows top and bottom, but at most 58x15 plus border
+ * In help mode:
+ * SPC or RTN moves down or to next page
+ * BKSPC goes backwards
+ * 'q' returns to origin screen
+ * '?' show help on help
+ * left and right scroll help view
+ *
+ * A help text is an array of lines of text
+ */
+
+char *help_help[] = {
+ " You are viewing the help page for the help viewer.",
+ "You normally get here by typing '?'",
+ "",
+ "The following keystrokes work in the help viewer:",
+ " ? display this help message",
+ " q return to previous view",
+ " SPC move forward through help document",
+ " RTN same as SPC",
+ " BKSP move backward through help document",
+ " RIGHT scroll help window so text on the right appears",
+ " LEFT scroll help window so text on the left appears",
+ NULL
+};
+
+char *help_missing[] = {
+ "The file that this patch applies to appears",
+ "to be missing.",
+ "Please type 'q' to continue",
+ NULL
+};
+
+char *help_corrupt[] = {
+ "This patch appears to be corrupt",
+ "Please type 'q' to continue",
+ NULL
+};
+
+/* We can give one or two pages to display in the help window.
+ * The first is specific to the current context. The second
+ * is optional and may provide help in a more broad context.
+ */
+static void help_window(char *page1[], char *page2[])
+{
+ int rows, cols;
+ int top, left;
+ int r, c;
+ int ch;
+ char **page = page1;
+ int line = 0;
+ int shift = 0;
+
+ getmaxyx(stdscr, rows, cols);
+
+ if (cols < 70) {
+ left = 6;
+ cols = cols-12;
+ } else {
+ left = (cols-58)/2 - 1;
+ cols = 58;
+ }
+
+ if (rows < 21) {
+ top = 3;
+ rows = rows - 6;
+ } else {
+ top = (rows-15)/2 - 1;
+ rows = 15;
+ }
+
+ /* Draw a border around the 'help' area */
+ (void)attrset(A_STANDOUT);
+ for (c = left; c < left+cols; c++) {
+ mvaddch(top-1, c, '-');
+ mvaddch(top+rows, c, '-');
+ }
+ for (r = top; r < top + rows ; r++) {
+ mvaddch(r, left-1, '|');
+ mvaddch(r, left+cols, '|');
+ }
+ mvaddch(top-1, left-1, '/');
+ mvaddch(top-1, left+cols, '\\');
+ mvaddch(top+rows, left-1, '\\');
+ mvaddch(top+rows, left+cols, '/');
+ mvaddstr(top-1, left + cols/2 - 9, "HELP - 'q' to exit");
+ mvaddstr(top+rows, left+cols/2 - 17, "Press SPACE for more, '?' for help");
+ (void)attrset(A_NORMAL);
+
+ while (1) {
+ char **lnp = page + line;
+
+ /* Draw as much of the page at the current offset
+ * as fits.
+ */
+ for (r = 0; r < rows; r++) {
+ char *ln = *lnp;
+ int sh = shift;
+ if (ln)
+ lnp++;
+ else
+ ln = "";
+
+ while (*ln && sh > 0) {
+ ln++;
+ sh--;
+ }
+ for (c = 0; c < cols; c++) {
+ int chr = *ln;
+ if (chr)
+ ln++;
+ else
+ chr = ' ';
+ mvaddch(top+r, left+c, chr);
+ }
+ }
+ move(top+rows-1, left);
+ ch = getch();
+
+ switch (ch) {
+ case 'q':
+ return;
+ case '?':
+ if (page1 != help_help)
+ help_window(help_help, NULL);
+ break;
+ case ' ':
+ case '\r': /* page-down */
+ for (r = 0; r < rows-2; r++)
+ if (page[line])
+ line++;
+ if (!page[line]) {
+ line = 0;
+ if (page == page1)
+ page = page2;
+ else
+ page = NULL;
+ if (page == NULL)
+ return;
+ }
+ break;
+
+ case '\b': /* page up */
+ if (line > 0) {
+ line -= (rows-2);
+ if (line < 0)
+ line = 0;
+ } else {
+ if (page == page2)
+ page = page1;
+ else
+ page = page2;
+ if (page == NULL)
+ page = page1;
+ line = 0;
+ }
+ break;
+
+ case KEY_LEFT:
+ if (shift > 0)
+ shift--;
+ break;
+ case KEY_RIGHT:
+ shift++;
+ break;
+
+ case KEY_UP:
+ if (line > 0)
+ line--;
+ break;
+ case KEY_DOWN:
+ if (page[line])
+ line++;
+ break;
+ }
+ }
+}
+
+static char *typenames[] = {
+ [End] = "End",
+ [Unmatched] = "Unmatched",
+ [Unchanged] = "Unchanged",
+ [Extraneous] = "Extraneous",
+ [Changed] = "Changed",
+ [Conflict] = "Conflict",
+ [AlreadyApplied] = "AlreadyApplied",
+};
+
+/* When we merge the original and the diff together we need
+ * to keep track of where everything came from.
+ * When we display the different views, we need to be able to
+ * select certain portions of the whole document.
+ * These flags are used to identify what is present, and to
+ * request different parts be extracted. They also help
+ * guide choice of colour.
+ */
+#define BEFORE 1
+#define AFTER 2
+#define ORIG 4
+#define RESULT 8
+#define CHANGED 16 /* The RESULT is different to ORIG */
+#define CHANGES 32 /* AFTER is different to BEFORE */
+#define WIGGLED 64 /* a conflict that was successfully resolved */
+#define CONFLICTED 128 /* a conflict that was not successfully resolved */
+
+/* Displaying a Merge.
+ * The first step is to linearise the merge. The merge in inherently
+ * parallel with before/after streams. However much of the whole document
+ * is linear as normally much of the original in unchanged.
+ * All parallelism comes from the patch. This normally produces two
+ * parallel stream, but in the case of a conflict can produce three.
+ * For browsing the merge we only ever show two alternates in-line.
+ * When there are three we use two panes with 1 or 2 alternates in each.
+ * So to linearise the two streams we find lines that are completely
+ * unchanged (same for all 3 streams, or missing in 2nd and 3rd) which bound
+ * a region where there are changes. We include everything between
+ * these twice, in two separate passes. The exact interpretation of the
+ * passes is handled at a higher level but will be one of:
+ * original and result
+ * before and after
+ * original and after (for a conflict)
+ * This is all encoded in the 'struct merge'. An array of these describes
+ * the whole document.
+ *
+ * At any position in the merge we can be in one of 3 states:
+ * 0: unchanged section
+ * 1: first pass
+ * 2: second pass
+ *
+ * So to walk a merge in display order we need a position in the merge,
+ * a current state, and when in a changed section, we need to know the
+ * bounds of that changed section.
+ * This is all encoded in 'struct mpos'.
+ *
+ * Each location may or may not be visible depending on certain
+ * display options.
+ *
+ * Also, some locations might be 'invalid' in that they don't need to be displayed.
+ * For example when the patch leaves a section of the original unchanged,
+ * we only need to see the original - the before/after sections are treated
+ * as invalid and are not displayed.
+ * The visibility of newlines is crucial and guides the display. One line
+ * of displayed text is all the visible sections between two visible newlines.
+ *
+ * Counting lines is a bit tricky. We only worry about line numbers in the
+ * original (stream 0) as these could compare with line numbers mentioned in
+ * patch chunks.
+ * We count 2 for every line: 1 for everything before the newline and 1 for the newline.
+ * That way we don't get a full counted line until we see the first char after the
+ * newline, so '+' lines are counted with the previous line.
+ *
+ */
+struct mp {
+ int m; /* merger index */
+ int s; /* stream 0,1,2 for a,b,c */
+ int o; /* offset in that stream */
+ int lineno; /* Counts newlines in stream 0
+ * set lsb when see newline.
+ * add one when not newline and lsb set
+ */
+};
+struct mpos {
+ struct mp p, /* the current point (end of a line) */
+ lo, /* eol for start of the current group */
+ hi; /* eol for end of the current group */
+ int state; /*
+ * 0 if on an unchanged (lo/hi not meaningful)
+ * 1 if on the '-' of a diff,
+ * 2 if on the '+' of a diff
+ */
+};
+
+struct cursor {
+ struct mp pos; /* where in the document we are (an element) */
+ int offset; /* which char in that element */
+ int target; /* display column - or -1 if we are looking for 'pos' */
+ int col; /* where we found pos or target */
+ int width; /* Size of char, for moving to the right */
+ int alt; /* Cursor is in alternate window */
+};
+
+/* used for checking location during search */
+static int same_mp(struct mp a, struct mp b)
+{
+ return a.m == b.m &&
+ a.s == b.s &&
+ a.o == b.o;
+}
+static int same_mpos(struct mpos a, struct mpos b)
+{
+ return same_mp(a.p, b.p) &&
+ (a.state == b.state || a.state == 0 || b.state == 0);
+}
+
+/* Check if a particular stream is meaningful in a particular merge
+ * section. e.g. in an Unchanged section, only stream 0, the
+ * original, is meaningful. This is used to avoid walking down
+ * pointless paths.
+ */
+static int stream_valid(int s, enum mergetype type)
+{
+ switch (type) {
+ case End:
+ return 1;
+ case Unmatched:
+ return s == 0;
+ case Unchanged:
+ return s == 0;
+ case Extraneous:
+ return s == 2;
+ case Changed:
+ return s != 1;
+ case Conflict:
+ return 1;
+ case AlreadyApplied:
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Advance the 'pos' in the current mergepos returning the next
+ * element (word).
+ * This walks the merges in sequence, and the streams within
+ * each merge.
+ */
+static struct elmnt next_melmnt(struct mp *pos,
+ struct file fm, struct file fb, struct file fa,
+ struct merge *m)
+{
+ pos->o++;
+ while (1) {
+ int l = 0; /* Length remaining in current merge section */
+ if (pos->m >= 0)
+ switch (pos->s) {
+ case 0:
+ l = m[pos->m].al;
+ break;
+ case 1:
+ l = m[pos->m].bl;
+ break;
+ case 2:
+ l = m[pos->m].cl;
+ break;
+ }
+ if (pos->o >= l) {
+ /* Offset has reached length, choose new stream or
+ * new merge */
+ pos->o = 0;
+ do {
+ pos->s++;
+ if (pos->s > 2) {
+ pos->s = 0;
+ pos->m++;
+ }
+ } while (!stream_valid(pos->s, m[pos->m].type));
+ } else
+ break;
+ }
+ if (pos->m == -1 || m[pos->m].type == End) {
+ struct elmnt e;
+ e.start = NULL; e.len = 0;
+ return e;
+ }
+ switch (pos->s) {
+ default: /* keep compiler happy */
+ case 0:
+ if (pos->lineno & 1)
+ pos->lineno++;
+ if (ends_mline(fm.list[m[pos->m].a + pos->o]))
+ pos->lineno++;
+ return fm.list[m[pos->m].a + pos->o];
+ case 1: return fb.list[m[pos->m].b + pos->o];
+ case 2: return fa.list[m[pos->m].c + pos->o];
+ }
+}
+
+/* step current position.p backwards */
+static struct elmnt prev_melmnt(struct mp *pos,
+ struct file fm, struct file fb, struct file fa,
+ struct merge *m)
+{
+ if (pos->s == 0) {
+ if (ends_mline(fm.list[m[pos->m].a + pos->o]))
+ pos->lineno--;
+ if (pos->lineno & 1)
+ pos->lineno--;
+ }
+
+ pos->o--;
+ while (pos->m >= 0 && pos->o < 0) {
+ do {
+ pos->s--;
+ if (pos->s < 0) {
+ pos->s = 2;
+ pos->m--;
+ }
+ } while (pos->m >= 0 &&
+ !stream_valid(pos->s, m[pos->m].type));
+ if (pos->m >= 0) {
+ switch (pos->s) {
+ case 0:
+ pos->o = m[pos->m].al-1;
+ break;
+ case 1:
+ pos->o = m[pos->m].bl-1;
+ break;
+ case 2:
+ pos->o = m[pos->m].cl-1;
+ break;
+ }
+ }
+ }
+ if (pos->m < 0) {
+ struct elmnt e;
+ e.start = NULL; e.len = 0;
+ return e;
+ }
+ switch (pos->s) {
+ default: /* keep compiler happy */
+ case 0: return fm.list[m[pos->m].a + pos->o];
+ case 1: return fb.list[m[pos->m].b + pos->o];
+ case 2: return fa.list[m[pos->m].c + pos->o];
+ }
+}
+
+/* 'visible' not only checks if this stream in this merge should be
+ * visible in this mode, but also chooses which colour/highlight to use
+ * to display it.
+ */
+static int visible(int mode, enum mergetype type, int stream)
+{
+ if (mode == 0)
+ return -1;
+ /* mode can be any combination of ORIG RESULT BEFORE AFTER */
+ switch (type) {
+ case End: /* The END is always visible */
+ return A_NORMAL;
+ case Unmatched: /* Visible in ORIG and RESULT */
+ if (mode & (ORIG|RESULT))
+ return a_unmatched;
+ break;
+ case Unchanged: /* visible everywhere, but only show stream 0 */
+ if (stream == 0)
+ return a_common;
+ break;
+ case Extraneous: /* stream 2 is visible in BEFORE and AFTER */
+ if ((mode & (BEFORE|AFTER))
+ && stream == 2)
+ return a_extra;
+ break;
+ case Changed: /* stream zero visible ORIG and BEFORE, stream 2 elsewhere */
+ if (stream == 0 &&
+ (mode & (ORIG|BEFORE)))
+ return a_delete;
+ if (stream == 2 &&
+ (mode & (RESULT|AFTER)))
+ return a_added;
+ break;
+ case Conflict:
+ switch (stream) {
+ case 0:
+ if (mode & ORIG)
+ return a_unmatched | A_REVERSE;
+ break;
+ case 1:
+ if (mode & BEFORE)
+ return a_extra | A_UNDERLINE;
+ break;
+ case 2:
+ if (mode & (AFTER|RESULT))
+ return a_added | A_UNDERLINE;
+ break;
+ }
+ break;
+ case AlreadyApplied:
+ switch (stream) {
+ case 0:
+ if (mode & (ORIG|RESULT))
+ return a_already;
+ break;
+ case 1:
+ if (mode & BEFORE)
+ return a_delete | A_UNDERLINE;
+ break;
+ case 2:
+ if (mode & AFTER)
+ return a_added | A_UNDERLINE;
+ break;
+ }
+ break;
+ }
+ return -1;
+}
+
+/* checkline creates a summary of the sort of changes that
+ * are in a line, returning an "or" of
+ * CHANGED
+ * CHANGES
+ * WIGGLED
+ * CONFLICTED
+ */
+static int check_line(struct mpos pos, struct file fm, struct file fb,
+ struct file fa,
+ struct merge *m, int mode)
+{
+ int rv = 0;
+ struct elmnt e;
+ int unmatched = 0;
+
+ do {
+ if (m[pos.p.m].type == Changed)
+ rv |= CHANGED | CHANGES;
+ else if ((m[pos.p.m].type == AlreadyApplied ||
+ m[pos.p.m].type == Conflict))
+ rv |= CONFLICTED | CHANGES;
+ else if (m[pos.p.m].type == Extraneous &&
+ /* hunk headers don't count as wiggles */
+ fb.list[m[pos.p.m].b].start[0] != '\0')
+ rv |= WIGGLED;
+ else if (m[pos.p.m].type == Unmatched)
+ unmatched = 1;
+ if (m[pos.p.m].in_conflict)
+ rv |= CONFLICTED | CHANGES;
+ e = prev_melmnt(&pos.p, fm, fb, fa, m);
+ } while (e.start != NULL &&
+ (!ends_mline(e)
+ || visible(mode, m[pos.p.m].type, pos.p.s) == -1));
+
+ if (unmatched && (rv & CHANGES))
+ rv |= WIGGLED;
+ return rv;
+}
+
+/* Find the next line in the merge which is visible.
+ * If we hit the end of a conflicted set during pass-1
+ * we rewind for pass-2.
+ * 'mode' tells which bits we want to see, possible one of
+ * the 4 parts (before/after/orig/result) or one of the pairs
+ * before+after or orig+result.
+ */
+static void next_mline(struct mpos *pos, struct file fm, struct file fb,
+ struct file fa,
+ struct merge *m, int mode)
+{
+ int mask;
+ do {
+ struct mp prv;
+ int mode2;
+
+ prv = pos->p;
+ while (1) {
+ struct elmnt e = next_melmnt(&pos->p, fm, fb, fa, m);
+ if (e.start == NULL)
+ break;
+ if (ends_mline(e) &&
+ visible(mode, m[pos->p.m].type, pos->p.s) >= 0)
+ break;
+ }
+ mode2 = check_line(*pos, fm, fb, fa, m, mode);
+
+ if ((mode2 & CHANGES) && pos->state == 0) {
+ /* Just entered a diff-set */
+ pos->lo = pos->p;
+ pos->state = 1;
+ } else if (!(mode2 & CHANGES) && pos->state) {
+ /* Come to the end of a diff-set */
+ switch (pos->state) {
+ case 1:
+ /* Need to record the end */
+ pos->hi = prv;
+ /* time for another pass */
+ pos->p = pos->lo;
+ pos->state++;
+ break;
+ case 2:
+ /* finished final pass */
+ pos->state = 0;
+ break;
+ }
+ }
+ mask = ORIG|RESULT|BEFORE|AFTER|CHANGES|CHANGED;
+ switch (pos->state) {
+ case 1:
+ mask &= ~(RESULT|AFTER);
+ break;
+ case 2:
+ mask &= ~(ORIG|BEFORE);
+ break;
+ }
+ } while (visible(mode&mask, m[pos->p.m].type, pos->p.s) < 0);
+
+}
+
+/* Move to previous line - simply the reverse of next_mline */
+static void prev_mline(struct mpos *pos, struct file fm, struct file fb,
+ struct file fa,
+ struct merge *m, int mode)
+{
+ int mask;
+ do {
+ struct mp prv;
+ int mode2;
+
+ prv = pos->p;
+ if (pos->p.m < 0)
+ return;
+ while (1) {
+ struct elmnt e = prev_melmnt(&pos->p, fm, fb, fa, m);
+ if (e.start == NULL)
+ break;
+ if (ends_mline(e) &&
+ visible(mode, m[pos->p.m].type, pos->p.s) >= 0)
+ break;
+ }
+ mode2 = check_line(*pos, fm, fb, fa, m, mode);
+
+ if ((mode2 & CHANGES) && pos->state == 0) {
+ /* Just entered a diff-set */
+ pos->hi = pos->p;
+ pos->state = 2;
+ } else if (!(mode2 & CHANGES) && pos->state) {
+ /* Come to the end (start) of a diff-set */
+ switch (pos->state) {
+ case 1:
+ /* finished final pass */
+ pos->state = 0;
+ break;
+ case 2:
+ /* Need to record the start */
+ pos->lo = prv;
+ /* time for another pass */
+ pos->p = pos->hi;
+ pos->state--;
+ break;
+ }
+ }
+ mask = ORIG|RESULT|BEFORE|AFTER|CHANGES|CHANGED;
+ switch (pos->state) {
+ case 1:
+ mask &= ~(RESULT|AFTER);
+ break;
+ case 2:
+ mask &= ~(ORIG|BEFORE);
+ break;
+ }
+ } while (visible(mode&mask, m[pos->p.m].type, pos->p.s) < 0);
+}
+
+/* blank a whole row of display */
+static void blank(int row, int start, int cols, unsigned int attr)
+{
+ (void)attrset(attr);
+ move(row, start);
+ while (cols-- > 0)
+ addch(' ');
+}
+
+/* search of a string on one display line. If found, update the
+ * cursor.
+ */
+
+static int mcontains(struct mpos pos,
+ struct file fm, struct file fb, struct file fa,
+ struct merge *m,
+ int mode, char *search, struct cursor *curs,
+ int dir, int ignore_case)
+{
+ /* See if any of the files, between start of this line and here,
+ * contain the search string.
+ * However this is modified by dir:
+ * -2: find last match *before* curs
+ * -1: find last match at-or-before curs
+ * 1: find first match at-or-after curs
+ * 2: find first match *after* curs
+ *
+ * We only test for equality with curs, so if it is on a different
+ * line it will not be found and everything is before/after.
+ * As we search from end-of-line to start we find the last
+ * match first.
+ * For a forward search, we stop when we find curs.
+ * For a backward search, we forget anything found when we find curs.
+ */
+ struct elmnt e;
+ int found = 0;
+ struct mp mp;
+ int o;
+ int len = strlen(search);
+
+ do {
+ e = prev_melmnt(&pos.p, fm, fb, fa, m);
+ if (e.start && e.start[0]) {
+ int i;
+ int curs_i;
+ if (same_mp(pos.p, curs->pos))
+ curs_i = curs->offset;
+ else
+ curs_i = -1;
+ for (i = e.len-1; i >= 0; i--) {
+ if (i == curs_i && dir == -1)
+ /* next match is the one we want */
+ found = 0;
+ if (i == curs_i && dir == 2)
+ /* future matches not accepted */
+ goto break_while;
+ if ((!found || dir > 0) &&
+ (ignore_case ? strncasecmp : strncmp)
+ (e.start+i, search, len) == 0) {
+ mp = pos.p;
+ o = i;
+ found = 1;
+ }
+ if (i == curs_i && dir == -2)
+ /* next match is the one we want */
+ found = 0;
+ if (i == curs_i && dir == 1)
+ /* future matches not accepted */
+ goto break_while;
+ }
+ }
+ } while (e.start != NULL &&
+ (!ends_mline(e)
+ || visible(mode, m[pos.p.m].type, pos.p.s) == -1));
+break_while:
+ if (found) {
+ curs->pos = mp;
+ curs->offset = o;
+ }
+ return found;
+}
+
+/* Drawing the display window.
+ * There are 7 different ways we can display the data, each
+ * of which can be configured by a keystroke:
+ * o original - just show the original file with no changes, but still
+ * with highlights of what is changed or unmatched
+ * r result - show just the result of the merge. Conflicts just show
+ * the original, not the before/after options
+ * b before - show the 'before' stream of the patch
+ * a after - show the 'after' stream of the patch
+ * d diff - show just the patch, both before and after
+ * m merge - show the full merge with -+ sections for changes.
+ * If point is in a wiggled or conflicted section the
+ * window is split horizontally and the diff is shown
+ * in the bottom window
+ * | sidebyside - two panes, left and right. Left holds the merge,
+ * right holds the diff. In the case of a conflict,
+ * left holds orig/after, right holds before/after
+ *
+ * The horizontal split for 'merge' mode is managed as follows.
+ * - The window is split when we first visit a line that contains
+ * a wiggle or a conflict, and the second pane is removed when
+ * we next visit a line that contains no changes (is fully Unchanged).
+ * - to display the second pane, we find a visible end-of-line in the
+ * (BEFORE|AFTER) mode at-or-before the current end-of-line and
+ * the we centre that line.
+ * - We need to rewind to an unchanged section, and wind forward again
+ * to make sure that 'lo' and 'hi' are set properly.
+ * - every time we move, we redraw the second pane (see how that goes).
+ */
+
+/* draw_mside draws one text line or, in the case of sidebyside, one side
+ * of a textline.
+ * The 'mode' tells us what to draw via the 'visible()' function.
+ * It is one of ORIG RESULT BEFORE AFTER or ORIG|RESULT or BEFORE|AFTER
+ * It may also have WIGGLED or CONFLICTED ored in to trigger extra highlights.
+ * The desired cursor position is given in 'target' the actual end
+ * cursor position (allowing e.g. for tabs) is returned in *colp.
+ */
+static void draw_mside(int mode, int row, int offset, int start, int cols,
+ struct file fm, struct file fb, struct file fa,
+ struct merge *m,
+ struct mpos pos,
+ struct cursor *curs)
+{
+ struct elmnt e;
+ int col = 0;
+ char tag;
+ unsigned int tag_attr;
+
+ switch (pos.state) {
+ case 0: /* unchanged line */
+ tag = ' ';
+ tag_attr = A_NORMAL;
+ break;
+ case 1: /* 'before' text */
+ tag = '-';
+ tag_attr = a_delete;
+ if ((mode & ORIG) && (mode & CONFLICTED)) {
+ tag = '|';
+ tag_attr = a_delete;
+ }
+ mode &= (ORIG|BEFORE);
+ break;
+ case 2: /* the 'after' part */
+ tag = '+';
+ tag_attr = a_added;
+ mode &= (AFTER|RESULT);
+ break;
+ }
+
+ if (visible(mode, m[pos.p.m].type, pos.p.s) < 0) {
+ /* Not visible, just draw a blank */
+ blank(row, offset, cols, a_void);
+ if (curs) {
+ curs->width = -1;
+ curs->col = 0;
+ curs->pos = pos.p;
+ curs->offset = 0;
+ }
+ return;
+ }
+
+ (void)attrset(tag_attr);
+ mvaddch(row, offset, tag);
+ offset++;
+ cols--;
+ (void)attrset(A_NORMAL);
+
+ /* find previous visible newline, or start of file */
+ do
+ e = prev_melmnt(&pos.p, fm, fb, fa, m);
+ while (e.start != NULL &&
+ (!ends_mline(e) ||
+ visible(mode, m[pos.p.m].type, pos.p.s) == -1));
+
+ while (1) {
+ unsigned char *c;
+ int l;
+ e = next_melmnt(&pos.p, fm, fb, fa, m);
+ if (e.start == NULL ||
+ (ends_mline(e)
+ && visible(mode, m[pos.p.m].type, pos.p.s) != -1)) {
+ /* We have reached the end of visible line, or end of file */
+ if (curs) {
+ curs->col = col;
+ if (col >= start + cols)
+ curs->width = 0;
+ else
+ curs->width = -1; /* end of line */
+ if (curs->target >= 0) {
+ curs->pos = pos.p;
+ curs->offset = 0;
+ } else if (same_mp(pos.p, curs->pos))
+ curs->target = col;
+ }
+ if (col < start)
+ col = start;
+ if (e.start && e.start[0] == 0) {
+ char b[40];
+ struct elmnt e1;
+ if (pos.p.s == 2 && m[pos.p.m].type == Extraneous) {
+ int A, B, C, D, E, F;
+ e1 = fb.list[m[pos.p.m].b + pos.p.o];
+ sscanf(e1.start+1, "%d %d %d", &A, &B, &C);
+ sscanf(e.start+1, "%d %d %d", &D, &E, &F);
+ sprintf(b, "@@ -%d,%d +%d,%d @@\n", B, C, E, F);
+ (void)attrset(a_sep);
+ } else {
+ (void)attrset(visible(mode, m[pos.p.m].type, pos.p.s));
+ sprintf(b, "<%.17s>", e.start+1);
+ }
+ mvaddstr(row, col-start+offset, b);
+ col += strlen(b);
+ }
+ blank(row, col-start+offset, start+cols-col,
+ e.start
+ ? (unsigned)visible(mode, m[pos.p.m].type, pos.p.s)
+ : A_NORMAL);
+ return;
+ }
+ if (visible(mode, m[pos.p.m].type, pos.p.s) == -1)
+ continue;
+ if (e.start[0] == 0)
+ continue;
+ (void)attrset(visible(mode, m[pos.p.m].type, pos.p.s));
+ c = (unsigned char *)e.start;
+ for (l = 0; l < e.len; l++) {
+ int scol = col;
+ if (*c >= ' ' && *c != 0x7f) {
+ if (col >= start && col < start+cols)
+ mvaddch(row, col-start+offset, *c);
+ col++;
+ } else if (*c == '\t') {
+ do {
+ if (col >= start && col < start+cols) {
+ mvaddch(row, col-start+offset, ' ');
+ } col++;
+ } while ((col&7) != 0);
+ } else {
+ if (col >= start && col < start+cols)
+ mvaddch(row, col-start+offset, '?');
+ col++;
+ }
+ if (curs) {
+ if (curs->target >= 0) {
+ if (curs->target < col) {
+ /* Found target column */
+ curs->pos = pos.p;
+ curs->offset = l;
+ curs->col = scol;
+ if (scol >= start + cols)
+ /* Didn't appear on screen */
+ curs->width = 0;
+ else
+ curs->width = col - scol;
+ curs = NULL;
+ }
+ } else if (l == curs->offset &&
+ same_mp(pos.p, curs->pos)) {
+ /* Found the pos */
+ curs->target = scol;
+ curs->col = scol;
+ if (scol >= start + cols)
+ /* Didn't appear on screen */
+ curs->width = 0;
+ else
+ curs->width = col - scol;
+ curs = NULL;
+ }
+ }
+ c++;
+ }
+ }
+}
+
+/* Draw either 1 or 2 sides depending on the mode. */
+
+static void draw_mline(int mode, int row, int start, int cols,
+ struct file fm, struct file fb, struct file fa,
+ struct merge *m,
+ struct mpos pos,
+ struct cursor *curs)
+{
+ /*
+ * Draw the left and right images of this line
+ * One side might be a_blank depending on the
+ * visibility of this newline
+ */
+ int lcols, rcols;
+
+ mode |= check_line(pos, fm, fb, fa, m, mode);
+
+ if ((mode & (BEFORE|AFTER)) &&
+ (mode & (ORIG|RESULT))) {
+
+ lcols = (cols-1)/2;
+ rcols = cols - lcols - 1;
+
+ (void)attrset(A_STANDOUT);
+ mvaddch(row, lcols, '|');
+
+ draw_mside(mode&~(BEFORE|AFTER), row, 0, start, lcols,
+ fm, fb, fa, m, pos, curs && !curs->alt ? curs : NULL);
+
+ draw_mside(mode&~(ORIG|RESULT), row, lcols+1, start, rcols,
+ fm, fb, fa, m, pos, curs && curs->alt ? curs : NULL);
+ } else
+ draw_mside(mode, row, 0, start, cols,
+ fm, fb, fa, m, pos, curs);
+}
+
+static char *merge_help[] = {
+ "This view shows the merge of the patch with the",
+ "original file. It is like a full-context diff showing",
+ "removed lines with a '-' prefix and added lines with a",
+ "'+' prefix.",
+ "In cases where a patch chunk could not be successfully",
+ "applied, the original text is prefixed with a '|', and",
+ "the text that the patch wanted to add is prefixed with",
+ "a '+'.",
+ "When the cursor is over such a conflict, or over a chunk",
+ "which required wiggling to apply (i.e. there was unmatched",
+ "text in the original, or extraneous unchanged text in",
+ "the patch), the terminal is split and the bottom pane is",
+ "use to display the part of the patch that applied to",
+ "this section of the original. This allows you to confirm",
+ "that a wiggled patch applied correctly, and to see",
+ "why there was a conflict",
+ NULL
+};
+static char *diff_help[] = {
+ "This is the 'diff' or 'patch' view. It shows",
+ "only the patch that is being applied without the",
+ "original to which it is being applied.",
+ "Underlined text indicates parts of the patch which",
+ "resulted in a conflict when applied to the",
+ "original.",
+ NULL
+};
+static char *orig_help[] = {
+ "This is the 'original' view which simply shows",
+ "the original file before applying the patch.",
+ "Sections of code that would be changed by the patch",
+ "are highlighted in red.",
+ NULL
+};
+static char *result_help[] = {
+ "This is the 'result' view which shows just the",
+ "result of applying the patch. When a conflict",
+ "occurred this view does not show the full conflict",
+ "but only the 'after' part of the patch. To see",
+ "the full conflict, use the 'merge' or 'sidebyside'",
+ "views.",
+ NULL
+};
+static char *before_help[] = {
+ "This view shows the 'before' section of a patch.",
+ "It allows the expected match text to be seen uncluttered",
+ "by text that is meant to replaced it."
+ "Red text is text that will be removed by the patch",
+ NULL
+};
+static char *after_help[] = {
+ "This view shows the 'after' section of a patch.",
+ "It allows the intended result to be seen uncluttered",
+ "by text that was meant to be matched and replaced."
+ "Green text is text that was added by the patch - it",
+ "was not present in the 'before' part of the patch",
+ NULL
+};
+static char *sidebyside_help[] = {
+ "This is the Side By Side view of a patched file.",
+ "The left side shows the original and the result.",
+ "The right side shows the patch which was applied",
+ "and lines up with the original/result as much as",
+ "possible.",
+ "",
+ "Where one side has no line which matches the",
+ "other side it is displayed as a solid colour in the",
+ "yellow family (depending on your terminal window).",
+ NULL
+};
+static char *merge_window_help[] = {
+ " Highlight Colours and Keystroke commands",
+ "",
+ "In all different views of a merge, highlight colours",
+ "are used to show which parts of lines were added,",
+ "removed, already changed, unchanged or in conflict.",
+ "Colours and their use are:",
+ " normal unchanged text",
+ " red text that was removed or changed",
+ " green text that was added or the result",
+ " of a change",
+ " yellow background used in side-by-side for a line",
+ " which has no match on the other",
+ " side",
+ " blue text in the original which did not",
+ " match anything in the patch",
+ " cyan text in the patch which did not",
+ " match anything in the original",
+ " cyan background already changed text: the result",
+ " of the patch matches the original",
+ " underline remove or added text can also be",
+ " underlined indicating that it",
+ " was involved in a conflict",
+ ""
+ "While viewing a merge various keystroke commands can",
+ "be used to move around and change the view. Basic",
+ "movement commands from both 'vi' and 'emacs' are",
+ "available:",
+ "",
+ " p control-p k UP Move to previous line",
+ " n control-n j DOWN Move to next line",
+ " l LEFT Move one char to right",
+ " h RIGHT Move one char to left",
+ " / control-s Enter incremental search mode",
+ " control-r Enter reverse-search mode",
+ " control-g Search again",
+ " ? Display help message",
+ " ESC-< 0-G Go to start of file",
+ " ESC-> G Go to end of file",
+ " q Return to list of files or exit",
+ " control-L recenter current line",
+ " control-V page down",
+ " ESC-v page up",
+ " N go to next patch chunk",
+ " P go to previous patch chunk",
+ " O move cursor to alternate pane",
+ " ^ control-A go to start of line",
+ " $ control-E go to end of line",
+ "",
+ " a display 'after' view",
+ " b display 'before' view",
+ " o display 'original' view",
+ " r display 'result' view",
+ " d display 'diff' or 'patch' view",
+ " m display 'merge' view",
+ " | display side-by-side view",
+ NULL
+};
+
+static void merge_window(struct plist *p, FILE *f, int reverse)
+{
+ /* Display the merge window in one of the selectable modes,
+ * starting with the 'merge' mode.
+ *
+ * Newlines are the key to display.
+ * 'pos' is always a visible newline (or eof).
+ * In sidebyside mode it might only be visible on one side,
+ * in which case the other side will be blank.
+ * Where the newline is visible, we rewind the previous visible
+ * newline visible and display the stuff in between
+ *
+ * A 'position' is a struct mpos
+ */
+
+ struct stream sm, sb, sa, sp; /* main, before, after, patch */
+ struct file fm, fb, fa;
+ struct csl *csl1, *csl2;
+ struct ci ci;
+ int ch; /* count of chunks */
+ /* Always refresh the current line.
+ * If refresh == 1, refresh all lines. If == 2, clear first
+ */
+ int refresh = 2;
+ int rows = 0, cols = 0;
+ int splitrow = -1; /* screen row for split - diff appears below */
+ int lastrow = 0; /* end of screen, or just above 'splitrow' */
+ int i, c, cswitch;
+ int mode = ORIG|RESULT;
+ int mmode = mode; /* Mode for moving - used when in 'other' pane */
+ char *modename = "merge";
+ char **modehelp = merge_help;
+
+ int row, start = 0;
+ int trow; /* screen-row while searching. If we cannot find,
+ * we forget this number */
+ struct cursor curs;
+ struct mpos pos; /* current point */
+ struct mpos tpos, /* temp point while drawing lines above and below pos */
+ toppos, /* pos at top of screen - for page-up */
+ botpos; /* pos at bottom of screen - for page-down */
+ struct mpos vpos, tvpos;
+ int botrow = 0;
+ int meta = 0, /* mode for multi-key commands- SEARCH or META */
+ tmeta;
+ int num = -1, /* numeric arg being typed. */
+ tnum;
+ char search[80]; /* string we are searching for */
+ unsigned int searchlen = 0;
+ int search_notfound = 0;
+ int searchdir = 0;
+ /* ignore_case:
+ * 0 == no
+ * 1 == no because there are upper-case chars
+ * 2 == yes as there are no upper-case chars
+ * 3 == yes
+ */
+ int ignore_case = 2;
+ /* We record all the places we find so 'backspace'
+ * can easily return to the previous one
+ */
+ struct search_anchor {
+ struct search_anchor *next;
+ struct mpos pos;
+ struct cursor curs;
+ int notfound;
+ int row, start;
+ unsigned int searchlen;
+ } *anchor = NULL;
+
+ if (f == NULL) {
+ /* three separate files */
+ sm = load_file(p->file);
+ sb = load_file(p->before);
+ sa = load_file(p->after);
+ ch = 0;
+ } else {
+ sp = load_segment(f, p->start, p->end);
+ if (p->is_merge) {
+ if (reverse)
+ split_merge(sp, &sm, &sa, &sb);
+ else
+ split_merge(sp, &sm, &sb, &sa);
+ ch = 0;
+ } else {
+ if (reverse)
+ ch = split_patch(sp, &sa, &sb);
+ else
+ ch = split_patch(sp, &sb, &sa);
+
+ sm = load_file(p->file);
+ }
+ }
+ if (!sm.body || !sb.body || !sa.body) {
+ term_init();
+ if (!sm.body)
+ help_window(help_missing, NULL);
+ else
+ help_window(help_corrupt, NULL);
+ return;
+ }
+ /* FIXME check for errors in the stream */
+ fm = split_stream(sm, ByWord);
+ fb = split_stream(sb, ByWord);
+ fa = split_stream(sa, ByWord);
+
+ if (ch)
+ csl1 = pdiff(fm, fb, ch);
+ else
+ csl1 = diff(fm, fb);
+ csl2 = diff(fb, fa);
+
+ ci = make_merger(fm, fb, fa, csl1, csl2, 0, 1, 0);
+
+ term_init();
+
+ row = 1;
+ pos.p.m = 0; /* merge node */
+ pos.p.s = 0; /* stream number */
+ pos.p.o = -1; /* offset */
+ pos.p.lineno = 1;
+ pos.state = 0;
+ next_mline(&pos, fm, fb, fa, ci.merger, mode);
+ memset(&curs, 0, sizeof(curs));
+ vpos = pos;
+ while (1) {
+ if (refresh >= 2) {
+ char buf[100];
+ clear();
+ snprintf(buf, 100, "File: %s%s Mode: %s\n",
+ p->file, reverse ? " - reversed" : "", modename);
+ (void)attrset(A_BOLD);
+ mvaddstr(0, 0, buf);
+ clrtoeol();
+ (void)attrset(A_NORMAL);
+ refresh = 1;
+ }
+ if (row < 1 || row >= lastrow)
+ refresh = 1;
+ if (curs.alt)
+ refresh = 1;
+
+ if (mode == (ORIG|RESULT)) {
+ int cmode = check_line(pos, fm, fb, fa, ci.merger, mode);
+ if (cmode & (WIGGLED | CONFLICTED)) {
+ if (splitrow < 0) {
+ splitrow = (rows+1)/2;
+ lastrow = splitrow - 1;
+ refresh = 1;
+ }
+ } else if (!curs.alt && splitrow >= 0) {
+ splitrow = -1;
+ lastrow = rows-1;
+ refresh = 1;
+ }
+ } else if (splitrow >= 0) {
+ splitrow = -1;
+ lastrow = rows-1;
+ refresh = 1;
+ }
+
+ if (refresh) {
+ getmaxyx(stdscr, rows, cols);
+ rows--; /* keep last row clear */
+ if (splitrow >= 0) {
+ splitrow = (rows+1)/2;
+ lastrow = splitrow - 1;
+ } else
+ lastrow = rows - 1;
+
+ if (row < -3)
+ row = lastrow/2+1;
+ if (row < 1)
+ row = 1;
+ if (row > lastrow+3)
+ row = lastrow/2+1;
+ if (row >= lastrow)
+ row = lastrow-1;
+ }
+ if (getenv("WIGGLE_VTRACE")) {
+ char b[100];
+ char *e, e2[7];
+ int i;
+ switch (vpos.p.s) {
+ case 0:
+ e = fm.list[ci.merger[vpos.p.m].a + vpos.p.o].start;
+ break;
+ case 1:
+ e = fb.list[ci.merger[vpos.p.m].b + vpos.p.o].start;
+ break;
+ case 2:
+ e = fa.list[ci.merger[vpos.p.m].c + vpos.p.o].start;
+ break;
+ }
+ for (i = 0; i < 6; i++) {
+ e2[i] = e[i];
+ if (e2[i] < 32 || e2[i] >= 127)
+ e2[i] = '?';
+ }
+ sprintf(b, "st=%d str=%d o=%d m=%d mt=%s(%d,%d,%d) ic=%d <%.3s>", vpos.state,
+ vpos.p.s, vpos.p.o,
+ vpos.p.m, typenames[ci.merger[vpos.p.m].type],
+ ci.merger[vpos.p.m].al,
+ ci.merger[vpos.p.m].bl,
+ ci.merger[vpos.p.m].cl,
+ ci.merger[vpos.p.m].in_conflict,
+ e2
+ );
+ (void)attrset(A_NORMAL);
+ mvaddstr(0, 50, b);
+ clrtoeol();
+ }
+
+ /* Always refresh the line */
+ while (start > curs.target) {
+ start -= 8;
+ refresh = 1;
+ }
+ if (start < 0)
+ start = 0;
+ retry:
+ draw_mline(mode, row, start, cols, fm, fb, fa, ci.merger,
+ pos, (splitrow >= 0 && curs.alt) ? NULL : &curs);
+ if (curs.width == 0 && start < curs.col) {
+ /* width == 0 implies it appear after end-of-screen */
+ start += 8;
+ refresh = 1;
+ goto retry;
+ }
+ if (curs.col < start) {
+ start -= 8;
+ refresh = 1;
+ if (start < 0)
+ start = 0;
+ goto retry;
+ }
+ if (refresh) {
+ refresh = 0;
+ tpos = pos;
+
+ for (i = row-1; i >= 1 && tpos.p.m >= 0; ) {
+ prev_mline(&tpos, fm, fb, fa, ci.merger, mode);
+ draw_mline(mode, i--, start, cols,
+ fm, fb, fa, ci.merger,
+ tpos, NULL);
+
+ }
+ if (i) {
+ row -= (i+1);
+ refresh = 1;
+ goto retry;
+ }
+ toppos = tpos;
+ while (i >= 1)
+ blank(i--, 0, cols, a_void);
+ tpos = pos;
+ for (i = row; i <= lastrow && ci.merger[tpos.p.m].type != End; ) {
+ draw_mline(mode, i++, start, cols,
+ fm, fb, fa, ci.merger,
+ tpos, NULL);
+ next_mline(&tpos, fm, fb, fa, ci.merger, mode);
+ }
+ botpos = tpos; botrow = i;
+ while (i <= lastrow)
+ blank(i++, 0, cols, a_void);
+ }
+
+ if (splitrow >= 0) {
+ struct mpos spos = pos;
+ int smode = BEFORE|AFTER;
+ int srow = (rows + splitrow)/2;
+ if (visible(smode, ci.merger[spos.p.m].type,
+ spos.p.s) < 0)
+ prev_mline(&spos, fm, fb, fa, ci.merger, smode);
+ /* Now hi/lo might be wrong, so lets fix it. */
+ tpos = spos;
+ while (spos.p.m >= 0 && spos.state != 0)
+ prev_mline(&spos, fm, fb, fa, ci.merger, smode);
+ while (!same_mpos(spos, tpos))
+ next_mline(&spos, fm, fb, fa, ci.merger, smode);
+
+ (void)attrset(a_sep);
+ for (i = 0; i < cols; i++)
+ mvaddstr(splitrow, i, "-");
+
+ tpos = spos;
+ for (i = srow-1; i > splitrow; i--) {
+ prev_mline(&tpos, fm, fb, fa, ci.merger, smode);
+ draw_mline(smode, i, start, cols, fm, fb, fa, ci.merger,
+ tpos, NULL);
+ }
+ while (i > splitrow)
+ blank(i--, 0, cols, a_void);
+ tpos = spos;
+ for (i = srow;
+ i < rows && ci.merger[tpos.p.m].type != End;
+ i++) {
+ draw_mline(smode, i, start, cols, fm, fb, fa, ci.merger,
+ tpos,
+ (i == srow && curs.alt) ? &curs : NULL);
+ next_mline(&tpos, fm, fb, fa, ci.merger, smode);
+ }
+ while (i < rows)
+ blank(i++, 0, cols, a_void);
+ }
+ /* Now that curs is accurate, report the type */
+ {
+ char lbuf[30];
+ (void)attrset(A_BOLD);
+ snprintf(lbuf, 29, "%s ln:%d",
+ typenames[ci.merger[curs.pos.m].type],
+ (pos.p.lineno-1)/2);
+ mvaddstr(0, cols - strlen(lbuf) - 4, " ");
+ mvaddstr(0, cols - strlen(lbuf) - 1, lbuf);
+ }
+#define META(c) ((c)|0x1000)
+#define SEARCH(c) ((c)|0x2000)
+ move(rows, 0);
+ (void)attrset(A_NORMAL);
+ if (num >= 0) {
+ char buf[10];
+ snprintf(buf, 10, "%d ", num);
+ addstr(buf);
+ }
+ if (meta & META(0))
+ addstr("ESC...");
+ if (meta & SEARCH(0)) {
+ if (searchdir < 0)
+ addstr("Backwards ");
+ addstr("Search: ");
+ addstr(search);
+ if (search_notfound)
+ addstr(" - Not Found.");
+ search_notfound = 0;
+ }
+ clrtoeol();
+ /* '+1' to skip over the leading +/-/| char */
+ if (curs.alt && splitrow > 0)
+ move((rows + splitrow)/2, curs.col - start + 1);
+ else if (curs.alt && ((mode & (BEFORE|AFTER)) &&
+ (mode & (ORIG|RESULT))))
+ move(row, curs.col-start + (cols-1)/2+2);
+ else
+ move(row, curs.col-start+1);
+ c = getch();
+ tmeta = meta; meta = 0;
+ tnum = num; num = -1;
+ tvpos = vpos; vpos = pos;
+ cswitch = c | tmeta;
+ /* Handle some ranges */
+ /* case '0' ... '9': */
+ if (cswitch >= '0' && cswitch <= '9')
+ cswitch = '0';
+ /* case SEARCH(' ') ... SEARCH('~'): */
+ if (cswitch >= SEARCH(' ') && cswitch <= SEARCH('~'))
+ cswitch = SEARCH(' ');
+
+ switch (cswitch) {
+ case 27: /* escape */
+ case META(27):
+ meta = META(0);
+ break;
+ case META('<'): /* start of file */
+ start:
+ tpos = pos; row++;
+ do {
+ pos = tpos; row--;
+ prev_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ } while (tpos.p.m >= 0);
+ if (row <= 0)
+ row = 0;
+ break;
+ case META('>'): /* end of file */
+ case 'G':
+ if (tnum >= 0)
+ goto start;
+ tpos = pos; row--;
+ do {
+ pos = tpos; row++;
+ next_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ } while (ci.merger[tpos.p.m].type != End);
+ if (row >= lastrow)
+ row = lastrow;
+ break;
+ case '0': /* actually '0'...'9' */
+ if (tnum < 0)
+ tnum = 0;
+ num = tnum*10 + (c-'0');
+ break;
+ case 'q':
+ return;
+
+ case '/':
+ case 'S'-64:
+ /* incr search forward */
+ meta = SEARCH(0);
+ searchlen = 0;
+ search[searchlen] = 0;
+ searchdir = 1;
+ break;
+ case '\\':
+ case 'R'-64:
+ /* incr search backwards */
+ meta = SEARCH(0);
+ searchlen = 0;
+ search[searchlen] = 0;
+ searchdir = -1;
+ break;
+ case SEARCH('G'-64):
+ case SEARCH('S'-64):
+ case SEARCH('R'-64):
+ /* search again */
+ if ((c|tmeta) == SEARCH('R'-64))
+ searchdir = -2;
+ else if ((c|tmeta) == SEARCH('S'-64))
+ searchdir = 2;
+ else
+ searchdir *= 2;
+ meta = SEARCH(0);
+ tpos = pos; trow = row;
+ goto search_again;
+
+ case SEARCH('H'-64):
+ case SEARCH(KEY_BACKSPACE):
+ meta = SEARCH(0);
+ if (anchor) {
+ struct search_anchor *a;
+ a = anchor;
+ anchor = a->next;
+ free(a);
+ }
+ if (anchor) {
+ struct search_anchor *a;
+ a = anchor;
+ anchor = a->next;
+ pos = a->pos;
+ row = a->row;
+ start = a->start;
+ curs = a->curs;
+ curs.target = -1;
+ search_notfound = a->notfound;
+ searchlen = a->searchlen;
+ search[searchlen] = 0;
+ free(a);
+ refresh = 1;
+ }
+ break;
+ case SEARCH(' '): /* actually ' '...'~' */
+ case SEARCH('\t'):
+ meta = SEARCH(0);
+ if (searchlen < sizeof(search)-1)
+ search[searchlen++] = c & (0x7f);
+ search[searchlen] = 0;
+ tpos = pos; trow = row;
+ search_again:
+ search_notfound = 1;
+ if (ignore_case == 1 || ignore_case == 2) {
+ unsigned int i;
+ ignore_case = 2;
+ for (i=0; i < searchlen; i++)
+ if (isupper(search[i])) {
+ ignore_case = 1;
+ break;
+ }
+ }
+ do {
+ if (mcontains(tpos, fm, fb, fa, ci.merger,
+ mmode, search, &curs, searchdir,
+ ignore_case >= 2)) {
+ curs.target = -1;
+ pos = tpos;
+ row = trow;
+ search_notfound = 0;
+ break;
+ }
+ if (searchdir < 0) {
+ trow--;
+ prev_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ } else {
+ trow++;
+ next_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ }
+ } while (tpos.p.m >= 0 && ci.merger[tpos.p.m].type != End);
+ searchdir /= abs(searchdir);
+
+ break;
+ case 'L'-64:
+ refresh = 2;
+ row = lastrow / 2;
+ break;
+
+ case 'V'-64: /* page down */
+ pos = botpos;
+ if (botrow <= lastrow)
+ row = botrow;
+ else
+ row = 2;
+ refresh = 1;
+ break;
+ case META('v'): /* page up */
+ pos = toppos;
+ row = lastrow-1;
+ refresh = 1;
+ break;
+
+ case 'j':
+ case 'n':
+ case 'N'-64:
+ case KEY_DOWN:
+ if (tnum < 0)
+ tnum = 1;
+ for (; tnum > 0 ; tnum--) {
+ tpos = pos;
+ next_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ if (ci.merger[tpos.p.m].type != End) {
+ pos = tpos;
+ row++;
+ }
+ }
+ break;
+ case 'N':
+ /* Next diff */
+ tpos = pos; row--;
+ do {
+ pos = tpos; row++;
+ next_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ } while (pos.state != 0 && ci.merger[tpos.p.m].type != End);
+ tpos = pos; row--;
+ do {
+ pos = tpos; row++;
+ next_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ } while (pos.state == 0 && ci.merger[tpos.p.m].type != End);
+
+ break;
+ case 'P':
+ /* Previous diff */
+ tpos = pos; row++;
+ do {
+ pos = tpos; row--;
+ prev_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ } while (tpos.state == 0 && tpos.p.m >= 0);
+ tpos = pos; row++;
+ do {
+ pos = tpos; row--;
+ prev_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ } while (tpos.state != 0 && tpos.p.m >= 0);
+ break;
+
+ case 'k':
+ case 'p':
+ case 'P'-64:
+ case KEY_UP:
+ if (tnum < 0)
+ tnum = 1;
+ for (; tnum > 0 ; tnum--) {
+ tpos = pos;
+ prev_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ if (tpos.p.m >= 0) {
+ pos = tpos;
+ row--;
+ }
+ }
+ break;
+
+ case KEY_LEFT:
+ case 'h':
+ /* left */
+ curs.target = curs.col - 1;
+ if (curs.target < 0) {
+ /* Try to go to end of previous line */
+ tpos = pos;
+ prev_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ if (tpos.p.m >= 0) {
+ pos = tpos;
+ row--;
+ curs.pos = pos.p;
+ curs.target = -1;
+ } else
+ curs.target = 0;
+ }
+ break;
+ case KEY_RIGHT:
+ case 'l':
+ /* right */
+ if (curs.width >= 0)
+ curs.target = curs.col + curs.width;
+ else {
+ /* end of line, go to next */
+ tpos = pos;
+ next_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ if (ci.merger[tpos.p.m].type != End) {
+ pos = tpos;
+ curs.pos = pos.p;
+ row++;
+ curs.target = 0;
+ }
+ }
+ break;
+
+ case '^':
+ case 'A'-64:
+ /* Start of line */
+ curs.target = 0;
+ break;
+ case '$':
+ case 'E'-64:
+ /* End of line */
+ curs.target = 1000;
+ break;
+
+ case 'O':
+ curs.alt = !curs.alt;
+ if (curs.alt && mode == (ORIG|RESULT))
+ mmode = (BEFORE|AFTER);
+ else
+ mmode = mode;
+ break;
+
+ case 'a': /* 'after' view in patch window */
+ mode = AFTER; modename = "after"; modehelp = after_help;
+ mmode = mode; curs.alt = 0;
+ refresh = 3;
+ break;
+ case 'b': /* 'before' view in patch window */
+ mode = BEFORE; modename = "before"; modehelp = before_help;
+ mmode = mode; curs.alt = 0;
+ refresh = 3;
+ break;
+ case 'o': /* 'original' view in the merge window */
+ mode = ORIG; modename = "original"; modehelp = orig_help;
+ mmode = mode; curs.alt = 0;
+ refresh = 3;
+ break;
+ case 'r': /* the 'result' view in the merge window */
+ mode = RESULT; modename = "result"; modehelp = result_help;
+ mmode = mode; curs.alt = 0;
+ refresh = 3;
+ break;
+ case 'd':
+ mode = BEFORE|AFTER; modename = "diff"; modehelp = diff_help;
+ mmode = mode; curs.alt = 0;
+ refresh = 3;
+ break;
+ case 'm':
+ mode = ORIG|RESULT; modename = "merge"; modehelp = merge_help;
+ mmode = mode; curs.alt = 0;
+ refresh = 3;
+ break;
+
+ case '|':
+ mode = ORIG|RESULT|BEFORE|AFTER; modename = "sidebyside"; modehelp = sidebyside_help;
+ mmode = mode; curs.alt = 0;
+ refresh = 3;
+ break;
+
+ case 'H': /* scroll window to the right */
+ if (start > 0)
+ start--;
+ curs.target = start + 1;
+ refresh = 1;
+ break;
+ case 'L': /* scroll window to the left */
+ if (start < cols)
+ start++;
+ curs.target = start + 1;
+ refresh = 1;
+ break;
+
+ case '<':
+ prev_melmnt(&tvpos.p, fm, fb, fa, ci.merger);
+ if (tvpos.p.m >= 0)
+ vpos = tvpos;
+ break;
+ case '>':
+ next_melmnt(&tvpos.p, fm, fb, fa, ci.merger);
+ if (ci.merger[tvpos.p.m].type != End)
+ vpos = tvpos;
+ break;
+
+ case '?':
+ help_window(modehelp, merge_window_help);
+ refresh = 2;
+ break;
+
+ case KEY_RESIZE:
+ refresh = 2;
+ break;
+ }
+
+ if (meta == SEARCH(0)) {
+ if (anchor == NULL ||
+ !same_mpos(anchor->pos, pos) ||
+ anchor->searchlen != searchlen ||
+ !same_mp(anchor->curs.pos, curs.pos)) {
+ struct search_anchor *a = xmalloc(sizeof(*a));
+ a->pos = pos;
+ a->row = row;
+ a->start = start;
+ a->curs = curs;
+ a->searchlen = searchlen;
+ a->notfound = search_notfound;
+ a->next = anchor;
+ anchor = a;
+ }
+ } else {
+ while (anchor) {
+ struct search_anchor *a = anchor;
+ anchor = a->next;
+ free(a);
+ }
+ }
+ if (refresh == 3) {
+ /* move backward and forward to make sure we
+ * are on a visible line
+ */
+ tpos = pos;
+ prev_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ if (tpos.p.m >= 0)
+ pos = tpos;
+ tpos = pos;
+ next_mline(&tpos, fm, fb, fa, ci.merger, mmode);
+ if (ci.merger[tpos.p.m].type != End)
+ pos = tpos;
+ }
+ }
+}
+
+static void show_merge(char *origname, FILE *patch, int reverse,
+ int is_merge, char *before, char *after)
+{
+ struct plist p;
+
+ p.file = origname;
+ if (patch) {
+ p.start = 0;
+ fseek(patch, 0, SEEK_END);
+ p.end = ftell(patch);
+ fseek(patch, 0, SEEK_SET);
+ }
+ p.calced = 0;
+ p.is_merge = is_merge;
+ p.before = before;
+ p.after = after;
+
+ freopen("/dev/null","w",stderr);
+ merge_window(&p, patch, reverse);
+}
+
+static void calc_one(struct plist *pl, FILE *f, int reverse)
+{
+ struct stream s1, s2;
+ struct stream s = load_segment(f, pl->start, pl->end);
+ struct stream sf;
+ if (pl->is_merge) {
+ if (reverse)
+ split_merge(s, &sf, &s2, &s1);
+ else
+ split_merge(s, &sf, &s1, &s2);
+ pl->chunks = 0;
+ } else {
+ sf = load_file(pl->file);
+ if (reverse)
+ pl->chunks = split_patch(s, &s2, &s1);
+ else
+ pl->chunks = split_patch(s, &s1, &s2);
+ }
+ if (sf.body == NULL || s1.body == NULL || s1.body == NULL) {
+ pl->wiggles = pl->conflicts = -1;
+ } else {
+ struct file ff, fp1, fp2;
+ struct csl *csl1, *csl2;
+ struct ci ci;
+ ff = split_stream(sf, ByWord);
+ fp1 = split_stream(s1, ByWord);
+ fp2 = split_stream(s2, ByWord);
+ if (pl->chunks)
+ csl1 = pdiff(ff, fp1, pl->chunks);
+ else
+ csl1 = diff(ff, fp1);
+ csl2 = diff(fp1, fp2);
+ ci = make_merger(ff, fp1, fp2, csl1, csl2, 0, 1, 0);
+ pl->wiggles = ci.wiggles;
+ pl->conflicts = ci.conflicts;
+ free(csl1);
+ free(csl2);
+ free(ff.list);
+ free(fp1.list);
+ free(fp2.list);
+ }
+
+ free(s1.body);
+ free(s2.body);
+ free(s.body);
+ free(sf.body);
+ pl->calced = 1;
+}
+
+static int get_prev(int pos, struct plist *pl, int n, int mode)
+{
+ int found = 0;
+ if (pos == -1)
+ return pos;
+ do {
+ if (pl[pos].prev == -1)
+ return pl[pos].parent;
+ pos = pl[pos].prev;
+ while (pl[pos].open &&
+ pl[pos].last >= 0)
+ pos = pl[pos].last;
+ if (pl[pos].last >= 0)
+ /* always see directories */
+ found = 1;
+ else if (mode == 0)
+ found = 1;
+ else if (mode <= 1 && pl[pos].wiggles > 0)
+ found = 1;
+ else if (mode <= 2 && pl[pos].conflicts > 0)
+ found = 1;
+ } while (pos >= 0 && !found);
+ return pos;
+}
+
+static int get_next(int pos, struct plist *pl, int n, int mode,
+ FILE *f, int reverse)
+{
+ int found = 0;
+ if (pos == -1)
+ return pos;
+ do {
+ if (pl[pos].open) {
+ if (pos + 1 < n)
+ pos = pos+1;
+ else
+ return -1;
+ } else {
+ while (pos >= 0 && pl[pos].next == -1)
+ pos = pl[pos].parent;
+ if (pos >= 0)
+ pos = pl[pos].next;
+ }
+ if (pos < 0)
+ return -1;
+ if (pl[pos].calced == 0 && pl[pos].end)
+ calc_one(pl+pos, f, reverse);
+ if (pl[pos].last >= 0)
+ /* always see directories */
+ found = 1;
+ else if (mode == 0)
+ found = 1;
+ else if (mode <= 1 && pl[pos].wiggles > 0)
+ found = 1;
+ else if (mode <= 2 && pl[pos].conflicts > 0)
+ found = 1;
+ } while (pos >= 0 && !found);
+ return pos;
+}
+
+static void draw_one(int row, struct plist *pl, FILE *f, int reverse)
+{
+ char hdr[12];
+ hdr[0] = 0;
+
+ if (pl == NULL) {
+ move(row, 0);
+ clrtoeol();
+ return;
+ }
+ if (pl->calced == 0 && pl->end)
+ /* better load the patch and count the chunks */
+ calc_one(pl, f, reverse);
+ if (pl->end == 0) {
+ strcpy(hdr, " ");
+ } else {
+ if (pl->chunks > 99)
+ strcpy(hdr, "XX");
+ else
+ sprintf(hdr, "%2d", pl->chunks);
+ if (pl->wiggles > 99)
+ strcpy(hdr+2, " XX");
+ else
+ sprintf(hdr+2, " %2d", pl->wiggles);
+ if (pl->conflicts > 99)
+ strcpy(hdr+5, " XX ");
+ else
+ sprintf(hdr+5, " %2d ", pl->conflicts);
+ }
+ if (pl->end)
+ strcpy(hdr+9, "= ");
+ else if (pl->open)
+ strcpy(hdr+9, "+ ");
+ else
+ strcpy(hdr+9, "- ");
+
+ if (!pl->end)
+ attrset(0);
+ else if (pl->conflicts)
+ attrset(a_has_conflicts);
+ else if (pl->wiggles)
+ attrset(a_has_wiggles);
+ else
+ attrset(a_no_wiggles);
+
+ mvaddstr(row, 0, hdr);
+ mvaddstr(row, 11, pl->file);
+ clrtoeol();
+}
+
+static char *main_help[] = {
+ " You are using the \"browse\" mode of wiggle.",
+ "This page shows a list of files in a patch together with",
+ "the directories that contain them.",
+ "A directory is indicated by a '+' if the contents are",
+ "listed or a '-' if the contents are hidden. A file is",
+ "indicated by an '='. Typing <space> or <return> will",
+ "expose or hide a directory, and will visit a file.",
+ "",
+ "The three columns of numbers are:",
+ " Ch The number of patch chunks which applied to",
+ " this file",
+ " Wi The number of chunks that needed to be wiggled",
+ " in to place",
+ " Co The number of chunks that created an unresolvable",
+ " conflict",
+ ""
+ "Keystrokes recognised in this page are:",
+ " ? Display this help",
+ " SPC On a directory, toggle hiding of contents",
+ " On file, visit the file",
+ " RTN Same as SPC",
+ " q Quit program",
+ " n,j,DOWN Go to next line",
+ " p,k,UP Go to previous line",
+ "",
+ " A list All files",
+ " W only list files with a wiggle or a conflict",
+ " C only list files with a conflict",
+ NULL
+};
+
+static void main_window(struct plist *pl, int n, FILE *f, int reverse)
+{
+ /* The main window lists all files together with summary information:
+ * number of chunks, number of wiggles, number of conflicts.
+ * The list is scrollable
+ * When a entry is 'selected', we switch to the 'file' window
+ * The list can be condensed by removing files with no conflict
+ * or no wiggles, or removing subdirectories
+ *
+ * We record which file in the list is 'current', and which
+ * screen line it is on. We try to keep things stable while
+ * moving.
+ *
+ * Counts are printed before the name using at most 2 digits.
+ * Numbers greater than 99 are XX
+ * Ch Wi Co File
+ * 27 5 1 drivers/md/md.c
+ *
+ * A directory show the sum in all children.
+ *
+ * Commands:
+ * select: enter, space, mouseclick
+ * on file, go to file window
+ * on directory, toggle open
+ * up: k, p, control-p uparrow
+ * Move to previous open object
+ * down: j, n, control-n, downarrow
+ * Move to next open object
+ *
+ * A W C: select All Wiggles or Conflicts
+ * mode
+ *
+ */
+ int pos = 0; /* position in file */
+ int row = 1; /* position on screen */
+ int rows = 0; /* size of screen in rows */
+ int cols = 0;
+ int tpos, i;
+ int refresh = 2;
+ int c = 0;
+ int mode = 0; /* 0=all, 1= only wiggled, 2=only conflicted */
+
+ freopen("/dev/null","w",stderr);
+ term_init();
+ pl = sort_patches(pl, &n);
+
+ while (1) {
+ if (refresh == 2) {
+ clear(); (void)attrset(0);
+ attron(A_BOLD);
+ mvaddstr(0, 0, "Ch Wi Co Patched Files");
+ move(2, 0);
+ attroff(A_BOLD);
+ refresh = 1;
+ }
+ if (row < 1 || row >= rows)
+ refresh = 1;
+ if (refresh) {
+ refresh = 0;
+ getmaxyx(stdscr, rows, cols);
+ cols = cols; /* Silence warning that 'cols' isn't used */
+ if (row >= rows + 3)
+ row = (rows+1)/2;
+ if (row >= rows)
+ row = rows-1;
+ tpos = pos;
+ for (i = row; i > 1; i--) {
+ tpos = get_prev(tpos, pl, n, mode);
+ if (tpos == -1) {
+ row = row - i + 1;
+ break;
+ }
+ }
+ /* Ok, row and pos could be trustworthy now */
+ tpos = pos;
+ for (i = row; i >= 1; i--) {
+ draw_one(i, &pl[tpos], f, reverse);
+ tpos = get_prev(tpos, pl, n, mode);
+ }
+ tpos = pos;
+ for (i = row+1; i < rows; i++) {
+ tpos = get_next(tpos, pl, n, mode, f, reverse);
+ if (tpos >= 0)
+ draw_one(i, &pl[tpos], f, reverse);
+ else
+ draw_one(i, NULL, f, reverse);
+ }
+ }
+ {char bb[20];
+ sprintf(bb, "%d", c);
+ mvaddstr(0, 70, bb);
+ clrtoeol();
+ }
+ move(row, 9);
+ c = getch();
+ switch (c) {
+ case 'j':
+ case 'n':
+ case 'N':
+ case 'N'-64:
+ case KEY_DOWN:
+ tpos = get_next(pos, pl, n, mode, f, reverse);
+ if (tpos >= 0) {
+ pos = tpos;
+ row++;
+ }
+ break;
+ case 'k':
+ case 'p':
+ case 'P':
+ case 'P'-64:
+ case KEY_UP:
+ tpos = get_prev(pos, pl, n, mode);
+ if (tpos >= 0) {
+ pos = tpos;
+ row--;
+ }
+ break;
+
+ case ' ':
+ case 13:
+ if (pl[pos].end == 0) {
+ pl[pos].open = !pl[pos].open;
+ refresh = 1;
+ } else {
+ /* diff_window(&pl[pos], f); */
+ merge_window(&pl[pos], f, reverse);
+ refresh = 2;
+ }
+ break;
+ case 27: /* escape */
+ mvaddstr(0, 70, "ESC..."); clrtoeol();
+ c = getch();
+ switch (c) {
+ }
+ break;
+ case 'q':
+ return;
+
+ case 'A':
+ mode = 0; refresh = 1;
+ break;
+ case 'W':
+ mode = 1; refresh = 1;
+ break;
+ case 'C':
+ mode = 2; refresh = 1;
+ break;
+
+ case '?':
+ help_window(main_help, NULL);
+ refresh = 2;
+ break;
+
+ case KEY_RESIZE:
+ refresh = 2;
+ break;
+ }
+ }
+}
+
+static void catch(int sig)
+{
+ if (sig == SIGINT) {
+ signal(sig, catch);
+ return;
+ }
+ nocbreak();
+ nl();
+ endwin();
+ printf("Died on signal %d\n", sig);
+ exit(2);
+}
+
+static void term_init(void)
+{
+
+ static int init_done = 0;
+
+ if (init_done)
+ return;
+ init_done = 1;
+
+ signal(SIGINT, catch);
+ signal(SIGQUIT, catch);
+ signal(SIGTERM, catch);
+ signal(SIGBUS, catch);
+ signal(SIGSEGV, catch);
+
+ initscr(); cbreak(); noecho();
+ start_color();
+ use_default_colors();
+ if (!has_colors()) {
+ a_delete = A_UNDERLINE;
+ a_added = A_BOLD;
+ a_common = A_NORMAL;
+ a_sep = A_STANDOUT;
+ a_already = A_STANDOUT;
+ a_has_conflicts = A_UNDERLINE;
+ a_has_wiggles = A_BOLD;
+ a_no_wiggles = A_NORMAL;
+ } else {
+ init_pair(1, COLOR_RED, -1);
+ a_delete = COLOR_PAIR(1);
+ init_pair(2, COLOR_GREEN, -1);
+ a_added = COLOR_PAIR(2);
+ a_common = A_NORMAL;
+ init_pair(3, COLOR_WHITE, COLOR_GREEN);
+ a_sep = COLOR_PAIR(3); a_sep = A_STANDOUT;
+ init_pair(4, -1, COLOR_YELLOW);
+ a_void = COLOR_PAIR(4);
+ init_pair(5, COLOR_BLUE, -1);
+ a_unmatched = COLOR_PAIR(5);
+ init_pair(6, COLOR_CYAN, -1);
+ a_extra = COLOR_PAIR(6);
+
+ init_pair(7, COLOR_BLACK, COLOR_CYAN);
+ a_already = COLOR_PAIR(7);
+
+ a_has_conflicts = a_delete;
+ a_has_wiggles = a_added;
+ a_no_wiggles = a_unmatched;
+ }
+ nonl(); intrflush(stdscr, FALSE); keypad(stdscr, TRUE);
+ mousemask(ALL_MOUSE_EVENTS, NULL);
+}
+
+int vpatch(int argc, char *argv[], int patch, int strip,
+ int reverse, int replace)
+{
+ /* NOTE argv[0] is first arg...
+ * Behaviour depends on number of args:
+ * 0: A multi-file patch is read from stdin
+ * 1: if 'patch', parse it as a multi-file patch and allow
+ * the files to be browsed.
+ * if filename ends '.rej', then treat it as a patch again
+ * a file with the same basename
+ * Else treat the file as a merge (with conflicts) and view it.
+ * 2: First file is original, second is patch
+ * 3: Files are: original previous new. The diff between 'previous' and
+ * 'new' needs to be applied to 'original'.
+ *
+ * If a multi-file patch is being read, 'strip' tells how many
+ * path components to strip. If it is -1, we guess based on
+ * existing files.
+ * If 'reverse' is given, when we invert any patch or diff
+ * If 'replace' then we save the resulting merge.
+ */
+ FILE *in;
+ FILE *f;
+ struct plist *pl;
+ int num_patches;
+
+ switch (argc) {
+ default:
+ fprintf(stderr, "%s: too many file names given.\n", Cmd);
+ exit(1);
+
+ case 0: /* stdin is a patch */
+ if (lseek(fileno(stdin), 0L, 1) == -1) {
+ /* cannot seek, so need to copy to a temp file */
+ f = tmpfile();
+ if (!f) {
+ fprintf(stderr, "%s: Cannot create temp file\n", Cmd);
+ exit(1);
+ }
+ pl = parse_patch(stdin, f, &num_patches);
+ in = f;
+ } else {
+ pl = parse_patch(stdin, NULL, &num_patches);
+ in = fdopen(dup(0), "r");
+ }
+ /* use stderr for keyboard input */
+ dup2(2, 0);
+ if (set_prefix(pl, num_patches, strip) == 0) {
+ fprintf(stderr, "%s: aborting\n", Cmd);
+ exit(2);
+ }
+ main_window(pl, num_patches, in, reverse);
+ break;
+
+ case 1: /* a patch, a .rej, or a merge file */
+ f = fopen(argv[0], "r");
+ if (!f) {
+ fprintf(stderr, "%s: cannot open %s\n", Cmd, argv[0]);
+ exit(1);
+ }
+ if (patch) {
+ pl = parse_patch(f, NULL, &num_patches);
+ if (set_prefix(pl, num_patches, strip) == 0) {
+ fprintf(stderr, "%s: aborting\n", Cmd);
+ exit(2);
+ }
+ main_window(pl, num_patches, f, reverse);
+ } else if (strlen(argv[0]) > 4 &&
+ strcmp(argv[0]+strlen(argv[0])-4, ".rej") == 0) {
+ char *origname = strdup(argv[0]);
+ origname[strlen(origname) - 4] = '\0';
+ show_merge(origname, f, reverse, 0, NULL, NULL);
+ } else
+ show_merge(argv[0], f, reverse, 1, NULL, NULL);
+
+ break;
+ case 2: /* an orig and a diff/.ref */
+ f = fopen(argv[1], "r");
+ if (!f) {
+ fprintf(stderr, "%s: cannot open %s\n", Cmd, argv[0]);
+ exit(1);
+ }
+ show_merge(argv[0], f, reverse, 0, NULL, NULL);
+ break;
+ case 3: /* orig, before, after */
+ show_merge(argv[0], NULL, reverse, 1, argv[1], argv[2]);
+ break;
+ }
+
+ nocbreak();
+ nl();
+ endwin();
+ exit(0);
+}
diff --git a/wiggle.1 b/wiggle.1
new file mode 100644
index 0000000..7060456
--- /dev/null
+++ b/wiggle.1
@@ -0,0 +1,544 @@
+.\" -*- nroff -*-
+.\" wiggle - apply rejected patches
+.\"
+.\" Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+.\" Copyright (C) 2010 Neil Brown <neilb@suse.de>
+.\"
+.\"
+.\" This program is free software; you can redistribute it and/or modify
+.\" it under the terms of the GNU General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or
+.\" (at your option) any later version.
+.\"
+.\" This program is distributed in the hope that it will be useful,
+.\" but WITHOUT ANY WARRANTY; without even the implied warranty of
+.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+.\" GNU General Public License for more details.
+.\"
+.\" You should have received a copy of the GNU General Public License
+.\" along with this program; if not, write to the Free Software
+.\" Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+.\"
+.\" Author: Neil Brown
+.\" Email: <neilb@suse.de>
+.\"
+.TH WIGGLE 1 "" v0.9.1
+.SH NAME
+wiggle \- apply rejected patches and perform word-wise diffs
+
+.SH SYNOPSIS
+
+.BI wiggle " [function] [options] file [files]"
+
+.SH DESCRIPTION
+The main function of
+.I wiggle
+is to apply a patch to a file in a similar manner to the
+.BR patch (1)
+program.
+
+The distinctive difference of
+.I wiggle
+is that it will attempt to apply a patch even if the "before" part of
+the patch doesn't match the target file perfectly.
+This is achieved by breaking the file and patch into words and finding
+the best alignment of words in the file with words in the patch.
+Once this alignment has been found, any differences (word-wise) in the
+patch are applied to the file as best as possible.
+
+Also,
+.I wiggle
+will (in some cases) detect changes that have already been applied,
+and will ignore them.
+
+.I wiggle
+ensures that every change in the patch is applied to the target
+file somehow. If a particular change cannot be made in the file, the
+file is annotated to show where the change should be made in a similar
+way to the
+.BR merge (1)
+program with the
+.B \-A
+option.
+Each annotation contains 3 components: a portion of the original file
+where the change should be applied, a portion of the patch that
+couldn't be matched precisely in the file, and the text that should
+replace that portion of the patch. These are separated by lines
+containing precisely 7 identical characters, either '<', '|', '=', or '>', so
+.in +5
+.nf
+.ft CW
+<<<<<<<
+Some portion of the original file
+|||||||
+text to replace
+=======
+text to replace it with
+>>>>>>>
+.ft
+.fi
+.in -5
+
+indicates that "text to replace" should be replaced by "text to
+replace it with" somewhere in the portion of the original file.
+However
+.I wiggle
+was not able to find a place to make this change.
+
+.I wiggle
+can also produce conflict reports showing only the words that are
+involved rather than showing whole lines.
+In this case the output looks like:
+.ft CW
+.ti +5
+<<<---original|||old===new--->>>
+.ft
+
+A typical usage of
+.I wiggle
+is to run
+.I patch
+to apply some patch, and to collect a list of rejects by monitoring
+the error messages from patch. Then for each file for which a
+reject was found, run
+.ti +5
+wiggle \-\-replace originalfile originalfile.rej
+
+Finally each file must be examined to resolve any unresolved
+conflicts, and to make sure the applied patch is semantically correct.
+
+.SS OPTIONS
+The following options are understood by
+.IR wiggle .
+Some of these are explained in more detail in the following sections
+on MERGE, DIFF, EXTRACT, and BROWSE.
+
+.TP
+.BR \-m ", " \-\-merge
+Select the "merge" function. This is the default function.
+
+.TP
+.BR \-d ", " \-\-diff
+Select the "diff" function. This displays the differences between files.
+
+.TP
+.BR \-x ", " \-\-extract
+Select the "extract" function. This extracts one branch of a patch or
+merge file.
+
+.TP
+.BR \-B ", " \-\-browse
+Select the "browse" function. This is similar to "merge" only with a
+different presentation. Instead of the result simply being sent to
+standard output, it is presented using an ncurses-based GUI so that
+each hunk of the patch can be examined to understand what conflicts
+where involved and what needed to be ignored in order of the patch to
+be wiggled in to place.
+
+.TP
+.BR \-w ", " \-\-words
+Request that all operations and display be word based. This is the
+default for the "diff" function.
+
+.TP
+.BR \-l ", " \-\-lines
+Request that all operations and display be line based.
+
+.TP
+.BR \-p ", " \-\-patch
+Treat the last named file as a patch instead of a file (with \-\-diff)
+or a merge (\-\-extract).
+In
+.I merge
+or
+.B browse
+mode,
+.B \-p
+requires there be exactly one file which is a patch and which can
+contain patches to multiple file. The patches are merged into each
+file. When used in
+.I merge
+mode, this usage requires the
+.B \-\-replace
+option as writing lots of merged files to standard-out is impractical.
+
+When processing a multi-file patch,
+B \-p
+can be followed by a numeric argument indicating how many file name
+components should be stripped from files named in the patch file. If no
+numeric argument is given,
+.I wiggle
+will deduce an appropriate number based what files are visible.
+
+.TP
+.BR \-r ", " \-\-replace
+Normally the merged output is written to standard-output. With
+\-\-replace, the original file is replaced with the merge output.
+
+.TP
+.BR \-R ", " \-\-reverse
+When used with the "diff" function, swap the files before calculating
+the differences.
+When used with the "merge" function,
+.I wiggle
+attempts to revert changes rather than apply them.
+
+.TP
+.BR \-i ", " \-\-no\-ignore
+Normally wiggle will ignore changes in the patch which appear to
+already have been applied in the original. With this flag those
+changes are reported as conflicts rather than being ignored.
+
+.TP
+.BR \-W ", " \-\-show\-wiggle
+When used with
+.IR \-\-merge ,
+conflicts that can be wiggled into place are reported as conflicts
+with an extra stanza which shows what the result would be if this flag
+had not been used. The extra stanza is introduce with a line
+containing 7 ampersand
+.RB ( & )
+characters thus:
+.in +5
+.nf
+.ft CW
+<<<<<<<
+Some portion of the original file
+|||||||
+text to replace
+=======
+text to replace it with
+&&&&&&&
+Text that would result from a successful wiggle
+>>>>>>>
+.ft
+.fi
+.in -5
+
+.TP
+.BR -h ", " \-\-help
+Print a simple help message. If given after one of the function
+selectors (\-\-merge, \-\-diff, \-\-extract) help specific to that function
+is displayed.
+
+.TP
+.BR -V ", " \-\-version
+Display the version number of
+.IR wiggle .
+
+.TP
+.BR -v ", " \-\-verbose
+Enable verbose mode. Currently this makes no difference.
+
+.TP
+.BR -q ", " \-\-quiet
+Enable quiet mode. This suppresses the message from the merge
+function when there are unresolvable conflicts.
+
+.SS WORDS
+.I wiggle
+can divide a text into lines or words when performing it's tasks.
+A line is simply a string of characters terminated by a newline.
+A word is either a maximal contiguous string of alphanumerics
+(including underscore), a maximal contiguous string of space or tab
+characters, or any other single character.
+
+.SS MERGE
+The merge function modifies a given text by finding all changes between
+two other texts and imposing those changes on the given text.
+
+Normally
+.I wiggle
+focuses on which words have changed so as to maximise the possibility
+of finding a good match in the given text for the context of a given
+change. However it can consider only whole lines instead.
+
+.I wiggle
+extracts the three texts that it needs from files listed on the
+command line. Either 1, 2, or 3 files may be listed, and any one of
+them may be a lone hyphen signifying standard-input.
+
+If one file is given and the
+.B \-p
+option is not present, the file is treated as a
+.B merge
+file, i.e. the output of "merge \-A" or "wiggle". Such a file
+implicitly contains three streams and these are extracted and
+compared.
+
+If two files are given, then the first simply contains the primary
+text, and the second is treated as a patch file (the output of "diff\ \-u"
+or "diff\ \-c", or a ".rej" file from
+.IR patch )
+and the two other texts
+are extracted from that.
+
+If one file is given together with the
+.B \-p
+option, the file is treated as a patch file containing the names of
+the files that it patches. In this case multiple merge operations can
+happen and each takes one stream from a file named in the patch, and
+the other to from the patch itself. The
+.B \-\-replace
+option is required and the results are written back to the
+target files.
+
+Finally if three files are listed, they are taken to contain the given
+text and the two other texts, in order.
+
+Normally the result of the merge is written to standard-output.
+However if the "\-r" flag is given, the output is written to a file
+which replaces the original given file. In this case the original file
+is renamed to have a
+.B .porig
+suffix (for "patched original" which makes sense if you first use
+.I patch
+to apply a patch, and then use
+.I wiggle
+to wiggle the rejects in).
+
+If no errors occur (such as file access errors)
+.I wiggle
+will exit with a status of 0 if all changes were successfully merged,
+and with an exit status of 1 and a brief message if any changes could
+not be fully merged and were instead inserted as annotations.
+
+The merge function can operate in three different modes with respect
+to lines or words.
+
+With the
+.B \-\-lines
+option, whole lines are compared and any conflicts
+are reported as whole lines that need to be replaced.
+
+With the
+.B \-\-words
+option, individual words are compared and any
+conflicts are reported just covering the words affected. This uses
+the \f(CW <<<|||===>>> \fP conflict format.
+
+Without either of these options, a hybrid approach is taken.
+Individual words are compared and merged, but when a conflict is found
+the whole surrounding line is reported as being in conflict.
+
+.I wiggle
+will ensure that every change between the two other texts is reflected
+in the result of the merge somehow. There are four different ways
+that a change can be reflected.
+.IP 1
+If a change converts
+.B A
+to
+.B B
+and
+.B A
+is found at a suitable place in the original file, it is
+replaced with
+.BR B .
+This includes the possibility that
+.B B
+is empty, but
+not that
+.B A
+is empty.
+
+.IP 2
+If a change is found which simply adds
+.B B
+and the text immediately preceding and following the insertion are
+found adjacent in the original file in a suitable place, then
+.B B
+is inserted between those adjacent texts.
+
+.IP 3
+If a change is found which changes
+.B A
+to
+.B B
+and this appears (based on context) to align with
+.B B
+in the original, then it is assumed that this change has already been
+applied, and the change is ignored. When this happens, a message
+reflected the number of ignored changes is printed by
+.IR wiggle .
+This optimisation can be suppressed with the
+.B \-i
+flag.
+
+.IP 4
+If a change is found that does not fit any of the above possibilities,
+then a conflict is reported as described earlier.
+
+.SS DIFF
+
+The diff function is provided primarily to allow inspection of the
+alignments that
+.I wiggle
+calculated between texts and that it uses for performing a merge.
+
+The output of the diff function is similar to the unified output of
+diff. However while diff does not output long stretches of common text,
+.IR wiggle 's
+diff mode outputs everything.
+
+When calculating a word-based alignment (the default),
+.I wiggle
+may need to show these word-based differences. This is done using an
+extension to the unified-diff format. If a line starts with a
+vertical bar, then it may contain sections surrounded by special
+multi-character brackets. The brackets "<<<++" and "++>>>" surround
+added text while "<<<--" and "-->>>" surround removed text.
+
+.I wiggle
+can be given the two texts to compare in one of three ways.
+
+If only one file is given, then it is treated as a patch and the two
+branches of that diff are compared. This effectively allows a patch
+to be refined from a line-based patch to a word-based patch.
+
+If two files are given, then they are normally assumed to be simple
+texts to be compared.
+
+If two files are given along with the \-\-patch option, then the second
+file is assumed to be a patch and either the first (with \-1) or the
+second (with \-2) branch is extracted and compared with text found in
+the first file.
+
+This last option causes
+.I wiggle
+to apply a "best-fit" algorithm for aligning patch hunks with the
+file before computing the differences. This algorithm is used when
+merging a patch with a file, and its value can be seen by comparing
+the difference produced this way with the difference produced by first
+extracting one branch of a patch into a file, and then computing the
+difference of that file with the main file.
+
+
+.SS EXTRACT
+
+The extract function of
+.I wiggle
+simply exposes the internal functionality for extracting one branch of
+a patch or a merge file.
+
+Precisely one file should be given, and it will be assumed to be a
+merge file unless
+.B \-\-patch
+is given, in which case a patch is assumed.
+
+The choice of branch in made by providing one of
+.BR -1 ,
+.BR -2 ,
+or
+.B -3
+with obvious meanings.
+
+.SS BROWSE
+
+The browse function of
+.I wiggle
+presents the result of a merge in a text-based GUI that can be
+navigated using keystrokes similar to vi(1) or emacs(1).
+
+The browser allow each of the three streams to be viewed individually
+with colours used to highlight different sorts of text - green for
+added text, red for deleted text etc. It can also show the patch by
+itself, the full result of the merge, or the merge and the patch
+side-by-side.
+
+The browser provides a number of context-sensitive help pages which
+can be accessed by typing '?'
+
+.SH WARNING
+
+Caution should always be exercised when applying a rejected patch with
+.IR wiggle .
+When
+.I patch
+rejects a patch, it does so for a good reason. Even though
+.I wiggle
+may be able to find a believable place to apply each textual change,
+there is no guarantee that the result is correct in any semantic
+sense. The result should always be inspected to make sure it is
+correct.
+
+.SH EXAMPLES
+
+.B " wiggle \-\-replace file file.rej"
+.br
+This is the normal usage of
+.I wiggle
+and will take any changes in
+.B file.rej
+that
+.I patch
+could not apply, and merge them into
+.BR file .
+
+.B " wiggle -dp1 file file.rej"
+.br
+This will perform a word-wise comparison between the
+.B file
+and the
+.I before
+branch of the diff in
+.B file.rej
+and display the differences. This allows you to see where a given
+patch would apply.
+
+.B " wiggle \-\-merge \-\-help"
+.br
+Get help about the merge function of
+.IR wiggle .
+
+.B " wiggle --browse --patch update.patch"
+.br
+Parse the
+.B update.patch
+file for patches and present a list of patched files which can be
+browsed to examine each patch in detail.
+
+.SH QUOTE
+The name of
+.I wiggle
+was inspired by the following quote. However
+.I wiggle
+does not yet
+.B help
+you to wiggle a patch into place. It either does the wiggle itself,
+or leave it for you to finish off.
+
+.nf
+The problem I find is that I often want to take
+ (file1+patch) -> file2,
+when I don't have file1. But merge tools want to take
+ (file1|file2) -> file3.
+I haven't seen a graphical tool which helps you to wiggle a patch
+into a file.
+
+\-\- Andrew Morton - 2002
+.fi
+
+.SH SHORTCOMINGS
+.IP -
+.I wiggle
+cannot read the extended unified-diff output that it produces for
+\-\-diff \-\-words.
+
+.IP -
+.I wiggle
+cannot read the word-based merge format that it produces for \-\-merge
+\-\-words.
+
+.SH AUTHOR
+
+Neil Brown at Computer Science and Engineering at
+The University of New South Wales, Sydney, Australia;
+and later and SUSE, still in Sydney, Australia.
+
+.SH SEE ALSO
+.IR patch (1),
+.IR diff (1),
+.IR merge (1),
+.IR wdiff (1),
+.IR diff3 (1).
diff --git a/wiggle.c b/wiggle.c
new file mode 100644
index 0000000..a3cba3e
--- /dev/null
+++ b/wiggle.c
@@ -0,0 +1,813 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+/*
+ * Wiggle is a tool for working with patches that don't quite apply properly.
+ * It provides functionality similar to 'diff' and 'merge' but can
+ * work at the level of individual words thus allowing the merging of
+ * two changes that affect the same line, but not the same parts of that line.
+ *
+ * Wiggle can also read patch and merge files. Unlike 'merge' it does not
+ * need to be given three separate files, but can be given a file and a patch
+ * and it will extract the pieces of the two other files that it needs from
+ * the patch.
+ *
+ * Wiggle performs one of three core function:
+ * --extract -x extract part of a patch or merge file
+ * --diff -d report differences between two files
+ * --merge -m merge the changes between two files into a third file
+ *
+ * This is also a --browse (-B) mode which provides interactive access
+ * to the merger.
+ *
+ * To perform these, wiggle requires 1, 2, or 3 input streams respectively.
+ * I can get these from individual files, from a diff (unified or context) or
+ * from a merge file.
+ *
+ * For merge:
+ * If one file is given, it is a merge file (output of 'merge').
+ * If two files are given, the second is assumed to be a patch,
+ * the first is a normal file.
+ * If three files are given, they are taken to be normal files.
+ *
+ * For diff:
+ * If one file is given, it is a patch
+ * If two files are given, they are normal files.
+ *
+ * For extract:
+ * Only one file can be given. -p indicates it is a patch,
+ * otherwise it is a merge.
+ * One of the flags -1 -2 or -3 must also be given and they indicate which
+ * part of the patch or merge to extract.
+ *
+ * Difference calculation and merging is performed on lines (-l) or words (-w).
+ * Each 'word' is either 1/all alphnumeric (or '_'), 2/ all space or tab,
+ * or 3/ any other single character.
+ *
+ * In the case of -w, an initial diff is computed based on non-trivial words
+ * which includes alhpanumeric words and newlines.
+ *
+ * This diff is computed from the ends of the file and is used to find
+ * a suitable starting point and range. Then a more precise diff is
+ * computed over that restricted range
+ *
+ * Other options available are:
+ * --replace -r replace first file with result of merge.
+ * --help -h provide help
+ * --version -v version
+ *
+ * Defaults are --merge --words
+ *
+ */
+#define _GNU_SOURCE
+#include "wiggle.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+
+char *Cmd = "wiggle";
+int do_trace = 0;
+
+void die()
+{
+ fprintf(stderr, "%s: fatal error\n", Cmd);
+ abort();
+ exit(3);
+}
+
+void *xmalloc(int size)
+{
+ void *rv = malloc(size);
+ if (size && !rv) {
+ char *msg = "Failed to allocate memory - aborting\n";
+ write(2, msg, strlen(msg));
+ exit(3);
+ }
+ return rv;
+}
+
+void printword(FILE *f, struct elmnt e)
+{
+ if (e.start[0])
+ fprintf(f, "%.*s", e.len, e.start);
+ else {
+ int a, b, c;
+ sscanf(e.start+1, "%d %d %d", &a, &b, &c);
+ fprintf(f, "*** %d,%d **** %d\n", b, c, a);
+ }
+}
+
+static void printsep(struct elmnt e1, struct elmnt e2)
+{
+ int a, b, c, d, e, f;
+ sscanf(e1.start+1, "%d %d %d", &a, &b, &c);
+ sscanf(e2.start+1, "%d %d %d", &d, &e, &f);
+ printf("@@ -%d,%d +%d,%d @@\n", b, c, e, f);
+}
+
+static int extract(int argc, char *argv[], int ispatch, int which)
+{
+ /* extract a branch of a diff or diff3 or merge output
+ * We need one file
+ */
+ struct stream f, flist[3];
+
+ if (argc == 0) {
+ fprintf(stderr,
+ "%s: no file given for --extract\n", Cmd);
+ return 2;
+ }
+ if (argc > 1) {
+ fprintf(stderr,
+ "%s: only give one file for --extract\n", Cmd);
+ return 2;
+ }
+ f = load_file(argv[0]);
+ if (f.body == NULL) {
+ fprintf(stderr,
+ "%s: cannot load file '%s' - %s\n", Cmd,
+ argv[0], strerror(errno));
+ return 2;
+ }
+ if (ispatch) {
+ if (split_patch(f, &flist[0], &flist[1]) == 0) {
+ fprintf(stderr,
+ "%s: No chunk found in patch: %s\n", Cmd,
+ argv[0]);
+ return 0;
+ }
+ } else {
+ if (!split_merge(f, &flist[0], &flist[1], &flist[2])) {
+ fprintf(stderr,
+ "%s: merge file %s looks bad.\n", Cmd,
+ argv[0]);
+ return 2;
+ }
+ }
+ if (flist[which-'1'].body == NULL) {
+ fprintf(stderr,
+ "%s: %s has no -%c component.\n", Cmd,
+ argv[0], which);
+ return 2;
+ } else {
+ if (write(1, flist[which-'1'].body,
+ flist[which-'1'].len)
+ != flist[which-'1'].len)
+ return 2;
+ }
+ return 0;
+}
+
+static int do_diff_lines(struct file fl[2], struct csl *csl)
+{
+ int a, b;
+ int exit_status = 0;
+ a = b = 0;
+ while (a < fl[0].elcnt || b < fl[1].elcnt) {
+ if (a < csl->a) {
+ if (fl[0].list[a].start[0]) {
+ printf("-");
+ printword(stdout,
+ fl[0].list[a]);
+ }
+ a++;
+ exit_status++;
+ } else if (b < csl->b) {
+ if (fl[1].list[b].start[0]) {
+ printf("+");
+ printword(stdout,
+ fl[1].list[b]);
+ }
+ b++;
+ exit_status++;
+ } else {
+ if (fl[0].list[a].start[0] == '\0')
+ printsep(fl[0].list[a],
+ fl[1].list[b]);
+ else {
+ printf(" ");
+ printword(stdout,
+ fl[0].list[a]);
+ }
+ a++;
+ b++;
+ if (a >= csl->a+csl->len)
+ csl++;
+ }
+ }
+ return exit_status;
+}
+
+static int do_diff_words(struct file fl[2], struct csl *csl)
+{
+ int a, b;
+ int exit_status = 0;
+ int sol = 1; /* start of line */
+ a = b = 0;
+ while (a < fl[0].elcnt || b < fl[1].elcnt) {
+ if (a < csl->a) {
+ exit_status++;
+ if (sol) {
+ int a1;
+ /* If we remove a
+ * whole line, output
+ * +line else clear
+ * sol and retry */
+ sol = 0;
+ for (a1 = a; a1 < csl->a ; a1++)
+ if (ends_line(fl[0].list[a1])) {
+ sol = 1;
+ break;
+ }
+ if (sol) {
+ printf("-");
+ for (; a < csl->a ; a++) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a])) {
+ a++;
+ break;
+ }
+ }
+ } else
+ printf("|");
+ }
+ if (!sol) {
+ printf("<<<--");
+ do {
+ if (sol)
+ printf("|");
+ printword(stdout, fl[0].list[a]);
+ sol = ends_line(fl[0].list[a]);
+ a++;
+ } while (a < csl->a);
+ printf("%s-->>>", sol ? "|" : "");
+ sol = 0;
+ }
+ } else if (b < csl->b) {
+ exit_status++;
+ if (sol) {
+ int b1;
+ sol = 0;
+ for (b1 = b; b1 < csl->b; b1++)
+ if (ends_line(fl[1].list[b1])) {
+ sol = 1;
+ break;
+ }
+ if (sol) {
+ printf("+");
+ for (; b < csl->b ; b++) {
+ printword(stdout, fl[1].list[b]);
+ if (ends_line(fl[1].list[b])) {
+ b++;
+ break;
+ }
+ }
+ } else
+ printf("|");
+ }
+ if (!sol) {
+ printf("<<<++");
+ do {
+ if (sol)
+ printf("|");
+ printword(stdout, fl[1].list[b]);
+ sol = ends_line(fl[1].list[b]);
+ b++;
+ } while (b < csl->b);
+ printf("%s++>>>", sol ? "|" : "");
+ sol = 0;
+ }
+ } else {
+ if (sol) {
+ int a1;
+ sol = 0;
+ for (a1 = a; a1 < csl->a+csl->len; a1++)
+ if (ends_line(fl[0].list[a1]))
+ sol = 1;
+ if (sol) {
+ if (fl[0].list[a].start[0]) {
+ printf(" ");
+ for (; a < csl->a+csl->len; a++, b++) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a])) {
+ a++, b++;
+ break;
+ }
+ }
+ } else {
+ printsep(fl[0].list[a], fl[1].list[b]);
+ a++; b++;
+ }
+ } else
+ printf("|");
+ }
+ if (!sol) {
+ printword(stdout, fl[0].list[a]);
+ if (ends_line(fl[0].list[a]))
+ sol = 1;
+ a++;
+ b++;
+ }
+ if (a >= csl->a+csl->len)
+ csl++;
+ }
+ }
+ return exit_status;
+}
+
+static int do_diff(int argc, char *argv[], int obj, int ispatch,
+ int which, int reverse)
+{
+ /* create a diff (line or char) of two streams */
+ struct stream f, flist[3];
+ int chunks1 = 0, chunks2 = 0, chunks3 = 0;
+ int exit_status = 0;
+ struct file fl[2];
+ struct csl *csl;
+
+ switch (argc) {
+ case 0:
+ fprintf(stderr, "%s: no file given for --diff\n", Cmd);
+ return 2;
+ case 1:
+ f = load_file(argv[0]);
+ if (f.body == NULL) {
+ fprintf(stderr,
+ "%s: cannot load file '%s' - %s\n", Cmd,
+ argv[0], strerror(errno));
+ return 2;
+ }
+ chunks1 = chunks2 =
+ split_patch(f, &flist[0], &flist[1]);
+ if (!flist[0].body || !flist[1].body) {
+ fprintf(stderr,
+ "%s: couldn't parse patch %s\n", Cmd,
+ argv[0]);
+ return 2;
+ }
+ break;
+ case 2:
+ flist[0] = load_file(argv[0]);
+ if (flist[0].body == NULL) {
+ fprintf(stderr,
+ "%s: cannot load file '%s' - %s\n", Cmd,
+ argv[0], strerror(errno));
+ return 2;
+ }
+ if (ispatch) {
+ f = load_file(argv[1]);
+ if (f.body == NULL) {
+ fprintf(stderr,
+ "%s: cannot load patch"
+ " '%s' - %s\n", Cmd,
+ argv[1], strerror(errno));
+ return 2;
+ }
+ if (which == '2')
+ chunks2 = chunks3 =
+ split_patch(f, &flist[2],
+ &flist[1]);
+ else
+ chunks2 = chunks3 =
+ split_patch(f, &flist[1],
+ &flist[2]);
+
+ } else
+ flist[1] = load_file(argv[1]);
+ if (flist[1].body == NULL) {
+ fprintf(stderr,
+ "%s: cannot load file"
+ " '%s' - %s\n", Cmd,
+ argv[1], strerror(errno));
+ return 2;
+ }
+ break;
+ default:
+ fprintf(stderr,
+ "%s: too many files given for --diff\n", Cmd);
+ return 2;
+ }
+ if (reverse) {
+ f = flist[1];
+ flist[1] = flist[2];
+ flist[2] = f;
+ }
+ fl[0] = split_stream(flist[0], obj == 'l' ? ByLine : ByWord);
+ fl[1] = split_stream(flist[1], obj == 'l' ? ByLine : ByWord);
+ if (chunks2 && !chunks1)
+ csl = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl = diff(fl[0], fl[1]);
+ if (obj == 'l') {
+ if (!chunks1)
+ printf("@@ -1,%d +1,%d @@\n",
+ fl[0].elcnt, fl[1].elcnt);
+ exit_status = do_diff_lines(fl, csl);
+ } else {
+ if (!chunks1) {
+ /* count lines in each file */
+ int l1, l2, i;
+ l1 = l2 = 0;
+ for (i = 0 ; i < fl[0].elcnt ; i++)
+ if (ends_line(fl[0].list[i]))
+ l1++;
+ for (i = 0 ; i < fl[1].elcnt ; i++)
+ if (ends_line(fl[1].list[i]))
+ l2++;
+ printf("@@ -1,%d +1,%d @@\n", l1, l2);
+ }
+ exit_status = do_diff_words(fl, csl);
+ }
+ return exit_status;
+}
+
+static int do_merge(int argc, char *argv[], int obj,
+ int reverse, int replace, int ignore, int show_wiggles,
+ int quiet)
+{
+ /* merge three files, A B C, so changed between B and C get made to A
+ */
+ struct stream f, flist[3];
+ struct file fl[3];
+ int i;
+ int chunks1 = 0, chunks2 = 0, chunks3 = 0;
+ char *replacename = NULL, *orignew = NULL;
+ struct csl *csl1, *csl2;
+ struct ci ci;
+ FILE *outfile = stdout;
+
+ switch (argc) {
+ case 0:
+ fprintf(stderr, "%s: no files given for --merge\n", Cmd);
+ return 2;
+ case 3:
+ case 2:
+ case 1:
+ for (i = 0; i < argc; i++) {
+ flist[i] = load_file(argv[i]);
+ if (flist[i].body == NULL) {
+ fprintf(stderr, "%s: cannot load file '%s' - %s\n",
+ Cmd,
+ argv[i], strerror(errno));
+ return 2;
+ }
+ }
+ break;
+ default:
+ fprintf(stderr, "%s: too many files given for --merge\n",
+ Cmd);
+ return 2;
+ }
+ switch (argc) {
+ case 1: /* a merge file */
+ f = flist[0];
+ if (!split_merge(f, &flist[0], &flist[1], &flist[2])) {
+ fprintf(stderr, "%s: merge file %s looks bad.\n",
+ Cmd,
+ argv[0]);
+ return 2;
+ }
+ break;
+ case 2: /* a file and a patch */
+ f = flist[1];
+ chunks2 = chunks3 = split_patch(f, &flist[1], &flist[2]);
+ break;
+ case 3: /* three separate files */
+ break;
+ }
+ if (reverse) {
+ f = flist[1];
+ flist[1] = flist[2];
+ flist[2] = f;
+ }
+
+ for (i = 0; i < 3; i++) {
+ if (flist[i].body == NULL) {
+ fprintf(stderr, "%s: file %d missing\n", Cmd, i);
+ return 2;
+ }
+ }
+ if (replace) {
+ int fd;
+ replacename = xmalloc(strlen(argv[0]) + 20);
+ orignew = xmalloc(strlen(argv[0]) + 20);
+ strcpy(replacename, argv[0]);
+ strcpy(orignew, argv[0]);
+ strcat(orignew, ".porig");
+ if (open(orignew, O_RDONLY) >= 0 ||
+ errno != ENOENT) {
+ fprintf(stderr, "%s: %s already exists\n",
+ Cmd,
+ orignew);
+ return 2;
+ }
+ strcat(replacename, "XXXXXX");
+ fd = mkstemp(replacename);
+ if (fd == -1) {
+ fprintf(stderr,
+ "%s: could not create temporary file for %s\n",
+ Cmd,
+ replacename);
+ return 2;
+ }
+ outfile = fdopen(fd, "w");
+ }
+
+ if (obj == 'l') {
+ fl[0] = split_stream(flist[0], ByLine);
+ fl[1] = split_stream(flist[1], ByLine);
+ fl[2] = split_stream(flist[2], ByLine);
+ } else {
+ fl[0] = split_stream(flist[0], ByWord);
+ fl[1] = split_stream(flist[1], ByWord);
+ fl[2] = split_stream(flist[2], ByWord);
+ }
+ if (chunks2 && !chunks1)
+ csl1 = pdiff(fl[0], fl[1], chunks2);
+ else
+ csl1 = diff(fl[0], fl[1]);
+ csl2 = diff(fl[1], fl[2]);
+
+ ci = print_merge2(outfile, &fl[0], &fl[1], &fl[2],
+ csl1, csl2, obj == 'w',
+ ignore, show_wiggles);
+ if (!quiet && ci.conflicts)
+ fprintf(stderr,
+ "%d unresolved conflict%s found\n",
+ ci.conflicts,
+ ci.conflicts == 1 ? "" : "s");
+ if (!quiet && ci.ignored)
+ fprintf(stderr,
+ "%d already-applied change%s ignored\n",
+ ci.ignored,
+ ci.ignored == 1 ? "" : "s");
+
+ if (replace) {
+ fclose(outfile);
+ if (rename(argv[0], orignew) == 0 &&
+ rename(replacename, argv[0]) == 0)
+ /* all ok */;
+ else {
+ fprintf(stderr,
+ "%s: failed to move new file into place.\n",
+ Cmd);
+ return 2;
+ }
+ }
+ return (ci.conflicts > 0);
+}
+
+static int multi_merge(int argc, char *argv[], int obj,
+ int reverse, int ignore, int show_wiggles,
+ int replace, int strip,
+ int quiet)
+{
+ FILE *f;
+ char *filename;
+ struct plist *pl;
+ int num_patches;
+ int rv = 0;
+ int i;
+
+ if (!replace) {
+ fprintf(stderr,
+ "%s: -p in merge mode requires -r\n",
+ Cmd);
+ return 2;
+ }
+ if (argc != 1) {
+ fprintf(stderr,
+ "%s: -p in merge mode requires exactly one file\n",
+ Cmd);
+ return 2;
+ }
+ filename = argv[0];
+ f = fopen(filename, "r");
+ if (!f) {
+ fprintf(stderr, "%s: cannot open %s\n",
+ Cmd, filename);
+ return 2;
+ }
+ pl = parse_patch(f, NULL, &num_patches);
+ fclose(f);
+ if (set_prefix(pl, num_patches, strip) == 0) {
+ fprintf(stderr, "%s: aborting\n", Cmd);
+ return 2;
+ }
+ for (i = 0; i < num_patches; i++) {
+ char *name;
+ char *av[2];
+ asprintf(&name, "_wiggle_:%d:%d:%s",
+ pl[i].start, pl[i].end, filename);
+ av[0] = pl[i].file;
+ av[1] = name;
+ rv |= do_merge(2, av, obj, reverse, 1, ignore,
+ show_wiggles, quiet);
+ }
+ return rv;
+}
+
+int main(int argc, char *argv[])
+{
+ int opt;
+ int option_index;
+ int mode = 0;
+ int obj = 0;
+ int replace = 0;
+ int which = 0;
+ int ispatch = 0;
+ int reverse = 0;
+ int verbose = 0, quiet = 0;
+ int strip = -1;
+ int exit_status = 0;
+ int ignore = 1;
+ int show_wiggles = 0;
+ char *helpmsg;
+ char *trace;
+
+ trace = getenv("WIGGLE_TRACE");
+ if (trace && *trace)
+ do_trace = 1;
+
+ while ((opt = getopt_long(argc, argv,
+ short_options, long_options,
+ &option_index)) != -1)
+ switch (opt) {
+ case 'h':
+ helpmsg = Help;
+ switch (mode) {
+ case 'x':
+ helpmsg = HelpExtract;
+ break;
+ case 'd':
+ helpmsg = HelpDiff;
+ break;
+ case 'm':
+ helpmsg = HelpMerge;
+ break;
+ case 'B':
+ helpmsg = HelpBrowse;
+ break;
+ }
+ fputs(helpmsg, stderr);
+ exit(0);
+
+ case 'V':
+ fputs(Version, stderr);
+ exit(0);
+ case ':':
+ case '?':
+ default:
+ fputs(Usage, stderr);
+ exit(2);
+
+ case 'B':
+ case 'x':
+ case 'd':
+ case 'm':
+ if (mode == 0) {
+ mode = opt;
+ continue;
+ }
+ fprintf(stderr,
+ "%s: mode is '%c' - cannot set to '%c'\n",
+ Cmd, mode, opt);
+ exit(2);
+
+ case 'w':
+ case 'l':
+ if (obj == 0 || obj == opt) {
+ obj = opt;
+ continue;
+ }
+ fprintf(stderr,
+ "%s: cannot select both words and lines.\n", Cmd);
+ exit(2);
+
+ case 'r':
+ replace = 1;
+ continue;
+ case 'R':
+ reverse = 1;
+ continue;
+
+ case 'i':
+ ignore = 0;
+ continue;
+ case 'W':
+ show_wiggles = 1;
+ ignore = 0;
+ continue;
+
+ case '1':
+ case '2':
+ case '3':
+ if (which == 0 || which == opt) {
+ which = opt;
+ continue;
+ }
+ fprintf(stderr,
+ "%s: can only select one of -1, -2, -3\n", Cmd);
+ exit(2);
+
+ case 'p': /* 'patch' or 'strip' */
+ if (optarg)
+ strip = atol(optarg);
+ ispatch = 1;
+ continue;
+
+ case 'v':
+ verbose++;
+ continue;
+ case 'q':
+ quiet = 1;
+ continue;
+ }
+ if (!mode)
+ mode = 'm';
+
+ if (mode == 'B') {
+ vpatch(argc-optind, argv+optind, ispatch,
+ strip, reverse, replace);
+ /* should not return */
+ exit(1);
+ }
+
+ if (obj && mode == 'x') {
+ fprintf(stderr,
+ "%s: cannot specify --line or --word with --extract\n",
+ Cmd);
+ exit(2);
+ }
+ if (mode != 'm' && !obj)
+ obj = 'w';
+ if (replace && mode != 'm') {
+ fprintf(stderr,
+ "%s: --replace only allowed with --merge\n", Cmd);
+ exit(2);
+ }
+ if (mode == 'x' && !which) {
+ fprintf(stderr,
+ "%s: must specify -1, -2 or -3 with --extract\n", Cmd);
+ exit(2);
+ }
+ if (mode != 'x' && mode != 'd' && which) {
+ fprintf(stderr,
+ "%s: -1, -2 or -3 only allowed with --extract or --diff\n",
+ Cmd);
+ exit(2);
+ }
+
+ if (ispatch && which == '3') {
+ fprintf(stderr,
+ "%s: cannot extract -3 from a patch.\n", Cmd);
+ exit(2);
+ }
+
+ switch (mode) {
+ case 'x':
+ exit_status = extract(argc-optind, argv+optind, ispatch, which);
+ break;
+ case 'd':
+ exit_status = do_diff(argc-optind, argv+optind, obj, ispatch, which, reverse);
+ break;
+ case 'm':
+ if (ispatch)
+ exit_status = multi_merge(argc-optind,
+ argv+optind, obj,
+ reverse, ignore,
+ show_wiggles,
+ replace, strip,
+ quiet);
+ else
+ exit_status = do_merge(argc-optind, argv+optind, obj, reverse, replace,
+ ignore, show_wiggles, quiet);
+ break;
+ }
+ exit(exit_status);
+}
diff --git a/wiggle.h b/wiggle.h
new file mode 100644
index 0000000..91b4110
--- /dev/null
+++ b/wiggle.h
@@ -0,0 +1,201 @@
+/*
+ * wiggle - apply rejected patches
+ *
+ * Copyright (C) 2003 Neil Brown <neilb@cse.unsw.edu.au>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <memory.h>
+#include <getopt.h>
+#include <stdlib.h>
+
+static inline void assert(int a)
+{
+ if (!a)
+ abort();
+}
+
+struct stream {
+ char *body;
+ int len;
+};
+
+/* an 'elmnt' is a word or a line from the file.
+ * 'start' points into a 'body' in a stream.
+ * When a stream is made of 'diff' hunks, there is a special
+ * elmnt at the start of each hunk which starts with '\0' and
+ * records the line offsets of the hunk. These are 20 bytes long.
+ * "\0\d{5} \d{5} \d{5}\n\0"
+ * The 3 numbers are: chunk number, starting line, number if lines.
+ * An element with len==0 marks EOF.
+ */
+struct elmnt {
+ char *start;
+ int hash;
+ int len;
+};
+
+static inline int match(struct elmnt *a, struct elmnt *b)
+{
+ return
+ a->hash == b->hash &&
+ a->len == b->len &&
+ strncmp(a->start, b->start, a->len) == 0;
+}
+
+/* end-of-line is important for narrowing conflicts.
+ * In line mode, every element is a line and 'ends a line'
+ * In word mode, the newline element and the diff-hunk element
+ * end a line.
+ */
+static inline int ends_line(struct elmnt e)
+{
+ if (e.len == 20 && e.start[0] == 0)
+ return 1;
+ return e.len && e.start[e.len-1] == '\n';
+}
+
+static inline int ends_mline(struct elmnt e)
+{
+ return e.len && (e.start[0] == '\n' || e.start[0] == 0);
+}
+
+struct csl {
+ int a, b;
+ int len;
+};
+
+struct file {
+ struct elmnt *list;
+ int elcnt;
+};
+
+/* The result of a merger is a series of text sections.
+ * Each section may occur in one or more of the three stream,
+ * and may be different in different stream (e.g. for changed text)
+ * or the same.
+ * When a conflict occurs we need to treat some surrounding
+ * sections as being involved in that conflict. For
+ * line-based merging, all surrounding sections until an Unchanged
+ * section are part of the conflict - the Unchanged isn't.
+ * For word-based merging, we need to find Unchanged sections
+ * that include a newline. Further, text within the unchanged
+ * section upto the newline (in whichever direction) is treated
+ * as part of the whole conflict.
+ * Actually... it is possibly for a 'Changed' section to bound
+ * a conflict as it indicates a successful match of A and B.
+ * For line-wise merges, any Changed or Unchanged section bounds a conflict
+ * For word-wise merges, any Changed or Unchanged section that matches
+ * a newline, or immediately follows a newline (in all files) can bound
+ * a conflict.
+ */
+struct merge {
+ enum mergetype {
+ End, Unmatched, Unchanged, Extraneous,
+ Changed, Conflict, AlreadyApplied,
+ } type;
+ int a, b, c; /* start of ranges */
+ int al, bl, cl; /* length of ranges */
+ int c1, c2; /* this or next common-sequence */
+ int in_conflict;
+ int lo, hi; /* region of a Changed or Unchanged that is not involved
+ * in a conflict.
+ * These are distances from start of the "before" section,
+ * not indexes into any file.
+ */
+
+};
+
+/* plist stores a list of patched files in an array
+ * Each entry identifies a file, the range of the
+ * original patch which applies to this file, some
+ * statistics concerning how many conflicts etc, and
+ * some linkage information so the list can be viewed
+ * as a directory-tree.
+ */
+struct plist {
+ char *file;
+ unsigned int start, end;
+ int parent;
+ int next, prev, last;
+ int open;
+ int chunks, wiggles, conflicts;
+ int calced;
+ int is_merge;
+ char *before, *after;
+};
+
+extern struct plist *sort_patches(struct plist *pl, int *np);
+extern struct plist *parse_patch(FILE *f, FILE *of, int *np);
+extern struct stream load_segment(FILE *f, unsigned int start,
+ unsigned int end);
+extern int set_prefix(struct plist *pl, int n, int strip);
+extern struct stream load_file(char *name);
+extern int split_patch(struct stream, struct stream*, struct stream*);
+extern int split_merge(struct stream, struct stream*, struct stream*,
+ struct stream*);
+extern struct file split_stream(struct stream s, int type);
+extern struct csl *pdiff(struct file a, struct file b, int chunks);
+extern struct csl *diff(struct file a, struct file b);
+extern struct csl *diff_partial(struct file a, struct file b,
+ int alo, int ahi, int blo, int bhi);
+extern struct csl *worddiff(struct stream f1, struct stream f2,
+ struct file *fl1p, struct file *fl2p);
+
+struct ci {
+ int conflicts, wiggles, ignored;
+ struct merge *merger;
+};
+extern struct ci print_merge2(FILE *out,
+ struct file *a, struct file *b, struct file *c,
+ struct csl *c1, struct csl *c2,
+ int words, int ignore_already, int show_wiggles);
+extern void printword(FILE *f, struct elmnt e);
+
+extern struct ci make_merger(struct file a, struct file b, struct file c,
+ struct csl *c1, struct csl *c2, int words,
+ int ignore_already, int show_wiggles);
+
+extern void die(void);
+extern void *xmalloc(int len);
+extern int do_trace;
+
+extern int vpatch(int argc, char *argv[], int patch, int strip,
+ int reverse, int replace);
+
+extern char *Cmd;
+extern char Version[];
+extern char short_options[];
+extern struct option long_options[];
+extern char Usage[];
+extern char Help[];
+extern char HelpExtract[];
+extern char HelpDiff[];
+extern char HelpMerge[];
+extern char HelpBrowse[];
+
+extern void cleanlist(struct file a, struct file b, struct csl *list);
+
+enum {
+ ByLine,
+ ByWord,
+};
diff --git a/wiggle.spec b/wiggle.spec
new file mode 100644
index 0000000..a700f1c
--- /dev/null
+++ b/wiggle.spec
@@ -0,0 +1,55 @@
+Summary: A tool for applying patches with conflicts
+Name: wiggle
+Version: 0.9.1
+Release: 1
+License: GPL
+Group: Development/Tools
+URL: http://neil.brown.name/wiggle/
+Source0: http://neil.brown.name/wiggle/%{name}-%{version}.tar.gz
+BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-buildroot
+
+%description
+Wiggle is a program for applying patches that 'patch' cannot
+apply due to conflicting changes in the original.
+
+Wiggle will always apply all changes in the patch to the original.
+If it cannot find a way to cleanly apply a patch, it inserts it
+in the original in a manner similar to 'merge', and report an
+unresolvable conflict.
+
+%prep
+%setup -q
+
+%build
+make BINDIR=/usr/bin \
+ MANDIR=%{_mandir} MAN1DIR=%{_mandir}/man1 MAN5DIR=%{_mandir}/man5 \
+ CFLAGS="$RPM_OPT_FLAGS" \
+ wiggle
+
+%install
+rm -rf $RPM_BUILD_ROOT
+mkdir -p $RPM_BUILD_ROOT/usr/bin
+mkdir -p $RPM_BUILD_ROOT%{_mandir}/man{1,5}
+
+make BINDIR=$RPM_BUILD_ROOT/usr/bin \
+ MANDIR=$RPM_BUILD_ROOT%{_mandir} \
+ MAN1DIR=$RPM_BUILD_ROOT%{_mandir}/man1 \
+ MAN5DIR=$RPM_BUILD_ROOT%{_mandir}/man5 \
+ install
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%defattr(-,root,root,-)
+/usr/bin/wiggle
+%{_mandir}/man1/wiggle.1*
+%doc ANNOUNCE TODO notes
+%doc p p.help
+
+
+%changelog
+* Thu May 22 2003 Horst von Brand <vonbrand@inf.utfsm.cl> 0.6-1
+- Initial build.
+
+