summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/math/statistics/dependence
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/math/statistics/dependence')
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/AbstractDependenceMeasure.java262
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java141
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DependenceMeasure.java98
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DistanceCorrelationDependenceMeasure.java206
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HSMDependenceMeasure.java245
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HiCSDependenceMeasure.java243
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HoeffdingsDDependenceMeasure.java207
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/JensenShannonEquiwidthDependenceMeasure.java143
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MCEDependenceMeasure.java274
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MutualInformationEquiwidthDependenceMeasure.java137
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SURFINGDependenceMeasure.java133
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeDependenceMeasure.java153
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeInversionDependenceMeasure.java157
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SpearmanCorrelationDependenceMeasure.java78
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/statistics/dependence/package-info.java26
15 files changed, 2503 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/AbstractDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/AbstractDependenceMeasure.java
new file mode 100644
index 00000000..ba23588e
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/AbstractDependenceMeasure.java
@@ -0,0 +1,262 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+
+/**
+ * Abstract base class for dependence measures.
+ *
+ * @author Erich Schubert
+ */
+public abstract class AbstractDependenceMeasure implements DependenceMeasure {
+ // In your subclass, you will need to implement at least this method:
+ // Sorry for the complex use of generics, but this way we can use it with any
+ // type of array, including vectors in rows or columns etc.
+ @Override
+ abstract public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2);
+
+ @Override
+ public double dependence(double[] data1, double[] data2) {
+ return dependence(ArrayLikeUtil.DOUBLEARRAYADAPTER, data1, ArrayLikeUtil.DOUBLEARRAYADAPTER, data2);
+ }
+
+ @Override
+ public <A> double dependence(NumberArrayAdapter<?, A> adapter, A data1, A data2) {
+ return dependence(adapter, data1, adapter, data2);
+ }
+
+ @Override
+ public <A> double[] dependence(NumberArrayAdapter<?, A> adapter, List<? extends A> data) {
+ final int dims = data.size();
+ double[] out = new double[(dims * (dims - 1)) >> 1];
+ int o = 0;
+ for(int y = 1; y < dims; y++) {
+ A dy = data.get(y);
+ for(int x = 0; x < y; x++) {
+ out[o++] = dependence(adapter, data.get(x), adapter, dy);
+ }
+ }
+ return out;
+ }
+
+ /**
+ * Clamp values to a given minimum and maximum.
+ *
+ * @param value True value
+ * @param min Minimum
+ * @param max Maximum
+ * @return {@code value}, unless smaller than {@code min} or larger than
+ * {@code max}.
+ */
+ protected static double clamp(double value, double min, double max) {
+ return value < min ? min : value > max ? max : value;
+ }
+
+ /**
+ * Compute ranks of all objects, normalized to [0;1] (where 0 is the smallest
+ * value, 1 is the largest).
+ *
+ * @param adapter Data adapter
+ * @param data Data array
+ * @param len Length of data
+ * @return Array of scores
+ */
+ protected static <A> double[] computeNormalizedRanks(final NumberArrayAdapter<?, A> adapter, final A data, int len) {
+ // Sort the objects:
+ int[] s1 = sortedIndex(adapter, data, len);
+ final double norm = .5 / (len - 1);
+ double[] ret = new double[len];
+ for(int i = 0; i < len;) {
+ final int start = i++;
+ final double val = adapter.getDouble(data, s1[start]);
+ while(i < len && adapter.getDouble(data, s1[i]) <= val) {
+ i++;
+ }
+ final double score = (start + i - 1) * norm;
+ for(int j = start; j < i; j++) {
+ ret[s1[j]] = score;
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Compute ranks of all objects, ranging from 1 to len.
+ *
+ * Ties are given the average rank.
+ *
+ * @param adapter Data adapter
+ * @param data Data array
+ * @param len Length of data
+ * @return Array of scores
+ */
+ protected static <A> double[] ranks(final NumberArrayAdapter<?, A> adapter, final A data, int len) {
+ // Sort the objects:
+ int[] s1 = sortedIndex(adapter, data, len);
+ return ranks(adapter, data, s1);
+ }
+
+ /**
+ * Compute ranks of all objects, ranging from 1 to len.
+ *
+ * Ties are given the average rank.
+ *
+ * @param adapter Data adapter
+ * @param data Data array
+ * @param idx Data index
+ * @return Array of scores
+ */
+ protected static <A> double[] ranks(final NumberArrayAdapter<?, A> adapter, final A data, int[] idx) {
+ final int len = idx.length;
+ double[] ret = new double[len];
+ for(int i = 0; i < len;) {
+ final int start = i++;
+ final double val = adapter.getDouble(data, idx[start]);
+ // Include ties:
+ while(i < len && adapter.getDouble(data, idx[i]) <= val) {
+ i++;
+ }
+ final double score = (start + i - 1) * .5 + 1;
+ for(int j = start; j < i; j++) {
+ ret[idx[j]] = score;
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Build a sorted index of objects.
+ *
+ * @param adapter Data adapter
+ * @param data Data array
+ * @param len Length of data
+ * @return Sorted index
+ */
+ protected static <A> int[] sortedIndex(final NumberArrayAdapter<?, A> adapter, final A data, int len) {
+ int[] s1 = MathUtil.sequence(0, len);
+ IntegerArrayQuickSort.sort(s1, new IntegerComparator() {
+ @Override
+ public int compare(int x, int y) {
+ return Double.compare(adapter.getDouble(data, x), adapter.getDouble(data, y));
+ }
+ });
+ return s1;
+ }
+
+ /**
+ * Discretize a data set into equi-width bin numbers.
+ *
+ * @param adapter Data adapter
+ * @param data Data array
+ * @param len Length of data
+ * @param bins Number of bins
+ * @return Array of bin numbers [0;bin[
+ */
+ protected static <A> int[] discretize(NumberArrayAdapter<?, A> adapter, A data, final int len, final int bins) {
+ double min = adapter.getDouble(data, 0), max = min;
+ for(int i = 1; i < len; i++) {
+ double v = adapter.getDouble(data, i);
+ if(v < min) {
+ min = v;
+ }
+ else if(v > max) {
+ max = v;
+ }
+ }
+ final double scale = (max > min) ? bins / (max - min) : 1;
+ int[] discData = new int[len];
+ for(int i = 0; i < len; i++) {
+ int bin = (int) Math.floor((adapter.getDouble(data, i) - min) * scale);
+ discData[i] = bin < 0 ? 0 : bin >= bins ? bins - 1 : bin;
+ }
+ return discData;
+ }
+
+ /**
+ * Index into the serialized array.
+ *
+ * @param x Column
+ * @param y Row
+ * @return Index in serialized array
+ */
+ protected static int index(int x, int y) {
+ assert (x < y) : "Only lower triangle is allowed.";
+ return ((y * (y - 1)) >> 1) + x;
+ }
+
+ /**
+ * Validate the length of the two data sets (must be the same, and non-zero)
+ *
+ * @param adapter1 First data adapter
+ * @param data1 First data set
+ * @param adapter2 Second data adapter
+ * @param data2 Second data set
+ * @param <A> First array type
+ * @param <B> Second array type
+ */
+ public static <A, B> int size(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = adapter1.size(data1);
+ if(len != adapter2.size(data2)) {
+ throw new AbortException("Array sizes do not match!");
+ }
+ if(len == 0) {
+ throw new AbortException("Empty array!");
+ }
+ return len;
+ }
+
+ /**
+ * Validate the length of the two data sets (must be the same, and non-zero)
+ *
+ * @param adapter Data adapter
+ * @param data Data sets
+ * @param <A> First array type
+ */
+ public static <A> int size(NumberArrayAdapter<?, A> adapter, Collection<? extends A> data) {
+ if(data.size() < 2) {
+ throw new AbortException("Need at least two axes to compute dependence measures.");
+ }
+ Iterator<? extends A> iter = data.iterator();
+ final int len = adapter.size(iter.next());
+ while(iter.hasNext()) {
+ if(len != adapter.size(iter.next())) {
+ throw new AbortException("Array sizes do not match!");
+ }
+ }
+ if(len == 0) {
+ throw new AbortException("Empty array!");
+ }
+ return len;
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java
new file mode 100644
index 00000000..e03601f7
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java
@@ -0,0 +1,141 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Pearson product-moment correlation coefficient.
+ *
+ * @author Erich Schubert
+ */
+public class CorrelationDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Class logger.
+ */
+ private static final Logging LOG = Logging.getLogger(CorrelationDependenceMeasure.class);
+
+ /**
+ * Static instance.
+ */
+ public static final CorrelationDependenceMeasure STATIC = new CorrelationDependenceMeasure();
+
+ /**
+ * Constructor - use {@link #STATIC} instance.
+ */
+ protected CorrelationDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ // Perform two-pass estimation, which is numerically stable and often faster
+ // than the Knuth-Welford approach (see PearsonCorrelation class)
+ double m1 = 0., m2 = 0;
+ for(int i = 0; i < len; i++) {
+ m1 += adapter1.getDouble(data1, i);
+ m2 += adapter2.getDouble(data2, i);
+ }
+ m1 /= len;
+ m2 /= len;
+ // Second pass: variances and covariance
+ double v1 = 0., v2 = 0., cov = 0.;
+ for(int i = 0; i < len; i++) {
+ double d1 = adapter1.getDouble(data1, i) - m1;
+ double d2 = adapter2.getDouble(data2, i) - m2;
+ v1 += d1 * d1;
+ v2 += d2 * d2;
+ cov += d1 * d2;
+ }
+ // Note: we did not normalize by len, as this cancels out.
+ return cov / Math.sqrt(v1 * v2);
+ }
+
+ @Override
+ public <A> double[] dependence(NumberArrayAdapter<?, A> adapter, List<? extends A> data) {
+ final int dims = data.size();
+ final int len = size(adapter, data);
+ double[] means = new double[dims];
+ // Two passes - often faster due to the lower numerical cost
+ // And accurate, don't use sum-of-squares.
+ for(int j = 0; j < dims; j++) {
+ double m = 0.;
+ A da = data.get(j);
+ for(int i = 0; i < len; i++) {
+ m += adapter.getDouble(da, i);
+ }
+ means[j] = m / len;
+ }
+ // Build the covariance matrix, lower triangular half
+ double[] vst = new double[dims];
+ double[] cov = new double[(dims * (dims - 1)) >> 1];
+ double[] buf = new double[dims];
+ for(int i = 0; i < len; i++) {
+ for(int j = 0; j < dims; j++) {
+ buf[j] = adapter.getDouble(data.get(j), i) - means[j];
+ }
+ for(int y = 0, c = 0; y < dims; y++) {
+ for(int x = 0; x < y; x++) {
+ cov[c++] += buf[x] * buf[y];
+ }
+ vst[y] += buf[y] * buf[y];
+ }
+ }
+ // Compute standard deviations (times sqrt(len)!):
+ for(int y = 0; y < dims; y++) {
+ if(vst[y] == 0.) {
+ LOG.warning("Correlation is not well defined for constant attributes.");
+ }
+ vst[y] = Math.sqrt(vst[y]);
+ }
+ for(int y = 1, c = 0; y < dims; y++) {
+ for(int x = 0; x < y; x++) {
+ // We don't need to divide by sqrt(len), because it will cancel out with
+ // the division we skipped just above.
+ cov[c] = cov[c] / (vst[x] * vst[y]);
+ c++;
+ }
+ }
+ return cov;
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected CorrelationDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DependenceMeasure.java
new file mode 100644
index 00000000..0e9e02c0
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DependenceMeasure.java
@@ -0,0 +1,98 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+
+/**
+ * Measure the dependence of two variables.
+ *
+ * @author Erich Schubert
+ */
+public interface DependenceMeasure {
+ /**
+ * Measure the dependence of two variables.
+ *
+ * This is the more flexible API, which allows using different internal data
+ * representations.
+ *
+ * @param adapter1 First data adapter
+ * @param data1 First data set
+ * @param adapter2 Second data adapter
+ * @param data2 Second data set
+ * @param <A> First array type
+ * @param <B> Second array type
+ * @return Dependence measure
+ */
+ <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2);
+
+ /**
+ * Measure the dependence of two variables.
+ *
+ * This is the more flexible API, which allows using different internal data
+ * representations.
+ *
+ * @param adapter Array type adapter
+ * @param data1 First data set
+ * @param data2 Second data set
+ * @param <A> Array type
+ * @return Dependence measure
+ */
+ <A> double dependence(NumberArrayAdapter<?, A> adapter, A data1, A data2);
+
+ /**
+ * Measure the dependence of two variables.
+ *
+ * This is the more flexible API, which allows using different internal data
+ * representations.
+ *
+ * The resulting data is a serialized lower triangular matrix:
+ *
+ * <pre>
+ * X S S S S S
+ * 0 X S S S S
+ * 1 2 X S S S
+ * 3 4 5 X S S
+ * 6 7 8 9 X S
+ * 10 11 12 13 14 X
+ * </pre>
+ *
+ * @param adapter Data adapter
+ * @param data Data sets. Must have fast random access!
+ * @param <A> Array type
+ * @return Lower triangular serialized matrix
+ */
+ <A> double[] dependence(NumberArrayAdapter<?, A> adapter, List<? extends A> data);
+
+ /**
+ * Measure the dependence of two variables.
+ *
+ * @param data1 First data set
+ * @param data2 Second data set
+ * @return Dependence measure
+ */
+ double dependence(double[] data1, double[] data2);
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DistanceCorrelationDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DistanceCorrelationDependenceMeasure.java
new file mode 100644
index 00000000..1ae8d6b5
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DistanceCorrelationDependenceMeasure.java
@@ -0,0 +1,206 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Distance correlation.
+ *
+ * The value returned is the square root of the dCor² value. This matches the R
+ * implementation by the original authors.
+ *
+ * Reference:
+ * <p>
+ * Székely, G. J., Rizzo, M. L., & Bakirov, N. K.<br />
+ * Measuring and testing dependence by correlation of distances<br />
+ * The Annals of Statistics, 35(6), 2769-2794
+ * </p>
+ *
+ * Implementation notice: we exploit symmetry, and thus use diagonal matrixes.
+ * While initially the diagonal is zero, after double-centering the matrix these
+ * values can become non-zero!
+ *
+ * @author Marie Kiermeier
+ * @author Erich Schubert
+ */
+@Reference(authors = "Székely, G. J., Rizzo, M. L., & Bakirov, N. K.", //
+title = "Measuring and testing dependence by correlation of distances", //
+booktitle = "The Annals of Statistics, 35(6), 2769-2794", //
+url = "http://dx.doi.org/10.1214/009053607000000505")
+public class DistanceCorrelationDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final DistanceCorrelationDependenceMeasure STATIC = new DistanceCorrelationDependenceMeasure();
+
+ /**
+ * Constructor - use {@link #STATIC} instance instead!
+ */
+ protected DistanceCorrelationDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ double[] dMatrixA = computeDistances(adapter1, data1);
+ double[] dMatrixB = computeDistances(adapter2, data2);
+
+ // distance variance
+ double dVarA = computeDCovar(dMatrixA, dMatrixA, len);
+ if(!(dVarA > 0.)) {
+ return 0.;
+ }
+ double dVarB = computeDCovar(dMatrixB, dMatrixB, len);
+ if(!(dVarB > 0.)) {
+ return 0.;
+ }
+ double dCovar = computeDCovar(dMatrixA, dMatrixB, len);
+ // distance correlation
+ return Math.sqrt(dCovar / Math.sqrt(dVarA * dVarB));
+ }
+
+ @Override
+ public <A> double[] dependence(NumberArrayAdapter<?, A> adapter, List<? extends A> data) {
+ final int dims = data.size();
+ final int len = size(adapter, data);
+ double[][] dMatrix = new double[dims][];
+ for(int i = 0; i < dims; i++) {
+ dMatrix[i] = computeDistances(adapter, data.get(i));
+ }
+ double[] dVar = new double[dims];
+ for(int i = 0; i < dims; i++) {
+ dVar[i] = computeDCovar(dMatrix[i], dMatrix[i], len);
+ }
+ double[] dCor = new double[(dims * (dims - 1)) >> 1];
+ for(int y = 1, c = 0; y < dims; y++) {
+ for(int x = 0; x < y; x++) {
+ if(!(dVar[x] * dVar[y] > 0.)) {
+ dCor[c++] = 0.;
+ continue;
+ }
+ double dCovar = computeDCovar(dMatrix[x], dMatrix[y], len);
+ dCor[c++] = Math.sqrt(dCovar / Math.sqrt(dVar[x] * dVar[y]));
+ }
+ }
+ return dCor;
+ }
+
+ /**
+ * Compute the double-centered delta matrix.
+ *
+ * @param adapter Data adapter
+ * @param data Input data
+ * @return Double-centered delta matrix.
+ */
+ protected static <A> double[] computeDistances(NumberArrayAdapter<?, A> adapter, A data) {
+ final int size = adapter.size(data);
+ double[] dMatrix = new double[(size * (size + 1)) >> 1];
+ for(int i = 0, c = 0; i < size; i++) {
+ for(int j = 0; j < i; j++) {
+ double dx = adapter.getDouble(data, i) - adapter.getDouble(data, j);
+ dMatrix[c++] = (dx < 0) ? -dx : dx; // Absolute difference.
+ }
+ c++; // Diagonal entry: zero
+ }
+ doubleCenterMatrix(dMatrix, size);
+ return dMatrix;
+ }
+
+ /**
+ * Computes the distance variance matrix of one axis.
+ *
+ * @param dMatrix distance matrix of the axis
+ * @param size Dimensionality
+ */
+ public static void doubleCenterMatrix(double[] dMatrix, int size) {
+ double[] rowMean = new double[size];
+ // row sum
+ for(int i = 0, c = 0; i < size; i++) {
+ for(int j = 0; j < i; j++) {
+ double v = dMatrix[c++];
+ rowMean[i] += v;
+ rowMean[j] += v;
+ }
+ assert (dMatrix[c] == 0.);
+ c++; // Diagonal entry. Must be zero!
+ }
+ // Normalize averages:
+ double matrixMean = 0.;
+ for(int i = 0; i < size; i++) {
+ matrixMean += rowMean[i];
+ rowMean[i] /= size;
+ }
+ matrixMean /= size * size;
+
+ for(int o = 0, c = 0; o < size; o++) {
+ // Including row mean!
+ for(int p = 0; p <= o; p++) {
+ dMatrix[c++] -= rowMean[o] + rowMean[p] - matrixMean;
+ }
+ }
+ }
+
+ /**
+ * Computes the distance covariance for two axis. Can also be used to compute
+ * the distance variance of one axis (dVarMatrixA = dVarMatrixB).
+ *
+ * @param dVarMatrixA distance variance matrix of the first axis
+ * @param dVarMatrixB distance variance matrix of the second axis
+ * @param n number of points
+ * @return distance covariance
+ */
+ protected double computeDCovar(double[] dVarMatrixA, double[] dVarMatrixB, int n) {
+ double result = 0.;
+ for(int i = 0, c = 0; i < n; i++) {
+ for(int j = 0; j < i; j++) {
+ result += 2. * dVarMatrixA[c] * dVarMatrixB[c];
+ c++;
+ }
+ // Diagonal entry.
+ result += dVarMatrixA[c] * dVarMatrixB[c];
+ c++;
+ }
+ return result / (n * n);
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Marie Kiermeier
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected DistanceCorrelationDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HSMDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HSMDependenceMeasure.java
new file mode 100644
index 00000000..aaacb888
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HSMDependenceMeasure.java
@@ -0,0 +1,245 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import de.lmu.ifi.dbs.elki.math.SinCosTable;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Compute the similarity of dimensions by using a hough transformation.
+ *
+ * Reference: <br>
+ * <p>
+ * A. Tatu, G. Albuquerque, M. Eisemann, P. Bak, H. Theisel, M. A. Magnor, and
+ * D. A. Keim.<br />
+ * Automated Analytical Methods to Support Visual Exploration of High-
+ * Dimensional Data. <br/>
+ * IEEEVisualization and Computer Graphics, 2011.
+ * </p>
+ *
+ * FIXME: This needs serious TESTING before release. Large parts have been
+ * rewritten, but could not be tested at the time of rewriting.
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+@Reference(authors = "A. Tatu, G. Albuquerque, M. Eisemann, P. Bak, H. Theisel, M. A. Magnor, and D. A. Keim", //
+title = "Automated Analytical Methods to Support Visual Exploration of High-Dimensional Data", //
+booktitle = "IEEE Trans. Visualization and Computer Graphics, 2011", //
+url = "http://dx.doi.org/10.1109/TVCG.2010.242")
+public class HSMDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final HSMDependenceMeasure STATIC = new HSMDependenceMeasure();
+
+ /**
+ * Angular resolution. Best if divisible by 4: smaller tables.
+ *
+ * The original publication used 50.
+ */
+ private final static int STEPS = 48; // 64;
+
+ /**
+ * Resolution of image.
+ */
+ private final int resolution = 512;
+
+ /**
+ * Precompute sinus and cosinus
+ */
+ private final static SinCosTable table = SinCosTable.make(STEPS);
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ boolean[][] pic = new boolean[resolution][resolution];
+
+ // Get attribute value range:
+ final double off1, scale1, off2, scale2;
+ {
+ double mi = adapter1.getDouble(data1, 0), ma = mi;
+ for(int i = 1; i < len; ++i) {
+ double v = adapter1.getDouble(data1, i);
+ if(v < mi) {
+ mi = v;
+ }
+ else if(v > ma) {
+ ma = v;
+ }
+ }
+ off1 = mi;
+ scale1 = (ma > mi) ? (1. / (ma - mi)) : 1.;
+ // Second data
+ mi = adapter2.getDouble(data2, 0);
+ ma = mi;
+ for(int i = 1; i < len; ++i) {
+ double v = adapter2.getDouble(data2, i);
+ if(v < mi) {
+ mi = v;
+ }
+ else if(v > ma) {
+ ma = v;
+ }
+ }
+ off2 = mi;
+ scale2 = (ma > mi) ? (1. / (ma - mi)) : 1.;
+ }
+ // Iterate dataset
+ for(int i = 0; i < len; ++i) {
+ double xi = (adapter1.getDouble(data1, i) - off1) * scale1;
+ double xj = (adapter2.getDouble(data2, i) - off2) * scale2;
+ drawLine(0, (int) (resolution * xi), resolution - 1, (int) (resolution * xj), pic);
+ }
+
+ final double stepsq = (double) STEPS * (double) STEPS;
+ int[][] hough = houghTransformation(pic);
+ // The original publication said "median", but judging from the text,
+ // meant "mean". Otherwise, always half of the cells are above the
+ // threshold, which doesn't match the explanation there.
+ double mean = sumMatrix(hough) / stepsq;
+ int abovemean = countAboveThreshold(hough, mean);
+
+ return 1. - (abovemean / stepsq);
+ }
+
+ /**
+ * Compute the sum of a matrix.
+ *
+ * @param mat Matrix
+ * @return Sum of all elements
+ */
+ private long sumMatrix(int[][] mat) {
+ long ret = 0;
+ for(int i = 0; i < mat.length; i++) {
+ final int[] row = mat[i];
+ for(int j = 0; j < row.length; j++) {
+ ret += row[j];
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Count the number of cells above the threshold.
+ *
+ * @param mat Matrix
+ * @param threshold Threshold
+ * @return Number of elements above the threshold.
+ */
+ private int countAboveThreshold(int[][] mat, double threshold) {
+ int ret = 0;
+ for(int i = 0; i < mat.length; i++) {
+ int[] row = mat[i];
+ for(int j = 0; j < row.length; j++) {
+ if(row[j] >= threshold) {
+ ret++;
+ }
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Perform a hough transformation on the binary image in "mat".
+ *
+ * @param mat Binary image
+ * @return Hough transformation of image.
+ */
+ private int[][] houghTransformation(boolean[][] mat) {
+ final int xres = mat.length, yres = mat[0].length;
+ final double tscale = STEPS * .5 / (xres + yres);
+ final int[][] ret = new int[STEPS][STEPS];
+
+ for(int x = 0; x < mat.length; x++) {
+ for(int y = 0; y < mat[0].length; y++) {
+ if(mat[x][y]) {
+ for(int i = 0; i < STEPS; i++) {
+ final int d = (STEPS >> 1) + (int) (tscale * (x * table.cos(i) + y * table.sin(i)));
+ if(d > 0 && d < STEPS) {
+ ret[d][i]++;
+ }
+ }
+ }
+ }
+ }
+
+ return ret;
+ }
+
+ /**
+ * Draw a line onto the array, using the classic Bresenham algorithm.
+ *
+ * @param x0 Start X
+ * @param y0 Start Y
+ * @param x1 End X
+ * @param y1 End Y
+ * @param pic Picture array
+ */
+ private static void drawLine(int x0, int y0, int x1, int y1, boolean[][] pic) {
+ final int xres = pic.length, yres = pic[0].length;
+ // Ensure bounds
+ y0 = (y0 < 0) ? 0 : (y0 >= yres) ? (yres - 1) : y0;
+ y1 = (y1 < 0) ? 0 : (y1 >= yres) ? (yres - 1) : y1;
+ x0 = (x0 < 0) ? 0 : (x0 >= xres) ? (xres - 1) : x0;
+ x1 = (x1 < 0) ? 0 : (x1 >= xres) ? (xres - 1) : x1;
+ // Default slope
+ final int dx = +Math.abs(x1 - x0), sx = x0 < x1 ? 1 : -1;
+ final int dy = -Math.abs(y1 - y0), sy = y0 < y1 ? 1 : -1;
+ // Error counter
+ int err = dx + dy;
+
+ for(;;) {
+ pic[x0][y0] = true;
+ if(x0 == x1 && y0 == y1) {
+ break;
+ }
+
+ final int e2 = err << 1;
+ if(e2 > dy) {
+ err += dy;
+ x0 += sx;
+ }
+ if(e2 < dx) {
+ err += dx;
+ y0 += sy;
+ }
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected HSMDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HiCSDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HiCSDependenceMeasure.java
new file mode 100644
index 00000000..e760ef83
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HiCSDependenceMeasure.java
@@ -0,0 +1,243 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.random.RandomFactory;
+import de.lmu.ifi.dbs.elki.math.statistics.tests.GoodnessOfFitTest;
+import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * Use the statistical tests as used by HiCS to measure dependence of variables.
+ *
+ * Reference:
+ * <p>
+ * Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek:<br />
+ * Interactive Data Mining with 3D-Parallel-Coordinate-Trees.<br />
+ * Proceedings of the 2013 ACM International Conference on Management of Data
+ * (SIGMOD), New York City, NY, 2013.
+ * </p>
+ *
+ * Based on:
+ * <p>
+ * F. Keller, E. Müller, and K. Böhm.<br />
+ * HiCS: High Contrast Subspaces for Density-Based Outlier Ranking. <br />
+ * In ICDE, pages 1037–1048, 2012.
+ * </p>
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+@Reference(authors = "Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", //
+title = "Interactive Data Mining with 3D-Parallel-Coordinate-Trees", //
+booktitle = "Proc. of the 2013 ACM International Conference on Management of Data (SIGMOD)", //
+url = "http://dx.doi.org/10.1145/2463676.2463696")
+public class HiCSDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Monte-Carlo iterations
+ */
+ private int m = 50;
+
+ /**
+ * Alpha threshold
+ */
+ private double alphasqrt = Math.sqrt(0.1);
+
+ /**
+ * Statistical test to use
+ */
+ private GoodnessOfFitTest statTest;
+
+ /**
+ * Random generator
+ */
+ private RandomFactory rnd;
+
+ /**
+ * Constructor.
+ *
+ * @param statTest Test function
+ * @param m Number of monte-carlo iterations
+ * @param alpha Alpha threshold
+ * @param rnd Random source
+ */
+ public HiCSDependenceMeasure(GoodnessOfFitTest statTest, int m, double alpha, RandomFactory rnd) {
+ super();
+ this.statTest = statTest;
+ this.m = m;
+ this.alphasqrt = Math.sqrt(alpha);
+ this.rnd = rnd;
+ }
+
+ @Override
+ public <A, B> double dependence(final NumberArrayAdapter<?, A> adapter1, final A data1, final NumberArrayAdapter<?, B> adapter2, final B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ final int windowsize = (int) (len * alphasqrt);
+ final Random random = rnd.getSingleThreadedRandom();
+
+ // Sorted copies for slicing.
+ int[] s1 = MathUtil.sequence(0, len), s2 = MathUtil.sequence(0, len);
+ IntegerArrayQuickSort.sort(s1, new IntegerComparator() {
+ @Override
+ public int compare(int x, int y) {
+ return Double.compare(adapter1.getDouble(data1, x), adapter1.getDouble(data1, y));
+ }
+ });
+ IntegerArrayQuickSort.sort(s2, new IntegerComparator() {
+ @Override
+ public int compare(int x, int y) {
+ return Double.compare(adapter2.getDouble(data2, x), adapter2.getDouble(data2, y));
+ }
+ });
+
+ // Distributions for testing
+ double[] fullValues = new double[len];
+ double[] sampleValues = new double[windowsize];
+ double deviationSum = 0.;
+
+ // For the first half, we use the first dimension as reference
+ for(int i = 0; i < len; i++) {
+ fullValues[i] = adapter1.getDouble(data1, i);
+ if(fullValues[i] != fullValues[i]) {
+ throw new AbortException("NaN values are not allowed by this implementation!");
+ }
+ }
+
+ int half = m >> 1; // TODO: remove bias?
+ for(int i = 0; i < half; ++i) {
+ // Build the sample
+ for(int j = random.nextInt(len - windowsize), k = 0; k < windowsize; ++k, ++j) {
+ sampleValues[k] = adapter2.getDouble(data2, j);
+ }
+ double contrast = statTest.deviation(fullValues, sampleValues);
+ if(Double.isNaN(contrast)) {
+ --i; // Retry.
+ continue;
+ }
+ deviationSum += contrast;
+ }
+
+ // For the second half, we use the second dimension as reference
+ for(int i = 0; i < len; i++) {
+ fullValues[i] = adapter2.getDouble(data2, i);
+ if(fullValues[i] != fullValues[i]) {
+ throw new AbortException("NaN values are not allowed by this implementation!");
+ }
+ }
+
+ for(int i = half; i < m; ++i) {
+ // Build the sample
+ for(int j = random.nextInt(len - windowsize), k = 0; k < windowsize; ++k, ++j) {
+ sampleValues[k] = adapter1.getDouble(data1, j);
+ }
+ double contrast = statTest.deviation(fullValues, sampleValues);
+ if(Double.isNaN(contrast)) {
+ --i; // Retry.
+ continue;
+ }
+ deviationSum += contrast;
+ }
+
+ return deviationSum / m;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Statistical test to use
+ */
+ private GoodnessOfFitTest statTest;
+
+ /**
+ * Holds the value of
+ * {@link de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS.Parameterizer#M_ID}
+ * .
+ */
+ private int m = 50;
+
+ /**
+ * Holds the value of
+ * {@link de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS.Parameterizer#ALPHA_ID}
+ * .
+ */
+ private double alpha = 0.1;
+
+ /**
+ * Random generator.
+ */
+ private RandomFactory rnd;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ final IntParameter mP = new IntParameter(HiCS.Parameterizer.M_ID, 50);
+ mP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT);
+ if(config.grab(mP)) {
+ m = mP.intValue();
+ }
+
+ final DoubleParameter alphaP = new DoubleParameter(HiCS.Parameterizer.ALPHA_ID, 0.1);
+ alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE);
+ if(config.grab(alphaP)) {
+ alpha = alphaP.doubleValue();
+ }
+
+ final ObjectParameter<GoodnessOfFitTest> testP = new ObjectParameter<>(HiCS.Parameterizer.TEST_ID, GoodnessOfFitTest.class, KolmogorovSmirnovTest.class);
+ if(config.grab(testP)) {
+ statTest = testP.instantiateClass(config);
+ }
+
+ final RandomParameter rndP = new RandomParameter(HiCS.Parameterizer.SEED_ID);
+ if(config.grab(rndP)) {
+ rnd = rndP.getValue();
+ }
+ }
+
+ @Override
+ protected HiCSDependenceMeasure makeInstance() {
+ return new HiCSDependenceMeasure(statTest, m, alpha, rnd);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HoeffdingsDDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HoeffdingsDDependenceMeasure.java
new file mode 100644
index 00000000..7f1f9fc4
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HoeffdingsDDependenceMeasure.java
@@ -0,0 +1,207 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Calculate Hoeffding's D as a measure of dependence.
+ *
+ * References:
+ * <p>
+ * W. Hoeffding:<br />
+ * A non-parametric test of independence<br />
+ * The Annals of Mathematical Statistics 19:546–57
+ * </p>
+ *
+ * The resulting value is scaled by 30, so it is in the range {@code [-.5;1]}.
+ *
+ * @author Yinchong Yang
+ * @author Erich Schubert
+ */
+@Reference(authors = "W. Hoeffding", //
+title = "A non-parametric test of independence", //
+booktitle = "The Annals of Mathematical Statistics 19", //
+url = "http://www.jstor.org/stable/2236021")
+public class HoeffdingsDDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final HoeffdingsDDependenceMeasure STATIC = new HoeffdingsDDependenceMeasure();
+
+ /**
+ * Constructor - use {@link #STATIC} instance.
+ */
+ protected HoeffdingsDDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int n = size(adapter1, data1, adapter2, data2);
+ assert (n > 4) : "Hoeffdings D needs at least 5 elements!";
+ if(n <= 4) {
+ return Double.NaN;
+ }
+ double[] r = ranks(adapter1, data1, n);
+ double[] s = ranks(adapter2, data2, n);
+ // TODO: is it possible to exploit sorting to accelerate computing q?
+ double[] q = computeBivariateRanks(adapter1, data1, adapter2, data2, n);
+
+ double d1 = 0, d2 = 0, d3 = 0;
+ for(int i = 0; i < n; i++) {
+ d1 += q[i] * (q[i] - 1); // Note: our q is 0-indexed.
+ d2 += (r[i] - 1) * (r[i] - 2) * (s[i] - 1) * (s[i] - 2);
+ d3 += (r[i] - 2) * (s[i] - 2) * q[i];
+ }
+
+ // Factor n-2 was moved for better numerical behavior.
+ double nom = (n - 3.) * d1 + d2 / (n - 2) - 2. * d3;
+ double div = n * (n - 1.) * (n - 3.) * (n - 4.);
+ double d = 30 * nom / div;
+ return d < 1. ? d : 1.;
+ }
+
+ /**
+ * Compute bivariate ranks.
+ *
+ * q[i] is the number of objects such that x[j] < x[i] and y[j] < y[i]
+ *
+ * @param adapter1 First adapter
+ * @param data1 First data set
+ * @param adapter2 Second adapter
+ * @param data2 Second data set
+ * @param len Length
+ * @return Bivariate rank statistics.
+ */
+ protected static <A, B> double[] computeBivariateRanks(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2, int len) {
+ double[] ret = new double[len];
+ for(int i = 0; i < len; i++) {
+ for(int j = i + 1; j < len; j++) {
+ double xi = adapter1.getDouble(data1, i), xj = adapter1.getDouble(data1, j);
+ double yi = adapter2.getDouble(data2, i), yj = adapter2.getDouble(data2, j);
+ if(xi < xj) {
+ ret[j] += (yi < yj) ? 1 : (yi == yj) ? .5 : 0;
+ }
+ else if(xj < xi) {
+ ret[i] += (yj < yi) ? 1 : (yj == yi) ? .5 : 0;
+ }
+ else { // tied on x
+ if(yi < yj) {
+ ret[j] += .5;
+ }
+ else if(yj < yi) {
+ ret[i] += .5;
+ }
+ else { // Double tied
+ ret[i] += .25;
+ ret[j] += .25;
+ }
+ }
+ }
+ }
+ return ret;
+ }
+
+ // Tabular approximation
+ private final static double[] TABVAL = { //
+ 0.5297, 0.4918, 0.4565, 0.4236, 0.393, //
+ 0.3648, 0.3387, 0.3146, 0.2924, 0.2719, // 10
+ 0.253, 0.2355, 0.2194, 0.2045, 0.1908, //
+ 0.1781, 0.1663, 0.1554, 0.1453, 0.1359, // 20
+ 0.1273, 0.1192, 0.1117, 0.1047, 0.0982, //
+ 0.0921, 0.0864, 0.0812, 0.0762, 0.0716, // 30
+ 0.0673, 0.0633, 0.0595, 0.056, 0.0527, //
+ 0.0496, 0.0467, 0.044, 0.0414, 0.039, // 40
+ 0.0368, 0.0347, 0.0327, 0.0308, 0.0291, //
+ 0.0274, 0.0259, 0.0244, 0.023, 0.0217, // 50
+ 0.0205, 0.0194, 0.0183, 0.0173, 0.0163, //
+ 0.0154, 0.0145, 0.0137, 0.013, 0.0123, // 60
+ 0.0116, 0.011, 0.0104, 0.0098, 0.0093, //
+ 0.0087, 0.0083, 0.0078, 0.0074, 0.007, // 70
+ 0.0066, 0.0063, 0.0059, 0.0056, 0.0053, //
+ 0.005, 0.0047, 0.0045, 0.0042, 0.0025, // 80
+ 0.0014, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001 };
+
+ // Table positions
+ private final static double[] TABPOS = new double[] { //
+ 1.10, 1.15, 1.20, 1.25, 1.30, 1.35, 1.40, 1.45, 1.50, 1.55, //
+ 1.60, 1.65, 1.70, 1.75, 1.80, 1.85, 1.90, 1.95, 2.00, 2.05, //
+ 2.10, 2.15, 2.20, 2.25, 2.30, 2.35, 2.40, 2.45, 2.50, 2.55, //
+ 2.60, 2.65, 2.70, 2.75, 2.80, 2.85, 2.90, 2.95, 3.00, 3.05, //
+ 3.10, 3.15, 3.20, 3.25, 3.30, 3.35, 3.40, 3.45, 3.50, 3.55, //
+ 3.60, 3.65, 3.70, 3.75, 3.80, 3.85, 3.90, 3.95, 4.00, 4.05, //
+ 4.10, 4.15, 4.20, 4.25, 4.30, 4.35, 4.40, 4.45, 4.50, 4.55, //
+ 4.60, 4.65, 4.70, 4.75, 4.80, 4.85, 4.90, 4.95, 5.00, //
+ 5.50, 6.00, 6.50, 7.00, 7.50, 8.00, 8.50 };
+
+ /**
+ * Convert Hoeffding D value to a p-value.
+ *
+ * @param d D value
+ * @param n Data set size
+ * @return p-value
+ */
+ public double toPValue(double d, int n) {
+ double b = d / 30 + 1. / (36 * n);
+ double z = .5 * MathUtil.PISQUARE * MathUtil.PISQUARE * n * b;
+
+ // Exponential approximation
+ if(z < 1.1 || z > 8.5) {
+ double e = Math.exp(0.3885037 - 1.164879 * z);
+ return (e > 1) ? 1 : (e < 0) ? 0 : e;
+ }
+ // Tabular approximation
+ for(int i = 0; i < 86; i++) {
+ if(TABPOS[i] >= z) {
+ // Exact table value
+ if(TABPOS[i] == z) {
+ return TABVAL[i];
+ }
+ // Linear interpolation
+ double x1 = TABPOS[i], x0 = TABPOS[i - 1];
+ double y1 = TABVAL[i], y0 = TABVAL[i - 1];
+ return y0 + (y1 - y0) * (z - x0) / (x1 - x0);
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected HoeffdingsDDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/JensenShannonEquiwidthDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/JensenShannonEquiwidthDependenceMeasure.java
new file mode 100644
index 00000000..7b0000a3
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/JensenShannonEquiwidthDependenceMeasure.java
@@ -0,0 +1,143 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Jensen-Shannon Divergence is closely related to mutual information.
+ *
+ * The output value is normalized, such that an evenly distributed and identical
+ * distribution will yield a value of 1. Independent distributions may still
+ * yield values close to .25, though.
+ *
+ * TODO: Offer normalized and non-normalized variants?
+ *
+ * @author Erich Schubert
+ */
+public class JensenShannonEquiwidthDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final JensenShannonEquiwidthDependenceMeasure STATIC = new JensenShannonEquiwidthDependenceMeasure();
+
+ /**
+ * Constructor - use {@link #STATIC} instance.
+ */
+ protected JensenShannonEquiwidthDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ final int bins = (int) Math.round(Math.sqrt(len));
+ final int maxbin = bins - 1;
+
+ double min1 = adapter1.getDouble(data1, 0), max1 = min1;
+ double min2 = adapter2.getDouble(data2, 0), max2 = min2;
+ for(int i = 1; i < len; i++) {
+ final double vx = adapter1.getDouble(data1, i);
+ final double vy = adapter2.getDouble(data2, i);
+ if(vx < min1) {
+ min1 = vx;
+ }
+ else if(vx > max1) {
+ max1 = vx;
+ }
+ if(vy < min2) {
+ min2 = vy;
+ }
+ else if(vy > max2) {
+ max2 = vy;
+ }
+ }
+ final double scale1 = (max1 > min1) ? bins / (max1 - min1) : 1;
+ final double scale2 = (max2 > min2) ? bins / (max2 - min2) : 1;
+
+ int[] margin1 = new int[bins], margin2 = new int[bins];
+ int[][] counts = new int[bins][bins];
+ for(int i = 0; i < len; i++) {
+ int bin1 = (int) Math.floor((adapter1.getDouble(data1, i) - min1) * scale1);
+ int bin2 = (int) Math.floor((adapter2.getDouble(data2, i) - min2) * scale2);
+ bin1 = bin1 < bins ? bin1 : maxbin;
+ bin2 = bin2 < bins ? bin2 : maxbin;
+ margin1[bin1]++;
+ margin2[bin2]++;
+ counts[bin1][bin2]++;
+ }
+
+ // calculating relative frequencies
+ double e = 0;
+ for(int bin1 = 0; bin1 < counts.length; bin1++) {
+ // Margin value for row i.
+ final int sum1 = margin1[bin1];
+ // Skip empty rows early.
+ if(sum1 == 0) {
+ continue;
+ }
+ // Inverse pX
+ final double pX = sum1 / (double) len;
+ final int[] row = counts[bin1];
+ for(int bin2 = 0; bin2 < row.length; bin2++) {
+ final int sum2 = margin2[bin2];
+ if(sum2 > 0) {
+ final int cell = row[bin2];
+ // JS divergence of pXY and (pX * pY)
+ double pXY = cell / (double) len;
+ final double pXpY = pX * sum2 / len;
+ final double iavg = 2. / (pXY + pXpY);
+ e += pXY > 0. ? pXY * Math.log(pXY * iavg) : 0.;
+ e += pXpY * Math.log(pXpY * iavg);
+ }
+ }
+ }
+ // Expected value for evenly distributed and identical:
+ // pX = pY = 1/b and thus pXpY = 1/b^2.
+ // for i==j, pXY=1/b and thus iavg = 2*b*b/(b+1)
+ // otherwise, pXY=0 and thus iavg = 2*b*b
+ // pXY: e += log(b*2/(b+1)) = log(b) + log(2/(b+1))
+ // pXpY1: e += 1/b*log(2/(b+1)) = 1/b*log(2/(b+1))
+ // pXpY2: e += (b-1)/b*log(2) = (b-1)/b*log(2)
+ final double exp = Math.log(bins) + (1. + 1. / bins) * Math.log(2. / (bins + 1)) + (bins - 1.) / bins * MathUtil.LOG2;
+ // e *= .5; // Average, but we need to adjust exp then, too!
+ return e / exp;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected JensenShannonEquiwidthDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MCEDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MCEDependenceMeasure.java
new file mode 100644
index 00000000..899aff9a
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MCEDependenceMeasure.java
@@ -0,0 +1,274 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Compute a mutual information based dependence measure using a nested means
+ * discretization, originally proposed for ordering axes in parallel coordinate
+ * plots.
+ *
+ * Reference:
+ * <p>
+ * D. Guo<br />
+ * Coordinating computational and visual approaches for interactive feature
+ * selection and multivariate clustering<br />
+ * Information Visualization, 2(4), 2003.
+ * </p>
+ *
+ * @author Erich Schubert
+ */
+@Reference(authors = "D. Guo", //
+title = "Coordinating computational and visual approaches for interactive feature selection and multivariate clustering", //
+booktitle = "Information Visualization, 2(4)", //
+url = "http://dx.doi.org/10.1057/palgrave.ivs.9500053")
+public class MCEDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final MCEDependenceMeasure STATIC = new MCEDependenceMeasure();
+
+ /**
+ * Desired size: 35 observations.
+ *
+ * While this could trivially be made parameterizable, it is a reasonable rule
+ * of thumb and not expected to have a major effect.
+ */
+ public static final int TARGET = 35;
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ // Find a number of bins as recommended by Cheng et al.
+ double p = MathUtil.log2(len / (double) TARGET);
+ // As we are in 2d, take the root (*.5) But let's use at least 1, too.
+ // Check: for 10000 this should give 4, for 150 it gives 1.
+ int power = Math.max(1, (int) Math.floor(p * .5));
+ int gridsize = 1 << power;
+ double loggrid = Math.log((double) gridsize);
+
+ ArrayList<int[]> parts1 = buildPartitions(adapter1, data1, len, power);
+ ArrayList<int[]> parts2 = buildPartitions(adapter2, data2, len, power);
+
+ int[][] res = new int[gridsize][gridsize];
+ intersectionMatrix(res, parts1, parts2, gridsize);
+ return 1. - getMCEntropy(res, parts1, parts2, len, gridsize, loggrid);
+ }
+
+ /**
+ * Partitions an attribute.
+ *
+ * @param adapter1 Data adapter
+ * @param data1 Data set
+ * @param len Length of data
+ * @param depth Splitting depth
+ * @return List of sorted objects
+ */
+ private <A> ArrayList<int[]> buildPartitions(NumberArrayAdapter<?, A> adapter1, A data1, int len, int depth) {
+ final int[] idx = new int[len];
+ final double[] tmp = new double[len];
+ for(int i = 0; i < len; ++i) {
+ idx[i] = i;
+ tmp[i] = adapter1.getDouble(data1, i);
+ }
+ // Sort indexes:
+ IntegerArrayQuickSort.sort(idx, new IntegerComparator() {
+ @Override
+ public int compare(int x, int y) {
+ return Double.compare(tmp[x], tmp[y]);
+ }
+ });
+ Arrays.sort(tmp); // Should yield the same ordering
+
+ ArrayList<int[]> ret = new ArrayList<>(1 << depth);
+ divide(idx, tmp, ret, 0, tmp.length, depth);
+ return ret;
+ }
+
+ /**
+ * Recursive call to further subdivide the array.
+ *
+ * @param idx Object indexes.
+ * @param data 1D data, sorted
+ * @param ret Output index
+ * @param start Interval start
+ * @param end Interval end
+ * @param depth Depth
+ */
+ private void divide(int[] idx, double[] data, ArrayList<int[]> ret, int start, int end, int depth) {
+ if(depth == 0) {
+ int[] a = Arrays.copyOfRange(idx, start, end);
+ Arrays.sort(a);
+ ret.add(a);
+ return;
+ }
+ final int count = end - start;
+ if(count == 0) {
+ // Corner case, that should barely happen. But for ties, we currently
+ // Do not yet assure that it doesn't happen!
+ for(int j = 1 << depth; j > 0; --j) {
+ ret.add(new int[0]);
+ }
+ return;
+ }
+ double m = 0.;
+ for(int i = start; i < end; i++) {
+ m += data[i];
+ }
+ m /= count;
+ int pos = Arrays.binarySearch(data, start, end, m);
+ if(pos >= 0) {
+ // Ties: try to choose the most central element.
+ final int opt = (start + end) >> 1;
+ while(data[pos] == m) {
+ if(pos < opt) {
+ pos++;
+ }
+ else if(pos > opt) {
+ pos--;
+ }
+ else {
+ break;
+ }
+ }
+ }
+ else {
+ pos = (-pos - 1);
+ }
+ divide(idx, data, ret, start, pos, depth - 1);
+ divide(idx, data, ret, pos, end, depth - 1);
+ }
+
+ /**
+ * Intersect the two 1d grid decompositions, to obtain a 2d matrix.
+ *
+ * @param res Output matrix to fill
+ * @param partsx Partitions in first component
+ * @param partsy Partitions in second component.
+ * @param gridsize Size of partition decomposition
+ */
+ private void intersectionMatrix(int[][] res, ArrayList<int[]> partsx, ArrayList<int[]> partsy, int gridsize) {
+ for(int x = 0; x < gridsize; x++) {
+ final int[] px = partsx.get(x);
+ final int[] rowx = res[x];
+ for(int y = 0; y < gridsize; y++) {
+ int[] py = partsy.get(y);
+ rowx[y] = intersectionSize(px, py);
+ }
+ }
+ }
+
+ /**
+ * Compute the intersection of two sorted integer lists.
+ *
+ * @param px First list
+ * @param py Second list
+ * @return Intersection size.
+ */
+ private int intersectionSize(int[] px, int[] py) {
+ int i = 0, j = 0, c = 0;
+ while(i < px.length && j < py.length) {
+ final int vx = px[i], vy = py[i];
+ if(vx < vy) {
+ ++i;
+ }
+ else if(vx > vy) {
+ ++j;
+ }
+ else {
+ ++c;
+ ++i;
+ ++j;
+ }
+ }
+ return c;
+ }
+
+ /**
+ * Compute the MCE entropy value.
+ *
+ * @param mat Partition size matrix
+ * @param partsx Partitions on X
+ * @param partsy Partitions on Y
+ * @param size Data set size
+ * @param gridsize Size of grids
+ * @param loggrid Logarithm of grid sizes, for normalization
+ * @return MCE score.
+ */
+ private double getMCEntropy(int[][] mat, ArrayList<int[]> partsx, ArrayList<int[]> partsy, int size, int gridsize, double loggrid) {
+ // Margin entropies:
+ double[] mx = new double[gridsize];
+ double[] my = new double[gridsize];
+
+ for(int i = 0; i < gridsize; i++) {
+ // Note: indexes are a bit tricky here, because we compute both margin
+ // entropies at the same time!
+ final double sumx = (double) partsx.get(i).length;
+ final double sumy = (double) partsy.get(i).length;
+ for(int j = 0; j < gridsize; j++) {
+ double px = mat[i][j] / sumx;
+ double py = mat[j][i] / sumy;
+
+ if(px > 0.) {
+ mx[i] -= px * Math.log(px);
+ }
+ if(py > 0.) {
+ my[i] -= py * Math.log(py);
+ }
+ }
+ }
+
+ // Weighted sums of margin entropies.
+ double sumx = 0., sumy = 0.;
+ for(int i = 0; i < gridsize; i++) {
+ sumx += mx[i] * partsx.get(i).length;
+ sumy += my[i] * partsy.get(i).length;
+ }
+
+ double max = ((sumx > sumy) ? sumx : sumy);
+ return max / (size * loggrid);
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected MCEDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MutualInformationEquiwidthDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MutualInformationEquiwidthDependenceMeasure.java
new file mode 100644
index 00000000..26a1759a
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MutualInformationEquiwidthDependenceMeasure.java
@@ -0,0 +1,137 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Mutual Information (MI) dependence measure by dividing each attribute into
+ * equal-width bins. MI can be seen as Kullback–Leibler divergence of the joint
+ * distribution and the product of the marginal distributions.
+ *
+ * For normalization, the resulting values are scaled by {@code mi/log(nbins)}.
+ * This both cancels out the logarithm base, and normalizes for the number of
+ * bins (a uniform distribution will yield a MI with itself of 1).
+ *
+ * TODO: Offer normalized and non-normalized variants?
+ *
+ * For a median-based discretization, see {@link MCEDependenceMeasure}.
+ *
+ * @author Erich Schubert
+ */
+public class MutualInformationEquiwidthDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final MutualInformationEquiwidthDependenceMeasure STATIC = new MutualInformationEquiwidthDependenceMeasure();
+
+ /**
+ * Constructor - use {@link #STATIC} instance.
+ */
+ protected MutualInformationEquiwidthDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ final int bins = (int) Math.round(Math.sqrt(len));
+ final int maxbin = bins - 1;
+
+ double min1 = adapter1.getDouble(data1, 0), max1 = min1;
+ double min2 = adapter2.getDouble(data2, 0), max2 = min2;
+ for(int i = 1; i < len; i++) {
+ final double vx = adapter1.getDouble(data1, i);
+ final double vy = adapter2.getDouble(data2, i);
+ if(vx < min1) {
+ min1 = vx;
+ }
+ else if(vx > max1) {
+ max1 = vx;
+ }
+ if(vy < min2) {
+ min2 = vy;
+ }
+ else if(vy > max2) {
+ max2 = vy;
+ }
+ }
+ final double scale1 = (max1 > min1) ? bins / (max1 - min1) : 1;
+ final double scale2 = (max2 > min2) ? bins / (max2 - min2) : 1;
+
+ int[] margin1 = new int[bins], margin2 = new int[bins];
+ int[][] counts = new int[bins][bins];
+ for(int i = 0; i < len; i++) {
+ int bin1 = (int) Math.floor((adapter1.getDouble(data1, i) - min1) * scale1);
+ int bin2 = (int) Math.floor((adapter2.getDouble(data2, i) - min2) * scale2);
+ bin1 = bin1 < bins ? bin1 : maxbin;
+ bin2 = bin2 < bins ? bin2 : maxbin;
+ margin1[bin1]++;
+ margin2[bin2]++;
+ counts[bin1][bin2]++;
+ }
+
+ // calculating relative frequencies
+ double e = 0;
+ for(int bin1 = 0; bin1 < counts.length; bin1++) {
+ // Margin value for row i.
+ final int sum1 = margin1[bin1];
+ // Skip empty rows early.
+ if(sum1 == 0) {
+ continue;
+ }
+ // Inverse pX
+ final double ipX = len / (double) sum1;
+ final int[] row = counts[bin1];
+ for(int bin2 = 0; bin2 < row.length; bin2++) {
+ final int cell = row[bin2];
+ // Skip empty cells.
+ if(cell != 0) {
+ // Mutual information pXY / (pX * pY)
+ double pXY = cell / (double) len;
+ // Inverse pXpY: 1 / (pX*pY)
+ final double ipXpY = ipX * len / margin2[bin2];
+ e += pXY * Math.log(pXY * ipXpY);
+ }
+ }
+ }
+ // Expected value for uniform identical: log(bins)!
+ return e / Math.log(bins);
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected MutualInformationEquiwidthDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SURFINGDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SURFINGDependenceMeasure.java
new file mode 100644
index 00000000..ce866c16
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SURFINGDependenceMeasure.java
@@ -0,0 +1,133 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Compute the similarity of dimensions using the SURFING score. The parameter k
+ * for the k nearest neighbors is currently hard-coded to 10% of the set size.
+ *
+ * Note that the complexity is roughly O(n n k), so this is a rather slow
+ * method, and with k at 10% of n, is actually cubic: O(0.1 * n^3).
+ *
+ * This version cannot use index support, as the API operates without database
+ * attachment. However, it should be possible to implement some trivial
+ * sorted-list indexes to get a reasonable speedup!
+ *
+ * Reference:
+ * <p>
+ * Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek:<br />
+ * Interactive Data Mining with 3D-Parallel-Coordinate-Trees.<br />
+ * Proceedings of the 2013 ACM International Conference on Management of Data
+ * (SIGMOD), New York City, NY, 2013.
+ * </p>
+ *
+ * Based on:
+ * <p>
+ * Christian Baumgartner, Claudia Plant, Karin Kailing, Hans-Peter Kriegel, and
+ * Peer Kröger<br />
+ * Subspace Selection for Clustering High-Dimensional Data<br />
+ * In IEEE International Conference on Data Mining, 2004.
+ * </p>
+ *
+ * TODO: make the subspace distance function and k parameterizable.
+ *
+ * @author Robert Rödler
+ * @author Erich Schubert
+ *
+ * @apiviz.uses SubspaceEuclideanDistanceFunction
+ */
+@Reference(authors = "Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", //
+title = "Interactive Data Mining with 3D-Parallel-Coordinate-Trees", //
+booktitle = "Proc. of the 2013 ACM International Conference on Management of Data (SIGMOD)", //
+url = "http://dx.doi.org/10.1145/2463676.2463696")
+public class SURFINGDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final SURFINGDependenceMeasure STATIC = new SURFINGDependenceMeasure();
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected SURFINGDependenceMeasure() {
+ super();
+ }
+
+ @Reference(authors = "Christian Baumgartner, Claudia Plant, Karin Kailing, Hans-Peter Kriegel, and Peer Kröger", //
+ title = "Subspace Selection for Clustering High-Dimensional Data", //
+ booktitle = "IEEE International Conference on Data Mining, 2004", //
+ url = "http://dx.doi.org/10.1109/ICDM.2004.10112")
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ final int k = Math.max(1, len / 10);
+
+ double[] knns = new double[len];
+
+ DoubleMinHeap heap = new DoubleMinHeap(k);
+ double kdistmean = 0.;
+ for(int i = 0; i < len; ++i) {
+ double ix = adapter1.getDouble(data1, i), iy = adapter2.getDouble(data2, i);
+ heap.clear();
+ for(int j = 0; j < len; ++j) {
+ double jx = adapter1.getDouble(data1, j), jy = adapter2.getDouble(data2, j);
+ double dx = ix - jx, dy = iy - jy;
+ heap.add(dx * dx + dy * dy); // Squared Euclidean.
+ }
+ double kdist = Math.sqrt(heap.peek()); // Euclidean
+ knns[i] = kdist;
+ kdistmean += kdist;
+ }
+ kdistmean /= len;
+ // Deviation from mean:
+ double diff = 0.;
+ int below = 0;
+ for(int l = 0; l < knns.length; l++) {
+ diff += Math.abs(kdistmean - knns[l]);
+ if(knns[l] < kdistmean) {
+ below++;
+ }
+ }
+ return (below > 0) ? diff / (2. * kdistmean * below) : 0;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected SURFINGDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeDependenceMeasure.java
new file mode 100644
index 00000000..3dd4fcaa
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeDependenceMeasure.java
@@ -0,0 +1,153 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Arrange dimensions based on the entropy of the slope spectrum.
+ *
+ * Reference:
+ * <p>
+ * Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek:<br />
+ * Interactive Data Mining with 3D-Parallel-Coordinate-Trees.<br />
+ * Proceedings of the 2013 ACM International Conference on Management of Data
+ * (SIGMOD), New York City, NY, 2013.
+ * </p>
+ *
+ * TODO: shouldn't this be normalized by the single-dimension entropies or so?
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+@Reference(authors = "Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", //
+title = "Interactive Data Mining with 3D-Parallel-Coordinate-Trees", //
+booktitle = "Proc. of the 2013 ACM International Conference on Management of Data (SIGMOD)", //
+url = "http://dx.doi.org/10.1145/2463676.2463696")
+public class SlopeDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final SlopeDependenceMeasure STATIC = new SlopeDependenceMeasure();
+
+ /**
+ * Full precision.
+ */
+ protected final static int PRECISION = 40;
+
+ /**
+ * Precision for entropy normalization.
+ */
+ protected final static double LOG_PRECISION = Math.log(PRECISION);
+
+ /**
+ * Scaling factor.
+ */
+ protected final static double RESCALE = PRECISION * .5;
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected SlopeDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+
+ // Get attribute value range:
+ final double off1, scale1, off2, scale2;
+ {
+ double mi = adapter1.getDouble(data1, 0), ma = mi;
+ for(int i = 1; i < len; ++i) {
+ double v = adapter1.getDouble(data1, i);
+ if(v < mi) {
+ mi = v;
+ }
+ else if(v > ma) {
+ ma = v;
+ }
+ }
+ off1 = mi;
+ scale1 = (ma > mi) ? (1. / (ma - mi)) : 1.;
+ // Second data
+ mi = adapter2.getDouble(data2, 0);
+ ma = mi;
+ for(int i = 1; i < len; ++i) {
+ double v = adapter2.getDouble(data2, i);
+ if(v < mi) {
+ mi = v;
+ }
+ else if(v > ma) {
+ ma = v;
+ }
+ }
+ off2 = mi;
+ scale2 = (ma > mi) ? (1. / (ma - mi)) : 1.;
+ }
+
+ // Collect angular histograms.
+ // Note, we only fill half of the matrix
+ int[] angles = new int[PRECISION];
+
+ // Scratch buffer
+ for(int i = 0; i < len; i++) {
+ double x = adapter1.getDouble(data1, i), y = adapter2.getDouble(data2, i);
+ x = (x - off1) * scale1;
+ y = (y - off2) * scale2;
+ final double delta = x - y + 1;
+ int div = (int) Math.round(delta * RESCALE);
+ // TODO: do we really need this check?
+ div = (div < 0) ? 0 : (div >= PRECISION) ? PRECISION - 1 : div;
+ angles[div] += 1;
+ }
+
+ // Compute entropy:
+ double entropy = 0.;
+ for(int l = 0; l < PRECISION; l++) {
+ if(angles[l] > 0) {
+ final double p = angles[l] / (double) len;
+ entropy += p * Math.log(p);
+ }
+ }
+ return 1 + entropy / LOG_PRECISION;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected SlopeDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeInversionDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeInversionDependenceMeasure.java
new file mode 100644
index 00000000..01ad78bc
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeInversionDependenceMeasure.java
@@ -0,0 +1,157 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Arrange dimensions based on the entropy of the slope spectrum.
+ *
+ * Reference:
+ * <p>
+ * Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek:<br />
+ * Interactive Data Mining with 3D-Parallel-Coordinate-Trees.<br />
+ * Proceedings of the 2013 ACM International Conference on Management of Data
+ * (SIGMOD), New York City, NY, 2013.
+ * </p>
+ *
+ * TODO: shouldn't this be normalized by the single-dimension entropies or so?
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+@Reference(authors = "Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", //
+title = "Interactive Data Mining with 3D-Parallel-Coordinate-Trees", //
+booktitle = "Proc. of the 2013 ACM International Conference on Management of Data (SIGMOD)", //
+url = "http://dx.doi.org/10.1145/2463676.2463696")
+public class SlopeInversionDependenceMeasure extends SlopeDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final SlopeInversionDependenceMeasure STATIC = new SlopeInversionDependenceMeasure();
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected SlopeInversionDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+
+ // Get attribute value range:
+ final double off1, scale1, off2, scale2;
+ {
+ double mi = adapter1.getDouble(data1, 0), ma = mi;
+ for(int i = 1; i < len; ++i) {
+ double v = adapter1.getDouble(data1, i);
+ if(v < mi) {
+ mi = v;
+ }
+ else if(v > ma) {
+ ma = v;
+ }
+ }
+ off1 = mi;
+ scale1 = (ma > mi) ? (1. / (ma - mi)) : 1.;
+ // Second data
+ mi = adapter2.getDouble(data2, 0);
+ ma = mi;
+ for(int i = 1; i < len; ++i) {
+ double v = adapter2.getDouble(data2, i);
+ if(v < mi) {
+ mi = v;
+ }
+ else if(v > ma) {
+ ma = v;
+ }
+ }
+ off2 = mi;
+ scale2 = (ma > mi) ? (1. / (ma - mi)) : 1.;
+ }
+
+ // Collect angular histograms.
+ // Note, we only fill half of the matrix
+ int[] angles = new int[PRECISION];
+ int[] angleI = new int[PRECISION];
+
+ // Scratch buffer
+ for(int i = 0; i < len; i++) {
+ double x = adapter1.getDouble(data1, i), y = adapter2.getDouble(data2, i);
+ x = (x - off1) * scale1;
+ y = (y - off2) * scale2;
+ {
+ final double delta = x - y + 1;
+ int div = (int) Math.round(delta * RESCALE);
+ // TODO: do we really need this check?
+ div = (div < 0) ? 0 : (div >= PRECISION) ? PRECISION - 1 : div;
+ angles[div] += 1;
+ }
+ {
+ final double delta = x + y;
+ int div = (int) Math.round(delta * RESCALE);
+ // TODO: do we really need this check?
+ div = (div < 0) ? 0 : (div >= PRECISION) ? PRECISION - 1 : div;
+ angleI[div] += 1;
+ }
+ }
+
+ // Compute entropy:
+ double entropy = 0., entropyI = 0.;
+ for(int l = 0; l < PRECISION; l++) {
+ if(angles[l] > 0) {
+ final double p = angles[l] / (double) len;
+ entropy += p * Math.log(p);
+ }
+ if(angleI[l] > 0) {
+ final double p = angleI[l] / (double) len;
+ entropyI += p * Math.log(p);
+ }
+ }
+ if(entropy >= entropyI) {
+ return 1 + entropy / LOG_PRECISION;
+ }
+ else {
+ return 1 + entropyI / LOG_PRECISION;
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected SlopeInversionDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SpearmanCorrelationDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SpearmanCorrelationDependenceMeasure.java
new file mode 100644
index 00000000..24202dce
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SpearmanCorrelationDependenceMeasure.java
@@ -0,0 +1,78 @@
+package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Spearman rank-correlation coefficient, also known as Spearmans Rho.
+ *
+ * @author Erich Schubert
+ */
+public class SpearmanCorrelationDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final SpearmanCorrelationDependenceMeasure STATIC = new SpearmanCorrelationDependenceMeasure();
+
+ /**
+ * Constructor - use {@link #STATIC} instance.
+ */
+ protected SpearmanCorrelationDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ double[] ranks1 = computeNormalizedRanks(adapter1, data1, len);
+ double[] ranks2 = computeNormalizedRanks(adapter2, data2, len);
+
+ // Second pass: variances and covariance
+ double v1 = 0., v2 = 0., cov = 0.;
+ for(int i = 0; i < len; i++) {
+ double d1 = ranks1[i] - .5, d2 = ranks2[i] - .5;
+ v1 += d1 * d1;
+ v2 += d2 * d2;
+ cov += d1 * d2;
+ }
+ // Note: we did not normalize by len, as this cancels out.
+ return cov / Math.sqrt(v1 * v2);
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected SpearmanCorrelationDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/package-info.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/package-info.java
new file mode 100644
index 00000000..a1f0bd55
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/package-info.java
@@ -0,0 +1,26 @@
+/**
+ * <p>Statistical measures of dependence, such as correlation.</p>
+ */
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.math.statistics.dependence; \ No newline at end of file