diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/math/statistics/dependence')
15 files changed, 2503 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/AbstractDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/AbstractDependenceMeasure.java new file mode 100644 index 00000000..ba23588e --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/AbstractDependenceMeasure.java @@ -0,0 +1,262 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Collection; +import java.util.Iterator; +import java.util.List; + +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.ArrayLikeUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; + +/** + * Abstract base class for dependence measures. + * + * @author Erich Schubert + */ +public abstract class AbstractDependenceMeasure implements DependenceMeasure { + // In your subclass, you will need to implement at least this method: + // Sorry for the complex use of generics, but this way we can use it with any + // type of array, including vectors in rows or columns etc. + @Override + abstract public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2); + + @Override + public double dependence(double[] data1, double[] data2) { + return dependence(ArrayLikeUtil.DOUBLEARRAYADAPTER, data1, ArrayLikeUtil.DOUBLEARRAYADAPTER, data2); + } + + @Override + public <A> double dependence(NumberArrayAdapter<?, A> adapter, A data1, A data2) { + return dependence(adapter, data1, adapter, data2); + } + + @Override + public <A> double[] dependence(NumberArrayAdapter<?, A> adapter, List<? extends A> data) { + final int dims = data.size(); + double[] out = new double[(dims * (dims - 1)) >> 1]; + int o = 0; + for(int y = 1; y < dims; y++) { + A dy = data.get(y); + for(int x = 0; x < y; x++) { + out[o++] = dependence(adapter, data.get(x), adapter, dy); + } + } + return out; + } + + /** + * Clamp values to a given minimum and maximum. + * + * @param value True value + * @param min Minimum + * @param max Maximum + * @return {@code value}, unless smaller than {@code min} or larger than + * {@code max}. + */ + protected static double clamp(double value, double min, double max) { + return value < min ? min : value > max ? max : value; + } + + /** + * Compute ranks of all objects, normalized to [0;1] (where 0 is the smallest + * value, 1 is the largest). + * + * @param adapter Data adapter + * @param data Data array + * @param len Length of data + * @return Array of scores + */ + protected static <A> double[] computeNormalizedRanks(final NumberArrayAdapter<?, A> adapter, final A data, int len) { + // Sort the objects: + int[] s1 = sortedIndex(adapter, data, len); + final double norm = .5 / (len - 1); + double[] ret = new double[len]; + for(int i = 0; i < len;) { + final int start = i++; + final double val = adapter.getDouble(data, s1[start]); + while(i < len && adapter.getDouble(data, s1[i]) <= val) { + i++; + } + final double score = (start + i - 1) * norm; + for(int j = start; j < i; j++) { + ret[s1[j]] = score; + } + } + return ret; + } + + /** + * Compute ranks of all objects, ranging from 1 to len. + * + * Ties are given the average rank. + * + * @param adapter Data adapter + * @param data Data array + * @param len Length of data + * @return Array of scores + */ + protected static <A> double[] ranks(final NumberArrayAdapter<?, A> adapter, final A data, int len) { + // Sort the objects: + int[] s1 = sortedIndex(adapter, data, len); + return ranks(adapter, data, s1); + } + + /** + * Compute ranks of all objects, ranging from 1 to len. + * + * Ties are given the average rank. + * + * @param adapter Data adapter + * @param data Data array + * @param idx Data index + * @return Array of scores + */ + protected static <A> double[] ranks(final NumberArrayAdapter<?, A> adapter, final A data, int[] idx) { + final int len = idx.length; + double[] ret = new double[len]; + for(int i = 0; i < len;) { + final int start = i++; + final double val = adapter.getDouble(data, idx[start]); + // Include ties: + while(i < len && adapter.getDouble(data, idx[i]) <= val) { + i++; + } + final double score = (start + i - 1) * .5 + 1; + for(int j = start; j < i; j++) { + ret[idx[j]] = score; + } + } + return ret; + } + + /** + * Build a sorted index of objects. + * + * @param adapter Data adapter + * @param data Data array + * @param len Length of data + * @return Sorted index + */ + protected static <A> int[] sortedIndex(final NumberArrayAdapter<?, A> adapter, final A data, int len) { + int[] s1 = MathUtil.sequence(0, len); + IntegerArrayQuickSort.sort(s1, new IntegerComparator() { + @Override + public int compare(int x, int y) { + return Double.compare(adapter.getDouble(data, x), adapter.getDouble(data, y)); + } + }); + return s1; + } + + /** + * Discretize a data set into equi-width bin numbers. + * + * @param adapter Data adapter + * @param data Data array + * @param len Length of data + * @param bins Number of bins + * @return Array of bin numbers [0;bin[ + */ + protected static <A> int[] discretize(NumberArrayAdapter<?, A> adapter, A data, final int len, final int bins) { + double min = adapter.getDouble(data, 0), max = min; + for(int i = 1; i < len; i++) { + double v = adapter.getDouble(data, i); + if(v < min) { + min = v; + } + else if(v > max) { + max = v; + } + } + final double scale = (max > min) ? bins / (max - min) : 1; + int[] discData = new int[len]; + for(int i = 0; i < len; i++) { + int bin = (int) Math.floor((adapter.getDouble(data, i) - min) * scale); + discData[i] = bin < 0 ? 0 : bin >= bins ? bins - 1 : bin; + } + return discData; + } + + /** + * Index into the serialized array. + * + * @param x Column + * @param y Row + * @return Index in serialized array + */ + protected static int index(int x, int y) { + assert (x < y) : "Only lower triangle is allowed."; + return ((y * (y - 1)) >> 1) + x; + } + + /** + * Validate the length of the two data sets (must be the same, and non-zero) + * + * @param adapter1 First data adapter + * @param data1 First data set + * @param adapter2 Second data adapter + * @param data2 Second data set + * @param <A> First array type + * @param <B> Second array type + */ + public static <A, B> int size(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) { + final int len = adapter1.size(data1); + if(len != adapter2.size(data2)) { + throw new AbortException("Array sizes do not match!"); + } + if(len == 0) { + throw new AbortException("Empty array!"); + } + return len; + } + + /** + * Validate the length of the two data sets (must be the same, and non-zero) + * + * @param adapter Data adapter + * @param data Data sets + * @param <A> First array type + */ + public static <A> int size(NumberArrayAdapter<?, A> adapter, Collection<? extends A> data) { + if(data.size() < 2) { + throw new AbortException("Need at least two axes to compute dependence measures."); + } + Iterator<? extends A> iter = data.iterator(); + final int len = adapter.size(iter.next()); + while(iter.hasNext()) { + if(len != adapter.size(iter.next())) { + throw new AbortException("Array sizes do not match!"); + } + } + if(len == 0) { + throw new AbortException("Empty array!"); + } + return len; + } +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java new file mode 100644 index 00000000..e03601f7 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/CorrelationDependenceMeasure.java @@ -0,0 +1,141 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Pearson product-moment correlation coefficient. + * + * @author Erich Schubert + */ +public class CorrelationDependenceMeasure extends AbstractDependenceMeasure { + /** + * Class logger. + */ + private static final Logging LOG = Logging.getLogger(CorrelationDependenceMeasure.class); + + /** + * Static instance. + */ + public static final CorrelationDependenceMeasure STATIC = new CorrelationDependenceMeasure(); + + /** + * Constructor - use {@link #STATIC} instance. + */ + protected CorrelationDependenceMeasure() { + super(); + } + + @Override + public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) { + final int len = size(adapter1, data1, adapter2, data2); + // Perform two-pass estimation, which is numerically stable and often faster + // than the Knuth-Welford approach (see PearsonCorrelation class) + double m1 = 0., m2 = 0; + for(int i = 0; i < len; i++) { + m1 += adapter1.getDouble(data1, i); + m2 += adapter2.getDouble(data2, i); + } + m1 /= len; + m2 /= len; + // Second pass: variances and covariance + double v1 = 0., v2 = 0., cov = 0.; + for(int i = 0; i < len; i++) { + double d1 = adapter1.getDouble(data1, i) - m1; + double d2 = adapter2.getDouble(data2, i) - m2; + v1 += d1 * d1; + v2 += d2 * d2; + cov += d1 * d2; + } + // Note: we did not normalize by len, as this cancels out. + return cov / Math.sqrt(v1 * v2); + } + + @Override + public <A> double[] dependence(NumberArrayAdapter<?, A> adapter, List<? extends A> data) { + final int dims = data.size(); + final int len = size(adapter, data); + double[] means = new double[dims]; + // Two passes - often faster due to the lower numerical cost + // And accurate, don't use sum-of-squares. + for(int j = 0; j < dims; j++) { + double m = 0.; + A da = data.get(j); + for(int i = 0; i < len; i++) { + m += adapter.getDouble(da, i); + } + means[j] = m / len; + } + // Build the covariance matrix, lower triangular half + double[] vst = new double[dims]; + double[] cov = new double[(dims * (dims - 1)) >> 1]; + double[] buf = new double[dims]; + for(int i = 0; i < len; i++) { + for(int j = 0; j < dims; j++) { + buf[j] = adapter.getDouble(data.get(j), i) - means[j]; + } + for(int y = 0, c = 0; y < dims; y++) { + for(int x = 0; x < y; x++) { + cov[c++] += buf[x] * buf[y]; + } + vst[y] += buf[y] * buf[y]; + } + } + // Compute standard deviations (times sqrt(len)!): + for(int y = 0; y < dims; y++) { + if(vst[y] == 0.) { + LOG.warning("Correlation is not well defined for constant attributes."); + } + vst[y] = Math.sqrt(vst[y]); + } + for(int y = 1, c = 0; y < dims; y++) { + for(int x = 0; x < y; x++) { + // We don't need to divide by sqrt(len), because it will cancel out with + // the division we skipped just above. + cov[c] = cov[c] / (vst[x] * vst[y]); + c++; + } + } + return cov; + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected CorrelationDependenceMeasure makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DependenceMeasure.java new file mode 100644 index 00000000..0e9e02c0 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DependenceMeasure.java @@ -0,0 +1,98 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.List; + +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; + +/** + * Measure the dependence of two variables. + * + * @author Erich Schubert + */ +public interface DependenceMeasure { + /** + * Measure the dependence of two variables. + * + * This is the more flexible API, which allows using different internal data + * representations. + * + * @param adapter1 First data adapter + * @param data1 First data set + * @param adapter2 Second data adapter + * @param data2 Second data set + * @param <A> First array type + * @param <B> Second array type + * @return Dependence measure + */ + <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2); + + /** + * Measure the dependence of two variables. + * + * This is the more flexible API, which allows using different internal data + * representations. + * + * @param adapter Array type adapter + * @param data1 First data set + * @param data2 Second data set + * @param <A> Array type + * @return Dependence measure + */ + <A> double dependence(NumberArrayAdapter<?, A> adapter, A data1, A data2); + + /** + * Measure the dependence of two variables. + * + * This is the more flexible API, which allows using different internal data + * representations. + * + * The resulting data is a serialized lower triangular matrix: + * + * <pre> + * X S S S S S + * 0 X S S S S + * 1 2 X S S S + * 3 4 5 X S S + * 6 7 8 9 X S + * 10 11 12 13 14 X + * </pre> + * + * @param adapter Data adapter + * @param data Data sets. Must have fast random access! + * @param <A> Array type + * @return Lower triangular serialized matrix + */ + <A> double[] dependence(NumberArrayAdapter<?, A> adapter, List<? extends A> data); + + /** + * Measure the dependence of two variables. + * + * @param data1 First data set + * @param data2 Second data set + * @return Dependence measure + */ + double dependence(double[] data1, double[] data2); +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DistanceCorrelationDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DistanceCorrelationDependenceMeasure.java new file mode 100644 index 00000000..1ae8d6b5 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/DistanceCorrelationDependenceMeasure.java @@ -0,0 +1,206 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Distance correlation.
+ *
+ * The value returned is the square root of the dCor² value. This matches the R
+ * implementation by the original authors.
+ *
+ * Reference:
+ * <p>
+ * Székely, G. J., Rizzo, M. L., & Bakirov, N. K.<br />
+ * Measuring and testing dependence by correlation of distances<br />
+ * The Annals of Statistics, 35(6), 2769-2794
+ * </p>
+ *
+ * Implementation notice: we exploit symmetry, and thus use diagonal matrixes.
+ * While initially the diagonal is zero, after double-centering the matrix these
+ * values can become non-zero!
+ *
+ * @author Marie Kiermeier
+ * @author Erich Schubert
+ */
+@Reference(authors = "Székely, G. J., Rizzo, M. L., & Bakirov, N. K.", //
+title = "Measuring and testing dependence by correlation of distances", //
+booktitle = "The Annals of Statistics, 35(6), 2769-2794", //
+url = "http://dx.doi.org/10.1214/009053607000000505")
+public class DistanceCorrelationDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final DistanceCorrelationDependenceMeasure STATIC = new DistanceCorrelationDependenceMeasure();
+
+ /**
+ * Constructor - use {@link #STATIC} instance instead!
+ */
+ protected DistanceCorrelationDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ double[] dMatrixA = computeDistances(adapter1, data1);
+ double[] dMatrixB = computeDistances(adapter2, data2);
+
+ // distance variance
+ double dVarA = computeDCovar(dMatrixA, dMatrixA, len);
+ if(!(dVarA > 0.)) {
+ return 0.;
+ }
+ double dVarB = computeDCovar(dMatrixB, dMatrixB, len);
+ if(!(dVarB > 0.)) {
+ return 0.;
+ }
+ double dCovar = computeDCovar(dMatrixA, dMatrixB, len);
+ // distance correlation
+ return Math.sqrt(dCovar / Math.sqrt(dVarA * dVarB));
+ }
+
+ @Override
+ public <A> double[] dependence(NumberArrayAdapter<?, A> adapter, List<? extends A> data) {
+ final int dims = data.size();
+ final int len = size(adapter, data);
+ double[][] dMatrix = new double[dims][];
+ for(int i = 0; i < dims; i++) {
+ dMatrix[i] = computeDistances(adapter, data.get(i));
+ }
+ double[] dVar = new double[dims];
+ for(int i = 0; i < dims; i++) {
+ dVar[i] = computeDCovar(dMatrix[i], dMatrix[i], len);
+ }
+ double[] dCor = new double[(dims * (dims - 1)) >> 1];
+ for(int y = 1, c = 0; y < dims; y++) {
+ for(int x = 0; x < y; x++) {
+ if(!(dVar[x] * dVar[y] > 0.)) {
+ dCor[c++] = 0.;
+ continue;
+ }
+ double dCovar = computeDCovar(dMatrix[x], dMatrix[y], len);
+ dCor[c++] = Math.sqrt(dCovar / Math.sqrt(dVar[x] * dVar[y]));
+ }
+ }
+ return dCor;
+ }
+
+ /**
+ * Compute the double-centered delta matrix.
+ *
+ * @param adapter Data adapter
+ * @param data Input data
+ * @return Double-centered delta matrix.
+ */
+ protected static <A> double[] computeDistances(NumberArrayAdapter<?, A> adapter, A data) {
+ final int size = adapter.size(data);
+ double[] dMatrix = new double[(size * (size + 1)) >> 1];
+ for(int i = 0, c = 0; i < size; i++) {
+ for(int j = 0; j < i; j++) {
+ double dx = adapter.getDouble(data, i) - adapter.getDouble(data, j);
+ dMatrix[c++] = (dx < 0) ? -dx : dx; // Absolute difference.
+ }
+ c++; // Diagonal entry: zero
+ }
+ doubleCenterMatrix(dMatrix, size);
+ return dMatrix;
+ }
+
+ /**
+ * Computes the distance variance matrix of one axis.
+ *
+ * @param dMatrix distance matrix of the axis
+ * @param size Dimensionality
+ */
+ public static void doubleCenterMatrix(double[] dMatrix, int size) {
+ double[] rowMean = new double[size];
+ // row sum
+ for(int i = 0, c = 0; i < size; i++) {
+ for(int j = 0; j < i; j++) {
+ double v = dMatrix[c++];
+ rowMean[i] += v;
+ rowMean[j] += v;
+ }
+ assert (dMatrix[c] == 0.);
+ c++; // Diagonal entry. Must be zero!
+ }
+ // Normalize averages:
+ double matrixMean = 0.;
+ for(int i = 0; i < size; i++) {
+ matrixMean += rowMean[i];
+ rowMean[i] /= size;
+ }
+ matrixMean /= size * size;
+
+ for(int o = 0, c = 0; o < size; o++) {
+ // Including row mean!
+ for(int p = 0; p <= o; p++) {
+ dMatrix[c++] -= rowMean[o] + rowMean[p] - matrixMean;
+ }
+ }
+ }
+
+ /**
+ * Computes the distance covariance for two axis. Can also be used to compute
+ * the distance variance of one axis (dVarMatrixA = dVarMatrixB).
+ *
+ * @param dVarMatrixA distance variance matrix of the first axis
+ * @param dVarMatrixB distance variance matrix of the second axis
+ * @param n number of points
+ * @return distance covariance
+ */
+ protected double computeDCovar(double[] dVarMatrixA, double[] dVarMatrixB, int n) {
+ double result = 0.;
+ for(int i = 0, c = 0; i < n; i++) {
+ for(int j = 0; j < i; j++) {
+ result += 2. * dVarMatrixA[c] * dVarMatrixB[c];
+ c++;
+ }
+ // Diagonal entry.
+ result += dVarMatrixA[c] * dVarMatrixB[c];
+ c++;
+ }
+ return result / (n * n);
+ }
+
+ /**
+ * Parameterization class
+ *
+ * @author Marie Kiermeier
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected DistanceCorrelationDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HSMDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HSMDependenceMeasure.java new file mode 100644 index 00000000..aaacb888 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HSMDependenceMeasure.java @@ -0,0 +1,245 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import de.lmu.ifi.dbs.elki.math.SinCosTable; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Compute the similarity of dimensions by using a hough transformation. + * + * Reference: <br> + * <p> + * A. Tatu, G. Albuquerque, M. Eisemann, P. Bak, H. Theisel, M. A. Magnor, and + * D. A. Keim.<br /> + * Automated Analytical Methods to Support Visual Exploration of High- + * Dimensional Data. <br/> + * IEEEVisualization and Computer Graphics, 2011. + * </p> + * + * FIXME: This needs serious TESTING before release. Large parts have been + * rewritten, but could not be tested at the time of rewriting. + * + * @author Erich Schubert + * @author Robert Rödler + */ +@Reference(authors = "A. Tatu, G. Albuquerque, M. Eisemann, P. Bak, H. Theisel, M. A. Magnor, and D. A. Keim", // +title = "Automated Analytical Methods to Support Visual Exploration of High-Dimensional Data", // +booktitle = "IEEE Trans. Visualization and Computer Graphics, 2011", // +url = "http://dx.doi.org/10.1109/TVCG.2010.242") +public class HSMDependenceMeasure extends AbstractDependenceMeasure { + /** + * Static instance. + */ + public static final HSMDependenceMeasure STATIC = new HSMDependenceMeasure(); + + /** + * Angular resolution. Best if divisible by 4: smaller tables. + * + * The original publication used 50. + */ + private final static int STEPS = 48; // 64; + + /** + * Resolution of image. + */ + private final int resolution = 512; + + /** + * Precompute sinus and cosinus + */ + private final static SinCosTable table = SinCosTable.make(STEPS); + + @Override + public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) { + final int len = size(adapter1, data1, adapter2, data2); + boolean[][] pic = new boolean[resolution][resolution]; + + // Get attribute value range: + final double off1, scale1, off2, scale2; + { + double mi = adapter1.getDouble(data1, 0), ma = mi; + for(int i = 1; i < len; ++i) { + double v = adapter1.getDouble(data1, i); + if(v < mi) { + mi = v; + } + else if(v > ma) { + ma = v; + } + } + off1 = mi; + scale1 = (ma > mi) ? (1. / (ma - mi)) : 1.; + // Second data + mi = adapter2.getDouble(data2, 0); + ma = mi; + for(int i = 1; i < len; ++i) { + double v = adapter2.getDouble(data2, i); + if(v < mi) { + mi = v; + } + else if(v > ma) { + ma = v; + } + } + off2 = mi; + scale2 = (ma > mi) ? (1. / (ma - mi)) : 1.; + } + // Iterate dataset + for(int i = 0; i < len; ++i) { + double xi = (adapter1.getDouble(data1, i) - off1) * scale1; + double xj = (adapter2.getDouble(data2, i) - off2) * scale2; + drawLine(0, (int) (resolution * xi), resolution - 1, (int) (resolution * xj), pic); + } + + final double stepsq = (double) STEPS * (double) STEPS; + int[][] hough = houghTransformation(pic); + // The original publication said "median", but judging from the text, + // meant "mean". Otherwise, always half of the cells are above the + // threshold, which doesn't match the explanation there. + double mean = sumMatrix(hough) / stepsq; + int abovemean = countAboveThreshold(hough, mean); + + return 1. - (abovemean / stepsq); + } + + /** + * Compute the sum of a matrix. + * + * @param mat Matrix + * @return Sum of all elements + */ + private long sumMatrix(int[][] mat) { + long ret = 0; + for(int i = 0; i < mat.length; i++) { + final int[] row = mat[i]; + for(int j = 0; j < row.length; j++) { + ret += row[j]; + } + } + return ret; + } + + /** + * Count the number of cells above the threshold. + * + * @param mat Matrix + * @param threshold Threshold + * @return Number of elements above the threshold. + */ + private int countAboveThreshold(int[][] mat, double threshold) { + int ret = 0; + for(int i = 0; i < mat.length; i++) { + int[] row = mat[i]; + for(int j = 0; j < row.length; j++) { + if(row[j] >= threshold) { + ret++; + } + } + } + return ret; + } + + /** + * Perform a hough transformation on the binary image in "mat". + * + * @param mat Binary image + * @return Hough transformation of image. + */ + private int[][] houghTransformation(boolean[][] mat) { + final int xres = mat.length, yres = mat[0].length; + final double tscale = STEPS * .5 / (xres + yres); + final int[][] ret = new int[STEPS][STEPS]; + + for(int x = 0; x < mat.length; x++) { + for(int y = 0; y < mat[0].length; y++) { + if(mat[x][y]) { + for(int i = 0; i < STEPS; i++) { + final int d = (STEPS >> 1) + (int) (tscale * (x * table.cos(i) + y * table.sin(i))); + if(d > 0 && d < STEPS) { + ret[d][i]++; + } + } + } + } + } + + return ret; + } + + /** + * Draw a line onto the array, using the classic Bresenham algorithm. + * + * @param x0 Start X + * @param y0 Start Y + * @param x1 End X + * @param y1 End Y + * @param pic Picture array + */ + private static void drawLine(int x0, int y0, int x1, int y1, boolean[][] pic) { + final int xres = pic.length, yres = pic[0].length; + // Ensure bounds + y0 = (y0 < 0) ? 0 : (y0 >= yres) ? (yres - 1) : y0; + y1 = (y1 < 0) ? 0 : (y1 >= yres) ? (yres - 1) : y1; + x0 = (x0 < 0) ? 0 : (x0 >= xres) ? (xres - 1) : x0; + x1 = (x1 < 0) ? 0 : (x1 >= xres) ? (xres - 1) : x1; + // Default slope + final int dx = +Math.abs(x1 - x0), sx = x0 < x1 ? 1 : -1; + final int dy = -Math.abs(y1 - y0), sy = y0 < y1 ? 1 : -1; + // Error counter + int err = dx + dy; + + for(;;) { + pic[x0][y0] = true; + if(x0 == x1 && y0 == y1) { + break; + } + + final int e2 = err << 1; + if(e2 > dy) { + err += dy; + x0 += sx; + } + if(e2 < dx) { + err += dx; + y0 += sy; + } + } + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected HSMDependenceMeasure makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HiCSDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HiCSDependenceMeasure.java new file mode 100644 index 00000000..e760ef83 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HiCSDependenceMeasure.java @@ -0,0 +1,243 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import java.util.Random; + +import de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS; +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.math.random.RandomFactory; +import de.lmu.ifi.dbs.elki.math.statistics.tests.GoodnessOfFitTest; +import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; + +/** + * Use the statistical tests as used by HiCS to measure dependence of variables. + * + * Reference: + * <p> + * Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek:<br /> + * Interactive Data Mining with 3D-Parallel-Coordinate-Trees.<br /> + * Proceedings of the 2013 ACM International Conference on Management of Data + * (SIGMOD), New York City, NY, 2013. + * </p> + * + * Based on: + * <p> + * F. Keller, E. Müller, and K. Böhm.<br /> + * HiCS: High Contrast Subspaces for Density-Based Outlier Ranking. <br /> + * In ICDE, pages 1037–1048, 2012. + * </p> + * + * @author Erich Schubert + * @author Robert Rödler + */ +@Reference(authors = "Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", // +title = "Interactive Data Mining with 3D-Parallel-Coordinate-Trees", // +booktitle = "Proc. of the 2013 ACM International Conference on Management of Data (SIGMOD)", // +url = "http://dx.doi.org/10.1145/2463676.2463696") +public class HiCSDependenceMeasure extends AbstractDependenceMeasure { + /** + * Monte-Carlo iterations + */ + private int m = 50; + + /** + * Alpha threshold + */ + private double alphasqrt = Math.sqrt(0.1); + + /** + * Statistical test to use + */ + private GoodnessOfFitTest statTest; + + /** + * Random generator + */ + private RandomFactory rnd; + + /** + * Constructor. + * + * @param statTest Test function + * @param m Number of monte-carlo iterations + * @param alpha Alpha threshold + * @param rnd Random source + */ + public HiCSDependenceMeasure(GoodnessOfFitTest statTest, int m, double alpha, RandomFactory rnd) { + super(); + this.statTest = statTest; + this.m = m; + this.alphasqrt = Math.sqrt(alpha); + this.rnd = rnd; + } + + @Override + public <A, B> double dependence(final NumberArrayAdapter<?, A> adapter1, final A data1, final NumberArrayAdapter<?, B> adapter2, final B data2) { + final int len = size(adapter1, data1, adapter2, data2); + final int windowsize = (int) (len * alphasqrt); + final Random random = rnd.getSingleThreadedRandom(); + + // Sorted copies for slicing. + int[] s1 = MathUtil.sequence(0, len), s2 = MathUtil.sequence(0, len); + IntegerArrayQuickSort.sort(s1, new IntegerComparator() { + @Override + public int compare(int x, int y) { + return Double.compare(adapter1.getDouble(data1, x), adapter1.getDouble(data1, y)); + } + }); + IntegerArrayQuickSort.sort(s2, new IntegerComparator() { + @Override + public int compare(int x, int y) { + return Double.compare(adapter2.getDouble(data2, x), adapter2.getDouble(data2, y)); + } + }); + + // Distributions for testing + double[] fullValues = new double[len]; + double[] sampleValues = new double[windowsize]; + double deviationSum = 0.; + + // For the first half, we use the first dimension as reference + for(int i = 0; i < len; i++) { + fullValues[i] = adapter1.getDouble(data1, i); + if(fullValues[i] != fullValues[i]) { + throw new AbortException("NaN values are not allowed by this implementation!"); + } + } + + int half = m >> 1; // TODO: remove bias? + for(int i = 0; i < half; ++i) { + // Build the sample + for(int j = random.nextInt(len - windowsize), k = 0; k < windowsize; ++k, ++j) { + sampleValues[k] = adapter2.getDouble(data2, j); + } + double contrast = statTest.deviation(fullValues, sampleValues); + if(Double.isNaN(contrast)) { + --i; // Retry. + continue; + } + deviationSum += contrast; + } + + // For the second half, we use the second dimension as reference + for(int i = 0; i < len; i++) { + fullValues[i] = adapter2.getDouble(data2, i); + if(fullValues[i] != fullValues[i]) { + throw new AbortException("NaN values are not allowed by this implementation!"); + } + } + + for(int i = half; i < m; ++i) { + // Build the sample + for(int j = random.nextInt(len - windowsize), k = 0; k < windowsize; ++k, ++j) { + sampleValues[k] = adapter1.getDouble(data1, j); + } + double contrast = statTest.deviation(fullValues, sampleValues); + if(Double.isNaN(contrast)) { + --i; // Retry. + continue; + } + deviationSum += contrast; + } + + return deviationSum / m; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + /** + * Statistical test to use + */ + private GoodnessOfFitTest statTest; + + /** + * Holds the value of + * {@link de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS.Parameterizer#M_ID} + * . + */ + private int m = 50; + + /** + * Holds the value of + * {@link de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS.Parameterizer#ALPHA_ID} + * . + */ + private double alpha = 0.1; + + /** + * Random generator. + */ + private RandomFactory rnd; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + final IntParameter mP = new IntParameter(HiCS.Parameterizer.M_ID, 50); + mP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); + if(config.grab(mP)) { + m = mP.intValue(); + } + + final DoubleParameter alphaP = new DoubleParameter(HiCS.Parameterizer.ALPHA_ID, 0.1); + alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); + if(config.grab(alphaP)) { + alpha = alphaP.doubleValue(); + } + + final ObjectParameter<GoodnessOfFitTest> testP = new ObjectParameter<>(HiCS.Parameterizer.TEST_ID, GoodnessOfFitTest.class, KolmogorovSmirnovTest.class); + if(config.grab(testP)) { + statTest = testP.instantiateClass(config); + } + + final RandomParameter rndP = new RandomParameter(HiCS.Parameterizer.SEED_ID); + if(config.grab(rndP)) { + rnd = rndP.getValue(); + } + } + + @Override + protected HiCSDependenceMeasure makeInstance() { + return new HiCSDependenceMeasure(statTest, m, alpha, rnd); + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HoeffdingsDDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HoeffdingsDDependenceMeasure.java new file mode 100644 index 00000000..7f1f9fc4 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/HoeffdingsDDependenceMeasure.java @@ -0,0 +1,207 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Calculate Hoeffding's D as a measure of dependence. + * + * References: + * <p> + * W. Hoeffding:<br /> + * A non-parametric test of independence<br /> + * The Annals of Mathematical Statistics 19:546–57 + * </p> + * + * The resulting value is scaled by 30, so it is in the range {@code [-.5;1]}. + * + * @author Yinchong Yang + * @author Erich Schubert + */ +@Reference(authors = "W. Hoeffding", // +title = "A non-parametric test of independence", // +booktitle = "The Annals of Mathematical Statistics 19", // +url = "http://www.jstor.org/stable/2236021") +public class HoeffdingsDDependenceMeasure extends AbstractDependenceMeasure { + /** + * Static instance. + */ + public static final HoeffdingsDDependenceMeasure STATIC = new HoeffdingsDDependenceMeasure(); + + /** + * Constructor - use {@link #STATIC} instance. + */ + protected HoeffdingsDDependenceMeasure() { + super(); + } + + @Override + public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) { + final int n = size(adapter1, data1, adapter2, data2); + assert (n > 4) : "Hoeffdings D needs at least 5 elements!"; + if(n <= 4) { + return Double.NaN; + } + double[] r = ranks(adapter1, data1, n); + double[] s = ranks(adapter2, data2, n); + // TODO: is it possible to exploit sorting to accelerate computing q? + double[] q = computeBivariateRanks(adapter1, data1, adapter2, data2, n); + + double d1 = 0, d2 = 0, d3 = 0; + for(int i = 0; i < n; i++) { + d1 += q[i] * (q[i] - 1); // Note: our q is 0-indexed. + d2 += (r[i] - 1) * (r[i] - 2) * (s[i] - 1) * (s[i] - 2); + d3 += (r[i] - 2) * (s[i] - 2) * q[i]; + } + + // Factor n-2 was moved for better numerical behavior. + double nom = (n - 3.) * d1 + d2 / (n - 2) - 2. * d3; + double div = n * (n - 1.) * (n - 3.) * (n - 4.); + double d = 30 * nom / div; + return d < 1. ? d : 1.; + } + + /** + * Compute bivariate ranks. + * + * q[i] is the number of objects such that x[j] < x[i] and y[j] < y[i] + * + * @param adapter1 First adapter + * @param data1 First data set + * @param adapter2 Second adapter + * @param data2 Second data set + * @param len Length + * @return Bivariate rank statistics. + */ + protected static <A, B> double[] computeBivariateRanks(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2, int len) { + double[] ret = new double[len]; + for(int i = 0; i < len; i++) { + for(int j = i + 1; j < len; j++) { + double xi = adapter1.getDouble(data1, i), xj = adapter1.getDouble(data1, j); + double yi = adapter2.getDouble(data2, i), yj = adapter2.getDouble(data2, j); + if(xi < xj) { + ret[j] += (yi < yj) ? 1 : (yi == yj) ? .5 : 0; + } + else if(xj < xi) { + ret[i] += (yj < yi) ? 1 : (yj == yi) ? .5 : 0; + } + else { // tied on x + if(yi < yj) { + ret[j] += .5; + } + else if(yj < yi) { + ret[i] += .5; + } + else { // Double tied + ret[i] += .25; + ret[j] += .25; + } + } + } + } + return ret; + } + + // Tabular approximation + private final static double[] TABVAL = { // + 0.5297, 0.4918, 0.4565, 0.4236, 0.393, // + 0.3648, 0.3387, 0.3146, 0.2924, 0.2719, // 10 + 0.253, 0.2355, 0.2194, 0.2045, 0.1908, // + 0.1781, 0.1663, 0.1554, 0.1453, 0.1359, // 20 + 0.1273, 0.1192, 0.1117, 0.1047, 0.0982, // + 0.0921, 0.0864, 0.0812, 0.0762, 0.0716, // 30 + 0.0673, 0.0633, 0.0595, 0.056, 0.0527, // + 0.0496, 0.0467, 0.044, 0.0414, 0.039, // 40 + 0.0368, 0.0347, 0.0327, 0.0308, 0.0291, // + 0.0274, 0.0259, 0.0244, 0.023, 0.0217, // 50 + 0.0205, 0.0194, 0.0183, 0.0173, 0.0163, // + 0.0154, 0.0145, 0.0137, 0.013, 0.0123, // 60 + 0.0116, 0.011, 0.0104, 0.0098, 0.0093, // + 0.0087, 0.0083, 0.0078, 0.0074, 0.007, // 70 + 0.0066, 0.0063, 0.0059, 0.0056, 0.0053, // + 0.005, 0.0047, 0.0045, 0.0042, 0.0025, // 80 + 0.0014, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001 }; + + // Table positions + private final static double[] TABPOS = new double[] { // + 1.10, 1.15, 1.20, 1.25, 1.30, 1.35, 1.40, 1.45, 1.50, 1.55, // + 1.60, 1.65, 1.70, 1.75, 1.80, 1.85, 1.90, 1.95, 2.00, 2.05, // + 2.10, 2.15, 2.20, 2.25, 2.30, 2.35, 2.40, 2.45, 2.50, 2.55, // + 2.60, 2.65, 2.70, 2.75, 2.80, 2.85, 2.90, 2.95, 3.00, 3.05, // + 3.10, 3.15, 3.20, 3.25, 3.30, 3.35, 3.40, 3.45, 3.50, 3.55, // + 3.60, 3.65, 3.70, 3.75, 3.80, 3.85, 3.90, 3.95, 4.00, 4.05, // + 4.10, 4.15, 4.20, 4.25, 4.30, 4.35, 4.40, 4.45, 4.50, 4.55, // + 4.60, 4.65, 4.70, 4.75, 4.80, 4.85, 4.90, 4.95, 5.00, // + 5.50, 6.00, 6.50, 7.00, 7.50, 8.00, 8.50 }; + + /** + * Convert Hoeffding D value to a p-value. + * + * @param d D value + * @param n Data set size + * @return p-value + */ + public double toPValue(double d, int n) { + double b = d / 30 + 1. / (36 * n); + double z = .5 * MathUtil.PISQUARE * MathUtil.PISQUARE * n * b; + + // Exponential approximation + if(z < 1.1 || z > 8.5) { + double e = Math.exp(0.3885037 - 1.164879 * z); + return (e > 1) ? 1 : (e < 0) ? 0 : e; + } + // Tabular approximation + for(int i = 0; i < 86; i++) { + if(TABPOS[i] >= z) { + // Exact table value + if(TABPOS[i] == z) { + return TABVAL[i]; + } + // Linear interpolation + double x1 = TABPOS[i], x0 = TABPOS[i - 1]; + double y1 = TABVAL[i], y0 = TABVAL[i - 1]; + return y0 + (y1 - y0) * (z - x0) / (x1 - x0); + } + } + return -1; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected HoeffdingsDDependenceMeasure makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/JensenShannonEquiwidthDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/JensenShannonEquiwidthDependenceMeasure.java new file mode 100644 index 00000000..7b0000a3 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/JensenShannonEquiwidthDependenceMeasure.java @@ -0,0 +1,143 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Jensen-Shannon Divergence is closely related to mutual information. + * + * The output value is normalized, such that an evenly distributed and identical + * distribution will yield a value of 1. Independent distributions may still + * yield values close to .25, though. + * + * TODO: Offer normalized and non-normalized variants? + * + * @author Erich Schubert + */ +public class JensenShannonEquiwidthDependenceMeasure extends AbstractDependenceMeasure { + /** + * Static instance. + */ + public static final JensenShannonEquiwidthDependenceMeasure STATIC = new JensenShannonEquiwidthDependenceMeasure(); + + /** + * Constructor - use {@link #STATIC} instance. + */ + protected JensenShannonEquiwidthDependenceMeasure() { + super(); + } + + @Override + public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) { + final int len = size(adapter1, data1, adapter2, data2); + final int bins = (int) Math.round(Math.sqrt(len)); + final int maxbin = bins - 1; + + double min1 = adapter1.getDouble(data1, 0), max1 = min1; + double min2 = adapter2.getDouble(data2, 0), max2 = min2; + for(int i = 1; i < len; i++) { + final double vx = adapter1.getDouble(data1, i); + final double vy = adapter2.getDouble(data2, i); + if(vx < min1) { + min1 = vx; + } + else if(vx > max1) { + max1 = vx; + } + if(vy < min2) { + min2 = vy; + } + else if(vy > max2) { + max2 = vy; + } + } + final double scale1 = (max1 > min1) ? bins / (max1 - min1) : 1; + final double scale2 = (max2 > min2) ? bins / (max2 - min2) : 1; + + int[] margin1 = new int[bins], margin2 = new int[bins]; + int[][] counts = new int[bins][bins]; + for(int i = 0; i < len; i++) { + int bin1 = (int) Math.floor((adapter1.getDouble(data1, i) - min1) * scale1); + int bin2 = (int) Math.floor((adapter2.getDouble(data2, i) - min2) * scale2); + bin1 = bin1 < bins ? bin1 : maxbin; + bin2 = bin2 < bins ? bin2 : maxbin; + margin1[bin1]++; + margin2[bin2]++; + counts[bin1][bin2]++; + } + + // calculating relative frequencies + double e = 0; + for(int bin1 = 0; bin1 < counts.length; bin1++) { + // Margin value for row i. + final int sum1 = margin1[bin1]; + // Skip empty rows early. + if(sum1 == 0) { + continue; + } + // Inverse pX + final double pX = sum1 / (double) len; + final int[] row = counts[bin1]; + for(int bin2 = 0; bin2 < row.length; bin2++) { + final int sum2 = margin2[bin2]; + if(sum2 > 0) { + final int cell = row[bin2]; + // JS divergence of pXY and (pX * pY) + double pXY = cell / (double) len; + final double pXpY = pX * sum2 / len; + final double iavg = 2. / (pXY + pXpY); + e += pXY > 0. ? pXY * Math.log(pXY * iavg) : 0.; + e += pXpY * Math.log(pXpY * iavg); + } + } + } + // Expected value for evenly distributed and identical: + // pX = pY = 1/b and thus pXpY = 1/b^2. + // for i==j, pXY=1/b and thus iavg = 2*b*b/(b+1) + // otherwise, pXY=0 and thus iavg = 2*b*b + // pXY: e += log(b*2/(b+1)) = log(b) + log(2/(b+1)) + // pXpY1: e += 1/b*log(2/(b+1)) = 1/b*log(2/(b+1)) + // pXpY2: e += (b-1)/b*log(2) = (b-1)/b*log(2) + final double exp = Math.log(bins) + (1. + 1. / bins) * Math.log(2. / (bins + 1)) + (bins - 1.) / bins * MathUtil.LOG2; + // e *= .5; // Average, but we need to adjust exp then, too! + return e / exp; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected JensenShannonEquiwidthDependenceMeasure makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MCEDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MCEDependenceMeasure.java new file mode 100644 index 00000000..899aff9a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MCEDependenceMeasure.java @@ -0,0 +1,274 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +import java.util.ArrayList; +import java.util.Arrays; + +import de.lmu.ifi.dbs.elki.math.MathUtil; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort; +import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Compute a mutual information based dependence measure using a nested means + * discretization, originally proposed for ordering axes in parallel coordinate + * plots. + * + * Reference: + * <p> + * D. Guo<br /> + * Coordinating computational and visual approaches for interactive feature + * selection and multivariate clustering<br /> + * Information Visualization, 2(4), 2003. + * </p> + * + * @author Erich Schubert + */ +@Reference(authors = "D. Guo", // +title = "Coordinating computational and visual approaches for interactive feature selection and multivariate clustering", // +booktitle = "Information Visualization, 2(4)", // +url = "http://dx.doi.org/10.1057/palgrave.ivs.9500053") +public class MCEDependenceMeasure extends AbstractDependenceMeasure { + /** + * Static instance. + */ + public static final MCEDependenceMeasure STATIC = new MCEDependenceMeasure(); + + /** + * Desired size: 35 observations. + * + * While this could trivially be made parameterizable, it is a reasonable rule + * of thumb and not expected to have a major effect. + */ + public static final int TARGET = 35; + + @Override + public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) { + final int len = size(adapter1, data1, adapter2, data2); + // Find a number of bins as recommended by Cheng et al. + double p = MathUtil.log2(len / (double) TARGET); + // As we are in 2d, take the root (*.5) But let's use at least 1, too. + // Check: for 10000 this should give 4, for 150 it gives 1. + int power = Math.max(1, (int) Math.floor(p * .5)); + int gridsize = 1 << power; + double loggrid = Math.log((double) gridsize); + + ArrayList<int[]> parts1 = buildPartitions(adapter1, data1, len, power); + ArrayList<int[]> parts2 = buildPartitions(adapter2, data2, len, power); + + int[][] res = new int[gridsize][gridsize]; + intersectionMatrix(res, parts1, parts2, gridsize); + return 1. - getMCEntropy(res, parts1, parts2, len, gridsize, loggrid); + } + + /** + * Partitions an attribute. + * + * @param adapter1 Data adapter + * @param data1 Data set + * @param len Length of data + * @param depth Splitting depth + * @return List of sorted objects + */ + private <A> ArrayList<int[]> buildPartitions(NumberArrayAdapter<?, A> adapter1, A data1, int len, int depth) { + final int[] idx = new int[len]; + final double[] tmp = new double[len]; + for(int i = 0; i < len; ++i) { + idx[i] = i; + tmp[i] = adapter1.getDouble(data1, i); + } + // Sort indexes: + IntegerArrayQuickSort.sort(idx, new IntegerComparator() { + @Override + public int compare(int x, int y) { + return Double.compare(tmp[x], tmp[y]); + } + }); + Arrays.sort(tmp); // Should yield the same ordering + + ArrayList<int[]> ret = new ArrayList<>(1 << depth); + divide(idx, tmp, ret, 0, tmp.length, depth); + return ret; + } + + /** + * Recursive call to further subdivide the array. + * + * @param idx Object indexes. + * @param data 1D data, sorted + * @param ret Output index + * @param start Interval start + * @param end Interval end + * @param depth Depth + */ + private void divide(int[] idx, double[] data, ArrayList<int[]> ret, int start, int end, int depth) { + if(depth == 0) { + int[] a = Arrays.copyOfRange(idx, start, end); + Arrays.sort(a); + ret.add(a); + return; + } + final int count = end - start; + if(count == 0) { + // Corner case, that should barely happen. But for ties, we currently + // Do not yet assure that it doesn't happen! + for(int j = 1 << depth; j > 0; --j) { + ret.add(new int[0]); + } + return; + } + double m = 0.; + for(int i = start; i < end; i++) { + m += data[i]; + } + m /= count; + int pos = Arrays.binarySearch(data, start, end, m); + if(pos >= 0) { + // Ties: try to choose the most central element. + final int opt = (start + end) >> 1; + while(data[pos] == m) { + if(pos < opt) { + pos++; + } + else if(pos > opt) { + pos--; + } + else { + break; + } + } + } + else { + pos = (-pos - 1); + } + divide(idx, data, ret, start, pos, depth - 1); + divide(idx, data, ret, pos, end, depth - 1); + } + + /** + * Intersect the two 1d grid decompositions, to obtain a 2d matrix. + * + * @param res Output matrix to fill + * @param partsx Partitions in first component + * @param partsy Partitions in second component. + * @param gridsize Size of partition decomposition + */ + private void intersectionMatrix(int[][] res, ArrayList<int[]> partsx, ArrayList<int[]> partsy, int gridsize) { + for(int x = 0; x < gridsize; x++) { + final int[] px = partsx.get(x); + final int[] rowx = res[x]; + for(int y = 0; y < gridsize; y++) { + int[] py = partsy.get(y); + rowx[y] = intersectionSize(px, py); + } + } + } + + /** + * Compute the intersection of two sorted integer lists. + * + * @param px First list + * @param py Second list + * @return Intersection size. + */ + private int intersectionSize(int[] px, int[] py) { + int i = 0, j = 0, c = 0; + while(i < px.length && j < py.length) { + final int vx = px[i], vy = py[i]; + if(vx < vy) { + ++i; + } + else if(vx > vy) { + ++j; + } + else { + ++c; + ++i; + ++j; + } + } + return c; + } + + /** + * Compute the MCE entropy value. + * + * @param mat Partition size matrix + * @param partsx Partitions on X + * @param partsy Partitions on Y + * @param size Data set size + * @param gridsize Size of grids + * @param loggrid Logarithm of grid sizes, for normalization + * @return MCE score. + */ + private double getMCEntropy(int[][] mat, ArrayList<int[]> partsx, ArrayList<int[]> partsy, int size, int gridsize, double loggrid) { + // Margin entropies: + double[] mx = new double[gridsize]; + double[] my = new double[gridsize]; + + for(int i = 0; i < gridsize; i++) { + // Note: indexes are a bit tricky here, because we compute both margin + // entropies at the same time! + final double sumx = (double) partsx.get(i).length; + final double sumy = (double) partsy.get(i).length; + for(int j = 0; j < gridsize; j++) { + double px = mat[i][j] / sumx; + double py = mat[j][i] / sumy; + + if(px > 0.) { + mx[i] -= px * Math.log(px); + } + if(py > 0.) { + my[i] -= py * Math.log(py); + } + } + } + + // Weighted sums of margin entropies. + double sumx = 0., sumy = 0.; + for(int i = 0; i < gridsize; i++) { + sumx += mx[i] * partsx.get(i).length; + sumy += my[i] * partsy.get(i).length; + } + + double max = ((sumx > sumy) ? sumx : sumy); + return max / (size * loggrid); + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected MCEDependenceMeasure makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MutualInformationEquiwidthDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MutualInformationEquiwidthDependenceMeasure.java new file mode 100644 index 00000000..26a1759a --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/MutualInformationEquiwidthDependenceMeasure.java @@ -0,0 +1,137 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Mutual Information (MI) dependence measure by dividing each attribute into + * equal-width bins. MI can be seen as Kullback–Leibler divergence of the joint + * distribution and the product of the marginal distributions. + * + * For normalization, the resulting values are scaled by {@code mi/log(nbins)}. + * This both cancels out the logarithm base, and normalizes for the number of + * bins (a uniform distribution will yield a MI with itself of 1). + * + * TODO: Offer normalized and non-normalized variants? + * + * For a median-based discretization, see {@link MCEDependenceMeasure}. + * + * @author Erich Schubert + */ +public class MutualInformationEquiwidthDependenceMeasure extends AbstractDependenceMeasure { + /** + * Static instance. + */ + public static final MutualInformationEquiwidthDependenceMeasure STATIC = new MutualInformationEquiwidthDependenceMeasure(); + + /** + * Constructor - use {@link #STATIC} instance. + */ + protected MutualInformationEquiwidthDependenceMeasure() { + super(); + } + + @Override + public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) { + final int len = size(adapter1, data1, adapter2, data2); + final int bins = (int) Math.round(Math.sqrt(len)); + final int maxbin = bins - 1; + + double min1 = adapter1.getDouble(data1, 0), max1 = min1; + double min2 = adapter2.getDouble(data2, 0), max2 = min2; + for(int i = 1; i < len; i++) { + final double vx = adapter1.getDouble(data1, i); + final double vy = adapter2.getDouble(data2, i); + if(vx < min1) { + min1 = vx; + } + else if(vx > max1) { + max1 = vx; + } + if(vy < min2) { + min2 = vy; + } + else if(vy > max2) { + max2 = vy; + } + } + final double scale1 = (max1 > min1) ? bins / (max1 - min1) : 1; + final double scale2 = (max2 > min2) ? bins / (max2 - min2) : 1; + + int[] margin1 = new int[bins], margin2 = new int[bins]; + int[][] counts = new int[bins][bins]; + for(int i = 0; i < len; i++) { + int bin1 = (int) Math.floor((adapter1.getDouble(data1, i) - min1) * scale1); + int bin2 = (int) Math.floor((adapter2.getDouble(data2, i) - min2) * scale2); + bin1 = bin1 < bins ? bin1 : maxbin; + bin2 = bin2 < bins ? bin2 : maxbin; + margin1[bin1]++; + margin2[bin2]++; + counts[bin1][bin2]++; + } + + // calculating relative frequencies + double e = 0; + for(int bin1 = 0; bin1 < counts.length; bin1++) { + // Margin value for row i. + final int sum1 = margin1[bin1]; + // Skip empty rows early. + if(sum1 == 0) { + continue; + } + // Inverse pX + final double ipX = len / (double) sum1; + final int[] row = counts[bin1]; + for(int bin2 = 0; bin2 < row.length; bin2++) { + final int cell = row[bin2]; + // Skip empty cells. + if(cell != 0) { + // Mutual information pXY / (pX * pY) + double pXY = cell / (double) len; + // Inverse pXpY: 1 / (pX*pY) + final double ipXpY = ipX * len / margin2[bin2]; + e += pXY * Math.log(pXY * ipXpY); + } + } + } + // Expected value for uniform identical: log(bins)! + return e / Math.log(bins); + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected MutualInformationEquiwidthDependenceMeasure makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SURFINGDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SURFINGDependenceMeasure.java new file mode 100644 index 00000000..ce866c16 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SURFINGDependenceMeasure.java @@ -0,0 +1,133 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Compute the similarity of dimensions using the SURFING score. The parameter k
+ * for the k nearest neighbors is currently hard-coded to 10% of the set size.
+ *
+ * Note that the complexity is roughly O(n n k), so this is a rather slow
+ * method, and with k at 10% of n, is actually cubic: O(0.1 * n^3).
+ *
+ * This version cannot use index support, as the API operates without database
+ * attachment. However, it should be possible to implement some trivial
+ * sorted-list indexes to get a reasonable speedup!
+ *
+ * Reference:
+ * <p>
+ * Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek:<br />
+ * Interactive Data Mining with 3D-Parallel-Coordinate-Trees.<br />
+ * Proceedings of the 2013 ACM International Conference on Management of Data
+ * (SIGMOD), New York City, NY, 2013.
+ * </p>
+ *
+ * Based on:
+ * <p>
+ * Christian Baumgartner, Claudia Plant, Karin Kailing, Hans-Peter Kriegel, and
+ * Peer Kröger<br />
+ * Subspace Selection for Clustering High-Dimensional Data<br />
+ * In IEEE International Conference on Data Mining, 2004.
+ * </p>
+ *
+ * TODO: make the subspace distance function and k parameterizable.
+ *
+ * @author Robert Rödler
+ * @author Erich Schubert
+ *
+ * @apiviz.uses SubspaceEuclideanDistanceFunction
+ */
+@Reference(authors = "Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", //
+title = "Interactive Data Mining with 3D-Parallel-Coordinate-Trees", //
+booktitle = "Proc. of the 2013 ACM International Conference on Management of Data (SIGMOD)", //
+url = "http://dx.doi.org/10.1145/2463676.2463696")
+public class SURFINGDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final SURFINGDependenceMeasure STATIC = new SURFINGDependenceMeasure();
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected SURFINGDependenceMeasure() {
+ super();
+ }
+
+ @Reference(authors = "Christian Baumgartner, Claudia Plant, Karin Kailing, Hans-Peter Kriegel, and Peer Kröger", //
+ title = "Subspace Selection for Clustering High-Dimensional Data", //
+ booktitle = "IEEE International Conference on Data Mining, 2004", //
+ url = "http://dx.doi.org/10.1109/ICDM.2004.10112")
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+ final int k = Math.max(1, len / 10);
+
+ double[] knns = new double[len];
+
+ DoubleMinHeap heap = new DoubleMinHeap(k);
+ double kdistmean = 0.;
+ for(int i = 0; i < len; ++i) {
+ double ix = adapter1.getDouble(data1, i), iy = adapter2.getDouble(data2, i);
+ heap.clear();
+ for(int j = 0; j < len; ++j) {
+ double jx = adapter1.getDouble(data1, j), jy = adapter2.getDouble(data2, j);
+ double dx = ix - jx, dy = iy - jy;
+ heap.add(dx * dx + dy * dy); // Squared Euclidean.
+ }
+ double kdist = Math.sqrt(heap.peek()); // Euclidean
+ knns[i] = kdist;
+ kdistmean += kdist;
+ }
+ kdistmean /= len;
+ // Deviation from mean:
+ double diff = 0.;
+ int below = 0;
+ for(int l = 0; l < knns.length; l++) {
+ diff += Math.abs(kdistmean - knns[l]);
+ if(knns[l] < kdistmean) {
+ below++;
+ }
+ }
+ return (below > 0) ? diff / (2. * kdistmean * below) : 0;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected SURFINGDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeDependenceMeasure.java new file mode 100644 index 00000000..3dd4fcaa --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeDependenceMeasure.java @@ -0,0 +1,153 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Arrange dimensions based on the entropy of the slope spectrum.
+ *
+ * Reference:
+ * <p>
+ * Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek:<br />
+ * Interactive Data Mining with 3D-Parallel-Coordinate-Trees.<br />
+ * Proceedings of the 2013 ACM International Conference on Management of Data
+ * (SIGMOD), New York City, NY, 2013.
+ * </p>
+ *
+ * TODO: shouldn't this be normalized by the single-dimension entropies or so?
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+@Reference(authors = "Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", //
+title = "Interactive Data Mining with 3D-Parallel-Coordinate-Trees", //
+booktitle = "Proc. of the 2013 ACM International Conference on Management of Data (SIGMOD)", //
+url = "http://dx.doi.org/10.1145/2463676.2463696")
+public class SlopeDependenceMeasure extends AbstractDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final SlopeDependenceMeasure STATIC = new SlopeDependenceMeasure();
+
+ /**
+ * Full precision.
+ */
+ protected final static int PRECISION = 40;
+
+ /**
+ * Precision for entropy normalization.
+ */
+ protected final static double LOG_PRECISION = Math.log(PRECISION);
+
+ /**
+ * Scaling factor.
+ */
+ protected final static double RESCALE = PRECISION * .5;
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected SlopeDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+
+ // Get attribute value range:
+ final double off1, scale1, off2, scale2;
+ {
+ double mi = adapter1.getDouble(data1, 0), ma = mi;
+ for(int i = 1; i < len; ++i) {
+ double v = adapter1.getDouble(data1, i);
+ if(v < mi) {
+ mi = v;
+ }
+ else if(v > ma) {
+ ma = v;
+ }
+ }
+ off1 = mi;
+ scale1 = (ma > mi) ? (1. / (ma - mi)) : 1.;
+ // Second data
+ mi = adapter2.getDouble(data2, 0);
+ ma = mi;
+ for(int i = 1; i < len; ++i) {
+ double v = adapter2.getDouble(data2, i);
+ if(v < mi) {
+ mi = v;
+ }
+ else if(v > ma) {
+ ma = v;
+ }
+ }
+ off2 = mi;
+ scale2 = (ma > mi) ? (1. / (ma - mi)) : 1.;
+ }
+
+ // Collect angular histograms.
+ // Note, we only fill half of the matrix
+ int[] angles = new int[PRECISION];
+
+ // Scratch buffer
+ for(int i = 0; i < len; i++) {
+ double x = adapter1.getDouble(data1, i), y = adapter2.getDouble(data2, i);
+ x = (x - off1) * scale1;
+ y = (y - off2) * scale2;
+ final double delta = x - y + 1;
+ int div = (int) Math.round(delta * RESCALE);
+ // TODO: do we really need this check?
+ div = (div < 0) ? 0 : (div >= PRECISION) ? PRECISION - 1 : div;
+ angles[div] += 1;
+ }
+
+ // Compute entropy:
+ double entropy = 0.;
+ for(int l = 0; l < PRECISION; l++) {
+ if(angles[l] > 0) {
+ final double p = angles[l] / (double) len;
+ entropy += p * Math.log(p);
+ }
+ }
+ return 1 + entropy / LOG_PRECISION;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected SlopeDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeInversionDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeInversionDependenceMeasure.java new file mode 100644 index 00000000..01ad78bc --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SlopeInversionDependenceMeasure.java @@ -0,0 +1,157 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2014
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Arrange dimensions based on the entropy of the slope spectrum.
+ *
+ * Reference:
+ * <p>
+ * Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek:<br />
+ * Interactive Data Mining with 3D-Parallel-Coordinate-Trees.<br />
+ * Proceedings of the 2013 ACM International Conference on Management of Data
+ * (SIGMOD), New York City, NY, 2013.
+ * </p>
+ *
+ * TODO: shouldn't this be normalized by the single-dimension entropies or so?
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+@Reference(authors = "Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", //
+title = "Interactive Data Mining with 3D-Parallel-Coordinate-Trees", //
+booktitle = "Proc. of the 2013 ACM International Conference on Management of Data (SIGMOD)", //
+url = "http://dx.doi.org/10.1145/2463676.2463696")
+public class SlopeInversionDependenceMeasure extends SlopeDependenceMeasure {
+ /**
+ * Static instance.
+ */
+ public static final SlopeInversionDependenceMeasure STATIC = new SlopeInversionDependenceMeasure();
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected SlopeInversionDependenceMeasure() {
+ super();
+ }
+
+ @Override
+ public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
+ final int len = size(adapter1, data1, adapter2, data2);
+
+ // Get attribute value range:
+ final double off1, scale1, off2, scale2;
+ {
+ double mi = adapter1.getDouble(data1, 0), ma = mi;
+ for(int i = 1; i < len; ++i) {
+ double v = adapter1.getDouble(data1, i);
+ if(v < mi) {
+ mi = v;
+ }
+ else if(v > ma) {
+ ma = v;
+ }
+ }
+ off1 = mi;
+ scale1 = (ma > mi) ? (1. / (ma - mi)) : 1.;
+ // Second data
+ mi = adapter2.getDouble(data2, 0);
+ ma = mi;
+ for(int i = 1; i < len; ++i) {
+ double v = adapter2.getDouble(data2, i);
+ if(v < mi) {
+ mi = v;
+ }
+ else if(v > ma) {
+ ma = v;
+ }
+ }
+ off2 = mi;
+ scale2 = (ma > mi) ? (1. / (ma - mi)) : 1.;
+ }
+
+ // Collect angular histograms.
+ // Note, we only fill half of the matrix
+ int[] angles = new int[PRECISION];
+ int[] angleI = new int[PRECISION];
+
+ // Scratch buffer
+ for(int i = 0; i < len; i++) {
+ double x = adapter1.getDouble(data1, i), y = adapter2.getDouble(data2, i);
+ x = (x - off1) * scale1;
+ y = (y - off2) * scale2;
+ {
+ final double delta = x - y + 1;
+ int div = (int) Math.round(delta * RESCALE);
+ // TODO: do we really need this check?
+ div = (div < 0) ? 0 : (div >= PRECISION) ? PRECISION - 1 : div;
+ angles[div] += 1;
+ }
+ {
+ final double delta = x + y;
+ int div = (int) Math.round(delta * RESCALE);
+ // TODO: do we really need this check?
+ div = (div < 0) ? 0 : (div >= PRECISION) ? PRECISION - 1 : div;
+ angleI[div] += 1;
+ }
+ }
+
+ // Compute entropy:
+ double entropy = 0., entropyI = 0.;
+ for(int l = 0; l < PRECISION; l++) {
+ if(angles[l] > 0) {
+ final double p = angles[l] / (double) len;
+ entropy += p * Math.log(p);
+ }
+ if(angleI[l] > 0) {
+ final double p = angleI[l] / (double) len;
+ entropyI += p * Math.log(p);
+ }
+ }
+ if(entropy >= entropyI) {
+ return 1 + entropy / LOG_PRECISION;
+ }
+ else {
+ return 1 + entropyI / LOG_PRECISION;
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected SlopeInversionDependenceMeasure makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SpearmanCorrelationDependenceMeasure.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SpearmanCorrelationDependenceMeasure.java new file mode 100644 index 00000000..24202dce --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/SpearmanCorrelationDependenceMeasure.java @@ -0,0 +1,78 @@ +package de.lmu.ifi.dbs.elki.math.statistics.dependence; + +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; + +/** + * Spearman rank-correlation coefficient, also known as Spearmans Rho. + * + * @author Erich Schubert + */ +public class SpearmanCorrelationDependenceMeasure extends AbstractDependenceMeasure { + /** + * Static instance. + */ + public static final SpearmanCorrelationDependenceMeasure STATIC = new SpearmanCorrelationDependenceMeasure(); + + /** + * Constructor - use {@link #STATIC} instance. + */ + protected SpearmanCorrelationDependenceMeasure() { + super(); + } + + @Override + public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) { + final int len = size(adapter1, data1, adapter2, data2); + double[] ranks1 = computeNormalizedRanks(adapter1, data1, len); + double[] ranks2 = computeNormalizedRanks(adapter2, data2, len); + + // Second pass: variances and covariance + double v1 = 0., v2 = 0., cov = 0.; + for(int i = 0; i < len; i++) { + double d1 = ranks1[i] - .5, d2 = ranks2[i] - .5; + v1 += d1 * d1; + v2 += d2 * d2; + cov += d1 * d2; + } + // Note: we did not normalize by len, as this cancels out. + return cov / Math.sqrt(v1 * v2); + } + + /** + * Parameterization class + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer extends AbstractParameterizer { + @Override + protected SpearmanCorrelationDependenceMeasure makeInstance() { + return STATIC; + } + } +} diff --git a/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/package-info.java b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/package-info.java new file mode 100644 index 00000000..a1f0bd55 --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/math/statistics/dependence/package-info.java @@ -0,0 +1,26 @@ +/** + * <p>Statistical measures of dependence, such as correlation.</p> + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.math.statistics.dependence;
\ No newline at end of file |