summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity')
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/CovarianceDimensionSimilarity.java82
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/DimensionSimilarity.java46
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/DimensionSimilarityMatrix.java247
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HSMDimensionSimilarity.java251
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HiCSDimensionSimilarity.java269
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java288
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SURFINGDimensionSimilarity.java133
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SlopeDimensionSimilarity.java142
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SlopeInversionDimensionSimilarity.java158
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/package-info.java26
10 files changed, 1642 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/CovarianceDimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/CovarianceDimensionSimilarity.java
new file mode 100644
index 00000000..ad59ebfd
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/CovarianceDimensionSimilarity.java
@@ -0,0 +1,82 @@
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Class to compute the dimension similarity based on covariances.
+ *
+ * @author Erich Schubert
+ */
+public class CovarianceDimensionSimilarity implements DimensionSimilarity<NumberVector<?>> {
+ /**
+ * Static instance
+ */
+ public static final CovarianceDimensionSimilarity STATIC = new CovarianceDimensionSimilarity();
+
+ /**
+ * Constructor. Use static instance.
+ */
+ protected CovarianceDimensionSimilarity() {
+ super();
+ }
+
+ @Override
+ public void computeDimensionSimilarites(Relation<? extends NumberVector<?>> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
+ final int dim = matrix.size();
+ // FIXME: Use only necessary dimensions!
+ CovarianceMatrix covmat = CovarianceMatrix.make(relation, subset);
+ double[][] mat = covmat.destroyToSampleMatrix().getArrayRef();
+ // Transform diagonal to 1 / stddev
+ for (int i = 0; i < mat.length; i++) {
+ mat[i][i] = 1. / Math.sqrt(mat[i][i]);
+ }
+ // Fill output matrix:
+ for (int x = 0; x < dim; x++) {
+ final int i = matrix.dim(x);
+ for (int y = x + 1; y < dim; y++) {
+ final int j = matrix.dim(y);
+ matrix.set(x, y, mat[i][j] * mat[i][i] * mat[j][j]);
+ }
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected CovarianceDimensionSimilarity makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/DimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/DimensionSimilarity.java
new file mode 100644
index 00000000..6b01b3f3
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/DimensionSimilarity.java
@@ -0,0 +1,46 @@
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.Parameterizable;
+
+/**
+ * Interface for computing pairwise dimension similarities, used for arranging
+ * dimensions in parallel coordinate plots.
+ *
+ * @author Erich Schubert
+ *
+ * @param <V> Object type
+ */
+public interface DimensionSimilarity<V> extends Parameterizable {
+ /**
+ * Compute the dimension similarity matrix
+ *
+ * @param relation Relation
+ * @param subset DBID subset (for sampling / selection)
+ * @param matrix Matrix to fill
+ */
+ public void computeDimensionSimilarites(Relation<? extends V> relation, DBIDs subset, DimensionSimilarityMatrix matrix);
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/DimensionSimilarityMatrix.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/DimensionSimilarityMatrix.java
new file mode 100644
index 00000000..9f7a9707
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/DimensionSimilarityMatrix.java
@@ -0,0 +1,247 @@
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
+
+import de.lmu.ifi.dbs.elki.math.geometry.PrimsMinimumSpanningTree;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * Class representing a similarity matrix between dimensions.
+ *
+ * @author Erich Schubert
+ */
+public abstract class DimensionSimilarityMatrix {
+ /**
+ * Adapter class for running Prim's minimum spanning tree algorithm.
+ */
+ public static final PrimAdapter PRIM_ADAPTER = new PrimAdapter();
+
+ /**
+ * Flat, symmetric storage. We use a lower triangle matrix.
+ *
+ * Basic memory layout (X = undef, S = symmetric)
+ *
+ * <pre>
+ * X S S S S S
+ * 0 X S S S S
+ * 1 2 X S S S
+ * 3 4 5 X S S
+ * 6 7 8 9 X S
+ * 10 11 12 13 14 X
+ * </pre>
+ *
+ *
+ */
+ private final double[] sim;
+
+ /**
+ * Constructor.
+ *
+ * @param dims Number of dimensions to allocate.
+ */
+ protected DimensionSimilarityMatrix(int dims) {
+ super();
+ this.sim = new double[index(0, dims)];
+ }
+
+ /**
+ * Number of dimensions.
+ *
+ * @return Size of dimensions array.
+ */
+ public abstract int size();
+
+ /**
+ * Get the dimension at position idx.
+ *
+ * @param idx Position
+ * @return Dimension
+ */
+ public abstract int dim(int idx);
+
+ /**
+ * Set the value of the given matrix position.
+ *
+ * Note that {@code x == y} is invalid!
+ *
+ * @param x X index coordinate
+ * @param y Y index coordinate
+ * @param val Value
+ */
+ public void set(int x, int y, double val) {
+ sim[index(x, y)] = val;
+ }
+
+ /**
+ * Get the value of the given matrix position.
+ *
+ * Note that {@code x == y} is invalid!
+ *
+ * @param x X index coordinate
+ * @param y Y index coordinate
+ * @return Value
+ */
+ public double get(int x, int y) {
+ return sim[index(x, y)];
+ }
+
+ /**
+ * Indexing function for triangular matrix.
+ *
+ * @param x X coordinate
+ * @param y Y coordinate
+ * @return Array index
+ */
+ private int index(int x, int y) {
+ assert (x != y);
+ if (x > y) {
+ return index(y, x);
+ }
+ return ((y * (y - 1)) >> 1) + x;
+ }
+
+ @Override
+ public String toString() {
+ StringBuffer buf = new StringBuffer();
+ final int d = size();
+ for (int x = 1; x < d; x++) {
+ for (int y = 0; y < x; y++) {
+ if (y > 0) {
+ buf.append(' ');
+ }
+ buf.append(get(x, y));
+ }
+ buf.append('\n');
+ }
+ return buf.toString();
+ }
+
+ /**
+ * Complete matrix of pairwise dimension similarities.
+ *
+ * @author Erich Schubert
+ */
+ public static class FullDimensionSimilarityMatrix extends DimensionSimilarityMatrix {
+ /**
+ * Number of dimensions.
+ */
+ final int dims;
+
+ /**
+ * Constructor.
+ *
+ * @param dims Number of dimensions
+ */
+ public FullDimensionSimilarityMatrix(int dims) {
+ super(dims);
+ this.dims = dims;
+ }
+
+ @Override
+ public int size() {
+ return dims;
+ }
+
+ @Override
+ public int dim(int idx) {
+ return idx;
+ }
+ }
+
+ /**
+ * Partial matrix of pairwise dimension similarities.
+ *
+ * @author Erich Schubert
+ */
+ public static class PartialDimensionSimilarityMatrix extends DimensionSimilarityMatrix {
+ /**
+ * Enumeration of dimensions to use (so we could use a subset only!)
+ */
+ final int[] dims;
+
+ /**
+ * Constructor.
+ *
+ * @param dims Array of dimensions to process.
+ */
+ public PartialDimensionSimilarityMatrix(int[] dims) {
+ super(dims.length);
+ this.dims = dims;
+ }
+
+ @Override
+ public int size() {
+ return dims.length;
+ }
+
+ @Override
+ public int dim(int idx) {
+ return dims[idx];
+ }
+ }
+
+ /**
+ * Adapter class for running prim's algorithm.
+ *
+ * @author Erich Schubert
+ */
+ public static class PrimAdapter implements PrimsMinimumSpanningTree.Adapter<DimensionSimilarityMatrix> {
+ /**
+ * Constructor. Use static instance!
+ */
+ protected PrimAdapter() {
+ super();
+ }
+
+ @Override
+ public double distance(DimensionSimilarityMatrix data, int i, int j) {
+ return -Math.abs(data.get(i, j));
+ }
+
+ @Override
+ public int size(DimensionSimilarityMatrix data) {
+ return data.size();
+ }
+
+ }
+
+ /**
+ * Make a full dimension similarity matrix.
+ *
+ * @param dims Number of dimensions.
+ * @return Matrix
+ */
+ public static DimensionSimilarityMatrix make(int dims) {
+ return new FullDimensionSimilarityMatrix(dims);
+ }
+
+ /**
+ * Make a partial dimension similarity matrix.
+ *
+ * @param dims Array of relevant dimensions
+ * @return Matrix
+ */
+ public static DimensionSimilarityMatrix make(int[] dims) {
+ return new PartialDimensionSimilarityMatrix(dims);
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HSMDimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HSMDimensionSimilarity.java
new file mode 100644
index 00000000..b221866c
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HSMDimensionSimilarity.java
@@ -0,0 +1,251 @@
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.math.SinCosTable;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+
+/**
+ * FIXME: This needs serious TESTING before release. Large parts have been
+ * rewritten, but could not be tested at the time of rewriting.
+ *
+ * Compute the similarity of dimensions by using a hough transformation.
+ *
+ * Reference: <br>
+ * <p>
+ * A. Tatu, G. Albuquerque, M. Eisemann, P. Bak, H. Theisel, M. A. Magnor, and
+ * D. A. Keim.<br />
+ * Automated Analytical Methods to Support Visual Exploration of High-
+ * Dimensional Data. <br/>
+ * IEEEVisualization and Computer Graphics, 2011.
+ * </p>
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+@Reference(authors = "A. Tatu, G. Albuquerque, M. Eisemann, P. Bak, H. Theisel, M. A. Magnor, and D. A. Keim.", title = "Automated Analytical Methods to Support Visual Exploration of High-Dimensional Data", booktitle = "IEEE Trans. Visualization and Computer Graphics, 2011", url = "http://dx.doi.org/10.1109/TVCG.2010.242")
+public class HSMDimensionSimilarity implements DimensionSimilarity<NumberVector<?>> {
+ /**
+ * Static instance.
+ */
+ public static final HSMDimensionSimilarity STATIC = new HSMDimensionSimilarity();
+
+ /**
+ * Angular resolution. Best if divisible by 4: smaller tables.
+ *
+ * The original publication used 50.
+ */
+ private final static int STEPS = 64;
+
+ /**
+ * Precompute sinus and cosinus
+ */
+ private final static SinCosTable table = SinCosTable.make(STEPS);
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected HSMDimensionSimilarity() {
+ super();
+ }
+
+ @Override
+ public void computeDimensionSimilarites(Relation<? extends NumberVector<?>> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
+ final int dim = matrix.size();
+ final int resolution = 500;
+ byte[][][][] pics = new byte[dim][dim][][]; // [resolution][resolution];
+
+ // Initialize / allocate "pictures":
+ for (int i = 0; i < dim - 1; i++) {
+ for (int j = i + 1; j < dim; j++) {
+ pics[i][j] = new byte[resolution][resolution];
+ }
+ }
+ // FIXME: Get/keep these statistics in the relation, or compute for the
+ // sample only.
+ double[] off = new double[dim], scale = new double[dim];
+ {
+ Pair<? extends NumberVector<?>, ? extends NumberVector<?>> mm = DatabaseUtil.computeMinMax(relation);
+ NumberVector<?> min = mm.first;
+ NumberVector<?> max = mm.second;
+ for (int d = 0; d < dim; d++) {
+ off[d] = min.doubleValue(matrix.dim(d));
+ final double m = max.doubleValue(matrix.dim(d));
+ scale[d] = (m > off[d]) ? 1. / (m - off[d]) : 1;
+ }
+ }
+ // Iterate dataset
+ for (DBIDIter id = subset.iter(); id.valid(); id.advance()) {
+ NumberVector<?> pvec = relation.get(id);
+ for (int i = 0; i < dim - 1; i++) {
+ for (int j = i + 1; j < dim; j++) {
+ double xi = (pvec.doubleValue(matrix.dim(i)) - off[i]) * scale[i];
+ double xj = (pvec.doubleValue(matrix.dim(j)) - off[j]) * scale[j];
+ drawLine(0, (int) (resolution * xi), resolution - 1, (int) (resolution * xj), pics[i][j]);
+ }
+ }
+ }
+
+ final double stepsq = (double) STEPS * (double) STEPS;
+ for (int x = 0; x < dim; x++) {
+ final int i = matrix.dim(x);
+ for (int y = x + 1; y < dim; y++) {
+ final int j = matrix.dim(y);
+ int[][] hough = houghTransformation(pics[i][j]);
+ pics[i][j] = null; // Release picture
+ // The original publication said "median", but judging from the text,
+ // meant "mean". Otherwise, always half of the cells are above the
+ // threshold, which doesn't match the explanation there.
+ double mean = sumMatrix(hough) / stepsq;
+ int abovemean = countAboveThreshold(hough, mean);
+
+ matrix.set(x, y, 1. - (abovemean / stepsq));
+ }
+ }
+ }
+
+ /**
+ * Compute the sum of a matix.
+ *
+ * @param mat Matrix
+ * @return Sum of all elements
+ */
+ private long sumMatrix(int[][] mat) {
+ long ret = 0;
+ for (int i = 0; i < mat[0].length; i++) {
+ for (int j = 0; j < mat.length; j++) {
+ ret += mat[i][j];
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Count the number of cells above the threshold.
+ *
+ * @param mat Matrix
+ * @param threshold Threshold
+ * @return Number of elements above the threshold.
+ */
+ private int countAboveThreshold(int[][] mat, double threshold) {
+ int ret = 0;
+ for (int i = 0; i < mat.length; i++) {
+ int[] row = mat[i];
+ for (int j = 0; j < row.length; j++) {
+ if (row[j] >= threshold) {
+ ret++;
+ }
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Perform a hough transformation on the binary image in "mat".
+ *
+ * @param mat Binary image
+ * @return Hough transformation of image.
+ */
+ private int[][] houghTransformation(byte[][] mat) {
+ final int xres = mat.length, yres = mat[0].length;
+ final double tscale = STEPS / Math.sqrt(xres * xres + yres * yres);
+ final int[][] ret = new int[STEPS][STEPS];
+
+ for (int x = 0; x < mat.length; x++) {
+ for (int y = 0; y < mat[0].length; y++) {
+ if (mat[x][y] > 0) {
+ for (int i = 0; i < STEPS; i++) {
+ final int d = (int) (tscale * (x * table.cos(i) + y * table.sin(i)));
+ if (d > 0 && d < STEPS) {
+ ret[d][i] += mat[x][y];
+ }
+ }
+ }
+ }
+ }
+
+ return ret;
+ }
+
+ /**
+ * Draw a line onto the array, using the classic Bresenham algorithm.
+ *
+ * @param x0 Start X
+ * @param y0 Start Y
+ * @param x1 End X
+ * @param y1 End Y
+ * @param pic Picture array
+ */
+ private static void drawLine(int x0, int y0, int x1, int y1, byte[][] pic) {
+ final int xres = pic.length, yres = pic[0].length;
+ // Ensure bounds
+ y0 = (y0 < 0) ? 0 : (y0 >= yres) ? (yres - 1) : y0;
+ y1 = (y1 < 0) ? 0 : (y1 >= yres) ? (yres - 1) : y1;
+ x0 = (x0 < 0) ? 0 : (x0 >= xres) ? (xres - 1) : x0;
+ x1 = (x1 < 0) ? 0 : (x1 >= xres) ? (xres - 1) : x1;
+ // Default slope
+ final int dx = +Math.abs(x1 - x0), sx = x0 < x1 ? 1 : -1;
+ final int dy = -Math.abs(y1 - y0), sy = y0 < y1 ? 1 : -1;
+ // Error counter
+ int err = dx + dy;
+
+ for (;;) {
+ pic[x0][y0] = 1;
+ if (x0 == x1 && y0 == y1) {
+ break;
+ }
+
+ final int e2 = err << 1;
+ if (e2 > dy) {
+ err += dy;
+ x0 += sx;
+ }
+ if (e2 < dx) {
+ err += dx;
+ y0 += sy;
+ }
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected HSMDimensionSimilarity makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HiCSDimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HiCSDimensionSimilarity.java
new file mode 100644
index 00000000..468db679
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HiCSDimensionSimilarity.java
@@ -0,0 +1,269 @@
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.math.statistics.tests.GoodnessOfFitTest;
+import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * Use the statistical tests as used by HiCS to arrange dimensions.
+ *
+ * <p>
+ * Based on:<br />
+ * Fabian Keller, Emmanuel Müller, and Klemens Böhm.<br />
+ * HiCS: High Contrast Subspaces for Density-Based Outlier Ranking. <br />
+ * In ICDE, pages 1037–1048, 2012.
+ * </p>
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+public class HiCSDimensionSimilarity implements DimensionSimilarity<NumberVector<?>> {
+ /**
+ * Monte-Carlo iterations
+ */
+ private int m = 50;
+
+ /**
+ * Alpha threshold
+ */
+ private double alpha = 0.1;
+
+ /**
+ * Statistical test to use
+ */
+ private GoodnessOfFitTest statTest;
+
+ /**
+ * Random generator
+ */
+ private RandomFactory rnd;
+
+ /**
+ * Constructor.
+ *
+ * @param statTest Test function
+ * @param m Number of monte-carlo iterations
+ * @param alpha Alpha threshold
+ * @param rnd Random source
+ */
+ public HiCSDimensionSimilarity(GoodnessOfFitTest statTest, int m, double alpha, RandomFactory rnd) {
+ super();
+ this.statTest = statTest;
+ this.m = m;
+ this.alpha = alpha;
+ this.rnd = rnd;
+ }
+
+ @Override
+ public void computeDimensionSimilarites(Relation<? extends NumberVector<?>> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
+ final Random random = rnd.getRandom();
+ final int dim = matrix.size();
+
+ // FIXME: only compute indexes necessary.
+ ArrayList<ArrayDBIDs> subspaceIndex = buildOneDimIndexes(relation, subset, matrix);
+
+ // compute two-element sets of subspaces
+ for (int x = 0; x < dim; x++) {
+ final int i = matrix.dim(x);
+ for (int y = x + 1; y < dim; y++) {
+ final int j = matrix.dim(y);
+ matrix.set(x, y, calculateContrast(relation, subset, subspaceIndex.get(x), subspaceIndex.get(y), i, j, random));
+ }
+ }
+ }
+
+ /**
+ * Calculates "index structures" for every attribute, i.e. sorts a
+ * ModifiableArray of every DBID in the database for every dimension and
+ * stores them in a list
+ *
+ * @param relation Relation to index
+ * @param ids IDs to use
+ * @param matrix Matrix (for dimension subset)
+ * @return List of sorted objects
+ */
+ private ArrayList<ArrayDBIDs> buildOneDimIndexes(Relation<? extends NumberVector<?>> relation, DBIDs ids, DimensionSimilarityMatrix matrix) {
+ final int dim = matrix.size();
+ ArrayList<ArrayDBIDs> subspaceIndex = new ArrayList<ArrayDBIDs>(dim);
+
+ SortDBIDsBySingleDimension comp = new VectorUtil.SortDBIDsBySingleDimension(relation);
+ for (int i = 0; i < dim; i++) {
+ ArrayModifiableDBIDs amDBIDs = DBIDUtil.newArray(ids);
+ comp.setDimension(matrix.dim(i));
+ amDBIDs.sort(comp);
+ subspaceIndex.add(amDBIDs);
+ }
+
+ return subspaceIndex;
+ }
+
+ /**
+ * Calculates the actual contrast of a given subspace
+ *
+ * @param relation Data relation
+ * @param subset Subset to process
+ * @param subspaceIndex1 Index of first subspace
+ * @param subspaceIndex2 Index of second subspace
+ * @param dim1 First dimension
+ * @param dim2 Second dimension
+ * @param random Random generator
+ * @return Contrast
+ */
+ private double calculateContrast(Relation<? extends NumberVector<?>> relation, DBIDs subset, ArrayDBIDs subspaceIndex1, ArrayDBIDs subspaceIndex2, int dim1, int dim2, Random random) {
+ final double alpha1 = Math.pow(alpha, .5);
+ final int windowsize = (int) (relation.size() * alpha1);
+
+ // TODO: speed up by keeping marginal distributions prepared.
+ // Instead of doing the random switch, do half-half.
+ double deviationSum = 0.0;
+ for (int i = 0; i < m; i++) {
+ // Randomly switch dimensions
+ final int cdim1;
+ ArrayDBIDs cindex1, cindex2;
+ if (random.nextDouble() > .5) {
+ cdim1 = dim1;
+ cindex1 = subspaceIndex1;
+ cindex2 = subspaceIndex2;
+ } else {
+ cdim1 = dim2;
+ cindex1 = subspaceIndex2;
+ cindex2 = subspaceIndex1;
+ }
+ // Build the sample
+ DBIDArrayIter iter = cindex2.iter();
+ HashSetModifiableDBIDs conditionalSample = DBIDUtil.newHashSet();
+ iter.seek(random.nextInt(subset.size() - windowsize));
+ for (int k = 0; k < windowsize && iter.valid(); k++, iter.advance()) {
+ conditionalSample.add(iter);
+ }
+ // Project the data
+ double[] fullValues = new double[subset.size()];
+ double[] sampleValues = new double[conditionalSample.size()];
+ {
+ int l = 0, s = 0;
+ // Note: we use the sorted index sets.
+ for (DBIDIter id = cindex1.iter(); id.valid(); id.advance(), l++) {
+ final double val = relation.get(id).doubleValue(cdim1);
+ fullValues[l] = val;
+ if (conditionalSample.contains(id)) {
+ sampleValues[s] = val;
+ s++;
+ }
+ }
+ assert (s == conditionalSample.size());
+ }
+ double contrast = statTest.deviation(fullValues, sampleValues);
+ if (Double.isNaN(contrast)) {
+ i--;
+ continue;
+ }
+ deviationSum += contrast;
+ }
+ return deviationSum / m;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Statistical test to use
+ */
+ private GoodnessOfFitTest statTest;
+
+ /**
+ * Holds the value of {@link de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS.Parameterizer#M_ID}.
+ */
+ private int m = 50;
+
+ /**
+ * Holds the value of {@link de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS.Parameterizer#ALPHA_ID}.
+ */
+ private double alpha = 0.1;
+
+ /**
+ * Random generator.
+ */
+ private RandomFactory rnd;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ final IntParameter mP = new IntParameter(HiCS.Parameterizer.M_ID, 50);
+ mP.addConstraint(new GreaterConstraint(1));
+ if (config.grab(mP)) {
+ m = mP.intValue();
+ }
+
+ final DoubleParameter alphaP = new DoubleParameter(HiCS.Parameterizer.ALPHA_ID, 0.1);
+ alphaP.addConstraint(new GreaterConstraint(0));
+ if (config.grab(alphaP)) {
+ alpha = alphaP.doubleValue();
+ }
+
+ final ObjectParameter<GoodnessOfFitTest> testP = new ObjectParameter<GoodnessOfFitTest>(HiCS.Parameterizer.TEST_ID, GoodnessOfFitTest.class, KolmogorovSmirnovTest.class);
+ if (config.grab(testP)) {
+ statTest = testP.instantiateClass(config);
+ }
+
+ final RandomParameter rndP = new RandomParameter(HiCS.Parameterizer.SEED_ID);
+ if (config.grab(rndP)) {
+ rnd = rndP.getValue();
+ }
+ }
+
+ @Override
+ protected HiCSDimensionSimilarity makeInstance() {
+ return new HiCSDimensionSimilarity(statTest, m, alpha, rnd);
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java
new file mode 100644
index 00000000..aecdf857
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/MCEDimensionSimilarity.java
@@ -0,0 +1,288 @@
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.math.MathUtil;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Compute dimension similarity by using a nested means discretization.
+ *
+ * Reference:
+ * <p>
+ * Diansheng Guo<br />
+ * Coordinating computational and visual approaches for interactive feature
+ * selection and multivariate clustering<br />
+ * Information Visualization, 2(4), 2003.
+ * </p>
+ *
+ * @author Erich Schubert
+ */
+@Reference(authors = "Diansheng Guo", title = "Coordinating computational and visual approaches for interactive feature selection and multivariate clustering", booktitle = "Information Visualization, 2(4)", url = "http://dx.doi.org/10.1057/palgrave.ivs.9500053")
+public class MCEDimensionSimilarity implements DimensionSimilarity<NumberVector<?>> {
+ /**
+ * Static instance.
+ */
+ public static final MCEDimensionSimilarity STATIC = new MCEDimensionSimilarity();
+
+ /**
+ * Desired size: 35 observations.
+ *
+ * While this could trivially be made parameterizable, it is a reasonable rule
+ * of thumb and not expected to have a major effect.
+ */
+ public static final int TARGET = 35;
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected MCEDimensionSimilarity() {
+ super();
+ }
+
+ @Override
+ public void computeDimensionSimilarites(Relation<? extends NumberVector<?>> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
+ final int dim = matrix.size();
+
+ // Find a number of bins as recommended by Cheng et al.
+ double p = Math.log(subset.size() / (double) TARGET) / MathUtil.LOG2;
+ // As we are in 2d, take the root (*.5) But let's use at least 1, too.
+ // Check: for 10000 this should give 4, for 150 it gives 1.
+ int power = Math.max(1, (int) Math.floor(p * .5));
+ int gridsize = 1 << power;
+ double loggrid = Math.log((double) gridsize);
+
+ ArrayList<ArrayList<DBIDs>> parts = buildPartitions(relation, subset, power, matrix);
+
+ // Partition sizes
+ int[][] psizes = new int[dim][gridsize];
+ for (int d = 0; d < dim; d++) {
+ final ArrayList<DBIDs> partsd = parts.get(d);
+ final int[] sizesp = psizes[d];
+ for (int i = 0; i < gridsize; i++) {
+ sizesp[i] = partsd.get(i).size();
+ }
+ }
+
+ int[][] res = new int[gridsize][gridsize];
+ for (int x = 0; x < dim; x++) {
+ ArrayList<DBIDs> partsi = parts.get(x);
+ for (int y = x + 1; y < dim; y++) {
+ ArrayList<DBIDs> partsj = parts.get(y);
+ // Fill the intersection matrix
+ intersectionMatrix(res, partsi, partsj, gridsize);
+ matrix.set(x, y, 1. - getMCEntropy(res, psizes[x], psizes[y], subset.size(), gridsize, loggrid));
+ }
+ }
+ }
+
+ /**
+ * Intersect the two 1d grid decompositions, to obtain a 2d matrix.
+ *
+ * @param res Output matrix to fill
+ * @param partsx Partitions in first component
+ * @param partsy Partitions in second component.
+ * @param gridsize Size of partition decomposition
+ */
+ private void intersectionMatrix(int[][] res, ArrayList<? extends DBIDs> partsx, ArrayList<? extends DBIDs> partsy, int gridsize) {
+ for (int x = 0; x < gridsize; x++) {
+ final DBIDs px = partsx.get(x);
+ final int[] rowx = res[x];
+ for (int y = 0; y < gridsize; y++) {
+ rowx[y] = DBIDUtil.intersectionSize(px, partsy.get(y));
+ }
+ }
+ }
+
+ /**
+ * Compute the MCE entropy value.
+ *
+ * @param mat Partition size matrix
+ * @param psizesx Partition sizes on X
+ * @param psizesy Partition sizes on Y
+ * @param size Data set size
+ * @param gridsize Size of grids
+ * @param loggrid Logarithm of grid sizes, for normalization
+ * @return MCE score.
+ */
+ private double getMCEntropy(int[][] mat, int[] psizesx, int[] psizesy, int size, int gridsize, double loggrid) {
+ // Margin entropies:
+ double[] mx = new double[gridsize];
+ double[] my = new double[gridsize];
+
+ for (int i = 0; i < gridsize; i++) {
+ // Note: indexes are a bit tricky here, because we compute both margin
+ // entropies at the same time!
+ final double sumx = (double) psizesx[i];
+ final double sumy = (double) psizesy[i];
+ for (int j = 0; j < gridsize; j++) {
+ double px = mat[i][j] / sumx;
+ double py = mat[j][i] / sumy;
+
+ if (px > 0.) {
+ mx[i] -= px * Math.log(px);
+ }
+ if (py > 0.) {
+ my[i] -= py * Math.log(py);
+ }
+ }
+ }
+
+ // Weighted sums of margin entropies.
+ double sumx = 0., sumy = 0.;
+ for (int i = 0; i < gridsize; i++) {
+ sumx += mx[i] * psizesx[i];
+ sumy += my[i] * psizesy[i];
+ }
+
+ double max = ((sumx > sumy) ? sumx : sumy);
+ return max / (size * loggrid);
+ }
+
+ /**
+ * Calculates "index structures" for every attribute, i.e. sorts a
+ * ModifiableArray of every DBID in the database for every dimension and
+ * stores them in a list.
+ *
+ * @param relation Relation to index
+ * @param ids IDs to use
+ * @param matrix Matrix for dimension information
+ * @return List of sorted objects
+ */
+ private ArrayList<ArrayList<DBIDs>> buildPartitions(Relation<? extends NumberVector<?>> relation, DBIDs ids, int depth, DimensionSimilarityMatrix matrix) {
+ final int dim = matrix.size();
+ ArrayList<ArrayList<DBIDs>> subspaceIndex = new ArrayList<ArrayList<DBIDs>>(dim);
+ SortDBIDsBySingleDimension comp = new VectorUtil.SortDBIDsBySingleDimension(relation);
+ double[] tmp = new double[ids.size()];
+ Mean mean = new Mean();
+
+ for (int i = 0; i < dim; i++) {
+ final int d = matrix.dim(i);
+ // Index for a single dimension:
+ ArrayList<DBIDs> idx = new ArrayList<DBIDs>(1 << depth);
+ // First, we need a copy of the DBIDs and sort it.
+ ArrayModifiableDBIDs sids = DBIDUtil.newArray(ids);
+ comp.setDimension(d);
+ sids.sort(comp);
+ // Now we build the temp array, and compute the first mean.
+ DBIDArrayIter it = sids.iter();
+ for (int j = 0; j < tmp.length; j++, it.advance()) {
+ assert (it.valid());
+ tmp[j] = relation.get(it).doubleValue(d);
+ }
+ divide(it, tmp, idx, 0, tmp.length, depth, mean);
+ assert (idx.size() == (1 << depth));
+ subspaceIndex.add(idx);
+ }
+
+ return subspaceIndex;
+ }
+
+ /**
+ * Recursive call to further subdivide the array.
+ *
+ * @param it Iterator (will be reset!)
+ * @param data 1D data, sorted
+ * @param idx Output index
+ * @param start Interval start
+ * @param end Interval end
+ * @param depth Depth
+ * @param mean Mean working variable (will be reset!)
+ */
+ private void divide(DBIDArrayIter it, double[] data, ArrayList<DBIDs> idx, int start, int end, int depth, Mean mean) {
+ final int count = end - start;
+ if (depth == 0) {
+ if (count > 0) {
+ ModifiableDBIDs out = DBIDUtil.newHashSet(count);
+ it.seek(start);
+ for (int i = count; i > 0; i--, it.advance()) {
+ out.add(it);
+ }
+ idx.add(out);
+ } else {
+ idx.add(DBIDUtil.EMPTYDBIDS);
+ }
+ return;
+ } else {
+ if (count > 0) {
+ mean.reset();
+ for (int i = start; i < end; i++) {
+ mean.put(data[i]);
+ }
+ final double m = mean.getMean();
+ int pos = Arrays.binarySearch(data, start, end, m);
+ if (pos >= 0) {
+ // Ties: try to choose the most central element.
+ int opt = (start + end) >> 1;
+ while (Double.compare(data[pos], m) == 0) {
+ if (pos < opt) {
+ pos++;
+ } else if (pos > opt) {
+ pos--;
+ } else {
+ break;
+ }
+ }
+ } else {
+ pos = (-pos - 1);
+ }
+ divide(it, data, idx, start, pos, depth - 1, mean);
+ divide(it, data, idx, pos, end, depth - 1, mean);
+ } else {
+ // Corner case, that should barely happen. But for ties, we currently
+ // Do not yet assure that it doesn't happen!
+ divide(it, data, idx, start, end, depth - 1, mean);
+ divide(it, data, idx, start, end, depth - 1, mean);
+ }
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected MCEDimensionSimilarity makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SURFINGDimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SURFINGDimensionSimilarity.java
new file mode 100644
index 00000000..551b4759
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SURFINGDimensionSimilarity.java
@@ -0,0 +1,133 @@
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.BitSet;
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.query.knn.KNNQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.subspace.SubspaceEuclideanDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance;
+import de.lmu.ifi.dbs.elki.math.Mean;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+
+/**
+ * Compute the similarity of dimensions using the SURFING score. The parameter k
+ * for the k nearest neighbors is currently hard-coded to 10% of the set size.
+ *
+ * Note that the complexity is roughly O(n n k) * O(d^2), so this is a rather
+ * slow method without index support, and with k at 10% of n, is actually cubic.
+ * So try to use an appropriate index!
+ *
+ * Reference:
+ * <p>
+ * Christian Baumgartner, Claudia Plant, Karin Kailing, Hans-Peter Kriegel, and
+ * Peer Kröger<br />
+ * Subspace Selection for Clustering High-Dimensional Data<br />
+ * In IEEE International Conference on Data Mining, 2004.
+ * </p>
+ *
+ * TODO: make the subspace distance function and k parameterizable.
+ *
+ * @author Robert Rödler
+ * @author Erich Schubert
+ *
+ * @apiviz.uses SubspaceEuclideanDistanceFunction
+ */
+@Reference(authors = "Christian Baumgartner, Claudia Plant, Karin Kailing, Hans-Peter Kriegel, and Peer Kröger", title = "Subspace Selection for Clustering High-Dimensional Data", booktitle = "IEEE International Conference on Data Mining, 2004", url = "http://dx.doi.org/10.1109/ICDM.2004.10112")
+public class SURFINGDimensionSimilarity implements DimensionSimilarity<NumberVector<?>> {
+ /**
+ * Static instance.
+ */
+ public static final SURFINGDimensionSimilarity STATIC = new SURFINGDimensionSimilarity();
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected SURFINGDimensionSimilarity() {
+ super();
+ }
+
+ @Override
+ public void computeDimensionSimilarites(Relation<? extends NumberVector<?>> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
+ final int dim = matrix.size();
+ final Database db = relation.getDatabase();
+ Mean kdistmean = new Mean();
+ final int k = Math.max(1, subset.size() / 10);
+
+ double[] knns = new double[subset.size()];
+
+ // TODO: optimize by using 1d indexes?
+ for (int x = 0; x < dim; x++) {
+ final int i = matrix.dim(x);
+ for (int y = x + 1; y < dim; y++) {
+ final int j = matrix.dim(y);
+ BitSet dims = new BitSet(dim);
+ dims.set(i);
+ dims.set(j);
+ DistanceQuery<? extends NumberVector<?>, DoubleDistance> dq = db.getDistanceQuery(relation, new SubspaceEuclideanDistanceFunction(dims));
+ KNNQuery<? extends NumberVector<?>, DoubleDistance> knnq = db.getKNNQuery(dq, k);
+
+ kdistmean.reset();
+ int knn = 0;
+ for (DBIDIter id1 = subset.iter(); id1.valid(); id1.advance(), knn++) {
+ final double kdist = knnq.getKNNForDBID(id1, k).getKNNDistance().doubleValue();
+ kdistmean.put(kdist);
+ knns[knn] = kdist;
+ }
+ double mean = kdistmean.getMean();
+ // Deviation from mean:
+ double diff = 0.;
+ int below = 0;
+ for (int l = 0; l < knns.length; l++) {
+ diff += Math.abs(mean - knns[l]);
+ if (knns[l] < mean) {
+ below++;
+ }
+ }
+ matrix.set(x, y, (below > 0) ? diff / (2. * mean * below) : 0);
+ }
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected SURFINGDimensionSimilarity makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SlopeDimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SlopeDimensionSimilarity.java
new file mode 100644
index 00000000..3c81da63
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SlopeDimensionSimilarity.java
@@ -0,0 +1,142 @@
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+
+/**
+ * Arrange dimensions based on the entropy of the slope spectrum.
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+public class SlopeDimensionSimilarity implements DimensionSimilarity<NumberVector<?>> {
+ /**
+ * Static instance.
+ */
+ public static final SlopeDimensionSimilarity STATIC = new SlopeDimensionSimilarity();
+
+ /**
+ * Full precision.
+ */
+ protected final static int PRECISION = 40;
+
+ /**
+ * Precision for entropy normalization.
+ */
+ protected final static double LOG_PRECISION = Math.log(PRECISION);
+
+ /**
+ * Scaling factor.
+ */
+ protected final static double RESCALE = PRECISION * .5;
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected SlopeDimensionSimilarity() {
+ super();
+ }
+
+ @Override
+ public void computeDimensionSimilarites(Relation<? extends NumberVector<?>> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
+ final int dim = matrix.size();
+ final int size = subset.size();
+
+ // FIXME: Get/keep these statistics in the relation, or compute for the
+ // sample only.
+ double[] off = new double[dim], scale = new double[dim];
+ {
+ Pair<? extends NumberVector<?>, ? extends NumberVector<?>> mm = DatabaseUtil.computeMinMax(relation);
+ NumberVector<?> min = mm.first;
+ NumberVector<?> max = mm.second;
+ for (int d = 0; d < dim; d++) {
+ off[d] = min.doubleValue(matrix.dim(d));
+ final double m = max.doubleValue(matrix.dim(d));
+ scale[d] = (m > off[d]) ? 1. / (m - off[d]) : 1;
+ }
+ }
+
+ // Collect angular histograms.
+ // Note, we only fill half of the matrix
+ int[][][] angles = new int[dim][dim][PRECISION];
+
+ // Scratch buffer
+ double[] vec = new double[dim];
+ for (DBIDIter id = subset.iter(); id.valid(); id.advance()) {
+ final NumberVector<?> obj = relation.get(id);
+ // Map values to 0..1
+ for (int d = 0; d < dim; d++) {
+ vec[d] = (obj.doubleValue(matrix.dim(d)) - off[d]) * scale[d];
+ }
+ for (int i = 0; i < dim - 1; i++) {
+ for (int j = i + 1; j < dim; j++) {
+ // This will be on a scale of 0 .. 2:
+ final double delta = vec[j] - vec[i] + 1;
+ int div = (int) Math.round(delta * RESCALE);
+ // TODO: do we really need this check?
+ div = (div < 0) ? 0 : (div >= PRECISION) ? PRECISION - 1 : div;
+ angles[i][j][div] += 1;
+ }
+ }
+ }
+
+ // Compute entropy in each combination:
+ for (int x = 0; x < dim; x++) {
+ for (int y = x + 1; y < dim; y++) {
+ double entropy = 0.;
+ int[] as = angles[x][y];
+ for (int l = 0; l < PRECISION; l++) {
+ if (as[l] > 0) {
+ final double p = as[l] / (double) size;
+ entropy += p * Math.log(p);
+ }
+ }
+ entropy /= LOG_PRECISION;
+
+ matrix.set(x, y, 1 + entropy);
+ }
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected SlopeDimensionSimilarity makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SlopeInversionDimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SlopeInversionDimensionSimilarity.java
new file mode 100644
index 00000000..aad58448
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/SlopeInversionDimensionSimilarity.java
@@ -0,0 +1,158 @@
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.utilities.DatabaseUtil;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.pairs.Pair;
+
+/**
+ * Arrange dimensions based on the entropy of the slope spectrum. In contrast to
+ * {@link SlopeDimensionSimilarity}, we also take the option of inverting an
+ * axis into account.
+ *
+ * TODO: shouldn't this be normalized by the single-dimension entropies or so?
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+public class SlopeInversionDimensionSimilarity extends SlopeDimensionSimilarity {
+ /**
+ * Static instance.
+ */
+ public static final SlopeInversionDimensionSimilarity STATIC = new SlopeInversionDimensionSimilarity();
+
+ /**
+ * Constructor. Use static instance instead!
+ */
+ protected SlopeInversionDimensionSimilarity() {
+ super();
+ }
+
+ @Override
+ public void computeDimensionSimilarites(Relation<? extends NumberVector<?>> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
+ final int dim = matrix.size();
+ final int size = subset.size();
+
+ // Collect angular histograms.
+ // Note, we only fill half of the matrix
+ int[][][] angles = new int[dim][dim][PRECISION];
+ int[][][] angleI = new int[dim][dim][PRECISION];
+
+ // FIXME: Get/keep these statistics in the relation, or compute for the
+ // sample only.
+ double[] off = new double[dim], scale = new double[dim];
+ {
+ Pair<? extends NumberVector<?>, ? extends NumberVector<?>> mm = DatabaseUtil.computeMinMax(relation);
+ NumberVector<?> min = mm.first;
+ NumberVector<?> max = mm.second;
+ for (int d = 0; d < dim; d++) {
+ off[d] = min.doubleValue(matrix.dim(d));
+ final double m = max.doubleValue(matrix.dim(d));
+ scale[d] = (m > off[d]) ? 1. / (m - off[d]) : 1;
+ }
+ }
+
+ // Scratch buffer
+ double[] vec = new double[dim];
+ for (DBIDIter id = subset.iter(); id.valid(); id.advance()) {
+ final NumberVector<?> obj = relation.get(id);
+ // Map values to 0..1
+ for (int d = 0; d < dim; d++) {
+ vec[d] = (obj.doubleValue(matrix.dim(d)) - off[d]) * scale[d];
+ }
+ for (int i = 0; i < dim - 1; i++) {
+ for (int j = i + 1; j < dim; j++) {
+ {
+ // This will be on a scale of 0 .. 2:
+ final double delta = vec[j] - vec[i] + 1;
+ int div = (int) Math.round(delta * RESCALE);
+ // TODO: do we really need this check?
+ div = (div < 0) ? 0 : (div >= PRECISION) ? PRECISION - 1 : div;
+ angles[i][j][div] += 1;
+ }
+ {
+ // This will be on a scale of 0 .. 2:
+ final double delta = vec[j] + vec[i];
+ int div = (int) Math.round(delta * RESCALE);
+ // TODO: do we really need this check?
+ div = (div < 0) ? 0 : (div >= PRECISION) ? PRECISION - 1 : div;
+ angleI[i][j][div] += 1;
+ }
+ }
+ }
+ }
+
+ // Compute entropy in each combination:
+ for (int x = 0; x < dim; x++) {
+ for (int y = x + 1; y < dim; y++) {
+ double entropy = 0., entropyI = 0;
+ {
+ int[] as = angles[x][y];
+ for (int l = 0; l < PRECISION; l++) {
+ if (as[l] > 0) {
+ final double p = as[l] / (double) size;
+ entropy += p * Math.log(p);
+ }
+ }
+ }
+ {
+ int[] as = angleI[x][y];
+ for (int l = 0; l < PRECISION; l++) {
+ if (as[l] > 0) {
+ final double p = as[l] / (double) size;
+ entropyI += p * Math.log(p);
+ }
+ }
+ }
+ if (entropy >= entropyI) {
+ entropy = 1 + entropy / LOG_PRECISION;
+ matrix.set(x, y, entropy);
+ } else {
+ entropyI = 1 + entropyI / LOG_PRECISION;
+ // Negative sign to indicate the axes might be inversely related
+ matrix.set(x, y, -entropyI);
+ }
+ }
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ @Override
+ protected SlopeInversionDimensionSimilarity makeInstance() {
+ return STATIC;
+ }
+ }
+}
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/package-info.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/package-info.java
new file mode 100644
index 00000000..b6c27c57
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/package-info.java
@@ -0,0 +1,26 @@
+/**
+ * <p>Functions to compute the similarity of dimensions (or the interestingness of the combination).</p>
+ */
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity; \ No newline at end of file