package de.lmu.ifi.dbs.elki.math.statistics.dependence; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2015 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; /** * Compute the similarity of dimensions using the SURFING score. The parameter k * for the k nearest neighbors is currently hard-coded to 10% of the set size. * * Note that the complexity is roughly O(n n k), so this is a rather slow * method, and with k at 10% of n, is actually cubic: O(0.1 * n^3). * * This version cannot use index support, as the API operates without database * attachment. However, it should be possible to implement some trivial * sorted-list indexes to get a reasonable speedup! * * Reference: *

* Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek:
* Interactive Data Mining with 3D-Parallel-Coordinate-Trees.
* Proceedings of the 2013 ACM International Conference on Management of Data * (SIGMOD), New York City, NY, 2013. *

* * Based on: *

* Christian Baumgartner, Claudia Plant, Karin Kailing, Hans-Peter Kriegel, and * Peer Kröger
* Subspace Selection for Clustering High-Dimensional Data
* In IEEE International Conference on Data Mining, 2004. *

* * TODO: make the subspace distance function and k parameterizable. * * @author Robert Rödler * @author Erich Schubert */ @Reference(authors = "Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", // title = "Interactive Data Mining with 3D-Parallel-Coordinate-Trees", // booktitle = "Proc. of the 2013 ACM International Conference on Management of Data (SIGMOD)", // url = "http://dx.doi.org/10.1145/2463676.2463696") public class SURFINGDependenceMeasure extends AbstractDependenceMeasure { /** * Static instance. */ public static final SURFINGDependenceMeasure STATIC = new SURFINGDependenceMeasure(); /** * Constructor. Use static instance instead! */ protected SURFINGDependenceMeasure() { super(); } @Reference(authors = "Christian Baumgartner, Claudia Plant, Karin Kailing, Hans-Peter Kriegel, and Peer Kröger", // title = "Subspace Selection for Clustering High-Dimensional Data", // booktitle = "IEEE International Conference on Data Mining, 2004", // url = "http://dx.doi.org/10.1109/ICDM.2004.10112") @Override public double dependence(NumberArrayAdapter adapter1, A data1, NumberArrayAdapter adapter2, B data2) { final int len = size(adapter1, data1, adapter2, data2); final int k = Math.max(1, len / 10); double[] knns = new double[len]; DoubleMinHeap heap = new DoubleMinHeap(k); double kdistmean = 0.; for(int i = 0; i < len; ++i) { double ix = adapter1.getDouble(data1, i), iy = adapter2.getDouble(data2, i); heap.clear(); for(int j = 0; j < len; ++j) { double jx = adapter1.getDouble(data1, j), jy = adapter2.getDouble(data2, j); double dx = ix - jx, dy = iy - jy; heap.add(dx * dx + dy * dy); // Squared Euclidean. } double kdist = Math.sqrt(heap.peek()); // Euclidean knns[i] = kdist; kdistmean += kdist; } kdistmean /= len; // Deviation from mean: double diff = 0.; int below = 0; for(int l = 0; l < knns.length; l++) { diff += Math.abs(kdistmean - knns[l]); if(knns[l] < kdistmean) { below++; } } return (below > 0) ? diff / (2. * kdistmean * below) : 0; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractParameterizer { @Override protected SURFINGDependenceMeasure makeInstance() { return STATIC; } } }