package de.lmu.ifi.dbs.elki.math.statistics.dependence; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2015 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.Random; import de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.math.random.RandomFactory; import de.lmu.ifi.dbs.elki.math.statistics.tests.GoodnessOfFitTest; import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest; import de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter; import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerArrayQuickSort; import de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter; /** * Use the statistical tests as used by HiCS to measure dependence of variables. * * Reference: *

* Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek:
* Interactive Data Mining with 3D-Parallel-Coordinate-Trees.
* Proceedings of the 2013 ACM International Conference on Management of Data * (SIGMOD), New York City, NY, 2013. *

* * Based on: *

* F. Keller, E. Müller, and K. Böhm.
* HiCS: High Contrast Subspaces for Density-Based Outlier Ranking.
* In ICDE, pages 1037–1048, 2012. *

* * @author Erich Schubert * @author Robert Rödler * @since 0.7.0 */ @Reference(authors = "Elke Achtert, Hans-Peter Kriegel, Erich Schubert, Arthur Zimek", // title = "Interactive Data Mining with 3D-Parallel-Coordinate-Trees", // booktitle = "Proc. of the 2013 ACM International Conference on Management of Data (SIGMOD)", // url = "http://dx.doi.org/10.1145/2463676.2463696") public class HiCSDependenceMeasure extends AbstractDependenceMeasure { /** * Monte-Carlo iterations */ private int m = 50; /** * Alpha threshold */ private double alphasqrt = Math.sqrt(0.1); /** * Statistical test to use */ private GoodnessOfFitTest statTest; /** * Random generator */ private RandomFactory rnd; /** * Constructor. * * @param statTest Test function * @param m Number of monte-carlo iterations * @param alpha Alpha threshold * @param rnd Random source */ public HiCSDependenceMeasure(GoodnessOfFitTest statTest, int m, double alpha, RandomFactory rnd) { super(); this.statTest = statTest; this.m = m; this.alphasqrt = Math.sqrt(alpha); this.rnd = rnd; } @Override public double dependence(final NumberArrayAdapter adapter1, final A data1, final NumberArrayAdapter adapter2, final B data2) { final int len = size(adapter1, data1, adapter2, data2); final int windowsize = (int) (len * alphasqrt); final Random random = rnd.getSingleThreadedRandom(); // Sorted copies for slicing. int[] s1 = MathUtil.sequence(0, len), s2 = MathUtil.sequence(0, len); IntegerArrayQuickSort.sort(s1, new IntegerComparator() { @Override public int compare(int x, int y) { return Double.compare(adapter1.getDouble(data1, x), adapter1.getDouble(data1, y)); } }); IntegerArrayQuickSort.sort(s2, new IntegerComparator() { @Override public int compare(int x, int y) { return Double.compare(adapter2.getDouble(data2, x), adapter2.getDouble(data2, y)); } }); // Distributions for testing double[] fullValues = new double[len]; double[] sampleValues = new double[windowsize]; double deviationSum = 0.; // For the first half, we use the first dimension as reference for(int i = 0; i < len; i++) { fullValues[i] = adapter1.getDouble(data1, i); if(fullValues[i] != fullValues[i]) { throw new AbortException("NaN values are not allowed by this implementation!"); } } int half = m >> 1; // TODO: remove bias? for(int i = 0; i < half; ++i) { // Build the sample for(int j = random.nextInt(len - windowsize), k = 0; k < windowsize; ++k, ++j) { sampleValues[k] = adapter2.getDouble(data2, j); } double contrast = statTest.deviation(fullValues, sampleValues); if(Double.isNaN(contrast)) { --i; // Retry. continue; } deviationSum += contrast; } // For the second half, we use the second dimension as reference for(int i = 0; i < len; i++) { fullValues[i] = adapter2.getDouble(data2, i); if(fullValues[i] != fullValues[i]) { throw new AbortException("NaN values are not allowed by this implementation!"); } } for(int i = half; i < m; ++i) { // Build the sample for(int j = random.nextInt(len - windowsize), k = 0; k < windowsize; ++k, ++j) { sampleValues[k] = adapter1.getDouble(data1, j); } double contrast = statTest.deviation(fullValues, sampleValues); if(Double.isNaN(contrast)) { --i; // Retry. continue; } deviationSum += contrast; } return deviationSum / m; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractParameterizer { /** * Statistical test to use */ private GoodnessOfFitTest statTest; /** * Holds the value of * {@link de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS.Parameterizer#M_ID} * . */ private int m = 50; /** * Holds the value of * {@link de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS.Parameterizer#ALPHA_ID} * . */ private double alpha = 0.1; /** * Random generator. */ private RandomFactory rnd; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); final IntParameter mP = new IntParameter(HiCS.Parameterizer.M_ID, 50); mP.addConstraint(CommonConstraints.GREATER_THAN_ONE_INT); if(config.grab(mP)) { m = mP.intValue(); } final DoubleParameter alphaP = new DoubleParameter(HiCS.Parameterizer.ALPHA_ID, 0.1); alphaP.addConstraint(CommonConstraints.GREATER_THAN_ZERO_DOUBLE); if(config.grab(alphaP)) { alpha = alphaP.doubleValue(); } final ObjectParameter testP = new ObjectParameter<>(HiCS.Parameterizer.TEST_ID, GoodnessOfFitTest.class, KolmogorovSmirnovTest.class); if(config.grab(testP)) { statTest = testP.instantiateClass(config); } final RandomParameter rndP = new RandomParameter(HiCS.Parameterizer.SEED_ID); if(config.grab(rndP)) { rnd = rndP.getValue(); } } @Override protected HiCSDependenceMeasure makeInstance() { return new HiCSDependenceMeasure(statTest, m, alpha, rnd); } } }