summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HiCSDimensionSimilarity.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HiCSDimensionSimilarity.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HiCSDimensionSimilarity.java269
1 files changed, 269 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HiCSDimensionSimilarity.java b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HiCSDimensionSimilarity.java
new file mode 100644
index 00000000..468db679
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/math/dimensionsimilarity/HiCSDimensionSimilarity.java
@@ -0,0 +1,269 @@
+package de.lmu.ifi.dbs.elki.math.dimensionsimilarity;
+
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2012
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.ArrayList;
+import java.util.Random;
+
+import de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.VectorUtil;
+import de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.math.statistics.tests.GoodnessOfFitTest;
+import de.lmu.ifi.dbs.elki.math.statistics.tests.KolmogorovSmirnovTest;
+import de.lmu.ifi.dbs.elki.utilities.RandomFactory;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.RandomParameter;
+
+/**
+ * Use the statistical tests as used by HiCS to arrange dimensions.
+ *
+ * <p>
+ * Based on:<br />
+ * Fabian Keller, Emmanuel Müller, and Klemens Böhm.<br />
+ * HiCS: High Contrast Subspaces for Density-Based Outlier Ranking. <br />
+ * In ICDE, pages 1037–1048, 2012.
+ * </p>
+ *
+ * @author Erich Schubert
+ * @author Robert Rödler
+ */
+public class HiCSDimensionSimilarity implements DimensionSimilarity<NumberVector<?>> {
+ /**
+ * Monte-Carlo iterations
+ */
+ private int m = 50;
+
+ /**
+ * Alpha threshold
+ */
+ private double alpha = 0.1;
+
+ /**
+ * Statistical test to use
+ */
+ private GoodnessOfFitTest statTest;
+
+ /**
+ * Random generator
+ */
+ private RandomFactory rnd;
+
+ /**
+ * Constructor.
+ *
+ * @param statTest Test function
+ * @param m Number of monte-carlo iterations
+ * @param alpha Alpha threshold
+ * @param rnd Random source
+ */
+ public HiCSDimensionSimilarity(GoodnessOfFitTest statTest, int m, double alpha, RandomFactory rnd) {
+ super();
+ this.statTest = statTest;
+ this.m = m;
+ this.alpha = alpha;
+ this.rnd = rnd;
+ }
+
+ @Override
+ public void computeDimensionSimilarites(Relation<? extends NumberVector<?>> relation, DBIDs subset, DimensionSimilarityMatrix matrix) {
+ final Random random = rnd.getRandom();
+ final int dim = matrix.size();
+
+ // FIXME: only compute indexes necessary.
+ ArrayList<ArrayDBIDs> subspaceIndex = buildOneDimIndexes(relation, subset, matrix);
+
+ // compute two-element sets of subspaces
+ for (int x = 0; x < dim; x++) {
+ final int i = matrix.dim(x);
+ for (int y = x + 1; y < dim; y++) {
+ final int j = matrix.dim(y);
+ matrix.set(x, y, calculateContrast(relation, subset, subspaceIndex.get(x), subspaceIndex.get(y), i, j, random));
+ }
+ }
+ }
+
+ /**
+ * Calculates "index structures" for every attribute, i.e. sorts a
+ * ModifiableArray of every DBID in the database for every dimension and
+ * stores them in a list
+ *
+ * @param relation Relation to index
+ * @param ids IDs to use
+ * @param matrix Matrix (for dimension subset)
+ * @return List of sorted objects
+ */
+ private ArrayList<ArrayDBIDs> buildOneDimIndexes(Relation<? extends NumberVector<?>> relation, DBIDs ids, DimensionSimilarityMatrix matrix) {
+ final int dim = matrix.size();
+ ArrayList<ArrayDBIDs> subspaceIndex = new ArrayList<ArrayDBIDs>(dim);
+
+ SortDBIDsBySingleDimension comp = new VectorUtil.SortDBIDsBySingleDimension(relation);
+ for (int i = 0; i < dim; i++) {
+ ArrayModifiableDBIDs amDBIDs = DBIDUtil.newArray(ids);
+ comp.setDimension(matrix.dim(i));
+ amDBIDs.sort(comp);
+ subspaceIndex.add(amDBIDs);
+ }
+
+ return subspaceIndex;
+ }
+
+ /**
+ * Calculates the actual contrast of a given subspace
+ *
+ * @param relation Data relation
+ * @param subset Subset to process
+ * @param subspaceIndex1 Index of first subspace
+ * @param subspaceIndex2 Index of second subspace
+ * @param dim1 First dimension
+ * @param dim2 Second dimension
+ * @param random Random generator
+ * @return Contrast
+ */
+ private double calculateContrast(Relation<? extends NumberVector<?>> relation, DBIDs subset, ArrayDBIDs subspaceIndex1, ArrayDBIDs subspaceIndex2, int dim1, int dim2, Random random) {
+ final double alpha1 = Math.pow(alpha, .5);
+ final int windowsize = (int) (relation.size() * alpha1);
+
+ // TODO: speed up by keeping marginal distributions prepared.
+ // Instead of doing the random switch, do half-half.
+ double deviationSum = 0.0;
+ for (int i = 0; i < m; i++) {
+ // Randomly switch dimensions
+ final int cdim1;
+ ArrayDBIDs cindex1, cindex2;
+ if (random.nextDouble() > .5) {
+ cdim1 = dim1;
+ cindex1 = subspaceIndex1;
+ cindex2 = subspaceIndex2;
+ } else {
+ cdim1 = dim2;
+ cindex1 = subspaceIndex2;
+ cindex2 = subspaceIndex1;
+ }
+ // Build the sample
+ DBIDArrayIter iter = cindex2.iter();
+ HashSetModifiableDBIDs conditionalSample = DBIDUtil.newHashSet();
+ iter.seek(random.nextInt(subset.size() - windowsize));
+ for (int k = 0; k < windowsize && iter.valid(); k++, iter.advance()) {
+ conditionalSample.add(iter);
+ }
+ // Project the data
+ double[] fullValues = new double[subset.size()];
+ double[] sampleValues = new double[conditionalSample.size()];
+ {
+ int l = 0, s = 0;
+ // Note: we use the sorted index sets.
+ for (DBIDIter id = cindex1.iter(); id.valid(); id.advance(), l++) {
+ final double val = relation.get(id).doubleValue(cdim1);
+ fullValues[l] = val;
+ if (conditionalSample.contains(id)) {
+ sampleValues[s] = val;
+ s++;
+ }
+ }
+ assert (s == conditionalSample.size());
+ }
+ double contrast = statTest.deviation(fullValues, sampleValues);
+ if (Double.isNaN(contrast)) {
+ i--;
+ continue;
+ }
+ deviationSum += contrast;
+ }
+ return deviationSum / m;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer extends AbstractParameterizer {
+ /**
+ * Statistical test to use
+ */
+ private GoodnessOfFitTest statTest;
+
+ /**
+ * Holds the value of {@link de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS.Parameterizer#M_ID}.
+ */
+ private int m = 50;
+
+ /**
+ * Holds the value of {@link de.lmu.ifi.dbs.elki.algorithm.outlier.meta.HiCS.Parameterizer#ALPHA_ID}.
+ */
+ private double alpha = 0.1;
+
+ /**
+ * Random generator.
+ */
+ private RandomFactory rnd;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ final IntParameter mP = new IntParameter(HiCS.Parameterizer.M_ID, 50);
+ mP.addConstraint(new GreaterConstraint(1));
+ if (config.grab(mP)) {
+ m = mP.intValue();
+ }
+
+ final DoubleParameter alphaP = new DoubleParameter(HiCS.Parameterizer.ALPHA_ID, 0.1);
+ alphaP.addConstraint(new GreaterConstraint(0));
+ if (config.grab(alphaP)) {
+ alpha = alphaP.doubleValue();
+ }
+
+ final ObjectParameter<GoodnessOfFitTest> testP = new ObjectParameter<GoodnessOfFitTest>(HiCS.Parameterizer.TEST_ID, GoodnessOfFitTest.class, KolmogorovSmirnovTest.class);
+ if (config.grab(testP)) {
+ statTest = testP.instantiateClass(config);
+ }
+
+ final RandomParameter rndP = new RandomParameter(HiCS.Parameterizer.SEED_ID);
+ if (config.grab(rndP)) {
+ rnd = rndP.getValue();
+ }
+ }
+
+ @Override
+ protected HiCSDimensionSimilarity makeInstance() {
+ return new HiCSDimensionSimilarity(statTest, m, alpha, rnd);
+ }
+ }
+}