Import Upstream version 0.4.0~beta1

author: Andrej Shadura <andrewsh@debian.org> 2019-03-09 22:30:28 +0000
committer: Andrej Shadura <andrewsh@debian.org> 2019-03-09 22:30:28 +0000
commit: cde76aeb42240f7270bc6605c606ae07d2dc5a7d (patch)
tree: c3ebf1d7745224f524da31dbabc5d76b9ea75916 /src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
1 files changed, 343 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
new file mode 100644
index 00000000..3bdb224e
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java
@@ -0,0 +1,343 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering;
+/*
+This file is part of ELKI:
+Environment for Developing KDD-Applications Supported by Index-Structures
+
+Copyright (C) 2011
+Ludwig-Maximilians-Universität München
+Lehr- und Forschungseinheit für Datenbanksysteme
+ELKI Development Team
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.ClusterModel;
+import de.lmu.ifi.dbs.elki.data.model.Model;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.IntegerDistance;
+import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * <p>
+ * Shared nearest neighbor clustering.
+ * </p>
+ * <p>
+ * Reference: L. Ertöz, M. Steinbach, V. Kumar: Finding Clusters of Different
+ * Sizes, Shapes, and Densities in Noisy, High Dimensional Data. <br>
+ * In: Proc. of SIAM Data Mining (SDM), 2003.
+ * </p>
+ * 
+ * @author Arthur Zimek
+ * 
+ * @apiviz.uses SharedNearestNeighborSimilarityFunction
+ * 
+ * @param <O> the type of DatabaseObject the algorithm is applied on
+ * @param <D> the type of Distance used for the preprocessing of the shared
+ *        nearest neighbors neighborhood lists
+ */
+@Title("SNN: Shared Nearest Neighbor Clustering")
+@Description("Algorithm to find shared-nearest-neighbors-density-connected sets in a database based on the " + "parameters 'minPts' and 'epsilon' (specifying a volume). " + "These two parameters determine a density threshold for clustering.")
+@Reference(authors = "L. Ertöz, M. Steinbach, V. Kumar", title = "Finding Clusters of Different Sizes, Shapes, and Densities in Noisy, High Dimensional Data", booktitle = "Proc. of SIAM Data Mining (SDM), 2003", url = "http://www.siam.org/meetings/sdm03/proceedings/sdm03_05.pdf")
+public class SNNClustering<O, D extends Distance<D>> extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> {
+  /**
+   * The logger for this class.
+   */
+  private static final Logging logger = Logging.getLogger(SNNClustering.class);
+
+  /**
+   * Parameter to specify the minimum SNN density, must be an integer greater
+   * than 0.
+   */
+  public static final OptionID EPSILON_ID = OptionID.getOrCreateOptionID("snn.epsilon", "The minimum SNN density.");
+
+  /**
+   * Holds the value of {@link #EPSILON_ID}.
+   */
+  private IntegerDistance epsilon;
+
+  /**
+   * Parameter to specify the threshold for minimum number of points in the
+   * epsilon-SNN-neighborhood of a point, must be an integer greater than 0.
+   */
+  public static final OptionID MINPTS_ID = OptionID.getOrCreateOptionID("snn.minpts", "Threshold for minimum number of points in " + "the epsilon-SNN-neighborhood of a point.");
+
+  /**
+   * Holds the value of {@link #MINPTS_ID}.
+   */
+  private int minpts;
+
+  /**
+   * Holds a list of clusters found.
+   */
+  protected List<ModifiableDBIDs> resultList;
+
+  /**
+   * Holds a set of noise.
+   */
+  protected ModifiableDBIDs noise;
+
+  /**
+   * Holds a set of processed ids.
+   */
+  protected ModifiableDBIDs processedIDs;
+
+  /**
+   * The similarity function for the shared nearest neighbor similarity.
+   */
+  private SharedNearestNeighborSimilarityFunction<O, D> similarityFunction;
+
+  /**
+   * Constructor.
+   * 
+   * @param similarityFunction Similarity function
+   * @param epsilon Epsilon
+   * @param minpts Minpts
+   */
+  public SNNClustering(SharedNearestNeighborSimilarityFunction<O, D> similarityFunction, IntegerDistance epsilon, int minpts) {
+    super();
+    this.similarityFunction = similarityFunction;
+    this.epsilon = epsilon;
+    this.minpts = minpts;
+  }
+
+  /**
+   * Perform SNN clustering
+   * 
+   * @param database Database
+   * @param relation Relation
+   * @return Result
+   */
+  public Clustering<Model> run(Database database, Relation<O> relation) {
+    SimilarityQuery<O, IntegerDistance> snnInstance = similarityFunction.instantiate(relation);
+
+    FiniteProgress objprog = logger.isVerbose() ? new FiniteProgress("SNNClustering", relation.size(), logger) : null;
+    IndefiniteProgress clusprog = logger.isVerbose() ? new IndefiniteProgress("Number of clusters", logger) : null;
+    resultList = new ArrayList<ModifiableDBIDs>();
+    noise = DBIDUtil.newHashSet();
+    processedIDs = DBIDUtil.newHashSet(relation.size());
+    if(relation.size() >= minpts) {
+      for(DBID id : snnInstance.getRelation().iterDBIDs()) {
+        if(!processedIDs.contains(id)) {
+          expandCluster(snnInstance, id, objprog, clusprog);
+          if(processedIDs.size() == relation.size() && noise.size() == 0) {
+            break;
+          }
+        }
+        if(objprog != null && clusprog != null) {
+          objprog.setProcessed(processedIDs.size(), logger);
+          clusprog.setProcessed(resultList.size(), logger);
+        }
+      }
+    }
+    else {
+      for(DBID id : snnInstance.getRelation().iterDBIDs()) {
+        noise.add(id);
+        if(objprog != null && clusprog != null) {
+          objprog.setProcessed(noise.size(), logger);
+          clusprog.setProcessed(resultList.size(), logger);
+        }
+      }
+    }
+    // Finish progress logging
+    if(objprog != null && clusprog != null) {
+      objprog.ensureCompleted(logger);
+      clusprog.setCompleted(logger);
+    }
+
+    Clustering<Model> result = new Clustering<Model>("Shared-Nearest-Neighbor Clustering", "snn-clustering");
+    for(Iterator<ModifiableDBIDs> resultListIter = resultList.iterator(); resultListIter.hasNext();) {
+      result.addCluster(new Cluster<Model>(resultListIter.next(), ClusterModel.CLUSTER));
+    }
+    result.addCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
+
+    return result;
+  }
+
+  /**
+   * Returns the shared nearest neighbors of the specified query object in the
+   * given database.
+   * 
+   * @param snnInstance shared nearest neighbors
+   * @param queryObject the query object
+   * @return the shared nearest neighbors of the specified query object in the
+   *         given database
+   */
+  protected List<DBID> findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) {
+    List<DBID> neighbors = new LinkedList<DBID>();
+    for(DBID id : snnInstance.getRelation().iterDBIDs()) {
+      if(snnInstance.similarity(queryObject, id).compareTo(epsilon) >= 0) {
+        neighbors.add(id);
+      }
+    }
+    return neighbors;
+  }
+
+  /**
+   * DBSCAN-function expandCluster adapted to SNN criterion.
+   * <p/>
+   * <p/>
+   * Border-Objects become members of the first possible cluster.
+   * 
+   * @param snnInstance shared nearest neighbors
+   * @param startObjectID potential seed of a new potential cluster
+   * @param objprog the progress object to report about the progress of
+   *        clustering
+   */
+  protected void expandCluster(SimilarityQuery<O, IntegerDistance> snnInstance, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) {
+    List<DBID> seeds = findSNNNeighbors(snnInstance, startObjectID);
+
+    // startObject is no core-object
+    if(seeds.size() < minpts) {
+      noise.add(startObjectID);
+      processedIDs.add(startObjectID);
+      if(objprog != null && clusprog != null) {
+        objprog.setProcessed(processedIDs.size(), logger);
+        clusprog.setProcessed(resultList.size(), logger);
+      }
+      return;
+    }
+
+    // try to expand the cluster
+    ModifiableDBIDs currentCluster = DBIDUtil.newArray();
+    for(DBID seed : seeds) {
+      if(!processedIDs.contains(seed)) {
+        currentCluster.add(seed);
+        processedIDs.add(seed);
+      }
+      else if(noise.contains(seed)) {
+        currentCluster.add(seed);
+        noise.remove(seed);
+      }
+    }
+    seeds.remove(0);
+
+    while(seeds.size() > 0) {
+      DBID o = seeds.remove(0);
+      List<DBID> neighborhood = findSNNNeighbors(snnInstance, o);
+
+      if(neighborhood.size() >= minpts) {
+        for(DBID p : neighborhood) {
+          boolean inNoise = noise.contains(p);
+          boolean unclassified = !processedIDs.contains(p);
+          if(inNoise || unclassified) {
+            if(unclassified) {
+              seeds.add(p);
+            }
+            currentCluster.add(p);
+            processedIDs.add(p);
+            if(inNoise) {
+              noise.remove(p);
+            }
+          }
+        }
+      }
+
+      if(objprog != null && clusprog != null) {
+        objprog.setProcessed(processedIDs.size(), logger);
+        int numClusters = currentCluster.size() > minpts ? resultList.size() + 1 : resultList.size();
+        clusprog.setProcessed(numClusters, logger);
+      }
+
+      if(processedIDs.size() == snnInstance.getRelation().size() && noise.size() == 0) {
+        break;
+      }
+    }
+    if(currentCluster.size() >= minpts) {
+      resultList.add(currentCluster);
+    }
+    else {
+      for(DBID id : currentCluster) {
+        noise.add(id);
+      }
+      noise.add(startObjectID);
+      processedIDs.add(startObjectID);
+    }
+  }
+
+  @Override
+  public TypeInformation[] getInputTypeRestriction() {
+    return TypeUtil.array(similarityFunction.getInputTypeRestriction());
+  }
+
+  @Override
+  protected Logging getLogger() {
+    return logger;
+  }
+
+  /**
+   * Parameterization class.
+   * 
+   * @author Erich Schubert
+   * 
+   * @apiviz.exclude
+   */
+  public static class Parameterizer<O, D extends Distance<D>> extends AbstractParameterizer {
+    protected IntegerDistance epsilon;
+
+    protected int minpts;
+
+    private SharedNearestNeighborSimilarityFunction<O, D> similarityFunction;
+
+    @Override
+    protected void makeOptions(Parameterization config) {
+      super.makeOptions(config);
+      Class<SharedNearestNeighborSimilarityFunction<O, D>> cls = ClassGenericsUtil.uglyCastIntoSubclass(SharedNearestNeighborSimilarityFunction.class);
+      similarityFunction = config.tryInstantiate(cls);
+
+      DistanceParameter<IntegerDistance> epsilonP = new DistanceParameter<IntegerDistance>(EPSILON_ID, IntegerDistance.FACTORY);
+      if(config.grab(epsilonP)) {
+        epsilon = epsilonP.getValue();
+      }
+
+      IntParameter minptsP = new IntParameter(MINPTS_ID, new GreaterConstraint(0));
+      if(config.grab(minptsP)) {
+        minpts = minptsP.getValue();
+      }
+    }
+
+    @Override
+    protected SNNClustering<O, D> makeInstance() {
+      return new SNNClustering<O, D>(similarityFunction, epsilon, minpts);
+    }
+  }
+}
+\ No newline at end of file
author	Andrej Shadura <andrewsh@debian.org>	2019-03-09 22:30:28 +0000
committer	Andrej Shadura <andrewsh@debian.org>	2019-03-09 22:30:28 +0000
commit	cde76aeb42240f7270bc6605c606ae07d2dc5a7d (patch)
tree	c3ebf1d7745224f524da31dbabc5d76b9ea75916 /src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java