diff options
author | Andrej Shadura <andrewsh@debian.org> | 2019-03-09 22:30:28 +0000 |
---|---|---|
committer | Andrej Shadura <andrewsh@debian.org> | 2019-03-09 22:30:28 +0000 |
commit | cde76aeb42240f7270bc6605c606ae07d2dc5a7d (patch) | |
tree | c3ebf1d7745224f524da31dbabc5d76b9ea75916 /src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java |
Import Upstream version 0.4.0~beta1
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java | 343 |
1 files changed, 343 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java new file mode 100644 index 00000000..3bdb224e --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/SNNClustering.java @@ -0,0 +1,343 @@ +package de.lmu.ifi.dbs.elki.algorithm.clustering; +/* +This file is part of ELKI: +Environment for Developing KDD-Applications Supported by Index-Structures + +Copyright (C) 2011 +Ludwig-Maximilians-Universität München +Lehr- und Forschungseinheit für Datenbanksysteme +ELKI Development Team + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; +import de.lmu.ifi.dbs.elki.data.Cluster; +import de.lmu.ifi.dbs.elki.data.Clustering; +import de.lmu.ifi.dbs.elki.data.model.ClusterModel; +import de.lmu.ifi.dbs.elki.data.model.Model; +import de.lmu.ifi.dbs.elki.data.type.TypeInformation; +import de.lmu.ifi.dbs.elki.data.type.TypeUtil; +import de.lmu.ifi.dbs.elki.database.Database; +import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; +import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; +import de.lmu.ifi.dbs.elki.database.relation.Relation; +import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.distance.distancevalue.IntegerDistance; +import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction; +import de.lmu.ifi.dbs.elki.logging.Logging; +import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; +import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; +import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; +import de.lmu.ifi.dbs.elki.utilities.documentation.Description; +import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; +import de.lmu.ifi.dbs.elki.utilities.documentation.Title; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; + +/** + * <p> + * Shared nearest neighbor clustering. + * </p> + * <p> + * Reference: L. Ertöz, M. Steinbach, V. Kumar: Finding Clusters of Different + * Sizes, Shapes, and Densities in Noisy, High Dimensional Data. <br> + * In: Proc. of SIAM Data Mining (SDM), 2003. + * </p> + * + * @author Arthur Zimek + * + * @apiviz.uses SharedNearestNeighborSimilarityFunction + * + * @param <O> the type of DatabaseObject the algorithm is applied on + * @param <D> the type of Distance used for the preprocessing of the shared + * nearest neighbors neighborhood lists + */ +@Title("SNN: Shared Nearest Neighbor Clustering") +@Description("Algorithm to find shared-nearest-neighbors-density-connected sets in a database based on the " + "parameters 'minPts' and 'epsilon' (specifying a volume). " + "These two parameters determine a density threshold for clustering.") +@Reference(authors = "L. Ertöz, M. Steinbach, V. Kumar", title = "Finding Clusters of Different Sizes, Shapes, and Densities in Noisy, High Dimensional Data", booktitle = "Proc. of SIAM Data Mining (SDM), 2003", url = "http://www.siam.org/meetings/sdm03/proceedings/sdm03_05.pdf") +public class SNNClustering<O, D extends Distance<D>> extends AbstractAlgorithm<Clustering<Model>> implements ClusteringAlgorithm<Clustering<Model>> { + /** + * The logger for this class. + */ + private static final Logging logger = Logging.getLogger(SNNClustering.class); + + /** + * Parameter to specify the minimum SNN density, must be an integer greater + * than 0. + */ + public static final OptionID EPSILON_ID = OptionID.getOrCreateOptionID("snn.epsilon", "The minimum SNN density."); + + /** + * Holds the value of {@link #EPSILON_ID}. + */ + private IntegerDistance epsilon; + + /** + * Parameter to specify the threshold for minimum number of points in the + * epsilon-SNN-neighborhood of a point, must be an integer greater than 0. + */ + public static final OptionID MINPTS_ID = OptionID.getOrCreateOptionID("snn.minpts", "Threshold for minimum number of points in " + "the epsilon-SNN-neighborhood of a point."); + + /** + * Holds the value of {@link #MINPTS_ID}. + */ + private int minpts; + + /** + * Holds a list of clusters found. + */ + protected List<ModifiableDBIDs> resultList; + + /** + * Holds a set of noise. + */ + protected ModifiableDBIDs noise; + + /** + * Holds a set of processed ids. + */ + protected ModifiableDBIDs processedIDs; + + /** + * The similarity function for the shared nearest neighbor similarity. + */ + private SharedNearestNeighborSimilarityFunction<O, D> similarityFunction; + + /** + * Constructor. + * + * @param similarityFunction Similarity function + * @param epsilon Epsilon + * @param minpts Minpts + */ + public SNNClustering(SharedNearestNeighborSimilarityFunction<O, D> similarityFunction, IntegerDistance epsilon, int minpts) { + super(); + this.similarityFunction = similarityFunction; + this.epsilon = epsilon; + this.minpts = minpts; + } + + /** + * Perform SNN clustering + * + * @param database Database + * @param relation Relation + * @return Result + */ + public Clustering<Model> run(Database database, Relation<O> relation) { + SimilarityQuery<O, IntegerDistance> snnInstance = similarityFunction.instantiate(relation); + + FiniteProgress objprog = logger.isVerbose() ? new FiniteProgress("SNNClustering", relation.size(), logger) : null; + IndefiniteProgress clusprog = logger.isVerbose() ? new IndefiniteProgress("Number of clusters", logger) : null; + resultList = new ArrayList<ModifiableDBIDs>(); + noise = DBIDUtil.newHashSet(); + processedIDs = DBIDUtil.newHashSet(relation.size()); + if(relation.size() >= minpts) { + for(DBID id : snnInstance.getRelation().iterDBIDs()) { + if(!processedIDs.contains(id)) { + expandCluster(snnInstance, id, objprog, clusprog); + if(processedIDs.size() == relation.size() && noise.size() == 0) { + break; + } + } + if(objprog != null && clusprog != null) { + objprog.setProcessed(processedIDs.size(), logger); + clusprog.setProcessed(resultList.size(), logger); + } + } + } + else { + for(DBID id : snnInstance.getRelation().iterDBIDs()) { + noise.add(id); + if(objprog != null && clusprog != null) { + objprog.setProcessed(noise.size(), logger); + clusprog.setProcessed(resultList.size(), logger); + } + } + } + // Finish progress logging + if(objprog != null && clusprog != null) { + objprog.ensureCompleted(logger); + clusprog.setCompleted(logger); + } + + Clustering<Model> result = new Clustering<Model>("Shared-Nearest-Neighbor Clustering", "snn-clustering"); + for(Iterator<ModifiableDBIDs> resultListIter = resultList.iterator(); resultListIter.hasNext();) { + result.addCluster(new Cluster<Model>(resultListIter.next(), ClusterModel.CLUSTER)); + } + result.addCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER)); + + return result; + } + + /** + * Returns the shared nearest neighbors of the specified query object in the + * given database. + * + * @param snnInstance shared nearest neighbors + * @param queryObject the query object + * @return the shared nearest neighbors of the specified query object in the + * given database + */ + protected List<DBID> findSNNNeighbors(SimilarityQuery<O, IntegerDistance> snnInstance, DBID queryObject) { + List<DBID> neighbors = new LinkedList<DBID>(); + for(DBID id : snnInstance.getRelation().iterDBIDs()) { + if(snnInstance.similarity(queryObject, id).compareTo(epsilon) >= 0) { + neighbors.add(id); + } + } + return neighbors; + } + + /** + * DBSCAN-function expandCluster adapted to SNN criterion. + * <p/> + * <p/> + * Border-Objects become members of the first possible cluster. + * + * @param snnInstance shared nearest neighbors + * @param startObjectID potential seed of a new potential cluster + * @param objprog the progress object to report about the progress of + * clustering + */ + protected void expandCluster(SimilarityQuery<O, IntegerDistance> snnInstance, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { + List<DBID> seeds = findSNNNeighbors(snnInstance, startObjectID); + + // startObject is no core-object + if(seeds.size() < minpts) { + noise.add(startObjectID); + processedIDs.add(startObjectID); + if(objprog != null && clusprog != null) { + objprog.setProcessed(processedIDs.size(), logger); + clusprog.setProcessed(resultList.size(), logger); + } + return; + } + + // try to expand the cluster + ModifiableDBIDs currentCluster = DBIDUtil.newArray(); + for(DBID seed : seeds) { + if(!processedIDs.contains(seed)) { + currentCluster.add(seed); + processedIDs.add(seed); + } + else if(noise.contains(seed)) { + currentCluster.add(seed); + noise.remove(seed); + } + } + seeds.remove(0); + + while(seeds.size() > 0) { + DBID o = seeds.remove(0); + List<DBID> neighborhood = findSNNNeighbors(snnInstance, o); + + if(neighborhood.size() >= minpts) { + for(DBID p : neighborhood) { + boolean inNoise = noise.contains(p); + boolean unclassified = !processedIDs.contains(p); + if(inNoise || unclassified) { + if(unclassified) { + seeds.add(p); + } + currentCluster.add(p); + processedIDs.add(p); + if(inNoise) { + noise.remove(p); + } + } + } + } + + if(objprog != null && clusprog != null) { + objprog.setProcessed(processedIDs.size(), logger); + int numClusters = currentCluster.size() > minpts ? resultList.size() + 1 : resultList.size(); + clusprog.setProcessed(numClusters, logger); + } + + if(processedIDs.size() == snnInstance.getRelation().size() && noise.size() == 0) { + break; + } + } + if(currentCluster.size() >= minpts) { + resultList.add(currentCluster); + } + else { + for(DBID id : currentCluster) { + noise.add(id); + } + noise.add(startObjectID); + processedIDs.add(startObjectID); + } + } + + @Override + public TypeInformation[] getInputTypeRestriction() { + return TypeUtil.array(similarityFunction.getInputTypeRestriction()); + } + + @Override + protected Logging getLogger() { + return logger; + } + + /** + * Parameterization class. + * + * @author Erich Schubert + * + * @apiviz.exclude + */ + public static class Parameterizer<O, D extends Distance<D>> extends AbstractParameterizer { + protected IntegerDistance epsilon; + + protected int minpts; + + private SharedNearestNeighborSimilarityFunction<O, D> similarityFunction; + + @Override + protected void makeOptions(Parameterization config) { + super.makeOptions(config); + Class<SharedNearestNeighborSimilarityFunction<O, D>> cls = ClassGenericsUtil.uglyCastIntoSubclass(SharedNearestNeighborSimilarityFunction.class); + similarityFunction = config.tryInstantiate(cls); + + DistanceParameter<IntegerDistance> epsilonP = new DistanceParameter<IntegerDistance>(EPSILON_ID, IntegerDistance.FACTORY); + if(config.grab(epsilonP)) { + epsilon = epsilonP.getValue(); + } + + IntParameter minptsP = new IntParameter(MINPTS_ID, new GreaterConstraint(0)); + if(config.grab(minptsP)) { + minpts = minptsP.getValue(); + } + } + + @Override + protected SNNClustering<O, D> makeInstance() { + return new SNNClustering<O, D>(similarityFunction, epsilon, minpts); + } + } +}
\ No newline at end of file |