package de.lmu.ifi.dbs.elki.algorithm.clustering; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.ArrayList; import java.util.Iterator; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.model.ClusterModel; import de.lmu.ifi.dbs.elki.data.model.Model; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.similarity.SimilarityQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancevalue.IntegerDistance; import de.lmu.ifi.dbs.elki.distance.similarityfunction.SharedNearestNeighborSimilarityFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.ClassGenericsUtil; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** *

* Shared nearest neighbor clustering. *

*

* Reference: L. Ertöz, M. Steinbach, V. Kumar: Finding Clusters of Different * Sizes, Shapes, and Densities in Noisy, High Dimensional Data.
* In: Proc. of SIAM Data Mining (SDM), 2003. *

* * @author Arthur Zimek * * @apiviz.uses SharedNearestNeighborSimilarityFunction * * @param the type of Object the algorithm is applied on */ @Title("SNN: Shared Nearest Neighbor Clustering") @Description("Algorithm to find shared-nearest-neighbors-density-connected sets in a database based on the " + "parameters 'minPts' and 'epsilon' (specifying a volume). " + "These two parameters determine a density threshold for clustering.") @Reference(authors = "L. Ertöz, M. Steinbach, V. Kumar", title = "Finding Clusters of Different Sizes, Shapes, and Densities in Noisy, High Dimensional Data", booktitle = "Proc. of SIAM Data Mining (SDM), 2003", url = "http://www.siam.org/meetings/sdm03/proceedings/sdm03_05.pdf") public class SNNClustering extends AbstractAlgorithm> implements ClusteringAlgorithm> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(SNNClustering.class); /** * Parameter to specify the minimum SNN density, must be an integer greater * than 0. */ public static final OptionID EPSILON_ID = new OptionID("snn.epsilon", "The minimum SNN density."); /** * Holds the value of {@link #EPSILON_ID}. */ private IntegerDistance epsilon; /** * Parameter to specify the threshold for minimum number of points in the * epsilon-SNN-neighborhood of a point, must be an integer greater than 0. */ public static final OptionID MINPTS_ID = new OptionID("snn.minpts", "Threshold for minimum number of points in " + "the epsilon-SNN-neighborhood of a point."); /** * Holds the value of {@link #MINPTS_ID}. */ private int minpts; /** * Holds a list of clusters found. */ protected List resultList; /** * Holds a set of noise. */ protected ModifiableDBIDs noise; /** * Holds a set of processed ids. */ protected ModifiableDBIDs processedIDs; /** * The similarity function for the shared nearest neighbor similarity. */ private SharedNearestNeighborSimilarityFunction similarityFunction; /** * Constructor. * * @param similarityFunction Similarity function * @param epsilon Epsilon * @param minpts Minpts */ public SNNClustering(SharedNearestNeighborSimilarityFunction similarityFunction, IntegerDistance epsilon, int minpts) { super(); this.similarityFunction = similarityFunction; this.epsilon = epsilon; this.minpts = minpts; } /** * Perform SNN clustering * * @param database Database * @param relation Relation * @return Result */ public Clustering run(Database database, Relation relation) { SimilarityQuery snnInstance = similarityFunction.instantiate(relation); FiniteProgress objprog = LOG.isVerbose() ? new FiniteProgress("SNNClustering", relation.size(), LOG) : null; IndefiniteProgress clusprog = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null; resultList = new ArrayList<>(); noise = DBIDUtil.newHashSet(); processedIDs = DBIDUtil.newHashSet(relation.size()); if(relation.size() >= minpts) { for(DBIDIter id = snnInstance.getRelation().iterDBIDs(); id.valid(); id.advance()) { if(!processedIDs.contains(id)) { expandCluster(snnInstance, DBIDUtil.deref(id), objprog, clusprog); if(processedIDs.size() == relation.size() && noise.size() == 0) { break; } } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), LOG); clusprog.setProcessed(resultList.size(), LOG); } } } else { for(DBIDIter id = snnInstance.getRelation().iterDBIDs(); id.valid(); id.advance()) { noise.add(id); if(objprog != null && clusprog != null) { objprog.setProcessed(noise.size(), LOG); clusprog.setProcessed(resultList.size(), LOG); } } } // Finish progress logging if(objprog != null && clusprog != null) { objprog.ensureCompleted(LOG); clusprog.setCompleted(LOG); } Clustering result = new Clustering<>("Shared-Nearest-Neighbor Clustering", "snn-clustering"); for(Iterator resultListIter = resultList.iterator(); resultListIter.hasNext();) { result.addToplevelCluster(new Cluster(resultListIter.next(), ClusterModel.CLUSTER)); } result.addToplevelCluster(new Cluster(noise, true, ClusterModel.CLUSTER)); return result; } /** * Returns the shared nearest neighbors of the specified query object in the * given database. * * @param snnInstance shared nearest neighbors * @param queryObject the query object * @return the shared nearest neighbors of the specified query object in the * given database */ protected ArrayModifiableDBIDs findSNNNeighbors(SimilarityQuery snnInstance, DBID queryObject) { ArrayModifiableDBIDs neighbors = DBIDUtil.newArray(); for(DBIDIter iditer = snnInstance.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { if(snnInstance.similarity(queryObject, iditer).compareTo(epsilon) >= 0) { neighbors.add(iditer); } } return neighbors; } /** * DBSCAN-function expandCluster adapted to SNN criterion. *

*

* Border-Objects become members of the first possible cluster. * * @param snnInstance shared nearest neighbors * @param startObjectID potential seed of a new potential cluster * @param objprog the progress object to report about the progress of * clustering */ protected void expandCluster(SimilarityQuery snnInstance, DBID startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) { ArrayModifiableDBIDs seeds = findSNNNeighbors(snnInstance, startObjectID); // startObject is no core-object if(seeds.size() < minpts) { noise.add(startObjectID); processedIDs.add(startObjectID); if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), LOG); clusprog.setProcessed(resultList.size(), LOG); } return; } // try to expand the cluster ModifiableDBIDs currentCluster = DBIDUtil.newArray(); for(DBIDIter seed = seeds.iter(); seed.valid(); seed.advance()) { if(!processedIDs.contains(seed)) { currentCluster.add(seed); processedIDs.add(seed); } else if(noise.contains(seed)) { currentCluster.add(seed); noise.remove(seed); } } while(seeds.size() > 0) { DBID o = seeds.remove(seeds.size() - 1); ArrayModifiableDBIDs neighborhood = findSNNNeighbors(snnInstance, o); if(neighborhood.size() >= minpts) { for(DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) { boolean inNoise = noise.contains(iter); boolean unclassified = !processedIDs.contains(iter); if(inNoise || unclassified) { if(unclassified) { seeds.add(iter); } currentCluster.add(iter); processedIDs.add(iter); if(inNoise) { noise.remove(iter); } } } } if(objprog != null && clusprog != null) { objprog.setProcessed(processedIDs.size(), LOG); int numClusters = currentCluster.size() > minpts ? resultList.size() + 1 : resultList.size(); clusprog.setProcessed(numClusters, LOG); } if(processedIDs.size() == snnInstance.getRelation().size() && noise.size() == 0) { break; } } if(currentCluster.size() >= minpts) { resultList.add(currentCluster); } else { noise.addDBIDs(currentCluster); noise.add(startObjectID); processedIDs.add(startObjectID); } } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(similarityFunction.getInputTypeRestriction()); } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude * * @param object type */ public static class Parameterizer extends AbstractParameterizer { protected IntegerDistance epsilon; protected int minpts; private SharedNearestNeighborSimilarityFunction similarityFunction; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); Class> cls = ClassGenericsUtil.uglyCastIntoSubclass(SharedNearestNeighborSimilarityFunction.class); similarityFunction = config.tryInstantiate(cls); DistanceParameter epsilonP = new DistanceParameter<>(EPSILON_ID, IntegerDistance.FACTORY); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } IntParameter minptsP = new IntParameter(MINPTS_ID); minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.intValue(); } } @Override protected SNNClustering makeInstance() { return new SNNClustering<>(similarityFunction, epsilon, minpts); } } }