package de.lmu.ifi.dbs.elki.algorithm.clustering; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2013 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.QueryUtil; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDList; import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDListIter; import de.lmu.ifi.dbs.elki.database.ids.distance.DistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDListIter; import de.lmu.ifi.dbs.elki.database.ids.distance.DoubleDistanceDBIDPair; import de.lmu.ifi.dbs.elki.database.query.range.RangeQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.DistanceUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderEntry; import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult; import de.lmu.ifi.dbs.elki.result.optics.DoubleDistanceClusterOrderEntry; import de.lmu.ifi.dbs.elki.result.optics.GenericClusterOrderEntry; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.UpdatableHeap; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** * OPTICS provides the OPTICS algorithm. *

* Reference: M. Ankerst, M. Breunig, H.-P. Kriegel, and J. Sander: OPTICS: * Ordering Points to Identify the Clustering Structure.
* In: Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99). *

* * @author Elke Achtert * @param the type of DatabaseObjects handled by the algorithm * @param the type of Distance used to discern objects */ @Title("OPTICS: Density-Based Hierarchical Clustering") @Description("Algorithm to find density-connected sets in a database based on the parameters 'minPts' and 'epsilon' (specifying a volume). These two parameters determine a density threshold for clustering.") @Reference(authors = "M. Ankerst, M. Breunig, H.-P. Kriegel, and J. Sander", title = "OPTICS: Ordering Points to Identify the Clustering Structure", booktitle = "Proc. ACM SIGMOD Int. Conf. on Management of Data (SIGMOD '99)", url = "http://dx.doi.org/10.1145/304181.304187") public class OPTICS> extends AbstractDistanceBasedAlgorithm> implements OPTICSTypeAlgorithm { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(OPTICS.class); /** * Parameter to specify the maximum radius of the neighborhood to be * considered, must be suitable to the distance function specified. */ public static final OptionID EPSILON_ID = new OptionID("optics.epsilon", "The maximum radius of the neighborhood to be considered."); /** * Parameter to specify the threshold for minimum number of points in the * epsilon-neighborhood of a point, must be an integer greater than 0. */ public static final OptionID MINPTS_ID = new OptionID("optics.minpts", "Threshold for minimum number of points in the epsilon-neighborhood of a point."); /** * Hold the value of {@link #EPSILON_ID}. */ private D epsilon; /** * Holds the value of {@link #MINPTS_ID}. */ private int minpts; /** * Holds a set of processed ids. */ private ModifiableDBIDs processedIDs; /** * Constructor. * * @param distanceFunction Distance function * @param epsilon Epsilon value * @param minpts Minpts value */ public OPTICS(DistanceFunction distanceFunction, D epsilon, int minpts) { super(distanceFunction); this.epsilon = epsilon; this.minpts = minpts; } /** * Run OPTICS on the database. * * @param database Database * @param relation Relation * @return Result */ public ClusterOrderResult run(Database database, Relation relation) { // Default value is infinite distance if(epsilon == null) { epsilon = getDistanceFunction().getDistanceFactory().infiniteDistance(); } RangeQuery rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction(), epsilon); int size = relation.size(); final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("OPTICS", size, LOG) : null; processedIDs = DBIDUtil.newHashSet(size); ClusterOrderResult clusterOrder = new ClusterOrderResult<>("OPTICS Clusterorder", "optics-clusterorder"); if(getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction && DoubleDistance.class.isInstance(epsilon)) { // Optimized codepath for double-based distances. Avoids Java // boxing/unboxing. for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { if(!processedIDs.contains(iditer)) { // We need to do some ugly casts to be able to run the optimized // version, unfortunately. @SuppressWarnings("unchecked") final ClusterOrderResult doubleClusterOrder = ClusterOrderResult.class.cast(clusterOrder); @SuppressWarnings("unchecked") final RangeQuery doubleRangeQuery = RangeQuery.class.cast(rangeQuery); final DoubleDistance depsilon = DoubleDistance.class.cast(epsilon); expandClusterOrderDouble(doubleClusterOrder, database, doubleRangeQuery, DBIDUtil.deref(iditer), depsilon, progress); } } } else { for(DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) { if(!processedIDs.contains(iditer)) { expandClusterOrder(clusterOrder, database, rangeQuery, DBIDUtil.deref(iditer), epsilon, progress); } } } if(progress != null) { progress.ensureCompleted(LOG); } return clusterOrder; } /** * OPTICS-function expandClusterOrder. * * @param clusterOrder Cluster order result to expand * @param database the database on which the algorithm is run * @param rangeQuery the range query to use * @param objectID the currently processed object * @param epsilon Epsilon range value * @param progress the progress object to actualize the current progress if * the algorithm */ protected void expandClusterOrder(ClusterOrderResult clusterOrder, Database database, RangeQuery rangeQuery, DBID objectID, D epsilon, FiniteProgress progress) { UpdatableHeap> heap = new UpdatableHeap<>(); heap.add(new GenericClusterOrderEntry<>(objectID, null, getDistanceFunction().getDistanceFactory().infiniteDistance())); while(!heap.isEmpty()) { final ClusterOrderEntry current = heap.poll(); clusterOrder.add(current); processedIDs.add(current.getID()); DistanceDBIDList neighbors = rangeQuery.getRangeForDBID(current.getID(), epsilon); if(neighbors.size() >= minpts) { final DistanceDBIDPair last = neighbors.get(minpts - 1); D coreDistance = last.getDistance(); for(DistanceDBIDListIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { if(processedIDs.contains(neighbor)) { continue; } D reachability = DistanceUtil.max(neighbor.getDistance(), coreDistance); heap.add(new GenericClusterOrderEntry<>(DBIDUtil.deref(neighbor), current.getID(), reachability)); } } if(progress != null) { progress.setProcessed(processedIDs.size(), LOG); } } } /** * OPTICS-function expandClusterOrder. * * @param clusterOrder Cluster order result to expand * @param database the database on which the algorithm is run * @param rangeQuery the range query to use * @param objectID the currently processed object * @param epsilon Query epsilon * @param progress the progress object to actualize the current progress if * the algorithm */ protected void expandClusterOrderDouble(ClusterOrderResult clusterOrder, Database database, RangeQuery rangeQuery, DBID objectID, DoubleDistance epsilon, FiniteProgress progress) { UpdatableHeap heap = new UpdatableHeap<>(); heap.add(new DoubleDistanceClusterOrderEntry(objectID, null, Double.POSITIVE_INFINITY)); while(!heap.isEmpty()) { final DoubleDistanceClusterOrderEntry current = heap.poll(); clusterOrder.add(current); processedIDs.add(current.getID()); DistanceDBIDList neighbors = rangeQuery.getRangeForDBID(current.getID(), epsilon); if(neighbors.size() >= minpts) { final DistanceDBIDPair last = neighbors.get(minpts - 1); if(last instanceof DoubleDistanceDBIDPair) { double coreDistance = ((DoubleDistanceDBIDPair) last).doubleDistance(); for(DistanceDBIDListIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { if(processedIDs.contains(neighbor)) { continue; } double reachability = Math.max(((DoubleDistanceDBIDListIter) neighbor).doubleDistance(), coreDistance); heap.add(new DoubleDistanceClusterOrderEntry(DBIDUtil.deref(neighbor), current.getID(), reachability)); } } else { // Actually we have little gains in this situation, // Only if we got an optimized result before. double coreDistance = last.getDistance().doubleValue(); for(DistanceDBIDListIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) { if(processedIDs.contains(neighbor)) { continue; } double reachability = Math.max(neighbor.getDistance().doubleValue(), coreDistance); heap.add(new DoubleDistanceClusterOrderEntry(DBIDUtil.deref(neighbor), current.getID(), reachability)); } } } if(progress != null) { progress.setProcessed(processedIDs.size(), LOG); } } } @Override public int getMinPts() { return minpts; } @Override public D getDistanceFactory() { return getDistanceFunction().getDistanceFactory(); } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer> extends AbstractDistanceBasedAlgorithm.Parameterizer { protected D epsilon = null; protected int minpts = 0; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); DistanceParameter epsilonP = new DistanceParameter<>(EPSILON_ID, distanceFunction, true); if(config.grab(epsilonP)) { epsilon = epsilonP.getValue(); } IntParameter minptsP = new IntParameter(MINPTS_ID); minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.intValue(); } } @Override protected OPTICS makeInstance() { return new OPTICS<>(distanceFunction, epsilon, minpts); } } }