package de.lmu.ifi.dbs.elki.algorithm.clustering.optics; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2015 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import gnu.trove.set.TIntSet; import java.util.ArrayList; import java.util.Collection; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.KNNJoin; import de.lmu.ifi.dbs.elki.data.NumberVector; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStore; import de.lmu.ifi.dbs.elki.database.ids.DBID; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.KNNList; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.SpatialPrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.index.tree.IndexTreePath; import de.lmu.ifi.dbs.elki.index.tree.LeafEntry; import de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialDirectoryEntry; import de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialEntry; import de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.deliclu.DeLiCluDirectoryEntry; import de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.deliclu.DeLiCluEntry; import de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.deliclu.DeLiCluNode; import de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.deliclu.DeLiCluTree; import de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.deliclu.DeLiCluTreeIndex; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.math.MathUtil; import de.lmu.ifi.dbs.elki.result.ResultUtil; import de.lmu.ifi.dbs.elki.utilities.Alias; import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.UpdatableHeap; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; /** * DeliClu: Density-Based Hierarchical Clustering, a hierarchical algorithm to * find density-connected sets in a database. *

* Reference:
* E. Achtert, C. Böhm, P. Kröger: DeLiClu: Boosting Robustness, Completeness, * Usability, and Efficiency of Hierarchical Clustering by a Closest Pair * Ranking.
* In Proc. 10th Pacific-Asia Conference on Knowledge Discovery and Data Mining * (PAKDD 2006), Singapore, 2006. *

* * @author Elke Achtert * @param the type of NumberVector handled by this Algorithm */ @Title("DeliClu: Density-Based Hierarchical Clustering") @Description("Hierachical algorithm to find density-connected sets in a database based on the parameter 'minpts'.") @Reference(authors = "E. Achtert, C. Böhm, P. Kröger", // title = "DeLiClu: Boosting Robustness, Completeness, Usability, and Efficiency of Hierarchical Clustering by a Closest Pair Ranking", // booktitle = "Proc. 10th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD 2006), Singapore, 2006", // url = "http://dx.doi.org/10.1007/11731139_16") @Alias({ "de.lmu.ifi.dbs.elki.algorithm.clustering.DeLiClu" }) public class DeLiClu extends AbstractDistanceBasedAlgorithmimplements OPTICSTypeAlgorithm { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(DeLiClu.class); /** * The priority queue for the algorithm. */ private UpdatableHeap heap; /** * Holds the knnJoin algorithm. */ private KNNJoin knnJoin; /** * Density threshold in number of objects. */ private int minpts; /** * Constructor. * * @param distanceFunction Distance function * @param minpts MinPts */ public DeLiClu(DistanceFunction distanceFunction, int minpts) { super(distanceFunction); this.knnJoin = new KNNJoin<>(distanceFunction, minpts); this.minpts = minpts; } public ClusterOrder run(Database database, Relation relation) { Collection> indexes = ResultUtil.filterResults(database.getHierarchy(), relation, DeLiCluTreeIndex.class); if(indexes.size() != 1) { throw new AbortException("DeLiClu found " + indexes.size() + " DeLiCluTree indexes. DeLiClu needs a special index to operate, therefore you need to add this index to your database."); } DeLiCluTreeIndex index = indexes.iterator().next(); // FIXME: check that the index matches the relation! if(!(getDistanceFunction() instanceof SpatialPrimitiveDistanceFunction)) { throw new IllegalArgumentException("Distance Function must be an instance of " + SpatialPrimitiveDistanceFunction.class.getName()); } @SuppressWarnings("unchecked") SpatialPrimitiveDistanceFunction distFunction = (SpatialPrimitiveDistanceFunction) getDistanceFunction(); // first do the knn-Join if(LOG.isVerbose()) { LOG.verbose("knnJoin..."); } DataStore knns = knnJoin.run(relation); DBIDs ids = relation.getDBIDs(); final int size = ids.size(); FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("DeLiClu", size, LOG) : null; ClusterOrder clusterOrder = new ClusterOrder(ids, "DeLiClu Clustering", "deliclu-clustering"); heap = new UpdatableHeap<>(); // add start object to cluster order and (root, root) to priority queue DBID startID = DBIDUtil.deref(ids.iter()); clusterOrder.add(startID, Double.POSITIVE_INFINITY, null); int numHandled = 1; index.setHandled(startID, relation.get(startID)); SpatialDirectoryEntry rootEntry = (SpatialDirectoryEntry) index.getRootEntry(); SpatialObjectPair spatialObjectPair = new SpatialObjectPair(0., rootEntry, rootEntry, true); heap.add(spatialObjectPair); while(numHandled < size) { if(heap.isEmpty()) { throw new AbortException("DeLiClu heap was empty when it shouldn't have been."); } SpatialObjectPair dataPair = heap.poll(); // pair of nodes if(dataPair.isExpandable) { expandNodes(index, distFunction, dataPair, knns); } // pair of objects else { // set handled LeafEntry e1 = (LeafEntry) dataPair.entry1; LeafEntry e2 = (LeafEntry) dataPair.entry2; final DBID e1id = e1.getDBID(); IndexTreePath path = index.setHandled(e1id, relation.get(e1id)); if(path == null) { throw new RuntimeException("snh: parent(" + e1id + ") = null!!!"); } // add to cluster order clusterOrder.add(e1id, dataPair.distance, e2.getDBID()); numHandled++; // reinsert expanded leafs reinsertExpanded(distFunction, index, path, knns); if(progress != null) { progress.setProcessed(numHandled, LOG); } } } LOG.ensureCompleted(progress); return clusterOrder; } /** * Expands the spatial nodes of the specified pair. * * @param index the index storing the objects * @param distFunction the spatial distance function of this algorithm * @param nodePair the pair of nodes to be expanded * @param knns the knn list */ private void expandNodes(DeLiCluTree index, SpatialPrimitiveDistanceFunction distFunction, SpatialObjectPair nodePair, DataStore knns) { DeLiCluNode node1 = index.getNode(((SpatialDirectoryEntry) nodePair.entry1).getPageID()); DeLiCluNode node2 = index.getNode(((SpatialDirectoryEntry) nodePair.entry2).getPageID()); if(node1.isLeaf()) { expandLeafNodes(distFunction, node1, node2, knns); } else { expandDirNodes(distFunction, node1, node2); } index.setExpanded(nodePair.entry2, nodePair.entry1); } /** * Expands the specified directory nodes. * * @param distFunction the spatial distance function of this algorithm * @param node1 the first node * @param node2 the second node */ private void expandDirNodes(SpatialPrimitiveDistanceFunction distFunction, DeLiCluNode node1, DeLiCluNode node2) { if(LOG.isDebuggingFinest()) { LOG.debugFinest("ExpandDirNodes: " + node1.getPageID() + " + " + node2.getPageID()); } int numEntries_1 = node1.getNumEntries(); int numEntries_2 = node2.getNumEntries(); // insert all combinations of unhandled - handled children of // node1-node2 into pq for(int i = 0; i < numEntries_1; i++) { DeLiCluEntry entry1 = node1.getEntry(i); if(!entry1.hasUnhandled()) { continue; } for(int j = 0; j < numEntries_2; j++) { DeLiCluEntry entry2 = node2.getEntry(j); if(!entry2.hasHandled()) { continue; } double distance = distFunction.minDist(entry1, entry2); SpatialObjectPair nodePair = new SpatialObjectPair(distance, entry1, entry2, true); heap.add(nodePair); } } } /** * Expands the specified leaf nodes. * * @param distFunction the spatial distance function of this algorithm * @param node1 the first node * @param node2 the second node * @param knns the knn list */ private void expandLeafNodes(SpatialPrimitiveDistanceFunction distFunction, DeLiCluNode node1, DeLiCluNode node2, DataStore knns) { if(LOG.isDebuggingFinest()) { LOG.debugFinest("ExpandLeafNodes: " + node1.getPageID() + " + " + node2.getPageID()); } int numEntries_1 = node1.getNumEntries(); int numEntries_2 = node2.getNumEntries(); // insert all combinations of unhandled - handled children of // node1-node2 into pq for(int i = 0; i < numEntries_1; i++) { DeLiCluEntry entry1 = node1.getEntry(i); if(!entry1.hasUnhandled()) { continue; } for(int j = 0; j < numEntries_2; j++) { DeLiCluEntry entry2 = node2.getEntry(j); if(!entry2.hasHandled()) { continue; } double distance = distFunction.minDist(entry1, entry2); double reach = MathUtil.max(distance, knns.get(((LeafEntry) entry2).getDBID()).getKNNDistance()); SpatialObjectPair dataPair = new SpatialObjectPair(reach, entry1, entry2, false); heap.add(dataPair); } } } /** * Reinserts the objects of the already expanded nodes. * * @param distFunction the spatial distance function of this algorithm * @param index the index storing the objects * @param path the path of the object inserted last * @param knns the knn list */ private void reinsertExpanded(SpatialPrimitiveDistanceFunction distFunction, DeLiCluTree index, IndexTreePath path, DataStore knns) { int l = 0; // Count the number of components. for(IndexTreePath it = path; it != null; it = it.getParentPath()) { l++; } ArrayList> p = new ArrayList<>(l - 1); // All except the last (= root). IndexTreePath it = path; for(; it.getParentPath() != null; it = it.getParentPath()) { p.add(it); } assert(p.size() == l - 1); DeLiCluEntry rootEntry = it.getEntry(); reinsertExpanded(distFunction, index, p, l - 2, rootEntry, knns); } private void reinsertExpanded(SpatialPrimitiveDistanceFunction distFunction, DeLiCluTree index, List> path, int pos, DeLiCluEntry parentEntry, DataStore knns) { DeLiCluNode parentNode = index.getNode(parentEntry); SpatialEntry entry2 = path.get(pos).getEntry(); if(entry2.isLeafEntry()) { assert(pos == 0); for(int i = 0; i < parentNode.getNumEntries(); i++) { DeLiCluEntry entry1 = parentNode.getEntry(i); if(entry1.hasHandled()) { continue; } double distance = distFunction.minDist(entry1, entry2); double reach = MathUtil.max(distance, knns.get(((LeafEntry) entry2).getDBID()).getKNNDistance()); SpatialObjectPair dataPair = new SpatialObjectPair(reach, entry1, entry2, false); heap.add(dataPair); } return; } TIntSet expanded = index.getExpanded(entry2); for(int i = 0; i < parentNode.getNumEntries(); i++) { DeLiCluDirectoryEntry entry1 = (DeLiCluDirectoryEntry) parentNode.getEntry(i); // not yet expanded if(!expanded.contains(entry1.getPageID())) { double distance = distFunction.minDist(entry1, entry2); SpatialObjectPair nodePair = new SpatialObjectPair(distance, entry1, entry2, true); heap.add(nodePair); } // already expanded else { reinsertExpanded(distFunction, index, path, pos - 1, entry1, knns); } } } @Override public int getMinPts() { return minpts; } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD); } @Override protected Logging getLogger() { return LOG; } /** * Encapsulates an entry in the cluster order. * * @apiviz.exclude */ public class SpatialObjectPair implements Comparable { /** * The first entry of this pair. */ SpatialEntry entry1; /** * The second entry of this pair. */ SpatialEntry entry2; /** * Indicates whether this pair is expandable or not. */ boolean isExpandable; /** * The current distance. */ double distance; /** * Creates a new entry with the specified parameters. * * @param entry1 the first entry of this pair * @param entry2 the second entry of this pair * @param isExpandable if true, this pair is expandable (a pair of nodes), * otherwise this pair is not expandable (a pair of objects) */ public SpatialObjectPair(double distance, SpatialEntry entry1, SpatialEntry entry2, boolean isExpandable) { this.distance = distance; this.entry1 = entry1; this.entry2 = entry2; this.isExpandable = isExpandable; } /** * Compares this object with the specified object for order. Returns a * negative integer, zero, or a positive integer as this object is less * than, equal to, or greater than the specified object. *

* * @param other the Object to be compared. * @return a negative integer, zero, or a positive integer as this object is * less than, equal to, or greater than the specified object. */ @Override public int compareTo(SpatialObjectPair other) { /* * if(this.entry1.getEntryID().compareTo(other.entry1.getEntryID()) > 0) { * return -1; } * if(this.entry1.getEntryID().compareTo(other.entry1.getEntryID()) < 0) { * return 1; } * if(this.entry2.getEntryID().compareTo(other.entry2.getEntryID()) > 0) { * return -1; } * if(this.entry2.getEntryID().compareTo(other.entry2.getEntryID()) < 0) { * return 1; } return 0; */ // FIXME: inverted? return Double.compare(this.distance, other.distance); } /** * Returns a string representation of the object. * * @return a string representation of the object. */ @Override public String toString() { if(!isExpandable) { return entry1 + " - " + entry2; } return "n_" + entry1 + " - n_" + entry2; } /** equals is used in updating the heap! */ @SuppressWarnings("unchecked") @Override public boolean equals(Object obj) { if(!(SpatialObjectPair.class.isInstance(obj))) { return false; } SpatialObjectPair other = (SpatialObjectPair) obj; if(!isExpandable) { return this.entry1.equals(other.entry1); } else { return this.entry1.equals(other.entry1) && this.entry2.equals(other.entry2); } } /** hashCode is used in updating the heap! */ @Override public int hashCode() { final long prime = 2654435761L; if(!isExpandable) { return entry1.hashCode(); } long result = 0; result = prime * result + ((entry1 == null) ? 0 : entry1.hashCode()); result = prime * result + ((entry2 == null) ? 0 : entry2.hashCode()); return (int) result; } } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractDistanceBasedAlgorithm.Parameterizer { /** * Parameter to specify the threshold for minimum number of points within a * cluster, must be an integer greater than 0. */ public static final OptionID MINPTS_ID = new OptionID("deliclu.minpts", "Threshold for minimum number of points within a cluster."); protected int minpts = 0; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter minptsP = new IntParameter(MINPTS_ID); minptsP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(minptsP)) { minpts = minptsP.getValue(); } } @Override protected DeLiClu makeInstance() { return new DeLiClu<>(distanceFunction, minpts); } } }