summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java501
1 files changed, 501 insertions, 0 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
new file mode 100644
index 00000000..d034c489
--- /dev/null
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/DeLiClu.java
@@ -0,0 +1,501 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering;
+/*
+This file is part of ELKI:
+Environment for Developing KDD-Applications Supported by Index-Structures
+
+Copyright (C) 2011
+Ludwig-Maximilians-Universität München
+Lehr- und Forschungseinheit für Datenbanksysteme
+ELKI Development Team
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.KNNJoin;
+import de.lmu.ifi.dbs.elki.data.NumberVector;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStore;
+import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.DistanceUtil;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.SpatialPrimitiveDistanceFunction;
+import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance;
+import de.lmu.ifi.dbs.elki.index.tree.LeafEntry;
+import de.lmu.ifi.dbs.elki.index.tree.TreeIndexPathComponent;
+import de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialDirectoryEntry;
+import de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialEntry;
+import de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.deliclu.DeLiCluEntry;
+import de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.deliclu.DeLiCluNode;
+import de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.deliclu.DeLiCluTree;
+import de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.deliclu.DeLiCluTreeIndex;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress;
+import de.lmu.ifi.dbs.elki.result.ResultUtil;
+import de.lmu.ifi.dbs.elki.result.optics.ClusterOrderResult;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.KNNList;
+import de.lmu.ifi.dbs.elki.utilities.datastructures.heap.UpdatableHeap;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Description;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.GreaterConstraint;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+
+/**
+ * DeLiClu provides the DeLiClu algorithm, a hierarchical algorithm to find
+ * density-connected sets in a database.
+ * <p>
+ * Reference: <br>
+ * E. Achtert, C. Böhm, P. Kröger: DeLiClu: Boosting Robustness, Completeness,
+ * Usability, and Efficiency of Hierarchical Clustering by a Closest Pair
+ * Ranking. <br>
+ * In Proc. 10th Pacific-Asia Conference on Knowledge Discovery and Data Mining
+ * (PAKDD 2006), Singapore, 2006.
+ * </p>
+ *
+ * @author Elke Achtert
+ * @param <NV> the type of NumberVector handled by this Algorithm
+ * @param <D> the type of Distance used
+ */
+@Title("DeliClu: Density-Based Hierarchical Clustering")
+@Description("Hierachical algorithm to find density-connected sets in a database based on the parameter 'minpts'.")
+@Reference(authors = "E. Achtert, C. Böhm, P. Kröger", title = "DeLiClu: Boosting Robustness, Completeness, Usability, and Efficiency of Hierarchical Clustering by a Closest Pair Ranking", booktitle = "Proc. 10th Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD 2006), Singapore, 2006", url = "http://dx.doi.org/10.1007/11731139_16")
+public class DeLiClu<NV extends NumberVector<NV, ?>, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm<NV, D, ClusterOrderResult<D>> implements OPTICSTypeAlgorithm<D> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging logger = Logging.getLogger(DeLiClu.class);
+
+ /**
+ * Parameter to specify the threshold for minimum number of points within a
+ * cluster, must be an integer greater than 0.
+ */
+ public static final OptionID MINPTS_ID = OptionID.getOrCreateOptionID("deliclu.minpts", "Threshold for minimum number of points within a cluster.");
+
+ /**
+ * The priority queue for the algorithm.
+ */
+ private UpdatableHeap<SpatialObjectPair> heap;
+
+ /**
+ * Holds the knnJoin algorithm.
+ */
+ private KNNJoin<NV, D, DeLiCluNode, DeLiCluEntry> knnJoin;
+
+ /**
+ * Holds the value of {@link #MINPTS_ID}.
+ */
+ private int minpts;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction Distance function
+ * @param minpts MinPts
+ */
+ public DeLiClu(DistanceFunction<? super NV, D> distanceFunction, int minpts) {
+ super(distanceFunction);
+ this.knnJoin = new KNNJoin<NV, D, DeLiCluNode, DeLiCluEntry>(distanceFunction, minpts);
+ this.minpts = minpts;
+ }
+
+ public ClusterOrderResult<D> run(Database database, Relation<NV> relation) {
+ Collection<DeLiCluTreeIndex<NV>> indexes = ResultUtil.filterResults(database, DeLiCluTreeIndex.class);
+ if(indexes.size() != 1) {
+ throw new AbortException("DeLiClu found " + indexes.size() + " DeLiCluTree indexes, expected exactly one.");
+ }
+ DeLiCluTreeIndex<NV> index = indexes.iterator().next();
+ // FIXME: check that the index matches the relation!
+
+ if(!(getDistanceFunction() instanceof SpatialPrimitiveDistanceFunction<?, ?>)) {
+ throw new IllegalArgumentException("Distance Function must be an instance of " + SpatialPrimitiveDistanceFunction.class.getName());
+ }
+ @SuppressWarnings("unchecked")
+ SpatialPrimitiveDistanceFunction<NV, D> distFunction = (SpatialPrimitiveDistanceFunction<NV, D>) getDistanceFunction();
+
+ // first do the knn-Join
+ if(logger.isVerbose()) {
+ logger.verbose("knnJoin...");
+ }
+ DataStore<KNNList<D>> knns = knnJoin.run(database, relation);
+
+ FiniteProgress progress = logger.isVerbose() ? new FiniteProgress("DeLiClu", relation.size(), logger) : null;
+ final int size = relation.size();
+
+ ClusterOrderResult<D> clusterOrder = new ClusterOrderResult<D>("DeLiClu Clustering", "deliclu-clustering");
+ heap = new UpdatableHeap<SpatialObjectPair>();
+
+ // add start object to cluster order and (root, root) to priority queue
+ DBID startID = getStartObject(relation);
+ clusterOrder.add(startID, null, distFunction.getDistanceFactory().infiniteDistance());
+ int numHandled = 1;
+ index.setHandled(startID, relation.get(startID));
+ SpatialDirectoryEntry rootEntry = (SpatialDirectoryEntry)index.getRootEntry();
+ SpatialObjectPair spatialObjectPair = new SpatialObjectPair(distFunction.getDistanceFactory().nullDistance(), rootEntry, rootEntry, true);
+ heap.add(spatialObjectPair);
+
+ while(numHandled < size) {
+ if(heap.isEmpty()) {
+ throw new AbortException("DeLiClu heap was empty when it shouldn't have been.");
+ }
+ SpatialObjectPair dataPair = heap.poll();
+
+ // pair of nodes
+ if(dataPair.isExpandable) {
+ expandNodes(index, distFunction, dataPair, knns);
+ }
+ // pair of objects
+ else {
+ // set handled
+ LeafEntry e1 = (LeafEntry) dataPair.entry1;
+ LeafEntry e2 = (LeafEntry) dataPair.entry2;
+ final DBID e1id = e1.getDBID();
+ List<TreeIndexPathComponent<DeLiCluEntry>> path = index.setHandled(e1id, relation.get(e1id));
+ if(path == null) {
+ throw new RuntimeException("snh: parent(" + e1id + ") = null!!!");
+ }
+ // add to cluster order
+ clusterOrder.add(e1id, e2.getDBID(), dataPair.distance);
+ numHandled++;
+ // reinsert expanded leafs
+ reinsertExpanded(distFunction, index, path, knns);
+
+ if(progress != null) {
+ progress.setProcessed(numHandled, logger);
+ }
+ }
+ }
+ if(progress != null) {
+ progress.ensureCompleted(logger);
+ }
+ return clusterOrder;
+ }
+
+ /**
+ * Returns the id of the start object for the run method.
+ *
+ * @param relation the database relation storing the objects
+ * @return the id of the start object for the run method
+ */
+ private DBID getStartObject(Relation<NV> relation) {
+ Iterator<DBID> it = relation.iterDBIDs();
+ if(!it.hasNext()) {
+ return null;
+ }
+ return it.next();
+ }
+
+ /**
+ * Expands the spatial nodes of the specified pair.
+ *
+ * @param index the index storing the objects
+ * @param distFunction the spatial distance function of this algorithm
+ * @param nodePair the pair of nodes to be expanded
+ * @param knns the knn list
+ */
+ private void expandNodes(DeLiCluTree index, SpatialPrimitiveDistanceFunction<NV, D> distFunction, SpatialObjectPair nodePair, DataStore<KNNList<D>> knns) {
+ DeLiCluNode node1 = index.getNode(((SpatialDirectoryEntry)nodePair.entry1).getPageID());
+ DeLiCluNode node2 = index.getNode(((SpatialDirectoryEntry)nodePair.entry2).getPageID());
+
+ if(node1.isLeaf()) {
+ expandLeafNodes(distFunction, node1, node2, knns);
+ }
+ else {
+ expandDirNodes(distFunction, node1, node2);
+ }
+
+ index.setExpanded(nodePair.entry2, nodePair.entry1);
+ }
+
+ /**
+ * Expands the specified directory nodes.
+ *
+ * @param distFunction the spatial distance function of this algorithm
+ * @param node1 the first node
+ * @param node2 the second node
+ */
+ private void expandDirNodes(SpatialPrimitiveDistanceFunction<NV, D> distFunction, DeLiCluNode node1, DeLiCluNode node2) {
+ int numEntries_1 = node1.getNumEntries();
+ int numEntries_2 = node2.getNumEntries();
+
+ // insert all combinations of unhandled - handled children of
+ // node1-node2 into pq
+ for(int i = 0; i < numEntries_1; i++) {
+ DeLiCluEntry entry1 = node1.getEntry(i);
+ if(!entry1.hasUnhandled()) {
+ continue;
+ }
+ for(int j = 0; j < numEntries_2; j++) {
+ DeLiCluEntry entry2 = node2.getEntry(j);
+
+ if(!entry2.hasHandled()) {
+ continue;
+ }
+ D distance = distFunction.minDist(entry1, entry2);
+
+ SpatialObjectPair nodePair = new SpatialObjectPair(distance, entry1, entry2, true);
+ heap.add(nodePair);
+ }
+ }
+ }
+
+ /**
+ * Expands the specified leaf nodes.
+ *
+ * @param distFunction the spatial distance function of this algorithm
+ * @param node1 the first node
+ * @param node2 the second node
+ * @param knns the knn list
+ */
+ private void expandLeafNodes(SpatialPrimitiveDistanceFunction<NV, D> distFunction, DeLiCluNode node1, DeLiCluNode node2, DataStore<KNNList<D>> knns) {
+ int numEntries_1 = node1.getNumEntries();
+ int numEntries_2 = node2.getNumEntries();
+
+ // insert all combinations of unhandled - handled children of
+ // node1-node2 into pq
+ for(int i = 0; i < numEntries_1; i++) {
+ DeLiCluEntry entry1 = node1.getEntry(i);
+ if(!entry1.hasUnhandled()) {
+ continue;
+ }
+ for(int j = 0; j < numEntries_2; j++) {
+ DeLiCluEntry entry2 = node2.getEntry(j);
+ if(!entry2.hasHandled()) {
+ continue;
+ }
+
+ D distance = distFunction.minDist(entry1, entry2);
+ D reach = DistanceUtil.max(distance, knns.get(((LeafEntry) entry2).getDBID()).getKNNDistance());
+ SpatialObjectPair dataPair = new SpatialObjectPair(reach, entry1, entry2, false);
+ heap.add(dataPair);
+ }
+ }
+ }
+
+ /**
+ * Reinserts the objects of the already expanded nodes.
+ *
+ * @param distFunction the spatial distance function of this algorithm
+ * @param index the index storing the objects
+ * @param path the path of the object inserted last
+ * @param knns the knn list
+ */
+ private void reinsertExpanded(SpatialPrimitiveDistanceFunction<NV, D> distFunction, DeLiCluTree index, List<TreeIndexPathComponent<DeLiCluEntry>> path, DataStore<KNNList<D>> knns) {
+ SpatialDirectoryEntry rootEntry = (SpatialDirectoryEntry)path.remove(0).getEntry();
+ reinsertExpanded(distFunction, index, path, 0, rootEntry, knns);
+ }
+
+ private void reinsertExpanded(SpatialPrimitiveDistanceFunction<NV, D> distFunction, DeLiCluTree index, List<TreeIndexPathComponent<DeLiCluEntry>> path, int pos, SpatialDirectoryEntry parentEntry, DataStore<KNNList<D>> knns) {
+ DeLiCluNode parentNode = index.getNode(parentEntry.getPageID());
+ SpatialEntry entry2 = path.get(pos).getEntry();
+
+ if(entry2.isLeafEntry()) {
+ for(int i = 0; i < parentNode.getNumEntries(); i++) {
+ DeLiCluEntry entry1 = parentNode.getEntry(i);
+ if(entry1.hasHandled()) {
+ continue;
+ }
+ D distance = distFunction.minDist(entry1, entry2);
+ D reach = DistanceUtil.max(distance, knns.get(((LeafEntry) entry2).getDBID()).getKNNDistance());
+ SpatialObjectPair dataPair = new SpatialObjectPair(reach, entry1, entry2, false);
+ heap.add(dataPair);
+ }
+ }
+ else {
+ Set<Integer> expanded = index.getExpanded(entry2);
+ for(int i = 0; i < parentNode.getNumEntries(); i++) {
+ SpatialDirectoryEntry entry1 = (SpatialDirectoryEntry) parentNode.getEntry(i);
+
+ // not yet expanded
+ if(!expanded.contains(entry1.getPageID())) {
+ D distance = distFunction.minDist(entry1, entry2);
+ SpatialObjectPair nodePair = new SpatialObjectPair(distance, entry1, entry2, true);
+ heap.add(nodePair);
+ }
+
+ // already expanded
+ else {
+ reinsertExpanded(distFunction, index, path, pos + 1, entry1, knns);
+ }
+ }
+ }
+ }
+
+ @Override
+ public int getMinPts() {
+ return minpts;
+ }
+
+ @Override
+ public D getDistanceFactory() {
+ return getDistanceFunction().getDistanceFactory();
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(TypeUtil.NUMBER_VECTOR_FIELD);
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return logger;
+ }
+
+ /**
+ * Encapsulates an entry in the cluster order.
+ *
+ * @apiviz.exclude
+ */
+ public class SpatialObjectPair implements Comparable<SpatialObjectPair> {
+ /**
+ * The first entry of this pair.
+ */
+ SpatialEntry entry1;
+
+ /**
+ * The second entry of this pair.
+ */
+ SpatialEntry entry2;
+
+ /**
+ * Indicates whether this pair is expandable or not.
+ */
+ boolean isExpandable;
+
+ /**
+ * The current distance.
+ */
+ D distance;
+
+ /**
+ * Creates a new entry with the specified parameters.
+ *
+ * @param entry1 the first entry of this pair
+ * @param entry2 the second entry of this pair
+ * @param isExpandable if true, this pair is expandable (a pair of nodes),
+ * otherwise this pair is not expandable (a pair of objects)
+ */
+ public SpatialObjectPair(D distance, SpatialEntry entry1, SpatialEntry entry2, boolean isExpandable) {
+ this.distance = distance;
+ this.entry1 = entry1;
+ this.entry2 = entry2;
+ this.isExpandable = isExpandable;
+ }
+
+ /**
+ * Compares this object with the specified object for order. Returns a
+ * negative integer, zero, or a positive integer as this object is less
+ * than, equal to, or greater than the specified object.
+ * <p/>
+ *
+ * @param other the Object to be compared.
+ * @return a negative integer, zero, or a positive integer as this object is
+ * less than, equal to, or greater than the specified object.
+ */
+ @Override
+ public int compareTo(SpatialObjectPair other) {
+ /*
+ * if(this.entry1.getEntryID().compareTo(other.entry1.getEntryID()) > 0) {
+ * return -1; }
+ * if(this.entry1.getEntryID().compareTo(other.entry1.getEntryID()) < 0) {
+ * return 1; }
+ * if(this.entry2.getEntryID().compareTo(other.entry2.getEntryID()) > 0) {
+ * return -1; }
+ * if(this.entry2.getEntryID().compareTo(other.entry2.getEntryID()) < 0) {
+ * return 1; } return 0;
+ */
+ // FIXME: inverted?
+ return this.distance.compareTo(other.distance);
+ }
+
+ /**
+ * Returns a string representation of the object.
+ *
+ * @return a string representation of the object.
+ */
+ @Override
+ public String toString() {
+ if(!isExpandable) {
+ return entry1 + " - " + entry2;
+ }
+ return "n_" + entry1 + " - n_" + entry2;
+ }
+
+ /** equals is used in updating the heap! */
+ @SuppressWarnings("unchecked")
+ @Override
+ public boolean equals(Object obj) {
+ if(!(SpatialObjectPair.class.isInstance(obj))) {
+ return false;
+ }
+ SpatialObjectPair other = (SpatialObjectPair) obj;
+ if(!isExpandable) {
+ return this.entry1.equals(other.entry1);
+ }
+ else {
+ return this.entry1.equals(other.entry1) && this.entry2.equals(other.entry2);
+ }
+ }
+
+ /** hashCode is used in updating the heap! */
+ @Override
+ public int hashCode() {
+ final long prime = 2654435761L;
+ if(!isExpandable) {
+ return entry1.hashCode();
+ }
+ long result = 0;
+ result = prime * result + ((entry1 == null) ? 0 : entry1.hashCode());
+ result = prime * result + ((entry2 == null) ? 0 : entry2.hashCode());
+ return (int) result;
+ }
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<NV extends NumberVector<NV, ?>, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<NV, D> {
+ protected int minpts = 0;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter minptsP = new IntParameter(MINPTS_ID);
+ minptsP.addConstraint(new GreaterConstraint(0));
+ if(config.grab(minptsP)) {
+ minpts = minptsP.getValue();
+ }
+ }
+
+ @Override
+ protected DeLiClu<NV, D> makeInstance() {
+ return new DeLiClu<NV, D>(distanceFunction, minpts);
+ }
+ }
+} \ No newline at end of file