diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical')
14 files changed, 180 insertions, 590 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CentroidLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CentroidLinkageMethod.java index 72b6fb57..f0be669f 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CentroidLinkageMethod.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CentroidLinkageMethod.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CompleteLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CompleteLinkageMethod.java index 0cb47fa7..4f205001 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CompleteLinkageMethod.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/CompleteLinkageMethod.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java index f6dbc88f..3a26a96c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/ExtractFlatClusteringFromHierarchy.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -26,7 +26,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; import gnu.trove.list.array.TDoubleArrayList; import java.util.ArrayList; -import java.util.Comparator; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.data.Cluster; @@ -35,10 +34,9 @@ import de.lmu.ifi.dbs.elki.data.model.DendrogramModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DBIDDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.DataStore; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; -import de.lmu.ifi.dbs.elki.database.datastore.DoubleDistanceDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.DoubleDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; @@ -48,8 +46,6 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; @@ -57,7 +53,7 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.AbstractParameterizer; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; -import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DistanceParameter; +import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.DoubleParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.EnumParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; @@ -75,7 +71,7 @@ import de.lmu.ifi.dbs.elki.workflow.AlgorithmStep; * @apiviz.uses PointerHierarchyRepresentationResult * @apiviz.has Clustering */ -public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implements ClusteringAlgorithm<Clustering<DendrogramModel<D>>> { +public class ExtractFlatClusteringFromHierarchy implements ClusteringAlgorithm<Clustering<DendrogramModel>> { /** * Class logger. */ @@ -119,7 +115,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement /** * Clustering algorithm to run to obtain the hierarchy. */ - private HierarchicalClusteringAlgorithm<D> algorithm; + private HierarchicalClusteringAlgorithm algorithm; /** * Include empty cluster in the hierarchy produced. @@ -129,7 +125,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement /** * Threshold for extracting clusters. */ - private D threshold = null; + private double threshold; /** * Disallow singleton clusters, but add them to the parent cluster instead. @@ -144,10 +140,10 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement * @param outputmode Output mode: truncated hierarchy or strict partitions. * @param singletons Allow producing singleton clusters. */ - public ExtractFlatClusteringFromHierarchy(HierarchicalClusteringAlgorithm<D> algorithm, int minclusters, OutputMode outputmode, boolean singletons) { + public ExtractFlatClusteringFromHierarchy(HierarchicalClusteringAlgorithm algorithm, int minclusters, OutputMode outputmode, boolean singletons) { super(); this.algorithm = algorithm; - this.threshold = null; + this.threshold = Double.NaN; this.minclusters = minclusters; this.outputmode = outputmode; this.singletons = singletons; @@ -161,7 +157,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement * @param outputmode Output mode: truncated hierarchy or strict partitions. * @param singletons Allow producing singleton clusters. */ - public ExtractFlatClusteringFromHierarchy(HierarchicalClusteringAlgorithm<D> algorithm, D threshold, OutputMode outputmode, boolean singletons) { + public ExtractFlatClusteringFromHierarchy(HierarchicalClusteringAlgorithm algorithm, double threshold, OutputMode outputmode, boolean singletons) { super(); this.algorithm = algorithm; this.threshold = threshold; @@ -171,19 +167,13 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } @Override - public Clustering<DendrogramModel<D>> run(Database database) { - PointerHierarchyRepresentationResult<D> pointerresult = algorithm.run(database); + public Clustering<DendrogramModel> run(Database database) { + PointerHierarchyRepresentationResult pointerresult = algorithm.run(database); DBIDs ids = pointerresult.getDBIDs(); DBIDDataStore pi = pointerresult.getParentStore(); - DataStore<D> lambda = pointerresult.getParentDistanceStore(); + DoubleDataStore lambda = pointerresult.getParentDistanceStore(); - Clustering<DendrogramModel<D>> result; - if(lambda instanceof DoubleDistanceDataStore) { - result = extractClustersDouble(ids, pi, (DoubleDistanceDataStore) lambda); - } - else { - result = extractClusters(ids, pi, lambda); - } + Clustering<DendrogramModel> result = extractClusters(ids, pi, lambda); result.addChildResult(pointerresult); return result; @@ -198,245 +188,14 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement * * @return Hierarchical clustering */ - private Clustering<DendrogramModel<D>> extractClusters(DBIDs ids, final DBIDDataStore pi, final DataStore<D> lambda) { - FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), LOG) : null; - - // Sort DBIDs by lambda. We need this for two things: - // a) to determine the stop distance from "minclusters" parameter - // b) to process arrows in decreasing / increasing order - ArrayModifiableDBIDs order = DBIDUtil.newArray(ids); - order.sort(new CompareByLambda<>(lambda)); - DBIDArrayIter it = order.iter(); // Used multiple times! - - int split; - if(minclusters > 0) { - split = Math.max(ids.size() - minclusters, 0); - // Stop distance: - final D stopdist = lambda.get(order.get(split)); - - // Tie handling: decrement split. - while(split > 0) { - it.seek(split - 1); - if(stopdist.compareTo(lambda.get(it)) <= 0) { - split--; - } - else { - break; - } - } - } - else if(threshold != null) { - split = ids.size(); - it.seek(split - 1); - while(threshold.compareTo(lambda.get(it)) <= 0 && it.valid()) { - split--; - it.retract(); - } - } - else { // full hierarchy - split = 0; - } - - // Extract the child clusters - int expcnum = ids.size() - split; - WritableIntegerDataStore cluster_map = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_TEMP, -1); - ArrayList<ModifiableDBIDs> cluster_dbids = new ArrayList<>(expcnum); - ArrayList<D> cluster_dist = new ArrayList<>(expcnum); - ArrayModifiableDBIDs cluster_leads = DBIDUtil.newArray(expcnum); - - DBIDVar succ = DBIDUtil.newVar(); // Variable for successor. - // Go backwards on the lower part. - for(it.seek(split - 1); it.valid(); it.retract()) { - D dist = lambda.get(it); // Distance to successor - pi.assignVar(it, succ); // succ = pi(it) - int clusterid = cluster_map.intValue(succ); - // Successor cluster has already been created: - if(clusterid >= 0) { - cluster_dbids.get(clusterid).add(it); - cluster_map.putInt(it, clusterid); - // Update distance to maximum encountered: - if(cluster_dist.get(clusterid).compareTo(dist) < 0) { - cluster_dist.set(clusterid, dist); - } - } - else { - // Need to start a new cluster: - clusterid = cluster_dbids.size(); // next cluster number. - ModifiableDBIDs cids = DBIDUtil.newArray(); - // Add element and successor as initial members: - cids.add(succ); - cluster_map.putInt(succ, clusterid); - cids.add(it); - cluster_map.putInt(it, clusterid); - // Store new cluster. - cluster_dbids.add(cids); - cluster_leads.add(succ); - cluster_dist.add(dist); - } - - // Decrement counter - if(progress != null) { - progress.incrementProcessed(LOG); - } - } - final Clustering<DendrogramModel<D>> dendrogram; - switch(outputmode){ - case PARTIAL_HIERARCHY: { - // Build a hierarchy out of these clusters. - dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering"); - Cluster<DendrogramModel<D>> root = null; - ArrayList<Cluster<DendrogramModel<D>>> clusters = new ArrayList<>(expcnum); - // Convert initial clusters to cluster objects - { - int i = 0; - for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { - clusters.add(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i))); - } - cluster_dist = null; // Invalidate - cluster_dbids = null; // Invalidate - } - // Process the upper part, bottom-up. - for(it.seek(split); it.valid(); it.advance()) { - int clusterid = cluster_map.intValue(it); - // The current cluster led by the current element: - final Cluster<DendrogramModel<D>> clus; - if(clusterid >= 0) { - clus = clusters.get(clusterid); - } - else if(!singletons && ids.size() != 1) { - clus = null; - } - else { - clus = makeCluster(it, null, DBIDUtil.deref(it)); - } - // The successor to join: - pi.assignVar(it, succ); // succ = pi(it) - if(DBIDUtil.equal(it, succ)) { - assert (root == null); - root = clus; - } - else { - // Parent cluster: - int parentid = cluster_map.intValue(succ); - D depth = lambda.get(it); - // Parent cluster exists - merge as a new cluster: - if(parentid >= 0) { - final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid); - if(pclus.getModel().getDistance().equals(depth)) { - if(clus == null) { - ((ModifiableDBIDs) pclus.getIDs()).add(it); - } - else { - dendrogram.addChildCluster(pclus, clus); - } - } - else { - // Merge at new depth: - ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 1 : 0); - if(clus == null) { - cids.add(it); - } - Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids); - if(clus != null) { - dendrogram.addChildCluster(npclus, clus); - } - dendrogram.addChildCluster(npclus, pclus); - // Replace existing parent cluster: new depth - clusters.set(parentid, npclus); - } - } - else { - // Merge with parent at this depth: - final Cluster<DendrogramModel<D>> pclus; - if(!singletons) { - ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1); - cids.add(succ); - if(clus == null) { - cids.add(it); - } - // New cluster for parent and/or new point - pclus = makeCluster(succ, depth, cids); - } - else { - // Create a new, one-element cluster for parent, and a merged - // cluster on top. - pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS); - dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ))); - } - if(clus != null) { - dendrogram.addChildCluster(pclus, clus); - } - // Store cluster: - parentid = clusters.size(); - clusters.add(pclus); // Remember parent cluster - cluster_map.putInt(succ, parentid); // Reference - } - } - - // Decrement counter - if(progress != null) { - progress.incrementProcessed(LOG); - } - } - assert (root != null); - // attach root - dendrogram.addToplevelCluster(root); - break; - } - case STRICT_PARTITIONS: { - // Build a hierarchy out of these clusters. - dendrogram = new Clustering<>("Flattened Hierarchical Clustering", "flattened-hierarchical-clustering"); - // Convert initial clusters to cluster objects - { - int i = 0; - for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { - dendrogram.addToplevelCluster(makeCluster(it2, cluster_dist.get(i), cluster_dbids.get(i))); - } - cluster_dist = null; // Invalidate - cluster_dbids = null; // Invalidate - } - // Process the upper part, bottom-up. - for(it.seek(split); it.valid(); it.advance()) { - int clusterid = cluster_map.intValue(it); - if(clusterid < 0) { - dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it))); - } - - // Decrement counter - if(progress != null) { - progress.incrementProcessed(LOG); - } - } - break; - } - default: - throw new AbortException("Unsupported output mode."); - } - - if(progress != null) { - progress.ensureCompleted(LOG); - } - - return dendrogram; - } - - /** - * Extract all clusters from the pi-lambda-representation. - * - * @param ids Object ids to process - * @param pi Pi store - * @param lambda Lambda store - * - * @return Hierarchical clustering - */ - private Clustering<DendrogramModel<D>> extractClustersDouble(DBIDs ids, final DBIDDataStore pi, final DoubleDistanceDataStore lambda) { + private Clustering<DendrogramModel> extractClusters(DBIDs ids, final DBIDDataStore pi, final DoubleDataStore lambda) { FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extracting clusters", ids.size(), LOG) : null; // Sort DBIDs by lambda. We need this for two things: // a) to determine the stop distance from "minclusters" parameter // b) to process arrows in decreasing / increasing order ArrayModifiableDBIDs order = DBIDUtil.newArray(ids); - order.sort(new CompareByDoubleLambda(lambda)); + order.sort(new DataStoreUtil.AscendingByDoubleDataStore(lambda)); DBIDArrayIter it = order.iter(); // Used multiple times! int split; @@ -456,11 +215,10 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } } } - else if(threshold != null) { + else if(!Double.isNaN(threshold)) { split = ids.size(); it.seek(split - 1); - double stopdist = ((DoubleDistance) threshold).doubleValue(); - while(stopdist <= lambda.doubleValue(it) && it.valid()) { + while(threshold <= lambda.doubleValue(it) && it.valid()) { split--; it.retract(); } @@ -507,23 +265,20 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } // Decrement counter - if(progress != null) { - progress.incrementProcessed(LOG); - } + LOG.incrementProcessed(progress); } - final Clustering<DendrogramModel<D>> dendrogram; + final Clustering<DendrogramModel> dendrogram; switch(outputmode){ case PARTIAL_HIERARCHY: { // Build a hierarchy out of these clusters. dendrogram = new Clustering<>("Hierarchical Clustering", "hierarchical-clustering"); - Cluster<DendrogramModel<D>> root = null; - ArrayList<Cluster<DendrogramModel<D>>> clusters = new ArrayList<>(expcnum); + Cluster<DendrogramModel> root = null; + ArrayList<Cluster<DendrogramModel>> clusters = new ArrayList<>(expcnum); // Convert initial clusters to cluster objects { int i = 0; for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { - @SuppressWarnings("unchecked") - D depth = (D) new DoubleDistance(cluster_dist.get(i)); + double depth = cluster_dist.get(i); clusters.add(makeCluster(it2, depth, cluster_dbids.get(i))); } cluster_dist = null; // Invalidate @@ -533,7 +288,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement for(it.seek(split); it.valid(); it.advance()) { int clusterid = cluster_map.intValue(it); // The current cluster led by the current element: - final Cluster<DendrogramModel<D>> clus; + final Cluster<DendrogramModel> clus; if(clusterid >= 0) { clus = clusters.get(clusterid); } @@ -541,7 +296,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement clus = null; } else { - clus = makeCluster(it, null, DBIDUtil.deref(it)); + clus = makeCluster(it, Double.NaN, DBIDUtil.deref(it)); } // The successor to join: pi.assignVar(it, succ); // succ = pi(it) @@ -552,12 +307,11 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement else { // Parent cluster: int parentid = cluster_map.intValue(succ); - @SuppressWarnings("unchecked") - D depth = (D) new DoubleDistance(lambda.doubleValue(it)); + double depth = lambda.doubleValue(it); // Parent cluster exists - merge as a new cluster: if(parentid >= 0) { - final Cluster<DendrogramModel<D>> pclus = clusters.get(parentid); - if(pclus.getModel().getDistance().equals(depth)) { + final Cluster<DendrogramModel> pclus = clusters.get(parentid); + if(pclus.getModel().getDistance() == depth) { if(clus == null) { ((ModifiableDBIDs) pclus.getIDs()).add(it); } @@ -571,7 +325,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement if(clus == null) { cids.add(it); } - Cluster<DendrogramModel<D>> npclus = makeCluster(succ, depth, cids); + Cluster<DendrogramModel> npclus = makeCluster(succ, depth, cids); if(clus != null) { dendrogram.addChildCluster(npclus, clus); } @@ -582,7 +336,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } else { // Merge with parent at this depth: - final Cluster<DendrogramModel<D>> pclus; + final Cluster<DendrogramModel> pclus; if(!singletons) { ModifiableDBIDs cids = DBIDUtil.newArray(clus == null ? 2 : 1); cids.add(succ); @@ -596,7 +350,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement // Create a new, one-element cluster for parent, and a merged // cluster on top. pclus = makeCluster(succ, depth, DBIDUtil.EMPTYDBIDS); - dendrogram.addChildCluster(pclus, makeCluster(succ, null, DBIDUtil.deref(succ))); + dendrogram.addChildCluster(pclus, makeCluster(succ, Double.NaN, DBIDUtil.deref(succ))); } if(clus != null) { dendrogram.addChildCluster(pclus, clus); @@ -609,9 +363,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } // Decrement counter - if(progress != null) { - progress.incrementProcessed(LOG); - } + LOG.incrementProcessed(progress); } assert (root != null); // attach root @@ -625,8 +377,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement { int i = 0; for(DBIDIter it2 = cluster_leads.iter(); it2.valid(); it2.advance(), i++) { - @SuppressWarnings("unchecked") - D depth = (D) new DoubleDistance(cluster_dist.get(i)); + double depth = cluster_dist.get(i); dendrogram.addToplevelCluster(makeCluster(it2, depth, cluster_dbids.get(i))); } cluster_dist = null; // Invalidate @@ -636,13 +387,11 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement for(it.seek(split); it.valid(); it.advance()) { int clusterid = cluster_map.intValue(it); if(clusterid < 0) { - dendrogram.addToplevelCluster(makeCluster(it, null, DBIDUtil.deref(it))); + dendrogram.addToplevelCluster(makeCluster(it, Double.NaN, DBIDUtil.deref(it))); } // Decrement counter - if(progress != null) { - progress.incrementProcessed(LOG); - } + LOG.incrementProcessed(progress); } break; } @@ -650,9 +399,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement throw new AbortException("Unsupported output mode."); } - if(progress != null) { - progress.ensureCompleted(LOG); - } + LOG.ensureCompleted(progress); return dendrogram; } @@ -665,22 +412,22 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement * @param members Member objects * @return Cluster */ - private Cluster<DendrogramModel<D>> makeCluster(DBIDRef lead, D depth, DBIDs members) { + private Cluster<DendrogramModel> makeCluster(DBIDRef lead, double depth, DBIDs members) { final String name; if(members.size() == 0) { name = "mrg_" + DBIDUtil.toString(lead) + "_" + depth; } - else if(depth != null && depth.isInfiniteDistance() || (members.size() == 1 && members.contains(lead))) { + else if(!Double.isNaN(depth) && Double.isInfinite(depth) || (members.size() == 1 && members.contains(lead))) { name = "obj_" + DBIDUtil.toString(lead); } - else if(depth != null) { + else if(!Double.isNaN(depth)) { name = "clu_" + DBIDUtil.toString(lead) + "_" + depth; } else { // Complete data set only? name = "clu_" + DBIDUtil.toString(lead); } - Cluster<DendrogramModel<D>> cluster = new Cluster<>(name, members, new DendrogramModel<>(depth)); + Cluster<DendrogramModel> cluster = new Cluster<>(name, members, new DendrogramModel(depth)); return cluster; } @@ -690,77 +437,13 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } /** - * Order a DBID collection by the lambda value. - * - * @author Erich Schubert - * - * @apiviz.exclude - * - * @param <D> Distance type - */ - private static final class CompareByLambda<D extends Distance<D>> implements Comparator<DBIDRef> { - /** - * Lambda storage - */ - private final DataStore<D> lambda; - - /** - * Constructor. - * - * @param lambda Lambda storage - */ - protected CompareByLambda(DataStore<D> lambda) { - this.lambda = lambda; - } - - @Override - public int compare(DBIDRef id1, DBIDRef id2) { - D k1 = lambda.get(id1); - D k2 = lambda.get(id2); - assert (k1 != null); - assert (k2 != null); - return k1.compareTo(k2); - } - } - - /** - * Order a DBID collection by the lambda value. - * - * @author Erich Schubert - * - * @apiviz.exclude - */ - private static final class CompareByDoubleLambda implements Comparator<DBIDRef> { - /** - * Lambda storage - */ - private final DoubleDistanceDataStore lambda; - - /** - * Constructor. - * - * @param lambda Lambda storage - */ - protected CompareByDoubleLambda(DoubleDistanceDataStore lambda) { - this.lambda = lambda; - } - - @Override - public int compare(DBIDRef id1, DBIDRef id2) { - double k1 = lambda.doubleValue(id1); - double k2 = lambda.doubleValue(id2); - return Double.compare(k1, k2); - } - } - - /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ - public static class Parameterizer<D extends Distance<D>> extends AbstractParameterizer { + public static class Parameterizer extends AbstractParameterizer { /** * Extraction mode to use. */ @@ -794,7 +477,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement /** * Threshold level. */ - D threshold = null; + double threshold = Double.NaN; /** * Flag to produce empty clusters to model the hierarchy above. @@ -804,7 +487,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement /** * The hierarchical clustering algorithm to run. */ - HierarchicalClusteringAlgorithm<D> algorithm; + HierarchicalClusteringAlgorithm algorithm; /** * Threshold mode. @@ -819,7 +502,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - ObjectParameter<HierarchicalClusteringAlgorithm<D>> algorithmP = new ObjectParameter<>(AlgorithmStep.Parameterizer.ALGORITHM_ID, HierarchicalClusteringAlgorithm.class); + ObjectParameter<HierarchicalClusteringAlgorithm> algorithmP = new ObjectParameter<>(AlgorithmStep.Parameterizer.ALGORITHM_ID, HierarchicalClusteringAlgorithm.class); if(config.grab(algorithmP)) { algorithm = algorithmP.instantiateClass(config); } @@ -838,10 +521,7 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } if(thresholdmode == null || ThresholdMode.BY_THRESHOLD.equals(thresholdmode)) { - // Fallback to double when no algorithm chosen yet: - @SuppressWarnings("unchecked") - final D factory = algorithm != null ? algorithm.getDistanceFactory() : (D) DoubleDistance.FACTORY; - DistanceParameter<D> distP = new DistanceParameter<>(THRESHOLD_ID, factory); + DoubleParameter distP = new DoubleParameter(THRESHOLD_ID); if(config.grab(distP)) { threshold = distP.getValue(); } @@ -866,13 +546,13 @@ public class ExtractFlatClusteringFromHierarchy<D extends Distance<D>> implement } @Override - protected ExtractFlatClusteringFromHierarchy<D> makeInstance() { + protected ExtractFlatClusteringFromHierarchy makeInstance() { switch(thresholdmode){ case NO_THRESHOLD: case BY_MINCLUSTERS: - return new ExtractFlatClusteringFromHierarchy<>(algorithm, minclusters, outputmode, singletons); + return new ExtractFlatClusteringFromHierarchy(algorithm, minclusters, outputmode, singletons); case BY_THRESHOLD: - return new ExtractFlatClusteringFromHierarchy<>(algorithm, threshold, outputmode, singletons); + return new ExtractFlatClusteringFromHierarchy(algorithm, threshold, outputmode, singletons); default: throw new AbortException("Unknown extraction mode."); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/GroupAverageLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/GroupAverageLinkageMethod.java index 079fb69b..5795ba08 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/GroupAverageLinkageMethod.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/GroupAverageLinkageMethod.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/HierarchicalClusteringAlgorithm.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/HierarchicalClusteringAlgorithm.java index f3595d51..88c1bf46 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/HierarchicalClusteringAlgorithm.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/HierarchicalClusteringAlgorithm.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,7 +24,6 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; */ import de.lmu.ifi.dbs.elki.algorithm.Algorithm; import de.lmu.ifi.dbs.elki.database.Database; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; /** * Interface for hierarchical clustering algorithms. @@ -35,17 +34,8 @@ import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; * @author Erich Schubert * * @apiviz.has PointerHierarchyRepresentationResult - * - * @param <D> Distance type */ -public interface HierarchicalClusteringAlgorithm<D extends Distance<D>> extends Algorithm { +public interface HierarchicalClusteringAlgorithm extends Algorithm { @Override - public PointerHierarchyRepresentationResult<D> run(Database db); - - /** - * Return the distance type that will be used by the algorithm. - * - * @return Distance factory. - */ - public D getDistanceFactory(); + public PointerHierarchyRepresentationResult run(Database db); } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/LinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/LinkageMethod.java index 68d0b4d8..0327010c 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/LinkageMethod.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/LinkageMethod.java @@ -6,7 +6,7 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/MedianLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/MedianLinkageMethod.java index fe167cec..eb4ea2a9 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/MedianLinkageMethod.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/MedianLinkageMethod.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/NaiveAgglomerativeHierarchicalClustering.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/NaiveAgglomerativeHierarchicalClustering.java index ee3052a4..083a13dd 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/NaiveAgglomerativeHierarchicalClustering.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/NaiveAgglomerativeHierarchicalClustering.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -30,7 +30,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDBIDDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDistanceDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; @@ -40,8 +40,6 @@ import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.DoubleDistance; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -51,11 +49,10 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameteriz import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** - * This tutorial will step you through implementing a well known clustering - * algorithm, agglomerative hierarchical clustering, in multiple steps. - * - * This is the third step, where we add support for different linkage - * strategies. + * Hierarchical Agglomerative Clustering (HAC) is a classic hierarchical + * clustering algorithm. Initially, each element is its own cluster; the closest + * clusters are merged at every step, until all the data has become a single + * cluster. * * This is the naive O(n^3) algorithm. See {@link SLINK} for a much faster * algorithm (however, only for single-linkage). @@ -81,8 +78,11 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * * @param <O> Object type */ -@Reference(authors = "G. N. Lance and W. T. Williams", title = "A general theory of classificatory sorting strategies 1. Hierarchical systems", booktitle = "The computer journal 9.4", url = "http://dx.doi.org/ 10.1093/comjnl/9.4.373") -public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<O, D, PointerHierarchyRepresentationResult<DoubleDistance>> implements HierarchicalClusteringAlgorithm<DoubleDistance> { +@Reference(authors = "G. N. Lance and W. T. Williams", // +title = "A general theory of classificatory sorting strategies 1. Hierarchical systems", // +booktitle = "The computer journal 9.4", // +url = "http://dx.doi.org/ 10.1093/comjnl/9.4.373") +public class NaiveAgglomerativeHierarchicalClustering<O> extends AbstractDistanceBasedAlgorithm<O, PointerHierarchyRepresentationResult> implements HierarchicalClusteringAlgorithm { /** * Class logger */ @@ -99,7 +99,7 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc * @param distanceFunction Distance function to use * @param linkage Linkage method */ - public NaiveAgglomerativeHierarchicalClustering(DistanceFunction<? super O, D> distanceFunction, LinkageMethod linkage) { + public NaiveAgglomerativeHierarchicalClustering(DistanceFunction<? super O> distanceFunction, LinkageMethod linkage) { super(distanceFunction); this.linkage = linkage; } @@ -111,15 +111,15 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc * @param relation Relation * @return Clustering hierarchy */ - public PointerHierarchyRepresentationResult<DoubleDistance> run(Database db, Relation<O> relation) { - DistanceQuery<O, D> dq = db.getDistanceQuery(relation, getDistanceFunction()); + public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) { + DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction()); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); final int size = ids.size(); - if (size > 0x10000) { + if(size > 0x10000) { throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow."); } - if (SingleLinkageMethod.class.isInstance(linkage)) { + if(SingleLinkageMethod.class.isInstance(linkage)) { LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!"); } @@ -129,11 +129,11 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc // Position counter - must agree with computeOffset! int pos = 0; boolean square = WardLinkageMethod.class.isInstance(linkage) && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction())); - for (ix.seek(0); ix.valid(); ix.advance()) { - for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { - scratch[pos] = dq.distance(ix, iy).doubleValue(); + for(ix.seek(0); ix.valid(); ix.advance()) { + for(iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { + scratch[pos] = dq.distance(ix, iy); // Ward uses variances -- i.e. squared values - if (square) { + if(square) { scratch[pos] *= scratch[pos]; } pos++; @@ -142,9 +142,9 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc // Initialize space for result: WritableDBIDDataStore pi = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); - WritableDoubleDistanceDataStore lambda = DataStoreUtil.makeDoubleDistanceStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); + WritableDoubleDataStore lambda = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableIntegerDataStore csize = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); - for (DBIDIter it = ids.iter(); it.valid(); it.advance()) { + for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { pi.put(it, it); lambda.put(it, Double.POSITIVE_INFINITY); csize.put(it, 1); @@ -152,20 +152,20 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc // Repeat until everything merged into 1 cluster FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null; - for (int i = 1; i < size; i++) { + for(int i = 1; i < size; i++) { double mindist = Double.POSITIVE_INFINITY; int x = -1, y = -1; - for (ix.seek(0); ix.valid(); ix.advance()) { - if (lambda.doubleValue(ix) < Double.POSITIVE_INFINITY) { + for(ix.seek(0); ix.valid(); ix.advance()) { + if(lambda.doubleValue(ix) < Double.POSITIVE_INFINITY) { continue; } final int xbase = triangleSize(ix.getOffset()); - for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { - if (lambda.doubleValue(iy) < Double.POSITIVE_INFINITY) { + for(iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { + if(lambda.doubleValue(iy) < Double.POSITIVE_INFINITY) { continue; } final int idx = xbase + iy.getOffset(); - if (scratch[idx] <= mindist) { + if(scratch[idx] <= mindist) { mindist = scratch[idx]; x = ix.getOffset(); y = iy.getOffset(); @@ -176,7 +176,7 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc // Avoid allocating memory, by reusing existing iterators: ix.seek(x); iy.seek(y); - if (LOG.isDebuggingFine()) { + if(LOG.isDebuggingFine()) { LOG.debugFine("Merging: " + DBIDUtil.toString(ix) + " -> " + DBIDUtil.toString(iy)); } // Perform merge in data structure: x -> y @@ -195,8 +195,8 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc ij.seek(0); // Write to (y, j), with j < y - for (; ij.getOffset() < y; ij.advance()) { - if (lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { + for(; ij.getOffset() < y; ij.advance()) { + if(lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int sizej = csize.intValue(ij); @@ -204,8 +204,8 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc } ij.advance(); // Skip y // Write to (j, y), with y < j < x - for (; ij.getOffset() < x; ij.advance()) { - if (lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { + for(; ij.getOffset() < x; ij.advance()) { + if(lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); @@ -214,23 +214,19 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc } ij.advance(); // Skip x // Write to (j, y), with y < x < j - for (; ij.valid(); ij.advance()) { - if (lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { + for(; ij.valid(); ij.advance()) { + if(lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int sizej = csize.intValue(ij); final int jbase = triangleSize(ij.getOffset()); scratch[jbase + y] = linkage.combine(sizex, scratch[jbase + x], sizey, scratch[jbase + y], sizej, mindist); } - if (prog != null) { - prog.incrementProcessed(LOG); - } - } - if (prog != null) { - prog.ensureCompleted(LOG); + LOG.incrementProcessed(prog); } + LOG.ensureCompleted(prog); - return new PointerHierarchyRepresentationResult<>(ids, pi, lambda); + return new PointerHierarchyRepresentationResult(ids, pi, lambda); } /** @@ -244,11 +240,6 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc } @Override - public DoubleDistance getDistanceFactory() { - return DoubleDistance.FACTORY; - } - - @Override public TypeInformation[] getInputTypeRestriction() { // The input relation must match our distance function: return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); @@ -267,9 +258,8 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc * @apiviz.exclude * * @param <O> Object type - * @param <D> Distance type */ - public static class Parameterizer<O, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> { /** * Option ID for linkage parameter. */ @@ -283,20 +273,20 @@ public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistanc @Override protected void makeOptions(Parameterization config) { // We don't call super, because we want a different default distance. - ObjectParameter<DistanceFunction<O, D>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, DistanceFunction.class); - if (config.grab(distanceFunctionP)) { + ObjectParameter<DistanceFunction<O>> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, DistanceFunction.class); + if(config.grab(distanceFunctionP)) { distanceFunction = distanceFunctionP.instantiateClass(config); } ObjectParameter<LinkageMethod> linkageP = new ObjectParameter<>(LINKAGE_ID, LinkageMethod.class); linkageP.setDefaultValue(WardLinkageMethod.class); - if (config.grab(linkageP)) { + if(config.grab(linkageP)) { linkage = linkageP.instantiateClass(config); } } @Override - protected NaiveAgglomerativeHierarchicalClustering<O, D> makeInstance() { + protected NaiveAgglomerativeHierarchicalClustering<O> makeInstance() { return new NaiveAgglomerativeHierarchicalClustering<>(distanceFunction, linkage); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/PointerHierarchyRepresentationResult.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/PointerHierarchyRepresentationResult.java index c339fb09..3f6f4155 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/PointerHierarchyRepresentationResult.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/PointerHierarchyRepresentationResult.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -24,9 +24,8 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; */ import de.lmu.ifi.dbs.elki.database.datastore.DBIDDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.DataStore; +import de.lmu.ifi.dbs.elki.database.datastore.DoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; import de.lmu.ifi.dbs.elki.result.BasicResult; /** @@ -35,10 +34,8 @@ import de.lmu.ifi.dbs.elki.result.BasicResult; * objects cluster. * * @author Erich Schubert - * - * @param <D> Distance type */ -public class PointerHierarchyRepresentationResult<D extends Distance<D>> extends BasicResult { +public class PointerHierarchyRepresentationResult extends BasicResult { /** * The DBIDs in this result. */ @@ -52,7 +49,7 @@ public class PointerHierarchyRepresentationResult<D extends Distance<D>> extends /** * Distance to the parent object. */ - DataStore<D> parentDistance; + DoubleDataStore parentDistance; /** * Constructor. @@ -61,7 +58,7 @@ public class PointerHierarchyRepresentationResult<D extends Distance<D>> extends * @param parent Parent pointer. * @param parentDistance Distance to parent. */ - public PointerHierarchyRepresentationResult(DBIDs ids, DBIDDataStore parent, DataStore<D> parentDistance) { + public PointerHierarchyRepresentationResult(DBIDs ids, DBIDDataStore parent, DoubleDataStore parentDistance) { super("Pointer Representation", "pointer-representation"); this.ids = ids; this.parent = parent; @@ -91,7 +88,7 @@ public class PointerHierarchyRepresentationResult<D extends Distance<D>> extends * * @return Parent distance. */ - public DataStore<D> getParentDistanceStore() { + public DoubleDataStore getParentDistanceStore() { return parentDistance; } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SLINK.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SLINK.java index f1b58868..3e8f0fdb 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SLINK.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SLINK.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -30,8 +30,7 @@ import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDBIDDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDataStore; -import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDistanceDataStore; +import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDRef; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; @@ -40,10 +39,8 @@ import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.DistanceUtil; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDoubleDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.Distance; +import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.Alias; @@ -67,13 +64,12 @@ import de.lmu.ifi.dbs.elki.utilities.documentation.Title; * @apiviz.has SingleLinkageMethod * * @param <O> the type of DatabaseObject the algorithm is applied on - * @param <D> the type of Distance used */ @Title("SLINK: Single Link Clustering") @Description("Hierarchical clustering algorithm based on single-link connectivity.") @Reference(authors = "R. Sibson", title = "SLINK: An optimally efficient algorithm for the single-link cluster method", booktitle = "The Computer Journal 16 (1973), No. 1, p. 30-34.", url = "http://dx.doi.org/10.1093/comjnl/16.1.30") @Alias(value = { "de.lmu.ifi.dbs.elki.algorithm.clustering.SLINK", "clustering.SLINK", "SLINK", "single-link", "single-linkage" }) -public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm<O, D, PointerHierarchyRepresentationResult<D>> implements HierarchicalClusteringAlgorithm<D> { +public class SLINK<O> extends AbstractDistanceBasedAlgorithm<O, PointerHierarchyRepresentationResult> implements HierarchicalClusteringAlgorithm { /** * The logger for this class. */ @@ -84,70 +80,60 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori * * @param distanceFunction Distance function */ - public SLINK(DistanceFunction<? super O, D> distanceFunction) { + public SLINK(DistanceFunction<? super O> distanceFunction) { super(distanceFunction); } /** * Performs the SLINK algorithm on the given database. + * + * @param database Database to process + * @param relation Data relation to use */ - public PointerHierarchyRepresentationResult<D> run(Database database, Relation<O> relation) { + public PointerHierarchyRepresentationResult run(Database database, Relation<O> relation) { DBIDs ids = relation.getDBIDs(); - DistanceQuery<O, D> distQuery = database.getDistanceQuery(relation, getDistanceFunction()); - @SuppressWarnings("unchecked") - Class<D> distCls = (Class<D>) getDistanceFunction().getDistanceFactory().getClass(); WritableDBIDDataStore pi = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); - WritableDataStore<D> lambda = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, distCls); + WritableDoubleDataStore lambda = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); // Temporary storage for m. - WritableDataStore<D> m = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, distCls); + WritableDoubleDataStore m = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Running SLINK", ids.size(), LOG) : null; // has to be an array for monotonicity reasons! ModifiableDBIDs processedIDs = DBIDUtil.newArray(ids.size()); - // Optimized code path for double distances - if (getDistanceFunction() instanceof PrimitiveDoubleDistanceFunction && lambda instanceof WritableDoubleDistanceDataStore && m instanceof WritableDoubleDistanceDataStore) { - @SuppressWarnings("unchecked") - PrimitiveDoubleDistanceFunction<? super O> dist = (PrimitiveDoubleDistanceFunction<? super O>) getDistanceFunction(); - WritableDoubleDistanceDataStore lambdad = (WritableDoubleDistanceDataStore) lambda; - WritableDoubleDistanceDataStore md = (WritableDoubleDistanceDataStore) m; - // apply the algorithm - for (DBIDIter id = ids.iter(); id.valid(); id.advance()) { - step1double(id, pi, lambdad); - step2double(id, processedIDs, distQuery.getRelation(), dist, md); - step3double(id, pi, lambdad, processedIDs, md); - step4double(id, pi, lambdad, processedIDs); + if(getDistanceFunction() instanceof PrimitiveDistanceFunction) { + PrimitiveDistanceFunction<? super O> distf = (PrimitiveDistanceFunction<? super O>) getDistanceFunction(); + for(DBIDIter id = ids.iter(); id.valid(); id.advance()) { + step1(id, pi, lambda); + step2primitive(id, processedIDs, relation, distf, m); + step3(id, pi, lambda, processedIDs, m); + step4(id, pi, lambda, processedIDs); processedIDs.add(id); - if (progress != null) { - progress.incrementProcessed(LOG); - } + LOG.incrementProcessed(progress); } - } else { - // apply the algorithm - for (DBIDIter id = ids.iter(); id.valid(); id.advance()) { + } + else { + DistanceQuery<O> distQ = database.getDistanceQuery(relation, getDistanceFunction()); + for(DBIDIter id = ids.iter(); id.valid(); id.advance()) { step1(id, pi, lambda); - step2(id, processedIDs, distQuery, m); + step2(id, processedIDs, distQ, m); step3(id, pi, lambda, processedIDs, m); step4(id, pi, lambda, processedIDs); processedIDs.add(id); - if (progress != null) { - progress.incrementProcessed(LOG); - } + LOG.incrementProcessed(progress); } } - if (progress != null) { - progress.ensureCompleted(LOG); - } + LOG.ensureCompleted(progress); // We don't need m anymore. m.destroy(); m = null; - return new PointerHierarchyRepresentationResult<>(ids, pi, lambda); + return new PointerHierarchyRepresentationResult(ids, pi, lambda); } /** @@ -158,11 +144,11 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori * @param pi Pi data store * @param lambda Lambda data store */ - private void step1(DBIDRef id, WritableDBIDDataStore pi, WritableDataStore<D> lambda) { + private void step1(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDataStore lambda) { // P(n+1) = n+1: pi.put(id, id); // L(n+1) = infinity - lambda.put(id, getDistanceFunction().getDistanceFactory().infiniteDistance()); + lambda.putDouble(id, Double.POSITIVE_INFINITY); } /** @@ -172,93 +158,17 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori * @param id the id of the object to be inserted into the pointer * representation * @param processedIDs the already processed ids + * @param distQuery Distnace query * @param m Data store - * @param distFunc Distance function to use */ - private void step2(DBIDRef id, DBIDs processedIDs, DistanceQuery<O, D> distFunc, WritableDataStore<D> m) { - O newObj = distFunc.getRelation().get(id); - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + private void step2(DBIDRef id, DBIDs processedIDs, DistanceQuery<? super O> distQuery, WritableDoubleDataStore m) { + for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { // M(i) = dist(i, n+1) - m.put(it, distFunc.distance(it, newObj)); - } - } - - /** - * Third step: Determine the values for P and L - * - * @param id the id of the object to be inserted into the pointer - * representation - * @param pi Pi data store - * @param lambda Lambda data store - * @param processedIDs the already processed ids - * @param m Data store - */ - private void step3(DBIDRef id, WritableDBIDDataStore pi, WritableDataStore<D> lambda, DBIDs processedIDs, WritableDataStore<D> m) { - DBIDVar p_i = DBIDUtil.newVar(); - // for i = 1..n - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { - D l_i = lambda.get(it); - D m_i = m.get(it); - pi.assignVar(it, p_i); // p_i = pi(it) - D mp_i = m.get(p_i); - - // if L(i) >= M(i) - if (l_i.compareTo(m_i) >= 0) { - // M(P(i)) = min { M(P(i)), L(i) } - m.put(p_i, DistanceUtil.min(mp_i, l_i)); - - // L(i) = M(i) - lambda.put(it, m_i); - - // P(i) = n+1; - pi.put(it, id); - } else { - // M(P(i)) = min { M(P(i)), M(i) } - m.put(p_i, DistanceUtil.min(mp_i, m_i)); - } + m.putDouble(it, distQuery.distance(it, id)); } } /** - * Fourth step: Actualize the clusters if necessary - * - * @param id the id of the current object - * @param pi Pi data store - * @param lambda Lambda data store - * @param processedIDs the already processed ids - */ - private void step4(DBIDRef id, WritableDBIDDataStore pi, WritableDataStore<D> lambda, DBIDs processedIDs) { - DBIDVar p_i = DBIDUtil.newVar(); - // for i = 1..n - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { - D l_i = lambda.get(it); - pi.assignVar(it, p_i); // p_i = pi(it) - D lp_i = lambda.get(p_i); - - // if L(i) >= L(P(i)) - if (l_i.compareTo(lp_i) >= 0) { - // P(i) = n+1 - pi.put(it, id); - } - } - } - - /** - * First step: Initialize P(id) = id, L(id) = infinity. - * - * @param id the id of the object to be inserted into the pointer - * representation - * @param pi Pi data store - * @param lambda Lambda data store - */ - private void step1double(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDistanceDataStore lambda) { - // P(n+1) = n+1: - pi.put(id, id); - // L(n+1) = infinity - lambda.putDouble(id, Double.POSITIVE_INFINITY); - } - - /** * Second step: Determine the pairwise distances from all objects in the * pointer representation to the new object with the specified id. * @@ -269,11 +179,11 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori * @param relation Data relation * @param distFunc Distance function to use */ - private void step2double(DBIDRef id, DBIDs processedIDs, Relation<? extends O> relation, PrimitiveDoubleDistanceFunction<? super O> distFunc, WritableDoubleDistanceDataStore m) { + private void step2primitive(DBIDRef id, DBIDs processedIDs, Relation<? extends O> relation, PrimitiveDistanceFunction<? super O> distFunc, WritableDoubleDataStore m) { O newObj = relation.get(id); - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { // M(i) = dist(i, n+1) - m.putDouble(it, distFunc.doubleDistance(relation.get(it), newObj)); + m.putDouble(it, distFunc.distance(relation.get(it), newObj)); } } @@ -287,17 +197,17 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori * @param processedIDs the already processed ids * @param m Data store */ - private void step3double(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDistanceDataStore lambda, DBIDs processedIDs, WritableDoubleDistanceDataStore m) { + private void step3(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDataStore lambda, DBIDs processedIDs, WritableDoubleDataStore m) { DBIDVar p_i = DBIDUtil.newVar(); // for i = 1..n - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { double l_i = lambda.doubleValue(it); double m_i = m.doubleValue(it); pi.assignVar(it, p_i); // p_i = pi(it) double mp_i = m.doubleValue(p_i); // if L(i) >= M(i) - if (l_i >= m_i) { + if(l_i >= m_i) { // M(P(i)) = min { M(P(i)), L(i) } m.putDouble(p_i, Math.min(mp_i, l_i)); @@ -306,7 +216,8 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori // P(i) = n+1; pi.put(it, id); - } else { + } + else { // M(P(i)) = min { M(P(i)), M(i) } m.putDouble(p_i, Math.min(mp_i, m_i)); } @@ -321,16 +232,16 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori * @param lambda Lambda data store * @param processedIDs the already processed ids */ - private void step4double(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDistanceDataStore lambda, DBIDs processedIDs) { + private void step4(DBIDRef id, WritableDBIDDataStore pi, WritableDoubleDataStore lambda, DBIDs processedIDs) { DBIDVar p_i = DBIDUtil.newVar(); // for i = 1..n - for (DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { + for(DBIDIter it = processedIDs.iter(); it.valid(); it.advance()) { double l_i = lambda.doubleValue(it); pi.assignVar(it, p_i); // p_i = pi(it) double lp_i = lambda.doubleValue(p_i); // if L(i) >= L(P(i)) - if (l_i >= lp_i) { + if(l_i >= lp_i) { // P(i) = n+1 pi.put(it, id); } @@ -338,11 +249,6 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori } @Override - public D getDistanceFactory() { - return getDistanceFunction().getDistanceFactory(); - } - - @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); } @@ -359,9 +265,9 @@ public class SLINK<O, D extends Distance<D>> extends AbstractDistanceBasedAlgori * * @apiviz.exclude */ - public static class Parameterizer<O, D extends Distance<D>> extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D> { + public static class Parameterizer<O> extends AbstractDistanceBasedAlgorithm.Parameterizer<O> { @Override - protected SLINK<O, D> makeInstance() { + protected SLINK<O> makeInstance() { return new SLINK<>(distanceFunction); } } diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SingleLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SingleLinkageMethod.java index 7ef81692..83543411 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SingleLinkageMethod.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/SingleLinkageMethod.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WardLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WardLinkageMethod.java index 488f011c..dc6f7079 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WardLinkageMethod.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WardLinkageMethod.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WeightedAverageLinkageMethod.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WeightedAverageLinkageMethod.java index ac0b17f5..df913a89 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WeightedAverageLinkageMethod.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/WeightedAverageLinkageMethod.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/package-info.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/package-info.java new file mode 100644 index 00000000..1783800c --- /dev/null +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/hierarchical/package-info.java @@ -0,0 +1,27 @@ +/** + * Hierarchical agglomerative clustering (HAC). + */ +/* + This file is part of ELKI: + Environment for Developing KDD-Applications Supported by Index-Structures + + Copyright (C) 2014 + Ludwig-Maximilians-Universität München + Lehr- und Forschungseinheit für Datenbanksysteme + ELKI Development Team + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; + |