package de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDBIDDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * Hierarchical Agglomerative Clustering (HAC) is a classic hierarchical * clustering algorithm. Initially, each element is its own cluster; the closest * clusters are merged at every step, until all the data has become a single * cluster. * * This is the naive O(n^3) algorithm. See {@link SLINK} for a much faster * algorithm (however, only for single-linkage). * * Reference for the unified concept: *

* G. N. Lance and W. T. Williams
* A general theory of classificatory sorting strategies 1. Hierarchical systems *
* The computer journal 9.4 (1967): 373-380. *

* * See also: *

* A Review of Classification
* R. M. Cormack
* Journal of the Royal Statistical Society. Series A, Vol. 134, No. 3 *

* * @author Erich Schubert * * @apiviz.composedOf LinkageMethod * * @param Object type */ @Reference(authors = "G. N. Lance and W. T. Williams", // title = "A general theory of classificatory sorting strategies 1. Hierarchical systems", // booktitle = "The computer journal 9.4", // url = "http://dx.doi.org/ 10.1093/comjnl/9.4.373") public class NaiveAgglomerativeHierarchicalClustering extends AbstractDistanceBasedAlgorithm implements HierarchicalClusteringAlgorithm { /** * Class logger */ private static final Logging LOG = Logging.getLogger(NaiveAgglomerativeHierarchicalClustering.class); /** * Current linkage method in use. */ LinkageMethod linkage = WardLinkageMethod.STATIC; /** * Constructor. * * @param distanceFunction Distance function to use * @param linkage Linkage method */ public NaiveAgglomerativeHierarchicalClustering(DistanceFunction distanceFunction, LinkageMethod linkage) { super(distanceFunction); this.linkage = linkage; } /** * Run the algorithm * * @param db Database * @param relation Relation * @return Clustering hierarchy */ public PointerHierarchyRepresentationResult run(Database db, Relation relation) { DistanceQuery dq = db.getDistanceQuery(relation, getDistanceFunction()); ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs()); final int size = ids.size(); if(size > 0x10000) { throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow."); } if(SingleLinkageMethod.class.isInstance(linkage)) { LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!"); } // Compute the initial (lower triangular) distance matrix. double[] scratch = new double[triangleSize(size)]; DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter(); // Position counter - must agree with computeOffset! int pos = 0; boolean square = WardLinkageMethod.class.isInstance(linkage) && !(SquaredEuclideanDistanceFunction.class.isInstance(getDistanceFunction())); for(ix.seek(0); ix.valid(); ix.advance()) { for(iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { scratch[pos] = dq.distance(ix, iy); // Ward uses variances -- i.e. squared values if(square) { scratch[pos] *= scratch[pos]; } pos++; } } // Initialize space for result: WritableDBIDDataStore pi = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableDoubleDataStore lambda = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC); WritableIntegerDataStore csize = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); for(DBIDIter it = ids.iter(); it.valid(); it.advance()) { pi.put(it, it); lambda.put(it, Double.POSITIVE_INFINITY); csize.put(it, 1); } // Repeat until everything merged into 1 cluster FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null; for(int i = 1; i < size; i++) { double mindist = Double.POSITIVE_INFINITY; int x = -1, y = -1; for(ix.seek(0); ix.valid(); ix.advance()) { if(lambda.doubleValue(ix) < Double.POSITIVE_INFINITY) { continue; } final int xbase = triangleSize(ix.getOffset()); for(iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) { if(lambda.doubleValue(iy) < Double.POSITIVE_INFINITY) { continue; } final int idx = xbase + iy.getOffset(); if(scratch[idx] <= mindist) { mindist = scratch[idx]; x = ix.getOffset(); y = iy.getOffset(); } } } assert (x >= 0 && y >= 0); // Avoid allocating memory, by reusing existing iterators: ix.seek(x); iy.seek(y); if(LOG.isDebuggingFine()) { LOG.debugFine("Merging: " + DBIDUtil.toString(ix) + " -> " + DBIDUtil.toString(iy)); } // Perform merge in data structure: x -> y // Since y < x, prefer keeping y, dropping x. lambda.put(ix, mindist); pi.put(ix, iy); // Merge into cluster int sizex = csize.intValue(ix), sizey = csize.intValue(iy); csize.put(iy, sizex + sizey); // Update distance matrix. Note: miny < minx // Implementation note: most will not need sizej, and could save the // hashmap lookup. final int xbase = triangleSize(x), ybase = triangleSize(y); ij.seek(0); // Write to (y, j), with j < y for(; ij.getOffset() < y; ij.advance()) { if(lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int sizej = csize.intValue(ij); scratch[ybase + ij.getOffset()] = linkage.combine(sizex, scratch[xbase + ij.getOffset()], sizey, scratch[ybase + ij.getOffset()], sizej, mindist); } ij.advance(); // Skip y // Write to (j, y), with y < j < x for(; ij.getOffset() < x; ij.advance()) { if(lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int jbase = triangleSize(ij.getOffset()); final int sizej = csize.intValue(ij); scratch[jbase + y] = linkage.combine(sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + y], sizej, mindist); } ij.advance(); // Skip x // Write to (j, y), with y < x < j for(; ij.valid(); ij.advance()) { if(lambda.doubleValue(ij) < Double.POSITIVE_INFINITY) { continue; } final int sizej = csize.intValue(ij); final int jbase = triangleSize(ij.getOffset()); scratch[jbase + y] = linkage.combine(sizex, scratch[jbase + x], sizey, scratch[jbase + y], sizej, mindist); } LOG.incrementProcessed(prog); } LOG.ensureCompleted(prog); return new PointerHierarchyRepresentationResult(ids, pi, lambda); } /** * Compute the size of a complete x by x triangle (minus diagonal) * * @param x Offset * @return Size of complete triangle */ protected static int triangleSize(int x) { return (x * (x - 1)) >>> 1; } @Override public TypeInformation[] getInputTypeRestriction() { // The input relation must match our distance function: return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class * * @author Erich Schubert * * @apiviz.exclude * * @param Object type */ public static class Parameterizer extends AbstractDistanceBasedAlgorithm.Parameterizer { /** * Option ID for linkage parameter. */ public static final OptionID LINKAGE_ID = new OptionID("hierarchical.linkage", "Linkage method to use (e.g. Ward, Single-Link)"); /** * Current linkage in use. */ protected LinkageMethod linkage; @Override protected void makeOptions(Parameterization config) { // We don't call super, because we want a different default distance. ObjectParameter> distanceFunctionP = makeParameterDistanceFunction(SquaredEuclideanDistanceFunction.class, DistanceFunction.class); if(config.grab(distanceFunctionP)) { distanceFunction = distanceFunctionP.instantiateClass(config); } ObjectParameter linkageP = new ObjectParameter<>(LINKAGE_ID, LinkageMethod.class); linkageP.setDefaultValue(WardLinkageMethod.class); if(config.grab(linkageP)) { linkage = linkageP.instantiateClass(config); } } @Override protected NaiveAgglomerativeHierarchicalClustering makeInstance() { return new NaiveAgglomerativeHierarchicalClustering<>(distanceFunction, linkage); } } }