package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; /* This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures Copyright (C) 2015 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ import java.util.Arrays; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithmUtil; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization; import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.model.MedoidModel; import de.lmu.ifi.dbs.elki.data.type.TypeInformation; import de.lmu.ifi.dbs.elki.data.type.TypeUtil; import de.lmu.ifi.dbs.elki.database.Database; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory; import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDRange; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.index.distancematrix.PrecomputedDistanceMatrix; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic; import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * The original PAM algorithm or k-medoids clustering, as proposed by Kaufman * and Rousseeuw in "Partitioning Around Medoids". * * Reference: *

* Clustering my means of Medoids
* Kaufman, L. and Rousseeuw, P.J.
* in: Statistical Data Analysis Based on the L1-Norm and Related Methods *

* * @author Erich Schubert * * @apiviz.has MedoidModel * @apiviz.composedOf KMedoidsInitialization * * @param vector datatype */ @Title("Partioning Around Medoids") @Reference(title = "Clustering by means of Medoids", // authors = "Kaufman, L. and Rousseeuw, P.J.", // booktitle = "Statistical Data Analysis Based on the L1-Norm and Related Methods") public class KMedoidsPAM extends AbstractDistanceBasedAlgorithm>implements ClusteringAlgorithm> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(KMedoidsPAM.class); /** * Key for statistics logging. */ private static final String KEY = KMedoidsPAM.class.getName(); /** * The number of clusters to produce. */ protected int k; /** * The maximum number of iterations. */ protected int maxiter; /** * Method to choose initial means. */ protected KMedoidsInitialization initializer; /** * Constructor. * * @param distanceFunction distance function * @param k k parameter * @param maxiter Maxiter parameter * @param initializer Function to generate the initial means */ public KMedoidsPAM(DistanceFunction distanceFunction, int k, int maxiter, KMedoidsInitialization initializer) { super(distanceFunction); this.k = k; this.maxiter = maxiter; this.initializer = initializer; } /** * Run k-medoids * * @param database Database * @param relation relation to use * @return result */ public Clustering run(Database database, Relation relation) { if(relation.size() <= 0) { return new Clustering<>("PAM Clustering", "pam-clustering"); } DistanceQuery distQ = database.getDistanceQuery(relation, getDistanceFunction(), DatabaseQuery.HINT_OPTIMIZED_ONLY); DBIDs ids = relation.getDBIDs(); if(distQ == null && ids instanceof DBIDRange) { LOG.verbose("Adding a distance matrix index to accelerate PAM."); PrecomputedDistanceMatrix idx = new PrecomputedDistanceMatrix(relation, getDistanceFunction()); idx.initialize(); distQ = idx.getDistanceQuery(getDistanceFunction()); } if(distQ == null) { distQ = database.getDistanceQuery(relation, getDistanceFunction()); LOG.warning("PAM may be slow, because we do not have a precomputed distance matrix available."); } // Choose initial medoids if(LOG.isStatistics()) { LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString())); } ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, ids, distQ)); if(medoids.size() != k) { throw new AbortException("Initializer " + initializer.toString() + " did not return " + k + " means, but " + medoids.size()); } // Setup cluster assignment store WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, -1); runPAMOptimization(distQ, ids, medoids, assignment); ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, assignment, k); // Wrap result Clustering result = new Clustering<>("PAM Clustering", "pam-clustering"); for(DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) { MedoidModel model = new MedoidModel(DBIDUtil.deref(it)); result.addToplevelCluster(new Cluster<>(clusters[it.getOffset()], model)); } return result; } /** * Run the PAM optimization phase. * * @param distQ Distance query * @param ids IDs to process * @param medoids Medoids list * @param assignment Cluster assignment */ protected void runPAMOptimization(DistanceQuery distQ, DBIDs ids, ArrayModifiableDBIDs medoids, WritableIntegerDataStore assignment) { WritableDoubleDataStore nearest = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); // Initial assignment to nearest medoids // TODO: reuse distance information, from the build phase, when possible? double tc = assignToNearestCluster(medoids, ids, nearest, second, assignment, distQ); if(LOG.isStatistics()) { LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc)); } IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null; // Swap phase DBIDVar bestid = DBIDUtil.newVar(); DBIDArrayIter m = medoids.iter(); int iteration = 1; double[] cost = new double[k]; for(; maxiter <= 0 || iteration <= maxiter; iteration++) { LOG.incrementProcessed(prog); // Try to swap a non-medoid with a medoid member: double best = Double.POSITIVE_INFINITY; int bestcluster = -1; // Iterate over all non-medoids: for(DBIDIter h = ids.iter(); h.valid(); h.advance()) { final int pm = assignment.intValue(h); m.seek(pm); double hdist = nearest.doubleValue(h); // Current assignment cost of h. if(DBIDUtil.equal(m, h) || hdist <= 0.) { continue; // Only consider non-selected items } // h is a non-medoid currently in cluster of medoid m. Arrays.fill(cost, -hdist); // Compute costs of reassigning other objects j: for(DBIDIter j = ids.iter(); j.valid(); j.advance()) { if(DBIDUtil.equal(h, j)) { continue; } final int pj = assignment.intValue(j); // distance(j, i) for pi == pj final double distcur = nearest.doubleValue(j); // second nearest, alternative reassignment final double distsec = second.doubleValue(j); // distance(j, h), the possible reassignment final double dist_h = distQ.distance(h, j); // We moved this distance computation out compared to original PAM. if(dist_h < distcur) { for(int pi = 0; pi < k; pi++) { if(pi == pj) { // The current nearest is lost. cost[pi] += ((dist_h < distsec) ? // dist_h // Case 1b1) j is closer to h : distsec // Case 1b2) j would switch to its second nearest ) - distcur; } else { // Case 1c) j is closer to h than its current medoid cost[pi] += dist_h - distcur; } // else Case 1a): j is closer to i than h and m, so no change. } } else { // Only need to consider pi == pj if(dist_h < distsec) { // Case 1b1) j is closer to h cost[pj] += dist_h - distcur; } else { // Case 1b2) j would switch to its second nearest cost[pj] += distsec - distcur; } } } // Consider all possible swaps: for(int pi = 0; pi < k; pi++) { if(cost[pi] < best) { best = cost[pi]; bestid.set(h); bestcluster = pi; } } } if(best >= 0.) { break; } medoids.set(bestcluster, bestid); // Reassign double nc = assignToNearestCluster(medoids, ids, nearest, second, assignment, distQ); if(LOG.isStatistics()) { LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc)); } if(nc > tc) { if(nc - tc < 1e-7 * tc) { LOG.warning("PAM failed to converge (numerical instability?)"); break; } LOG.warning("PAM failed to converge: costs increased by: " + (nc - tc) + " exepected a decrease by " + best); break; } tc = nc; } LOG.setCompleted(prog); if(LOG.isStatistics()) { LOG.statistics(new LongStatistic(KEY + ".iterations", iteration)); LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", tc)); } } /** * Returns a list of clusters. The kth cluster contains the ids of * those FeatureVectors, that are nearest to the kth mean. * * @param means Object centroids * @param ids Object ids * @param nearest Distance to nearest medoid * @param second Distance to second nearest medoid * @param assignment Cluster assignment * @param distQ distance query * @return Assignment cost */ protected double assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore nearest, WritableDoubleDataStore second, WritableIntegerDataStore assignment, DistanceQuery distQ) { assert(means.size() == k); DBIDArrayIter miter = means.iter(); double cost = 0.; for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) { double mindist = Double.POSITIVE_INFINITY, mindist2 = Double.POSITIVE_INFINITY; int minIndex = -1; for(int i = 0; i < k; i++) { double dist = distQ.distance(iditer, miter.seek(i)); if(dist < mindist) { mindist2 = mindist; mindist = dist; minIndex = i; } else if(dist < mindist2) { mindist2 = dist; } } if(minIndex < 0) { throw new AbortException("Too many infinite distances. Cannot assign objects."); } assignment.put(iditer, minIndex); nearest.put(iditer, mindist); second.put(iditer, mindist2); cost += mindist; } return cost; } @Override public TypeInformation[] getInputTypeRestriction() { return TypeUtil.array(getDistanceFunction().getInputTypeRestriction()); } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractDistanceBasedAlgorithm.Parameterizer { /** * The number of clusters to produce. */ protected int k; /** * The maximum number of iterations. */ protected int maxiter; /** * Method to choose initial means. */ protected KMedoidsInitialization initializer; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); IntParameter kP = new IntParameter(KMeans.K_ID) // .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(kP)) { k = kP.intValue(); } ObjectParameter> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class); if(config.grab(initialP)) { initializer = initialP.instantiateClass(config); } IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0) // .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); if(config.grab(maxiterP)) { maxiter = maxiterP.intValue(); } } @Override protected KMedoidsPAM makeInstance() { return new KMedoidsPAM<>(distanceFunction, k, maxiter, initializer); } } }