diff options
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java')
-rw-r--r-- | src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java | 163 |
1 files changed, 88 insertions, 75 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java index c9e1dc47..30bb56e5 100644 --- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java +++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java @@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans; This file is part of ELKI: Environment for Developing KDD-Applications Supported by Index-Structures - Copyright (C) 2013 + Copyright (C) 2014 Ludwig-Maximilians-Universität München Lehr- und Forschungseinheit für Datenbanksysteme ELKI Development Team @@ -27,8 +27,9 @@ import java.util.ArrayList; import java.util.List; import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm; -import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm; import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization; +import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans; import de.lmu.ifi.dbs.elki.data.Cluster; import de.lmu.ifi.dbs.elki.data.Clustering; import de.lmu.ifi.dbs.elki.data.model.MedoidModel; @@ -40,15 +41,15 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil; import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore; import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs; import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs; -import de.lmu.ifi.dbs.elki.database.ids.DBID; +import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDIter; import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil; +import de.lmu.ifi.dbs.elki.database.ids.DBIDVar; import de.lmu.ifi.dbs.elki.database.ids.DBIDs; import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs; import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery; import de.lmu.ifi.dbs.elki.database.relation.Relation; -import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction; -import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance; +import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress; import de.lmu.ifi.dbs.elki.utilities.documentation.Reference; @@ -59,14 +60,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** - * Provides the k-medoids clustering algorithm, using the - * "Partitioning Around Medoids" approach. + * The original PAM algorithm or k-medoids clustering, as proposed by Kaufman + * and Rousseeuw in "Partitioning Around Medoids". * * Reference: * <p> * Clustering my means of Medoids<br /> * Kaufman, L. and Rousseeuw, P.J.<br /> - * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods + * in: Statistical Data Analysis Based on the L1-Norm and Related Methods * </p> * * @author Erich Schubert @@ -75,23 +76,24 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; * @apiviz.composedOf KMedoidsInitialization * * @param <V> vector datatype - * @param <D> distance value type */ @Title("Partioning Around Medoids") -@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods") -public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { +@Reference(title = "Clustering by means of Medoids", // +authors = "Kaufman, L. and Rousseeuw, P.J.", // +booktitle = "Statistical Data Analysis Based on the L1-Norm and Related Methods") +public class KMedoidsPAM<V> extends AbstractDistanceBasedAlgorithm<V, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> { /** * The logger for this class. */ private static final Logging LOG = Logging.getLogger(KMedoidsPAM.class); /** - * Holds the value of {@link AbstractKMeans#K_ID}. + * The number of clusters to produce. */ protected int k; /** - * Holds the value of {@link AbstractKMeans#MAXITER_ID}. + * The maximum number of iterations. */ protected int maxiter; @@ -108,7 +110,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist * @param maxiter Maxiter parameter * @param initializer Function to generate the initial means */ - public KMedoidsPAM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { + public KMedoidsPAM(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) { super(distanceFunction); this.k = k; this.maxiter = maxiter; @@ -126,16 +128,36 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist if(relation.size() <= 0) { return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); } - DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction()); + DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction()); DBIDs ids = relation.getDBIDs(); // Choose initial medoids - ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ)); + ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, ids, distQ)); // Setup cluster assignment store List<ModifiableDBIDs> clusters = new ArrayList<>(); for(int i = 0; i < k; i++) { clusters.add(DBIDUtil.newHashSet(relation.size() / k)); } + runPAMOptimization(distQ, ids, medoids, clusters); + + // Wrap result + Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); + for(int i = 0; i < clusters.size(); i++) { + MedoidModel model = new MedoidModel(medoids.get(i)); + result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); + } + return result; + } + + /** + * Run the PAM optimization phase. + * + * @param distQ Distance query + * @param ids IDs to process + * @param medoids Medoids list + * @param clusters Clusters + */ + protected void runPAMOptimization(DistanceQuery<V> distQ, DBIDs ids, ArrayModifiableDBIDs medoids, List<ModifiableDBIDs> clusters) { WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP); // Initial assignment to nearest medoids // TODO: reuse this information, from the build phase, when possible? @@ -143,15 +165,13 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null; // Swap phase + DBIDVar bestid = DBIDUtil.newVar(); boolean changed = true; while(changed) { - if(prog != null) { - prog.incrementProcessed(LOG); - } + LOG.incrementProcessed(prog); changed = false; // Try to swap the medoid with a better cluster member: double best = 0; - DBID bestid = null; int bestcluster = -1; int i = 0; for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) { @@ -159,30 +179,28 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist if(DBIDUtil.equal(miter, iter)) { continue; } - // double disti = distQ.distance(id, med).doubleValue(); double cost = 0; - DBIDIter olditer = medoids.iter(); - for(int j = 0; j < k; j++, olditer.advance()) { + DBIDIter miter2 = medoids.iter(); + for(int j = 0; j < k; j++, miter2.advance()) { for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) { - double distcur = distQ.distance(iter2, olditer).doubleValue(); - double distnew = distQ.distance(iter2, iter).doubleValue(); + if(DBIDUtil.equal(miter2, iter2)) { + continue; + } + double distcur = distQ.distance(iter2, miter2); + double distnew = distQ.distance(iter2, iter); if(j == i) { // Cases 1 and 2. double distsec = second.doubleValue(iter2); - if(distcur > distsec) { - // Case 1, other would switch to a third medoid - cost += distsec - distcur; // Always positive! - } - else { // Would remain with the candidate - cost += distnew - distcur; // Could be negative - } + cost += (distcur > distsec) ? // + // Case 1, other would switch to a third medoid + distsec - distcur // Always positive! + : // Would remain with the candidate + distnew - distcur; // Could be negative } else { // Cases 3-4: objects from other clusters - if(distcur < distnew) { - // Case 3: no change - } - else { + // Case 3: is no change + if(distcur > distnew) { // Case 4: would switch to new medoid cost += distnew - distcur; // Always negative } @@ -191,18 +209,15 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist } if(cost < best) { best = cost; - bestid = DBIDUtil.deref(iter); + bestid.set(iter); bestcluster = i; } } } - if(prog != null) { - prog.setCompleted(LOG); - } if(LOG.isDebugging()) { LOG.debug("Best cost: " + best); } - if(bestid != null) { + if(best < 0.) { changed = true; medoids.set(bestcluster, bestid); } @@ -212,14 +227,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist assignToNearestCluster(medoids, ids, second, clusters, distQ); } } - - // Wrap result - Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering"); - for(int i = 0; i < clusters.size(); i++) { - MedoidModel model = new MedoidModel(medoids.get(i)); - result.addToplevelCluster(new Cluster<>(clusters.get(i), model)); - } - return result; + LOG.setCompleted(prog); } /** @@ -233,36 +241,32 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist * @param distQ distance query * @return true when any object was reassigned */ - protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) { + protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V> distQ) { boolean changed = false; + DBIDArrayIter miter = means.iter(); for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) { + double mindist = Double.POSITIVE_INFINITY, mindist2 = Double.POSITIVE_INFINITY; int minIndex = 0; - double mindist = Double.POSITIVE_INFINITY; - double mindist2 = Double.POSITIVE_INFINITY; - { - int i = 0; - for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) { - double dist = distQ.distance(iditer, miter).doubleValue(); - if(dist < mindist) { - minIndex = i; - mindist2 = mindist; - mindist = dist; - } - else if(dist < mindist2) { - mindist2 = dist; - } + miter.seek(0); // Reuse iterator. + for(int i = 0; miter.valid(); miter.advance(), i++) { + double dist = distQ.distance(iditer, miter); + if(dist < mindist) { + minIndex = i; + mindist2 = mindist; + mindist = dist; + } + else if(dist < mindist2) { + mindist2 = dist; } } if(clusters.get(minIndex).add(iditer)) { changed = true; // Remove from previous cluster // TODO: keep a list of cluster assignments to save this search? - for(int i = 0; i < k; i++) { - if(i != minIndex) { - if(clusters.get(i).remove(iditer)) { - break; - } + for(int j = 0; j < k; j++) { + if(j != minIndex && clusters.get(j).remove(iditer)) { + break; } } } @@ -288,18 +292,27 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist * * @apiviz.exclude */ - public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> { + public static class Parameterizer<V> extends AbstractDistanceBasedAlgorithm.Parameterizer<V> { + /** + * The number of clusters to produce. + */ protected int k; + /** + * The maximum number of iterations. + */ protected int maxiter; + /** + * Method to choose initial means. + */ protected KMedoidsInitialization<V> initializer; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); - IntParameter kP = new IntParameter(KMeans.K_ID); - kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); + IntParameter kP = new IntParameter(KMeans.K_ID) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT); if(config.grab(kP)) { k = kP.intValue(); } @@ -309,15 +322,15 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist initializer = initialP.instantiateClass(config); } - IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0); - maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); + IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0) // + .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT); if(config.grab(maxiterP)) { maxiter = maxiterP.intValue(); } } @Override - protected KMedoidsPAM<V, D> makeInstance() { + protected KMedoidsPAM<V> makeInstance() { return new KMedoidsPAM<>(distanceFunction, k, maxiter, initializer); } } |