summaryrefslogtreecommitdiff
path: root/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java')
-rw-r--r--src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java163
1 files changed, 88 insertions, 75 deletions
diff --git a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
index c9e1dc47..30bb56e5 100644
--- a/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
+++ b/src/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
@@ -4,7 +4,7 @@ package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
This file is part of ELKI:
Environment for Developing KDD-Applications Supported by Index-Structures
- Copyright (C) 2013
+ Copyright (C) 2014
Ludwig-Maximilians-Universität München
Lehr- und Forschungseinheit für Datenbanksysteme
ELKI Development Team
@@ -27,8 +27,9 @@ import java.util.ArrayList;
import java.util.List;
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
-import de.lmu.ifi.dbs.elki.algorithm.AbstractPrimitiveDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans;
import de.lmu.ifi.dbs.elki.data.Cluster;
import de.lmu.ifi.dbs.elki.data.Clustering;
import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
@@ -40,15 +41,15 @@ import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
-import de.lmu.ifi.dbs.elki.database.ids.DBID;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
import de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs;
import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
import de.lmu.ifi.dbs.elki.database.relation.Relation;
-import de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction;
-import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
@@ -59,14 +60,14 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
/**
- * Provides the k-medoids clustering algorithm, using the
- * "Partitioning Around Medoids" approach.
+ * The original PAM algorithm or k-medoids clustering, as proposed by Kaufman
+ * and Rousseeuw in "Partitioning Around Medoids".
*
* Reference:
* <p>
* Clustering my means of Medoids<br />
* Kaufman, L. and Rousseeuw, P.J.<br />
- * in: Statistical Data Analysis Based on the L_1–Norm and Related Methods
+ * in: Statistical Data Analysis Based on the L1-Norm and Related Methods
* </p>
*
* @author Erich Schubert
@@ -75,23 +76,24 @@ import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
* @apiviz.composedOf KMedoidsInitialization
*
* @param <V> vector datatype
- * @param <D> distance value type
*/
@Title("Partioning Around Medoids")
-@Reference(title = "Clustering my means of Medoids", authors = "Kaufman, L. and Rousseeuw, P.J.", booktitle = "Statistical Data Analysis Based on the L_1–Norm and Related Methods")
-public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDistanceBasedAlgorithm<V, D, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
+@Reference(title = "Clustering by means of Medoids", //
+authors = "Kaufman, L. and Rousseeuw, P.J.", //
+booktitle = "Statistical Data Analysis Based on the L1-Norm and Related Methods")
+public class KMedoidsPAM<V> extends AbstractDistanceBasedAlgorithm<V, Clustering<MedoidModel>> implements ClusteringAlgorithm<Clustering<MedoidModel>> {
/**
* The logger for this class.
*/
private static final Logging LOG = Logging.getLogger(KMedoidsPAM.class);
/**
- * Holds the value of {@link AbstractKMeans#K_ID}.
+ * The number of clusters to produce.
*/
protected int k;
/**
- * Holds the value of {@link AbstractKMeans#MAXITER_ID}.
+ * The maximum number of iterations.
*/
protected int maxiter;
@@ -108,7 +110,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
* @param maxiter Maxiter parameter
* @param initializer Function to generate the initial means
*/
- public KMedoidsPAM(PrimitiveDistanceFunction<? super V, D> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
+ public KMedoidsPAM(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
super(distanceFunction);
this.k = k;
this.maxiter = maxiter;
@@ -126,16 +128,36 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
if(relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
- DistanceQuery<V, D> distQ = database.getDistanceQuery(relation, getDistanceFunction());
+ DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction());
DBIDs ids = relation.getDBIDs();
// Choose initial medoids
- ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, distQ));
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, ids, distQ));
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for(int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k));
}
+ runPAMOptimization(distQ, ids, medoids, clusters);
+
+ // Wrap result
+ Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
+ for(int i = 0; i < clusters.size(); i++) {
+ MedoidModel model = new MedoidModel(medoids.get(i));
+ result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
+ }
+ return result;
+ }
+
+ /**
+ * Run the PAM optimization phase.
+ *
+ * @param distQ Distance query
+ * @param ids IDs to process
+ * @param medoids Medoids list
+ * @param clusters Clusters
+ */
+ protected void runPAMOptimization(DistanceQuery<V> distQ, DBIDs ids, ArrayModifiableDBIDs medoids, List<ModifiableDBIDs> clusters) {
WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
// Initial assignment to nearest medoids
// TODO: reuse this information, from the build phase, when possible?
@@ -143,15 +165,13 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null;
// Swap phase
+ DBIDVar bestid = DBIDUtil.newVar();
boolean changed = true;
while(changed) {
- if(prog != null) {
- prog.incrementProcessed(LOG);
- }
+ LOG.incrementProcessed(prog);
changed = false;
// Try to swap the medoid with a better cluster member:
double best = 0;
- DBID bestid = null;
int bestcluster = -1;
int i = 0;
for(DBIDIter miter = medoids.iter(); miter.valid(); miter.advance(), i++) {
@@ -159,30 +179,28 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
if(DBIDUtil.equal(miter, iter)) {
continue;
}
- // double disti = distQ.distance(id, med).doubleValue();
double cost = 0;
- DBIDIter olditer = medoids.iter();
- for(int j = 0; j < k; j++, olditer.advance()) {
+ DBIDIter miter2 = medoids.iter();
+ for(int j = 0; j < k; j++, miter2.advance()) {
for(DBIDIter iter2 = clusters.get(j).iter(); iter2.valid(); iter2.advance()) {
- double distcur = distQ.distance(iter2, olditer).doubleValue();
- double distnew = distQ.distance(iter2, iter).doubleValue();
+ if(DBIDUtil.equal(miter2, iter2)) {
+ continue;
+ }
+ double distcur = distQ.distance(iter2, miter2);
+ double distnew = distQ.distance(iter2, iter);
if(j == i) {
// Cases 1 and 2.
double distsec = second.doubleValue(iter2);
- if(distcur > distsec) {
- // Case 1, other would switch to a third medoid
- cost += distsec - distcur; // Always positive!
- }
- else { // Would remain with the candidate
- cost += distnew - distcur; // Could be negative
- }
+ cost += (distcur > distsec) ? //
+ // Case 1, other would switch to a third medoid
+ distsec - distcur // Always positive!
+ : // Would remain with the candidate
+ distnew - distcur; // Could be negative
}
else {
// Cases 3-4: objects from other clusters
- if(distcur < distnew) {
- // Case 3: no change
- }
- else {
+ // Case 3: is no change
+ if(distcur > distnew) {
// Case 4: would switch to new medoid
cost += distnew - distcur; // Always negative
}
@@ -191,18 +209,15 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
}
if(cost < best) {
best = cost;
- bestid = DBIDUtil.deref(iter);
+ bestid.set(iter);
bestcluster = i;
}
}
}
- if(prog != null) {
- prog.setCompleted(LOG);
- }
if(LOG.isDebugging()) {
LOG.debug("Best cost: " + best);
}
- if(bestid != null) {
+ if(best < 0.) {
changed = true;
medoids.set(bestcluster, bestid);
}
@@ -212,14 +227,7 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
assignToNearestCluster(medoids, ids, second, clusters, distQ);
}
}
-
- // Wrap result
- Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
- for(int i = 0; i < clusters.size(); i++) {
- MedoidModel model = new MedoidModel(medoids.get(i));
- result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
- }
- return result;
+ LOG.setCompleted(prog);
}
/**
@@ -233,36 +241,32 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
* @param distQ distance query
* @return true when any object was reassigned
*/
- protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V, D> distQ) {
+ protected boolean assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore second, List<? extends ModifiableDBIDs> clusters, DistanceQuery<V> distQ) {
boolean changed = false;
+ DBIDArrayIter miter = means.iter();
for(DBIDIter iditer = distQ.getRelation().iterDBIDs(); iditer.valid(); iditer.advance()) {
+ double mindist = Double.POSITIVE_INFINITY, mindist2 = Double.POSITIVE_INFINITY;
int minIndex = 0;
- double mindist = Double.POSITIVE_INFINITY;
- double mindist2 = Double.POSITIVE_INFINITY;
- {
- int i = 0;
- for(DBIDIter miter = means.iter(); miter.valid(); miter.advance(), i++) {
- double dist = distQ.distance(iditer, miter).doubleValue();
- if(dist < mindist) {
- minIndex = i;
- mindist2 = mindist;
- mindist = dist;
- }
- else if(dist < mindist2) {
- mindist2 = dist;
- }
+ miter.seek(0); // Reuse iterator.
+ for(int i = 0; miter.valid(); miter.advance(), i++) {
+ double dist = distQ.distance(iditer, miter);
+ if(dist < mindist) {
+ minIndex = i;
+ mindist2 = mindist;
+ mindist = dist;
+ }
+ else if(dist < mindist2) {
+ mindist2 = dist;
}
}
if(clusters.get(minIndex).add(iditer)) {
changed = true;
// Remove from previous cluster
// TODO: keep a list of cluster assignments to save this search?
- for(int i = 0; i < k; i++) {
- if(i != minIndex) {
- if(clusters.get(i).remove(iditer)) {
- break;
- }
+ for(int j = 0; j < k; j++) {
+ if(j != minIndex && clusters.get(j).remove(iditer)) {
+ break;
}
}
}
@@ -288,18 +292,27 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
*
* @apiviz.exclude
*/
- public static class Parameterizer<V, D extends NumberDistance<D, ?>> extends AbstractPrimitiveDistanceBasedAlgorithm.Parameterizer<V, D> {
+ public static class Parameterizer<V> extends AbstractDistanceBasedAlgorithm.Parameterizer<V> {
+ /**
+ * The number of clusters to produce.
+ */
protected int k;
+ /**
+ * The maximum number of iterations.
+ */
protected int maxiter;
+ /**
+ * Method to choose initial means.
+ */
protected KMedoidsInitialization<V> initializer;
@Override
protected void makeOptions(Parameterization config) {
super.makeOptions(config);
- IntParameter kP = new IntParameter(KMeans.K_ID);
- kP.addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ IntParameter kP = new IntParameter(KMeans.K_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
if(config.grab(kP)) {
k = kP.intValue();
}
@@ -309,15 +322,15 @@ public class KMedoidsPAM<V, D extends NumberDistance<D, ?>> extends AbstractDist
initializer = initialP.instantiateClass(config);
}
- IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0);
- maxiterP.addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
if(config.grab(maxiterP)) {
maxiter = maxiterP.intValue();
}
}
@Override
- protected KMedoidsPAM<V, D> makeInstance() {
+ protected KMedoidsPAM<V> makeInstance() {
return new KMedoidsPAM<>(distanceFunction, k, maxiter, initializer);
}
}