summaryrefslogtreecommitdiff
path: root/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
diff options
context:
space:
mode:
authorAndrej Shadura <andrewsh@debian.org>2019-03-09 22:30:41 +0000
committerAndrej Shadura <andrewsh@debian.org>2019-03-09 22:30:41 +0000
commit38212b3127e90751fb39cda34250bc11be62b76c (patch)
treedc1397346030e9695bd763dddc93b3be527cd643 /elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
parent337087b668d3a54f3afee3a9adb597a32e9f7e94 (diff)
Import Upstream version 0.7.0
Diffstat (limited to 'elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java')
-rw-r--r--elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java391
1 files changed, 391 insertions, 0 deletions
diff --git a/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
new file mode 100644
index 00000000..fc3bdd5f
--- /dev/null
+++ b/elki/src/main/java/de/lmu/ifi/dbs/elki/algorithm/clustering/kmeans/KMedoidsPAM.java
@@ -0,0 +1,391 @@
+package de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans;
+/*
+ This file is part of ELKI:
+ Environment for Developing KDD-Applications Supported by Index-Structures
+
+ Copyright (C) 2015
+ Ludwig-Maximilians-Universität München
+ Lehr- und Forschungseinheit für Datenbanksysteme
+ ELKI Development Team
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import java.util.Arrays;
+
+import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithmUtil;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.ClusteringAlgorithm;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.KMedoidsInitialization;
+import de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans;
+import de.lmu.ifi.dbs.elki.data.Cluster;
+import de.lmu.ifi.dbs.elki.data.Clustering;
+import de.lmu.ifi.dbs.elki.data.model.MedoidModel;
+import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
+import de.lmu.ifi.dbs.elki.data.type.TypeUtil;
+import de.lmu.ifi.dbs.elki.database.Database;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreFactory;
+import de.lmu.ifi.dbs.elki.database.datastore.DataStoreUtil;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore;
+import de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDIter;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDRange;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDUtil;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDVar;
+import de.lmu.ifi.dbs.elki.database.ids.DBIDs;
+import de.lmu.ifi.dbs.elki.database.query.DatabaseQuery;
+import de.lmu.ifi.dbs.elki.database.query.distance.DistanceQuery;
+import de.lmu.ifi.dbs.elki.database.relation.Relation;
+import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
+import de.lmu.ifi.dbs.elki.index.distancematrix.PrecomputedDistanceMatrix;
+import de.lmu.ifi.dbs.elki.logging.Logging;
+import de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress;
+import de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic;
+import de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic;
+import de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Reference;
+import de.lmu.ifi.dbs.elki.utilities.documentation.Title;
+import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.constraints.CommonConstraints;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.IntParameter;
+import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter;
+
+/**
+ * The original PAM algorithm or k-medoids clustering, as proposed by Kaufman
+ * and Rousseeuw in "Partitioning Around Medoids".
+ *
+ * Reference:
+ * <p>
+ * Clustering my means of Medoids<br />
+ * Kaufman, L. and Rousseeuw, P.J.<br />
+ * in: Statistical Data Analysis Based on the L1-Norm and Related Methods
+ * </p>
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.has MedoidModel
+ * @apiviz.composedOf KMedoidsInitialization
+ *
+ * @param <V> vector datatype
+ */
+@Title("Partioning Around Medoids")
+@Reference(title = "Clustering by means of Medoids", //
+authors = "Kaufman, L. and Rousseeuw, P.J.", //
+booktitle = "Statistical Data Analysis Based on the L1-Norm and Related Methods")
+public class KMedoidsPAM<V> extends AbstractDistanceBasedAlgorithm<V, Clustering<MedoidModel>>implements ClusteringAlgorithm<Clustering<MedoidModel>> {
+ /**
+ * The logger for this class.
+ */
+ private static final Logging LOG = Logging.getLogger(KMedoidsPAM.class);
+
+ /**
+ * Key for statistics logging.
+ */
+ private static final String KEY = KMedoidsPAM.class.getName();
+
+ /**
+ * The number of clusters to produce.
+ */
+ protected int k;
+
+ /**
+ * The maximum number of iterations.
+ */
+ protected int maxiter;
+
+ /**
+ * Method to choose initial means.
+ */
+ protected KMedoidsInitialization<V> initializer;
+
+ /**
+ * Constructor.
+ *
+ * @param distanceFunction distance function
+ * @param k k parameter
+ * @param maxiter Maxiter parameter
+ * @param initializer Function to generate the initial means
+ */
+ public KMedoidsPAM(DistanceFunction<? super V> distanceFunction, int k, int maxiter, KMedoidsInitialization<V> initializer) {
+ super(distanceFunction);
+ this.k = k;
+ this.maxiter = maxiter;
+ this.initializer = initializer;
+ }
+
+ /**
+ * Run k-medoids
+ *
+ * @param database Database
+ * @param relation relation to use
+ * @return result
+ */
+ public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
+ if(relation.size() <= 0) {
+ return new Clustering<>("PAM Clustering", "pam-clustering");
+ }
+ DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction(), DatabaseQuery.HINT_OPTIMIZED_ONLY);
+ DBIDs ids = relation.getDBIDs();
+ if(distQ == null && ids instanceof DBIDRange) {
+ LOG.verbose("Adding a distance matrix index to accelerate PAM.");
+ PrecomputedDistanceMatrix<V> idx = new PrecomputedDistanceMatrix<V>(relation, getDistanceFunction());
+ idx.initialize();
+ distQ = idx.getDistanceQuery(getDistanceFunction());
+ }
+ if(distQ == null) {
+ distQ = database.getDistanceQuery(relation, getDistanceFunction());
+ LOG.warning("PAM may be slow, because we do not have a precomputed distance matrix available.");
+ }
+ // Choose initial medoids
+ if(LOG.isStatistics()) {
+ LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
+ }
+ ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, ids, distQ));
+ if(medoids.size() != k) {
+ throw new AbortException("Initializer " + initializer.toString() + " did not return " + k + " means, but " + medoids.size());
+ }
+
+ // Setup cluster assignment store
+ WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, -1);
+ runPAMOptimization(distQ, ids, medoids, assignment);
+
+ ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, assignment, k);
+
+ // Wrap result
+ Clustering<MedoidModel> result = new Clustering<>("PAM Clustering", "pam-clustering");
+ for(DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
+ MedoidModel model = new MedoidModel(DBIDUtil.deref(it));
+ result.addToplevelCluster(new Cluster<>(clusters[it.getOffset()], model));
+ }
+ return result;
+ }
+
+ /**
+ * Run the PAM optimization phase.
+ *
+ * @param distQ Distance query
+ * @param ids IDs to process
+ * @param medoids Medoids list
+ * @param assignment Cluster assignment
+ */
+ protected void runPAMOptimization(DistanceQuery<V> distQ, DBIDs ids, ArrayModifiableDBIDs medoids, WritableIntegerDataStore assignment) {
+ WritableDoubleDataStore nearest = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ WritableDoubleDataStore second = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
+ // Initial assignment to nearest medoids
+ // TODO: reuse distance information, from the build phase, when possible?
+ double tc = assignToNearestCluster(medoids, ids, nearest, second, assignment, distQ);
+ if(LOG.isStatistics()) {
+ LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc));
+ }
+
+ IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("PAM iteration", LOG) : null;
+ // Swap phase
+ DBIDVar bestid = DBIDUtil.newVar();
+ DBIDArrayIter m = medoids.iter();
+ int iteration = 1;
+ double[] cost = new double[k];
+ for(; maxiter <= 0 || iteration <= maxiter; iteration++) {
+ LOG.incrementProcessed(prog);
+ // Try to swap a non-medoid with a medoid member:
+ double best = Double.POSITIVE_INFINITY;
+ int bestcluster = -1;
+ // Iterate over all non-medoids:
+ for(DBIDIter h = ids.iter(); h.valid(); h.advance()) {
+ final int pm = assignment.intValue(h);
+ m.seek(pm);
+ double hdist = nearest.doubleValue(h); // Current assignment cost of h.
+ if(DBIDUtil.equal(m, h) || hdist <= 0.) {
+ continue; // Only consider non-selected items
+ }
+ // h is a non-medoid currently in cluster of medoid m.
+ Arrays.fill(cost, -hdist);
+ // Compute costs of reassigning other objects j:
+ for(DBIDIter j = ids.iter(); j.valid(); j.advance()) {
+ if(DBIDUtil.equal(h, j)) {
+ continue;
+ }
+ final int pj = assignment.intValue(j);
+ // distance(j, i) for pi == pj
+ final double distcur = nearest.doubleValue(j);
+ // second nearest, alternative reassignment
+ final double distsec = second.doubleValue(j);
+ // distance(j, h), the possible reassignment
+ final double dist_h = distQ.distance(h, j);
+ // We moved this distance computation out compared to original PAM.
+ if(dist_h < distcur) {
+ for(int pi = 0; pi < k; pi++) {
+ if(pi == pj) { // The current nearest is lost.
+ cost[pi] += ((dist_h < distsec) ? //
+ dist_h // Case 1b1) j is closer to h
+ : distsec // Case 1b2) j would switch to its second nearest
+ ) - distcur;
+ }
+ else { // Case 1c) j is closer to h than its current medoid
+ cost[pi] += dist_h - distcur;
+ } // else Case 1a): j is closer to i than h and m, so no change.
+ }
+ }
+ else { // Only need to consider pi == pj
+ if(dist_h < distsec) {
+ // Case 1b1) j is closer to h
+ cost[pj] += dist_h - distcur;
+ }
+ else {
+ // Case 1b2) j would switch to its second nearest
+ cost[pj] += distsec - distcur;
+ }
+ }
+ }
+
+ // Consider all possible swaps:
+ for(int pi = 0; pi < k; pi++) {
+ if(cost[pi] < best) {
+ best = cost[pi];
+ bestid.set(h);
+ bestcluster = pi;
+ }
+ }
+ }
+ if(best >= 0.) {
+ break;
+ }
+ medoids.set(bestcluster, bestid);
+ // Reassign
+ double nc = assignToNearestCluster(medoids, ids, nearest, second, assignment, distQ);
+ if(LOG.isStatistics()) {
+ LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc));
+ }
+ if(nc > tc) {
+ if(nc - tc < 1e-7 * tc) {
+ LOG.warning("PAM failed to converge (numerical instability?)");
+ break;
+ }
+ LOG.warning("PAM failed to converge: costs increased by: " + (nc - tc) + " exepected a decrease by " + best);
+ break;
+ }
+ tc = nc;
+ }
+ LOG.setCompleted(prog);
+ if(LOG.isStatistics()) {
+ LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
+ LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", tc));
+ }
+ }
+
+ /**
+ * Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
+ * those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
+ *
+ * @param means Object centroids
+ * @param ids Object ids
+ * @param nearest Distance to nearest medoid
+ * @param second Distance to second nearest medoid
+ * @param assignment Cluster assignment
+ * @param distQ distance query
+ * @return Assignment cost
+ */
+ protected double assignToNearestCluster(ArrayDBIDs means, DBIDs ids, WritableDoubleDataStore nearest, WritableDoubleDataStore second, WritableIntegerDataStore assignment, DistanceQuery<V> distQ) {
+ assert(means.size() == k);
+ DBIDArrayIter miter = means.iter();
+ double cost = 0.;
+ for(DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
+ double mindist = Double.POSITIVE_INFINITY,
+ mindist2 = Double.POSITIVE_INFINITY;
+ int minIndex = -1;
+ for(int i = 0; i < k; i++) {
+ double dist = distQ.distance(iditer, miter.seek(i));
+ if(dist < mindist) {
+ mindist2 = mindist;
+ mindist = dist;
+ minIndex = i;
+ }
+ else if(dist < mindist2) {
+ mindist2 = dist;
+ }
+ }
+ if(minIndex < 0) {
+ throw new AbortException("Too many infinite distances. Cannot assign objects.");
+ }
+ assignment.put(iditer, minIndex);
+ nearest.put(iditer, mindist);
+ second.put(iditer, mindist2);
+ cost += mindist;
+ }
+ return cost;
+ }
+
+ @Override
+ public TypeInformation[] getInputTypeRestriction() {
+ return TypeUtil.array(getDistanceFunction().getInputTypeRestriction());
+ }
+
+ @Override
+ protected Logging getLogger() {
+ return LOG;
+ }
+
+ /**
+ * Parameterization class.
+ *
+ * @author Erich Schubert
+ *
+ * @apiviz.exclude
+ */
+ public static class Parameterizer<V> extends AbstractDistanceBasedAlgorithm.Parameterizer<V> {
+ /**
+ * The number of clusters to produce.
+ */
+ protected int k;
+
+ /**
+ * The maximum number of iterations.
+ */
+ protected int maxiter;
+
+ /**
+ * Method to choose initial means.
+ */
+ protected KMedoidsInitialization<V> initializer;
+
+ @Override
+ protected void makeOptions(Parameterization config) {
+ super.makeOptions(config);
+ IntParameter kP = new IntParameter(KMeans.K_ID) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ONE_INT);
+ if(config.grab(kP)) {
+ k = kP.intValue();
+ }
+
+ ObjectParameter<KMedoidsInitialization<V>> initialP = new ObjectParameter<>(KMeans.INIT_ID, KMedoidsInitialization.class, PAMInitialMeans.class);
+ if(config.grab(initialP)) {
+ initializer = initialP.instantiateClass(config);
+ }
+
+ IntParameter maxiterP = new IntParameter(KMeans.MAXITER_ID, 0) //
+ .addConstraint(CommonConstraints.GREATER_EQUAL_ZERO_INT);
+ if(config.grab(maxiterP)) {
+ maxiter = maxiterP.intValue();
+ }
+ }
+
+ @Override
+ protected KMedoidsPAM<V> makeInstance() {
+ return new KMedoidsPAM<>(distanceFunction, k, maxiter, initializer);
+ }
+ }
+}